import sys def solve(lines): # from 0 to n-6 date_f = "19XX r." date_l = len(date_f) res = [] for l in lines: # acc = [] j = 0 for i in range(len(l)): c = l[i] if j == date_l: # acc.append(l[i-j: i]) res.append(l) break elif c == date_f[j] \ or (j in (2, 3) and c.isdigit()): j += 1 continue j = 0 # res.append(" ".join(acc)) # return "\n".join(res) return "".join(res) # detect numbers. separate numbers by spaces def solve(lines): res = [] j = 0 for l in lines: numbers = [] for i in range(len(l)): c = l[i] if j > 0 and not c.isdigit(): numbers.append(l[i-j:i]) j = j+1 if (c.isdigit() and c.isascii()) else 0 if numbers: res.append(' '.join(numbers)) return '\n'.join(res) if __name__ == "__main__": lines = [] fp = sys.argv[1] with open(fp, encoding="utf-8") as f: lines = f.readlines() sol = solve(lines) with open("./polish_wiki_excerpt.out", 'w', encoding="utf-8") as f: f.writelines(sol) f.write('\n') # it expects for file to end with newline :)