import sys import lzma import regex as re import pickle from tqdm import tqdm from collections import Counter def get_words(text): for m in re.finditer(r'[\p{L}\']+', text): yield m.group(0) def get_ngrams(iterable, n): ngram = [] for item in iterable: ngram.append(item) if len(ngram) == n: yield tuple(ngram) ngram = ngram[1:] def get_stats(): word_stats = Counter() bigram_stats = Counter() with lzma.open("train/in.tsv.xz", mode="rt", encoding="utf-8") as file: for line in tqdm(file): _, _, _, _, _, _, l_context, r_context = line.split("\t") text = f"{l_context.strip()} {r_context.strip()}".replace("\n", " ") word_stats.update(get_words(text)) bigram_stats.update(get_ngrams(get_words(text), 2)) with open("word_stats.pickle", "wb") as file: pickle.dump(word_stats, file, protocol=pickle.HIGHEST_PROTOCOL) with open("bigram_stats.pickle", "wb") as file: pickle.dump(bigram_stats, file, protocol=pickle.HIGHEST_PROTOCOL) get_stats()