challenging-america-word-ga.../bigrams_train.py

40 lines
1.1 KiB
Python

import sys
import lzma
import regex as re
import pickle
from tqdm import tqdm
from collections import Counter
def get_words(text):
for m in re.finditer(r'[\p{L}\']+', text):
yield m.group(0)
def get_ngrams(iterable, n):
ngram = []
for item in iterable:
ngram.append(item)
if len(ngram) == n:
yield tuple(ngram)
ngram = ngram[1:]
def get_stats():
word_stats = Counter()
bigram_stats = Counter()
with lzma.open("train/in.tsv.xz", mode="rt", encoding="utf-8") as file:
for line in tqdm(file):
_, _, _, _, _, _, l_context, r_context = line.split("\t")
text = f"{l_context.strip()} {r_context.strip()}".replace("\n", " ")
word_stats.update(get_words(text))
bigram_stats.update(get_ngrams(get_words(text), 2))
with open("word_stats.pickle", "wb") as file:
pickle.dump(word_stats, file, protocol=pickle.HIGHEST_PROTOCOL)
with open("bigram_stats.pickle", "wb") as file:
pickle.dump(bigram_stats, file, protocol=pickle.HIGHEST_PROTOCOL)
get_stats()