challenging-america-word-ga.../bigrams_train.py

import sys
import lzma
import regex as re
import pickle
from tqdm import tqdm
from collections import Counter

def get_words(text):
    for m in re.finditer(r'[\p{L}\']+', text):
        yield m.group(0)

def get_ngrams(iterable, n):
  ngram = []
  for item in iterable:
     ngram.append(item)
     if len(ngram) == n:
        yield tuple(ngram)
        ngram = ngram[1:]


def get_stats():
    word_stats = Counter()
    bigram_stats = Counter()

    with lzma.open("train/in.tsv.xz", mode="rt", encoding="utf-8") as file:
        for line in tqdm(file):
            _, _, _, _, _, _, l_context, r_context = line.split("\t")
            text = f"{l_context.strip()} {r_context.strip()}".replace("\n", " ")
            word_stats.update(get_words(text))
            bigram_stats.update(get_ngrams(get_words(text), 2))

    with open("word_stats.pickle", "wb") as file:
        pickle.dump(word_stats, file, protocol=pickle.HIGHEST_PROTOCOL)
    with open("bigram_stats.pickle", "wb") as file:
        pickle.dump(bigram_stats, file, protocol=pickle.HIGHEST_PROTOCOL)


get_stats()