challenging-america-word-ga.../bigrams_train.py

import sys
import lzma
import regex as re
import pickle
from tqdm import tqdm
from collections import Counter

def get_words(text):
    for m in re.finditer(r'[\p{L}\']+', text):
        yield m.group(0)

def get_ngrams(iterable, n):
  ngram = []
  for item in iterable:
     ngram.append(item)
     if len(ngram) == n:
        yield tuple(ngram)
        ngram = ngram[1:]


def get_stats():
    word_stats = Counter()
    bigram_stats = Counter()

    with lzma.open("train/in.tsv.xz", mode="rt", encoding="utf-8") as file:
        for line in tqdm(file):
            _, _, _, _, _, _, l_context, r_context = line.split("\t")
            text = f"{l_context.strip()} {r_context.strip()}".replace("\n", " ")
            word_stats.update(get_words(text))
            bigram_stats.update(get_ngrams(get_words(text), 2))

    with open("word_stats.pickle", "wb") as file:
        pickle.dump(word_stats, file, protocol=pickle.HIGHEST_PROTOCOL)
    with open("bigram_stats.pickle", "wb") as file:
        pickle.dump(bigram_stats, file, protocol=pickle.HIGHEST_PROTOCOL)


get_stats()
Add missing code for bigram model 2023-04-22 18:58:52 +02:00			`import sys`
			`import lzma`
			`import regex as re`
			`import pickle`
			`from tqdm import tqdm`
			`from collections import Counter`

			`def get_words(text):`
			`for m in re.finditer(r'[\p{L}\']+', text):`
			`yield m.group(0)`

			`def get_ngrams(iterable, n):`
			`ngram = []`
			`for item in iterable:`
			`ngram.append(item)`
			`if len(ngram) == n:`
			`yield tuple(ngram)`
			`ngram = ngram[1:]`


			`def get_stats():`
			`word_stats = Counter()`
			`bigram_stats = Counter()`

			`with lzma.open("train/in.tsv.xz", mode="rt", encoding="utf-8") as file:`
			`for line in tqdm(file):`
			`_, _, _, _, _, _, l_context, r_context = line.split("\t")`
			`text = f"{l_context.strip()} {r_context.strip()}".replace("\n", " ")`
			`word_stats.update(get_words(text))`
			`bigram_stats.update(get_ngrams(get_words(text), 2))`

			`with open("word_stats.pickle", "wb") as file:`
			`pickle.dump(word_stats, file, protocol=pickle.HIGHEST_PROTOCOL)`
			`with open("bigram_stats.pickle", "wb") as file:`
			`pickle.dump(bigram_stats, file, protocol=pickle.HIGHEST_PROTOCOL)`


			`get_stats()`