40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
|
import sys
|
||
|
import lzma
|
||
|
import regex as re
|
||
|
import pickle
|
||
|
from tqdm import tqdm
|
||
|
from collections import Counter
|
||
|
|
||
|
def get_words(text):
|
||
|
for m in re.finditer(r'[\p{L}\']+', text):
|
||
|
yield m.group(0)
|
||
|
|
||
|
def get_ngrams(iterable, n):
|
||
|
ngram = []
|
||
|
for item in iterable:
|
||
|
ngram.append(item)
|
||
|
if len(ngram) == n:
|
||
|
yield tuple(ngram)
|
||
|
ngram = ngram[1:]
|
||
|
|
||
|
|
||
|
def get_stats():
|
||
|
word_stats = Counter()
|
||
|
bigram_stats = Counter()
|
||
|
|
||
|
with lzma.open("train/in.tsv.xz", mode="rt", encoding="utf-8") as file:
|
||
|
for line in tqdm(file):
|
||
|
_, _, _, _, _, _, l_context, r_context = line.split("\t")
|
||
|
text = f"{l_context.strip()} {r_context.strip()}".replace("\n", " ")
|
||
|
word_stats.update(get_words(text))
|
||
|
bigram_stats.update(get_ngrams(get_words(text), 2))
|
||
|
|
||
|
with open("word_stats.pickle", "wb") as file:
|
||
|
pickle.dump(word_stats, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||
|
with open("bigram_stats.pickle", "wb") as file:
|
||
|
pickle.dump(bigram_stats, file, protocol=pickle.HIGHEST_PROTOCOL)
|
||
|
|
||
|
|
||
|
get_stats()
|
||
|
|
||
|
|