60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
import lzma
|
|
import regex as re
|
|
import string
|
|
import queue
|
|
# text = lzma.open('train/in.tsv.xz').read()
|
|
trigrams = {}
|
|
bigrams = {}
|
|
def read_file(file):
|
|
for line in file:
|
|
yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
|
|
|
|
def get_words(file):
|
|
for words in read_file(file):
|
|
yield from words
|
|
|
|
def set_bigram_count(first_word, second_word, bigrams):
|
|
if f"{first_word}_{second_word}" not in bigrams:
|
|
bigrams[f"{first_word}_{second_word}"] = 1
|
|
else:
|
|
bigrams[f"{first_word}_{second_word}"] += 1
|
|
|
|
def set_trigram_count(first_word, second_word, third_word, trigrams):
|
|
if f"{first_word}_{second_word}_{third_word}" not in trigrams:
|
|
trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
|
|
else:
|
|
trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
|
|
|
|
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
|
wordNo = 1
|
|
word_bi_last = ""
|
|
words = ["", "", ""]
|
|
for i_, word in enumerate(get_words(file)):
|
|
if len(word_bi_last) > 0:
|
|
set_bigram_count(word_bi_last, word, bigrams)
|
|
if i_ == 1:
|
|
words[0]=word_bi_last
|
|
words[1]=word
|
|
elif i_ == 2:
|
|
words[2]=word
|
|
set_trigram_count(words[0], words[1], words[2], trigrams)
|
|
elif i_ > 2:
|
|
words[0]=words[1]
|
|
words[1]=words[2]
|
|
words[2]=word
|
|
set_trigram_count(words[0], words[1], words[2], trigrams)
|
|
word_bi_last = word
|
|
|
|
if i_ == 10000:
|
|
break
|
|
|
|
text = "one of the"
|
|
print(bigrams["political_thirst"])
|
|
print(trigrams["to_political_thirst"])
|
|
for trigram in trigrams:
|
|
if trigrams[trigram] > 1:
|
|
print(trigram, trigrams[trigram])
|
|
for bigram in bigrams:
|
|
if bigrams[bigram] > 6:
|
|
print(bigram, bigrams[bigram])
|