challenging-america-word-ga.../run.py

104 lines
3.7 KiB
Python

import lzma
import regex as re
import string
# text = lzma.open('train/in.tsv.xz').read()
trigrams = {}
bigrams = {}
pos = 0
index = 0
words = []
def read_file(file):
for line in file:
yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
def get_words(file):
for words in read_file(file):
yield from words
def set_bigram_count(first_word, second_word, bigrams):
if f"{first_word}_{second_word}" not in bigrams:
bigrams[f"{first_word}_{second_word}"] = 1
else:
bigrams[f"{first_word}_{second_word}"] += 1
def set_trigram_count(first_word, second_word, third_word, trigrams):
if f"{first_word}_{second_word}_{third_word}" not in trigrams:
trigrams[f"{first_word}_{second_word}_{third_word}"] = 1
else:
trigrams[f"{first_word}_{second_word}_{third_word}"] += 1
with lzma.open('train/in.tsv.xz', mode='rt') as file:
wordNo = 1
first_word = ""
second_word = ""
third_word = ""
for i_, word in enumerate(get_words(file)):
if wordNo == 1:
first_word = word
if len(third_word) > 0:
set_bigram_count(third_word, first_word, bigrams)
if len(second_word) > 0:
set_trigram_count(second_word, third_word, first_word, trigrams)
elif wordNo == 2:
second_word = word
set_bigram_count(first_word, second_word, bigrams)
if len(third_word) > 0:
set_trigram_count(third_word, first_word, second_word, trigrams)
elif wordNo == 3:
third_word = word
set_bigram_count(second_word, third_word, bigrams)
set_trigram_count(first_word, second_word, third_word, trigrams)
wordNo = 0
wordNo += 1
if i_ == 100:
break
print(trigrams)
with lzma.open('train/in.tsv.xz', mode='rt') as file:
for line in file:
words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
print(words)
break
# with lzma.open('train/in.tsv.xz', mode='rt') as file:
# for line in file:
# # print(line.replace("\\n"," ").replace("\n"," "))
# words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
# print(words)
# last_two_words = []
# for i_, word in enumerate(words):
# if i_ + 2 < len(words):
# if f"{words[i_+1]}_{words[i_+2]}" not in bigrams:
# bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1
# else:
# bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1
# if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams:
# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1
# else:
# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1
# else:
# last_two_words = [words[-2]]+[words[-1]]
# print(last_two_words)
# words = []
# # print(words)
# # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" "))
# # break
# if index == 2:
# break
# index += 1
# text = "one of the"
# print(bigrams["political_thirst"])
# print(trigrams["to_political_thirst"])
# for trigram in trigrams:
# if trigrams[trigram] > 1:
# print(trigram, trigrams[trigram])
# for bigram in bigrams:
# if bigrams[bigram] > 6:
# print(bigram, bigrams[bigram])