Zrobione dodawanie bigramow i trigramow na tablicach.
This commit is contained in:
parent
d5888a3d7a
commit
993eaaa168
104
run.py
104
run.py
@ -1,17 +1,14 @@
|
|||||||
import lzma
|
import lzma
|
||||||
import regex as re
|
import regex as re
|
||||||
import string
|
import string
|
||||||
|
import queue
|
||||||
# text = lzma.open('train/in.tsv.xz').read()
|
# text = lzma.open('train/in.tsv.xz').read()
|
||||||
trigrams = {}
|
trigrams = {}
|
||||||
bigrams = {}
|
bigrams = {}
|
||||||
pos = 0
|
|
||||||
index = 0
|
|
||||||
words = []
|
|
||||||
def read_file(file):
|
def read_file(file):
|
||||||
for line in file:
|
for line in file:
|
||||||
yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
|
yield re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
|
||||||
|
|
||||||
|
|
||||||
def get_words(file):
|
def get_words(file):
|
||||||
for words in read_file(file):
|
for words in read_file(file):
|
||||||
yield from words
|
yield from words
|
||||||
@ -30,74 +27,33 @@ def set_trigram_count(first_word, second_word, third_word, trigrams):
|
|||||||
|
|
||||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
||||||
wordNo = 1
|
wordNo = 1
|
||||||
first_word = ""
|
word_bi_last = ""
|
||||||
second_word = ""
|
words = ["", "", ""]
|
||||||
third_word = ""
|
for i_, word in enumerate(get_words(file)):
|
||||||
for i_, word in enumerate(get_words(file)):
|
if len(word_bi_last) > 0:
|
||||||
if wordNo == 1:
|
set_bigram_count(word_bi_last, word, bigrams)
|
||||||
first_word = word
|
if i_ == 1:
|
||||||
if len(third_word) > 0:
|
words[0]=word_bi_last
|
||||||
set_bigram_count(third_word, first_word, bigrams)
|
words[1]=word
|
||||||
if len(second_word) > 0:
|
elif i_ == 2:
|
||||||
set_trigram_count(second_word, third_word, first_word, trigrams)
|
words[2]=word
|
||||||
|
set_trigram_count(words[0], words[1], words[2], trigrams)
|
||||||
elif wordNo == 2:
|
elif i_ > 2:
|
||||||
second_word = word
|
words[0]=words[1]
|
||||||
set_bigram_count(first_word, second_word, bigrams)
|
words[1]=words[2]
|
||||||
if len(third_word) > 0:
|
words[2]=word
|
||||||
set_trigram_count(third_word, first_word, second_word, trigrams)
|
set_trigram_count(words[0], words[1], words[2], trigrams)
|
||||||
|
word_bi_last = word
|
||||||
elif wordNo == 3:
|
|
||||||
third_word = word
|
|
||||||
set_bigram_count(second_word, third_word, bigrams)
|
|
||||||
set_trigram_count(first_word, second_word, third_word, trigrams)
|
|
||||||
wordNo = 0
|
|
||||||
|
|
||||||
wordNo += 1
|
|
||||||
if i_ == 100:
|
|
||||||
break
|
|
||||||
print(trigrams)
|
|
||||||
|
|
||||||
with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
|
||||||
for line in file:
|
|
||||||
words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
|
|
||||||
print(words)
|
|
||||||
break
|
|
||||||
|
|
||||||
# with lzma.open('train/in.tsv.xz', mode='rt') as file:
|
|
||||||
# for line in file:
|
|
||||||
# # print(line.replace("\\n"," ").replace("\n"," "))
|
|
||||||
# words += re.sub(' +|\t', ' ', line.replace("\\n"," ").replace("\n","").translate(str.maketrans('','', string.punctuation))).split(" ")
|
|
||||||
# print(words)
|
|
||||||
# last_two_words = []
|
|
||||||
# for i_, word in enumerate(words):
|
|
||||||
# if i_ + 2 < len(words):
|
|
||||||
# if f"{words[i_+1]}_{words[i_+2]}" not in bigrams:
|
|
||||||
# bigrams[f"{words[i_+1]}_{words[i_+2]}"] = 1
|
|
||||||
# else:
|
|
||||||
# bigrams[f"{words[i_+1]}_{words[i_+2]}"] += 1
|
|
||||||
|
|
||||||
# if f"{words[i_]}_{words[i_+1]}_{words[i_+2]}" not in trigrams:
|
|
||||||
# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] = 1
|
|
||||||
# else:
|
|
||||||
# trigrams[f"{words[i_]}_{words[i_+1]}_{words[i_+2]}"] += 1
|
|
||||||
# else:
|
|
||||||
# last_two_words = [words[-2]]+[words[-1]]
|
|
||||||
# print(last_two_words)
|
|
||||||
# words = []
|
|
||||||
# # print(words)
|
|
||||||
# # print(re.sub(' +|\t', ' ', line).replace("\\n", " ").replace("\n","").split(" "))
|
|
||||||
# # break
|
|
||||||
# if index == 2:
|
|
||||||
# break
|
|
||||||
# index += 1
|
|
||||||
|
|
||||||
# text = "one of the"
|
if i_ == 10000:
|
||||||
# print(bigrams["political_thirst"])
|
break
|
||||||
# print(trigrams["to_political_thirst"])
|
|
||||||
# for trigram in trigrams:
|
text = "one of the"
|
||||||
# if trigrams[trigram] > 1:
|
print(bigrams["political_thirst"])
|
||||||
# print(trigram, trigrams[trigram])
|
print(trigrams["to_political_thirst"])
|
||||||
# for bigram in bigrams:
|
for trigram in trigrams:
|
||||||
# if bigrams[bigram] > 6:
|
if trigrams[trigram] > 1:
|
||||||
# print(bigram, bigrams[bigram])
|
print(trigram, trigrams[trigram])
|
||||||
|
for bigram in bigrams:
|
||||||
|
if bigrams[bigram] > 6:
|
||||||
|
print(bigram, bigrams[bigram])
|
||||||
|
Loading…
Reference in New Issue
Block a user