452662 trigram
This commit is contained in:
parent
66ceb65baf
commit
6bb3f18cf6
62
run.py
62
run.py
@ -72,31 +72,13 @@ for i in range(len(expected)):
|
|||||||
# In[9]:
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
from collections import defaultdict
|
|
||||||
from nltk import ngrams
|
|
||||||
from nltk.tokenize import word_tokenize
|
|
||||||
|
|
||||||
model = defaultdict(lambda: defaultdict(float))
|
|
||||||
dictionary = set()
|
|
||||||
for line in corpus[:100000]:
|
|
||||||
tokens = word_tokenize(line)
|
|
||||||
for word1, word2, word3, word4 in ngrams(tokens, n=4, pad_right=True, pad_left=True):
|
|
||||||
if word1 and word2 and word3 and word4:
|
|
||||||
model[(word2, word3, word4)][word1] += 1
|
|
||||||
model[(word1, word2, word3)][word4] += 1
|
|
||||||
dictionary.update([word1, word2, word3, word4])
|
|
||||||
|
|
||||||
|
|
||||||
# In[15]:
|
|
||||||
|
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from nltk import trigrams
|
from nltk import trigrams
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
model_trigram = defaultdict(lambda: defaultdict(float))
|
model_trigram = defaultdict(lambda: defaultdict(float))
|
||||||
dictionary_trigram = set()
|
dictionary_trigram = set()
|
||||||
for line in corpus[:100000]:
|
for line in corpus[:200000]:
|
||||||
tokens = word_tokenize(line)
|
tokens = word_tokenize(line)
|
||||||
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
|
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
|
||||||
if word1 and word2 and word3:
|
if word1 and word2 and word3:
|
||||||
@ -105,7 +87,7 @@ for line in corpus[:100000]:
|
|||||||
dictionary_trigram.update([word1, word2, word3])
|
dictionary_trigram.update([word1, word2, word3])
|
||||||
|
|
||||||
|
|
||||||
# In[18]:
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@ -114,7 +96,7 @@ from nltk.tokenize import word_tokenize
|
|||||||
|
|
||||||
model_bigram = defaultdict(lambda: defaultdict(float))
|
model_bigram = defaultdict(lambda: defaultdict(float))
|
||||||
dictionary_bigram = set()
|
dictionary_bigram = set()
|
||||||
for line in corpus[:100000]:
|
for line in corpus[:200000]:
|
||||||
tokens = word_tokenize(line)
|
tokens = word_tokenize(line)
|
||||||
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
|
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
|
||||||
if word1 and word2:
|
if word1 and word2:
|
||||||
@ -126,16 +108,6 @@ for line in corpus[:100000]:
|
|||||||
# In[11]:
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
smoothing = 0.0001
|
|
||||||
for trio in model:
|
|
||||||
count_sum = sum(model[trio].values()) + smoothing * len(dictionary)
|
|
||||||
for token in model[trio]:
|
|
||||||
model[trio][token] = (model[trio][token] + smoothing) / count_sum
|
|
||||||
|
|
||||||
|
|
||||||
# In[17]:
|
|
||||||
|
|
||||||
|
|
||||||
smoothing = 0.0001
|
smoothing = 0.0001
|
||||||
for trio in model_trigram:
|
for trio in model_trigram:
|
||||||
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
|
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
|
||||||
@ -143,7 +115,7 @@ for trio in model_trigram:
|
|||||||
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
|
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
|
||||||
|
|
||||||
|
|
||||||
# In[19]:
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
smoothing = 0.0001
|
smoothing = 0.0001
|
||||||
@ -153,12 +125,12 @@ for trio in model_bigram:
|
|||||||
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
|
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
|
||||||
|
|
||||||
|
|
||||||
# In[21]:
|
# In[19]:
|
||||||
|
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
|
default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
|
||||||
|
|
||||||
data = read_xz_file("dev-0\\in.tsv.xz")
|
data = read_xz_file("dev-0\\in.tsv.xz")
|
||||||
corpus_before=[]
|
corpus_before=[]
|
||||||
@ -172,16 +144,11 @@ with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
|
|||||||
tokens = word_tokenize(text)
|
tokens = word_tokenize(text)
|
||||||
prediction = ""
|
prediction = ""
|
||||||
|
|
||||||
if len(tokens) >= 4:
|
if len(tokens) >= 3:
|
||||||
results = dict(model[(tokens[0], tokens[1], tokens[2])])
|
results = dict(model_trigram[(tokens[0], tokens[1])])
|
||||||
if results:
|
if results:
|
||||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||||
|
|
||||||
if prediction == "":
|
|
||||||
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
|
|
||||||
if trigram_results:
|
|
||||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
|
|
||||||
|
|
||||||
if prediction == "":
|
if prediction == "":
|
||||||
bigram_results = dict(model_bigram[tokens[0]])
|
bigram_results = dict(model_bigram[tokens[0]])
|
||||||
if bigram_results:
|
if bigram_results:
|
||||||
@ -199,12 +166,12 @@ with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# In[23]:
|
# In[22]:
|
||||||
|
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
|
||||||
default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
|
default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
|
||||||
|
|
||||||
data = read_xz_file("test-A\\in.tsv.xz")
|
data = read_xz_file("test-A\\in.tsv.xz")
|
||||||
corpus_before=[]
|
corpus_before=[]
|
||||||
@ -218,16 +185,11 @@ with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
|
|||||||
tokens = word_tokenize(text)
|
tokens = word_tokenize(text)
|
||||||
prediction = ""
|
prediction = ""
|
||||||
|
|
||||||
if len(tokens) >= 4:
|
if len(tokens) >= 3:
|
||||||
results = dict(model[(tokens[0], tokens[1], tokens[2])])
|
results = dict(model_trigram[(tokens[0], tokens[1])])
|
||||||
if results:
|
if results:
|
||||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||||
|
|
||||||
if prediction == "":
|
|
||||||
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
|
|
||||||
if trigram_results:
|
|
||||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
|
|
||||||
|
|
||||||
if prediction == "":
|
if prediction == "":
|
||||||
bigram_results = dict(model_bigram[tokens[0]])
|
bigram_results = dict(model_bigram[tokens[0]])
|
||||||
if bigram_results:
|
if bigram_results:
|
||||||
|
910
test-A/out.tsv
910
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user