452662
This commit is contained in:
parent
77806b3f25
commit
3283b55c35
10270
dev-0/out.tsv
10270
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
108
run.py
108
run.py
@ -21,8 +21,8 @@ def read_tsv_file(file_path):
|
||||
data = []
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
line = line.strip().split('\t')
|
||||
data.append(line)
|
||||
line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora
|
||||
data.append(line) # Dodaj elementy do listy danych
|
||||
return data
|
||||
|
||||
|
||||
@ -87,16 +87,40 @@ for line in corpus[:100000]:
|
||||
dictionary.update([word1, word2, word3, word4])
|
||||
|
||||
|
||||
# In[10]:
|
||||
# In[15]:
|
||||
|
||||
|
||||
model2 = model.copy()
|
||||
from collections import defaultdict
|
||||
from nltk import trigrams
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
model_trigram = defaultdict(lambda: defaultdict(float))
|
||||
dictionary_trigram = set()
|
||||
for line in corpus[:100000]:
|
||||
tokens = word_tokenize(line)
|
||||
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
|
||||
if word1 and word2 and word3:
|
||||
model_trigram[(word2, word3)][word1] += 1
|
||||
model_trigram[(word1, word2)][word3] += 1
|
||||
dictionary_trigram.update([word1, word2, word3])
|
||||
|
||||
|
||||
# In[ ]:
|
||||
# In[18]:
|
||||
|
||||
|
||||
len(model)
|
||||
from collections import defaultdict
|
||||
from nltk import bigrams
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
model_bigram = defaultdict(lambda: defaultdict(float))
|
||||
dictionary_bigram = set()
|
||||
for line in corpus[:100000]:
|
||||
tokens = word_tokenize(line)
|
||||
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
|
||||
if word1 and word2:
|
||||
model_bigram[word2][word1] += 1
|
||||
model_bigram[word1][word2] += 1
|
||||
dictionary_bigram.update([word1, word2])
|
||||
|
||||
|
||||
# In[11]:
|
||||
@ -109,7 +133,27 @@ for trio in model:
|
||||
model[trio][token] = (model[trio][token] + smoothing) / count_sum
|
||||
|
||||
|
||||
# In[12]:
|
||||
# In[17]:
|
||||
|
||||
|
||||
smoothing = 0.0001
|
||||
for trio in model_trigram:
|
||||
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
|
||||
for token in model_trigram[trio]:
|
||||
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
|
||||
|
||||
|
||||
# In[19]:
|
||||
|
||||
|
||||
smoothing = 0.0001
|
||||
for trio in model_bigram:
|
||||
count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
|
||||
for token in model_bigram[trio]:
|
||||
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
|
||||
|
||||
|
||||
# In[21]:
|
||||
|
||||
|
||||
from collections import Counter
|
||||
@ -126,21 +170,36 @@ for i in range(len(data)):
|
||||
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
|
||||
for text in corpus_before:
|
||||
tokens = word_tokenize(text)
|
||||
if len(tokens) < 4:
|
||||
prediction = default
|
||||
prediction = ""
|
||||
|
||||
if len(tokens) >= 4:
|
||||
results = dict(model[(tokens[0], tokens[1], tokens[2])])
|
||||
if not results:
|
||||
prediction = default
|
||||
if results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
|
||||
if trigram_results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
bigram_results = dict(model_bigram[tokens[0]])
|
||||
if bigram_results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
|
||||
|
||||
prediction = ' '.join(
|
||||
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||
if prediction == "":
|
||||
prediction = default
|
||||
|
||||
output.write(str(prediction.replace("\n", "").strip() + "\n"))
|
||||
|
||||
|
||||
# In[13]:
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# In[23]:
|
||||
|
||||
|
||||
from collections import Counter
|
||||
@ -157,17 +216,26 @@ for i in range(len(data)):
|
||||
with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
|
||||
for text in corpus_before:
|
||||
tokens = word_tokenize(text)
|
||||
if len(tokens) < 4:
|
||||
prediction = default
|
||||
prediction = ""
|
||||
|
||||
if len(tokens) >= 4:
|
||||
results = dict(model[(tokens[0], tokens[1], tokens[2])])
|
||||
if not results:
|
||||
prediction = default
|
||||
if results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
|
||||
if trigram_results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
|
||||
|
||||
if prediction == "":
|
||||
bigram_results = dict(model_bigram[tokens[0]])
|
||||
if bigram_results:
|
||||
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
|
||||
|
||||
prediction = ' '.join(
|
||||
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||
if prediction == "":
|
||||
prediction = default
|
||||
|
||||
output.write(str(prediction.replace("\n", "").strip() + "\n"))
|
||||
|
||||
|
||||
|
7284
test-A/out.tsv
7284
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user