This commit is contained in:
s452662 2024-04-23 21:23:41 +02:00
parent 77806b3f25
commit 3283b55c35
3 changed files with 8867 additions and 8799 deletions

File diff suppressed because it is too large Load Diff

112
run.py
View File

@ -21,8 +21,8 @@ def read_tsv_file(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip().split('\t')
data.append(line)
line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora
data.append(line) # Dodaj elementy do listy danych
return data
@ -87,16 +87,40 @@ for line in corpus[:100000]:
dictionary.update([word1, word2, word3, word4])
# In[10]:
# In[15]:
model2 = model.copy()
from collections import defaultdict
from nltk import trigrams
from nltk.tokenize import word_tokenize
model_trigram = defaultdict(lambda: defaultdict(float))
dictionary_trigram = set()
for line in corpus[:100000]:
tokens = word_tokenize(line)
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
if word1 and word2 and word3:
model_trigram[(word2, word3)][word1] += 1
model_trigram[(word1, word2)][word3] += 1
dictionary_trigram.update([word1, word2, word3])
# In[ ]:
# In[18]:
len(model)
from collections import defaultdict
from nltk import bigrams
from nltk.tokenize import word_tokenize
model_bigram = defaultdict(lambda: defaultdict(float))
dictionary_bigram = set()
for line in corpus[:100000]:
tokens = word_tokenize(line)
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
if word1 and word2:
model_bigram[word2][word1] += 1
model_bigram[word1][word2] += 1
dictionary_bigram.update([word1, word2])
# In[11]:
@ -109,7 +133,27 @@ for trio in model:
model[trio][token] = (model[trio][token] + smoothing) / count_sum
# In[12]:
# In[17]:
smoothing = 0.0001
for trio in model_trigram:
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
for token in model_trigram[trio]:
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
# In[19]:
smoothing = 0.0001
for trio in model_bigram:
count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
for token in model_bigram[trio]:
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
# In[21]:
from collections import Counter
@ -126,21 +170,36 @@ for i in range(len(data)):
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
for text in corpus_before:
tokens = word_tokenize(text)
if len(tokens) < 4:
prediction = default
prediction = ""
results = dict(model[(tokens[0], tokens[1], tokens[2])])
if not results:
prediction = default
if len(tokens) >= 4:
results = dict(model[(tokens[0], tokens[1], tokens[2])])
if results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
if trigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
if prediction == "":
bigram_results = dict(model_bigram[tokens[0]])
if bigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
prediction = ' '.join(
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
prediction = default
output.write(str(prediction.replace("\n", "").strip() + "\n"))
# In[13]:
# In[ ]:
# In[23]:
from collections import Counter
@ -157,17 +216,26 @@ for i in range(len(data)):
with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
for text in corpus_before:
tokens = word_tokenize(text)
if len(tokens) < 4:
prediction = default
prediction = ""
results = dict(model[(tokens[0], tokens[1], tokens[2])])
if not results:
prediction = default
if len(tokens) >= 4:
results = dict(model[(tokens[0], tokens[1], tokens[2])])
if results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
if trigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
if prediction == "":
bigram_results = dict(model_bigram[tokens[0]])
if bigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
prediction = ' '.join(
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
prediction = default
output.write(str(prediction.replace("\n", "").strip() + "\n"))

File diff suppressed because it is too large Load Diff