This commit is contained in:
s452662 2024-04-23 21:23:41 +02:00
parent 77806b3f25
commit 3283b55c35
3 changed files with 8867 additions and 8799 deletions

File diff suppressed because it is too large Load Diff

112
run.py
View File

@ -21,8 +21,8 @@ def read_tsv_file(file_path):
data = [] data = []
with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
for line in file: for line in file:
line = line.strip().split('\t') line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora
data.append(line) data.append(line) # Dodaj elementy do listy danych
return data return data
@ -87,16 +87,40 @@ for line in corpus[:100000]:
dictionary.update([word1, word2, word3, word4]) dictionary.update([word1, word2, word3, word4])
# In[10]: # In[15]:
model2 = model.copy() from collections import defaultdict
from nltk import trigrams
from nltk.tokenize import word_tokenize
model_trigram = defaultdict(lambda: defaultdict(float))
dictionary_trigram = set()
for line in corpus[:100000]:
tokens = word_tokenize(line)
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
if word1 and word2 and word3:
model_trigram[(word2, word3)][word1] += 1
model_trigram[(word1, word2)][word3] += 1
dictionary_trigram.update([word1, word2, word3])
# In[ ]: # In[18]:
len(model) from collections import defaultdict
from nltk import bigrams
from nltk.tokenize import word_tokenize
model_bigram = defaultdict(lambda: defaultdict(float))
dictionary_bigram = set()
for line in corpus[:100000]:
tokens = word_tokenize(line)
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
if word1 and word2:
model_bigram[word2][word1] += 1
model_bigram[word1][word2] += 1
dictionary_bigram.update([word1, word2])
# In[11]: # In[11]:
@ -109,7 +133,27 @@ for trio in model:
model[trio][token] = (model[trio][token] + smoothing) / count_sum model[trio][token] = (model[trio][token] + smoothing) / count_sum
# In[12]: # In[17]:
smoothing = 0.0001
for trio in model_trigram:
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
for token in model_trigram[trio]:
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
# In[19]:
smoothing = 0.0001
for trio in model_bigram:
count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
for token in model_bigram[trio]:
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
# In[21]:
from collections import Counter from collections import Counter
@ -126,21 +170,36 @@ for i in range(len(data)):
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output: with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
for text in corpus_before: for text in corpus_before:
tokens = word_tokenize(text) tokens = word_tokenize(text)
if len(tokens) < 4: prediction = ""
prediction = default
results = dict(model[(tokens[0], tokens[1], tokens[2])]) if len(tokens) >= 4:
if not results: results = dict(model[(tokens[0], tokens[1], tokens[2])])
prediction = default if results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
if trigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
if prediction == "":
bigram_results = dict(model_bigram[tokens[0]])
if bigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
prediction = ' '.join(
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "": if prediction == "":
prediction = default prediction = default
output.write(str(prediction.replace("\n", "").strip() + "\n")) output.write(str(prediction.replace("\n", "").strip() + "\n"))
# In[13]: # In[ ]:
# In[23]:
from collections import Counter from collections import Counter
@ -157,17 +216,26 @@ for i in range(len(data)):
with open("test-A\\out.tsv", "w", encoding="utf-8") as output: with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
for text in corpus_before: for text in corpus_before:
tokens = word_tokenize(text) tokens = word_tokenize(text)
if len(tokens) < 4: prediction = ""
prediction = default
results = dict(model[(tokens[0], tokens[1], tokens[2])]) if len(tokens) >= 4:
if not results: results = dict(model[(tokens[0], tokens[1], tokens[2])])
prediction = default if results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
trigram_results = dict(model_trigram[(tokens[0], tokens[1])])
if trigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(trigram_results).most_common(6))
if prediction == "":
bigram_results = dict(model_bigram[tokens[0]])
if bigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
prediction = ' '.join(
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "": if prediction == "":
prediction = default prediction = default
output.write(str(prediction.replace("\n", "").strip() + "\n")) output.write(str(prediction.replace("\n", "").strip() + "\n"))

File diff suppressed because it is too large Load Diff