challenging-america-word-ga.../run.py

209 lines
5.1 KiB
Python
Raw Normal View History

2024-04-23 20:46:13 +02:00
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import lzma
def read_xz_file(file_path):
data = []
with lzma.open(file_path, 'rt', encoding='utf-8') as f:
for line in f:
line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ")
data.append(line)
return data
# In[2]:
def read_tsv_file(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
2024-04-23 21:23:41 +02:00
line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora
data.append(line) # Dodaj elementy do listy danych
2024-04-23 20:46:13 +02:00
return data
# In[3]:
file_path = "train\\in.tsv.xz"
# In[4]:
data = read_xz_file(file_path)
# In[5]:
expected = read_tsv_file("train\\expected.tsv")
# In[6]:
corpus_before=[]
corpus_after=[]
for i in range(len(data)):
corpus_before.append(str(data[i].split("\t")[6]))
corpus_after.append(str(data[i].split("\t")[7]))
# In[7]:
for i in range(len(expected)):
expected[i] = str(expected[i]).lower()
# In[8]:
corpus = []
for i in range(len(expected)):
corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])
# In[9]:
2024-04-23 21:23:41 +02:00
from collections import defaultdict
from nltk import trigrams
from nltk.tokenize import word_tokenize
model_trigram = defaultdict(lambda: defaultdict(float))
dictionary_trigram = set()
2024-04-24 14:20:00 +02:00
for line in corpus[:200000]:
2024-04-23 21:23:41 +02:00
tokens = word_tokenize(line)
for word1, word2, word3 in trigrams(tokens, pad_right=True, pad_left=True):
if word1 and word2 and word3:
model_trigram[(word2, word3)][word1] += 1
model_trigram[(word1, word2)][word3] += 1
dictionary_trigram.update([word1, word2, word3])
2024-04-23 20:46:13 +02:00
2024-04-24 14:20:00 +02:00
# In[10]:
2024-04-23 20:46:13 +02:00
2024-04-23 21:23:41 +02:00
from collections import defaultdict
from nltk import bigrams
from nltk.tokenize import word_tokenize
model_bigram = defaultdict(lambda: defaultdict(float))
dictionary_bigram = set()
2024-04-24 14:20:00 +02:00
for line in corpus[:200000]:
2024-04-23 21:23:41 +02:00
tokens = word_tokenize(line)
for word1, word2 in bigrams(tokens, pad_right=True, pad_left=True):
if word1 and word2:
model_bigram[word2][word1] += 1
model_bigram[word1][word2] += 1
dictionary_bigram.update([word1, word2])
2024-04-23 20:46:13 +02:00
# In[11]:
2024-04-23 21:23:41 +02:00
smoothing = 0.0001
for trio in model_trigram:
count_sum = sum(model_trigram[trio].values()) + smoothing * len(dictionary_trigram)
for token in model_trigram[trio]:
model_trigram[trio][token] = (model_trigram[trio][token] + smoothing) / count_sum
2024-04-24 14:20:00 +02:00
# In[12]:
2024-04-23 21:23:41 +02:00
smoothing = 0.0001
for trio in model_bigram:
count_sum = sum(model_bigram[trio].values()) + smoothing * len(dictionary_bigram)
for token in model_bigram[trio]:
model_bigram[trio][token] = (model_bigram[trio][token] + smoothing) / count_sum
2024-04-24 14:20:00 +02:00
# In[19]:
2024-04-23 20:46:13 +02:00
from collections import Counter
2024-04-24 14:20:00 +02:00
default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
2024-04-23 20:46:13 +02:00
data = read_xz_file("dev-0\\in.tsv.xz")
corpus_before=[]
corpus_after=[]
for i in range(len(data)):
corpus_before.append(str(data[i].split("\t")[6]))
corpus_after.append(str(data[i].split("\t")[7]))
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
for text in corpus_before:
tokens = word_tokenize(text)
2024-04-23 21:23:41 +02:00
prediction = ""
2024-04-23 20:46:13 +02:00
2024-04-24 14:20:00 +02:00
if len(tokens) >= 3:
results = dict(model_trigram[(tokens[0], tokens[1])])
2024-04-23 21:23:41 +02:00
if results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
bigram_results = dict(model_bigram[tokens[0]])
if bigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
2024-04-23 20:46:13 +02:00
if prediction == "":
prediction = default
2024-04-23 21:23:41 +02:00
2024-04-23 20:46:13 +02:00
output.write(str(prediction.replace("\n", "").strip() + "\n"))
2024-04-23 21:23:41 +02:00
# In[ ]:
2024-04-24 14:20:00 +02:00
# In[22]:
2024-04-23 20:46:13 +02:00
from collections import Counter
2024-04-24 14:20:00 +02:00
default = "the:0.10000 of:0.05000 and:0.01000 to:0.01000 in:0.01000 a:0.01000 :0.81000"
2024-04-23 20:46:13 +02:00
data = read_xz_file("test-A\\in.tsv.xz")
corpus_before=[]
corpus_after=[]
for i in range(len(data)):
corpus_before.append(str(data[i].split("\t")[6]))
corpus_after.append(str(data[i].split("\t")[7]))
with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
for text in corpus_before:
tokens = word_tokenize(text)
2024-04-23 21:23:41 +02:00
prediction = ""
2024-04-23 20:46:13 +02:00
2024-04-24 14:20:00 +02:00
if len(tokens) >= 3:
results = dict(model_trigram[(tokens[0], tokens[1])])
2024-04-23 21:23:41 +02:00
if results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
if prediction == "":
bigram_results = dict(model_bigram[tokens[0]])
if bigram_results:
prediction = ' '.join(f"{term}:{round(prob, 5)}" for term, prob in Counter(bigram_results).most_common(6))
2024-04-23 20:46:13 +02:00
if prediction == "":
prediction = default
2024-04-23 21:23:41 +02:00
2024-04-23 20:46:13 +02:00
output.write(str(prediction.replace("\n", "").strip() + "\n"))
# In[ ]: