fourgram 100k corpus
This commit is contained in:
parent
1450ea4378
commit
7743dd2472
20804
dev-0/out.tsv
20804
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
178
run.py
Normal file
178
run.py
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
|
import lzma
|
||||||
|
def read_xz_file(file_path):
|
||||||
|
data = []
|
||||||
|
with lzma.open(file_path, 'rt', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.lower().replace("-\\n", "").replace("\\n", " ").replace("\xad", "").replace("\\\\n", " ").replace("\\\\", " ").replace("\n", " ")
|
||||||
|
data.append(line)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
|
def read_tsv_file(file_path):
|
||||||
|
data = []
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
for line in file:
|
||||||
|
line = line.strip().split('\t') # Rozdziel linie na elementy za pomocą tabulatora
|
||||||
|
data.append(line) # Dodaj elementy do listy danych
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
|
file_path = "train\\in.tsv.xz"
|
||||||
|
|
||||||
|
|
||||||
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
|
data = read_xz_file(file_path)
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
expected = read_tsv_file("train\\expected.tsv")
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
corpus_before=[]
|
||||||
|
corpus_after=[]
|
||||||
|
for i in range(len(data)):
|
||||||
|
corpus_before.append(str(data[i].split("\t")[6]))
|
||||||
|
corpus_after.append(str(data[i].split("\t")[7]))
|
||||||
|
|
||||||
|
|
||||||
|
# In[7]:
|
||||||
|
|
||||||
|
|
||||||
|
for i in range(len(expected)):
|
||||||
|
expected[i] = str(expected[i]).lower()
|
||||||
|
|
||||||
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
corpus = []
|
||||||
|
for i in range(len(expected)):
|
||||||
|
corpus.append(corpus_before[i] + " " + expected[i] + " " + corpus_after[i])
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from nltk import ngrams
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
model = defaultdict(lambda: defaultdict(float))
|
||||||
|
dictionary = set()
|
||||||
|
for line in corpus[:100000]:
|
||||||
|
tokens = word_tokenize(line)
|
||||||
|
for word1, word2, word3, word4 in ngrams(tokens, n=4, pad_right=True, pad_left=True):
|
||||||
|
if word1 and word2 and word3 and word4:
|
||||||
|
model[(word2, word3, word4)][word1] += 1
|
||||||
|
model[(word1, word2, word3)][word4] += 1
|
||||||
|
dictionary.update([word1, word2, word3, word4])
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
model2 = model.copy()
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
len(model)
|
||||||
|
|
||||||
|
|
||||||
|
# In[11]:
|
||||||
|
|
||||||
|
|
||||||
|
smoothing = 0.0001
|
||||||
|
for trio in model:
|
||||||
|
count_sum = sum(model[trio].values()) + smoothing * len(dictionary)
|
||||||
|
for token in model[trio]:
|
||||||
|
model[trio][token] = (model[trio][token] + smoothing) / count_sum
|
||||||
|
|
||||||
|
|
||||||
|
# In[12]:
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
|
||||||
|
|
||||||
|
data = read_xz_file("dev-0\\in.tsv.xz")
|
||||||
|
corpus_before=[]
|
||||||
|
corpus_after=[]
|
||||||
|
for i in range(len(data)):
|
||||||
|
corpus_before.append(str(data[i].split("\t")[6]))
|
||||||
|
corpus_after.append(str(data[i].split("\t")[7]))
|
||||||
|
|
||||||
|
with open("dev-0\\out.tsv", "w", encoding="utf-8") as output:
|
||||||
|
for text in corpus_before:
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
if len(tokens) < 4:
|
||||||
|
prediction = default
|
||||||
|
|
||||||
|
results = dict(model[(tokens[0], tokens[1], tokens[2])])
|
||||||
|
if not results:
|
||||||
|
prediction = default
|
||||||
|
|
||||||
|
prediction = ' '.join(
|
||||||
|
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||||
|
if prediction == "":
|
||||||
|
prediction = default
|
||||||
|
output.write(str(prediction.replace("\n", "").strip() + "\n"))
|
||||||
|
|
||||||
|
|
||||||
|
# In[13]:
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
default = "the:0.30000 of:0.20000 and:0.10000 to:0.10000 in:0.10000 a:0.10000 :0.10000"
|
||||||
|
|
||||||
|
data = read_xz_file("test-A\\in.tsv.xz")
|
||||||
|
corpus_before=[]
|
||||||
|
corpus_after=[]
|
||||||
|
for i in range(len(data)):
|
||||||
|
corpus_before.append(str(data[i].split("\t")[6]))
|
||||||
|
corpus_after.append(str(data[i].split("\t")[7]))
|
||||||
|
|
||||||
|
with open("test-A\\out.tsv", "w", encoding="utf-8") as output:
|
||||||
|
for text in corpus_before:
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
if len(tokens) < 4:
|
||||||
|
prediction = default
|
||||||
|
|
||||||
|
results = dict(model[(tokens[0], tokens[1], tokens[2])])
|
||||||
|
if not results:
|
||||||
|
prediction = default
|
||||||
|
|
||||||
|
prediction = ' '.join(
|
||||||
|
f"{term}:{round(prob, 5)}" for term, prob in Counter(results).most_common(6))
|
||||||
|
if prediction == "":
|
||||||
|
prediction = default
|
||||||
|
output.write(str(prediction.replace("\n", "").strip() + "\n"))
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
14464
test-A/out.tsv
14464
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user