2021-05-18 22:41:52 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# coding: utf-8
|
|
|
|
|
|
|
|
# # retroc2
|
|
|
|
|
|
|
|
# In[1]:
|
|
|
|
|
|
|
|
|
|
|
|
import lzma
|
|
|
|
import csv
|
|
|
|
from stop_words import get_stop_words
|
|
|
|
import gensim
|
|
|
|
import itertools
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
import pandas as pd
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[68]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
def read_data(filename):
|
|
|
|
all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
|
|
|
|
return [line.split('\t') for line in all_data][:-1]
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
train_data = read_data('train/train.tsv.xz')[::250]
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[69]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
train_data[0]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[70]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']
|
|
|
|
print(stop_words)
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[71]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
train_data_tokenized = [list(set(gensim.utils.tokenize(x[4], lowercase = True))) for x in train_data]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[72]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
train_data_tokenized[0]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[73]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
train_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]
|
|
|
|
train_data_stemmatized[0]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[74]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
|
|
vectors = vectorizer.fit_transform([' '.join(i) for i in train_data_stemmatized])
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[75]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
feature_names = vectorizer.get_feature_names()
|
|
|
|
dense = vectors.todense()
|
|
|
|
denselist = dense.tolist()
|
|
|
|
df = pd.DataFrame(denselist, columns=feature_names)
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[76]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
len(train_data)
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[77]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
df[:10]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[78]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
vectorizer.transform(['__ ma kota']).toarray()[0]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[79]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
train_Y = [(float(x[0]) + float(x[1])) / 2 for x in train_data]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[80]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
model = LinearRegression() # definicja modelu
|
|
|
|
model.fit(df, train_Y) # dopasowanie modelu
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[81]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
model.predict(df[:10])
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[82]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
|
|
|
|
dev_0_data = [line.rstrip() for line in f]
|
|
|
|
|
|
|
|
dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]
|
|
|
|
dev_0_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]
|
|
|
|
dev_0_data = [' '.join(i) for i in dev_0_data_stemmatized]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[83]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
y_predicted = model.predict(vectorizer.transform(dev_0_data).toarray())
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[84]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
y_predicted[:10]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[92]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
f = open("dev-0/out.tsv", "a")
|
|
|
|
for i in y_predicted:
|
|
|
|
f.write(str(round(i, 11)) + '\n')
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[86]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
|
|
|
|
e = [line.rstrip() for line in f]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[94]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
import math
|
|
|
|
t = []
|
|
|
|
for i in range(len(y_predicted)):
|
|
|
|
tmp = (float(y_predicted[i]) - float(e[i])) ** 2
|
|
|
|
t.append(tmp)
|
|
|
|
print(math.sqrt(sum(t)/len(y_predicted)))
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[88]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
with open('test-A/in.tsv', "r", encoding="utf-8") as f:
|
|
|
|
test_A_data = [line.rstrip() for line in f]
|
|
|
|
|
|
|
|
test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]
|
|
|
|
test_A_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]
|
|
|
|
test_A_data = [' '.join(i) for i in test_A_data_stemmatized]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[89]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
y_test_predicted = model.predict(vectorizer.transform(test_A_data).toarray())
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[90]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
y_test_predicted[:10]
|
|
|
|
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
# In[93]:
|
2021-05-18 22:41:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
f = open("test-A/out.tsv", "a")
|
|
|
|
for i in y_test_predicted:
|
|
|
|
f.write(str(round(i, 11)) + '\n')
|
|
|
|
f.close()
|
|
|
|
|
2021-05-18 22:46:19 +02:00
|
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|