Compare commits

...

3 Commits

Author SHA1 Message Date
Zosia
aa6998f037 update script 2021-05-18 22:46:19 +02:00
Zosia
ad632af707 add script 2021-05-18 22:41:52 +02:00
Zosia
56217b104a add output files 2021-05-18 22:40:40 +02:00
4 changed files with 35895 additions and 0 deletions

20000
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1475
retroc2.ipynb Normal file

File diff suppressed because it is too large Load Diff

200
retroc2.py Normal file
View File

@ -0,0 +1,200 @@
#!/usr/bin/env python
# coding: utf-8
# # retroc2
# In[1]:
import lzma
import csv
from stop_words import get_stop_words
import gensim
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LinearRegression
# In[68]:
def read_data(filename):
all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
return [line.split('\t') for line in all_data][:-1]
train_data = read_data('train/train.tsv.xz')[::250]
# In[69]:
train_data[0]
# In[70]:
stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']
print(stop_words)
# In[71]:
train_data_tokenized = [list(set(gensim.utils.tokenize(x[4], lowercase = True))) for x in train_data]
# In[72]:
train_data_tokenized[0]
# In[73]:
train_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]
train_data_stemmatized[0]
# In[74]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([' '.join(i) for i in train_data_stemmatized])
# In[75]:
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
# In[76]:
len(train_data)
# In[77]:
df[:10]
# In[78]:
vectorizer.transform(['__ ma kota']).toarray()[0]
# In[79]:
train_Y = [(float(x[0]) + float(x[1])) / 2 for x in train_data]
# In[80]:
model = LinearRegression() # definicja modelu
model.fit(df, train_Y) # dopasowanie modelu
# In[81]:
model.predict(df[:10])
# In[82]:
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
dev_0_data = [line.rstrip() for line in f]
dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]
dev_0_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]
dev_0_data = [' '.join(i) for i in dev_0_data_stemmatized]
# In[83]:
y_predicted = model.predict(vectorizer.transform(dev_0_data).toarray())
# In[84]:
y_predicted[:10]
# In[92]:
f = open("dev-0/out.tsv", "a")
for i in y_predicted:
f.write(str(round(i, 11)) + '\n')
f.close()
# In[86]:
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
e = [line.rstrip() for line in f]
# In[94]:
import math
t = []
for i in range(len(y_predicted)):
tmp = (float(y_predicted[i]) - float(e[i])) ** 2
t.append(tmp)
print(math.sqrt(sum(t)/len(y_predicted)))
# In[88]:
with open('test-A/in.tsv', "r", encoding="utf-8") as f:
test_A_data = [line.rstrip() for line in f]
test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]
test_A_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]
test_A_data = [' '.join(i) for i in test_A_data_stemmatized]
# In[89]:
y_test_predicted = model.predict(vectorizer.transform(test_A_data).toarray())
# In[90]:
y_test_predicted[:10]
# In[93]:
f = open("test-A/out.tsv", "a")
for i in y_test_predicted:
f.write(str(round(i, 11)) + '\n')
f.close()
# In[ ]:

14220
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff