Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
aa6998f037 | ||
|
ad632af707 | ||
|
56217b104a |
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1475
retroc2.ipynb
Normal file
1475
retroc2.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
200
retroc2.py
Normal file
200
retroc2.py
Normal file
@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# # retroc2
|
||||
|
||||
# In[1]:
|
||||
|
||||
|
||||
import lzma
|
||||
import csv
|
||||
from stop_words import get_stop_words
|
||||
import gensim
|
||||
import itertools
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LinearRegression
|
||||
|
||||
|
||||
# In[68]:
|
||||
|
||||
|
||||
def read_data(filename):
|
||||
all_data = lzma.open(filename).read().decode('UTF-8').split('\n')
|
||||
return [line.split('\t') for line in all_data][:-1]
|
||||
|
||||
train_data = read_data('train/train.tsv.xz')[::250]
|
||||
|
||||
|
||||
# In[69]:
|
||||
|
||||
|
||||
train_data[0]
|
||||
|
||||
|
||||
# In[70]:
|
||||
|
||||
|
||||
stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']
|
||||
print(stop_words)
|
||||
|
||||
|
||||
# In[71]:
|
||||
|
||||
|
||||
train_data_tokenized = [list(set(gensim.utils.tokenize(x[4], lowercase = True))) for x in train_data]
|
||||
|
||||
|
||||
# In[72]:
|
||||
|
||||
|
||||
train_data_tokenized[0]
|
||||
|
||||
|
||||
# In[73]:
|
||||
|
||||
|
||||
train_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]
|
||||
train_data_stemmatized[0]
|
||||
|
||||
|
||||
# In[74]:
|
||||
|
||||
|
||||
vectorizer = TfidfVectorizer()
|
||||
vectors = vectorizer.fit_transform([' '.join(i) for i in train_data_stemmatized])
|
||||
|
||||
|
||||
# In[75]:
|
||||
|
||||
|
||||
feature_names = vectorizer.get_feature_names()
|
||||
dense = vectors.todense()
|
||||
denselist = dense.tolist()
|
||||
df = pd.DataFrame(denselist, columns=feature_names)
|
||||
|
||||
|
||||
# In[76]:
|
||||
|
||||
|
||||
len(train_data)
|
||||
|
||||
|
||||
# In[77]:
|
||||
|
||||
|
||||
df[:10]
|
||||
|
||||
|
||||
# In[78]:
|
||||
|
||||
|
||||
vectorizer.transform(['__ ma kota']).toarray()[0]
|
||||
|
||||
|
||||
# In[79]:
|
||||
|
||||
|
||||
train_Y = [(float(x[0]) + float(x[1])) / 2 for x in train_data]
|
||||
|
||||
|
||||
# In[80]:
|
||||
|
||||
|
||||
model = LinearRegression() # definicja modelu
|
||||
model.fit(df, train_Y) # dopasowanie modelu
|
||||
|
||||
|
||||
# In[81]:
|
||||
|
||||
|
||||
model.predict(df[:10])
|
||||
|
||||
|
||||
# In[82]:
|
||||
|
||||
|
||||
with open('dev-0/in.tsv', "r", encoding="utf-8") as f:
|
||||
dev_0_data = [line.rstrip() for line in f]
|
||||
|
||||
dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]
|
||||
dev_0_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]
|
||||
dev_0_data = [' '.join(i) for i in dev_0_data_stemmatized]
|
||||
|
||||
|
||||
# In[83]:
|
||||
|
||||
|
||||
y_predicted = model.predict(vectorizer.transform(dev_0_data).toarray())
|
||||
|
||||
|
||||
# In[84]:
|
||||
|
||||
|
||||
y_predicted[:10]
|
||||
|
||||
|
||||
# In[92]:
|
||||
|
||||
|
||||
f = open("dev-0/out.tsv", "a")
|
||||
for i in y_predicted:
|
||||
f.write(str(round(i, 11)) + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
# In[86]:
|
||||
|
||||
|
||||
with open('dev-0/expected.tsv', "r", encoding="utf-8") as f:
|
||||
e = [line.rstrip() for line in f]
|
||||
|
||||
|
||||
# In[94]:
|
||||
|
||||
|
||||
import math
|
||||
t = []
|
||||
for i in range(len(y_predicted)):
|
||||
tmp = (float(y_predicted[i]) - float(e[i])) ** 2
|
||||
t.append(tmp)
|
||||
print(math.sqrt(sum(t)/len(y_predicted)))
|
||||
|
||||
|
||||
# In[88]:
|
||||
|
||||
|
||||
with open('test-A/in.tsv', "r", encoding="utf-8") as f:
|
||||
test_A_data = [line.rstrip() for line in f]
|
||||
|
||||
test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]
|
||||
test_A_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]
|
||||
test_A_data = [' '.join(i) for i in test_A_data_stemmatized]
|
||||
|
||||
|
||||
# In[89]:
|
||||
|
||||
|
||||
y_test_predicted = model.predict(vectorizer.transform(test_A_data).toarray())
|
||||
|
||||
|
||||
# In[90]:
|
||||
|
||||
|
||||
y_test_predicted[:10]
|
||||
|
||||
|
||||
# In[93]:
|
||||
|
||||
|
||||
f = open("test-A/out.tsv", "a")
|
||||
for i in y_test_predicted:
|
||||
f.write(str(round(i, 11)) + '\n')
|
||||
f.close()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
|
||||
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user