forked from kubapok/retroc2
first commit
This commit is contained in:
parent
647c099815
commit
525b78298b
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
99
run.py
Normal file
99
run.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
import lzma
|
||||||
|
from math import sqrt
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.pipeline import make_pipeline
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
#print("lista polskich stop words")
|
||||||
|
|
||||||
|
pol_stop_words = ["aczkolwiek","albo","ależ",
|
||||||
|
"bardziej","bardzo","bowiem","byli","bynajmniej","była","było","były","będzie","będą",
|
||||||
|
"cali","cała","cały","ciebie","cokolwiek","czasami","czasem","czemu","czyli",
|
||||||
|
"daleko","dlaczego","dlatego","dobrze","dokąd","dość","dużo","dwaj","dwie","dwoje","dziś","dzisiaj",
|
||||||
|
"gdyby","gdyż","gdzie","gdziekolwiek","gdzieś",
|
||||||
|
"inna","inne","inny","innych",
|
||||||
|
"jakaś","jakby","jaki","jakichś","jakie","jakiś","jakiż","jakkolwiek","jako","jakoś","jeden","jedna","jedno","jednak","jednakże","jego","jej","jemu","jest","jestem","jeszcze","jeśli","jeżeli","już","ją",
|
||||||
|
"każdy", "kiedy","kilka","kimś","kto","ktokolwiek","ktoś","która","które","którego","której","który","których","którym","którzy","ku",
|
||||||
|
"lecz","mają","mimo","między","mnie","mogą","moim","moja","moje","może","możliwe","można","musi",
|
||||||
|
"nami","nasi","nasz","nasza","nasze","naszego","naszych","natomiast","natychmiast","nawet","nich","niego","niej","niemu","nigdy","nimi",
|
||||||
|
"obok","około","oraz","owszem",
|
||||||
|
"pana","pani","podczas","pomimo","ponad","ponieważ","powinien","powinna","powinni","powinno","poza","prawie","przecież","przed","przede","przedtem","przez","przy",
|
||||||
|
"roku","również","sama", "są","skąd","sobie","sobą","sposób","swoje",
|
||||||
|
"taka","taki","takie","także","tego","teraz","tobą","tobie","toteż","trzeba","tutaj","twoi","twoim","twoja","twoje","twym","twój","tych","tylko",
|
||||||
|
"wami","wasz","wasza","wasze","według","wiele","wielu","więc","więcej","wszyscy","wszystkich","wszystkie","wszystkim","wszystko","wtedy","właśnie",
|
||||||
|
"zapewne","zawsze","zeznowu","znów","został","żaden","żadna","żadne","żadnych","żeby"]
|
||||||
|
|
||||||
|
def rmse(Y_true, Y_predicted):
|
||||||
|
return np.sqrt(np.sum((Y_true - Y_predicted)**2)/ len(Y_true))
|
||||||
|
|
||||||
|
#print("wczytuję dane")
|
||||||
|
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
|
||||||
|
data = pd.read_csv(f, sep='\t',
|
||||||
|
names=['Poczatek', 'Koniec', 'Tytul', 'Wyd', 'Tresc'])
|
||||||
|
|
||||||
|
#data = data[0:5000]
|
||||||
|
|
||||||
|
#print("wyznaczam najlepszy wynik dla roku")
|
||||||
|
data['Rok'] = (data['Poczatek'].astype(float) + data['Koniec'].astype(float)) / 2
|
||||||
|
|
||||||
|
#print("model")
|
||||||
|
x = data['Tresc']
|
||||||
|
y = data['Rok']
|
||||||
|
|
||||||
|
model = make_pipeline(TfidfVectorizer(stop_words=pol_stop_words), LinearRegression())
|
||||||
|
model.fit(x, y)
|
||||||
|
|
||||||
|
#print("trenowanie modelu")
|
||||||
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||||
|
in_dev0 = f.readlines()
|
||||||
|
|
||||||
|
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
|
||||||
|
expected_dev0 = f.readlines()
|
||||||
|
|
||||||
|
out_dev0 = model.predict(in_dev0)
|
||||||
|
|
||||||
|
with open('dev-0/out.tsv', 'wt') as f:
|
||||||
|
for predict in out_dev0:
|
||||||
|
f.write(str(predict)+'\n')
|
||||||
|
|
||||||
|
#print("trenowanie dev1")
|
||||||
|
|
||||||
|
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
|
||||||
|
in_dev1 = f.readlines()
|
||||||
|
|
||||||
|
with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:
|
||||||
|
expected_dev1 = f.readlines()
|
||||||
|
|
||||||
|
out_dev1 = model.predict(in_dev1)
|
||||||
|
|
||||||
|
with open('dev-1/out.tsv', 'wt') as f:
|
||||||
|
for prediction in out_dev1:
|
||||||
|
f.write(str(prediction)+'\n')
|
||||||
|
|
||||||
|
#print("predykcja")
|
||||||
|
|
||||||
|
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||||
|
in_test = f.readlines()
|
||||||
|
|
||||||
|
out_test = model.predict(in_test)
|
||||||
|
|
||||||
|
with open('test-A/out.tsv', 'wt') as f:
|
||||||
|
for predict in out_test:
|
||||||
|
f.write(str(predict)+'\n')
|
||||||
|
|
||||||
|
'''print("ewaluacja modelu")
|
||||||
|
|
||||||
|
y_true = []
|
||||||
|
with open("dev-0/expected.tsv", encoding='utf-8') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
y_true.append(line)
|
||||||
|
y_pred = []
|
||||||
|
with open("dev-0/out.tsv", encoding='utf-8') as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
y_pred.append(line)
|
||||||
|
print(sqrt(mean_squared_error(y_true,y_pred)))'''
|
||||||
|
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user