import lzma from math import sqrt from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline from sklearn.metrics import mean_squared_error import pandas as pd import numpy as np from sklearn.metrics import accuracy_score #print("lista polskich stop words") pol_stop_words = ["aczkolwiek","albo","ależ", "bardziej","bardzo","bowiem","byli","bynajmniej","była","było","były","będzie","będą", "cali","cała","cały","ciebie","cokolwiek","czasami","czasem","czemu","czyli", "daleko","dlaczego","dlatego","dobrze","dokąd","dość","dużo","dwaj","dwie","dwoje","dziś","dzisiaj", "gdyby","gdyż","gdzie","gdziekolwiek","gdzieś", "inna","inne","inny","innych", "jakaś","jakby","jaki","jakichś","jakie","jakiś","jakiż","jakkolwiek","jako","jakoś","jeden","jedna","jedno","jednak","jednakże","jego","jej","jemu","jest","jestem","jeszcze","jeśli","jeżeli","już","ją", "każdy", "kiedy","kilka","kimś","kto","ktokolwiek","ktoś","która","które","którego","której","który","których","którym","którzy","ku", "lecz","mają","mimo","między","mnie","mogą","moim","moja","moje","może","możliwe","można","musi", "nami","nasi","nasz","nasza","nasze","naszego","naszych","natomiast","natychmiast","nawet","nich","niego","niej","niemu","nigdy","nimi", "obok","około","oraz","owszem", "pana","pani","podczas","pomimo","ponad","ponieważ","powinien","powinna","powinni","powinno","poza","prawie","przecież","przed","przede","przedtem","przez","przy", "roku","również","sama", "są","skąd","sobie","sobą","sposób","swoje", "taka","taki","takie","także","tego","teraz","tobą","tobie","toteż","trzeba","tutaj","twoi","twoim","twoja","twoje","twym","twój","tych","tylko", "wami","wasz","wasza","wasze","według","wiele","wielu","więc","więcej","wszyscy","wszystkich","wszystkie","wszystkim","wszystko","wtedy","właśnie", "zapewne","zawsze","zeznowu","znów","został","żaden","żadna","żadne","żadnych","żeby"] def rmse(Y_true, Y_predicted): return np.sqrt(np.sum((Y_true - Y_predicted)**2)/ len(Y_true)) #print("wczytuję dane") with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f: data = pd.read_csv(f, sep='\t', names=['Poczatek', 'Koniec', 'Tytul', 'Wyd', 'Tresc']) #data = data[0:5000] #print("wyznaczam najlepszy wynik dla roku") data['Rok'] = (data['Poczatek'].astype(float) + data['Koniec'].astype(float)) / 2 #print("model") x = data['Tresc'] y = data['Rok'] model = make_pipeline(TfidfVectorizer(stop_words=pol_stop_words), LinearRegression()) model.fit(x, y) #print("trenowanie modelu") with open('dev-0/in.tsv', 'r', encoding='utf8') as f: in_dev0 = f.readlines() with open('dev-0/expected.tsv', 'r', encoding='utf8') as f: expected_dev0 = f.readlines() out_dev0 = model.predict(in_dev0) with open('dev-0/out.tsv', 'wt') as f: for predict in out_dev0: f.write(str(predict)+'\n') #print("trenowanie dev1") with open('dev-1/in.tsv', 'r', encoding='utf8') as f: in_dev1 = f.readlines() with open('dev-1/expected.tsv', 'r', encoding='utf8') as f: expected_dev1 = f.readlines() out_dev1 = model.predict(in_dev1) with open('dev-1/out.tsv', 'wt') as f: for prediction in out_dev1: f.write(str(prediction)+'\n') #print("predykcja") with open('test-A/in.tsv', 'r', encoding='utf8') as f: in_test = f.readlines() out_test = model.predict(in_test) with open('test-A/out.tsv', 'wt') as f: for predict in out_test: f.write(str(predict)+'\n') '''print("ewaluacja modelu") y_true = [] with open("dev-0/expected.tsv", encoding='utf-8') as f: for line in f.readlines(): y_true.append(line) y_pred = [] with open("dev-0/out.tsv", encoding='utf-8') as f: for line in f.readlines(): y_pred.append(line) print(sqrt(mean_squared_error(y_true,y_pred)))'''