#!/usr/bin/env python # coding: utf-8 # # retroc2 # In[1]: import lzma import csv from stop_words import get_stop_words import gensim import itertools from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd from sklearn.linear_model import LinearRegression # In[68]: def read_data(filename): all_data = lzma.open(filename).read().decode('UTF-8').split('\n') return [line.split('\t') for line in all_data][:-1] train_data = read_data('train/train.tsv.xz')[::250] # In[69]: train_data[0] # In[70]: stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o'] print(stop_words) # In[71]: train_data_tokenized = [list(set(gensim.utils.tokenize(x[4], lowercase = True))) for x in train_data] # In[72]: train_data_tokenized[0] # In[73]: train_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized] train_data_stemmatized[0] # In[74]: vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform([' '.join(i) for i in train_data_stemmatized]) # In[75]: feature_names = vectorizer.get_feature_names() dense = vectors.todense() denselist = dense.tolist() df = pd.DataFrame(denselist, columns=feature_names) # In[76]: len(train_data) # In[77]: df[:10] # In[78]: vectorizer.transform(['__ ma kota']).toarray()[0] # In[79]: train_Y = [(float(x[0]) + float(x[1])) / 2 for x in train_data] # In[80]: model = LinearRegression() # definicja modelu model.fit(df, train_Y) # dopasowanie modelu # In[81]: model.predict(df[:10]) # In[82]: with open('dev-0/in.tsv', "r", encoding="utf-8") as f: dev_0_data = [line.rstrip() for line in f] dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data] dev_0_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized] dev_0_data = [' '.join(i) for i in dev_0_data_stemmatized] # In[83]: y_predicted = model.predict(vectorizer.transform(dev_0_data).toarray()) # In[84]: y_predicted[:10] # In[92]: f = open("dev-0/out.tsv", "a") for i in y_predicted: f.write(str(round(i, 11)) + '\n') f.close() # In[86]: with open('dev-0/expected.tsv', "r", encoding="utf-8") as f: e = [line.rstrip() for line in f] # In[94]: import math t = [] for i in range(len(y_predicted)): tmp = (float(y_predicted[i]) - float(e[i])) ** 2 t.append(tmp) print(math.sqrt(sum(t)/len(y_predicted))) # In[88]: with open('test-A/in.tsv', "r", encoding="utf-8") as f: test_A_data = [line.rstrip() for line in f] test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data] test_A_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized] test_A_data = [' '.join(i) for i in test_A_data_stemmatized] # In[89]: y_test_predicted = model.predict(vectorizer.transform(test_A_data).toarray()) # In[90]: y_test_predicted[:10] # In[93]: f = open("test-A/out.tsv", "a") for i in y_test_predicted: f.write(str(round(i, 11)) + '\n') f.close() # In[ ]: