#!/usr/bin/env python # coding: utf-8 # In[1]: import os import pandas as pd import numpy as np import sklearn from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.pipeline import make_pipeline # In[2]: train = pd.read_csv('train/train.tsv', header=None, sep='\t', error_bad_lines=False) print(len(train)) train = train.head(40000) # In[3]: x_train = train[4] y_train = train[0] # In[4]: x_dev_data = pd.read_csv('dev-0/in.tsv', header=None, sep='\t') x_dev = x_dev_data[0] x_dev[19999] = "to jest tekst testowy" x_dev[20000] = "a ten tekst jest najbardziej testowy" y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t') # In[5]: model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(x_train, y_train) # In[6]: dev_predicted = model.predict(x_dev) with open('dev-0/out.tsv', 'wt') as f: for i in dev_predicted: f.write(str(i)+'\n') dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\t') dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t') # In[7]: print(mean_squared_error(dev_out, dev_expected)) # In[8]: with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f: x_test = f.readlines() # x_test = pd.Series(x_test) # x_test = vectorizer.transform(x_test) test_predicted = model.predict(x_test) with open('test-A/out.tsv', 'wt') as f: for i in test_predicted: f.write(str(i)+'\n') # In[9]: get_ipython().system('jupyter nbconvert --to script run.ipynb')