#!/usr/bin/env python # coding: utf-8 # In[24]: import pandas as pd import numpy as np import math from sklearn.pipeline import make_pipeline import os from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import csv # In[39]: train = pd.read_csv('train/train.tsv', header=None, sep='\t') dev_x0 = pd.read_csv('dev-0/in.tsv', header=None, sep='\t', quoting=csv.QUOTE_NONE, error_bad_lines=False) dev_y0 = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t',quoting=csv.QUOTE_NONE, error_bad_lines=False) dev_x1 = pd.read_csv('dev-1/in.tsv', header=None, sep='\t', quoting=csv.QUOTE_NONE, error_bad_lines=False) dev_y1 = pd.read_csv('dev-1/expected.tsv', header=None, sep='\t', quoting=csv.QUOTE_NONE, error_bad_lines=False) test_x = pd.read_csv('test-A/in.tsv', header=None, sep='\t', quoting=csv.QUOTE_NONE, error_bad_lines=False) # In[26]: len(dev_y0[0]) # In[27]: len(dev_x0[0]) # In[40]: len(dev_y1[0]) # In[41]: len(dev_x1[0]) # In[43]: train_x = train[4] train_y_mean = (train.iloc[:, 0] + train.iloc[:, 1])/2 # In[49]: train_y_mean = train_y_mean[:30000] train_x = train_x[:30000] # In[51]: vectorizer = TfidfVectorizer() X_train_tfidf = vectorizer.fit_transform(train_x) # In[52]: lm = LinearRegression() lm.fit(X_train_tfidf,train_y_mean) X_dev0_= vectorizer.transform(dev_x0[0]) X_dev1_ = vectorizer.transform(dev_x1[0]) X_test_ = vectorizer.transform(test_x[0]) # In[54]: dev0_y_pred = lm.predict(X_dev0_) dev1_y_pred = lm.predict(X_dev1_) test_y_pred = lm.predict(X_test_) # In[55]: print(dev_y0[:19998]) # In[58]: rmse_dev0 = mean_squared_error(dev_y0, dev0_y_pred, squared=False) rmse_dev1 = mean_squared_error(dev_y1,dev1_y_pred, squared = False) print(rmse_dev0, rmse_dev1) # In[18]: print(dev_y0[:10]) # In[64]: type(dev0_y_pred) # In[65]: np.savetxt("out.tsv",dev0_y_pred, delimiter="\t", fmt='%1.8f') # In[66]: np.savetxt("out.tsv",dev1_y_pred, delimiter="\t", fmt='%1.8f') # In[67]: np.savetxt("out.tsv",test_y_pred, delimiter="\t", fmt='%1.8f') # In[ ]: