#!/usr/bin/env python # coding: utf-8 # In[1]: import os import pandas as pd import numpy as np import sklearn from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.pipeline import make_pipeline # In[2]: train = pd.read_csv('train/train.tsv', header=None, sep='\t', error_bad_lines=False) print(len(train)) train = train[:30000] # In[3]: x_train = train[4] y_train = train[0] # In[4]: model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(x_train, y_train) # In[5]: def readFile(filename): result = [] with open(filename, 'r', encoding="utf-8") as file: for line in file: text = line.split("\t")[0].strip() result.append(text) return result # In[6]: x_dev0 = readFile('dev-0/in.tsv') dev_predicted = model.predict(x_dev0) with open('dev-0/out.tsv', 'wt') as f: for i in dev_predicted: f.write(str(i)+'\n') # In[ ]: x_dev1 = readFile('dev-0/in.tsv') dev_predicted = model.predict(x_dev1) with open('dev-0/out.tsv', 'wt') as f: for i in dev_predicted: f.write(str(i)+'\n') # In[ ]: with open('test-A/in.tsv', 'r', encoding = 'utf-8') as f: x_test = f.readlines() # x_test = pd.Series(x_test) # x_test = vectorizer.transform(x_test) test_predicted = model.predict(x_test) with open('test-A/out.tsv', 'wt') as f: for i in test_predicted: f.write(str(i)+'\n') # In[ ]: get_ipython().system('jupyter nbconvert --to script run.ipynb')