#!/usr/bin/env python # coding: utf-8 # In[59]: import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from sklearn.metrics import mean_squared_error from sklearn.pipeline import make_pipeline # In[60]: colnames = ['start', 'text'] data = pd.read_csv('train/train.tsv', sep='\t', names=colnames, usecols=[0, 4]) # In[66]: x_train = data['text'] y_train = data['start'] # In[67]: tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95) tfidf_vectorizer.fit_transform(x_train.values) x_train_prepared = tfidf_vectorizer.transform(x_train.values) # In[68]: lr = LinearRegression() model = lr.fit(x_train_prepared, y_train) # In[69]: y_dev0_exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['text']) f = open("dev-0/in.tsv", "r", encoding='utf-8') lines_dev_0 = f.readlines() x_dev0 = pd.DataFrame(lines_dev_0) x_dev0.rename(columns = {0 : 'text'}, inplace = True) x_dev0_prepared = tfidf_vectorizer.transform(x_dev0['text'].values) y_dev0_pred = model.predict(x_dev0_prepared) file = open('dev-0/out.tsv', 'w') for y in y_dev0_pred: file.write(f'{y}\n') file.close() # In[74]: y_dev1_exp = pd.read_csv('dev-1/expected.tsv', sep='\t', names=['text']) f = open("dev-1/in.tsv", "r", encoding='utf-8') lines_dev_1 = f.readlines() x_dev1 = pd.DataFrame(lines_dev_1) x_dev1.rename(columns = {0 : 'text'}, inplace = True) x_dev1_prepared = tfidf_vectorizer.transform(x_dev1['text'].values) y_dev1_pred = model.predict(x_dev1_prepared) file = open('dev-1/out.tsv', 'w') for y in y_dev1_pred: file.write(f'{y}\n') file.close() # In[76]: f = open("test-A/in.tsv", "r", encoding='utf-8') lines_test = f.readlines() x_test = pd.DataFrame(lines_test) x_test.rename(columns = {0 : 'text'}, inplace = True) x_test_prepared = tfidf_vectorizer.transform(x_test['text'].values) y_test_pred = model.predict(x_test_prepared) file = open('test-A/out.tsv', 'w') for y in y_test_pred: file.write(f'{y}\n') file.close()