retroc2/retroc2.ipynb
2022-05-17 22:26:42 +02:00

4.2 KiB

import os
import sklearn
import pandas as pd
from gzip import open as open_gz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
def predict_year(x, path_out, model):
    results = model.predict(x)
    with open(path_out, 'wt') as file:
        for r in results:
            file.write(str(r) + '\n')       
with open('train/train.tsv', 'r', encoding='utf8') as file:
    train = pd.read_csv(file, sep='\t', names=['Date1', 'Date2', 'Title', 'Author', 'Text'])
    
#train = train[0:10000]
train_x = train['Text']
train['Date'] = (train['Date1'].astype(float) + train['Date2'].astype(float))/2
train_y=train['Date1']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(train_x, train_y)
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('linearregression', LinearRegression())])
with open('dev-0/in.tsv', 'r', encoding='utf8') as file:
    x_dev0 = pd.read_csv(file, header=None, sep='\t')
x_dev0 = x_dev0[0]  
x_dev0[19999] = 'nie jest'
x_dev0[20000] = 'nie wiem'
with open('dev-1/in.tsv', 'r', encoding='utf8') as file:
    x_dev1 = pd.read_csv(file, header=None, sep='\t')
x_dev1 = x_dev1[0] 
with open('test-A/in.tsv', 'r', encoding='utf8') as file:
    x_test = pd.read_csv(file, header=None, sep='\t')
x_test = x_test[0] 
#y_dev = pd.read_csv('dev-0/out.tsv',header = None, sep = '/t',engine = 'python')
#y_dev = y_dev[0]
#y_dev_exp = pd.read_csv('dev-0/expected.tsv',header = None, sep = '/t',engine = 'python')
#y_dev_exp = y_dev_exp[0]
#RMSE_dev = mean_squared_error(y_dev_exp, y_dev) 
predict_year(x_dev0, 'dev-0/out.tsv', model)
predict_year(x_dev1,'dev-1/out.tsv', model)
predict_year(x_test,'test-A/out.tsv', model)