retroc2/linear.ipynb
2021-05-18 00:37:50 +02:00

2.2 KiB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from stop_words import get_stop_words
import pandas as pd
import numpy as np
vectorizer = TfidfVectorizer(stop_words = get_stop_words('polish'))
linear = LinearRegression()

train = pd.read_csv('train/train.tsv', sep = "\t", names = ['start_date', 'end_date', 'title', 'sort_title', 'data'])

mean = (train['start_date'] + train['end_date']) / 2
tv = vectorizer.fit_transform(train['data'])
linear.fit(tv, mean)
def getData(directory):
    with open(directory, encoding="utf-8") as file:
          return file.readlines()
tmp_dev = getData('dev-0/in.tsv')
dataFrame_dev = pd.DataFrame(data = tmp_dev)
evaluate_dev = linear.predict(vectorizer.transform(dataFrame_dev[0]))
np.savetxt('dev-0/out.tsv', evaluate, fmt='%f', delimiter='\n')
tmp_test = getData('test-A/in.tsv')
dataFrame_test = pd.DataFrame(data = tmp_test)
evaluate_test = linear.predict(vectorizer.transform(dataFrame_test[0]))
np.savetxt('test-A/out.tsv', evaluate, fmt='%f', delimiter='\n')