retroc2-mkostrzewski/run.py
2022-05-17 23:30:07 +02:00

61 lines
1.7 KiB
Python

import pandas as pd
import csv
from statistics import mean
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.pl.stop_words import STOP_WORDS as pl_stop
def write_file(fname, data):
with open(fname, 'w', newline='') as f:
tsv_output = csv.writer(f, delimiter='\n')
tsv_output.writerow(data)
def get_data(fname):
with open(fname, 'r', encoding='utf8') as file:
return file.readlines()
def preprocess_year(data, colnames):
return data[colnames].apply(lambda x: round(mean(x)), axis=1)
def vectorize(data):
vectorizer = TfidfVectorizer(lowercase=True, stop_words=pl_stop)
vectorizer.fit(data['text'].to_numpy().ravel())
return vectorizer
def train_model(data, vec):
model = LinearRegression()
model.fit(vec, data['year_avg'])
return model
def predict_and_save(folder_name, model, vectorizer):
test_x = pd.DataFrame(get_data(f'{folder_name}/in.tsv'))
test_tfidfs = vectorizer.transform(test_x[0].to_numpy().ravel())
predicted = model.predict(test_tfidfs)
write_file(f'{folder_name}/out.tsv', predicted)
def main():
train = pd.read_csv('train/train.tsv.xz', sep='\t', nrows=50000, header=None, names=['begin', 'end', 'title_norm', 'symbol', 'text'])
vectorizer = vectorize(train)
tfidfs = vectorizer.transform(train['text'].to_numpy().ravel())
# Preprocessing
train['year_avg'] = preprocess_year(train, ['begin', 'end'])
model = train_model(train, tfidfs)
predict_and_save('test-A', model, vectorizer)
predict_and_save('dev-0', model, vectorizer)
predict_and_save('dev-1', model, vectorizer)
main()