61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
import pandas as pd
|
|
import csv
|
|
from statistics import mean
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from spacy.lang.pl.stop_words import STOP_WORDS as pl_stop
|
|
|
|
|
|
def write_file(fname, data):
|
|
with open(fname, 'w', newline='') as f:
|
|
tsv_output = csv.writer(f, delimiter='\n')
|
|
tsv_output.writerow(data)
|
|
|
|
|
|
def get_data(fname):
|
|
with open(fname, 'r', encoding='utf8') as file:
|
|
return file.readlines()
|
|
|
|
|
|
def preprocess_year(data, colnames):
|
|
return data[colnames].apply(lambda x: round(mean(x)), axis=1)
|
|
|
|
|
|
def vectorize(data):
|
|
vectorizer = TfidfVectorizer(lowercase=True, stop_words=pl_stop)
|
|
|
|
vectorizer.fit(data['text'].to_numpy().ravel())
|
|
return vectorizer
|
|
|
|
|
|
def train_model(data, vec):
|
|
model = LinearRegression()
|
|
model.fit(vec, data['year_avg'])
|
|
|
|
return model
|
|
|
|
|
|
def predict_and_save(folder_name, model, vectorizer):
|
|
test_x = pd.DataFrame(get_data(f'{folder_name}/in.tsv'))
|
|
|
|
test_tfidfs = vectorizer.transform(test_x[0].to_numpy().ravel())
|
|
predicted = model.predict(test_tfidfs)
|
|
|
|
write_file(f'{folder_name}/out.tsv', predicted)
|
|
|
|
|
|
def main():
|
|
train = pd.read_csv('train/train.tsv.xz', sep='\t', nrows=50000, header=None, names=['begin', 'end', 'title_norm', 'symbol', 'text'])
|
|
vectorizer = vectorize(train)
|
|
tfidfs = vectorizer.transform(train['text'].to_numpy().ravel())
|
|
|
|
# Preprocessing
|
|
train['year_avg'] = preprocess_year(train, ['begin', 'end'])
|
|
model = train_model(train, tfidfs)
|
|
|
|
predict_and_save('test-A', model, vectorizer)
|
|
predict_and_save('dev-0', model, vectorizer)
|
|
predict_and_save('dev-1', model, vectorizer)
|
|
|
|
|
|
main() |