Add solution
This commit is contained in:
parent
33b70ce7b1
commit
0143180d81
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
61
run.py
Normal file
61
run.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
from statistics import mean
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from spacy.lang.pl.stop_words import STOP_WORDS as pl_stop
|
||||||
|
|
||||||
|
|
||||||
|
def write_file(fname, data):
|
||||||
|
with open(fname, 'w', newline='') as f:
|
||||||
|
tsv_output = csv.writer(f, delimiter='\n')
|
||||||
|
tsv_output.writerow(data)
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(fname):
|
||||||
|
with open(fname, 'r', encoding='utf8') as file:
|
||||||
|
return file.readlines()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_year(data, colnames):
|
||||||
|
return data[colnames].apply(lambda x: round(mean(x)), axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
def vectorize(data):
|
||||||
|
vectorizer = TfidfVectorizer(lowercase=True, stop_words=pl_stop)
|
||||||
|
|
||||||
|
vectorizer.fit(data['text'].to_numpy().ravel())
|
||||||
|
return vectorizer
|
||||||
|
|
||||||
|
|
||||||
|
def train_model(data, vec):
|
||||||
|
model = LinearRegression()
|
||||||
|
model.fit(vec, data['year_avg'])
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def predict_and_save(folder_name, model, vectorizer):
|
||||||
|
test_x = pd.DataFrame(get_data(f'{folder_name}/in.tsv'))
|
||||||
|
|
||||||
|
test_tfidfs = vectorizer.transform(test_x[0].to_numpy().ravel())
|
||||||
|
predicted = model.predict(test_tfidfs)
|
||||||
|
|
||||||
|
write_file(f'{folder_name}/out.tsv', predicted)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
train = pd.read_csv('train/train.tsv.xz', sep='\t', nrows=50000, header=None, names=['begin', 'end', 'title_norm', 'symbol', 'text'])
|
||||||
|
vectorizer = vectorize(train)
|
||||||
|
tfidfs = vectorizer.transform(train['text'].to_numpy().ravel())
|
||||||
|
|
||||||
|
# Preprocessing
|
||||||
|
train['year_avg'] = preprocess_year(train, ['begin', 'end'])
|
||||||
|
model = train_model(train, tfidfs)
|
||||||
|
|
||||||
|
predict_and_save('test-A', model, vectorizer)
|
||||||
|
predict_and_save('dev-0', model, vectorizer)
|
||||||
|
predict_and_save('dev-1', model, vectorizer)
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user