Add solution
This commit is contained in:
parent
33b70ce7b1
commit
0143180d81
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
61
run.py
Normal file
61
run.py
Normal file
@ -0,0 +1,61 @@
|
||||
import pandas as pd
|
||||
import csv
|
||||
from statistics import mean
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from spacy.lang.pl.stop_words import STOP_WORDS as pl_stop
|
||||
|
||||
|
||||
def write_file(fname, data):
|
||||
with open(fname, 'w', newline='') as f:
|
||||
tsv_output = csv.writer(f, delimiter='\n')
|
||||
tsv_output.writerow(data)
|
||||
|
||||
|
||||
def get_data(fname):
|
||||
with open(fname, 'r', encoding='utf8') as file:
|
||||
return file.readlines()
|
||||
|
||||
|
||||
def preprocess_year(data, colnames):
|
||||
return data[colnames].apply(lambda x: round(mean(x)), axis=1)
|
||||
|
||||
|
||||
def vectorize(data):
|
||||
vectorizer = TfidfVectorizer(lowercase=True, stop_words=pl_stop)
|
||||
|
||||
vectorizer.fit(data['text'].to_numpy().ravel())
|
||||
return vectorizer
|
||||
|
||||
|
||||
def train_model(data, vec):
|
||||
model = LinearRegression()
|
||||
model.fit(vec, data['year_avg'])
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def predict_and_save(folder_name, model, vectorizer):
|
||||
test_x = pd.DataFrame(get_data(f'{folder_name}/in.tsv'))
|
||||
|
||||
test_tfidfs = vectorizer.transform(test_x[0].to_numpy().ravel())
|
||||
predicted = model.predict(test_tfidfs)
|
||||
|
||||
write_file(f'{folder_name}/out.tsv', predicted)
|
||||
|
||||
|
||||
def main():
|
||||
train = pd.read_csv('train/train.tsv.xz', sep='\t', nrows=50000, header=None, names=['begin', 'end', 'title_norm', 'symbol', 'text'])
|
||||
vectorizer = vectorize(train)
|
||||
tfidfs = vectorizer.transform(train['text'].to_numpy().ravel())
|
||||
|
||||
# Preprocessing
|
||||
train['year_avg'] = preprocess_year(train, ['begin', 'end'])
|
||||
model = train_model(train, tfidfs)
|
||||
|
||||
predict_and_save('test-A', model, vectorizer)
|
||||
predict_and_save('dev-0', model, vectorizer)
|
||||
predict_and_save('dev-1', model, vectorizer)
|
||||
|
||||
|
||||
main()
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user