This commit is contained in:
s444501 2022-05-17 23:28:23 +02:00
parent 33b70ce7b1
commit 736197e51b
4 changed files with 45837 additions and 0 deletions

20000
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

11563
dev-1/out.tsv Normal file

File diff suppressed because it is too large Load Diff

54
run.py Normal file
View File

@ -0,0 +1,54 @@
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
print('Loading train.tsv...')
train = pd.read_csv('train/train.tsv', sep='\t', header=None)
train_date = (train[0] + train[1]) / 2
train_text = train[4]
# Debug
# train_text = train_text[:1000]
# train_date = train_date[:1000]
print('Loading dev-0...')
with open('dev-0/in.tsv', encoding='utf8') as file:
dev0_text = file.readlines()
dev0_text = [str(line) for line in dev0_text]
print('Loading dev-1...')
with open('dev-1/in.tsv', encoding='utf8') as file:
dev1_text = file.readlines()
dev1_text = [str(line) for line in dev1_text]
print('Loading test...')
with open('test-A/in.tsv', encoding='utf8') as file:
test_text = file.readlines()
test_text = [str(line) for line in test_text]
print('Vectorizing training text...')
vc = TfidfVectorizer(max_df=0.90)
vectorized_text = vc.fit_transform(train_text)
print('Training model...')
model = LinearRegression()
model.fit(vectorized_text, train_date)
print('Predicting dev0...')
vectorized_dev0 = vc.transform(dev0_text)
out_dev0 = model.predict(vectorized_dev0)
print('Predicting dev1...')
vectorized_dev1 = vc.transform(dev1_text)
out_dev1 = model.predict(vectorized_dev1)
print('Predicting test-A...')
vectorized_test = vc.transform(test_text)
out_test = model.predict(vectorized_test)
print('Saving to file')
np.savetxt('dev-0/out.tsv', out_dev0, fmt='%f')
np.savetxt('dev-1/out.tsv', out_dev1, fmt='%f')
np.savetxt('test-A/out.tsv', out_test, fmt='%f')

14220
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff