55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
print('Loading train.tsv...')
|
|
train = pd.read_csv('train/train.tsv', sep='\t', header=None)
|
|
train_date = (train[0] + train[1]) / 2
|
|
train_text = train[4]
|
|
|
|
# Debug
|
|
# train_text = train_text[:1000]
|
|
# train_date = train_date[:1000]
|
|
|
|
print('Loading dev-0...')
|
|
with open('dev-0/in.tsv', encoding='utf8') as file:
|
|
dev0_text = file.readlines()
|
|
dev0_text = [str(line) for line in dev0_text]
|
|
|
|
print('Loading dev-1...')
|
|
with open('dev-1/in.tsv', encoding='utf8') as file:
|
|
dev1_text = file.readlines()
|
|
dev1_text = [str(line) for line in dev1_text]
|
|
|
|
print('Loading test...')
|
|
with open('test-A/in.tsv', encoding='utf8') as file:
|
|
test_text = file.readlines()
|
|
test_text = [str(line) for line in test_text]
|
|
|
|
print('Vectorizing training text...')
|
|
vc = TfidfVectorizer(max_df=0.90)
|
|
vectorized_text = vc.fit_transform(train_text)
|
|
|
|
print('Training model...')
|
|
model = LinearRegression()
|
|
model.fit(vectorized_text, train_date)
|
|
|
|
print('Predicting dev0...')
|
|
vectorized_dev0 = vc.transform(dev0_text)
|
|
out_dev0 = model.predict(vectorized_dev0)
|
|
|
|
print('Predicting dev1...')
|
|
vectorized_dev1 = vc.transform(dev1_text)
|
|
out_dev1 = model.predict(vectorized_dev1)
|
|
|
|
print('Predicting test-A...')
|
|
vectorized_test = vc.transform(test_text)
|
|
out_test = model.predict(vectorized_test)
|
|
|
|
print('Saving to file')
|
|
np.savetxt('dev-0/out.tsv', out_dev0, fmt='%f')
|
|
np.savetxt('dev-1/out.tsv', out_dev1, fmt='%f')
|
|
np.savetxt('test-A/out.tsv', out_test, fmt='%f')
|
|
|