retroc2/solution.py

56 lines
1.5 KiB
Python
Raw Normal View History

2021-04-27 20:34:48 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import pickle
filename = 'regressor.sav'
2021-04-28 20:32:51 +02:00
vec_file = 'vectorizer.pickle'
2021-04-27 20:34:48 +02:00
regressor = LinearRegression()
# regressor = pickle.load(open(filename, 'rb'))
vectorizer = TfidfVectorizer()
2021-04-28 20:32:51 +02:00
# vectorizer = pickle.load(open(vec_file, 'rb'))
2021-04-27 20:34:48 +02:00
def train():
with open('train/train.tsv') as f:
docs = [line.rstrip() for line in f]
docs_preprocessed = []
y = []
2021-04-28 20:32:51 +02:00
for doc in docs:
2021-04-27 20:34:48 +02:00
row = doc.split('\t')
2021-04-28 20:32:51 +02:00
start_date = row[0]
end_date = row[1]
end_date = end_date.split(' ')
if len(end_date) > 1:
row.insert(4, end_date[1])
end_date = end_date[0]
doc = row[4:5][0]
docs_preprocessed.append(doc)
y.append((float(start_date) + float(end_date))/2)
2021-04-27 20:34:48 +02:00
y = [float(value) for value in y]
2021-04-28 20:32:51 +02:00
print('Fitting vectorizer...')
2021-04-27 20:34:48 +02:00
x = vectorizer.fit_transform(docs_preprocessed)
2021-04-28 20:32:51 +02:00
pickle.dump(vectorizer, open(vec_file, 'wb'))
print('DONE!')
print('Fitting regressor...')
2021-04-27 20:34:48 +02:00
regressor.fit(x, y)
pickle.dump(regressor, open(filename, 'wb'))
2021-04-28 20:32:51 +02:00
print('DONE!')
2021-04-27 20:34:48 +02:00
def classify(path):
2021-04-28 20:32:51 +02:00
print("Predicting for", path)
2021-04-27 20:34:48 +02:00
with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f]
test_x = vectorizer.transform(docs)
predictions = regressor.predict(test_x)
with open(path + 'out.tsv', 'w') as file:
for prediction in predictions:
file.write("%f\n" % prediction)
train()
classify('dev-0/')
2021-05-06 22:38:36 +02:00
classify('dev-1/')
classify('test-A/')
2021-04-27 20:34:48 +02:00