Compare commits
5 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
762fcdebfa | ||
|
1a52693ead | ||
|
1287c86fce | ||
e328161046 | |||
|
6a796f68ee |
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
.idea
|
||||||
|
train.tsv
|
||||||
|
*.sav
|
||||||
|
*.pickle
|
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
55
solution.py
Normal file
55
solution.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
filename = 'regressor.sav'
|
||||||
|
vec_file = 'vectorizer.pickle'
|
||||||
|
regressor = LinearRegression()
|
||||||
|
# regressor = pickle.load(open(filename, 'rb'))
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
# vectorizer = pickle.load(open(vec_file, 'rb'))
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
with open('train/train.tsv') as f:
|
||||||
|
docs = [line.rstrip() for line in f]
|
||||||
|
docs_preprocessed = []
|
||||||
|
y = []
|
||||||
|
for doc in docs:
|
||||||
|
row = doc.split('\t')
|
||||||
|
start_date = row[0]
|
||||||
|
end_date = row[1]
|
||||||
|
end_date = end_date.split(' ')
|
||||||
|
if len(end_date) > 1:
|
||||||
|
row.insert(4, end_date[1])
|
||||||
|
end_date = end_date[0]
|
||||||
|
doc = row[4:5][0]
|
||||||
|
docs_preprocessed.append(doc)
|
||||||
|
y.append((float(start_date) + float(end_date))/2)
|
||||||
|
y = [float(value) for value in y]
|
||||||
|
print('Fitting vectorizer...')
|
||||||
|
x = vectorizer.fit_transform(docs_preprocessed)
|
||||||
|
pickle.dump(vectorizer, open(vec_file, 'wb'))
|
||||||
|
print('DONE!')
|
||||||
|
print('Fitting regressor...')
|
||||||
|
regressor.fit(x, y)
|
||||||
|
pickle.dump(regressor, open(filename, 'wb'))
|
||||||
|
print('DONE!')
|
||||||
|
|
||||||
|
|
||||||
|
def classify(path):
|
||||||
|
print("Predicting for", path)
|
||||||
|
with open(path + 'in.tsv') as f:
|
||||||
|
docs = [line.rstrip() for line in f]
|
||||||
|
test_x = vectorizer.transform(docs)
|
||||||
|
predictions = regressor.predict(test_x)
|
||||||
|
with open(path + 'out.tsv', 'w') as file:
|
||||||
|
for prediction in predictions:
|
||||||
|
file.write("%f\n" % prediction)
|
||||||
|
|
||||||
|
|
||||||
|
train()
|
||||||
|
classify('dev-0/')
|
||||||
|
classify('dev-1/')
|
||||||
|
classify('test-A/')
|
||||||
|
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user