forked from kubapok/retroc2
Compare commits
No commits in common. "master" and "master" have entirely different histories.
20000
dev-0/out.tsv
20000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
11563
dev-1/out.tsv
File diff suppressed because it is too large
Load Diff
50
run.py
50
run.py
@ -1,50 +0,0 @@
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
|
||||
with open('train/train.tsv', 'r', encoding='utf8') as f:
|
||||
train_data = f.readlines()
|
||||
|
||||
with open('train/meta.tsv', 'r', encoding='utf8') as f:
|
||||
expected_years = f.readlines()
|
||||
|
||||
for i, expected in enumerate(expected_years):
|
||||
expected_years[i] = expected.split('\t')[5]
|
||||
#
|
||||
vectorizer = TfidfVectorizer(token_pattern=r"\b[a-zA-Z]+\b|[0-9]{4}[\.]?[a-z]{0,3}[\.]?")
|
||||
train = vectorizer.fit_transform(train_data)
|
||||
model = LinearRegression()
|
||||
model.fit(train, expected_years)
|
||||
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||
dev_0 = f.readlines()
|
||||
|
||||
# prediction on test1 data
|
||||
dev_0 = vectorizer.transform(dev_0)
|
||||
predicted_dev_0 = model.predict(dev_0)
|
||||
|
||||
with open('dev-0/out.tsv', 'wt') as f:
|
||||
for p in predicted_dev_0:
|
||||
f.write(str(p) + '\n')
|
||||
|
||||
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
|
||||
dev_1 = f.readlines()
|
||||
|
||||
# prediction on test2 data
|
||||
dev_1 = vectorizer.transform(dev_1)
|
||||
predicted_dev_1 = model.predict(dev_1)
|
||||
|
||||
with open('dev-1/out.tsv', 'wt') as f:
|
||||
for p in predicted_dev_1:
|
||||
f.write(str(p) + '\n')
|
||||
|
||||
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||
test_A = f.readlines()
|
||||
|
||||
test_A = vectorizer.transform(test_A)
|
||||
predicted_test_A = model.predict(test_A)
|
||||
|
||||
with open('test-A/out.tsv', 'wt') as f:
|
||||
for p in predicted_test_A:
|
||||
f.write(str(p) + '\n')
|
||||
|
||||
print("finished")
|
14220
test-A/out.tsv
14220
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
1729
train.ipynb
1729
train.ipynb
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user