forked from kubapok/retroc2
first solution
This commit is contained in:
parent
647c099815
commit
dd3261c0d5
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
48
run.py
Normal file
48
run.py
Normal file
@ -0,0 +1,48 @@
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
|
||||
with open('train/train.tsv', 'r', encoding='utf8') as f:
|
||||
train_data = f.readlines()
|
||||
|
||||
with open('train/meta.tsv', 'r', encoding='utf8') as f:
|
||||
expected_years = f.readlines()
|
||||
|
||||
for i, expected in enumerate(expected_years):
|
||||
expected_years[i] = expected.split('\t')[5]
|
||||
#
|
||||
vectorizer = TfidfVectorizer(token_pattern=r"\b[a-zA-Z]+\b|[0-9]{4}[\.]?[a-z]{0,3}[\.]?")
|
||||
train = vectorizer.fit_transform(train_data)
|
||||
model = LinearRegression()
|
||||
model.fit(train, expected_years)
|
||||
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||
dev_0 = f.readlines()
|
||||
|
||||
# prediction on test1 data
|
||||
dev_0 = vectorizer.transform(dev_0)
|
||||
predicted_dev_0 = model.predict(dev_0)
|
||||
|
||||
with open('dev-0/out.tsv', 'wt') as f:
|
||||
for p in predicted_dev_0:
|
||||
f.write(str(p) + '\n')
|
||||
|
||||
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
|
||||
dev_1 = f.readlines()
|
||||
|
||||
# prediction on test2 data
|
||||
dev_1 = vectorizer.transform(dev_1)
|
||||
predicted_dev_1 = model.predict(dev_1)
|
||||
|
||||
with open('dev-1/out.tsv', 'wt') as f:
|
||||
for p in predicted_dev_1:
|
||||
f.write(str(p) + '\n')
|
||||
|
||||
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||
test_A = f.readlines()
|
||||
|
||||
test_A = vectorizer.transform(test_A)
|
||||
predicted_test_A = model.predict(test_A)
|
||||
|
||||
with open('test-A/out.tsv', 'wt') as f:
|
||||
for p in predicted_test_A:
|
||||
f.write(str(p) + '\n')
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1729
train.ipynb
Normal file
1729
train.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user