Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9eac7fe04b | ||
|
dd3261c0d5 |
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
50
run.py
Normal file
50
run.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
|
||||||
|
with open('train/train.tsv', 'r', encoding='utf8') as f:
|
||||||
|
train_data = f.readlines()
|
||||||
|
|
||||||
|
with open('train/meta.tsv', 'r', encoding='utf8') as f:
|
||||||
|
expected_years = f.readlines()
|
||||||
|
|
||||||
|
for i, expected in enumerate(expected_years):
|
||||||
|
expected_years[i] = expected.split('\t')[5]
|
||||||
|
#
|
||||||
|
vectorizer = TfidfVectorizer(token_pattern=r"\b[a-zA-Z]+\b|[0-9]{4}[\.]?[a-z]{0,3}[\.]?")
|
||||||
|
train = vectorizer.fit_transform(train_data)
|
||||||
|
model = LinearRegression()
|
||||||
|
model.fit(train, expected_years)
|
||||||
|
|
||||||
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||||
|
dev_0 = f.readlines()
|
||||||
|
|
||||||
|
# prediction on test1 data
|
||||||
|
dev_0 = vectorizer.transform(dev_0)
|
||||||
|
predicted_dev_0 = model.predict(dev_0)
|
||||||
|
|
||||||
|
with open('dev-0/out.tsv', 'wt') as f:
|
||||||
|
for p in predicted_dev_0:
|
||||||
|
f.write(str(p) + '\n')
|
||||||
|
|
||||||
|
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
|
||||||
|
dev_1 = f.readlines()
|
||||||
|
|
||||||
|
# prediction on test2 data
|
||||||
|
dev_1 = vectorizer.transform(dev_1)
|
||||||
|
predicted_dev_1 = model.predict(dev_1)
|
||||||
|
|
||||||
|
with open('dev-1/out.tsv', 'wt') as f:
|
||||||
|
for p in predicted_dev_1:
|
||||||
|
f.write(str(p) + '\n')
|
||||||
|
|
||||||
|
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||||
|
test_A = f.readlines()
|
||||||
|
|
||||||
|
test_A = vectorizer.transform(test_A)
|
||||||
|
predicted_test_A = model.predict(test_A)
|
||||||
|
|
||||||
|
with open('test-A/out.tsv', 'wt') as f:
|
||||||
|
for p in predicted_test_A:
|
||||||
|
f.write(str(p) + '\n')
|
||||||
|
|
||||||
|
print("finished")
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1729
train.ipynb
Normal file
1729
train.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user