33 lines
1.0 KiB
Python
33 lines
1.0 KiB
Python
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
with open('train/in.tsv') as f:
|
|
data_train_X = f.readlines()
|
|
|
|
with open('train/expected.tsv') as f:
|
|
data_train_Y = f.readlines()
|
|
|
|
with open('dev-0/in.tsv') as f:
|
|
data_dev_X = f.readlines()
|
|
|
|
with open('test-A/in.tsv') as f:
|
|
data_test_X = f.readlines()
|
|
|
|
data_train_Y = LabelEncoder().fit_transform(data_train_Y)
|
|
model = Pipeline(steps=[('tfidf', TfidfVectorizer()),('bayes', MultinomialNB())])
|
|
|
|
clf = model.fit(data_train_X, data_train_Y)
|
|
|
|
with open('train/out.tsv', 'w') as writer:
|
|
for result in clf.predict(data_train_X):
|
|
writer.write(str(result) + '\n')
|
|
|
|
with open('dev-0/out.tsv', 'w') as writer:
|
|
for result in clf.predict(data_dev_X):
|
|
writer.write(str(result) + '\n')
|
|
|
|
with open('test-A/out.tsv', 'w') as writer:
|
|
for result in clf.predict(data_test_X):
|
|
writer.write(str(result) + '\n') |