31 lines
1.1 KiB
Python
31 lines
1.1 KiB
Python
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
import pandas as pd
|
|
|
|
trainX = pd.read_csv('./train/in.tsv', sep = '\t', header = None)
|
|
trainY = pd.read_csv('./train/expected.tsv', sep = '\t', header = None)
|
|
train_X = trainX[0]
|
|
train_Y = trainY[0][:289541]
|
|
|
|
test_X_dev0 = pd.read_csv('./dev-0/in.tsv', sep = '\t', header = None)
|
|
test_Y_dev0 = pd.read_csv('./dev-0/expected.tsv', sep = '\t', header = None)
|
|
test_X_dev0 = test_X_dev0[0]
|
|
|
|
test_X_A = pd.read_csv('./test-A/in.tsv', sep = '\t', header = None)
|
|
test_X_A = test_X_A[0]
|
|
|
|
vectorizer = CountVectorizer()
|
|
|
|
features_train = vectorizer.fit_transform(train_X)
|
|
features_test_dev0 = vectorizer.transform(test_X_dev0)
|
|
features_test_testA = vectorizer.transform(test_X_A)
|
|
|
|
model = MultinomialNB()
|
|
model.fit(features_train, train_Y)
|
|
|
|
y_dev0_pred = model.predict(features_test_dev0)
|
|
y_testA_pred = model.predict(features_test_testA)
|
|
|
|
pd.DataFrame(y_dev0_pred).to_csv('./dev-0/out.tsv', header=None, sep='\t', index=False)
|
|
pd.DataFrame(y_testA_pred).to_csv('./test-A/out.tsv', header=None, sep='\t', index=False)
|