sport-text-classification-b.../bayes.py
2021-05-06 10:35:19 +02:00

25 lines
826 B
Python

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
df = pd.read_csv("train/train.tsv", error_bad_lines=False, header=None, sep="\t")
dev0 = pd.read_csv("dev-0/in.tsv", header=None, sep="\t")
testA = pd.read_csv("test-A/in.tsv", header=None, sep="\t")
dev0X = dev0.iloc[:, 0].tolist()
testAX = testA.iloc[:, 0].tolist()
Y = df.iloc[:, 0].tolist()
X = df.iloc[:, 1].tolist()
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X, Y)
dev0_predicted = model.predict(dev0X)
testA_predicted = model.predict(testAX)
pd.Series(dev0_predicted).to_csv("dev-0/out.tsv", sep="\t", index=False, header=False)
pd.Series(testA_predicted).to_csv("test-A/out.tsv", sep="\t", index=False, header=False)