sport-text-classification-b.../Untitled.ipynb
2021-06-20 20:05:08 +02:00

2.3 KiB

from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

r_in = './train/train.tsv'

r_ind_ev = './dev-0/in.tsv'
tsv_read = pd.read_table(r_in, error_bad_lines=False, sep='\t', header=None)
tsv_read_dev = pd.read_table(r_ind_ev, error_bad_lines=False, sep='\t', header=None)

y_train = tsv_read[0].values
X_train = tsv_read[1].values
X_dev = tsv_read_dev[0].values

vectorizer = TfidfVectorizer()
counts = vectorizer.fit_transform(X_train)


classifier = MultinomialNB()
classifier.fit(counts, y_train)

counts2 = vectorizer.transform(X_dev)
predictions = classifier.predict(counts2)

predictions.tofile("./dev-0/out.tsv", sep='\n')

tsv_read_test_in =  pd.read_table('./test-A/in.tsv', error_bad_lines=False, header= None)
X_test= tsv_read_test_in[0].values

counts3 = vectorizer.transform(X_test)
predictions_test_A = classifier.predict(counts3)
predictions_test_A.tofile('./test-A/out.tsv', sep='\n')
b'Skipping line 25706: expected 2 fields, saw 3\nSkipping line 58881: expected 2 fields, saw 3\nSkipping line 73761: expected 2 fields, saw 3\n'
b'Skipping line 1983: expected 1 fields, saw 2\nSkipping line 5199: expected 1 fields, saw 2\n'