sport-text-classification-b.../classifier.py

35 lines
939 B
Python
Raw Normal View History

2021-04-19 19:44:17 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
2021-04-19 19:17:10 +02:00
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
2021-04-19 20:00:40 +02:00
vectorizer = TfidfVectorizer()
2021-04-19 19:17:10 +02:00
def train():
with open('train/train.tsv') as f:
docs = [line.rstrip() for line in f]
docs_preprocessed = []
y = []
for doc in docs:
2021-04-19 20:00:40 +02:00
y_with_doc = doc.split('\t')
y.append(y_with_doc[0])
doc = y_with_doc[1]
2021-04-28 20:21:12 +02:00
docs_preprocessed.append(doc)
2021-04-19 20:00:40 +02:00
y = [int(value) for value in y]
x = vectorizer.fit_transform(docs_preprocessed)
classifier.fit(x, y)
2021-04-19 19:17:10 +02:00
def classify(path):
with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f]
2021-04-19 20:00:40 +02:00
test_x = vectorizer.transform(docs)
predictions = classifier.predict(test_x)
2021-04-19 19:17:10 +02:00
with open(path + 'out.tsv', 'w') as file:
2021-04-19 20:00:40 +02:00
for prediction in predictions:
file.write("%i\n" % prediction)
2021-04-19 19:17:10 +02:00
train()
classify('dev-0/')
2021-04-28 20:21:12 +02:00
classify('test-A/')