sport-text-classification-b.../classifier.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
vectorizer = TfidfVectorizer()


def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
    for doc in docs:
        y_with_doc = doc.split('\t')
        y.append(y_with_doc[0])
        doc = y_with_doc[1]
        docs_preprocessed.append(doc)
    y = [int(value) for value in y]
    x = vectorizer.fit_transform(docs_preprocessed)
    classifier.fit(x, y)


def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    test_x = vectorizer.transform(docs)
    predictions = classifier.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
        for prediction in predictions:
            file.write("%i\n" % prediction)


train()
classify('dev-0/')
classify('test-A/')
Change to tfidf 2021-04-19 19:44:17 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
no message 2021-04-19 19:17:10 +02:00			`from sklearn.naive_bayes import MultinomialNB`

			`classifier = MultinomialNB()`
Refactor 2021-04-19 20:00:40 +02:00			`vectorizer = TfidfVectorizer()`


no message 2021-04-19 19:17:10 +02:00			`def train():`
			`with open('train/train.tsv') as f:`
			`docs = [line.rstrip() for line in f]`
			`docs_preprocessed = []`
			`y = []`
			`for doc in docs:`
Refactor 2021-04-19 20:00:40 +02:00			`y_with_doc = doc.split('\t')`
			`y.append(y_with_doc[0])`
			`doc = y_with_doc[1]`
Fix preprocessing 2021-04-28 20:21:12 +02:00			`docs_preprocessed.append(doc)`
Refactor 2021-04-19 20:00:40 +02:00			`y = [int(value) for value in y]`
			`x = vectorizer.fit_transform(docs_preprocessed)`
			`classifier.fit(x, y)`

no message 2021-04-19 19:17:10 +02:00
			`def classify(path):`
			`with open(path + 'in.tsv') as f:`
			`docs = [line.rstrip() for line in f]`
Refactor 2021-04-19 20:00:40 +02:00			`test_x = vectorizer.transform(docs)`
			`predictions = classifier.predict(test_x)`
no message 2021-04-19 19:17:10 +02:00			`with open(path + 'out.tsv', 'w') as file:`
Refactor 2021-04-19 20:00:40 +02:00			`for prediction in predictions:`
			`file.write("%i\n" % prediction)`
no message 2021-04-19 19:17:10 +02:00

			`train()`
			`classify('dev-0/')`
Fix preprocessing 2021-04-28 20:21:12 +02:00			`classify('test-A/')`