sport-text-classification-b.../classifier.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords') as f:
    stopwords = [line.rstrip() for line in f]

classifier = MultinomialNB()
vectorizer = TfidfVectorizer()


def preprocess(doc):
    doc = doc.lower().split(' ')
    doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
    doc = ' '.join(doc)
    return doc


def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
    for doc in docs:
        y_with_doc = doc.split('\t')
        y.append(y_with_doc[0])
        doc = y_with_doc[1]
        docs_preprocessed.append(preprocess(doc))
    y = [int(value) for value in y]
    x = vectorizer.fit_transform(docs_preprocessed)
    classifier.fit(x, y)


def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    for doc in docs:
        docs_preprocessed.append(preprocess(doc))
    test_x = vectorizer.transform(docs)
    predictions = classifier.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
        for prediction in predictions:
            file.write("%i\n" % prediction)


train()
classify('dev-0/')
classify('test-A/')