from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB stopwords = [] # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt with open('stopwords') as f: stopwords = [line.rstrip() for line in f] classifier = MultinomialNB() vectorizer = TfidfVectorizer() def preprocess(doc): doc = doc.lower().split(' ') doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) doc = ' '.join(doc) return doc def train(): with open('train/train.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] y = [] for doc in docs: y_with_doc = doc.split('\t') y.append(y_with_doc[0]) doc = y_with_doc[1] docs_preprocessed.append(preprocess(doc)) y = [int(value) for value in y] x = vectorizer.fit_transform(docs_preprocessed) classifier.fit(x, y) def classify(path): with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] for doc in docs: docs_preprocessed.append(preprocess(doc)) test_x = vectorizer.transform(docs) predictions = classifier.predict(test_x) with open(path + 'out.tsv', 'w') as file: for prediction in predictions: file.write("%i\n" % prediction) train() classify('dev-0/') classify('test-A/')