from sklearn import preprocessing from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import LabelEncoder from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import lzma def openXZ(path): with lzma.open(path, mode='rt') as f: return f.readlines() def readFile(path): with open(path) as source: return source.readlines() lines = openXZ('./paranormal-or-skeptic-ISI-public/dev-0/in.tsv.xz') inData = openXZ('./paranormal-or-skeptic-ISI-public/train/in.tsv.xz') expData = readFile('./paranormal-or-skeptic-ISI-public/train/expected.tsv') expected = LabelEncoder().fit_transform(expData) pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB()) model = pipeline.fit(inData, expected) result = model.predict(lines) np.savetxt('./paranormal-or-skeptic-ISI-public/dev-0/out.tsv', result, fmt='%d', delimiter='\n') lines = openXZ('./paranormal-or-skeptic-ISI-public/test-A/in.tsv.xz') result = model.predict(lines) np.savetxt('./paranormal-or-skeptic-ISI-public/test-A/out.tsv', result, fmt='%d', delimiter='\n') #dla dev wynik był: 0.7367223065250379, ściezki sa troche dziwne, ponieważ pracowałem na google colab