import string from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.cluster import KMeans from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import MinMaxScaler, normalize scaler = MinMaxScaler() stopwords = [] # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt with open('stopwords') as f: stopwords = [line.rstrip() for line in f] classifier = MultinomialNB() def train(): with open('train/train.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] y = [] for doc in docs: t = doc.split('\t') y.append(t[0]) doc = t[1] doc = doc.lower().split(' ') doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) docs_preprocessed.append(doc) y = [int(numeric_string) for numeric_string in y] tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)] global d2v_model d2v_model = Doc2Vec(tagged_documents, epochs=300, dm=0) X = d2v_model.dv.vectors X = scaler.fit_transform(X) classifier.fit(X, y) def classify(path): with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] for doc in docs: doc = doc.lower().split(' ') doc = [''.join(char for char in word if char not in string.punctuation) for word in doc] doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) docs_preprocessed.append(doc) test_vectors = [] for doc in docs_preprocessed: test_vectors.append(d2v_model.infer_vector(doc)) results = classifier.predict(test_vectors) with open(path + 'out.tsv', 'w') as file: for result in results: file.write("%i\n" % result) train() classify('dev-0/') # classify('test-A/', n_clusters=10)