import gensim as gensim from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import GaussianNB, MultinomialNB from stop_words import get_stop_words import os import pandas as pd def read_train_file(inDirectory): colnames = ['Y', 'X'] df_train = pd.read_csv(inDirectory, sep="\t", names=colnames) return df_train def read_evaluate_file(inDirectory): colnames = ['X'] df_evaulate = pd.read_csv(inDirectory, sep="\n", names=colnames) return df_evaulate def naivBayersWithTFIDFTrain(inDirectory): df_train = read_train_file(inDirectory) train_vectorized_corpus = vectorizer.fit_transform(df_train['X']) gnb.fit(train_vectorized_corpus, df_train['Y']) def naivBayersWithTFIDFEvaluate(inDirectory, outDirectory): df_evaluate = read_evaluate_file(inDirectory) evaluate_vectorized_corpus = vectorizer.transform(df_evaluate['X']) evaluate = gnb.predict(evaluate_vectorized_corpus) with open(outDirectory, 'w') as file: for e in evaluate: file.write("%i\n" % e) vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish')) gnb = MultinomialNB() naivBayersWithTFIDFTrain("train/train.tsv") naivBayersWithTFIDFEvaluate("dev-0/in.tsv", "dev-0/out.tsv") naivBayersWithTFIDFEvaluate("test-A/in.tsv", "test-A/out.tsv") os.system("./geval -t dev-0")