import pandas as pd import numpy as np import gzip import os import sys from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import metrics IN_FILE_NAME = "in.tsv" OUT_FILE_NAME = "out.tsv" def main(dirname): in_path = os.path.join(dirname, IN_FILE_NAME) if not os.path.exists(in_path): raise Exception(f"Path {in_path} does not exist!") input = pd.read_table(in_path, error_bad_lines=False, header=None) X_train = [] y_train = [] with gzip.open('train/train.tsv.gz', 'r') as f: for l in f: line = l.decode('UTF-8').replace("\n", "").split("\t") y_train.append(int(line[0])) X_train.append(str(line[1:])) X_train = np.asarray(X_train) y_train = np.asarray(y_train) X = input[0].values model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(X_train, y_train) pred = model.predict(X) pred.tofile(os.path.join(dirname, OUT_FILE_NAME), sep='\n') if __name__ == "__main__": if len(sys.argv) < 2: raise Exception("Name of working dir not specified!") main(sys.argv[1])