import lzma import pandas as pd import numpy as np import gzip from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import make_pipeline from sklearn.metrics import accuracy_score def readFile(filename): X_dev = [] with open(filename, 'r', encoding="utf-8") as dev_in: for line in dev_in: text = line.split("\t")[0].strip() X_dev.append(text) return X_dev def writePred(filename, predictions): with open(filename, "w") as out_file: for pred in predictions: out_file.write(str(pred) + "\n") with gzip.open('train/train.tsv.gz', 'rb') as f: data = pd.read_csv(f, sep='\t',error_bad_lines=False,names=['isBall','text']) x = data['text'] y = data['isBall'] x = np.asarray(x) y = np.asarray(y) model = make_pipeline(TfidfVectorizer(), MultinomialNB()) model.fit(x,y) dev = readFile('dev-0/in.tsv') pred = model.predict(dev) writePred('dev-0/out.tsv',pred) dev = readFile('test-A/in.tsv') pred = model.predict(dev) writePred('test-A/out.tsv',pred)