import lzma from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import recall_score from sklearn.metrics import precision_score from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score X_train = [] Y_train = [] stop = 0 with lzma.open('train/in.tsv.xz', 'rt', encoding="utf-8") as f: for line in f: if(stop > 5000): break else: text = line.strip() X_train.append(text) #stop = stop + 1 stop = 0 with open('train/expected.tsv', 'rt') as f2: for line in f2: if(stop > 5000): break else: text = line.strip() Y_train.append(int(text)) #stop = stop + 1 vectorizer = TfidfVectorizer() document_vectors = vectorizer.fit_transform(X_train) model = LogisticRegression() model.fit(document_vectors, Y_train) def readFile(filename): X_dev = [] with open(filename, 'r', encoding="utf-8") as dev_in: for line in dev_in: text = line.split("\t")[0].strip() X_dev.append(text) return X_dev def writePred(filename, predictions): with open(filename, "w") as out_file: for pred in predictions: out_file.write(str(pred) + "\n") X_dev = readFile('dev-0/in.tsv') X_dev = vectorizer.transform(X_dev) predictions = model.predict(X_dev) writePred('dev-0/out.tsv',predictions) X_dev = readFile('dev-1/in.tsv') X_dev = vectorizer.transform(X_dev) predictions = model.predict(X_dev) writePred('dev-1/out.tsv',predictions) X_dev = readFile('test-A/in.tsv') X_dev = vectorizer.transform(X_dev) predictions = model.predict(X_dev) writePred('test-A/out.tsv',predictions)