import pandas as pd import csv from sklearn.feature_extraction.text import CountVectorizer from xgboost import XGBClassifier import numpy as np def load_data(path): with open(path, encoding='utf-8') as f: data = f.readlines() data = [x.strip() for x in data] return data def save_data(path, data): with open(path, "w") as out: for line in data: out.write(str(line[1])) out.write("\n") def program(): train = load_data("train/in.tsv") y = load_data("train/expected.tsv") vectorizer = CountVectorizer() x_vectorizer = vectorizer.fit_transform(train) clf = XGBClassifier().fit(x_vectorizer, y) dev0 = load_data("dev-0/in.tsv") dev0_vectorizer = vectorizer.transform(dev0) y_dev0 = clf.predict_proba(dev0_vectorizer) save_data("dev-0/out.tsv", y_dev0) dev1 = load_data("dev-1/in.tsv") dev1_vectorizer = vectorizer.transform(dev1) y_dev1 = clf.predict_proba(dev1_vectorizer) save_data("dev-1/out.tsv", y_dev1) testA = load_data("test-A/in.tsv") testA_vectorizer = vectorizer.transform(testA) y_testA = clf.predict_proba(testA_vectorizer) save_data("test-A/out.tsv", y_testA) program()