#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pandas as pd import numpy as np import csv from xgboost import XGBClassifier from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() #load data: train = pd.read_csv("train/train.tsv.gz", compression='gzip', delimiter="\t", header=None, names=["y","text"], quoting=csv.QUOTE_NONE) texts = train["text"] y = train["y"] #print(y) #train X_train_counts = count_vect.fit_transform(texts) clf = XGBClassifier().fit(X_train_counts, y) print(texts[0]) print(len(texts)) print(len(y)) #predict dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)["text"] testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)["text"] dev0_new_counts = count_vect.transform(dev0) testA_new_counts = count_vect.transform(testA) predicted_dev0 = clf.predict_proba(dev0_new_counts) predicted_testA = clf.predict_proba(testA_new_counts) print(len(dev0)) print(len(predicted_dev0)) with open("dev-0/out.tsv", "w") as out1: for line in predicted_dev0: out1.write(str(line[1])) out1.write("\n") with open("test-A/out.tsv", "w") as out2: for line in predicted_testA: out2.write(str(line[1])) out2.write("\n")