#!/usr/bin/env python import numpy as np import sys import csv from sklearn.feature_extraction.text import CountVectorizer from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score tsv_file = open("train/in.tsv") tsv_file3 = open("dev-0/in.tsv") tsv_file2 = open("train/expected.tsv") read_tsv = csv.reader(tsv_file) read_tsv2 = csv.reader(tsv_file2) listatesting = [] listatesting2 = [] listatesting = list(tsv_file) listatesting3 = [] listatesting3 = list(tsv_file3) for line2 in read_tsv2: listatesting2.append(line2) lista = [] vectorizer = CountVectorizer() seed = 7 X = vectorizer.fit_transform(listatesting) Y = np.ravel(listatesting2) X_train, y_train, x_test, y_test = train_test_split(X,Y, test_size=0.33,random_state=seed) seed = 7 param = { 'objective':'binary:logistic'} model = XGBClassifier() model.fit(X_train, x_test) y_pred = model.predict_proba(y_train) Z_train = vectorizer.transform(listatesting3) y_pred2 = model.predict_proba(Z_train) predictions = [value for value in y_pred2] for a in predictions: print(1-a[0])