47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from xgboost import XGBClassifier
|
||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||
|
count_vect = CountVectorizer()
|
||
|
|
||
|
#load data:
|
||
|
train = pd.read_csv("train/train.tsv.gz", compression='gzip', delimiter="\t", header=None, names=["y","text"], quoting=csv.QUOTE_NONE)
|
||
|
texts = train["text"]
|
||
|
y = train["y"]
|
||
|
|
||
|
#print(y)
|
||
|
#train
|
||
|
X_train_counts = count_vect.fit_transform(texts)
|
||
|
clf = XGBClassifier().fit(X_train_counts, y)
|
||
|
print(texts[0])
|
||
|
print(len(texts))
|
||
|
print(len(y))
|
||
|
|
||
|
#predict
|
||
|
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)["text"]
|
||
|
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text"], quoting=csv.QUOTE_NONE)["text"]
|
||
|
|
||
|
dev0_new_counts = count_vect.transform(dev0)
|
||
|
testA_new_counts = count_vect.transform(testA)
|
||
|
|
||
|
predicted_dev0 = clf.predict_proba(dev0_new_counts)
|
||
|
predicted_testA = clf.predict_proba(testA_new_counts)
|
||
|
|
||
|
print(len(dev0))
|
||
|
print(len(predicted_dev0))
|
||
|
|
||
|
with open("dev-0/out.tsv", "w") as out1:
|
||
|
for line in predicted_dev0:
|
||
|
out1.write(str(line[1]))
|
||
|
out1.write("\n")
|
||
|
|
||
|
with open("test-A/out.tsv", "w") as out2:
|
||
|
for line in predicted_testA:
|
||
|
out2.write(str(line[1]))
|
||
|
out2.write("\n")
|
||
|
|