petite-difference-challenge2/train.py

67 lines
2.2 KiB
Python

!xzcat train/in.tsv.xz | wc -l
import gzip
import re
import string
import ftfy
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import csv
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
from sklearn.neural_network import MLPClassifier
def load_set(path, isTest):
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
if not isTest:
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
return dataset, expected
return dataset
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
dev_set_1, expected_dev_1 = load_set("dev-1", False)
test_set = load_set("test-A", True)
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
vectorized = vectorize.fit_transform(train_set["text"])
X = vectorized
y = expected_train["class"]
bayes = LogisticRegression(max_iter=1000)
bayes.fit(X,y)
def predict_data(data):
prepared = prepare_data(data)
vectorized = vectorize.transform(data["text"])
predicted = bayes.predict_proba(vectorized)[:,1]
predicted[predicted < 0.05] = 0.00000005
predicted[predicted > 0.95] = 0.99999995
return predicted
dev_predicted = predict_data(dev_set)
dev_predicted_1 = predict_data(dev_set1)
test_predicted = predict_data(test_set)
test_predicted = np.array([item.strip() for item in test_predicted])
dev_predicted = np.array([item.strip() for item in dev_predicted])
dev_predicted1 = np.array([item.strip() for item in dev_predicted_1])
np.savetxt('test-A/out.tsv', test_predicted, '%f')
np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
np.savetxt('dev-1/out.tsv', dev_predicted_1, '%f')
!wget https://gonito.net/get/bin/geval
!chmod u+x geval
!./geval -t "dev-0"