2020-05-26 11:50:16 +02:00
|
|
|
!xzcat train/in.tsv.xz | wc -l
|
|
|
|
|
|
|
|
import gzip
|
|
|
|
import re
|
|
|
|
import string
|
|
|
|
import ftfy
|
|
|
|
import datetime
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
import seaborn as sns
|
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
from scipy.sparse import hstack
|
|
|
|
import csv
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
|
from sklearn.linear_model import SGDClassifier, LogisticRegression
|
|
|
|
from sklearn.svm import LinearSVC
|
|
|
|
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
|
|
|
|
from sklearn.neural_network import MLPClassifier
|
|
|
|
|
|
|
|
def load_set(path, isTest):
|
|
|
|
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
|
|
|
|
if not isTest:
|
|
|
|
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
|
|
|
|
return dataset, expected
|
|
|
|
return dataset
|
|
|
|
|
|
|
|
train_set, expected_train = load_set("train", False)
|
|
|
|
dev_set, expected_dev = load_set("dev-0", False)
|
|
|
|
dev_set_1, expected_dev_1 = load_set("dev-1", False)
|
|
|
|
test_set = load_set("test-A", True)
|
|
|
|
|
|
|
|
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
|
|
|
|
vectorized = vectorize.fit_transform(train_set["text"])
|
|
|
|
|
|
|
|
X = vectorized
|
|
|
|
y = expected_train["class"]
|
|
|
|
|
|
|
|
bayes = LogisticRegression(max_iter=1000)
|
|
|
|
bayes.fit(X,y)
|
|
|
|
|
|
|
|
def predict_data(data):
|
|
|
|
prepared = prepare_data(data)
|
|
|
|
vectorized = vectorize.transform(data["text"])
|
|
|
|
predicted = bayes.predict_proba(vectorized)[:,1]
|
2020-05-26 13:52:02 +02:00
|
|
|
predicted[predicted < 0.05] = 0.00000005
|
|
|
|
predicted[predicted > 0.95] = 0.99999995
|
2020-05-26 11:50:16 +02:00
|
|
|
return predicted
|
|
|
|
|
|
|
|
dev_predicted = predict_data(dev_set)
|
|
|
|
dev_predicted_1 = predict_data(dev_set1)
|
|
|
|
test_predicted = predict_data(test_set)
|
|
|
|
|
|
|
|
test_predicted = np.array([item.strip() for item in test_predicted])
|
|
|
|
dev_predicted = np.array([item.strip() for item in dev_predicted])
|
|
|
|
dev_predicted1 = np.array([item.strip() for item in dev_predicted_1])
|
|
|
|
|
|
|
|
np.savetxt('test-A/out.tsv', test_predicted, '%f')
|
|
|
|
np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
|
|
|
|
np.savetxt('dev-1/out.tsv', dev_predicted_1, '%f')
|
|
|
|
|
|
|
|
!wget https://gonito.net/get/bin/geval
|
|
|
|
!chmod u+x geval
|
|
|
|
|
|
|
|
!./geval -t "dev-0"
|