!xzcat train/in.tsv.xz | wc -l import gzip import re import string import ftfy import datetime import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from scipy.sparse import hstack import csv import datetime from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB from sklearn.neural_network import MLPClassifier def load_set(path, isTest): dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE) if not isTest: expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category") return dataset, expected return dataset train_set, expected_train = load_set("train", False) dev_set, expected_dev = load_set("dev-0", False) dev_set_1, expected_dev_1 = load_set("dev-1", False) test_set = load_set("test-A", True) vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii') vectorized = vectorize.fit_transform(train_set["text"]) X = vectorized y = expected_train["class"] bayes = LogisticRegression(max_iter=1000) bayes.fit(X,y) def predict_data(data): prepared = prepare_data(data) vectorized = vectorize.transform(data["text"]) predicted = bayes.predict_proba(vectorized)[:,1] predicted[predicted < 0.05] = 0.05 predicted[predicted > 0.95] = 0.95 return predicted dev_predicted = predict_data(dev_set) dev_predicted_1 = predict_data(dev_set1) test_predicted = predict_data(test_set) test_predicted = np.array([item.strip() for item in test_predicted]) dev_predicted = np.array([item.strip() for item in dev_predicted]) dev_predicted1 = np.array([item.strip() for item in dev_predicted_1]) np.savetxt('test-A/out.tsv', test_predicted, '%f') np.savetxt('dev-0/out.tsv', dev_predicted, '%f') np.savetxt('dev-1/out.tsv', dev_predicted_1, '%f') !wget https://gonito.net/get/bin/geval !chmod u+x geval !./geval -t "dev-0"