
# Loading Data


In [0]:
!xzcat train/in.tsv.xz | wc -l

289579


In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import csv
import datetime

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB

In [0]:
def load_set(path, isTest):
  dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
  dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
  if not isTest:
    expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
    return dataset, expected
  return dataset

**Load all sets**

In [0]:
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
test_set = load_set("test-A", True)

# Prepare data

In [0]:
def prepare_data(data):
  data["day"] = data["date"].dt.day
  data["month"] = data["date"].dt.month
  data["year"] = data["date"].dt.year
  return data

In [0]:
train_set = prepare_data(train_set)

In [0]:
train_set.sample(5)

Unnamed: 0,text,date,day,month,year
103770,Holy crap. I don't think I've seen or heard o...,2010-07-16 19:27:08,16,7,2010
240391,You lost all pretense of civility with your ar...,2010-09-30 12:18:36,30,9,2010
220910,What do people think of ghost adventures? Cur...,2012-08-21 19:59:56,21,8,2012
39644,Congrats on getting the joke.,2011-07-29 18:19:46,29,7,2011
220867,We live in a world where any media can be copi...,2012-07-18 08:53:24,18,7,2012


# Train

In [0]:
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
vectorized = vectorize.fit_transform(train_set["text"])

In [0]:
X = vectorized
y = expected_train["class"]

In [0]:
bayes = MultinomialNB(alpha=0.4)
bayes.fit(X,y)

MultinomialNB(alpha=0.4, class_prior=None, fit_prior=True)

# Predict and evaluate

In [0]:
def predict_data(data):
  prepared = prepare_data(data)
  vectorized = vectorize.transform(data["text"])
  predicted = bayes.predict(vectorized)
  return predicted

In [0]:
dev_predicted = predict_data(dev_set)

In [0]:
np.mean(dev_predicted == expected_dev["class"])

0.8201820940819423

In [0]:
test_predicted = predict_data(test_set)

**Clean output for saving**

In [0]:
test_predicted = np.array([item.strip() for item in test_predicted])
dev_predicted = np.array([item.strip() for item in dev_predicted])

**Save to file**


In [0]:
np.savetxt('test-A/out.tsv', test_predicted, '%c')
np.savetxt('dev-0/out.tsv', dev_predicted, '%c')

**Check geval output**

In [0]:
!wget https://gonito.net/get/bin/geval
!chmod u+x geval

In [0]:
!./geval -t "dev-0"

0.8202
