15 KiB
15 KiB
!git clone git://gonito.net/paranormal-or-skeptic
Cloning into 'paranormal-or-skeptic'... remote: Enumerating objects: 3583, done.[K remote: Counting objects: 100% (3583/3583), done.[K remote: Compressing objects: 100% (3188/3188), done.[K remote: Total 3583 (delta 789), reused 2704 (delta 338) Receiving objects: 100% (3583/3583), 202.38 MiB | 4.18 MiB/s, done. Resolving deltas: 100% (789/789), done.
Loading Data
!xzcat train/in.tsv.xz | wc -l
289579
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import csv
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
from sklearn.neural_network import MLPClassifier
def load_set(path, isTest):
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
if not isTest:
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
return dataset, expected
return dataset
Load all sets
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
test_set = load_set("test-A", True)
Prepare data
def prepare_data(data):
data["day"] = data["date"].dt.day
data["month"] = data["date"].dt.month
data["year"] = data["date"].dt.year
return data
train_set = prepare_data(train_set)
train_set.sample(5)
text | date | day | month | year | |
---|---|---|---|---|---|
112652 | As i hovered over that link I was expecting r/... | 2012-03-23 13:34:29 | 23 | 3 | 2012 |
172265 | Caesarean section is now the new natural child... | 2012-04-19 14:28:59 | 19 | 4 | 2012 |
150100 | The Somerton Man reminds me of the [Lead Masks... | 2012-08-04 21:21:56 | 4 | 8 | 2012 |
153335 | As a skeptic, I demand this man provide eviden... | 2012-06-20 04:44:02 | 20 | 6 | 2012 |
149621 | It's a fucking bug. | 2012-11-15 02:29:24 | 15 | 11 | 2012 |
Train
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
vectorized = vectorize.fit_transform(train_set["text"])
X = vectorized
y = expected_train["class"]
bayes = LogisticRegression(max_iter=1000)
bayes.fit(X,y)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
Predict and evaluate
def predict_data(data):
prepared = prepare_data(data)
vectorized = vectorize.transform(data["text"])
predicted = bayes.predict_proba(vectorized)[:,1]
predicted[predicted < 0.05] = 0.05
predicted[predicted > 0.95] = 0.95
return predicted
dev_predicted = predict_data(dev_set)
dev_predicted
array([0.05 , 0.75847969, 0.86484399, ..., 0.0650311 , 0.95 , 0.37791457])
test_predicted = predict_data(test_set)
Clean output for saving
test_predicted = np.array([item.strip() for item in test_predicted])
dev_predicted = np.array([item.strip() for item in dev_predicted])
Save to file
np.savetxt('test-A/out.tsv', test_predicted, '%f')
np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
Check geval output
!wget https://gonito.net/get/bin/geval
!chmod u+x geval
!./geval -t "dev-0"
Likelihood 0.6707 Accuracy 0.8151 F1.0 0.7197 Precision 0.7762 Recall 0.6710