# -*- coding: utf-8 -*- """Paranormal or skeptic.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58 """ !git clone git://gonito.net/paranormal-or-skeptic """# Loading Data""" !xzcat train/in.tsv.xz | wc -l import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from scipy.sparse import hstack import csv import datetime from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB from sklearn.neural_network import MLPClassifier def load_set(path, isTest): dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE) dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat())) if not isTest: expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category") return dataset, expected return dataset """**Load all sets**""" train_set, expected_train = load_set("train", False) dev_set, expected_dev = load_set("dev-0", False) test_set = load_set("test-A", True) """# Prepare data""" def prepare_data(data): data["day"] = data["date"].dt.day data["month"] = data["date"].dt.month data["year"] = data["date"].dt.year return data train_set = prepare_data(train_set) train_set.sample(5) """# Train""" vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii') vectorized = vectorize.fit_transform(train_set["text"]) X = vectorized y = expected_train["class"] bayes = LogisticRegression(max_iter=1000) bayes.fit(X,y) """# Predict and evaluate""" def predict_data(data): prepared = prepare_data(data) vectorized = vectorize.transform(data["text"]) predicted = bayes.predict_proba(vectorized)[:,1] predicted[predicted < 0.05] = 0.05 predicted[predicted > 0.95] = 0.95 return predicted dev_predicted = predict_data(dev_set) dev_predicted test_predicted = predict_data(test_set) """**Clean output for saving**""" test_predicted = np.array([item.strip() for item in test_predicted]) dev_predicted = np.array([item.strip() for item in dev_predicted]) """**Save to file**""" np.savetxt('test-A/out.tsv', test_predicted, '%f') np.savetxt('dev-0/out.tsv', dev_predicted, '%f') """**Check geval output**""" !wget https://gonito.net/get/bin/geval !chmod u+x geval !./geval -t "dev-0"