paranormal-or-skeptic/paranormal_or_skeptic.py

93 lines
2.4 KiB
Python

# -*- coding: utf-8 -*-
"""Paranormal or skeptic.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1JI_RWapDbABFZPc4NDhU-zQlZiIiXk58
# Loading Data
"""
!xzcat train/in.tsv.xz | wc -l
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import csv
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
def load_set(path, isTest):
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
dataset["date"] = pd.to_datetime(dataset["date"].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
if not isTest:
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
return dataset, expected
return dataset
"""**Load all sets**"""
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
test_set = load_set("test-A", True)
"""# Prepare data"""
def prepare_data(data):
data["day"] = data["date"].dt.day
data["month"] = data["date"].dt.month
data["year"] = data["date"].dt.year
return data
train_set = prepare_data(train_set)
train_set.sample(5)
"""# Train"""
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
vectorized = vectorize.fit_transform(train_set["text"])
X = vectorized
y = expected_train["class"]
bayes = MultinomialNB(alpha=0.4)
bayes.fit(X,y)
"""# Predict and evaluate"""
def predict_data(data):
prepared = prepare_data(data)
vectorized = vectorize.transform(data["text"])
predicted = bayes.predict(vectorized)
return predicted
dev_predicted = predict_data(dev_set)
np.mean(dev_predicted == expected_dev["class"])
test_predicted = predict_data(test_set)
"""**Clean output for saving**"""
test_predicted = np.array([item.strip() for item in test_predicted])
dev_predicted = np.array([item.strip() for item in dev_predicted])
"""**Save to file**"""
np.savetxt('test-A/out.tsv', test_predicted, '%c')
np.savetxt('dev-0/out.tsv', dev_predicted, '%c')
"""**Check geval output**"""
!wget https://gonito.net/get/bin/geval
!chmod u+x geval
!./geval -t "dev-0"