Naive bayes first try

This commit is contained in:
Th3NiKo 2020-03-13 01:24:43 +01:00
parent fa78136b5f
commit 35d0bbd849
16 changed files with 3111 additions and 296974 deletions

View File

@ -0,0 +1,385 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"train = pd.read_csv(\"train/in.tsv.xz\",header=None, compression='xz',sep=\"\\t\", names=[\"text\",\"time\"])\n",
"expected = pd.read_csv(\"train/expected.tsv\", header=None)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"train[\"expected\"] = expected"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 185478.000000\n",
"mean 303.405056\n",
"std 494.328936\n",
"min 3.000000\n",
"25% 68.000000\n",
"50% 151.000000\n",
"75% 341.000000\n",
"max 10251.000000\n",
"Name: text, dtype: float64"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train[\"expected\"]==' S'][\"text\"].str.len().describe()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 104063.000000\n",
"mean 298.150995\n",
"std 504.984133\n",
"min 3.000000\n",
"25% 65.000000\n",
"50% 146.000000\n",
"75% 330.000000\n",
"max 10161.000000\n",
"Name: text, dtype: float64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train[\"expected\"]==' P'][\"text\"].str.len().describe()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
]
}
],
"source": [
"import string\n",
"from nltk import word_tokenize\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"stopwords = set(stopwords.words('english'))\n",
"nltk.download(\"punkt\")\n",
"\n",
"def clean_text(text):\n",
" text = word_tokenize(text)\n",
" text = [word.lower() for word in text if word.isalpha()]\n",
" punct = str.maketrans('','',string.punctuation)\n",
" text = [word.translate(punct) for word in text]\n",
" text = [word for word in text if not word in stopwords]\n",
" return text\n",
"\n",
"train['text'] = train['text'].apply(clean_text)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [medical, issues, recently]\n",
"1 [supposedly, aluminum, barium, strontium, used...\n",
"2 [nobel, prizes, make, rich]\n",
"3 [came, article, stayed, doctor]\n",
"4 [resorted, insults, got, owned, directly, afte...\n",
" ... \n",
"289536 [really, baby, shampoo, actually, highly, alka...\n",
"289537 [gives, example, brendan, reilly, doctor, came...\n",
"289538 [ca, fix, stupidity]\n",
"289539 [excellent, points, also, looking, bit, progra...\n",
"289540 [earlier, year, may, couple, days, ago, nov]\n",
"Name: text, Length: 289541, dtype: object"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['text']"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"def counter(text):\n",
" cnt = Counter()\n",
" for msgs in text:\n",
" for msg in msgs:\n",
" cnt[msg] += 1\n",
" return cnt\n",
"\n",
"text_cnt_s = counter(train[train['expected']==' S']['text'])\n",
"text_cnt_p = counter(train[train['expected']==' P']['text'])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"text_s = text_cnt_s.most_common(100)\n",
"text_p = text_cnt_p.most_common(100)\n",
"text_s = pd.DataFrame(text_s,columns = ['words','counts'])\n",
"text_p = pd.DataFrame(text_p,columns = ['words','counts'])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>counts1</th>\n",
" <th>counts2</th>\n",
" <th>dataset</th>\n",
" <th>words1</th>\n",
" <th>words2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>39094.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>would</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>36978.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>like</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>36461.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>people</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>29143.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>one</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>26827.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>think</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>95</td>\n",
" <td>NaN</td>\n",
" <td>3007.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>kind</td>\n",
" </tr>\n",
" <tr>\n",
" <td>96</td>\n",
" <td>NaN</td>\n",
" <td>2990.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>show</td>\n",
" </tr>\n",
" <tr>\n",
" <td>97</td>\n",
" <td>NaN</td>\n",
" <td>2970.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>far</td>\n",
" </tr>\n",
" <tr>\n",
" <td>98</td>\n",
" <td>NaN</td>\n",
" <td>2964.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>feel</td>\n",
" </tr>\n",
" <tr>\n",
" <td>99</td>\n",
" <td>NaN</td>\n",
" <td>2915.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>try</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" counts1 counts2 dataset words1 words2\n",
"0 39094.0 NaN s would NaN\n",
"1 36978.0 NaN s like NaN\n",
"2 36461.0 NaN s people NaN\n",
"3 29143.0 NaN s one NaN\n",
"4 26827.0 NaN s think NaN\n",
".. ... ... ... ... ...\n",
"95 NaN 3007.0 p NaN kind\n",
"96 NaN 2990.0 p NaN show\n",
"97 NaN 2970.0 p NaN far\n",
"98 NaN 2964.0 p NaN feel\n",
"99 NaN 2915.0 p NaN try\n",
"\n",
"[200 rows x 5 columns]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])\n",
"concatenated\n",
"sns.set(style=\"whitegrid\")\n",
"g = sns.catplot(x=\"words\", y=\"counts\", data=concatenated,\n",
" height=6, kind=\"bar\", palette=\"muted\",style=\"dataset\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 109 KiB

View File

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

104076
onlyP.txt

File diff suppressed because one or more lines are too long

185503
onlyS.txt

File diff suppressed because one or more lines are too long

32
predict.py Normal file
View File

@ -0,0 +1,32 @@
#!/usr/bin/python3
import sys
import pickle
from math import log
from tokenizer import tokenize
model = pickle.load(open("model.pkl","rb"))
pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
for line in sys.stdin:
document = line.rstrip()
fields = document.split('\t')
document = fields[0]
terms = tokenize(document)
log_prob_sketpic = log(pskeptic)
log_prob_paranormal = log(1 - pskeptic)
for term in terms:
if term not in skeptic_count:
skeptic_count[term] = 0
if term not in paranormal_count:
paranormal_count[term] = 0
log_prob_sketpic += log((skeptic_count[term] + 1) / (skeptic_words_total + vocabulary_size))
log_prob_paranormal += log((paranormal_count[term] + 1) / (paranormal_words_total + vocabulary_size))
if log_prob_sketpic > log_prob_paranormal:
print('S')
else:
print('P')

View File

@ -1,14 +0,0 @@
#!/usr/bin/env python3
import pandas as pd
import re
import sys
# sort | uniq -c
#train = pd.read_csv("./train/in.tsv.xz", delimiter='\t')
#import sys
#for line in sys.stdin
#if re.search(r'UFO', line) print("P")
for line in sys.stdin:
if re.search(r'(video|paranormal|happened|alien|camera|ghost|sleep|dream|moving|sky|contact|sightings|footage|photo|phenomena|phenomenon|spirit|shadow|board|window|creepy|wake|eye|film|circles|lol|extraterrestrial|floating|disclosure|civilization|record|glitch|driving|ufo|flash|sharing)', line.lower()):
print("P")
else:
print("S")

File diff suppressed because it is too large Load Diff

25
tokenizer.py Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/python3
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import re
import string
wordlist = set(nltk.corpus.words.words())
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))
printable = set(string.printable)
def tokenize(d):
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
d = re.sub(r'\\n',' ',d)
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
d = ''.join(filter(lambda x: x in printable, d))
tokenized = word_tokenize(d)
#tokenized = re.split(r'\/|\\| ', d)
lower = [w.lower() for w in tokenized]
words = [w for w in lower if not w in stop_words]
return words

58
train.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/python3
import sys
import pickle
from tokenizer import tokenize
def train():
documents_total = 0
skeptic_documents_total = 0
vocabulary = set()
skeptic_words_total = 0
paranormal_words_total = 0
skeptic_count = {}
paranormal_count = {}
for line in sys.stdin:
line = line.rstrip()
fields = line.split('\t')
label = fields[0].strip()
document = fields[1]
terms = tokenize(document)
for t in terms:
vocabulary.add(t)
documents_total += 1
if label == 'S':
skeptic_documents_total += 1
skeptic_words_total += len(terms)
for term in terms:
if term in skeptic_count:
skeptic_count[term] += 1
else:
skeptic_count[term] = 1
else:
paranormal_words_total += len(terms)
for term in terms:
if term in paranormal_count:
paranormal_count[term] += 1
else:
paranormal_count[term] = 1
psketpic = skeptic_documents_total / documents_total
vocabulary_size = len(vocabulary)
model = (psketpic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
pickle.dump(model, open("model.pkl", "wb"))
print(paranormal_count)
print(skeptic_words_total)
train()

View File

@ -1,12 +0,0 @@
#!/bin/bash
input="../mostUsedP.txt"
while IFS= read -r line
do
p=`xzcat in.tsv.xz | paste expected.tsv - |grep "P.* $line" | wc -l`
s=`xzcat in.tsv.xz | paste expected.tsv - |grep "S.* $line" | wc -l`
diff=$((p-s))
if [ $p -ge $s ]
then
echo "$line, $diff"
fi
done < "$input"

View File

@ -1,202 +0,0 @@
video, 1790
UFO, 3604
saw, 958
light, 1910
paranormal, 1871
looks, 459
happened, 569
story, 324
night, 1327
alien, 1511
house, 1054
camera, 1611
aliens, 794
experience, 342
lights, 1214
looked, 193
object, 508
came, 1026
UFOs, 1097
room, 273
seeing, 99
ghost, 1301
videos, 645
nI, 0
sleep, 503
weird, 608
flying, 584
picture, 718
dream, 1191
stories, 385
moving, 494
space, 268
felt, 10
strange, 436
objects, 531
experiences, 519
technology, 189
watching, 8
sky, 769
fake, 698
military, 235
dont, 223
door, 401
contact, 333
planet, 45
sightings, 620
phone, 114
craft, 681
footage, 612
advanced, 176
cool, 83
dreams, 532
ghosts, 319
pictures, 455
experienced, 300
eyes, 97
photo, 1113
moved, 254
phenomena, 273
phenomenon, 220
air, 298
image, 174
happening, 116
spirit, 470
travel, 305
video, 1790
dark, 384
bed, 328
reports, 95
walking, 138
beings, 233
ET, 562
shadow, 449
nThe, 0
Looks, 36
board, 151
scared, 322
night, 1327
bright, 348
house, 1054
spirits, 369
photos, 511
Very, 42
sitting, 42
lived, 51
story, 324
thats, 127
video, 1790
speed, 101
window, 366
plane, 258
creepy, 444
shape, 397
cameras, 302
wake, 180
sighting, 1073
passed, 24
eye, 58
woke, 267
activity, 64
dad, 89
film, 479
Sounds, 5
feet, 43
fake, 698
standing, 33
happened, 569
UFO, 3604
fly, 648
ufo, 721
voice, 95
night, 1327
circles, 122
lol, 310
seconds, 135
extraterrestrial, 267
experience, 342
paralysis, 332
aircraft, 247
room, 273
brother, 29
haunted, 335
youtube, 30
story, 324
Ghost, 238
spot, 79
paranormal, 1871
house, 1054
scary, 136
distance, 176
nIf, 0
witness, 495
freaked, 236
witnesses, 224
music, 34
weather, 9
images, 125
cant, 78
NASA, 60
walked, 52
sky, 769
floating, 168
noise, 251
disclosure, 254
miles, 78
civilization, 125
Ouija, 175
record, 133
visit, 217
audio, 113
appeared, 103
incident, 91
slowly, 24
stars, 84
glitch, 602
corner, 141
orbs, 254
lens, 282
visiting, 83
town, 36
camera, 1611
location, 205
hoax, 380
visited, 97
aliens, 794
light, 1910
ship, 144
recording, 248
abduction, 239
experience, 342
UFOs, 1097
floor, 32
driving, 19
didnt, 119
UFO, 3604
project, 19
communicate, 29
radar, 77
visible, 54
ball, 480
planes, 75
street, 30
flash, 377
room, 273
sharing, 271
balloon, 539
presence, 26
entity, 140
filmed, 193
sleeping, 70
witnessed, 138
Aliens, 95
reflection, 260
lucid, 135
digital, 138
light, 1910
entities, 172
recorded, 74
fake, 698
memories, 51
aliens, 794
flight, 51