Naive bayes first try

2020-03-13 01:24:43 +01:00
16 changed files with 3111 additions and 296974 deletions
--- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@ -0,0 +1,385 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "train = pd.read_csv(\"train/in.tsv.xz\",header=None, compression='xz',sep=\"\\t\", names=[\"text\",\"time\"])\n",
    "expected = pd.read_csv(\"train/expected.tsv\", header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "train[\"expected\"] = expected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    185478.000000\n",
       "mean        303.405056\n",
       "std         494.328936\n",
       "min           3.000000\n",
       "25%          68.000000\n",
       "50%         151.000000\n",
       "75%         341.000000\n",
       "max       10251.000000\n",
       "Name: text, dtype: float64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[train[\"expected\"]==' S'][\"text\"].str.len().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    104063.000000\n",
       "mean        298.150995\n",
       "std         504.984133\n",
       "min           3.000000\n",
       "25%          65.000000\n",
       "50%         146.000000\n",
       "75%         330.000000\n",
       "max       10161.000000\n",
       "Name: text, dtype: float64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[train[\"expected\"]==' P'][\"text\"].str.len().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
     ]
    }
   ],
   "source": [
    "import string\n",
    "from nltk import word_tokenize\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "stopwords = set(stopwords.words('english'))\n",
    "nltk.download(\"punkt\")\n",
    "\n",
    "def clean_text(text):\n",
    "    text = word_tokenize(text)\n",
    "    text = [word.lower() for word in text if word.isalpha()]\n",
    "    punct = str.maketrans('','',string.punctuation)\n",
    "    text = [word.translate(punct) for word in text]\n",
    "    text = [word for word in text if not word in stopwords]\n",
    "    return text\n",
    "\n",
    "train['text'] = train['text'].apply(clean_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                               [medical, issues, recently]\n",
       "1         [supposedly, aluminum, barium, strontium, used...\n",
       "2                               [nobel, prizes, make, rich]\n",
       "3                           [came, article, stayed, doctor]\n",
       "4         [resorted, insults, got, owned, directly, afte...\n",
       "                                ...                        \n",
       "289536    [really, baby, shampoo, actually, highly, alka...\n",
       "289537    [gives, example, brendan, reilly, doctor, came...\n",
       "289538                                 [ca, fix, stupidity]\n",
       "289539    [excellent, points, also, looking, bit, progra...\n",
       "289540         [earlier, year, may, couple, days, ago, nov]\n",
       "Name: text, Length: 289541, dtype: object"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "def counter(text):\n",
    "    cnt = Counter()\n",
    "    for msgs in text:\n",
    "        for msg in msgs:\n",
    "            cnt[msg] += 1\n",
    "    return cnt\n",
    "\n",
    "text_cnt_s = counter(train[train['expected']==' S']['text'])\n",
    "text_cnt_p = counter(train[train['expected']==' P']['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_s = text_cnt_s.most_common(100)\n",
    "text_p = text_cnt_p.most_common(100)\n",
    "text_s = pd.DataFrame(text_s,columns = ['words','counts'])\n",
    "text_p = pd.DataFrame(text_p,columns = ['words','counts'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>counts1</th>\n",
       "      <th>counts2</th>\n",
       "      <th>dataset</th>\n",
       "      <th>words1</th>\n",
       "      <th>words2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>39094.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>s</td>\n",
       "      <td>would</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>36978.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>s</td>\n",
       "      <td>like</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>36461.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>s</td>\n",
       "      <td>people</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>29143.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>s</td>\n",
       "      <td>one</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>26827.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>s</td>\n",
       "      <td>think</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>95</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3007.0</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>kind</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>96</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2990.0</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>show</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>97</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2970.0</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>far</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>98</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2964.0</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>feel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>99</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2915.0</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>try</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>200 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    counts1  counts2 dataset  words1 words2\n",
       "0   39094.0      NaN       s   would    NaN\n",
       "1   36978.0      NaN       s    like    NaN\n",
       "2   36461.0      NaN       s  people    NaN\n",
       "3   29143.0      NaN       s     one    NaN\n",
       "4   26827.0      NaN       s   think    NaN\n",
       "..      ...      ...     ...     ...    ...\n",
       "95      NaN   3007.0       p     NaN   kind\n",
       "96      NaN   2990.0       p     NaN   show\n",
       "97      NaN   2970.0       p     NaN    far\n",
       "98      NaN   2964.0       p     NaN   feel\n",
       "99      NaN   2915.0       p     NaN    try\n",
       "\n",
       "[200 rows x 5 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])\n",
    "concatenated\n",
    "sns.set(style=\"whitegrid\")\n",
    "g = sns.catplot(x=\"words\", y=\"counts\", data=concatenated,\n",
    "                height=6, kind=\"bar\", palette=\"muted\",style=\"dataset\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/.ipynb_checkpoints/plot-checkpoint.png
+++ b/.ipynb_checkpoints/plot-checkpoint.png
--- a/dev-0/mostUsedP.txt
+++ b/dev-0/mostUsedP.txt
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/mostUsed.txt
+++ b/mostUsed.txt
--- a/mostUsedP.txt
+++ b/mostUsedP.txt
--- a/mostUsedS.txt
+++ b/mostUsedS.txt
--- a/onlyP.txt
+++ b/onlyP.txt
--- a/onlyS.txt
+++ b/onlyS.txt
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,32 @@
 #!/usr/bin/python3
 import sys
 import pickle
 from math import log
 from tokenizer import tokenize
 model = pickle.load(open("model.pkl","rb"))
 pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
 for line in sys.stdin:
    document = line.rstrip()
    fields = document.split('\t')
    document = fields[0]
    terms = tokenize(document)
    log_prob_sketpic = log(pskeptic)
    log_prob_paranormal = log(1 - pskeptic)
    for term in terms:
        if term not in skeptic_count:
            skeptic_count[term] = 0
        if term not in paranormal_count:
            paranormal_count[term] = 0
        log_prob_sketpic += log((skeptic_count[term] + 1) / (skeptic_words_total + vocabulary_size))
        log_prob_paranormal += log((paranormal_count[term] + 1) / (paranormal_words_total + vocabulary_size))
    if log_prob_sketpic > log_prob_paranormal:
        print('S')
    else:
        print('P')
--- a/solve.py
+++ b/solve.py
@ -1,14 +0,0 @@
 #!/usr/bin/env python3
 import pandas as pd
 import re
 import sys
 # sort | uniq -c
 #train = pd.read_csv("./train/in.tsv.xz", delimiter='\t')
 #import sys
 #for line in sys.stdin
 #if re.search(r'UFO', line) print("P")
 for line in sys.stdin:
    if re.search(r'(video|paranormal|happened|alien|camera|ghost|sleep|dream|moving|sky|contact|sightings|footage|photo|phenomena|phenomenon|spirit|shadow|board|window|creepy|wake|eye|film|circles|lol|extraterrestrial|floating|disclosure|civilization|record|glitch|driving|ufo|flash|sharing)', line.lower()):
        print("P")
    else: 
        print("S")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,25 @@
 #!/usr/bin/python3
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem.porter import PorterStemmer
 import nltk
 import re
 import string
 wordlist = set(nltk.corpus.words.words())
 porter = PorterStemmer()
 stop_words = set(stopwords.words('english'))
 printable = set(string.printable)
 def tokenize(d):
    d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
    d = re.sub(r'\\n',' ',d)
    d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
    d = ''.join(filter(lambda x: x in printable, d))
    tokenized = word_tokenize(d)
    #tokenized = re.split(r'\/|\\| ', d)
    lower = [w.lower() for w in tokenized]
    words = [w for w in lower if not w in stop_words]
    return words
--- a/train.py
+++ b/train.py
@ -0,0 +1,58 @@
 #!/usr/bin/python3
 import sys
 import pickle
 from tokenizer import tokenize
 def train():
    documents_total = 0
    skeptic_documents_total = 0
    vocabulary = set()
    skeptic_words_total = 0
    paranormal_words_total = 0
    skeptic_count = {}
    paranormal_count = {}
    for line in sys.stdin:
        line = line.rstrip()
        fields = line.split('\t')
        label = fields[0].strip()
        document = fields[1]
        terms = tokenize(document)
        for t in terms:
            vocabulary.add(t)
        documents_total += 1
        if label == 'S':
            skeptic_documents_total += 1
            skeptic_words_total += len(terms)
            for term in terms:
                if term in skeptic_count:
                    skeptic_count[term] += 1
                else:
                    skeptic_count[term] = 1
        else:
            paranormal_words_total += len(terms)
            for term in terms:
                if term in paranormal_count:
                    paranormal_count[term] += 1
                else:
                    paranormal_count[term] = 1
    psketpic = skeptic_documents_total / documents_total
    vocabulary_size = len(vocabulary)
    model = (psketpic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
    pickle.dump(model, open("model.pkl", "wb"))
    print(paranormal_count)
    print(skeptic_words_total)
 train()
--- a/train/check.sh
+++ b/train/check.sh
@ -1,12 +0,0 @@
 #!/bin/bash
 input="../mostUsedP.txt"
 while IFS= read -r line
 do
    p=`xzcat in.tsv.xz | paste expected.tsv - |grep "P.* $line" | wc -l`
    s=`xzcat in.tsv.xz | paste expected.tsv - |grep "S.* $line" | wc -l`
    diff=$((p-s))
    if [ $p -ge $s ]
    then
        echo "$line, $diff"
    fi
 done < "$input"
--- a/train/moreInP.txt
+++ b/train/moreInP.txt
@ -1,202 +0,0 @@
 video, 1790
 UFO, 3604
 saw, 958
 light, 1910
 paranormal, 1871
 looks, 459
 happened, 569
 story, 324
 night, 1327
 alien, 1511
 house, 1054
 camera, 1611
 aliens, 794
 experience, 342
 lights, 1214
 looked, 193
 object, 508
 came, 1026
 UFOs, 1097
 room, 273
 seeing, 99
 ghost, 1301
 videos, 645
 nI, 0
 sleep, 503
 weird, 608
 flying, 584
 picture, 718
 dream, 1191
 stories, 385
 moving, 494
 space, 268
 felt, 10
 strange, 436
 objects, 531
 experiences, 519
 technology, 189
 watching, 8
 sky, 769
 fake, 698
 military, 235
 dont, 223
 door, 401
 contact, 333
 planet, 45
 sightings, 620
 phone, 114
 craft, 681
 footage, 612
 advanced, 176
 cool, 83
 dreams, 532
 ghosts, 319
 pictures, 455
 experienced, 300
 eyes, 97
 photo, 1113
 moved, 254
 phenomena, 273
 phenomenon, 220
 air, 298
 image, 174
 happening, 116
 spirit, 470
 travel, 305
 video, 1790
 dark, 384
 bed, 328
 reports, 95
 walking, 138
 beings, 233
 ET, 562
 shadow, 449
 nThe, 0
 Looks, 36
 board, 151
 scared, 322
 night, 1327
 bright, 348
 house, 1054
 spirits, 369
 photos, 511
 Very, 42
 sitting, 42
 lived, 51
 story, 324
 thats, 127
 video, 1790
 speed, 101
 window, 366
 plane, 258
 creepy, 444
 shape, 397
 cameras, 302
 wake, 180
 sighting, 1073
 passed, 24
 eye, 58
 woke, 267
 activity, 64
 dad, 89
 film, 479
 Sounds, 5
 feet, 43
 fake, 698
 standing, 33
 happened, 569
 UFO, 3604
 fly, 648
 ufo, 721
 voice, 95
 night, 1327
 circles, 122
 lol, 310
 seconds, 135
 extraterrestrial, 267
 experience, 342
 paralysis, 332
 aircraft, 247
 room, 273
 brother, 29
 haunted, 335
 youtube, 30
 story, 324
 Ghost, 238
 spot, 79
 paranormal, 1871
 house, 1054
 scary, 136
 distance, 176
 nIf, 0
 witness, 495
 freaked, 236
 witnesses, 224
 music, 34
 weather, 9
 images, 125
 cant, 78
 NASA, 60
 walked, 52
 sky, 769
 floating, 168
 noise, 251
 disclosure, 254
 miles, 78
 civilization, 125
 Ouija, 175
 record, 133
 visit, 217
 audio, 113
 appeared, 103
 incident, 91
 slowly, 24
 stars, 84
 glitch, 602
 corner, 141
 orbs, 254
 lens, 282
 visiting, 83
 town, 36
 camera, 1611
 location, 205
 hoax, 380
 visited, 97
 aliens, 794
 light, 1910
 ship, 144
 recording, 248
 abduction, 239
 experience, 342
 UFOs, 1097
 floor, 32
 driving, 19
 didnt, 119
 UFO, 3604
 project, 19
 communicate, 29
 radar, 77
 visible, 54
 ball, 480
 planes, 75
 street, 30
 flash, 377
 room, 273
 sharing, 271
 balloon, 539
 presence, 26
 entity, 140
 filmed, 193
 sleeping, 70
 witnessed, 138
 Aliens, 95
 reflection, 260
 lucid, 135
 digital, 138
 light, 1910
 entities, 172
 recorded, 74
 fake, 698
 memories, 51
 aliens, 794
 flight, 51