Naive bayes first try

2020-03-13 01:24:43 +01:00
9 changed files with 10926 additions and 0 deletions
--- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@ -0,0 +1,385 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "train = pd.read_csv(\"train/in.tsv.xz\",header=None, compression='xz',sep=\"\\t\", names=[\"text\",\"time\"])\n",
+    "expected = pd.read_csv(\"train/expected.tsv\", header=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train[\"expected\"] = expected"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    185478.000000\n",
+       "mean        303.405056\n",
+       "std         494.328936\n",
+       "min           3.000000\n",
+       "25%          68.000000\n",
+       "50%         151.000000\n",
+       "75%         341.000000\n",
+       "max       10251.000000\n",
+       "Name: text, dtype: float64"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train[train[\"expected\"]==' S'][\"text\"].str.len().describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    104063.000000\n",
+       "mean        298.150995\n",
+       "std         504.984133\n",
+       "min           3.000000\n",
+       "25%          65.000000\n",
+       "50%         146.000000\n",
+       "75%         330.000000\n",
+       "max       10161.000000\n",
+       "Name: text, dtype: float64"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train[train[\"expected\"]==' P'][\"text\"].str.len().describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...\n",
+      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import string\n",
+    "from nltk import word_tokenize\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "stopwords = set(stopwords.words('english'))\n",
+    "nltk.download(\"punkt\")\n",
+    "\n",
+    "def clean_text(text):\n",
+    "    text = word_tokenize(text)\n",
+    "    text = [word.lower() for word in text if word.isalpha()]\n",
+    "    punct = str.maketrans('','',string.punctuation)\n",
+    "    text = [word.translate(punct) for word in text]\n",
+    "    text = [word for word in text if not word in stopwords]\n",
+    "    return text\n",
+    "\n",
+    "train['text'] = train['text'].apply(clean_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0                               [medical, issues, recently]\n",
+       "1         [supposedly, aluminum, barium, strontium, used...\n",
+       "2                               [nobel, prizes, make, rich]\n",
+       "3                           [came, article, stayed, doctor]\n",
+       "4         [resorted, insults, got, owned, directly, afte...\n",
+       "                                ...                        \n",
+       "289536    [really, baby, shampoo, actually, highly, alka...\n",
+       "289537    [gives, example, brendan, reilly, doctor, came...\n",
+       "289538                                 [ca, fix, stupidity]\n",
+       "289539    [excellent, points, also, looking, bit, progra...\n",
+       "289540         [earlier, year, may, couple, days, ago, nov]\n",
+       "Name: text, Length: 289541, dtype: object"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train['text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import Counter\n",
+    "def counter(text):\n",
+    "    cnt = Counter()\n",
+    "    for msgs in text:\n",
+    "        for msg in msgs:\n",
+    "            cnt[msg] += 1\n",
+    "    return cnt\n",
+    "\n",
+    "text_cnt_s = counter(train[train['expected']==' S']['text'])\n",
+    "text_cnt_p = counter(train[train['expected']==' P']['text'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_s = text_cnt_s.most_common(100)\n",
+    "text_p = text_cnt_p.most_common(100)\n",
+    "text_s = pd.DataFrame(text_s,columns = ['words','counts'])\n",
+    "text_p = pd.DataFrame(text_p,columns = ['words','counts'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+      "of pandas will change to not sort by default.\n",
+      "\n",
+      "To accept the future behavior, pass 'sort=False'.\n",
+      "\n",
+      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+      "\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>counts1</th>\n",
+       "      <th>counts2</th>\n",
+       "      <th>dataset</th>\n",
+       "      <th>words1</th>\n",
+       "      <th>words2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>39094.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>s</td>\n",
+       "      <td>would</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>36978.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>s</td>\n",
+       "      <td>like</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>36461.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>s</td>\n",
+       "      <td>people</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>29143.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>s</td>\n",
+       "      <td>one</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>26827.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>s</td>\n",
+       "      <td>think</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>95</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3007.0</td>\n",
+       "      <td>p</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>kind</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>96</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2990.0</td>\n",
+       "      <td>p</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>show</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>97</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2970.0</td>\n",
+       "      <td>p</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>far</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>98</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2964.0</td>\n",
+       "      <td>p</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>feel</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>99</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2915.0</td>\n",
+       "      <td>p</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>try</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>200 rows × 5 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    counts1  counts2 dataset  words1 words2\n",
+       "0   39094.0      NaN       s   would    NaN\n",
+       "1   36978.0      NaN       s    like    NaN\n",
+       "2   36461.0      NaN       s  people    NaN\n",
+       "3   29143.0      NaN       s     one    NaN\n",
+       "4   26827.0      NaN       s   think    NaN\n",
+       "..      ...      ...     ...     ...    ...\n",
+       "95      NaN   3007.0       p     NaN   kind\n",
+       "96      NaN   2990.0       p     NaN   show\n",
+       "97      NaN   2970.0       p     NaN    far\n",
+       "98      NaN   2964.0       p     NaN   feel\n",
+       "99      NaN   2915.0       p     NaN    try\n",
+       "\n",
+       "[200 rows x 5 columns]"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])\n",
+    "concatenated\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "g = sns.catplot(x=\"words\", y=\"counts\", data=concatenated,\n",
+    "                height=6, kind=\"bar\", palette=\"muted\",style=\"dataset\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/.ipynb_checkpoints/plot-checkpoint.png
+++ b/.ipynb_checkpoints/plot-checkpoint.png
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+PostText	Timestamp
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Label
--- a/predict.py
+++ b/predict.py
@ -0,0 +1,32 @@
+#!/usr/bin/python3
+
+import sys
+import pickle
+from math import log
+from tokenizer import tokenize
+
+model = pickle.load(open("model.pkl","rb"))
+pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
+
+
+for line in sys.stdin:
+    document = line.rstrip()
+    fields = document.split('\t')
+    document = fields[0]
+    terms = tokenize(document)
+
+    log_prob_sketpic = log(pskeptic)
+    log_prob_paranormal = log(1 - pskeptic)
+
+    for term in terms:
+        if term not in skeptic_count:
+            skeptic_count[term] = 0
+        if term not in paranormal_count:
+            paranormal_count[term] = 0
+        log_prob_sketpic += log((skeptic_count[term] + 1) / (skeptic_words_total + vocabulary_size))
+        log_prob_paranormal += log((paranormal_count[term] + 1) / (paranormal_words_total + vocabulary_size))
+
+    if log_prob_sketpic > log_prob_paranormal:
+        print('S')
+    else:
+        print('P')
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,25 @@
+#!/usr/bin/python3
+
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem.porter import PorterStemmer
+import nltk
+import re
+import string
+
+
+wordlist = set(nltk.corpus.words.words())
+porter = PorterStemmer()
+stop_words = set(stopwords.words('english'))
+printable = set(string.printable)
+
+def tokenize(d):
+    d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
+    d = re.sub(r'\\n',' ',d)
+    d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
+    d = ''.join(filter(lambda x: x in printable, d))
+    tokenized = word_tokenize(d)
+    #tokenized = re.split(r'\/|\\| ', d)
+    lower = [w.lower() for w in tokenized]
+    words = [w for w in lower if not w in stop_words]
+    return words
--- a/train.py
+++ b/train.py
@ -0,0 +1,58 @@
+#!/usr/bin/python3
+
+import sys
+import pickle
+from tokenizer import tokenize
+
+
+def train():
+    documents_total = 0
+    skeptic_documents_total = 0
+
+    vocabulary = set()
+
+    skeptic_words_total = 0
+    paranormal_words_total = 0
+
+    skeptic_count = {}
+    paranormal_count = {}
+
+    for line in sys.stdin:
+        line = line.rstrip()
+        fields = line.split('\t')
+        label = fields[0].strip()
+        document = fields[1]
+        terms = tokenize(document)
+
+        for t in terms:
+            vocabulary.add(t)
+
+        documents_total += 1
+        if label == 'S':
+            skeptic_documents_total += 1
+            skeptic_words_total += len(terms)
+            for term in terms:
+                if term in skeptic_count:
+                    skeptic_count[term] += 1
+                else:
+                    skeptic_count[term] = 1
+        else:
+            paranormal_words_total += len(terms)
+            for term in terms:
+                if term in paranormal_count:
+                    paranormal_count[term] += 1
+                else:
+                    paranormal_count[term] = 1
+
+
+
+    psketpic = skeptic_documents_total / documents_total
+    vocabulary_size = len(vocabulary)
+
+    model = (psketpic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
+    pickle.dump(model, open("model.pkl", "wb"))
+
+    print(paranormal_count)
+    print(skeptic_words_total)
+
+train()