paranormal-or-skeptic/.ipynb_checkpoints/Untitled-checkpoint.ipynb
2020-03-13 01:24:43 +01:00

386 lines
11 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"train = pd.read_csv(\"train/in.tsv.xz\",header=None, compression='xz',sep=\"\\t\", names=[\"text\",\"time\"])\n",
"expected = pd.read_csv(\"train/expected.tsv\", header=None)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"train[\"expected\"] = expected"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 185478.000000\n",
"mean 303.405056\n",
"std 494.328936\n",
"min 3.000000\n",
"25% 68.000000\n",
"50% 151.000000\n",
"75% 341.000000\n",
"max 10251.000000\n",
"Name: text, dtype: float64"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train[\"expected\"]==' S'][\"text\"].str.len().describe()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 104063.000000\n",
"mean 298.150995\n",
"std 504.984133\n",
"min 3.000000\n",
"25% 65.000000\n",
"50% 146.000000\n",
"75% 330.000000\n",
"max 10161.000000\n",
"Name: text, dtype: float64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train[train[\"expected\"]==' P'][\"text\"].str.len().describe()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
]
}
],
"source": [
"import string\n",
"from nltk import word_tokenize\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"stopwords = set(stopwords.words('english'))\n",
"nltk.download(\"punkt\")\n",
"\n",
"def clean_text(text):\n",
" text = word_tokenize(text)\n",
" text = [word.lower() for word in text if word.isalpha()]\n",
" punct = str.maketrans('','',string.punctuation)\n",
" text = [word.translate(punct) for word in text]\n",
" text = [word for word in text if not word in stopwords]\n",
" return text\n",
"\n",
"train['text'] = train['text'].apply(clean_text)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [medical, issues, recently]\n",
"1 [supposedly, aluminum, barium, strontium, used...\n",
"2 [nobel, prizes, make, rich]\n",
"3 [came, article, stayed, doctor]\n",
"4 [resorted, insults, got, owned, directly, afte...\n",
" ... \n",
"289536 [really, baby, shampoo, actually, highly, alka...\n",
"289537 [gives, example, brendan, reilly, doctor, came...\n",
"289538 [ca, fix, stupidity]\n",
"289539 [excellent, points, also, looking, bit, progra...\n",
"289540 [earlier, year, may, couple, days, ago, nov]\n",
"Name: text, Length: 289541, dtype: object"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['text']"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"def counter(text):\n",
" cnt = Counter()\n",
" for msgs in text:\n",
" for msg in msgs:\n",
" cnt[msg] += 1\n",
" return cnt\n",
"\n",
"text_cnt_s = counter(train[train['expected']==' S']['text'])\n",
"text_cnt_p = counter(train[train['expected']==' P']['text'])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"text_s = text_cnt_s.most_common(100)\n",
"text_p = text_cnt_p.most_common(100)\n",
"text_s = pd.DataFrame(text_s,columns = ['words','counts'])\n",
"text_p = pd.DataFrame(text_p,columns = ['words','counts'])"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=False'.\n",
"\n",
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
"\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>counts1</th>\n",
" <th>counts2</th>\n",
" <th>dataset</th>\n",
" <th>words1</th>\n",
" <th>words2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>39094.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>would</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>36978.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>like</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>36461.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>people</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>29143.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>one</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>26827.0</td>\n",
" <td>NaN</td>\n",
" <td>s</td>\n",
" <td>think</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>95</td>\n",
" <td>NaN</td>\n",
" <td>3007.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>kind</td>\n",
" </tr>\n",
" <tr>\n",
" <td>96</td>\n",
" <td>NaN</td>\n",
" <td>2990.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>show</td>\n",
" </tr>\n",
" <tr>\n",
" <td>97</td>\n",
" <td>NaN</td>\n",
" <td>2970.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>far</td>\n",
" </tr>\n",
" <tr>\n",
" <td>98</td>\n",
" <td>NaN</td>\n",
" <td>2964.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>feel</td>\n",
" </tr>\n",
" <tr>\n",
" <td>99</td>\n",
" <td>NaN</td>\n",
" <td>2915.0</td>\n",
" <td>p</td>\n",
" <td>NaN</td>\n",
" <td>try</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" counts1 counts2 dataset words1 words2\n",
"0 39094.0 NaN s would NaN\n",
"1 36978.0 NaN s like NaN\n",
"2 36461.0 NaN s people NaN\n",
"3 29143.0 NaN s one NaN\n",
"4 26827.0 NaN s think NaN\n",
".. ... ... ... ... ...\n",
"95 NaN 3007.0 p NaN kind\n",
"96 NaN 2990.0 p NaN show\n",
"97 NaN 2970.0 p NaN far\n",
"98 NaN 2964.0 p NaN feel\n",
"99 NaN 2915.0 p NaN try\n",
"\n",
"[200 rows x 5 columns]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])\n",
"concatenated\n",
"sns.set(style=\"whitegrid\")\n",
"g = sns.catplot(x=\"words\", y=\"counts\", data=concatenated,\n",
" height=6, kind=\"bar\", palette=\"muted\",style=\"dataset\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}