386 lines
11 KiB
Plaintext
386 lines
11 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"train = pd.read_csv(\"train/in.tsv.xz\",header=None, compression='xz',sep=\"\\t\", names=[\"text\",\"time\"])\n",
|
||
"expected = pd.read_csv(\"train/expected.tsv\", header=None)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"train[\"expected\"] = expected"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"count 185478.000000\n",
|
||
"mean 303.405056\n",
|
||
"std 494.328936\n",
|
||
"min 3.000000\n",
|
||
"25% 68.000000\n",
|
||
"50% 151.000000\n",
|
||
"75% 341.000000\n",
|
||
"max 10251.000000\n",
|
||
"Name: text, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train[train[\"expected\"]==' S'][\"text\"].str.len().describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"count 104063.000000\n",
|
||
"mean 298.150995\n",
|
||
"std 504.984133\n",
|
||
"min 3.000000\n",
|
||
"25% 65.000000\n",
|
||
"50% 146.000000\n",
|
||
"75% 330.000000\n",
|
||
"max 10161.000000\n",
|
||
"Name: text, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train[train[\"expected\"]==' P'][\"text\"].str.len().describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 39,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...\n",
|
||
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import string\n",
|
||
"from nltk import word_tokenize\n",
|
||
"import nltk\n",
|
||
"from nltk.corpus import stopwords\n",
|
||
"stopwords = set(stopwords.words('english'))\n",
|
||
"nltk.download(\"punkt\")\n",
|
||
"\n",
|
||
"def clean_text(text):\n",
|
||
" text = word_tokenize(text)\n",
|
||
" text = [word.lower() for word in text if word.isalpha()]\n",
|
||
" punct = str.maketrans('','',string.punctuation)\n",
|
||
" text = [word.translate(punct) for word in text]\n",
|
||
" text = [word for word in text if not word in stopwords]\n",
|
||
" return text\n",
|
||
"\n",
|
||
"train['text'] = train['text'].apply(clean_text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"0 [medical, issues, recently]\n",
|
||
"1 [supposedly, aluminum, barium, strontium, used...\n",
|
||
"2 [nobel, prizes, make, rich]\n",
|
||
"3 [came, article, stayed, doctor]\n",
|
||
"4 [resorted, insults, got, owned, directly, afte...\n",
|
||
" ... \n",
|
||
"289536 [really, baby, shampoo, actually, highly, alka...\n",
|
||
"289537 [gives, example, brendan, reilly, doctor, came...\n",
|
||
"289538 [ca, fix, stupidity]\n",
|
||
"289539 [excellent, points, also, looking, bit, progra...\n",
|
||
"289540 [earlier, year, may, couple, days, ago, nov]\n",
|
||
"Name: text, Length: 289541, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train['text']"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from collections import Counter\n",
|
||
"def counter(text):\n",
|
||
" cnt = Counter()\n",
|
||
" for msgs in text:\n",
|
||
" for msg in msgs:\n",
|
||
" cnt[msg] += 1\n",
|
||
" return cnt\n",
|
||
"\n",
|
||
"text_cnt_s = counter(train[train['expected']==' S']['text'])\n",
|
||
"text_cnt_p = counter(train[train['expected']==' P']['text'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 58,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"text_s = text_cnt_s.most_common(100)\n",
|
||
"text_p = text_cnt_p.most_common(100)\n",
|
||
"text_s = pd.DataFrame(text_s,columns = ['words','counts'])\n",
|
||
"text_p = pd.DataFrame(text_p,columns = ['words','counts'])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"%matplotlib inline\n",
|
||
"import seaborn as sns"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
|
||
"of pandas will change to not sort by default.\n",
|
||
"\n",
|
||
"To accept the future behavior, pass 'sort=False'.\n",
|
||
"\n",
|
||
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
|
||
"\n",
|
||
" \"\"\"Entry point for launching an IPython kernel.\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>counts1</th>\n",
|
||
" <th>counts2</th>\n",
|
||
" <th>dataset</th>\n",
|
||
" <th>words1</th>\n",
|
||
" <th>words2</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <td>0</td>\n",
|
||
" <td>39094.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>s</td>\n",
|
||
" <td>would</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>1</td>\n",
|
||
" <td>36978.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>s</td>\n",
|
||
" <td>like</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>2</td>\n",
|
||
" <td>36461.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>s</td>\n",
|
||
" <td>people</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>3</td>\n",
|
||
" <td>29143.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>s</td>\n",
|
||
" <td>one</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>4</td>\n",
|
||
" <td>26827.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>s</td>\n",
|
||
" <td>think</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>95</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>3007.0</td>\n",
|
||
" <td>p</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>kind</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>96</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2990.0</td>\n",
|
||
" <td>p</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>show</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>97</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2970.0</td>\n",
|
||
" <td>p</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>far</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>98</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2964.0</td>\n",
|
||
" <td>p</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>feel</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <td>99</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2915.0</td>\n",
|
||
" <td>p</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>try</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>200 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" counts1 counts2 dataset words1 words2\n",
|
||
"0 39094.0 NaN s would NaN\n",
|
||
"1 36978.0 NaN s like NaN\n",
|
||
"2 36461.0 NaN s people NaN\n",
|
||
"3 29143.0 NaN s one NaN\n",
|
||
"4 26827.0 NaN s think NaN\n",
|
||
".. ... ... ... ... ...\n",
|
||
"95 NaN 3007.0 p NaN kind\n",
|
||
"96 NaN 2990.0 p NaN show\n",
|
||
"97 NaN 2970.0 p NaN far\n",
|
||
"98 NaN 2964.0 p NaN feel\n",
|
||
"99 NaN 2915.0 p NaN try\n",
|
||
"\n",
|
||
"[200 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 57,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])\n",
|
||
"concatenated\n",
|
||
"sns.set(style=\"whitegrid\")\n",
|
||
"g = sns.catplot(x=\"words\", y=\"counts\", data=concatenated,\n",
|
||
" height=6, kind=\"bar\", palette=\"muted\",style=\"dataset\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.4"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|