s444417-paranormal-or-skept.../run.ipynb
2022-05-10 23:41:58 +02:00

282 lines
7.1 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"import sys\n",
"from io import StringIO\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import pandas as pd\n",
"import numpy\n",
"\n",
"pathX = \"./train/in.tsv.xz\"\n",
"# pathX = \"./train/in.tsv\"\n",
"pathY = \"./train/expected.tsv\"\n",
"nrows = 10000"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"# data = lzma.open(pathX, mode='rt', encoding='utf-8').read()\n",
"# stringIO = StringIO(data)\n",
"# df = pd.read_csv(stringIO, sep=\"\\t\", header=None)\n",
"df = pd.read_csv(pathX, sep='\\t', nrows=nrows, header=None)\n",
"df = df.drop(df.columns[1], axis=1)\n",
"topics = pd.read_csv(pathY, sep='\\t', nrows=nrows, header=None)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10000\n",
"10000\n"
]
}
],
"source": [
"print(len(df.index))\n",
"\n",
"print(len(topics.index))\n"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8910</th>\n",
" <td>What? It isn't a fake memo. It's a real memo. ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"8910 What? It isn't a fake memo. It's a real memo. ..."
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sample()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['00', '000', '00000001', ..., 'αsynuclein', 'ಠ_ಠ', 'fibrosis'],\n",
" dtype=object)"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorizer = TfidfVectorizer(lowercase=True, stop_words=['english'])\n",
"X = vectorizer.fit_transform(df.to_numpy().ravel())\n",
"vectorizer.get_feature_names_out()\n"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"# vectorizer.transform(\"Ala ma kotka\".lower().split())"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"df = df.reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"tfidfVector = vectorizer.transform(df[0])\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Python310\\lib\\site-packages\\sklearn\\utils\\validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n",
"c:\\Python310\\lib\\site-packages\\sklearn\\utils\\validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import GaussianNB\n",
"\n",
"gnb = GaussianNB()\n",
"gnb.fit(tfidfVector.todense(), topics)"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"testXPath = \"./dev-0/in.tsv.xz\"\n",
"testYPath = \"./dev-0/expected.tsv\"\n",
"\n",
"testX = pd.read_csv(testXPath, sep='\\t', nrows=nrows, header=None)\n",
"\n",
"testY = pd.read_csv(testYPath, sep='\\t', nrows=nrows, header=None)\n",
"testXtfidfVector = vectorizer.transform(testX[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"testXPath = \"./test-A/in.tsv.xz\"\n",
"testYPath = \"./test-A/expected.tsv\"\n",
"\n",
"testX = pd.read_csv(testXPath, sep='\\t', nrows=nrows, header=None)\n",
"\n",
"# testY = pd.read_csv(testYPath, sep='\\t', nrows=nrows, header=None)\n",
"testXtfidfVector = vectorizer.transform(testX[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Python310\\lib\\site-packages\\sklearn\\utils\\validation.py:593: FutureWarning: np.matrix usage is deprecated in 1.0 and will raise a TypeError in 1.2. Please convert to a numpy array with np.asarray. For more information see: https://numpy.org/doc/stable/reference/generated/numpy.matrix.html\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 1 0 ... 0 0 1]\n"
]
}
],
"source": [
"pred = gnb.predict(testXtfidfVector.todense())\n",
"print(pred)\n",
"\n",
"import csv\n",
"with open(testYPath, 'w', newline='') as f_output:\n",
" tsv_output = csv.writer(f_output, delimiter='\\n')\n",
" tsv_output.writerow(pred)"
]
}
],
"metadata": {
"interpreter": {
"hash": "369f2c481f4da34e4445cda3fffd2e751bd1c4d706f27375911949ba6bb62e1c"
},
"kernelspec": {
"display_name": "Python 3.10.4 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}