Systemy_dialogowe/evaluate/preprocess.ipynb

218 lines
5.8 KiB
Plaintext
Raw Normal View History

2022-04-26 18:33:20 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dc7f5718",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c22f479e",
"metadata": {},
"outputs": [],
"source": [
"def preprocess(line):\n",
" txt = line\n",
" txt = re.sub(\n",
" \"(£|§|@|#|\\$|%|\\^|&|\\*|\\(|\\)|_|-|\\+|=|\\{|\\[|\\}|\\]|:|;|\\\"|'|\\|\\\\|\\<|,|\\>|/|~|`|\\|||)\",\n",
" \"\",\n",
" txt,\n",
" )\n",
" txt = txt.lower()\n",
" txt = re.sub(\"[0-9]\", \"\", txt)\n",
" txt = re.sub(\"[ \\t]+\", \" \", txt)\n",
" txt = re.sub(\" +$\", \"\", txt)\n",
" txt = re.sub(\"ą\", \"a\", txt)\n",
" txt = re.sub(\"ć\", \"c\", txt)\n",
" txt = re.sub(\"ę\", \"e\", txt)\n",
" txt = re.sub(\"ł\", \"l\", txt)\n",
" txt = re.sub(\"ń\", \"n\", txt)\n",
" txt = re.sub(\"ó\", \"o\", txt)\n",
" txt = re.sub(\"ś\", \"s\", txt)\n",
" txt = re.sub(\"ź\", \"z\", txt)\n",
" txt = re.sub(\"ż\", \"z\", txt)\n",
" return txt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4e6b43e5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Witam</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Co możesz dla mnie zrobić?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Jakie są moje repozytoria?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ok. co nowego w Zajęcia AI?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ok. co nowego w Zajęcia AI?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>586</th>\n",
" <td>upewniam się</td>\n",
" </tr>\n",
" <tr>\n",
" <th>587</th>\n",
" <td>pokaż mi raport</td>\n",
" </tr>\n",
" <tr>\n",
" <th>588</th>\n",
" <td>zmienić</td>\n",
" </tr>\n",
" <tr>\n",
" <th>589</th>\n",
" <td>Tak</td>\n",
" </tr>\n",
" <tr>\n",
" <th>590</th>\n",
" <td>elo</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>591 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 Witam\n",
"1 Co możesz dla mnie zrobić?\n",
"2 Jakie są moje repozytoria?\n",
"3 ok. co nowego w Zajęcia AI?\n",
"4 ok. co nowego w Zajęcia AI?\n",
".. ...\n",
"586 upewniam się \n",
"587 pokaż mi raport \n",
"588 zmienić \n",
"589 Tak \n",
"590 elo\n",
"\n",
"[591 rows x 1 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessed = pd.read_csv('in.tsv', sep='\\t', header=None)\n",
"preprocessed = preprocessed.fillna('null')\n",
"preprocessed"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ca7e6cca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 witam\n",
"1 co mozesz dla mnie zrobic?\n",
"2 jakie sa moje repozytoria?\n",
"3 ok. co nowego w zajecia ai?\n",
"4 ok. co nowego w zajecia ai?\n",
" ... \n",
"586 upewniam sie\n",
"587 pokaz mi raport\n",
"588 zmienic\n",
"589 tak\n",
"590 elo\n",
"Length: 591, dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessed = preprocessed.apply(lambda row : preprocess(row[0]),axis=1)\n",
"preprocessed"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d5153b79",
"metadata": {},
"outputs": [],
"source": [
"# preprocessed.to_csv('in.tsv', sep='\\t', index=False, header=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}