Systemy_dialogowe/evaluate/preprocess.ipynb
2022-04-26 18:33:20 +02:00

218 lines
5.8 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dc7f5718",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c22f479e",
"metadata": {},
"outputs": [],
"source": [
"def preprocess(line):\n",
" txt = line\n",
" txt = re.sub(\n",
" \"(£|§|@|#|\\$|%|\\^|&|\\*|\\(|\\)|_|-|\\+|=|\\{|\\[|\\}|\\]|:|;|\\\"|'|\\|\\\\|\\<|,|\\>|/|~|`|\\|||)\",\n",
" \"\",\n",
" txt,\n",
" )\n",
" txt = txt.lower()\n",
" txt = re.sub(\"[0-9]\", \"\", txt)\n",
" txt = re.sub(\"[ \\t]+\", \" \", txt)\n",
" txt = re.sub(\" +$\", \"\", txt)\n",
" txt = re.sub(\"ą\", \"a\", txt)\n",
" txt = re.sub(\"ć\", \"c\", txt)\n",
" txt = re.sub(\"ę\", \"e\", txt)\n",
" txt = re.sub(\"ł\", \"l\", txt)\n",
" txt = re.sub(\"ń\", \"n\", txt)\n",
" txt = re.sub(\"ó\", \"o\", txt)\n",
" txt = re.sub(\"ś\", \"s\", txt)\n",
" txt = re.sub(\"ź\", \"z\", txt)\n",
" txt = re.sub(\"ż\", \"z\", txt)\n",
" return txt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4e6b43e5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Witam</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Co możesz dla mnie zrobić?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Jakie są moje repozytoria?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>ok. co nowego w Zajęcia AI?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ok. co nowego w Zajęcia AI?</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>586</th>\n",
" <td>upewniam się</td>\n",
" </tr>\n",
" <tr>\n",
" <th>587</th>\n",
" <td>pokaż mi raport</td>\n",
" </tr>\n",
" <tr>\n",
" <th>588</th>\n",
" <td>zmienić</td>\n",
" </tr>\n",
" <tr>\n",
" <th>589</th>\n",
" <td>Tak</td>\n",
" </tr>\n",
" <tr>\n",
" <th>590</th>\n",
" <td>elo</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>591 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 Witam\n",
"1 Co możesz dla mnie zrobić?\n",
"2 Jakie są moje repozytoria?\n",
"3 ok. co nowego w Zajęcia AI?\n",
"4 ok. co nowego w Zajęcia AI?\n",
".. ...\n",
"586 upewniam się \n",
"587 pokaż mi raport \n",
"588 zmienić \n",
"589 Tak \n",
"590 elo\n",
"\n",
"[591 rows x 1 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessed = pd.read_csv('in.tsv', sep='\\t', header=None)\n",
"preprocessed = preprocessed.fillna('null')\n",
"preprocessed"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ca7e6cca",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 witam\n",
"1 co mozesz dla mnie zrobic?\n",
"2 jakie sa moje repozytoria?\n",
"3 ok. co nowego w zajecia ai?\n",
"4 ok. co nowego w zajecia ai?\n",
" ... \n",
"586 upewniam sie\n",
"587 pokaz mi raport\n",
"588 zmienic\n",
"589 tak\n",
"590 elo\n",
"Length: 591, dtype: object"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preprocessed = preprocessed.apply(lambda row : preprocess(row[0]),axis=1)\n",
"preprocessed"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d5153b79",
"metadata": {},
"outputs": [],
"source": [
"# preprocessed.to_csv('in.tsv', sep='\\t', index=False, header=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}