{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "dc7f5718", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "id": "c22f479e", "metadata": {}, "outputs": [], "source": [ "def preprocess(line):\n", " txt = line\n", " txt = re.sub(\n", " \"(£|§|@|#|\\$|%|\\^|&|\\*|\\(|\\)|_|-|\\+|=|\\{|\\[|\\}|\\]|:|;|\\\"|'|\\|\\\\|\\<|,|\\>|/|~|`|\\|–|–|)\",\n", " \"\",\n", " txt,\n", " )\n", " txt = txt.lower()\n", " txt = re.sub(\"[0-9]\", \"\", txt)\n", " txt = re.sub(\"[ \\t]+\", \" \", txt)\n", " txt = re.sub(\" +$\", \"\", txt)\n", " txt = re.sub(\"ą\", \"a\", txt)\n", " txt = re.sub(\"ć\", \"c\", txt)\n", " txt = re.sub(\"ę\", \"e\", txt)\n", " txt = re.sub(\"ł\", \"l\", txt)\n", " txt = re.sub(\"ń\", \"n\", txt)\n", " txt = re.sub(\"ó\", \"o\", txt)\n", " txt = re.sub(\"ś\", \"s\", txt)\n", " txt = re.sub(\"ź\", \"z\", txt)\n", " txt = re.sub(\"ż\", \"z\", txt)\n", " return txt" ] }, { "cell_type": "code", "execution_count": 3, "id": "4e6b43e5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "
---|---|
0 | \n", "Witam | \n", "
1 | \n", "Co możesz dla mnie zrobić? | \n", "
2 | \n", "Jakie są moje repozytoria? | \n", "
3 | \n", "ok. co nowego w Zajęcia AI? | \n", "
4 | \n", "ok. co nowego w Zajęcia AI? | \n", "
... | \n", "... | \n", "
586 | \n", "upewniam się | \n", "
587 | \n", "pokaż mi raport | \n", "
588 | \n", "zmienić | \n", "
589 | \n", "Tak | \n", "
590 | \n", "elo | \n", "
591 rows × 1 columns
\n", "