{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "0c094232-f694-44bb-b40b-804b9702b439", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import regex\n", "\n", "# cleans our data\n", "# extracts intents for lab8 from dtsv files with dialogues\n", "def removeParenthesis(text):\n", " resultText = \"\"\n", " leftParCount = 0\n", " for letter in text:\n", " if letter == \"(\": leftParCount += 1\n", " if (leftParCount == 0):\n", " resultText += letter\n", " if letter == \")\": leftParCount -= 1\n", " return resultText\n", "\n", "def processFile(pathIn, pathOut, fileName):\n", " # path = './lab/dialog-15-04-01.tsv'\n", " encoding = \"utf-8\"\n", " # encoding = \"cp1250\"\n", " dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n", " dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))\n", " dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r\"( & |&| AND |AND)\", \" \", str(x)))\n", " dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : \" \".join(list(set(x.split()))))\n", " dialog_df.to_csv(pathOut + fileName, sep=\"\\t\", index=False, header=None)\n", "\n", "def cleanAllData():\n", " pathOut = './empty_data/'\n", " pathIn = \"./data_v2/\"\n", " i = 0\n", " j = 0\n", " nr = 0\n", " for i in range(11,16):\n", " for j in range(20):\n", " for nr in range(1,2):\n", " fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n", " try:\n", " processFile(pathIn, pathOut, fileName)\n", " except:\n", " pass\n", " try:\n", " fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n", " processFile(pathIn, pathOut, fileName)\n", " except:\n", " pass\n", "\n", "cleanAllData()\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }