2022-05-17 16:42:09 +02:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 3,
|
|
|
|
"id": "0c094232-f694-44bb-b40b-804b9702b439",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import regex\n",
|
|
|
|
"\n",
|
|
|
|
"# cleans our data\n",
|
|
|
|
"# extracts intents for lab8 from dtsv files with dialogues\n",
|
|
|
|
"def removeParenthesis(text):\n",
|
|
|
|
" resultText = \"\"\n",
|
|
|
|
" leftParCount = 0\n",
|
|
|
|
" for letter in text:\n",
|
|
|
|
" if letter == \"(\": leftParCount += 1\n",
|
|
|
|
" if (leftParCount == 0):\n",
|
|
|
|
" resultText += letter\n",
|
|
|
|
" if letter == \")\": leftParCount -= 1\n",
|
|
|
|
" return resultText\n",
|
|
|
|
"\n",
|
|
|
|
"def processFile(pathIn, pathOut, fileName):\n",
|
|
|
|
" # path = './lab/dialog-15-04-01.tsv'\n",
|
|
|
|
" encoding = \"utf-8\"\n",
|
|
|
|
" # encoding = \"cp1250\"\n",
|
|
|
|
" dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n",
|
|
|
|
" dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))\n",
|
|
|
|
" dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r\"( & |&| AND |AND)\", \" \", str(x)))\n",
|
|
|
|
" dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : \" \".join(list(set(x.split()))))\n",
|
|
|
|
" dialog_df.to_csv(pathOut + fileName, sep=\"\\t\", index=False, header=None)\n",
|
|
|
|
"\n",
|
|
|
|
"def cleanAllData():\n",
|
|
|
|
" pathOut = './empty_data/'\n",
|
|
|
|
" pathIn = \"./data_v2/\"\n",
|
|
|
|
" i = 0\n",
|
|
|
|
" j = 0\n",
|
|
|
|
" nr = 0\n",
|
|
|
|
" for i in range(11,16):\n",
|
|
|
|
" for j in range(20):\n",
|
|
|
|
" for nr in range(1,2):\n",
|
|
|
|
" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n",
|
|
|
|
" try:\n",
|
|
|
|
" processFile(pathIn, pathOut, fileName)\n",
|
|
|
|
" except:\n",
|
|
|
|
" pass\n",
|
|
|
|
" try:\n",
|
|
|
|
" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n",
|
|
|
|
" processFile(pathIn, pathOut, fileName)\n",
|
|
|
|
" except:\n",
|
|
|
|
" pass\n",
|
|
|
|
"\n",
|
|
|
|
"cleanAllData()\n",
|
|
|
|
"\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
2022-05-18 00:06:14 +02:00
|
|
|
"version": "3.9.7"
|
2022-05-17 16:42:09 +02:00
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 5
|
|
|
|
}
|