SystemyDialogowe/clean_data.ipynb

82 lines
2.7 KiB
Plaintext
Raw Normal View History

2022-05-17 16:42:09 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "0c094232-f694-44bb-b40b-804b9702b439",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import regex\n",
"\n",
"# cleans our data\n",
"# extracts intents for lab8 from dtsv files with dialogues\n",
"def removeParenthesis(text):\n",
" resultText = \"\"\n",
" leftParCount = 0\n",
" for letter in text:\n",
" if letter == \"(\": leftParCount += 1\n",
" if (leftParCount == 0):\n",
" resultText += letter\n",
" if letter == \")\": leftParCount -= 1\n",
" return resultText\n",
"\n",
"def processFile(pathIn, pathOut, fileName):\n",
" # path = './lab/dialog-15-04-01.tsv'\n",
" encoding = \"utf-8\"\n",
" # encoding = \"cp1250\"\n",
" dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n",
" dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))\n",
" dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r\"( & |&| AND |AND)\", \" \", str(x)))\n",
" dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : \" \".join(list(set(x.split()))))\n",
" dialog_df.to_csv(pathOut + fileName, sep=\"\\t\", index=False, header=None)\n",
"\n",
"def cleanAllData():\n",
" pathOut = './empty_data/'\n",
" pathIn = \"./data_v2/\"\n",
" i = 0\n",
" j = 0\n",
" nr = 0\n",
" for i in range(11,16):\n",
" for j in range(20):\n",
" for nr in range(1,2):\n",
" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n",
" try:\n",
" processFile(pathIn, pathOut, fileName)\n",
" except:\n",
" pass\n",
" try:\n",
" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n",
" processFile(pathIn, pathOut, fileName)\n",
" except:\n",
" pass\n",
"\n",
"cleanAllData()\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2022-05-18 00:06:14 +02:00
"version": "3.9.7"
2022-05-17 16:42:09 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}