SystemyDialogowe/mark_data.ipynb

328 lines
12 KiB
Plaintext
Raw Normal View History

2022-05-18 00:06:14 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "db5f348a-d8b8-451e-8ff7-7b7b0fb917fc",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import regex as re\n",
"\n",
"# sample values\n",
"\n",
"titles = [\n",
" \"Batman\",\n",
" \"Batmana\",\n",
" \"Ambulans\",\n",
" \"Bunkier strachu\",\n",
" \"Córka\",\n",
" \"Curka\"\n",
" \"Uncharted\",\n",
" \"Inni ludzie\",\n",
" \"Śmierć na Nilu\",\n",
" \"Skarb Mikołajka\",\n",
" ]\n",
"\n",
"dates = [\n",
" \"w najbliższy poniedziałek\", \n",
" \"w najbliższy wtorek\", \n",
" \"w najbliższą środę\", \n",
" \"w najbliższy czwartek\", \n",
" \"w najbliższy piątek\",\n",
" \"w najbliższą sobotę\",\n",
" \"w najbliższą niedzielę\",\n",
" \"w poniedziałek\", \n",
" \"we wtorek\", \n",
" \"w środę\", \n",
" \"w czwartek\", \n",
" \"w piątek\",\n",
" \"w sobotę\",\n",
" \"w niedzielę\",\n",
" \"na najbliższy poniedziałek\", \n",
" \"na najbliższy wtorek\", \n",
" \"na najbliższą środę\", \n",
" \"na najbliższy czwartek\", \n",
" \"na najbliższy piątek\",\n",
" \"na najbliższą sobotę\",\n",
" \"na najbliższą niedzielę\",\n",
" \"najbliższy poniedziałek\", \n",
" \"najbliższy wtorek\", \n",
" \"najbliższa środa\", \n",
" \"najbliższy czwartek\", \n",
" \"najbliższy piątek\",\n",
" \"najbliższa sobota\",\n",
" \"najbliższa niedziela\",\n",
" \"na jutro\", \n",
" \"jutro\", \n",
" \"w dniu jutrzejszym\",\n",
" \"po jutrze\", \n",
" \"pojutrze\", \n",
" \"za dwa dni\", \n",
" \"za trzy dni\",\n",
" \"za tydzień\", \n",
" \"dzisiaj\",\n",
" \"dziś\",\n",
" r\"na dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",
" r\"na [0-9]{1,2}[ /][0-9]{1,2}\",\n",
" r\"dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",
" r\"dnia [0-9]{1,2}[ /][0-9]{1,2}\",\n",
" r\"[0-9]{1,2}[ /][0-9]{1,2}\",\n",
" ]\n",
"\n",
"times = [\n",
" r\"wybieram: [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"wybieram [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"na godzinę [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"na godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"o godzinie [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"o [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"o [0-9]{2}\",\n",
" r\"na [0-9]{1,2}\",\n",
" r\"na [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",
" r\"[0-9]{1,2}[:][0-9]{1,2}\",\n",
" \"rano\",\n",
" \"wieczorem\",\n",
" \"w południe\",\n",
" \"po południu\",\n",
" \"popołudniu\",\n",
" \"w nocy\",\n",
" \"pierwszą\",\n",
" \"drugą\",\n",
" \"trzecią\",\n",
" \"piątą\",\n",
" \"szóstą\",\n",
" \"siódmą\",\n",
" \"ósmą\",\n",
" \"dziewiatą\",\n",
" \"dziesiątą\",\n",
" \"jedenastą\",\n",
" \"dwunastą\",\n",
" \"trzynastą\",\n",
" \"czternastą\",\n",
" \"piętnastą\",\n",
" \"szesnastą\",\n",
" \"siedemnastą\",\n",
" \"osiemnastą\",\n",
" \"dziewiętnastą\",\n",
" \"dwudziestą pierwszą\",\n",
" \"dwudziestą drugą\",\n",
" \"dwudziestą trzecią\",\n",
" \"dwudziestą czwartą\",\n",
" \"o północy\"\n",
" ]\n",
"\n",
"quantities = [\n",
" r\"chcę [1-9][0-9]{0,1}\",\n",
" r\"poproszę [1-9][0-9]{0,1}\",\n",
" r\"[1-9][0-9]{0,1}\",\n",
"]\n",
"\n",
"seats = [\n",
" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}', \n",
"]\n",
"\n",
"areas = [\n",
" \"u góry po prawej\",\n",
" \"u góry po lewej\",\n",
" \"u góry na środku\",\n",
" \"na środku po prawej\",\n",
" \"na środku po lewej\",\n",
" \"na dole po prawej\",\n",
" \"na dole po lewej\",\n",
" \"na dole na środku\",\n",
" \"po lewej\",\n",
" \"po prawej\",\n",
" \"na środku\",\n",
" \"lewo\",\n",
" \"prawo\",\n",
" \"środek\",\n",
" \"blisko od ekranu\",\n",
" \"daleko od ekranu\",\n",
" \"blisko ekranu\",\n",
" \"daleko ekranu\",\n",
"]\n",
"\n",
"genres = [\n",
" 'akcja',\n",
" 'akcji'\n",
" 'dramat',\n",
" 'komedia',\n",
" 'Horror',\n",
" 'Thriller',\n",
" 'science fiction',\n",
" 'romans',\n",
" 'bajka',\n",
" 'rodzinny',\n",
" 'animowany',\n",
" 'bajka',\n",
" 'komedia romantyczna',\n",
"\n",
"]\n",
"\n",
"# slots names\n",
"slots = [\n",
" \"tytul\",\n",
" \"date\",\n",
" \"time\",\n",
" \"quantity\",\n",
" \"seats\",\n",
" \"area\",\n",
" \"genres\",\n",
" ]\n",
"\n",
"def removePunctation(text):\n",
" return re.sub(r'[!@#$,\\\"\\'\\?\\'\\\"\\|.]', '', text)\n",
"\n",
"def dummyTextAnnotation(text, sampelValuesList, slotName):\n",
" textJoin = text\n",
" for sampleValue in sampelValuesList:\n",
" if re.search(r\"(^|\\s)\" + sampleValue + r\"($|\\s)\", textJoin):\n",
" textJoin = re.sub(r'(^|\\s)(' + sampleValue + r')($|\\s)', f\"\\g<1> **/start** {slotName} \\g<2> **/end** \\g<3>\", textJoin)\n",
" break # can one sentence has only one slot of a given type?\n",
" return textJoin\n",
"\n",
"def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):\n",
" textTokenize = text.split()\n",
" slotLabel = \"\"\n",
" col1 = [i for i in range(1, textLen + 1)]\n",
" col2 = []\n",
" col3 = [intent for _ in range(textLen)]\n",
" col4 = []\n",
" isSlot = False\n",
" slotVal = None\n",
" annotation = \"\"\n",
" i = 0\n",
" while i < len(textTokenize):\n",
" if textTokenize[i] == \"**/start**\":\n",
" isSlot = True\n",
" slotVal = textTokenize[i + 1]\n",
" if len(slotLabel) > 0:\n",
" slotLabel += f','\n",
" i += 2\n",
" annotation = \"B-\"\n",
" elif textTokenize[i] == \"**/end**\":\n",
" isSlot = False\n",
" slotLabel += \":\" + slotVal\n",
" slotVal = None\n",
" i += 1\n",
" elif isSlot:\n",
" slotLabel += textTokenize[i]\n",
" col2.append(textTokenize[i])\n",
" col4.append(annotation + slotVal)\n",
" annotation = \"I-\"\n",
" i += 1\n",
" elif not isSlot:\n",
" col2.append(textTokenize[i])\n",
" col4.append(\"NoLabel\")\n",
" i += 1\n",
" df = pd.DataFrame(list(zip(col1, col2, col3, col4)))\n",
" path = \"\"\n",
" if isTest: path = pathOut + 'test.conllu'\n",
" else: path = pathOut + 'train.conllu'\n",
" with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",
" outputFile.write(f\"# text: {cleanText}\\n# intent: {intent}\\n# slots: {slotLabel}\\n\")\n",
" df.to_csv(path, header=None, index=None, sep='\\t', mode='a')\n",
" with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",
" outputFile.write(f\"\\n\")\n",
" \n",
"\n",
"def processFile(pathIn, pathOut, fileName, isTest):\n",
" # path = './data/dialog-16-01-01.tsv'\n",
" encoding = \"utf-8\"\n",
" # encoding = \"cp1250\"\n",
" dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n",
" \n",
" dialog_df = dialog_df.reset_index() # make sure indexes pair with number of rows\n",
" for _, row in dialog_df.iterrows():\n",
" if row[0].strip() == \"user\":\n",
" text = removePunctation(row[1]).lower()\n",
" \n",
" # movies annotation\n",
" annotatedText = dummyTextAnnotation(text, titles, slots[0])\n",
"\n",
" # dates annotation\n",
" annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])\n",
"\n",
" # time\n",
" annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])\n",
"\n",
" # quantity\n",
" annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])\n",
"\n",
" # seats \n",
" annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])\n",
"\n",
" # area\n",
" annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])\n",
" \n",
" #genre\n",
" annotatedText = dummyTextAnnotation(annotatedText, genres, slots[5])\n",
" \n",
" parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)\n",
" \n",
" # dialog_df.to_csv(pathOut + \"test.conllu\", sep=\"\\t\", index=False, header=None)\n",
"\n",
"def annotateData():\n",
" pathOut = './train_data/'\n",
" pathIn = \"./empty_data/\"\n",
" i = 0\n",
" j = 0\n",
" nr = 0\n",
" for i in range(11,16):\n",
" for j in range(20):\n",
" for nr in range(1,5):\n",
" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n",
" try:\n",
" processFile(pathIn, pathOut, fileName, False)\n",
" except:\n",
" pass\n",
" try:\n",
" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n",
" processFile(pathIn, pathOut, fileName, True)\n",
" except:\n",
" pass\n",
"annotateData()\n",
"# processFile(\"./empty_data/\", './train_data/', \"dialog-12-01-01.tsv\", isTest=True)\n",
"# testText = \"dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890\"\n",
"# print(dummyTextAnnotation(testText, sampelValuesList=titles, slotName=slots[0]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2022-06-05 18:56:12 +02:00
"version": "3.9.7"
2022-05-18 00:06:14 +02:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}