{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "db5f348a-d8b8-451e-8ff7-7b7b0fb917fc", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import regex as re\n", "\n", "# sample values\n", "\n", "titles = [\n", " \"Batman\",\n", " \"Batmana\",\n", " \"Ambulans\",\n", " \"Bunkier strachu\",\n", " \"Córka\",\n", " \"Curka\"\n", " \"Uncharted\",\n", " \"Inni ludzie\",\n", " \"Śmierć na Nilu\",\n", " \"Skarb Mikołajka\",\n", " ]\n", "\n", "dates = [\n", " \"w najbliższy poniedziałek\", \n", " \"w najbliższy wtorek\", \n", " \"w najbliższą środę\", \n", " \"w najbliższy czwartek\", \n", " \"w najbliższy piątek\",\n", " \"w najbliższą sobotę\",\n", " \"w najbliższą niedzielę\",\n", " \"w poniedziałek\", \n", " \"we wtorek\", \n", " \"w środę\", \n", " \"w czwartek\", \n", " \"w piątek\",\n", " \"w sobotę\",\n", " \"w niedzielę\",\n", " \"na najbliższy poniedziałek\", \n", " \"na najbliższy wtorek\", \n", " \"na najbliższą środę\", \n", " \"na najbliższy czwartek\", \n", " \"na najbliższy piątek\",\n", " \"na najbliższą sobotę\",\n", " \"na najbliższą niedzielę\",\n", " \"najbliższy poniedziałek\", \n", " \"najbliższy wtorek\", \n", " \"najbliższa środa\", \n", " \"najbliższy czwartek\", \n", " \"najbliższy piątek\",\n", " \"najbliższa sobota\",\n", " \"najbliższa niedziela\",\n", " \"na jutro\", \n", " \"jutro\", \n", " \"w dniu jutrzejszym\",\n", " \"po jutrze\", \n", " \"pojutrze\", \n", " \"za dwa dni\", \n", " \"za trzy dni\",\n", " \"za tydzień\", \n", " \"dzisiaj\",\n", " \"dziś\",\n", " r\"na dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n", " r\"na [0-9]{1,2}[ /][0-9]{1,2}\",\n", " r\"dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n", " r\"dnia [0-9]{1,2}[ /][0-9]{1,2}\",\n", " r\"[0-9]{1,2}[ /][0-9]{1,2}\",\n", " ]\n", "\n", "times = [\n", " r\"wybieram: [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"wybieram [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"na godzinę [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"na godzina [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"o godzinie [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"o [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"o [0-9]{2}\",\n", " r\"na [0-9]{1,2}\",\n", " r\"na [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"godzina [0-9]{1,2}[:][0-9]{1,2}\",\n", " r\"[0-9]{1,2}[:][0-9]{1,2}\",\n", " \"rano\",\n", " \"wieczorem\",\n", " \"w południe\",\n", " \"po południu\",\n", " \"popołudniu\",\n", " \"w nocy\",\n", " \"pierwszą\",\n", " \"drugą\",\n", " \"trzecią\",\n", " \"piątą\",\n", " \"szóstą\",\n", " \"siódmą\",\n", " \"ósmą\",\n", " \"dziewiatą\",\n", " \"dziesiątą\",\n", " \"jedenastą\",\n", " \"dwunastą\",\n", " \"trzynastą\",\n", " \"czternastą\",\n", " \"piętnastą\",\n", " \"szesnastą\",\n", " \"siedemnastą\",\n", " \"osiemnastą\",\n", " \"dziewiętnastą\",\n", " \"dwudziestą pierwszą\",\n", " \"dwudziestą drugą\",\n", " \"dwudziestą trzecią\",\n", " \"dwudziestą czwartą\",\n", " \"o północy\"\n", " ]\n", "\n", "quantities = [\n", " r\"chcę [1-9][0-9]{0,1}\",\n", " r\"poproszę [1-9][0-9]{0,1}\",\n", " r\"[1-9][0-9]{0,1}\",\n", "]\n", "\n", "seats = [\n", " r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n", " r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}', \n", "]\n", "\n", "areas = [\n", " \"u góry po prawej\",\n", " \"u góry po lewej\",\n", " \"u góry na środku\",\n", " \"na środku po prawej\",\n", " \"na środku po lewej\",\n", " \"na dole po prawej\",\n", " \"na dole po lewej\",\n", " \"na dole na środku\",\n", " \"po lewej\",\n", " \"po prawej\",\n", " \"na środku\",\n", " \"lewo\",\n", " \"prawo\",\n", " \"środek\",\n", " \"blisko od ekranu\",\n", " \"daleko od ekranu\",\n", " \"blisko ekranu\",\n", " \"daleko ekranu\",\n", "]\n", "\n", "genres = [\n", " 'akcja',\n", " 'akcji'\n", " 'dramat',\n", " 'komedia',\n", " 'Horror',\n", " 'Thriller',\n", " 'science fiction',\n", " 'romans',\n", " 'bajka',\n", " 'rodzinny',\n", " 'animowany',\n", " 'bajka',\n", " 'komedia romantyczna',\n", "\n", "]\n", "\n", "# slots names\n", "slots = [\n", " \"tytul\",\n", " \"date\",\n", " \"time\",\n", " \"quantity\",\n", " \"seats\",\n", " \"area\",\n", " \"genres\",\n", " ]\n", "\n", "def removePunctation(text):\n", " return re.sub(r'[!@#$,\\\"\\'\\?\\'\\\"\\|.]', '', text)\n", "\n", "def dummyTextAnnotation(text, sampelValuesList, slotName):\n", " textJoin = text\n", " for sampleValue in sampelValuesList:\n", " if re.search(r\"(^|\\s)\" + sampleValue + r\"($|\\s)\", textJoin):\n", " textJoin = re.sub(r'(^|\\s)(' + sampleValue + r')($|\\s)', f\"\\g<1> **/start** {slotName} \\g<2> **/end** \\g<3>\", textJoin)\n", " break # can one sentence has only one slot of a given type?\n", " return textJoin\n", "\n", "def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):\n", " textTokenize = text.split()\n", " slotLabel = \"\"\n", " col1 = [i for i in range(1, textLen + 1)]\n", " col2 = []\n", " col3 = [intent for _ in range(textLen)]\n", " col4 = []\n", " isSlot = False\n", " slotVal = None\n", " annotation = \"\"\n", " i = 0\n", " while i < len(textTokenize):\n", " if textTokenize[i] == \"**/start**\":\n", " isSlot = True\n", " slotVal = textTokenize[i + 1]\n", " if len(slotLabel) > 0:\n", " slotLabel += f','\n", " i += 2\n", " annotation = \"B-\"\n", " elif textTokenize[i] == \"**/end**\":\n", " isSlot = False\n", " slotLabel += \":\" + slotVal\n", " slotVal = None\n", " i += 1\n", " elif isSlot:\n", " slotLabel += textTokenize[i]\n", " col2.append(textTokenize[i])\n", " col4.append(annotation + slotVal)\n", " annotation = \"I-\"\n", " i += 1\n", " elif not isSlot:\n", " col2.append(textTokenize[i])\n", " col4.append(\"NoLabel\")\n", " i += 1\n", " df = pd.DataFrame(list(zip(col1, col2, col3, col4)))\n", " path = \"\"\n", " if isTest: path = pathOut + 'test.conllu'\n", " else: path = pathOut + 'train.conllu'\n", " with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n", " outputFile.write(f\"# text: {cleanText}\\n# intent: {intent}\\n# slots: {slotLabel}\\n\")\n", " df.to_csv(path, header=None, index=None, sep='\\t', mode='a')\n", " with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n", " outputFile.write(f\"\\n\")\n", " \n", "\n", "def processFile(pathIn, pathOut, fileName, isTest):\n", " # path = './data/dialog-16-01-01.tsv'\n", " encoding = \"utf-8\"\n", " # encoding = \"cp1250\"\n", " dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n", " \n", " dialog_df = dialog_df.reset_index() # make sure indexes pair with number of rows\n", " for _, row in dialog_df.iterrows():\n", " if row[0].strip() == \"user\":\n", " text = removePunctation(row[1]).lower()\n", " \n", " # movies annotation\n", " annotatedText = dummyTextAnnotation(text, titles, slots[0])\n", "\n", " # dates annotation\n", " annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])\n", "\n", " # time\n", " annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])\n", "\n", " # quantity\n", " annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])\n", "\n", " # seats \n", " annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])\n", "\n", " # area\n", " annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])\n", " \n", " #genre\n", " annotatedText = dummyTextAnnotation(annotatedText, genres, slots[5])\n", " \n", " parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)\n", " \n", " # dialog_df.to_csv(pathOut + \"test.conllu\", sep=\"\\t\", index=False, header=None)\n", "\n", "def annotateData():\n", " pathOut = './train_data/'\n", " pathIn = \"./empty_data/\"\n", " i = 0\n", " j = 0\n", " nr = 0\n", " for i in range(11,16):\n", " for j in range(20):\n", " for nr in range(1,5):\n", " fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n", " try:\n", " processFile(pathIn, pathOut, fileName, False)\n", " except:\n", " pass\n", " try:\n", " fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n", " processFile(pathIn, pathOut, fileName, True)\n", " except:\n", " pass\n", "annotateData()\n", "# processFile(\"./empty_data/\", './train_data/', \"dialog-12-01-01.tsv\", isTest=True)\n", "# testText = \"dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890\"\n", "# print(dummyTextAnnotation(testText, sampelValuesList=titles, slotName=slots[0]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }