SystemyDialogowe/mark_data.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "db5f348a-d8b8-451e-8ff7-7b7b0fb917fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import regex as re\n",
    "\n",
    "# sample values\n",
    "\n",
    "titles = [\n",
    "    \"Batman\",\n",
    "    \"Batmana\",\n",
    "    \"Ambulans\",\n",
    "    \"Bunkier strachu\",\n",
    "    \"Córka\",\n",
    "    \"Curka\"\n",
    "    \"Uncharted\",\n",
    "    \"Inni ludzie\",\n",
    "    \"Śmierć na Nilu\",\n",
    "    \"Skarb Mikołajka\",\n",
    "        ]\n",
    "\n",
    "dates = [\n",
    "    \"w najbliższy poniedziałek\", \n",
    "    \"w najbliższy wtorek\", \n",
    "    \"w najbliższą środę\", \n",
    "    \"w najbliższy czwartek\", \n",
    "    \"w najbliższy piątek\",\n",
    "    \"w najbliższą sobotę\",\n",
    "    \"w najbliższą niedzielę\",\n",
    "    \"w poniedziałek\", \n",
    "    \"we wtorek\", \n",
    "    \"w środę\", \n",
    "    \"w czwartek\", \n",
    "    \"w piątek\",\n",
    "    \"w sobotę\",\n",
    "    \"w niedzielę\",\n",
    "    \"na najbliższy poniedziałek\", \n",
    "    \"na najbliższy wtorek\", \n",
    "    \"na najbliższą środę\", \n",
    "    \"na najbliższy czwartek\", \n",
    "    \"na najbliższy piątek\",\n",
    "    \"na najbliższą sobotę\",\n",
    "    \"na najbliższą niedzielę\",\n",
    "    \"najbliższy poniedziałek\", \n",
    "    \"najbliższy wtorek\", \n",
    "    \"najbliższa środa\", \n",
    "    \"najbliższy czwartek\", \n",
    "    \"najbliższy piątek\",\n",
    "    \"najbliższa sobota\",\n",
    "    \"najbliższa niedziela\",\n",
    "    \"na jutro\", \n",
    "    \"jutro\", \n",
    "    \"w dniu jutrzejszym\",\n",
    "    \"po jutrze\", \n",
    "    \"pojutrze\", \n",
    "    \"za dwa dni\", \n",
    "    \"za trzy dni\",\n",
    "    \"za tydzień\", \n",
    "    \"dzisiaj\",\n",
    "    \"dziś\",\n",
    "    r\"na dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"na [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"dnia [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"[0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    ]\n",
    "\n",
    "times = [\n",
    "    r\"wybieram: [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"wybieram [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"na godzinę [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"na godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"o godzinie [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"o [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"o [0-9]{2}\",\n",
    "    r\"na [0-9]{1,2}\",\n",
    "    r\"na [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"[0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    \"rano\",\n",
    "    \"wieczorem\",\n",
    "    \"w południe\",\n",
    "    \"po południu\",\n",
    "    \"popołudniu\",\n",
    "    \"w nocy\",\n",
    "    \"pierwszą\",\n",
    "    \"drugą\",\n",
    "    \"trzecią\",\n",
    "    \"piątą\",\n",
    "    \"szóstą\",\n",
    "    \"siódmą\",\n",
    "    \"ósmą\",\n",
    "    \"dziewiatą\",\n",
    "    \"dziesiątą\",\n",
    "    \"jedenastą\",\n",
    "    \"dwunastą\",\n",
    "    \"trzynastą\",\n",
    "    \"czternastą\",\n",
    "    \"piętnastą\",\n",
    "    \"szesnastą\",\n",
    "    \"siedemnastą\",\n",
    "    \"osiemnastą\",\n",
    "    \"dziewiętnastą\",\n",
    "    \"dwudziestą pierwszą\",\n",
    "    \"dwudziestą drugą\",\n",
    "    \"dwudziestą trzecią\",\n",
    "    \"dwudziestą czwartą\",\n",
    "    \"o północy\"\n",
    "    ]\n",
    "\n",
    "quantities = [\n",
    "    r\"chcę [1-9][0-9]{0,1}\",\n",
    "    r\"poproszę [1-9][0-9]{0,1}\",\n",
    "    r\"[1-9][0-9]{0,1}\",\n",
    "]\n",
    "\n",
    "seats = [\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',    \n",
    "]\n",
    "\n",
    "areas = [\n",
    "    \"u góry po prawej\",\n",
    "    \"u góry po lewej\",\n",
    "    \"u góry na środku\",\n",
    "    \"na środku po prawej\",\n",
    "    \"na środku po lewej\",\n",
    "    \"na dole po prawej\",\n",
    "    \"na dole po lewej\",\n",
    "    \"na dole na środku\",\n",
    "    \"po lewej\",\n",
    "    \"po prawej\",\n",
    "    \"na środku\",\n",
    "    \"lewo\",\n",
    "    \"prawo\",\n",
    "    \"środek\",\n",
    "    \"blisko od ekranu\",\n",
    "    \"daleko od ekranu\",\n",
    "    \"blisko ekranu\",\n",
    "    \"daleko ekranu\",\n",
    "]\n",
    "\n",
    "genres = [\n",
    "    'akcja',\n",
    "    'akcji'\n",
    "    'dramat',\n",
    "    'komedia',\n",
    "    'Horror',\n",
    "    'Thriller',\n",
    "    'science fiction',\n",
    "    'romans',\n",
    "    'bajka',\n",
    "    'rodzinny',\n",
    "    'animowany',\n",
    "    'bajka',\n",
    "    'komedia romantyczna',\n",
    "\n",
    "]\n",
    "\n",
    "# slots names\n",
    "slots = [\n",
    "    \"tytul\",\n",
    "    \"date\",\n",
    "    \"time\",\n",
    "    \"quantity\",\n",
    "    \"seats\",\n",
    "    \"area\",\n",
    "    \"genres\",\n",
    "    ]\n",
    "\n",
    "def removePunctation(text):\n",
    "    return re.sub(r'[!@#$,\\\"\\'\\?\\'\\\"\\|.]', '', text)\n",
    "\n",
    "def dummyTextAnnotation(text, sampelValuesList, slotName):\n",
    "    textJoin = text\n",
    "    for sampleValue in sampelValuesList:\n",
    "        if re.search(r\"(^|\\s)\" + sampleValue + r\"($|\\s)\", textJoin):\n",
    "            textJoin = re.sub(r'(^|\\s)(' + sampleValue + r')($|\\s)', f\"\\g<1> **/start** {slotName} \\g<2> **/end** \\g<3>\", textJoin)\n",
    "            break # can one sentence has only one slot of a given type?\n",
    "    return textJoin\n",
    "\n",
    "def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):\n",
    "    textTokenize = text.split()\n",
    "    slotLabel =  \"\"\n",
    "    col1 = [i for i in range(1, textLen + 1)]\n",
    "    col2 = []\n",
    "    col3 = [intent for _ in range(textLen)]\n",
    "    col4 = []\n",
    "    isSlot = False\n",
    "    slotVal = None\n",
    "    annotation = \"\"\n",
    "    i = 0\n",
    "    while i < len(textTokenize):\n",
    "        if textTokenize[i] == \"**/start**\":\n",
    "            isSlot = True\n",
    "            slotVal = textTokenize[i + 1]\n",
    "            if len(slotLabel) > 0:\n",
    "                slotLabel += f','\n",
    "            i += 2\n",
    "            annotation = \"B-\"\n",
    "        elif textTokenize[i] == \"**/end**\":\n",
    "            isSlot = False\n",
    "            slotLabel += \":\" + slotVal\n",
    "            slotVal = None\n",
    "            i += 1\n",
    "        elif isSlot:\n",
    "            slotLabel += textTokenize[i]\n",
    "            col2.append(textTokenize[i])\n",
    "            col4.append(annotation + slotVal)\n",
    "            annotation = \"I-\"\n",
    "            i += 1\n",
    "        elif not isSlot:\n",
    "            col2.append(textTokenize[i])\n",
    "            col4.append(\"NoLabel\")\n",
    "            i += 1\n",
    "    df = pd.DataFrame(list(zip(col1, col2, col3, col4)))\n",
    "    path = \"\"\n",
    "    if isTest: path = pathOut + 'test.conllu'\n",
    "    else: path =  pathOut + 'train.conllu'\n",
    "    with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",
    "        outputFile.write(f\"# text: {cleanText}\\n# intent: {intent}\\n# slots: {slotLabel}\\n\")\n",
    "    df.to_csv(path, header=None, index=None, sep='\\t', mode='a')\n",
    "    with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",
    "        outputFile.write(f\"\\n\")\n",
    "    \n",
    "\n",
    "def processFile(pathIn, pathOut, fileName, isTest):\n",
    "    # path = './data/dialog-16-01-01.tsv'\n",
    "    encoding = \"utf-8\"\n",
    "    # encoding = \"cp1250\"\n",
    "    dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n",
    "    \n",
    "    dialog_df = dialog_df.reset_index()  # make sure indexes pair with number of rows\n",
    "    for _, row in dialog_df.iterrows():\n",
    "        if row[0].strip() == \"user\":\n",
    "            text = removePunctation(row[1]).lower()\n",
    "            \n",
    "            # movies annotation\n",
    "            annotatedText = dummyTextAnnotation(text, titles, slots[0])\n",
    "\n",
    "            # dates annotation\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])\n",
    "\n",
    "            # time\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])\n",
    "\n",
    "            # quantity\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])\n",
    "\n",
    "            # seats \n",
    "            annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])\n",
    "\n",
    "            # area\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])\n",
    "            \n",
    "            #genre\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, genres, slots[5])\n",
    "            \n",
    "            parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)\n",
    "            \n",
    "    # dialog_df.to_csv(pathOut + \"test.conllu\", sep=\"\\t\", index=False, header=None)\n",
    "\n",
    "def annotateData():\n",
    "    pathOut = './train_data/'\n",
    "    pathIn = \"./empty_data/\"\n",
    "    i = 0\n",
    "    j = 0\n",
    "    nr = 0\n",
    "    for i in range(11,16):\n",
    "        for j in range(20):\n",
    "            for nr in range(1,5):\n",
    "                fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n",
    "                try:\n",
    "                    processFile(pathIn, pathOut, fileName, False)\n",
    "                except:\n",
    "                    pass\n",
    "                try:\n",
    "                    fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n",
    "                    processFile(pathIn, pathOut, fileName, True)\n",
    "                except:\n",
    "                    pass\n",
    "annotateData()\n",
    "# processFile(\"./empty_data/\", './train_data/', \"dialog-12-01-01.tsv\", isTest=True)\n",
    "# testText = \"dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890\"\n",
    "# print(dummyTextAnnotation(testText, sampelValuesList=titles, slotName=slots[0]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}