SystemyDialogowe/.ipynb_checkpoints/mark_data-checkpoint.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "db5f348a-d8b8-451e-8ff7-7b7b0fb917fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import regex as re\n",
    "\n",
    "# sample values\n",
    "\n",
    "titles = [\n",
    "    \"Batman\",\n",
    "    \"Batmana\",\n",
    "    \"Ambulans\",\n",
    "    \"Bunkier strachu\",\n",
    "    \"Córka\",\n",
    "    \"Curka\"\n",
    "    \"Uncharted\",\n",
    "    \"Inni ludzie\",\n",
    "    \"Śmierć na Nilu\",\n",
    "    \"Skarb Mikołajka\",\n",
    "        ]\n",
    "\n",
    "dates = [\n",
    "    \"w najbliższy poniedziałek\", \n",
    "    \"w najbliższy wtorek\", \n",
    "    \"w najbliższą środę\", \n",
    "    \"w najbliższy czwartek\", \n",
    "    \"w najbliższy piątek\",\n",
    "    \"w najbliższą sobotę\",\n",
    "    \"w najbliższą niedzielę\",\n",
    "    \"w poniedziałek\", \n",
    "    \"we wtorek\", \n",
    "    \"w środę\", \n",
    "    \"w czwartek\", \n",
    "    \"w piątek\",\n",
    "    \"w sobotę\",\n",
    "    \"w niedzielę\",\n",
    "    \"na najbliższy poniedziałek\", \n",
    "    \"na najbliższy wtorek\", \n",
    "    \"na najbliższą środę\", \n",
    "    \"na najbliższy czwartek\", \n",
    "    \"na najbliższy piątek\",\n",
    "    \"na najbliższą sobotę\",\n",
    "    \"na najbliższą niedzielę\",\n",
    "    \"najbliższy poniedziałek\", \n",
    "    \"najbliższy wtorek\", \n",
    "    \"najbliższa środa\", \n",
    "    \"najbliższy czwartek\", \n",
    "    \"najbliższy piątek\",\n",
    "    \"najbliższa sobota\",\n",
    "    \"najbliższa niedziela\",\n",
    "    \"na jutro\", \n",
    "    \"jutro\", \n",
    "    \"w dniu jutrzejszym\",\n",
    "    \"po jutrze\", \n",
    "    \"pojutrze\", \n",
    "    \"za dwa dni\", \n",
    "    \"za trzy dni\",\n",
    "    \"za tydzień\", \n",
    "    \"dzisiaj\",\n",
    "    \"dziś\",\n",
    "    r\"na dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"na [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"dnia [0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    r\"[0-9]{1,2}[ /][0-9]{1,2}\",\n",
    "    ]\n",
    "\n",
    "times = [\n",
    "    r\"wybieram: [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"wybieram [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"na godzinę [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"na godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"o godzinie [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"o [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"o [0-9]{2}\",\n",
    "    r\"na [0-9]{1,2}\",\n",
    "    r\"na [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    r\"[0-9]{1,2}[:][0-9]{1,2}\",\n",
    "    \"rano\",\n",
    "    \"wieczorem\",\n",
    "    \"w południe\",\n",
    "    \"po południu\",\n",
    "    \"popołudniu\",\n",
    "    \"w nocy\",\n",
    "    \"pierwszą\",\n",
    "    \"drugą\",\n",
    "    \"trzecią\",\n",
    "    \"piątą\",\n",
    "    \"szóstą\",\n",
    "    \"siódmą\",\n",
    "    \"ósmą\",\n",
    "    \"dziewiatą\",\n",
    "    \"dziesiątą\",\n",
    "    \"jedenastą\",\n",
    "    \"dwunastą\",\n",
    "    \"trzynastą\",\n",
    "    \"czternastą\",\n",
    "    \"piętnastą\",\n",
    "    \"szesnastą\",\n",
    "    \"siedemnastą\",\n",
    "    \"osiemnastą\",\n",
    "    \"dziewiętnastą\",\n",
    "    \"dwudziestą pierwszą\",\n",
    "    \"dwudziestą drugą\",\n",
    "    \"dwudziestą trzecią\",\n",
    "    \"dwudziestą czwartą\",\n",
    "    \"o północy\"\n",
    "    ]\n",
    "\n",
    "quantities = [\n",
    "    r\"chcę [1-9][0-9]{0,1}\",\n",
    "    r\"poproszę [1-9][0-9]{0,1}\",\n",
    "    r\"[1-9][0-9]{0,1}\",\n",
    "]\n",
    "\n",
    "seats = [\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",
    "    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',    \n",
    "]\n",
    "\n",
    "areas = [\n",
    "    \"u góry po prawej\",\n",
    "    \"u góry po lewej\",\n",
    "    \"u góry na środku\",\n",
    "    \"na środku po prawej\",\n",
    "    \"na środku po lewej\",\n",
    "    \"na dole po prawej\",\n",
    "    \"na dole po lewej\",\n",
    "    \"na dole na środku\",\n",
    "    \"po lewej\",\n",
    "    \"po prawej\",\n",
    "    \"na środku\",\n",
    "    \"lewo\",\n",
    "    \"prawo\",\n",
    "    \"środek\",\n",
    "    \"blisko od ekranu\",\n",
    "    \"daleko od ekranu\",\n",
    "    \"blisko ekranu\",\n",
    "    \"daleko ekranu\",\n",
    "]\n",
    "\n",
    "genres = [\n",
    "    'akcja',\n",
    "    'akcji'\n",
    "    'dramat',\n",
    "    'komedia',\n",
    "    'Horror',\n",
    "    'Thriller',\n",
    "    'science fiction',\n",
    "    'romans',\n",
    "    'bajka',\n",
    "    'rodzinny',\n",
    "    'animowany',\n",
    "    'bajka',\n",
    "    'komedia romantyczna',\n",
    "\n",
    "]\n",
    "\n",
    "# slots names\n",
    "slots = [\n",
    "    \"tytul\",\n",
    "    \"date\",\n",
    "    \"time\",\n",
    "    \"quantity\",\n",
    "    \"seats\",\n",
    "    \"area\",\n",
    "    \"genres\",\n",
    "    ]\n",
    "\n",
    "def removePunctation(text):\n",
    "    return re.sub(r'[!@#$,\\\"\\'\\?\\'\\\"\\|.]', '', text)\n",
    "\n",
    "def dummyTextAnnotation(text, sampelValuesList, slotName):\n",
    "    textJoin = text\n",
    "    for sampleValue in sampelValuesList:\n",
    "        if re.search(r\"(^|\\s)\" + sampleValue + r\"($|\\s)\", textJoin):\n",
    "            textJoin = re.sub(r'(^|\\s)(' + sampleValue + r')($|\\s)', f\"\\g<1> **/start** {slotName} \\g<2> **/end** \\g<3>\", textJoin)\n",
    "            break # can one sentence has only one slot of a given type?\n",
    "    return textJoin\n",
    "\n",
    "def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):\n",
    "    textTokenize = text.split()\n",
    "    slotLabel =  \"\"\n",
    "    col1 = [i for i in range(1, textLen + 1)]\n",
    "    col2 = []\n",
    "    col3 = [intent for _ in range(textLen)]\n",
    "    col4 = []\n",
    "    isSlot = False\n",
    "    slotVal = None\n",
    "    annotation = \"\"\n",
    "    i = 0\n",
    "    while i < len(textTokenize):\n",
    "        if textTokenize[i] == \"**/start**\":\n",
    "            isSlot = True\n",
    "            slotVal = textTokenize[i + 1]\n",
    "            if len(slotLabel) > 0:\n",
    "                slotLabel += f','\n",
    "            i += 2\n",
    "            annotation = \"B-\"\n",
    "        elif textTokenize[i] == \"**/end**\":\n",
    "            isSlot = False\n",
    "            slotLabel += \":\" + slotVal\n",
    "            slotVal = None\n",
    "            i += 1\n",
    "        elif isSlot:\n",
    "            slotLabel += textTokenize[i]\n",
    "            col2.append(textTokenize[i])\n",
    "            col4.append(annotation + slotVal)\n",
    "            annotation = \"I-\"\n",
    "            i += 1\n",
    "        elif not isSlot:\n",
    "            col2.append(textTokenize[i])\n",
    "            col4.append(\"NoLabel\")\n",
    "            i += 1\n",
    "    df = pd.DataFrame(list(zip(col1, col2, col3, col4)))\n",
    "    path = \"\"\n",
    "    if isTest: path = pathOut + 'test.conllu'\n",
    "    else: path =  pathOut + 'train.conllu'\n",
    "    with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",
    "        outputFile.write(f\"# text: {cleanText}\\n# intent: {intent}\\n# slots: {slotLabel}\\n\")\n",
    "    df.to_csv(path, header=None, index=None, sep='\\t', mode='a')\n",
    "    with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",
    "        outputFile.write(f\"\\n\")\n",
    "    \n",
    "\n",
    "def processFile(pathIn, pathOut, fileName, isTest):\n",
    "    # path = './data/dialog-16-01-01.tsv'\n",
    "    encoding = \"utf-8\"\n",
    "    # encoding = \"cp1250\"\n",
    "    dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n",
    "    \n",
    "    dialog_df = dialog_df.reset_index()  # make sure indexes pair with number of rows\n",
    "    for _, row in dialog_df.iterrows():\n",
    "        if row[0].strip() == \"user\":\n",
    "            text = removePunctation(row[1]).lower()\n",
    "            \n",
    "            # movies annotation\n",
    "            annotatedText = dummyTextAnnotation(text, titles, slots[0])\n",
    "\n",
    "            # dates annotation\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])\n",
    "\n",
    "            # time\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])\n",
    "\n",
    "            # quantity\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])\n",
    "\n",
    "            # seats \n",
    "            annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])\n",
    "\n",
    "            # area\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])\n",
    "            \n",
    "            #genre\n",
    "            annotatedText = dummyTextAnnotation(annotatedText, genres, slots[5])\n",
    "            \n",
    "            parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)\n",
    "            \n",
    "    # dialog_df.to_csv(pathOut + \"test.conllu\", sep=\"\\t\", index=False, header=None)\n",
    "\n",
    "def annotateData():\n",
    "    pathOut = './train_data/'\n",
    "    pathIn = \"./empty_data/\"\n",
    "    i = 0\n",
    "    j = 0\n",
    "    nr = 0\n",
    "    for i in range(11,16):\n",
    "        for j in range(20):\n",
    "            for nr in range(1,5):\n",
    "                fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n",
    "                try:\n",
    "                    processFile(pathIn, pathOut, fileName, False)\n",
    "                except:\n",
    "                    pass\n",
    "                try:\n",
    "                    fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n",
    "                    processFile(pathIn, pathOut, fileName, True)\n",
    "                except:\n",
    "                    pass\n",
    "annotateData()\n",
    "# processFile(\"./empty_data/\", './train_data/', \"dialog-12-01-01.tsv\", isTest=True)\n",
    "# testText = \"dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890\"\n",
    "# print(dummyTextAnnotation(testText, sampelValuesList=titles, slotName=slots[0]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
add model mlu 2022-05-18 00:06:14 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"id": "db5f348a-d8b8-451e-8ff7-7b7b0fb917fc",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pandas as pd\n",`
			`"import regex as re\n",`
			`"\n",`
			`"# sample values\n",`
			`"\n",`
			`"titles = [\n",`
			`" \"Batman\",\n",`
			`" \"Batmana\",\n",`
			`" \"Ambulans\",\n",`
			`" \"Bunkier strachu\",\n",`
			`" \"Córka\",\n",`
			`" \"Curka\"\n",`
			`" \"Uncharted\",\n",`
			`" \"Inni ludzie\",\n",`
			`" \"Śmierć na Nilu\",\n",`
			`" \"Skarb Mikołajka\",\n",`
			`" ]\n",`
			`"\n",`
			`"dates = [\n",`
			`" \"w najbliższy poniedziałek\", \n",`
			`" \"w najbliższy wtorek\", \n",`
			`" \"w najbliższą środę\", \n",`
			`" \"w najbliższy czwartek\", \n",`
			`" \"w najbliższy piątek\",\n",`
			`" \"w najbliższą sobotę\",\n",`
			`" \"w najbliższą niedzielę\",\n",`
			`" \"w poniedziałek\", \n",`
			`" \"we wtorek\", \n",`
			`" \"w środę\", \n",`
			`" \"w czwartek\", \n",`
			`" \"w piątek\",\n",`
			`" \"w sobotę\",\n",`
			`" \"w niedzielę\",\n",`
			`" \"na najbliższy poniedziałek\", \n",`
			`" \"na najbliższy wtorek\", \n",`
			`" \"na najbliższą środę\", \n",`
			`" \"na najbliższy czwartek\", \n",`
			`" \"na najbliższy piątek\",\n",`
			`" \"na najbliższą sobotę\",\n",`
			`" \"na najbliższą niedzielę\",\n",`
			`" \"najbliższy poniedziałek\", \n",`
			`" \"najbliższy wtorek\", \n",`
			`" \"najbliższa środa\", \n",`
			`" \"najbliższy czwartek\", \n",`
			`" \"najbliższy piątek\",\n",`
			`" \"najbliższa sobota\",\n",`
			`" \"najbliższa niedziela\",\n",`
			`" \"na jutro\", \n",`
			`" \"jutro\", \n",`
			`" \"w dniu jutrzejszym\",\n",`
			`" \"po jutrze\", \n",`
			`" \"pojutrze\", \n",`
			`" \"za dwa dni\", \n",`
			`" \"za trzy dni\",\n",`
			`" \"za tydzień\", \n",`
			`" \"dzisiaj\",\n",`
			`" \"dziś\",\n",`
			`" r\"na dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",`
			`" r\"na [0-9]{1,2}[ /][0-9]{1,2}\",\n",`
			`" r\"dzień [0-9]{1,2}[ /][0-9]{1,2}\",\n",`
			`" r\"dnia [0-9]{1,2}[ /][0-9]{1,2}\",\n",`
			`" r\"[0-9]{1,2}[ /][0-9]{1,2}\",\n",`
			`" ]\n",`
			`"\n",`
			`"times = [\n",`
			`" r\"wybieram: [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"wybieram [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"na godzinę [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"na godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"o godzinie [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"o [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"o [0-9]{2}\",\n",`
			`" r\"na [0-9]{1,2}\",\n",`
			`" r\"na [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"godzina [0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" r\"[0-9]{1,2}[:][0-9]{1,2}\",\n",`
			`" \"rano\",\n",`
			`" \"wieczorem\",\n",`
			`" \"w południe\",\n",`
			`" \"po południu\",\n",`
			`" \"popołudniu\",\n",`
			`" \"w nocy\",\n",`
			`" \"pierwszą\",\n",`
			`" \"drugą\",\n",`
			`" \"trzecią\",\n",`
			`" \"piątą\",\n",`
			`" \"szóstą\",\n",`
			`" \"siódmą\",\n",`
			`" \"ósmą\",\n",`
			`" \"dziewiatą\",\n",`
			`" \"dziesiątą\",\n",`
			`" \"jedenastą\",\n",`
			`" \"dwunastą\",\n",`
			`" \"trzynastą\",\n",`
			`" \"czternastą\",\n",`
			`" \"piętnastą\",\n",`
			`" \"szesnastą\",\n",`
			`" \"siedemnastą\",\n",`
			`" \"osiemnastą\",\n",`
			`" \"dziewiętnastą\",\n",`
			`" \"dwudziestą pierwszą\",\n",`
			`" \"dwudziestą drugą\",\n",`
			`" \"dwudziestą trzecią\",\n",`
			`" \"dwudziestą czwartą\",\n",`
			`" \"o północy\"\n",`
			`" ]\n",`
			`"\n",`
			`"quantities = [\n",`
			`" r\"chcę [1-9][0-9]{0,1}\",\n",`
			`" r\"poproszę [1-9][0-9]{0,1}\",\n",`
			`" r\"[1-9][0-9]{0,1}\",\n",`
			`"]\n",`
			`"\n",`
			`"seats = [\n",`
			`" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',\n",`
			`" r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}', \n",`
			`"]\n",`
			`"\n",`
			`"areas = [\n",`
			`" \"u góry po prawej\",\n",`
			`" \"u góry po lewej\",\n",`
			`" \"u góry na środku\",\n",`
			`" \"na środku po prawej\",\n",`
			`" \"na środku po lewej\",\n",`
			`" \"na dole po prawej\",\n",`
			`" \"na dole po lewej\",\n",`
			`" \"na dole na środku\",\n",`
			`" \"po lewej\",\n",`
			`" \"po prawej\",\n",`
			`" \"na środku\",\n",`
			`" \"lewo\",\n",`
			`" \"prawo\",\n",`
			`" \"środek\",\n",`
			`" \"blisko od ekranu\",\n",`
			`" \"daleko od ekranu\",\n",`
			`" \"blisko ekranu\",\n",`
			`" \"daleko ekranu\",\n",`
			`"]\n",`
			`"\n",`
			`"genres = [\n",`
			`" 'akcja',\n",`
			`" 'akcji'\n",`
			`" 'dramat',\n",`
			`" 'komedia',\n",`
			`" 'Horror',\n",`
			`" 'Thriller',\n",`
			`" 'science fiction',\n",`
			`" 'romans',\n",`
			`" 'bajka',\n",`
			`" 'rodzinny',\n",`
			`" 'animowany',\n",`
			`" 'bajka',\n",`
			`" 'komedia romantyczna',\n",`
			`"\n",`
			`"]\n",`
			`"\n",`
			`"# slots names\n",`
			`"slots = [\n",`
			`" \"tytul\",\n",`
			`" \"date\",\n",`
			`" \"time\",\n",`
			`" \"quantity\",\n",`
			`" \"seats\",\n",`
			`" \"area\",\n",`
			`" \"genres\",\n",`
			`" ]\n",`
			`"\n",`
			`"def removePunctation(text):\n",`
			`" return re.sub(r'[!@#$,\\\"\\'\\?\\'\\\"\\\|.]', '', text)\n",`
			`"\n",`
			`"def dummyTextAnnotation(text, sampelValuesList, slotName):\n",`
			`" textJoin = text\n",`
			`" for sampleValue in sampelValuesList:\n",`
			`" if re.search(r\"(^\|\\s)\" + sampleValue + r\"($\|\\s)\", textJoin):\n",`
			`" textJoin = re.sub(r'(^\|\\s)(' + sampleValue + r')($\|\\s)', f\"\\g<1> /start {slotName} \\g<2> /end \\g<3>\", textJoin)\n",`
			`" break # can one sentence has only one slot of a given type?\n",`
			`" return textJoin\n",`
			`"\n",`
			`"def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):\n",`
			`" textTokenize = text.split()\n",`
			`" slotLabel = \"\"\n",`
			`" col1 = [i for i in range(1, textLen + 1)]\n",`
			`" col2 = []\n",`
			`" col3 = [intent for _ in range(textLen)]\n",`
			`" col4 = []\n",`
			`" isSlot = False\n",`
			`" slotVal = None\n",`
			`" annotation = \"\"\n",`
			`" i = 0\n",`
			`" while i < len(textTokenize):\n",`
			`" if textTokenize[i] == \"/start\":\n",`
			`" isSlot = True\n",`
			`" slotVal = textTokenize[i + 1]\n",`
			`" if len(slotLabel) > 0:\n",`
			`" slotLabel += f','\n",`
			`" i += 2\n",`
			`" annotation = \"B-\"\n",`
			`" elif textTokenize[i] == \"/end\":\n",`
			`" isSlot = False\n",`
			`" slotLabel += \":\" + slotVal\n",`
			`" slotVal = None\n",`
			`" i += 1\n",`
			`" elif isSlot:\n",`
			`" slotLabel += textTokenize[i]\n",`
			`" col2.append(textTokenize[i])\n",`
			`" col4.append(annotation + slotVal)\n",`
			`" annotation = \"I-\"\n",`
			`" i += 1\n",`
			`" elif not isSlot:\n",`
			`" col2.append(textTokenize[i])\n",`
			`" col4.append(\"NoLabel\")\n",`
			`" i += 1\n",`
			`" df = pd.DataFrame(list(zip(col1, col2, col3, col4)))\n",`
			`" path = \"\"\n",`
			`" if isTest: path = pathOut + 'test.conllu'\n",`
			`" else: path = pathOut + 'train.conllu'\n",`
			`" with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",`
			`" outputFile.write(f\"# text: {cleanText}\\n# intent: {intent}\\n# slots: {slotLabel}\\n\")\n",`
			`" df.to_csv(path, header=None, index=None, sep='\\t', mode='a')\n",`
			`" with open(path, \"a\", encoding=\"utf-8\") as outputFile:\n",`
			`" outputFile.write(f\"\\n\")\n",`
			`" \n",`
			`"\n",`
			`"def processFile(pathIn, pathOut, fileName, isTest):\n",`
			`" # path = './data/dialog-16-01-01.tsv'\n",`
			`" encoding = \"utf-8\"\n",`
			`" # encoding = \"cp1250\"\n",`
			`" dialog_df = pd.read_csv(pathIn + fileName, sep='\\t', header=None, encoding=encoding)\n",`
			`" \n",`
			`" dialog_df = dialog_df.reset_index() # make sure indexes pair with number of rows\n",`
			`" for _, row in dialog_df.iterrows():\n",`
			`" if row[0].strip() == \"user\":\n",`
			`" text = removePunctation(row[1]).lower()\n",`
			`" \n",`
			`" # movies annotation\n",`
			`" annotatedText = dummyTextAnnotation(text, titles, slots[0])\n",`
			`"\n",`
			`" # dates annotation\n",`
			`" annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])\n",`
			`"\n",`
			`" # time\n",`
			`" annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])\n",`
			`"\n",`
			`" # quantity\n",`
			`" annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])\n",`
			`"\n",`
			`" # seats \n",`
			`" annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])\n",`
			`"\n",`
			`" # area\n",`
			`" annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])\n",`
			`" \n",`
			`" #genre\n",`
			`" annotatedText = dummyTextAnnotation(annotatedText, genres, slots[5])\n",`
			`" \n",`
			`" parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)\n",`
			`" \n",`
			`" # dialog_df.to_csv(pathOut + \"test.conllu\", sep=\"\\t\", index=False, header=None)\n",`
			`"\n",`
			`"def annotateData():\n",`
			`" pathOut = './train_data/'\n",`
			`" pathIn = \"./empty_data/\"\n",`
			`" i = 0\n",`
			`" j = 0\n",`
			`" nr = 0\n",`
			`" for i in range(11,16):\n",`
			`" for j in range(20):\n",`
			`" for nr in range(1,5):\n",`
			`" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \".tsv\"\n",`
			`" try:\n",`
			`" processFile(pathIn, pathOut, fileName, False)\n",`
			`" except:\n",`
			`" pass\n",`
			`" try:\n",`
			`" fileName = \"dialog-\" + str(i).zfill(2) + \"-\" + str(j).zfill(2) + \"-\" + str(nr).zfill(2) + \"(test)\" + \".tsv\"\n",`
			`" processFile(pathIn, pathOut, fileName, True)\n",`
			`" except:\n",`
			`" pass\n",`
			`"annotateData()\n",`
			`"# processFile(\"./empty_data/\", './train_data/', \"dialog-12-01-01.tsv\", isTest=True)\n",`
			`"# testText = \"dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890\"\n",`
			`"# print(dummyTextAnnotation(testText, sampelValuesList=titles, slotName=slots[0]))"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
add grammar 2022-06-05 18:56:12 +02:00			`"version": "3.9.7"`
add model mlu 2022-05-18 00:06:14 +02:00			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`
No results found.