SystemyDialogowe-ProjektMag.../tasks/zad8/pl/annotate.py

import pandas as pd
import regex as re

# sample values

titles = [
    "batmana",
    "batman",
    "na noże",
    "zorro",
    "transformer",
    "podróż na księżyc",
    "krzyk",
    "to nie wypanda",
    "inni ludzie",
    "ambulans",
    "uncharted",
    "nasze magiczne encanto"
    "zorro - jak to było na prawdę",
    "fdantastyczne zwierznera",
    "cud guadlupe",
    "piosenki o miłości",
    "cud Guadalupe",
    "historia mojej żony",
    "matki równoległe",
    "wielka wolność",
    "najgorszy człowiek na świecie",
    "inni ludzie",
    ]

dates = [
    "w najbliższy poniedziałek",
    "w najbliższy wtorek",
    "w najbliższą środę",
    "w najbliższy czwartek",
    "w najbliższy piątek",
    "w najbliższą sobotę",
    "w najbliższą niedzielę",
    "w poniedziałek",
    "we wtorek",
    "w środę",
    "w czwartek",
    "w piątek",
    "w sobotę",
    "w niedzielę",
    "na najbliższy poniedziałek",
    "na najbliższy wtorek",
    "na najbliższą środę",
    "na najbliższy czwartek",
    "na najbliższy piątek",
    "na najbliższą sobotę",
    "na najbliższą niedzielę",
    "najbliższy poniedziałek",
    "najbliższy wtorek",
    "najbliższa środa",
    "najbliższy czwartek",
    "najbliższy piątek",
    "najbliższa sobota",
    "najbliższa niedziela",
    "na jutro",
    "jutro",
    "w dniu jutrzejszym",
    "po jutrze",
    "pojutrze",
    "za dwa dni",
    "za trzy dni",
    "za tydzień",
    "dzisiaj",
    "dziś",
    r"na dzień [0-9]{1,2}[ /][0-9]{1,2}",
    r"na [0-9]{1,2}[ /][0-9]{1,2}",
    r"dzień [0-9]{1,2}[ /][0-9]{1,2}",
    r"dnia [0-9]{1,2}[ /][0-9]{1,2}",
    r"[0-9]{1,2}[ /][0-9]{1,2}",
    ]

times = [
    r"wybieram: [0-9]{1,2}[:][0-9]{1,2}",
    r"wybieram [0-9]{1,2}[:][0-9]{1,2}",
    r"na godzinę [0-9]{1,2}[:][0-9]{1,2}",
    r"na godzina [0-9]{1,2}[:][0-9]{1,2}",
    r"o godzinie [0-9]{1,2}[:][0-9]{1,2}",
    r"o [0-9]{1,2}[:][0-9]{1,2}",
    r"o [0-9]{2}",
    r"na [0-9]{1,2}",
    r"na [0-9]{1,2}[:][0-9]{1,2}",
    r"godzina [0-9]{1,2}[:][0-9]{1,2}",
    r"[0-9]{1,2}[:][0-9]{1,2}",
    "rano",
    "wieczorem",
    "w południe",
    "po południu",
    "popołudniu",
    "w nocy",
    "pierwszą",
    "drugą",
    "trzecią",
    "piątą",
    "szóstą",
    "siódmą",
    "ósmą",
    "dziewiatą",
    "dziesiątą",
    "jedenastą",
    "dwunastą",
    "trzynastą",
    "czternastą",
    "piętnastą",
    "szesnastą",
    "siedemnastą",
    "osiemnastą",
    "dziewiętnastą",
    "dwudziestą pierwszą",
    "dwudziestą drugą",
    "dwudziestą trzecią",
    "dwudziestą czwartą",
    "o północy"
    ]

quantities = [
    r"chcę [1-9][0-9]{0,1}",
    r"poproszę [1-9][0-9]{0,1}",
    r"[1-9][0-9]{0,1}",
]

seats = [
    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
    r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
]

areas = [
    "u góry po prawej",
    "u góry po lewej",
    "u góry na środku",
    "na środku po prawej",
    "na środku po lewej",
    "na dole po prawej",
    "na dole po lewej",
    "na dole na środku",
    "po lewej",
    "po prawej",
    "na środku",
    "lewo",
    "prawo",
    "środek",
    "blisko od ekranu",
    "daleko od ekranu",
    "blisko ekranu",
    "daleko ekranu",
]

# slots names
slots = [
    "title",
    "date",
    "time",
    "quantity",
    "seats",
    "area"
    ]

def removePunctation(text):
    return re.sub(r'[!@#$,\"\'\?\'\"\|.]', '', text)

def dummyTextAnnotation(text, sampelValuesList, slotName):
    textJoin = text
    for sampleValue in sampelValuesList:
        if re.search(r"(^|\s)" + sampleValue + r"($|\s)", textJoin):
            textJoin = re.sub(r'(^|\s)(' + sampleValue + r')($|\s)', f"\g<1> **/start** {slotName} \g<2> **/end** \g<3>", textJoin)
            break # can one sentence has only one slot of a given type?
    return textJoin

def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):
    textTokenize = text.split()
    slotLabel =  ""
    col1 = [i for i in range(1, textLen + 1)]
    col2 = []
    col3 = [intent for _ in range(textLen)]
    col4 = []
    isSlot = False
    slotVal = None
    annotation = ""
    i = 0
    while i < len(textTokenize):
        if textTokenize[i] == "**/start**":
            isSlot = True
            slotVal = textTokenize[i + 1]
            if len(slotLabel) > 0:
                slotLabel += f','
            i += 2
            annotation = "B-"
        elif textTokenize[i] == "**/end**":
            isSlot = False
            slotLabel += ":" + slotVal
            slotVal = None
            i += 1
        elif isSlot:
            slotLabel += textTokenize[i]
            col2.append(textTokenize[i])
            col4.append(annotation + slotVal)
            annotation = "I-"
            i += 1
        elif not isSlot:
            col2.append(textTokenize[i])
            col4.append("NoLabel")
            i += 1
    df = pd.DataFrame(list(zip(col1, col2, col3, col4)))
    path = ""
    if isTest: path = pathOut + 'test.conllu'
    else: path =  pathOut + 'train.conllu'
    with open(path, "a", encoding="utf-8") as outputFile:
        outputFile.write(f"# text: {cleanText}\n# intent: {intent}\n# slots: {slotLabel}\n")
    df.to_csv(path, header=None, index=None, sep='\t', mode='a')
    with open(path, "a", encoding="utf-8") as outputFile:
        outputFile.write(f"\n")


def processFile(pathIn, pathOut, fileName, isTest):
    # path = './data/dialog-16-01-01.tsv'
    encoding = "utf-8"
    # encoding = "cp1250"
    dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)

    dialog_df = dialog_df.reset_index()  # make sure indexes pair with number of rows
    for _, row in dialog_df.iterrows():
        if row[0].strip() == "user":
            text = removePunctation(row[1]).lower()

            # movies annotation
            annotatedText = dummyTextAnnotation(text, titles, slots[0])

            # dates annotation
            annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])

            # time
            annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])

            # quantity
            annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])

            # seats
            annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])

            # area
            annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])

            parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)

    # dialog_df.to_csv(pathOut + "test.conllu", sep="\t", index=False, header=None)

def annotateData():
    pathOut = './tasks/zad8/pl/'
    pathIn = "./data/clean/"
    i = 0
    j = 0
    nr = 0
    for i in range(16,20):
        for j in range(20):
            for nr in range(1,5):
                fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
                try:
                    processFile(pathIn, pathOut, fileName, False)
                except:
                    pass
                try:
                    fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
                    processFile(pathIn, pathOut, fileName, True)
                except:
                    pass
# annotateData()
processFile("./data/clean/", './tasks/zad8/pl/', "dialog-20-01-01.tsv", isTest=False)
# testText = "dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890"
# print(dummyTextAnotation(testText, sampelValuesList=titles, slotName=slots[0]))