284 lines
8.4 KiB
Python
284 lines
8.4 KiB
Python
|
import pandas as pd
|
||
|
import regex as re
|
||
|
|
||
|
# sample values
|
||
|
|
||
|
titles = [
|
||
|
"batmana",
|
||
|
"batman",
|
||
|
"na noże",
|
||
|
"zorro",
|
||
|
"transformer",
|
||
|
"podróż na księżyc",
|
||
|
"krzyk",
|
||
|
"to nie wypanda",
|
||
|
"inni ludzie",
|
||
|
"ambulans",
|
||
|
"uncharted",
|
||
|
"nasze magiczne encanto"
|
||
|
"zorro - jak to było na prawdę",
|
||
|
"fdantastyczne zwierznera",
|
||
|
"cud guadlupe",
|
||
|
"piosenki o miłości",
|
||
|
"cud Guadalupe",
|
||
|
"historia mojej żony",
|
||
|
"matki równoległe",
|
||
|
"wielka wolność",
|
||
|
"najgorszy człowiek na świecie",
|
||
|
"inni ludzie",
|
||
|
]
|
||
|
|
||
|
dates = [
|
||
|
"w najbliższy poniedziałek",
|
||
|
"w najbliższy wtorek",
|
||
|
"w najbliższą środę",
|
||
|
"w najbliższy czwartek",
|
||
|
"w najbliższy piątek",
|
||
|
"w najbliższą sobotę",
|
||
|
"w najbliższą niedzielę",
|
||
|
"w poniedziałek",
|
||
|
"we wtorek",
|
||
|
"w środę",
|
||
|
"w czwartek",
|
||
|
"w piątek",
|
||
|
"w sobotę",
|
||
|
"w niedzielę",
|
||
|
"na najbliższy poniedziałek",
|
||
|
"na najbliższy wtorek",
|
||
|
"na najbliższą środę",
|
||
|
"na najbliższy czwartek",
|
||
|
"na najbliższy piątek",
|
||
|
"na najbliższą sobotę",
|
||
|
"na najbliższą niedzielę",
|
||
|
"najbliższy poniedziałek",
|
||
|
"najbliższy wtorek",
|
||
|
"najbliższa środa",
|
||
|
"najbliższy czwartek",
|
||
|
"najbliższy piątek",
|
||
|
"najbliższa sobota",
|
||
|
"najbliższa niedziela",
|
||
|
"na jutro",
|
||
|
"jutro",
|
||
|
"w dniu jutrzejszym",
|
||
|
"po jutrze",
|
||
|
"pojutrze",
|
||
|
"za dwa dni",
|
||
|
"za trzy dni",
|
||
|
"za tydzień",
|
||
|
"dzisiaj",
|
||
|
"dziś",
|
||
|
r"na dzień [0-9]{1,2}[ /][0-9]{1,2}",
|
||
|
r"na [0-9]{1,2}[ /][0-9]{1,2}",
|
||
|
r"dzień [0-9]{1,2}[ /][0-9]{1,2}",
|
||
|
r"dnia [0-9]{1,2}[ /][0-9]{1,2}",
|
||
|
r"[0-9]{1,2}[ /][0-9]{1,2}",
|
||
|
]
|
||
|
|
||
|
times = [
|
||
|
r"wybieram: [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"wybieram [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"na godzinę [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"na godzina [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"o godzinie [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"o [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"o [0-9]{2}",
|
||
|
r"na [0-9]{1,2}",
|
||
|
r"na [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"godzina [0-9]{1,2}[:][0-9]{1,2}",
|
||
|
r"[0-9]{1,2}[:][0-9]{1,2}",
|
||
|
"rano",
|
||
|
"wieczorem",
|
||
|
"w południe",
|
||
|
"po południu",
|
||
|
"popołudniu",
|
||
|
"w nocy",
|
||
|
"pierwszą",
|
||
|
"drugą",
|
||
|
"trzecią",
|
||
|
"piątą",
|
||
|
"szóstą",
|
||
|
"siódmą",
|
||
|
"ósmą",
|
||
|
"dziewiatą",
|
||
|
"dziesiątą",
|
||
|
"jedenastą",
|
||
|
"dwunastą",
|
||
|
"trzynastą",
|
||
|
"czternastą",
|
||
|
"piętnastą",
|
||
|
"szesnastą",
|
||
|
"siedemnastą",
|
||
|
"osiemnastą",
|
||
|
"dziewiętnastą",
|
||
|
"dwudziestą pierwszą",
|
||
|
"dwudziestą drugą",
|
||
|
"dwudziestą trzecią",
|
||
|
"dwudziestą czwartą",
|
||
|
"o północy"
|
||
|
]
|
||
|
|
||
|
quantities = [
|
||
|
r"chcę [1-9][0-9]{0,1}",
|
||
|
r"poproszę [1-9][0-9]{0,1}",
|
||
|
r"[1-9][0-9]{0,1}",
|
||
|
]
|
||
|
|
||
|
seats = [
|
||
|
r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2} [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2},[a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
|
||
|
r'[a-zA-Z][0-9]{1,2}, [a-zA-Z][0-9]{1,2}',
|
||
|
]
|
||
|
|
||
|
areas = [
|
||
|
"u góry po prawej",
|
||
|
"u góry po lewej",
|
||
|
"u góry na środku",
|
||
|
"na środku po prawej",
|
||
|
"na środku po lewej",
|
||
|
"na dole po prawej",
|
||
|
"na dole po lewej",
|
||
|
"na dole na środku",
|
||
|
"po lewej",
|
||
|
"po prawej",
|
||
|
"na środku",
|
||
|
"lewo",
|
||
|
"prawo",
|
||
|
"środek",
|
||
|
"blisko od ekranu",
|
||
|
"daleko od ekranu",
|
||
|
"blisko ekranu",
|
||
|
"daleko ekranu",
|
||
|
]
|
||
|
|
||
|
# slots names
|
||
|
slots = [
|
||
|
"title",
|
||
|
"date",
|
||
|
"time",
|
||
|
"quantity",
|
||
|
"seats",
|
||
|
"area"
|
||
|
]
|
||
|
|
||
|
def removePunctation(text):
|
||
|
return re.sub(r'[!@#$,\"\'\?\'\"\|.]', '', text)
|
||
|
|
||
|
def dummyTextAnnotation(text, sampelValuesList, slotName):
|
||
|
textJoin = text
|
||
|
for sampleValue in sampelValuesList:
|
||
|
if re.search(r"(^|\s)" + sampleValue + r"($|\s)", textJoin):
|
||
|
textJoin = re.sub(r'(^|\s)(' + sampleValue + r')($|\s)', f"\g<1> **/start** {slotName} \g<2> **/end** \g<3>", textJoin)
|
||
|
break # can one sentence has only one slot of a given type?
|
||
|
return textJoin
|
||
|
|
||
|
def parseAnnotation(text, intent, textLen, cleanText, isTest, pathOut):
|
||
|
textTokenize = text.split()
|
||
|
slotLabel = ""
|
||
|
col1 = [i for i in range(1, textLen + 1)]
|
||
|
col2 = []
|
||
|
col3 = [intent for _ in range(textLen)]
|
||
|
col4 = []
|
||
|
isSlot = False
|
||
|
slotVal = None
|
||
|
annotation = ""
|
||
|
i = 0
|
||
|
while i < len(textTokenize):
|
||
|
if textTokenize[i] == "**/start**":
|
||
|
isSlot = True
|
||
|
slotVal = textTokenize[i + 1]
|
||
|
if len(slotLabel) > 0:
|
||
|
slotLabel += f','
|
||
|
i += 2
|
||
|
annotation = "B-"
|
||
|
elif textTokenize[i] == "**/end**":
|
||
|
isSlot = False
|
||
|
slotLabel += ":" + slotVal
|
||
|
slotVal = None
|
||
|
i += 1
|
||
|
elif isSlot:
|
||
|
slotLabel += textTokenize[i]
|
||
|
col2.append(textTokenize[i])
|
||
|
col4.append(annotation + slotVal)
|
||
|
annotation = "I-"
|
||
|
i += 1
|
||
|
elif not isSlot:
|
||
|
col2.append(textTokenize[i])
|
||
|
col4.append("NoLabel")
|
||
|
i += 1
|
||
|
df = pd.DataFrame(list(zip(col1, col2, col3, col4)))
|
||
|
path = ""
|
||
|
if isTest: path = pathOut + 'test.conllu'
|
||
|
else: path = pathOut + 'train.conllu'
|
||
|
with open(path, "a", encoding="utf-8") as outputFile:
|
||
|
outputFile.write(f"# text: {cleanText}\n# intent: {intent}\n# slots: {slotLabel}\n")
|
||
|
df.to_csv(path, header=None, index=None, sep='\t', mode='a')
|
||
|
with open(path, "a", encoding="utf-8") as outputFile:
|
||
|
outputFile.write(f"\n")
|
||
|
|
||
|
|
||
|
def processFile(pathIn, pathOut, fileName, isTest):
|
||
|
# path = './data/dialog-16-01-01.tsv'
|
||
|
encoding = "utf-8"
|
||
|
# encoding = "cp1250"
|
||
|
dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)
|
||
|
|
||
|
dialog_df = dialog_df.reset_index() # make sure indexes pair with number of rows
|
||
|
for _, row in dialog_df.iterrows():
|
||
|
if row[0].strip() == "user":
|
||
|
text = removePunctation(row[1]).lower()
|
||
|
|
||
|
# movies annotation
|
||
|
annotatedText = dummyTextAnnotation(text, titles, slots[0])
|
||
|
|
||
|
# dates annotation
|
||
|
annotatedText = dummyTextAnnotation(annotatedText, dates, slots[1])
|
||
|
|
||
|
# time
|
||
|
annotatedText = dummyTextAnnotation(annotatedText, times, slots[2])
|
||
|
|
||
|
# quantity
|
||
|
annotatedText = dummyTextAnnotation(annotatedText, quantities, slots[3])
|
||
|
|
||
|
# seats
|
||
|
annotatedText = dummyTextAnnotation(annotatedText, seats, slots[4])
|
||
|
|
||
|
# area
|
||
|
annotatedText = dummyTextAnnotation(annotatedText, areas, slots[5])
|
||
|
|
||
|
parseAnnotation(annotatedText, row[2], len(row[1].split()), text, isTest, pathOut)
|
||
|
|
||
|
# dialog_df.to_csv(pathOut + "test.conllu", sep="\t", index=False, header=None)
|
||
|
|
||
|
def annotateData():
|
||
|
pathOut = './tasks/zad8/pl/'
|
||
|
pathIn = "./data/clean/"
|
||
|
i = 0
|
||
|
j = 0
|
||
|
nr = 0
|
||
|
for i in range(16,20):
|
||
|
for j in range(20):
|
||
|
for nr in range(1,5):
|
||
|
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
|
||
|
try:
|
||
|
processFile(pathIn, pathOut, fileName, False)
|
||
|
except:
|
||
|
pass
|
||
|
try:
|
||
|
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
|
||
|
processFile(pathIn, pathOut, fileName, True)
|
||
|
except:
|
||
|
pass
|
||
|
# annotateData()
|
||
|
processFile("./data/clean/", './tasks/zad8/pl/', "dialog-20-01-01.tsv", isTest=False)
|
||
|
# testText = "dobrze dokonano rezerwacji na film transformer numer twojej rezeracji to 123890"
|
||
|
# print(dummyTextAnotation(testText, sampelValuesList=titles, slotName=slots[0]))
|