import pandas as pd import regex # cleans our data # extracts intents for lab8 from dtsv files with dialogues def removeParenthesis(text): resultText = "" leftParCount = 0 for letter in text: if letter == "(": leftParCount += 1 if (leftParCount == 0): resultText += letter if letter == ")": leftParCount -= 1 return resultText def processFile(pathIn, pathOut, fileName): # path = './data/dialog-16-01-01.tsv' encoding = "utf-8" # encoding = "cp1250" dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding) dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x))) dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r"( & |&| AND |AND)", " ", str(x))) dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : " ".join(list(set(x.split())))) dialog_df.to_csv(pathOut + fileName, sep="\t", index=False, header=None) def cleanAllData(): pathOut = './data/clean/' pathIn = "./data/" i = 0 j = 0 nr = 0 for i in range(16,20): for j in range(20): for nr in range(1,5): fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv" try: processFile(pathIn, pathOut, fileName) except: pass try: fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv" processFile(pathIn, pathOut, fileName) except: pass cleanAllData() #processFile("./data/", './data/clean/', 'dialog-16-09-01.tsv')