2.7 KiB
2.7 KiB
import pandas as pd
import regex
# cleans our data
# extracts intents for lab8 from dtsv files with dialogues
def removeParenthesis(text):
resultText = ""
leftParCount = 0
for letter in text:
if letter == "(": leftParCount += 1
if (leftParCount == 0):
resultText += letter
if letter == ")": leftParCount -= 1
return resultText
def processFile(pathIn, pathOut, fileName):
# path = './lab/dialog-15-04-01.tsv'
encoding = "utf-8"
# encoding = "cp1250"
dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r"( & |&| AND |AND)", " ", str(x)))
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : " ".join(list(set(x.split()))))
dialog_df.to_csv(pathOut + fileName, sep="\t", index=False, header=None)
def cleanAllData():
pathOut = './empty_data/'
pathIn = "./data_v2/"
i = 0
j = 0
nr = 0
for i in range(11,16):
for j in range(20):
for nr in range(1,2):
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
try:
processFile(pathIn, pathOut, fileName)
except:
pass
try:
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
processFile(pathIn, pathOut, fileName)
except:
pass
cleanAllData()