SystemyDialogowe/clean_data.ipynb

2.7 KiB

import pandas as pd
import regex

# cleans our data
# extracts intents for lab8 from dtsv files with dialogues
def removeParenthesis(text):
    resultText = ""
    leftParCount = 0
    for letter in text:
        if letter == "(": leftParCount += 1
        if (leftParCount == 0):
            resultText += letter
        if letter == ")": leftParCount -= 1
    return resultText

def processFile(pathIn, pathOut, fileName):
    # path = './lab/dialog-15-04-01.tsv'
    encoding = "utf-8"
    # encoding = "cp1250"
    dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)
    dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))
    dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r"( & |&| AND |AND)", " ", str(x)))
    dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : " ".join(list(set(x.split()))))
    dialog_df.to_csv(pathOut + fileName, sep="\t", index=False, header=None)

def cleanAllData():
    pathOut = './empty_data/'
    pathIn = "./data_v2/"
    i = 0
    j = 0
    nr = 0
    for i in range(11,16):
        for j in range(20):
            for nr in range(1,2):
                fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
                try:
                    processFile(pathIn, pathOut, fileName)
                except:
                    pass
                try:
                    fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
                    processFile(pathIn, pathOut, fileName)
                except:
                    pass

cleanAllData()