SystemyDialogowe-ProjektMag.../tasks/zad8/pl/cleanData.py

import pandas as pd
import regex

# cleans our data
# extracts intents for lab8 from dtsv files with dialogues
def removeParenthesis(text):
    resultText = ""
    leftParCount = 0
    for letter in text:
        if letter == "(": leftParCount += 1
        if (leftParCount == 0):
            resultText += letter
        if letter == ")": leftParCount -= 1
    return resultText

def processFile(pathIn, pathOut, fileName):
    # path = './data/dialog-16-01-01.tsv'
    encoding = "utf-8"
    # encoding = "cp1250"
    dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)
    dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))
    dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r"( & |&| AND |AND)", " ", str(x)))
    dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : " ".join(list(set(x.split()))))
    dialog_df.to_csv(pathOut + fileName, sep="\t", index=False, header=None)

def cleanAllData():
    pathOut = './data/clean/'
    pathIn = "./data/"
    i = 0
    j = 0
    nr = 0
    for i in range(16,20):
        for j in range(20):
            for nr in range(1,5):
                fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
                try:
                    processFile(pathIn, pathOut, fileName)
                except:
                    pass
                try:
                    fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
                    processFile(pathIn, pathOut, fileName)
                except:
                    pass

cleanAllData()

#processFile("./data/", './data/clean/', 'dialog-16-09-01.tsv')