48 lines
1.7 KiB
Python
48 lines
1.7 KiB
Python
import pandas as pd
|
|
import regex
|
|
|
|
# cleans our data
|
|
# extracts intents for lab8 from dtsv files with dialogues
|
|
def removeParenthesis(text):
|
|
resultText = ""
|
|
leftParCount = 0
|
|
for letter in text:
|
|
if letter == "(": leftParCount += 1
|
|
if (leftParCount == 0):
|
|
resultText += letter
|
|
if letter == ")": leftParCount -= 1
|
|
return resultText
|
|
|
|
def processFile(pathIn, pathOut, fileName):
|
|
# path = './data/dialog-16-01-01.tsv'
|
|
encoding = "utf-8"
|
|
# encoding = "cp1250"
|
|
dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)
|
|
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))
|
|
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r"( & |&| AND |AND)", " ", str(x)))
|
|
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : " ".join(list(set(x.split()))))
|
|
dialog_df.to_csv(pathOut + fileName, sep="\t", index=False, header=None)
|
|
|
|
def cleanAllData():
|
|
pathOut = './data/clean/'
|
|
pathIn = "./data/"
|
|
i = 0
|
|
j = 0
|
|
nr = 0
|
|
for i in range(16,20):
|
|
for j in range(20):
|
|
for nr in range(1,5):
|
|
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
|
|
try:
|
|
processFile(pathIn, pathOut, fileName)
|
|
except:
|
|
pass
|
|
try:
|
|
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
|
|
processFile(pathIn, pathOut, fileName)
|
|
except:
|
|
pass
|
|
|
|
cleanAllData()
|
|
|
|
#processFile("./data/", './data/clean/', 'dialog-16-09-01.tsv') |