SystemyDialogowe-ProjektMag.../tasks/zad8/pl/cleanData.py

48 lines
1.7 KiB
Python
Raw Permalink Normal View History

import pandas as pd
import regex
# cleans our data
# extracts intents for lab8 from dtsv files with dialogues
def removeParenthesis(text):
resultText = ""
leftParCount = 0
for letter in text:
if letter == "(": leftParCount += 1
if (leftParCount == 0):
resultText += letter
if letter == ")": leftParCount -= 1
return resultText
def processFile(pathIn, pathOut, fileName):
# path = './data/dialog-16-01-01.tsv'
encoding = "utf-8"
# encoding = "cp1250"
dialog_df = pd.read_csv(pathIn + fileName, sep='\t', header=None, encoding=encoding)
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : removeParenthesis(str(x)))
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : regex.sub(r"( & |&| AND |AND)", " ", str(x)))
dialog_df.iloc[:,2] = dialog_df.iloc[:,2].apply(lambda x : " ".join(list(set(x.split()))))
dialog_df.to_csv(pathOut + fileName, sep="\t", index=False, header=None)
def cleanAllData():
pathOut = './data/clean/'
pathIn = "./data/"
i = 0
j = 0
nr = 0
for i in range(16,20):
for j in range(20):
for nr in range(1,5):
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + ".tsv"
try:
processFile(pathIn, pathOut, fileName)
except:
pass
try:
fileName = "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) + "(test)" + ".tsv"
processFile(pathIn, pathOut, fileName)
except:
pass
cleanAllData()
#processFile("./data/", './data/clean/', 'dialog-16-09-01.tsv')