import pandas as pd from sympy import true def flatten(t): return [item for sublist in t for item in sublist] # def getStrCleaned(rawMessage): # # / and : is needed for date and time recognition # punctuation = '!"#$%&\'()*+,-.;<=>?@[\\\\]^_`{|}~' # messageLower = rawMessage.lower() # # new_str = re.sub(' +', ' ', new_str) # for char in punctuation: # messageLower = messageLower.replace(char,'') # return messageLower.split() def getData(testData): Xdata = [] Ydata = [] pathOut = './tasks/zad8/pl/' pathIn = "./data/clean/" i = 0 j = 0 nr = 0 for i in range(16,20): for j in range(20): for nr in range(1,5): fileName = pathIn + "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2) if testData: fileName += "(test)" fileName += ".tsv" try: df = pd.read_csv(fileName, sep='\t', header=None, encoding="utf-8") dfSys = df[df[0] == "user"] dfSys.dropna() Xdata.append(dfSys[1].tolist()) Ydata.append(dfSys[2].tolist()) # Xdata = [getStrCleaned(x) for x in Xdata] except: pass return flatten(Xdata), flatten(Ydata) x, y = getData(False) xTest, yTest = getData(True) dataTuples = list(zip(x, y)) testDataTuples = list(zip(xTest, yTest)) df = pd.DataFrame(dataTuples) dfTest = pd.DataFrame(testDataTuples) df.to_csv('tasks/zad8/pl/dataSentence/train.tsv', sep="\t", index=False, header=None) dfTest.to_csv('tasks/zad8/pl/dataSentence/test.tsv', sep="\t", index=False, header=None)