2022-05-18 16:18:36 +02:00
|
|
|
import pandas as pd
|
2022-05-31 00:23:12 +02:00
|
|
|
from sympy import true
|
2022-05-18 16:18:36 +02:00
|
|
|
|
|
|
|
def flatten(t):
|
|
|
|
return [item for sublist in t for item in sublist]
|
|
|
|
|
2022-05-31 00:23:12 +02:00
|
|
|
# def getStrCleaned(rawMessage):
|
|
|
|
# # / and : is needed for date and time recognition
|
|
|
|
# punctuation = '!"#$%&\'()*+,-.;<=>?@[\\\\]^_`{|}~'
|
|
|
|
# messageLower = rawMessage.lower()
|
|
|
|
# # new_str = re.sub(' +', ' ', new_str)
|
|
|
|
# for char in punctuation:
|
|
|
|
# messageLower = messageLower.replace(char,'')
|
|
|
|
# return messageLower.split()
|
|
|
|
|
|
|
|
def getData(testData):
|
2022-05-18 16:18:36 +02:00
|
|
|
Xdata = []
|
|
|
|
Ydata = []
|
|
|
|
pathOut = './tasks/zad8/pl/'
|
|
|
|
pathIn = "./data/clean/"
|
|
|
|
i = 0
|
|
|
|
j = 0
|
|
|
|
nr = 0
|
|
|
|
for i in range(16,20):
|
|
|
|
for j in range(20):
|
|
|
|
for nr in range(1,5):
|
2022-05-31 00:23:12 +02:00
|
|
|
fileName = pathIn + "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2)
|
|
|
|
if testData: fileName += "(test)"
|
|
|
|
fileName += ".tsv"
|
2022-05-18 16:18:36 +02:00
|
|
|
try:
|
|
|
|
df = pd.read_csv(fileName, sep='\t', header=None, encoding="utf-8")
|
2022-05-31 00:23:12 +02:00
|
|
|
dfSys = df[df[0] == "user"]
|
|
|
|
dfSys.dropna()
|
|
|
|
Xdata.append(dfSys[1].tolist())
|
|
|
|
Ydata.append(dfSys[2].tolist())
|
|
|
|
# Xdata = [getStrCleaned(x) for x in Xdata]
|
2022-05-18 16:18:36 +02:00
|
|
|
except:
|
|
|
|
pass
|
|
|
|
return flatten(Xdata), flatten(Ydata)
|
|
|
|
|
2022-05-31 00:23:12 +02:00
|
|
|
x, y = getData(False)
|
|
|
|
xTest, yTest = getData(True)
|
|
|
|
dataTuples = list(zip(x, y))
|
|
|
|
testDataTuples = list(zip(xTest, yTest))
|
|
|
|
|
|
|
|
df = pd.DataFrame(dataTuples)
|
|
|
|
dfTest = pd.DataFrame(testDataTuples)
|
2022-05-18 16:18:36 +02:00
|
|
|
|
2022-05-31 00:23:12 +02:00
|
|
|
df.to_csv('tasks/zad8/pl/dataSentence/train.tsv', sep="\t", index=False, header=None)
|
|
|
|
dfTest.to_csv('tasks/zad8/pl/dataSentence/test.tsv', sep="\t", index=False, header=None)
|