SystemyDialogowe-ProjektMag.../tasks/zad8/pl/intentClassification.py

50 lines
1.7 KiB
Python
Raw Permalink Normal View History

2022-05-18 16:18:36 +02:00
import pandas as pd
2022-05-31 00:23:12 +02:00
from sympy import true
2022-05-18 16:18:36 +02:00
def flatten(t):
return [item for sublist in t for item in sublist]
2022-05-31 00:23:12 +02:00
# def getStrCleaned(rawMessage):
# # / and : is needed for date and time recognition
# punctuation = '!"#$%&\'()*+,-.;<=>?@[\\\\]^_`{|}~'
# messageLower = rawMessage.lower()
# # new_str = re.sub(' +', ' ', new_str)
# for char in punctuation:
# messageLower = messageLower.replace(char,'')
# return messageLower.split()
def getData(testData):
2022-05-18 16:18:36 +02:00
Xdata = []
Ydata = []
pathOut = './tasks/zad8/pl/'
pathIn = "./data/clean/"
i = 0
j = 0
nr = 0
for i in range(16,20):
for j in range(20):
for nr in range(1,5):
2022-05-31 00:23:12 +02:00
fileName = pathIn + "dialog-" + str(i).zfill(2) + "-" + str(j).zfill(2) + "-" + str(nr).zfill(2)
if testData: fileName += "(test)"
fileName += ".tsv"
2022-05-18 16:18:36 +02:00
try:
df = pd.read_csv(fileName, sep='\t', header=None, encoding="utf-8")
2022-05-31 00:23:12 +02:00
dfSys = df[df[0] == "user"]
dfSys.dropna()
Xdata.append(dfSys[1].tolist())
Ydata.append(dfSys[2].tolist())
# Xdata = [getStrCleaned(x) for x in Xdata]
2022-05-18 16:18:36 +02:00
except:
pass
return flatten(Xdata), flatten(Ydata)
2022-05-31 00:23:12 +02:00
x, y = getData(False)
xTest, yTest = getData(True)
dataTuples = list(zip(x, y))
testDataTuples = list(zip(xTest, yTest))
df = pd.DataFrame(dataTuples)
dfTest = pd.DataFrame(testDataTuples)
2022-05-18 16:18:36 +02:00
2022-05-31 00:23:12 +02:00
df.to_csv('tasks/zad8/pl/dataSentence/train.tsv', sep="\t", index=False, header=None)
dfTest.to_csv('tasks/zad8/pl/dataSentence/test.tsv', sep="\t", index=False, header=None)