DSIC-Bayes-continuous/datapreparator.py

39 lines
1.2 KiB
Python
Raw Permalink Normal View History

2021-05-26 15:25:00 +02:00
from sklearn.model_selection import train_test_split
2021-05-26 13:32:48 +02:00
from copy import deepcopy
import pandas as pd
2021-05-26 15:25:00 +02:00
import typing
2021-05-26 13:32:48 +02:00
class DataPreparator:
genre_dict = {
"blues" : 1,
"classical" : 2,
"country" : 3,
"disco" : 4,
"hiphop" : 5,
"jazz" : 6,
"metal" : 7,
"pop" : 8,
"reggae" : 9,
"rock" : 10
}
2021-05-26 15:25:00 +02:00
2021-05-26 13:32:48 +02:00
def prepare_data(df: pd.DataFrame) -> pd.DataFrame:
data = deepcopy(df)
column = df["label"].apply(lambda x: DataPreparator.genre_dict[x])
2021-05-26 21:08:58 +02:00
data.insert(0, 'genre', column, 'int')
2021-05-26 13:32:48 +02:00
data = data.drop(columns=['filename', 'label', 'length'])
2021-05-26 15:25:00 +02:00
return data
def train_test_split(df: pd.DataFrame) -> typing.Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
2021-05-26 21:08:58 +02:00
#X = df.drop(["genre"], axis=1)
X = df[["chroma_stft_mean","chroma_stft_var","rms_mean"]]
2021-05-26 15:25:00 +02:00
Y = df["genre"]
2021-05-26 21:08:58 +02:00
return train_test_split(X, Y, test_size = 0.20, random_state = False)
def print_df_info(df: pd.DataFrame) -> None:
for key in DataPreparator.genre_dict.keys():
count = len(df[df["genre"]==DataPreparator.genre_dict[key]])
print(f"Key: {key}\tCount: {count}")