mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2024-07-22 07:35:30 +02:00
62 lines
1.9 KiB
R
62 lines
1.9 KiB
R
# ---- preprocess-data ----
|
|
|
|
source("init.R")
|
|
source("utils.R")
|
|
|
|
setup.logger(LOGGER.OUTPUT.S2.FILE, LOGGER.OVERWRITE.EXISTING.FILES)
|
|
|
|
flog.info("Step 2: preprocess dataset collection")
|
|
|
|
for (dir.name in dir(DATASETS.DIR))
|
|
{
|
|
flog.info(paste("Dataset:", dir.name))
|
|
|
|
orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
|
|
dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR)
|
|
dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE)
|
|
|
|
if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
|
|
{
|
|
r.src.file = file.path(DATASETS.DIR, dir.name,
|
|
DATASET.PREPROCESSING.SCRIPT)
|
|
source(r.src.file)
|
|
dataset = preprocess.dataset() # custom per-dataset preprocessing
|
|
|
|
# change column names
|
|
colnames(dataset) = tolower(
|
|
make.names(
|
|
gsub("^\\.|\\.$", "", colnames(dataset)),
|
|
unique = TRUE, allow_ = FALSE))
|
|
|
|
# change factor levels
|
|
for (name in colnames(dataset))
|
|
{
|
|
if (any(class(dataset[[name]]) == "factor"))
|
|
{
|
|
levels(dataset[[name]]) = tolower(
|
|
make.names(
|
|
gsub("^\\.|\\.$", "",
|
|
levels(dataset[[name]])),
|
|
unique = TRUE, allow_ = FALSE))
|
|
}
|
|
}
|
|
|
|
print.dataset.statistics(dataset)
|
|
|
|
if (!dir.exists(dest.dir))
|
|
{
|
|
dir.create(dest.dir)
|
|
}
|
|
|
|
saveRDS(dataset, dest.file.path)
|
|
|
|
flog.info(paste("Created preprocessed file",
|
|
DATASET.PREPROCESSED.OUTPUT.FILE))
|
|
} else {
|
|
flog.warn(paste("Target file", basename(dest.file.path),
|
|
"already exists, skipping"))
|
|
}
|
|
|
|
flog.info(paste(rep("*", 25), collapse = ""))
|
|
}
|