uci-ml-to-r/s2-preprocess-data.R

62 lines
1.9 KiB
R

# ---- preprocess-data ----
source("init.R")
source("utils.R")
setup.logger(LOGGER.OUTPUT.S2.FILE, LOGGER.OVERWRITE.EXISTING.FILES)
flog.info("Step 2: preprocess dataset collection")
for (dir.name in dir(DATASETS.DIR))
{
flog.info(paste("Dataset:", dir.name))
orig.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.ORIGINAL.DIR)
dest.dir = gsub(DATASET.NAME.PATTERN, dir.name, DATASET.PREPROCESSED.DIR)
dest.file.path = file.path(dest.dir, DATASET.PREPROCESSED.OUTPUT.FILE)
if (!file.exists(dest.file.path) | OVERWRITE.OUTPUT.FILES)
{
r.src.file = file.path(DATASETS.DIR, dir.name,
DATASET.PREPROCESSING.SCRIPT)
source(r.src.file)
dataset = preprocess.dataset() # custom per-dataset preprocessing
# change column names
colnames(dataset) = tolower(
make.names(
gsub("^\\.|\\.$", "", colnames(dataset)),
unique = TRUE, allow_ = FALSE))
# change factor levels
for (name in colnames(dataset))
{
if (any(class(dataset[[name]]) == "factor"))
{
levels(dataset[[name]]) = tolower(
make.names(
gsub("^\\.|\\.$", "",
levels(dataset[[name]])),
unique = TRUE, allow_ = FALSE))
}
}
print.dataset.statistics(dataset)
if (!dir.exists(dest.dir))
{
dir.create(dest.dir)
}
saveRDS(dataset, dest.file.path)
flog.info(paste("Created preprocessed file",
DATASET.PREPROCESSED.OUTPUT.FILE))
} else {
flog.warn(paste("Target file", basename(dest.file.path),
"already exists, skipping"))
}
flog.info(paste(rep("*", 25), collapse = ""))
}