mirror of
https://github.com/andre-wojtowicz/uci-ml-to-r.git
synced 2024-11-03 14:20:28 +01:00
34 lines
1.9 KiB
R
34 lines
1.9 KiB
R
preprocess.dataset = function()
|
|
{
|
|
csv.file = "spambase.data"
|
|
|
|
dataset = read.csv(file.path(orig.dir, csv.file),
|
|
header = FALSE)
|
|
|
|
colnames(dataset) = c("word_freq_make", "word_freq_address", "word_freq_all",
|
|
"word_freq_3d", "word_freq_our", "word_freq_over",
|
|
"word_freq_remove", "word_freq_internet",
|
|
"word_freq_order", "word_freq_mail", "word_freq_receive",
|
|
"word_freq_will", "word_freq_people", "word_freq_report",
|
|
"word_freq_addresses", "word_freq_free",
|
|
"word_freq_business", "word_freq_email", "word_freq_you",
|
|
"word_freq_credit", "word_freq_your", "word_freq_font",
|
|
"word_freq_000", "word_freq_money", "word_freq_hp",
|
|
"word_freq_hpl", "word_freq_george", "word_freq_650",
|
|
"word_freq_lab", "word_freq_labs", "word_freq_telnet",
|
|
"word_freq_857", "word_freq_data", "word_freq_415",
|
|
"word_freq_85", "word_freq_technology", "word_freq_1999",
|
|
"word_freq_parts", "word_freq_pm", "word_freq_direct",
|
|
"word_freq_cs", "word_freq_meeting", "word_freq_original",
|
|
"word_freq_project", "word_freq_re", "word_freq_edu",
|
|
"word_freq_table", "word_freq_conference", "char_freq_;",
|
|
"char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
|
|
"char_freq_#", "capital_run_length_average",
|
|
"capital_run_length_longest", "capital_run_length_total",
|
|
"class")
|
|
|
|
dataset = dataset %>%
|
|
mutate(class = factor(class))
|
|
|
|
return(dataset)
|
|
} |