Two custom functions I will use later:

*# Generate prediction tables*

predict_table <- **function**(model, data, tidy_flag) {

**if** (tidy_flag == TRUE) {

result <- model %>%

predict(data) %>%

rename(pred = .pred) %>%

mutate(

actual = data$total,

pred_real = pred^3,

actual_real = actual^3

)

} **else** {

result <- model %>%

predict(data) %>%

as_tibble_col(column_name = "pred") %>%

mutate(

actual = data$total,

pred_real = pred^3,

actual_real = actual^3

)

}

result

}*# Extract RMSE for models*

pull_rmse <- **function**(result_table) {

rmse_result <- rmse(result_table, pred, actual) %>%

pull(.estimate)

rmse_result_real <- rmse(result_table, pred_real, actual_real) %>%

pull(.estimate)

result <- c(rmse = rmse_result, real_rmse = rmse_result_real)

}

## Baseline

The baseline is the average of the `total`

.

base_train_pred <-

tibble(

actual = train_data$total,

actual_real = train_data$total^3

) %>%

mutate(pred = mean(actual), pred_real = mean(actual_real))base_test_pred <-

tibble(

actual = test_data$total,

actual_real = test_data$total^3

) %>%

mutate(pred = mean(actual), pred_real = mean(actual_real))base_train_rmse <- pull_rmse(base_train_pred)

print(base_train_rmse)

## rmse real_rmse

## 2.032927 181.063306base_test_rmse <- pull_rmse(base_test_pred)

print(base_test_rmse)

## rmse real_rmse

## 2.02608 182.61370

## Decision trees with tidymodels

`parsnip`

for modelling, `workflow`

for well… workflow, `tune`

for parameter tuning, and `yardstick`

for performance metrics. I was also curious about the timing, so I recorded the time as well.

*# Cost complexity for decision tree parameter*

tree_cp <- seq(0.01, 0.1, 0.01)set.seed(25)

tree_tidy_time1 <- Sys.time()

*# Specify model*

tree_engine <-

decision_tree(mode = "regression", cost_complexity = tune()) %>%

set_engine("rpart")

*# Set workflow (Preprocess & model)*

tree_workflow <-

workflow() %>%

add_recipe(prep_recipe) %>%

add_model(tree_engine)

*# Tune parameters with cross-validation*

tree_tune <- tune_grid(

tree_workflow,

resamples = train_cv,

grid = data.frame(cost_complexity = tree_cp),

metrics = metric_set(rmse)

)

*# Fit again with the best parameter*

tree_best <-

finalize_workflow(tree_workflow, select_best(tree_tune)) %>%

fit(train_data)

tree_tidy_time2 <- Sys.time()

print(tree_tidy_time2 - tree_tidy_time1)

## Time difference of 1.376683 mins

It takes around 1 minute and 20 seconds to cross-validate ten parameters. Once it’s done, I can predict the target variable and examine model performance with RMSE. Here I used custom functions `predict_table`

and `pull_rmse`

to complete the task.

tree_tidy_train_pred <- predict_table(tree_best, train_data, TRUE)

tree_tidy_train_rmse <- pull_rmse(tree_tidy_train_pred)

print(tree_tidy_train_rmse)

## rmse real_rmse

## 1.078724 116.106006tree_tidy_test_pred <- predict_table(tree_best, test_data, TRUE)

tree_tidy_test_rmse <- pull_rmse(tree_tidy_test_pred)

print(tree_tidy_test_rmse)

## rmse real_rmse

## 1.074347 118.205989

## Decision trees with caret

`set.seed(25)`

tree_caret_time1 <- Sys.time()

tree_caret <- train(

total~.,

data = train_data_caret,

method = "rpart",

trControl = ctrl_caret,

metric = "RMSE",

tuneGrid = data.frame(cp = tree_cp)

)

tree_caret_time2 <- Sys.time()

print(tree_caret_time2 - tree_caret_time1)

## Time difference of 4.469931 secs

Wooooow! It only takes 4.5 seconds. Moreover, the code is much shorter. The `train`

function includes the model `method = "rpart"`

, cross-validation `trControl = ctrl_caret`

, and parameter tuning `tuneGrid = data.frame(cp = tree_cp)`

.

tree_caret_train_pred <- predict_table(tree_caret, train_data_caret, FALSE)

tree_caret_train_rmse <- pull_rmse(tree_caret_train_pred)

print(tree_caret_train_rmse)

## rmse real_rmse

## 1.078724 116.106006tree_caret_test_pred <- predict_table(tree_caret, test_data_caret, FALSE)

tree_caret_test_rmse <- pull_rmse(tree_caret_test_pred)

print(tree_caret_test_rmse)

## rmse real_rmse

## 1.074347 118.205989

## Compare models

rbind(

base_train_rmse, base_test_rmse,

tree_tidy_train_rmse, tree_tidy_test_rmse,

tree_caret_train_rmse, tree_caret_test_rmse

)## rmse real_rmse

## base_train_rmse 2.032927 181.0633

## base_test_rmse 2.026080 182.6137

## tree_tidy_train_rmse 1.078724 116.1060

## tree_tidy_test_rmse 1.074347 118.2060

## tree_caret_train_rmse 1.078724 116.1060

## tree_caret_test_rmse 1.074347 118.2060

As you can see, the decision tree model results are the same regardless of the library, since I split the data and set up cross-validation the same way. Moreover, both tidymodels and caret use `rpart`

as the underlying engine. So it seems strange that tidymodels takes over 1 minute while caret only needs 4–5 seconds to run decision tree.