We are happy to announce the release of the rules package on CRAN. rules is another “parsnip-adjacent” package that enables a specific class of models within the tidymodels infrastructure. rules currently contains three models:
-
C5_rules()
: classification rule sets based on the C5.0 model. -
cubist_rules()
: regression rules using Cubist. -
rule_fit()
: classification or regression rules using the RuleFit model.
If you aren’t familiar with rule-based models, there is a companion blog post that summarizes how they work.
Install rules from CRAN like so:
install.packages("rules")
Then attach it for use via:
library(rules)
Here’s an example of creating Cubist regression rules via the parsnip package:
library(tidymodels)
#> ── Attaching packages ──────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom 0.5.6 ✓ recipes 0.1.12
#> ✓ dials 0.0.6 ✓ rsample 0.0.6
#> ✓ dplyr 0.8.5 ✓ tibble 3.0.1
#> ✓ ggplot2 3.3.0 ✓ tune 0.1.0
#> ✓ infer 0.5.1 ✓ workflows 0.1.1
#> ✓ parsnip 0.1.1 ✓ yardstick 0.0.6
#> ✓ purrr 0.3.4
#> ── Conflicts ─────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::accumulate() masks foreach::accumulate()
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x ggplot2::margin() masks dials::margin()
#> x recipes::step() masks stats::step()
#> x purrr::when() masks foreach::when()
library(rules)
data(car_prices, package = "modeldata")
set.seed(9932)
car_split <- initial_split(car_prices)
car_tr <- training(car_split)
car_te <- testing(car_split)
# A single rule set:
cubist_mod <-
cubist_rules(neighbors = 7) %>%
set_engine("Cubist")
cubist_fit <-
cubist_mod %>%
fit(log10(Price) ~ ., data = car_tr)
summary(cubist_fit$fit)
#>
#> Call:
#> cubist.default(x = x, y = y, committees = 1)
#>
#>
#> Cubist [Release 2.07 GPL Edition] Wed May 20 21:39:22 2020
#> ---------------------------------
#>
#> Target attribute `outcome'
#>
#> Read 603 cases (18 attributes) from undefined.data
#>
#> Model:
#>
#> Rule 1: [210 cases, mean 4.116360, range 3.94295 to 4.2505, est err 0.030756]
#>
#> if
#> Cylinder <= 4
#> Saab <= 0
#> then
#> outcome = 4.115185 + 0.12 Saab - 3.5e-06 Mileage + 0.017 Cylinder
#> - 0.087 hatchback - 0.029 Chevy + 0.046 wagon + 0.028 Leather
#> + 0.041 Cadillac - 0.024 sedan + 0.027 convertible
#> + 0.006 Doors + 0.012 Buick
#>
#> Rule 2: [8 cases, mean 4.207121, range 4.13308 to 4.26696, est err 0.006589]
#>
#> if
#> Cylinder > 4
#> Saturn > 0
#> then
#> outcome = 3.88624 + 0.057 Cylinder + 0.2 Saab + 0.141 Cadillac
#> - 3.8e-06 Mileage - 0.054 sedan + 0.094 convertible
#> - 0.085 hatchback + 0.019 Doors + 0.04 Buick + 0.014 Cruise
#> + 0.01 Leather + 0.007 Sound + 0.007 Saturn
#>
#> Rule 3: [33 cases, mean 4.229076, range 4.16741 to 4.29184, est err 0.012903]
#>
#> if
#> Cylinder > 4
#> Cruise <= 0
#> then
#> outcome = 4.265627 - 3.7e-06 Mileage + 0.039 Chevy
#>
#> Rule 4: [94 cases, mean 4.272727, range 4.18913 to 4.4427, est err 0.034717]
#>
#> if
#> Mileage > 3946
#> Cylinder > 4
#> Doors > 2
#> Cruise > 0
#> Buick <= 0
#> Cadillac <= 0
#> Saturn <= 0
#> then
#> outcome = 4.037203 + 0.051 Cylinder - 4.3e-06 Mileage + 0.061 Saab
#> + 0.044 Cadillac - 0.016 sedan + 0.029 convertible
#> - 0.026 hatchback + 0.006 Doors - 0.009 Chevy + 0.012 Buick
#> + 0.004 Cruise
#>
#> Rule 5: [57 cases, mean 4.314541, range 4.17208 to 4.42864, est err 0.049758]
#>
#> if
#> Buick > 0
#> then
#> outcome = 4.389884 - 3e-06 Mileage
#>
#> Rule 6: [9 cases, mean 4.341528, range 4.23957 to 4.66962, est err 0.036309]
#>
#> if
#> Mileage <= 3946
#> Cylinder > 4
#> Cadillac <= 0
#> then
#> outcome = 3.439093 + 5.28e-05 Mileage + 0.129 Cylinder
#>
#> Rule 7: [43 cases, mean 4.354487, range 4.1778 to 4.60071, est err 0.031792]
#>
#> if
#> Cylinder > 4
#> Doors <= 2
#> Cruise > 0
#> convertible <= 0
#> then
#> outcome = 3.40984 + 0.13 Cylinder + 0.116 Chevy - 2.7e-06 Mileage
#> + 0.037 Sound + 0.031 Leather
#>
#> Rule 8: [85 cases, mean 4.462877, range 4.34723 to 4.58348, est err 0.023398]
#>
#> if
#> Saab > 0
#> then
#> outcome = 4.522928 - 3.4e-06 Mileage + 0.064 Saab - 0.021 Doors
#> - 0.035 sedan + 0.009 Cylinder + 0.022 Cadillac
#> - 0.024 hatchback + 0.015 convertible - 0.004 Chevy
#> + 0.006 Buick
#>
#> Rule 9: [60 cases, mean 4.592824, range 4.44778 to 4.84976, est err 0.041948]
#>
#> if
#> Cadillac > 0
#> then
#> outcome = 4.774347 - 0.103 Doors + 0.036 Cylinder - 3.4e-06 Mileage
#>
#> Rule 10: [7 cases, mean 4.625017, range 4.58911 to 4.6727, est err 0.006627]
#>
#> if
#> Cylinder > 4
#> Cadillac <= 0
#> convertible > 0
#> then
#> outcome = 4.693132 - 3.9e-06 Mileage
#>
#>
#> Evaluation on training data (603 cases):
#>
#> Average |error| 0.032526
#> Relative |error| 0.23
#> Correlation coefficient 0.97
#>
#>
#> Attribute usage:
#> Conds Model
#>
#> 67% 84% Cylinder
#> 49% 66% Saab
#> 28% 66% Cadillac
#> 28% 17% Cruise
#> 25% 66% Buick
#> 23% 75% Doors
#> 17% 100% Mileage
#> 17% 1% Saturn
#> 8% 66% convertible
#> 77% Chevy
#> 66% hatchback
#> 66% sedan
#> 43% Leather
#> 35% wagon
#> 8% Sound
#>
#>
#> Time: 0.0 secs
predict(cubist_fit, car_te %>% select(-Price))
#> # A tibble: 201 x 1
#> .pred
#> <dbl>
#> 1 4.32
#> 2 4.49
#> 3 4.54
#> 4 4.54
#> 5 4.43
#> 6 4.43
#> 7 4.46
#> 8 4.44
#> 9 4.37
#> 10 4.48
#> # … with 191 more rows
The functions also work with the tune package. To optimize our model, the number of committees (similar to boosting iterations) and the number of nearest-neighbors are the primary parameters for tuning.
cb_grid <- expand.grid(committees = 1:30, neighbors = c(1, 3, 5, 7, 9))
set.seed(8226)
car_folds <- vfold_cv(car_tr)
cubist_mod <-
cubist_rules(neighbors = tune(), committees = tune()) %>%
set_engine("Cubist")
car_tune_res <-
cubist_mod %>%
tune_grid(log10(Price) ~ ., resamples = car_folds, grid = cb_grid)
car_tune_res %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
mutate(neighbors = factor(neighbors)) %>%
ggplot(aes(x = committees, y = mean, col = neighbors)) +
geom_point() +
geom_line() +
scale_color_brewer(palette = "Dark2") +
theme(legend.position = "top")
show_best(car_tune_res, metric = "rmse")
#> # A tibble: 5 x 7
#> committees neighbors .metric .estimator mean n std_err
#> <int> <dbl> <chr> <chr> <dbl> <int> <dbl>
#> 1 9 9 rmse standard 0.0395 10 0.00133
#> 2 5 9 rmse standard 0.0395 10 0.00132
#> 3 11 9 rmse standard 0.0395 10 0.00133
#> 4 13 9 rmse standard 0.0395 10 0.00132
#> 5 8 9 rmse standard 0.0395 10 0.00131
smallest_rmse <- select_best(car_tune_res, metric = "rmse")
smallest_rmse
#> # A tibble: 1 x 2
#> committees neighbors
#> <int> <dbl>
#> 1 9 9
final_cb_mod <-
cubist_mod %>%
finalize_model(smallest_rmse) %>%
fit(log10(Price) ~ ., data = car_tr)
It appears that the benefit of using committees occurs in the first 10 iterations. The nearest-neighbor adjustment was important to obtaining good performance.
The test set results look good and are consistent with the resampling estimate of RMSE:
test_pred <-
predict(final_cb_mod, car_te) %>%
bind_cols(car_te %>% select(Price)) %>%
mutate(Price = log10(Price))
test_pred %>% rmse(Price, .pred)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.0382
ggplot(test_pred, aes(x = .pred, y = Price)) +
geom_abline(col = "green", lty = 2) +
geom_point(alpha = 0.5) +
coord_fixed(ratio = 1)
I’d like to thank Karl Holub for making the xrf package and accepting my PRs and changes.