Copyright (c) Microsoft Corporation.
Licensed under the MIT License.
This notebook builds on the output from “Basic models” by including regressor variables in the ARIMA model(s). We
fit the following model types:
ar_trend
includes only a linear trend over time.
ar_reg
allows stepwise selection of independent regressors.
ar_reg_price
: rather than allowing the algorithm to select from the 11 price variables, we use
only the price relevant to each brand. This is to guard against possible overfitting, something that classical
stepwise procedures are wont to do.
ar_reg_price_trend
is the same as ar_reg_price
, but including a linear trend.
As part of the modelling, we also compute a new independent variable maxpricediff
, the log-ratio of
the price of this brand compared to the best competing price. A positive maxpricediff
means this
brand is cheaper than all the other brands, and a negative maxpricediff
means it is more expensive.
srcdir <- here::here("R_utils")
for(src in dir(srcdir, full.names=TRUE)) source(src)
load_objects("grocery_sales", "data.Rdata")
cl <- make_cluster(libs=c("tidyr", "dplyr", "fable", "tsibble", "feasts"))
# add extra regression variables to training and test datasets
add_regvars <- function(df)
{
df %>%
group_by(store, brand) %>%
group_modify(~ {
pricevars <- grep("price", names(.x), value=TRUE)
thispricevar <- unique(paste0("price", .y$brand))
best_other_price <- do.call(pmin, .x[setdiff(pricevars, thispricevar)])
.x$price <- .x[[thispricevar]]
.x$maxpricediff <- log(best_other_price/.x$price)
.x
}) %>%
ungroup() %>%
mutate(week=yearweek(week)) %>% # need to recreate this variable because of tsibble/vctrs issues
as_tsibble(week, key=c(store, brand))
}
oj_trainreg <- parallel::parLapply(cl, oj_train, add_regvars)
oj_testreg <- parallel::parLapply(cl, oj_test, add_regvars)
save_objects(oj_trainreg, oj_testreg,
example="grocery_sales", file="data_reg.Rdata")
oj_modelset_reg <- parallel::parLapply(cl, oj_trainreg, function(df)
{
model(df,
ar_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend()),
ar_reg=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + maxpricediff +
price1 + price2 + price3 + price4 + price5 + price6 + price7 + price8 + price9 + price10 + price11),
ar_reg_price=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + deal + feat + maxpricediff + price),
ar_reg_price_trend=ARIMA(logmove ~ pdq() + PDQ(0, 0, 0) + trend() + deal + feat + maxpricediff + price),
.safely=FALSE
)
})
oj_fcast_reg <- parallel::clusterMap(cl, get_forecasts, oj_modelset_reg, oj_testreg)
destroy_cluster(cl)
save_objects(oj_modelset_reg, oj_fcast_reg,
example="grocery_sales", file="model_reg.Rdata")
do.call(rbind, oj_fcast_reg) %>%
mutate_at(-(1:3), exp) %>%
eval_forecasts()
This shows that the models incorporating price are a significant improvement over the previous naive models. The
model that uses stepwise selection to choose the best price variable does worse than the one where we choose the
price beforehand, confirming the suspicion that stepwise leads to overfitting in this case.
LS0tCnRpdGxlOiBBUklNQS1SZWdyZXNzaW9uIG1vZGVscwpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpfQ29weXJpZ2h0IChjKSBNaWNyb3NvZnQgQ29ycG9yYXRpb24uXzxici8+Cl9MaWNlbnNlZCB1bmRlciB0aGUgTUlUIExpY2Vuc2UuXwoKYGBge3IsIGVjaG89RkFMU0UsIHJlc3VsdHM9ImhpZGUiLCBtZXNzYWdlPUZBTFNFfQpsaWJyYXJ5KHRpZHlyKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KHRzaWJibGUpCmxpYnJhcnkoZmVhc3RzKQpsaWJyYXJ5KGZhYmxlKQpgYGAKClRoaXMgbm90ZWJvb2sgYnVpbGRzIG9uIHRoZSBvdXRwdXQgZnJvbSAiQmFzaWMgbW9kZWxzIiBieSBpbmNsdWRpbmcgcmVncmVzc29yIHZhcmlhYmxlcyBpbiB0aGUgQVJJTUEgbW9kZWwocykuIFdlIGZpdCB0aGUgZm9sbG93aW5nIG1vZGVsIHR5cGVzOgoKLSBgYXJfdHJlbmRgIGluY2x1ZGVzIG9ubHkgYSBsaW5lYXIgdHJlbmQgb3ZlciB0aW1lLgotIGBhcl9yZWdgIGFsbG93cyBzdGVwd2lzZSBzZWxlY3Rpb24gb2YgaW5kZXBlbmRlbnQgcmVncmVzc29ycy4KLSBgYXJfcmVnX3ByaWNlYDogcmF0aGVyIHRoYW4gYWxsb3dpbmcgdGhlIGFsZ29yaXRobSB0byBzZWxlY3QgZnJvbSB0aGUgMTEgcHJpY2UgdmFyaWFibGVzLCB3ZSB1c2Ugb25seSB0aGUgcHJpY2UgcmVsZXZhbnQgdG8gZWFjaCBicmFuZC4gVGhpcyBpcyB0byBndWFyZCBhZ2FpbnN0IHBvc3NpYmxlIG92ZXJmaXR0aW5nLCBzb21ldGhpbmcgdGhhdCBjbGFzc2ljYWwgc3RlcHdpc2UgcHJvY2VkdXJlcyBhcmUgd29udCB0byBkby4KLSBgYXJfcmVnX3ByaWNlX3RyZW5kYCBpcyB0aGUgc2FtZSBhcyBgYXJfcmVnX3ByaWNlYCwgYnV0IGluY2x1ZGluZyBhIGxpbmVhciB0cmVuZC4KCkFzIHBhcnQgb2YgdGhlIG1vZGVsbGluZywgd2UgYWxzbyBjb21wdXRlIGEgbmV3IGluZGVwZW5kZW50IHZhcmlhYmxlIGBtYXhwcmljZWRpZmZgLCB0aGUgbG9nLXJhdGlvIG9mIHRoZSBwcmljZSBvZiB0aGlzIGJyYW5kIGNvbXBhcmVkIHRvIHRoZSBiZXN0IGNvbXBldGluZyBwcmljZS4gQSBwb3NpdGl2ZSBgbWF4cHJpY2VkaWZmYCBtZWFucyB0aGlzIGJyYW5kIGlzIGNoZWFwZXIgdGhhbiBhbGwgdGhlIG90aGVyIGJyYW5kcywgYW5kIGEgbmVnYXRpdmUgYG1heHByaWNlZGlmZmAgbWVhbnMgaXQgaXMgbW9yZSBleHBlbnNpdmUuCgpgYGB7cn0Kc3JjZGlyIDwtIGhlcmU6OmhlcmUoIlJfdXRpbHMiKQpmb3Ioc3JjIGluIGRpcihzcmNkaXIsIGZ1bGwubmFtZXM9VFJVRSkpIHNvdXJjZShzcmMpCgpsb2FkX29iamVjdHMoImdyb2Nlcnlfc2FsZXMiLCAiZGF0YS5SZGF0YSIpCgpjbCA8LSBtYWtlX2NsdXN0ZXIobGlicz1jKCJ0aWR5ciIsICJkcGx5ciIsICJmYWJsZSIsICJ0c2liYmxlIiwgImZlYXN0cyIpKQoKIyBhZGQgZXh0cmEgcmVncmVzc2lvbiB2YXJpYWJsZXMgdG8gdHJhaW5pbmcgYW5kIHRlc3QgZGF0YXNldHMKYWRkX3JlZ3ZhcnMgPC0gZnVuY3Rpb24oZGYpCnsKICAgIGRmICU+JQogICAgICAgIGdyb3VwX2J5KHN0b3JlLCBicmFuZCkgJT4lCiAgICAgICAgZ3JvdXBfbW9kaWZ5KH4gewogICAgICAgICAgICBwcmljZXZhcnMgPC0gZ3JlcCgicHJpY2UiLCBuYW1lcygueCksIHZhbHVlPVRSVUUpCiAgICAgICAgICAgIHRoaXNwcmljZXZhciA8LSB1bmlxdWUocGFzdGUwKCJwcmljZSIsIC55JGJyYW5kKSkKICAgICAgICAgICAgYmVzdF9vdGhlcl9wcmljZSA8LSBkby5jYWxsKHBtaW4sIC54W3NldGRpZmYocHJpY2V2YXJzLCB0aGlzcHJpY2V2YXIpXSkKICAgICAgICAgICAgLngkcHJpY2UgPC0gLnhbW3RoaXNwcmljZXZhcl1dCiAgICAgICAgICAgIC54JG1heHByaWNlZGlmZiA8LSBsb2coYmVzdF9vdGhlcl9wcmljZS8ueCRwcmljZSkKICAgICAgICAgICAgLngKICAgICAgICB9KSAlPiUKICAgICAgICB1bmdyb3VwKCkgJT4lCiAgICAgICAgbXV0YXRlKHdlZWs9eWVhcndlZWsod2VlaykpICU+JSAgIyBuZWVkIHRvIHJlY3JlYXRlIHRoaXMgdmFyaWFibGUgYmVjYXVzZSBvZiB0c2liYmxlL3ZjdHJzIGlzc3VlcwogICAgICAgIGFzX3RzaWJibGUod2Vlaywga2V5PWMoc3RvcmUsIGJyYW5kKSkKfQoKb2pfdHJhaW5yZWcgPC0gcGFyYWxsZWw6OnBhckxhcHBseShjbCwgb2pfdHJhaW4sIGFkZF9yZWd2YXJzKQpval90ZXN0cmVnIDwtIHBhcmFsbGVsOjpwYXJMYXBwbHkoY2wsIG9qX3Rlc3QsIGFkZF9yZWd2YXJzKQoKc2F2ZV9vYmplY3RzKG9qX3RyYWlucmVnLCBval90ZXN0cmVnLAogICAgICAgICAgICAgZXhhbXBsZT0iZ3JvY2VyeV9zYWxlcyIsIGZpbGU9ImRhdGFfcmVnLlJkYXRhIikKCm9qX21vZGVsc2V0X3JlZyA8LSBwYXJhbGxlbDo6cGFyTGFwcGx5KGNsLCBval90cmFpbnJlZywgZnVuY3Rpb24oZGYpCnsKICAgIG1vZGVsKGRmLAogICAgICAgIGFyX3RyZW5kPUFSSU1BKGxvZ21vdmUgfiBwZHEoKSArIFBEUSgwLCAwLCAwKSArIHRyZW5kKCkpLAoKICAgICAgICBhcl9yZWc9QVJJTUEobG9nbW92ZSB+IHBkcSgpICsgUERRKDAsIDAsIDApICsgZGVhbCArIGZlYXQgKyBtYXhwcmljZWRpZmYgKwogICAgICAgICAgICBwcmljZTEgKyBwcmljZTIgKyBwcmljZTMgKyBwcmljZTQgKyBwcmljZTUgKyBwcmljZTYgKyBwcmljZTcgKyBwcmljZTggKyBwcmljZTkgKyBwcmljZTEwICsgcHJpY2UxMSksCgogICAgICAgIGFyX3JlZ19wcmljZT1BUklNQShsb2dtb3ZlIH4gcGRxKCkgKyBQRFEoMCwgMCwgMCkgKyBkZWFsICsgZmVhdCArIG1heHByaWNlZGlmZiArIHByaWNlKSwKCiAgICAgICAgYXJfcmVnX3ByaWNlX3RyZW5kPUFSSU1BKGxvZ21vdmUgfiBwZHEoKSArIFBEUSgwLCAwLCAwKSArIHRyZW5kKCkgKyBkZWFsICsgZmVhdCArIG1heHByaWNlZGlmZiArIHByaWNlKSwKCiAgICAgICAgLnNhZmVseT1GQUxTRQogICAgKQp9KQoKb2pfZmNhc3RfcmVnIDwtIHBhcmFsbGVsOjpjbHVzdGVyTWFwKGNsLCBnZXRfZm9yZWNhc3RzLCBval9tb2RlbHNldF9yZWcsIG9qX3Rlc3RyZWcpCgpkZXN0cm95X2NsdXN0ZXIoY2wpCgpzYXZlX29iamVjdHMob2pfbW9kZWxzZXRfcmVnLCBval9mY2FzdF9yZWcsCiAgICAgICAgICAgICBleGFtcGxlPSJncm9jZXJ5X3NhbGVzIiwgZmlsZT0ibW9kZWxfcmVnLlJkYXRhIikKCmRvLmNhbGwocmJpbmQsIG9qX2ZjYXN0X3JlZykgJT4lCiAgICBtdXRhdGVfYXQoLSgxOjMpLCBleHApICU+JQogICAgZXZhbF9mb3JlY2FzdHMoKQpgYGAKClRoaXMgc2hvd3MgdGhhdCB0aGUgbW9kZWxzIGluY29ycG9yYXRpbmcgcHJpY2UgYXJlIGEgc2lnbmlmaWNhbnQgaW1wcm92ZW1lbnQgb3ZlciB0aGUgcHJldmlvdXMgbmFpdmUgbW9kZWxzLiBUaGUgbW9kZWwgdGhhdCB1c2VzIHN0ZXB3aXNlIHNlbGVjdGlvbiB0byBjaG9vc2UgdGhlIGJlc3QgcHJpY2UgdmFyaWFibGUgZG9lcyB3b3JzZSB0aGFuIHRoZSBvbmUgd2hlcmUgd2UgY2hvb3NlIHRoZSBwcmljZSBiZWZvcmVoYW5kLCBjb25maXJtaW5nIHRoZSBzdXNwaWNpb24gdGhhdCBzdGVwd2lzZSBsZWFkcyB0byBvdmVyZml0dGluZyBpbiB0aGlzIGNhc2UuCg==