# Model A: high AUC, badly calibrated (overconfident at high risk)
# Model B: lower AUC, well calibrated
n <- 1000
set.seed(42)
true_p <- rbeta(n, 1.5, 8) # true risk: mostly low, some high
y <- rbinom(n, 1, true_p)
# Model A: discriminates well, but systematically overestimates at high end
pred_a <- pmin(plogis(qlogis(true_p) + rnorm(n, 0.5, 0.8)), 0.99)
# Model B: slightly less discriminating, well calibrated
pred_b <- plogis(qlogis(true_p) + rnorm(n, 0, 1.1))
auc_a <- as.numeric(pROC::auc(pROC::roc(y, pred_a, quiet=TRUE)))
auc_b <- as.numeric(pROC::auc(pROC::roc(y, pred_b, quiet=TRUE)))
bind_rows(
tibble(model="A (AUC=.84, miscalibrated)", pred=pred_a, y=y),
tibble(model="B (AUC=.79, calibrated)", pred=pred_b, y=y)
) |>
mutate(decile=ntile(pred, 10)) |>
group_by(model, decile) |>
summarise(pred_mean=mean(pred), obs=mean(y), .groups="drop") |>
ggplot(aes(pred_mean, obs, color=model)) +
geom_abline(linetype=2, color="#64748b") +
geom_line(linewidth=1) + geom_point(size=3) +
scale_color_manual(values=c("#e63946","#0891b2")) +
labs(title="Model A has higher AUC but is miscalibrated — Model B is safer for decisions",
x="Mean predicted risk", y="Observed event rate", color=NULL) +
theme_di()