diff --git a/.DS_Store b/.DS_Store new file mode 100755 index 0000000..32a24ca Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/AH_code.Rproj b/AH_code.Rproj new file mode 100755 index 0000000..066341e --- /dev/null +++ b/AH_code.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/NB/nb_boot.R b/NB/nb_boot.R new file mode 100644 index 0000000..587762f --- /dev/null +++ b/NB/nb_boot.R @@ -0,0 +1,176 @@ + +# load libraries +library(tidyverse) +library(rsample) +library(reshape2) +library(ggplot2) +library(survminer) # for plotting theme +library(ggsci) # for color palette + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/NB" +source(paste0(path_funs, "/net_benefit_fun.R")) +source(paste0(path_funs, "/net_benefit_wrapper_fun.R")) + +# data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data, "/recalibrated_models_default.Rdata")) + +test.data.short <- test.data %>% mutate(lille.death.risk = 1 - Lille.surv, + lille.death.risk.updated = 1 -lille.surv.updated, + CLIF.death.risk = 1 - CLIF.surv, + clif.death.risk.updated = 1 - clif.surv.updated, + meld.death.risk = 1 - MELD.surv, + meld.death.risk.updated = 1 - meld.surv.updated, + meld.death.risk2 = 1 - MELD.surv2) + +################################################################## +############### Calculate bootstrapped NB diff ################## +################################################################## +thresholds = seq(0.01, 0.99, 0.04) +n_boot <- 1000 + + +set.seed(353) +bt_resamples <- bootstraps(test.data.short, times = n_boot, strata = "D90_DTH") + +# models +m_lille <- map_df(bt_resamples$splits, net_benefit_treated_wrapper, model = "Lille") +m_lille$boot_id <- rep(1:n_boot, each = length(thresholds) * 2) + +m_cliff <- map_df(bt_resamples$splits, net_benefit_treated_wrapper, model = "CLIF-C ACLF") +m_cliff$boot_id <- rep(1:n_boot, each = length(thresholds) * 2) + +meld1 <- map_df(bt_resamples$splits, net_benefit_treated_wrapper, model = "MELD 1") +meld1$boot_id <- rep(1:n_boot, each = length(thresholds) * 2) + +meld2 <- map_df(bt_resamples$splits, net_benefit_treated_wrapper, model = "MELD 2") +meld2$boot_id <- rep(1:n_boot, each = length(thresholds) * 2) + +combined_df <- rbind(m_lille, m_cliff, meld1, meld2) + +alpha <- 0.05 # significance level + +data_wide <- dcast(combined_df, threshold + model + boot_id ~ status, value.var = "NB") %>% + group_by(threshold, boot_id, model) %>% + mutate(diff_NB = Original - Updated) %>% + ungroup() %>% + group_by(threshold, model) %>% + summarize(meanNB_diff = mean(diff_NB), + low = quantile(diff_NB, alpha / 2), + high = quantile(diff_NB, 1 - alpha / 2)) + +############################################# +###################### Plots ############### +############################################# + +# Original vs Updated comparison plots +combined_df %>% group_by(threshold, status, model) %>% + summarize(meanNB = mean(NB), + low = quantile(NB, alpha/2, na.rm = T), + high = quantile(NB, 1 - alpha/2, na.rm = T), + mean_NB_all = mean(NB_all)) %>% + ggplot(aes(x = threshold, y = meanNB)) + + geom_line(aes(col = status)) + + #geom_ribbon(aes(ymin = low, ymax = high, fill = model, linetype = NA), + # alpha = 0.3, show.legend = F) + + geom_line(aes(y = mean_NB_all), linetype = "dashed") + + geom_hline(yintercept = 0, linetype = "dotted") + + coord_cartesian(ylim = c(-0.05, 0.4)) + + facet_wrap(.~ model) + + theme_classic2() + +# Updated comparison plots +updated <- combined_df %>% + filter(model %in% c("CLIF-C ACLF", "Lille", "MELD 1")) %>% + group_by(threshold, status, model) %>% + summarize(meanNB = mean(NB), + low = quantile(NB, alpha/2, na.rm = T), + high = quantile(NB, 1 - alpha/2, na.rm = T), + mean_NB_all = mean(NB_all)) %>% + filter(status == "Updated") %>% + mutate(model = ifelse(model == "MELD 1", "MELD", model)) %>% + ggplot(aes(x = threshold, y = meanNB)) + + geom_line(aes(col = model), lwd = 1) + + #geom_ribbon(aes(ymin = low, ymax = high, fill = model, linetype = NA), + # alpha = 0.3, show.legend = F) + + geom_line(aes(y = mean_NB_all), linetype = "dashed") + + geom_hline(yintercept = 0, linetype = "dotted") + + coord_cartesian(ylim = c(-0.05, 0.3)) + + labs(col = "Updated Scores", y = "Net Benefit", x = "Threshold Risk probability") + + #facet_wrap(.~ model) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic2() + + +# Updated original plots +library(RColorBrewer) +customs_cols <- brewer.pal(6, "Dark2") +customs_cols + +original <- combined_df %>% + #filter(model %in% c("CLIF-C ACLF", "Lille", "MELD 1")) %>% + group_by(threshold, status, model) %>% + summarize(meanNB = mean(NB), + low = quantile(NB, alpha/2, na.rm = T), + high = quantile(NB, 1 - alpha/2, na.rm = T), + mean_NB_all = mean(NB_all)) %>% + filter(status == "Original") %>% + #mutate(model = ifelse(model == "MELD 1", "MELD", model)) %>% + ggplot(aes(x = threshold, y = meanNB)) + + geom_line(aes(col = model), lwd = 1) + + #geom_ribbon(aes(ymin = low, ymax = high, fill = model, linetype = NA), + # alpha = 0.3, show.legend = F) + + geom_line(aes(y = mean_NB_all), linetype = "dashed") + + geom_hline(yintercept = 0, linetype = "dotted") + + coord_cartesian(ylim = c(-0.05, 0.3)) + + labs(col = "Original Scores", y = "Net Benefit", x = "Threshold Risk probability") + + #facet_wrap(.~ model) + + #scale_color_brewer(palette = "Dark2", type = "diverging") + + scale_color_manual(values = customs_cols[-3]) + + theme_classic2() + +combined_df %>% + #filter(model %in% c("CLIF-C ACLF", "Lille", "MELD 1")) %>% + group_by(threshold, status, model) %>% + summarize(meanNB = mean(NB), + low = quantile(NB, alpha/2, na.rm = T), + high = quantile(NB, 1 - alpha/2, na.rm = T), + mean_NB_all = mean(NB_all), + mean_FPR = mean(FPR)) %>% + #filter(status == "Original") %>% + filter(model == "Lille" & threshold == 0.45) + +data_wide %>% filter(model == "Lille" & threshold == 0.45) + +tp <- sum(test.data.short$lille.death.risk >= 0.45 & test.data.short$D90_DTH == 1) +tp +fp <- sum(test.data.short$lille.death.risk >= 0.45 & test.data.short$D90_DTH == 0) +fp +fp/tp + +tp <- sum(test.data.short$lille.death.risk.updated >= 0.45 & test.data.short$D90_DTH == 1) +tp +fp <- sum(test.data.short$lille.death.risk.update >= 0.45 & test.data.short$D90_DTH == 0) +fp + +fp/tp + + + +################# +ggplot(data_wide, aes(x = threshold, y = meanNB_diff)) + + geom_line(lwd = 1) + + geom_ribbon(aes(ymin = low, ymax = high, linetype = NA), + alpha = 0.3, show.legend = F) + + geom_hline(yintercept = 0, linetype = "dashed") + + labs(y = "Difference in Net Benefit", x = "Threshold Risk Probability") + + facet_wrap(.~ model, scales = "free_y") + + theme_classic2() + +# lille threshold +data_wide %>% filter(model == "Lille" & threshold == 0.45) + +data_wide %>% filter(model == "MELD 1" & threshold == 0.45) +################# \ No newline at end of file diff --git a/NB/net_benefit_fun.R b/NB/net_benefit_fun.R new file mode 100644 index 0000000..4601779 --- /dev/null +++ b/NB/net_benefit_fun.R @@ -0,0 +1,82 @@ +# function to calculate the NB of the treated i.e those +# receiving treatment (risk > risk_threshold) +net_benefit_treated <- function(pred_y, obs_y, risk_threshold){ + # pred_y = (vector) of the predicted probabilty of outcome, must be 0 < pred_y < 1 + # obs_y = (vector) of the observed outcome, must be 1 = event and 0 = non-event + # risk_threshold = (vector) of the risk thresholds, must be 0 < risk_threshold < 1 + + # start checks + # if (!all(pred_y >= 0) | !all(pred_y <= 1)){ + # stop("The predicted values must range between 0 and 1") + # } + + if (!all(between(pred_y, 0, 1))){ + stop("The predicted values must range between 0 and 1") + } + + if (!all(obs_y == 0 | obs_y == 1)){ + stop("Outcome must be coded 0 (non event) and 1 (event)") + } + + # if (!all(risk_threshold >= 0) | !all(risk_threshold <= 1)){ + # stop("The risk threshold must range between 0 and 1") + # } + + if (!all(between(risk_threshold, 0, 1))){ + stop("The risk threshold must range between 0 and 1") + } + + # end checks + + # number of observations + N <- length(obs_y) + + # prevalence Pr(obs_y = 1) + rho = mean(obs_y == 1) + + # initialize empty vectors + tpr <- numeric(length(risk_threshold)) + fpr <- numeric(length(risk_threshold)) + nb <- numeric(length(risk_threshold)) + snb <- numeric(length(risk_threshold)) + U_all <- numeric(length(risk_threshold)) + sU_all <- numeric(length(risk_threshold)) + + # loop through each risk threshold and calcualte tpr, fpr, nb, snb, U_all, sU_all + for(i in 1:length(risk_threshold)){ + + # true positive rate Pr(pred_y > risk_threshold | obs_y = 1) + tpr[i] <- sum(pred_y >= risk_threshold[i] & obs_y == 1)/sum(obs_y == 1) + + # false positive rate Pr(pred_y > risk_threshold | obs_y = 0) + fpr[i] <- sum(pred_y >= risk_threshold[i] & obs_y == 0)/sum(obs_y == 0) # check again it works ok + + # net benefit + nb[i] = tpr[i] * rho - (risk_threshold[i]/(1 - risk_threshold[i])) * (1 - rho) * fpr[i] + + # standardized net benefit + snb[i] = nb[i]/rho + + # treat all NB = (U_all - U_none) + U_all[i] = rho - (1 - rho) * (risk_threshold[i]/(1- risk_threshold[i])) + + # treat all sNB + sU_all[i] = U_all[i]/rho + + } + + # expected utility treat none + U_none = 0 + + output = data.frame("threshold" = risk_threshold, + "TPR" = tpr, + "FPR" = fpr, + "NB" = nb, + "sNB" = snb, + "rho" = rho, + "NB_none" = U_none, + "NB_all" = U_all, + "sNB_all" = sU_all) + return(output) + +} diff --git a/NB/net_benefit_wrapper_fun.R b/NB/net_benefit_wrapper_fun.R new file mode 100644 index 0000000..ed4d29c --- /dev/null +++ b/NB/net_benefit_wrapper_fun.R @@ -0,0 +1,28 @@ + + +net_benefit_treated_wrapper <- function(splits, model){ + + x <- analysis(splits) + + if (model == "Lille"){ + nb_original <- net_benefit_treated(x$lille.death.risk, x$D90_DTH, risk_threshold = thresholds) + nb_update <- net_benefit_treated(x$lille.death.risk.updated, x$D90_DTH, risk_threshold = thresholds) + } else if (model == "CLIF-C ACLF"){ + nb_original <- net_benefit_treated(x$CLIF.death.risk, x$D90_DTH, risk_threshold = thresholds) + nb_update <- net_benefit_treated(x$clif.death.risk.updated, x$D90_DTH, risk_threshold = thresholds) + } else if ( model == "MELD 1"){ + nb_original <- net_benefit_treated(x$meld.death.risk, x$D90_DTH, risk_threshold = thresholds) + nb_update <- net_benefit_treated(x$meld.death.risk.updated, x$D90_DTH, risk_threshold = thresholds) + } else if ( model == "MELD 2"){ + nb_original <- net_benefit_treated(x$meld.death.risk2, x$D90_DTH, risk_threshold = thresholds) + nb_update <- net_benefit_treated(x$meld.death.risk.updated, x$D90_DTH, risk_threshold = thresholds) + } + + nb_original$status <- "Original" + nb_update$status <- "Updated" + + nb <- rbind(nb_original, nb_update) + nb$model <- model + + return(nb) +} diff --git a/README.md b/README.md old mode 100644 new mode 100755 index be637f8..4acf3bb --- a/README.md +++ b/README.md @@ -23,4 +23,9 @@ The repository contains several folders, with the following files in them: - subgroup_comparison.R. File that is used to analyse model performance in sub-groups of patients. In order to perform the analysis, the files import_data.R and prognostic_scores.R should always be run first. Generally, files may only work after another file is run first. As the data is not published, however, I would recommend using this code as a guideline for comparable analyses, rather than as a perfect example of how model validations and comparisons should always be done. +<<<<<<< HEAD + +---- +======= +>>>>>>> origin diff --git a/grid_arrange_fun.R b/grid_arrange_fun.R new file mode 100644 index 0000000..9343696 --- /dev/null +++ b/grid_arrange_fun.R @@ -0,0 +1,44 @@ + +# https://cran.r-project.org/web/packages/egg/vignettes/Ecosystem.html + +grid_arrange_shared_legend <- + function(..., + ncol = length(list(...)), + nrow = 1, + position = c("bottom", "right")) { + + plots <- list(...) + position <- match.arg(position) + g <- + ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs + legend <- g[[which(sapply(g, function(x) + x$name) == "guide-box")]] + lheight <- sum(legend$height) + lwidth <- sum(legend$width) + gl <- lapply(plots, function(x) + x + theme(legend.position = "none")) + gl <- c(gl, ncol = ncol, nrow = nrow) + + combined <- switch( + position, + "bottom" = arrangeGrob( + do.call(arrangeGrob, gl), + legend, + ncol = 1, + heights = unit.c(unit(1, "npc") - lheight, lheight) + ), + "right" = arrangeGrob( + do.call(arrangeGrob, gl), + legend, + ncol = 2, + widths = unit.c(unit(1, "npc") - lwidth, lwidth) + ) + ) + + grid.newpage() + grid.draw(combined) + + # return gtable invisibly + invisible(combined) + + } diff --git a/original models/Calibration.R b/original models/Calibration.R old mode 100644 new mode 100755 index 507a51e..20440f2 --- a/original models/Calibration.R +++ b/original models/Calibration.R @@ -1,36 +1,122 @@ # This script performs all calculations of model calibration -library(rms) +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +# data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "original_models.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# -# Create calibration plots and compute corresponding slopes and intercepts for all models # MELD_1 survival function -val.prob(stph.meld$MELD.surv, stph.meld$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) +cal_MELD.surv <- calibration(stph.meld$MELD.surv, y = stph.meld$D90_surv) +cal_MELD.surv$Score <- "MELD_1" # MELD_2 survival function -val.prob(stph.meld$MELD.surv2, stph.meld$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) +cal_MELD.surv2 <- calibration(stph.meld$MELD.surv2, y = stph.meld$D90_surv) +cal_MELD.surv2$Score <- "MELD_2" -# MELD 3.0 -val.prob(stph.meld$MELD3.surv, stph.meld$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) +# MELD VanDerwerken +cal_MELD.VanDerwerken <- calibration(stph.meld$MELD_Van, y = stph.meld$D90_surv) +cal_MELD.VanDerwerken$Score <- "MELD VanDerwerken" +# MELD 3.0 +sum(is.na(stph.meld$MELD3.surv)) # 10 missing values due to Albumin.MELD +cal_MELD3.surv <- calibration(stph.meld$MELD3.surv[!is.na(stph.meld$MELD3.surv)], y = stph.meld$D90_surv[!is.na(stph.meld$MELD3.surv)]) +cal_MELD3.surv$Score <- "MELD 3.0" + # Lille -val.prob(stph.lille$Lille.surv, stph.lille$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) +cal_Lille <- calibration(stph.lille$Lille.surv, y = stph.lille$D90_surv) +cal_Lille$Score <- "Lille" # CLIF-C ACLF -val.prob(stph.clif$CLIF.surv, stph.clif$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) +cal_CLIF <- calibration(stph.clif$CLIF.surv, y = stph.clif$D90_surv) +cal_CLIF$Score <- "CLIF-C ACLF" + +# combine dfs +df_cal <- rbind(cal_MELD.surv, cal_MELD.surv2, cal_MELD.VanDerwerken, cal_MELD3.surv, cal_Lille, cal_CLIF) + +############################################# +###################### Plots ############### +############################################# + +# plot without ribbon and without MELD 3.0 +df_cal %>% filter(Score != "MELD 3.0") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + +# plot with ribbon +p_calibration <- df_cal %>% filter(Score != "MELD 3.0") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + facet_grid(. ~ Score) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + labs(y = "Observed survival proportion", x = "Predicted survival probability") + + theme_classic2() + + theme(legend.position = "none") + +############################################# +############## Summary Stats ############### +############################################# + +# compute calibration slopes and intercepts + +# MELD_1 survival function +MELD1_stats <- val.prob.ci.2_wrapper(stph.meld$MELD.surv, stph.meld$D90_surv, "MELD_1") + +# MELD_2 survival function +MELD2_stats <- val.prob.ci.2_wrapper(stph.meld$MELD.surv2, stph.meld$D90_surv, "MELD_2") + +# MELD VanDerwerken +MELD_Van_stats <- val.prob.ci.2_wrapper(stph.meld$MELD_Van, stph.meld$D90_surv, "MELD VanDerwerken") + +# Lille +Lille_stats <- val.prob.ci.2_wrapper(stph.lille$Lille.surv, stph.lille$D90_surv, "Lille") + +#CLIF-C ACLF +clif_stats <- val.prob.ci.2_wrapper(stph.clif$CLIF.surv, y = stph.clif$D90_surv, "CLIF-C ACLF") + + +df_stats <- rbind(MELD1_stats, MELD2_stats, MELD_Van_stats, Lille_stats, clif_stats) %>% relocate(Score) + +#df_stats$Model <- fct_reorder(df_stats$Model, df_stats$ Point.estimate) +# adding to plot +#p_calibration + geom_text(data = df_stats %>% filter(stat == "intercept"), +# aes(0.1, 0.90, label = +# paste0("Int (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), +# hjust = 0), col = "black") #+ + + +library(xtable) +xtable((df_stats)) + + -# Calculate the Hosmer-Lemeshow goodness-of-fit statistic for all models -library(ResourceSelection) -hoslem.test(stph.meld$D90_surv, stph.meld$MELD.surv) -hoslem.test(stph.meld$D90_surv, stph.meld$MELD.surv2) -hoslem.test(stph.lille$D90_surv, stph.lille$Lille.surv) -hoslem.test(stph.clif$D90_surv, stph.clif$CLIF.surv) diff --git a/original models/Calibration_6M.R b/original models/Calibration_6M.R new file mode 100644 index 0000000..efec9f2 --- /dev/null +++ b/original models/Calibration_6M.R @@ -0,0 +1,99 @@ +# This script performs all calculations of model calibration +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +# data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "original_models_6M.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# + +# MELD_1 survival function +cal_MELD.surv <- calibration(stph.meld$MELD.surv_6M, y = stph.meld$M6_surv) +cal_MELD.surv$Score <- "MELD_1" + +# Lille +cal_Lille <- calibration(stph.lille$Lille.surv_6M, y = stph.lille$M6_surv) +cal_Lille$Score <- "Lille" + +# CLIF-C ACLF +cal_CLIF <- calibration(stph.clif$CLIF.surv_6M, y = stph.clif$M6_surv) +cal_CLIF$Score <- "CLIF-C ACLF" + +# combine dfs +df_cal <- rbind(cal_MELD.surv, cal_Lille, cal_CLIF) + +############################################# +###################### Plots ############### +############################################# + +# plot without ribbon and without MELD 3.0 +df_cal %>% filter(Score != "MELD 3.0") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + +# plot with ribbon +p_calibration <- df_cal %>% filter(Score != "MELD 3.0") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + scale_color_manual(values = c("#1B9E77", "#D95F02", "#E7298A")) + #palette = "Dark2") + + scale_fill_manual(values = c("#1B9E77", "#D95F02", "#E7298A")) + #palette = "Dark2") + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + labs(y = "Observed 6-month survival proportion", x = "Predicted 6-month survival probability") + + theme_classic2() + + theme(legend.position = "none") + + +############################################# +############## Summary Stats ############### +############################################# +# compute calibration slopes and intercepts + +# MELD_1 survival function +MELD_stats <- val.prob.ci.2_wrapper(stph.meld$MELD.surv_6M, stph.meld$M6_surv, "MELD_1") + +# Lille +Lille_stats <- val.prob.ci.2_wrapper(stph.lille$Lille.surv_6M, stph.lille$M6_surv, "Lille") + +#CLIF-C ACLF +clif_stats <- val.prob.ci.2_wrapper(stph.clif$CLIF.surv_6M, y = stph.clif$M6_surv, "CLIF-C ACLF") + +df_stats <- rbind(MELD_stats, Lille_stats, clif_stats) %>% relocate(Score) + +# plot +p_calibration + geom_text(data = df_stats %>% filter(stat == "intercept"), + aes(0.05, 0.95, label = + paste0("Intercept (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + geom_text(data = df_stats %>% filter(stat == "slope"), + aes(0.05, 0.90, label = + paste0("Slope (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + diff --git a/original models/Calibration_imp.R b/original models/Calibration_imp.R new file mode 100644 index 0000000..5225924 --- /dev/null +++ b/original models/Calibration_imp.R @@ -0,0 +1,104 @@ +# This script performs all calculations of model calibration +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) + +# imputed data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "imputed_orig_scores.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# +imp_index <- group_split(imp_data2, .imp) + +imp_ind <- max(imp_data2$.imp) + +imp_index_Bili7 <- group_split(imp_data2_Biliday7, .imp) + +imp_ind_Bili7 <- max(imp_data2_Biliday7$.imp) + +# MELD_1 +cal_MELD.surv1 <- tibble() +for(i in 1:imp_ind){ + # MELD 1 + temp <- calibration(imp_index[[i]]$MELD.surv1, y = imp_index[[i]]$D90_surv) + temp$Score <- "MELD_1" + temp$.imp <- i + cal_MELD.surv1 <- rbind(cal_MELD.surv1, temp) +} +rm(temp) + +# MELD 2 +cal_MELD.surv2 <- tibble() +for(i in 1:imp_ind){ + temp <- calibration(imp_index[[i]]$MELD.surv2, y = imp_index[[i]]$D90_surv) + temp$Score <- "MELD_2" + temp$.imp <- i + cal_MELD.surv2 <- rbind(cal_MELD.surv2, temp) +} + +rm(temp) + +# MELD VanDerwerken +cal_MELD.VanDerwerken <- tibble() +for(i in 1:imp_ind){ + temp <- calibration(imp_index[[i]]$MELD_Van, y = imp_index[[i]]$D90_surv) + temp$Score <- "MELD VanDerwerken" + temp$.imp <- i + cal_MELD.VanDerwerken <- rbind(cal_MELD.VanDerwerken, temp) +} + +rm(temp) + +# Lille +cal_Lille <- tibble() +for(i in 1:imp_ind_Bili7){ + temp <- calibration(imp_index_Bili7[[i]]$Lille.surv, y = imp_index_Bili7[[i]]$D90_surv) + temp$Score <- "Lille" + temp$.imp <- i + cal_Lille <- rbind(cal_Lille, temp) +} + +rm(temp) + +# "CLIF-C ACLF" +cal_CLIF <- tibble() +for(i in 1:imp_ind){ + temp <- calibration(imp_index[[i]]$CLIF.surv, y = imp_index[[i]]$D90_surv) + temp$Score <- "CLIF-C ACLF" + temp$.imp <- i + cal_CLIF <- rbind(cal_CLIF, temp) +} + +# combine dfs +df_cal <- rbind(cal_MELD.surv1, cal_MELD.surv2, cal_MELD.VanDerwerken, cal_Lille, cal_CLIF) + +############################################# +###################### Plots ############### +############################################# +# plot without ribbon +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(aes(group = .imp), lwd = 0.7) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + facet_grid(. ~ Score) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + + theme(legend.position = "none") + + + diff --git a/original models/Calibration_imp_sens.R b/original models/Calibration_imp_sens.R new file mode 100644 index 0000000..415e3f0 --- /dev/null +++ b/original models/Calibration_imp_sens.R @@ -0,0 +1,124 @@ + +# This script performs all calculations of model calibration +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) + +# imputed data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "imputed_sens_orig_scores.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# +imp_index <- group_split(imp_data3, .imp) + +imp_ind <- max(imp_data3$.imp) +delta <- unique(imp_data3$delta) + +# # MELD_1 +# cal_MELD.surv1 <- tibble() +# for(i in 1:imp_ind){ +# # MELD 1 +# temp <- calibration(imp_index[[i]]$MELD.surv1, y = imp_index[[i]]$D90_surv) +# temp$Score <- "MELD_1" +# temp$delta <- rep(unique(imp_index[[i]]$delta), each = 1068) +# temp$.imp <- i +# cal_MELD.surv1 <- rbind(cal_MELD.surv1, temp) +# } +# rm(temp) +# +# # MELD 2 +# cal_MELD.surv2 <- tibble() +# for(i in 1:imp_ind){ +# temp <- calibration(imp_index[[i]]$MELD.surv2, y = imp_index[[i]]$D90_surv) +# temp$Score <- "MELD_2" +# temp$delta <- rep(unique(imp_index[[i]]$delta), each = 1068) +# temp$.imp <- i +# cal_MELD.surv2 <- rbind(cal_MELD.surv2, temp) +# } +# +# rm(temp) +# +# # MELD VanDerwerken +# cal_MELD.VanDerwerken <- tibble() +# for(i in 1:imp_ind){ +# temp <- calibration(imp_index[[i]]$MELD_Van, y = imp_index[[i]]$D90_surv) +# temp$Score <- "MELD VanDerwerken" +# temp$delta <- rep(unique(imp_index[[i]]$delta), each = 1068) +# temp$.imp <- i +# cal_MELD.VanDerwerken <- rbind(cal_MELD.VanDerwerken, temp) +# } +# +# rm(temp) + +# Lille +cal_Lille <- tibble() +for(i in 1:imp_ind){ + for(g in 1:length(delta)){ + d <- unique(imp_index[[i]]$delta) + df <- imp_index[[i]] %>% filter(delta == d[g]) + + temp <- calibration(df$Lille.surv, y = df$D90_surv) + temp$Score <- "Lille" + temp$delta <- unique(df$delta) + temp$.imp <- i + cal_Lille <- rbind(cal_Lille, temp) + } +} + +# rm(temp) +# +# # "CLIF-C ACLF" +# cal_CLIF <- tibble() +# for(i in 1:imp_ind){ +# temp <- calibration(imp_index[[i]]$CLIF.surv, y = imp_index[[i]]$D90_surv) +# temp$Score <- "CLIF-C ACLF" +# temp$delta <- rep(unique(imp_index[[i]]$delta), each = 1068) +# temp$.imp <- i +# cal_CLIF <- rbind(cal_CLIF, temp) +# } +# +# # combine dfs +# df_cal <- rbind(cal_MELD.surv1, cal_MELD.surv2, cal_MELD.VanDerwerken, cal_Lille, cal_CLIF) + +############################################# +###################### Plots ############### +############################################# +# distribution of predictions +p5 <- cal_Lille %>% ggplot(.) + + geom_boxplot(aes(x = pred, fill = as.factor(as.numeric(delta)))) + + scale_fill_brewer(palette = "Dark2") + + labs(fill = expression(delta)) + + xlab("Predicted survival distribution - Lille") + + theme_classic() + + theme(axis.text.y = element_blank(), + axis.ticks.y = element_blank()) + +p6 <- cal_Lille %>% filter(Score == "Lille") %>% + mutate(grp = paste0(delta, "_", .imp), + delta = as.numeric(delta)) %>% + ggplot(., aes(x = pred, y = obs, group = grp)) + + geom_line(aes(group = grp)) + + geom_line(aes(col = as.factor(delta)), alpha = 0.5) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + #facet_grid(. ~ delta) + + xlim(0, 1) + + ylim(0, 1) + + labs(col = expression(delta)) + + scale_color_brewer(palette = "Dark2") + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + + guides(colour = guide_legend(override.aes = list(alpha = 1))) + #theme(legend.position = "none") + + diff --git a/original models/Discimination_imp.R b/original models/Discimination_imp.R new file mode 100644 index 0000000..2086c4c --- /dev/null +++ b/original models/Discimination_imp.R @@ -0,0 +1,106 @@ +# This script performs all calculations for assessing discrimination + +library(pROC) +library(dplyr) +library(ggplot2) +library(tidyr) +library(forcats) +library(purrr) + +# data +#path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +#load(paste0(path_data, "original_models.Rdata")) + +# imputed data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "imputed_orig_scores.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# +imp_index <- group_split(imp_data2, .imp) + +imp_ind <- max(imp_data2$.imp) + +imp_index_Bili7 <- group_split(imp_data2_Biliday7, .imp) + +imp_ind_Bili7 <- max(imp_data2_Biliday7$.imp) + +# MELD_1 +cal_MELD.surv1 <- tibble() +for(i in 1:imp_ind){ + # MELD 1 + temp <- roc(imp_index[[i]]$D90_DTH, imp_index[[i]]$MELD.surv1) + temp2 <- data.frame(sensitivities = temp$sensitivities, + specificities = temp$specificities, + thresholds = temp$thresholds) + temp2$Score <- "MELD" + temp2$.imp <- i + cal_MELD.surv1 <- rbind(cal_MELD.surv1, temp2) +} +rm(temp2) + +# MELD VanDerwerken +cal_MELD.VanDerwerken <- tibble() +for(i in 1:imp_ind){ + temp <- roc(imp_index[[i]]$D90_DTH, imp_index[[i]]$MELD_Van) + temp2 <- data.frame(sensitivities = temp$sensitivities, + specificities = temp$specificities, + thresholds = temp$thresholds) + temp2$Score <- "MELD VanDerwerken" + temp2$.imp <- i + cal_MELD.VanDerwerken <- rbind(cal_MELD.VanDerwerken, temp2) +} +rm(temp2) + +# Lille +cal_Lille <- tibble() +for(i in 1:imp_ind_Bili7){ + temp <- roc(imp_index_Bili7[[i]]$D90_DTH, imp_index_Bili7[[i]]$Lille.surv) + temp2 <- data.frame(sensitivities = temp$sensitivities, + specificities = temp$specificities, + thresholds = temp$thresholds) + temp2$Score <- "Lille" + temp2$.imp <- i + cal_Lille <- rbind(cal_Lille, temp2) +} +rm(temp2) + +# "CLIF-C ACLF" +cal_CLIF <- tibble() +for(i in 1:imp_ind){ + temp <- roc(imp_index[[i]]$D90_DTH, imp_index[[i]]$CLIF.surv) + temp2 <- data.frame(sensitivities = temp$sensitivities, + specificities = temp$specificities, + thresholds = temp$thresholds) + temp2$Score <- "CLIF-C ACLF" + temp2$.imp <- i + cal_CLIF <- rbind(cal_CLIF, temp2) +} +rm(temp2) + +# combine dfs +df_cal <- rbind(cal_MELD.surv1, cal_MELD.VanDerwerken, cal_Lille, cal_CLIF) + +############################################# +###################### Plots ############### +############################################# +# plot without ribbon +df_cal %>% + ggplot(., aes(x = 1 - specificities, y = sensitivities, col = Score)) + + geom_line(aes(group = .imp), lwd = 0.7) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + xlim(0, 1) + + ylim(0, 1) + + ylab("Sensitivity") + + xlab("Specificity") + + theme_classic() + + theme(legend.position = "none") + diff --git a/original models/Discrimination.R b/original models/Discrimination.R deleted file mode 100644 index 3d3b3ef..0000000 --- a/original models/Discrimination.R +++ /dev/null @@ -1,88 +0,0 @@ -# This script performs all calculations for assessing discrimination -library(tidyverse) -library(pROC) -library(dplyr) -library(ggplot2) -library(runway) - -# AUROC calculation and figure for MELD -roc_meld <- roc(stph.meld$D90_DTH, stph.meld$MELD.surv) -auc_meld <- auc(roc_meld) -auc_meld_ci <- ci.auc(roc_meld) # Confidence intervals - -roc_plot(stph.meld, "D90_surv", "MELD.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") - -# AUROC calculation and figure for MELD 3.0 -roc_meld3 <- roc(stph.meld$D90_DTH, stph.meld$MELD3.surv) -auc_meld3 <- auc(roc_meld3) -auc_meld3_ci <- ci.auc(roc_meld3) # Confidence intervals - -roc_plot(stph.meld, "D90_surv", "MELD3.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") - -# AUROC calculation and figure for Lille -roc_lille <- roc(stph.lille$D90_DTH, stph.lille$Lille.surv) -auc_lille <- auc(roc_lille) -auc_lille_ci <- ci.auc(roc_lille) - -roc_plot(stph.lille, "D90_surv", "Lille.surv", ci = TRUE, plot_title = "ROC curve for the Lille score") - -# AUROC calculation and figure for CLIF-C ACLF -roc_clif <- roc(stph.clif$D90_DTH, stph.clif$CLIF.surv) -auc_clif <- auc(roc_clif) -auc_clif_ci <- ci.auc(roc_clif) - -roc_plot(stph.clif, "D90_surv", "CLIF.surv", ci = TRUE, plot_title = "ROC curve for the CLIF-C ACLF score") - -# ROC plot of all models combined -library(ggROC) -ggroc(list(MELD = roc_meld, CLIF = roc_clif, Lille = roc_lille)) - -##### -# Formally ompare the c-statistics across models using bootstrap method -compareroc.mc <- roc.test(roc_clif, roc_meld) # comparison between MELD and CLIF -compareroc.ml <- roc.test(roc_meld, roc_lille) # comparison between MELD and Lille -compareroc.cl <- roc.test(roc_clif, roc_lille) # comparison between CLIF and Lille - -# Tabulate the p-values -roc_pvalues <- c(compareroc.mc$p.value, compareroc.ml$p.value, compareroc.cl$p.value) -names(roc_pvalues) <- c("p-value MELD-CLIF", "p-value MELD-Lille", "p-value CLIF-Lille") -roc_pvalues - -##### -# NRI calculations -library(nricens) - -# Define events and probability vectors -event <- stph.c$D90_surv -p.MELD <- stph.c$MELD.surv -p.MELD2 <- stph.c$MELD.surv2 -p.LILLE <- stph.c$Lille.surv -p.CLIF <- stph.c$CLIF.surv - -# Define cut-off points -cut_lille <- 1 - (exp(-0.45)/(1 + exp(-0.45))) -cut_meld <- 0.707^(exp(2.5 - 1.127)) -cut_meld2 <- 0.98465^(exp(0.1635*(25 - 10))) -cut_clif <- exp(-0.0079 * exp(0.0869*51)) - -# Calculate NRI for all models -# MELD_1 and Lille -NRI_ML <- nribin(event = event, p.std = p.MELD, p.new = p.LILLE, cut = cut_meld, niter = 0, updown = 'category') -# MELD_2 and Lille -NRI_M2L <- nribin(event = event, p.std = p.MELD2, p.new = p.LILLE, cut = cut_meld2, niter = 0, updown = 'category') -# CLIF-C ACLF and Lille -NRI_CL <- nribin(event = event, p.std = p.CLIF, p.new = p.LILLE, cut = cut_clif, niter = 0, updown = 'category') -# MELD_1 and CLIF-C ACLF -NRI_MC <- nribin(event = event, p.std = p.MELD, p.new = p.CLIF, cut = cut_meld, niter = 0, updown = 'category') -# MELD_2 and CLIF-C ACLF -NRI_M2C <- nribin(event = event, p.std = p.MELD2, p.new = p.CLIF, cut = cut_meld2, niter = 0, updown = 'category') -# Lille and MELD_1 -NRI_LM <- nribin(event = event, p.std = p.LILLE, p.new = p.MELD, cut = cut_lille, niter = 0, updown = 'category') -# Lille and MELD_2 -NRI_LM2 <- nribin(event = event, p.std = p.LILLE, p.new = p.MELD2, cut = cut_lille, niter = 0, updown = 'category') -# Lille and CLIF-C ACLf -NRI_LC <- nribin(event = event, p.std = p.LILLE, p.new = p.CLIF, cut = cut_lille, niter = 0, updown = 'category') -# CLIF-C ACLF and MELD_1 -NRI_CM <- nribin(event = event, p.std = p.CLIF, p.new = p.MELD, cut = cut_clif, niter = 0, updown = 'category') -# CLIF-C ACLF and MELD_2 -NRI_CM2 <- nribin(event = event, p.std = p.CLIF, p.new = p.MELD2, cut = cut_clif, niter = 0, updown = 'category') diff --git a/original models/Discrimination_6M.R b/original models/Discrimination_6M.R new file mode 100644 index 0000000..6bb5d77 --- /dev/null +++ b/original models/Discrimination_6M.R @@ -0,0 +1,126 @@ +# This script performs all calculations for assessing discrimination +library(pROC) +library(dplyr) +library(ggplot2) +library(tidyr) +library(forcats) +library(purrr) + +# data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "original_models_6M.Rdata")) + +############################################# +############### Calculate AUCs ############# +############################################# + +# MELD (survival function 1 and 2 do not matter here) +roc_meld <- roc(stph.meld$M6_DTH, stph.meld$MELD.surv_6M) + +# Lille +roc_lille <- roc(stph.lille$M6_DTH, stph.lille$Lille.surv_6M) + +# CLIF-C ACLF +roc_clif <- roc(stph.clif$M6_DTH, stph.clif$CLIF.surv_6M) + +############################################# +###################### Plots ############### +############################################# +roc.list <- list("CLIF-C ACLF" = roc_clif, + "Lille" = roc_lille, + "MELD" = roc_meld) + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +# rorder based on AUC scores +#df_cal$Score <- factor(df_cal$Score, labels = levels(data_wide$condition)) +ggroc(roc.list) + + facet_grid(. ~ name) + + theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + +############################################# +###################### AUC ################# +############################################# +# CI +#auc_meld <- auc(roc_meld) +#auc_meld_ci <- ci.auc(roc_meld) # Confidence intervals +#roc_plot(stph.meld, "D90_surv", "MELD.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") + +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + +df3 <- gather(df_AUC2, condition, measurement, "CLIF-C ACLF":"MELD", factor_key = TRUE) + +#df3 <- gather(df_AUC2, condition, measurement, MELD:Lille, factor_key = TRUE) + +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + + +# p_auc <- ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + +# geom_point(lwd = 2) + +# coord_cartesian(xlim = c(0.5, 0.86)) + +# geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), +# alpha = 1, show.legend = F, lwd = 1, width = 0.5) + +# labs(y = "Score", col = "Score", x = "AUC with 95% limits") + +# scale_color_brewer(palette = "Dark2") + +# scale_fill_brewer(palette = "Dark2") + +# theme_classic2() + +# theme(legend.position = "none") + +##### +p_roc <- ggroc(roc.list, lwd = 1.1) + + facet_grid(. ~ name) + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic2() + + theme(legend.position = "none") + +data_wide$name <- data_wide$condition +data_wide$low_CL <- round(data_wide$low_CL, 2) +data_wide$mean <- round(data_wide$mean, 2) +data_wide$upper_CL <- round(data_wide$upper_CL, 2) + +p_roc + geom_text(data = data_wide, + aes(0.02, 0.10, label = paste0("AUC (95% CI): ", mean, " (", low_CL, "-", upper_CL, ")" ), + hjust = 1), col = "black") #+ + + diff --git a/original models/Discrimination_uo.R b/original models/Discrimination_uo.R new file mode 100644 index 0000000..90f5a52 --- /dev/null +++ b/original models/Discrimination_uo.R @@ -0,0 +1,158 @@ +# This script performs all calculations for assessing discrimination + +library(pROC) +library(dplyr) +library(ggplot2) +library(tidyr) +library(forcats) +library(purrr) + +# data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "original_models.Rdata")) + +############################################# +############### Calculate AUCs ############# +############################################# + +# MELD (survival function 1 and 2 do not matter here) +roc_meld <- roc(stph.meld$D90_DTH, stph.meld$MELD.surv) + +# MELD 3.0 +roc_meld3 <- roc(stph.meld$D90_DTH, stph.meld$MELD3.surv) + +# MELD from VanDerwerken et al 2021 +roc_meld.VanDerwerken <- roc(stph.meld$D90_DTH, stph.meld$MELD_Van) + +# Lille +roc_lille <- roc(stph.lille$D90_DTH, stph.lille$Lille.surv) + +# CLIF-C ACLF +roc_clif <- roc(stph.clif$D90_DTH, stph.clif$CLIF.surv) + +############################################# +###################### Plots ############### +############################################# +roc.list <- list("CLIF-C ACLF" = roc_clif, + "Lille" = roc_lille, + "MELD" = roc_meld, + "MELD VanDerwerken" = roc_meld.VanDerwerken) + +#roc.list <- list("MELD" = roc_meld, +# "MELD VanDerwerken" = roc_meld.VanDerwerken, +# "CLIF-C ACLF" = roc_clif, +# "Lille" = roc_lille) + +#setwd(path_data) +#save(roc.list, file = "ROC_original.Rdata") + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +# rorder based on AUC scores +#df_cal$Score <- factor(df_cal$Score, labels = levels(data_wide$condition)) +ggroc(roc.list) + + facet_grid(. ~ name) + + theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + +# To have all the curves of the same color, use aes="group": +#g.group <- ggroc(roc.list, aes="group") +#g.group +#g.group + facet_grid(.~name) + +############################################# +###################### AUC ############### +############################################# +# CI +#auc_meld <- auc(roc_meld) +#auc_meld_ci <- ci.auc(roc_meld) # Confidence intervals +#roc_plot(stph.meld, "D90_surv", "MELD.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") + +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + + +df3 <- gather(df_AUC2, condition, measurement, `CLIF-C ACLF`:`MELD VanDerwerken`, factor_key = TRUE) + +#df3 <- gather(df_AUC2, condition, measurement, MELD:Lille, factor_key = TRUE) + +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + +p_auc <- ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + + geom_point(lwd = 2) + + coord_cartesian(xlim = c(0.5, 0.86)) + + geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), + alpha = 1, show.legend = F, lwd = 1, width = 0.5) + + labs(y = "Score", col = "Score", x = "AUC with 95% limits") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic2() + + theme(legend.position = "none") + +##### +p_roc <- ggroc(roc.list, lwd = 1.1) + + facet_grid(. ~ name) + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic2() + + theme(legend.position = "none") + +data_wide$name <- data_wide$condition +data_wide$low_CL <- round(data_wide$low_CL, 2) +data_wide$mean <- round(data_wide$mean, 2) +data_wide$upper_CL <- round(data_wide$upper_CL, 2) + +p_roc + geom_text(data = data_wide, + aes(0.02, 0.10, label = paste0("AUC (95% CI): ", mean, " (", low_CL, "-", upper_CL, ")" ), + hjust = 1), col = "black") #+ +#theme_classic() + + +##### +# Formally compare the c-statistics across models using bootstrap method +compareroc.mc <- roc.test(roc_clif, roc_meld) # comparison between MELD and CLIF +compareroc.ml <- roc.test(roc_meld, roc_lille) # comparison between MELD and Lille +compareroc.cl <- roc.test(roc_clif, roc_lille) # comparison between CLIF and Lille + +# Tabulate the p-values +roc_pvalues <- c(compareroc.mc$p.value, compareroc.ml$p.value, compareroc.cl$p.value) +names(roc_pvalues) <- c("p-value MELD-CLIF", "p-value MELD-Lille", "p-value CLIF-Lille") +roc_pvalues \ No newline at end of file diff --git a/original models/ROC_original.Rdata b/original models/ROC_original.Rdata new file mode 100755 index 0000000..03005cc Binary files /dev/null and b/original models/ROC_original.Rdata differ diff --git a/original models/clinical_utility.R b/original models/clinical_utility.R old mode 100644 new mode 100755 index 631f20d..9a4d6c6 --- a/original models/clinical_utility.R +++ b/original models/clinical_utility.R @@ -1,115 +1,115 @@ -# This script performs all calculations for decision curve analysis. - -# add necessary functions to the directory -source("nb_diff.R") -source("dca.R") # dca.R is taken from decisioncurveanalysis.org - -# Since decision curves look better when working with mortality probability rather than survival, first -# define variables corresponding to 90-day mortality -stph.c$clif.mort <- 1 - stph.c$CLIF.surv -stph.c$meld.mort <- 1 - stph.c$MELD.surv -stph.c$meld.mort2 <- 1 - stph.c$MELD.surv2 -stph.c$lille.mort <- 1 - stph.c$Lille.surv -stph.c$meld3.mort <- 1 - stph.c$MELD3.surv - -# Perform and plot DCA for the original models -full_dca <- dca(data = stph.c, outcome = "D90_DTH", predictors = c("clif.mort", "meld.mort", "meld.mort2", "lille.mort"), xstop = 0.75) -nb_data <- full_dca$net.benefit - -plot(nb_data$threshold, nb_data$none, type = "l", lwd = 2, xlab = "Threshold mortality probability", ylab = "Net benefit", ylim = c(-0.05, 0.25)) -lines(nb_data$threshold, nb_data$all, type = "l", col = 8, lwd = 2) -lines(nb_data$threshold, nb_data$meld.mort, type = "l", col = "darkblue", lwd = 2) -lines(nb_data$threshold, nb_data$meld.mort2, type = "l", col = "darkgreen", lwd = 2) -lines(nb_data$threshold, nb_data$clif.mort, type = "l", col = "darkred", lwd = 2) -lines(nb_data$threshold, nb_data$lille.mort, type = "l", col = "orange", lwd = 2) -# Add a legend -legend("topright", cex = 0.8, legend = c("Treat none", "Treat all", "MELD_1", "MELD_2", "CLIF-C ACLF", "Lille"), - col = c(17, 8, "darkblue", "darkgreen", "darkred", "orange"), lwd = c(2, 2, 2, 2, 2, 2)) - -# Create a table of NB values for a set of reasonable threshold probabilities -table_output <- dca(data = stph.c, outcome = "D90_DTH", - predictors = c("clif.mort", "meld.mort", "lille.mort"), - xstart = 0.25, xstop = 0.75, xby = 0.10, graph = F) -table_output - -###### -# Formally compare the NB values for different values of the threshold probability using a bootstrap approach -# The code is based on that proposed by Zhang et al., 2018 -library(boot) -set.seed(34) -R <- 500 # Number of bootstrap samples - -# Compare the CLIF-C ACLF and MELD models -boot.diff.cm <- boot(data=stph.c, statistic = nb_diff, - R = R, outcome = "D90_DTH", pred1 = "clif.mort", - pred2 = "meld.mort", xstart = 0.25, xstop = 0.75, - step = 0.05) -pvalue.cm <- NULL - -for(i in 1:length(boot.diff.cm$t0)){ - pvalue.cm <- c(pvalue.cm, mean(abs(boot.diff.cm$t[,i] - boot.diff.cm$t0[i]) > abs(boot.diff.cm$t0[i]))) -} - -# Compare the MELD and Lille model -boot.diff.ml <- boot(data=stph.c, statistic = nb_diff, - R = R, outcome = "D90_DTH", pred1 = "lille.mort", - pred2 = "meld.mort", xstart = 0.25, xstop = 0.75, - step = 0.05) -pvalue.ml <- NULL - -for(i in 1:length(boot.diff.ml$t0)){ - pvalue.ml <- c(pvalue.ml, mean(abs(boot.diff.ml$t[,i] - boot.diff.ml$t0[i]) > abs(boot.diff.ml$t0[i]))) -} - -# Compare the CLIF-C ACLF and Lille model -boot.diff.cl <- boot(data=stph.c, statistic = nb_diff, - R = R, outcome = "D90_DTH", pred1 = "clif.mort", - pred2 = "lille.mort", xstart = 0.25, xstop = 0.75, - step = 0.05) -pvalue.cl <- NULL - -for(i in 1:length(boot.diff.cl$t0)){ - pvalue.cl <- c(pvalue.cl, mean(abs(boot.diff.cl$t[,i] - boot.diff.cl$t0[i]) > abs(boot.diff.cl$t0[i]))) -} - -# Compare CLIF-C ACLF with the MELD_2 -boot.diff.cm2 <- boot(data=stph.c, statistic = nb_diff, - R = R, outcome = "D90_DTH", pred1 = "clif.mort", - pred2 = "meld.mort2", xstart = 0.25, xstop = 0.75, - step = 0.05) -pvalue.cm2 <- NULL - -for(i in 1:length(boot.diff.cm2$t0)){ - pvalue.cm2 <- c(pvalue.cm2, mean(abs(boot.diff.cm2$t[,i] - boot.diff.cm2$t0[i]) > abs(boot.diff.cm2$t0[i]))) -} - -# Compare Lille and the MELD_2 -boot.diff.m2l <- boot(data=stph.c, statistic = nb_diff, - R = R, outcome = "D90_DTH", pred1 = "lille.mort", - pred2 = "meld.mort2", xstart = 0.25, xstop = 0.75, - step = 0.05) -pvalue.m2l <- NULL - -for(i in 1:length(boot.diff.m2l$t0)){ - pvalue.m2l <- c(pvalue.m2l, mean(abs(boot.diff.m2l$t[,i] - boot.diff.m2l$t0[i]) > abs(boot.diff.m2l$t0[i]))) -} - -# Compare both MELD options -boot.diff.mm <- boot(data=stph.c, statistic = nb_diff, - R = R, outcome = "D90_DTH", pred1 = "meld.mort2", - pred2 = "meld.mort", xstart = 0.25, xstop = 0.75, - step = 0.05) -pvalue.mm <- NULL - -for(i in 1:length(boot.diff.mm$t0)){ - pvalue.mm <- c(pvalue.mm, mean(abs(boot.diff.mm$t[,i] - boot.diff.mm$t0[i]) > abs(boot.diff.mm$t0[i]))) -} - -# Append p-values together in a dataframe for simplicity -pvalues_dca <- data.frame("threshold probability" = seq(from = 0.25, to = 0.75, by = 0.05), - "p-values Meld-CLIF" = pvalue.cm, - "p-values MELD-Lille" = pvalue.ml, - "p-values CLIF-Lille" = pvalue.cl, - "p-values MELD2-CLIF" = pvalue.cm2, - "p-values MELD2-Lille" = pvalue.m2l, - "p-values MELD-MELD2" = pvalue.mm) +# # This script performs all calculations for decision curve analysis. +# +# # add necessary functions to the directory +# source("nb_diff.R") +# source("dca.R") # dca.R is taken from decisioncurveanalysis.org +# +# # Since decision curves look better when working with mortality probability rather than survival, first +# # define variables corresponding to 90-day mortality +# stph.c$clif.mort <- 1 - stph.c$CLIF.surv +# stph.c$meld.mort <- 1 - stph.c$MELD.surv +# stph.c$meld.mort2 <- 1 - stph.c$MELD.surv2 +# stph.c$lille.mort <- 1 - stph.c$Lille.surv +# stph.c$meld3.mort <- 1 - stph.c$MELD3.surv +# +# # Perform and plot DCA for the original models +# full_dca <- dca(data = stph.c, outcome = "D90_DTH", predictors = c("clif.mort", "meld.mort", "meld.mort2", "lille.mort"), xstop = 0.75) +# nb_data <- full_dca$net.benefit +# +# plot(nb_data$threshold, nb_data$none, type = "l", lwd = 2, xlab = "Threshold mortality probability", ylab = "Net benefit", ylim = c(-0.05, 0.25)) +# lines(nb_data$threshold, nb_data$all, type = "l", col = 8, lwd = 2) +# lines(nb_data$threshold, nb_data$meld.mort, type = "l", col = "darkblue", lwd = 2) +# lines(nb_data$threshold, nb_data$meld.mort2, type = "l", col = "darkgreen", lwd = 2) +# lines(nb_data$threshold, nb_data$clif.mort, type = "l", col = "darkred", lwd = 2) +# lines(nb_data$threshold, nb_data$lille.mort, type = "l", col = "orange", lwd = 2) +# # Add a legend +# legend("topright", cex = 0.8, legend = c("Treat none", "Treat all", "MELD_1", "MELD_2", "CLIF-C ACLF", "Lille"), +# col = c(17, 8, "darkblue", "darkgreen", "darkred", "orange"), lwd = c(2, 2, 2, 2, 2, 2)) +# +# # Create a table of NB values for a set of reasonable threshold probabilities +# table_output <- dca(data = stph.c, outcome = "D90_DTH", +# predictors = c("clif.mort", "meld.mort", "lille.mort"), +# xstart = 0.25, xstop = 0.75, xby = 0.10, graph = F) +# table_output +# +# ###### +# # Formally compare the NB values for different values of the threshold probability using a bootstrap approach +# # The code is based on that proposed by Zhang et al., 2018 +# library(boot) +# set.seed(34) +# R <- 500 # Number of bootstrap samples +# +# # Compare the CLIF-C ACLF and MELD models +# boot.diff.cm <- boot(data=stph.c, statistic = nb_diff, +# R = R, outcome = "D90_DTH", pred1 = "clif.mort", +# pred2 = "meld.mort", xstart = 0.25, xstop = 0.75, +# step = 0.05) +# pvalue.cm <- NULL +# +# for(i in 1:length(boot.diff.cm$t0)){ +# pvalue.cm <- c(pvalue.cm, mean(abs(boot.diff.cm$t[,i] - boot.diff.cm$t0[i]) > abs(boot.diff.cm$t0[i]))) +# } +# +# # Compare the MELD and Lille model +# boot.diff.ml <- boot(data=stph.c, statistic = nb_diff, +# R = R, outcome = "D90_DTH", pred1 = "lille.mort", +# pred2 = "meld.mort", xstart = 0.25, xstop = 0.75, +# step = 0.05) +# pvalue.ml <- NULL +# +# for(i in 1:length(boot.diff.ml$t0)){ +# pvalue.ml <- c(pvalue.ml, mean(abs(boot.diff.ml$t[,i] - boot.diff.ml$t0[i]) > abs(boot.diff.ml$t0[i]))) +# } +# +# # Compare the CLIF-C ACLF and Lille model +# boot.diff.cl <- boot(data=stph.c, statistic = nb_diff, +# R = R, outcome = "D90_DTH", pred1 = "clif.mort", +# pred2 = "lille.mort", xstart = 0.25, xstop = 0.75, +# step = 0.05) +# pvalue.cl <- NULL +# +# for(i in 1:length(boot.diff.cl$t0)){ +# pvalue.cl <- c(pvalue.cl, mean(abs(boot.diff.cl$t[,i] - boot.diff.cl$t0[i]) > abs(boot.diff.cl$t0[i]))) +# } +# +# # Compare CLIF-C ACLF with the MELD_2 +# boot.diff.cm2 <- boot(data=stph.c, statistic = nb_diff, +# R = R, outcome = "D90_DTH", pred1 = "clif.mort", +# pred2 = "meld.mort2", xstart = 0.25, xstop = 0.75, +# step = 0.05) +# pvalue.cm2 <- NULL +# +# for(i in 1:length(boot.diff.cm2$t0)){ +# pvalue.cm2 <- c(pvalue.cm2, mean(abs(boot.diff.cm2$t[,i] - boot.diff.cm2$t0[i]) > abs(boot.diff.cm2$t0[i]))) +# } +# +# # Compare Lille and the MELD_2 +# boot.diff.m2l <- boot(data=stph.c, statistic = nb_diff, +# R = R, outcome = "D90_DTH", pred1 = "lille.mort", +# pred2 = "meld.mort2", xstart = 0.25, xstop = 0.75, +# step = 0.05) +# pvalue.m2l <- NULL +# +# for(i in 1:length(boot.diff.m2l$t0)){ +# pvalue.m2l <- c(pvalue.m2l, mean(abs(boot.diff.m2l$t[,i] - boot.diff.m2l$t0[i]) > abs(boot.diff.m2l$t0[i]))) +# } +# +# # Compare both MELD options +# boot.diff.mm <- boot(data=stph.c, statistic = nb_diff, +# R = R, outcome = "D90_DTH", pred1 = "meld.mort2", +# pred2 = "meld.mort", xstart = 0.25, xstop = 0.75, +# step = 0.05) +# pvalue.mm <- NULL +# +# for(i in 1:length(boot.diff.mm$t0)){ +# pvalue.mm <- c(pvalue.mm, mean(abs(boot.diff.mm$t[,i] - boot.diff.mm$t0[i]) > abs(boot.diff.mm$t0[i]))) +# } +# +# # Append p-values together in a dataframe for simplicity +# pvalues_dca <- data.frame("threshold probability" = seq(from = 0.25, to = 0.75, by = 0.05), +# "p-values Meld-CLIF" = pvalue.cm, +# "p-values MELD-Lille" = pvalue.ml, +# "p-values CLIF-Lille" = pvalue.cl, +# "p-values MELD2-CLIF" = pvalue.cm2, +# "p-values MELD2-Lille" = pvalue.m2l, +# "p-values MELD-MELD2" = pvalue.mm) diff --git a/original models/complete_cases_models.Rdata b/original models/complete_cases_models.Rdata new file mode 100755 index 0000000..6ff94c7 Binary files /dev/null and b/original models/complete_cases_models.Rdata differ diff --git a/original models/dca.r b/original models/dca.r deleted file mode 100644 index 9c17e58..0000000 --- a/original models/dca.r +++ /dev/null @@ -1,232 +0,0 @@ -#' Function that performs and plots the dca for prognostic models -#' Code taken from decisioncurveanalysis.org, referenced by -#' Vickers AJ, Elkin EB. Decision curve analysis: a novel method for evaluating prediction models. Medical Decision Making. 2006;26(6):565-74. -#' -#' @param data dataset that includes the predictor and outcome variables -#' @param outcome 0/1 outcome variable -#' @param predictors vector of predicted risks of the models to plot -#' @param xstart lowest value of threshold probability to be looked at -#' @param xstop highest value of threshold probability to consider -#' @param xby stepsize for looking at different threshold probabilities -#' @param ymin minimum of y-axis in the plot -#' @param probablity -#' @param harm -#' @param graph TRUE if we want a graph of NB curves to print -#' @param intervention -#' @param interventionper -#' @param smooth -#' @param loess.span - - -dca <- function(data, outcome, predictors, xstart=0.01, xstop=0.99, xby=0.01, - ymin=-0.05, probability=NULL, harm=NULL,graph=TRUE, intervention=FALSE, - interventionper=100, smooth=FALSE,loess.span=0.10) { - - # LOADING REQUIRED LIBRARIES - require(stats) - - # data MUST BE A DATA FRAME - if (class(data)!="data.frame") { - stop("Input data must be class data.frame") - } - - #ONLY KEEPING COMPLETE CASES - data=data[complete.cases(data[append(outcome,predictors)]),append(outcome,predictors)] - - # outcome MUST BE CODED AS 0 AND 1 - if (max(data[[outcome]])>1 | min(data[[outcome]])<0) { - stop("outcome cannot be less than 0 or greater than 1") - } - # xstart IS BETWEEN 0 AND 1 - if (xstart<0 | xstart>1) { - stop("xstart must lie between 0 and 1") - } - - # xstop IS BETWEEN 0 AND 1 - if (xstop<0 | xstop>1) { - stop("xstop must lie between 0 and 1") - } - - # xby IS BETWEEN 0 AND 1 - if (xby<=0 | xby>=1) { - stop("xby must lie between 0 and 1") - } - - # xstart IS BEFORE xstop - if (xstart>=xstop) { - stop("xstop must be larger than xstart") - } - - #STORING THE NUMBER OF PREDICTORS SPECIFIED - pred.n=length(predictors) - - #IF probability SPECIFIED ENSURING THAT EACH PREDICTOR IS INDICATED AS A YES OR NO - if (length(probability)>0 & pred.n!=length(probability)) { - stop("Number of probabilities specified must be the same as the number of predictors being checked.") - } - - #IF harm SPECIFIED ENSURING THAT EACH PREDICTOR HAS A SPECIFIED HARM - if (length(harm)>0 & pred.n!=length(harm)) { - stop("Number of harms specified must be the same as the number of predictors being checked.") - } - - #INITIALIZING DEFAULT VALUES FOR PROBABILITES AND HARMS IF NOT SPECIFIED - if (length(harm)==0) { - harm=rep(0,pred.n) - } - if (length(probability)==0) { - probability=rep(TRUE,pred.n) - } - - - #CHECKING THAT EACH probability ELEMENT IS EQUAL TO YES OR NO, - #AND CHECKING THAT PROBABILITIES ARE BETWEEN 0 and 1 - #IF NOT A PROB THEN CONVERTING WITH A LOGISTIC REGRESSION - for(m in 1:pred.n) { - if (probability[m]!=TRUE & probability[m]!=FALSE) { - stop("Each element of probability vector must be TRUE or FALSE") - } - if (probability[m]==TRUE & (max(data[predictors[m]])>1 | min(data[predictors[m]])<0)) { - stop(paste(predictors[m],"must be between 0 and 1 OR sepcified as a non-probability in the probability option",sep=" ")) - } - if(probability[m]==FALSE) { - model=NULL - pred=NULL - model=glm(data.matrix(data[outcome]) ~ data.matrix(data[predictors[m]]), family=binomial("logit")) - pred=data.frame(model$fitted.values) - pred=data.frame(pred) - names(pred)=predictors[m] - data=cbind(data[names(data)!=predictors[m]],pred) - print(paste(predictors[m],"converted to a probability with logistic regression. Due to linearity assumption, miscalibration may occur.",sep=" ")) - } - } - - # THE PREDICTOR NAMES CANNOT BE EQUAL TO all OR none. - if (length(predictors[predictors=="all" | predictors=="none"])) { - stop("Prediction names cannot be equal to all or none.") - } - - ######### CALCULATING NET BENEFIT ######### - N=dim(data)[1] - event.rate=colMeans(data[outcome]) - - # CREATING DATAFRAME THAT IS ONE LINE PER THRESHOLD PER all AND none STRATEGY - nb=data.frame(seq(from=xstart, to=xstop, by=xby)) - names(nb)="threshold" - interv=nb - - nb["all"]=event.rate - (1-event.rate)*nb$threshold/(1-nb$threshold) - nb["none"]=0 - - # CYCLING THROUGH EACH PREDICTOR AND CALCULATING NET BENEFIT - for(m in 1:pred.n){ - for(t in 1:length(nb$threshold)){ - # COUNTING TRUE POSITIVES AT EACH THRESHOLD - tp=mean(data[data[[predictors[m]]]>=nb$threshold[t],outcome])*sum(data[[predictors[m]]]>=nb$threshold[t]) - # COUNTING FALSE POSITIVES AT EACH THRESHOLD - fp=(1-mean(data[data[[predictors[m]]]>=nb$threshold[t],outcome]))*sum(data[[predictors[m]]]>=nb$threshold[t]) - #setting TP and FP to 0 if no observations meet threshold prob. - if (sum(data[[predictors[m]]]>=nb$threshold[t])==0) { - tp=0 - fp=0 - } - - # CALCULATING NET BENEFIT - nb[t,predictors[m]]=tp/N - fp/N*(nb$threshold[t]/(1-nb$threshold[t])) - harm[m] - } - interv[predictors[m]]=(nb[predictors[m]] - nb["all"])*interventionper/(interv$threshold/(1-interv$threshold)) - } - - # CYCLING THROUGH EACH PREDICTOR AND SMOOTH NET BENEFIT AND INTERVENTIONS AVOIDED - for(m in 1:pred.n) { - if (smooth==TRUE){ - lws=loess(data.matrix(nb[!is.na(nb[[predictors[m]]]),predictors[m]]) ~ data.matrix(nb[!is.na(nb[[predictors[m]]]),"threshold"]),span=loess.span) - nb[!is.na(nb[[predictors[m]]]),paste(predictors[m],"_sm",sep="")]=lws$fitted - - lws=loess(data.matrix(interv[!is.na(nb[[predictors[m]]]),predictors[m]]) ~ data.matrix(interv[!is.na(nb[[predictors[m]]]),"threshold"]),span=loess.span) - interv[!is.na(nb[[predictors[m]]]),paste(predictors[m],"_sm",sep="")]=lws$fitted - } - } - - # PLOTTING GRAPH IF REQUESTED - if (graph==TRUE) { - require(graphics) - - # PLOTTING INTERVENTIONS AVOIDED IF REQUESTED - if(intervention==TRUE) { - # initialize the legend label, color, and width using the standard specs of the none and all lines - legendlabel <- NULL - legendcolor <- NULL - legendwidth <- NULL - legendpattern <- NULL - - #getting maximum number of avoided interventions - ymax=max(interv[predictors],na.rm = TRUE) - - #INITIALIZING EMPTY PLOT WITH LABELS - plot(x=nb$threshold, y=nb$all, type="n" ,xlim=c(xstart, xstop), ylim=c(ymin, ymax), xlab="Threshold probability", ylab=paste("Net reduction in interventions per",interventionper,"patients")) - - #PLOTTING INTERVENTIONS AVOIDED FOR EACH PREDICTOR - for(m in 1:pred.n) { - if (smooth==TRUE){ - lines(interv$threshold,data.matrix(interv[paste(predictors[m],"_sm",sep="")]),col=m,lty=2) - } else { - lines(interv$threshold,data.matrix(interv[predictors[m]]),col=m,lty=2) - } - - # adding each model to the legend - legendlabel <- c(legendlabel, predictors[m]) - legendcolor <- c(legendcolor, m) - legendwidth <- c(legendwidth, 1) - legendpattern <- c(legendpattern, 2) - } - } else { - # PLOTTING NET BENEFIT IF REQUESTED - - # initialize the legend label, color, and width using the standard specs of the none and all lines - legendlabel <- c("None", "All") - legendcolor <- c(17, 8) - legendwidth <- c(2, 2) - legendpattern <- c(1, 1) - - #getting maximum net benefit - ymax=max(nb[names(nb)!="threshold"],na.rm = TRUE) - - # inializing new benfit plot with treat all option - plot(x=nb$threshold, y=nb$all, type="l", col=8, lwd=2 ,xlim=c(xstart, xstop), ylim=c(ymin, ymax), xlab="Threshold probability", ylab="Net benefit") - # adding treat none option - lines(x=nb$threshold, y=nb$none,lwd=2) - #PLOTTING net benefit FOR EACH PREDICTOR - for(m in 1:pred.n) { - if (smooth==TRUE){ - lines(nb$threshold,data.matrix(nb[paste(predictors[m],"_sm",sep="")]),col=m,lty=2) - } else { - lines(nb$threshold,data.matrix(nb[predictors[m]]),col=m,lty=2) - } - # adding each model to the legend - legendlabel <- c(legendlabel, predictors[m]) - legendcolor <- c(legendcolor, m) - legendwidth <- c(legendwidth, 1) - legendpattern <- c(legendpattern, 2) - } - } - # then add the legend - legend("topright", legendlabel, cex=0.8, col=legendcolor, lwd=legendwidth, lty=legendpattern) - - } - - #RETURNING RESULTS - results=list() - results$N=N - results$predictors=data.frame(cbind(predictors,harm,probability)) - names(results$predictors)=c("predictor","harm.applied","probability") - results$interventions.avoided.per=interventionper - results$net.benefit=nb - results$interventions.avoided=interv - - return(results) - -} - - - diff --git a/original models/imputed_orig_scores.Rdata b/original models/imputed_orig_scores.Rdata new file mode 100644 index 0000000..a58f11a Binary files /dev/null and b/original models/imputed_orig_scores.Rdata differ diff --git a/original models/imputed_sens_orig_scores.Rdata b/original models/imputed_sens_orig_scores.Rdata new file mode 100644 index 0000000..673edcc Binary files /dev/null and b/original models/imputed_sens_orig_scores.Rdata differ diff --git a/original models/nb_diff.R b/original models/nb_diff.R deleted file mode 100644 index 19dca3b..0000000 --- a/original models/nb_diff.R +++ /dev/null @@ -1,28 +0,0 @@ -#' Function that calculates the difference between net benefit and calculates the p-value for -#' formal comparison between the net benefits for different values of the threshold probability -#' The idea for this function is taken from: -#' Zhang Z, Rousson V, Lee WC, et al. Decision curve analysis: a technical note. Ann Transl Med. 2018;6(15):308. doi:10.21037/atm.2018.07.02 -#' link: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6123195/ -#' -#' @param data dataset that includes the predictor and outcome variables -#' @param ii indices parameter for bootstrap -#' @param outcome 0/1 outcome variable -#' @param pred1 first predictor for survival -#' @param pred2 second predictor for survival -#' @param xstart lowest value of threshold probability to be looked at -#' @param xstop highest value of threshold probability to consider -#' @param step step size for looking at different threshold probabilities - -nb_diff <- function(data, ii, outcome, pred1, pred2, xstart, xstop, step){ - dd <- data[ii,] - nb1 <- dca(data = dd, outcome = outcome, predictors = pred1, - xstart = 0.25, xstop = 0.75, xby = 0.05, graph = F) - nb2 <- dca(data = dd, outcome = outcome, predictors = pred2, - xstart = 0.25, xstop = 0.75, xby = 0.05, graph = F) - - nb.diff <- nb2$net.benefit[,4] - nb1$net.benefit[,4] - return(nb.diff) -} - - - diff --git a/original models/original_models.Rdata b/original models/original_models.Rdata new file mode 100755 index 0000000..afe8845 Binary files /dev/null and b/original models/original_models.Rdata differ diff --git a/original models/original_models_6M.Rdata b/original models/original_models_6M.Rdata new file mode 100644 index 0000000..a23af4a Binary files /dev/null and b/original models/original_models_6M.Rdata differ diff --git a/original models/plots_imputation_sens.R b/original models/plots_imputation_sens.R new file mode 100644 index 0000000..300e697 --- /dev/null +++ b/original models/plots_imputation_sens.R @@ -0,0 +1,46 @@ + +library(ggplot2) +library(gridExtra) + + +path_imp <- "~/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/" + +# imputed data - sensitivity analysis +load(paste0(path_imp, "imputed_data_sens.Rdata")) + + +p1 <- densityplot(imp.all[[1]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", + main = expression(paste(delta, " = 0"))) +p2 <- densityplot(imp.all[[2]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", + main = expression(paste(delta, " = -50"))) +p3 <- densityplot(imp.all[[3]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", + main = expression(paste(delta, " = -100"))) +p4 <- densityplot(imp.all[[4]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", + main = expression(paste(delta, " = -200"))) + +library(patchwork) +p_prelim <- grid.arrange(p1, p2, p3, p4, nrow = 1, + bottom = textGrob("Bilirubin day 7", gp = gpar(fontsize = 11), vjust = -3), + left = textGrob("Density", gp = gpar(fontsize = 11), rot = 90, vjust = 1.5, hjust = 0) +) + +grid.arrange(p_prelim, p5, p6, nrow = 2, ncol = 3, layout_matrix = rbind(c(1,1), c(2,3))) + + + +top_row <- plot_grid(p1, p2, p3, p4, labels = "a", nrow = 1) +title <- ggdraw() + + draw_label("Bilirubin day 7", x = 0.5, y = 0.5, size = 11) + +top_row_with_title <- plot_grid( + title, top_row, + ncol = 1 + # rel_heights values control vertical title margins +) + +top_row_with_title + +bottom_row <- plot_grid(p5, p6, labels = c('b', 'c'), nrow = 1) +plot_grid(top_row_with_title, bottom_row, nrow = 2, rel_heights = c(4, 4)) + + diff --git a/original models/prognostic_scores.R b/original models/prognostic_scores.R old mode 100644 new mode 100755 index 69b7afb..c2d9c4d --- a/original models/prognostic_scores.R +++ b/original models/prognostic_scores.R @@ -1,11 +1,13 @@ ###### -# This script calculates the prognostic scores (MELD, Lille, CLIF-C ACLF) and corresponding survival probabilities. +# This script calculates the prognostic scores (MELD, Lille, CLIF-C ACLF) and corresponding 90-day survival probabilities. library(tidyverse) library(dplyr) library(ggplot2) -###### -# MELD score calculations +############################################# +############### MELD score ################## +############################################# +##### # First constrain INR, Creatinine, and Bilirubin to be within certain bandwidth stph.meld$INR <- ifelse(stph.meld$INR < 1, 1, stph.meld$INR) stph.meld$Creatinine.mg.dl.MELD <- ifelse(stph.meld$Creatinine.mg.dl < 1, 1, stph.meld$Creatinine.mg.dl) @@ -13,8 +15,9 @@ stph.meld$Bilirubin.mg.dl.MELD <- ifelse(stph.meld$Bilirubin.mg.dl < 1, 1, stph. stph.meld$Creatinine.mg.dl.MELD <- ifelse(stph.meld$Creatinine.mg.dl > 4, 4, stph.meld$Creatinine.mg.dl) # Calculate basic MELD score -stph.meld$MELD.calc <- 0.378*log(stph.meld$Bilirubin.mg.dl.MELD) + - 1.120*log(stph.meld$INR) + 0.957*log(stph.meld$Creatinine.mg.dl.MELD) +stph.meld$MELD.calc <- 0.378*log(stph.meld$Bilirubin.mg.dl.MELD) + + 1.120*log(stph.meld$INR) + + 0.957*log(stph.meld$Creatinine.mg.dl.MELD) # Round score to the nearest tenth stph.meld$MELD.calc <- round(stph.meld$MELD.calc, 1) @@ -48,8 +51,16 @@ stph.meld$MELD.calc <- ifelse(stph.meld$MELD.calc > 40, 40, stph.meld$MELD.calc) stph.meld$MELD.surv <- 0.707^(exp((stph.meld$MELD.calc/10) - 1.127)) stph.meld$MELD.surv2 <- 0.98465^(exp(0.1635*(stph.meld$MELD.calc - 10))) +# Extract 90-day survival probability from VanDerwerke et al, 2021 +library(readxl) +MELD_VanDerwerken <- read_excel("~/IDrive-Sync/Projects/MIMAH/data/MELD_VanDerwerken.xlsx") + +for(i in 1:nrow(stph.meld)){ + stph.meld$MELD_Van[i] <- MELD_VanDerwerken[stph.meld$MELD.calc[i] - 5, ]$SURV +} + #### -# Calculate the MELD3.0 score (sex-adjusted version of the MELD) +# Calculate the MELD 3.0 score (sex-adjusted version of the MELD) # Constrain Albumin stph.meld$Albumin.MELD <- ifelse(stph.meld$Albumin < 1.5, 1.5, stph.meld$Albumin) stph.meld$Albumin.MELD <- ifelse(stph.meld$Albumin > 3.5, 3.5, stph.meld$Albumin) @@ -63,8 +74,10 @@ stph.meld$MELD3 <- 1.33*stph.meld$Gender + 4.56*log(stph.meld$Bilirubin.mg.dl.ME stph.meld$MELD3 <- round(stph.meld$MELD3, 0) stph.meld$MELD3.surv <- 0.946^(exp(0.17698*stph.meld$MELD3 - 3.56)) +############################################# +######### CLIF-C ACLF Score ################# +############################################# ##### -# CLIF-C ACLF Score calculations # Start by calculating organ-failure sub-scores stph.clif$liver.score <- ifelse(stph.clif$Bilirubin.mg.dl < 6, 1, 2) stph.clif$liver.score <- ifelse(stph.clif$Bilirubin.mg.dl < 12, stph.clif$liver.score, 3) @@ -88,8 +101,10 @@ stph.clif$CLIF.OF <- stph.clif$liver.score + stph.clif$kidney.score + stph.clif$CLIF.C <- 10*(0.33*stph.clif$CLIF.OF + 0.04*stph.clif$Age + 0.63*log(stph.clif$WBC) - 2) stph.clif$CLIF.surv <- exp(-0.0079 * exp(0.0869*stph.clif$CLIF.C)) -###### -# Lille score calculations +############################################# +################### Lille score ############# +############################################# +##### # First create renal insufficiency dummy stph.lille$ren.insuf <- ifelse(stph.lille$Creatinine.mg.dl < 1.3, 0, 1) @@ -97,19 +112,56 @@ stph.lille$ren.insuf <- ifelse(stph.lille$Creatinine.mg.dl < 1.3, 0, 1) stph.lille$delta.bili <- stph.lille$Bilirubin.Merged - stph.lille$Bilirubin.day.7 # Calculate the Lille score and corresponding survival probability -stph.lille$LILLE <- 3.19 - 0.101*stph.lille$Age + 0.147*stph.lille$Albumin + 0.0165*stph.lille$delta.bili - - 0.206*stph.lille$ren.insuf - 0.0065*stph.lille$Bilirubin.Merged - 0.0096*stph.lille$protime +stph.lille$LILLE <- 3.19 - + 0.101*stph.lille$Age.at.randomisation..calc. + + 0.147*stph.lille$Albumin + 0.0165*stph.lille$delta.bili - + 0.206*stph.lille$ren.insuf - + 0.0065*stph.lille$Bilirubin.Merged - + 0.0096*stph.lille$protime stph.lille$Lille.surv <- 1 - (exp(-stph.lille$LILLE)/(1 + exp(-stph.lille$LILLE))) + +#stph.lille[240,] %>% select(Bilirubin.Merged, Bilirubin.mg.dl, Bilirubin.day.7, Bilirubin.mg.dl.day.7, +# Lille...R, Lille.index, LILLE, Lille.surv, Age.at.randomisation..calc., Albumin, Creatinine, protime) ##### # Create dataframe of all complete cases including prognostic scores (used later) stph.c <- merge(stph.meld, stph.clif, by = "Subject") stph.c <- merge(stph.c, stph.lille, by = "Subject") +# +#path <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +#setwd(path) +#save(stph.meld, stph.lille, stph.clif, file = "original_models.Rdata") +#save(stph.c, file = "complete_cases_models.Rdata") +#save(stph, file = "full_sample.Rdata") + # Tabulate the calculated prognostic scores and survival probabilities library(table1) -table1::table1(~MELD.calc + MELD.surv + MELD.surv2 + MELD3.surv, data = stph.meld) -table1::table1(~LILLE + Lille.surv, data = stph.lille) -table1::table1(~CLIF.C + CLIF.surv, data = stph.clif) - - +tb_MELD <- table1(~ MELD.calc + MELD.surv + MELD.surv2 + MELD3.surv | factor(D90_surv), data = stph.meld) +tb_Lille <- table1(~ LILLE + Lille.surv | factor(D90_surv), data = stph.lille) +tb_CLIF <- table1(~ CLIF.C + CLIF.surv| factor(D90_surv), data = stph.clif) + +library(xtable) +xtable(as_tibble(tb_MELD)) +xtable(as_tibble(tb_Lille)) +xtable(as_tibble(tb_CLIF)) + +# KM estimates and plots +library(ggsurvfit) +survfit2(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph) %>% + ggsurvfit() + + labs( + x = "Days", + y = "Overall survival probability" + ) + + add_confidence_interval() + + add_risktable() + +summary(survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph), times = 90) +summary(survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph), times = 183) + +survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph) %>% + gtsummary::tbl_survfit( + times = 90, + label_header = "**1-year survival (95% CI)**" + ) diff --git a/original models/prognostic_scores_6M.R b/original models/prognostic_scores_6M.R new file mode 100644 index 0000000..3da8e72 --- /dev/null +++ b/original models/prognostic_scores_6M.R @@ -0,0 +1,102 @@ +###### +# This script calculates the prognostic scores (MELD, Lille, CLIF-C ACLF) and corresponding 90-day survival probabilities. +library(tidyverse) +library(dplyr) +library(ggplot2) + +############################################# +############### MELD score ################## +############################################# +##### +# First constrain INR, Creatinine, and Bilirubin to be within certain bandwidth +stph.meld$INR <- ifelse(stph.meld$INR < 1, 1, stph.meld$INR) +stph.meld$Creatinine.mg.dl.MELD <- ifelse(stph.meld$Creatinine.mg.dl < 1, 1, stph.meld$Creatinine.mg.dl) +stph.meld$Bilirubin.mg.dl.MELD <- ifelse(stph.meld$Bilirubin.mg.dl < 1, 1, stph.meld$Bilirubin.mg.dl) +stph.meld$Creatinine.mg.dl.MELD <- ifelse(stph.meld$Creatinine.mg.dl > 4, 4, stph.meld$Creatinine.mg.dl) + +# Calculate basic MELD score +stph.meld$MELD.calc <- 0.378*log(stph.meld$Bilirubin.mg.dl.MELD) + + 1.120*log(stph.meld$INR) + + 0.957*log(stph.meld$Creatinine.mg.dl.MELD) + +# Round score to the nearest tenth +stph.meld$MELD.calc <- round(stph.meld$MELD.calc, 1) + +# Multiply score by ten +stph.meld$MELD.calc <- 10*stph.meld$MELD.calc + +# Constrain serum sodium to be within certain bandwidth +stph.meld$Sodium <- ifelse(stph.meld$Sodium < 125, 125, stph.meld$Sodium) +stph.meld$Sodium <- ifelse(stph.meld$Sodium > 137, 137, stph.meld$Sodium) + +# Recalculate to get the MELD-Na score +for(i in 1:nrow(stph.meld)){ + if(stph.meld$MELD.calc[i] > 11){ + temp <- stph.meld$MELD.calc[i] + 1.32*(137 - stph.meld$Sodium[i]) - + 0.033*stph.meld$MELD.calc[i]*(137 - stph.meld$Sodium[i]) + stph.meld$MELD.calc[i] <- temp + } else { + stph.meld$MELD.calc[i] <- stph.meld$MELD.calc[i] + } +} + +# Round to nearest integer +stph.meld$MELD.calc <- round(stph.meld$MELD.calc, 0) + +# Constrain MELD to be between 6 and 40 +stph.meld$MELD.calc <- ifelse(stph.meld$MELD.calc < 6, 6, stph.meld$MELD.calc) +stph.meld$MELD.calc <- ifelse(stph.meld$MELD.calc > 40, 40, stph.meld$MELD.calc) + +# Calculate 90-day survival probability based on MELD-Na score (2 different functions) +stph.meld$MELD.surv_6M <- 0.621^(exp((stph.meld$MELD.calc/10) - 1.127)) +#stph.meld$MELD.surv2 <- 0.98465^(exp(0.1635*(stph.meld$MELD.calc - 10))) + +############################################# +######### CLIF-C ACLF Score ################# +############################################# +##### +# Start by calculating organ-failure sub-scores +stph.clif$liver.score <- ifelse(stph.clif$Bilirubin.mg.dl < 6, 1, 2) +stph.clif$liver.score <- ifelse(stph.clif$Bilirubin.mg.dl < 12, stph.clif$liver.score, 3) + +stph.clif$kidney.score <- ifelse(stph.clif$Creatinine.mg.dl < 2, 1, 2) +stph.clif$kidney.score <- ifelse(stph.clif$Creatinine.mg.dl < 3.5, stph.clif$kidney.score, 3) + +stph.clif$brain.score <- ifelse(stph.clif$HE == 0, 1, 2) +stph.clif$brain.score <- ifelse(stph.clif$HE > 2, 3, stph.clif$brain.score) + +stph.clif$coag.score <- ifelse(stph.clif$INR < 2, 1, 2) +stph.clif$coag.score <- ifelse(stph.clif$INR < 2.5, stph.clif$coag.score, 3) + +stph.clif$circ.score <- ifelse(stph.clif$MAP >= 70, 1, 2) + +# Calculate CLIF-OF score, which is the sum of sub-scores +stph.clif$CLIF.OF <- stph.clif$liver.score + stph.clif$kidney.score + + stph.clif$brain.score + stph.clif$coag.score + stph.clif$circ.score + 1 # respiratory score equal to 1 for all patients + +# Calculate the CLIF-C ACLF score and corresponding survival +stph.clif$CLIF.C <- 10*(0.33*stph.clif$CLIF.OF + 0.04*stph.clif$Age + 0.63*log(stph.clif$WBC) - 2) +stph.clif$CLIF.surv_6M <- exp(-0.0115 * exp(0.0824*stph.clif$CLIF.C)) + +############################################# +################### Lille score ############# +############################################# +##### +# First create renal insufficiency dummy +stph.lille$ren.insuf <- ifelse(stph.lille$Creatinine.mg.dl < 1.3, 0, 1) + +# Calculate the change in bilirubin +stph.lille$delta.bili <- stph.lille$Bilirubin.Merged - stph.lille$Bilirubin.day.7 + +# Calculate the Lille score and corresponding survival probability +stph.lille$LILLE <- 3.19 - 0.101*stph.lille$Age + 0.147*stph.lille$Albumin + 0.0165*stph.lille$delta.bili - + 0.206*stph.lille$ren.insuf - 0.0065*stph.lille$Bilirubin.Merged - 0.0096*stph.lille$protime +stph.lille$Lille.surv_6M <- 1 - (exp(-stph.lille$LILLE)/(1 + exp(-stph.lille$LILLE))) + + +#path <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +#setwd(path) +#save(stph.meld, stph.lille, stph.clif, file = "original_models_6M.Rdata") + + + diff --git a/original models/prognostic_scores_imp.R b/original models/prognostic_scores_imp.R new file mode 100644 index 0000000..78bb7ef --- /dev/null +++ b/original models/prognostic_scores_imp.R @@ -0,0 +1,160 @@ + +library(readxl) +library(dplyr) + +############################################# +################ load data ################## +############################################# +# imputed data path +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/" + +# imputed data - default +load(paste0(path_data, "imputed_data.Rdata")) + +# imputed data - sensitivity analysis +load(paste0(path_data, "imputed_data_sens.Rdata")) + +##### +# Extract 90-day survival probability from VanDerwerke et al, 2021 +MELD_VanDerwerken <- read_excel("~/IDrive-Sync/Projects/MIMAH/data/MELD_VanDerwerken.xlsx") + +############################################# +############### MELD score ################## +############################################# +# data preparation +imp_data2 <- imp_data %>% + dplyr::mutate( + # Survival indicator + D90_surv = 1 - D90_DTH, + # MELD + INR = ifelse(INR < 1, 1, INR), + Creatinine.mg.dl.MELD = ifelse(Creatinine.mg.dl < 1, 1, Creatinine.mg.dl), + Bilirubin.mg.dl.MELD = ifelse(Bilirubin.mg.dl < 1, 1, Bilirubin.mg.dl), + Creatinine.mg.dl.MELD = ifelse(Creatinine.mg.dl > 4, 4, Creatinine.mg.dl), + MELD.calc = (0.378 * log(Bilirubin.mg.dl.MELD) + 1.120 * log(INR) + 0.957 * log(Creatinine.mg.dl.MELD)), + MELD.calc = 10 * round(MELD.calc, 1), + Sodium = ifelse(Sodium < 125, 125, Sodium), + Sodium = ifelse(Sodium > 137, 137, Sodium), + MELD.calc = ifelse(MELD.calc > 11, + (MELD.calc + 1.32 * (137 - Sodium) - 0.033 * MELD.calc * (137 - Sodium)), + MELD.calc), + MELD.calc = round(MELD.calc, 0), + MELD.calc = ifelse(MELD.calc < 6, 6, MELD.calc), + MELD.calc = ifelse(MELD.calc > 40, 40, MELD.calc), + MELD.surv1 = 0.707^(exp((MELD.calc/10) - 1.127)), + MELD.surv2= 0.98465^(exp(0.1635*(MELD.calc - 10))), + MELD_Van = MELD_VanDerwerken[MELD.calc - 5, ]$SURV, + Albumin.MELD = ifelse(Albumin < 1.5, 1.5, Albumin), + Albumin.MELD = ifelse(Albumin > 3.5, 3.5, Albumin), + MELD3 = 1.33 * Gender + + 4.56 * log(Bilirubin.mg.dl.MELD) + + 0.82 * (137 - Sodium) - + (0.24 * (137 - Sodium) * log(Bilirubin.mg.dl.MELD)) + + 9.09 * log(INR) + + 11.14 * log(Creatinine.mg.dl.MELD) + + 1.85 * (3.5 - Albumin.MELD) - + (1.83 * (3.5 - Albumin.MELD) * log(Creatinine.mg.dl.MELD)) + 6, + MELD3 = round(MELD3, 0), + MELD3.surv = 0.946^(exp(0.17698*MELD3 - 3.56)), + # CLIF + liver.score = ifelse(Bilirubin.mg.dl < 6, 1, 2), + liver.score = ifelse(Bilirubin.mg.dl < 12, liver.score, 3), + kidney.score = ifelse(Creatinine.mg.dl < 2, 1, 2), + kidney.score = ifelse(Creatinine.mg.dl < 3.5, kidney.score, 3), + brain.score = ifelse(HE == 0, 1, 2), + brain.score = ifelse(HE > 2, 3, brain.score), + coag.score = ifelse(INR < 2, 1, 2), + coag.score = ifelse(INR < 2.5, coag.score, 3), + circ.score = ifelse(MAP >= 70, 1, 2), + CLIF.OF = liver.score + kidney.score + brain.score + coag.score + circ.score + 1, + CLIF.C = 10 * (0.33 * CLIF.OF + 0.04 * Age + 0.63 * log(WBC) - 2), + CLIF.surv = exp(-0.0079 * exp(0.0869 * CLIF.C)), + # Lille + #ren.insuf = ifelse(Creatinine.mg.dl < 1.3, 0, 1), + #Bilirubin.Merged = Bilirubin.mg.dl * 17, + #delta.bili = Bilirubin.Merged - Bilirubin.day.7, + #LILLE = 3.19 - 0.101 * Age + 0.147 * Albumin + 0.0165 * delta.bili - 0.206 * ren.insuf - 0.0065 * Bilirubin.Merged - 0.0096 * protime, + #Lille.surv = 1 - (exp(-LILLE)/(1 + exp(-LILLE))) + + ) + +colnames(imp_data2) + +imp_data2_Biliday7 <- imp_data_day7 %>% + dplyr::mutate( + # Survival indicator + D90_surv = 1 - D90_DTH, + # Lille + ren.insuf = ifelse(Creatinine.mg.dl < 1.3, 0, 1), + Bilirubin.Merged = Bilirubin.mg.dl * 17, + delta.bili = Bilirubin.Merged - Bilirubin.day.7, + LILLE = 3.19 - 0.101 * Age + 0.147 * Albumin + 0.0165 * delta.bili - 0.206 * ren.insuf - 0.0065 * Bilirubin.Merged - 0.0096 * protime, + Lille.surv = 1 - (exp(-LILLE)/(1 + exp(-LILLE))) + ) + +#setwd("~/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models") +#save(imp_data2, imp_data2_Biliday7, file = "imputed_orig_scores.Rdata") + +# data preparation sensitivity analysis +imp_data3 <- imp_sens_df %>% + dplyr::mutate( + # Survival indicator + D90_surv = 1 - D90_DTH, + # MELD + INR = ifelse(INR < 1, 1, INR), + Creatinine.mg.dl.MELD = ifelse(Creatinine.mg.dl < 1, 1, Creatinine.mg.dl), + Bilirubin.mg.dl.MELD = ifelse(Bilirubin.mg.dl < 1, 1, Bilirubin.mg.dl), + Creatinine.mg.dl.MELD = ifelse(Creatinine.mg.dl > 4, 4, Creatinine.mg.dl), + MELD.calc = (0.378 * log(Bilirubin.mg.dl.MELD) + 1.120 * log(INR) + 0.957 * log(Creatinine.mg.dl.MELD)), + MELD.calc = 10 * round(MELD.calc, 1), + Sodium = ifelse(Sodium < 125, 125, Sodium), + Sodium = ifelse(Sodium > 137, 137, Sodium), + MELD.calc = ifelse(MELD.calc > 11, + (MELD.calc + 1.32 * (137 - Sodium) - 0.033 * MELD.calc * (137 - Sodium)), + MELD.calc), + MELD.calc = round(MELD.calc, 0), + MELD.calc = ifelse(MELD.calc < 6, 6, MELD.calc), + MELD.calc = ifelse(MELD.calc > 40, 40, MELD.calc), + MELD.surv1 = 0.707^(exp((MELD.calc/10) - 1.127)), + MELD.surv2= 0.98465^(exp(0.1635*(MELD.calc - 10))), + MELD_Van = MELD_VanDerwerken[MELD.calc - 5, ]$SURV, + Albumin.MELD = ifelse(Albumin < 1.5, 1.5, Albumin), + Albumin.MELD = ifelse(Albumin > 3.5, 3.5, Albumin), + MELD3 = 1.33 * Gender + + 4.56 * log(Bilirubin.mg.dl.MELD) + + 0.82 * (137 - Sodium) - + (0.24 * (137 - Sodium) * log(Bilirubin.mg.dl.MELD)) + + 9.09 * log(INR) + + 11.14 * log(Creatinine.mg.dl.MELD) + + 1.85 * (3.5 - Albumin.MELD) - + (1.83 * (3.5 - Albumin.MELD) * log(Creatinine.mg.dl.MELD)) + 6, + MELD3 = round(MELD3, 0), + MELD3.surv = 0.946^(exp(0.17698*MELD3 - 3.56)), + # CLIF + liver.score = ifelse(Bilirubin.mg.dl < 6, 1, 2), + liver.score = ifelse(Bilirubin.mg.dl < 12, liver.score, 3), + kidney.score = ifelse(Creatinine.mg.dl < 2, 1, 2), + kidney.score = ifelse(Creatinine.mg.dl < 3.5, kidney.score, 3), + brain.score = ifelse(HE == 0, 1, 2), + brain.score = ifelse(HE > 2, 3, brain.score), + coag.score = ifelse(INR < 2, 1, 2), + coag.score = ifelse(INR < 2.5, coag.score, 3), + circ.score = ifelse(MAP >= 70, 1, 2), + CLIF.OF = liver.score + kidney.score + brain.score + coag.score + circ.score + 1, + CLIF.C = 10 * (0.33 * CLIF.OF + 0.04 * Age + 0.63 * log(WBC) - 2), + CLIF.surv = exp(-0.0079 * exp(0.0869 * CLIF.C)), + # Lille + ren.insuf = ifelse(Creatinine.mg.dl < 1.3, 0, 1), + Bilirubin.Merged = Bilirubin.mg.dl * 17, + delta.bili = Bilirubin.Merged - Bilirubin.day.7, + LILLE = 3.19 - 0.101 * Age + 0.147 * Albumin + 0.0165 * delta.bili - 0.206 * ren.insuf - 0.0065 * Bilirubin.Merged - 0.0096 * protime, + Lille.surv = 1 - (exp(-LILLE)/(1 + exp(-LILLE))) + + ) + +colnames(imp_data3) +dim(imp_data3) + +#setwd("~/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models") +#save(imp_data3, file = "imputed_sens_orig_scores.Rdata") + diff --git a/original models/sample_size.R b/original models/sample_size.R new file mode 100644 index 0000000..7fd89c6 --- /dev/null +++ b/original models/sample_size.R @@ -0,0 +1,10 @@ +# Sample sizes for original analysis + +# data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/original models/" +load(paste0(path_data, "original_models.Rdata")) + +# overall sample size per model +nrow(stph.meld); nrow(stph.lille); nrow(stph.clif) + + diff --git a/original models/val_data/models_validation.Rdata b/original models/val_data/models_validation.Rdata new file mode 100644 index 0000000..b29b482 Binary files /dev/null and b/original models/val_data/models_validation.Rdata differ diff --git a/original models/val_data/prognostic_scores_validation.R b/original models/val_data/prognostic_scores_validation.R new file mode 100644 index 0000000..296e5aa --- /dev/null +++ b/original models/val_data/prognostic_scores_validation.R @@ -0,0 +1,142 @@ + +############################################# +# Load data +path <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/val_data/" +load(paste0(path, "full_sample_Global_AlcHep.Rdata")) + +############################################# +############### MELD score ################## +############################################# +##### +# First constrain INR, Creatinine, and Bilirubin to be within certain bandwidth +Global_AlcHep.meld$INR.MELD <- ifelse(Global_AlcHep.meld$INR < 1, 1, Global_AlcHep.meld$INR) +Global_AlcHep.meld$Creatinine.MELD <- ifelse(Global_AlcHep.meld$Creatinine < 1, 1, + ifelse(Global_AlcHep.meld$Creatinine > 4, 4, + Global_AlcHep.meld$Creatinine)) +#Global_AlcHep.meld$Bilirubin.mg.dl.MELD <- Global_AlcHep.meld$Bilirubin / 10 +Global_AlcHep.meld$Bilirubin.MELD <- ifelse(Global_AlcHep.meld$Bilirubin < 1, 1, + Global_AlcHep.meld$Bilirubin) + +# Calculate basic MELD score +Global_AlcHep.meld$MELD.calc <- 0.378 * log(Global_AlcHep.meld$Bilirubin.MELD) + + 1.120 * log(Global_AlcHep.meld$INR.MELD) + + 0.957 * log(Global_AlcHep.meld$Creatinine.MELD) + +# Round score to the nearest tenth +Global_AlcHep.meld$MELD.calc <- round(Global_AlcHep.meld$MELD.calc, 1) + +# Multiply score by ten +Global_AlcHep.meld$MELD.calc <- 10 * Global_AlcHep.meld$MELD.calc + +# Constrain serum sodium to be within certain bandwidth +Global_AlcHep.meld$Sodium <- ifelse(Global_AlcHep.meld$Sodium < 125, 125, Global_AlcHep.meld$Sodium) +Global_AlcHep.meld$Sodium <- ifelse(Global_AlcHep.meld$Sodium > 137, 137, Global_AlcHep.meld$Sodium) + +# Recalculate to get the MELD-Na score +for(i in 1:nrow(Global_AlcHep.meld)){ + if(Global_AlcHep.meld$MELD.calc[i] > 11){ + temp <- Global_AlcHep.meld$MELD.calc[i] + 1.32 * (137 - Global_AlcHep.meld$Sodium[i]) - + 0.033 * Global_AlcHep.meld$MELD.calc[i]*(137 - Global_AlcHep.meld$Sodium[i]) + Global_AlcHep.meld$MELD.calc[i] <- temp + } else { + Global_AlcHep.meld$MELD.calc[i] <- Global_AlcHep.meld$MELD.calc[i] + } +} + +# Round to nearest integer +Global_AlcHep.meld$MELD.calc <- round(Global_AlcHep.meld$MELD.calc, 0) + +# Constrain MELD to be between 6 and 40 +Global_AlcHep.meld$MELD.calc <- ifelse(Global_AlcHep.meld$MELD.calc < 6, 6, Global_AlcHep.meld$MELD.calc) +Global_AlcHep.meld$MELD.calc <- ifelse(Global_AlcHep.meld$MELD.calc > 40, 40, Global_AlcHep.meld$MELD.calc) + +# Calculate 90-day survival probability based on MELD-Na score (2 different functions) +Global_AlcHep.meld$MELD.surv <- 0.707^(exp((Global_AlcHep.meld$MELD.calc/10) - 1.127)) +Global_AlcHep.meld$MELD.surv2 <- 0.98465^(exp(0.1635*(Global_AlcHep.meld$MELD.calc - 10))) + +# Extract 90-day survival probability from VanDerwerke et al, 2021 +library(readxl) +MELD_VanDerwerken <- read_excel("~/IDrive-Sync/Projects/MIMAH/data/MELD_VanDerwerken.xlsx") + +for(i in 1:nrow(Global_AlcHep.meld)){ + Global_AlcHep.meld$MELD_Van[i] <- MELD_VanDerwerken[Global_AlcHep.meld$MELD.calc[i] - 5, ]$SURV +} + +#### +# Calculate the MELD 3.0 score (sex-adjusted version of the MELD) +# Constrain Albumin +Global_AlcHep.meld$Albumin.MELD <- ifelse(Global_AlcHep.meld$Albumin < 1.5, 1.5, Global_AlcHep.meld$Albumin) +Global_AlcHep.meld$Albumin.MELD <- ifelse(Global_AlcHep.meld$Albumin> 3.5, 3.5, Global_AlcHep.meld$Albumin) + +# Calculate MELD 3.0, round to nearest integer, and calculate corresponding survival +#Global_AlcHep.meld$MELD3 <- 1.33 * Global_AlcHep.meld$Gender + +# 4.56 * log(Global_AlcHep.meld$Bilirubin.mg.dl.MELD) + +# 0.82 * (137 - Global_AlcHep.meld$Sodium) - +# (0.24 * (137 - Global_AlcHep.meld$Sodium) * log(Global_AlcHep.meld$Bilirubin.mg.dl.MELD)) + +# 9.09 * log(Global_AlcHep.meld$INR) + +# 11.14 * log(Global_AlcHep.meld$Creatinine.mg.dl.MELD) + +# 1.85 * (3.5 - Global_AlcHep.meld$Albumin.MELD) - +# (1.83 * (3.5 - Global_AlcHep.meld$Albumin.MELD) * log(Global_AlcHep.meld$Creatinine.mg.dl.MELD)) +#+ 6 + +#stph.meld$MELD3 <- round(stph.meld$MELD3, 0) +#stph.meld$MELD3.surv <- 0.946^(exp(0.17698*stph.meld$MELD3 - 3.56)) + +############################################# +################### Lille score ############# +############################################# +##### +# First create renal insufficiency dummy +Global_AlcHep.lille$ren.insuf <- ifelse(Global_AlcHep.lille$Creatinine <= 1.3, 0, 1) + +# convert from mg/L to micromole per litre (μmol/L) +Global_AlcHep.lille$Bilirubin.Lille <- Global_AlcHep.lille$Bilirubin * 17.1 #5.5506216696 +Global_AlcHep.lille$Bilirubin.day.7.Lille <- Global_AlcHep.lille$Bilirubin.day.7 * 17.1 #5.5506216696 + +# convert from g/dL to g/L +Global_AlcHep.lille$Albumin.Lille <- Global_AlcHep.lille$Albumin * 10 + +# Calculate the change in bilirubin +Global_AlcHep.lille$delta.bili <- Global_AlcHep.lille$Bilirubin.Lille - Global_AlcHep.lille$Bilirubin.day.7.Lille + +# Calculate the Lille score and corresponding survival probability +Global_AlcHep.lille$LILLE <- 3.19 - + 0.101 * Global_AlcHep.lille$Age + + 0.147 * Global_AlcHep.lille$Albumin.Lille + + 0.0165 * Global_AlcHep.lille$delta.bili - + 0.206 * Global_AlcHep.lille$ren.insuf - + 0.0065 * Global_AlcHep.lille$Bilirubin.Lille - + 0.0096 * Global_AlcHep.lille$protime + +Global_AlcHep.lille$Lille.risk <- (exp(-Global_AlcHep.lille$LILLE)/(1 + exp(-Global_AlcHep.lille$LILLE))) +Global_AlcHep.lille$Lille.surv <- 1 - Global_AlcHep.lille$Lille.risk + +#Global_AlcHep.lille[100,] %>% +# select(Bilirubin, Bilirubin.Lille, Age, Bilirubin.day.7, Bilirubin.day.7.Lille, +# Albumin, Creatinine, protime, LILLE, Lille.risk) + +############################################# +############# Add updated models ############ +############################################# +Global_AlcHep.meld <- Global_AlcHep.meld %>% + mutate(meld_updated = 2.54 - 0.09 * MELD.calc, + meld.surv.updated = 1/(1 + exp(- meld_updated))) +Global_AlcHep.lille <- Global_AlcHep.lille %>% + mutate(lille_updated = 0.98 + 0.36 * LILLE, + lille.surv.updated = 1/(1 + exp(- lille_updated))) + +#path <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/val_data" +#setwd(path) +#save(Global_AlcHep.meld, Global_AlcHep.lille, file = "models_validation.Rdata") + +# Tabulate the calculated prognostic scores and survival probabilities +library(table1) +tb_MELD <- table1(~ MELD.calc + MELD.surv + MELD.surv2 | factor(D90_surv), data = Global_AlcHep.meld) +tb_Lille <- table1(~ LILLE + Lille.surv | factor(D90_surv), data = Global_AlcHep.lille) + +library(xtable) +xtable(as_tibble(tb_MELD)) +xtable(as_tibble(tb_Lille)) + + + diff --git a/pre-analysis/.DS_Store b/pre-analysis/.DS_Store new file mode 100755 index 0000000..5008ddf Binary files /dev/null and b/pre-analysis/.DS_Store differ diff --git a/pre-analysis/full_sample.Rdata b/pre-analysis/full_sample.Rdata new file mode 100755 index 0000000..bd82c7d Binary files /dev/null and b/pre-analysis/full_sample.Rdata differ diff --git a/pre-analysis/import_data.R b/pre-analysis/import_data.R old mode 100644 new mode 100755 index 7e54aa9..063f060 --- a/pre-analysis/import_data.R +++ b/pre-analysis/import_data.R @@ -4,9 +4,11 @@ library(tidyverse) library(dplyr) # Load data -load("stopah_plus_scores.Rdata") +path <- "/Users/work/IDrive-Sync/Projects/MIMAH/data" +load(paste0(path, "/stopah_plus_scores.Rdata")) stph <- rename(data_stph_prelim3) + # Rename relevant variables stph <- rename(stph, Bilirubin.mg.dl = Bilirubin.Merged..mg.dL..Merged..calc.) stph <- rename(stph, Creatinine = Creatinine...Merged) @@ -16,12 +18,47 @@ stph <- rename(stph, INR = INR...Merged.clinical.and.calc) stph <- rename(stph, protime = Prothrombin.Time..patient....Merged) # Prothrombin time stph <- rename(stph, HE = Hepatic.Encephalopathy...Merged) # Brain function stph <- rename(stph, Sodium = Sodium...Merged) - + # Create survival variable stph$D90_surv <- 1 - stph$D90_DTH # Transform creatinine into mg/dl -stph$Creatinine.mg.dl <- 0.0113*stph$Creatinine +stph$Creatinine.mg.dl <- 0.0113 * stph$Creatinine + +# Time-to-event +stph <- stph %>% + mutate(Date_of_death = as.Date(Date_of_death, format = "%d/%m/%y"), + Date_rand = as.Date(Treatment.start.date..randomisation.date.if.rx.start.date.missing., format = "%d/%m/%y"), + Date_Last_day_study_contact = as.Date(Last_day_study_contact, format = "%d-%B-%y"), + Death_event = ifelse(is.na(Date_of_death), 0, 1), + Time_to_death_from_rand = ifelse(Death_event == 1, + (Date_of_death - Date_rand), (Date_Last_day_study_contact - Date_rand))) %>% + mutate(M6_DTH = ifelse((Death_event == 1 & Time_to_death_from_rand <= 183), 1, 0), # Calculate 6-month death indicator + M6_surv = 1 - M6_DTH) # Calculate 6-month survival indicator + +#stph2$Time_to_death_from_rand +#stph2$Death_event +#stph2 %>% filter(Death_event == 0 & Time_to_death_from_rand < 90) %>% +# select(Time_to_death_from_rand, Death_event, Date_of_death, +# Time.to.death..calculated.by.MRIS.retrieved.data., Date_Last_day_study_contact, Date_rand) #%>% + #summarise(n()) + +#stph$Last_day_study_contact[1] +#as.Date(stph$Last_day_study_contact[1], format = "%d-%B-%y") +#stph2 <- stph %>% mutate(Event = ifelse(is.na(Time.to.death..calculated.by.MRIS.retrieved.data.), 0, 1)) +#stph2 %>% filter(Time.to.death..calculated.by.MRIS.retrieved.data. < 90) +#range(stph$Time.to.death..calculated.by.MRIS.retrieved.data., na.rm = T) +#stph$Y1_DTH +#plot(density(na.omit(stph$Time.to.death..calculated.by.MRIS.retrieved.data.))) +#as.Date(Date_of_death, format = "%d/%m/%y") +#ind <-str_detect(colnames(stph), "rand") +#stph[,ind] + +# check last day contact +#The trial is currently recruiting. Recruitment commenced +#on 1 February 2011 and will finish on 28 February 2014 - +#from https://link.springer.com/content/pdf/10.1186/1745-6215-14-262.pdf +max(stph$Date_Last_day_study_contact) # Handle missing data and create data frames for each of the prognostic scores stph.meld <- stph[complete.cases(stph$Bilirubin.mg.dl, stph$Creatinine, stph$INR, stph$Sodium),] @@ -30,13 +67,44 @@ stph.clif <- stph[complete.cases(stph$Bilirubin.mg.dl, stph$Creatinine, stph$INR stph.lille <- stph[complete.cases(stph$Bilirubin.day.7, stph$Bilirubin.Merged, stph$Creatinine, stph$protime, stph$Albumin),] +#path <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/" +#setwd(path) +#save(stph, file = "full_sample.Rdata") + # Create table of descriptive statistics and number of missing values per variable library(table1) # Some factor variables stph$HE_f <- as.factor(stph$HE) stph$D90_surv_f <- as.factor(stph$D90_surv) stph$Gender_f <- as.factor(stph$Gender) -table1::table1(~Bilirubin.mg.dl + Creatinine.mg.dl + Albumin + WBC + protime + - INR + HE_f + Bilirubin.day.7 + Gender_f + D90_surv_f + Sodium, data = stph) +# table stratified by outcome +tb1 <- table1::table1(~ Age.at.randomisation..calc. + Bilirubin.mg.dl + Creatinine.mg.dl + Albumin + WBC + protime + + INR + HE_f + Bilirubin.day.7 + Gender_f + Sodium | factor(D90_surv_f), data = stph) + +library(xtable) +xtable(as_tibble(tb1)) + +# 90-day death rate +mean(stph$D90_DTH) + +# KM estimates and plots +library(ggsurvfit) +survfit2(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph) %>% + ggsurvfit() + + labs( + x = "Days", + y = "Overall survival probability" + ) + + add_confidence_interval() + + add_risktable() + +summary(survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph), times = 90) +summary(survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph), times = 183) + +survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = stph) %>% + gtsummary::tbl_survfit( + times = 90, + label_header = "**1-year survival (95% CI)**" + ) diff --git a/pre-analysis/imputation.R b/pre-analysis/imputation.R new file mode 100644 index 0000000..b1aec17 --- /dev/null +++ b/pre-analysis/imputation.R @@ -0,0 +1,168 @@ + +################################################## +###################### pkgs ###################### +################################################## +library(mice) +library(dplyr) +library(purrr) +library(forcats) +library(VIM) +library(ggplot2) +library(gridExtra) + +################################################## +###################### data ###################### +################################################## +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/" +load(paste0(path_data, "full_sample.Rdata")) + +# select patients that died within 7 days of randomisation +dead_before_day7 <- stph %>% filter(Time_to_death_from_rand <= 7 & Death_event == 1) + +# these are 49 in total +dead_before_day7 %>% summarise(n()) + +# exclude patients that died within 7 days from analysis +stph_day7 <- stph %>% filter(! (Subject %in% dead_before_day7$Subject)) + +################################################## +################## logistic ###################### +################################################## +# association between MELD and missing bilirubin day 7 +stph_missing_day7 <- stph_day7 %>% mutate(miss_bili_day7 = ifelse(is.na(Bilirubin.day.7), 1, 0)) + +fit <- glm(miss_bili_day7 ~ MELD, data = stph_missing_day7, family = binomial("logit")) +exp(coef(fit)) # odds ratio +exp(confint.default(fit)) # CIs +summary(fit) + +################################################## +################ imputation ###################### +################################################## +# vars to use in the imputation +vars_MELD <- c("Creatinine.mg.dl", "Bilirubin.mg.dl", "INR", "Sodium") +vars_MELD3 <- c(vars_MELD, "Albumin", "Gender") +vars_Lille_without_Bili7 <- c("Age", "Albumin", "Bilirubin.mg.dl", "protime") +vars_CLIF <- c("Bilirubin.mg.dl", "Creatinine.mg.dl", "HE", "INR", "MAP", "Age", "WBC") + +vars_Lille_with_Bili7 <- c("Age", "Albumin", "Bilirubin.day.7", "Bilirubin.mg.dl", "protime") + +# select dataframe for all models excluding Bilirubin.day.7 +stph_short <- stph %>% + rename(Age = Age.at.randomisation..calc.) %>% + select(D90_DTH, all_of(vars_MELD), all_of(vars_MELD3), all_of(vars_Lille_without_Bili7), all_of(vars_CLIF)) + +#select dataframe for Lille +stph_short_BiliDay7 <- stph %>% + filter(! (Subject %in% dead_before_day7$Subject)) %>% #exclude patients that died within 7 days from analysis + rename(Age = Age.at.randomisation..calc.) %>% + select(D90_DTH, all_of(vars_MELD), all_of(vars_MELD3), all_of(vars_Lille_with_Bili7), all_of(vars_CLIF)) + +# create df for plot of missing values +stph_plot <- stph %>% + rename(Age = Age.at.randomisation..calc.) %>% + select(D90_DTH, all_of(vars_MELD), all_of(vars_MELD3), all_of(vars_Lille_with_Bili7), all_of(vars_CLIF)) + +# inspect the missing data pattern +#md.pattern(stph_short, rotate.names = T) + +# missingness plot +a <- aggr(stph_plot, plot = F) +#plot(a, numbers = TRUE, prop = FALSE, only.miss = T, combined = T) + +a$missings %>% filter(Count != 0) %>% # filter out complete obs + mutate(Variable = + case_when( + Variable == "Bilirubin.mg.dl" ~ "Bilirubin day 0", + Variable == "Creatinine.mg.dl" ~ "Creatinine", + Variable == "Bilirubin.day.7" ~ "Bilirubin day 7", + Variable == "proctime" ~ "Prothrombin time", + TRUE ~ Variable)) %>% + mutate(Variable = fct_reorder(Variable, Count)) %>% # reorder factor + ggplot(., aes(x = Variable, y = Count)) + + geom_bar(stat = "identity") + + geom_text(aes(label = Count), hjust = -0.5, size = 3.5) + + labs(x = "") + + coord_flip() + + theme_classic(12) + + theme(axis.text.x = element_blank(), + axis.ticks.x = element_blank()) + +# multiple imputation +imp <- mice(stph_short, m = 20, method = "pmm", seed = 5678) + +imp_day7 <- mice(stph_short_BiliDay7, m = 20, method = "pmm", seed = 5678) + +# predictor matrix +# https://www.gerkovink.com/miceVignettes/Convergence_pooling/Convergence_and_pooling.html +imp$pred + +# inspect the convergence of the algorithm +plot(imp) + +# imputation method +imp$method #pmm method + +# diagnostic checking +stripplot(imp, Bilirubin.day.7 ~ .imp, pch = 20, cex = 2) +stripplot(imp) + +summary(stph_short$Bilirubin.day.7) +summary(with(imp, mean(Bilirubin.day.7))) + +densityplot(imp, ~ Bilirubin.day.7) + +# collection of the m imputed data sets +imp_data <- complete(imp, action = "long") +imp_data_day7 <- complete(imp_day7, action = "long") + +imp_data[which(is.na(imp_data)),] + +#setwd("~/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis") +#save(imp_data, imp_data_day7, file = "imputed_data.Rdata") + +########################################################### +################### Sensitivity analysis ################## +########################################################### +# create vector that represent the following adjustment values +#for Bili day 7: 0 for MAR, and -50, -100, and -200 for MNAR. +delta <- c(0, -50, -100, -200) + +# perform a dry run (using maxit = 0) +ini <- mice(stph_short_BiliDay7, maxit = 0) + +imp.all <- vector("list", length(delta)) +post <- ini$post +for (i in 1:length(delta)){ + d <- delta[i] + cmd <- paste("imp[[j]][,i] <- squeeze(imp[[j]][,i] +", d, ",c(0, 1061))") #squeeze values to observed ranges + post["Bilirubin.day.7"] <- cmd + imp <- mice(stph_short_BiliDay7, post = post, m = 20, maxit = 30, seed = i, print = FALSE) + imp.all[[i]] <- imp +} + +#bwplot(imp.all[[1]]) +#bwplot(imp.all[[4]]) + +p1 <- densityplot(imp.all[[1]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", main = expression(paste(delta, " = 0"))) +p2 <- densityplot(imp.all[[2]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", main = expression(paste(delta, " = -50"))) +p3 <- densityplot(imp.all[[3]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", main = expression(paste(delta, " = -100"))) +p4 <- densityplot(imp.all[[4]], ~ Bilirubin.day.7, lwd = 3, xlab = "", ylab = "", main = expression(paste(delta, " = -200"))) + +grid.arrange(p1, p2, p3, p4, nrow = 1, + bottom = textGrob("Bilirubin day 7", gp = gpar(fontsize = 13), vjust = -1.5), + left = textGrob("Density", gp = gpar(fontsize = 13), rot = 90, vjust = 1.5, hjust = 0) + ) + +#densityplot(imp.all[[1]], ~ Bilirubin.day.7, lwd = 3) +#densityplot(imp.all[[4]], ~ Bilirubin.day.7, lwd = 3) +#summary(with(imp.all[[3]], range(Bilirubin.day.7))) + +## post-process +names(imp.all) <- delta # add names to list +complete_list <- lapply(imp.all, complete, action = "long") +imp_sens_df <- map_df(complete_list, ~ as.data.frame(.x), .id = "delta") + +#setwd("~/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis") +#save(imp.all, imp_sens_df, file = "imputed_data_sens.Rdata") + diff --git a/pre-analysis/imputed_data.Rdata b/pre-analysis/imputed_data.Rdata new file mode 100644 index 0000000..76dcbf7 Binary files /dev/null and b/pre-analysis/imputed_data.Rdata differ diff --git a/pre-analysis/imputed_data_sens.Rdata b/pre-analysis/imputed_data_sens.Rdata new file mode 100644 index 0000000..0372d9f Binary files /dev/null and b/pre-analysis/imputed_data_sens.Rdata differ diff --git a/pre-analysis/missing_data.R b/pre-analysis/missing_data.R old mode 100644 new mode 100755 diff --git a/pre-analysis/val_data/full_sample_Global_AlcHep.Rdata b/pre-analysis/val_data/full_sample_Global_AlcHep.Rdata new file mode 100644 index 0000000..99682fb Binary files /dev/null and b/pre-analysis/val_data/full_sample_Global_AlcHep.Rdata differ diff --git a/pre-analysis/val_data/import_data_validation.R b/pre-analysis/val_data/import_data_validation.R new file mode 100644 index 0000000..16a4ea3 --- /dev/null +++ b/pre-analysis/val_data/import_data_validation.R @@ -0,0 +1,94 @@ +###### +# This script reads in the Global_AlcHep data and creates workable data frames +library(readxl) +library(tidyverse) +library(dplyr) + +# Load data +path <- "/Users/work/IDrive-Sync/Projects/MIMAH/data" +Global_AlcHep <- read_excel(paste0(path, "/Global_AlcHep.xlsx")) + +# Rename relevant variables +Global_AlcHep <- rename(Global_AlcHep, Bilirubin = Bili_admission) # mg/L +Global_AlcHep <- rename(Global_AlcHep, Creatinine = "Creatinine\r\nadmission") # mg/dL or mg/L +Global_AlcHep <- rename(Global_AlcHep, Albumin = Alb_admission) #g/dL +Global_AlcHep <- rename(Global_AlcHep, INR = INR_admission) +Global_AlcHep <- rename(Global_AlcHep, protime = PT_admission) # Prothrombin time (sec) +Global_AlcHep <- rename(Global_AlcHep, Sodium = "Sodium at admission") +Global_AlcHep <- rename(Global_AlcHep, Bilirubin.day.7 = "Total bili day 7") + +# to check again - the discrepancy +#diff <- (Global_AlcHep$`INR admission` - Global_AlcHep$INR_admission) +#ind <- which(diff !=0 ) +#Global_AlcHep[ind, ] + +# Create survival variable +Global_AlcHep <- rename(Global_AlcHep, D90_surv = "Alive at day 90\r\n") +Global_AlcHep$D90_DTH <- 1 - Global_AlcHep$D90_surv + +# Time-to-event +Global_AlcHep <- Global_AlcHep %>% + mutate(Date_of_death = as.Date(`Date of death`, format = "%d/%m/%y"), + Date_rand = as.Date(`Date of admission`, format = "%d/%m/%y"), + Date_Last_day_study_contact = as.Date(`Last follow up`, format = "%d-%B-%y"), + Death_event = ifelse(is.na(Date_of_death), 0, 1), + Time_to_death_from_rand = ifelse(Death_event == 1, + (Date_of_death - Date_rand), (Date_Last_day_study_contact - Date_rand))) + +# Handle missing data and create data frames for each of the prognostic scores +Global_AlcHep.meld <- Global_AlcHep[complete.cases(Global_AlcHep$Bilirubin, + Global_AlcHep$Creatinine, + Global_AlcHep$INR, + Global_AlcHep$Sodium),] + +Global_AlcHep.lille <- Global_AlcHep[complete.cases(Global_AlcHep$Bilirubin.day.7, + Global_AlcHep$Bilirubin, + Global_AlcHep$Creatinine, + Global_AlcHep$protime, + Global_AlcHep$Albumin),] + +#path <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/val_data" +#setwd(path) +#save(Global_AlcHep.meld, Global_AlcHep.lille, file = "full_sample_Global_AlcHep.Rdata") + +# Create table of descriptive statistics and number of missing values per variable +library(table1) +# Some factor variables +Global_AlcHep$D90_surv_f <- as.factor(Global_AlcHep$D90_surv) +Global_AlcHep$Gender_f <- as.factor(Global_AlcHep$Gender) +# table stratified by outcome +tb1 <- table1::table1(~ Age + Bilirubin + Creatinine + Albumin + protime + + INR + Bilirubin.day.7 + Gender_f + Sodium | factor(D90_surv_f), data = Global_AlcHep) +tb1 + + +library(xtable) +xtable(as_tibble(tb1)) + +# 90-day death rate +mean(Global_AlcHep$D90_DTH) + +# +unique(Global_AlcHep$Country) + +# KM estimates and plots +library(ggsurvfit) +survfit2(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = Global_AlcHep) %>% + ggsurvfit() + + labs( + x = "Days", + y = "Overall survival probability" + ) + + add_confidence_interval() + + add_risktable() + +summary(survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = Global_AlcHep), times = 90) +summary(survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = Global_AlcHep), times = 183) + +survfit(Surv(Time_to_death_from_rand, Death_event) ~ 1, data = Global_AlcHep) %>% + gtsummary::tbl_survfit( + times = 90, + label_header = "**1-year survival (95% CI)**" + ) + + diff --git a/subgroup comparison/.DS_Store b/subgroup comparison/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/subgroup comparison/.DS_Store differ diff --git a/subgroup comparison/subgroup_age.R b/subgroup comparison/subgroup_age.R new file mode 100755 index 0000000..7fb31bb --- /dev/null +++ b/subgroup comparison/subgroup_age.R @@ -0,0 +1,170 @@ + +library(pROC) +library(ggplot2) +library(dplyr) + +# data original models +#path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/original models/" +#load(paste0(path_data, "original_models.Rdata")) + +# data recalibrated models +path_data_rec <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data_rec, "/recalibrated_models_default.Rdata")) + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) + + +age_cut_off <- 49 # stratify by the median age of 49 years + +test.y <- test.data[test.data$Age.at.randomisation..calc. < age_cut_off,] +test.o <- test.data[test.data$Age.at.randomisation..calc. >= age_cut_off,] + +# choose age-group to calculate statistics +age_group <- "o" # "y" or "o" + +if(age_group == "y"){ + dataset <- test.y +}else if(age_group == "o"){ + dataset <- test.o +} + +#### Discrimination ############### +# MELD updated +roc_meld <- roc(dataset$D90_DTH, dataset$meld.surv.updated) +# CLIF - updated +roc_clif <- roc(dataset$D90_DTH, dataset$clif.surv.updated) +# Lille - updated +roc_lille <- roc(dataset$D90_DTH, dataset$lille.surv.updated) + +## Plots ############### +roc.list <- list("MELD updated" = roc_meld, + #"MELD 3.0" = roc_meld3, + "CLIF-C ACLF updated" = roc_clif, + "Lille updated" = roc_lille) + +#save(roc.list, file = "ROC_updated.Rdata") + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + ggtitle(paste0("age_group ", age_group)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +ggroc(roc.list) + + facet_grid(. ~ name) + + theme_minimal() + + ggtitle(paste0("age_group ", age_group)) + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + +############################################# +###################### AUC ################# +############################################# +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + +df3 <- gather(df_AUC2, condition, measurement, `MELD updated`:`Lille updated`, factor_key = TRUE) +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + +p_auc_young <- ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + + #ggtitle(age_group) + + geom_point(lwd = 2) + + coord_cartesian(xlim = c(0.5, 0.86)) + + geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), + alpha = 1, show.legend = F, lwd = 1, width = 0.5) + + labs(y = "Score", col = "Score", x = "AUC with 95% limits") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic() + +#### Calibration ############### +# MELD updated +cal_meld <- calibration(dataset$meld.surv.updated, dataset$D90_surv) +cal_meld$Score <- "MELD updated" +# CLIF - updated +cal_clif <- calibration(dataset$clif.surv.updated, dataset$D90_surv) +cal_clif$Score <- "CLIF-C ACLF updated" +# Lille - updated +cal_lille <- calibration(dataset$lille.surv.updated, dataset$D90_surv) +cal_lille$Score <- "Lille updated" + +# combine dfs +df_cal <- rbind(cal_meld, cal_clif, cal_lille) + +df_cal$Score <- factor(df_cal$Score, labels = levels(data_wide$condition)) + +# plot without ribbon +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + ggtitle(paste0("age_group ", age_group)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + +# plot with ribbon +p_calibration_young <- df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + theme_classic() + + +##### combine plots +source("/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/grid_arrange_fun.R") +library(gtable) +p1 <- grid_arrange_shared_legend(p_auc_young, p_calibration_young) + +p2 <- grid_arrange_shared_legend(p_auc_old, p_calibration_old) + diff --git a/subgroup comparison/subgroup_comparison.R b/subgroup comparison/subgroup_comparison.R old mode 100644 new mode 100755 index fa77439..e332a76 --- a/subgroup comparison/subgroup_comparison.R +++ b/subgroup comparison/subgroup_comparison.R @@ -7,11 +7,12 @@ source("dca.R") ###### # Comparison of ROC between MELD3.0 and original MELD -roc_meld_u <- roc(test.meld$D90_surv, test.meld$meld.surv.updated) -roc_meld3 <- roc(stph.meld$D90_DTH, stph.meld$MELD3.surv) +#roc_meld_u <- roc(test.meld$D90_surv, test.meld$meld.surv.updated) +#roc_meld3 <- roc(stph.meld$D90_DTH, stph.meld$MELD3.surv) + +#roc.test(roc_meld3, roc_meld) +#roc.test(roc_meld3, roc_meld_u) -roc.test(roc_meld3, roc_meld) -roc.test(roc_meld3, roc_meld_u) ###### # Sub-group based on sex diff --git a/subgroup comparison/subgroup_sex.R b/subgroup comparison/subgroup_sex.R new file mode 100755 index 0000000..b310152 --- /dev/null +++ b/subgroup comparison/subgroup_sex.R @@ -0,0 +1,207 @@ + +library(pROC) +library(survminer) # for plotting theme + +# data original models +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "original_models.Rdata")) + +# data recalibrated models +path_data_rec <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data_rec, "/recalibrated_models_default.Rdata")) + +# funs +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) + +################################################################################ +############################ Sub-group based on sex ############################ +################################################################################ +# Calculations on the MELD 3.0 score - first create two separate dfs per sex +stph.meld.males <- stph.meld[stph.meld$Gender == 0,] +stph.meld.females <- stph.meld[stph.meld$Gender == 1,] + +# Calculations on the updated MELD score +test.males <- test.data[test.data$Gender == 0,] +test.females <- test.data[test.data$Gender == 1,] + +# choose sex to calculate statistics +sex <- "m" # or "f +#sex <- "m" # or "f + +if(sex == "m"){ + dataset_original <- stph.meld.males + dataset_updated <- test.males + }else if(sex == "f"){ + dataset_original <- stph.meld.females + dataset_updated <- test.females + } + +#### Discrimination ############### +# MELD 3.0 - original +roc_meld3 <- roc(dataset_original$D90_DTH, dataset_original$MELD3.surv) +# MELD updated +roc_meld <- roc(dataset_updated$D90_DTH, dataset_updated$meld.surv.updated) +# CLIF - updated +roc_clif <- roc(dataset_updated$D90_DTH, dataset_updated$clif.surv.updated) +# Lille - updated +roc_lille <- roc(dataset_updated$D90_DTH, dataset_updated$lille.surv.updated) + + +## Plots ############### +roc.list <- list("MELD updated" = roc_meld, + "MELD 3.0" = roc_meld3, + "CLIF-C ACLF updated" = roc_clif, + "Lille updated" = roc_lille) + +#save(roc.list, file = "ROC_updated.Rdata") + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + ggtitle(sex) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic2() + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +ggroc(roc.list) + + facet_grid(. ~ name) + + theme_minimal() + + ggtitle(sex) + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + +############################################# +###################### AUC ################# +############################################# +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + +df3 <- gather(df_AUC2, condition, measurement, `MELD updated`:`Lille updated`, factor_key = TRUE) +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + +p_auc_females <- ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + + #ggtitle(sex) + + geom_point(lwd = 2) + + coord_cartesian(xlim = c(0.5, 0.86)) + + geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), + alpha = 1, show.legend = F, lwd = 1, width = 0.5) + + labs(y = "Score", col = "Score", x = "AUC with 95% limits") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic2() + + theme(legend.position = "none") + theme_classic() + +#### Calibration ############### +# MELD 3.0 - original +cal_meld3 <- calibration(dataset_original$MELD3.surv[!is.na(dataset_original$MELD3.surv)], dataset_original$D90_surv[!is.na(dataset_original$MELD3.surv)]) +cal_meld3$Score <- "MELD 3.0" +# MELD updated +cal_meld <- calibration(dataset_updated$meld.surv.updated, dataset_updated$D90_surv) +cal_meld$Score <- "MELD updated" +# CLIF - updated +cal_clif <- calibration(dataset_updated$clif.surv.updated, dataset_updated$D90_surv) +cal_clif$Score <- "CLIF-C ACLF updated" +# Lille - updated +cal_lille <- calibration(dataset_updated$lille.surv.updated, dataset_updated$D90_surv) +cal_lille$Score <- "Lille updated" + +# combine dfs +df_cal <- rbind(cal_meld3, cal_meld, cal_clif, cal_lille) + +df_cal$Score <- factor(df_cal$Score, labels = levels(data_wide$condition)) + +# plot without ribbon +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + ggtitle(sex) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + theme_classic() + +# plot with ribbon +p_calibration_females <- df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + #ggtitle(sex) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + theme_classic() + +# +source("/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/grid_arrange_fun.R") +library(gtable) +p1 <- grid_arrange_shared_legend(p_auc_males, p_calibration_males) +p2 <- grid_arrange_shared_legend(p_auc_females, p_calibration_females) + theme_bw() + +# plot with ribbon +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + ggtitle(sex) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + diff --git a/updating models/.DS_Store b/updating models/.DS_Store new file mode 100755 index 0000000..5008ddf Binary files /dev/null and b/updating models/.DS_Store differ diff --git a/updating models/Calibration_updated.R b/updating models/Calibration_updated.R new file mode 100755 index 0000000..2b9c08c --- /dev/null +++ b/updating models/Calibration_updated.R @@ -0,0 +1,96 @@ +# This script performs all calculations of model calibration +library(ggplot2) +library(survminer) # for plotting theme + +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data, "/recalibrated_models_default.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# +# MELD +cal_MELD.surv <- calibration(test.data$meld.surv.updated, y = test.data$D90_surv) +cal_MELD.surv$Score <- "MELD" + +# Lille +cal_Lille <- calibration(test.data$lille.surv.updated, y = test.data$D90_surv) +cal_Lille$Score <- "Lille" + +# CLIF-C ACLF +cal_CLIF <- calibration(test.data$clif.surv.updated, y = test.data$D90_surv) +cal_CLIF$Score <- "CLIF-C ACLF" + +# combine dfs +df_cal <- rbind(cal_MELD.surv, cal_Lille, cal_CLIF) + +############################################# +###################### Plots ############### +############################################# + +# plot without ribbon and without MELD 3.0 +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + +# plot with ribbon +p_calibration <- df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + theme_classic2() + + theme(legend.position = "none") + +############################################# +############## Summary Stats ############### +############################################# + +# compute calibration slopes and intercepts + +# MELD_1 survival function +MELD_stats <- val.prob.ci.2_wrapper(test.data$meld.surv.updated, test.data$D90_surv, "MELD") + +# Lille +Lille_stats <- val.prob.ci.2_wrapper(test.data$lille.surv.updated, test.data$D90_surv, "Lille") + +#CLIF-C ACLF +clif_stats <- val.prob.ci.2_wrapper(test.data$clif.surv.updated, y = test.data$D90_surv, "CLIF-C ACLF") + + +df_stats <- rbind(MELD_stats, Lille_stats, clif_stats) %>% relocate(Score) + + +p_calibration + geom_text(data = df_stats %>% filter(stat == "intercept"), + aes(0.1, 0.95, label = + paste0("Intercept (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + geom_text(data = df_stats %>% filter(stat == "slope"), + aes(0.1, 0.90, label = + paste0("Slope (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + diff --git a/updating models/Calibration_updated_sens.R b/updating models/Calibration_updated_sens.R new file mode 100755 index 0000000..2642cc0 --- /dev/null +++ b/updating models/Calibration_updated_sens.R @@ -0,0 +1,125 @@ + +# This script performs all calculations of model calibration +library(ggplot2) + +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data, "/recalibrated_models_sens.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# +# MELD +cal_MELD.surv <- calibration(test.meld$meld.surv.updated, y = test.meld$D90_surv) +cal_MELD.surv$Score <- "MELD" + +# Lille +cal_Lille <- calibration(test.lille$lille.surv.updated, y = test.lille$D90_surv) +cal_Lille$Score <- "Lille" + +# CLIF-C ACLF +cal_CLIF <- calibration(test.clif$clif.surv.updated, y = test.clif$D90_surv) +cal_CLIF$Score <- "CLIF-C ACLF" + +############################################# +############### Calculate stats ############ +############################################# +# MELD +cal_MELD.stats <- val.prob.ci.2_wrapper(test.meld$meld.surv.updated, y = test.meld$D90_surv, "MELD") + +# Lille +cal_Lille.stats <- val.prob.ci.2_wrapper(test.lille$lille.surv.updated, y = test.lille$D90_surv, "Lille") + +# CLIF-C ACLF +cal_CLIF.stats <- val.prob.ci.2_wrapper(test.clif$clif.surv.updated, y = test.clif$D90_surv, "CLIF-C ACLF") + +############################################# +###################### Plots ############### +############################################# +# plot with ribbon +p1 <- cal_MELD.surv %>% + ggplot(., aes(x = pred, y = obs)) + + geom_line(linewidth = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + alpha = 0.3, show.legend = F) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic2() + + theme_classic() + + geom_text(data = cal_MELD.stats %>% filter(stat == "intercept"), + aes(0.1, 0.90, label = + paste0("Intercept: ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + geom_text(data = cal_MELD.stats %>% filter(stat == "slope"), + aes(0.1, 0.85, label = + paste0("Slope: ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + +p2 <- cal_Lille %>% + ggplot(., aes(x = pred, y = obs)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + alpha = 0.3, show.legend = F) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic2() + + theme_classic() + + geom_text(data = cal_Lille.stats %>% filter(stat == "intercept"), + aes(0.1, 0.90, label = + paste0("Intercept: ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + geom_text(data = cal_Lille.stats %>% filter(stat == "slope"), + aes(0.1, 0.85, label = + paste0("Slope: ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + +p3 <- cal_CLIF %>% + ggplot(., aes(x = pred, y = obs)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + alpha = 0.3, show.legend = F) + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic2() + + theme_classic() + + geom_text(data = cal_CLIF.stats %>% filter(stat == "intercept"), + aes(0.1, 0.90, label = + paste0("Intercept: ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + geom_text(data = cal_CLIF.stats %>% filter(stat == "slope"), + aes(0.1, 0.85, label = + paste0("Slope: ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + +ggarrange(p1, p2, p3, nrow = 1, ncol = 3, common.legend = TRUE) diff --git a/updating models/Discrimination_updated.R b/updating models/Discrimination_updated.R new file mode 100755 index 0000000..5c36b61 --- /dev/null +++ b/updating models/Discrimination_updated.R @@ -0,0 +1,116 @@ + +library(pROC) +library(dplyr) +library(ggplot2) +library(purrr) +library(tidyr) +library(forcats) +#library(ggROC) + +# load data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data, "/recalibrated_models_default.Rdata")) + +############################################# +############### Calculate AUCs ############# +############################################# + +# MELD (survival function 1 and 2 do not matter here) +roc_meld <- roc(test.data$D90_surv, test.data$meld.surv.updated) + +# MELD 3.0 +#roc_meld3 <- roc(stph.meld$D90_DTH, stph.meld$MELD3.surv) + +# MELD from VanDerwerken et al 2021 +#roc_meld.VanDerwerken <- roc(stph.meld$D90_DTH, stph.meld$MELD_Van) + +# Lille +roc_lille <- roc(test.data$D90_surv, test.data$lille.surv.updated) + +# CLIF-C ACLF +roc_clif <- roc(test.data$D90_surv, test.data$clif.surv.updated) + +############################################# +###################### Plots ############### +############################################# +roc.list <- list(#"MELD VanDerwerken" = roc_meld.VanDerwerken, + "CLIF-C ACLF" = roc_clif, + "Lille" = roc_lille, + "MELD" = roc_meld) +#setwd(path_data) +#save(roc.list, file = "ROC_updated.Rdata") + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +pl <- ggroc(roc.list) + + facet_grid(. ~ name) + + theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + +data_wide$name <- data_wide$condition +data_wide$low_CL <- round(data_wide$low_CL, 2) +data_wide$mean <- round(data_wide$mean, 2) +data_wide$upper_CL <- round(data_wide$upper_CL, 2) + +pl + geom_text(data = data_wide, aes(0.05, 0.25, label = paste0("AUC (95% CI): ", mean, " (", low_CL, "-", upper_CL, ")" ), + hjust = 1), col = "black") + +############################################# +###################### AUC ################# +############################################# +# CI +#auc_meld <- auc(roc_meld) +#auc_meld_ci <- ci.auc(roc_meld) # Confidence intervals +#roc_plot(stph.meld, "D90_surv", "MELD.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") + +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + +df3 <- gather(df_AUC2, condition, measurement, `CLIF-C ACLF`:MELD, factor_key = TRUE) +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + +ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + + geom_point(lwd = 2) + + coord_cartesian(xlim = c(0.5, 0.86)) + + geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), + alpha = 1, show.legend = F, lwd = 1, width = 0.5) + + labs(y = "Score", col = "Score", x = "AUC with 95% limits") + + theme_classic() + + diff --git a/updating models/Discrimination_updated_sens.R b/updating models/Discrimination_updated_sens.R new file mode 100755 index 0000000..edc481d --- /dev/null +++ b/updating models/Discrimination_updated_sens.R @@ -0,0 +1,117 @@ + +library(pROC) +library(dplyr) +library(ggplot2) +#library(ggROC) + +# load data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data, "/recalibrated_models_sens.Rdata")) + +############################################# +############### Calculate AUCs ############# +############################################# + +# MELD (survival function 1 and 2 do not matter here) +roc_meld <- roc(test.meld$D90_surv, test.meld$meld.surv.updated) + +# MELD 3.0 +#roc_meld3 <- roc(stph.meld$D90_DTH, stph.meld$MELD3.surv) + +# MELD from VanDerwerken et al 2021 +#roc_meld.VanDerwerken <- roc(stph.meld$D90_DTH, stph.meld$MELD_Van) + +# Lille +roc_lille <- roc(test.lille$D90_surv, test.lille$lille.surv.updated) + +# CLIF-C ACLF +roc_clif <- roc(test.clif$D90_surv, test.clif$clif.surv.updated) + +############################################# +###################### Plots ############### +############################################# +roc.list <- list("CLIF-C ACLF" = roc_clif, + #"MELD VanDerwerken" = roc_meld.VanDerwerken, + "Lille" = roc_lille, + "MELD" = roc_meld) +#save(roc.list, file = "ROC_updated.Rdata") + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + + +pl <- ggroc(roc.list) + + facet_grid(. ~ name) + + theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + +data_wide$name <- data_wide$condition +data_wide$low_CL <- round(data_wide$low_CL, 2) +data_wide$mean <- round(data_wide$mean, 2) +data_wide$upper_CL <- round(data_wide$upper_CL, 2) +pl + geom_text(data = data_wide, aes(0.05, 0.25, label = paste0("AUC (95% CI): ", mean, " (", low_CL, "-", upper_CL, ")" ), + hjust = 1), col = "black") + +############################################# +###################### AUC ################# +############################################# +# CI +#auc_meld <- auc(roc_meld) +#auc_meld_ci <- ci.auc(roc_meld) # Confidence intervals +#roc_plot(stph.meld, "D90_surv", "MELD.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") + +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + +df3 <- gather(df_AUC2, condition, measurement, `CLIF-C ACLF`:MELD, factor_key = TRUE) +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + +ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + + geom_point(lwd = 2) + + coord_cartesian(xlim = c(0.5, 0.86)) + + geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), + alpha = 1, show.legend = F, lwd = 1, width = 0.5) + + labs(y = "Score", col = "Score", x = "AUC with 95% limits") + + theme_classic() + + + + + + + diff --git a/updating models/ROC_updated.Rdata b/updating models/ROC_updated.Rdata new file mode 100755 index 0000000..8b3f290 Binary files /dev/null and b/updating models/ROC_updated.Rdata differ diff --git a/updating models/agreement.R b/updating models/agreement.R new file mode 100644 index 0000000..898a2c6 --- /dev/null +++ b/updating models/agreement.R @@ -0,0 +1,267 @@ + +library(dplyr) +library(ggplot2) +library(ggpubr) +library(tidyr) +library(forcats) +library(purrr) +library(survminer) +library(psych) + +### data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models" +load(paste0(path_data, "/recalibrated_models_default.Rdata")) + +# subset data +data <- test.data %>% + select(Subject, meld.surv.updated, lille.surv.updated, clif.surv.updated, D90_surv) + +############################################# +################### ICC #################### +############################################# +coords <- c(0.25, 1) + +icc_meld_lille <- ICC(data[,c("meld.surv.updated", "lille.surv.updated")]) + +ICC_meld_lille <- paste0("ICC = ", round(icc_meld_lille$results$ICC[3], 2), + ", p = ", signif(icc_meld_lille$results$p[3], 2)) + +p1 <- ggscatter(data, x = "meld.surv.updated", y = "lille.surv.updated", + add = "reg.line", conf.int = TRUE, + cor.coef = F, cor.method = "pearson", cor.coef.coord = coords, + color = "grey", alpha = 0.7, + add.params = list(color = "purple2")) + + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + + labs(x = "MELD", y = "Lille") + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + annotate("text", x = coords[1], y = coords[2], label = ICC_meld_lille, size = 4, fontface = 2) + + theme_classic2() + +icc_clif_meld <- ICC(data[,c("meld.surv.updated", "clif.surv.updated")]) + +ICC_CLIF_MELD <- paste0("ICC = ", round(icc_clif_meld$results$ICC[3], 2), ", p = ", signif(icc_clif_meld$results$p[3], 2)) + +p2 <- ggscatter(data, x = "meld.surv.updated", y = "clif.surv.updated", + add = "reg.line", conf.int = TRUE, + cor.coef = F, cor.method = "pearson", cor.coef.coord = coords, + color = "grey", alpha = 0.7, + add.params = list(color = "purple2")) + + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + + labs(x = "MELD", y = "CLIF-C ACLF") + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + annotate("text", x = coords[1], y = coords[2], label = ICC_CLIF_MELD, size = 4, fontface = 2) + + theme_classic2() + +icc_clif_lille <- ICC(data[,c("clif.surv.updated", "lille.surv.updated")]) + +ICC_CLIF_lille <- paste0("ICC = ", round(icc_clif_lille$results$ICC[3], 2), ", p = ", signif(icc_clif_lille$results$p[3], 2)) + +p3 <- ggscatter(data, x = "lille.surv.updated", y = "clif.surv.updated", + add = "reg.line", conf.int = TRUE, + cor.coef = F, cor.method = "pearson", cor.coef.coord = coords, + color = "grey", alpha = 0.7, + add.params = list(color = "purple2")) + + geom_abline(intercept = 0, slope = 1, linetype = "dashed") + + labs(x = "Lille", y = "CLIF-C ACLF") + + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + + annotate("text", x = coords[1], y = coords[2], label = ICC_CLIF_lille, size = 4, fontface = 2) + + theme_classic2() + +############################################# +################ Correlation ############### +############################################# +#coords <- c(0.01, 1) +# +# p1 <- ggscatter(data, x = "meld.surv.updated", y = "lille.surv.updated", +# add = "reg.line", conf.int = TRUE, +# cor.coef = TRUE, cor.method = "pearson", cor.coef.coord = coords, +# color = "grey", alpha = 0.7, +# add.params = list(color = "purple2")) + +# geom_abline(intercept = 0, slope = 1, linetype = "dashed") + +# labs(x = "MELD", y = "Lille") + +# coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + +# theme_classic2() +# +# p2 <- ggscatter(data, x = "meld.surv.updated", y = "clif.surv.updated", +# add = "reg.line", conf.int = TRUE, +# cor.coef = TRUE, cor.method = "pearson", cor.coef.coord = coords, +# color = "grey", alpha = 0.7, +# add.params = list(color = "purple2")) + +# geom_abline(intercept = 0, slope = 1, linetype = "dashed") + +# labs(x = "MELD", y = "CLIF-C ACLF") + +# coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + +# theme_classic2() +# +# p3 <- ggscatter(data, x = "lille.surv.updated", y = "clif.surv.updated", +# add = "reg.line", conf.int = TRUE, +# cor.coef = TRUE, cor.method = "pearson", cor.coef.coord = coords, +# color = "grey", alpha = 0.7, +# add.params = list(color = "purple2")) + +# geom_abline(intercept = 0, slope = 1, linetype = "dashed") + +# labs(x = "Lille", y = "CLIF-C ACLF") + +# coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + +# theme_classic2() + +############################################# +################ Correlation ############### +############################################# +# long format +data_long <- gather(data, model, probs, meld.surv.updated:clif.surv.updated, factor_key = TRUE) + +data_long2 <- data_long %>% + group_by(Subject) %>% + mutate(dev = sd(probs), # sd + mad_dev = mad(probs), #median absolute deviation + max_min_dev = max(probs) - min(probs)) # range + +# plot +p4 <- data_long2 %>% + #mutate(Subject = fct_reorder(as.factor(Subject), desc(max_min_dev))) %>% + ggplot(.) + + geom_point(aes(x = max_min_dev, y = reorder(Subject, max_min_dev))) + + labs(x = "Range probabilities ", y = "Subject") + + theme_classic2() + + theme(axis.text.y = element_blank(), + axis.ticks.y = element_blank()) + +p4 + +# overall survival rate +data_long2 %>% ungroup() %>% summarise(mean(D90_surv)) + +# fun +cut_off_surv_rate <- function(data, thres, var_name){ + # calculates mean of `var_name` and sample size for chosen `thres` value + + df <- data %>% + filter(max_min_dev >= thres) %>% + ungroup() %>% + summarise(sum_var = mean({{ var_name }}, na.rm = T)) + + df_n <- data %>% + filter(max_min_dev >= thres) %>% + ungroup() %>% + summarise(n = length(unique(Subject))) + + out <- data.frame(surv_rate = df$sum_var, thres = thres, n = df_n$n) + + return(out) +} + +data_long23 <- data_long2 %>% distinct(Subject, .keep_all = T) +data_long23%>% filter(Subject == 1005003) + + +thresholds <- seq(0, 0.5, by = 0.01) +res <- map_df(thresholds, cut_off_surv_rate, data = data_long2, var_name = D90_surv) + +# plot +ind <- c(1, 5, 9, 13, 25, 41, 49) # select a subset to visualise +p5 <- ggplot(res, aes(y = surv_rate, x = thres)) + + geom_point() + + geom_line() + + geom_text(data = res[ind, ], aes(label = n), check_overlap = TRUE, nudge_y = 0.04, nudge_x = 0.015) + + labs(x = "Range probabilities", y = "Survival Rate") + + theme_classic2() + +p5 + +# fun +sum_fun <- function(df1, df2, thres){ + # choses ids based the `thres` value + + df_prelim <- df1 %>% filter(max_min_dev > thres) %>% arrange(Subject) + + df_final <- df2 %>% filter(Subject %in% df_prelim$Subject) + + df_final$threshold <- thres + + return(df_final) +} + +res <- map_df(thresholds, sum_fun, df1 = data_long2, df2 = test.data) + +vars <- c("Subject", "Bilirubin.mg.dl", "Bilirubin.day.7", "delta.bili", "INR", + "Creatinine.mg.dl", "Albumin", "WBC", "INR", "protime", "HE", + "MAP", "Sodium", "CLIF.OF", + "kidney.score", "liver.score", "brain.score", "coag.score", "circ.score", + "Gender.x", "Age.at.randomisation..calc..x") + +sum_df <- res %>% + group_by(threshold) %>% + select(all_of(vars)) %>% + summarise_all(mean) %>% + gather(., variable, value, Bilirubin.mg.dl:Age.at.randomisation..calc..x, factor_key=TRUE) + +vars_to_keep <- c("CLIF.OF") + +p6 <- sum_df %>% + filter(variable %in% vars_to_keep) %>% + mutate(variable = ifelse(variable == "CLIF.OF", "CLIF OF", NA)) %>% + ggplot(., aes(x = threshold, y = value)) + + geom_point() + + geom_line() + + facet_wrap(. ~ variable, scales = "free_y") + + labs(x = "Range probabilities", y = "") + + theme_classic2() + + theme(strip.background = element_blank()) + +p6 + +vars_to_remove <- c("Gender.x", "Age.at.randomisation..calc..x", + "liver.score", "kidney.score", "brain.score", "coag.score", + "circ.score", + "protime", "delta.bili") + +#library(gridExtra) +ggarrange(p1, p2, p3, p4, p5, p6, nrow = 2, ncol = 3, labels = "auto", align = "hv") + +################# +################# +################# + +sum_df %>% filter(!(variable %in% vars_to_remove)) %>% + filter(!variable %in% "CLIF.OF") %>% + mutate(variable = case_when(variable == "Bilirubin.mg.dl" ~ "Bilirubin (mg/dL)", + variable == "Bilirubin.day.7" ~ "Bilirubin (day 7)", + variable == "Creatinine.mg.dl" ~ "Creatinine (mg/dL)", + TRUE ~ variable)) %>% + ggplot(., aes(x = threshold, y = value)) + + geom_point() + + geom_line() + + facet_wrap(. ~ variable, scales = "free_y") + + labs(x = "Range probabilities", y = "") + + theme_bw() + + theme(strip.background = element_blank()) + +############################################# +########### ICC exploration ################ +############################################# +#https://www.datanovia.com/en/lessons/intraclass-correlation-coefficient-in-r/ +# library(psych) +# ICC(data[,c(2:4)]) +# +# ICC(data[,c(2,4)]) +# +#cor.test(data[,c(2)],data[,c(4)], method = c("pearson")) +#ICC(data[,c(2,4)]) +# +#cor.test(data[,c(2)],data[,c(3)], method = c("pearson")) +#ICC(data[,c(2,3)]) +# +#cor.test(data[,c(3)],data[,c(4)], method = c("pearson")) +#ICC(data[,c(3,4)]) +# +# #The intra-class correlation coefficient was computed to assess the agreement between three scores +# #in predicting 90-day survival. There was a poor agreement between the three scores, +# # using the two-way mixed-effects model and “single rater” unit, kappa = 0.2, p = 0.056. +# +# #ICC estimates and their 95% confident intervals were calculated using the R package psych (2.3.6) based on a "single rating" unit, +# #two-way mixed-effects model. - add https://www.sciencedirect.com/science/article/pii/S1556370716000158 +# +# #library("irr") # an alternative package +# #icc(data[,c(2:4)], model = "oneway", +# # type = "consistency", unit = "single") + + + diff --git a/updating models/compare_newmodels.R b/updating models/compare_newmodels.R old mode 100644 new mode 100755 diff --git a/updating models/compare_oldnew.R b/updating models/compare_oldnew.R old mode 100644 new mode 100755 index 2f34387..639b545 --- a/updating models/compare_oldnew.R +++ b/updating models/compare_oldnew.R @@ -6,13 +6,13 @@ library(pROC) ###### # Comparison of areas under the ROC curves # MELD -roc_meld_new <- roc(test.data.c$D90_surv, test.data.c$meld.surv.updated) -roc_meld_old <- roc(test.data.c$D90_surv, test.data.c$MELD.surv) +roc_meld_new <- roc(test.data$D90_surv, test.data$meld.surv.updated) +roc_meld_old <- roc(test.data$D90_surv, test.data$MELD.surv) roc.test(roc_meld_old, roc_meld_new) # CLIF-C ACLF -roc_clif_new <- roc(test.data.c$D90_surv, test.data.c$clif.surv.updated) -roc_clif_old <- roc(test.data.c$D90_surv, test.data.c$CLIF.surv) +roc_clif_new <- roc(test.data$D90_surv, test.data$clif.surv.updated) +roc_clif_old <- roc(test.data$D90_surv, test.data$CLIF.surv) roc.test(roc_clif_new, roc_clif_old) # Lille diff --git a/updating models/comparisons.R b/updating models/comparisons.R new file mode 100755 index 0000000..3c89041 --- /dev/null +++ b/updating models/comparisons.R @@ -0,0 +1,144 @@ + +library(ggsci) # for color palette +library(grid) +library(gridExtra) + +path1 <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path1, "ROC_original.Rdata")) + +roc_list_orig <- roc.list + +path2 <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models/" +load(paste0(path2, "ROC_updated.Rdata")) + +roc_list_update <- roc.list + + +#model <- "meld" # meld, lille, clif + +model <- "clif" # meld, lille, clif + +if(model == "meld"){ + # MELD + model_list <- list("meld_orig" = roc_list_orig$MELD, + "meld_update" = roc_list_update$MELD) + + P_value_boot <- roc.test(roc_list_orig$MELD, roc_list_update$MELD, method = "bootstrap") # bootstrap p_value + +}else if(model == "lille"){ + #Lille + model_list <- list("lille_orig" = roc_list_orig$Lille, + "lille_update" = roc_list_update$Lille) + + P_value_boot <- roc.test(roc_list_orig$Lille, roc_list_update$Lille, method = "bootstrap") + +}else{ + # CLIF + model_list <- list("clif_orig" = roc_list_orig$`CLIF-C ACLF`, + "clif_update" = roc_list_update$`CLIF-C ACLF`) + + P_value_boot <- roc.test(roc_list_orig$`CLIF-C ACLF`, roc_list_update$`CLIF-C ACLF`, method = "bootstrap") + +} + +g.list <- ggroc(model_list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# add confidence bands +ci.list <- lapply(model_list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +# individual plot +ggroc(model_list) + + ggtitle(model) + + facet_grid(. ~ name) + + +ggroc(model_list) + + ggtitle(model) + + #facet_grid(. ~ name) + + theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + #geom_text(x = 0.5, y = 0.5, label = paste0("p.value")) + + annotate(geom = "text", x = 0.3, y = 0.5, label = paste0("p.value = ", round(P_value_boot$p.value, 2)), fontsize = 12) + + labs(x = "Specificity", y = "Sensitivity") + + scale_fill_jco() + + scale_color_jco() + + annotate(geom = "text", x = 0.3, y = 0.5, label = paste0("p.value ", round(P_value_boot$p.value, 2)), fontsize = 12) + + labs(x = "Specificity", y = "Sensitivity") + + coord_equal() + + theme_classic() + + theme(legend.position = "none") + + +#### arranged plots +linewidth <- 1.3 +text_size <- 4.5 + +roc1 <- ggroc(model_list, lwd = linewidth) + + ggtitle("CLIF-C ACLF") + + #facet_grid(. ~ name) + + #theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + #geom_text(x = 0.5, y = 0.5, label = paste0("p.value")) + + annotate(geom = "text", x = 0.3, y = 0.5, label = paste0("p.value = ", round(P_value_boot$p.value, 2)), + size = text_size) + + labs(x = " ", y = " ") + + scale_fill_jco() + + scale_color_jco() + + coord_equal() + + theme_classic2() + + theme(legend.position = "none") + +roc2 <- ggroc(model_list, lwd = linewidth) + + ggtitle("Lille") + + #facet_grid(. ~ name) + + #theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + #geom_text(x = 0.5, y = 0.5, label = paste0("p.value")) + + annotate(geom = "text", x = 0.3, y = 0.5, label = paste0("p.value = ", round(P_value_boot$p.value, 2)), + size = text_size) + + labs(x = " ", y = " ") + + scale_fill_jco() + + scale_color_jco() + + coord_equal() + + theme_classic2() + + theme(legend.position = "none") + +roc3 <- ggroc(model_list, lwd = linewidth) + + ggtitle("MELD") + + #facet_grid(. ~ name) + + #theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + #geom_text(x = 0.5, y = 0.5, label = paste0("p.value")) + + annotate(geom = "text", x = 0.3, y = 0.5, label = paste0("p.value = ", round(P_value_boot$p.value, 2)), + size = text_size) + + labs(x = " ", y = " ") + + scale_fill_jco() + + scale_color_jco() + + coord_equal() + + theme_classic2() + + theme(legend.position = "none") + +grid.arrange(arrangeGrob(roc1), + arrangeGrob(roc2), + arrangeGrob(roc3), + ncol = 3, left = "Sensitivity", bottom = "Specificity") + + + + diff --git a/updating models/recalibrate.R b/updating models/recalibrate.R old mode 100644 new mode 100755 index f48b864..3051b4c --- a/updating models/recalibrate.R +++ b/updating models/recalibrate.R @@ -3,206 +3,94 @@ # find recalibration parameters and the survival probability is re-calculated using these parameters. library(rms) library(pROC) -source("val_prob_confidence.R") + +# load data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data, "complete_cases_models.Rdata")) # Set random seed -set.seed(34) +set.seed(111) ###### # Split complete-case sample in training and test observations -dt <- sort(sample(nrow(stph.c), nrow(stph.c)*.8)) -test.data.c <- stph.c[dt,] -train.data.c <- stph.c[-dt,] - +fraction <- 0.7 +dt <- sort(sample(nrow(stph.c), nrow(stph.c)*fraction)) +test.data <- stph.c[dt,] +train.data <- stph.c[-dt,] + +############################################# +############### MELD score ################## +############################################# # Fit a logistic regression model on the training data -meld_regr <- glm(D90_surv ~ MELD.calc, family = binomial, data = train.data.c) +meld_regr <- glm(D90_surv ~ MELD.calc, family = binomial("logit"), data = train.data) # Save the regression coefficients (alpha and beta) ic_meld_c <- meld_regr$coefficients[1] slope_meld_c <- meld_regr$coefficients[2] # Calculate adjusted MELD score and survival probability on training set -train.data.c$updated.meld <- ic_meld_c + slope_meld_c*train.data.c$MELD.calc -train.data.c$meld.surv.updated <- 1/(1+exp(-train.data.c$updated.meld)) +train.data$updated.meld <- ic_meld_c + slope_meld_c*train.data$MELD.calc +train.data$meld.surv.updated <- 1/(1 + exp(-train.data$updated.meld)) # Calculate adjusted scores on the test set and assess performance -test.data.c$updated.meld <- ic_meld_c + slope_meld_c*test.data.c$MELD.calc -test.data.c$meld.surv.updated <- 1/(1+exp(-test.data.c$updated.meld)) - -roc_meld_uc <- roc(test.data.c$D90_surv, test.data.c$meld.surv.updated) -ci.auc(roc_meld_uc) +test.data$updated.meld <- ic_meld_c + slope_meld_c*test.data$MELD.calc +test.data$meld.surv.updated <- 1/(1 + exp(-test.data$updated.meld)) -val.prob(test.data.c$meld.surv.updated, test.data.c$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) - -# Lille score re-calibration -# Run logistic regression and save regression coefficientss -lille_regr <- glm(D90_surv ~ LILLE, family = binomial, data = train.data.c) +############################################# +################### Lille score ############# +############################################# +# Run logistic regression and save regression coefficients +lille_regr <- glm(D90_surv ~ LILLE, family = binomial("logit"), data = train.data) ic_lille_c <- lille_regr$coefficients[1] slope_lille_c <- lille_regr$coefficients[2] # Update scores in training set -train.data.c$updated.lille <- ic_lille_c + slope_lille_c*train.data.c$LILLE -train.data.c$lille.surv.updated <- 1/(1 + exp(-train.data.c$updated.lille)) +train.data$updated.lille <- ic_lille_c + slope_lille_c*train.data$LILLE +train.data$lille.surv.updated <- 1/(1 + exp(-train.data$updated.lille)) # Assess performance on the test set -test.data.c$updated.lille <- ic_lille_c + slope_lille_c*test.data.c$LILLE -test.data.c$lille.surv.updated <- 1/(1 + exp(-test.data.c$updated.lille)) - -roc_lille_uc <- roc(test.data.c$D90_surv, test.data.c$lille.surv.updated) -ci.auc(roc_lille_uc) +test.data$updated.lille <- ic_lille_c + slope_lille_c*test.data$LILLE +test.data$lille.surv.updated <- 1/(1 + exp(-test.data$updated.lille)) -val.prob(test.data.c$lille.surv.updated, test.data.c$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) - -# CLIF-C ACLF score re-calibration -# Run logistic regression and save regression coefficientss -clif_regr <- glm(D90_surv ~ CLIF.C, family = binomial, data = train.data.c) +############################################# +######### CLIF-C ACLF Score ################# +############################################# +# Run logistic regression and save regression coefficients +clif_regr <- glm(D90_surv ~ CLIF.C, family = binomial("logit"), data = train.data) ic_clif_c <- clif_regr$coefficients[1] slope_clif_c <- clif_regr$coefficients[2] # Update scores in training set -train.data.c$updated.clif <- ic_clif_c + slope_clif_c*train.data.c$CLIF.C -train.data.c$clif.surv.updated <- 1/(1 + exp(-train.data.c$updated.clif)) +train.data$updated.clif <- ic_clif_c + slope_clif_c*train.data$CLIF.C +train.data$clif.surv.updated <- 1/(1 + exp(-train.data$updated.clif)) # Assess performance on the test set -test.data.c$updated.clif <- ic_clif_c + slope_clif_c*test.data.c$CLIF.C -test.data.c$clif.surv.updated <- 1/(1 + exp(-test.data.c$updated.clif)) - -roc_clif_uc <- roc(test.data.c$D90_surv, test.data.c$clif.surv.updated) -ci.auc(roc_clif_uc) - -val.prob(test.data.c$clif.surv.updated, test.data.c$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) - -# Confidence intervals/SEs for the slope and intercept -# val_prob_confidence is used to get SEs for the calibration intercepts and slopes (and CIs in the plots) -val_prob_confidence(test.data.c$meld.surv.updated, test.data.c$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = T, roundstats = 3) - -val_prob_confidence(test.data.c$clif.surv.updated, test.data.c$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = T, dostats = TRUE, roundstats = 3) - -val_prob_confidence(test.data.c$lille.surv.updated, test.data.c$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = T, dostats = TRUE) - -###### -# Sensitivity analysis; split data on full stph sample and then select the complete cases per model -# Split full sample in training and test observations -dt <- sort(sample(nrow(stph), nrow(stph)*.8)) -test.data <- stph[dt,] -train.data <- stph[-dt,] - -# MELD score re-calibration -test.meld <- stph.meld[stph.meld$Subject %in% test.data$Subject,] -train.meld <- stph.meld[stph.meld$Subject %in% train.data$Subject,] - -# Fit a logistic regression model on the training data -meld_regr <- glm(D90_surv ~ MELD.calc, family = binomial, data = train.meld) - -# Save the regression coefficients (alpha and beta) -ic_meld <- meld_regr$coefficients[1] -slope_meld <- meld_regr$coefficients[2] - -# Calculate adjusted MELD score and survival probability on training set -train.meld$updated.meld <- ic_meld + slope_meld*train.meld$MELD.calc -train.meld$meld.surv.updated <- 1/(1+exp(-train.meld$updated.meld)) +test.data$updated.clif <- ic_clif_c + slope_clif_c*test.data$CLIF.C +test.data$clif.surv.updated <- 1/(1 + exp(-test.data$updated.clif)) -# Calculate adjusted scores on the test set and assess performance -test.meld$updated.meld <- ic_meld + slope_meld*test.meld$MELD.calc -test.meld$meld.surv.updated <- 1/(1+exp(-test.meld$updated.meld)) - -roc_meld_u <- roc(test.meld$D90_surv, test.meld$meld.surv.updated) -ci.auc(roc_meld_u) - -val.prob(test.meld$meld.surv.updated, test.meld$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) - -# Finally, add the updated scores to the full MELD df -stph.meld$updated.meld <- ic_meld + slope_meld*stph.meld$MELD.calc -stph.meld$meld.surv.updated <- 1/(1+exp(-stph.meld$updated.meld)) - -# Lille score re-calibration -# Split Lille dataframe into training and test -test.lille <- stph.lille[stph.lille$Subject %in% test.data$Subject,] -train.lille <- stph.lille[stph.lille$Subject %in% train.data$Subject,] - -# Run logistic regression and save regression coefficientss -lille_regr <- glm(D90_surv ~ LILLE, family = binomial, data = train.lille) - -ic_lille <- lille_regr$coefficients[1] -slope_lille <- lille_regr$coefficients[2] - -# Update scores in training set and assess performance -train.lille$updated.lille <- ic_lille + slope_lille*train.lille$LILLE -train.lille$lille.surv.updated <- 1/(1 + exp(-train.lille$updated.lille)) - -# Assess performance on the test set -test.lille$updated.lille <- ic_lille + slope_lille*test.lille$LILLE -test.lille$lille.surv.updated <- 1/(1 + exp(-test.lille$updated.lille)) - -roc_lille_u <- roc(test.lille$D90_surv, test.lille$lille.surv.updated) -ci.auc(roc_lille_u) - -val.prob(test.lille$lille.surv.updated, test.lille$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) - -# Finally, add the updated scores to the full Lille df -stph.lille$updated.lille <- ic_lille + slope_lille*stph.lille$LILLE -stph.lille$lille.surv.updated <- 1/(1 + exp(-stph.lille$updated.lille)) - -# CLIF-C ACLF score re-calibration -# Split CLIF-C ACLF dataframe into training and test -test.clif <- stph.clif[stph.clif$Subject %in% test.data$Subject,] -train.clif <- stph.clif[stph.clif$Subject %in% train.data$Subject,] - -# Run logistic regression and save regression coefficientss -clif_regr <- glm(D90_surv ~ CLIF.C, family = binomial, data = train.clif) - -ic_clif <- clif_regr$coefficients[1] -slope_clif <- clif_regr$coefficients[2] - -# Update scores in training set and assess performance -train.clif$updated.clif <- ic_clif + slope_clif*train.clif$CLIF.C -train.clif$clif.surv.updated <- 1/(1 + exp(-train.clif$updated.clif)) - -# Assess performance on the test set -test.clif$updated.clif <- ic_clif + slope_clif*test.clif$CLIF.C -test.clif$clif.surv.updated <- 1/(1 + exp(-test.clif$updated.clif)) +############################################# +######### collect all params ################ +############################################# +df_meld <- data.frame(intercept = ic_meld_c, slope = slope_meld_c) +df_meld$Score <- "MELD" -roc_clif_u <- roc(test.clif$D90_surv, test.clif$clif.surv.updated) -ci.auc(roc_clif_u) +df_lille <- data.frame(intercept = ic_lille_c, slope = slope_lille_c) +df_lille$Score <- "Lille" -val.prob(test.clif$clif.surv.updated, test.clif$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = FALSE, statloc = FALSE) +df_clif <- data.frame(intercept = ic_clif_c, slope = slope_clif_c) +df_clif$Score <- "CLIF-C ACLF" -# Finally, add the updated scores to the full CLIF-C ACLF df -stph.clif$updated.clif <- ic_clif + slope_clif*stph.clif$CLIF.C -stph.clif$clif.surv.updated <- 1/(1 + exp(-stph.clif$updated.clif)) +df <- rbind(df_meld, df_lille, df_clif) +rownames(df) <- NULL -# Confidence intervals/SEs for the slope and intercept -# val_prob_confidence is used to get SEs for the calibration intercepts and slopes (and CIs in the plots) -val_prob_confidence(test.meld$meld.surv.updated, test.meld$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = T, dostats = TRUE, roundstats = 3) +library(xtable) +xtable(df[c(3, 1, 2)]) -val_prob_confidence(test.clif$clif.surv.updated, test.clif$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = T, dostats = TRUE, roundstats = 3) +#path_to_save <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models/" +#setwd(path_to_save) +#save(test.data, file = "recalibrated_models_default.Rdata") -val_prob_confidence(test.lille$lille.surv.updated, test.lille$D90_surv, pl = TRUE, smooth = FALSE, logistic.cal = TRUE, - xlab = "Predicted survival probability", ylab = "Actual survival probability", - legendloc = T, dostats = TRUE, roundstats = 3) diff --git a/updating models/recalibrated_models_default.Rdata b/updating models/recalibrated_models_default.Rdata new file mode 100755 index 0000000..491b659 Binary files /dev/null and b/updating models/recalibrated_models_default.Rdata differ diff --git a/updating models/recalibrated_models_sens.Rdata b/updating models/recalibrated_models_sens.Rdata new file mode 100755 index 0000000..441b32f Binary files /dev/null and b/updating models/recalibrated_models_sens.Rdata differ diff --git a/updating models/recalibration_sens.R b/updating models/recalibration_sens.R new file mode 100755 index 0000000..25461bc --- /dev/null +++ b/updating models/recalibration_sens.R @@ -0,0 +1,121 @@ + +############################################# +############### Sensitivity Analysis ####### +############################################# + +# load full data +path_data1 <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/pre-analysis/" +load(paste0(path_data1, "full_sample.Rdata")) + +# load data with original models +path_data2 <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/" +load(paste0(path_data2, "original_models.Rdata")) + +#split data on full stph sample and then select the complete cases per model + +# Split full sample in training and test observations +## make sure to use the same seed and fraction as in recalibrate.R script +fraction <- 0.7 + +#dt <- sort(sample(nrow(stph), nrow(stph) * fraction)) +#test.data <- stph[dt,] +#train.data <- stph[-dt,] + +############################################# +############### MELD score ################## +############################################# +# MELD score re-calibration - subset data +#test.meld <- stph.meld[stph.meld$Subject %in% test.data$Subject,] +#train.meld <- stph.meld[stph.meld$Subject %in% train.data$Subject,] + +set.seed(111) # Set random seed +dt <- sort(sample(nrow(stph.meld), nrow(stph.meld) * fraction)) +test.meld <- stph.meld[dt,] +train.meld <- stph.meld[-dt,] + +# Fit a logistic regression model on the training data +meld_regr <- glm(D90_surv ~ MELD.calc, family = binomial("logit"), data = train.meld) + +# Save the regression coefficients (alpha and beta) +ic_meld <- meld_regr$coefficients[1] +slope_meld <- meld_regr$coefficients[2] + +# Update scores on training set +train.meld$updated.meld <- ic_meld + slope_meld*train.meld$MELD.calc +train.meld$meld.surv.updated <- 1/(1 + exp(-train.meld$updated.meld)) + +# Update scores on test set +test.meld$updated.meld <- ic_meld + slope_meld*test.meld$MELD.calc +test.meld$meld.surv.updated <- 1/(1 + exp(-test.meld$updated.meld)) + +############################################# +############### Lille score ################# +############################################# +# Lille score re-calibration - subset data +#test.lille <- stph.lille[stph.lille$Subject %in% test.data$Subject,] +#train.lille <- stph.lille[stph.lille$Subject %in% train.data$Subject,] + +set.seed(111) +dt <- sort(sample(nrow(stph.lille), nrow(stph.lille) * fraction)) +test.lille <- stph.lille[dt,] +train.lille <- stph.lille[-dt,] + +# Run logistic regression and save regression coefficients +lille_regr <- glm(D90_surv ~ LILLE, family = binomial("logit"), data = train.lille) + +# Save the regression coefficients (alpha and beta) +ic_lille <- lille_regr$coefficients[1] +slope_lille <- lille_regr$coefficients[2] + +# Update scores on training set +train.lille$updated.lille <- ic_lille + slope_lille*train.lille$LILLE +train.lille$lille.surv.updated <- 1/(1 + exp(-train.lille$updated.lille)) + +# Update scores on test set +test.lille$updated.lille <- ic_lille + slope_lille*test.lille$LILLE +test.lille$lille.surv.updated <- 1/(1 + exp(-test.lille$updated.lille)) + +############################################# +############### CLIF-C ACLF score ########### +############################################# +# CLIF-C ACLF score re-calibration - subset data +#test.clif <- stph.clif[stph.clif$Subject %in% test.data$Subject,] +#train.clif <- stph.clif[stph.clif$Subject %in% train.data$Subject,] + +set.seed(111) +dt <- sort(sample(nrow(stph.clif), nrow(stph.clif) * fraction)) +test.clif <- stph.clif[dt,] +train.clif <- stph.clif[-dt,] + +# Run logistic regression and save regression coefficients +clif_regr <- glm(D90_surv ~ CLIF.C, family = binomial("logit"), data = train.clif) + +# Save the regression coefficients (alpha and beta) +ic_clif <- clif_regr$coefficients[1] +slope_clif <- clif_regr$coefficients[2] + +# Update scores on training set +train.clif$updated.clif <- ic_clif + slope_clif*train.clif$CLIF.C +train.clif$clif.surv.updated <- 1/(1 + exp(-train.clif$updated.clif)) + +# Assess performance on the test set +test.clif$updated.clif <- ic_clif + slope_clif*test.clif$CLIF.C +test.clif$clif.surv.updated <- 1/(1 + exp(-test.clif$updated.clif)) + +# overall sample size per model +nrow(stph.meld); nrow(stph.lille); nrow(stph.clif) + +# train/test sample size +model <- c("MELD", "Lille", "CLIF") +train_size <- c(nrow(train.meld), nrow(train.lille), nrow(train.clif)) +test_size <- c(nrow(test.meld), nrow(test.lille), nrow(test.clif)) +df <- data.frame(model, train_size, test_size) +df + +library(xtable) +xtable(t(df)) + +#path_to_save <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/updating models/" +#setwd(path_to_save) +#save(test.meld, test.lille, test.clif, file = "recalibrated_models_sens.Rdata") + diff --git a/updating models/val/Calibration_severity.R b/updating models/val/Calibration_severity.R new file mode 100644 index 0000000..38ad8ed --- /dev/null +++ b/updating models/val/Calibration_severity.R @@ -0,0 +1,165 @@ + +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme + +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/val_data" +load(paste0(path_data, "/models_validation.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# + +# stratify by severity +severity <- "non-severe" # "severe" or non-severe + +if(severity == "severe"){ + data_meld <- Global_AlcHep.meld %>% filter(`DF at admission` >= 32) + data_lille <- Global_AlcHep.lille %>% filter(`DF at admission` >= 32) + }else{ + data_meld <- Global_AlcHep.meld %>% filter(`DF at admission` < 32) + data_lille <- Global_AlcHep.lille %>% filter(`DF at admission` < 32) + } + + +############################################# +# MELD 1 survival function +cal_MELD.surv <- calibration(data_meld$MELD.surv, y = data_meld$D90_surv) +cal_MELD.surv$Score <- "MELD 1" + +# MELD_2 survival function +cal_MELD.surv2 <- calibration(data_meld$MELD.surv2, y = data_meld$D90_surv) +cal_MELD.surv2$Score <- "MELD 2" + +# MELD_2 survival function +cal_MELD.surv2 <- calibration(data_meld$MELD.surv2, y = data_meld$D90_surv) +cal_MELD.surv2$Score <- "MELD 2" + +# MELD VanDerwerken +cal_MELD.VanDerwerken <- calibration(data_meld$MELD_Van, y = data_meld$D90_surv) +cal_MELD.VanDerwerken$Score <- "MELD VanDerwerken" + +# MELD updated +cal_MELD.updated <- calibration(data_meld$meld.surv.updated, y = data_meld$D90_surv) +cal_MELD.updated$Score <- "MELD updated" + +############################################# +# Lille +cal_Lille <- calibration(data_lille$Lille.surv, y = data_lille$D90_surv) +cal_Lille$Score <- "Lille original" + +# Lille updated +cal_Lille.updated <- calibration(data_lille$lille.surv.updated, y = data_lille$D90_surv) +cal_Lille.updated$Score <- "Lille updated" + +# combine dfs +df_cal <- rbind(cal_MELD.surv, cal_MELD.surv2, cal_MELD.VanDerwerken, cal_MELD.updated, cal_Lille, cal_Lille.updated) + +df_cal <- df_cal %>% mutate(Status = ifelse(Score == "Lille updated" | Score == "MELD updated", "Updated", "Original")) + +############################################# + +if(severity == "severe"){ + df_cal_severe <- df_cal + df_cal_severe$severity <- "Severe AH" + }else{ + df_cal_non_severe <- df_cal + df_cal_non_severe$severity <- "Non-severe AH" +} + +df_cal_final <- rbind(df_cal_severe, df_cal_non_severe) + +############################################# + +# plot with ribbon +# display.brewer.pal(n = 8, name = 'Dark2') +col_updated <- brewer.pal(n = 8, name = "Dark2")[c(2, 3)] + +p_calibration_updated <- df_cal_final %>% filter(Status == "Updated") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Dark2") + + #scale_fill_brewer(palette = "Dark2") + + scale_fill_manual("", values = col_updated) + + scale_color_manual(name = "Score", values = col_updated) + + facet_grid(severity ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + theme_classic2() + + theme(legend.position = "none") + + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1)) + +p_calibration_updated + +col_original <- brewer.pal(n = 8, name = "Dark2")[c(2, 4, 5, 3)] + +p_calibration_original <- df_cal_final %>% filter(Status == "Original") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Dark2") + + #scale_fill_brewer(palette = "Dark2") + + scale_fill_manual("", values = col_original) + + scale_color_manual(name = "Score", values = col_original) + + facet_grid(severity ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab(" ") + + theme_classic2() + + theme(legend.position = "none") + + #scale_y_continuous(labels = scientific) + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1.00)) + +p_calibration_original + +ggarrange(p_calibration_original, p_calibration_updated, nrow = 1) + +cowplot::plot_grid(p_calibration_original, p_calibration_updated, nrow = 1) + + +# re-arrange factor levels +df_cal_final2 <- df_cal_final %>% + mutate(Score = factor(Score, + levels = c("Lille updated", "MELD updated", "Lille original", "MELD VanDerwerken", "MELD 1", "MELD 2"))) + +display.brewer.pal(n = 8, name = 'Paired') + +col_2 <- brewer.pal(n = 8, name = "Paired")[c(1, 2, 5, 6, 7, 8)] + +df_cal_final2 %>% #filter(Status == "Original") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Paired") + + #scale_fill_brewer(palette = "Paired") + + scale_fill_manual("", values = col_2) + + scale_color_manual(name = "Score", values = col_2) + + facet_grid(severity ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + #theme_minimal() + + theme_classic2() + + theme(legend.position = "none") + + #scale_y_continuous(labels = scientific) + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1.00)) + + diff --git a/updating models/val/Calibration_temporal.R b/updating models/val/Calibration_temporal.R new file mode 100644 index 0000000..79c8085 --- /dev/null +++ b/updating models/val/Calibration_temporal.R @@ -0,0 +1,125 @@ + +# This script evaluated all calculations of model calibration +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme + +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/val_data" +load(paste0(path_data, "/models_validation.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# + +data_meld <- Global_AlcHep.meld %>% filter(`Date of admission`> "2015-01-01") +data_lille <- Global_AlcHep.lille %>% filter(`Date of admission`> "2015-01-01") + +############################################# +# MELD 1 survival function +cal_MELD.surv <- calibration(data_meld$MELD.surv, y = data_meld$D90_surv) +cal_MELD.surv$Score <- "MELD 1" + +# MELD_2 survival function +cal_MELD.surv2 <- calibration(data_meld$MELD.surv2, y = data_meld$D90_surv) +cal_MELD.surv2$Score <- "MELD 2" + +# MELD_2 survival function +cal_MELD.surv2 <- calibration(data_meld$MELD.surv2, y = data_meld$D90_surv) +cal_MELD.surv2$Score <- "MELD 2" + +# MELD VanDerwerken +cal_MELD.VanDerwerken <- calibration(data_meld$MELD_Van, y = data_meld$D90_surv) +cal_MELD.VanDerwerken$Score <- "MELD VanDerwerken" + +# MELD updated +cal_MELD.updated <- calibration(data_meld$meld.surv.updated, y = data_meld$D90_surv) +cal_MELD.updated$Score <- "MELD updated" + +############################################# +# Lille +cal_Lille <- calibration(data_lille$Lille.surv, y = data_lille$D90_surv) +cal_Lille$Score <- "Lille original" + +# Lille updated +cal_Lille.updated <- calibration(data_lille$lille.surv.updated, y = data_lille$D90_surv) +cal_Lille.updated$Score <- "Lille updated" + +# combine dfs +df_cal <- rbind(cal_MELD.surv, cal_MELD.surv2, cal_MELD.VanDerwerken, cal_MELD.updated, cal_Lille, cal_Lille.updated) + +df_cal <- df_cal %>% mutate(Status = ifelse(Score == "Lille updated" | Score == "MELD updated", "Updated", "Original")) + +############################################# +# plot without ribbon and without MELD 3.0 +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + +# plot with ribbon +# display.brewer.pal(n = 8, name = 'Dark2') +col_updated <- brewer.pal(n = 8, name = "Dark2")[c(2, 3)] + +p_calibration_updated <- df_cal %>% filter(Status == "Updated") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Dark2") + + #scale_fill_brewer(palette = "Dark2") + + scale_fill_manual("", values = col_updated) + + scale_color_manual(name = "Score", values = col_updated) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + theme_classic2() + + theme(legend.position = "none") + + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1)) + +p_calibration_updated + +col_original <- brewer.pal(n = 8, name = "Dark2")[c(2, 4, 5, 3)] + +p_calibration_original <- df_cal %>% filter(Status == "Original") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Dark2") + + #scale_fill_brewer(palette = "Dark2") + + scale_fill_manual("", values = col_original) + + scale_color_manual(name = "Score", values = col_original) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab(" ") + + theme_classic2() + + theme(legend.position = "none") + + #scale_y_continuous(labels = scientific) + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1.00)) + +p_calibration_original + +ggarrange(p_calibration_original, p_calibration_updated, nrow = 2) + +cowplot::plot_grid(p_calibration_original, p_calibration_updated, nrow = 2, scale = 1) \ No newline at end of file diff --git a/updating models/val/Calibration_validation.R b/updating models/val/Calibration_validation.R new file mode 100644 index 0000000..f959a66 --- /dev/null +++ b/updating models/val/Calibration_validation.R @@ -0,0 +1,172 @@ + +# This script evaluated all calculations of model calibration +library(ggplot2) +library(dplyr) +library(survminer) # for plotting theme +library(RColorBrewer) + +path_funs <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/funs" +source(paste0(path_funs, "/calibration_fun.R")) +source(paste0(path_funs, "/val.prob.ci.2_wrapper.R")) + +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/val_data" +load(paste0(path_data, "/models_validation.Rdata")) + +############################################# +############### Calculate calibration ###### +############################################# + +data_to_use <- "all" # all, severe, non_severe "no" + +if(data_to_use == "all"){ + data_meld <- Global_AlcHep.meld + data_lille <- Global_AlcHep.lille + }else if(data_to_use == "severe"){ + data_meld <- Global_AlcHep.meld %>% filter(`DF at admission` >= 32) + data_lille <- Global_AlcHep.lille %>% filter(`DF at admission` >= 32) + }else if(data_to_use == "non_severe"){ + data_meld <- Global_AlcHep.meld %>% filter(`DF at admission` < 32) + data_lille <- Global_AlcHep.lille %>% filter(`DF at admission` < 32) + } + +############################################# +# MELD 1 survival function +cal_MELD.surv <- calibration(data_meld$MELD.surv, y = data_meld$D90_surv) +cal_MELD.surv$Score <- "MELD 1" + +# MELD_2 survival function +cal_MELD.surv2 <- calibration(data_meld$MELD.surv2, y = data_meld$D90_surv) +cal_MELD.surv2$Score <- "MELD 2" + +# MELD VanDerwerken +cal_MELD.VanDerwerken <- calibration(data_meld$MELD_Van, y = data_meld$D90_surv) +cal_MELD.VanDerwerken$Score <- "MELD VanDerwerken" + +# MELD updated +cal_MELD.updated <- calibration(data_meld$meld.surv.updated, y = data_meld$D90_surv) +cal_MELD.updated$Score <- "MELD updated" + +############################################# +# Lille +cal_Lille <- calibration(data_lille$Lille.surv, y = data_lille$D90_surv) +cal_Lille$Score <- "Lille original" + +# Lille updated +cal_Lille.updated <- calibration(data_lille$lille.surv.updated, y = data_lille$D90_surv) +cal_Lille.updated$Score <- "Lille updated" + +# combine dfs +df_cal <- rbind(cal_MELD.surv, cal_MELD.surv2, cal_MELD.VanDerwerken, cal_MELD.updated, cal_Lille, cal_Lille.updated) + +df_cal <- df_cal %>% mutate(Status = ifelse(Score == "Lille updated" | Score == "MELD updated", "Updated", "Original")) + +############################################# +# plot without ribbon and without MELD 3.0 +df_cal %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + #geom_ribbon(aes(ymin = lower, ymax = upper, linetype = NA), + # alpha = 0.3, show.legend = F) + + #scale_fill_manual("", values = col) + + #scale_color_manual(name = "Score", values = col) + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed proportion") + + xlab("Predicted probability") + + theme_classic() + +# plot with ribbon +# display.brewer.pal(n = 8, name = 'Dark2') +col_updated <- brewer.pal(n = 8, name = "Dark2")[c(2, 3)] + +p_calibration_updated <- df_cal %>% filter(Status == "Updated") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Dark2") + + #scale_fill_brewer(palette = "Dark2") + + scale_fill_manual("", values = col_updated) + + scale_color_manual(name = "Score", values = col_updated) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab("Predicted survival probability") + + theme_classic2() + + theme(legend.position = "none") + + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1)) + +p_calibration_updated + +col_original <- brewer.pal(n = 8, name = "Dark2")[c(2, 4, 5, 3)] + +p_calibration_original <- df_cal %>% filter(Status == "Original") %>% + ggplot(., aes(x = pred, y = obs, col = Score)) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed") + + geom_ribbon(aes(ymin = lower, ymax = upper, fill = Score, linetype = NA), + alpha = 0.3, show.legend = F) + + #scale_color_brewer(palette = "Dark2") + + #scale_fill_brewer(palette = "Dark2") + + scale_fill_manual("", values = col_original) + + scale_color_manual(name = "Score", values = col_original) + + facet_grid(. ~ Score) + + coord_equal() + + xlim(0, 1) + + ylim(0, 1) + + ylab("Observed survival proportion") + + xlab(" ") + + theme_classic2() + + theme(legend.position = "none") + + #scale_y_continuous(labels = scientific) + scale_x_continuous(labels = c(0, 0.25, 0.5, 0.75, 1.00)) + +p_calibration_original + +ggarrange(p_calibration_original, p_calibration_updated, nrow = 2) + +cowplot::plot_grid(p_calibration_original, p_calibration_updated, nrow = 2, scale = 1) + +############################################# +############## Summary Stats ############### +############################################# + +# compute calibration slopes and intercepts +# MELD updated +MELD_stats <- val.prob.ci.2_wrapper(data_meld$meld.surv.updated, data_meld$D90_surv, "MELD updated") + +# Lille updated +Lille_stats <- val.prob.ci.2_wrapper(data_lille$lille.surv.updated, data_lille$D90_surv, "Lille updated") + +df_stats <- rbind(MELD_stats, Lille_stats) %>% relocate(Score) + +p_calibration_updated + geom_text(data = df_stats %>% filter(stat == "intercept"), + aes(0.1, 0.95, label = + paste0("Intercept (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + geom_text(data = df_stats %>% filter(stat == "slope"), + aes(0.1, 0.90, label = + paste0("Slope (95% CI): ", Point.estimate, " (", Lower.confidence.limit, "-", Upper.confidence.limit, ")"), + hjust = 0), col = "black") + + +##### original models +# MELD +MELD_stats1 <- val.prob.ci.2_wrapper(data_meld$MELD.surv, data_meld$D90_surv, "MELD 1") +MELD_stats2 <- val.prob.ci.2_wrapper(data_meld$MELD.surv2, data_meld$D90_surv, "MELD 2") +MELD_Van <- val.prob.ci.2_wrapper(data_meld$MELD_Van, data_meld$D90_surv, "MELD Van") + +# Lille updated +Lille_stats_orig <- val.prob.ci.2_wrapper(data_lille$Lille.surv, data_lille$D90_surv, "Lille") + +df_original_stats <- rbind(MELD_stats1,MELD_stats2, MELD_Van, Lille_stats_orig) + +df_original_stats %>% filter(stat == "slope") +library(xtable) +xtable((df_original_stats)) + + diff --git a/updating models/val/Discrimiation_validation.R b/updating models/val/Discrimiation_validation.R new file mode 100644 index 0000000..ec24340 --- /dev/null +++ b/updating models/val/Discrimiation_validation.R @@ -0,0 +1,130 @@ +library(pROC) +library(dplyr) +library(ggplot2) +library(purrr) +library(tidyr) +library(forcats) +#library(ggROC) + +# load data +path_data <- "/Users/work/IDrive-Sync/Projects/MIMAH/code/AH_code/AH_code/original models/val_data" +load(paste0(path_data, "/models_validation.Rdata")) + +############################################# +############### Calculate AUCs ############# +############################################# + +data_to_use <- "all" # all, severe, non-severe "no" + +if(data_to_use == "all"){ + data_meld <- Global_AlcHep.meld + data_lille <- Global_AlcHep.lille +}else if(data_to_use == "severe"){ + data_meld <- Global_AlcHep.meld %>% filter(`DF at admission` >= 32) + data_lille <- Global_AlcHep.lille %>% filter(`DF at admission` >= 32) +}else if(data_to_use == "non-severe"){ + data_meld <- Global_AlcHep.meld %>% filter(`DF at admission` < 32) + data_lille <- Global_AlcHep.lille %>% filter(`DF at admission` < 32) +} + +############################################# +# MELD original +roc_meld_original <- roc(data_meld$D90_surv, data_meld$MELD.surv) + +# MELD updated +roc_meld_updated <- roc(data_meld$D90_surv, data_meld$meld.surv.updated) + +# Lille original +roc_lille_original <- roc(data_lille$D90_surv, data_lille$LILLE) + +# Lille updated +roc_lille_updated <- roc(data_lille$D90_surv, data_lille$lille.surv.updated) + +############################################# +###################### Plots ############### +############################################# +roc.list <- list( + "Lille original" = roc_lille_original, + "Lille updated" = roc_lille_updated, + "MELD original" = roc_meld_original, + "MELD updated" = roc_meld_updated) + +#setwd(path_data) +#save(roc.list, file = "ROC_updated.Rdata") + +g.list <- ggroc(roc.list) + +# ROC plot of all models combined +g.list + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + +# faceting +g.list + + facet_grid(. ~ name) + + geom_line(lwd = 1) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + theme_classic() + + theme(legend.position="none") + +# add confidence bands +ci.list <- lapply(roc.list, ci.se, specificities = seq(0, 1, l = 25)) + +dat.ci.list <- lapply(ci.list, function(ciobj) + data.frame(x = as.numeric(rownames(ciobj)), + lower = ciobj[, 1], + upper = ciobj[, 3])) + +df <- plyr::ldply(dat.ci.list, data.frame, .id = "name") + +pl <- ggroc(roc.list, lwd = 1.1) + + facet_grid(. ~ name) + + #theme_minimal() + + geom_ribbon(data = df, aes(x = x, ymin = lower, ymax = upper, fill = name), alpha = 0.3, inherit.aes = F) + + geom_abline(slope = 1, intercept = 1, linetype = "dashed") + + labs(x = "Specificity", y = "Sensitivity") + + scale_color_brewer(palette = "Dark2") + + scale_fill_brewer(palette = "Dark2") + + coord_equal() + + theme_classic2() + + theme(legend.position = "none") + +data_wide$name <- data_wide$condition +data_wide$low_CL <- round(data_wide$low_CL, 2) +data_wide$mean <- round(data_wide$mean, 2) +data_wide$upper_CL <- round(data_wide$upper_CL, 2) + +pl + geom_text(data = data_wide, aes(0.01, 0.19, label = paste0("AUC (95% CI): ", mean, " (", low_CL, "-", upper_CL, ")" ), + hjust = 1), col = "black") + +############################################# +###################### AUC ################# +############################################# +# CI +#auc_meld <- auc(roc_meld) +#auc_meld_ci <- ci.auc(roc_meld) # Confidence intervals +#roc_plot(stph.meld, "D90_surv", "MELD.surv", ci = TRUE, plot_title = "ROC curve for the MELD score") + +df_AUC <- as.data.frame(map_dfr(roc.list, ci.auc)) +rownames(df_AUC) <- c("low_CL", "mean", "upper_CL") + +df_AUC2 <- tibble::rownames_to_column(df_AUC, var = "AUC") + +df3 <- gather(df_AUC2, condition, measurement, `Lille original`:`MELD updated`, factor_key = TRUE) +data_wide <- spread(df3, AUC, measurement) %>% arrange(mean) + +# reorder factor levels +data_wide$condition <- fct_reorder(data_wide$condition, data_wide$mean) + +ggplot(data_wide, aes(x = mean, y = condition, col = condition)) + + geom_point(lwd = 2) + + coord_cartesian(xlim = c(0.5, 0.86)) + + geom_errorbar(aes(xmin = low_CL, xmax = upper_CL), + alpha = 1, show.legend = F, lwd = 1, width = 0.5) + + labs(y = "Score", col = "Score", x = "AUC with 95% limits") + + theme_classic() + + +pl + geom_text(data = data_wide, aes(0.05, 0.25, label = paste0("AUC (95% CI): ", mean, " (", low_CL, "-", upper_CL, ")" ), + hjust = 1), col = "black") diff --git a/updating models/val_prob_confidence.R b/updating models/val_prob_confidence.R old mode 100644 new mode 100755