Setup

#clear
  knitr::opts_chunk$set(warning = F, message = F)
  source('P:\\flunm\\K1M\\flublok_pretrial\\flublok_pretrial_cfg.R')

Load Cohort File

7 Methods

  1. Simple Random
  2. 2-strata randomization
  3. Pair-matched design
  4. K-means cluster, a prior covariates
  5. K-means cluster, PCA
  6. Re-randomization

Variable Spefications

Select key covariates

Covariates from LASSO

final.vars <- coef(cv_fit, s = "lambda.min")[, 1]

final.vars <- final.vars[final.vars!=0]
final.vars <- final.vars[2:length(final.vars)] %>% #remove intercept  
  sort(.)

final.vars <- names(final.vars)

final.vars

mdshosps / mds_hospptyr / fac_hosp_num are all very similar so just lept mds_hospptyr.

## Variables selected from 2015 analysis and LASSO:  pcthmo agghighcfs obs_rehosprate acuindex2 mds_hospptyr fac_ls fac_age_85 fac_white fac_black fac_diet fac_dem
#Only for proportional variables, continuous measures dont have a percentage point diff
pr_vars <- c('female', 'white', 'black', 'hispanic', 'asian', 'mds_resp',
             'mds_dem', 'M3K0510C2', 'age_65', 'age_85')
pr_var_lbls <- c('Female*', 'White', 'Black' ,'Hispanic*', 'Asian*',
                 'Resp. diagnosis', 'Dementia diagnosis', 'Mech. Diet', 
                 'Age >=65 years', 'Age >=85 years*')

cat('Outlier definition, difference in standardized group means:', pr_reject, '\n')
cat('Variables to test for outliers:', paste0(pr_vars, collapse=', '), '\n')

Functions for randomization

Method 2. Simple stratified randomization - Race, Size

Method 3. Pair-matched Randomization - Mahalanobis Distance

Method 4. K-means clustering, key variables

Method 5. K-means, PCA cluster eigen-weighted

Method 6. Re-randomization

Find Acceptable Cut-Off Value

st_run <- Sys.time()

  df_rerand <- df_cov %>%
    select(-dc_hosp_any) %>%
    distinct(.) %>%
    mutate_at(vars(-fac_id), scale)

 # Find M-distance cutoff empirically   
  do_it <- function(x) {

    sim_iter <- rnd_allot(df_fac) %>%
      as.data.frame(.) %>%
      inner_join(df_rerand, ., by=c('fac_id'='id')) %>%
      select(-fac_id) %>%
      as.data.frame(.)

    mdis_iter <- mahalanobis(colMeans(sim_iter[sim_iter$group=='a', fac_key_vars], na.rm=T), 
                               colMeans(sim_iter[sim_iter$group=='b', fac_key_vars], na.rm=T), 
                                  cov= cov(sim_iter[, fac_key_vars])) * 
                   ((nrow(sim_iter[sim_iter$group=='a', ]) * nrow(sim_iter[sim_iter$group=='b', ])) / (nrow(sim_iter)))
  }

  m6_chk_mdis <- sapply(1:n_rndms, do_it)
  
  m6_cutoff <- quantile(m6_chk_mdis, 0.001)  
  
st_end <- Sys.time()

cat('Method 6. Re-randomization cut-off', round(m6_cutoff, 3), ' \n')
st_end - st_run

Perform Re-randomizations

st_run <- Sys.time()

  m6_res <- list()
  
  m6_res$delta <- matrix(NA, nrow = n_re_rndms, ncol = length(chk_id_vars))  
  m6_res$stdev <- df_trial[, chk_id_vars] %>% 
      map_dfr(., sd, na.rm=T) %>%
      t(.)

  df_m6 <- df_trial %>%
    select(fac_id, fac_key_vars) %>%
    distinct()   
  
  do_it <- function(x)  {
    repeat { # search for good rerand
    
      chk_rand <- rnd_allot(df_fac) %>%
        inner_join(df_m6[, c('fac_id', fac_key_vars)], ., by=c('fac_id'='id')) 
    
      mdis_iter <- Rfast::mahala(colMeans(chk_rand[chk_rand$group=='a', fac_key_vars], na.rm=T), 
                                 colMeans(chk_rand[chk_rand$group=='b', fac_key_vars], na.rm=T), 
                                 sigma= cov(chk_rand[, fac_key_vars])) * 
                       ((nrow(chk_rand[chk_rand$group=='a', ]) * nrow(chk_rand[chk_rand$group=='b', ])) / (nrow(chk_rand)))
      
      if(mdis_iter<m6_cutoff) {
        break
      }
    }
  
  sim_iter <- inner_join(df_trial[, c('fac_id', chk_id_vars)], 
                         chk_rand[, c('fac_id', 'group')], 
                         by=c('fac_id')) %>%
    select(group, chk_id_vars)
  
  delta <- do_rand(sim_iter)
  return(delta)
}

m6_res$delta[1:n_rndms, ] <- t(sapply(1L:n_rndms, do_it))
m6_res$smd <- t(apply(m6_res$delta, 1, function(x) x / t(m6_res$stdev)))

st_end <- Sys.time()

cat(paste0(n_rndms), 'Method 6. Re-randomizations, \n')
st_end - st_run

Permute M6

st_run <- Sys.time()

m6_rtest <- vector(mode = 'double', length = n_permutes)

for (i  in 1:n_permutes) {
  
      repeat { # search for good rerand
    
      chk_rand <- rnd_allot(df_fac) %>%
        inner_join(df_m6[, c('fac_id', fac_key_vars)], ., by=c('fac_id'='id')) 
    
      mdis_iter <- Rfast::mahala(colMeans(chk_rand[chk_rand$group=='a', fac_key_vars], na.rm=T), 
                                 colMeans(chk_rand[chk_rand$group=='b', fac_key_vars], na.rm=T), 
                                 sigma= cov(chk_rand[, fac_key_vars])) * 
                       ((nrow(chk_rand[chk_rand$group=='a', ]) * nrow(chk_rand[chk_rand$group=='b', ])) / (nrow(chk_rand)))
      
      if(mdis_iter<m6_cutoff) {
        break
      }
    }
  
  sim_1 <- df_trial %>%
    select(fac_id, dc_hosp_any) %>%
    inner_join(., chk_rand[, c('fac_id', 'group')], by=c('fac_id')) %>%
    mutate(assign = if_else(group=='a', 1L, 0L)) %>%
    na.omit(.)
  
  
  iter <- lme4::glmer(dc_hosp_any ~ assign + (1 | fac_id), 
                      nAGQ=0, control=glmerControl(optimizer = "nloptwrap"), 
                      data = sim_1, family = 'binomial')
  
  m6_rtest[i] <- mean(coef(iter)$fac_id[['assign']])
}


st_end <- Sys.time()
st_end - st_run
save.image(file='flublok_pretrial_sims.RData')

Evaluation of Methods

Distribution of Variables

Gather checking variables from Methods

fct_lvls <- rndm_methods

for (i in 1:6) { #loop to create smd objects
  assign(paste0('m', i, '_diff'), get(paste0('m', i, '_res'))$delta %>%
     as.data.frame(.) %>%
    mutate(Method = fct_lvls[i]))
}

diff_all <- mget(ls()[str_detect(ls(), '_diff')]) #get all objects with '_diff' names

diff_all <- bind_rows(diff_all) %>%
  mutate(Method = factor(Method, levels= fct_lvls)) %>%
  na.omit(.)

colnames(diff_all) <- c(chk_id_vars, 'Method')

diff_all_grp <- gather(diff_all, key = 'var', value = 'x', -Method) %>%
  mutate(var = factor(var, levels=chk_id_vars, labels = chk_id_lbls))

Compute range of values

cats <- c(0.025, 0.25, 0.5, 0.75, 0.975)

quant_vals <- function(x) {
  y <- quantile(x, probs=cats)
  return(y)
}
cats_2 <- c(rep(cats[1], 6*length(chk_id_vars)), 
          rep(cats[2], 6*length(chk_id_vars)), 
          rep(cats[3], 6*length(chk_id_vars)), 
          rep(cats[4], 6*length(chk_id_vars)), 
          rep(cats[5], 6*length(chk_id_vars)))

sum_diff_grp <- diff_all_grp %>%
  split(., list(.$Method, .$var)) %>%
  purrr::map(~quant_vals(.$x)) %>%
  bind_rows(.) %>%
  pivot_longer(., 
               cols = everything(),
               values_to = 'x') %>%
  mutate(Quintile = cats_2) %>%
  separate(., name, into = c('Method', 'var'), sep = "([\\.])") 

Table of Quintiles

tab_quint <- sum_diff_grp %>%
  pivot_wider(., id_cols = c(Method, var), names_from = Quintile, values_from = x) %>%
  select(var, everything()) 

tab_quint[, 3:7] <- round(tab_quint[, 3:7], 3) 

DT::datatable(tab_quint)

Variables greater 0.2 SMD

fct_lvls <- rndm_methods

for (i in 1:6) { #loop to create smd objects
  assign(paste0('m', i, '_smd'), get(paste0('m', i, '_res'))$smd %>%
     as.data.frame(.) %>%
    mutate(Method = fct_lvls[i]))
}

smd_all <- mget(ls()[str_detect(ls(), '_smd')]) #get all objects with '_smd' names

smd_all <- bind_rows(smd_all) %>%
  mutate(Method = factor(Method, levels= fct_lvls)) %>%
  na.omit(.)

colnames(smd_all) <- c(chk_id_vars, 'Method')

smd_all_grp <- gather(smd_all, key = 'var', value = 'x', -Method) %>%
  mutate(var = factor(var, levels=chk_id_vars, labels = chk_id_lbls))

Table % of variables with >0.2 SMD

prop_outlier <- smd_all_grp %>%
  mutate(out = if_else(abs(x)>0.2, 1L, 0L)) %>%
  group_by(Method, var) %>%
  summarize(pr_out = (sum(out) / n())*100) %>%
  pivot_wider(., id_cols = c(var), names_from = Method, values_from = pr_out)

DT::datatable(prop_outlier)

Variance reduction

Permutation Results

save.image(file='flublok_pretrial_sims.RData')