################ Infrastructure to get initial solutions ############### # Extract the model parameters from an initial solution # I - the data matrix # Format: rows are genes (or parcels, or voxels); columns are time points of many patients # Column name format: subject_name;TR:X, where X is the time point number # selected_bic_rows = the rows of the initial biclusters # selected_bic_cols = the columns of the initial biclusters get_initial_solution_I_as_input<-function(I,selected_bic_rows,selected_bic_cols,window_size=1,start_tr=0){ # create the objects of our model Z_cols_to_subjects = sapply(colnames(I),function(x){strsplit(x,split=';')[[1]][1]}) n_subj = length(unique(Z_cols_to_subjects)) n_tr = sum(Z_cols_to_subjects==Z_cols_to_subjects[1]) n_voxels = nrow(I) # H H = rep(0,n_voxels);names(H) = rownames(I) H[selected_bic_rows] = 1 # Ps P = rep(1,n_subj);names(P) = unique(Z_cols_to_subjects) # C C = matrix(nrow=n_subj,ncol=n_tr);C[,]=0 colnames(C) = paste("TR",(1+start_tr):(n_tr+start_tr),sep='');rownames(C) = unique(Z_cols_to_subjects) C[t(sapply(selected_bic_cols,function(x){strsplit(x,split=';')[[1]]}))]=1 # HS HS = matrix(nrow=n_voxels,ncol=n_subj);HS[,] = 0 rownames(HS) = rownames(I);colnames(HS) = unique(Z_cols_to_subjects) HS[selected_bic_rows,]=1 # set the hyper-parameters of H and C pi_v = length(selected_bic_rows)/nrow(I) pi_1_1 = sum(C[,1])/nrow(C) C_counts = apply(C,1,analyze_C_row,window_size=window_size) pi_1_to_1 = sum(C_counts[1,])/sum(C_counts[3,]) pi_0_to_1 = sum(C_counts[2,])/(length(C)-nrow(C)-sum(C_counts[3,])) return(list(I=I,C=C,H=H,HS=HS,pi_1_1=pi_1_1,w=window_size ,pi_v=pi_v,pi_1_to_1=pi_1_to_1,pi_0_to_1=pi_0_to_1,C_counts=C_counts)) } # Extract time points model parameters from a solution vector x # x is a binary vector of a specific subject. A value of 1 means that the specific time # point is in the module analyze_C_row<-function(x,window_size=1){ pi_0_to_1 = 0; pi_1_to_1 = 0; pi_0_to_0 = 0; pi_1_to_0 =0 n_window_has_1 = 0 #print(x) for (i in 1:length(x)){ if (i==1){next} wind_ind = i - window_size; wind_ind = max(wind_ind,1) if (sum(x[wind_ind:(i-1)]) > 0){ pi_1_to_1 = pi_1_to_1 + x[i] pi_1_to_0 = pi_1_to_0 + (1-x[i]) } else{ pi_0_to_1 = pi_0_to_1 + x[i] pi_0_to_0 = pi_0_to_0 + (1-x[i]) } } return (c(pi_1_to_1,pi_0_to_1,pi_1_to_0,pi_0_to_0)) } # Set the default (non-informative) priors for the hyperparameters get_default_hyper_parameter_priors <- function(n_tr=100,n_v=500,n_subj=5){ pr_params = list() pr_params[["pi_v"]] = c(1,1) pr_params[['pi_1_1']] = c(1,1) pr_params[['pi_0_to_1']] = c(1,1) pr_params[['pi_1_to_1']] = c(1,1) pr_params[['pi_s']] = c(1,1) pr_params[['pi_s_c']] = c(1,1) pr_params[['pi_lc']] = c(1,1) pr_params[['pi_lc_c']] = c(1,1) # mu0, p0, v0, ss0 pr_params[['norm_bg']] = c(0,0,1,1) pr_params[['norm_bic']] = c(0,0,1,1) return (pr_params) } get_hyper_parameter_modes_from_prior <- function(sol,pr_params){ sol$pi_v = get_hp_mode(pr_params[['pi_v']]) sol$pi_1_1 = get_hp_mode(pr_params[['pi_1_1']]) sol$pi_0_to_1 = get_hp_mode(pr_params[['pi_0_to_1']]) sol$pi_1_to_1 = get_hp_mode(pr_params[['pi_1_to_1']]) sol$pi_s = rep(get_hp_mode(pr_params[['pi_s']]),ncol(sol$HS)) sol$pi_s_c = get_hp_mode(pr_params[['pi_s_c']]) sol$pi_lc = get_hp_mode(pr_params[['pi_lc']]) sol$pi_lc_c = get_hp_mode(pr_params[['pi_lc_c']]) sol$norm_bg = get_norm_dist_hp_mode(pr_params[['norm_bg']]) sol$norm_bic = get_norm_dist_hp_mode(pr_params[['norm_bic']]) return (sol) } get_hp_mode<-function(params){ if (all(params==1)){return (0.5)} return((params[1]-1)/(sum(params)-2)) } # params: # mu0, p0, v0, ss0 get_norm_dist_hp_mode<-function(params){ precision = params[3]/params[4] return (c(params[1],sqrt(1/(precision*params[3])))) } ################ The Gibbs process starts here ############### # 1. sample the hyper parameters sample_pi_v <- function(sol,pr_params,reps=10){ return (rbeta(reps,pr_params[['pi_v']][1]+sum(sol$H),pr_params[['pi_v']][2]+sum(1-sol$H))) } sample_pi_1_1<-function(sol,pr_params,reps=10){ return (rbeta(reps,pr_params[['pi_1_1']][1]+sum(sol$C[,1]),pr_params[['pi_1_1']][2]+sum(1-sol$C[,1]))) } sample_pi_1_to_1<-function(sol,pr_params,reps=10){ return (rbeta(reps,pr_params[['pi_1_to_1']][1]+sum(sol$C_counts[1,]),pr_params[['pi_1_to_1']][2]+sum(sol$C_counts[3,]))) } sample_pi_0_to_1<-function(sol,pr_params,reps=10){ return (rbeta(reps,pr_params[['pi_0_to_1']][1]+sum(sol$C_counts[2,]),pr_params[['pi_0_to_1']][2]+sum(sol$C_counts[4,]))) } sample_pi_lc<-function(sol,pr_params,reps=1,return_counts=F){ count1 = 0; count2 = 0; # go over subjects for (s in 1:nrow(sol$C)){ currs = rownames(sol$C)[s] curr_rows = which(sol$HS[,s] == 1) currtr = which(sol$C[s,] == 1) curr_cols = (s-1)*ncol(sol$C)+currtr count1 = count1+sum(sol$I[curr_rows,curr_cols]) count2 = count2+sum(1-sol$I[curr_rows,curr_cols]) } if (return_counts){return (c(count1,count2))} print (paste(count1,count2,rbeta(reps,pr_params[['pi_lc']][1]+count1,pr_params[['pi_lc']][2]+count2),sep=' ')) return (rbeta(reps,pr_params[['pi_lc']][1]+count1,pr_params[['pi_lc']][2]+count2)) } sample_pi_lc_c<-function(sol,pr_params,reps=1){ pi_lc_counts = sample_pi_lc(sol,pr_params,reps=1,return_counts=T) count1 = sum(c(sol$I))- pi_lc_counts[1] count2 = sum(1-c(sol$I))- pi_lc_counts[2] return (rbeta(reps,pr_params[['pi_lc_c']][1]+count1,pr_params[['pi_lc_c']][2]+count2)) } sample_norm_bic<-function(sol,pr_params,reps=1,return_vecs = F){ # go over subjects x = c() for (s in 1:nrow(sol$C)){ currs = rownames(sol$C)[s] curr_rows = which(sol$HS[,s] == 1) currtr = which(sol$C[s,] == 1) curr_cols = (s-1)*ncol(sol$C)+currtr x = c(x,sol$I[curr_rows,curr_cols]) } if (length(x) == 0){return(get_norm_dist_hp_mode(pr_params[['norm_bic']]))} p0 = pr_params[['norm_bic']][2] n = length(x) m0 = pr_params[['norm_bic']][1] v0 = pr_params[['norm_bic']][3] ss0 = pr_params[['norm_bic']][4] meanx = mean(x) p_n = p0+n m_n = (n*meanx + p0*m0)/p_n v_n = v0 + n ss_n = ss0 + sum((x-meanx)^2) + ((n*p0)*(meanx - m0)^2)/p_n print (paste(m_n,p_n,v_n,ss_n)) new_prec = rgamma(1,v_n/2,ss_n/2) new_prec = (v_n - 2)/(ss_n) new_sd = sqrt(1/(new_prec*p_n)) new_mean = rnorm(1,m_n,new_sd) return (c(new_mean,sqrt(1/(new_prec)))) } sample_norm_bg<-function(sol,pr_params,reps=1,return_vecs = F){ # go over subjects x = c() for (s in 1:nrow(sol$C)){ currs = rownames(sol$C)[s] curr_rows = which(sol$HS[,s] != 1) currtr = which(sol$C[s,] != 1) curr_cols = (s-1)*ncol(sol$C)+currtr x = c(x,sol$I[curr_rows,]) x = c(x,sol$I[-curr_rows,curr_cols]) } p0 = pr_params[['norm_bg']][2] n = length(x) m0 = pr_params[['norm_bg']][1] v0 = pr_params[['norm_bg']][3] ss0 = pr_params[['norm_bg']][4] meanx = mean(x) p_n = p0+n m_n = (n*meanx + p0*m0)/p_n v_n = v0 + n ss_n = ss0 + sum((x-meanx)^2) + ((n*p0)*(meanx - m0)^2)/p_n new_prec = rgamma(1,v_n/2,ss_n/2) new_prec = (v_n - 2)/(ss_n) new_sd = sqrt(1/(new_prec*p_n)) new_mean = rnorm(1,m_n,new_sd) return (c(new_mean,sqrt(1/(new_prec)))) } sample_pi_s <- function(s,sol,pr_params,reps=1){ count1 = sum(sol$H*sol$HS[,s]) count2 = sum(sol$H*(1-sol$HS[,s])) return (rbeta(reps,pr_params[['pi_s']][1]+count1,pr_params[['pi_s']][2]+count2)) } sample_pi_s_c <- function(sol,pr_params,reps=1){ count1 = 0; count2 = 0; for (i in which(sol$H==0)){ count1 = count1 + sum(sol$HS[i,]) count2 = count2 + sum(1-sol$HS[i,]) } return (rbeta(reps,pr_params[['pi_s_c']][1]+count1,pr_params[['pi_s_c']][2]+count2)) } sample_Hv_posterior<-function(sol,v,reps=1){ p_Hv1_HSv = sol$pi_v*prod(sol$pi_s^sol$HS[v,])*prod((1-sol$pi_s)^(1-sol$HS[v,])) p_Hv0_HSv = (1-sol$pi_v)*(sol$pi_s_c^sum(sol$HS[v,]))*((1-sol$pi_s_c)^sum(1-sol$HS[v,])) p = p_Hv1_HSv / (p_Hv1_HSv+p_Hv0_HSv) return (rbinom(reps,1,prob=p)) } sample_HS_posterior<-function(sol,v,s,reps=1){ curr_p_s = sol$pi_s[s] if (sol$H[v]==0){curr_p_s = sol$pi_s_c} Cts_inds = which(sol$C[s,]==1) currI = sol$I[v,grepl(pattern=rownames(sol$C)[s],colnames(sol$I))][Cts_inds] p_HSvs1_Ivs = curr_p_s p_HSvs0_Ivs = (1-curr_p_s) log_p_HSvs1_Ivs = log(curr_p_s) log_p_HSvs0_Ivs = log(1-curr_p_s) if (length(currI)>0){ an = get_two_group_analysis(currI,sol,method=sol$method,use.log=F) p_HSvs1_Ivs = curr_p_s*an[1] p_HSvs0_Ivs = (1-curr_p_s)*an[2] an = get_two_group_analysis(currI,sol,method=sol$method,use.log=T) log_p_HSvs1_Ivs = log(curr_p_s)+an[1] log_p_HSvs0_Ivs = log(1-curr_p_s)+an[2] } p = p_HSvs1_Ivs/(p_HSvs1_Ivs+p_HSvs0_Ivs) ratio1 = exp(log_p_HSvs0_Ivs - log_p_HSvs1_Ivs) p1 = exp(-1*log(1+ratio1)) if (is.na(p) || is.na(rbinom(reps,1,prob=p))){p=p1} if (is.na(p)){print (paste(c("####", p, p_HSvs1_Ivs,p_HSvs0_Ivs)))} return (rbinom(reps,1,prob=p)) } sample_C_1_posterior<-function(sol,s,reps=1){ # first window of this sample, only inds with HS==1 currI = sol$I[sol$HS[,s]==1,which(grepl(pattern=paste(rownames(sol$C)[s],';',sep=''),colnames(sol$I)))[1]] an = get_two_group_analysis(currI,sol,method=sol$method,use.log=F) p_Cts1_window_Its = an[1]; p_Cts0_window_Its = an[2] an = get_two_group_analysis(currI,sol,method=sol$method,use.log=T) log_p_Cts1_window_Its = an[1]; log_p_Cts0_window_Its = an[2] curr_w = sol$C[s,2:(1+sol$w)] p_C1s1_window_I1s = p_Cts1_window_Its*sol$pi_1_1*(sol$pi_1_to_1^sum(curr_w))*((1-sol$pi_1_to_1)^sum(1-curr_w)) p_C1s0_window_I1s = p_Cts0_window_Its*(1-sol$pi_1_1) log_p_C1s1_window_I1s = log_p_Cts1_window_Its + log(sol$pi_1_1) + sum(curr_w)*log(sol$pi_1_to_1)+ sum(1-curr_w)*log((1-sol$pi_1_to_1)) log_p_C1s0_window_I1s = p_Cts0_window_Its + log(1-sol$pi_1_1) curr_sum = sum(curr_w) # if the window contains only zeroes if (curr_sum == 0){ p_C1s0_window_I1s = p_C1s0_window_I1s*((1-sol$pi_0_to_1)^sol$w) log_p_C1s0_window_I1s = log_p_C1s0_window_I1s + sol$w*log(1-sol$pi_0_to_1) } # if the window contains only ones if (curr_sum == sol$w){ p_C1s0_window_I1s = p_C1s0_window_I1s*sol$pi_0_to_1*(sol$pi_1_to_1^(sol$w-1)) log_p_C1s0_window_I1s = log_p_C1s0_window_I1s + log(sol$pi_0_to_1) + (sol$w-1)*log(sol$pi_1_to_1) } # here we know we have at least one 1, and not all are 1's if (curr_sum > 0 && curr_sum < sol$w){ l = min(which(curr_w==1)) curr_w = curr_w[-c(1:l)] p_C1s0_window_I1s = p_C1s0_window_I1s* ((1-sol$pi_0_to_1)^(l-1))*sol$pi_0_to_1* (sol$pi_1_to_1^sum(curr_w))*((1-sol$pi_1_to_1)^sum(1-curr_w)) log_p_C1s0_window_I1s = log_p_C1s0_window_I1s + (l-1)*log((1-sol$pi_0_to_1)) + log(sol$pi_0_to_1) + sum(curr_w)*log(sol$pi_1_to_1) + sum(1-curr_w)*log(1-sol$pi_1_to_1) } p = p_C1s1_window_I1s / (p_C1s0_window_I1s + p_C1s1_window_I1s) ratio1 = exp(log_p_Cts0_window_Its - log_p_Cts1_window_Its) p1 = exp(-1*log(1+ratio1)) if (is.na(p) || is.na(rbinom(reps,1,prob=p))){p=p1} if (is.na(p)){p = as.numeric(p_Cts1_window_Its>p_Cts0_window_Its)} return (rbinom(reps,1,prob=p)) } sample_Ct_posterior<-function(sol,t,s,reps=1){ currI = sol$I[sol$HS[,s]==1,which(grepl(pattern=paste(rownames(sol$C)[s],';',sep=''),colnames(sol$I)))[t]] #print (dim(currI));print (length(currI)) upper_tr = min(t+sol$w,ncol(sol$C)) curr_w_size = upper_tr - t an = get_two_group_analysis(currI,sol,method=sol$method,use.log=F) p_Cts1_window_Its = an[1]; p_Cts0_window_Its = an[2] an = get_two_group_analysis(currI,sol,method=sol$method,use.log=T) log_p_Cts1_window_Its = an[1]; log_p_Cts0_window_Its = an[2] previous_w = sol$C[s,max(1,t-sol$w):(t-1)] if(any(is.na(sol$C))){print('C has NAs :(');print (table(is.na(sol$C)))} #print (previous_w) previous_has_1 = sum(previous_w)>0 initial_p1 = sol$pi_1_to_1 if (!previous_has_1){initial_p1 = sol$pi_0_to_1} p_Cts1_window_Its = p_Cts1_window_Its*initial_p1 p_Cts0_window_Its = p_Cts0_window_Its*(1-initial_p1) log_p_Cts1_window_Its = log_p_Cts1_window_Its+log(initial_p1) log_p_Cts0_window_Its = log_p_Cts0_window_Its+log(1-initial_p1) if(curr_w_size>0){ curr_w = sol$C[s,(t+1):upper_tr] curr_w_has_1 = rep(1,length(curr_w)) p_Cts1_window_Its = p_Cts1_window_Its*(sol$pi_1_to_1^sum(curr_w))*((1-sol$pi_1_to_1)^sum(1-curr_w)) for(nextt in (t+1):(t+curr_w_size)){ if (sum(sol$C[s,max(1,nextt-sol$w):(nextt-1)])>0 && curr_w[nextt-t]==1){ curr_w_has_1[nextt-t]=sol$pi_1_to_1 } if (sum(sol$C[s,max(1,nextt-sol$w):(nextt-1)])>0 && curr_w[nextt-t]==0){ curr_w_has_1[nextt-t]=1-sol$pi_1_to_1 } if (sum(sol$C[s,max(1,nextt-sol$w):(nextt-1)])==0 && curr_w[nextt-t]==1){ curr_w_has_1[nextt-t]=sol$pi_0_to_1 } if (sum(sol$C[s,max(1,nextt-sol$w):(nextt-1)])==0 && curr_w[nextt-t]==0){ curr_w_has_1[nextt-t]=1-sol$pi_0_to_1 } } p_Cts0_window_Its = p_Cts0_window_Its * prod(curr_w_has_1) log_p_Cts0_window_Its = log_p_Cts0_window_Its + sum(log(curr_w_has_1)) } p = p_Cts1_window_Its/(p_Cts0_window_Its+p_Cts1_window_Its) ratio1 = exp(log_p_Cts0_window_Its - log_p_Cts1_window_Its) p1 = exp(-1*log(1+ratio1)) if (is.na(p) || is.na(rbinom(reps,1,prob=p))){p=p1} if (is.na(p)){p = as.numeric(p_Cts1_window_Its>p_Cts0_window_Its)} if (is.na(p)){print (paste('#####',p,ratio1,p1,log_p_Cts0_window_Its,log_p_Cts1_window_Its))} if (is.nan(p)){print (paste('#####',p,ratio1,p1,log_p_Cts0_window_Its,log_p_Cts1_window_Its))} return (rbinom(reps,1,prob=p)) } # This is the basic function that calculates the probability/frequency # of the two group model get_two_group_analysis <- function(currI, sol, method = 'binary', use.log = F){ # initial tests if (length(currI) == 0 && !use.log){return(c(1,1))} if (length(currI) == 0 && use.log){return(c(0,0))} if (method == 'norm' && !use.log){ p_Cts1_window_Its = prod(dnorm(currI,mean = sol$norm_bic[1],sd = sol$norm_bic[2])) p_Cts0_window_Its = prod(dnorm(currI,mean = sol$norm_bg[1],sd = sol$norm_bg[2])) return(c(p_Cts1_window_Its,p_Cts0_window_Its)) } if (method == 'norm' && use.log){ p_Cts1_window_Its = sum(dnorm(currI,mean = sol$norm_bic[1],sd = sol$norm_bic[2],log=T)) p_Cts0_window_Its = sum(dnorm(currI,mean = sol$norm_bg[1],sd = sol$norm_bg[2],log=T)) return(c(p_Cts1_window_Its,p_Cts0_window_Its)) } # return the default calculations - the binary model if (!use.log){ p_Cts1_window_Its = (sol$pi_lc^sum(currI))*((1-sol$pi_lc)^sum(1-currI)) p_Cts0_window_Its = (sol$pi_lc_c^sum(currI))*((1-sol$pi_lc_c)^sum(1-currI)) return(c(p_Cts1_window_Its,p_Cts0_window_Its)) }else{ log_p_Cts1_window_Its = sum(currI)*log(sol$pi_lc) + sum(1-currI)*log(1-sol$pi_lc) log_p_Cts0_window_Its = sum(currI)*log(sol$pi_lc_c) + sum(1-currI)*log(1-sol$pi_lc_c) return (c(log_p_Cts1_window_Its,log_p_Cts0_window_Its)) } } perform_gibbs_runs_fast<-function(sol, reps1=1, reps2=100, pr_params, method = 'binary',use.log = F){ iters = list() iters[[1]] = sol if (reps2 < 2){return(iters)} for (i in 2:(reps2+1)){ print (paste("######## iteration ",i," ########")) currsol = iters[[i-1]] # define the new object newsol = list() newsol$H = currsol$H ; newsol$HS = currsol$HS; newsol$C = currsol$C; newsol$I = currsol$I; newsol$w = currsol$w; newsol$method = method;newsol$use.log = use.log # sample hyper-parameters newsol$pi_v = sample_pi_v(currsol, pr_params, reps = reps1) if (is.na(newsol$pi_v)){print ('newsol$pi_v is na');break} newsol$pi_1_1 = sample_pi_1_1(currsol, pr_params, reps = reps1) if (is.na(newsol$pi_1_1)){print ('newsol$pi_1_1 is na');break} newsol$pi_1_to_1 = sample_pi_1_to_1(currsol, pr_params, reps = reps1) if (is.na(newsol$pi_1_to_1)){print ('newsol$pi_1_to_1 is na');break} newsol$pi_0_to_1 = sample_pi_0_to_1(currsol, pr_params, reps = reps1) if (is.na(newsol$pi_0_to_1)){print ('newsol$pi_0_to_1 is na');break} if (method == 'binary'){ newsol$pi_lc = sample_pi_lc(currsol, pr_params, reps = reps1) newsol$pi_lc_c = sample_pi_lc_c(currsol, pr_params, reps = reps1) #print(newsol$pi_lc);print(newsol$pi_lc_c) } if (method == 'norm'){ newsol$norm_bic = sample_norm_bic(currsol, pr_params, reps = reps1) newsol$norm_bg = sample_norm_bg(currsol, pr_params, reps = reps1) print (newsol$norm_bic);print(newsol$norm_bg) if (any(is.na(newsol$norm_bic))){print('within normal dist has na');break} if (any(is.na(newsol$norm_bg))){print('within normal dist has na');break} } newsol$pi_s = currsol$pi_s newsol$pi_s = sapply(1:length(currsol$pi_s),sample_pi_s,sol=currsol,pr_params = pr_params,reps = reps1) if (any(is.na(newsol$pi_s))){print('newsol$pi_s has nas');break} newsol$pi_s_c = sample_pi_s_c(currsol, pr_params, reps = reps1) newsol$H = sapply(1:length(newsol$H),function(x,sol,reps){sample_Hv_posterior(sol,x,reps=reps)},sol = newsol, reps = reps1) for (l in 1:ncol(newsol$HS)){ newsol$HS[,l] = sapply(1:nrow(newsol$HS),function(x,sol,l, reps){sample_HS_posterior(sol, x, l, reps = reps)},sol=newsol,l=l, reps=reps1) } if (any(is.na(newsol$HS))){print('newsol$HS has nas');break} newsol$C[,1] = sapply(1:nrow(newsol$C),function(x,sol, reps){sample_C_1_posterior(sol,x,reps=reps)},sol=newsol,reps=reps1) for (k in 1:nrow(newsol$C)){ newsol$C[k,2:ncol(newsol$C)] = sapply(2:ncol(newsol$C),function(x,sol,k,reps){sample_Ct_posterior(sol, x, k, reps = reps)},sol=newsol,k=k,reps=reps1) } if (any(is.na(newsol$C))){print('newsol$C has nas');break} newsol$C_counts = apply(newsol$C,1,analyze_C_row,window_size=currsol$w) print (table(c(newsol$C),useNA='always')) if (sum(newsol$C)==0 || sum(newsol$HS)==0){print (paste ('Gibbs iteration',i,' got an empty bic'))} iters[[i]] = newsol } return (iters) } ##################### Implementation of the wrapper ############## wrapper2_algo <-function(I,initial_algo = 'bimax',debi_thr1 = 0.5,debi_thr2 = 0.8,...){ mode(I) = 'numeric' if ( initial_algo == 'bimax'){ initial_sol = run_bimax(I,bimax_min_row=10,bimax_min_col = 20,number=2000) } else{ initial_sol = run_isa(I,min_row=10,min_col = 10) } initial_sol_red = initial_sol if (length(initial_sol_red$rows)>1){ initial_sol_red = reduce_overlaps_debi(initial_sol$rows,initial_sol$cols,thr=debi_thr1) } initial_improved = multisample_biclustering_wrapper(I,initial_sol_red$rows,initial_sol_red$cols,...) initial_improved_red = reduce_overlaps_debi(initial_improved[[1]],initial_improved[[2]],thr = debi_thr2) initial_improved_red[[3]] = initial_improved[[3]][names(initial_improved_red[[1]])] initial_improved_red[[4]] = initial_improved[[4]][names(initial_improved_red[[1]])] return (initial_improved_red) } multisample_biclustering_wrapper <- function(Z,bic_rows_list,bic_cols_list,model='binary',fchange_thr = 0.5, w=1,gibbs_reps = 50,min_row = 5,min_col = 10,save_iters=T,...){ # reduce overlaps reduced_overlaps = reduce_overlaps_debi(bic_rows_list,bic_cols_list,thr=0.5) bic_rows_list = reduced_overlaps$rows bic_cols_list = reduced_overlaps$cols print (paste('number of bics before gibbs',length(bic_rows_list))) #if (gibbs_reps <1){ # return (list(bic_rows = bic_rows_list , bic_cols = bic_cols_list)) #} mode(Z) = 'numeric' bic_rows = list();bic_cols = list();bic_mats = list();bics_params = list() for (n in names(bic_cols_list)){ ptm <- proc.time() algo_out = improve_bicluster(Z,bic_rows_list[[n]],bic_cols_list[[n]],model=model,gibbs_reps=gibbs_reps,window_size=w) if (model=='binary'){ curr_pi_lc = mean(sapply(algo_out$iters,function(x){x$pi_lc})) curr_pi_lc_c = mean(sapply(algo_out$iters,function(x){x$pi_lc_c})) print (paste('curr bic pi_lc: ' , curr_pi_lc,curr_pi_lc_c)) if (gibbs_reps>0 && (curr_pi_lc / curr_pi_lc_c < fchange_thr || curr_pi_lc < 0.4)) {break} } if (model=='norm' && gibbs_reps > 0){ curr_norm_dist = rowMeans(sapply(algo_out$iters,function(x){x$norm_bic})) if (curr_norm_dist[1] < fchange_thr){next} } gibbs_solution = algo_out$gibbs_solution if (length(gibbs_solution$bic_rows) < min_row){next} if (length(gibbs_solution$bic_cols) < min_col){next} print(paste("Found a bicluster, it took (sec): ",proc.time() - ptm)[3]) print (paste("bicluster size:",length(gibbs_solution$bic_rows),length(gibbs_solution$bic_cols),sep = " " )) if (length(gibbs_solution$bic_rows) < min_row){break} if (length(gibbs_solution$bic_cols) < min_col){break} bic_rows[[n]] = gibbs_solution$bic_rows bic_cols[[n]] = gibbs_solution$bic_cols bic_mats[[n]] = gibbs_solution$bic_I if (save_iters){bics_params[[n]] = algo_out$iters} } return (list(bic_rows = bic_rows , bic_cols = bic_cols, bic_mats = bic_mats,bics_params=bics_params)) } reduce_overlaps_debi<-function(bic_rows_list,bic_cols_list,thr=0.5,max_num_bics=10){ #print (paste('reducing overlaps, num bics is: ',length(bic_rows_list))) #print (class(bic_rows_list)) if (length(bic_rows_list)==0){return (list(rows=bic_rows_list,cols=bic_cols_list))} in_pool = rep(T,length(bic_rows_list)) bic_size = sapply(bic_rows_list,length) * sapply(bic_cols_list,length) kept_bic_rows = list() kept_bic_cols = list() while (T){ if (length(kept_bic_rows) == max_num_bics){break} if (sum(in_pool) == 0){break} # next bic is the largest curr_size = max(bic_size[in_pool]) next_ind = which(in_pool & bic_size == curr_size)[1] kept_bic_rows[[as.character(next_ind)]] = bic_rows_list[[next_ind]] kept_bic_cols[[as.character(next_ind)]] = bic_cols_list[[next_ind]] in_pool[next_ind] = F # go over remaining bics and remove from loop overlaps for (i in 1:length(in_pool)){ if (!in_pool[i]){next} curr_rows_overlap = intersect(bic_rows_list[[next_ind]],bic_rows_list[[i]]) curr_cols_overlap = intersect(bic_cols_list[[next_ind]],bic_cols_list[[i]]) overlap_size = length(curr_rows_overlap) * length(curr_cols_overlap) if (overlap_size / bic_size[i] > thr){in_pool[i] = F} } } # if the bics had names then keep them if (!is.null(names(bic_rows_list))){ kept_bic_rows = bic_rows_list[as.numeric(names(kept_bic_rows))] kept_bic_cols = bic_cols_list[as.numeric(names(kept_bic_cols))] } return (list(rows = kept_bic_rows, cols = kept_bic_cols)) } improve_bicluster<-function(Z,rows,cols,model='binary',window_size=1,start_tr=0,gibbs_reps=50){ sol = get_initial_solution_I_as_input(Z,rows,cols,window_size=window_size) pr_params = get_default_hyper_parameter_priors(ncol(sol$C),nrow(sol$HS),nrow(sol$C)) initial_sol = get_hyper_parameter_modes_from_prior(sol,pr_params) initial_sol$method = model iters = perform_gibbs_runs_fast(initial_sol,reps2=gibbs_reps,pr_params=pr_params,method=model) gibbs_solution = extract_gibbs_solution(Z,iters,ignore_first = 0,eps2=0.5,eps1=0.5) if (model=='binary'){print (table(Z[gibbs_solution$bic_rows,gibbs_solution$bic_cols])/length(Z[gibbs_solution$bic_rows,gibbs_solution$bic_cols]))} print (dim(Z[gibbs_solution$bic_rows,gibbs_solution$bic_cols])) if (gibbs_reps <=1){gibbs_solution = extract_gibbs_solution(Z,iters[1])} return (list(iters = iters,gibbs_solution = gibbs_solution)) } ### helper methods to run extant algorithms run_bimax <-function(I,number = 1000,bimax_min_row=10,bimax_min_col=20){ if (!all(I==1 | I==0)){I = I>= quantile(c(I),0.9)} bimax_sol = biclust(I,method=BCBimax(), minr = bimax_min_row, minc = bimax_min_col, number = number) print ('after running bimax') bics = bicluster(I,bimax_sol) bics = bics[sapply(bics,length)>0] if(length(bics)==0){ print ('bimax could not find an initial solution, trying smaller bics') bimax_min_col = bimax_min_col - 2 while (length(bics)==0 && bimax_min_col >= 6){ print (paste(bimax_min_row,bimax_min_col,number)) bimax_sol = biclust(I,method=BCBimax(), minr = bimax_min_row, minc = bimax_min_col, number = number) bics = bicluster(I,bimax_sol) bics = bics[sapply(bics,length)>0] bimax_min_col = bimax_min_col - 2 } } if(length(bics)==0){ bimax_min_col = 8 while (length(bics)==0 && bimax_min_col >= 5){ bimax_min_row = bimax_min_col print (paste(bimax_min_row,bimax_min_col,number)) bimax_sol = biclust(I,method=BCBimax(), minr = bimax_min_row, minc = bimax_min_col, number = number) bics = bicluster(I,bimax_sol) bics = bics[sapply(bics,length)>0] bimax_min_col = bimax_min_col - 1 } } rows = list();cols = list() if (length(bics)==0){return (list(rows=rows,cols=cols))} for (i in 1:length(bics)){ rows[[i]] = rownames(bics[[i]]);cols[[i]] = colnames(bics[[i]]) } return (list(rows=rows,cols=cols)) } run_isa<-function(I,min_row,min_col,direction = c('up','up')){ isa.result=isa(I,direction = direction,thr.row = c(0.5,1,1.5,2),thr.col = c(0.5,1,1.5,2),no.seeds = 100) rows_mat = isa.result$rows cols_mat = isa.result$columns to_keep = apply(rows_mat>0,2,sum) >= min_row & apply(cols_mat>0,2,sum) >= min_col rows_mat = rows_mat[,to_keep] cols_mat = cols_mat[,to_keep] rows = list(); cols = list() if (ncol(rows_mat)==0){return (list(rows=rows,cols=cols))} for (j in 1:ncol(rows_mat)){ rows[[j]] = rownames(I)[rows_mat[,j]>0] cols[[j]] = colnames(I)[cols_mat[,j]>0] } return (list(rows=rows,cols=cols)) } ##################### Implementation of the masker (this is the complete TWIGS algorithm) ############## # "musbic" = multi-sample biclustering # initial_method = how to get the initial solution # ... parameters for the gibbs sampler one_musbic_step<-function(I,min_row = 5, min_col = 5, gibbs_reps = 50,initial_method='bimax', method = 'binary', initial_method_thr = 1,bimax_min_col=20,...){ if (initial_method == 'bimax'){ first_b = get_bicluster_bimax(I,method=method,thr=initial_method_thr,bimax_min_col=bimax_min_col) } else if (initial_method == 'plaid'){ first_b = get_bicluster_plaid(I,method=method,thr=initial_method_thr) } else{ first_b = get_bicluster_ISA(I,min_row,min_col) } print (paste('initial solution size is : ',nrow(first_b),ncol(first_b))) #print (mode(I)) #print (I[rownames(first_b),colnames(first_b)]) if (length(first_b)==0){return (NULL)} sol = get_initial_solution_I_as_input(I,rownames(first_b),colnames(first_b),...) pr_params = get_default_hyper_parameter_priors(ncol(sol$C),nrow(sol$HS),nrow(sol$C)) initial_sol = get_hyper_parameter_modes_from_prior(sol,pr_params) initial_sol$method = method iters = perform_gibbs_runs_fast(initial_sol,reps2=gibbs_reps,pr_params=pr_params,method=method) gibbs_solution = extract_gibbs_solution(I,iters,ignore_first = 0,eps2=0.5,eps1=0.5) if (method=='binary'){print (table(I[gibbs_solution$bic_rows,gibbs_solution$bic_cols])/length(I[gibbs_solution$bic_rows,gibbs_solution$bic_cols]))} print (dim(I[gibbs_solution$bic_rows,gibbs_solution$bic_cols])) if (gibbs_reps <=1){gibbs_solution = extract_gibbs_solution(I,iters[1])} return (list(iters = iters,gibbs_solution = gibbs_solution)) } # Assumption: gibbs_reps > 1 one_musbic_step_recursive_normal<-function(I,min_row = 5, min_col = 5, gibbs_reps = 50,initial_method='bimax',initial_method_thr = 1,...){ # TODO: enforce method='norm' initial_sol = one_musbic_step(I,min_row,min_col,gibbs_reps,initial_method,initial_method_thr,...) curr_norm_dist = rowMeans(sapply(initial_sol$iters,function(x){x$norm_bic})) curr_bg_dist = rowMeans(sapply(initial_sol$iters,function(x){x$norm_bg})) # rerun the flow nextI = I[initial_sol$gibbs_solution$bic_rows,] hist(nextI) next_sol = one_musbic_step(nextI,min_row,min_col,gibbs_reps,initial_method,initial_method_thr,...) # inspect the solution next_norm_dist = rowMeans(sapply(next_sol$iters,function(x){x$norm_bic})) next_bg_dist = rowMeans(sapply(next_sol$iters,function(x){x$norm_bg})) if (abs(next_bg_dist[1]- next_norm_dist[1])<0.3){return(initial_sol)} return (next_sol) } run_complete_twigs_algorithm<-function(I, fchange_thr = 0.5 ,min_row = 10,min_col = 10,max_num_bics = 20,gibbs_reps=50,...){ mode(I) = 'numeric' bic_rows = list();bic_cols = list();bic_mats = list();bics_params = list() count = 1 while (T){ ptm <- proc.time() algo_out = one_musbic_step(I,min_row = min_row, min_col = min_col,gibbs_reps=gibbs_reps,...) if (is.null(algo_out)){break} curr_pi_lc = mean(sapply(algo_out$iters,function(x){x$pi_lc})) curr_pi_lc_c = mean(sapply(algo_out$iters,function(x){x$pi_lc_c})) print (paste('curr bic pi_lc: ' , curr_pi_lc,curr_pi_lc_c)) if (gibbs_reps>0 && curr_pi_lc < max(curr_pi_lc_c*fchange_thr,0.5)) {break} gibbs_solution = algo_out$gibbs_solution print(paste("Found a bicluster, it took (sec): ",proc.time() - ptm)[3]) print (paste("bicluster size:",length(gibbs_solution$bic_rows),length(gibbs_solution$bic_cols),sep = " " )) if (length(gibbs_solution$bic_rows) < min_row){break} if (length(gibbs_solution$bic_cols) < min_col){break} bic_rows[[count]] = gibbs_solution$bic_rows bic_cols[[count]] = gibbs_solution$bic_cols bic_mats[[count]] = gibbs_solution$bic_I bics_params[[count]] = algo_out$iters # calculate the residual HS = gibbs_solution$HS; C = gibbs_solution$C for (i in 1:nrow(C)){ curr_subj = rownames(C)[i] curr_TRs = colnames(I)[grepl(pattern = paste(curr_subj,';',sep=''),colnames(I))] curr_TRs = curr_TRs[C[curr_subj,]==1] I[HS[,curr_subj]==1,curr_TRs] = 0 } I[bic_rows[[count]],bic_cols[[count]]] = 0 count = count + 1 if (count > max_num_bics){break} } return (list(bic_rows = bic_rows , bic_cols = bic_cols, bic_mats = bic_mats,bics_params=bics_params)) } run_complete_twigs_algorithm_normal<-function(Z,mean_thr = 0.3 ,min_row = 5,min_col = 10,max_num_bics = 10,gibbs_reps=50,...){ bic_rows = list();bic_cols = list();bic_mats = list();bics_params = list() count = 1 while (T){ ptm <- proc.time() algo_out = one_musbic_step(Z,min_row = min_row, min_col = min_col, method = 'norm',gibbs_reps=gibbs_reps,...) if (is.null(algo_out)){break} curr_norm_dist = rowMeans(sapply(algo_out$iters,function(x){x$norm_bic})) curr_bg_dist = rowMeans(sapply(algo_out$iters,function(x){x$norm_bg})) print (paste(c('curr bic within distribution: ' , curr_norm_dist))) if (gibbs_reps>0 && curr_norm_dist[1] < mean_thr) { break } gibbs_solution = algo_out$gibbs_solution print(paste("Found a bicluster, it took (sec): ",proc.time() - ptm)[3]) print (paste("bicluster size:",length(gibbs_solution$bic_rows),length(gibbs_solution$bic_cols),sep = " " )) if (length(gibbs_solution$bic_rows) < min_row){break} if (length(gibbs_solution$bic_cols) < min_col){break} bic_rows[[count]] = gibbs_solution$bic_rows bic_cols[[count]] = gibbs_solution$bic_cols bic_mats[[count]] = gibbs_solution$bic_I bics_params[[count]] = algo_out$iters # calculate the residual matrix HS = gibbs_solution$HS; C = gibbs_solution$C for (i in 1:nrow(C)){ curr_subj = rownames(C)[i] curr_TRs = colnames(Z)[grepl(pattern = paste(curr_subj,';',sep=''),colnames(Z))] curr_TRs = curr_TRs[C[curr_subj,]==1] if (length(curr_TRs) == 0 || sum(HS[,curr_subj]) == 0){next} Z[HS[,curr_subj]==1,curr_TRs] = Z[HS[,curr_subj]==1,curr_TRs] - curr_norm_dist[1] # alternative normalization, added May 2015 #currmean = curr_norm_dist[1] #currsig = curr_norm_dist[2] #currbgsig = curr_bg_dist[2] #Z[HS[,curr_subj]==1,curr_TRs] = ((Z[HS[,curr_subj]==1,curr_TRs] - currmean)/currsig)*currbgsig } count = count + 1 if (count > max_num_bics){break} } return (list(bic_rows = bic_rows , bic_cols = bic_cols, bic_mats = bic_mats,bics_params=bics_params)) } extract_gibbs_solution<-function(I,iters,eps1 = 0.5,eps2=0.5,ignore_first = 0){ if (length(iters) > ignore_first && ignore_first > 1){iters = iters[-c(1:ignore_first)]} H = iters[[1]]$H; HS = iters[[1]]$HS;C = iters[[1]]$C mode(H) = 'numeric';mode(HS) = 'numeric'; mode(C) = 'numeric' pi_1_1 = mean(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_1_1},iters=iters)) pi_1_to_1 = mean(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_1_to_1},iters=iters)) pi_0_to_1 = mean(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_0_to_1},iters=iters)) pi_lc = mean(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_lc},iters=iters)) pi_lc_c = mean(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_lc_c},iters=iters)) pi_s = rowMeans(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_s},iters=iters)) pi_s_c = mean(sapply(1:length(iters),function(x,iters){iters[[x]]$pi_s_c},iters=iters)) if (length(iters)>2){ n = length(iters) for (i in 2:n){ H = H + iters[[i]]$H HS = HS + iters[[i]]$HS C = C + iters[[i]]$C } H = H/n;C = C/n;HS = HS/n } bic_cols = c() bic_rows = c() for (i in 1:nrow(C)){ for (j in 1:ncol(C)){ if (C[i,j] > eps2){ bic_cols = c(bic_cols,paste(rownames(C)[i],';',colnames(C)[j],sep='')) } } } bic_rows = rownames(I)[(which(H>eps1))] bic_I = I[bic_rows,bic_cols] HS = HS > eps1; H = H > eps1; C = C > eps2 return(list(H=H,HS=HS,C=C,bic_rows=bic_rows,bic_cols=bic_cols,bic_I=bic_I,pi_1_1=pi_1_1,pi_1_to_1 = pi_1_to_1, pi_0_to_1 = pi_0_to_1, pi_lc=pi_lc, pi_lc_c = pi_lc_c, pi_s=pi_s, pi_s_c = pi_s_c)) } get_bicluster_bimax<-function(I,bimax_min_row=10,bimax_min_col=10,number = 1000, method = 'binary', thr = 1){ min_row = bimax_min_row; min_col = bimax_min_col if (method != 'binary'){ I = I>= max(quantile(c(I),0.9),thr) print(paste("bimax thr is:", max(thr,quantile(c(I),0.9)))) } bimax_sol = biclust(I,method=BCBimax(), minr = bimax_min_row, minc = bimax_min_col, number = number) print ('after running bimax') bics = bicluster(I,bimax_sol) bics = bics[sapply(bics,length)>0] if(length(bics)==0){ print ('bimax could not find an initial solution, trying smaller bics') bimax_min_col = bimax_min_col - 2 while (length(bics)==0 && bimax_min_col >= 6){ print (paste(bimax_min_row,bimax_min_col,number)) bimax_sol = biclust(I,method=BCBimax(), minr = bimax_min_row, minc = bimax_min_col, number = number) bics = bicluster(I,bimax_sol) bics = bics[sapply(bics,length)>0] bimax_min_col = bimax_min_col - 2 } } if(length(bics)==0){ bimax_min_col = 8 while (length(bics)==0 && bimax_min_col >= 5){ bimax_min_row = bimax_min_col print (paste(bimax_min_row,bimax_min_col,number)) bimax_sol = biclust(I,method=BCBimax(), minr = bimax_min_row, minc = bimax_min_col, number = number) bics = bicluster(I,bimax_sol) bics = bics[sapply(bics,length)>0] bimax_min_col = bimax_min_col - 1 } } print (length(bics)) if(length(bics)==0){ print ('bimax could not find an initial solution, setting a random starting point') bimax_first_b = I[sample(rownames(I))[1:min_row],sample(colnames(I))[1:min_col]] } else{ bic_sizes = sapply(bics,length) bic_num_samples = sapply(bics,function(x){length(unique(sapply(colnames(x),getSubjFromColName)))}) #print (sapply(bics,function(x){unique(strsplit(colnames(x),split=';')[[1]][1])})) plot (bic_sizes,bic_num_samples) bic_sizes = sapply(bics,ncol) #print (bic_sizes) ind = which(bic_sizes == max(bic_sizes))[1] #ind = which(bic_num_samples == max(bic_num_samples))[1] bimax_first_b = bics[[ind]] } return (bimax_first_b) } getSubjFromColName <- function (x){ strsplit(x,split=';')[[1]][1] } get_bicluster_ISA<-function(I,min_row,min_col){ isa.result=isa(I,direction = c('up','up'),thr.row = c(0.5,1,1.5),thr.col = c(0.5,1,1.5),no.seeds = 100) curr_rows = isa.result$rows > 0 curr_cols = isa.result$columns > 0 if (!is.null(dim(curr_cols)) && length(curr_rows) > 0){ print (dim(curr_cols));print(dim(curr_rows)) to_keep = colSums(curr_rows) > min_row & colSums(curr_cols) > min_col if (all(!to_keep)){ curr_rows = NULL; curr_cols = NULL } else{ curr_rows = curr_rows[,to_keep]; curr_cols = curr_cols[,to_keep] #print (paste(colSums(curr_rows),colSums(curr_cols),sep='...')) } } print (length(curr_rows)) isa_b = NULL if (! is.null(dim(curr_cols))){ # get the largest bic bicluster_sizes = apply(curr_rows,2,sum) * apply(curr_cols,2,sum) bicluster_sizes = apply(curr_cols,2,sum) ind = which(bicluster_sizes == max(bicluster_sizes))[1] isa_b = I[curr_rows[,ind],curr_cols[,ind]] } if (is.null(dim(curr_cols)) && length(curr_rows) > 0){ isa_b = I[curr_rows,curr_cols] } if(length(isa_b)==0){ print ('isa could not find an initial solution, setting a random starting point') isa_b = I[sample(rownames(I))[1:min_row],sample(colnames(I))[1:min_col]] } return (isa_b) }