setwd('Z:/html/adeptus/scripts') # load libraries cran_libs = c('hash','e1071','pROC','randomForest','parallel','ROCR') bioconductor_libs = c('CMA','preprocessCore','DO.db') for (p in (c(bioconductor_libs,cran_libs))){library(p,character.only=T)} # load objects and read file: # these are required data for the analysis data_directory = '../data/' # load the microarray data x_microarray = get(load(paste(data_directory,'ge_x_microarray.RData',sep=''))) y_microarray = get(load(paste(data_directory,'ge_y_microarray.RData',sep=''))) sample2dataset_microarray = get(load(paste(data_directory,'sample2dataset.RData',sep=''))) sample2dataset_microarray = sample2dataset_microarray[rownames(y_microarray)] # load rnaseq data x_rnaseq = get(load(paste(data_directory,'ge_x_rnaseq.RData',sep=''))) y_rnaseq = get(load(paste(data_directory,'ge_y_rnaseq.RData',sep=''))) # load the microarray-based classifier # (code to learn such models is given below) microarray_model = get(load(paste(data_directory,'model_single_svm_gene_matrix_5000genes_fs100.RData',sep=''))) # load data for generating the networks # load gene2name gene2name = as.matrix(read.table(paste(data_directory,'gene2name.txt',sep=''),header=F,row.names=1,stringsAsFactors=F))[,1] # load gene to pathway associations, the following command will # add two objects to the working directory: all_pathway_genes, and a list called pathways # that maps a pathway name into its genes (entrez ids) load(paste(data_directory,'PathwayInfo.RData',sep='')) # load the selected gene sets disease_gene_sets = as.matrix(read.table(paste(data_directory,'supp_table_1.txt',sep=''),header=T,row.names=NULL,stringsAsFactors=F,sep='\t')) # load drug target info load(paste(data_directory,'drug2gene_names.RData',sep='')) # load the cosmic analysis data (and parse it into a list) cosmic_data = as.matrix(read.table(paste(data_directory,'gene2cancer_cosmic.txt',sep=''),row.names=NULL)) site2genes = list() for (i in 1:nrow(cosmic_data)){ g = cosmic_data[i,1]; site = cosmic_data[i,2] site2genes[[site]] = unname(c(site2genes[[site]],g)) } # load code scripts_directory = './' source(paste(scripts_directory,'FUNCTIONS_expression_data_analysis_helper_methods.R',sep='')) source(paste(scripts_directory,'FUNCTIONS_general_helper_methods.R',sep='')) source(paste(scripts_directory,'FUNCTIONS_classification_methods.R',sep='')) source(paste(scripts_directory,'FUNCTIONS_multilabel_classification.R',sep='')) ################################################################################################################################## ########################################## Part 1: classification analysis ####################################################### ################################################################################################################################## ###### Leave Dataset Out Cross Validation (LDO-CV) ####### # regroup datasets to run leave 15-datasets out CV newd = getKfoldLeaveDatasetOutVector(sample2dataset_microarray,15) # OPTIONAL: focus on the top variance genes (to save running time) sds = apply(x_microarray,1,sd) topk = 5000 sd_thr = sort(sds,decreasing=T)[topk] to_keep = which(sds >= sd_thr) x_microarray = x_microarray[to_keep,]; gc() # run LDO-CV using the simple binary relevance classifier # parameters: # In unix you can set num cores to more than 1 # The default classifier used by this code is a multilable chain classifier # By setting the internal parameters: cvfold=1, secondClassifier = NULL # We get the binary relevance classifier. # A simpler implementation for binary relevance is available in FUNCTIONS_multilabel_classification.R # The base classifer here is linear SVM (from e1071) coupled with feature selection to save time LDO15_results = getMultitaskLeaveDatasetOutClassificationPerformance_multicore(x_microarray, y_microarray,newd,numCores=1,baseClassifier=featureSelectionClassifier, baseClassifierArgs = list(numFeatures=2,classification_function = svm,probability=T), baseClassifierPredArgs = list(probability=T),secondClassifier = NULL,cvfolds=1) # Get the classification scores: PN-ROC, PB-ROC, and SMQ curr_preds = LDO15_results[['predictions']] curr_y = LDO15_results[['y']] pos_neg_perf = getMultitaskPerformancePositivesVsNegatives(curr_preds,curr_y,sample2dataset_microarray[rownames(curr_y)]) pn_rocs = pos_neg_perf[[1]] pb_rocs = pos_neg_perf[[3]] table(pn_rocs>0.7 & pb_rocs > 0.7) SMQ_scores = getZscoresDatasetAnalysisPvalue( getDatasetWilcoxonTest(curr_preds,curr_y,sample2dataset_microarray[rownames(curr_y)])) ###### Validation on an external dataset ####### # Learn Single-SVM classifiers using the microarray data microarray_model = multitaskChainClassifier(t(x_microarray),y_microarray, baseClassifier=featureSelectionClassifier,baseClassifierArgs = list(numFeatures=100,classification_function = svm,probability=T), baseClassifierPredArgs = list(probability=T),secondClassifier = NULL,cvfolds=1) rnaseq_predictions = getModelPredictionsUsingQuantileNormalizationWithTraining(microarray_model,x_microarray,x_rnaseq) # calculate performance covered_samples = intersect(rownames(rnaseq_predictions),rownames(y_rnaseq)) rnaseq_predictions = rnaseq_predictions[covered_samples,] y_rnaseq = y_rnaseq[covered_samples,] by_disease_preds = getMultitaskPerformanceMeasures(rnaseq_predictions,y_rnaseq,useROCR=F,asMat=T) ################################################################################################################################## ########################## Part 2: Integrative analysis for generating gene networks ############################################# ################################################################################################################################## # This code create an output file for each of the following diseases: # ALL* # Breast cancer # leukemia # lung cancer* # colorectal cancer* # (*) = the diseases analyzed in the study # The resulting file for each disease contains a table with the following columns: # 1. The gene name (string) # 2. Differential expression pattern (differential in ancestor, or up- or down- regulated vs. negatives and bgcs) # 3. is druggable (0 or 1) # 4. is related to the disease in the cosmic analysis (0 or 1) # 5. is a pathway gene (0 or 1) # Additional columns to be loaded to the enhancedGraphics Cytoscape app # see http://www.cgl.ucsf.edu/cytoscape/utilities3/enhancedcg.shtml for a tutorial # The way these data were analyzed (for each disease): # 1. Load the gene set into a GeneMANIA search, use only protein and genetic interactions # 2. Load the file (the output of the analysis below) as an attribute file for the nodes # Make sure you use gene names in the relations when uploading the file - GeneMANIA's defaule # node labels are not the gene names # 3. Create a custom graphics property for the nodes # Add Chart 2 as a passthrough mapper # Again, see the enhanced graphics tutorial for details. out_file_name = '../../gene_based_matrices/disease_specific_inspection_' mappings = c("colorectal cancer","lung squamous cell carcinoma",'breast cancer',"leukemia","acute lymphocytic leukemia") names(mappings) = c("large_intestine",'lung','breast',"haematopoietic_and_lymphoid_tissue","haematopoietic_and_lymphoid_tissue") d_ancs = as.list(DOANCESTOR) all_p_gene_names = gene2name[all_pathway_genes] disease_names = colnames(y_microarray) names(disease_names) = sapply(disease_names,term2name) for (i in 1:length(mappings)){ # get up - genes currd = mappings[i]; currdoid = disease_names[names(disease_names) == currd] curr_ancestors = intersect(disease_names ,d_ancs[[currdoid]]) curr_diseases = c(currd,sapply(curr_ancestors,term2name)) curr_diseases = intersect(curr_diseases,unique(disease_gene_sets[,1])) curr_all_genes = unique(disease_gene_sets[is.element(disease_gene_sets[,1],set=curr_diseases),2]) # add roc filter (for colorectal cancer) if (currd == "colorectal cancer"){ curr_d_gene_scores = disease_gene_sets[disease_gene_sets[,1] == currd,c(1:2,4:5)] low_score_genes = unique(curr_d_gene_scores[as.numeric(curr_d_gene_scores[,3]) < 0.8,2]) curr_all_genes = setdiff(curr_all_genes,low_score_genes) } curr_disease_matrix = matrix(nrow = length(curr_all_genes), ncol = 5) rownames(curr_disease_matrix) = curr_all_genes # 1. differentiality curr_disease_matrix[,1] = "Differential in ancestor" currd_genes = disease_gene_sets[which(disease_gene_sets[,1] == currd),2:3] currd_genes = currd_genes[is.element(currd_genes[,1],set=curr_all_genes),] curr_disease_matrix[currd_genes[,1],1] = currd_genes[,2] # 2. is druggable? curr_disease_matrix[,2] = as.numeric(is.element(curr_all_genes, set = drug_target_genes)) # 3. appears in the cosmic analysis? curr_cosmic_genes = site2genes[[names(mappings)[i]]] curr_disease_matrix[,3] = as.numeric(is.element(curr_all_genes, set = curr_cosmic_genes)) # 4. is a pathway gene? curr_disease_matrix[,4] = as.numeric(is.element(curr_all_genes, set = all_p_gene_names)) # 5. create a pie chart info for cytoscape for (j in 1:nrow(curr_disease_matrix)){ cols = c('cyan','cyan','white','white') if (grepl(curr_disease_matrix[j,1],pattern="vs bgc: up,")){cols[2] = 'red'} if (grepl(curr_disease_matrix[j,1],pattern="vs bgc: down,")){cols[2] = 'green'} if (grepl(curr_disease_matrix[j,1],pattern="negatives: up")){cols[1] = 'red'} if (grepl(curr_disease_matrix[j,1],pattern="vs negatives: down")){cols[1] = 'green'} if(curr_disease_matrix[j,2]=="1"){cols[3]='blue'} if(curr_disease_matrix[j,3]=="1"){cols[4]='black'} curr_disease_matrix[j,5] = paste('piechart: attributelist="a1,a2,a3,a4" colorlist="',paste(cols,collapse=','), '" labelsize=0',sep="") } for (j in 1:4){curr_disease_matrix = cbind(curr_disease_matrix,rep(1,nrow(curr_disease_matrix)))} colnames(curr_disease_matrix) = c('differentiality','drugbank_druggable','cosmic_mutations','is_pathway_gene',"Chart",'a1','a2','a3','a4') # write results to a file curr_name = paste(gsub(currd,pattern = ' ', replace = '_'),'_gene_data_for_cytoscape.txt',sep='') write.table(curr_disease_matrix,file = curr_name, sep = '\t', row.names=T,col.names=T,quote=F) }