setwd('Z:/html/adeptus/scripts')
# load libraries
cran_libs = c('hash','e1071','pROC','randomForest','parallel','ROCR')
bioconductor_libs = c('CMA','preprocessCore','DO.db')
for (p in (c(bioconductor_libs,cran_libs))){library(p,character.only=T)}
# load objects and read file:
# these are required data for the analysis
data_directory = '../data/'
# load the microarray data
x_microarray = get(load(paste(data_directory,'ge_x_microarray.RData',sep='')))
y_microarray = get(load(paste(data_directory,'ge_y_microarray.RData',sep='')))
sample2dataset_microarray = get(load(paste(data_directory,'sample2dataset.RData',sep='')))
sample2dataset_microarray = sample2dataset_microarray[rownames(y_microarray)]
# load rnaseq data
x_rnaseq = get(load(paste(data_directory,'ge_x_rnaseq.RData',sep='')))
y_rnaseq = get(load(paste(data_directory,'ge_y_rnaseq.RData',sep='')))
# load the microarray-based classifier
# (code to learn such models is given below)
microarray_model = get(load(paste(data_directory,'model_single_svm_gene_matrix_5000genes_fs100.RData',sep='')))
# load data for generating the networks
# load gene2name
gene2name = as.matrix(read.table(paste(data_directory,'gene2name.txt',sep=''),header=F,row.names=1,stringsAsFactors=F))[,1]
# load gene to pathway associations, the following command will
# add two objects to the working directory: all_pathway_genes, and a list called pathways
# that maps a pathway name into its genes (entrez ids)
load(paste(data_directory,'PathwayInfo.RData',sep=''))
# load the selected gene sets
disease_gene_sets = as.matrix(read.table(paste(data_directory,'supp_table_1.txt',sep=''),header=T,row.names=NULL,stringsAsFactors=F,sep='\t'))
# load drug target info
load(paste(data_directory,'drug2gene_names.RData',sep=''))
# load the cosmic analysis data (and parse it into a list)
cosmic_data = as.matrix(read.table(paste(data_directory,'gene2cancer_cosmic.txt',sep=''),row.names=NULL))
site2genes = list()
for (i in 1:nrow(cosmic_data)){
g = cosmic_data[i,1]; site = cosmic_data[i,2]
site2genes[[site]] = unname(c(site2genes[[site]],g))
}
# load code
scripts_directory = './'
source(paste(scripts_directory,'FUNCTIONS_expression_data_analysis_helper_methods.R',sep=''))
source(paste(scripts_directory,'FUNCTIONS_general_helper_methods.R',sep=''))
source(paste(scripts_directory,'FUNCTIONS_classification_methods.R',sep=''))
source(paste(scripts_directory,'FUNCTIONS_multilabel_classification.R',sep=''))
##################################################################################################################################
########################################## Part 1: classification analysis #######################################################
##################################################################################################################################
###### Leave Dataset Out Cross Validation (LDO-CV) #######
# regroup datasets to run leave 15-datasets out CV
newd = getKfoldLeaveDatasetOutVector(sample2dataset_microarray,15)
# OPTIONAL: focus on the top variance genes (to save running time)
sds = apply(x_microarray,1,sd)
topk = 5000
sd_thr = sort(sds,decreasing=T)[topk]
to_keep = which(sds >= sd_thr)
x_microarray = x_microarray[to_keep,]; gc()
# run LDO-CV using the simple binary relevance classifier
# parameters:
# In unix you can set num cores to more than 1
# The default classifier used by this code is a multilable chain classifier
# By setting the internal parameters: cvfold=1, secondClassifier = NULL
# We get the binary relevance classifier.
# A simpler implementation for binary relevance is available in FUNCTIONS_multilabel_classification.R
# The base classifer here is linear SVM (from e1071) coupled with feature selection to save time
LDO15_results = getMultitaskLeaveDatasetOutClassificationPerformance_multicore(x_microarray,
y_microarray,newd,numCores=1,baseClassifier=featureSelectionClassifier,
baseClassifierArgs = list(numFeatures=2,classification_function = svm,probability=T),
baseClassifierPredArgs = list(probability=T),secondClassifier = NULL,cvfolds=1)
# Get the classification scores: PN-ROC, PB-ROC, and SMQ
curr_preds = LDO15_results[['predictions']]
curr_y = LDO15_results[['y']]
pos_neg_perf = getMultitaskPerformancePositivesVsNegatives(curr_preds,curr_y,sample2dataset_microarray[rownames(curr_y)])
pn_rocs = pos_neg_perf[[1]]
pb_rocs = pos_neg_perf[[3]]
table(pn_rocs>0.7 & pb_rocs > 0.7)
SMQ_scores = getZscoresDatasetAnalysisPvalue(
getDatasetWilcoxonTest(curr_preds,curr_y,sample2dataset_microarray[rownames(curr_y)]))
###### Validation on an external dataset #######
# Learn Single-SVM classifiers using the microarray data
microarray_model = multitaskChainClassifier(t(x_microarray),y_microarray,
baseClassifier=featureSelectionClassifier,baseClassifierArgs = list(numFeatures=100,classification_function = svm,probability=T),
baseClassifierPredArgs = list(probability=T),secondClassifier = NULL,cvfolds=1)
rnaseq_predictions = getModelPredictionsUsingQuantileNormalizationWithTraining(microarray_model,x_microarray,x_rnaseq)
# calculate performance
covered_samples = intersect(rownames(rnaseq_predictions),rownames(y_rnaseq))
rnaseq_predictions = rnaseq_predictions[covered_samples,]
y_rnaseq = y_rnaseq[covered_samples,]
by_disease_preds = getMultitaskPerformanceMeasures(rnaseq_predictions,y_rnaseq,useROCR=F,asMat=T)
##################################################################################################################################
########################## Part 2: Integrative analysis for generating gene networks #############################################
##################################################################################################################################
# This code create an output file for each of the following diseases:
# ALL*
# Breast cancer
# leukemia
# lung cancer*
# colorectal cancer*
# (*) = the diseases analyzed in the study
# The resulting file for each disease contains a table with the following columns:
# 1. The gene name (string)
# 2. Differential expression pattern (differential in ancestor, or up- or down- regulated vs. negatives and bgcs)
# 3. is druggable (0 or 1)
# 4. is related to the disease in the cosmic analysis (0 or 1)
# 5. is a pathway gene (0 or 1)
# Additional columns to be loaded to the enhancedGraphics Cytoscape app
# see http://www.cgl.ucsf.edu/cytoscape/utilities3/enhancedcg.shtml for a tutorial
# The way these data were analyzed (for each disease):
# 1. Load the gene set into a GeneMANIA search, use only protein and genetic interactions
# 2. Load the file (the output of the analysis below) as an attribute file for the nodes
# Make sure you use gene names in the relations when uploading the file - GeneMANIA's defaule
# node labels are not the gene names
# 3. Create a custom graphics property for the nodes
# Add Chart 2 as a passthrough mapper
# Again, see the enhanced graphics tutorial for details.
out_file_name = '../../gene_based_matrices/disease_specific_inspection_'
mappings = c("colorectal cancer","lung squamous cell carcinoma",'breast cancer',"leukemia","acute lymphocytic leukemia")
names(mappings) = c("large_intestine",'lung','breast',"haematopoietic_and_lymphoid_tissue","haematopoietic_and_lymphoid_tissue")
d_ancs = as.list(DOANCESTOR)
all_p_gene_names = gene2name[all_pathway_genes]
disease_names = colnames(y_microarray)
names(disease_names) = sapply(disease_names,term2name)
for (i in 1:length(mappings)){
# get up - genes
currd = mappings[i]; currdoid = disease_names[names(disease_names) == currd]
curr_ancestors = intersect(disease_names ,d_ancs[[currdoid]])
curr_diseases = c(currd,sapply(curr_ancestors,term2name))
curr_diseases = intersect(curr_diseases,unique(disease_gene_sets[,1]))
curr_all_genes = unique(disease_gene_sets[is.element(disease_gene_sets[,1],set=curr_diseases),2])
# add roc filter (for colorectal cancer)
if (currd == "colorectal cancer"){
curr_d_gene_scores = disease_gene_sets[disease_gene_sets[,1] == currd,c(1:2,4:5)]
low_score_genes = unique(curr_d_gene_scores[as.numeric(curr_d_gene_scores[,3]) < 0.8,2])
curr_all_genes = setdiff(curr_all_genes,low_score_genes)
}
curr_disease_matrix = matrix(nrow = length(curr_all_genes), ncol = 5)
rownames(curr_disease_matrix) = curr_all_genes
# 1. differentiality
curr_disease_matrix[,1] = "Differential in ancestor"
currd_genes = disease_gene_sets[which(disease_gene_sets[,1] == currd),2:3]
currd_genes = currd_genes[is.element(currd_genes[,1],set=curr_all_genes),]
curr_disease_matrix[currd_genes[,1],1] = currd_genes[,2]
# 2. is druggable?
curr_disease_matrix[,2] = as.numeric(is.element(curr_all_genes, set = drug_target_genes))
# 3. appears in the cosmic analysis?
curr_cosmic_genes = site2genes[[names(mappings)[i]]]
curr_disease_matrix[,3] = as.numeric(is.element(curr_all_genes, set = curr_cosmic_genes))
# 4. is a pathway gene?
curr_disease_matrix[,4] = as.numeric(is.element(curr_all_genes, set = all_p_gene_names))
# 5. create a pie chart info for cytoscape
for (j in 1:nrow(curr_disease_matrix)){
cols = c('cyan','cyan','white','white')
if (grepl(curr_disease_matrix[j,1],pattern="vs bgc: up,")){cols[2] = 'red'}
if (grepl(curr_disease_matrix[j,1],pattern="vs bgc: down,")){cols[2] = 'green'}
if (grepl(curr_disease_matrix[j,1],pattern="negatives: up")){cols[1] = 'red'}
if (grepl(curr_disease_matrix[j,1],pattern="vs negatives: down")){cols[1] = 'green'}
if(curr_disease_matrix[j,2]=="1"){cols[3]='blue'}
if(curr_disease_matrix[j,3]=="1"){cols[4]='black'}
curr_disease_matrix[j,5] = paste('piechart: attributelist="a1,a2,a3,a4" colorlist="',paste(cols,collapse=','), '" labelsize=0',sep="")
}
for (j in 1:4){curr_disease_matrix = cbind(curr_disease_matrix,rep(1,nrow(curr_disease_matrix)))}
colnames(curr_disease_matrix) = c('differentiality','drugbank_druggable','cosmic_mutations','is_pathway_gene',"Chart",'a1','a2','a3','a4')
# write results to a file
curr_name = paste(gsub(currd,pattern = ' ', replace = '_'),'_gene_data_for_cytoscape.txt',sep='')
write.table(curr_disease_matrix,file = curr_name, sep = '\t', row.names=T,col.names=T,quote=F)
}