-
Notifications
You must be signed in to change notification settings - Fork 6
/
README.txt
159 lines (132 loc) · 8.17 KB
/
README.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# README
# CellDistinguisher
# Tested/run on Linux
#
# Generic input
# exprLinear: a matrix of linear expression values with probes sets (or genes) as rows and samples as columns.
# genesymb: a vector of gene names associated with the probe sets or NULL if not available.
#
# To run, get the software and your data as described below. Start R at the command line, then copy-paste the commands.
#############################
# Run CellDistinguisher using a local text file/matrix with linear gene expression data
# Gene expression data from pooled normal tissues was obtained from the RNA-Seq Atlas (http://medicalgenomics.org/rna_seq_atlas).
# Mixed samples were generated by combining the expression values of known genes from
# five tissues: adipose, colon, heart, hypothalamus, kidney.
#
# Input provided to run CellDistinguisher: expression_values.mixed_samples.tsv
# Two additional files are provided for comparing them to the results of the deconvolution
# based on distinguishers:
# 1. Known composition of the mixed samples: known_fractions.tsv
# 2. Known expression signatures of the pure tissues: expression_signatures.pure_tissues.tsv
#############################
R
## Get the software. You will need the gtools and Matrix packages.
## You may also want the CellMix and GEOquery packages. Note that if
## the following approach using remote access does not work for
## installing the CellDistinguisher package, you can download the
## package directly from
## https://github.com/GeneralElectric/CellDistinguisher/archive/master.tar.gz
## and then install it with any approach that uses local files.
{
if (!suppressWarnings(require("gtools", quietly=TRUE))) {
install.packages("gtools")
library("gtools")
}
if (!suppressWarnings(require("Matrix", quietly=TRUE))) {
install.packages("Matrix")
library("Matrix")
}
if (!suppressWarnings(require("CellDistinguisher", quietly=TRUE))) {
if (!suppressWarnings(require("devtools", quietly=TRUE))) {
install.packages("devtools")
library("devtools")
}
install_github("GeneralElectric/CellDistinguisher")
library("CellDistinguisher")
}
}
# Read the data that contains gene symbols instead of probe ID-s in first columns
inDataFile <- system.file("extdata", "expression_values.mixed_samples.tsv", package = "CellDistinguisher")
inData <- as.matrix(read.table(inDataFile, sep="\t", header = TRUE, row.names = 1))
# Remove lines with all 0 elements
# Expression data is often stored as log values. In such cases, make sure it is transformed back to linear.
exprLinear <- inData[ rowSums(inData==0) != ncol(inData), ]
# Compute the distinguishers. With the below call we are asking for 1-100 distinguishers for each of 5 cell classes.
# We are filtering out probe sets that have expression values above 99.9 percentile.
# We are filtering out probe sets for which the second largest expression value among the heterogeneous samples
# is less than or equal to 0.333 of the first largest.
distinguishers <- gecd_CellDistinguisher(exprLinear, genesymb=NULL, numCellClasses=5, minDistinguisherAlternatives=1, maxDistinguisherAlternatives=100, minAlternativesLengthsNormalized=0.5, expressionQuantileForFilter=0.999, expressionConcentrationRatio=0.333, verbose=0)
# Run deconvolution using these distinguishers
deconvolution <- gecd_DeconvolutionByDistinguishers(exprLinear, distinguishers$bestDistinguishers, nonNegativeOnly = TRUE, convexSolution = TRUE, verbose = 0)
# Alternatively, one of the deconvolution algorithms of CellMix (ssKL or ssFrobenius) can be used. Note that you must install the CellMix package before running these commands.
library(CellMix)
deconvolutionSSKL <- gecd_DeconvolutionCellMix(exprLinear, distinguishers$bestDistinguishers, method="ssKL", maxIter=5)
# Get the sample composition: samples as columns, cell types as rows
# For a large number of samples, it is easier to use the transposed form of the sample composition matrix
sampleCompositions <- t(deconvolution$sampleCompositions)
# Expression signatures of pure cell types/subtypes/activities
celltypeSignatures <- deconvolution$cellSubclassSignatures
# Output #1 - list of distinguisher genes for each cell class/type
write.table(distinguishers$bestDistinguishers, file="distinguishers.tsv",sep="\t", col.names = F, row.names = F, quote=FALSE)
# Output #2 - sample composition: the fraction of each of the 5 cell types in every sample
write.table(sampleCompositions, file="sample_composition.tsv",sep="\t", col.names = T, row.names = T, quote=FALSE)
# Output #3 - computed expression signatures of pure cell types
write.table(round(celltypeSignatures,4),file="celltype_signatures.tsv",sep="\t", col.names = NA, row.names = T, quote=FALSE)
###############################################################################
# Run CellDistinguisher using a GEO accession number to download the study data
###############################################################################
######################################################################
### GSE19830
### http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE19830
###
### Shen-Orr SS, Tibshirani R, Khatri P, Bodian DL, Staedtler F, Perry
### NM, Hastie T, Sarwal MM, Davis MM, Butte AJ. Cell type-specific
### gene expression differences in complex tissues. Nat Methods. 2010
### Apr;7(4):287-9. doi: 10.1038/nmeth.1439. Epub 2010 Mar 7. PubMed
### PMID: 20208531; PubMed Central PMCID: PMC3699332.
###
### Title: Expression data from pure/mixed brain, liver and lung to
### test feasability and sensitivity of statistical deconvolution
###
### Organism: Rattus norvegicus
### Experiment type: Expression profiling by array
###
### Tissue samples from the brain, liver and lung of a single rat were
### analyzed using expression arrays (Affymetrix) in triplicate.
### Homogenates of those three tissues were then mixed
### together at the cRNA level and the gene expression in each
### mixed sample was measured the same way.
R
## Note that you must install the GEOquery package before running these commands.
library(GEOquery)
options(download.file.method='wget')
options(download.file.method.GEOquery='wget')
GSE19830GEO <- getGEO("GSE19830", GSEMatrix=FALSE)
GSE19830Data <- gecd_DataLoader$GSEGeneric(GSE19830GEO, retrievedAs="log2", genesymbColname="Gene Symbol")
## Keep only first of several alternative gene symbols that are separated by "//"
GSE19830Data[[1]]$genesymb <- sub("(.*?) //.*", "\\1", GSE19830Data[[1]]$genesymb)
GSE19830ExprLinear <- GSE19830Data[[1]]$exprLinear
GSE19830genesymb <- GSE19830Data[[1]]$genesymb
GSE19830SampleInfo <- GSE19830Data[[1]]$sampleInfo
GSE19830ProbesetInfo <- GSE19830Data[[1]]$probesetInfo
save(GSE19830ExprLinear, GSE19830genesymb, GSE19830SampleInfo, GSE19830ProbesetInfo, file="GSE19830.RData")
# Next time just load the RData file
load("GSE19830.RData")
exprLinear = GSE19830ExprLinear
exprLinearClasses = NULL
sampleInfo = GSE19830SampleInfo
probesetInfo = GSE19830ProbesetInfo
genesymb = GSE19830genesymb
MyDistinguishers <- gecd_CellDistinguisher(exprLinear[,-(1:9)], genesymb=genesymb, numCellClasses=3, probesWithGenesOnly=TRUE, minDistinguisherAlternatives=1, maxDistinguisherAlternatives=100, minAlternativesLengthsNormalized=0.5, expressionQuantileForFilter=0.999, expressionConcentrationRatio=0.333, verbose=0)
head(MyDistinguishers$bestDistinguishersGeneNames)
# Output #1 - list of distinguisher genes for each cell class/type
write.table(MyDistinguishers$bestDistinguishersGeneNames, file="distinguishers.tsv",sep="\t", col.names = F, row.names = F, quote=FALSE)
## Perform deconvolution with default method
MyDeconvolution <- gecd_DeconvolutionByDistinguishers(exprLinear, MyDistinguishers$bestDistinguishers, nonNegativeOnly=TRUE, convexSolution=TRUE)
sampleCompositions <- MyDeconvolution$sampleCompositions
## Transpose the matrix to have the cell types as columns, samples as rows
sampleCompositions <- t(sampleCompositions)
## Cell/tissue types are labeled with the top distinguisher gene
colnames(sampleCompositions) <- MyDistinguishers$bestDistinguishersGeneNames[1,]
# Output #2: Sample composition saved into a tsv file
write.table(t(round(sampleCompositions, 3)), col.names= T, file="rat_tissue_sample_compositions_top50dist.txt", sep="\t", quote = F)