First we will make up, errrr, simulate data

set.seed(13342143)
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] magrittr_1.5    formatR_1.2     tools_3.2.2     htmltools_0.2.6
##  [5] yaml_2.1.13     stringi_0.5-5   rmarkdown_0.7   knitr_1.11     
##  [9] stringr_1.0.0   digest_0.6.8    evaluate_0.7.2
setwd("~/Desktop/Projects/NGS-course/SLDC")

# simulate read counts under a log normal distribution
go.1 <- rlnorm(n = 1e4, meanlog = 3, sdlog = 1)
head(go.1)
## [1] 104.032753  21.115618  13.296879   7.716048  28.856369  33.456710
hist(go.1, col = "grey", main = "", las = 1, breaks = 50)

Now we will simulate data for males and females under equal expression

#Male data
M.exp <- go.1 + rnorm(1e4, mean = 0, sd = 10)
M.exp[M.exp < 0.1] <- 0.1 # we will be using log2, and don't want the world to implode.

#Female data
F.exp <- go.1 + rnorm(1e4, mean = 0, sd = 10)
F.exp[F.exp < 0.1] <- 0.1

# Combine into data.frame
df.2 <- data.frame(exp = go.1, male = M.exp, female = F.exp, chr = as.factor(rep(1:10)))
head(df.2)
##          exp       male    female chr
## 1 104.032753 108.777455 104.35075   1
## 2  21.115618  26.920815  35.79136   2
## 3  13.296879   8.237346  11.17147   3
## 4   7.716048   0.100000  25.14156   4
## 5  28.856369  38.871033  26.13981   5
## 6  33.456710  39.967444  27.40130   6
tail(df.2)
##             exp      male    female chr
## 9995  22.001704 25.884432 35.098484   5
## 9996  12.251902 21.353553  7.301167   6
## 9997   3.623728  0.100000  0.100000   7
## 9998   1.507560  1.495755 23.806271   8
## 9999  14.583080 15.142078 10.117831   9
## 10000 51.379336 46.607079 45.080487  10
str(df.2)
## 'data.frame':    10000 obs. of  4 variables:
##  $ exp   : num  104.03 21.12 13.3 7.72 28.86 ...
##  $ male  : num  108.78 26.92 8.24 0.1 38.87 ...
##  $ female: num  104.4 35.8 11.2 25.1 26.1 ...
##  $ chr   : Factor w/ 10 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...

We can explore the data a bit:

plot(M.exp, F.exp, pch = 19, las = 1, col = rgb(0, 0, 0, 0.2))

#### Make your own M:A plot
plot(log2(go.1), log2(M.exp) - log2(F.exp), pch = 19, col = rgb(0, 0, 0, 0.2), ylim = c(-10, 10), xlim = c(0, 10), las = 1, xlab = expression(paste("log Expression")), ylab = expression(paste("log"[2], " Male - log"[2], " Female")))

# plot males count density
plot(density(log2(M.exp)), lwd = 3, main = "") 
# overlay female count density
lines(density(log2(F.exp)), lwd = 3, main = "", col = "red", lty = 2) 
# overlay the base expression
lines(density(log2(df.2$exp)), lwd = 3, col = "dodgerblue", lty = 4) 

Note the bimodality? That is a consequence of not wanting the world to implode. We’ll get rid of that in short order.

Let’s plot the log2 expression for males and females by chromosome:

par(mfrow = c(1, 2))
# Chromosome 1 is the sex chromosome
boxplot(log2(df.2$male) ~ df.2$chr, las =1, pch = 19, col = c("red", rep("dodgerblue", 9)), ylim = c(-1, 9), main = "male", outline = FALSE) 
boxplot(log2(df.2$female) ~ df.2$chr, las =1, pch = 19, col = c("red", rep("dodgerblue", 9)), ylim = c(-1, 9), main = "female", outline = FALSE)

par(mfrow = c(1, 1))

Cool, Males and Females show the same expression by chromosome. We can look at this other ways:

# This is chromosome by chromosome
boxplot((log2(df.2$male) - log2(df.2$female)) ~ df.2$chr, pch = 19, col = c("red", rep("dodgerblue", 9)), las = 1, ylim = c(-3, 3), main = expression(paste("log"[2], "Male - log"[2], "Female")), outline = FALSE) 

# Autosomes vs. sex-chromosome
plot(density(log2(df.2$male[df.2$chr != 1]) - log2(df.2$female[df.2$chr != 1])), lwd = 3, las = 1, main = "", xlab = "")
lines(density(log2(df.2$male[df.2$chr == 1]) - log2(df.2$female[df.2$chr == 1])), lwd = 3, col = "red")

Now let’s add a dosage effect that is equal in both sexes (70% reduction)

df.2$M.z <- df.2$male
df.2$M.z[df.2$chr == 1] <- df.2$male[df.2$chr == 1] * 0.7

df.2$F.z <- df.2$female
df.2$F.z[df.2$chr == 1] <- df.2$female[df.2$chr == 1] * 0.7
head(df.2)
##          exp       male    female chr       M.z      F.z
## 1 104.032753 108.777455 104.35075   1 76.144219 73.04553
## 2  21.115618  26.920815  35.79136   2 26.920815 35.79136
## 3  13.296879   8.237346  11.17147   3  8.237346 11.17147
## 4   7.716048   0.100000  25.14156   4  0.100000 25.14156
## 5  28.856369  38.871033  26.13981   5 38.871033 26.13981
## 6  33.456710  39.967444  27.40130   6 39.967444 27.40130
par(mfrow = c(1, 2))
boxplot(log2(df.2$M.z) ~ df.2$chr, las =1, pch = 19, col = c("red", rep("dodgerblue", 9)), ylim = c(-1, 9), main = "male", outline = FALSE)

boxplot(log2(df.2$F.z) ~ df.2$chr, las =1, pch = 19, col = c("red", rep("dodgerblue", 9)), ylim = c(-1, 9), main = "female", outline = FALSE)

par(mfrow = c(1, 1))

The effect is pretty visible in the chromosome plots

Now let’s look at what happens if we double male expression

df.2$trt <- df.2$male
df.2$trt[df.2$chr == 1] <- df.2$male[df.2$chr == 1] * 2
head(df.2)
##          exp       male    female chr       M.z      F.z        trt
## 1 104.032753 108.777455 104.35075   1 76.144219 73.04553 217.554910
## 2  21.115618  26.920815  35.79136   2 26.920815 35.79136  26.920815
## 3  13.296879   8.237346  11.17147   3  8.237346 11.17147   8.237346
## 4   7.716048   0.100000  25.14156   4  0.100000 25.14156   0.100000
## 5  28.856369  38.871033  26.13981   5 38.871033 26.13981  38.871033
## 6  33.456710  39.967444  27.40130   6 39.967444 27.40130  39.967444
par(mfrow = c(1, 2))
boxplot(log2(df.2$trt) ~ df.2$chr, las =1, pch = 19, col = c("red", rep("dodgerblue", 9)), ylim = c(-1, 10), main = "male", outline = FALSE)

boxplot(log2(df.2$F.z) ~ df.2$chr, las =1, pch = 19, col = c("red", rep("dodgerblue", 9)), ylim = c(-1, 10), main = "female", outline = FALSE)

par(mfrow = c(1, 1))

You can really see male expression go up:

# with density plots for male vs. female
plot(density(log2(df.2$trt[df.2$chr != 1]) - log2(df.2$F.z[df.2$chr != 1])), lwd = 3, las = 1, main = "", xlab = "", ylim = c(0, 0.65))
lines(density(log2(df.2$trt[df.2$chr == 1]) - log2(df.2$F.z[df.2$chr == 1])), lwd = 3, col = "red")

# and with M:F boxplots
boxplot((log2(df.2$trt) - log2(df.2$female)) ~ df.2$chr, pch = 19, col = c("red", rep("dodgerblue", 9)), las = 1, ylim = c(-3, 4), main = expression(paste("log"[2], "Male - log"[2], "Female")), outline = FALSE)

Now let’s play with some imaginary data from the silkmoth Bombyx mori

# T <- FALSE
# T
# Bmori.dat <- read.csv("Data/Bmori-data.csv", header = T)
# head(Bmori.dat) 
Bmori.dat <- read.csv("Data/Bmori-data.csv", header = TRUE)
head(Bmori.dat)
##   X      DNA    rpkm.f67    rpkm.m68    rpkm.f69    rpkm.m70   Assembly
## 1 1 gene7136   416.36491   358.92354   413.79858   382.33994 ASM15162v1
## 2 2 gene7137 16777.64269 17743.90956 15144.15295 16193.52100 ASM15162v1
## 3 3 gene6997   120.59783   124.01753   123.03134   123.73535 ASM15162v1
## 4 4 gene7279    56.31174    76.32931    85.97140    59.51128 ASM15162v1
## 5 5 gene7168    99.88249   106.29217    98.38808   136.62353 ASM15162v1
## 6 6 gene7023     0.00000     0.00000     0.00000     0.00000 ASM15162v1
##   GenBank.Accession.version       NCBI.name chr
## 1                DF090316.1 GPS_001612900.1  13
## 2                DF090316.1 GPS_001612900.1  13
## 3                DF090316.1 GPS_001612900.1  13
## 4                DF090316.1 GPS_001612900.1  13
## 5                DF090316.1 GPS_001612900.1  13
## 6                DF090316.1 GPS_001612900.1  13
# TRUE <- FALSE
# TRUE <- F
# TRUE <- "anything"
str(Bmori.dat)
## 'data.frame':    12690 obs. of  10 variables:
##  $ X                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DNA                      : Factor w/ 12690 levels "gene100","gene1000",..: 9584 9585 9435 9742 9619 9465 9462 9383 9503 9508 ...
##  $ rpkm.f67                 : num  416.4 16777.6 120.6 56.3 99.9 ...
##  $ rpkm.m68                 : num  358.9 17743.9 124 76.3 106.3 ...
##  $ rpkm.f69                 : num  413.8 15144.2 123 86 98.4 ...
##  $ rpkm.m70                 : num  382.3 16193.5 123.7 59.5 136.6 ...
##  $ Assembly                 : Factor w/ 1 level "ASM15162v1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ GenBank.Accession.version: Factor w/ 176 levels "DF090316.1","DF090317.1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ NCBI.name                : Factor w/ 176 levels "GPS_001612571.1",..: 141 141 141 141 141 141 141 141 141 141 ...
##  $ chr                      : int  13 13 13 13 13 13 13 13 13 13 ...

The data are:

# Make your own M:A plots
M.Mcont_Fcont<- log2(Bmori.dat$rpkm.m68) - log2(Bmori.dat$rpkm.f67) 
A.Mcont_Fcont <- log2((Bmori.dat$rpkm.m68 + Bmori.dat$rpkm.f67) / 2)

M.Mcont_Fexp <- log2(Bmori.dat$rpkm.m68) - log2(Bmori.dat$rpkm.f69)
A.Mcont_Fexp <- log2((Bmori.dat$rpkm.m68 + Bmori.dat$rpkm.f69) / 2)

M.Mexp_Fcont <- log2(Bmori.dat$rpkm.m70) - log2(Bmori.dat$rpkm.f67)
A.Mexp_Fcont <- log2((Bmori.dat$rpkm.m70 + Bmori.dat$rpkm.f67) / 2)

M.Mexp_Fexp <- log2(Bmori.dat$rpkm.m70) - log2(Bmori.dat$rpkm.f69)
A.Mexp_Fexp <- log2((Bmori.dat$rpkm.m70 + Bmori.dat$rpkm.f69) / 2)

# make the four pairwise plots
par(mfrow = c(2, 2))
plot(A.Mcont_Fcont, M.Mcont_Fcont, main = "Mcont - Fcont", pch = 19, col = rgb(0, 0, 0, 0.1), cex = 0.8, ylim = c(-4, 6), xlim = c(-5, 17), ylab = expression(paste("log"[2], " expression")), xlab = "")
abline(h = 0, lwd = 3, lty = 1, col = "red")

plot(A.Mcont_Fcont, M.Mcont_Fexp, main = "Mcont - Fexp", pch = 19, col = rgb(0, 0, 0, 0.1), cex = 0.8, ylim = c(-8, 8), xlim = c(-5, 17), xlab = "")
abline(h = 0, lwd = 3, lty = 1, col = "red")

plot(A.Mexp_Fcont, M.Mexp_Fcont, main = "Mexp - Fcont", pch = 19, col = rgb(0, 0, 0, 0.1), cex = 0.8, ylim = c(-6, 8), xlim = c(-5, 17), ylab = expression(paste("log"[2], " expression")), xlab = "Expected count")
abline(h = 0, lwd = 3, lty = 1, col = "red")

plot(A.Mexp_Fexp, M.Mexp_Fexp, main = "Mexp - Fexp", pch = 19, col = rgb(0, 0, 0, 0.1), cex = 0.8, ylim = c(-5, 7), xlim = c(-5, 17), xlab = "Expected count")
abline(h = 0, lwd = 3, lty = 1, col = "red")

par(mfrow = c(1, 1))

Now we plot Male and female control groups by chromosome

par(mfrow = c(1, 2))
boxplot(log2(Bmori.dat$rpkm.m68[Bmori.dat$rpkm.m68 >= 1]) ~ Bmori.dat$chr[Bmori.dat$rpkm.m68 >= 1], pch = 19, notch = TRUE, col = c("red", rep("grey", 27)), na.rm = TRUE, las = 1, ylim = c(-1, 15), main = "Control (male)", ylab = expression(paste("log"[2]," expression (RPKM)")), outline = FALSE)

boxplot(log2(Bmori.dat$rpkm.m70[Bmori.dat$rpkm.m70 >= 1]) ~ Bmori.dat$chr[Bmori.dat$rpkm.m70 >= 1],  pch = 19, notch = TRUE, col = c("red", rep("grey", 27)), na.rm = TRUE, las = 1, ylim = c(-1, 15), main = expression(paste(italic("Expr"), " knockdown (male)")), ylab = expression(paste("log"[2]," expression (RPKM)")), outline = FALSE)

par(mfrow = c(1, 1))

We’ll write a function to speed things up a bit:

plot.ratios <- function(chr = Bmori.dat$chr, minRPKM = 1, sampA, sampB, ...) {
    rpkm <- data.frame(chr, sampA, sampB)
    filter <- (sampA >= minRPKM & sampB >= minRPKM)
    rpkm <- rpkm[filter,]
    rpkm$lratio <- log2(rpkm$sampA) - log2(rpkm$sampB)
    boxplot(rpkm$lratio ~ rpkm$chr, ...)
}
plot.ratios(sampA = Bmori.dat$rpkm.m70, sampB=Bmori.dat$rpkm.f69, minRPKM = 0.1,  main = expression(paste(italic(Expr), " Male vs. Female")), outline = FALSE,  col = c("red", rep("grey", 27)), notch = TRUE, ylab = expression(paste("log" [2], "(Male) - log" [2], "(Female)")), las = 1)

#### Clearly the experimental treatment has resulted in the upregulation of the sex chromosome

We can also plot sex-chromosome vs. autosomes:

plot.density.za <- function(chr = Bmori.dat$chr, minRPKM = 1, sampA, sampB, ...) {
    rpkm <- data.frame(chr, sampA, sampB)
    filter <- (sampA > minRPKM & sampB > minRPKM)
    rpkm <- rpkm[filter,]
    rpkm$lratio <- log2(rpkm$sampA) - log2(rpkm$sampB)
    rpkm$za <- ifelse(rpkm$chr == 1, "Z", "A")
    Zvals <- rpkm$lratio[rpkm$za == "Z"]
    Avals <- rpkm$lratio[rpkm$za == "A"]
    plot(density(Avals), las = 1, ...)
    abline(v = median(Avals), lty = 2, lwd = 3)
    lines(density(Zvals), col = "red", lwd = 3)
    abline(v = median(Zvals), lty = 2, col = "red", lwd = 3)
}

plot.density.za(sampA = Bmori.dat$rpkm.m70, sampB = Bmori.dat$rpkm.f69, minRPKM = 1,  main = expression(paste(italic("Masc"), ": Male vs. Female ")), xlim = c(-2,2), lwd = 3, xlab = expression(paste("Log"[2], " (Male:Female)")), ylab = "")
legend("topright", legend = c("Z Chromosome", "Autosomes"), lty = 1, col = c("red", "black"), bty = "n", lwd = 3)

get.za.stats <- function(chr = Bmori.dat$chr, minRPKM = 1, samp, plot.it = FALSE, ...) {
    rpkm <- data.frame(chr, samp)
    rpkm <- rpkm[samp > minRPKM,]
    rpkm$za <- ifelse(rpkm$chr == 1, "Z", "A")
    if (plot.it == TRUE) {
        boxplot(log2(rpkm$samp) ~ rpkm$za, ...)
    }
    Zvals <- rpkm$samp[rpkm$za == "Z"]
    Avals <- rpkm$samp[rpkm$za == "A"]
    Zmean <- round(mean(Zvals), digits = 3)
    Zmedian <- round(median(Zvals), digits = 3)
    Amean <- round(mean(Avals), digits = 3)
    Amedian <- round(median(Avals), digits = 3)
    za.mean <- round(Zmean/Amean, digits = 4)
    za.median <- round(Zmedian/Amedian, digits = 4)
    wilcox <- wilcox.test(Zvals, Avals)
    out.stats <- list(  "Zmean" = Zmean, "Amean" = Amean, "Z:A_mean"= za.mean, "Zmedian" = Zmedian,  "Amedian" = Amedian, "Z:A_median" = za.median, "ZvA_MWU-p" = signif(wilcox$p.value, digits = 4), "n_Zgenes" = length(Zvals), "n_Agenes" = length(Avals) 
        )               
    return(out.stats)
}
sapply(Bmori.dat[3:6], FUN = function(x) get.za.stats(samp = x))
##            rpkm.f67  rpkm.m68  rpkm.f69  rpkm.m70
## Zmean      206.727   218.62    228.218   316.713 
## Amean      418.812   425.255   402.138   416.213 
## Z:A_mean   0.4936    0.5141    0.5675    0.7609  
## Zmedian    54.129    55.004    61.61     85.748  
## Amedian    104.553   101.347   102.622   99.977  
## Z:A_median 0.5177    0.5427    0.6004    0.8577  
## ZvA_MWU-p  2.189e-10 1.794e-08 1.123e-06 0.04087 
## n_Zgenes   408       415       435       443     
## n_Agenes   9710      9782      9962      9833