From ffdceb69aba9221123b6272f7745ef20fbda6ff2 Mon Sep 17 00:00:00 2001 From: fwip Date: Wed, 14 Mar 2018 15:52:04 -0700 Subject: [PATCH 01/15] Add general.R to repository --- code_build.R | 2 +- code_matrix.R | 2 +- code_overlap.R | 2 +- general.R | 2732 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 2735 insertions(+), 3 deletions(-) create mode 100644 general.R diff --git a/code_build.R b/code_build.R index 04fae35..0d05c25 100644 --- a/code_build.R +++ b/code_build.R @@ -7,7 +7,7 @@ # # ########################################################################################################## -source("../../code/general.R") +source("general.R") library(caTools) source("code_ML.R") diff --git a/code_matrix.R b/code_matrix.R index 9137340..691a1a2 100644 --- a/code_matrix.R +++ b/code_matrix.R @@ -7,7 +7,7 @@ # # ########################################################################################################## -source("../../code/general.R") +source("general.R") library(caTools) library(Matrix) source("code_ML.R") diff --git a/code_overlap.R b/code_overlap.R index ddf4c88..8ae85f2 100644 --- a/code_overlap.R +++ b/code_overlap.R @@ -7,7 +7,7 @@ # # ########################################################################################################## -source("../../code/general.R") +source("general.R") library(caTools) source("code_ML.R") diff --git a/general.R b/general.R new file mode 100644 index 0000000..5df0495 --- /dev/null +++ b/general.R @@ -0,0 +1,2732 @@ +#library(CGtools) + +chrom_info <- list(human = paste("chr", c(1:22, "X"), sep=""), # hg18 + mouse = paste("chr", c(1:19, "X"), sep=""), # mm9 + fly = c("2L", "2R", "3L", "3R", "4", "X"), # dm3 + fugu = "chrUn"); + +#chromsize_info <- list(human = chromSE_HSA_NCBI36, # hg18 +# mouse = chromSE_MM_NCBIm37, # mm9 +# fly = chromSE_DMEL_R5, #dm3 +# fugu = data.frame(seqname="chrUn", start=1, end=400509343)); + +id <- function() paste("WM", gsub("-", "", Sys.Date()), sep="") + +#runmean <- function(x,k) { +# if ((k %% 2) == 0) { k <- k+1; } +# hk <- k %/% 2; +# +# res <- diff(cumsum(c(0,x)),k)/k +# +# #c(rep(NA, hk), res, rep(NA, hk)); +# c(rep(res[1], hk), res, rep(res[length(res)], hk)); +#} + +runsum <- function(x,k) { + if ((k %% 2) == 0) { k <- k+1; } + hk <- k %/% 2; + + res <- diff(cumsum(c(0,x)),k) + + #c(rep(NA, hk), res, rep(NA, hk)); + c(rep(res[1], hk), res, rep(res[length(res)], hk)); +} + +runsd <- function(x,k) { + if ((k %% 2) == 0) { k <- k+1; } + hk <- k %/% 2; + + a<-diff(c(0,cumsum(x^2)),k)*k + b<-diff(c(0,cumsum(x)),k)^2 + d<-(a-b)/(k*(k-1)) + res <- sqrt(d) + + #c(rep(NA, hk), res, rep(NA, hk)); + c(rep(res[1], hk), res, rep(res[length(res)], hk)); +} + +runvar <- function(x,k) { + if ((k %% 2) == 0) { k <- k+1; } + hk <- k %/% 2; + + a<-diff(c(0,cumsum(x^2)),k)*k + b<-diff(c(0,cumsum(x)),k)^2 + d<-(a-b)/(k*(k-1)) + res <- d + + #c(rep(NA, hk), res, rep(NA, hk)); + c(rep(res[1], hk), res, rep(res[length(res)], hk)); +} + +# Function to calculate a running t-statistic for samples with (unknown) unequal +# variance and size (i.e., like in Welch t-test) +runt <- function(x, k) { + if ((k %% 2) != 0) { k <- k+1; } + hk <- k %/% 2; + + #m <- runmean(x, k=hk) + m <- runsum(x, k=hk)/hk + m1 <- c(rep(NA, hk), m) + m2 <- c(m, rep(NA, hk)) + s <- runvar(x, k=hk) + s1 <- c(rep(NA, hk), s) + s2 <- c(s, rep(NA, hk)) + + res <- (m1 - m2) / sqrt(s1/hk + s2/hk) + hhk <- hk %/% 2; + res[(hhk+1):(length(res)-(hhk+1))] +} + +## Originally by Elzo de Wit (does not work properly) +#runcor <- function(a,b,n) { +# ab <- a * b +# prod <- diff(c(0, cumsum(ab)), n) +# #sub <- n * runmean(a,n) * runmean(b,n) +# sub <- n * (runsum(a, k=n)/n) * (runsum(b, k=n)/n) +# bottom <- (n-1) * runsd(a,n) * runsd(b,n) +# (prod-sub)/bottom +#} + + +odd <- function(x) { x[x %% 2 == 1] } +even <- function(x) { x[x %% 2 == 0] } + +readGFF <- function(file, purge=c(), nlines = NULL, skip = 0, verbose=FALSE) { + if(is.null(nlines)) + nlines <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) + + if(verbose) + cat(paste("Reading in", nlines, "lines...")); + + what_list <- list(seqname='c', source='c', feature='c', start=0, end=0, + score=0.1, strand='0', frame='c', attribute='c'); + + if (length(purge) > 0) { what_list[purge] <- list(NULL); } + + data <- scan(file, nmax=nlines, skip=skip, what=what_list, sep='\t', quiet=TRUE); + + if (length(purge) > 0) { data[purge] <- NULL } + + class(data) <- "data.frame"; + row.names(data) <- 1:nlines; + + if(verbose) + cat("done.\n"); + + data$score <- as.numeric(data$score); + if (all(is.na(data$score))) { data$score <- 0; } + + invisible(data); +} + + +writeGFF <- function(data, file) { + n <- nrow(data); + newdata <- data.frame( seqname= rep(".", n), source = rep(".", n), feature = rep(".", n), + start = rep(0, n), end = rep(0, n), score = rep(0, n), + strand = rep(".", n), frame = rep(".", n), attribute = rep(".", n) ) + for (i in colnames(data)) { + if (i %in% names(newdata)) { + newdata[,i] <- data[,i]; + } + } + + write.table(newdata, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE); +} + +readBED <- function(file, purge=c(), nlines = NULL, skip = 0, verbose = FALSE) { + if(is.null(nlines)) + nlines <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) + + if(verbose) + cat(paste("Reading in", nlines, "lines...")); + + what_list <- list(seqname='c', start=0, end=0, name='c', score='c', strand='c'); + + if (length(purge) > 0) { what_list[purge] <- list(NULL); } + + data <- scan(file, nmax=nlines, skip=skip, what=what_list, sep='\t', quiet=TRUE); + + if (length(purge) > 0) { data[purge] <- NULL } + + class(data) <- "data.frame"; + row.names(data) <- 1:nlines; + + if(verbose) + cat("done.\n"); + + data$score <- as.numeric(data$score); + if (all(is.na(data$score))) { data$score <- 0; } + + invisible(data); +} + +readBED4 <- function(file, purge=c(), nlines = NULL, skip = 0, verbose = FALSE) { + if(is.null(nlines)) + nlines <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) + + if(verbose) + cat(paste("Reading in", nlines, "lines...")); + + what_list <- list(seqname='c', start=0, end=0, score='c'); + + if (length(purge) > 0) { what_list[purge] <- list(NULL); } + + data <- scan(file, nmax=nlines, skip=skip, what=what_list, sep='\t', quiet=TRUE); + + if (length(purge) > 0) { data[purge] <- NULL } + + class(data) <- "data.frame"; + row.names(data) <- 1:nlines; + + if(verbose) + cat("done.\n"); + + data$score <- as.numeric(data$score); + if (all(is.na(data$score))) { data$score <- 0; } + + invisible(data); +} + + +writeBED <- function(data, file, ...) { + n <- nrow(data); + newdata <- data.frame( seqname= rep(".", n), start = rep(0, n), end = rep(0, n), + name = rep(".", n), score = rep(0, n), strand = rep(".", n) ) + for (i in colnames(data)) { + if (i %in% names(newdata)) { + newdata[,i] <- data[,i]; + } + } + + write.table(newdata, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, ...); +} + +writeWIG <- function(data, file) { + n <- nrow(data); + chroms <- unique(data$seqname); + span <- median(data$end-data$start); + append <- FALSE; + for (chr in chroms) { + write(paste("variableStep chrom=", chr, " span=", span, sep=""), file=file, append=append) + write.table(data[data$seqname==chr, c("start", "score")], file=file, + quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, append=TRUE); + append <- TRUE; + } +} + +get_overlap <- function(regions, feats, modus=1, minOverlap=0.5) { + region_feats = NULL; + # We loop over regions, because we generally have + # less regions than features. + for (i in 1:nrow(regions)) { + if (i %% 100 == 0) { cat(paste(i, "\r")); } + + chr_hits <- feats$seqname == regions$seqname[i]; + cont_hits <- 1; # By default, this is satisfied. Only used for modus 3. + # Options: 1 => fully contained, 2 => partly contained, + # 3 => fraction contained at least 'minOverlap' + if (modus == 1) { + start_hits <- feats$start > regions$start[i]; + end_hits <- feats$end < regions$end[i]; + } else if (modus == 2 || modus == 3) { + start_hits <- feats$end > regions$start[i]; + end_hits <- feats$start < regions$end[i]; + + if (modus == 3) { + # Find position of feature relative to current region + pend <- pmin(feats$end, regions$end[i]) + pstart <- pmax(feats$start, regions$start[i]) + overlap <- (pend - pstart) / (feats$end - feats$start) + cont_hits <- overlap >= minOverlap + } + } else { + error("Unknown modus: use either 1, 2 or 3") + } + hits <- chr_hits & start_hits & end_hits & cont_hits + + if (any(hits, na.rm=TRUE)) { + hit_feats <- feats[which(hits),] + feats <- feats[-which(hits),]; + hit_feats$frame = paste("Region", i); + region_feats <- rbind(region_feats, hit_feats); + } + } + cat("\n"); + + list(fg=region_feats, bg=feats) +} + + +plot_regional_expression <- function(regions, file, delta=FALSE) { + genes <- readGFF("/data/people/meuleman/projects/cons/LADs_fourth_strike/data/Ensembl50_Mmus_NCBI37_genes.gff") + exons <- readGFF("/data/people/meuleman/projects/cons/LADs_fourth_strike/data/Ensembl50_Mmus_NCBI37_exons.gff") + data_path <- "/data/datasets/Wouter/Efroni2008_AffyTiling/mm16tpmap/"; # Trailing slash! + + if (delta) { + pdf(file=file, width=10, version="1.4") + par(mar=c(4,4,2,2)) + par(mfrow=c(3,1)) + } else { + pdf(file=file, width=14) + par(mar=c(4,4,2,2)) + par(mfrow=c(2,1)) + } + + for (region in regions) { + print(region); + items <- unlist(strsplit(region, "[:-]")); + chr <- items[1]; + start <- as.numeric(items[2]); + end <- as.numeric(items[3]); + + exons_chr <- exons[exons$seqname==chr,]; + genes_chr <- genes[genes$seqname==chr,]; + + ES_name <- load(paste(data_path, chr, "_ES.RData", sep="")) + ES <- get(ES_name) + NP_name <- load(paste(data_path, chr, "_NP.RData", sep="")) + NP <- get(NP_name) + + #ES$score[ES$score <= 0] <- NA + #NP$score[NP$score <= 0] <- NA + + ES$score <- ES$score - 4 + NP$score <- NP$score - 4 + + sel <- (NP$start > start & NP$end < end) + ymax <- max(c(ES$score[sel], NP$score[sel]), na.rm=T); + + plot(0, type="n", xlim=c(start,end), ylim=c(-ymax/5,ymax), xlab="Genomic location", + ylab="signal", col=2, main=paste("ESC (", region, ")", sep="")) + rect(ES$start, 0, ES$end, ES$score, border="NA", col=2) + # Genes + rect(genes_chr$start[genes_chr$strand=="+"], -1, genes_chr$end[genes_chr$strand=="+"], -ymax/10, border=TRUE) + rect(genes_chr$start[genes_chr$strand=="-"], -ymax/10, genes_chr$end[genes_chr$strand=="-"], -ymax/5, border=TRUE) + # Exons + rect(exons_chr$start[exons_chr$strand=="+"], -1, exons_chr$end[exons_chr$strand=="+"], -ymax/10, + col=as.factor(exons_chr$attribute[exons_chr$strand=="+"])) + rect(exons_chr$start[exons_chr$strand=="-"], -ymax/10, exons_chr$end[exons_chr$strand=="-"], -ymax/5, + col=as.factor(exons_chr$attribute[exons_chr$strand=="-"])) + + plot(0, type="n", xlim=c(start,end), ylim=c(-ymax/5,ymax), xlab="Genomic location", + ylab="signal", col=3, main=paste("NPC (", region, ")", sep="")) + rect(NP$start, 0, NP$end, NP$score, border="NA", col=3) + # Genes + rect(genes_chr$start[genes_chr$strand=="+"], -1, genes_chr$end[genes_chr$strand=="+"], -ymax/10, border=TRUE) + rect(genes_chr$start[genes_chr$strand=="-"], -ymax/10, genes_chr$end[genes_chr$strand=="-"], -ymax/5, border=TRUE) + # Exons + rect(exons_chr$start[exons_chr$strand=="+"], -1, exons_chr$end[exons_chr$strand=="+"], -ymax/10, + col=as.factor(exons_chr$attribute[exons_chr$strand=="+"])) + rect(exons_chr$start[exons_chr$strand=="-"], -ymax/10, exons_chr$end[exons_chr$strand=="-"], -ymax/5, + col=as.factor(exons_chr$attribute[exons_chr$strand=="-"])) + + if (delta) { + ratio_name <- load(paste(data_path, chr, ".RData", sep="")) + ratio <- get(ratio_name) + dScore <- ratio$score + + nonNA <- (!is.na(ES$score) & !is.na(NP$score) & !is.na(dScore)); + ES <- ES[nonNA,] + NP <- NP[nonNA,] + dScore <- dScore[nonNA] + + # Used for running medians + ES$start <- NP$start <- runmed(NP$start, k=49) + ES$end <- NP$end <- runmed(NP$end, k=49) + ES$score <- runmed(ES$score, k=49) + NP$score <- runmed(NP$score, k=49) + dScore <- runmed(dScore, k=49); + + #dScore <- log2(ES$score/NP$score); + sel <- (NP$start > start & NP$end < end) + + yrange <- range(c(0, dScore[sel]), finite=TRUE); + plot(0, type="n", xlim=c(start,end), ylim=yrange, xlab="Genomic location", + ylab="signal", main=paste("Z-score (", region, ")", sep="")) + + use_alpha <- FALSE; + if (use_alpha) { + signal <- ES$score+NP$score + alphas <- (signal - min(signal)) / (max(signal) - min(signal)) + rect(ES$start[dScore>0], 0, ES$end[dScore>0], dScore[dScore>0], + border="NA", col=rgb(1,0,0,alphas[dScore>0])) + rect(NP$start[dScore<0], dScore[dScore<0], NP$end[dScore<0], 0, + border="NA", col=rgb(0,1,0,alphas[dScore<0])) + } else { + rect(ES$start[dScore>0], 0, ES$end[dScore>0], dScore[dScore>0], + border="NA", col=rgb(1,0,0)) + rect(NP$start[dScore<0], dScore[dScore<0], NP$end[dScore<0], 0, + border="NA", col=rgb(0,1,0)) + } + } + } + + dev.off() +} + + +CGintvlOverlap_Wouter <- function (domains, features, remapIndices = TRUE, fun=mean, ...) { + allseq = unique(c(domains$seqname, features$seqname)) + res = vector(mode = "list", length = length(allseq)) + names(res) = allseq + warnings <- c() + for (chr in allseq) { + print(chr) + d <- domains[domains$seqname == chr, ] + fOrder <- which(features$seqname == chr) + f <- features[fOrder, ] + res[[chr]] <- .domOverlap(d$start, d$end, f$start, f$end, + ...) + warnings <- c(warnings, res[[chr]]$warn) + res[[chr]] <- res[[chr]]$overlap + + #res[[chr]] <- lapply(res[[chr]], function(v) median(f$score[v], na.rm=T)) + res[[chr]] <- lapply(res[[chr]], function(v) fun(f$score[v])) + #res[[chr]] <- lapply(res[[chr]], function(v) f$score[v]) + #res[[chr]] <- lapply(res[[chr]], function(v) min(f$score[v])) +# if (remapIndices) +# res[[chr]] <- lapply(res[[chr]], function(v) fOrder[v]) + } + res +} + +# For every (extended) domain, the function finds the features contained in it. +# Then, the index of the nearest neighbour feature (based on domain and feature +# centers) is returned. If no features are found, NA is returned. +CGintvlOverlap_Wouter_closest <- function (domains, features, ...) { + allseq = unique(c(domains$seqname, features$seqname)) + res = vector(mode = "list", length = length(allseq)) + names(res) = allseq + warnings <- c() + for (chr in allseq) { + print(chr) + d <- domains[domains$seqname == chr, ] + fOrder <- which(features$seqname == chr) + f <- features[fOrder, ] + dctr <- (d$start+d$end)/2 + fctr <- (f$start+f$end)/2 + res[[chr]] <- .domOverlap(d$start, d$end, f$start, f$end, ...) + warnings <- c(warnings, res[[chr]]$warn) + res[[chr]] <- res[[chr]]$overlap + + di <- 0; + res[[chr]] <- lapply(res[[chr]], function(v) { + di <<- di + 1; + if (length(v) > 0) + fOrder[v[which.min(abs(fctr[v] - dctr[di]))]] + else + NA + }) + } + res +} + + +fix_overlap <- function(x.in) { + # create matrix to determine queue size (overlap) + x.q <- rbind(cbind(x.in$start, 1, 1:nrow(x.in)), + cbind(x.in$end+1, -1, 1:nrow(x.in))) + # sort + x.q <- x.q[order(x.q[,1], x.q[,2]),] + x.q <- cbind(x.q, queue=cumsum(x.q[,2])) + + end <- which(x.q[,4] == 0); + start <- c(1, end+1); + start <- start[-length(start)]; + + # This removes all overlapping features! + #diff_res <- apply(cbind(start, end), 1, diff) + #diff_hits <- which(diff_res > 1) + #diff_sel <- unlist(apply(cbind(start[diff_hits], + # end[diff_hits]), + # 1, function(x) x[1]:x[2])) + #x.in[-unique(x.q[diff_sel,3]),] + + data.frame(start=x.q[start,1], end=x.q[end,1]-1) +} + +plot_delta_dom_sizes <- function(delta_doms_pos, delta_doms_neg, chr_sizes, file="delta_doms_sizes.pdf") { + pdf(file=file) + + delta_doms <- rbind(delta_doms_pos, delta_doms_neg); + sizes <- delta_doms$end-delta_doms$start+1 + + empty_sizes <- chr_sizes; empty_sizes[names(chr_sizes)] <- 0; + + delta_sizes <- pos_delta_sizes <- neg_delta_sizes <- empty_sizes + tmp_sizes <- tapply(sizes, delta_doms$seqname, function(x) sum(x)) + delta_sizes[names(tmp_sizes)] <- tmp_sizes + + print(cbind(median(sizes), length(sizes))); + plot(dens <- density(log10(sizes)), lwd=2, + main="Size of delta Lamin domains (+ & -)", xlab="log10(bp)", ylab="Density") + abline(v=median(log10(sizes)), lwd=2, col=2) + + barplot(round(delta_sizes/chr_sizes*100, 2), las=2, ylab="% of chromosome size", + main=paste("delta Lamin fractions (+ & -)\nGenome-wide: ", + round(sum(as.numeric(delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) + + delta_sizes <- chr_sizes + tmp_sizes <- tapply(sizes, delta_doms$seqname, function(x) sum(x)) + delta_sizes[names(delta_sizes)] <- 0; + delta_sizes[names(tmp_sizes)] <- tmp_sizes + + pos_sizes <- delta_doms_pos$end-delta_doms_pos$start+1 + pos_delta_sizes <- empty_sizes + tmp_sizes <- tapply(pos_sizes, delta_doms_pos$seqname, function(x) sum(x)) + pos_delta_sizes[names(tmp_sizes)] <- tmp_sizes + print(cbind(median(pos_sizes), length(pos_sizes))); + plot(dens <- density(log10(pos_sizes)), lwd=2, + main="Size of delta Lamin domains (+)", xlab="log10(bp)", ylab="Density") + abline(v=median(log10(pos_sizes)), lwd=2, col=2) + + pos_delta_frac <- round(pos_delta_sizes/chr_sizes*100, 2) + barplot(pos_delta_frac, las=2, ylab="% of chromosome size", col="blue", + main=paste("delta Lamin fractions (+)\nGenome-wide: ", + round(sum(as.numeric(pos_delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) + + neg_sizes <- delta_doms_neg$end-delta_doms_neg$start+1 + neg_delta_sizes <- empty_sizes + tmp_sizes <- tapply(neg_sizes, delta_doms_neg$seqname, function(x) sum(x)) + neg_delta_sizes[names(tmp_sizes)] <- tmp_sizes + print(cbind(median(neg_sizes), length(neg_sizes))); + plot(dens <- density(log10(neg_sizes)), lwd=2, + main="Size of delta Lamin domains (-)", xlab="log10(bp)", ylab="Density") + abline(v=median(log10(neg_sizes)), lwd=2, col=2) + + neg_delta_frac <- round(neg_delta_sizes/chr_sizes*100, 2) + barplot(neg_delta_frac, las=2, ylab="% of chromosome size", col="orange", + main=paste("delta Lamin fractions (-)\nGenome-wide: ", + round(sum(as.numeric(neg_delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) + + pos_neg_mat <- rbind(pos_delta_frac, neg_delta_frac) + barplot(pos_neg_mat, las=2, ylab="% of chromosome size", col=c("blue","orange"), + main=paste("delta Lamin fractions (+ & -)\nGenome-wide: ", + round(sum(as.numeric(delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) + legend("topright", "(x,y)", fill=c("blue","orange"), legend=c("+", "-")) + + dev.off() +} + +plot_detail <- function(ES, NP, NP_ES, delta_doms_pos, delta_doms_neg, chr, from, to, id="default", path="plots", pdf=FALSE, lwd=1) { + if (pdf) { + pdf(file=paste(path, "/detail_", chr, "_", from, "-", to, "_", id, ".pdf", sep=""), width=15) + } else { +# png(file=paste(path, "/detail_", chr, "_", from, "-", to, "_", id, ".png", sep=""), width=960) + } + + ES_chr <- ES[ES$seqname==chr,] + NP_chr <- NP[ES$seqname==chr,] + NP_ES_chr <- NP_ES[ES$seqname==chr,] + delta_doms_neg_chr <- delta_doms_neg[delta_doms_neg$seqname==chr,] + delta_doms_pos_chr <- delta_doms_pos[delta_doms_pos$seqname==chr,] + + ext <- (to - from) + print(paste("Plotting region ", chr, ":", from-ext, "-", to+ext, sep="")) + + par(mfrow=c(4,1), mar=c(0,2,0,1)) + plot(ES_chr$start, runmed(ES_chr$score, k=5), type="h", col="orange", xlim=c(from-ext, to+ext), xaxt="n", xaxs="i", lwd=lwd) + abline(v=c(from,to), lty=2) + plot(NP_chr$start, runmed(NP_chr$score, k=5), type="h", col="blue", xlim=c(from-ext, to+ext), xaxt="n", xaxs="i", lwd=lwd) + abline(v=c(from,to), lty=2) + plot(NP_ES_chr$start, runmed(NP_ES_chr$score, k=5), type="h", col=ifelse(runmed(NP_ES_chr$score, k=5) > 0, "blue", "orange"), xlim=c(from-ext, to+ext), xaxt="n", xaxs="i", lwd=lwd) + abline(v=c(from,to), lty=2) + plot(0, type="n", xlim=c(from-ext, to+ext), ylim=c(0, 2), yaxt="n", axes=FALSE, xlab="", ylab="", xaxs="i") + par(mar=c(8,2,0,1), new=TRUE) + plot(0, type="n", xlim=c(from-ext, to+ext), ylim=c(0, 2), yaxt="n", xaxs="i", xlab=paste("Chromosome", sub("chr", "", chr), "(bp)")) + if (!is.null(delta_doms_neg_chr)) + rect(delta_doms_neg_chr$start, 1.5, delta_doms_neg_chr$end, 1.7, col="grey") + if (!is.null(delta_doms_pos_chr)) + rect(delta_doms_pos_chr$start, 1.8, delta_doms_pos_chr$end, 2, col="grey") + +# dev.off() +} + + +plot_three_singles <- function(ES, NP, EF, ES_doms, NP_doms, EF_doms, id="default", path="plots", pdf=FALSE) { + chroms <- unique(ES$seqname) + for (i in chroms) { + print(i) + + if (pdf) { + pdf(file=paste(path, "/three_singles_", id, "_", i, ".pdf", sep=""), width=15) + } else { + png(file=paste(path, "/three_singles_", id, "_", i, ".png", sep=""), width=960) + } + + par(mfrow=c(3,1)) + + ylim <- quantile(ES$score[ES$seqname == i], probs=c(0.01,0.99)) + plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 49), type="h", bty="n", + xlab="genomic location (bp)", ylab="LaminB1 interaction", + ylim=ylim, main="ES cells", col="orange") + if (!is.null(ES_doms) && (nrow(ES_doms[ES_doms$seqname == i,]) > 0)) + rect(ES_doms$start[ES_doms$seqname == i], ylim[1], ES_doms$end[ES_doms$seqname == i], ylim[1]+0.2, col = "grey"); + + ylim <- quantile(NP$score[NP$seqname == i], probs=c(0.01,0.99)) + plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 49), type="h", bty="n", + xlab="genomic location (bp)", ylab="LaminB1 interaction", + ylim=ylim, main="NP cells", col="blue") + if (!is.null(NP_doms) && (nrow(NP_doms[NP_doms$seqname == i,]) > 0)) + rect(NP_doms$start[NP_doms$seqname == i], ylim[1], NP_doms$end[NP_doms$seqname == i], ylim[1]+0.2, col = "grey"); + + ylim <- quantile(EF$score[EF$seqname == i], probs=c(0.01,0.99)) + plot(EF$start[EF$seqname == i], runmed(EF$score[EF$seqname == i], 49), type="h", bty="n", + xlab="genomic location (bp)", ylab="LaminB1 interaction", + ylim=ylim, main="EF cells", col="magenta") + if (!is.null(EF_doms) && (nrow(EF_doms[EF_doms$seqname == i,]) > 0)) + rect(EF_doms$start[EF_doms$seqname == i], ylim[1], EF_doms$end[EF_doms$seqname == i], ylim[1]+0.2, col = "grey"); + + dev.off() + } +} + +plot_detail_specific <- function(signal1, signal2, delta, Entrez, sel) { + chr <- min(sel) + plot_detail(signal1, signal2, delta, NULL, NULL, chr=Entrez$seqname[chr], + from=Entrez$start[min(sel)], to=Entrez$end[max(sel)]) + rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1, + Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1.2, col=2) + rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.7, + Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.9, col=2) +} + +plot_detail_specific_Ensembl <- function(signal1, signal2, delta, Entrez, Ensembl, sel) { + chr <- min(sel) + plot_detail(signal1, signal2, delta, NULL, NULL, chr=Entrez$seqname[chr], + from=Entrez$start[min(sel)], to=Entrez$end[max(sel)]) + + rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1, + Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1.2, col=2) + rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.7, + Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.9, col=2) + + rect(Ensembl$start[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="+"], 0.4, + Ensembl$end[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="+"], 0.6, col=rgb(0,1,0,0.3)) + rect(Ensembl$start[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="-"], 0.3, + Ensembl$end[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="-"], 0.5, col=rgb(0,1,0,0.3)) +} + +plot_single_and_comb <- function(ES, NP, NP_ES, ES_doms, NP_doms, delta_doms_pos, delta_doms_neg, comb="delta", id="default", path="plots", pdf=FALSE) { + chroms <- unique(ES$seqname) + for (i in chroms) { + print(i) + + if (pdf) { + pdf(file=paste(path, "/single_and_", comb, "_", id, "_", i, ".pdf", sep=""), width=15) + } else { + png(file=paste(path, "/single_and_", comb, "_", id, "_", i, ".png", sep=""), width=960) + } + + par(mfrow=c(3,1)) + + ylim <- quantile(ES$score[ES$seqname == i], probs=c(0.01,0.99)) + plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 49), type="h", bty="n", + xlab="genomic location (bp)", ylab="LaminB1 interaction", + ylim=ylim, main="ES cells", col="orange") + if (!is.null(ES_doms) && (nrow(ES_doms[ES_doms$seqname == i,]) > 0)) + rect(ES_doms$start[ES_doms$seqname == i], ylim[1], ES_doms$end[ES_doms$seqname == i], ylim[1]+0.2, col = "grey"); + + ylim <- quantile(NP$score[ES$seqname == i], probs=c(0.01,0.99)) + plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 49), type="h", bty="n", + xlab="genomic location (bp)", ylab="LaminB1 interaction", + ylim=ylim, main="NP cells", col="blue") + if (!is.null(NP_doms) && (nrow(NP_doms[NP_doms$seqname == i,]) > 0)) + rect(NP_doms$start[NP_doms$seqname == i], ylim[1], NP_doms$end[NP_doms$seqname == i], ylim[1]+0.2, col = "grey"); + + ylim <- quantile(NP_ES$score[ES$seqname == i], probs=c(0.01,0.99)) + delta_signal <- runmed(NP_ES$score[NP_ES$seqname == i], 49); + delta_col <- ifelse(delta_signal > 0, "blue", "orange") + plot(NP_ES$start[NP_ES$seqname == i], delta_signal, type="h", bty="n", + xlab="genomic location (bp)", ylab="LaminB1 interaction", + ylim=ylim, main=paste(comb, "Lamin"), col=delta_col) + + if (nrow(delta_doms_pos[delta_doms_pos$seqname == i,]) > 0) { + rect(delta_doms_pos$start[delta_doms_pos$seqname == i], ylim[2], + delta_doms_pos$end[delta_doms_pos$seqname == i], ylim[2]+0.2, col = "grey"); + } + if (nrow(delta_doms_neg[delta_doms_neg$seqname == i,]) > 0) { + rect(delta_doms_neg$start[delta_doms_neg$seqname == i], ylim[1], + delta_doms_neg$end[delta_doms_neg$seqname == i], ylim[1]+0.2, col = "grey"); + } + + dev.off() + } +} + +### OK (OLD) +#subAlign <- function(aln,idx){ +# if (is.logical(idx)) idx <- which(idx); +# +# newIntvlIdx <- rep(NA, nrow(aln$relDom)) +# newIntvlIdx[idx] <- 1:length(idx) +# +# alnIdx <- (aln$aln$relDomIdx %in% rownames(aln$relDom)[idx]) +# aln$aln <- aln$aln[alnIdx,] +# aln$aln$relDomIdx <- newIntvlIdx[aln$aln$relDomIdx]; +# +# aln$relDom <- aln$relDom[idx,] +# +# aln +#} + +subAlign <- function(aln,idx){ + if (is.logical(idx)) idx <- which(idx); + + newIntvlIdx <- rep(NA, nrow(aln$relDom)) + newIntvlIdx[idx] <- 1:length(idx) + + aln$relDom <- aln$relDom[idx,] + aln$aln <- aln$aln[aln$aln$relDomIdx %in% idx,] + aln$aln$relDomIdx <- newIntvlIdx[aln$aln$relDomIdx]; + + aln +} + +plot_profile <- function(GFF, LADs=NULL, chr="chr18", col="black", main="", type="h", + plot_LADs=TRUE, plot_stats=FALSE, plot_ylab=TRUE, k=49, ylim=NULL, sample=FALSE, ...) { + if (is.null(ylim)) ylim <- quantile(GFF$score[GFF$seqname == chr], probs=c(0.01,0.99)) + pos <- GFF$start[GFF$seqname == chr]; + scores <- runmed(GFF$score[GFF$seqname == chr], k); + if (sample) { + sel <- sample(1:length(pos), round(length(pos)/5)) + } else { + sel <- 1:length(pos); + } + if (length(col) > 1) col <- col[sel]; + + plot(pos[sel], scores[sel], type=type, bty="n", + ylab=ifelse(plot_ylab, "LaminB1 interaction (log2)", ""), xaxt="n", ylim=ylim, col=col, ...) + mtext(main, 3, -1, cex=1.5) + if (!is.null(LADs) && (nrow(LADs[LADs$seqname == chr,]) > 0)) { + if (plot_LADs) { + rect(LADs$start[LADs$seqname == chr], ylim[1], LADs$end[LADs$seqname == chr], ylim[1]+0.4, + col = grey(0.5), border=NA); + } + if (plot_stats) { + med <- median(LADs$end-LADs$start+1) + legend("topright", "(x,y)", bty="n", inset=c(0.1,-0.05), + legend=c(paste(nrow(LADs), "LADs"), paste("Median size: ", med %/% 1e3, "kb", sep=""))) + } + } +} + +## Detailed profile +plot_binding_detail <- function(chr, from, to, lwd=1, extend=0.5) { + par(mfrow=c(4,1), mar=c(0,5,0,1)) + i <- chr + + from <- from - extend*(to - from) + to <- to + extend*(to - from) + + ylim <- quantile(ES$score[ES$seqname == i], probs=c(0.05,0.95)) + plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + ylab="", xaxt="n", ylim=ylim, col="orange", xlim=c(from,to), cex.axis=1.5) + #mtext("Embryonic stem cells", 3, -1) + + ylim <- quantile(NP$score[NP$seqname == i], probs=c(0.05,0.95)) + plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + ylab="", xaxt="n", ylim=ylim, col="blue", xlim=c(from,to), cex.axis=1.5) + #mtext("Neuro-precursor cells", 3, -1) + + ylim <- quantile(AC$score[AC$seqname == i], probs=c(0.05,0.95)) + plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + ylab="", xaxt="n", ylim=ylim, col="magenta", xlim=c(from,to), cex.axis=1.5) + #mtext("Astrocyte cells", 3, -1) + + mtext("LaminB1 interaction (log2)", 2, -2, cex=1.5, at=0.62, outer=TRUE) + + par(mar=c(5,5,0,1)) + plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 49), + type="n", bty="n", ylim=c(0.4, 1), xlab="", ylab="", xlim=c(from,to), yaxt="n", cex.axis=1.5, cex.lab=1.5) + mtext(paste("Genomic location (bp, ", chr, ")", sep=""), 1, 3, cex=1.5) + + rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.77, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.78, col=1); + rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.57, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.58, col=1); + + text(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.87, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "+"], adj=c(0,0), col=1, cex=1.5) + text(Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.67, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "-"], adj=c(1,0), col=1, cex=1.5) + + rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.74, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.81, col=1); + rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.54, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.61, col=1); + + rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.7, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.85, col=1); + rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.5, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.65, col=1); +} + +## Detailed profile, including H3K36me3 data +plot_binding_detail_plus_K36 <- function(chr, from, to, lwd=1, extend=0.5, ylim_LB1=NULL, ylim_K36=NULL, + axes_LB1=TRUE, axes_K36=TRUE, emph_used_genes=FALSE, cols=c("orange", "blue", "magenta")) { + par(mfrow=c(3,1), oma=c(5,0,0,0)) + i <- chr + + load(paste("../../data/external_data/Mikkelsen_Nat07/H3K36me3_data/mm9/", chr, ".ES.K36.RData", sep="")) + names(K36) <- c("seqname", "start", "end", "score") + #K36$score <- log2(K36$score) + K36 <- K36[K36$score>2,] + ES_K36 <- K36; + + load(paste("../../data/external_data/Mikkelsen_Nat07/H3K36me3_data/mm9/", chr, ".NP.K36.RData", sep="")) + names(K36) <- c("seqname", "start", "end", "score") + #K36$score <- log2(K36$score) + K36 <- K36[K36$score>2,] + NP_K36 <- K36; + + if (is.null(ylim_LB1)) + ylim_LB1 <- quantile(c(ES$score[ES$seqname == i], NP$score[NP$seqname == i], AC$score[AC$seqname == i]), probs=c(0.05,0.95)) + if (is.null(ylim_K36)) + ylim_K36 <- quantile(c(ES_K36$score[ES_K36$seqname == i], NP_K36$score[NP_K36$seqname == i]), probs=c(0.01,0.99)) + + from <- from - extend*(to - from) + to <- to + extend*(to - from) + + if (axes_LB1) { + #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); + yaxt_LB1="s"; ylab_LB1=""; # We want to plot one large label over all 3 profiles... + } else { + yaxt_LB1="n"; ylab_LB1=""; + } + + par(mar=c(5,5,3,5)) + plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + xlab="", xaxt="n", ylim=ylim_LB1, col=cols[1], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) + #mtext("Embryonic stem cells", 3, -1) + par(mar=c(0,5,15,5), new=TRUE) + plot(0, type="n", bty="n", xlab="", ylab="", xaxt="n", yaxt="n", xlim=c(from, to), ylim=ylim_K36, yaxs="i") + #rect(0, 0, 1e9, 100, col="orange", border=NA) + lines(ES_K36$start[ES_K36$seqname == i], ES_K36$score[ES_K36$seqname == i], type="h", bty="n", lend=2, col="black") + lines(c(ylim_K36[1], 1e9), rep(ylim_K36[1], 2), type="l", col=cols[1], lwd=3); + if (axes_K36) { + axis(4, cex.axis=2) + mtext("H3K36me3", 4, line=3, cex=1.5) + } + + par(mar=c(5,5,3,5)) + plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + xlab="", xaxt="n", ylim=ylim_LB1, col=cols[2], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) + #mtext("Neuro-precursor cells", 3, -1) + par(mar=c(0,5,15,5), new=TRUE) + plot(0, type="n", bty="n", xlab="", ylab="", xaxt="n", yaxt="n", xlim=c(from, to), ylim=ylim_K36, yaxs="i") + #rect(0, 0, 1e9, 100, col="blue", border=NA) + lines(NP_K36$start[NP_K36$seqname == i], NP_K36$score[NP_K36$seqname == i], type="h", bty="n", lend=2, col="black") + lines(c(ylim_K36[1], 1e9), rep(ylim_K36[1], 2), type="l", col=cols[2], lwd=3); + if (axes_K36) { + axis(4, cex.axis=2) + mtext("H3K36me3", 4, line=3, cex=1.5) + } + + par(mar=c(5,5,3,5)) + plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + xlab="", xaxt="n", ylim=ylim_LB1, col=cols[3], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) + #mtext("Astrocyte cells", 3, -1) + + #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); + + mtext(expression(log[2](LaminB1~interaction)), 2, -2, cex=1.5, at=0.55, outer=TRUE) + #mtext("H3K36me3 (ChIP-seq data)", 4, -2, cex=1.5, at=0.53, outer=TRUE) + + #par(mar=c(5,5,0,5)) + par(new=TRUE) + par(mar=c(0,5,3,5)) + options("scipen"=5) + plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 49), + type="n", bty="n", ylim=c(0.4, 2.1), xlab="", ylab="", xlim=c(from,to), yaxt="n", cex.axis=2) + options("scipen"=0) + mtext(paste("chromosomal location (bp, ", chr, ")", sep=""), 1, 3.5, cex=1.5) + + rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.77, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.78, border=NA, col=1); + rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.57, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.58, border=NA, col=1); + + text(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.88, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "+"], adj=c(0,0), col=1, cex=2) + text(Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.68, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "-"], adj=c(1,0), col=1, cex=2) + + rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.74, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.81, border=NA, col=1); + rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.54, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.61, border=NA, col=1); + + rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.7, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.85, border=NA, col=1); + rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.5, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.65, border=NA, col=1); + + # Again, only genes used in analysis, in red. + if (emph_used_genes) { + rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.77, Entrez$end[Entrez$seqname == i & Entrez$strand == "+"], 0.78, border=NA, col=2); + rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "-"], 0.57, Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.58, border=NA, col=2); + + text(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.88, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "+"], adj=c(0,0), col=2, cex=2) + text(Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.68, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "-"], adj=c(1,0), col=2, cex=2) + + rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "+"], 0.74, UTRs$end[UTRs$seqname == i & UTRs$strand == "+"], 0.81, border=NA, col=2); + rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "-"], 0.54, UTRs$end[UTRs$seqname == i & UTRs$strand == "-"], 0.61, border=NA, col=2); + + rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "+"], 0.7, CDSs$end[CDSs$seqname == i & CDSs$strand == "+"], 0.85, border=NA, col=2); + rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "-"], 0.5, CDSs$end[CDSs$seqname == i & CDSs$strand == "-"], 0.65, border=NA, col=2); + } + + #sel_genes <- Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$start > from & Entrez_full$end < to] + sel_genes <- Entrez$frame[Entrez$seqname == i & Entrez$start > from & Entrez$end < to] + invisible(sel_genes) +} + +## Detailed profile, including H3K36me3 data +plot_binding_detail_plus_K36_NP_AC <- function(signal1=NP, signal2=AC, chr, from, to, lwd=1, extend=0.5, ylim_LB1=NULL, ylim_K36=NULL, + axes_LB1=TRUE, axes_K36=TRUE, emph_used_genes=FALSE, cols=c("orange", "blue", "magenta"), + ylab_text=expression(log[2](LaminB1~interaction))) { + par(mfrow=c(2,1), oma=c(5,0,0,0)) + i <- chr + + load(paste("../../data/external_data/Mikkelsen_Nat07/H3K36me3_data/mm9/", chr, ".NP.K36.RData", sep="")) + names(K36) <- c("seqname", "start", "end", "score") + #K36$score <- log2(K36$score) + K36 <- K36[K36$score>2,] + signal1_K36 <- K36; + + if (is.null(ylim_LB1)) + ylim_LB1 <- quantile(c(signal1$score[signal1$seqname == i], signal2$score[signal2$seqname == i]), probs=c(0.05,0.95)) + if (is.null(ylim_K36)) + ylim_K36 <- quantile(c(signal1_K36$score[signal1_K36$seqname == i]), probs=c(0.01,0.99)) + + from <- from - extend*(to - from) + to <- to + extend*(to - from) + + if (axes_LB1) { + #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); + yaxt_LB1="s"; ylab_LB1=""; # We want to plot one large label over all 3 profiles... + } else { + yaxt_LB1="n"; ylab_LB1=""; + } + + par(mar=c(5,5,3,5)) + plot(signal1$start[signal1$seqname == i], runmed(signal1$score[signal1$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + xlab="", xaxt="n", ylim=ylim_LB1, col=cols[2], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) + #mtext("Neuro-precursor cells", 3, -1) + par(mar=c(0,5,15,5), new=TRUE) + plot(0, type="n", bty="n", xlab="", ylab="", xaxt="n", yaxt="n", xlim=c(from, to), ylim=ylim_K36, yaxs="i") + #rect(0, 0, 1e9, 100, col="blue", border=NA) + lines(signal1_K36$start[signal1_K36$seqname == i], signal1_K36$score[signal1_K36$seqname == i], type="h", bty="n", lend=2, col="black") + lines(c(ylim_K36[1], 1e9), rep(ylim_K36[1], 2), type="l", col=cols[2], lwd=3); + if (axes_K36) { + axis(4, cex.axis=2) + mtext("H3K36me3", 4, line=3, cex=1.5) + } + + par(mar=c(5,5,3,5)) + plot(signal2$start[signal2$seqname == i], runmed(signal2$score[signal2$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, + xlab="", xaxt="n", ylim=ylim_LB1, col=cols[3], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) + #mtext("Astrocytes", 3, -1) + + #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); + + #mtext(expression(log[2](LaminB1~interaction)), 2, -2, cex=1.5, at=0.55, outer=TRUE) + mtext(ylab_text, 2, -2, cex=1.5, at=0.55, outer=TRUE) + #mtext("H3K36me3 (ChIP-seq data)", 4, -2, cex=1.5, at=0.53, outer=TRUE) + + #par(mar=c(5,5,0,5)) + par(new=TRUE) + par(mar=c(0,5,3,5)) + options("scipen"=5) + plot(signal2$start[signal2$seqname == i], runmed(signal2$score[signal2$seqname == i], 49), + type="n", bty="n", ylim=c(0.4, 2.1), xlab="", ylab="", xlim=c(from,to), yaxt="n", cex.axis=2) + options("scipen"=0) + mtext(paste("chromosomal location (bp, ", chr, ")", sep=""), 1, 3.5, cex=1.5) + + rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.77, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.78, border=NA, col=1); + rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.57, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.58, border=NA, col=1); + + text(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.88, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "+"], adj=c(0,0), col=1, cex=2) + text(Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.68, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "-"], adj=c(1,0), col=1, cex=2) + + rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.74, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.81, border=NA, col=1); + rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.54, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.61, border=NA, col=1); + + rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.7, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.85, border=NA, col=1); + rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.5, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.65, border=NA, col=1); + + # Again, only genes used in analysis, in red. + if (emph_used_genes) { + rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.77, Entrez$end[Entrez$seqname == i & Entrez$strand == "+"], 0.78, border=NA, col=2); + rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "-"], 0.57, Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.58, border=NA, col=2); + + text(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.88, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "+"], adj=c(0,0), col=2, cex=2) + text(Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.68, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "-"], adj=c(1,0), col=2, cex=2) + + rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "+"], 0.74, UTRs$end[UTRs$seqname == i & UTRs$strand == "+"], 0.81, border=NA, col=2); + rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "-"], 0.54, UTRs$end[UTRs$seqname == i & UTRs$strand == "-"], 0.61, border=NA, col=2); + + rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "+"], 0.7, CDSs$end[CDSs$seqname == i & CDSs$strand == "+"], 0.85, border=NA, col=2); + rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "-"], 0.5, CDSs$end[CDSs$seqname == i & CDSs$strand == "-"], 0.65, border=NA, col=2); + } + + #sel_genes <- Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$start > from & Entrez_full$end < to] + sel_genes <- Entrez$frame[Entrez$seqname == i & Entrez$start > from & Entrez$end < to] + invisible(sel_genes) +} + +# Used to shift doms half a inter-dom size up, i.e., rotate them circularly over the genome. +shiftDoms <- function(doms, seq_dims) { + shift_doms <- NULL; + for (chr in unique(doms$seqname)) { + end <- seq_dims$end[seq_dims$seqname==chr] + dom_chr <- doms[doms$seqname == chr,] + n <- nrow(dom_chr) + + ctr <- floor((dom_chr$start + dom_chr$end)/2) + ictr <- floor((dom_chr$end[1:(n-1)] + dom_chr$start[2:n])/2) + ictr <- c(ictr, floor((dom_chr$end[n] + end)/2)); + + dom_chr$start <- ctr; + dom_chr$end <- ictr; + + shift_doms <- rbind(shift_doms, dom_chr); + } + invisible(shift_doms) +} + +combineAlign <- function(aln1, aln2) { + aln2$aln$relDomIdx <- aln2$aln$relDomIdx + nrow(aln1$relDom) + aln <- list(aln = rbind(aln1$aln, aln2$aln), + relDom = rbind(aln1$relDom, aln2$relDom)); + invisible(aln); +} + +func.mem.use <- function() { + sum_size <- 0; + for (i in ls(envir=parent.frame())) { + sum_size <- sum_size + object.size(eval(parse(text=i), envir=parent.frame())); + } + + print(sum_size, units="auto") + invisible(sum_size); +} + +mem.use <- function() { + sort(sapply(ls(envir=.GlobalEnv), function(x) object.size(eval(parse(text=x))))) +} + +# Convert genomic locations back to probe indices +get_probe_coords <- function(GFF, template=NP_ES) { + probes <- CGintvlOverlap(GFF, template, type="center", checkData=FALSE) + ranges <- lapply(probes, function(x) lapply(x, range, na.rm=T)) + probe_coords <- NULL; + for (i in names(ranges)) { + probe_coords <- rbind(probe_coords, data.frame(start=unlist(lapply(ranges[[i]], "[[", 1)), + end=unlist(lapply(ranges[[i]], "[[", 2)))) + } + + probe_coords[probe_coords == Inf | probe_coords == -Inf] <- NA; + + invisible(probe_coords) +} + +# Used to extend GFF based on number of array probes. Used in synteny analysis of intergenic regions. +ext_probe_coords <- function(probe_coords, template=NP_ES, upstr=25, downstr=25) { + chrom_mins <- tapply(1:nrow(template), template$seqname, min) + chrom_maxs <- tapply(1:nrow(template), template$seqname, max) + + probe_coords$start <- pmax(probe_coords$start - upstr, + chrom_mins[template$seqname[probe_coords$start]], na.rm=T) + probe_coords$end <- pmin(probe_coords$end + downstr, + chrom_maxs[template$seqname[probe_coords$end]], na.rm=T) + + invisible(probe_coords) +} + +# Inverse of get_probe_coords() +get_chr_coords <- function(probe_coords, template=NP_ES) { + res <- data.frame(seqname=template$seqname[probe_coords[,1]], + start=template$start[probe_coords[,1]], + end=template$end[probe_coords[,2]], + stringsAsFactors=FALSE) + + hits <- which(is.na(res$start)) + chrom_mins <- tapply(template$start, template$seqname, min) +# res$start[hits] <- chrom_mins[res$seqname[hits]] + + hits <- which(is.na(res$end)) + chrom_maxs <- tapply(template$end, template$seqname, max) +# res$end[hits] <- chrom_maxs[res$seqname[hits]] + + invisible(res) +} + +get_dom_border_feats <- function(doms, feats, winsize=10000) { + bndr <- CGintvlBndr(doms) + bndrExt <- CGintvlExtend(bndr, upstr=winsize, downstr=0, chromStartEnd=chromSE_MM_NCBIm37) + + feats$center <- floor((feats$start + feats$end)/2) + feats$start <- feats$end <- feats$center; + + # Return doms + bndrfeat <- unlist(CGintvlOverlap(feats, bndrExt, type="minimal")) + + # Create 0/1 vector, also taking care of duplicates + res <- rep(FALSE, nrow(bndr)) + res[bndrfeat] <- TRUE; + + invisible(res) +} + +get_dom_border_feats_outward <- function(doms, feats, winsize=10000) { + bndr <- CGintvlBndr(doms) + bndrExt <- CGintvlExtend(bndr, upstr=winsize, downstr=0, chromStartEnd=chromSE_MM_NCBIm37) + + feats5 <- feats[feats$strand=="-",]; feats5$start <- feats5$end # Select appropriate TSSs + feats3 <- feats[feats$strand=="+",]; feats3$end <- feats3$start # Select appropriate TSSs + + bndr5Ext <- bndrExt[seq(1,nrow(bndrExt), 2),] + bndr3Ext <- bndrExt[seq(2,nrow(bndrExt), 2),] + + # Return doms + bndr5feat <- unlist(CGintvlOverlap(feats5, bndr5Ext, type="minimal")) + bndr3feat <- unlist(CGintvlOverlap(feats3, bndr3Ext, type="minimal")) + + # Create 0/1 vector, also taking care of duplicates + res <- rep(FALSE, nrow(bndrExt)) + res[seq(1, nrow(bndrExt), 2)][bndr5feat] <- TRUE + res[seq(2, nrow(bndrExt), 2)][bndr3feat] <- TRUE + + invisible(res) +} + +# OK +get_bindata <- function(aln, bins=seq(-25*1e3, 25*1e3, length.out=11)) { + aln$aln$center <- (aln$aln$start + aln$aln$end)/2 + + # Retain only features of which the center lies within a bin. + aln$aln <- aln$aln[aln$aln$center < max(bins) & aln$aln$center > min(bins),] + + # Do actual binning, making sure the bins extend to the extremes + binning <- cut(c(range(bins), aln$aln$center), breaks=bins) + binning <- binning[3:length(binning)] # Remove extremes + aln$aln$centerbin <- binning # Assign features to bins + + # For each domain, obtain features and calculate median value per bin. + print("Building binned dataset...") + binned <- tapply(1:nrow(aln$aln), aln$aln$relDomIdx, function(x) { + perc <- round((max(x) / nrow(aln$aln) * 100), 0) + cat(paste("\r", perc, "%", sep="")) + tapply(aln$aln$score[x], aln$aln$centerbin[x], median, na.rm=T) + }) + cat("\n"); + + # Convert result into bin (this could probably be done more elegantly) + binmat <- matrix(NA, nrow=length(binned), ncol=length(bins)-1) + i <- 1; + feh <- lapply(binned, function(x) { + binmat[i,] <<- x; + i <<- i+1 + }) + rownames(binmat) <- names(binned) + colnames(binmat) <- levels(binning); + + invisible(list(binmat=binmat, limits=range(bins))) +} + +get_probedata <- function(doms_probes, signal, numprobes=10, direction="both") { + binmat_probes_prime5 <- t(apply(doms_probes, 1, function(x) x[1] + c(-numprobes:numprobes))) + binmat_probes_prime3 <- t(apply(doms_probes, 1, function(x) x[2] - c(-numprobes:numprobes))) + + binmat_prime5 <- t(apply(binmat_probes_prime5, 1, function(x) signal$score[x])) + binmat_prime3 <- t(apply(binmat_probes_prime3, 1, function(x) signal$score[x])) + + if (direction == "both") { + n <- nrow(doms_probes)*2 + binmat <- matrix(NA, ncol=numprobes*2+1, nrow=n) + binmat[seq(1, n/2)*2-1,] <- binmat_prime5; + binmat[seq(1, n/2)*2,] <- binmat_prime3; + } else { + n <- nrow(doms_probes) + if (direction == "prime5") { + binmat <- binmat_prime5; + } else { + binmat <- binmat_prime3; + } + } + + rownames(binmat) <- 1:n + colnames(binmat) <- c(-numprobes:numprobes) + + invisible(list(binmat=binmat, limits=c(-numprobes,numprobes)*1200)) +} + +doFisherGeneric <- function(set1, set2) { + A <- sum(set1 & set2, na.rm=T) + B <- sum(set1 & !set2, na.rm=T) + C <- sum(!set1 & set2, na.rm=T) + D <- sum(!set1 & !set2, na.rm=T) + + tab <- matrix(c(A,C,B,D), ncol=2, dimnames=list(c("in set1","not in set1"), c("in set2","not in set2"))) + print(tab) + perc_set1 <- A/(A+B)*100; + perc_set2 <- C/(C+D)*100; + print(paste("Set 1:", perc_set1)) + print(paste("Set 2:", perc_set2)) + + res_twosided <- fisher.test(tab, alt="two.sided")$p.value + print(res_twosided) + + print(paste("Enrichment ratio:", perc_set1/perc_set2)) + res_enrich <- fisher.test(tab, alt="greater")$p.value + print(res_enrich) + + print(paste("Depletion ratio:", perc_set2/perc_set1)) + res_deplet <- fisher.test(tab, alt="less")$p.value + print(res_deplet) + + invisible(list(perc_set1=perc_set1, perc_set2=perc_set2, ratio=perc_set1/perc_set2, + p=res_twosided, p_enrich=res_enrich, p_deplet=res_deplet)) +} + +doHyperGeneric <- function(set1, set2) { + q <- length(which(set1 & set2)) + m <- length(which(set2)) + n <- length(which(!set2)) + k <- length(which(set1)) + + print(paste("Enrichment ratio:", (q / k) / ((m-q) / (m+n-k)))) + res_enrich <- phyper(q-1, m, n, k, lower.tail=FALSE) + print(res_enrich) + + print(paste("Depletion ratio:", ((m-q) / (m+n-k)) / (q / k))) + res_deplet <- phyper(q, m, n, k, lower.tail=TRUE) + print(res_deplet) + + invisible(list(enrich=res_enrich, deplet=res_deplet)) +} + +likelihood.gauss <- function (parameters, sample) { + sample <- sample[!is.na(sample) & is.finite(sample)]; + if (length(parameters) != 5) + stop ("parameters must be vector of length 5") + prop = parameters[1] + if (prop < 0) prop = 0 + if (prop > 1) prop = 1 + m1 = parameters[2] + s1 = parameters[3] + m2 = parameters[4] + s2 = parameters[5] + + #print(parameters) + return (-sum(log( prop*dnorm(sample, m1, s1) + (1-prop)*dnorm(sample, m2, s2) ))) +} + +# Not in use anymore +find_center_optim <- function(data, plot=FALSE) { + dens <- density(data, na.rm=T) + slopeSign <- diff(dens$y) > 0 + slopeSignChange <- diff(slopeSign) < 0 + h <- which(slopeSignChange == TRUE) + 1 + print(paste(length(h), "modi found.")) + + ord <- order(dens$y[h], decreasing=TRUE) + sel <- which(ord %in% c(1,2)) + center <- dens$x[mean(h[sel])] + + out <- optim(c(0.5, dens$x[h[sel[1]]], 1, dens$x[h[sel[2]]], 1), likelihood.gauss, sample = data) + + if (plot) { + plot(dens) + abline(v=dens$x[h[sel]], col=2) + abline(v=center, col=3) + curve(dnorm(x, mean=out$par[2], sd=out$par[3]), col=4, add=T) + curve(dnorm(x, mean=out$par[4], sd=out$par[5]), col=4, add=T) + } + + invisible(out); +} + +# Not in use anymore +find_center_cent <- function(data, plot=FALSE) { + dens <- density(data, na.rm=T) + slopeSign <- diff(dens$y) > 0 + slopeSignChange <- diff(slopeSign) < 0 + h <- which(slopeSignChange == TRUE) + 1 + print(paste(length(h), "modi found.")) + + ord <- order(dens$y[h], decreasing=TRUE) + sel <- which(ord %in% c(1,2)) + center <- dens$x[mean(h[sel])] + + if (plot) { + plot(dens) + abline(v=dens$x[h[sel]], col=2) + abline(v=center, col=3) + } + + invisible(center); +} + +# Not in use anymore +find_center_dip_max <- function(data, plot=FALSE) { + dens <- density(data, na.rm=T) + slopeSign <- diff(-dens$y) > 0 + slopeSignChange <- diff(slopeSign) < 0 + h <- which(slopeSignChange == TRUE) + 1 + print(paste(length(h), "dips found.")) + dip <- h[which.max(dens$y[h])] + #dip <- h[1]; # Use the first dip found. + + if (plot) { + plot(dens) + abline(v=dens$x[dip], col=2) + } + + invisible(dens$x[dip]); +} + +# Currently used method +find_center_dip <- function(data, plot=FALSE) { + dens <- density(data, na.rm=T) + + slopeSign <- diff(dens$y) > 0 + slopeSignChange <- diff(slopeSign) < 0 + modi <- which(slopeSignChange == TRUE) + 1 + print(paste(length(modi), "modi found.")) + + ord <- order(dens$y[modi], decreasing=TRUE) + sel <- sort(ord[1:2]) + + slopeSign <- diff(-dens$y) > 0 + slopeSignChange <- diff(slopeSign) < 0 + dips <- which(slopeSignChange == TRUE) + 1 + print(paste(length(dips), "dips found.")) + + # Select minimal dip inbetween two maximum modi + dip_sel <- which(dips > modi[sel[1]] & dips < modi[sel[2]]); + dip <- dips[dip_sel[which.min(dens$y[dips[dip_sel]])]] + + if (plot) { + plot(dens) + abline(v=dens$x[dip], col=2) + } + + invisible(dens$x[dip]); +} + +# Not in use anymore +find_center_clust <- function(data, plot=FALSE) { + data <- data[!is.na(data) & is.finite(data)] + clust <- kmeans(data, centers=2) + rng <- sort(unlist(tapply(data, clust$cluster, range))); + center <- mean(rng[2:3]) + dens <- density(data, na.rm=T) + + if (plot) { + plot(dens) + abline(v=clust$centers, col=2) + abline(v=center, col=3) + } + + invisible(center); +} + +cond.norm <- function (x, conditions = NULL, invariantset = NULL, dat.log.scale = TRUE, method = c("quantile", "global")){ + method <- method[1] + method.available = c("quantile", "global") + method.use <- method.available[charmatch(method, method.available)] + if (is.na(method.use)) { + stop(paste("This method, '", as.character(method), "', is not (yet) implemented. Please use 'quantile' or 'global'.", + sep = "")) + } + if (method.use == "quantile") { + if (is.null(invariantset)) { + x.dist <- apply(apply(x, 1, sort), 1, median) + x.norm <- t(apply(x, 1, function(xx, y) y[rank(xx)], + y = x.dist)) + } else { + n.col <- ncol(x) + x.rank <- t(apply(x, 1, rank)) + if (length(invariantset) == 1) { + invset <- order(apply(x.rank, 2, mad, constant = 1))[1:invariantset] + } + else { + invset <- invariantset + } + mins <- where.is.min(x.rank) + maxs <- where.is.max(x.rank) + if (is.logical(invset)) { + invset[c(mins, maxs)] <- TRUE + } + else { + invset <- unique(c(invset, mins, maxs)) + invset <- sort(invset) + } + inv.x <- x[, invset] + inv.x.dist <- apply(apply(inv.x, 1, sort), 1, median) + x.norm <- t(apply(x, 1, function(y, invset, inv.x.dist) approx(sort(rank(y)[invset]), + inv.x.dist, rank(y))$y, invset = invset, inv.x.dist = inv.x.dist)) + + if (!is.null(conditions)) { + if (nrow(x) != length(conditions)) { + stop("The length of the vector 'conditions' should correspond to the number of rows of the data. NOTE that the genes are in the columns and the conditions in the rows.") + } + cond.lev <- unique(conditions) + for (i in cond.lev) { + which.lev <- (conditions == i) + if (sum(which.lev) > 1) { + x.lev <- x.norm[which.lev, ] + x.lev.dist <- apply(apply(x.lev, 1, sort), + 1, median) + x.lev.norm <- t(apply(x.lev, 1, function(xx, + y) y[rank(xx)], y = x.lev.dist)) + x.norm[which.lev, ] <- x.lev.norm + } + } + } + } + } + else { + if (method.use == "global") { + x.mdn <- median(as.vector(x)) + x.mad <- median(apply(x, 1, mad)) + x.norm <- t(apply(x, 1, function(y) (y - median(y))/mad(y))) + x.norm <- x.mdn + x.norm * x.mad + } + else { + warning("Check that the function is correct! The method specified could not be found.") + } + } + x.norm +} + +### MUCH simpler versions of the cond.norm() function above. +### Also, this separates the empirical distribution from the actual normalisation, which can be convenient. +get_quant_norm_dist <- function (x) { + dist <- apply(apply(x, 1, sort), 1, median) + invisible(dist) +} + +do_quant_norm <- function (x, dist=NULL) { + if (is.null(dist)) dist <- get_quant_norm_dist(x); + + x.norm <- t(apply(x, 1, function(xx, y) y[rank(xx)], y = dist)) + invisible(x.norm) +} + + +plotfile <- function(filename="Rplot", type="png", width=7, height=7, device="bitmap", warn=TRUE) { + format <- switch(type, + pdf = "pdfwrite", + png = "png16m", + type); + + if (device == "bitmap") { + bitmap(file=paste(filename, "_", id(), ".", type, sep=""), + type=format, res=144, taa=ifelse(type=="png16m", 4, NA), width=width, height=height) + } else { + eval(parse(text=device))(file=paste(filename, "_", id(), ".", type, sep=""), + res=144, width=width, height=height) + } + + if (warn) print("Do not forget to use 'dev.off()' afterwards!") +} + + +get_delta_stats_orig_single <- function(delta, NS_mean, NS_sd, probes, p.adjust.method="BH", signif=0.05) { + ## Obtain significantly dissociated genes or intergenic regions + lengths <- (probes$end-probes$start+1) + means <- apply(probes, 1, function(x) { + if (is.na(x[1])) NA else + mean(delta[x[1]:x[2]], na.rm=T) + }) + p_as <- pnorm(means, mean=NS_mean, sd=NS_sd/sqrt(2*lengths), lower.tail=FALSE) + p_dis <- pnorm(means, mean=NS_mean, sd=NS_sd/sqrt(2*lengths), lower.tail=TRUE) + hits_as <- (p.adjust(p_as, method=p.adjust.method) < signif) + hits_dis <- (p.adjust(p_dis, method=p.adjust.method) < signif) + + list(p_as=p_as, p_dis=p_dis, hits_as=hits_as, hits_dis=hits_dis) +} + +get_delta_stats_orig <- function(delta, NS_mean, NS_sd, probes, probes_IG, + p.adjust.method="BH", signif=0.05) { + genes_res <- get_delta_stats_orig_single(delta, NS_mean, NS_sd, probes, p.adjust.method, signif) + IG_res <- get_delta_stats_orig_single(delta, NS_mean, NS_sd, probes_IG, p.adjust.method, signif) + + list(p_as=genes_res$p_as, p_dis=genes_res$p_dis, hits_as=genes_res$hits_as, hits_dis=genes_res$hits_dis, + IG_p_as=IG_res$p_as, IG_p_dis=IG_res$p_dis, IG_hits_as=IG_res$hits_as, IG_hits_dis=IG_res$hits_dis) +} + +## Obtain significantly dissociated/associated genes/IGs +get_delta_stats_single <- function(delta, NS_mean, NS_sd, probes, + p.adjust.method="BH", signif=0.05, acfs=NULL) { + lengths <- (probes$end-probes$start+1) + means <- apply(probes, 1, function(x) { + if (is.na(x[1])) NA else mean(delta[x[1]:x[2]], na.rm=T) + }) + + if (is.null(acfs)) + acfs <- acf(delta, lag.max=max(lengths, na.rm=T), plot=FALSE) + + NS_var <- NS_sd^2 + + covs <- acfs$acf * NS_var # Scale correlation by SD based on null distribution + #covs <- acfs$acf * var(delta) # Scale correlation by SD based on delta distribution + + kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) + sumvars <- ((NS_var)*lengths) + (2 * kcovs[lengths]) + #sumvars <- (var(delta)*lengths) + (2 * kcovs[lengths]) + + z <- ((means*lengths) - (NS_mean*lengths)) / sqrt(sumvars) + + p_as <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=FALSE) + hits_as <- (p.adjust(p_as, method=p.adjust.method) < signif) + + p_dis <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=TRUE) + hits_dis <- (p.adjust(p_dis, method=p.adjust.method) < signif) + + # For a two-tailed test do both the lower-tailed and the upper-tailed test and + # double the P-value of the smaller of the two results. + p <- pmin(p_as, p_dis)*2 + hits <- (p.adjust(p, method=p.adjust.method) < signif) + + list(z=z, p=p, p_as=p_as, p_dis=p_dis, hits=hits, hits_as=hits_as, hits_dis=hits_dis) +} + + +## Obtain significantly dissociated/associated genes/IGs +get_delta_stats_single_mean <- function(delta, NS_mean, NS_sd, probes, + p.adjust.method="BH", signif=0.05, acfs=NULL) { + lengths <- (probes$end-probes$start+1) + means <- apply(probes, 1, function(x) { + if (is.na(x[1])) NA else mean(delta[x[1]:x[2]], na.rm=T) + }) + + if (is.null(acfs)) + acfs <- acf(delta, lag.max=max(lengths, na.rm=T), plot=FALSE) + + NS_var <- NS_sd^2 + + covs <- acfs$acf * NS_var # Scale correlation by SD based on null distribution + #covs <- acfs$acf * var(delta) # Scale correlation by SD based on delta distribution +} + +# +#### 1 +# kcovs <- NS_var * 2 * c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) +# sumvars <- (lengths*NS_var) + kcovs[lengths] +# p_as <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# +# +#### 2 +# kcovs <- NS_var * 2 * c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) +# sumvars <- (NS_var/lengths) + (kcovs[lengths]/(lengths^2)) +# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# +# +# +#### OK +# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) +# sumvars <- (NS_var*(lengths+2*kcovs[lengths])) +# p_as <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# +#### OK +# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) +# sumvars <- (NS_var/lengths) + (NS_var*2*kcovs[lengths])/(lengths^2) +# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# +#### OK +# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) +# sumvars <- (NS_var + (NS_var*2*kcovs[lengths])/lengths) / lengths +# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# +#### OK +# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) +# sumvars <- (NS_var * (lengths + 2*kcovs[lengths]))/lengths^2 +# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# +#### OK +# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1)/(k/2))) +# sumvars <- (NS_var * (1 + kcovs[lengths]))/lengths +# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) +# p_as[1:10] +# + +get_delta_stats <- function(delta, NS_mean, NS_sd, probes, probes_IG, + p.adjust.method="BH", signif=0.05, acfs=NULL) { + genes_res <- get_delta_stats_single(delta, NS_mean, NS_sd, probes, p.adjust.method, signif, acfs) + IG_res <- get_delta_stats_single(delta, NS_mean, NS_sd, probes_IG, p.adjust.method, signif, acfs) + + ## 'Correct' NA results for intergenic regions: these result from the IGs being too small to calculate significance. + #IG_res$hits[is.na(IG_res$hits)] <- TRUE + #IG_res$hits_as[is.na(IG_res$hits_as)] <- TRUE + #IG_res$hits_dis[is.na(IG_res$hits_dis)] <- TRUE + + list(z=genes_res$z, p=genes_res$p, p_as=genes_res$p_as, p_dis=genes_res$p_dis, + hits=genes_res$hits, hits_as=genes_res$hits_as, hits_dis=genes_res$hits_dis, + IG_z=IG_res$z, IG_p=IG_res$p, IG_p_as=IG_res$p_as, IG_p_dis=IG_res$p_dis, + IG_hits=IG_res$hits, IG_hits_as=IG_res$hits_as, IG_hits_dis=IG_res$hits_dis) +} + +## Obtain significantly dissociated/associated running windows +get_delta_stats_win <- function(delta, NS_mean, NS_sd, k=10, + p.adjust.method="BH", signif=0.05) { + means <- runmean(delta, k=k); + + acfs <- acf(delta, lag.max=k, plot=FALSE) + + NS_var <- NS_sd^2 + + covs <- acfs$acf * NS_var # Scale correlation by SD based on null distribution + #covs <- acfs$acf * var(delta) # Scale correlation by SD based on delta distribution + + kcovs <- c(0, sapply(2:k, function(l) sum(covs[2:l] * (l-1):1))) + sumvars <- ((NS_var)*k) + (2 * kcovs[k]) + #sumvars <- (var(delta)*k) + (2 * kcovs[k]) + + z <- ((means*k) - (NS_mean*k)) / sqrt(sumvars) + + p_as <- pnorm(means*k, mean=NS_mean*k, sd=sqrt(sumvars), lower.tail=FALSE) + hits_as <- (p.adjust(p_as, method=p.adjust.method) < signif) + + p_dis <- pnorm(means*k, mean=NS_mean*k, sd=sqrt(sumvars), lower.tail=TRUE) + hits_dis <- (p.adjust(p_dis, method=p.adjust.method) < signif) + + # For a two-tailed test do both the lower-tailed and the upper-tailed test and + # double the P-value of the smaller of the two results. + p <- pmin(p_as, p_dis)*2 + hits <- (p.adjust(p, method=p.adjust.method) < signif) + + list(z=z, p=p, p_as=p_as, p_dis=p_dis, hits=hits, hits_as=hits_as, hits_dis=hits_dis) +} + + +Davies.Harte.FD <- function (n, sigma = 1, delta = 0) { + N <- 2^ceiling(log2(n)) + M <- 2*N + + S.X.tau <- sigma^2 / pi * sin(pi*delta) * gamma(1-2*delta) * + exp(lgamma((0:N)+delta) - lgamma((0:N)+1-delta)) + + if (delta < 0) + S.X.tau[1] <- -S.X.tau[1] + + S <- Re(fft(c(S.X.tau, S.X.tau[N:2]))) + + Z <- rnorm(M) + + Y <- double(M) + Y[1] <- Z[1] * sqrt(M*S[1]) + Y[2:N] <- (Z[2*(2:N)-1] + Z[2*(2:N)]*1i) * sqrt(M*S[2:N]/2) + Y[N+1] <- Z[M] * sqrt(M*S[N+1]) + Y[(N+2):M] <- Conj(Y[N:2]) + + Y <- Re(fft(Y, inverse = T)) / M + + return(Y[1:n]) +} + +## Make synteny mapping between master (e.g., mouse) and slave (e.g., human) species. +make_synteny_maps <- function(master, slave, master_template, slave_template, + master_species="mouse", slave_species="human", + format="png", sample=TRUE, info="", master_col="black", + slave_col=NULL, lwd=0.2, ylim=NULL, to_file=TRUE, outputdir="misc_figures", + postfun=NULL, postpar=NULL, prefun=NULL, prepar=NULL) { + + master_chroms <- chrom_info[[master_species]]; + master_chromsizes <- chromsize_info[[master_species]]; + + ncols <- round(sqrt(length(master_chroms))) + nrows <- ceiling(sqrt(length(master_chroms))) + tot <- ncols * nrows; + colwidths <- tapply(master_chroms, rep(1:ncols, each=nrows)[1:length(master_chroms)], + function(x) max(master_template$start[master_template$seqname %in% x])) + + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/synteny_maps_", info, "_", id(), ".", format, sep=""), + type=type, width=14*ncols, height=7*nrows, res=144, taa=4) + } + + par(oma=c(2,3,2,0), mar=c(0,2,0,2)) + nf <- layout(matrix(1:(tot*3), 3*nrows, ncols, byrow = FALSE), heights=rep(c(3,3,1.5), tot), widths=colwidths/sum(colwidths)) + + if (length(master_col) == 1) master_col <- rep(master_col, length(master)) + + if (is.null(slave_col)) { + slave_col <- rep("#000000", length(unique(slave_template$seqname))) + names(slave_col) <- sort(unique(slave_template$seqname)); + slave_chroms <- chrom_info[[slave_species]]; + slave_col[slave_chroms] <- rainbow(length(slave_chroms)); + slave_col=slave_col[slave_template$seqname] + } + + #xlim <- range(master_chromsizes[, c("start","end")]) + colwidths_chr <- rep(colwidths, each=nrows); + names(colwidths_chr) <- master_chroms; + + for (chr in master_chroms) { + print(chr) + sel <- which(master_template$seqname == chr) + if (sample) sel <- sample(sel, round(length(sel)/5)) + + xlim <- c(0, colwidths_chr[chr]); + if (!is.null(prefun)) prefun(prepar); + plot(master_template$start[sel], master[sel], type="h", xlim=xlim, ylim=ylim, + xaxt="n", bty="n", xlab="", ylab="", cex.axis=0.8, lwd=lwd, col=master_col[sel]) + if (!is.null(postfun)) postfun(postpar); + + mtext(paste(" ", chr), 3, 0, cex=1, line=-0.2, adj=0) + if (!is.null(prefun)) prefun(prepar); + plot(master_template$start[sel], slave[sel], type="h", xlim=xlim, ylim=ylim, + xaxt="n", bty="n", xlab="", ylab="", cex.axis=0.8, lwd=lwd, col=slave_col[sel]) + if (!is.null(postfun)) postfun(postpar); + + axDat <- prettyTicks(c(0, max(master_template$start[master_template$seqname == chr])), ntick = 7) + plot(master_template$start[master_template$seqname == chr], + rep(1, length(which(master_template$seqname == chr))), type="n", + bty="n", axes=FALSE, ylim=c(-0.3, 1), xlab="", ylab="", xlim=xlim) + axis(1, at = axDat$at, labels = axDat$labels, cex.axis=1, line=-2.5, cex.axis=0.8, padj=-1.2) + } + + mtext(paste("genomic location (", axDat$xunit, ")", sep=""), 1, line=0.5, cex=1.5, outer=TRUE) + mtext(text=expression(log[2](LaminB1~interaction)), 2, 1, cex=1.5, outer=TRUE) + + if (to_file) dev.off() +} + +make_synteny_maps_chr <- function(master, slave, master_template, slave_template, + master_species="mouse", slave_species="human", + chr="chr1", xlim=NULL, ylim=NULL, format="png", + sample=TRUE, main="", info="", lwd=0.2, + master_col="black", slave_col=NULL, + width=14, to_file=TRUE, outputdir="misc_figures", + prefun=NULL, prepar=NULL, postfun=NULL, postpar=NULL) { + master_chroms <- chrom_info[[master_species]]; + master_chromsizes <- chromsize_info[[master_species]]; + + if (is.null(xlim)) { + xrange <- "full"; + xlim <- as.numeric(master_chromsizes[master_chromsizes$seqname==chr,c("start","end")]) + } else { + xrange <- paste(xlim, collapse="-") + } + + if (is.null(ylim)) { + ylim_master <- quantile(master, prob=c(0.001,0.999), na.rm=T) + ylim_slave <- quantile(slave, prob=c(0.001,0.999), na.rm=T) + } else { + ylim_master <- ylim_slave <- NULL; + } + + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/synteny_maps_", chr, "_", xrange, "_", + info, "_", id(), ".", format, sep=""), + type=type, width=width, height=3, res=144, taa=4) + } + + nf <- layout(matrix(1:3, 3, 1, byrow = FALSE), heights=c(3,3,1.5)) + #par(oma=c(2,4,2,0), mar=c(0,2,0,0)) + par(mar=c(0,7,0,1), oma=c(0,0,ifelse(nchar(main) > 0, 2, 1),0)) + + if (length(master_col) == 1) master_col <- rep(master_col, length(master)) + + if (is.null(slave_col)) { + slave_col <- rep("#000000", length(unique(slave_template$seqname))) + names(slave_col) <- sort(unique(slave_template$seqname)); + slave_chroms <- chrom_info[[slave_species]]; + slave_col[slave_chroms] <- rainbow(length(slave_chroms)); + slave_col=slave_col[slave_template$seqname] + } + + sel <- which(master_template$seqname == chr) + if (sample) sel <- sample(sel, round(length(sel)/5)) + + + plot(0, type="n", xlim=xlim, ylim=ylim_master, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) + if (!is.null(prefun)) prefun(prepar); + lines(master_template$start[sel], master[sel], type="h", lwd=lwd, col=master_col[sel]) + #mtext(paste(" ", chr), 3, 0, cex=2, line=-0.2, adj=0) + if (!is.null(postfun)) postfun(postpar); + + plot(0, type="n", xlim=xlim, ylim=ylim_slave, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) + if (!is.null(prefun)) prefun(prepar); + lines(master_template$start[sel], slave[sel], type="h", lwd=lwd, col=slave_col[sel]) + if (!is.null(postfun)) postfun(postpar); + + axDat <- prettyTicks(xlim, ntick = round(width/2)) + plot(master_template$start[master_template$seqname == chr], + rep(1, length(which(master_template$seqname == chr))), type="n", + bty="n", axes=FALSE, ylim=c(0, 1), xlab="", ylab="", xlim=xlim) + axis(1, at = axDat$at, labels = axDat$labels, cex.axis=2.5, line=-7, padj=0.5) + + mtext(text=expression(log[2](nuclear~lamina~interaction)), side=2, -3, cex=2, at=0.6, outer=TRUE) + mtext(paste("location on ", master_species, " chromosome ", sub("chr", "", chr), + " (", axDat$xunit, ")", sep=""), side=3, -7, cex=2) + + mtext(main, side=3, cex=2, outer=TRUE) + + if (to_file) dev.off() +} + +make_ext_synteny_maps_chr <- function(master, slave, master_template, slave_template, + master_species="mouse", slave_species="human", + chr="chr1", xlim=NULL, format="png", sample=TRUE, + info="", master_col="black", slave_col=NULL, + lwd=0.2, ylim=NULL, width=14, to_file=TRUE, + outputdir="misc_figures", + ext_dat=NULL, postfun=NULL, postpar=NULL, + prefun=NULL, prepar=NULL) { + master_chroms <- chrom_info[[master_species]]; + master_chromsizes <- chromsize_info[[master_species]]; + + if (is.null(xlim)) { + xrange <- "full"; + xlim <- as.numeric(master_chromsizes[master_chromsizes$seqname==chr,c("start","end")]) + } else { + xrange <- paste(xlim, collapse="-") + } + + if (is.null(ylim)) { + ylim_master <- quantile(master, prob=c(0.001,0.999), na.rm=T) + ylim_slave <- quantile(slave, prob=c(0.001,0.999), na.rm=T) + if (is.null(ext_dat)) { + ylim_ext <- NULL; + } else { + ylim_ext <- lapply(ext_dat, function(x) quantile(x, prob=c(0.001,0.999), na.rm=T)) + } + } else { + ylim_master <- ylim_slave <- ylim_ext <- NULL; + } + + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/synteny_maps_", chr, "_", xrange, "_", + info, "_", id(), ".", format, sep=""), + type=type, width=width, height=3, res=144, taa=4) + } + + nf <- layout(matrix(1:(length(ext_dat)+3), (length(ext_dat)+3), 1, byrow = FALSE), + heights=c(3,3,rep(1.5, length(ext_dat)),1.5)) + #par(oma=c(2,4,2,0), mar=c(0,2,0,0)) + par(mar=c(0,7,0,1), oma=c(0,0,1,0)) + + if (length(master_col) == 1) master_col <- rep(master_col, length(master)) + + if (is.null(slave_col)) { + slave_col <- rep("#000000", length(unique(slave_template$seqname))) + names(slave_col) <- sort(unique(slave_template$seqname)); + slave_chroms <- chrom_info[[slave_species]]; + slave_col[slave_chroms] <- rainbow(length(slave_chroms)); + slave_col=slave_col[slave_template$seqname] + } + + sel <- which(master_template$seqname == chr) + if (sample) sel <- sample(sel, round(length(sel)/5)) + + plot(0, type="n", xlim=xlim, ylim=ylim_master, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) + if (!is.null(prefun)) prefun(prepar); + lines(master_template$start[sel], master[sel], type="h", lwd=lwd, col=master_col[sel]) + #mtext(paste(" ", chr), 3, 0, cex=2, line=-0.2, adj=0) + legend("topleft", "(x,y)", inset=0.001, legend=master_species, cex=2, bty="n") + if (!is.null(postfun)) postfun(postpar); + + plot(0, type="n", xlim=xlim, ylim=ylim_slave, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) + if (!is.null(prefun)) prefun(prepar); + lines(master_template$start[sel], slave[sel], type="h", lwd=lwd, col=slave_col[sel]) + legend("topleft", "(x,y)", inset=0.001, legend=slave_species, cex=2, bty="n") + if (!is.null(postfun)) postfun(postpar); + + for (i in 1:length(ext_dat)) { + plot(0, type="n", xlim=xlim, ylim=ylim_ext[[i]], xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) + if (!is.null(prefun)) prefun(prepar); + lines(master_template$start[sel], ext_dat[[i]][sel], type="h", lwd=lwd, col="black") + legend("topleft", "(x,y)", inset=0.001, legend=names(ext_dat)[i], cex=2, bty="n") + if (!is.null(postfun)) postfun(postpar); + } + + axDat <- prettyTicks(xlim, ntick = round(width/2)) + plot(master_template$start[master_template$seqname == chr], + rep(1, length(which(master_template$seqname == chr))), type="n", + bty="n", axes=FALSE, ylim=c(0, 1), xlab="", ylab="", xlim=xlim) + axis(1, at = axDat$at, labels = axDat$labels, cex.axis=2.5, line=-7, padj=0.5) + + mtext(paste("location on ", master_species, " chromosome ", sub("chr", "", chr), + " (", axDat$xunit, ")", sep=""), side=3, -7, cex=2) + mtext(text=expression(log[2](nuclear~lamina~interaction)), 2, -3, cex=2, at=0.6, outer=TRUE) + + if (to_file) dev.off() +} + + +draw_rainbow_bar_outertext <- function(template, chroms, format="png", header="Human\nchroms", to_file=TRUE, + mar=c(2,2,2,6), info="", outputdir="misc_figures", main_cex=2.5) { + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/rainbow_bar_outertext_", info, "_", id(), ".", format, sep=""), + type=type, res=144, taa=4, height=7, width=1) + } + + par(oma=c(0,0,4,0), mar=mar) + + chroms_cols <- rep("#000000", length(unique(template$seqname))) + names(chroms_cols) <- sort(unique(template$seqname)); + chroms_cols[chroms] <- rainbow(length(chroms)); + + n <- length(chroms); + cols <- chroms_cols[chroms] + + coly1 <- rev((1:n) / n) + coly2 <- coly1 - 1/n + + plot(0, type="n", xlim=c(0,1), ylim=c(0,1), xaxs="i", yaxs="i", axes=FALSE, xlab="", ylab="") + + rect(0, coly1, 1, coly2, col=cols, border=NA) + grid(nx=1, ny=n, col="black", lty=1); box(); + mtext(side=3, header, cex=main_cex, font=2, adj=0.5, line=-0.75, outer=TRUE) + + tickMarks <- ((0:n)/n)+(1/(2*n)); tickMarks <- tickMarks[-length(tickMarks)] + labels <- chroms + axis(side=4, at=tickMarks, labels=rev(labels), las=2, cex.axis=2) + #axis(side=1, at=tickMarks[sel], labels=c(2, 5, ">8"), cex.axis=2, padj=0.3) + + if (to_file) dev.off() +} + +draw_rainbow_bar <- function(template, chroms, format="png", header="Human\nchroms", to_file=TRUE, + mar=c(2,2,2,2), info="", outputdir="misc_figures", main_cex=2.5) { + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/rainbow_bar_", info, "_", id(), ".", format, sep=""), + type=type, res=144, taa=4, height=7, width=1) + + par(oma=c(0,0,4,0), mar=mar) + } + + chroms_cols <- rep("#000000", length(unique(template$seqname))) + names(chroms_cols) <- sort(unique(template$seqname)); + chroms_cols[chroms] <- rainbow(length(chroms)); + + n <- length(chroms); + cols <- chroms_cols[chroms] + + coly1 <- rev((1:n) / n) + coly2 <- coly1 - 1/n + + plot(0, type="n", xlim=c(0,1), ylim=c(0,1), xaxs="i", yaxs="i", axes=FALSE, xlab="", ylab="") + + rect(0, coly1, 1, coly2, col=cols, border=NA) + grid(nx=1, ny=n, col="black", lty=1); box(); + mtext(side=3, header, cex=main_cex, font=2, adj=0.5, line=-0.75, outer=TRUE) + + text(0.5, coly1-(1/(2*n)), col="white", sub("chr", "", chroms), cex=2.5, font=2, adj=0.5) + + if (to_file) dev.off() +} + +draw_rainbow_box <- function(template, chroms, format="png", header="Human chromosomes", to_file=TRUE, + mar=c(2,2,2,2), info="", outputdir="misc_figures", main_cex=2) { + n <- length(chroms); + ncols <- round(sqrt(n)) + nrows <- ceiling(sqrt(n)) + tot <- ncols * nrows; + + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/rainbow_box_", info, "_", id(), ".", format, sep=""), + type=type, res=144, taa=4, height=nrows/2, width=ncols) + } + + par(mar=mar) + + mat <- matrix(1:tot, nrow=nrows, ncol=ncols); + mat <- mat[nrows:1,] + + chroms_cols <- rep("#FFFFFF", tot); + names(chroms_cols) <- chroms; + chroms_cols[chroms] <- rainbow(n); + + image(mat, col=rev(chroms_cols), axes=FALSE); + grid(nx=nrows, ny=ncols, col="black", lty=1); box(); + mtext(side=3, header, cex=main_cex, font=2, adj=0, line=0.5) + + text(x=rep((0:(nrows-1))/(nrows-1), ncols), + y=rep(((ncols-1):0)/(ncols-1), each=nrows), labels=chroms, cex=3) + + if (to_file) dev.off() +} + + +draw_HMMcall_box <- function(calls=c("LAD", "iLAD"), cols=c("red", "green"), textcols=c("black", "black"), + format="png", header="Human lamina status", to_file=TRUE, mar=c(2,2,2,2), + main_cex=2, info="", outputdir="misc_figures") { + + if (to_file) { + type <- format; + if (format == "pdf") type <- "pdfwrite"; + if (format == "png") type <- "png16m"; + bitmap(file=paste(outputdir, "/HMMcall_box_", info, "_", id(), ".", format, sep=""), + type=type, res=144, taa=4, height=1, width=length(calls)) + } + + par(mar=mar) + + n <- length(calls) + mat <- matrix(1:n, nrow=n); + + image(mat, col=cols, axes=FALSE); + grid(nx=n, ny=1, col="black", lty=1); box(); + mtext(side=3, header, cex=main_cex, font=2, adj=0, line=0.5) + + text((0:(n-1))/(n-1), rep(0, n), calls, cex=2, col=textcols) + + if (to_file) dev.off() +} + +#violin.plot <- function (x, x.scaling = 1, autoscale = 0.995, +# individual.scaling = FALSE, median.bar = "grey50", x.pos, cex.names=1, ...) { +## Author: Guillaume Filion. +## Date: June 7, 2011. +## x: a list/data.frame of values. +## x.scaling: an x scaling factor to apply to the densities. +## autoscale: the minimum proportion of the data to be displayed on the plot. +## ...: density, angle, border, col, and lty are passed to polygon(). +# +# if (autoscale < 0 || autoscale > 1) +# stop ("'autoscale' outside the interval [0,1]") +# +# m <- length(x) +# if (missing(x.pos)) +# x.pos <- 1:m +# densities <- lapply(X = x, FUN = density, na.rm = TRUE) +# +# # Compute nice spacing parameters between plots. +# y.sup <- sapply(X = densities, FUN = function (x) max(x$y)) +# x.min <- min(sapply(X = densities, FUN = function (x) min(x$x))) +# x.max <- max(sapply(X = densities, FUN = function (x) max(x$x))) +# if(individual.scaling) +# y.sup <- y.sup/.8 +# else +# y.sup <- rep(max(y.sup), length(y.sup)) +# +# extrargs = list(...) +# # The density, angle, border, col, and lty parameters are passed +# # to polygon(). The rest is passed to plot(). +# pol.index <- match(names(extrargs), +# c("density", "angle", "border", "col", "lty")) +# plotargs <- list() +# polargs <- list() +# if (length(extrargs) > 0) { +# for (i in 1:length(extrargs)) { +# if (is.na(pol.index[i])) +# plotargs[[names(extrargs)[i]]] <- extrargs[[i]] +# else +# polargs[[names(extrargs)[i]]] <- extrargs[[i]] +# } +# } +# # Set the axis labels to null if not specified. +# if (is.null(plotargs[["xlab"]])) +# plotargs[["xlab"]] <- "" +# if (is.null(plotargs[["ylab"]])) +# plotargs[["ylab"]] <- "" +# +# # Find a nice y zoom if ylim is not specified. +# if (is.null(plotargs[["ylim"]])) { +# ymin <- min(sapply(X = x, FUN = quantile, prob = (1 - autoscale)/2, +# na.rm = TRUE)) +# ymax <- max(sapply(X = x, FUN = quantile, prob = 0.5 + autoscale/2, +# na.rm = TRUE)) +# plotargs[["ylim"]] <- c(ymin, ymax) +# } +# +# # Overwrite type, xaxt, x, and y. +# plotargs[["type"]] <- 'n' +# plotargs[["xaxt"]] <- 'n' +# plotargs[["x"]] <- c(min(x.pos)-.6*x.scaling, max(x.pos)+.6*x.scaling) +# plotargs[["y"]] <- c(x.min, x.max) +# +# # Plot the frame. +# eval(as.call(c(plot, plotargs))) +# +# # Manually recycle the parameters passed to polygon(). +# if (is.null(polargs[["col"]])) +# polargs[["col"]] <- 'black' +# for (i in 1:length(polargs)) +# polargs[[i]] <- rep(eval(polargs[[i]]), length.out = m) +# +# # Set the default border parameter to the value of col. +# if (is.null(polargs[["border"]])) +# polargs[["border"]] <- polargs[["col"]] +# # Set color to black if not specified. +# if (is.null(polargs[["col"]])) +# polargs[["col"]] <- "black" +# +# # Plot the x-axis. +# axis(side = 1, at = 1:m, labels = names(x), cex.axis=cex.names) +# +# # Plot the violins. +# polargs.i <- list() +# for (i in 1:m) { +# polargs.i[["x"]] <- x.scaling*(c(densities[[i]]$y, +# -rev(densities[[i]]$y))) / (2.1*y.sup[i]) + x.pos[i] +# polargs.i[["y"]] <- c(densities[[i]]$x, rev(densities[[i]]$x)) +# for (a in names(polargs)) +# polargs.i[[a]] <- polargs[[a]][i] +# eval(as.call(c(polygon, polargs.i))) +# +# # Plot the median bar. +# if (!is.na(median.bar)) { +# medians <- sapply(X = x, FUN = median, na.rm = TRUE) +# segments(x0 = x.pos[i], x1 = x.pos[i]-.4, y0 = medians[i], +# y1 = medians[i], lwd = 2, lty = 1, col = median.bar) +# segments(x0 = x.pos[i], x1 = x.pos[i]+.4, y0 = medians[i], +# y1 = medians[i], lwd = 2, lty = 1, col = median.bar) +# } +# } +#} + +violin.plot <- function (x, names, x.scaling = 1, autoscale = 0.995, + individual.scaling = FALSE, median.bar = "grey50", x.pos, ...) { +# Author: Guillaume Filion. +# Date: June 7, 2011. +# x: a list/data.frame of values. +# x.scaling: an x scaling factor to apply to the densities. +# autoscale: the minimum proportion of the data to be displayed on the plot. +# ...: density, angle, border, col, and lty are passed to polygon(). + + if (autoscale < 0 || autoscale > 1) + stop ("'autoscale' outside the interval [0,1]") + + m <- length(x) + if (missing(x.pos)) + x.pos <- 1:m + densities <- lapply(X = x, FUN = density, na.rm = TRUE) + + # Compute nice spacing parameters between plots. + y.sup <- sapply(X = densities, FUN = function (x) max(x$y)) + x.min <- min(sapply(X = densities, FUN = function (x) min(x$x))) + x.max <- max(sapply(X = densities, FUN = function (x) max(x$x))) + if(individual.scaling) + y.sup <- y.sup/.8 + else + y.sup <- rep(max(y.sup), length(y.sup)) + + extrargs = list(...) + # The density, angle, border, col, and lty parameters are passed + # to polygon(). The rest is passed to plot(). + pol.index <- match(names(extrargs), + c("density", "angle", "border", "col", "lty")) + plotargs <- list() + polargs <- list() + if (length(extrargs) > 0) { + for (i in 1:length(extrargs)) { + if (is.na(pol.index[i])) + plotargs[[names(extrargs)[i]]] <- extrargs[[i]] + else + polargs[[names(extrargs)[i]]] <- extrargs[[i]] + } + } + # Set the axis labels to null if not specified. + if (is.null(plotargs[["xlab"]])) + plotargs[["xlab"]] <- "" + if (is.null(plotargs[["ylab"]])) + plotargs[["ylab"]] <- "" + + # Find a nice y zoom if ylim is not specified. + if (is.null(plotargs[["ylim"]])) { + ymin <- min(sapply(X = x, FUN = quantile, prob = (1 - autoscale)/2, + na.rm = TRUE)) + ymax <- max(sapply(X = x, FUN = quantile, prob = 0.5 + autoscale/2, + na.rm = TRUE)) + plotargs[["ylim"]] <- c(ymin, ymax) + } + + # Overwrite type, xaxt, x, and y. + plotargs[["type"]] <- 'n' + plotargs[["xaxt"]] <- 'n' + plotargs[["x"]] <- c(min(x.pos)-.6*x.scaling, max(x.pos)+.6*x.scaling) + plotargs[["y"]] <- c(x.min, x.max) + + # Plot the frame. + eval(as.call(c(plot, plotargs))) + + # Manually recycle the parameters passed to polygon(). + if (is.null(polargs[["col"]])) + polargs[["col"]] <- 'black' + for (i in 1:length(polargs)) + polargs[[i]] <- rep(eval(polargs[[i]]), length.out = m) + + # Set the default border parameter to the value of col. + if (is.null(polargs[["border"]])) + polargs[["border"]] <- polargs[["col"]] + # Set color to black if not specified. + if (is.null(polargs[["col"]])) + polargs[["col"]] <- "black" + + # Plot the x-axis. + axis(side = 1, at = 1:m, labels = names) + + # Plot the violins. + polargs.i <- list() + for (i in 1:m) { + polargs.i[["x"]] <- x.scaling*(c(densities[[i]]$y, + -rev(densities[[i]]$y))) / (2.1*y.sup[i]) + x.pos[i] + polargs.i[["y"]] <- c(densities[[i]]$x, rev(densities[[i]]$x)) + for (a in names(polargs)) + polargs.i[[a]] <- polargs[[a]][i] + eval(as.call(c(polygon, polargs.i))) + + # Plot the median bar. + if (!is.na(median.bar)) { + medians <- sapply(X = x, FUN = median, na.rm = TRUE) + segments(x0 = x.pos[i], x1 = x.pos[i]-.4, y0 = medians[i], + y1 = medians[i], lwd = 2, lty = 1, col = median.bar) + segments(x0 = x.pos[i], x1 = x.pos[i]+.4, y0 = medians[i], + y1 = medians[i], lwd = 2, lty = 1, col = median.bar) + } + } +} + +get_pref_stats_one_side <- function(hits, IG_hits) { + n <- length(hits)*2 + all_hits <- rep(NA, n) + all_hits[seq(1, n, 2)] <- IG_hits + all_hits[seq(2, n, 2)] <- hits + + gene_ind <- rep(c(0,1), n/2) + + res <- doFisherGeneric(all_hits, gene_ind) + + invisible(res) +} + +get_pref_stats <- function(stats) { + print("Dissociation") + pref_dis <- get_pref_stats_one_side(stats$hits_dis, stats$IG_hits_dis) + print("Association") + pref_as <- get_pref_stats_one_side(stats$hits_as, stats$IG_hits_as) + + invisible(list(pref_dis=pref_dis, pref_as=pref_as)) +} + +get_adj_stats_one_side <- function(hits, IG_hits, VERBOSE=TRUE) { + # In case intergenic regions are too small to test for significance, + # we previously assigned 'NA' to their coordinates. Here, we change their + # as/dis status to 'TRUE', in order to be able to glue adjacent dis/as genes together. + IG_hits[is.na(IG_hits)] <- TRUE; + + n <- length(hits)*2 + all_hits <- rep(NA, n) + all_hits[seq(1, n, 2)] <- IG_hits + all_hits[seq(2, n, 2)] <- hits + + sel <- which(all_hits) + adj <- data.frame(start=sel[c(1, which(diff(sel) != 1)+1)], + end=sel[c(which(diff(sel) != 1), length(sel))]); + + adj_full <- adj; + + # If the start or end position is odd, it is in IG. + # Because we're intereseted in genes only, we correct this by adding 1 to start and/or subtracting 1 from end. + adj$start <- ifelse(adj$start %% 2 == 1, adj$start+1, adj$start) + adj$end <- ifelse(adj$end %% 2 == 1, adj$end-1, adj$end) + + sel <- which(adj$start %% 2 == 0 & adj$end %% 2 == 0 & adj$start == adj$end) + + if (VERBOSE) { + cat("% of genes that move in isolation: "); + cat(length(sel) / length(which(hits)) * 100) + cat("\n"); + cat("% of these genes that move without their intergenic regions: "); + cat(length(which(adj_full$start[sel] %% 2 == 0 & adj_full$end[sel] %% 2 == 0)) / length(sel) * 100) + cat("\n"); + cat("% of all genes that move in isolation and without their intergenic regions: "); + cat(length(which(adj_full$start[sel] %% 2 == 0 & adj_full$end[sel] %% 2 == 0)) / length(which(hits)) * 100) + cat("\n") + } + + # Remove IG-only regions and translate indices back to gene indices + adj <- adj[adj$end-adj$start >= 0,] + adj$start <- adj$start/2 + adj$end <- adj$end/2 + + ls <- (adj$end-adj$start+1) + if (VERBOSE) print(table(ls)*as.numeric(names(table(ls)))/length(which(hits))*100) + adj$length <- ls + + invisible(list(adj=adj, adj_full=adj_full)) +} + +get_adj_stats <- function(stats, VERBOSE=TRUE) { + if (VERBOSE) print("Dissociating genes") + adj_dis <- get_adj_stats_one_side(stats$hits_dis, stats$IG_hits_dis, VERBOSE) + if (VERBOSE) print("Associating genes") + adj_as <- get_adj_stats_one_side(stats$hits_as, stats$IG_hits_as, VERBOSE) + + invisible(list(adj_dis=adj_dis$adj, adj_as=adj_as$adj, + adj_dis_full=adj_dis$adj_full, adj_as_full=adj_as$adj_full)) +} + +get_flanks <- function(probes, n=10, inner=FALSE) { + probes_5prime <- probes_3prime <- probes; + + if (inner) { + probes_5prime$end <- probes$start + n; + probes_3prime$start <- probes$end - n; + } else { + probes_5prime$end <- probes$start; + probes_5prime$start <- probes$start - n; + probes_3prime$end <- probes$end + n; + probes_3prime$start <- probes$end; + } + + probes_flanks <- rbind(probes_5prime, probes_3prime) + probes_flanks <- probes_flanks[order(probes_flanks$start),] + + # Make sure there are no overlapping flanks: OK + m <- nrow(probes_flanks) + res <- quantile(probes_flanks$start[2:m] - probes_flanks$end[1:(m-1)]) + if (any(res < 0)) { + warning("Overlapping flanks"); + print(res); + } + + invisible(probes_flanks) +} + +### Used to calculate probe spacing in GFF +probe_spacing <- function(GFF) { + summary(unlist(tapply(1:nrow(GFF), GFF$seqname, function(x) + GFF$start[x[-1]] - GFF$end[x[-length(x)]]))) +} + + +### Used to write diagnostics to the 'diag' object. +add_diag <- function(name, value) { + if (length(value) == 1) { + cat(paste(name, ": ", value, "\n", sep="")) + } else { + cat(paste(name, ":\n", sep="")) + print(value) + } + diag[[name]] <<- value; +} + +### Perform circular permutation using two binary sets: +### Rotate the one and calculate overlap with the other. +### Then, compare to 'real' overlap. +get_circ_perm_p <- function(set1, set2, n=length(set1), do_sample=FALSE) { + if (is.logical(set1)) set1 <- which(set1); + if (is.logical(set2)) set2 <- which(set2); + + if (do_sample) { + offsets <- sort(sample(seq.int(n-1), 1e4)) + } else { + offsets <- seq.int(n-1) + } + rnd_overlap <- c(); + for (i in 1:length(offsets)) { + offset <- offsets[i]; + if ((i/length(offsets)*100) %% 10 == 0) + cat(paste(i/length(offsets)*100, "%\r", sep="")) + + rnd <- (((set2+offset)-1) %% n)+1 + rnd_overlap <- c(rnd_overlap, length(intersect(set1, rnd))) + } + cat("\n"); + + ovl <- length(intersect(set1, set2)) + p_le <- length(which(rnd_overlap <= ovl)) / length(rnd_overlap) + p_gr <- length(which(rnd_overlap > ovl)) / length(rnd_overlap) + p <- min(p_le, p_gr)*2 + + res <- list(ovl=ovl, n=length(rnd_overlap), p_le=p_le, p_gr=p_gr, p=p, summary(rnd_overlap)) +} + +get_conc_circ_perm_p <- function(calls1, calls2, n=length(calls1), do_sample=FALSE) { + if (do_sample) { + offsets <- sort(sample(seq.int(n-1)+1, 1e4)) + } else { + offsets <- seq.int(n-1)+1 + } + + rnd_overlap <- c(); + for (i in 1:length(offsets)) { + offset <- offsets[i]; + if ((i/length(offsets)*100) %% 10 == 0) + cat(paste(i/length(offsets)*100, "%\r", sep="")) + + rnd <- calls2[c(offset:n,1:(offset-1))] + rnd_overlap <- c(rnd_overlap, sum(calls1 == rnd)) + } + cat("\n"); + + ovl <- sum(calls1 == calls2) + + p_le <- length(which(rnd_overlap <= ovl)) / length(rnd_overlap) + p_gr <- length(which(rnd_overlap > ovl)) / length(rnd_overlap) + p <- min(p_le, p_gr)*2 + + res <- list(ovl=ovl, n=length(rnd_overlap), p_le=p_le, p_gr=p_gr, p=p, sum=summary(rnd_overlap)) +} + +# define wideScreen function +wideScreen <- function(ncols) { + if (missing(ncols)) + ncols <- as.integer(Sys.getenv("COLUMNS")) + options(width=ncols); +} + +#### Function to read in (large) .bed files and merge overlapping regions. +# NOTE: simply ignore the fact that UCSC coordinates are 0-based. +# We won't notice this on our probe-level resolution anyway. +readFixMergeBED <- function(file, purge=c(), verbose=FALSE) { + alldat <- NULL; + cnt <- 0; + step <- 1e4; + rcnt <- 0; + cracks <- c(); + + n <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) + print(paste("Total number of lines:", n)); + + for (i in 1:ceiling(n/step)) { + if (verbose) + print(paste("Processing lines", (cnt*step)+1, "to", min(n, (cnt+1)*step))) + else + cat("."); + + toread <- min(step, n - (cnt*step)); + dat <- readBED(file, nlines=toread, skip=cnt*step, purge=purge); + + # Fix cases where start > end (this happens sometimes..) + hits <- which(dat$start > dat$end) + tmp <- dat$start[hits]; + dat$start[hits] <- dat$end[hits]; + dat$end[hits] <- tmp; + + # Sort data + dat <- dat[order(dat$seqname, dat$start),] + + ## We extend genes by +/- 30bp, as this is half of our probe width. + ## This is to facilitate further merging of annotations with small (< 60bp) gaps. + #dat$start <- dat$start - 30; + #dat$end <- dat$end + 30; + + # Merge overlapping entries. + dat <- Merge(dat) + left <- round((nrow(dat)/toread)*100, 2) + if (verbose) + print(paste(left, "percent left after merging")) + + cracks <- c(cracks, rcnt + nrow(dat)); + alldat <- rbind(alldat, dat); + rcnt <- nrow(alldat); + + cnt <- cnt+1; + } + if (!verbose) cat("\n") + + tmps <- list(); + idxs <- c(); + for (i in 1:(length(cracks)-1)) { + crack <- cracks[i]; # Select crack to be processed + idx <- crack+c(-100:100); # Expand crack + idx <- idx[idx %in% 1:nrow(alldat)] # Correct for boundaries of alldat + idxs <- c(idxs, idx); # Remembers positions + tmp <- alldat[idx,]; # Create temp object + tmp <- tmp[order(tmp$seqname, tmp$start),] # Sort object + tmps <- rbind(tmps, Merge(tmp)) # Merge and append to tmp collection + } + alldat <- alldat[-idxs,] # Remove crack (neighbour) positions from alldat + alldat <- rbind(alldat, tmps) # Append new merged data + alldat <- alldat[order(alldat$seqname, alldat$start),] # Sort all data. + + print(paste(nrow(alldat), "(", round((nrow(alldat) / n) * 100, 2), "%) rows left after merging"), sep="") + + invisible(alldat); +} + +### Function to calculate confidence intervals ###NEED TO CHECK! +get_conf_int <- function(x, dist="norm") { + if (dist == "norm") + qnorm(0.975) * (sd(x) / sqrt(length(x))) + else + qt(0.975, df=length(x)-1) * (sd(x) / sqrt(length(x))) +} + +### Function to scale up/down a heatmap. Adapted from the original R heatmap() function. +heatmap.scale <- +function (x, Rowv = NULL, Colv = if (symm) "Rowv" else NULL, + distfun = dist, hclustfun = hclust, reorderfun = function(d, + w) reorder(d, w), do_reorder=TRUE, add.expr, symm = FALSE, revC = identical(Colv, + "Rowv"), scale = c("row", "column", "none"), na.rm = TRUE, + margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 + + 1/log10(nr), cexCol = 0.2 + 1/log10(nc), labRow = NULL, + labCol = NULL, main = NULL, xlab = NULL, ylab = NULL, keep.dendro = FALSE, + verbose = getOption("verbose"), width=4, height=4, cex.lab=NULL, ...) +{ + scale <- if (symm && missing(scale)) + "none" + else match.arg(scale) + if (length(di <- dim(x)) != 2 || !is.numeric(x)) + stop("'x' must be a numeric matrix") + nr <- di[1] + nc <- di[2] + if (nr <= 1 || nc <= 1) + stop("'x' must have at least 2 rows and 2 columns") + if (!is.numeric(margins) || length(margins) != 2) + stop("'margins' must be a numeric vector of length 2") + doRdend <- !identical(Rowv, NA) + doCdend <- !identical(Colv, NA) + if (is.null(Rowv)) + Rowv <- rowMeans(x, na.rm = na.rm) + if (is.null(Colv)) + Colv <- colMeans(x, na.rm = na.rm) + if (doRdend) { + if (inherits(Rowv, "dendrogram")) + ddr <- Rowv + else { + hcr <- hclustfun(distfun(x)) + ddr <- as.dendrogram(hcr) + if (!is.logical(Rowv) || Rowv) + ddr <- reorderfun(ddr, Rowv) + } + if (nr != length(rowInd <- order.dendrogram(ddr))) + stop("row dendrogram ordering gave index of wrong length") + } + else rowInd <- 1L:nr + if (doCdend) { + if (inherits(Colv, "dendrogram")) + ddc <- Colv + else if (identical(Colv, "Rowv")) { + if (nr != nc) + stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") + ddc <- ddr + } + else { + hcc <- hclustfun(distfun(if (symm) + x + else t(x))) + ddc <- as.dendrogram(hcc) + if (!is.logical(Colv) || Colv) + ddc <- reorderfun(ddc, Colv) + } + if (nc != length(colInd <- order.dendrogram(ddc))) + stop("column dendrogram ordering gave index of wrong length") + } + else colInd <- 1L:nc + + # Force no-reorder when neccessary. + if (!do_reorder) { + colInd <- 1L:nc; + rowInd <- 1L:nr; + } + + x <- x[rowInd, colInd] + labRow <- if (is.null(labRow)) + if (is.null(rownames(x))) + (1L:nr)[rowInd] + else rownames(x) + else labRow[rowInd] + labCol <- if (is.null(labCol)) + if (is.null(colnames(x))) + (1L:nc)[colInd] + else colnames(x) + else labCol[colInd] + if (scale == "row") { + x <- sweep(x, 1, rowMeans(x, na.rm = na.rm)) + sx <- apply(x, 1, sd, na.rm = na.rm) + x <- sweep(x, 1, sx, "/") + } + else if (scale == "column") { + x <- sweep(x, 2, colMeans(x, na.rm = na.rm)) + sx <- apply(x, 2, sd, na.rm = na.rm) + x <- sweep(x, 2, sx, "/") + } + lmat <- rbind(c(NA, 3), 2:1) + #lwid <- c(if (doRdend) 1 else 0.05, 4) + lwid <- c(if (doRdend) 1 else 0.05, width) + #lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0,4) + lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0, height) + if (!missing(ColSideColors)) { + if (!is.character(ColSideColors) || length(ColSideColors) != + nc) + stop("'ColSideColors' must be a character vector of length ncol(x)") + lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1) + lhei <- c(lhei[1], 0.2, lhei[2]) + } + if (!missing(RowSideColors)) { + if (!is.character(RowSideColors) || length(RowSideColors) != + nr) + stop("'RowSideColors' must be a character vector of length nrow(x)") + lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), + 1), lmat[, 2] + 1) + lwid <- c(lwid[1], 0.2, lwid[2]) + } + lmat[is.na(lmat)] <- 0 + if (verbose) { + cat("layout: widths = ", lwid, ", heights = ", lhei, + "; lmat=\n") + print(lmat) + } + op <- par(no.readonly = TRUE) + on.exit(par(op)) + layout(lmat, widths = lwid, heights = lhei, respect = TRUE) + if (!missing(RowSideColors)) { + par(mar = c(margins[1], 0, 0, 0.5)) + image(rbind(1L:nr), col = RowSideColors[rowInd], axes = FALSE) + } + if (!missing(ColSideColors)) { + par(mar = c(0.5, 0, 0, margins[2])) + image(cbind(1L:nc), col = ColSideColors[colInd], axes = FALSE) + } + par(mar = c(margins[1], 0, 0, margins[2])) + if (!symm || scale != "none") + x <- t(x) + if (revC) { + iy <- nr:1 + ddr <- rev(ddr) + x <- x[, iy] + } + else iy <- 1L:nr + image(1L:nc, 1L:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + + c(0, nr), axes = FALSE, xlab = "", ylab = "", ...) + axis(1, 1L:nc, labels = labCol, las = 2, line = -0.5, tick = 0, + cex.axis = cexCol) + if (!is.null(xlab)) + mtext(xlab, side = 1, line = margins[1] - 1.25, cex=cex.lab) + axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, + cex.axis = cexRow) + if (!is.null(ylab)) + mtext(ylab, side = 4, line = margins[2] - 1.25, cex=cex.lab) + if (!missing(add.expr)) + eval(substitute(add.expr)) + par(mar = c(margins[1], 0, 0, 0)) + if (doRdend) + plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") + else frame() + par(mar = c(0, 0, if (!is.null(main)) 1 else 0, margins[2])) + if (doCdend) + plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") + else if (!is.null(main)) + frame() + if (!is.null(main)) + title(main, cex.main = 1.5 * op[["cex.main"]]) + invisible(list(rowInd = rowInd, colInd = colInd, Rowv = if (keep.dendro && + doRdend) ddr, Colv = if (keep.dendro && doCdend) ddc)) +} + +### Obtain optimal clustering for a distance matrix. +get_optimal_ordering <- function(distmat) { + library(cba) + + hc <- hclust(distmat, method="average"); + optim <- order.optimal(distmat, hc$merge); + + invisible(as.numeric(optim$order)); +} + +### Peak finder +# Taken from https://gist.github.com/jamiefolson/5831746 +which.peaks <- function(x,partial=TRUE,decreasing=FALSE){ + if (decreasing){ + if (partial){ + which(diff(c(TRUE,diff(x)<=0,FALSE))>0) + }else { + which(diff(diff(x)<=0)>0) + } + }else { + if (partial){ + which(diff(c(TRUE,diff(x)>=0,FALSE))<0) + }else { + which(diff(diff(x)>=0)<0) + } + + } +} + +# plot pretty ticks +prettyTicks <- function (dataRange, ntick = 6) { + XDIV = 1 + while (XDIV < diff(dataRange)) XDIV = XDIV * 1000 + XDIV = XDIV/1000 + if ((diff(dataRange)/XDIV) <= 2) + XDIV = XDIV/1000 + Xscl <- c(floor(dataRange[1]/XDIV) * XDIV, ceiling(dataRange[2]/XDIV) * XDIV) + xunit = switch(as.character(log10(XDIV)), `0` = "bp", `3` = "kb", + `6` = "Mb", `9` = "Gb", "scale not recognized") + Lab <- pretty(c(Xscl[1]/XDIV, Xscl[2]/XDIV), n = ntick) + Tic <- Lab * XDIV + return(list(at = Tic, labels = Lab, xunit = xunit)) +} + +## Make plots for signals across all chromosomes. +make_chr_plot <- function(coords, signal, type="h", assembly="hg19", format="png", + sample=FALSE, nam="", ylab="", col="black", lwd=0.2, + ylim=range(signal, na.rm=T), to_file=TRUE, + outputdir="misc_figures", postfun=NULL, ...) { + + if (ylab == "") ylab <- nam; + + # Obtain names and sizes of chromosomes + tmp <- read.delim(paste(assembly, "genome", sep="."), header=FALSE, row.names=1) + chroms <- rownames(tmp); + chromsizes <- tmp[,1]; names(chromsizes) <- chroms; + + ncols <- round(sqrt(length(chroms))) + nrows <- ceiling(sqrt(length(chroms))) + tot <- ncols * nrows; + colwidths <- tapply(chroms, rep(1:ncols, each=nrows)[1:length(chroms)], + function(x) max(coords$start[coords$seqname %in% x])) + + if (to_file) { + dir.create(outputdir, recursive=TRUE, showWarnings=FALSE); + plotfile(paste(outputdir, "/chr_plot_", nam, sep=""), + type=format, width=7*ncols, height=5*nrows) + } + + par(oma=c(2,3,2,0), mar=c(0,2,0,1)) + nf <- layout(matrix(1:(tot*2), 2*nrows, ncols, byrow = FALSE), + heights=rep(c(3,1), tot), widths=colwidths/sum(colwidths)) + + if (length(col) == 1) col <- rep(col, length(signal)) + + colwidths_chr <- rep(colwidths, each=nrows); + names(colwidths_chr) <- chroms; + + for (chr in chroms) { + print(chr) + sel <- which(coords$seqname == chr) + if (sample) sel <- sort(sample(sel, round(length(sel)/5))) + + xlim <- c(0, colwidths_chr[chr]); + plot(coords$start[sel], signal[sel], type=type, xlim=xlim, ylim=ylim, + xaxt="n", xaxs="i", bty="n", xlab="", ylab="", adj=1, las=2, + cex.axis=0.8, lwd=lwd, col=col[sel], ...) + mtext(paste(" ", chr), 3, 0, cex=1, line=-1.5, adj=0) + + axDat <- prettyTicks(c(0, max(coords$start[coords$seqname == chr])), ntick = 7) + if (!is.null(postfun)) postfun(range(axDat$at)); + + plot(coords$start[coords$seqname == chr], + rep(1, length(which(coords$seqname == chr))), type="n", + xaxs="i", bty="n", axes=FALSE, ylim=c(-0.3, 1), xlab="", ylab="", xlim=xlim) + axis(1, at = axDat$at, labels = axDat$labels, cex.axis=1, cex.axis=0.8, padj=-1.2, line=-3) + } + + mtext(paste("genomic location (", axDat$xunit, ")", sep=""), 1, line=0.5, cex=1.2, outer=TRUE) + #mtext(text=expression(log[2](LaminB1~interaction)), 2, 1, cex=1.2, outer=TRUE) + mtext(text=ylab, 2, 1, cex=1.2, outer=TRUE) + + if (to_file) dev.off() +} + + From 7cb8d926ccb039fa5a424fbef61eb6d67142883e Mon Sep 17 00:00:00 2001 From: fwip Date: Thu, 15 Mar 2018 10:52:47 -0700 Subject: [PATCH 02/15] Add preliminary dockerfile --- Dockerfile | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5a87dc3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM hotspot2 +RUN apk add --no-cache \ + g++ \ + R \ + R-dev +RUN Rscript -e 'install.packages("caTools", repo="http://cran.rstudio.com")' + +WORKDIR masterlist + +ADD . . From 9f721f2f44c45d00bbc022263fce29355b37e25e Mon Sep 17 00:00:00 2001 From: fwip Date: Thu, 15 Mar 2018 11:29:46 -0700 Subject: [PATCH 03/15] code_build.R: read file from commandlint --- code_build.R | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/code_build.R b/code_build.R index 0d05c25..20aeba3 100644 --- a/code_build.R +++ b/code_build.R @@ -1,3 +1,4 @@ +#!/usr/bin/env Rscript ########################################################################################################## # # # Method for generating a master list / Index of DNaseI hypersensitivity sites. # @@ -18,6 +19,7 @@ if (length(args)==0) { stop("No arguments supplied.") } else { eval(parse(text=args[[1]])) # parse first argument: chunknum + eval(parse(text=args[[2]])) # parse first argument: filepath } chunk <- paste("chunk", sprintf("%04d", chunknum), ".bed", sep=""); @@ -31,20 +33,13 @@ options(warn=2) ## Example line: # chr11 52983767 52983861 id-708944 0.0622013 52983811 LN4866 52983810 -#FILEPATH=paste("/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf644/masterLists/", -# "FDR0.0010/outdir_MLfromPeakClumps_1g_MA21_DHSwidthThreshold20bp/chunks_10kb_corrected/", sep="/"); -#chunks <- dir(FILEPATH, pattern="^chunk.*.txt"); - -#FILEPATH="/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf665/varWidthPeaks/chunks_5kb_min500elementsPerChunk/"; -FILEPATH="/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf665/varWidthPeaks/chunks_10kb_min500elementsPerChunk/"; -#chunks <- dir(FILEPATH, pattern="^chunk.*.bed"); dir.create("DHSs_all", showWarnings=FALSE, recursive=TRUE) dir.create("peaks_all", showWarnings=FALSE, recursive=TRUE) #for (chunk in chunks) { # Load in data per chunk, each separated by at least 10kb - peaks <- read.delim(paste(FILEPATH, chunk, sep="/"), header=FALSE, as.is=T) + peaks <- read.delim(filepath, header=FALSE, as.is=T) #colnames(peaks) <- c("seqname", "start", "end", "ID", "score", "density_summit", "sampleID", "wavelet_summit") colnames(peaks) <- c("seqname", "start", "end", "sampleID", "score", "density_summit", "wavelet_summit") peaks <- peaks[order(peaks$wavelet_summit),] # Order peaks by wavelet summit first From 3882c1ea5ad3416a790c01bea65fa0d3823707ff Mon Sep 17 00:00:00 2001 From: fwip Date: Thu, 15 Mar 2018 11:32:02 -0700 Subject: [PATCH 04/15] Fix up existing scripts --- code_construct_matrix.sh | 31 ++++++++++---------- code_gen_masterlist.sh | 63 ++++++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/code_construct_matrix.sh b/code_construct_matrix.sh index 2258f5e..dc1321d 100755 --- a/code_construct_matrix.sh +++ b/code_construct_matrix.sh @@ -1,3 +1,4 @@ +#!/bin/bash ########################################################################################################## # # # Method for generating a master list / Index of DNaseI hypersensitivity sites. # @@ -8,37 +9,37 @@ ########################################################################################################## -if [[ $# -eq 0 ]] ; then - echo 'Provide the name/version of the file, e.g. WM20180301' +if [[ $# -le 2 ]] ; then + echo 'Provide the name/version of the file, e.g. WM20180301, and the number of chunks' exit 1 fi NAME=$1; +numchunks=$2 -rm -f matrix_bin_all_${NAME}.txt.gz; -for i in $(seq -f "%04g" 1 5000); do - echo ${i} +rm -f "matrix_bin_all_${NAME}.txt.gz" +for i in $(seq -f "%04g" 1 "$numchunks"); do + echo "$i" if [[ -f "DHSs_all/chunk${i}.bed" ]] ; then #paste <(cut -f 1-3 DHSs_all/chunk${i}.bed) matrix_bin_all/chunk${i}.bed | gzip -c - >> matrix_bin_all_${NAME}.txt.gz - paste <(cut -f 4 DHSs_all/chunk${i}.bed) matrix_bin_all/chunk${i}.bed | gzip -c - >> matrix_bin_all_${NAME}.txt.gz + paste <(cut -f 4 "DHSs_all/chunk${i}.bed") "matrix_bin_all/chunk${i}.bed" | gzip -c - >> "matrix_bin_all_${NAME}.txt.gz" fi done -rm -f matrix_bin_nonovl_core_${NAME}.txt.gz; -for i in $(seq -f "%04g" 1 5000); do - echo ${i} +rm -f "matrix_bin_nonovl_core_${NAME}.txt.gz" +for i in $(seq -f "%04g" 1 "$numchunks"); do + echo "$i" if [[ -f "DHSs_nonovl_core/chunk${i}.bed" ]] ; then #paste <(cut -f 1-3 DHSs_nonovl_core/chunk${i}.bed) matrix_bin_nonovl_core/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_core_${NAME}.txt.gz - paste <(cut -f 4 DHSs_nonovl_core/chunk${i}.bed) matrix_bin_nonovl_core/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_core_${NAME}.txt.gz + paste <(cut -f 4 "DHSs_nonovl_core/chunk${i}.bed") "matrix_bin_nonovl_core/chunk${i}.bed" | gzip -c - >> "matrix_bin_nonovl_core_${NAME}.txt.gz" fi done -rm -f matrix_bin_nonovl_any_${NAME}.txt.gz; -for i in $(seq -f "%04g" 1 5000); do - echo ${i} +rm -f "matrix_bin_nonovl_any_${NAME}.txt.gz" +for i in $(seq -f "%04g" 1 "$numchunks"); do + echo "$i" if [[ -f "DHSs_nonovl_any/chunk${i}.bed" ]] ; then #paste <(cut -f 1-3 DHSs_nonovl_any/chunk${i}.bed) matrix_bin_nonovl_any/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_any_${NAME}.txt.gz - paste <(cut -f 4 DHSs_nonovl_any/chunk${i}.bed) matrix_bin_nonovl_any/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_any_${NAME}.txt.gz + paste <(cut -f 4 "DHSs_nonovl_any/chunk${i}.bed") "matrix_bin_nonovl_any/chunk${i}.bed" | gzip -c - >> "matrix_bin_nonovl_any_${NAME}.txt.gz" fi done - diff --git a/code_gen_masterlist.sh b/code_gen_masterlist.sh index 9012843..571c025 100755 --- a/code_gen_masterlist.sh +++ b/code_gen_masterlist.sh @@ -1,3 +1,4 @@ +#!/bin/bash ########################################################################################################## # # # Method for generating a master list / Index of DNaseI hypersensitivity sites. # @@ -16,7 +17,7 @@ NAME=$1; TYPES="all nonovl_any nonovl_core"; for TYPE in ${TYPES}; do - echo $TYPE; + echo "$TYPE" FILE_CHUNKIDS="masterlist_DHSs_${NAME}_${TYPE}_chunkIDs.txt"; FILE_INDEXIDS="masterlist_DHSs_${NAME}_${TYPE}_indexIDs.txt"; @@ -26,22 +27,22 @@ for TYPE in ${TYPES}; do ### Create final master list if ! [[ -f "$FILE_CHUNKIDS" ]] ; then echo "Concatenating DHS chunks" - cat DHSs_${TYPE}/* | sort-bed - > ${FILE_CHUNKIDS} + cat "DHSs_${TYPE}"/* | sort-bed - > "${FILE_CHUNKIDS}" fi - + ### Generate label mapping - if ! [[ -f "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" && ${TYPE}=="all" ]] ; then + if ! [[ -f "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" && ${TYPE} == "all" ]] ; then echo "Generating unique DHS identifiers" - ./run_name_master_list.sh ${FILE_CHUNKIDS}; + ./run_name_master_list.sh "${FILE_CHUNKIDS}" fi - + ### Apply mapping if ! [[ -f "$FILE_INDEXIDS" ]] ; then echo "Mapping chunk identifiers to final DHS identifiers" awk 'BEGIN{ FS = OFS = "\t" } - FNR == NR { split($0, f, /\t/); map[f[2]] = f[1]; next } - { if ($4 in map) { $4 = map[$4] } } - { print }' masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt ${FILE_CHUNKIDS} > ${FILE_INDEXIDS} + FNR == NR { split($0, f, /\t/); map[f[2]] = f[1]; next } + { if ($4 in map) { $4 = map[$4] } } + { print }' "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" "${FILE_CHUNKIDS}" > "${FILE_INDEXIDS}" fi ### Create browser loadable BED12 files @@ -49,9 +50,9 @@ for TYPE in ${TYPES}; do echo "Constructing BED12 file" #echo "browser position chr6:26020208-26022677" > ${FILE_BED} #echo "track name='Master list DHSs ${NAME} ${TYPE}' description='Master list DHSs ${NAME} ${TYPE}' visibility=2 useScore=1" >> ${FILE_BED} - cat ${FILE_INDEXIDS} | awk ' + awk "${FILE_INDEXIDS}" ' function round(x) { if(x=="NA") { return 0 } else { return int(x + 0.5) } } - BEGIN { OFS="\t"; } + BEGIN { OFS="\t"; } { if ($10 == "NA" || $11 == "NA") { thickStart=$2 @@ -64,51 +65,49 @@ for TYPE in ${TYPES}; do $9=($9 >= $3 ? $3-1 : $9) $10=($10 <= $2 ? $2+1 : $10) $11=($11 >= $3 ? $3-1 : $11) - + thickStart= $9-1 thickEnd = $9+1 - + blockCount=1 blockSizes=1 blockStarts=0 - + blockSize2=$9-$10 blockStart2=$10-$2 - if (blockSize2 != 0 && blockStart2 < $3-$2-1) { - blockCount+=1; - blockSizes=blockSizes","blockSize2; - blockStarts=blockStarts","blockStart2 + if (blockSize2 != 0 && blockStart2 < $3-$2-1) { + blockCount+=1; + blockSizes=blockSizes","blockSize2; + blockStarts=blockStarts","blockStart2 } - + blockSize3=$11-$9 blockStart3=$9-$2 - if (blockSize3 != 0 && blockStart3 < $3-$2-1) { - blockCount+=1; - blockSizes=blockSizes","blockSize3; - blockStarts=blockStarts","blockStart3 + if (blockSize3 != 0 && blockStart3 < $3-$2-1) { + blockCount+=1; + blockSizes=blockSizes","blockSize3; + blockStarts=blockStarts","blockStart3 } - + blockSize4=1 blockStart4=$3-$2-1 blockCount+=1; - blockSizes=blockSizes","blockSize4; + blockSizes=blockSizes","blockSize4; blockStarts=blockStarts","blockStart4 } - + score=round(log($5+1)/log(10)*500) score=(score > 1000 ? 1000 : score) print $1, $2, $3, $4, score, ".", thickStart, thickEnd, "0,0,0", blockCount, blockSizes, blockStarts - }' > ${FILE_BED} + }' > "${FILE_BED}" fi - + if ! [[ -f "$FILE_BIGBED" ]] ; then echo "Converting BED file to BIGBED file" - bedToBigBed -type=bed12 ${FILE_BED} <(tail -n +2 hg38.genome) ${FILE_BIGBED} + bedToBigBed -type=bed12 "${FILE_BED}" <(tail -n +2 hg38.genome) "${FILE_BIGBED}" fi echo "Load this track in the UCSC browser using the following:" echo "track type=bigBed name=master_list_${NAME}_${TYPE} useScore=1 visibility=2 itemRgb='On' bigDataUrl=https://encode:collabor8@resources.altius.org/~meuleman/ML_tracks/${FILE_BIGBED}" -done; - - +done From b0e6e4eb15396d1b6e4b9e450451def884d842de Mon Sep 17 00:00:00 2001 From: fwip Date: Thu, 15 Mar 2018 11:32:30 -0700 Subject: [PATCH 05/15] Add script to run masterlist generation --- run_sequential.sh | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100755 run_sequential.sh diff --git a/run_sequential.sh b/run_sequential.sh new file mode 100755 index 0000000..97150a6 --- /dev/null +++ b/run_sequential.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +dir=$1 +outname=${2:-masterlist} + +outdir=Rout +mkdir -p "$outdir" +numchunks=$(find "$dir" -maxdepth 1 -name 'chunk*.bed' | wc -l) + +# Do something +i=1 +for chunkfile in "$dir"/chunk*.bed ; do + R CMD BATCH \ + --no-save --no-restore \ + "--args chunknum=$i filepath=$chunkfile" \ + ./code_build.R \ + "$outdir/output_build_chunk_$i.Rout" + (i++) +done + +# Do something else +for i in $(seq 1 "$numchunks") ; do + R CMD BATCH --no-save --no-restore "--args chunknum=${i}" \ + code_overlap.R "Rout/output_overlap_chunk_${i}.Rout" +done + +# Generate masterlist +./code_gen_masterlist.sh "$outname" + +# Make a matrix +./code_construct_matrix.sh "$outname" "$numchunks" From 8ab350301747f2863db59dec408611ca9510837c Mon Sep 17 00:00:00 2001 From: fwip Date: Thu, 15 Mar 2018 11:33:25 -0700 Subject: [PATCH 06/15] Add script to run all of hotspot2 & masterlist gen --- run_from_scratch.sh | 110 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100755 run_from_scratch.sh diff --git a/run_from_scratch.sh b/run_from_scratch.sh new file mode 100755 index 0000000..fa286fc --- /dev/null +++ b/run_from_scratch.sh @@ -0,0 +1,110 @@ +#! /bin/bash + +set -e -u -o pipefail + +if [[ $# != 4 && $# != 5 ]]; then + echo -e "Usage: $0 fileOfBAMfiles outdir chrom_sizes.bed mappable.starch [min_varWidth_peak_width]" + exit 2 +fi + +FILE_OF_BAM_FILES=$1 +OUTDIR=$2 +CHROM_FILTER=$3 +MAPPABLE_REGIONS=$4 + +if [ -n "$5" ]; then + MIN_VARWIDTH_PEAK_WIDTH=$5 +else + MIN_VARWIDTH_PEAK_WIDTH="20" +fi + +# Sanity check on input filenames. +linenum=1 +while read -r line +do + fname=$line + if [ ! -s "$fname" ]; then + echo -e "Error: on line $linenum of $FILE_OF_BAM_FILES, file not found, or the file is empty ($fname)." + exit 2 + fi + ((linenum++)) +done < "$FILE_OF_BAM_FILES" + +mkdir -p "$OUTDIR" + +# Input files and parameter values +#CENTER_SITES=/home/erynes/topics/ENCODEpaper2017/CenterSitesFiles/centerSites_halfWin100bp_K36mapOnlyMinusBlacklist_chrs1-22XY.hg38.starch +#MAPPABLE_REGIONS=/home/erynes/topics/ENCODEpaper2017/GRCh38_36merMappableOnly_minusBlacklist_chrs1-22XY.starch +#CHROM_FILTER=/home/erynes/topics/ENCODEpaper2017/chromLengths_GRCh38_only1-22XY.bed3 +CALL_THRESHOLD=1.0 # write all sites to disk, including sites with FDR = 1 +HOTSPOT_FDR_THRESHOLD=0.0010 # 0.1% (not 1%) +CENTER_SITES=$(mktemp) + +function exe_check(){ + for exe in "$@"; do + if [ ! -x "$exe" ]; then + echo -e "Error: $exe was not found, or execution privileges for it are not set." + exit 2 + fi + done +} + +# Executables and scripts +HOTSPOT2=$(which hotspot2.sh) +EXTRACT=$(which extractCenterSites.sh) + +exe_check "$HOTSPOT2" "$EXTRACT" + +# Create center sites +"$EXTRACT" -c "$CHROM_FILTER" -o "$CENTER_SITES" + + +# Run hotspot2.sh +while read -r BAM +do + bname=$(basename "$BAM" .bam) + varWidthSpec="varWidth_${MIN_VARWIDTH_PEAK_WIDTH}_${bname}" + { + unset TMPDIR + "$HOTSPOT2" -c "$CHROM_FILTER" -C "$CENTER_SITES" -M "$MAPPABLE_REGIONS" -p "$varWidthSpec" -f "$HOTSPOT_FDR_THRESHOLD" -F "$CALL_THRESHOLD" "$BAM" "$OUTDIR" + } +done < "$FILE_OF_BAM_FILES" + + +OUTFILE=allPeaks.starch +{ + echo -e "Collating peaks..." + bedops -u "${OUTDIR}"/*.peaks.starch \ + | starch - \ + > "${OUTDIR}/$OUTFILE" + if ! mkdir -p "${OUTDIR}/subdir" ; then + echo -e "Failed to create directory ${OUTDIR}/subdir." + exit 2 + fi + echo -e '"Chunking" collated peaks for eventual use in master list creation...' + cd "${OUTDIR}/subdir" || exit 2 + unstarch "../${OUTFILE}" \ + | awk -v minChunkSep=10000 -v minChunkSep2=4000 -v minPerChunk=500 -v maxPerChunk=100000 \ + 'BEGIN{n=1; nWritten=0; outf="chunk0001.bed"}{ + if(NR > 1){ + if(\$1 != prevChr || \$2-prev3 > minChunkSep){ + if(nWritten >= minPerChunk){ + outf = sprintf("chunk%04d.bed", ++n); + nWritten = 0; + } + } + else{ + if(\$2-prev3 > minChunkSep2 && nWritten > maxPerChunk){ + outf = sprintf("chunk%04d.bed",++n); + nWritten=0; + } + } + } + print \$0 > outf; + nWritten++; prevChr = \$1; prev3 = \$3; + }' + echo -e "Done!" +} + +# Run the masterlist stuff +./run_sequential.sh From 11baeed6f2f563be317fa239e221e6d9491c3676 Mon Sep 17 00:00:00 2001 From: fwip Date: Thu, 15 Mar 2018 13:30:34 -0700 Subject: [PATCH 07/15] scratch script fixups --- run_from_scratch.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/run_from_scratch.sh b/run_from_scratch.sh index fa286fc..7a77261 100755 --- a/run_from_scratch.sh +++ b/run_from_scratch.sh @@ -12,7 +12,7 @@ OUTDIR=$2 CHROM_FILTER=$3 MAPPABLE_REGIONS=$4 -if [ -n "$5" ]; then +if [ $# = 5 ]; then MIN_VARWIDTH_PEAK_WIDTH=$5 else MIN_VARWIDTH_PEAK_WIDTH="20" @@ -38,7 +38,7 @@ mkdir -p "$OUTDIR" #CHROM_FILTER=/home/erynes/topics/ENCODEpaper2017/chromLengths_GRCh38_only1-22XY.bed3 CALL_THRESHOLD=1.0 # write all sites to disk, including sites with FDR = 1 HOTSPOT_FDR_THRESHOLD=0.0010 # 0.1% (not 1%) -CENTER_SITES=$(mktemp) +CENTER_SITES="$OUTDIR/centersites.starch" function exe_check(){ for exe in "$@"; do @@ -56,8 +56,11 @@ EXTRACT=$(which extractCenterSites.sh) exe_check "$HOTSPOT2" "$EXTRACT" # Create center sites +echo "Creating center sites" "$EXTRACT" -c "$CHROM_FILTER" -o "$CENTER_SITES" +ls -l "$CENTER_SITES" + # Run hotspot2.sh while read -r BAM @@ -107,4 +110,4 @@ OUTFILE=allPeaks.starch } # Run the masterlist stuff -./run_sequential.sh +./run_sequential.sh "$OUTDIR" "$OUTDIR" From 5ab7173d26a7c060acfc2fae2291881ec1498665 Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 13:13:22 -0700 Subject: [PATCH 08/15] Add perl to dependencies --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 5a87dc3..b277735 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM hotspot2 RUN apk add --no-cache \ g++ \ + perl \ R \ R-dev RUN Rscript -e 'install.packages("caTools", repo="http://cran.rstudio.com")' From 2a83bfbcca7ec6dbf949b4103312a40d70e285d2 Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 13:14:28 -0700 Subject: [PATCH 09/15] Fix matrix arg parsing --- code_build.R | 2 +- code_construct_matrix.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/code_build.R b/code_build.R index 20aeba3..6a2de98 100644 --- a/code_build.R +++ b/code_build.R @@ -19,7 +19,7 @@ if (length(args)==0) { stop("No arguments supplied.") } else { eval(parse(text=args[[1]])) # parse first argument: chunknum - eval(parse(text=args[[2]])) # parse first argument: filepath + filepath <- args[[2]] # parse second argument: filepath } chunk <- paste("chunk", sprintf("%04d", chunknum), ".bed", sep=""); diff --git a/code_construct_matrix.sh b/code_construct_matrix.sh index dc1321d..bad13ea 100755 --- a/code_construct_matrix.sh +++ b/code_construct_matrix.sh @@ -8,8 +8,9 @@ # # ########################################################################################################## +set -e -o pipefail -if [[ $# -le 2 ]] ; then +if [[ $# -lt 2 ]] ; then echo 'Provide the name/version of the file, e.g. WM20180301, and the number of chunks' exit 1 fi From 2142fc07bf1b6c768422d4cad235de9648dc5ba9 Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 13:15:15 -0700 Subject: [PATCH 10/15] Add namingDHS src dir --- .../scripts/assign_genomic_feature_labels.pl | 341 ++++++++++++++++++ .../scripts/make_chromosome_edge_sites.pl | 32 ++ 2 files changed, 373 insertions(+) create mode 100755 namingDHS_src/scripts/assign_genomic_feature_labels.pl create mode 100755 namingDHS_src/scripts/make_chromosome_edge_sites.pl diff --git a/namingDHS_src/scripts/assign_genomic_feature_labels.pl b/namingDHS_src/scripts/assign_genomic_feature_labels.pl new file mode 100755 index 0000000..464be1c --- /dev/null +++ b/namingDHS_src/scripts/assign_genomic_feature_labels.pl @@ -0,0 +1,341 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +use Getopt::Long; +use List::Util qw( min ); + +# Assign DHS identifiers for a semi-human-readable index. + +############ Paths to required programs. ########## +my $thisScriptFile = $0; +chomp( my $bedops = `which bedops` ); +chomp( my $bedmap = `which bedmap` ); +chomp( my $sort_bed = `which sort-bed` ); +chomp( my $closest_features = `which closest-features` ); +unless (-x $bedops) { + die "Failed to find bedops program\n"; +} + +chomp( my $scriptdir = `dirname $0` ); # a relative path is fine too +my $makeChromosomeEdgeSites = "$scriptdir/make_chromosome_edge_sites.pl"; + +# Generate ONE fake 1-bp DHS at each end for the namer, +# bedops -u with real input master list, +# and removed by -n 100% on output, with names not even reserved? +# but some namespace reserved. +# Hmmm... just 1 placeholder at the very ends could be kept for naming but how to keep track? +# WE keep track, because only WE assign the numbers. +# So we reserve those numbers and their existence puts some elbow room around to avoid too-long names. + +########## Decimal numbering, but could extend to anything else. ########## +my @symbols = (); +my $base = 10; +for (my $i = 0; $i < $base; ++$i) { + push @symbols, $i; +} +my $symbolString = join("", @symbols ); +my $zeroSymbol = $symbols[0]; + +########## Usage in two distinct modes, new or updated label assignments. ########## +my $usageText =< labelled.txt + +Options: +### Mode 1 : denovo Lay out numbers proportionally to a decent approximation of DHS space ### +INPUT: + --sizes hg38.chrom.sizes + --regions bed/starch file of DHSs + + Prefix will be the chromosome name without leading 'chr', followed by a dot. + +OUTPUT: + STDOUT bed4 with labels, requires sorting + --edgefile Filename to write pseudo-DHS at chrom boundaries for future labelling (SAVE THIS FILE!) + +### Mode 2 : update Make new labels consistent with previous ones ### +INPUT: + --regions bed/starch file of new DHSs + --previous bed/starch (BED4) of previously-labelled DHSs + --retiredIDs optional text file of retired IDs that may or may not be in the previous BED file + --edgefile From first denovo pass, possibly edited if additional chromosomes in new genome build + + Prefix will be taken from adjacent DHSs or pseudo-DHSs via closest-features. + +OUTPUT: + STDOUT bed4 with labels, requires sorting +EOusage + +# TODO? [--overlap] bedmap overlap options to determine when to re-use an existing label (ONCE!) +# (Currently requiring exact overlap). +# For master-list DHSs, at least be stringent enough that no two input regions will get the same old name! + +########## Read command-line options. ########## +my $chromSizesFile; +my $assignRegionsFile; +my $previousRegionsFile; +my $edgeCaseLabelsFile; +my $retiredIDFile; + +GetOptions( "sizes=s" => \$chromSizesFile, + "regions=s" => \$assignRegionsFile, + "previous=s" => \$previousRegionsFile, + "edgefile=s" => \$edgeCaseLabelsFile, + "retiredIDs=s" => \$retiredIDFile ) +or die $usageText; + + +# Two options always required, the regions to label and a special file for handling chromosome edge cases. +unless (defined($assignRegionsFile)) { + die "Error: No regions file specified for label assignment.\n$usageText"; +} +unless (defined($edgeCaseLabelsFile)) { + die "Error: No edge file specified, required for input or output depending on assignment mode.\n$usageText"; +} + +#Read optional retired ids here before doing anything else +my %retiredIDs = (); + +# Existence of a previous file determines which mode we run in. +if (defined( $previousRegionsFile )) { + # Try to make new labels fit nicely in between the previous labels. + unless (-e $previousRegionsFile) { + die "Failed to find $previousRegionsFile\n"; + } + # Labels to avoid re-assigning + readRetiredIDs(); + # Make new labels consistent with exact-matching and neighboring previous regions + updateLabelAssignments(); + +} else { + # Try to make new label numbers spread out evenly across the genome. + unless (defined($chromSizesFile)) { + die "Error: No genome.chrom.sizes file specified.\n$usageText\n"; + } + # Need the fake edge markers sorted into the real input regions, but diverted on output. + # Could do this more cryptically with awk instead of external script, + # e.g. "awk '{print \$1\"\t0\t1\n\"\$1\"\t\"(\$2-1)\"\t\"\$2}' chrom.sizes"; + my %regionsByPrefix = (); + my $cmd = "$makeChromosomeEdgeSites $chromSizesFile | sort-bed - | $bedops -u - $assignRegionsFile"; + open (my $in, "$cmd |") or die "$!"; + while (<$in>) { + chomp; + my $line = $_; + my ($chrom,$min0,$max1,@etc) = split /\t/; + my $labelPrefix = $chrom; + if ($chrom =~ /^chr(.*)$/) { + $labelPrefix = $1; + } + $labelPrefix .= "."; + my $location = join("\t", ($chrom, $min0, $max1) ); + push @{$regionsByPrefix{$labelPrefix}}, $location; + } + close $in; + # Do the thing + if (-e $edgeCaseLabelsFile) { + die "Warning: $edgeCaseLabelsFile would be overwritten!\n" + . " Please move it out of the way first,\n" + . " because we don't want to clobber it by accident here.\n"; + } + open (my $outEdges, "| $sort_bed - > $edgeCaseLabelsFile") or die "$!"; + for my $labelPrefix (sort keys %regionsByPrefix) { + recursiveHierarchicalNumbering( $regionsByPrefix{$labelPrefix}, $symbolString, $labelPrefix, $outEdges, 1 ); + } + close $outEdges; +} + +######## Assign new labels consistent with existing labels. ######### +sub updateLabelAssignments { + my %fillIns = (); + my $cmd = "$bedops -u $previousRegionsFile $edgeCaseLabelsFile " + . "| $closest_features $assignRegionsFile - " + . "| $bedmap --echo --echo-map --exact - $previousRegionsFile "; + open (my $in, "$cmd |") or die "$!"; + while (<$in>) { + chomp; + my $mappedLine = $_; + my ($element,$prev5,$prev3,$exactMatch) = split /\|/; + #NOT USED: my ($elementChrom,$elementMin0,$elementMax1) = split /\t/, $element; + if (defined($exactMatch) and (length($exactMatch)>0)) { + # If the match criteria was made more lax, we still want the old region for the old label + print "$exactMatch\n"; + } else { + # Assign a new name in between the nearest flanking old names + my ($prev5Chrom,$prev5Min0,$prev5Max1,$prev5Name) = split /\t/, $prev5; + my ($prev3Chrom,$prev3Min0,$prev3Max1,$prev3Name) = split /\t/, $prev3; + unless (defined($prev5Name) and defined($prev3Name)) { + die "Missing a named neighbor in ($mappedLine)\n"; + } + my ($commonPrefix, $uniqueLeft, $uniqueRight) = compareFlankingIDs( $prev5Name, $prev3Name ); + my $key = "$commonPrefix|$uniqueLeft|$uniqueRight"; # just the ids would work too + push @{$fillIns{$key}}, $element; + } + } + close $in; + + foreach my $context (keys %fillIns) { + my $listref = $fillIns{$context}; + my ($commonPrefix, $uniqueLeft, $uniqueRight) = split /\|/, $context; + my @list = @$listref; + my $numvals = scalar(@list); + my @symsL = split //, $uniqueLeft; + my @symsR = split //, $uniqueRight; + # Zero-extend the boundary values to least common depth + while (scalar(@symsL) < scalar(@symsR)) { + push @symsL, $zeroSymbol; + } + while (scalar(@symsR) < scalar(@symsL)) { + push @symsR, $zeroSymbol; + } + # Assuming decimal digits because it makes it really easy. + # But can generalize if time permits, preferably in Python... + my $limitL = int( join( '', @symsL ) ); + my $limitR = int( join( '', @symsR ) ); + if ($limitR < $limitL) { + # Could happen if assembly changed + ($limitL, $limitR) = ($limitR, $limitL); + # If these ids are far away or random, interpolation will still work just be kinda nonsensical. + } + my @middlestrings = (); + for (my $i = $limitL + 1; $i < $limitR; ++$i) { + push @middlestrings, "$i"; + } + @middlestrings = excludeUsedIDs( $commonPrefix, \@middlestrings ); + while (scalar(@middlestrings) < $numvals) { + @middlestrings = (); + $limitL *= 10; + $limitR *= 10; + for (my $i = $limitL + 1; $i < $limitR; ++$i) { + push @middlestrings, "$i"; + } + @middlestrings = excludeUsedIDs( $commonPrefix, \@middlestrings ); + } + my @indices = getIndicesForSplittingList( scalar(@list), scalar(@middlestrings) ); + my $prevIndex = 0; + for (my $i = 0; $i < scalar(@list); ++$i) { + my $region = $list[$i]; + my $nextIndex = $indices[$i]; + # If there's a lot of space, we don't want new IDs too close to old ones or each other + # That way we limit the likelihood of having to make longer IDs. + my $useIndex = int( ($prevIndex + $nextIndex) / 2 ); + my $newID = $commonPrefix . $middlestrings[$useIndex]; + print "$region\t$newID\n"; + $prevIndex = $nextIndex; + } + } +} + +####### Remove retired labels from a list of candidates for assignment. ####### +sub excludeUsedIDs { + my ($prefix, $suffixlistref) = @_; + my @result = (); + foreach my $candidate (@$suffixlistref) { + while ($candidate =~ /^(.*)$zeroSymbol$/) { + $candidate = $1; + } + unless( defined( $retiredIDs{"$prefix$candidate"} ) ) { + push @result, $candidate; + } + } + return @result; +} + +####### Find the maximal common prefix of neighboring identifiers. ####### +sub compareFlankingIDs { + my ( $prev5, $prev3 ) = @_; + my $minlen = min( length($prev5), length($prev3) ); + my $common = 0; + while (($common < $minlen) && (substr($prev5,$common,1) eq substr($prev3,$common,1))) { + ++$common; + } + my $commonPrefix = substr( $prev5, 0, $common ); + die "assertion failed" unless ($commonPrefix eq substr( $prev3, 0, $common )); + my $uniqueLeft = substr( $prev5, $common ); + my $uniqueRight = substr( $prev3, $common ); + #print "common( $prev5, $prev3 ) = ($commonPrefix, $uniqueLeft, $uniqueRight)\n"; + return ($commonPrefix, $uniqueLeft, $uniqueRight); +} + +####### Method for spreading out namespace over a big region list the very first time ####### +sub recursiveHierarchicalNumbering { + my ($regionsRef, $symbolString, $prefix, $fileHandleForEdges, $startNumberingWithOne ) = @_; + my @list = @$regionsRef; + if (@list < 1) { + # NOP + } elsif (@list == 1) { + my $line = $list[0]; + my $name = removeTrailingZeros( $prefix ); + my ($chrom,$min0,$max1) = split /\t/, $line; + if ($max1 - $min0 == 1) { + # Fake edge case, keeping out of main output + print $fileHandleForEdges "$line\t$name\n"; + } else { + # just writing to stdout for now + print "$line\t$name\n"; + } + } else { + my @indices = getIndicesForSplittingList( length($symbolString) - $startNumberingWithOne, scalar(@list) ); + my $prevIndex = 0; + for (my $i = 0; $i < scalar(@indices); ++$i) { + my $nextIndex = $indices[$i]; + my @slice = @list[$prevIndex..($nextIndex-1)]; + my $symbol = substr( $symbolString, $i + $startNumberingWithOne, 1 ); + recursiveHierarchicalNumbering( \@slice, $symbolString, "$prefix$symbol", $fileHandleForEdges, 0 ); + $prevIndex = $nextIndex; + } + } +} + +####### Spread a small list more-or-less evenly across a larger list. ####### +sub getIndicesForSplittingList { + my ($numGroups, $origLength) = @_; + my $groups = $numGroups; + my $remaining = $origLength; + my $index = 0; + my @result = (); + while ($groups > 0) { + my $size = int( 0.5 + ($remaining / $groups) ); + $index += $size; + $remaining -= $size; + push @result, $index; + --$groups; + } + return @result; +} + + +####### Avoid re-using IDs already assigned in any previous version. ####### +sub readRetiredIDs { + # optional list of things that might not be in the old-id BED file + if (defined( $retiredIDFile )) { + open (my $ids, '<', $retiredIDFile) or die "$!"; + while (<$ids>) { + chomp; + $retiredIDs{removeTrailingZeros($_)} = 1; + } + close $ids; + } + # Also add ids from the previous file and the edge-case file + for my $bedFile ($previousRegionsFile, $edgeCaseLabelsFile) { + open (my $ids, "$bedops -u $bedFile |") or die "$!"; + while (<$ids>) { + chomp; + my ($chrom, $min0, $max1, $prevId) = split /\t/; + if (defined($prevId)) { + $retiredIDs{removeTrailingZeros($prevId)} = 1; + } + } + close $ids; + } +} + +# Like representing the decimal fraction of real numbers, +# adding zeroes doesn't change the ID number to be something different. +sub removeTrailingZeros { + my $prevId = shift; + while ($prevId =~ /^(.*)$zeroSymbol$/) { + $prevId = $1; + } + return $prevId; +} + diff --git a/namingDHS_src/scripts/make_chromosome_edge_sites.pl b/namingDHS_src/scripts/make_chromosome_edge_sites.pl new file mode 100755 index 0000000..9720d16 --- /dev/null +++ b/namingDHS_src/scripts/make_chromosome_edge_sites.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +use Getopt::Long; + +# Generate ONE fake 1-bp DHS at each end for the namer, +# bedops -u with real input master list, +# and removed by -n 100% on output, with names not even reserved? +# but some namespace reserved. +# Hmmm... just 1 placeholder at the very ends could be kept for naming but how to keep track? +# WE keep track, because only WE assign the numbers. +# So we reserve those numbers and their existence puts some elbow room around to avoid too-long names. +# +# equivalently, +# awk '{print $1"\t0\t1\n"$1"\t"($2-1)"\t"$2}' chrom.sizes + + +unless (@ARGV > 0) { + die "Usage: $0 genome.chrom.sizes\n"; +} + +my $chromSizesFile = $ARGV[0]; +open (my $in, '<', $chromSizesFile) or die "$!"; +while (<$in>) { + chomp; + my ($chrom, $size) = split /\t/; + print join("\t", ($chrom, 0, 1)) . "\n"; + print join("\t", ($chrom, $size - 1, $size)) . "\n"; +} +close $in; + + From 81f042f22e92cabe13bd5ec680c25b06931211f3 Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 13:15:36 -0700 Subject: [PATCH 11/15] Update masterlist code --- code_gen_masterlist.sh | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/code_gen_masterlist.sh b/code_gen_masterlist.sh index 571c025..3e7449c 100755 --- a/code_gen_masterlist.sh +++ b/code_gen_masterlist.sh @@ -8,12 +8,15 @@ # # ########################################################################################################## +set -e -o pipefail + if [[ $# -eq 0 ]] ; then echo 'Provide the name/version of the file, e.g. WM20180301' exit 1 fi NAME=$1; +CHROM_FILE=$2 TYPES="all nonovl_any nonovl_core"; for TYPE in ${TYPES}; do @@ -31,26 +34,28 @@ for TYPE in ${TYPES}; do fi ### Generate label mapping - if ! [[ -f "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" && ${TYPE} == "all" ]] ; then - echo "Generating unique DHS identifiers" - ./run_name_master_list.sh "${FILE_CHUNKIDS}" - fi - - ### Apply mapping - if ! [[ -f "$FILE_INDEXIDS" ]] ; then - echo "Mapping chunk identifiers to final DHS identifiers" - awk 'BEGIN{ FS = OFS = "\t" } - FNR == NR { split($0, f, /\t/); map[f[2]] = f[1]; next } - { if ($4 in map) { $4 = map[$4] } } - { print }' "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" "${FILE_CHUNKIDS}" > "${FILE_INDEXIDS}" - fi + # if ! [[ -f "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" && ${TYPE} == "all" ]] ; then + # echo "Generating unique DHS identifiers" + # ./run_name_master_list.sh "${FILE_CHUNKIDS}" "$CHROM_FILE" + # fi + + # ### Apply mapping + # if ! [[ -f "$FILE_INDEXIDS" ]] ; then + # echo "Mapping chunk identifiers to final DHS identifiers" + # awk 'BEGIN{ FS = OFS = "\t" } + # FNR == NR { split($0, f, /\t/); map[f[2]] = f[1]; next } + # { if ($4 in map) { $4 = map[$4] } } + # { print }' "masterlist_DHSs_${NAME}_all_chunkIDs2indexIDs.txt" "${FILE_CHUNKIDS}" > "${FILE_INDEXIDS}" + # fi + + FILE_INDEXIDS=$FILE_CHUNKIDS ### Create browser loadable BED12 files if ! [[ -f "$FILE_BED" ]] ; then echo "Constructing BED12 file" #echo "browser position chr6:26020208-26022677" > ${FILE_BED} #echo "track name='Master list DHSs ${NAME} ${TYPE}' description='Master list DHSs ${NAME} ${TYPE}' visibility=2 useScore=1" >> ${FILE_BED} - awk "${FILE_INDEXIDS}" ' + awk ' function round(x) { if(x=="NA") { return 0 } else { return int(x + 0.5) } } BEGIN { OFS="\t"; } { @@ -99,12 +104,13 @@ for TYPE in ${TYPES}; do score=round(log($5+1)/log(10)*500) score=(score > 1000 ? 1000 : score) print $1, $2, $3, $4, score, ".", thickStart, thickEnd, "0,0,0", blockCount, blockSizes, blockStarts - }' > "${FILE_BED}" + }' "${FILE_INDEXIDS}" > "${FILE_BED}" fi if ! [[ -f "$FILE_BIGBED" ]] ; then echo "Converting BED file to BIGBED file" - bedToBigBed -type=bed12 "${FILE_BED}" <(tail -n +2 hg38.genome) "${FILE_BIGBED}" + #bedToBigBed -type=bed12 "${FILE_BED}" "$CHROM_FILE" "${FILE_BIGBED}" + bedToBigBed -type=bed12 "${FILE_BED}" <(cut -f1,3 "$CHROM_FILE") "${FILE_BIGBED}" fi echo "Load this track in the UCSC browser using the following:" From 9b59832156fe3cde53a2f177e25e7d5b7d10597e Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 13:16:55 -0700 Subject: [PATCH 12/15] Fix awk & dir errors in run_from_scratch --- run_from_scratch.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/run_from_scratch.sh b/run_from_scratch.sh index 7a77261..b59725f 100755 --- a/run_from_scratch.sh +++ b/run_from_scratch.sh @@ -74,6 +74,7 @@ do done < "$FILE_OF_BAM_FILES" +here=$(dirname "$(readlink -f "$0")") OUTFILE=allPeaks.starch { echo -e "Collating peaks..." @@ -90,24 +91,26 @@ OUTFILE=allPeaks.starch | awk -v minChunkSep=10000 -v minChunkSep2=4000 -v minPerChunk=500 -v maxPerChunk=100000 \ 'BEGIN{n=1; nWritten=0; outf="chunk0001.bed"}{ if(NR > 1){ - if(\$1 != prevChr || \$2-prev3 > minChunkSep){ + if($1 != prevChr || $2-prev3 > minChunkSep){ if(nWritten >= minPerChunk){ outf = sprintf("chunk%04d.bed", ++n); nWritten = 0; } } else{ - if(\$2-prev3 > minChunkSep2 && nWritten > maxPerChunk){ + if($2-prev3 > minChunkSep2 && nWritten > maxPerChunk){ outf = sprintf("chunk%04d.bed",++n); nWritten=0; } } } - print \$0 > outf; - nWritten++; prevChr = \$1; prev3 = \$3; + print $0 > outf; + nWritten++; prevChr = $1; prev3 = $3; }' echo -e "Done!" } +cd "$here" + # Run the masterlist stuff -./run_sequential.sh "$OUTDIR" "$OUTDIR" +"$here/run_sequential.sh" "$OUTDIR/subdir" "$CHROM_FILTER" From 50fb58239bb873b6c16b83e2710cc4ed1fcdea2c Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 13:19:17 -0700 Subject: [PATCH 13/15] Fix bugs in run_sequential, ignore matrix code --- run_sequential.sh | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/run_sequential.sh b/run_sequential.sh index 97150a6..52e89c2 100755 --- a/run_sequential.sh +++ b/run_sequential.sh @@ -1,7 +1,10 @@ #!/bin/bash +set -e -o pipefail + dir=$1 -outname=${2:-masterlist} +chrom_file=$2 +outname=${3:-masterlist} outdir=Rout mkdir -p "$outdir" @@ -12,20 +15,28 @@ i=1 for chunkfile in "$dir"/chunk*.bed ; do R CMD BATCH \ --no-save --no-restore \ - "--args chunknum=$i filepath=$chunkfile" \ + "--args chunknum=$i $chunkfile" \ ./code_build.R \ "$outdir/output_build_chunk_$i.Rout" - (i++) + ((i++)) done # Do something else for i in $(seq 1 "$numchunks") ; do R CMD BATCH --no-save --no-restore "--args chunknum=${i}" \ - code_overlap.R "Rout/output_overlap_chunk_${i}.Rout" + ./code_overlap.R "$outdir/output_overlap_chunk_${i}.Rout" done # Generate masterlist -./code_gen_masterlist.sh "$outname" +./code_gen_masterlist.sh "$(basename "$outname")" "$chrom_file" +# Copy output files out of the container +cp -r \ + DHS* \ + Rout \ + masterlist_DHSs* \ + peaks_* \ + "$dir" # Make a matrix -./code_construct_matrix.sh "$outname" "$numchunks" +# TODO: This comes later +# ./code_construct_matrix.sh "$(basename "$outname")" "$numchunks" From 9c2a91245a665df4916dc5b93f97f9ae61a425b1 Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 14:24:19 -0700 Subject: [PATCH 14/15] Remove perl scripts and dependency --- Dockerfile | 1 - .../scripts/assign_genomic_feature_labels.pl | 341 ------------------ .../scripts/make_chromosome_edge_sites.pl | 32 -- 3 files changed, 374 deletions(-) delete mode 100755 namingDHS_src/scripts/assign_genomic_feature_labels.pl delete mode 100755 namingDHS_src/scripts/make_chromosome_edge_sites.pl diff --git a/Dockerfile b/Dockerfile index b277735..5a87dc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,6 @@ FROM hotspot2 RUN apk add --no-cache \ g++ \ - perl \ R \ R-dev RUN Rscript -e 'install.packages("caTools", repo="http://cran.rstudio.com")' diff --git a/namingDHS_src/scripts/assign_genomic_feature_labels.pl b/namingDHS_src/scripts/assign_genomic_feature_labels.pl deleted file mode 100755 index 464be1c..0000000 --- a/namingDHS_src/scripts/assign_genomic_feature_labels.pl +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use warnings; -use Getopt::Long; -use List::Util qw( min ); - -# Assign DHS identifiers for a semi-human-readable index. - -############ Paths to required programs. ########## -my $thisScriptFile = $0; -chomp( my $bedops = `which bedops` ); -chomp( my $bedmap = `which bedmap` ); -chomp( my $sort_bed = `which sort-bed` ); -chomp( my $closest_features = `which closest-features` ); -unless (-x $bedops) { - die "Failed to find bedops program\n"; -} - -chomp( my $scriptdir = `dirname $0` ); # a relative path is fine too -my $makeChromosomeEdgeSites = "$scriptdir/make_chromosome_edge_sites.pl"; - -# Generate ONE fake 1-bp DHS at each end for the namer, -# bedops -u with real input master list, -# and removed by -n 100% on output, with names not even reserved? -# but some namespace reserved. -# Hmmm... just 1 placeholder at the very ends could be kept for naming but how to keep track? -# WE keep track, because only WE assign the numbers. -# So we reserve those numbers and their existence puts some elbow room around to avoid too-long names. - -########## Decimal numbering, but could extend to anything else. ########## -my @symbols = (); -my $base = 10; -for (my $i = 0; $i < $base; ++$i) { - push @symbols, $i; -} -my $symbolString = join("", @symbols ); -my $zeroSymbol = $symbols[0]; - -########## Usage in two distinct modes, new or updated label assignments. ########## -my $usageText =< labelled.txt - -Options: -### Mode 1 : denovo Lay out numbers proportionally to a decent approximation of DHS space ### -INPUT: - --sizes hg38.chrom.sizes - --regions bed/starch file of DHSs - - Prefix will be the chromosome name without leading 'chr', followed by a dot. - -OUTPUT: - STDOUT bed4 with labels, requires sorting - --edgefile Filename to write pseudo-DHS at chrom boundaries for future labelling (SAVE THIS FILE!) - -### Mode 2 : update Make new labels consistent with previous ones ### -INPUT: - --regions bed/starch file of new DHSs - --previous bed/starch (BED4) of previously-labelled DHSs - --retiredIDs optional text file of retired IDs that may or may not be in the previous BED file - --edgefile From first denovo pass, possibly edited if additional chromosomes in new genome build - - Prefix will be taken from adjacent DHSs or pseudo-DHSs via closest-features. - -OUTPUT: - STDOUT bed4 with labels, requires sorting -EOusage - -# TODO? [--overlap] bedmap overlap options to determine when to re-use an existing label (ONCE!) -# (Currently requiring exact overlap). -# For master-list DHSs, at least be stringent enough that no two input regions will get the same old name! - -########## Read command-line options. ########## -my $chromSizesFile; -my $assignRegionsFile; -my $previousRegionsFile; -my $edgeCaseLabelsFile; -my $retiredIDFile; - -GetOptions( "sizes=s" => \$chromSizesFile, - "regions=s" => \$assignRegionsFile, - "previous=s" => \$previousRegionsFile, - "edgefile=s" => \$edgeCaseLabelsFile, - "retiredIDs=s" => \$retiredIDFile ) -or die $usageText; - - -# Two options always required, the regions to label and a special file for handling chromosome edge cases. -unless (defined($assignRegionsFile)) { - die "Error: No regions file specified for label assignment.\n$usageText"; -} -unless (defined($edgeCaseLabelsFile)) { - die "Error: No edge file specified, required for input or output depending on assignment mode.\n$usageText"; -} - -#Read optional retired ids here before doing anything else -my %retiredIDs = (); - -# Existence of a previous file determines which mode we run in. -if (defined( $previousRegionsFile )) { - # Try to make new labels fit nicely in between the previous labels. - unless (-e $previousRegionsFile) { - die "Failed to find $previousRegionsFile\n"; - } - # Labels to avoid re-assigning - readRetiredIDs(); - # Make new labels consistent with exact-matching and neighboring previous regions - updateLabelAssignments(); - -} else { - # Try to make new label numbers spread out evenly across the genome. - unless (defined($chromSizesFile)) { - die "Error: No genome.chrom.sizes file specified.\n$usageText\n"; - } - # Need the fake edge markers sorted into the real input regions, but diverted on output. - # Could do this more cryptically with awk instead of external script, - # e.g. "awk '{print \$1\"\t0\t1\n\"\$1\"\t\"(\$2-1)\"\t\"\$2}' chrom.sizes"; - my %regionsByPrefix = (); - my $cmd = "$makeChromosomeEdgeSites $chromSizesFile | sort-bed - | $bedops -u - $assignRegionsFile"; - open (my $in, "$cmd |") or die "$!"; - while (<$in>) { - chomp; - my $line = $_; - my ($chrom,$min0,$max1,@etc) = split /\t/; - my $labelPrefix = $chrom; - if ($chrom =~ /^chr(.*)$/) { - $labelPrefix = $1; - } - $labelPrefix .= "."; - my $location = join("\t", ($chrom, $min0, $max1) ); - push @{$regionsByPrefix{$labelPrefix}}, $location; - } - close $in; - # Do the thing - if (-e $edgeCaseLabelsFile) { - die "Warning: $edgeCaseLabelsFile would be overwritten!\n" - . " Please move it out of the way first,\n" - . " because we don't want to clobber it by accident here.\n"; - } - open (my $outEdges, "| $sort_bed - > $edgeCaseLabelsFile") or die "$!"; - for my $labelPrefix (sort keys %regionsByPrefix) { - recursiveHierarchicalNumbering( $regionsByPrefix{$labelPrefix}, $symbolString, $labelPrefix, $outEdges, 1 ); - } - close $outEdges; -} - -######## Assign new labels consistent with existing labels. ######### -sub updateLabelAssignments { - my %fillIns = (); - my $cmd = "$bedops -u $previousRegionsFile $edgeCaseLabelsFile " - . "| $closest_features $assignRegionsFile - " - . "| $bedmap --echo --echo-map --exact - $previousRegionsFile "; - open (my $in, "$cmd |") or die "$!"; - while (<$in>) { - chomp; - my $mappedLine = $_; - my ($element,$prev5,$prev3,$exactMatch) = split /\|/; - #NOT USED: my ($elementChrom,$elementMin0,$elementMax1) = split /\t/, $element; - if (defined($exactMatch) and (length($exactMatch)>0)) { - # If the match criteria was made more lax, we still want the old region for the old label - print "$exactMatch\n"; - } else { - # Assign a new name in between the nearest flanking old names - my ($prev5Chrom,$prev5Min0,$prev5Max1,$prev5Name) = split /\t/, $prev5; - my ($prev3Chrom,$prev3Min0,$prev3Max1,$prev3Name) = split /\t/, $prev3; - unless (defined($prev5Name) and defined($prev3Name)) { - die "Missing a named neighbor in ($mappedLine)\n"; - } - my ($commonPrefix, $uniqueLeft, $uniqueRight) = compareFlankingIDs( $prev5Name, $prev3Name ); - my $key = "$commonPrefix|$uniqueLeft|$uniqueRight"; # just the ids would work too - push @{$fillIns{$key}}, $element; - } - } - close $in; - - foreach my $context (keys %fillIns) { - my $listref = $fillIns{$context}; - my ($commonPrefix, $uniqueLeft, $uniqueRight) = split /\|/, $context; - my @list = @$listref; - my $numvals = scalar(@list); - my @symsL = split //, $uniqueLeft; - my @symsR = split //, $uniqueRight; - # Zero-extend the boundary values to least common depth - while (scalar(@symsL) < scalar(@symsR)) { - push @symsL, $zeroSymbol; - } - while (scalar(@symsR) < scalar(@symsL)) { - push @symsR, $zeroSymbol; - } - # Assuming decimal digits because it makes it really easy. - # But can generalize if time permits, preferably in Python... - my $limitL = int( join( '', @symsL ) ); - my $limitR = int( join( '', @symsR ) ); - if ($limitR < $limitL) { - # Could happen if assembly changed - ($limitL, $limitR) = ($limitR, $limitL); - # If these ids are far away or random, interpolation will still work just be kinda nonsensical. - } - my @middlestrings = (); - for (my $i = $limitL + 1; $i < $limitR; ++$i) { - push @middlestrings, "$i"; - } - @middlestrings = excludeUsedIDs( $commonPrefix, \@middlestrings ); - while (scalar(@middlestrings) < $numvals) { - @middlestrings = (); - $limitL *= 10; - $limitR *= 10; - for (my $i = $limitL + 1; $i < $limitR; ++$i) { - push @middlestrings, "$i"; - } - @middlestrings = excludeUsedIDs( $commonPrefix, \@middlestrings ); - } - my @indices = getIndicesForSplittingList( scalar(@list), scalar(@middlestrings) ); - my $prevIndex = 0; - for (my $i = 0; $i < scalar(@list); ++$i) { - my $region = $list[$i]; - my $nextIndex = $indices[$i]; - # If there's a lot of space, we don't want new IDs too close to old ones or each other - # That way we limit the likelihood of having to make longer IDs. - my $useIndex = int( ($prevIndex + $nextIndex) / 2 ); - my $newID = $commonPrefix . $middlestrings[$useIndex]; - print "$region\t$newID\n"; - $prevIndex = $nextIndex; - } - } -} - -####### Remove retired labels from a list of candidates for assignment. ####### -sub excludeUsedIDs { - my ($prefix, $suffixlistref) = @_; - my @result = (); - foreach my $candidate (@$suffixlistref) { - while ($candidate =~ /^(.*)$zeroSymbol$/) { - $candidate = $1; - } - unless( defined( $retiredIDs{"$prefix$candidate"} ) ) { - push @result, $candidate; - } - } - return @result; -} - -####### Find the maximal common prefix of neighboring identifiers. ####### -sub compareFlankingIDs { - my ( $prev5, $prev3 ) = @_; - my $minlen = min( length($prev5), length($prev3) ); - my $common = 0; - while (($common < $minlen) && (substr($prev5,$common,1) eq substr($prev3,$common,1))) { - ++$common; - } - my $commonPrefix = substr( $prev5, 0, $common ); - die "assertion failed" unless ($commonPrefix eq substr( $prev3, 0, $common )); - my $uniqueLeft = substr( $prev5, $common ); - my $uniqueRight = substr( $prev3, $common ); - #print "common( $prev5, $prev3 ) = ($commonPrefix, $uniqueLeft, $uniqueRight)\n"; - return ($commonPrefix, $uniqueLeft, $uniqueRight); -} - -####### Method for spreading out namespace over a big region list the very first time ####### -sub recursiveHierarchicalNumbering { - my ($regionsRef, $symbolString, $prefix, $fileHandleForEdges, $startNumberingWithOne ) = @_; - my @list = @$regionsRef; - if (@list < 1) { - # NOP - } elsif (@list == 1) { - my $line = $list[0]; - my $name = removeTrailingZeros( $prefix ); - my ($chrom,$min0,$max1) = split /\t/, $line; - if ($max1 - $min0 == 1) { - # Fake edge case, keeping out of main output - print $fileHandleForEdges "$line\t$name\n"; - } else { - # just writing to stdout for now - print "$line\t$name\n"; - } - } else { - my @indices = getIndicesForSplittingList( length($symbolString) - $startNumberingWithOne, scalar(@list) ); - my $prevIndex = 0; - for (my $i = 0; $i < scalar(@indices); ++$i) { - my $nextIndex = $indices[$i]; - my @slice = @list[$prevIndex..($nextIndex-1)]; - my $symbol = substr( $symbolString, $i + $startNumberingWithOne, 1 ); - recursiveHierarchicalNumbering( \@slice, $symbolString, "$prefix$symbol", $fileHandleForEdges, 0 ); - $prevIndex = $nextIndex; - } - } -} - -####### Spread a small list more-or-less evenly across a larger list. ####### -sub getIndicesForSplittingList { - my ($numGroups, $origLength) = @_; - my $groups = $numGroups; - my $remaining = $origLength; - my $index = 0; - my @result = (); - while ($groups > 0) { - my $size = int( 0.5 + ($remaining / $groups) ); - $index += $size; - $remaining -= $size; - push @result, $index; - --$groups; - } - return @result; -} - - -####### Avoid re-using IDs already assigned in any previous version. ####### -sub readRetiredIDs { - # optional list of things that might not be in the old-id BED file - if (defined( $retiredIDFile )) { - open (my $ids, '<', $retiredIDFile) or die "$!"; - while (<$ids>) { - chomp; - $retiredIDs{removeTrailingZeros($_)} = 1; - } - close $ids; - } - # Also add ids from the previous file and the edge-case file - for my $bedFile ($previousRegionsFile, $edgeCaseLabelsFile) { - open (my $ids, "$bedops -u $bedFile |") or die "$!"; - while (<$ids>) { - chomp; - my ($chrom, $min0, $max1, $prevId) = split /\t/; - if (defined($prevId)) { - $retiredIDs{removeTrailingZeros($prevId)} = 1; - } - } - close $ids; - } -} - -# Like representing the decimal fraction of real numbers, -# adding zeroes doesn't change the ID number to be something different. -sub removeTrailingZeros { - my $prevId = shift; - while ($prevId =~ /^(.*)$zeroSymbol$/) { - $prevId = $1; - } - return $prevId; -} - diff --git a/namingDHS_src/scripts/make_chromosome_edge_sites.pl b/namingDHS_src/scripts/make_chromosome_edge_sites.pl deleted file mode 100755 index 9720d16..0000000 --- a/namingDHS_src/scripts/make_chromosome_edge_sites.pl +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use warnings; -use Getopt::Long; - -# Generate ONE fake 1-bp DHS at each end for the namer, -# bedops -u with real input master list, -# and removed by -n 100% on output, with names not even reserved? -# but some namespace reserved. -# Hmmm... just 1 placeholder at the very ends could be kept for naming but how to keep track? -# WE keep track, because only WE assign the numbers. -# So we reserve those numbers and their existence puts some elbow room around to avoid too-long names. -# -# equivalently, -# awk '{print $1"\t0\t1\n"$1"\t"($2-1)"\t"$2}' chrom.sizes - - -unless (@ARGV > 0) { - die "Usage: $0 genome.chrom.sizes\n"; -} - -my $chromSizesFile = $ARGV[0]; -open (my $in, '<', $chromSizesFile) or die "$!"; -while (<$in>) { - chomp; - my ($chrom, $size) = split /\t/; - print join("\t", ($chrom, 0, 1)) . "\n"; - print join("\t", ($chrom, $size - 1, $size)) . "\n"; -} -close $in; - - From f48e3623e94fb7d311088b445491fb92db7f2f52 Mon Sep 17 00:00:00 2001 From: fwip Date: Fri, 16 Mar 2018 14:40:46 -0700 Subject: [PATCH 15/15] Remove general.R dependency --- code_build.R | 1 - code_matrix.R | 1 - code_overlap.R | 1 - general.R | 2732 ------------------------------------------------ 4 files changed, 2735 deletions(-) delete mode 100644 general.R diff --git a/code_build.R b/code_build.R index 6a2de98..8dca21e 100644 --- a/code_build.R +++ b/code_build.R @@ -8,7 +8,6 @@ # # ########################################################################################################## -source("general.R") library(caTools) source("code_ML.R") diff --git a/code_matrix.R b/code_matrix.R index 691a1a2..cd8f6d9 100644 --- a/code_matrix.R +++ b/code_matrix.R @@ -7,7 +7,6 @@ # # ########################################################################################################## -source("general.R") library(caTools) library(Matrix) source("code_ML.R") diff --git a/code_overlap.R b/code_overlap.R index 8ae85f2..df0b6af 100644 --- a/code_overlap.R +++ b/code_overlap.R @@ -7,7 +7,6 @@ # # ########################################################################################################## -source("general.R") library(caTools) source("code_ML.R") diff --git a/general.R b/general.R deleted file mode 100644 index 5df0495..0000000 --- a/general.R +++ /dev/null @@ -1,2732 +0,0 @@ -#library(CGtools) - -chrom_info <- list(human = paste("chr", c(1:22, "X"), sep=""), # hg18 - mouse = paste("chr", c(1:19, "X"), sep=""), # mm9 - fly = c("2L", "2R", "3L", "3R", "4", "X"), # dm3 - fugu = "chrUn"); - -#chromsize_info <- list(human = chromSE_HSA_NCBI36, # hg18 -# mouse = chromSE_MM_NCBIm37, # mm9 -# fly = chromSE_DMEL_R5, #dm3 -# fugu = data.frame(seqname="chrUn", start=1, end=400509343)); - -id <- function() paste("WM", gsub("-", "", Sys.Date()), sep="") - -#runmean <- function(x,k) { -# if ((k %% 2) == 0) { k <- k+1; } -# hk <- k %/% 2; -# -# res <- diff(cumsum(c(0,x)),k)/k -# -# #c(rep(NA, hk), res, rep(NA, hk)); -# c(rep(res[1], hk), res, rep(res[length(res)], hk)); -#} - -runsum <- function(x,k) { - if ((k %% 2) == 0) { k <- k+1; } - hk <- k %/% 2; - - res <- diff(cumsum(c(0,x)),k) - - #c(rep(NA, hk), res, rep(NA, hk)); - c(rep(res[1], hk), res, rep(res[length(res)], hk)); -} - -runsd <- function(x,k) { - if ((k %% 2) == 0) { k <- k+1; } - hk <- k %/% 2; - - a<-diff(c(0,cumsum(x^2)),k)*k - b<-diff(c(0,cumsum(x)),k)^2 - d<-(a-b)/(k*(k-1)) - res <- sqrt(d) - - #c(rep(NA, hk), res, rep(NA, hk)); - c(rep(res[1], hk), res, rep(res[length(res)], hk)); -} - -runvar <- function(x,k) { - if ((k %% 2) == 0) { k <- k+1; } - hk <- k %/% 2; - - a<-diff(c(0,cumsum(x^2)),k)*k - b<-diff(c(0,cumsum(x)),k)^2 - d<-(a-b)/(k*(k-1)) - res <- d - - #c(rep(NA, hk), res, rep(NA, hk)); - c(rep(res[1], hk), res, rep(res[length(res)], hk)); -} - -# Function to calculate a running t-statistic for samples with (unknown) unequal -# variance and size (i.e., like in Welch t-test) -runt <- function(x, k) { - if ((k %% 2) != 0) { k <- k+1; } - hk <- k %/% 2; - - #m <- runmean(x, k=hk) - m <- runsum(x, k=hk)/hk - m1 <- c(rep(NA, hk), m) - m2 <- c(m, rep(NA, hk)) - s <- runvar(x, k=hk) - s1 <- c(rep(NA, hk), s) - s2 <- c(s, rep(NA, hk)) - - res <- (m1 - m2) / sqrt(s1/hk + s2/hk) - hhk <- hk %/% 2; - res[(hhk+1):(length(res)-(hhk+1))] -} - -## Originally by Elzo de Wit (does not work properly) -#runcor <- function(a,b,n) { -# ab <- a * b -# prod <- diff(c(0, cumsum(ab)), n) -# #sub <- n * runmean(a,n) * runmean(b,n) -# sub <- n * (runsum(a, k=n)/n) * (runsum(b, k=n)/n) -# bottom <- (n-1) * runsd(a,n) * runsd(b,n) -# (prod-sub)/bottom -#} - - -odd <- function(x) { x[x %% 2 == 1] } -even <- function(x) { x[x %% 2 == 0] } - -readGFF <- function(file, purge=c(), nlines = NULL, skip = 0, verbose=FALSE) { - if(is.null(nlines)) - nlines <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) - - if(verbose) - cat(paste("Reading in", nlines, "lines...")); - - what_list <- list(seqname='c', source='c', feature='c', start=0, end=0, - score=0.1, strand='0', frame='c', attribute='c'); - - if (length(purge) > 0) { what_list[purge] <- list(NULL); } - - data <- scan(file, nmax=nlines, skip=skip, what=what_list, sep='\t', quiet=TRUE); - - if (length(purge) > 0) { data[purge] <- NULL } - - class(data) <- "data.frame"; - row.names(data) <- 1:nlines; - - if(verbose) - cat("done.\n"); - - data$score <- as.numeric(data$score); - if (all(is.na(data$score))) { data$score <- 0; } - - invisible(data); -} - - -writeGFF <- function(data, file) { - n <- nrow(data); - newdata <- data.frame( seqname= rep(".", n), source = rep(".", n), feature = rep(".", n), - start = rep(0, n), end = rep(0, n), score = rep(0, n), - strand = rep(".", n), frame = rep(".", n), attribute = rep(".", n) ) - for (i in colnames(data)) { - if (i %in% names(newdata)) { - newdata[,i] <- data[,i]; - } - } - - write.table(newdata, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE); -} - -readBED <- function(file, purge=c(), nlines = NULL, skip = 0, verbose = FALSE) { - if(is.null(nlines)) - nlines <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) - - if(verbose) - cat(paste("Reading in", nlines, "lines...")); - - what_list <- list(seqname='c', start=0, end=0, name='c', score='c', strand='c'); - - if (length(purge) > 0) { what_list[purge] <- list(NULL); } - - data <- scan(file, nmax=nlines, skip=skip, what=what_list, sep='\t', quiet=TRUE); - - if (length(purge) > 0) { data[purge] <- NULL } - - class(data) <- "data.frame"; - row.names(data) <- 1:nlines; - - if(verbose) - cat("done.\n"); - - data$score <- as.numeric(data$score); - if (all(is.na(data$score))) { data$score <- 0; } - - invisible(data); -} - -readBED4 <- function(file, purge=c(), nlines = NULL, skip = 0, verbose = FALSE) { - if(is.null(nlines)) - nlines <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) - - if(verbose) - cat(paste("Reading in", nlines, "lines...")); - - what_list <- list(seqname='c', start=0, end=0, score='c'); - - if (length(purge) > 0) { what_list[purge] <- list(NULL); } - - data <- scan(file, nmax=nlines, skip=skip, what=what_list, sep='\t', quiet=TRUE); - - if (length(purge) > 0) { data[purge] <- NULL } - - class(data) <- "data.frame"; - row.names(data) <- 1:nlines; - - if(verbose) - cat("done.\n"); - - data$score <- as.numeric(data$score); - if (all(is.na(data$score))) { data$score <- 0; } - - invisible(data); -} - - -writeBED <- function(data, file, ...) { - n <- nrow(data); - newdata <- data.frame( seqname= rep(".", n), start = rep(0, n), end = rep(0, n), - name = rep(".", n), score = rep(0, n), strand = rep(".", n) ) - for (i in colnames(data)) { - if (i %in% names(newdata)) { - newdata[,i] <- data[,i]; - } - } - - write.table(newdata, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, ...); -} - -writeWIG <- function(data, file) { - n <- nrow(data); - chroms <- unique(data$seqname); - span <- median(data$end-data$start); - append <- FALSE; - for (chr in chroms) { - write(paste("variableStep chrom=", chr, " span=", span, sep=""), file=file, append=append) - write.table(data[data$seqname==chr, c("start", "score")], file=file, - quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE, append=TRUE); - append <- TRUE; - } -} - -get_overlap <- function(regions, feats, modus=1, minOverlap=0.5) { - region_feats = NULL; - # We loop over regions, because we generally have - # less regions than features. - for (i in 1:nrow(regions)) { - if (i %% 100 == 0) { cat(paste(i, "\r")); } - - chr_hits <- feats$seqname == regions$seqname[i]; - cont_hits <- 1; # By default, this is satisfied. Only used for modus 3. - # Options: 1 => fully contained, 2 => partly contained, - # 3 => fraction contained at least 'minOverlap' - if (modus == 1) { - start_hits <- feats$start > regions$start[i]; - end_hits <- feats$end < regions$end[i]; - } else if (modus == 2 || modus == 3) { - start_hits <- feats$end > regions$start[i]; - end_hits <- feats$start < regions$end[i]; - - if (modus == 3) { - # Find position of feature relative to current region - pend <- pmin(feats$end, regions$end[i]) - pstart <- pmax(feats$start, regions$start[i]) - overlap <- (pend - pstart) / (feats$end - feats$start) - cont_hits <- overlap >= minOverlap - } - } else { - error("Unknown modus: use either 1, 2 or 3") - } - hits <- chr_hits & start_hits & end_hits & cont_hits - - if (any(hits, na.rm=TRUE)) { - hit_feats <- feats[which(hits),] - feats <- feats[-which(hits),]; - hit_feats$frame = paste("Region", i); - region_feats <- rbind(region_feats, hit_feats); - } - } - cat("\n"); - - list(fg=region_feats, bg=feats) -} - - -plot_regional_expression <- function(regions, file, delta=FALSE) { - genes <- readGFF("/data/people/meuleman/projects/cons/LADs_fourth_strike/data/Ensembl50_Mmus_NCBI37_genes.gff") - exons <- readGFF("/data/people/meuleman/projects/cons/LADs_fourth_strike/data/Ensembl50_Mmus_NCBI37_exons.gff") - data_path <- "/data/datasets/Wouter/Efroni2008_AffyTiling/mm16tpmap/"; # Trailing slash! - - if (delta) { - pdf(file=file, width=10, version="1.4") - par(mar=c(4,4,2,2)) - par(mfrow=c(3,1)) - } else { - pdf(file=file, width=14) - par(mar=c(4,4,2,2)) - par(mfrow=c(2,1)) - } - - for (region in regions) { - print(region); - items <- unlist(strsplit(region, "[:-]")); - chr <- items[1]; - start <- as.numeric(items[2]); - end <- as.numeric(items[3]); - - exons_chr <- exons[exons$seqname==chr,]; - genes_chr <- genes[genes$seqname==chr,]; - - ES_name <- load(paste(data_path, chr, "_ES.RData", sep="")) - ES <- get(ES_name) - NP_name <- load(paste(data_path, chr, "_NP.RData", sep="")) - NP <- get(NP_name) - - #ES$score[ES$score <= 0] <- NA - #NP$score[NP$score <= 0] <- NA - - ES$score <- ES$score - 4 - NP$score <- NP$score - 4 - - sel <- (NP$start > start & NP$end < end) - ymax <- max(c(ES$score[sel], NP$score[sel]), na.rm=T); - - plot(0, type="n", xlim=c(start,end), ylim=c(-ymax/5,ymax), xlab="Genomic location", - ylab="signal", col=2, main=paste("ESC (", region, ")", sep="")) - rect(ES$start, 0, ES$end, ES$score, border="NA", col=2) - # Genes - rect(genes_chr$start[genes_chr$strand=="+"], -1, genes_chr$end[genes_chr$strand=="+"], -ymax/10, border=TRUE) - rect(genes_chr$start[genes_chr$strand=="-"], -ymax/10, genes_chr$end[genes_chr$strand=="-"], -ymax/5, border=TRUE) - # Exons - rect(exons_chr$start[exons_chr$strand=="+"], -1, exons_chr$end[exons_chr$strand=="+"], -ymax/10, - col=as.factor(exons_chr$attribute[exons_chr$strand=="+"])) - rect(exons_chr$start[exons_chr$strand=="-"], -ymax/10, exons_chr$end[exons_chr$strand=="-"], -ymax/5, - col=as.factor(exons_chr$attribute[exons_chr$strand=="-"])) - - plot(0, type="n", xlim=c(start,end), ylim=c(-ymax/5,ymax), xlab="Genomic location", - ylab="signal", col=3, main=paste("NPC (", region, ")", sep="")) - rect(NP$start, 0, NP$end, NP$score, border="NA", col=3) - # Genes - rect(genes_chr$start[genes_chr$strand=="+"], -1, genes_chr$end[genes_chr$strand=="+"], -ymax/10, border=TRUE) - rect(genes_chr$start[genes_chr$strand=="-"], -ymax/10, genes_chr$end[genes_chr$strand=="-"], -ymax/5, border=TRUE) - # Exons - rect(exons_chr$start[exons_chr$strand=="+"], -1, exons_chr$end[exons_chr$strand=="+"], -ymax/10, - col=as.factor(exons_chr$attribute[exons_chr$strand=="+"])) - rect(exons_chr$start[exons_chr$strand=="-"], -ymax/10, exons_chr$end[exons_chr$strand=="-"], -ymax/5, - col=as.factor(exons_chr$attribute[exons_chr$strand=="-"])) - - if (delta) { - ratio_name <- load(paste(data_path, chr, ".RData", sep="")) - ratio <- get(ratio_name) - dScore <- ratio$score - - nonNA <- (!is.na(ES$score) & !is.na(NP$score) & !is.na(dScore)); - ES <- ES[nonNA,] - NP <- NP[nonNA,] - dScore <- dScore[nonNA] - - # Used for running medians - ES$start <- NP$start <- runmed(NP$start, k=49) - ES$end <- NP$end <- runmed(NP$end, k=49) - ES$score <- runmed(ES$score, k=49) - NP$score <- runmed(NP$score, k=49) - dScore <- runmed(dScore, k=49); - - #dScore <- log2(ES$score/NP$score); - sel <- (NP$start > start & NP$end < end) - - yrange <- range(c(0, dScore[sel]), finite=TRUE); - plot(0, type="n", xlim=c(start,end), ylim=yrange, xlab="Genomic location", - ylab="signal", main=paste("Z-score (", region, ")", sep="")) - - use_alpha <- FALSE; - if (use_alpha) { - signal <- ES$score+NP$score - alphas <- (signal - min(signal)) / (max(signal) - min(signal)) - rect(ES$start[dScore>0], 0, ES$end[dScore>0], dScore[dScore>0], - border="NA", col=rgb(1,0,0,alphas[dScore>0])) - rect(NP$start[dScore<0], dScore[dScore<0], NP$end[dScore<0], 0, - border="NA", col=rgb(0,1,0,alphas[dScore<0])) - } else { - rect(ES$start[dScore>0], 0, ES$end[dScore>0], dScore[dScore>0], - border="NA", col=rgb(1,0,0)) - rect(NP$start[dScore<0], dScore[dScore<0], NP$end[dScore<0], 0, - border="NA", col=rgb(0,1,0)) - } - } - } - - dev.off() -} - - -CGintvlOverlap_Wouter <- function (domains, features, remapIndices = TRUE, fun=mean, ...) { - allseq = unique(c(domains$seqname, features$seqname)) - res = vector(mode = "list", length = length(allseq)) - names(res) = allseq - warnings <- c() - for (chr in allseq) { - print(chr) - d <- domains[domains$seqname == chr, ] - fOrder <- which(features$seqname == chr) - f <- features[fOrder, ] - res[[chr]] <- .domOverlap(d$start, d$end, f$start, f$end, - ...) - warnings <- c(warnings, res[[chr]]$warn) - res[[chr]] <- res[[chr]]$overlap - - #res[[chr]] <- lapply(res[[chr]], function(v) median(f$score[v], na.rm=T)) - res[[chr]] <- lapply(res[[chr]], function(v) fun(f$score[v])) - #res[[chr]] <- lapply(res[[chr]], function(v) f$score[v]) - #res[[chr]] <- lapply(res[[chr]], function(v) min(f$score[v])) -# if (remapIndices) -# res[[chr]] <- lapply(res[[chr]], function(v) fOrder[v]) - } - res -} - -# For every (extended) domain, the function finds the features contained in it. -# Then, the index of the nearest neighbour feature (based on domain and feature -# centers) is returned. If no features are found, NA is returned. -CGintvlOverlap_Wouter_closest <- function (domains, features, ...) { - allseq = unique(c(domains$seqname, features$seqname)) - res = vector(mode = "list", length = length(allseq)) - names(res) = allseq - warnings <- c() - for (chr in allseq) { - print(chr) - d <- domains[domains$seqname == chr, ] - fOrder <- which(features$seqname == chr) - f <- features[fOrder, ] - dctr <- (d$start+d$end)/2 - fctr <- (f$start+f$end)/2 - res[[chr]] <- .domOverlap(d$start, d$end, f$start, f$end, ...) - warnings <- c(warnings, res[[chr]]$warn) - res[[chr]] <- res[[chr]]$overlap - - di <- 0; - res[[chr]] <- lapply(res[[chr]], function(v) { - di <<- di + 1; - if (length(v) > 0) - fOrder[v[which.min(abs(fctr[v] - dctr[di]))]] - else - NA - }) - } - res -} - - -fix_overlap <- function(x.in) { - # create matrix to determine queue size (overlap) - x.q <- rbind(cbind(x.in$start, 1, 1:nrow(x.in)), - cbind(x.in$end+1, -1, 1:nrow(x.in))) - # sort - x.q <- x.q[order(x.q[,1], x.q[,2]),] - x.q <- cbind(x.q, queue=cumsum(x.q[,2])) - - end <- which(x.q[,4] == 0); - start <- c(1, end+1); - start <- start[-length(start)]; - - # This removes all overlapping features! - #diff_res <- apply(cbind(start, end), 1, diff) - #diff_hits <- which(diff_res > 1) - #diff_sel <- unlist(apply(cbind(start[diff_hits], - # end[diff_hits]), - # 1, function(x) x[1]:x[2])) - #x.in[-unique(x.q[diff_sel,3]),] - - data.frame(start=x.q[start,1], end=x.q[end,1]-1) -} - -plot_delta_dom_sizes <- function(delta_doms_pos, delta_doms_neg, chr_sizes, file="delta_doms_sizes.pdf") { - pdf(file=file) - - delta_doms <- rbind(delta_doms_pos, delta_doms_neg); - sizes <- delta_doms$end-delta_doms$start+1 - - empty_sizes <- chr_sizes; empty_sizes[names(chr_sizes)] <- 0; - - delta_sizes <- pos_delta_sizes <- neg_delta_sizes <- empty_sizes - tmp_sizes <- tapply(sizes, delta_doms$seqname, function(x) sum(x)) - delta_sizes[names(tmp_sizes)] <- tmp_sizes - - print(cbind(median(sizes), length(sizes))); - plot(dens <- density(log10(sizes)), lwd=2, - main="Size of delta Lamin domains (+ & -)", xlab="log10(bp)", ylab="Density") - abline(v=median(log10(sizes)), lwd=2, col=2) - - barplot(round(delta_sizes/chr_sizes*100, 2), las=2, ylab="% of chromosome size", - main=paste("delta Lamin fractions (+ & -)\nGenome-wide: ", - round(sum(as.numeric(delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) - - delta_sizes <- chr_sizes - tmp_sizes <- tapply(sizes, delta_doms$seqname, function(x) sum(x)) - delta_sizes[names(delta_sizes)] <- 0; - delta_sizes[names(tmp_sizes)] <- tmp_sizes - - pos_sizes <- delta_doms_pos$end-delta_doms_pos$start+1 - pos_delta_sizes <- empty_sizes - tmp_sizes <- tapply(pos_sizes, delta_doms_pos$seqname, function(x) sum(x)) - pos_delta_sizes[names(tmp_sizes)] <- tmp_sizes - print(cbind(median(pos_sizes), length(pos_sizes))); - plot(dens <- density(log10(pos_sizes)), lwd=2, - main="Size of delta Lamin domains (+)", xlab="log10(bp)", ylab="Density") - abline(v=median(log10(pos_sizes)), lwd=2, col=2) - - pos_delta_frac <- round(pos_delta_sizes/chr_sizes*100, 2) - barplot(pos_delta_frac, las=2, ylab="% of chromosome size", col="blue", - main=paste("delta Lamin fractions (+)\nGenome-wide: ", - round(sum(as.numeric(pos_delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) - - neg_sizes <- delta_doms_neg$end-delta_doms_neg$start+1 - neg_delta_sizes <- empty_sizes - tmp_sizes <- tapply(neg_sizes, delta_doms_neg$seqname, function(x) sum(x)) - neg_delta_sizes[names(tmp_sizes)] <- tmp_sizes - print(cbind(median(neg_sizes), length(neg_sizes))); - plot(dens <- density(log10(neg_sizes)), lwd=2, - main="Size of delta Lamin domains (-)", xlab="log10(bp)", ylab="Density") - abline(v=median(log10(neg_sizes)), lwd=2, col=2) - - neg_delta_frac <- round(neg_delta_sizes/chr_sizes*100, 2) - barplot(neg_delta_frac, las=2, ylab="% of chromosome size", col="orange", - main=paste("delta Lamin fractions (-)\nGenome-wide: ", - round(sum(as.numeric(neg_delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) - - pos_neg_mat <- rbind(pos_delta_frac, neg_delta_frac) - barplot(pos_neg_mat, las=2, ylab="% of chromosome size", col=c("blue","orange"), - main=paste("delta Lamin fractions (+ & -)\nGenome-wide: ", - round(sum(as.numeric(delta_sizes))/sum(as.numeric(chr_sizes))*100, 2), "%", sep="")) - legend("topright", "(x,y)", fill=c("blue","orange"), legend=c("+", "-")) - - dev.off() -} - -plot_detail <- function(ES, NP, NP_ES, delta_doms_pos, delta_doms_neg, chr, from, to, id="default", path="plots", pdf=FALSE, lwd=1) { - if (pdf) { - pdf(file=paste(path, "/detail_", chr, "_", from, "-", to, "_", id, ".pdf", sep=""), width=15) - } else { -# png(file=paste(path, "/detail_", chr, "_", from, "-", to, "_", id, ".png", sep=""), width=960) - } - - ES_chr <- ES[ES$seqname==chr,] - NP_chr <- NP[ES$seqname==chr,] - NP_ES_chr <- NP_ES[ES$seqname==chr,] - delta_doms_neg_chr <- delta_doms_neg[delta_doms_neg$seqname==chr,] - delta_doms_pos_chr <- delta_doms_pos[delta_doms_pos$seqname==chr,] - - ext <- (to - from) - print(paste("Plotting region ", chr, ":", from-ext, "-", to+ext, sep="")) - - par(mfrow=c(4,1), mar=c(0,2,0,1)) - plot(ES_chr$start, runmed(ES_chr$score, k=5), type="h", col="orange", xlim=c(from-ext, to+ext), xaxt="n", xaxs="i", lwd=lwd) - abline(v=c(from,to), lty=2) - plot(NP_chr$start, runmed(NP_chr$score, k=5), type="h", col="blue", xlim=c(from-ext, to+ext), xaxt="n", xaxs="i", lwd=lwd) - abline(v=c(from,to), lty=2) - plot(NP_ES_chr$start, runmed(NP_ES_chr$score, k=5), type="h", col=ifelse(runmed(NP_ES_chr$score, k=5) > 0, "blue", "orange"), xlim=c(from-ext, to+ext), xaxt="n", xaxs="i", lwd=lwd) - abline(v=c(from,to), lty=2) - plot(0, type="n", xlim=c(from-ext, to+ext), ylim=c(0, 2), yaxt="n", axes=FALSE, xlab="", ylab="", xaxs="i") - par(mar=c(8,2,0,1), new=TRUE) - plot(0, type="n", xlim=c(from-ext, to+ext), ylim=c(0, 2), yaxt="n", xaxs="i", xlab=paste("Chromosome", sub("chr", "", chr), "(bp)")) - if (!is.null(delta_doms_neg_chr)) - rect(delta_doms_neg_chr$start, 1.5, delta_doms_neg_chr$end, 1.7, col="grey") - if (!is.null(delta_doms_pos_chr)) - rect(delta_doms_pos_chr$start, 1.8, delta_doms_pos_chr$end, 2, col="grey") - -# dev.off() -} - - -plot_three_singles <- function(ES, NP, EF, ES_doms, NP_doms, EF_doms, id="default", path="plots", pdf=FALSE) { - chroms <- unique(ES$seqname) - for (i in chroms) { - print(i) - - if (pdf) { - pdf(file=paste(path, "/three_singles_", id, "_", i, ".pdf", sep=""), width=15) - } else { - png(file=paste(path, "/three_singles_", id, "_", i, ".png", sep=""), width=960) - } - - par(mfrow=c(3,1)) - - ylim <- quantile(ES$score[ES$seqname == i], probs=c(0.01,0.99)) - plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 49), type="h", bty="n", - xlab="genomic location (bp)", ylab="LaminB1 interaction", - ylim=ylim, main="ES cells", col="orange") - if (!is.null(ES_doms) && (nrow(ES_doms[ES_doms$seqname == i,]) > 0)) - rect(ES_doms$start[ES_doms$seqname == i], ylim[1], ES_doms$end[ES_doms$seqname == i], ylim[1]+0.2, col = "grey"); - - ylim <- quantile(NP$score[NP$seqname == i], probs=c(0.01,0.99)) - plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 49), type="h", bty="n", - xlab="genomic location (bp)", ylab="LaminB1 interaction", - ylim=ylim, main="NP cells", col="blue") - if (!is.null(NP_doms) && (nrow(NP_doms[NP_doms$seqname == i,]) > 0)) - rect(NP_doms$start[NP_doms$seqname == i], ylim[1], NP_doms$end[NP_doms$seqname == i], ylim[1]+0.2, col = "grey"); - - ylim <- quantile(EF$score[EF$seqname == i], probs=c(0.01,0.99)) - plot(EF$start[EF$seqname == i], runmed(EF$score[EF$seqname == i], 49), type="h", bty="n", - xlab="genomic location (bp)", ylab="LaminB1 interaction", - ylim=ylim, main="EF cells", col="magenta") - if (!is.null(EF_doms) && (nrow(EF_doms[EF_doms$seqname == i,]) > 0)) - rect(EF_doms$start[EF_doms$seqname == i], ylim[1], EF_doms$end[EF_doms$seqname == i], ylim[1]+0.2, col = "grey"); - - dev.off() - } -} - -plot_detail_specific <- function(signal1, signal2, delta, Entrez, sel) { - chr <- min(sel) - plot_detail(signal1, signal2, delta, NULL, NULL, chr=Entrez$seqname[chr], - from=Entrez$start[min(sel)], to=Entrez$end[max(sel)]) - rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1, - Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1.2, col=2) - rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.7, - Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.9, col=2) -} - -plot_detail_specific_Ensembl <- function(signal1, signal2, delta, Entrez, Ensembl, sel) { - chr <- min(sel) - plot_detail(signal1, signal2, delta, NULL, NULL, chr=Entrez$seqname[chr], - from=Entrez$start[min(sel)], to=Entrez$end[max(sel)]) - - rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1, - Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="+"], 1.2, col=2) - rect(Entrez$start[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.7, - Entrez$end[Entrez$seqname==Entrez$seqname[chr] & Entrez$strand=="-"], 0.9, col=2) - - rect(Ensembl$start[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="+"], 0.4, - Ensembl$end[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="+"], 0.6, col=rgb(0,1,0,0.3)) - rect(Ensembl$start[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="-"], 0.3, - Ensembl$end[Ensembl$seqname==Entrez$seqname[chr] & Ensembl$strand=="-"], 0.5, col=rgb(0,1,0,0.3)) -} - -plot_single_and_comb <- function(ES, NP, NP_ES, ES_doms, NP_doms, delta_doms_pos, delta_doms_neg, comb="delta", id="default", path="plots", pdf=FALSE) { - chroms <- unique(ES$seqname) - for (i in chroms) { - print(i) - - if (pdf) { - pdf(file=paste(path, "/single_and_", comb, "_", id, "_", i, ".pdf", sep=""), width=15) - } else { - png(file=paste(path, "/single_and_", comb, "_", id, "_", i, ".png", sep=""), width=960) - } - - par(mfrow=c(3,1)) - - ylim <- quantile(ES$score[ES$seqname == i], probs=c(0.01,0.99)) - plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 49), type="h", bty="n", - xlab="genomic location (bp)", ylab="LaminB1 interaction", - ylim=ylim, main="ES cells", col="orange") - if (!is.null(ES_doms) && (nrow(ES_doms[ES_doms$seqname == i,]) > 0)) - rect(ES_doms$start[ES_doms$seqname == i], ylim[1], ES_doms$end[ES_doms$seqname == i], ylim[1]+0.2, col = "grey"); - - ylim <- quantile(NP$score[ES$seqname == i], probs=c(0.01,0.99)) - plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 49), type="h", bty="n", - xlab="genomic location (bp)", ylab="LaminB1 interaction", - ylim=ylim, main="NP cells", col="blue") - if (!is.null(NP_doms) && (nrow(NP_doms[NP_doms$seqname == i,]) > 0)) - rect(NP_doms$start[NP_doms$seqname == i], ylim[1], NP_doms$end[NP_doms$seqname == i], ylim[1]+0.2, col = "grey"); - - ylim <- quantile(NP_ES$score[ES$seqname == i], probs=c(0.01,0.99)) - delta_signal <- runmed(NP_ES$score[NP_ES$seqname == i], 49); - delta_col <- ifelse(delta_signal > 0, "blue", "orange") - plot(NP_ES$start[NP_ES$seqname == i], delta_signal, type="h", bty="n", - xlab="genomic location (bp)", ylab="LaminB1 interaction", - ylim=ylim, main=paste(comb, "Lamin"), col=delta_col) - - if (nrow(delta_doms_pos[delta_doms_pos$seqname == i,]) > 0) { - rect(delta_doms_pos$start[delta_doms_pos$seqname == i], ylim[2], - delta_doms_pos$end[delta_doms_pos$seqname == i], ylim[2]+0.2, col = "grey"); - } - if (nrow(delta_doms_neg[delta_doms_neg$seqname == i,]) > 0) { - rect(delta_doms_neg$start[delta_doms_neg$seqname == i], ylim[1], - delta_doms_neg$end[delta_doms_neg$seqname == i], ylim[1]+0.2, col = "grey"); - } - - dev.off() - } -} - -### OK (OLD) -#subAlign <- function(aln,idx){ -# if (is.logical(idx)) idx <- which(idx); -# -# newIntvlIdx <- rep(NA, nrow(aln$relDom)) -# newIntvlIdx[idx] <- 1:length(idx) -# -# alnIdx <- (aln$aln$relDomIdx %in% rownames(aln$relDom)[idx]) -# aln$aln <- aln$aln[alnIdx,] -# aln$aln$relDomIdx <- newIntvlIdx[aln$aln$relDomIdx]; -# -# aln$relDom <- aln$relDom[idx,] -# -# aln -#} - -subAlign <- function(aln,idx){ - if (is.logical(idx)) idx <- which(idx); - - newIntvlIdx <- rep(NA, nrow(aln$relDom)) - newIntvlIdx[idx] <- 1:length(idx) - - aln$relDom <- aln$relDom[idx,] - aln$aln <- aln$aln[aln$aln$relDomIdx %in% idx,] - aln$aln$relDomIdx <- newIntvlIdx[aln$aln$relDomIdx]; - - aln -} - -plot_profile <- function(GFF, LADs=NULL, chr="chr18", col="black", main="", type="h", - plot_LADs=TRUE, plot_stats=FALSE, plot_ylab=TRUE, k=49, ylim=NULL, sample=FALSE, ...) { - if (is.null(ylim)) ylim <- quantile(GFF$score[GFF$seqname == chr], probs=c(0.01,0.99)) - pos <- GFF$start[GFF$seqname == chr]; - scores <- runmed(GFF$score[GFF$seqname == chr], k); - if (sample) { - sel <- sample(1:length(pos), round(length(pos)/5)) - } else { - sel <- 1:length(pos); - } - if (length(col) > 1) col <- col[sel]; - - plot(pos[sel], scores[sel], type=type, bty="n", - ylab=ifelse(plot_ylab, "LaminB1 interaction (log2)", ""), xaxt="n", ylim=ylim, col=col, ...) - mtext(main, 3, -1, cex=1.5) - if (!is.null(LADs) && (nrow(LADs[LADs$seqname == chr,]) > 0)) { - if (plot_LADs) { - rect(LADs$start[LADs$seqname == chr], ylim[1], LADs$end[LADs$seqname == chr], ylim[1]+0.4, - col = grey(0.5), border=NA); - } - if (plot_stats) { - med <- median(LADs$end-LADs$start+1) - legend("topright", "(x,y)", bty="n", inset=c(0.1,-0.05), - legend=c(paste(nrow(LADs), "LADs"), paste("Median size: ", med %/% 1e3, "kb", sep=""))) - } - } -} - -## Detailed profile -plot_binding_detail <- function(chr, from, to, lwd=1, extend=0.5) { - par(mfrow=c(4,1), mar=c(0,5,0,1)) - i <- chr - - from <- from - extend*(to - from) - to <- to + extend*(to - from) - - ylim <- quantile(ES$score[ES$seqname == i], probs=c(0.05,0.95)) - plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - ylab="", xaxt="n", ylim=ylim, col="orange", xlim=c(from,to), cex.axis=1.5) - #mtext("Embryonic stem cells", 3, -1) - - ylim <- quantile(NP$score[NP$seqname == i], probs=c(0.05,0.95)) - plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - ylab="", xaxt="n", ylim=ylim, col="blue", xlim=c(from,to), cex.axis=1.5) - #mtext("Neuro-precursor cells", 3, -1) - - ylim <- quantile(AC$score[AC$seqname == i], probs=c(0.05,0.95)) - plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - ylab="", xaxt="n", ylim=ylim, col="magenta", xlim=c(from,to), cex.axis=1.5) - #mtext("Astrocyte cells", 3, -1) - - mtext("LaminB1 interaction (log2)", 2, -2, cex=1.5, at=0.62, outer=TRUE) - - par(mar=c(5,5,0,1)) - plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 49), - type="n", bty="n", ylim=c(0.4, 1), xlab="", ylab="", xlim=c(from,to), yaxt="n", cex.axis=1.5, cex.lab=1.5) - mtext(paste("Genomic location (bp, ", chr, ")", sep=""), 1, 3, cex=1.5) - - rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.77, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.78, col=1); - rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.57, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.58, col=1); - - text(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.87, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "+"], adj=c(0,0), col=1, cex=1.5) - text(Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.67, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "-"], adj=c(1,0), col=1, cex=1.5) - - rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.74, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.81, col=1); - rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.54, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.61, col=1); - - rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.7, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.85, col=1); - rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.5, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.65, col=1); -} - -## Detailed profile, including H3K36me3 data -plot_binding_detail_plus_K36 <- function(chr, from, to, lwd=1, extend=0.5, ylim_LB1=NULL, ylim_K36=NULL, - axes_LB1=TRUE, axes_K36=TRUE, emph_used_genes=FALSE, cols=c("orange", "blue", "magenta")) { - par(mfrow=c(3,1), oma=c(5,0,0,0)) - i <- chr - - load(paste("../../data/external_data/Mikkelsen_Nat07/H3K36me3_data/mm9/", chr, ".ES.K36.RData", sep="")) - names(K36) <- c("seqname", "start", "end", "score") - #K36$score <- log2(K36$score) - K36 <- K36[K36$score>2,] - ES_K36 <- K36; - - load(paste("../../data/external_data/Mikkelsen_Nat07/H3K36me3_data/mm9/", chr, ".NP.K36.RData", sep="")) - names(K36) <- c("seqname", "start", "end", "score") - #K36$score <- log2(K36$score) - K36 <- K36[K36$score>2,] - NP_K36 <- K36; - - if (is.null(ylim_LB1)) - ylim_LB1 <- quantile(c(ES$score[ES$seqname == i], NP$score[NP$seqname == i], AC$score[AC$seqname == i]), probs=c(0.05,0.95)) - if (is.null(ylim_K36)) - ylim_K36 <- quantile(c(ES_K36$score[ES_K36$seqname == i], NP_K36$score[NP_K36$seqname == i]), probs=c(0.01,0.99)) - - from <- from - extend*(to - from) - to <- to + extend*(to - from) - - if (axes_LB1) { - #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); - yaxt_LB1="s"; ylab_LB1=""; # We want to plot one large label over all 3 profiles... - } else { - yaxt_LB1="n"; ylab_LB1=""; - } - - par(mar=c(5,5,3,5)) - plot(ES$start[ES$seqname == i], runmed(ES$score[ES$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - xlab="", xaxt="n", ylim=ylim_LB1, col=cols[1], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) - #mtext("Embryonic stem cells", 3, -1) - par(mar=c(0,5,15,5), new=TRUE) - plot(0, type="n", bty="n", xlab="", ylab="", xaxt="n", yaxt="n", xlim=c(from, to), ylim=ylim_K36, yaxs="i") - #rect(0, 0, 1e9, 100, col="orange", border=NA) - lines(ES_K36$start[ES_K36$seqname == i], ES_K36$score[ES_K36$seqname == i], type="h", bty="n", lend=2, col="black") - lines(c(ylim_K36[1], 1e9), rep(ylim_K36[1], 2), type="l", col=cols[1], lwd=3); - if (axes_K36) { - axis(4, cex.axis=2) - mtext("H3K36me3", 4, line=3, cex=1.5) - } - - par(mar=c(5,5,3,5)) - plot(NP$start[NP$seqname == i], runmed(NP$score[NP$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - xlab="", xaxt="n", ylim=ylim_LB1, col=cols[2], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) - #mtext("Neuro-precursor cells", 3, -1) - par(mar=c(0,5,15,5), new=TRUE) - plot(0, type="n", bty="n", xlab="", ylab="", xaxt="n", yaxt="n", xlim=c(from, to), ylim=ylim_K36, yaxs="i") - #rect(0, 0, 1e9, 100, col="blue", border=NA) - lines(NP_K36$start[NP_K36$seqname == i], NP_K36$score[NP_K36$seqname == i], type="h", bty="n", lend=2, col="black") - lines(c(ylim_K36[1], 1e9), rep(ylim_K36[1], 2), type="l", col=cols[2], lwd=3); - if (axes_K36) { - axis(4, cex.axis=2) - mtext("H3K36me3", 4, line=3, cex=1.5) - } - - par(mar=c(5,5,3,5)) - plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - xlab="", xaxt="n", ylim=ylim_LB1, col=cols[3], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) - #mtext("Astrocyte cells", 3, -1) - - #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); - - mtext(expression(log[2](LaminB1~interaction)), 2, -2, cex=1.5, at=0.55, outer=TRUE) - #mtext("H3K36me3 (ChIP-seq data)", 4, -2, cex=1.5, at=0.53, outer=TRUE) - - #par(mar=c(5,5,0,5)) - par(new=TRUE) - par(mar=c(0,5,3,5)) - options("scipen"=5) - plot(AC$start[AC$seqname == i], runmed(AC$score[AC$seqname == i], 49), - type="n", bty="n", ylim=c(0.4, 2.1), xlab="", ylab="", xlim=c(from,to), yaxt="n", cex.axis=2) - options("scipen"=0) - mtext(paste("chromosomal location (bp, ", chr, ")", sep=""), 1, 3.5, cex=1.5) - - rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.77, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.78, border=NA, col=1); - rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.57, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.58, border=NA, col=1); - - text(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.88, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "+"], adj=c(0,0), col=1, cex=2) - text(Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.68, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "-"], adj=c(1,0), col=1, cex=2) - - rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.74, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.81, border=NA, col=1); - rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.54, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.61, border=NA, col=1); - - rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.7, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.85, border=NA, col=1); - rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.5, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.65, border=NA, col=1); - - # Again, only genes used in analysis, in red. - if (emph_used_genes) { - rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.77, Entrez$end[Entrez$seqname == i & Entrez$strand == "+"], 0.78, border=NA, col=2); - rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "-"], 0.57, Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.58, border=NA, col=2); - - text(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.88, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "+"], adj=c(0,0), col=2, cex=2) - text(Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.68, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "-"], adj=c(1,0), col=2, cex=2) - - rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "+"], 0.74, UTRs$end[UTRs$seqname == i & UTRs$strand == "+"], 0.81, border=NA, col=2); - rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "-"], 0.54, UTRs$end[UTRs$seqname == i & UTRs$strand == "-"], 0.61, border=NA, col=2); - - rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "+"], 0.7, CDSs$end[CDSs$seqname == i & CDSs$strand == "+"], 0.85, border=NA, col=2); - rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "-"], 0.5, CDSs$end[CDSs$seqname == i & CDSs$strand == "-"], 0.65, border=NA, col=2); - } - - #sel_genes <- Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$start > from & Entrez_full$end < to] - sel_genes <- Entrez$frame[Entrez$seqname == i & Entrez$start > from & Entrez$end < to] - invisible(sel_genes) -} - -## Detailed profile, including H3K36me3 data -plot_binding_detail_plus_K36_NP_AC <- function(signal1=NP, signal2=AC, chr, from, to, lwd=1, extend=0.5, ylim_LB1=NULL, ylim_K36=NULL, - axes_LB1=TRUE, axes_K36=TRUE, emph_used_genes=FALSE, cols=c("orange", "blue", "magenta"), - ylab_text=expression(log[2](LaminB1~interaction))) { - par(mfrow=c(2,1), oma=c(5,0,0,0)) - i <- chr - - load(paste("../../data/external_data/Mikkelsen_Nat07/H3K36me3_data/mm9/", chr, ".NP.K36.RData", sep="")) - names(K36) <- c("seqname", "start", "end", "score") - #K36$score <- log2(K36$score) - K36 <- K36[K36$score>2,] - signal1_K36 <- K36; - - if (is.null(ylim_LB1)) - ylim_LB1 <- quantile(c(signal1$score[signal1$seqname == i], signal2$score[signal2$seqname == i]), probs=c(0.05,0.95)) - if (is.null(ylim_K36)) - ylim_K36 <- quantile(c(signal1_K36$score[signal1_K36$seqname == i]), probs=c(0.01,0.99)) - - from <- from - extend*(to - from) - to <- to + extend*(to - from) - - if (axes_LB1) { - #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); - yaxt_LB1="s"; ylab_LB1=""; # We want to plot one large label over all 3 profiles... - } else { - yaxt_LB1="n"; ylab_LB1=""; - } - - par(mar=c(5,5,3,5)) - plot(signal1$start[signal1$seqname == i], runmed(signal1$score[signal1$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - xlab="", xaxt="n", ylim=ylim_LB1, col=cols[2], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) - #mtext("Neuro-precursor cells", 3, -1) - par(mar=c(0,5,15,5), new=TRUE) - plot(0, type="n", bty="n", xlab="", ylab="", xaxt="n", yaxt="n", xlim=c(from, to), ylim=ylim_K36, yaxs="i") - #rect(0, 0, 1e9, 100, col="blue", border=NA) - lines(signal1_K36$start[signal1_K36$seqname == i], signal1_K36$score[signal1_K36$seqname == i], type="h", bty="n", lend=2, col="black") - lines(c(ylim_K36[1], 1e9), rep(ylim_K36[1], 2), type="l", col=cols[2], lwd=3); - if (axes_K36) { - axis(4, cex.axis=2) - mtext("H3K36me3", 4, line=3, cex=1.5) - } - - par(mar=c(5,5,3,5)) - plot(signal2$start[signal2$seqname == i], runmed(signal2$score[signal2$seqname == i], 9), type="h", bty="n", lwd=lwd, lend=2, - xlab="", xaxt="n", ylim=ylim_LB1, col=cols[3], xlim=c(from,to), cex.axis=1.5, yaxt=yaxt_LB1, ylab=ylab_LB1, cex.lab=2, cex.axis=2) - #mtext("Astrocytes", 3, -1) - - #yaxt_LB1="s"; ylab_LB1=expression(log[2](LaminB1~interaction)); - - #mtext(expression(log[2](LaminB1~interaction)), 2, -2, cex=1.5, at=0.55, outer=TRUE) - mtext(ylab_text, 2, -2, cex=1.5, at=0.55, outer=TRUE) - #mtext("H3K36me3 (ChIP-seq data)", 4, -2, cex=1.5, at=0.53, outer=TRUE) - - #par(mar=c(5,5,0,5)) - par(new=TRUE) - par(mar=c(0,5,3,5)) - options("scipen"=5) - plot(signal2$start[signal2$seqname == i], runmed(signal2$score[signal2$seqname == i], 49), - type="n", bty="n", ylim=c(0.4, 2.1), xlab="", ylab="", xlim=c(from,to), yaxt="n", cex.axis=2) - options("scipen"=0) - mtext(paste("chromosomal location (bp, ", chr, ")", sep=""), 1, 3.5, cex=1.5) - - rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.77, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.78, border=NA, col=1); - rect(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.57, Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.58, border=NA, col=1); - - text(Entrez_full$start[Entrez_full$seqname == i & Entrez_full$strand == "+"], 0.88, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "+"], adj=c(0,0), col=1, cex=2) - text(Entrez_full$end[Entrez_full$seqname == i & Entrez_full$strand == "-"], 0.68, labels=Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$strand == "-"], adj=c(1,0), col=1, cex=2) - - rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.74, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "+"], 0.81, border=NA, col=1); - rect(UTRs_full$start[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.54, UTRs_full$end[UTRs_full$seqname == i & UTRs_full$strand == "-"], 0.61, border=NA, col=1); - - rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.7, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "+"], 0.85, border=NA, col=1); - rect(CDSs_full$start[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.5, CDSs_full$end[CDSs_full$seqname == i & CDSs_full$strand == "-"], 0.65, border=NA, col=1); - - # Again, only genes used in analysis, in red. - if (emph_used_genes) { - rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.77, Entrez$end[Entrez$seqname == i & Entrez$strand == "+"], 0.78, border=NA, col=2); - rect(Entrez$start[Entrez$seqname == i & Entrez$strand == "-"], 0.57, Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.58, border=NA, col=2); - - text(Entrez$start[Entrez$seqname == i & Entrez$strand == "+"], 0.88, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "+"], adj=c(0,0), col=2, cex=2) - text(Entrez$end[Entrez$seqname == i & Entrez$strand == "-"], 0.68, labels=Entrez$frame[Entrez$seqname == i & Entrez$strand == "-"], adj=c(1,0), col=2, cex=2) - - rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "+"], 0.74, UTRs$end[UTRs$seqname == i & UTRs$strand == "+"], 0.81, border=NA, col=2); - rect(UTRs$start[UTRs$seqname == i & UTRs$strand == "-"], 0.54, UTRs$end[UTRs$seqname == i & UTRs$strand == "-"], 0.61, border=NA, col=2); - - rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "+"], 0.7, CDSs$end[CDSs$seqname == i & CDSs$strand == "+"], 0.85, border=NA, col=2); - rect(CDSs$start[CDSs$seqname == i & CDSs$strand == "-"], 0.5, CDSs$end[CDSs$seqname == i & CDSs$strand == "-"], 0.65, border=NA, col=2); - } - - #sel_genes <- Entrez_full$frame[Entrez_full$seqname == i & Entrez_full$start > from & Entrez_full$end < to] - sel_genes <- Entrez$frame[Entrez$seqname == i & Entrez$start > from & Entrez$end < to] - invisible(sel_genes) -} - -# Used to shift doms half a inter-dom size up, i.e., rotate them circularly over the genome. -shiftDoms <- function(doms, seq_dims) { - shift_doms <- NULL; - for (chr in unique(doms$seqname)) { - end <- seq_dims$end[seq_dims$seqname==chr] - dom_chr <- doms[doms$seqname == chr,] - n <- nrow(dom_chr) - - ctr <- floor((dom_chr$start + dom_chr$end)/2) - ictr <- floor((dom_chr$end[1:(n-1)] + dom_chr$start[2:n])/2) - ictr <- c(ictr, floor((dom_chr$end[n] + end)/2)); - - dom_chr$start <- ctr; - dom_chr$end <- ictr; - - shift_doms <- rbind(shift_doms, dom_chr); - } - invisible(shift_doms) -} - -combineAlign <- function(aln1, aln2) { - aln2$aln$relDomIdx <- aln2$aln$relDomIdx + nrow(aln1$relDom) - aln <- list(aln = rbind(aln1$aln, aln2$aln), - relDom = rbind(aln1$relDom, aln2$relDom)); - invisible(aln); -} - -func.mem.use <- function() { - sum_size <- 0; - for (i in ls(envir=parent.frame())) { - sum_size <- sum_size + object.size(eval(parse(text=i), envir=parent.frame())); - } - - print(sum_size, units="auto") - invisible(sum_size); -} - -mem.use <- function() { - sort(sapply(ls(envir=.GlobalEnv), function(x) object.size(eval(parse(text=x))))) -} - -# Convert genomic locations back to probe indices -get_probe_coords <- function(GFF, template=NP_ES) { - probes <- CGintvlOverlap(GFF, template, type="center", checkData=FALSE) - ranges <- lapply(probes, function(x) lapply(x, range, na.rm=T)) - probe_coords <- NULL; - for (i in names(ranges)) { - probe_coords <- rbind(probe_coords, data.frame(start=unlist(lapply(ranges[[i]], "[[", 1)), - end=unlist(lapply(ranges[[i]], "[[", 2)))) - } - - probe_coords[probe_coords == Inf | probe_coords == -Inf] <- NA; - - invisible(probe_coords) -} - -# Used to extend GFF based on number of array probes. Used in synteny analysis of intergenic regions. -ext_probe_coords <- function(probe_coords, template=NP_ES, upstr=25, downstr=25) { - chrom_mins <- tapply(1:nrow(template), template$seqname, min) - chrom_maxs <- tapply(1:nrow(template), template$seqname, max) - - probe_coords$start <- pmax(probe_coords$start - upstr, - chrom_mins[template$seqname[probe_coords$start]], na.rm=T) - probe_coords$end <- pmin(probe_coords$end + downstr, - chrom_maxs[template$seqname[probe_coords$end]], na.rm=T) - - invisible(probe_coords) -} - -# Inverse of get_probe_coords() -get_chr_coords <- function(probe_coords, template=NP_ES) { - res <- data.frame(seqname=template$seqname[probe_coords[,1]], - start=template$start[probe_coords[,1]], - end=template$end[probe_coords[,2]], - stringsAsFactors=FALSE) - - hits <- which(is.na(res$start)) - chrom_mins <- tapply(template$start, template$seqname, min) -# res$start[hits] <- chrom_mins[res$seqname[hits]] - - hits <- which(is.na(res$end)) - chrom_maxs <- tapply(template$end, template$seqname, max) -# res$end[hits] <- chrom_maxs[res$seqname[hits]] - - invisible(res) -} - -get_dom_border_feats <- function(doms, feats, winsize=10000) { - bndr <- CGintvlBndr(doms) - bndrExt <- CGintvlExtend(bndr, upstr=winsize, downstr=0, chromStartEnd=chromSE_MM_NCBIm37) - - feats$center <- floor((feats$start + feats$end)/2) - feats$start <- feats$end <- feats$center; - - # Return doms - bndrfeat <- unlist(CGintvlOverlap(feats, bndrExt, type="minimal")) - - # Create 0/1 vector, also taking care of duplicates - res <- rep(FALSE, nrow(bndr)) - res[bndrfeat] <- TRUE; - - invisible(res) -} - -get_dom_border_feats_outward <- function(doms, feats, winsize=10000) { - bndr <- CGintvlBndr(doms) - bndrExt <- CGintvlExtend(bndr, upstr=winsize, downstr=0, chromStartEnd=chromSE_MM_NCBIm37) - - feats5 <- feats[feats$strand=="-",]; feats5$start <- feats5$end # Select appropriate TSSs - feats3 <- feats[feats$strand=="+",]; feats3$end <- feats3$start # Select appropriate TSSs - - bndr5Ext <- bndrExt[seq(1,nrow(bndrExt), 2),] - bndr3Ext <- bndrExt[seq(2,nrow(bndrExt), 2),] - - # Return doms - bndr5feat <- unlist(CGintvlOverlap(feats5, bndr5Ext, type="minimal")) - bndr3feat <- unlist(CGintvlOverlap(feats3, bndr3Ext, type="minimal")) - - # Create 0/1 vector, also taking care of duplicates - res <- rep(FALSE, nrow(bndrExt)) - res[seq(1, nrow(bndrExt), 2)][bndr5feat] <- TRUE - res[seq(2, nrow(bndrExt), 2)][bndr3feat] <- TRUE - - invisible(res) -} - -# OK -get_bindata <- function(aln, bins=seq(-25*1e3, 25*1e3, length.out=11)) { - aln$aln$center <- (aln$aln$start + aln$aln$end)/2 - - # Retain only features of which the center lies within a bin. - aln$aln <- aln$aln[aln$aln$center < max(bins) & aln$aln$center > min(bins),] - - # Do actual binning, making sure the bins extend to the extremes - binning <- cut(c(range(bins), aln$aln$center), breaks=bins) - binning <- binning[3:length(binning)] # Remove extremes - aln$aln$centerbin <- binning # Assign features to bins - - # For each domain, obtain features and calculate median value per bin. - print("Building binned dataset...") - binned <- tapply(1:nrow(aln$aln), aln$aln$relDomIdx, function(x) { - perc <- round((max(x) / nrow(aln$aln) * 100), 0) - cat(paste("\r", perc, "%", sep="")) - tapply(aln$aln$score[x], aln$aln$centerbin[x], median, na.rm=T) - }) - cat("\n"); - - # Convert result into bin (this could probably be done more elegantly) - binmat <- matrix(NA, nrow=length(binned), ncol=length(bins)-1) - i <- 1; - feh <- lapply(binned, function(x) { - binmat[i,] <<- x; - i <<- i+1 - }) - rownames(binmat) <- names(binned) - colnames(binmat) <- levels(binning); - - invisible(list(binmat=binmat, limits=range(bins))) -} - -get_probedata <- function(doms_probes, signal, numprobes=10, direction="both") { - binmat_probes_prime5 <- t(apply(doms_probes, 1, function(x) x[1] + c(-numprobes:numprobes))) - binmat_probes_prime3 <- t(apply(doms_probes, 1, function(x) x[2] - c(-numprobes:numprobes))) - - binmat_prime5 <- t(apply(binmat_probes_prime5, 1, function(x) signal$score[x])) - binmat_prime3 <- t(apply(binmat_probes_prime3, 1, function(x) signal$score[x])) - - if (direction == "both") { - n <- nrow(doms_probes)*2 - binmat <- matrix(NA, ncol=numprobes*2+1, nrow=n) - binmat[seq(1, n/2)*2-1,] <- binmat_prime5; - binmat[seq(1, n/2)*2,] <- binmat_prime3; - } else { - n <- nrow(doms_probes) - if (direction == "prime5") { - binmat <- binmat_prime5; - } else { - binmat <- binmat_prime3; - } - } - - rownames(binmat) <- 1:n - colnames(binmat) <- c(-numprobes:numprobes) - - invisible(list(binmat=binmat, limits=c(-numprobes,numprobes)*1200)) -} - -doFisherGeneric <- function(set1, set2) { - A <- sum(set1 & set2, na.rm=T) - B <- sum(set1 & !set2, na.rm=T) - C <- sum(!set1 & set2, na.rm=T) - D <- sum(!set1 & !set2, na.rm=T) - - tab <- matrix(c(A,C,B,D), ncol=2, dimnames=list(c("in set1","not in set1"), c("in set2","not in set2"))) - print(tab) - perc_set1 <- A/(A+B)*100; - perc_set2 <- C/(C+D)*100; - print(paste("Set 1:", perc_set1)) - print(paste("Set 2:", perc_set2)) - - res_twosided <- fisher.test(tab, alt="two.sided")$p.value - print(res_twosided) - - print(paste("Enrichment ratio:", perc_set1/perc_set2)) - res_enrich <- fisher.test(tab, alt="greater")$p.value - print(res_enrich) - - print(paste("Depletion ratio:", perc_set2/perc_set1)) - res_deplet <- fisher.test(tab, alt="less")$p.value - print(res_deplet) - - invisible(list(perc_set1=perc_set1, perc_set2=perc_set2, ratio=perc_set1/perc_set2, - p=res_twosided, p_enrich=res_enrich, p_deplet=res_deplet)) -} - -doHyperGeneric <- function(set1, set2) { - q <- length(which(set1 & set2)) - m <- length(which(set2)) - n <- length(which(!set2)) - k <- length(which(set1)) - - print(paste("Enrichment ratio:", (q / k) / ((m-q) / (m+n-k)))) - res_enrich <- phyper(q-1, m, n, k, lower.tail=FALSE) - print(res_enrich) - - print(paste("Depletion ratio:", ((m-q) / (m+n-k)) / (q / k))) - res_deplet <- phyper(q, m, n, k, lower.tail=TRUE) - print(res_deplet) - - invisible(list(enrich=res_enrich, deplet=res_deplet)) -} - -likelihood.gauss <- function (parameters, sample) { - sample <- sample[!is.na(sample) & is.finite(sample)]; - if (length(parameters) != 5) - stop ("parameters must be vector of length 5") - prop = parameters[1] - if (prop < 0) prop = 0 - if (prop > 1) prop = 1 - m1 = parameters[2] - s1 = parameters[3] - m2 = parameters[4] - s2 = parameters[5] - - #print(parameters) - return (-sum(log( prop*dnorm(sample, m1, s1) + (1-prop)*dnorm(sample, m2, s2) ))) -} - -# Not in use anymore -find_center_optim <- function(data, plot=FALSE) { - dens <- density(data, na.rm=T) - slopeSign <- diff(dens$y) > 0 - slopeSignChange <- diff(slopeSign) < 0 - h <- which(slopeSignChange == TRUE) + 1 - print(paste(length(h), "modi found.")) - - ord <- order(dens$y[h], decreasing=TRUE) - sel <- which(ord %in% c(1,2)) - center <- dens$x[mean(h[sel])] - - out <- optim(c(0.5, dens$x[h[sel[1]]], 1, dens$x[h[sel[2]]], 1), likelihood.gauss, sample = data) - - if (plot) { - plot(dens) - abline(v=dens$x[h[sel]], col=2) - abline(v=center, col=3) - curve(dnorm(x, mean=out$par[2], sd=out$par[3]), col=4, add=T) - curve(dnorm(x, mean=out$par[4], sd=out$par[5]), col=4, add=T) - } - - invisible(out); -} - -# Not in use anymore -find_center_cent <- function(data, plot=FALSE) { - dens <- density(data, na.rm=T) - slopeSign <- diff(dens$y) > 0 - slopeSignChange <- diff(slopeSign) < 0 - h <- which(slopeSignChange == TRUE) + 1 - print(paste(length(h), "modi found.")) - - ord <- order(dens$y[h], decreasing=TRUE) - sel <- which(ord %in% c(1,2)) - center <- dens$x[mean(h[sel])] - - if (plot) { - plot(dens) - abline(v=dens$x[h[sel]], col=2) - abline(v=center, col=3) - } - - invisible(center); -} - -# Not in use anymore -find_center_dip_max <- function(data, plot=FALSE) { - dens <- density(data, na.rm=T) - slopeSign <- diff(-dens$y) > 0 - slopeSignChange <- diff(slopeSign) < 0 - h <- which(slopeSignChange == TRUE) + 1 - print(paste(length(h), "dips found.")) - dip <- h[which.max(dens$y[h])] - #dip <- h[1]; # Use the first dip found. - - if (plot) { - plot(dens) - abline(v=dens$x[dip], col=2) - } - - invisible(dens$x[dip]); -} - -# Currently used method -find_center_dip <- function(data, plot=FALSE) { - dens <- density(data, na.rm=T) - - slopeSign <- diff(dens$y) > 0 - slopeSignChange <- diff(slopeSign) < 0 - modi <- which(slopeSignChange == TRUE) + 1 - print(paste(length(modi), "modi found.")) - - ord <- order(dens$y[modi], decreasing=TRUE) - sel <- sort(ord[1:2]) - - slopeSign <- diff(-dens$y) > 0 - slopeSignChange <- diff(slopeSign) < 0 - dips <- which(slopeSignChange == TRUE) + 1 - print(paste(length(dips), "dips found.")) - - # Select minimal dip inbetween two maximum modi - dip_sel <- which(dips > modi[sel[1]] & dips < modi[sel[2]]); - dip <- dips[dip_sel[which.min(dens$y[dips[dip_sel]])]] - - if (plot) { - plot(dens) - abline(v=dens$x[dip], col=2) - } - - invisible(dens$x[dip]); -} - -# Not in use anymore -find_center_clust <- function(data, plot=FALSE) { - data <- data[!is.na(data) & is.finite(data)] - clust <- kmeans(data, centers=2) - rng <- sort(unlist(tapply(data, clust$cluster, range))); - center <- mean(rng[2:3]) - dens <- density(data, na.rm=T) - - if (plot) { - plot(dens) - abline(v=clust$centers, col=2) - abline(v=center, col=3) - } - - invisible(center); -} - -cond.norm <- function (x, conditions = NULL, invariantset = NULL, dat.log.scale = TRUE, method = c("quantile", "global")){ - method <- method[1] - method.available = c("quantile", "global") - method.use <- method.available[charmatch(method, method.available)] - if (is.na(method.use)) { - stop(paste("This method, '", as.character(method), "', is not (yet) implemented. Please use 'quantile' or 'global'.", - sep = "")) - } - if (method.use == "quantile") { - if (is.null(invariantset)) { - x.dist <- apply(apply(x, 1, sort), 1, median) - x.norm <- t(apply(x, 1, function(xx, y) y[rank(xx)], - y = x.dist)) - } else { - n.col <- ncol(x) - x.rank <- t(apply(x, 1, rank)) - if (length(invariantset) == 1) { - invset <- order(apply(x.rank, 2, mad, constant = 1))[1:invariantset] - } - else { - invset <- invariantset - } - mins <- where.is.min(x.rank) - maxs <- where.is.max(x.rank) - if (is.logical(invset)) { - invset[c(mins, maxs)] <- TRUE - } - else { - invset <- unique(c(invset, mins, maxs)) - invset <- sort(invset) - } - inv.x <- x[, invset] - inv.x.dist <- apply(apply(inv.x, 1, sort), 1, median) - x.norm <- t(apply(x, 1, function(y, invset, inv.x.dist) approx(sort(rank(y)[invset]), - inv.x.dist, rank(y))$y, invset = invset, inv.x.dist = inv.x.dist)) - - if (!is.null(conditions)) { - if (nrow(x) != length(conditions)) { - stop("The length of the vector 'conditions' should correspond to the number of rows of the data. NOTE that the genes are in the columns and the conditions in the rows.") - } - cond.lev <- unique(conditions) - for (i in cond.lev) { - which.lev <- (conditions == i) - if (sum(which.lev) > 1) { - x.lev <- x.norm[which.lev, ] - x.lev.dist <- apply(apply(x.lev, 1, sort), - 1, median) - x.lev.norm <- t(apply(x.lev, 1, function(xx, - y) y[rank(xx)], y = x.lev.dist)) - x.norm[which.lev, ] <- x.lev.norm - } - } - } - } - } - else { - if (method.use == "global") { - x.mdn <- median(as.vector(x)) - x.mad <- median(apply(x, 1, mad)) - x.norm <- t(apply(x, 1, function(y) (y - median(y))/mad(y))) - x.norm <- x.mdn + x.norm * x.mad - } - else { - warning("Check that the function is correct! The method specified could not be found.") - } - } - x.norm -} - -### MUCH simpler versions of the cond.norm() function above. -### Also, this separates the empirical distribution from the actual normalisation, which can be convenient. -get_quant_norm_dist <- function (x) { - dist <- apply(apply(x, 1, sort), 1, median) - invisible(dist) -} - -do_quant_norm <- function (x, dist=NULL) { - if (is.null(dist)) dist <- get_quant_norm_dist(x); - - x.norm <- t(apply(x, 1, function(xx, y) y[rank(xx)], y = dist)) - invisible(x.norm) -} - - -plotfile <- function(filename="Rplot", type="png", width=7, height=7, device="bitmap", warn=TRUE) { - format <- switch(type, - pdf = "pdfwrite", - png = "png16m", - type); - - if (device == "bitmap") { - bitmap(file=paste(filename, "_", id(), ".", type, sep=""), - type=format, res=144, taa=ifelse(type=="png16m", 4, NA), width=width, height=height) - } else { - eval(parse(text=device))(file=paste(filename, "_", id(), ".", type, sep=""), - res=144, width=width, height=height) - } - - if (warn) print("Do not forget to use 'dev.off()' afterwards!") -} - - -get_delta_stats_orig_single <- function(delta, NS_mean, NS_sd, probes, p.adjust.method="BH", signif=0.05) { - ## Obtain significantly dissociated genes or intergenic regions - lengths <- (probes$end-probes$start+1) - means <- apply(probes, 1, function(x) { - if (is.na(x[1])) NA else - mean(delta[x[1]:x[2]], na.rm=T) - }) - p_as <- pnorm(means, mean=NS_mean, sd=NS_sd/sqrt(2*lengths), lower.tail=FALSE) - p_dis <- pnorm(means, mean=NS_mean, sd=NS_sd/sqrt(2*lengths), lower.tail=TRUE) - hits_as <- (p.adjust(p_as, method=p.adjust.method) < signif) - hits_dis <- (p.adjust(p_dis, method=p.adjust.method) < signif) - - list(p_as=p_as, p_dis=p_dis, hits_as=hits_as, hits_dis=hits_dis) -} - -get_delta_stats_orig <- function(delta, NS_mean, NS_sd, probes, probes_IG, - p.adjust.method="BH", signif=0.05) { - genes_res <- get_delta_stats_orig_single(delta, NS_mean, NS_sd, probes, p.adjust.method, signif) - IG_res <- get_delta_stats_orig_single(delta, NS_mean, NS_sd, probes_IG, p.adjust.method, signif) - - list(p_as=genes_res$p_as, p_dis=genes_res$p_dis, hits_as=genes_res$hits_as, hits_dis=genes_res$hits_dis, - IG_p_as=IG_res$p_as, IG_p_dis=IG_res$p_dis, IG_hits_as=IG_res$hits_as, IG_hits_dis=IG_res$hits_dis) -} - -## Obtain significantly dissociated/associated genes/IGs -get_delta_stats_single <- function(delta, NS_mean, NS_sd, probes, - p.adjust.method="BH", signif=0.05, acfs=NULL) { - lengths <- (probes$end-probes$start+1) - means <- apply(probes, 1, function(x) { - if (is.na(x[1])) NA else mean(delta[x[1]:x[2]], na.rm=T) - }) - - if (is.null(acfs)) - acfs <- acf(delta, lag.max=max(lengths, na.rm=T), plot=FALSE) - - NS_var <- NS_sd^2 - - covs <- acfs$acf * NS_var # Scale correlation by SD based on null distribution - #covs <- acfs$acf * var(delta) # Scale correlation by SD based on delta distribution - - kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) - sumvars <- ((NS_var)*lengths) + (2 * kcovs[lengths]) - #sumvars <- (var(delta)*lengths) + (2 * kcovs[lengths]) - - z <- ((means*lengths) - (NS_mean*lengths)) / sqrt(sumvars) - - p_as <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=FALSE) - hits_as <- (p.adjust(p_as, method=p.adjust.method) < signif) - - p_dis <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=TRUE) - hits_dis <- (p.adjust(p_dis, method=p.adjust.method) < signif) - - # For a two-tailed test do both the lower-tailed and the upper-tailed test and - # double the P-value of the smaller of the two results. - p <- pmin(p_as, p_dis)*2 - hits <- (p.adjust(p, method=p.adjust.method) < signif) - - list(z=z, p=p, p_as=p_as, p_dis=p_dis, hits=hits, hits_as=hits_as, hits_dis=hits_dis) -} - - -## Obtain significantly dissociated/associated genes/IGs -get_delta_stats_single_mean <- function(delta, NS_mean, NS_sd, probes, - p.adjust.method="BH", signif=0.05, acfs=NULL) { - lengths <- (probes$end-probes$start+1) - means <- apply(probes, 1, function(x) { - if (is.na(x[1])) NA else mean(delta[x[1]:x[2]], na.rm=T) - }) - - if (is.null(acfs)) - acfs <- acf(delta, lag.max=max(lengths, na.rm=T), plot=FALSE) - - NS_var <- NS_sd^2 - - covs <- acfs$acf * NS_var # Scale correlation by SD based on null distribution - #covs <- acfs$acf * var(delta) # Scale correlation by SD based on delta distribution -} - -# -#### 1 -# kcovs <- NS_var * 2 * c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) -# sumvars <- (lengths*NS_var) + kcovs[lengths] -# p_as <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# -# -#### 2 -# kcovs <- NS_var * 2 * c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) -# sumvars <- (NS_var/lengths) + (kcovs[lengths]/(lengths^2)) -# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# -# -# -#### OK -# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) -# sumvars <- (NS_var*(lengths+2*kcovs[lengths])) -# p_as <- pnorm(means*lengths, mean=NS_mean*lengths, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# -#### OK -# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) -# sumvars <- (NS_var/lengths) + (NS_var*2*kcovs[lengths])/(lengths^2) -# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# -#### OK -# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) -# sumvars <- (NS_var + (NS_var*2*kcovs[lengths])/lengths) / lengths -# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# -#### OK -# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1))) -# sumvars <- (NS_var * (lengths + 2*kcovs[lengths]))/lengths^2 -# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# -#### OK -# kcovs <- c(0, sapply(2:max(lengths, na.rm=T), function(k) sum(covs[2:k] * (k-1):1)/(k/2))) -# sumvars <- (NS_var * (1 + kcovs[lengths]))/lengths -# p_as <- pnorm(means, mean=NS_mean, sd=sqrt(sumvars), lower.tail=FALSE) -# p_as[1:10] -# - -get_delta_stats <- function(delta, NS_mean, NS_sd, probes, probes_IG, - p.adjust.method="BH", signif=0.05, acfs=NULL) { - genes_res <- get_delta_stats_single(delta, NS_mean, NS_sd, probes, p.adjust.method, signif, acfs) - IG_res <- get_delta_stats_single(delta, NS_mean, NS_sd, probes_IG, p.adjust.method, signif, acfs) - - ## 'Correct' NA results for intergenic regions: these result from the IGs being too small to calculate significance. - #IG_res$hits[is.na(IG_res$hits)] <- TRUE - #IG_res$hits_as[is.na(IG_res$hits_as)] <- TRUE - #IG_res$hits_dis[is.na(IG_res$hits_dis)] <- TRUE - - list(z=genes_res$z, p=genes_res$p, p_as=genes_res$p_as, p_dis=genes_res$p_dis, - hits=genes_res$hits, hits_as=genes_res$hits_as, hits_dis=genes_res$hits_dis, - IG_z=IG_res$z, IG_p=IG_res$p, IG_p_as=IG_res$p_as, IG_p_dis=IG_res$p_dis, - IG_hits=IG_res$hits, IG_hits_as=IG_res$hits_as, IG_hits_dis=IG_res$hits_dis) -} - -## Obtain significantly dissociated/associated running windows -get_delta_stats_win <- function(delta, NS_mean, NS_sd, k=10, - p.adjust.method="BH", signif=0.05) { - means <- runmean(delta, k=k); - - acfs <- acf(delta, lag.max=k, plot=FALSE) - - NS_var <- NS_sd^2 - - covs <- acfs$acf * NS_var # Scale correlation by SD based on null distribution - #covs <- acfs$acf * var(delta) # Scale correlation by SD based on delta distribution - - kcovs <- c(0, sapply(2:k, function(l) sum(covs[2:l] * (l-1):1))) - sumvars <- ((NS_var)*k) + (2 * kcovs[k]) - #sumvars <- (var(delta)*k) + (2 * kcovs[k]) - - z <- ((means*k) - (NS_mean*k)) / sqrt(sumvars) - - p_as <- pnorm(means*k, mean=NS_mean*k, sd=sqrt(sumvars), lower.tail=FALSE) - hits_as <- (p.adjust(p_as, method=p.adjust.method) < signif) - - p_dis <- pnorm(means*k, mean=NS_mean*k, sd=sqrt(sumvars), lower.tail=TRUE) - hits_dis <- (p.adjust(p_dis, method=p.adjust.method) < signif) - - # For a two-tailed test do both the lower-tailed and the upper-tailed test and - # double the P-value of the smaller of the two results. - p <- pmin(p_as, p_dis)*2 - hits <- (p.adjust(p, method=p.adjust.method) < signif) - - list(z=z, p=p, p_as=p_as, p_dis=p_dis, hits=hits, hits_as=hits_as, hits_dis=hits_dis) -} - - -Davies.Harte.FD <- function (n, sigma = 1, delta = 0) { - N <- 2^ceiling(log2(n)) - M <- 2*N - - S.X.tau <- sigma^2 / pi * sin(pi*delta) * gamma(1-2*delta) * - exp(lgamma((0:N)+delta) - lgamma((0:N)+1-delta)) - - if (delta < 0) - S.X.tau[1] <- -S.X.tau[1] - - S <- Re(fft(c(S.X.tau, S.X.tau[N:2]))) - - Z <- rnorm(M) - - Y <- double(M) - Y[1] <- Z[1] * sqrt(M*S[1]) - Y[2:N] <- (Z[2*(2:N)-1] + Z[2*(2:N)]*1i) * sqrt(M*S[2:N]/2) - Y[N+1] <- Z[M] * sqrt(M*S[N+1]) - Y[(N+2):M] <- Conj(Y[N:2]) - - Y <- Re(fft(Y, inverse = T)) / M - - return(Y[1:n]) -} - -## Make synteny mapping between master (e.g., mouse) and slave (e.g., human) species. -make_synteny_maps <- function(master, slave, master_template, slave_template, - master_species="mouse", slave_species="human", - format="png", sample=TRUE, info="", master_col="black", - slave_col=NULL, lwd=0.2, ylim=NULL, to_file=TRUE, outputdir="misc_figures", - postfun=NULL, postpar=NULL, prefun=NULL, prepar=NULL) { - - master_chroms <- chrom_info[[master_species]]; - master_chromsizes <- chromsize_info[[master_species]]; - - ncols <- round(sqrt(length(master_chroms))) - nrows <- ceiling(sqrt(length(master_chroms))) - tot <- ncols * nrows; - colwidths <- tapply(master_chroms, rep(1:ncols, each=nrows)[1:length(master_chroms)], - function(x) max(master_template$start[master_template$seqname %in% x])) - - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/synteny_maps_", info, "_", id(), ".", format, sep=""), - type=type, width=14*ncols, height=7*nrows, res=144, taa=4) - } - - par(oma=c(2,3,2,0), mar=c(0,2,0,2)) - nf <- layout(matrix(1:(tot*3), 3*nrows, ncols, byrow = FALSE), heights=rep(c(3,3,1.5), tot), widths=colwidths/sum(colwidths)) - - if (length(master_col) == 1) master_col <- rep(master_col, length(master)) - - if (is.null(slave_col)) { - slave_col <- rep("#000000", length(unique(slave_template$seqname))) - names(slave_col) <- sort(unique(slave_template$seqname)); - slave_chroms <- chrom_info[[slave_species]]; - slave_col[slave_chroms] <- rainbow(length(slave_chroms)); - slave_col=slave_col[slave_template$seqname] - } - - #xlim <- range(master_chromsizes[, c("start","end")]) - colwidths_chr <- rep(colwidths, each=nrows); - names(colwidths_chr) <- master_chroms; - - for (chr in master_chroms) { - print(chr) - sel <- which(master_template$seqname == chr) - if (sample) sel <- sample(sel, round(length(sel)/5)) - - xlim <- c(0, colwidths_chr[chr]); - if (!is.null(prefun)) prefun(prepar); - plot(master_template$start[sel], master[sel], type="h", xlim=xlim, ylim=ylim, - xaxt="n", bty="n", xlab="", ylab="", cex.axis=0.8, lwd=lwd, col=master_col[sel]) - if (!is.null(postfun)) postfun(postpar); - - mtext(paste(" ", chr), 3, 0, cex=1, line=-0.2, adj=0) - if (!is.null(prefun)) prefun(prepar); - plot(master_template$start[sel], slave[sel], type="h", xlim=xlim, ylim=ylim, - xaxt="n", bty="n", xlab="", ylab="", cex.axis=0.8, lwd=lwd, col=slave_col[sel]) - if (!is.null(postfun)) postfun(postpar); - - axDat <- prettyTicks(c(0, max(master_template$start[master_template$seqname == chr])), ntick = 7) - plot(master_template$start[master_template$seqname == chr], - rep(1, length(which(master_template$seqname == chr))), type="n", - bty="n", axes=FALSE, ylim=c(-0.3, 1), xlab="", ylab="", xlim=xlim) - axis(1, at = axDat$at, labels = axDat$labels, cex.axis=1, line=-2.5, cex.axis=0.8, padj=-1.2) - } - - mtext(paste("genomic location (", axDat$xunit, ")", sep=""), 1, line=0.5, cex=1.5, outer=TRUE) - mtext(text=expression(log[2](LaminB1~interaction)), 2, 1, cex=1.5, outer=TRUE) - - if (to_file) dev.off() -} - -make_synteny_maps_chr <- function(master, slave, master_template, slave_template, - master_species="mouse", slave_species="human", - chr="chr1", xlim=NULL, ylim=NULL, format="png", - sample=TRUE, main="", info="", lwd=0.2, - master_col="black", slave_col=NULL, - width=14, to_file=TRUE, outputdir="misc_figures", - prefun=NULL, prepar=NULL, postfun=NULL, postpar=NULL) { - master_chroms <- chrom_info[[master_species]]; - master_chromsizes <- chromsize_info[[master_species]]; - - if (is.null(xlim)) { - xrange <- "full"; - xlim <- as.numeric(master_chromsizes[master_chromsizes$seqname==chr,c("start","end")]) - } else { - xrange <- paste(xlim, collapse="-") - } - - if (is.null(ylim)) { - ylim_master <- quantile(master, prob=c(0.001,0.999), na.rm=T) - ylim_slave <- quantile(slave, prob=c(0.001,0.999), na.rm=T) - } else { - ylim_master <- ylim_slave <- NULL; - } - - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/synteny_maps_", chr, "_", xrange, "_", - info, "_", id(), ".", format, sep=""), - type=type, width=width, height=3, res=144, taa=4) - } - - nf <- layout(matrix(1:3, 3, 1, byrow = FALSE), heights=c(3,3,1.5)) - #par(oma=c(2,4,2,0), mar=c(0,2,0,0)) - par(mar=c(0,7,0,1), oma=c(0,0,ifelse(nchar(main) > 0, 2, 1),0)) - - if (length(master_col) == 1) master_col <- rep(master_col, length(master)) - - if (is.null(slave_col)) { - slave_col <- rep("#000000", length(unique(slave_template$seqname))) - names(slave_col) <- sort(unique(slave_template$seqname)); - slave_chroms <- chrom_info[[slave_species]]; - slave_col[slave_chroms] <- rainbow(length(slave_chroms)); - slave_col=slave_col[slave_template$seqname] - } - - sel <- which(master_template$seqname == chr) - if (sample) sel <- sample(sel, round(length(sel)/5)) - - - plot(0, type="n", xlim=xlim, ylim=ylim_master, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) - if (!is.null(prefun)) prefun(prepar); - lines(master_template$start[sel], master[sel], type="h", lwd=lwd, col=master_col[sel]) - #mtext(paste(" ", chr), 3, 0, cex=2, line=-0.2, adj=0) - if (!is.null(postfun)) postfun(postpar); - - plot(0, type="n", xlim=xlim, ylim=ylim_slave, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) - if (!is.null(prefun)) prefun(prepar); - lines(master_template$start[sel], slave[sel], type="h", lwd=lwd, col=slave_col[sel]) - if (!is.null(postfun)) postfun(postpar); - - axDat <- prettyTicks(xlim, ntick = round(width/2)) - plot(master_template$start[master_template$seqname == chr], - rep(1, length(which(master_template$seqname == chr))), type="n", - bty="n", axes=FALSE, ylim=c(0, 1), xlab="", ylab="", xlim=xlim) - axis(1, at = axDat$at, labels = axDat$labels, cex.axis=2.5, line=-7, padj=0.5) - - mtext(text=expression(log[2](nuclear~lamina~interaction)), side=2, -3, cex=2, at=0.6, outer=TRUE) - mtext(paste("location on ", master_species, " chromosome ", sub("chr", "", chr), - " (", axDat$xunit, ")", sep=""), side=3, -7, cex=2) - - mtext(main, side=3, cex=2, outer=TRUE) - - if (to_file) dev.off() -} - -make_ext_synteny_maps_chr <- function(master, slave, master_template, slave_template, - master_species="mouse", slave_species="human", - chr="chr1", xlim=NULL, format="png", sample=TRUE, - info="", master_col="black", slave_col=NULL, - lwd=0.2, ylim=NULL, width=14, to_file=TRUE, - outputdir="misc_figures", - ext_dat=NULL, postfun=NULL, postpar=NULL, - prefun=NULL, prepar=NULL) { - master_chroms <- chrom_info[[master_species]]; - master_chromsizes <- chromsize_info[[master_species]]; - - if (is.null(xlim)) { - xrange <- "full"; - xlim <- as.numeric(master_chromsizes[master_chromsizes$seqname==chr,c("start","end")]) - } else { - xrange <- paste(xlim, collapse="-") - } - - if (is.null(ylim)) { - ylim_master <- quantile(master, prob=c(0.001,0.999), na.rm=T) - ylim_slave <- quantile(slave, prob=c(0.001,0.999), na.rm=T) - if (is.null(ext_dat)) { - ylim_ext <- NULL; - } else { - ylim_ext <- lapply(ext_dat, function(x) quantile(x, prob=c(0.001,0.999), na.rm=T)) - } - } else { - ylim_master <- ylim_slave <- ylim_ext <- NULL; - } - - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/synteny_maps_", chr, "_", xrange, "_", - info, "_", id(), ".", format, sep=""), - type=type, width=width, height=3, res=144, taa=4) - } - - nf <- layout(matrix(1:(length(ext_dat)+3), (length(ext_dat)+3), 1, byrow = FALSE), - heights=c(3,3,rep(1.5, length(ext_dat)),1.5)) - #par(oma=c(2,4,2,0), mar=c(0,2,0,0)) - par(mar=c(0,7,0,1), oma=c(0,0,1,0)) - - if (length(master_col) == 1) master_col <- rep(master_col, length(master)) - - if (is.null(slave_col)) { - slave_col <- rep("#000000", length(unique(slave_template$seqname))) - names(slave_col) <- sort(unique(slave_template$seqname)); - slave_chroms <- chrom_info[[slave_species]]; - slave_col[slave_chroms] <- rainbow(length(slave_chroms)); - slave_col=slave_col[slave_template$seqname] - } - - sel <- which(master_template$seqname == chr) - if (sample) sel <- sample(sel, round(length(sel)/5)) - - plot(0, type="n", xlim=xlim, ylim=ylim_master, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) - if (!is.null(prefun)) prefun(prepar); - lines(master_template$start[sel], master[sel], type="h", lwd=lwd, col=master_col[sel]) - #mtext(paste(" ", chr), 3, 0, cex=2, line=-0.2, adj=0) - legend("topleft", "(x,y)", inset=0.001, legend=master_species, cex=2, bty="n") - if (!is.null(postfun)) postfun(postpar); - - plot(0, type="n", xlim=xlim, ylim=ylim_slave, xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) - if (!is.null(prefun)) prefun(prepar); - lines(master_template$start[sel], slave[sel], type="h", lwd=lwd, col=slave_col[sel]) - legend("topleft", "(x,y)", inset=0.001, legend=slave_species, cex=2, bty="n") - if (!is.null(postfun)) postfun(postpar); - - for (i in 1:length(ext_dat)) { - plot(0, type="n", xlim=xlim, ylim=ylim_ext[[i]], xaxt="n", bty="n", xlab="", ylab="", cex.axis=2) - if (!is.null(prefun)) prefun(prepar); - lines(master_template$start[sel], ext_dat[[i]][sel], type="h", lwd=lwd, col="black") - legend("topleft", "(x,y)", inset=0.001, legend=names(ext_dat)[i], cex=2, bty="n") - if (!is.null(postfun)) postfun(postpar); - } - - axDat <- prettyTicks(xlim, ntick = round(width/2)) - plot(master_template$start[master_template$seqname == chr], - rep(1, length(which(master_template$seqname == chr))), type="n", - bty="n", axes=FALSE, ylim=c(0, 1), xlab="", ylab="", xlim=xlim) - axis(1, at = axDat$at, labels = axDat$labels, cex.axis=2.5, line=-7, padj=0.5) - - mtext(paste("location on ", master_species, " chromosome ", sub("chr", "", chr), - " (", axDat$xunit, ")", sep=""), side=3, -7, cex=2) - mtext(text=expression(log[2](nuclear~lamina~interaction)), 2, -3, cex=2, at=0.6, outer=TRUE) - - if (to_file) dev.off() -} - - -draw_rainbow_bar_outertext <- function(template, chroms, format="png", header="Human\nchroms", to_file=TRUE, - mar=c(2,2,2,6), info="", outputdir="misc_figures", main_cex=2.5) { - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/rainbow_bar_outertext_", info, "_", id(), ".", format, sep=""), - type=type, res=144, taa=4, height=7, width=1) - } - - par(oma=c(0,0,4,0), mar=mar) - - chroms_cols <- rep("#000000", length(unique(template$seqname))) - names(chroms_cols) <- sort(unique(template$seqname)); - chroms_cols[chroms] <- rainbow(length(chroms)); - - n <- length(chroms); - cols <- chroms_cols[chroms] - - coly1 <- rev((1:n) / n) - coly2 <- coly1 - 1/n - - plot(0, type="n", xlim=c(0,1), ylim=c(0,1), xaxs="i", yaxs="i", axes=FALSE, xlab="", ylab="") - - rect(0, coly1, 1, coly2, col=cols, border=NA) - grid(nx=1, ny=n, col="black", lty=1); box(); - mtext(side=3, header, cex=main_cex, font=2, adj=0.5, line=-0.75, outer=TRUE) - - tickMarks <- ((0:n)/n)+(1/(2*n)); tickMarks <- tickMarks[-length(tickMarks)] - labels <- chroms - axis(side=4, at=tickMarks, labels=rev(labels), las=2, cex.axis=2) - #axis(side=1, at=tickMarks[sel], labels=c(2, 5, ">8"), cex.axis=2, padj=0.3) - - if (to_file) dev.off() -} - -draw_rainbow_bar <- function(template, chroms, format="png", header="Human\nchroms", to_file=TRUE, - mar=c(2,2,2,2), info="", outputdir="misc_figures", main_cex=2.5) { - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/rainbow_bar_", info, "_", id(), ".", format, sep=""), - type=type, res=144, taa=4, height=7, width=1) - - par(oma=c(0,0,4,0), mar=mar) - } - - chroms_cols <- rep("#000000", length(unique(template$seqname))) - names(chroms_cols) <- sort(unique(template$seqname)); - chroms_cols[chroms] <- rainbow(length(chroms)); - - n <- length(chroms); - cols <- chroms_cols[chroms] - - coly1 <- rev((1:n) / n) - coly2 <- coly1 - 1/n - - plot(0, type="n", xlim=c(0,1), ylim=c(0,1), xaxs="i", yaxs="i", axes=FALSE, xlab="", ylab="") - - rect(0, coly1, 1, coly2, col=cols, border=NA) - grid(nx=1, ny=n, col="black", lty=1); box(); - mtext(side=3, header, cex=main_cex, font=2, adj=0.5, line=-0.75, outer=TRUE) - - text(0.5, coly1-(1/(2*n)), col="white", sub("chr", "", chroms), cex=2.5, font=2, adj=0.5) - - if (to_file) dev.off() -} - -draw_rainbow_box <- function(template, chroms, format="png", header="Human chromosomes", to_file=TRUE, - mar=c(2,2,2,2), info="", outputdir="misc_figures", main_cex=2) { - n <- length(chroms); - ncols <- round(sqrt(n)) - nrows <- ceiling(sqrt(n)) - tot <- ncols * nrows; - - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/rainbow_box_", info, "_", id(), ".", format, sep=""), - type=type, res=144, taa=4, height=nrows/2, width=ncols) - } - - par(mar=mar) - - mat <- matrix(1:tot, nrow=nrows, ncol=ncols); - mat <- mat[nrows:1,] - - chroms_cols <- rep("#FFFFFF", tot); - names(chroms_cols) <- chroms; - chroms_cols[chroms] <- rainbow(n); - - image(mat, col=rev(chroms_cols), axes=FALSE); - grid(nx=nrows, ny=ncols, col="black", lty=1); box(); - mtext(side=3, header, cex=main_cex, font=2, adj=0, line=0.5) - - text(x=rep((0:(nrows-1))/(nrows-1), ncols), - y=rep(((ncols-1):0)/(ncols-1), each=nrows), labels=chroms, cex=3) - - if (to_file) dev.off() -} - - -draw_HMMcall_box <- function(calls=c("LAD", "iLAD"), cols=c("red", "green"), textcols=c("black", "black"), - format="png", header="Human lamina status", to_file=TRUE, mar=c(2,2,2,2), - main_cex=2, info="", outputdir="misc_figures") { - - if (to_file) { - type <- format; - if (format == "pdf") type <- "pdfwrite"; - if (format == "png") type <- "png16m"; - bitmap(file=paste(outputdir, "/HMMcall_box_", info, "_", id(), ".", format, sep=""), - type=type, res=144, taa=4, height=1, width=length(calls)) - } - - par(mar=mar) - - n <- length(calls) - mat <- matrix(1:n, nrow=n); - - image(mat, col=cols, axes=FALSE); - grid(nx=n, ny=1, col="black", lty=1); box(); - mtext(side=3, header, cex=main_cex, font=2, adj=0, line=0.5) - - text((0:(n-1))/(n-1), rep(0, n), calls, cex=2, col=textcols) - - if (to_file) dev.off() -} - -#violin.plot <- function (x, x.scaling = 1, autoscale = 0.995, -# individual.scaling = FALSE, median.bar = "grey50", x.pos, cex.names=1, ...) { -## Author: Guillaume Filion. -## Date: June 7, 2011. -## x: a list/data.frame of values. -## x.scaling: an x scaling factor to apply to the densities. -## autoscale: the minimum proportion of the data to be displayed on the plot. -## ...: density, angle, border, col, and lty are passed to polygon(). -# -# if (autoscale < 0 || autoscale > 1) -# stop ("'autoscale' outside the interval [0,1]") -# -# m <- length(x) -# if (missing(x.pos)) -# x.pos <- 1:m -# densities <- lapply(X = x, FUN = density, na.rm = TRUE) -# -# # Compute nice spacing parameters between plots. -# y.sup <- sapply(X = densities, FUN = function (x) max(x$y)) -# x.min <- min(sapply(X = densities, FUN = function (x) min(x$x))) -# x.max <- max(sapply(X = densities, FUN = function (x) max(x$x))) -# if(individual.scaling) -# y.sup <- y.sup/.8 -# else -# y.sup <- rep(max(y.sup), length(y.sup)) -# -# extrargs = list(...) -# # The density, angle, border, col, and lty parameters are passed -# # to polygon(). The rest is passed to plot(). -# pol.index <- match(names(extrargs), -# c("density", "angle", "border", "col", "lty")) -# plotargs <- list() -# polargs <- list() -# if (length(extrargs) > 0) { -# for (i in 1:length(extrargs)) { -# if (is.na(pol.index[i])) -# plotargs[[names(extrargs)[i]]] <- extrargs[[i]] -# else -# polargs[[names(extrargs)[i]]] <- extrargs[[i]] -# } -# } -# # Set the axis labels to null if not specified. -# if (is.null(plotargs[["xlab"]])) -# plotargs[["xlab"]] <- "" -# if (is.null(plotargs[["ylab"]])) -# plotargs[["ylab"]] <- "" -# -# # Find a nice y zoom if ylim is not specified. -# if (is.null(plotargs[["ylim"]])) { -# ymin <- min(sapply(X = x, FUN = quantile, prob = (1 - autoscale)/2, -# na.rm = TRUE)) -# ymax <- max(sapply(X = x, FUN = quantile, prob = 0.5 + autoscale/2, -# na.rm = TRUE)) -# plotargs[["ylim"]] <- c(ymin, ymax) -# } -# -# # Overwrite type, xaxt, x, and y. -# plotargs[["type"]] <- 'n' -# plotargs[["xaxt"]] <- 'n' -# plotargs[["x"]] <- c(min(x.pos)-.6*x.scaling, max(x.pos)+.6*x.scaling) -# plotargs[["y"]] <- c(x.min, x.max) -# -# # Plot the frame. -# eval(as.call(c(plot, plotargs))) -# -# # Manually recycle the parameters passed to polygon(). -# if (is.null(polargs[["col"]])) -# polargs[["col"]] <- 'black' -# for (i in 1:length(polargs)) -# polargs[[i]] <- rep(eval(polargs[[i]]), length.out = m) -# -# # Set the default border parameter to the value of col. -# if (is.null(polargs[["border"]])) -# polargs[["border"]] <- polargs[["col"]] -# # Set color to black if not specified. -# if (is.null(polargs[["col"]])) -# polargs[["col"]] <- "black" -# -# # Plot the x-axis. -# axis(side = 1, at = 1:m, labels = names(x), cex.axis=cex.names) -# -# # Plot the violins. -# polargs.i <- list() -# for (i in 1:m) { -# polargs.i[["x"]] <- x.scaling*(c(densities[[i]]$y, -# -rev(densities[[i]]$y))) / (2.1*y.sup[i]) + x.pos[i] -# polargs.i[["y"]] <- c(densities[[i]]$x, rev(densities[[i]]$x)) -# for (a in names(polargs)) -# polargs.i[[a]] <- polargs[[a]][i] -# eval(as.call(c(polygon, polargs.i))) -# -# # Plot the median bar. -# if (!is.na(median.bar)) { -# medians <- sapply(X = x, FUN = median, na.rm = TRUE) -# segments(x0 = x.pos[i], x1 = x.pos[i]-.4, y0 = medians[i], -# y1 = medians[i], lwd = 2, lty = 1, col = median.bar) -# segments(x0 = x.pos[i], x1 = x.pos[i]+.4, y0 = medians[i], -# y1 = medians[i], lwd = 2, lty = 1, col = median.bar) -# } -# } -#} - -violin.plot <- function (x, names, x.scaling = 1, autoscale = 0.995, - individual.scaling = FALSE, median.bar = "grey50", x.pos, ...) { -# Author: Guillaume Filion. -# Date: June 7, 2011. -# x: a list/data.frame of values. -# x.scaling: an x scaling factor to apply to the densities. -# autoscale: the minimum proportion of the data to be displayed on the plot. -# ...: density, angle, border, col, and lty are passed to polygon(). - - if (autoscale < 0 || autoscale > 1) - stop ("'autoscale' outside the interval [0,1]") - - m <- length(x) - if (missing(x.pos)) - x.pos <- 1:m - densities <- lapply(X = x, FUN = density, na.rm = TRUE) - - # Compute nice spacing parameters between plots. - y.sup <- sapply(X = densities, FUN = function (x) max(x$y)) - x.min <- min(sapply(X = densities, FUN = function (x) min(x$x))) - x.max <- max(sapply(X = densities, FUN = function (x) max(x$x))) - if(individual.scaling) - y.sup <- y.sup/.8 - else - y.sup <- rep(max(y.sup), length(y.sup)) - - extrargs = list(...) - # The density, angle, border, col, and lty parameters are passed - # to polygon(). The rest is passed to plot(). - pol.index <- match(names(extrargs), - c("density", "angle", "border", "col", "lty")) - plotargs <- list() - polargs <- list() - if (length(extrargs) > 0) { - for (i in 1:length(extrargs)) { - if (is.na(pol.index[i])) - plotargs[[names(extrargs)[i]]] <- extrargs[[i]] - else - polargs[[names(extrargs)[i]]] <- extrargs[[i]] - } - } - # Set the axis labels to null if not specified. - if (is.null(plotargs[["xlab"]])) - plotargs[["xlab"]] <- "" - if (is.null(plotargs[["ylab"]])) - plotargs[["ylab"]] <- "" - - # Find a nice y zoom if ylim is not specified. - if (is.null(plotargs[["ylim"]])) { - ymin <- min(sapply(X = x, FUN = quantile, prob = (1 - autoscale)/2, - na.rm = TRUE)) - ymax <- max(sapply(X = x, FUN = quantile, prob = 0.5 + autoscale/2, - na.rm = TRUE)) - plotargs[["ylim"]] <- c(ymin, ymax) - } - - # Overwrite type, xaxt, x, and y. - plotargs[["type"]] <- 'n' - plotargs[["xaxt"]] <- 'n' - plotargs[["x"]] <- c(min(x.pos)-.6*x.scaling, max(x.pos)+.6*x.scaling) - plotargs[["y"]] <- c(x.min, x.max) - - # Plot the frame. - eval(as.call(c(plot, plotargs))) - - # Manually recycle the parameters passed to polygon(). - if (is.null(polargs[["col"]])) - polargs[["col"]] <- 'black' - for (i in 1:length(polargs)) - polargs[[i]] <- rep(eval(polargs[[i]]), length.out = m) - - # Set the default border parameter to the value of col. - if (is.null(polargs[["border"]])) - polargs[["border"]] <- polargs[["col"]] - # Set color to black if not specified. - if (is.null(polargs[["col"]])) - polargs[["col"]] <- "black" - - # Plot the x-axis. - axis(side = 1, at = 1:m, labels = names) - - # Plot the violins. - polargs.i <- list() - for (i in 1:m) { - polargs.i[["x"]] <- x.scaling*(c(densities[[i]]$y, - -rev(densities[[i]]$y))) / (2.1*y.sup[i]) + x.pos[i] - polargs.i[["y"]] <- c(densities[[i]]$x, rev(densities[[i]]$x)) - for (a in names(polargs)) - polargs.i[[a]] <- polargs[[a]][i] - eval(as.call(c(polygon, polargs.i))) - - # Plot the median bar. - if (!is.na(median.bar)) { - medians <- sapply(X = x, FUN = median, na.rm = TRUE) - segments(x0 = x.pos[i], x1 = x.pos[i]-.4, y0 = medians[i], - y1 = medians[i], lwd = 2, lty = 1, col = median.bar) - segments(x0 = x.pos[i], x1 = x.pos[i]+.4, y0 = medians[i], - y1 = medians[i], lwd = 2, lty = 1, col = median.bar) - } - } -} - -get_pref_stats_one_side <- function(hits, IG_hits) { - n <- length(hits)*2 - all_hits <- rep(NA, n) - all_hits[seq(1, n, 2)] <- IG_hits - all_hits[seq(2, n, 2)] <- hits - - gene_ind <- rep(c(0,1), n/2) - - res <- doFisherGeneric(all_hits, gene_ind) - - invisible(res) -} - -get_pref_stats <- function(stats) { - print("Dissociation") - pref_dis <- get_pref_stats_one_side(stats$hits_dis, stats$IG_hits_dis) - print("Association") - pref_as <- get_pref_stats_one_side(stats$hits_as, stats$IG_hits_as) - - invisible(list(pref_dis=pref_dis, pref_as=pref_as)) -} - -get_adj_stats_one_side <- function(hits, IG_hits, VERBOSE=TRUE) { - # In case intergenic regions are too small to test for significance, - # we previously assigned 'NA' to their coordinates. Here, we change their - # as/dis status to 'TRUE', in order to be able to glue adjacent dis/as genes together. - IG_hits[is.na(IG_hits)] <- TRUE; - - n <- length(hits)*2 - all_hits <- rep(NA, n) - all_hits[seq(1, n, 2)] <- IG_hits - all_hits[seq(2, n, 2)] <- hits - - sel <- which(all_hits) - adj <- data.frame(start=sel[c(1, which(diff(sel) != 1)+1)], - end=sel[c(which(diff(sel) != 1), length(sel))]); - - adj_full <- adj; - - # If the start or end position is odd, it is in IG. - # Because we're intereseted in genes only, we correct this by adding 1 to start and/or subtracting 1 from end. - adj$start <- ifelse(adj$start %% 2 == 1, adj$start+1, adj$start) - adj$end <- ifelse(adj$end %% 2 == 1, adj$end-1, adj$end) - - sel <- which(adj$start %% 2 == 0 & adj$end %% 2 == 0 & adj$start == adj$end) - - if (VERBOSE) { - cat("% of genes that move in isolation: "); - cat(length(sel) / length(which(hits)) * 100) - cat("\n"); - cat("% of these genes that move without their intergenic regions: "); - cat(length(which(adj_full$start[sel] %% 2 == 0 & adj_full$end[sel] %% 2 == 0)) / length(sel) * 100) - cat("\n"); - cat("% of all genes that move in isolation and without their intergenic regions: "); - cat(length(which(adj_full$start[sel] %% 2 == 0 & adj_full$end[sel] %% 2 == 0)) / length(which(hits)) * 100) - cat("\n") - } - - # Remove IG-only regions and translate indices back to gene indices - adj <- adj[adj$end-adj$start >= 0,] - adj$start <- adj$start/2 - adj$end <- adj$end/2 - - ls <- (adj$end-adj$start+1) - if (VERBOSE) print(table(ls)*as.numeric(names(table(ls)))/length(which(hits))*100) - adj$length <- ls - - invisible(list(adj=adj, adj_full=adj_full)) -} - -get_adj_stats <- function(stats, VERBOSE=TRUE) { - if (VERBOSE) print("Dissociating genes") - adj_dis <- get_adj_stats_one_side(stats$hits_dis, stats$IG_hits_dis, VERBOSE) - if (VERBOSE) print("Associating genes") - adj_as <- get_adj_stats_one_side(stats$hits_as, stats$IG_hits_as, VERBOSE) - - invisible(list(adj_dis=adj_dis$adj, adj_as=adj_as$adj, - adj_dis_full=adj_dis$adj_full, adj_as_full=adj_as$adj_full)) -} - -get_flanks <- function(probes, n=10, inner=FALSE) { - probes_5prime <- probes_3prime <- probes; - - if (inner) { - probes_5prime$end <- probes$start + n; - probes_3prime$start <- probes$end - n; - } else { - probes_5prime$end <- probes$start; - probes_5prime$start <- probes$start - n; - probes_3prime$end <- probes$end + n; - probes_3prime$start <- probes$end; - } - - probes_flanks <- rbind(probes_5prime, probes_3prime) - probes_flanks <- probes_flanks[order(probes_flanks$start),] - - # Make sure there are no overlapping flanks: OK - m <- nrow(probes_flanks) - res <- quantile(probes_flanks$start[2:m] - probes_flanks$end[1:(m-1)]) - if (any(res < 0)) { - warning("Overlapping flanks"); - print(res); - } - - invisible(probes_flanks) -} - -### Used to calculate probe spacing in GFF -probe_spacing <- function(GFF) { - summary(unlist(tapply(1:nrow(GFF), GFF$seqname, function(x) - GFF$start[x[-1]] - GFF$end[x[-length(x)]]))) -} - - -### Used to write diagnostics to the 'diag' object. -add_diag <- function(name, value) { - if (length(value) == 1) { - cat(paste(name, ": ", value, "\n", sep="")) - } else { - cat(paste(name, ":\n", sep="")) - print(value) - } - diag[[name]] <<- value; -} - -### Perform circular permutation using two binary sets: -### Rotate the one and calculate overlap with the other. -### Then, compare to 'real' overlap. -get_circ_perm_p <- function(set1, set2, n=length(set1), do_sample=FALSE) { - if (is.logical(set1)) set1 <- which(set1); - if (is.logical(set2)) set2 <- which(set2); - - if (do_sample) { - offsets <- sort(sample(seq.int(n-1), 1e4)) - } else { - offsets <- seq.int(n-1) - } - rnd_overlap <- c(); - for (i in 1:length(offsets)) { - offset <- offsets[i]; - if ((i/length(offsets)*100) %% 10 == 0) - cat(paste(i/length(offsets)*100, "%\r", sep="")) - - rnd <- (((set2+offset)-1) %% n)+1 - rnd_overlap <- c(rnd_overlap, length(intersect(set1, rnd))) - } - cat("\n"); - - ovl <- length(intersect(set1, set2)) - p_le <- length(which(rnd_overlap <= ovl)) / length(rnd_overlap) - p_gr <- length(which(rnd_overlap > ovl)) / length(rnd_overlap) - p <- min(p_le, p_gr)*2 - - res <- list(ovl=ovl, n=length(rnd_overlap), p_le=p_le, p_gr=p_gr, p=p, summary(rnd_overlap)) -} - -get_conc_circ_perm_p <- function(calls1, calls2, n=length(calls1), do_sample=FALSE) { - if (do_sample) { - offsets <- sort(sample(seq.int(n-1)+1, 1e4)) - } else { - offsets <- seq.int(n-1)+1 - } - - rnd_overlap <- c(); - for (i in 1:length(offsets)) { - offset <- offsets[i]; - if ((i/length(offsets)*100) %% 10 == 0) - cat(paste(i/length(offsets)*100, "%\r", sep="")) - - rnd <- calls2[c(offset:n,1:(offset-1))] - rnd_overlap <- c(rnd_overlap, sum(calls1 == rnd)) - } - cat("\n"); - - ovl <- sum(calls1 == calls2) - - p_le <- length(which(rnd_overlap <= ovl)) / length(rnd_overlap) - p_gr <- length(which(rnd_overlap > ovl)) / length(rnd_overlap) - p <- min(p_le, p_gr)*2 - - res <- list(ovl=ovl, n=length(rnd_overlap), p_le=p_le, p_gr=p_gr, p=p, sum=summary(rnd_overlap)) -} - -# define wideScreen function -wideScreen <- function(ncols) { - if (missing(ncols)) - ncols <- as.integer(Sys.getenv("COLUMNS")) - options(width=ncols); -} - -#### Function to read in (large) .bed files and merge overlapping regions. -# NOTE: simply ignore the fact that UCSC coordinates are 0-based. -# We won't notice this on our probe-level resolution anyway. -readFixMergeBED <- function(file, purge=c(), verbose=FALSE) { - alldat <- NULL; - cnt <- 0; - step <- 1e4; - rcnt <- 0; - cracks <- c(); - - n <- as.numeric(system(paste("cat \"", file, "\" | wc -l", sep=''), intern=TRUE)) - print(paste("Total number of lines:", n)); - - for (i in 1:ceiling(n/step)) { - if (verbose) - print(paste("Processing lines", (cnt*step)+1, "to", min(n, (cnt+1)*step))) - else - cat("."); - - toread <- min(step, n - (cnt*step)); - dat <- readBED(file, nlines=toread, skip=cnt*step, purge=purge); - - # Fix cases where start > end (this happens sometimes..) - hits <- which(dat$start > dat$end) - tmp <- dat$start[hits]; - dat$start[hits] <- dat$end[hits]; - dat$end[hits] <- tmp; - - # Sort data - dat <- dat[order(dat$seqname, dat$start),] - - ## We extend genes by +/- 30bp, as this is half of our probe width. - ## This is to facilitate further merging of annotations with small (< 60bp) gaps. - #dat$start <- dat$start - 30; - #dat$end <- dat$end + 30; - - # Merge overlapping entries. - dat <- Merge(dat) - left <- round((nrow(dat)/toread)*100, 2) - if (verbose) - print(paste(left, "percent left after merging")) - - cracks <- c(cracks, rcnt + nrow(dat)); - alldat <- rbind(alldat, dat); - rcnt <- nrow(alldat); - - cnt <- cnt+1; - } - if (!verbose) cat("\n") - - tmps <- list(); - idxs <- c(); - for (i in 1:(length(cracks)-1)) { - crack <- cracks[i]; # Select crack to be processed - idx <- crack+c(-100:100); # Expand crack - idx <- idx[idx %in% 1:nrow(alldat)] # Correct for boundaries of alldat - idxs <- c(idxs, idx); # Remembers positions - tmp <- alldat[idx,]; # Create temp object - tmp <- tmp[order(tmp$seqname, tmp$start),] # Sort object - tmps <- rbind(tmps, Merge(tmp)) # Merge and append to tmp collection - } - alldat <- alldat[-idxs,] # Remove crack (neighbour) positions from alldat - alldat <- rbind(alldat, tmps) # Append new merged data - alldat <- alldat[order(alldat$seqname, alldat$start),] # Sort all data. - - print(paste(nrow(alldat), "(", round((nrow(alldat) / n) * 100, 2), "%) rows left after merging"), sep="") - - invisible(alldat); -} - -### Function to calculate confidence intervals ###NEED TO CHECK! -get_conf_int <- function(x, dist="norm") { - if (dist == "norm") - qnorm(0.975) * (sd(x) / sqrt(length(x))) - else - qt(0.975, df=length(x)-1) * (sd(x) / sqrt(length(x))) -} - -### Function to scale up/down a heatmap. Adapted from the original R heatmap() function. -heatmap.scale <- -function (x, Rowv = NULL, Colv = if (symm) "Rowv" else NULL, - distfun = dist, hclustfun = hclust, reorderfun = function(d, - w) reorder(d, w), do_reorder=TRUE, add.expr, symm = FALSE, revC = identical(Colv, - "Rowv"), scale = c("row", "column", "none"), na.rm = TRUE, - margins = c(5, 5), ColSideColors, RowSideColors, cexRow = 0.2 + - 1/log10(nr), cexCol = 0.2 + 1/log10(nc), labRow = NULL, - labCol = NULL, main = NULL, xlab = NULL, ylab = NULL, keep.dendro = FALSE, - verbose = getOption("verbose"), width=4, height=4, cex.lab=NULL, ...) -{ - scale <- if (symm && missing(scale)) - "none" - else match.arg(scale) - if (length(di <- dim(x)) != 2 || !is.numeric(x)) - stop("'x' must be a numeric matrix") - nr <- di[1] - nc <- di[2] - if (nr <= 1 || nc <= 1) - stop("'x' must have at least 2 rows and 2 columns") - if (!is.numeric(margins) || length(margins) != 2) - stop("'margins' must be a numeric vector of length 2") - doRdend <- !identical(Rowv, NA) - doCdend <- !identical(Colv, NA) - if (is.null(Rowv)) - Rowv <- rowMeans(x, na.rm = na.rm) - if (is.null(Colv)) - Colv <- colMeans(x, na.rm = na.rm) - if (doRdend) { - if (inherits(Rowv, "dendrogram")) - ddr <- Rowv - else { - hcr <- hclustfun(distfun(x)) - ddr <- as.dendrogram(hcr) - if (!is.logical(Rowv) || Rowv) - ddr <- reorderfun(ddr, Rowv) - } - if (nr != length(rowInd <- order.dendrogram(ddr))) - stop("row dendrogram ordering gave index of wrong length") - } - else rowInd <- 1L:nr - if (doCdend) { - if (inherits(Colv, "dendrogram")) - ddc <- Colv - else if (identical(Colv, "Rowv")) { - if (nr != nc) - stop("Colv = \"Rowv\" but nrow(x) != ncol(x)") - ddc <- ddr - } - else { - hcc <- hclustfun(distfun(if (symm) - x - else t(x))) - ddc <- as.dendrogram(hcc) - if (!is.logical(Colv) || Colv) - ddc <- reorderfun(ddc, Colv) - } - if (nc != length(colInd <- order.dendrogram(ddc))) - stop("column dendrogram ordering gave index of wrong length") - } - else colInd <- 1L:nc - - # Force no-reorder when neccessary. - if (!do_reorder) { - colInd <- 1L:nc; - rowInd <- 1L:nr; - } - - x <- x[rowInd, colInd] - labRow <- if (is.null(labRow)) - if (is.null(rownames(x))) - (1L:nr)[rowInd] - else rownames(x) - else labRow[rowInd] - labCol <- if (is.null(labCol)) - if (is.null(colnames(x))) - (1L:nc)[colInd] - else colnames(x) - else labCol[colInd] - if (scale == "row") { - x <- sweep(x, 1, rowMeans(x, na.rm = na.rm)) - sx <- apply(x, 1, sd, na.rm = na.rm) - x <- sweep(x, 1, sx, "/") - } - else if (scale == "column") { - x <- sweep(x, 2, colMeans(x, na.rm = na.rm)) - sx <- apply(x, 2, sd, na.rm = na.rm) - x <- sweep(x, 2, sx, "/") - } - lmat <- rbind(c(NA, 3), 2:1) - #lwid <- c(if (doRdend) 1 else 0.05, 4) - lwid <- c(if (doRdend) 1 else 0.05, width) - #lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0,4) - lhei <- c((if (doCdend) 1 else 0.05) + if (!is.null(main)) 0.2 else 0, height) - if (!missing(ColSideColors)) { - if (!is.character(ColSideColors) || length(ColSideColors) != - nc) - stop("'ColSideColors' must be a character vector of length ncol(x)") - lmat <- rbind(lmat[1, ] + 1, c(NA, 1), lmat[2, ] + 1) - lhei <- c(lhei[1], 0.2, lhei[2]) - } - if (!missing(RowSideColors)) { - if (!is.character(RowSideColors) || length(RowSideColors) != - nr) - stop("'RowSideColors' must be a character vector of length nrow(x)") - lmat <- cbind(lmat[, 1] + 1, c(rep(NA, nrow(lmat) - 1), - 1), lmat[, 2] + 1) - lwid <- c(lwid[1], 0.2, lwid[2]) - } - lmat[is.na(lmat)] <- 0 - if (verbose) { - cat("layout: widths = ", lwid, ", heights = ", lhei, - "; lmat=\n") - print(lmat) - } - op <- par(no.readonly = TRUE) - on.exit(par(op)) - layout(lmat, widths = lwid, heights = lhei, respect = TRUE) - if (!missing(RowSideColors)) { - par(mar = c(margins[1], 0, 0, 0.5)) - image(rbind(1L:nr), col = RowSideColors[rowInd], axes = FALSE) - } - if (!missing(ColSideColors)) { - par(mar = c(0.5, 0, 0, margins[2])) - image(cbind(1L:nc), col = ColSideColors[colInd], axes = FALSE) - } - par(mar = c(margins[1], 0, 0, margins[2])) - if (!symm || scale != "none") - x <- t(x) - if (revC) { - iy <- nr:1 - ddr <- rev(ddr) - x <- x[, iy] - } - else iy <- 1L:nr - image(1L:nc, 1L:nr, x, xlim = 0.5 + c(0, nc), ylim = 0.5 + - c(0, nr), axes = FALSE, xlab = "", ylab = "", ...) - axis(1, 1L:nc, labels = labCol, las = 2, line = -0.5, tick = 0, - cex.axis = cexCol) - if (!is.null(xlab)) - mtext(xlab, side = 1, line = margins[1] - 1.25, cex=cex.lab) - axis(4, iy, labels = labRow, las = 2, line = -0.5, tick = 0, - cex.axis = cexRow) - if (!is.null(ylab)) - mtext(ylab, side = 4, line = margins[2] - 1.25, cex=cex.lab) - if (!missing(add.expr)) - eval(substitute(add.expr)) - par(mar = c(margins[1], 0, 0, 0)) - if (doRdend) - plot(ddr, horiz = TRUE, axes = FALSE, yaxs = "i", leaflab = "none") - else frame() - par(mar = c(0, 0, if (!is.null(main)) 1 else 0, margins[2])) - if (doCdend) - plot(ddc, axes = FALSE, xaxs = "i", leaflab = "none") - else if (!is.null(main)) - frame() - if (!is.null(main)) - title(main, cex.main = 1.5 * op[["cex.main"]]) - invisible(list(rowInd = rowInd, colInd = colInd, Rowv = if (keep.dendro && - doRdend) ddr, Colv = if (keep.dendro && doCdend) ddc)) -} - -### Obtain optimal clustering for a distance matrix. -get_optimal_ordering <- function(distmat) { - library(cba) - - hc <- hclust(distmat, method="average"); - optim <- order.optimal(distmat, hc$merge); - - invisible(as.numeric(optim$order)); -} - -### Peak finder -# Taken from https://gist.github.com/jamiefolson/5831746 -which.peaks <- function(x,partial=TRUE,decreasing=FALSE){ - if (decreasing){ - if (partial){ - which(diff(c(TRUE,diff(x)<=0,FALSE))>0) - }else { - which(diff(diff(x)<=0)>0) - } - }else { - if (partial){ - which(diff(c(TRUE,diff(x)>=0,FALSE))<0) - }else { - which(diff(diff(x)>=0)<0) - } - - } -} - -# plot pretty ticks -prettyTicks <- function (dataRange, ntick = 6) { - XDIV = 1 - while (XDIV < diff(dataRange)) XDIV = XDIV * 1000 - XDIV = XDIV/1000 - if ((diff(dataRange)/XDIV) <= 2) - XDIV = XDIV/1000 - Xscl <- c(floor(dataRange[1]/XDIV) * XDIV, ceiling(dataRange[2]/XDIV) * XDIV) - xunit = switch(as.character(log10(XDIV)), `0` = "bp", `3` = "kb", - `6` = "Mb", `9` = "Gb", "scale not recognized") - Lab <- pretty(c(Xscl[1]/XDIV, Xscl[2]/XDIV), n = ntick) - Tic <- Lab * XDIV - return(list(at = Tic, labels = Lab, xunit = xunit)) -} - -## Make plots for signals across all chromosomes. -make_chr_plot <- function(coords, signal, type="h", assembly="hg19", format="png", - sample=FALSE, nam="", ylab="", col="black", lwd=0.2, - ylim=range(signal, na.rm=T), to_file=TRUE, - outputdir="misc_figures", postfun=NULL, ...) { - - if (ylab == "") ylab <- nam; - - # Obtain names and sizes of chromosomes - tmp <- read.delim(paste(assembly, "genome", sep="."), header=FALSE, row.names=1) - chroms <- rownames(tmp); - chromsizes <- tmp[,1]; names(chromsizes) <- chroms; - - ncols <- round(sqrt(length(chroms))) - nrows <- ceiling(sqrt(length(chroms))) - tot <- ncols * nrows; - colwidths <- tapply(chroms, rep(1:ncols, each=nrows)[1:length(chroms)], - function(x) max(coords$start[coords$seqname %in% x])) - - if (to_file) { - dir.create(outputdir, recursive=TRUE, showWarnings=FALSE); - plotfile(paste(outputdir, "/chr_plot_", nam, sep=""), - type=format, width=7*ncols, height=5*nrows) - } - - par(oma=c(2,3,2,0), mar=c(0,2,0,1)) - nf <- layout(matrix(1:(tot*2), 2*nrows, ncols, byrow = FALSE), - heights=rep(c(3,1), tot), widths=colwidths/sum(colwidths)) - - if (length(col) == 1) col <- rep(col, length(signal)) - - colwidths_chr <- rep(colwidths, each=nrows); - names(colwidths_chr) <- chroms; - - for (chr in chroms) { - print(chr) - sel <- which(coords$seqname == chr) - if (sample) sel <- sort(sample(sel, round(length(sel)/5))) - - xlim <- c(0, colwidths_chr[chr]); - plot(coords$start[sel], signal[sel], type=type, xlim=xlim, ylim=ylim, - xaxt="n", xaxs="i", bty="n", xlab="", ylab="", adj=1, las=2, - cex.axis=0.8, lwd=lwd, col=col[sel], ...) - mtext(paste(" ", chr), 3, 0, cex=1, line=-1.5, adj=0) - - axDat <- prettyTicks(c(0, max(coords$start[coords$seqname == chr])), ntick = 7) - if (!is.null(postfun)) postfun(range(axDat$at)); - - plot(coords$start[coords$seqname == chr], - rep(1, length(which(coords$seqname == chr))), type="n", - xaxs="i", bty="n", axes=FALSE, ylim=c(-0.3, 1), xlab="", ylab="", xlim=xlim) - axis(1, at = axDat$at, labels = axDat$labels, cex.axis=1, cex.axis=0.8, padj=-1.2, line=-3) - } - - mtext(paste("genomic location (", axDat$xunit, ")", sep=""), 1, line=0.5, cex=1.2, outer=TRUE) - #mtext(text=expression(log[2](LaminB1~interaction)), 2, 1, cex=1.2, outer=TRUE) - mtext(text=ylab, 2, 1, cex=1.2, outer=TRUE) - - if (to_file) dev.off() -} - -