Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM hotspot2
RUN apk add --no-cache \
g++ \
R \
R-dev
RUN Rscript -e 'install.packages("caTools", repo="http://cran.rstudio.com")'

WORKDIR masterlist

ADD . .
12 changes: 3 additions & 9 deletions code_build.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env Rscript
##########################################################################################################
# #
# Method for generating a master list / Index of DNaseI hypersensitivity sites. #
Expand All @@ -7,7 +8,6 @@
# #
##########################################################################################################

source("../../code/general.R")
library(caTools)
source("code_ML.R")

Expand All @@ -18,6 +18,7 @@ if (length(args)==0) {
stop("No arguments supplied.")
} else {
eval(parse(text=args[[1]])) # parse first argument: chunknum
filepath <- args[[2]] # parse second argument: filepath
}

chunk <- paste("chunk", sprintf("%04d", chunknum), ".bed", sep="");
Expand All @@ -31,20 +32,13 @@ options(warn=2)

## Example line:
# chr11 52983767 52983861 id-708944 0.0622013 52983811 LN4866 52983810
#FILEPATH=paste("/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf644/masterLists/",
# "FDR0.0010/outdir_MLfromPeakClumps_1g_MA21_DHSwidthThreshold20bp/chunks_10kb_corrected/", sep="/");
#chunks <- dir(FILEPATH, pattern="^chunk.*.txt");

#FILEPATH="/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf665/varWidthPeaks/chunks_5kb_min500elementsPerChunk/";
FILEPATH="/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf665/varWidthPeaks/chunks_10kb_min500elementsPerChunk/";
#chunks <- dir(FILEPATH, pattern="^chunk.*.bed");

dir.create("DHSs_all", showWarnings=FALSE, recursive=TRUE)
dir.create("peaks_all", showWarnings=FALSE, recursive=TRUE)

#for (chunk in chunks) {
# Load in data per chunk, each separated by at least 10kb
peaks <- read.delim(paste(FILEPATH, chunk, sep="/"), header=FALSE, as.is=T)
peaks <- read.delim(filepath, header=FALSE, as.is=T)
#colnames(peaks) <- c("seqname", "start", "end", "ID", "score", "density_summit", "sampleID", "wavelet_summit")
colnames(peaks) <- c("seqname", "start", "end", "sampleID", "score", "density_summit", "wavelet_summit")
peaks <- peaks[order(peaks$wavelet_summit),] # Order peaks by wavelet summit first
Expand Down
32 changes: 17 additions & 15 deletions code_construct_matrix.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
##########################################################################################################
# #
# Method for generating a master list / Index of DNaseI hypersensitivity sites. #
Expand All @@ -7,38 +8,39 @@
# #
##########################################################################################################

set -e -o pipefail

if [[ $# -eq 0 ]] ; then
echo 'Provide the name/version of the file, e.g. WM20180301'
if [[ $# -lt 2 ]] ; then
echo 'Provide the name/version of the file, e.g. WM20180301, and the number of chunks'
exit 1
fi

NAME=$1;
numchunks=$2

rm -f matrix_bin_all_${NAME}.txt.gz;
for i in $(seq -f "%04g" 1 5000); do
echo ${i}
rm -f "matrix_bin_all_${NAME}.txt.gz"
for i in $(seq -f "%04g" 1 "$numchunks"); do
echo "$i"
if [[ -f "DHSs_all/chunk${i}.bed" ]] ; then
#paste <(cut -f 1-3 DHSs_all/chunk${i}.bed) matrix_bin_all/chunk${i}.bed | gzip -c - >> matrix_bin_all_${NAME}.txt.gz
paste <(cut -f 4 DHSs_all/chunk${i}.bed) matrix_bin_all/chunk${i}.bed | gzip -c - >> matrix_bin_all_${NAME}.txt.gz
paste <(cut -f 4 "DHSs_all/chunk${i}.bed") "matrix_bin_all/chunk${i}.bed" | gzip -c - >> "matrix_bin_all_${NAME}.txt.gz"
fi
done

rm -f matrix_bin_nonovl_core_${NAME}.txt.gz;
for i in $(seq -f "%04g" 1 5000); do
echo ${i}
rm -f "matrix_bin_nonovl_core_${NAME}.txt.gz"
for i in $(seq -f "%04g" 1 "$numchunks"); do
echo "$i"
if [[ -f "DHSs_nonovl_core/chunk${i}.bed" ]] ; then
#paste <(cut -f 1-3 DHSs_nonovl_core/chunk${i}.bed) matrix_bin_nonovl_core/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_core_${NAME}.txt.gz
paste <(cut -f 4 DHSs_nonovl_core/chunk${i}.bed) matrix_bin_nonovl_core/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_core_${NAME}.txt.gz
paste <(cut -f 4 "DHSs_nonovl_core/chunk${i}.bed") "matrix_bin_nonovl_core/chunk${i}.bed" | gzip -c - >> "matrix_bin_nonovl_core_${NAME}.txt.gz"
fi
done

rm -f matrix_bin_nonovl_any_${NAME}.txt.gz;
for i in $(seq -f "%04g" 1 5000); do
echo ${i}
rm -f "matrix_bin_nonovl_any_${NAME}.txt.gz"
for i in $(seq -f "%04g" 1 "$numchunks"); do
echo "$i"
if [[ -f "DHSs_nonovl_any/chunk${i}.bed" ]] ; then
#paste <(cut -f 1-3 DHSs_nonovl_any/chunk${i}.bed) matrix_bin_nonovl_any/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_any_${NAME}.txt.gz
paste <(cut -f 4 DHSs_nonovl_any/chunk${i}.bed) matrix_bin_nonovl_any/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_any_${NAME}.txt.gz
paste <(cut -f 4 "DHSs_nonovl_any/chunk${i}.bed") "matrix_bin_nonovl_any/chunk${i}.bed" | gzip -c - >> "matrix_bin_nonovl_any_${NAME}.txt.gz"
fi
done

53 changes: 28 additions & 25 deletions code_gen_masterlist.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/bin/bash
##########################################################################################################
# #
# Method for generating a master list / Index of DNaseI hypersensitivity sites. #
Expand All @@ -7,16 +8,19 @@
# #
##########################################################################################################

set -e -o pipefail

if [[ $# -eq 0 ]] ; then
echo 'Provide the name/version of the file, e.g. WM20180301'
exit 1
fi

NAME=$1;
CHROM_FILE=$2

TYPES="all nonovl_any nonovl_core";
for TYPE in ${TYPES}; do
echo $TYPE;
echo "$TYPE"

FILE_CHUNKIDS="masterlist_DHSs_${NAME}_${TYPE}_chunkIDs.txt";
FILE_INDEXIDS="masterlist_DHSs_${NAME}_${TYPE}_indexIDs.txt";
Expand All @@ -26,7 +30,7 @@ for TYPE in ${TYPES}; do
### Create final master list
if ! [[ -f "$FILE_CHUNKIDS" ]] ; then
echo "Concatenating DHS chunks"
cat DHSs_${TYPE}/* | sort-bed - > ${FILE_CHUNKIDS}
cat "DHSs_${TYPE}"/* | sort-bed - > "${FILE_CHUNKIDS}"
fi

#### Generate label mapping
Expand All @@ -50,9 +54,9 @@ for TYPE in ${TYPES}; do
echo "Constructing BED12 file"
#echo "browser position chr6:26020208-26022677" > ${FILE_BED}
#echo "track name='Master list DHSs ${NAME} ${TYPE}' description='Master list DHSs ${NAME} ${TYPE}' visibility=2 useScore=1" >> ${FILE_BED}
cat ${FILE_INDEXIDS} | awk '
awk '
function round(x) { if(x=="NA") { return 0 } else { return int(x + 0.5) } }
BEGIN { OFS="\t"; }
BEGIN { OFS="\t"; }
{
if ($10 == "NA" || $11 == "NA") {
thickStart=$2
Expand All @@ -65,51 +69,50 @@ for TYPE in ${TYPES}; do
$9=($9 >= $3 ? $3-1 : $9)
$10=($10 <= $2 ? $2+1 : $10)
$11=($11 >= $3 ? $3-1 : $11)

thickStart= $9-1
thickEnd = $9+1

blockCount=1
blockSizes=1
blockStarts=0

blockSize2=$9-$10
blockStart2=$10-$2
if (blockSize2 != 0 && blockStart2 < $3-$2-1) {
blockCount+=1;
blockSizes=blockSizes","blockSize2;
blockStarts=blockStarts","blockStart2
if (blockSize2 != 0 && blockStart2 < $3-$2-1) {
blockCount+=1;
blockSizes=blockSizes","blockSize2;
blockStarts=blockStarts","blockStart2
}

blockSize3=$11-$9
blockStart3=$9-$2
if (blockSize3 != 0 && blockStart3 < $3-$2-1) {
blockCount+=1;
blockSizes=blockSizes","blockSize3;
blockStarts=blockStarts","blockStart3
if (blockSize3 != 0 && blockStart3 < $3-$2-1) {
blockCount+=1;
blockSizes=blockSizes","blockSize3;
blockStarts=blockStarts","blockStart3
}

blockSize4=1
blockStart4=$3-$2-1
blockCount+=1;
blockSizes=blockSizes","blockSize4;
blockSizes=blockSizes","blockSize4;
blockStarts=blockStarts","blockStart4
}

score=round(log($5+1)/log(10)*500)
score=(score > 1000 ? 1000 : score)
print $1, $2, $3, $4, score, ".", thickStart, thickEnd, "0,0,0", blockCount, blockSizes, blockStarts
}' > ${FILE_BED}
}' "${FILE_INDEXIDS}" > "${FILE_BED}"
fi

if ! [[ -f "$FILE_BIGBED" ]] ; then
echo "Converting BED file to BIGBED file"
bedToBigBed -type=bed12 ${FILE_BED} <(tail -n +2 hg38.genome) ${FILE_BIGBED}
#bedToBigBed -type=bed12 "${FILE_BED}" "$CHROM_FILE" "${FILE_BIGBED}"
bedToBigBed -type=bed12 "${FILE_BED}" <(cut -f1,3 "$CHROM_FILE") "${FILE_BIGBED}"
fi

echo "Load this track in the UCSC browser using the following:"
echo "track type=bigBed name=master_list_${NAME}_${TYPE} useScore=1 visibility=2 itemRgb='On' bigDataUrl=https://encode:collabor8@resources.altius.org/~meuleman/ML_tracks/${FILE_BIGBED}"

done;


done
1 change: 0 additions & 1 deletion code_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# #
##########################################################################################################

source("../../code/general.R")
library(caTools)
library(Matrix)
source("code_ML.R")
Expand Down
1 change: 0 additions & 1 deletion code_overlap.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# #
##########################################################################################################

source("../../code/general.R")
library(caTools)
source("code_ML.R")

Expand Down
116 changes: 116 additions & 0 deletions run_from_scratch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#! /bin/bash

set -e -u -o pipefail

if [[ $# != 4 && $# != 5 ]]; then
echo -e "Usage: $0 fileOfBAMfiles outdir chrom_sizes.bed mappable.starch [min_varWidth_peak_width]"
exit 2
fi

FILE_OF_BAM_FILES=$1
OUTDIR=$2
CHROM_FILTER=$3
MAPPABLE_REGIONS=$4

if [ $# = 5 ]; then
MIN_VARWIDTH_PEAK_WIDTH=$5
else
MIN_VARWIDTH_PEAK_WIDTH="20"
fi

# Sanity check on input filenames.
linenum=1
while read -r line
do
fname=$line
if [ ! -s "$fname" ]; then
echo -e "Error: on line $linenum of $FILE_OF_BAM_FILES, file not found, or the file is empty ($fname)."
exit 2
fi
((linenum++))
done < "$FILE_OF_BAM_FILES"

mkdir -p "$OUTDIR"

# Input files and parameter values
#CENTER_SITES=/home/erynes/topics/ENCODEpaper2017/CenterSitesFiles/centerSites_halfWin100bp_K36mapOnlyMinusBlacklist_chrs1-22XY.hg38.starch
#MAPPABLE_REGIONS=/home/erynes/topics/ENCODEpaper2017/GRCh38_36merMappableOnly_minusBlacklist_chrs1-22XY.starch
#CHROM_FILTER=/home/erynes/topics/ENCODEpaper2017/chromLengths_GRCh38_only1-22XY.bed3
CALL_THRESHOLD=1.0 # write all sites to disk, including sites with FDR = 1
HOTSPOT_FDR_THRESHOLD=0.0010 # 0.1% (not 1%)
CENTER_SITES="$OUTDIR/centersites.starch"

function exe_check(){
for exe in "$@"; do
if [ ! -x "$exe" ]; then
echo -e "Error: $exe was not found, or execution privileges for it are not set."
exit 2
fi
done
}

# Executables and scripts
HOTSPOT2=$(which hotspot2.sh)
EXTRACT=$(which extractCenterSites.sh)

exe_check "$HOTSPOT2" "$EXTRACT"

# Create center sites
echo "Creating center sites"
"$EXTRACT" -c "$CHROM_FILTER" -o "$CENTER_SITES"

ls -l "$CENTER_SITES"


# Run hotspot2.sh
while read -r BAM
do
bname=$(basename "$BAM" .bam)
varWidthSpec="varWidth_${MIN_VARWIDTH_PEAK_WIDTH}_${bname}"
{
unset TMPDIR
"$HOTSPOT2" -c "$CHROM_FILTER" -C "$CENTER_SITES" -M "$MAPPABLE_REGIONS" -p "$varWidthSpec" -f "$HOTSPOT_FDR_THRESHOLD" -F "$CALL_THRESHOLD" "$BAM" "$OUTDIR"
}
done < "$FILE_OF_BAM_FILES"


here=$(dirname "$(readlink -f "$0")")
OUTFILE=allPeaks.starch
{
echo -e "Collating peaks..."
bedops -u "${OUTDIR}"/*.peaks.starch \
| starch - \
> "${OUTDIR}/$OUTFILE"
if ! mkdir -p "${OUTDIR}/subdir" ; then
echo -e "Failed to create directory ${OUTDIR}/subdir."
exit 2
fi
echo -e '"Chunking" collated peaks for eventual use in master list creation...'
cd "${OUTDIR}/subdir" || exit 2
unstarch "../${OUTFILE}" \
| awk -v minChunkSep=10000 -v minChunkSep2=4000 -v minPerChunk=500 -v maxPerChunk=100000 \
'BEGIN{n=1; nWritten=0; outf="chunk0001.bed"}{
if(NR > 1){
if($1 != prevChr || $2-prev3 > minChunkSep){
if(nWritten >= minPerChunk){
outf = sprintf("chunk%04d.bed", ++n);
nWritten = 0;
}
}
else{
if($2-prev3 > minChunkSep2 && nWritten > maxPerChunk){
outf = sprintf("chunk%04d.bed",++n);
nWritten=0;
}
}
}
print $0 > outf;
nWritten++; prevChr = $1; prev3 = $3;
}'
echo -e "Done!"
}

cd "$here"

# Run the masterlist stuff
"$here/run_sequential.sh" "$OUTDIR/subdir" "$CHROM_FILTER"
Loading