diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5a87dc3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM hotspot2 +RUN apk add --no-cache \ + g++ \ + R \ + R-dev +RUN Rscript -e 'install.packages("caTools", repo="http://cran.rstudio.com")' + +WORKDIR masterlist + +ADD . . diff --git a/code_build.R b/code_build.R index 04fae35..8dca21e 100644 --- a/code_build.R +++ b/code_build.R @@ -1,3 +1,4 @@ +#!/usr/bin/env Rscript ########################################################################################################## # # # Method for generating a master list / Index of DNaseI hypersensitivity sites. # @@ -7,7 +8,6 @@ # # ########################################################################################################## -source("../../code/general.R") library(caTools) source("code_ML.R") @@ -18,6 +18,7 @@ if (length(args)==0) { stop("No arguments supplied.") } else { eval(parse(text=args[[1]])) # parse first argument: chunknum + filepath <- args[[2]] # parse second argument: filepath } chunk <- paste("chunk", sprintf("%04d", chunknum), ".bed", sep=""); @@ -31,20 +32,13 @@ options(warn=2) ## Example line: # chr11 52983767 52983861 id-708944 0.0622013 52983811 LN4866 52983810 -#FILEPATH=paste("/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf644/masterLists/", -# "FDR0.0010/outdir_MLfromPeakClumps_1g_MA21_DHSwidthThreshold20bp/chunks_10kb_corrected/", sep="/"); -#chunks <- dir(FILEPATH, pattern="^chunk.*.txt"); - -#FILEPATH="/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf665/varWidthPeaks/chunks_5kb_min500elementsPerChunk/"; -FILEPATH="/net/seq/data/projects/ENCODE3_publications/erynes/ENCODEpaper2017/SubsetOf665/varWidthPeaks/chunks_10kb_min500elementsPerChunk/"; -#chunks <- dir(FILEPATH, pattern="^chunk.*.bed"); dir.create("DHSs_all", showWarnings=FALSE, recursive=TRUE) dir.create("peaks_all", showWarnings=FALSE, recursive=TRUE) #for (chunk in chunks) { # Load in data per chunk, each separated by at least 10kb - peaks <- read.delim(paste(FILEPATH, chunk, sep="/"), header=FALSE, as.is=T) + peaks <- read.delim(filepath, header=FALSE, as.is=T) #colnames(peaks) <- c("seqname", "start", "end", "ID", "score", "density_summit", "sampleID", "wavelet_summit") colnames(peaks) <- c("seqname", "start", "end", "sampleID", "score", "density_summit", "wavelet_summit") peaks <- peaks[order(peaks$wavelet_summit),] # Order peaks by wavelet summit first diff --git a/code_construct_matrix.sh b/code_construct_matrix.sh index 2258f5e..bad13ea 100755 --- a/code_construct_matrix.sh +++ b/code_construct_matrix.sh @@ -1,3 +1,4 @@ +#!/bin/bash ########################################################################################################## # # # Method for generating a master list / Index of DNaseI hypersensitivity sites. # @@ -7,38 +8,39 @@ # # ########################################################################################################## +set -e -o pipefail -if [[ $# -eq 0 ]] ; then - echo 'Provide the name/version of the file, e.g. WM20180301' +if [[ $# -lt 2 ]] ; then + echo 'Provide the name/version of the file, e.g. WM20180301, and the number of chunks' exit 1 fi NAME=$1; +numchunks=$2 -rm -f matrix_bin_all_${NAME}.txt.gz; -for i in $(seq -f "%04g" 1 5000); do - echo ${i} +rm -f "matrix_bin_all_${NAME}.txt.gz" +for i in $(seq -f "%04g" 1 "$numchunks"); do + echo "$i" if [[ -f "DHSs_all/chunk${i}.bed" ]] ; then #paste <(cut -f 1-3 DHSs_all/chunk${i}.bed) matrix_bin_all/chunk${i}.bed | gzip -c - >> matrix_bin_all_${NAME}.txt.gz - paste <(cut -f 4 DHSs_all/chunk${i}.bed) matrix_bin_all/chunk${i}.bed | gzip -c - >> matrix_bin_all_${NAME}.txt.gz + paste <(cut -f 4 "DHSs_all/chunk${i}.bed") "matrix_bin_all/chunk${i}.bed" | gzip -c - >> "matrix_bin_all_${NAME}.txt.gz" fi done -rm -f matrix_bin_nonovl_core_${NAME}.txt.gz; -for i in $(seq -f "%04g" 1 5000); do - echo ${i} +rm -f "matrix_bin_nonovl_core_${NAME}.txt.gz" +for i in $(seq -f "%04g" 1 "$numchunks"); do + echo "$i" if [[ -f "DHSs_nonovl_core/chunk${i}.bed" ]] ; then #paste <(cut -f 1-3 DHSs_nonovl_core/chunk${i}.bed) matrix_bin_nonovl_core/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_core_${NAME}.txt.gz - paste <(cut -f 4 DHSs_nonovl_core/chunk${i}.bed) matrix_bin_nonovl_core/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_core_${NAME}.txt.gz + paste <(cut -f 4 "DHSs_nonovl_core/chunk${i}.bed") "matrix_bin_nonovl_core/chunk${i}.bed" | gzip -c - >> "matrix_bin_nonovl_core_${NAME}.txt.gz" fi done -rm -f matrix_bin_nonovl_any_${NAME}.txt.gz; -for i in $(seq -f "%04g" 1 5000); do - echo ${i} +rm -f "matrix_bin_nonovl_any_${NAME}.txt.gz" +for i in $(seq -f "%04g" 1 "$numchunks"); do + echo "$i" if [[ -f "DHSs_nonovl_any/chunk${i}.bed" ]] ; then #paste <(cut -f 1-3 DHSs_nonovl_any/chunk${i}.bed) matrix_bin_nonovl_any/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_any_${NAME}.txt.gz - paste <(cut -f 4 DHSs_nonovl_any/chunk${i}.bed) matrix_bin_nonovl_any/chunk${i}.bed | gzip -c - >> matrix_bin_nonovl_any_${NAME}.txt.gz + paste <(cut -f 4 "DHSs_nonovl_any/chunk${i}.bed") "matrix_bin_nonovl_any/chunk${i}.bed" | gzip -c - >> "matrix_bin_nonovl_any_${NAME}.txt.gz" fi done - diff --git a/code_gen_masterlist.sh b/code_gen_masterlist.sh index 2a4c557..0bbdb62 100755 --- a/code_gen_masterlist.sh +++ b/code_gen_masterlist.sh @@ -1,3 +1,4 @@ +#!/bin/bash ########################################################################################################## # # # Method for generating a master list / Index of DNaseI hypersensitivity sites. # @@ -7,16 +8,19 @@ # # ########################################################################################################## +set -e -o pipefail + if [[ $# -eq 0 ]] ; then echo 'Provide the name/version of the file, e.g. WM20180301' exit 1 fi NAME=$1; +CHROM_FILE=$2 TYPES="all nonovl_any nonovl_core"; for TYPE in ${TYPES}; do - echo $TYPE; + echo "$TYPE" FILE_CHUNKIDS="masterlist_DHSs_${NAME}_${TYPE}_chunkIDs.txt"; FILE_INDEXIDS="masterlist_DHSs_${NAME}_${TYPE}_indexIDs.txt"; @@ -26,7 +30,7 @@ for TYPE in ${TYPES}; do ### Create final master list if ! [[ -f "$FILE_CHUNKIDS" ]] ; then echo "Concatenating DHS chunks" - cat DHSs_${TYPE}/* | sort-bed - > ${FILE_CHUNKIDS} + cat "DHSs_${TYPE}"/* | sort-bed - > "${FILE_CHUNKIDS}" fi #### Generate label mapping @@ -50,9 +54,9 @@ for TYPE in ${TYPES}; do echo "Constructing BED12 file" #echo "browser position chr6:26020208-26022677" > ${FILE_BED} #echo "track name='Master list DHSs ${NAME} ${TYPE}' description='Master list DHSs ${NAME} ${TYPE}' visibility=2 useScore=1" >> ${FILE_BED} - cat ${FILE_INDEXIDS} | awk ' + awk ' function round(x) { if(x=="NA") { return 0 } else { return int(x + 0.5) } } - BEGIN { OFS="\t"; } + BEGIN { OFS="\t"; } { if ($10 == "NA" || $11 == "NA") { thickStart=$2 @@ -65,51 +69,50 @@ for TYPE in ${TYPES}; do $9=($9 >= $3 ? $3-1 : $9) $10=($10 <= $2 ? $2+1 : $10) $11=($11 >= $3 ? $3-1 : $11) - + thickStart= $9-1 thickEnd = $9+1 - + blockCount=1 blockSizes=1 blockStarts=0 - + blockSize2=$9-$10 blockStart2=$10-$2 - if (blockSize2 != 0 && blockStart2 < $3-$2-1) { - blockCount+=1; - blockSizes=blockSizes","blockSize2; - blockStarts=blockStarts","blockStart2 + if (blockSize2 != 0 && blockStart2 < $3-$2-1) { + blockCount+=1; + blockSizes=blockSizes","blockSize2; + blockStarts=blockStarts","blockStart2 } - + blockSize3=$11-$9 blockStart3=$9-$2 - if (blockSize3 != 0 && blockStart3 < $3-$2-1) { - blockCount+=1; - blockSizes=blockSizes","blockSize3; - blockStarts=blockStarts","blockStart3 + if (blockSize3 != 0 && blockStart3 < $3-$2-1) { + blockCount+=1; + blockSizes=blockSizes","blockSize3; + blockStarts=blockStarts","blockStart3 } - + blockSize4=1 blockStart4=$3-$2-1 blockCount+=1; - blockSizes=blockSizes","blockSize4; + blockSizes=blockSizes","blockSize4; blockStarts=blockStarts","blockStart4 } - + score=round(log($5+1)/log(10)*500) score=(score > 1000 ? 1000 : score) print $1, $2, $3, $4, score, ".", thickStart, thickEnd, "0,0,0", blockCount, blockSizes, blockStarts - }' > ${FILE_BED} + }' "${FILE_INDEXIDS}" > "${FILE_BED}" fi - + if ! [[ -f "$FILE_BIGBED" ]] ; then echo "Converting BED file to BIGBED file" - bedToBigBed -type=bed12 ${FILE_BED} <(tail -n +2 hg38.genome) ${FILE_BIGBED} + #bedToBigBed -type=bed12 "${FILE_BED}" "$CHROM_FILE" "${FILE_BIGBED}" + bedToBigBed -type=bed12 "${FILE_BED}" <(cut -f1,3 "$CHROM_FILE") "${FILE_BIGBED}" fi echo "Load this track in the UCSC browser using the following:" echo "track type=bigBed name=master_list_${NAME}_${TYPE} useScore=1 visibility=2 itemRgb='On' bigDataUrl=https://encode:collabor8@resources.altius.org/~meuleman/ML_tracks/${FILE_BIGBED}" -done; - - +done diff --git a/code_matrix.R b/code_matrix.R index 9137340..cd8f6d9 100644 --- a/code_matrix.R +++ b/code_matrix.R @@ -7,7 +7,6 @@ # # ########################################################################################################## -source("../../code/general.R") library(caTools) library(Matrix) source("code_ML.R") diff --git a/code_overlap.R b/code_overlap.R index ddf4c88..df0b6af 100644 --- a/code_overlap.R +++ b/code_overlap.R @@ -7,7 +7,6 @@ # # ########################################################################################################## -source("../../code/general.R") library(caTools) source("code_ML.R") diff --git a/run_from_scratch.sh b/run_from_scratch.sh new file mode 100755 index 0000000..b59725f --- /dev/null +++ b/run_from_scratch.sh @@ -0,0 +1,116 @@ +#! /bin/bash + +set -e -u -o pipefail + +if [[ $# != 4 && $# != 5 ]]; then + echo -e "Usage: $0 fileOfBAMfiles outdir chrom_sizes.bed mappable.starch [min_varWidth_peak_width]" + exit 2 +fi + +FILE_OF_BAM_FILES=$1 +OUTDIR=$2 +CHROM_FILTER=$3 +MAPPABLE_REGIONS=$4 + +if [ $# = 5 ]; then + MIN_VARWIDTH_PEAK_WIDTH=$5 +else + MIN_VARWIDTH_PEAK_WIDTH="20" +fi + +# Sanity check on input filenames. +linenum=1 +while read -r line +do + fname=$line + if [ ! -s "$fname" ]; then + echo -e "Error: on line $linenum of $FILE_OF_BAM_FILES, file not found, or the file is empty ($fname)." + exit 2 + fi + ((linenum++)) +done < "$FILE_OF_BAM_FILES" + +mkdir -p "$OUTDIR" + +# Input files and parameter values +#CENTER_SITES=/home/erynes/topics/ENCODEpaper2017/CenterSitesFiles/centerSites_halfWin100bp_K36mapOnlyMinusBlacklist_chrs1-22XY.hg38.starch +#MAPPABLE_REGIONS=/home/erynes/topics/ENCODEpaper2017/GRCh38_36merMappableOnly_minusBlacklist_chrs1-22XY.starch +#CHROM_FILTER=/home/erynes/topics/ENCODEpaper2017/chromLengths_GRCh38_only1-22XY.bed3 +CALL_THRESHOLD=1.0 # write all sites to disk, including sites with FDR = 1 +HOTSPOT_FDR_THRESHOLD=0.0010 # 0.1% (not 1%) +CENTER_SITES="$OUTDIR/centersites.starch" + +function exe_check(){ + for exe in "$@"; do + if [ ! -x "$exe" ]; then + echo -e "Error: $exe was not found, or execution privileges for it are not set." + exit 2 + fi + done +} + +# Executables and scripts +HOTSPOT2=$(which hotspot2.sh) +EXTRACT=$(which extractCenterSites.sh) + +exe_check "$HOTSPOT2" "$EXTRACT" + +# Create center sites +echo "Creating center sites" +"$EXTRACT" -c "$CHROM_FILTER" -o "$CENTER_SITES" + +ls -l "$CENTER_SITES" + + +# Run hotspot2.sh +while read -r BAM +do + bname=$(basename "$BAM" .bam) + varWidthSpec="varWidth_${MIN_VARWIDTH_PEAK_WIDTH}_${bname}" + { + unset TMPDIR + "$HOTSPOT2" -c "$CHROM_FILTER" -C "$CENTER_SITES" -M "$MAPPABLE_REGIONS" -p "$varWidthSpec" -f "$HOTSPOT_FDR_THRESHOLD" -F "$CALL_THRESHOLD" "$BAM" "$OUTDIR" + } +done < "$FILE_OF_BAM_FILES" + + +here=$(dirname "$(readlink -f "$0")") +OUTFILE=allPeaks.starch +{ + echo -e "Collating peaks..." + bedops -u "${OUTDIR}"/*.peaks.starch \ + | starch - \ + > "${OUTDIR}/$OUTFILE" + if ! mkdir -p "${OUTDIR}/subdir" ; then + echo -e "Failed to create directory ${OUTDIR}/subdir." + exit 2 + fi + echo -e '"Chunking" collated peaks for eventual use in master list creation...' + cd "${OUTDIR}/subdir" || exit 2 + unstarch "../${OUTFILE}" \ + | awk -v minChunkSep=10000 -v minChunkSep2=4000 -v minPerChunk=500 -v maxPerChunk=100000 \ + 'BEGIN{n=1; nWritten=0; outf="chunk0001.bed"}{ + if(NR > 1){ + if($1 != prevChr || $2-prev3 > minChunkSep){ + if(nWritten >= minPerChunk){ + outf = sprintf("chunk%04d.bed", ++n); + nWritten = 0; + } + } + else{ + if($2-prev3 > minChunkSep2 && nWritten > maxPerChunk){ + outf = sprintf("chunk%04d.bed",++n); + nWritten=0; + } + } + } + print $0 > outf; + nWritten++; prevChr = $1; prev3 = $3; + }' + echo -e "Done!" +} + +cd "$here" + +# Run the masterlist stuff +"$here/run_sequential.sh" "$OUTDIR/subdir" "$CHROM_FILTER" diff --git a/run_sequential.sh b/run_sequential.sh new file mode 100755 index 0000000..52e89c2 --- /dev/null +++ b/run_sequential.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e -o pipefail + +dir=$1 +chrom_file=$2 +outname=${3:-masterlist} + +outdir=Rout +mkdir -p "$outdir" +numchunks=$(find "$dir" -maxdepth 1 -name 'chunk*.bed' | wc -l) + +# Do something +i=1 +for chunkfile in "$dir"/chunk*.bed ; do + R CMD BATCH \ + --no-save --no-restore \ + "--args chunknum=$i $chunkfile" \ + ./code_build.R \ + "$outdir/output_build_chunk_$i.Rout" + ((i++)) +done + +# Do something else +for i in $(seq 1 "$numchunks") ; do + R CMD BATCH --no-save --no-restore "--args chunknum=${i}" \ + ./code_overlap.R "$outdir/output_overlap_chunk_${i}.Rout" +done + +# Generate masterlist +./code_gen_masterlist.sh "$(basename "$outname")" "$chrom_file" + +# Copy output files out of the container +cp -r \ + DHS* \ + Rout \ + masterlist_DHSs* \ + peaks_* \ + "$dir" +# Make a matrix +# TODO: This comes later +# ./code_construct_matrix.sh "$(basename "$outname")" "$numchunks"