From 0a5cf20ea3a46149e3465c98a9eb1d2443d16922 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 13 Aug 2020 13:43:05 -0400 Subject: [PATCH 01/66] fix name --- pepatac_output_schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index 11f400b8..3a74c02d 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -1,4 +1,4 @@ -description: objects produced by PEPPRO pipeline. +description: objects produced by PEPATAC pipeline. properties: samples: type: array From 5bd07e1086dc541693a0efbde3c93dfba634f288 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 10 Jun 2021 08:59:41 -0400 Subject: [PATCH 02/66] Make checkinstall more concise --- checkinstall | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/checkinstall b/checkinstall index d703cf3a..ec573fa9 100755 --- a/checkinstall +++ b/checkinstall @@ -3,12 +3,13 @@ # PEPATAC pipeline installation check # -if [[ $# -gt 0 ]] ; then +if [ $# -gt 0 ] ; then echo "Usage: checkinstall" exit 1 fi set -o pipefail +# set -e echo -e "-----------------------------------------------------------" echo -e " " @@ -49,28 +50,28 @@ GREEN='\033[0;32m' YELLOW='\033[0;33m' NC='\033[0m' # No Color -function fail { +fail() { printf "${RED}\u2716 $@${NC}\n" } -function success { +success() { printf "${GREEN}\xE2\x9C\x94 $@${NC}\n" } -function warn { - printf "${YELLOW}$@${NC}\n" +warn() { + printf "${YELLOW}\u26A0 $@${NC}\n" } ################################################################################ echo -e "Checking base requirements... " -echo -e " " + BASE_REQS=0 declare -a requiredPkgs=("refgenie" "looper") for package in ${requiredPkgs[@]}; do if ! pip_show $package; then - echo $(fail "ERROR: PEPATAC requires the Python package, $package. Try pip install $package and checkinstall again.") + echo $(fail "ERROR: PEPATAC requires the Python package, $package. Try pip install $package.") printf "\n" exit 1 fi @@ -84,7 +85,6 @@ fi ################################################################################ echo -e "-----------------------------------------------------------" echo -e "Checking native installation... " -echo -e " " NATIVE_INSTALL=0 # Check Python @@ -125,9 +125,9 @@ while IFS= read -r line; do rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - if ! pip_show "${package}"; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") - printf "\n" + if ! pip_show "${package}" 2&>/dev/null ; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package.") + # printf "\n" NATIVE_INSTALL=1 BULKER_INSTALL=1 else @@ -172,7 +172,7 @@ declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fse for cmd in ${requiredCommands[@]}; do if ! is_executable $cmd; then echo $(warn "WARNING: Install $cmd and checkinstall again.") - printf "\n" + # printf "\n" NATIVE_INSTALL=1 else echo -e $(success "SUCCESS: ${cmd}") @@ -210,7 +210,6 @@ done ################################################################################ echo -e "-----------------------------------------------------------" echo -e "Checking conda installation... " -echo -e " " CONDA_INSTALL=0 if ! is_executable "conda"; then @@ -347,7 +346,6 @@ fi ################################################################################ echo -e "-----------------------------------------------------------" echo -e "Checking bulker installation... " -echo -e " " BULKER_INSTALL=0 if ! is_executable "bulker"; then @@ -385,24 +383,23 @@ fi ################################################################################ echo -e "-----------------------------------------------------------" echo -e " PEPATAC checkinstall results " -echo -e " " if [ "$NATIVE_INSTALL" -eq 0 ]; then - echo -e $(success "SUCCESS: PEPATAC can be run utilizing native installations!") + echo -e $(success "SUCCESS: PEPATAC can be run using native installations!") else - echo -e $(fail "WARNING: PEPATAC cannot be run utilizing native installations!") + echo -e $(fail "ERROR: PEPATAC cannot be run using native installations.") fi if [ "$CONDA_INSTALL" -eq 0 ]; then - echo -e $(success "SUCCESS: PEPATAC can be run utilizing conda installation!") + echo -e $(success "SUCCESS: PEPATAC can be run using conda installation!") else - echo -e $(fail "WARNING: PEPATAC cannot be run via conda!") + echo -e $(fail "ERROR: PEPATAC cannot be run via conda.") fi if [ "$BULKER_INSTALL" -eq 0 ]; then - echo -e $(success "SUCCESS: PEPATAC can be run utilizing bulker!") + echo -e $(success "SUCCESS: PEPATAC can be run using bulker!") else - echo -e $(fail "WARNING: PEPATAC cannot be run utilizing bulker!") + echo -e $(fail "ERROR: PEPATAC cannot be run using bulker.") fi if [ "$NATIVE_INSTALL" -eq 1 ] && [ "$CONDA_INSTALL" -eq 1 ] && [ "$BULKER_INSTALL" -eq 1 ]; then From 2b7335eae9833590406341a5d160cd6ed46a87fe Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Thu, 10 Jun 2021 15:57:51 -0400 Subject: [PATCH 03/66] add alignment stats to glossary --- docs/glossary.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/glossary.md b/docs/glossary.md index 66857527..7418a1b0 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -4,6 +4,22 @@ Information regarding universal ATAC-seq [terms and definitions](https://www.enc Additional information relevant to `PEPATAC` output and interpretation includes: +## Alignment statistics + +- **Raw reads**: The original number of reads in the input files. +- **Trimmed reads**: The number of reads remaining after trimming. +- **Trim loss rate**: The percentage of total reads that was trimmed. i.e. $\frac{(Raw\_reads - Trimmed\_reads)\times100}{Raw\_reads}$ +- **Mapped reads**: The number of reads mapped to primary genome. +- **QC filtered reads**: The number of reads removed due to poor MAPQ values (i.e. <10). +- **Aligned reads**: (Mapped_reads - QC_filtered_reads) +- **Alignment rate**: The percentage of trimmed reads that mapped to the primary genome. i.e. $\frac{Aligned\_reads}{Trimmed\_reads}$ $\times$ 100. In this case, trimmed reads represent the maximum number of reads that even have the potential to be mapped. +- **Total efficiency** - The percentage of raw reads that mapped to the primary genome. i.e. $\frac{Aligned\_reads}{Raw\_reads}$ $\times$ 100. Here we're looking at even reads that necessitated trimming, which gives an idea of how well your sample preparation was if, for example, a large number of reads had required trimming. If the efficiency is very poor but you had a high alignment rate, it would suggest an issue with sample prep because so many reads were trimmed. +- **Unmapped reads**: The number of trimmed reads that remains unmapped following prealignment and primary alignment. i.e. Trimmed_reads - Aligned_reads(prealignments) - Mapped_reads +- **Duplicate reads**: The number of duplicate reads removed from the mapped reads. +- **Dedup aligned reads**: The number of aligned reads following duplicate removal. i.e. Aligned_reads - Duplicate_reads +- **Dedup alignment rate**: The number of deduplicated, aligned reads out of the number of trimmed reads. i.e. $\frac{Dedup\_aligned\_reads}{Trimmed\_reads}$ +- **Dedup total efficiency**: The number of deduplicated, aligned reads out of the number of raw reads. i.e. $\frac{Dedup\_aligned\_reads}{Raw\_reads}$ + ## Peak calling output The pipeline's default peak caller is `MACS2`. More [detailed descriptions regarding `MACS2` output](https://github.com/taoliu/MACS) may be discovered in the `MACS2` repository. From 7111aeda439a9e22ad133889782febf02dfd8e46 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Thu, 17 Jun 2021 10:24:23 -0400 Subject: [PATCH 04/66] update docs --- docs/compute-resources.md | 36 ++++++++++++++++++++++++++++++++++++ mkdocs.yml | 3 ++- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 docs/compute-resources.md diff --git a/docs/compute-resources.md b/docs/compute-resources.md new file mode 100644 index 00000000..b43ae714 --- /dev/null +++ b/docs/compute-resources.md @@ -0,0 +1,36 @@ +# How to configure computing resources + +`PEPATAC` comes packaged with default compute settings (memory, cores, and time) for both the sample and project-level pipeline. These values will automatically be populated by `looper` based on the input file size. In that way, smaller samples (e.g. fewer reads) will request less resources and vice-versa for large samples. You can also specify these values on the command-line. + +## Configure computing resource requests at the command-line + +You can specify the the memory (`-M`) and number of cores (`-P`) directly on the command-line. +```console +pipelines/pepatac.py -O /path/to/processed/data/ -S "compute_example" -I /path/to/fastq.fq -G "hg38" -P 16 -M 16000 +``` + +## Configure computing resource requests with `looper` + +Default computing resource requests are defined in the [resources-sample.tsv](https://github.com/databio/pepatac/blob/master/resources-sample.tsv) and [resources-project.tsv](https://github.com/databio/pepatac/blob/master/resources-project.tsv) for sample and project-level pipeline calls, respectively. `Looper` checks these files based on the `size_dependent_variables` section in the `sample` and `project` `pipeline_interface.yaml` files. For default pipeline settings, these resources should be more than sufficient, but for different pipeline settings you may desire to request different resources. This could be accomplished two ways: +1. You can [override universal compute settings when you call `looper` by specifying the resources using the `--compute` variable](https://looper.readthedocs.io/en/latest/variable-namespaces/#5-compute): +```console +looper run --compute mem=24000 time=00-12:00:00 --cpus-per-task=36 --ntasks=1 +``` +2. You could modify the `resources-sample.tsv` or `resources-project.tsv` manually and `looper` will use these updated values. +`resources-sample.tsv` default: +| max_file_size | cores | mem | time | +|---------------|-------|-------|-------------| +| 0.05 | 4 | 10000 | 00-03:00:00 | +| 0.5 | 8 | 12000 | 00-08:00:00 | +| 1 | 16 | 16000 | 00-12:00:00 | +| 10 | 32 | 24000 | 01-00:00:00 | +| NaN | 32 | 32000 | 02-00:00:00 | + +`resources-project.tsv` default: +| max_file_size | cores | mem | time | +|---------------|-------|-------|-------------| +| 0.05 | 1 | 16000 | 00-01:00:00 | +| 0.5 | 1 | 32000 | 00-01:00:00 | +| 1 | 1 | 56000 | 00-01:00:00 | +| 10 | 1 | 64000 | 00-01:00:00 | +| NaN | 1 | 64000 | 00-02:00:00 | \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 9602a248..be1988a8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,7 +26,8 @@ nav: - Configure assets: 'assets.md' - Configure prealignments: 'prealignments.md' - Configure project files: 'peps.md' - - Configure seqOutBias assets: 'sob.md' + - Configure seqOutBias assets: 'sob.md' + - Configure computing resources: 'compute-resources.md' - Download SRA data: 'download.md' - Use custom reference data: 'annotation.md' - Use reference peaks: 'reference_peaks.md' From 84badc0471575bda3480a21ce9c28316d1b4bb26 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 21 Jun 2021 13:33:40 -0400 Subject: [PATCH 05/66] use refgenie looper plugin --- pipelines/pepatac.py | 434 +++++++++++++++------------------ sample_pipeline_interface.yaml | 26 +- 2 files changed, 215 insertions(+), 245 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 7f459c75..1ba3e18d 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.9.16" +__version__ = "0.10.0" from argparse import ArgumentParser @@ -40,47 +40,31 @@ def parse_arguments(): parser = ArgumentParser(description='PEPATAC version ' + __version__) parser = pypiper.add_pypiper_args(parser, groups= ['pypiper', 'looper', 'ngs'], - required=["input", "genome", "sample-name", "output-parent"]) + required=["input", "genome", "sample-name", "output-parent", + "chrom-sizes", "primary-index"]) # Pipeline-specific arguments + parser.add_argument("--trimmer", dest="trimmer", type=str.lower, + default="skewer", choices=TRIMMERS, + help="Name of read trimming program.") + parser.add_argument("--aligner", dest="aligner", type=str.lower, default="bowtie2", choices=ALIGNERS, - help="Name of read aligner") - + help="Name of read aligner.") + + parser.add_argument("--deduplicator", dest="deduplicator", type=str.lower, + default="samblaster", choices=DEDUPLICATORS, + help="Name of deduplicator program.") + parser.add_argument("--peak-caller", dest="peak_caller", type=str.lower, default="macs2", choices=PEAK_CALLERS, - help="Name of peak caller") + help="Name of peak caller.") parser.add_argument("-gs", "--genome-size", default="2.7e9", type=str.lower, help="Effective genome size. It can be 1.0e+9 " "or 1000000000: e.g. human (2.7e9), mouse (1.87e9), " "C. elegans (9e7), fruitfly (1.2e8). Default:2.7e9") - parser.add_argument("--trimmer", dest="trimmer", type=str.lower, - default="skewer", choices=TRIMMERS, - help="Name of read trimming program") - - parser.add_argument("--prealignments", default=[], type=str, - nargs="+", - help="Space-delimited list of reference genomes to " - "align to before primary alignment.") - - parser.add_argument("--deduplicator", dest="deduplicator", type=str.lower, - default="samblaster", choices=DEDUPLICATORS, - help="Name of deduplicator program") - - parser.add_argument("--TSS-name", default=None, - dest="TSS_name", type=str, - help="Path to TSS annotation file.") - - parser.add_argument("--blacklist", default=None, - dest="blacklist", type=str, - help="Path to genomic region blacklist file") - - parser.add_argument("--anno-name", default=None, - dest="anno_name", type=str, - help="Path to reference annotation file (BED format) for calculating FRiF") - parser.add_argument("--peak-type", default="fixed", dest="peak_type", choices=PEAK_TYPES, type=str.lower, help="Call variable or fixed width peaks.\n" @@ -93,11 +77,12 @@ def parse_arguments(): parser.add_argument("--frip-ref-peaks", default=None, dest="frip_ref_peaks", type=str, - help="Path to reference peak set (BED format) for calculating FRiP") + help="Path to reference peak set (BED format) " + "for calculating FRiP.") parser.add_argument("--motif", action='store_true', dest="motif", - help="Perform motif enrichment analysis") + help="Perform motif enrichment analysis.") parser.add_argument("--sob", action='store_true', dest="sob", default=False, @@ -120,11 +105,11 @@ def parse_arguments(): parser.add_argument("--keep", action='store_true', dest="keep", - help="Enable this flag to keep prealignment BAM files") + help="Enable this flag to keep prealignment BAM files.") parser.add_argument("--noFIFO", action='store_true', dest="no_fifo", - help="Do NOT use named pipes during prealignments") + help="Do NOT use named pipes during prealignments.") parser.add_argument("--lite", dest="lite", action='store_true', help="Only keep minimal, essential output to conserve " @@ -134,15 +119,56 @@ def parse_arguments(): help="Skip FastQC. Useful for bugs in FastQC " "that appear with some sequence read files.") + # Genome assets + prealignment_index = parser.add_mutually_exclusive_group(required=False) + prealignment_index.add_argument("--prealignment-bowtie2-index", + default=[], type=str, nargs="+", dest="prealignment_bowtie2_index", + help="Space-delimited list of paths to bowtie2 " + "prefixes to align to before primary alignment " + "(minus trailing .X.bt2).") + prealignment_index.add_argument("--prealignment-bwa-index", default=[], + type=str, nargs="+", dest="prealignment_bwa_index", + help="Space-delimited list of paths to bwa " + "index directories to align to before primary alignment.") + + primary_index = parser.add_mutually_exclusive_group(required=True) + primary_index.add_argument("--bowtie2-index", default=None, + dest="bowtie2_index", type=str, + help="Path to primary genome bowtie2 prefix " + "(minus trailing .X.bt2).") + primary_index.add_argument("--bwa-index", default=None, + dest="bwa_index", type=str, + help="Path to primary genome bwa index " + "directory.") + + parser.add_argument("--chrom-sizes", default=None, required=True, + dest="chrom_sizes", type=str, + help="Path to primary genome chromosome sizes file.") + + parser.add_argument("--TSS-name", default=None, + dest="TSS_name", type=str, + help="Path to TSS annotation file.") + + parser.add_argument("--blacklist", default=None, + dest="blacklist", type=str, + help="Path to genomic region blacklist file.") + + parser.add_argument("--anno-name", default=None, + dest="anno_name", type=str, + help="Path to reference annotation file (BED format) " + "for calculating FRiF.") + + parser.add_argument("--search-file", default=None, + dest="search_file", type=str, + help="Required for seqOutBias (--sob). " + "Path to tallymer index search file built " + "with the same read length as the input.") + parser.add_argument("-V", "--version", action="version", version="%(prog)s {v}".format(v=__version__)) args = parser.parse_args() - # TODO: determine if it's safe to handle this requirement with argparse. - # It may be that communication between pypiper and a pipeline via - # the pipeline interface (and/or) looper, and how the partial argument - # parsing is handled, that makes this more favorable. if not args.input: parser.print_help() raise SystemExit @@ -234,7 +260,7 @@ def _align(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, sub_outdir = os.path.join(outfolder, aligndir) ngstk.make_dir(sub_outdir) - bamname = "{}_{}.bam".format(args.sample_name, assembly_identifier) + bamname = f"{args.sample_name}_{assembly_identifier}.bam" all_mapped_bam = os.path.join(sub_outdir, args.sample_name + "_" + assembly_identifier + "_all.bam") mapped_bam = os.path.join(sub_outdir, bamname) @@ -466,92 +492,6 @@ def check_commands(commands, ignore=''): return False -def _add_resources(args, res, asset_dict=None): - """ - Add additional resources needed for pipeline. - - :param argparse.Namespace args: binding between option name and argument, - e.g. from parsing command-line options - :param pm.config.resources res: pipeline manager resources list - :param asset_dict list: list of dictionary of assets to add - """ - - rgc = RGC(select_genome_config(res.get("genome_config"))) - - key_errors = [] - exist_errors = [] - required_list = [] - - # Check that bowtie2/bwa indicies exist for specified prealignments - for reference in args.prealignments: - for asset in [GENOME_IDX_KEY]: - try: - res[asset] = rgc.seek(reference, asset) - except KeyError: - err_msg = "{} for {} is missing from REFGENIE config file." - pm.fail_pipeline(KeyError(err_msg.format(asset, reference))) - except: - err_msg = "{} for {} does not exist." - pm.fail_pipeline(IOError(err_msg.format(asset, reference))) - - # Check specified assets - if not asset_dict: - return res, rgc - else: - for item in asset_dict: - pm.debug("item: {}".format(item)) # DEBUG - asset = item["asset_name"] - seek_key = item["seek_key"] or item["asset_name"] - tag = item["tag_name"] or "default" - arg = item["arg"] - user_arg = item["user_arg"] - req = item["required"] - - if arg and hasattr(args, arg) and getattr(args, arg): - res[seek_key] = os.path.abspath(getattr(args, arg)) - else: - try: - pm.debug("{} - {}.{}:{}".format(args.genome_assembly, - asset, - seek_key, - tag)) # DEBUG - res[seek_key] = rgc.seek(args.genome_assembly, - asset_name=str(asset), - tag_name=str(tag), - seek_key=str(seek_key)) - except KeyError: - key_errors.append(item) - if req: - required_list.append(item) - except: - exist_errors.append(item) - if req: - required_list.append(item) - - if len(key_errors) > 0 or len(exist_errors) > 0: - pm.info("Some assets are not found. You can update your REFGENIE " - "config file or point directly to the file using the noted " - "command-line arguments:") - - if len(key_errors) > 0: - if required_list: - err_msg = "Required assets missing from REFGENIE config file: {}" - pm.fail_pipeline(IOError(err_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name}".format(**x) for x in required_list])))) - else: - warning_msg = "Optional assets missing from REFGENIE config file: {}" - pm.info(warning_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name}".format(**x) for x in key_errors]))) - - if len(exist_errors) > 0: - if required_list: - err_msg = "Required assets not existing: {}" - pm.fail_pipeline(IOError(err_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name} (--{user_arg})".format(**x) for x in required_list])))) - else: - warning_msg = "Optional assets not existing: {}" - pm.info(warning_msg.format(", ".join(["{asset_name}.{seek_key}:{tag_name} (--{user_arg})".format(**x) for x in exist_errors]))) - - return res, rgc - - ################################################################################ # Pipeline MAIN # ################################################################################ @@ -661,66 +601,111 @@ def main(): pm.fail_pipeline(RuntimeError(err_msg)) if args.input2 and not args.paired_end: - err_msg = "Incompatible settings: You specified single-end, but provided --input2." + err_msg = (f"Incompatible settings: You specified single-end, " + f"but provided --input2.") pm.fail_pipeline(RuntimeError(err_msg)) - ############################################################################ - # Set up reference resources according to primary genome. # + # Set up reference resources # ############################################################################ if args.aligner.lower() == "bwa": GENOME_IDX_KEY = "bwa_index" else: GENOME_IDX_KEY = "bowtie2_index" - check_list = [ - {"asset_name":"fasta", "seek_key":"chrom_sizes", - "tag_name":"default", "arg":None, "user_arg":None, - "required":True}, - {"asset_name":"fasta", "seek_key":None, - "tag_name":"default", "arg":None, "user_arg":None, - "required":True}, - {"asset_name":GENOME_IDX_KEY, "seek_key":None, - "tag_name":"default", "arg":None, "user_arg":None, - "required":True} - ] - # If user specifies TSS file, use that instead of the refgenie asset - if not args.TSS_name: - check_list.append( - {"asset_name":"refgene_anno", "seek_key":"refgene_tss", - "tag_name":"default", "arg":"TSS_name", "user_arg":"TSS-name", - "required":False} - ) - # If user specifies feature annotation file, - # use that instead of the refgenie managed asset - if not args.anno_name: - check_list.append( - {"asset_name":"feat_annotation", "seek_key":"feat_annotation", - "tag_name":"default", "arg":"anno_name", "user_arg":"anno-name", - "required":False} - ) - # If user specifies blacklist file, - # use that instead of the refgenie managed asset - if not args.blacklist: - check_list.append( - {"asset_name":"blacklist", "seek_key":"blacklist", - "tag_name":"default", "arg":"blacklist", "user_arg":"blacklist", - "required":False} - ) - res, rgc = _add_resources(args, res, check_list) - - # If the user specifies optional files, add those to our resources + + # Add prealignment genome annotation files to resources + res.prealignment_index = [] + if args.prealignment_bowtie2_index and args.prealignment_bwa_index: + err_msg = (f"Incompatible prealignment settings: You specified a " + f"bowtie2 and bwa index.") + pm.fail_pipeline(RuntimeError(err_msg)) + elif args.prealignment_bowtie2_index: + if args.aligner.lower() == "bwa": + err_msg = (f"Incompatible settings: You specified bwa as your " + f"aligner but are using --prealignment-bowtie2-index " + f"to pass indices.") + pm.fail_pipeline(RuntimeError(err_msg)) + for index in args.prealignment_bowtie2_index: + if not os.path.exists(os.path.dirname(index)): + err_msg = (f"Could not find {index}.") + pm.info(IOError(err_msg)) + else: + res.prealignment_index.append(index) + elif args.prealignment_bwa_index: + if args.aligner.lower() == "bowtie2": + err_msg = (f"Incompatible settings: You specified bowtie2 as your " + f"aligner but are using --prealignment-bwa " + f"to pass indices.") + pm.fail_pipeline(RuntimeError(err_msg)) + for index in args.prealignment_bwa_index: + if not os.path.exists(os.path.dirname(index)): + err_msg = (f"Could not find {index}.") + pm.info(IOError(err_msg)) + else: + res.prealignment_index.append(index) + else: + pm.warning(f"Unable to find any prealignment indices. If this appears " + f"incorrect, confirm you passed the full path to each " + f"index directory prefix.") + + # Add primary genome annotation files to resources + if (args.bowtie2_index and args.bwa_index): + err_msg = (f"Incompatible settings: You specified a bowtie2 and " + f"bwa index.") + pm.fail_pipeline(RuntimeError(err_msg)) + elif (os.path.exists(os.path.dirname(args.bowtie2_index)) and not + args.bwa_index): + if args.aligner.lower() == "bwa": + err_msg = (f"Incompatible settings: You specified bwa as your " + f"aligner but are using --bwa-index " + f"to specify the index.") + pm.fail_pipeline(RuntimeError(err_msg)) + else: + res.genome_index = args.bowtie2_index + elif (os.path.exists(os.path.dirname(args.bwa_index)) and not + args.bowtie2_index): + if args.aligner.lower() == "bowtie2": + err_msg = (f"Incompatible settings: You specified bowtie2 as your " + f"aligner but are using --bowtie2-index " + f"to specify the index.") + pm.fail_pipeline(RuntimeError(err_msg)) + else: + res.genome_index = args.bwa_index + else: + err_msg = (f"A genome index file for {args.genome_assembly} " + f"for {args.aligner} is required.") + pm.fail_pipeline(IOError(err_msg)) + + if (args.chrom_sizes and os.path.isfile(args.chrom_sizes) and + os.stat(args.chrom_sizes).st_size > 0): + res.chrom_sizes = args.chrom_sizes + else: + err_msg = (f"A chromosome sizes file for {args.genome_assembly} " + f"is required.") + pm.fail_pipeline(IOError(err_msg)) + + # Add optional files to resources + if args.sob and not args.search_file: + err_msg = (f"You specified --sob but did not include the path to" + f"the tallymer index search file. Specify this with" + f"--search-file ") + pm.fail_pipeline(RuntimeError(err_msg)) + if (args.search_file and os.path.isfile(args.search_file) and + os.stat(args.search_file).st_size > 0): + res.search_file = args.search_file + if (args.blacklist and os.path.isfile(args.blacklist) and os.stat(args.blacklist).st_size > 0): res.blacklist = args.blacklist - if (args.frip_ref_peaks and os.path.isfile(args.frip_ref_peaks) and - os.stat(args.frip_ref_peaks).st_size > 0): - res.frip_ref_peaks = args.frip_ref_peaks if (args.TSS_name and os.path.isfile(args.TSS_name) and os.stat(args.TSS_name).st_size > 0): res.refgene_tss = args.TSS_name if (args.anno_name and os.path.isfile(args.anno_name) and os.stat(args.anno_name).st_size > 0): res.feat_annotation = args.anno_name + if (args.frip_ref_peaks and os.path.isfile(args.frip_ref_peaks) and + os.stat(args.frip_ref_peaks).st_size > 0): + res.frip_ref_peaks = args.frip_ref_peaks # Adapter file can be set in the config; if left null, we use a default. res.adapters = res.adapters or tool_path("NexteraPE-PE.fa") @@ -728,9 +713,16 @@ def main(): # Report utilized assets assets_file = os.path.join(param.outfolder, "assets.tsv") for asset in res: - message = "{}\t{}".format(asset, os.path.expandvars(res[asset])) - report_message(pm, assets_file, message) - + if isinstance(res[asset], list): + for a in res[asset]: + message = "{}\t{}".format(asset, os.path.expandvars(a)) + pm.debug(message) + report_message(pm, assets_file, message) + else: + message = "{}\t{}".format(asset, os.path.expandvars(res[asset])) + pm.debug(message) + report_message(pm, assets_file, message) + # Report primary genome message = "genome\t{}".format(args.genome_assembly) report_message(pm, assets_file, message) @@ -982,53 +974,43 @@ def check_trim(): # Keep track of the unmapped files in order to compress them after final # alignment. to_compress = [] - if len(args.prealignments) == 0: - print("You may use `--prealignments` to align to references before " - "the genome alignment step. See docs.") + if len(res.prealignment_index) == 0: + print("You may use `--prealignment-bowtie2-index` or " + "`--prealignment-bwa-index` to align to references before " + "the genome alignment step. " + "See http://pepatac.databio.org/en/latest/ for documentation.") else: - print("Prealignment assemblies: " + str(args.prealignments)) # Loop through any prealignment references and map to them sequentially - for reference in args.prealignments: - genome_index = os.path.join(rgc.seek(reference, GENOME_IDX_KEY)) - if not os.path.exists(os.path.dirname(genome_index)): - msg = "No {} index found in {}; skipping.".format( - reference, os.path.dirname(genome_index)) - print(msg) - else: - if not genome_index.endswith(reference): - genome_index = os.path.join( - os.path.dirname(rgc.seek(reference, GENOME_IDX_KEY)), - reference) - if args.aligner.lower() == "bwa": - genome_index += ".fa" - if args.no_fifo: - unmap_fq1, unmap_fq2 = _align( - args, tools, args.paired_end, False, - unmap_fq1, unmap_fq2, reference, - assembly=genome_index, - outfolder=param.outfolder, - aligndir="prealignments", - bt2_opts_txt=param.bowtie2_pre.params, - bwa_opts_txt=param.bwa_pre.params) - to_compress.append(unmap_fq1) - if args.paired_end: - to_compress.append(unmap_fq2) - else: - unmap_fq1, unmap_fq2 = _align( - args, tools, args.paired_end, True, - unmap_fq1, unmap_fq2, reference, - assembly=genome_index, - outfolder=param.outfolder, - aligndir="prealignments", - bt2_opts_txt=param.bowtie2_pre.params, - bwa_opts_txt=param.bwa_pre.params) - to_compress.append(unmap_fq1) - if args.paired_end: - to_compress.append(unmap_fq2) + for count, genome_index in enumerate(res.prealignment_index): + pm.info(f"Aligning with {args.aligner} to {genome_index}.") + assembly_identifier = f"prealignment_{count}" + if args.no_fifo: + unmap_fq1, unmap_fq2 = _align( + args, tools, args.paired_end, False, + unmap_fq1, unmap_fq2, assembly_identifier, + assembly=genome_index, + outfolder=param.outfolder, + aligndir="prealignments", + bt2_opts_txt=param.bowtie2_pre.params, + bwa_opts_txt=param.bwa_pre.params) + to_compress.append(unmap_fq1) + if args.paired_end: + to_compress.append(unmap_fq2) + else: + unmap_fq1, unmap_fq2 = _align( + args, tools, args.paired_end, True, + unmap_fq1, unmap_fq2, assembly_identifier, + assembly=genome_index, + outfolder=param.outfolder, + aligndir="prealignments", + bt2_opts_txt=param.bowtie2_pre.params, + bwa_opts_txt=param.bwa_pre.params) + to_compress.append(unmap_fq1) + if args.paired_end: + to_compress.append(unmap_fq2) pm.timestamp("### Compress all unmapped read files") # Confirm pairing is complete - # Confirm pairing is complete def no_handle(fq): fpath = str(Path(fq).resolve()) pm.debug("fq: {}".format(fpath)) @@ -1080,7 +1062,6 @@ def no_handle(fq): # Map to primary genome # ############################################################################ pm.timestamp("### Map to genome") - if args.aligner.lower() == "bwa": if not param.bwa.params: @@ -1106,18 +1087,10 @@ def no_handle(fq): if os.path.exists(unmap_fq2 + ".gz"): unmap_fq2 = unmap_fq2 + ".gz" - genome_index = os.path.join(rgc.seek(args.genome_assembly, GENOME_IDX_KEY)) - if not genome_index.endswith(args.genome_assembly): - genome_index = os.path.join( - os.path.dirname(rgc.seek(args.genome_assembly, GENOME_IDX_KEY)), - args.genome_assembly) - if args.aligner.lower() == "bwa": - genome_index += ".fa" - if args.aligner.lower() == "bwa": cmd = tools.bwa + " mem -t " + str(pm.cores) cmd += " " + bwa_options - cmd += " " + genome_index + cmd += " " + res.genome_index cmd += " " + unmap_fq1 if args.paired_end: cmd += " " + unmap_fq2 @@ -1129,7 +1102,7 @@ def no_handle(fq): cmd = tools.bowtie2 + " -p " + str(pm.cores) cmd += " " + bt2_options cmd += " --rg-id " + args.sample_name - cmd += " -x " + genome_index + cmd += " -x " + res.genome_index if args.paired_end: cmd += " -1 " + unmap_fq1 + " -2 " + unmap_fq2 else: @@ -1477,19 +1450,6 @@ def post_dup_aligned_reads(dedup_log): else: read_len = int(pm.get_stat("Read_length")) - # At this point we can check for seqOutBias required indicies. - # Can't do it earlier because we haven't determined the read_length of - # interest for mappability purposes. - if args.sob: - pm.debug("read_len: {}".format(read_len)) # DEBUG - search_asset = [{"asset_name":"tallymer_index", - "seek_key":"search_file", - "tag_name":read_len, - "arg":"search_file", - "user_arg":"search-file", - "required":True}] - res, rgc = _add_resources(args, res, search_asset) - # Calculate size of genome if not pm.get_stat("Genome_size") or args.new_start: genome_size = int(pm.checkprint( diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 49998829..a2b3ff3a 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -4,42 +4,52 @@ path: pipelines/pepatac.py input_schema: pepatac_input_schema.yaml output_schema: pepatac_output_schema.yaml command_template: > - {pipeline.path} + python {pipeline.path} --sample-name {sample.sample_name} --genome {sample.genome} --input {sample.read1} --single-or-paired {sample.read_type} + --chrom-sizes {refgenie.fasta.chrom_sizes} -O {looper.results_subdir} -P {compute.cores} -M {compute.mem} {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} + {% if sample.trimmer is defined %} --trimmer {sample.trimmer} {% endif %} {% if sample.aligner is defined %} --aligner {sample.aligner} {% endif %} + {% if sample.aligner == "bowtie2" %} --bowtie2-index {refgenie.bowtie2_index.dir} {% endif %} + {% if sample.aligner == "bwa" %} --bwa-index {refgenie.bwa_index.dir} {% endif %} + {% if sample.deduplicator is defined %} --deduplicator {sample.deduplicator} {% endif %} {% if sample.peak_caller is defined %} --peak-caller {sample.peak_caller} {% endif %} {% if sample.macs_genome_size is defined %} --genome-size {sample.macs_genome_size} {% endif %} - {% if sample.trimmer is defined %} --trimmer {sample.trimmer} {% endif %} - {% if sample.prealignments is defined %} --prealignments {sample.prealignments} {% endif %} - {% if sample.deduplicator is defined %} --deduplicator {sample.deduplicator} {% endif %} - {% if sample.TSS_name is defined %} --TSS-name {sample.TSS_name} {% endif %} - {% if sample.blacklist is defined %} --blacklist {sample.blacklist} {% endif %} - {% if sample.anno_name is defined %} --anno-name {sample.anno_name} {% endif %} {% if sample.peak_type is defined %} --peak-type {sample.peak_type} {% endif %} {% if sample.extend is defined %} --extend {sample.extend} {% endif %} {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks {sample.frip_ref_peaks} {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} + {% if sample.sob is defined %} --search-file {refgenie.tallymer_index.search_file} {% endif %} {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} {% if sample.lite is defined %} --lite {% endif %} {% if sample.skipqc is defined %} --skipqc {% endif %} + {% if sample.prealignment_bowtie2_index is defined %} --prealignment-bowtie2-index {refgenie.bowtie2_index.dir} {% endif %} + {% if sample.prealignment_bwa_index is defined %} --prealignment-bwa-index {refgenie.bwa_index.bwa_index} {% endif %} + --TSS-name {refgenie.refgene_anno.refgene_tss} + --blacklist {refgenie.blacklist} + --anno-name {refgenie.feat_annotation} + compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac docker_image: databio/pepatac bulker_crate: databio/pepatac:1.0.7 size_dependent_variables: resources-sample.tsv - bioconductor: readFunName: runCOCOA readFunPath: BiocProject/runCOCOA.R +var_templates: + refgenie_config: "$REFGENIE" +pre_submit: + python_functions: + - refgenconf.looper_refgenie_populate From 281df8bf218b306fa7636e7de37bc7e462bda604 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 09:56:41 -0400 Subject: [PATCH 06/66] update doc layout --- docs/compute-resources.md | 36 ++++++++++- docs/install.md | 129 ++------------------------------------ docs/run-bulker.md | 89 ++++++++++++++++++++++++++ docs/run-conda.md | 115 +++++++++++++++++++++++++++++++++ docs/run-container.md | 93 ++++++++++----------------- mkdocs.yml | 8 +-- 6 files changed, 281 insertions(+), 189 deletions(-) create mode 100644 docs/run-bulker.md create mode 100644 docs/run-conda.md diff --git a/docs/compute-resources.md b/docs/compute-resources.md index b43ae714..c25837de 100644 --- a/docs/compute-resources.md +++ b/docs/compute-resources.md @@ -2,6 +2,29 @@ `PEPATAC` comes packaged with default compute settings (memory, cores, and time) for both the sample and project-level pipeline. These values will automatically be populated by `looper` based on the input file size. In that way, smaller samples (e.g. fewer reads) will request less resources and vice-versa for large samples. You can also specify these values on the command-line. +## Default computing options + +When you run your `PEPATAC` project using `looper run`, by default it will simply run each sample locally. You can change that using `looper run --package COMPUTE_PACKAGE`, where `COMPUTE_PACKAGE` is an option as described below. This enables you to adjust your computing preferences on-the-fly. You have several built-in packages, which you can view by typing `divvy list`. Default packages include: + +- `--package slurm`. Submit the jobs to a SLURM cluster using `sbatch`. +- `--package sge`. Submit the jobs to a SGE cluster using `qsub`. + +To show how this works, let's run the example project using the `slurm` compute package. Used `-d` for a dry run to create the submits scripts but not run them: + +```console +cd pepatac +looper run examples/test_project/test_config.yaml -d \ + --package slurm +``` + +This will produce a job script: + +```console +cat pepatac_test/submission/PEPATAC_test1.sub +``` + +If all looks well, run looper without `-d` to actually submit the jobs. + ## Configure computing resource requests at the command-line You can specify the the memory (`-M`) and number of cores (`-P`) directly on the command-line. @@ -11,7 +34,18 @@ pipelines/pepatac.py -O /path/to/processed/data/ -S "compute_example" -I /path/t ## Configure computing resource requests with `looper` -Default computing resource requests are defined in the [resources-sample.tsv](https://github.com/databio/pepatac/blob/master/resources-sample.tsv) and [resources-project.tsv](https://github.com/databio/pepatac/blob/master/resources-project.tsv) for sample and project-level pipeline calls, respectively. `Looper` checks these files based on the `size_dependent_variables` section in the `sample` and `project` `pipeline_interface.yaml` files. For default pipeline settings, these resources should be more than sufficient, but for different pipeline settings you may desire to request different resources. This could be accomplished two ways: +`PEPATAC` uses a standardized computing configuration called [divvy](https://divvy.databio.org). The instructions for changing these computing configuration options are universal for any software that relies on `divvy`. + +To customize your compute packages, you first create a `divvy` computing configuration file and point an environment variable (`DIVCFG`) to that file: + +```console +export DIVCFG="divvy_config.yaml" +divvy init $DIVCFG +``` + +Next, you edit that config file to add in any compute packages you need. `PEPATAC` will then give you access to any of your custom packages with `looper --package `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). + +Default computing resource requests for `PEPATAC` are defined in the [resources-sample.tsv](https://github.com/databio/pepatac/blob/master/resources-sample.tsv) and [resources-project.tsv](https://github.com/databio/pepatac/blob/master/resources-project.tsv) for sample and project-level pipeline calls, respectively. `Looper` checks these files based on the `size_dependent_variables` section in the `sample` and `project` `pipeline_interface.yaml` files. For default pipeline settings, these resources should be more than sufficient, but for different pipeline settings you may desire to request different resources. This could be accomplished two ways: 1. You can [override universal compute settings when you call `looper` by specifying the resources using the `--compute` variable](https://looper.readthedocs.io/en/latest/variable-namespaces/#5-compute): ```console looper run --compute mem=24000 time=00-12:00:00 --cpus-per-task=36 --ntasks=1 diff --git a/docs/install.md b/docs/install.md index 94722e56..e0d8d05f 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,127 +1,8 @@ # Install and run PEPATAC -## 1: Clone the `PEPATAC` pipeline +We provide several methods to setup `PEPATAC`. A fundamental challenge of any complex pipeline is that they rely on many independent tools. Installing all of these from scratch can be a chore, although the common use of many of the required bioinformatic tools means they are likely to already be available on an HPC or server. Installation can also be addressed through the use of containers, although that requires setting up and learning to use containers. No single approach appears to resolve all challenges for all users, but we've done our best to provide various ways to ease setup as much as possible. -``` -git clone https://github.com/databio/pepatac.git -``` - -## 2: Install required software - -You have a few options for software prerequisites: 1) use containers, 2) install via `conda`, or 3) install all prerequisites natively. If you want to use containers, you can use either the [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/) with `docker` or `singularity`, or just use either `docker` or `singularity` -- see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements with `conda`: - -### Tools - -You will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/). - -Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)). - -The easiest and preferred way is to utilize `conda` to install all the tools in a single command, albeit be prepared for this initial installation process to take more than an hour to complete. - -From the `pepatac/` directory: -```{bash} -conda env create -f requirements-conda.yml -``` - -Note: The subsequent steps all assume you have installed using `conda`. Alternatively, you can follow instructions to install each individual program natively. If you need additional direction with this approach, see the [detailed installation instructions](detailed-install.md). - -### Python packages - -`PEPATAC` uses several Python packages under the hood. Not all of these are available through `conda`, so we'll ensure they are installed ourselves to the `pepatac` `conda` environment. From the `pepatac/` directory: - -```{bash} -conda activate pepatac -unset PYTHONPATH -python -m pip install --ignore-installed --upgrade -r requirements.txt -``` - -### R packages - -`PEPATAC` uses `R` to generate quality control and read/peak annotation plots. We have packaged the `pepatac` specific `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/master/PEPATACr). The `PEPATACr` package relies on a few additional packages which can be installed to the `conda` environment. - -To ensure these packages are installed to the `pepatac` `conda` environment, make sure to point your `R_LIBS` environment variable to the `conda` environment `R` library. For example: -```{bash} -conda activate pepatac -unset R_LIBS -export R_LIBS="$CONDA_PREFIX/lib/R/library" -``` - -From the `pepatac/` directory, open `R` and install the following packages: -```{R} -install.packages("optigrab") -devtools::install_github("databio/GenomicDistributions") -install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL) -devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/") -``` - -## 3: Download `refgenie` assets - -PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this: - -```console -export REFGENIE=/path/to/your_genome_folder/genome_config.yaml -refgenie init -c $REFGENIE -``` - -Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. - -Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: - -```console -refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb -refgenie build hg38/feat_annotation -``` - -PEPATAC also requires a `bowtie2_index` asset for any pre-alignment genomes: - -```console -refgenie pull rCRSd/bowtie2_index -refgenie pull human_repeats/bowtie2_index -``` - -## 4: Use `looper` to run the sample processing pipeline - -Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the sample-level pipeline across each sample in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them: - -From the `pepatac/` folder: -``` -looper run -d examples/test_project/test_config.yaml -``` - -If that looked good, let's actually run the example by taking out the `-d` flag: -``` -looper run examples/test_project/test_config.yaml -``` - -Or, if you're using [`bulker`](https://bulker.databio.org/en/latest/) to run the pipeline in containers: - -``` -bulker activate databio/pepatac -looper run examples/test_project/test_config.yaml -``` - -There are lots of other cool things you can do with looper, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/). - -## 5: Use `looper` to run the project level pipeline - -`PEPATAC` also includes a project-level processing pipeline to do things like: - - - [Plot alignment statistics](files/examples/gold/summary/gold_alignmentPercent.pdf) for all samples in the project together for easy visualization - - [Plot TSS enrichment scores](files/examples/gold/summary/gold_TSSEnrichment.pdf) for all the samples in the project in a single figure - - [Produce a consensus peak set](consensus_peaks.md) for the project - - [Produce a count table](count_table.md) using the consensus peak set for all the samples in a project - -`looper runp examples/test_project/test_config.yaml` - -This should take < a minute on the test sample and will generate a summary/ directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. - -# Next steps - -This is just the beginning. For your next step, take a look at one of these user guides: - -- [Configuring custom project files](peps.md) -- [Extended tutorial for running multiple samples](tutorial.md) -- [Running the pipeline with containers](run-container.md) -- See other detailed user guide links in the side menu - -Any questions? Feel free to [reach out to us](contact.md). Otherwise, go analyze some ATAC-seq! +1. [Run the pipeline using the multi-container environment manager, `bulker`.](run-bulker.md) +2. [Run the pipeline using a single, monolithic container.](run-container.md) +3. [Run the pipeline in a conda environment.](run-conda.md) +4. [Run the pipeline natively.](detailed-install.md) diff --git a/docs/run-bulker.md b/docs/run-bulker.md new file mode 100644 index 00000000..5def8be0 --- /dev/null +++ b/docs/run-bulker.md @@ -0,0 +1,89 @@ +# Run PEPATAC with a multiple container manager. + +Whether you are using `docker` or `singularity`, we have a solution to run the pipeline using containers that reduces the installation burden. + +In addition to cloning the `PEPATAC` repository, this requires the installation and configuration of a single python package, our [multi-container environment manager `bulker`](https://bulker.databio.org/en/latest/). We support using `bulker` for a few reasons: + +1. It simplifies container use by wrapping the complexities of `docker` or `singularity` calls so that you can use a containerized program without even realizing you're using a container. You can call a program at the command line the same as your would *without* using bulker. +2. Similar to a dockerfile, you can distribute sets of tools *but* as a separate set of containers, not a single, unwieldy, and monolithic container. +3. Since `bulker` commands behave like native commands, a workflow becomes automatically containerized with bulker. +4. Finally, this makes bulker environments very portable, since the only requirement for native-like command use is `docker` or `singularity`. + +If you would still prefer using a single container, we do provide a [PEPATAC dockerfile](https://github.com/databio/pepatac/blob/master/containers/pepatac.Dockerfile) and support for [running the pipeline using a single, monolithic container.](run-container.md). + +## Running `PEPATAC` using `bulker` + +### 1: Clone the `PEPATAC` pipeline + +```console +git clone https://github.com/databio/pepatac.git +``` + +### 2: Initialize `refgenie` and download assets + +`PEPATAC` uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. Because assets are user-dependent, these files must still exist outside of a container system. We need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +`PEPATAC` also requires a `bowtie2_index` asset for any pre-alignment genomes: + +```console +refgenie pull rCRSd/bowtie2_index +refgenie pull human_repeats/bowtie2_index +``` + +### 3. Install and configure `bulker` + +Check out [the `bulker` setup guide to install bulker](https://bulker.databio.org/en/latest/install/) on your system. It is a straightforward python package with a few configuration steps required prior to use with `PEPATAC`. + +### 4. Load the `PEPATAC` crate + +We've already produced a `bulker` crate for `PEPATAC` that requires all software needed to run the pipeline. We can load this crate directly from the [`bulker registry`](http://hub.bulker.io/): +```console +bulker load databio/pepatac:1.0.7 -r +``` + +### 5. Activate the `PEPATAC` crate + +Now that we've loaded the `PEPATAC` crate, we need to activate that specific crate so its included tools are available. +```console +bulker activate databio/pepatac:1.0.7 +``` +Now, you can run any of the commands in the crate as if they were natively installed, **but they're actually running in containers**! + +### 6. Run the pipeline + +Now we simply run the pipeline like you would with a native installation, but we wouldn't have needed to install any additional tools! + +#### Run the pipeline at the command line + +From the `pepatac/` repository folder: +```console +pipelines/pepatac.py --single-or-paired paired \ + --prealignments rCRSd human_repeats \ + --genome hg38 \ + --sample-name test1 \ + --input examples/data/test1_r1.fastq.gz \ + --input2 examples/data/test1_r2.fastq.gz \ + --genome-size hs \ + -O $HOME/pepatac_test +``` +#### Run the pipeline using looper + +Since `bulker` automatically direct any calls to required software to instead be executed in containers, we can just run our project the exact same way we would when we installed everything natively! +```console +looper run examples/test_project/test_config.yaml +``` \ No newline at end of file diff --git a/docs/run-conda.md b/docs/run-conda.md new file mode 100644 index 00000000..115a0218 --- /dev/null +++ b/docs/run-conda.md @@ -0,0 +1,115 @@ +# Run PEPATAC in a conda environment. + +We also enable setup of the pipeline using conda. As with container-based approaches, some native installation is required for complete setup. + +## 1: Clone the `PEPATAC` pipeline + +```console +git clone https://github.com/databio/pepatac.git +``` + +## 2: Install bioinformatic tools + +You will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/). + +Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)). + +Be prepared for this initial installation process to take more than an hour to complete. + +From the `pepatac/` directory: +```{bash} +conda env create -f requirements-conda.yml +``` + +Note: The subsequent steps all assume you have installed using `conda`. Alternatively, you can [follow instructions to install each individual program natively](detailed-install.md). + +## 3. Install python packages + +`PEPATAC` uses several Python packages under the hood. Not all of these are available through `conda`, so we'll ensure they are installed ourselves to the `pepatac` `conda` environment. From the `pepatac/` directory: + +```{bash} +conda activate pepatac +unset PYTHONPATH +python -m pip install --ignore-installed --upgrade -r requirements.txt +``` + +## 4. Install R packages + +`PEPATAC` uses `R` to generate quality control and read/peak annotation plots. We have packaged the `pepatac` specific `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/master/PEPATACr). The `PEPATACr` package relies on a few additional packages which can be installed to the `conda` environment. + +To ensure these packages are installed to the `pepatac` `conda` environment, make sure to point your `R_LIBS` environment variable to the `conda` environment `R` library. For example: +```{bash} +conda activate pepatac +unset R_LIBS +export R_LIBS="$CONDA_PREFIX/lib/R/library" +``` + +From the `pepatac/` directory, open `R` and install the following packages: +```{R} +install.packages("optigrab") +devtools::install_github("databio/GenomicDistributions") +install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL) +devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/") +``` + +## 5: Initialize `refgenie` and download assets + +PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this: + +```console +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +PEPATAC also requires a `bowtie2_index` asset for any pre-alignment genomes: + +```console +refgenie pull rCRSd/bowtie2_index +refgenie pull human_repeats/bowtie2_index +``` + +## 6: Use `looper` to run the sample processing pipeline + +Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the sample-level pipeline across each sample in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them: + +From the `pepatac/` folder: +```console +looper run -d examples/test_project/test_config.yaml +``` + +If that looked good, let's actually run the example by taking out the `-d` flag: +```console +looper run examples/test_project/test_config.yaml +``` + +Or, if you're using [`bulker`](https://bulker.databio.org/en/latest/) to run the pipeline in containers: + +```console +bulker activate databio/pepatac +looper run examples/test_project/test_config.yaml +``` + +There are lots of other cool things you can do with looper, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/). + +## 7: Use `looper` to run the project level pipeline + +`PEPATAC` also includes a project-level processing pipeline to do things like: + + - [Plot alignment statistics](files/examples/gold/summary/gold_alignmentPercent.pdf) for all samples in the project together for easy visualization + - [Plot TSS enrichment scores](files/examples/gold/summary/gold_TSSEnrichment.pdf) for all the samples in the project in a single figure + - [Produce a consensus peak set](consensus_peaks.md) for the project + - [Produce a count table](count_table.md) using the consensus peak set for all the samples in a project + +`looper runp examples/test_project/test_config.yaml` + +This should take < a minute on the test sample and will generate a summary/ directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. + diff --git a/docs/run-container.md b/docs/run-container.md index 61786224..157e9176 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -1,101 +1,74 @@ -# Run PEPATAC in a container +# Run PEPATAC in a container. -Whether you are using `docker` or `singularity`, we have a solution to run the pipeline using containers that dramatically reduces the installation burden. +A popular approach is installing all dependencies in a container and just use that single container. This container can be used with either `docker` or `singularity`. You can run `PEPATAC` as an individual pipeline on a single sample using these containers by directly calling `docker run` or `singularity exec`. Or, you can rely on `looper`, which is already set up to run any pipeline in existing containers using the `divvy` templating system. -In addition to cloning the `PEPATAC` repository, this requires the installation and configuration of a single python package, our [multi-container environment manager `bulker`](https://bulker.databio.org/en/latest/). We support using `bulker` for a few reasons: +## Running `PEPATAC` using a single, monolithic container. -1. It simplifies container use by wrapping the complexities of `docker` or -`singularity` calls so that you can use a containerized program without even -realizing you're using a container. You can call a program at the command line -the same as your would *without* using bulker. -2. Similar to a dockerfile, you can distribute sets of tools *but* as a separate set of containers, not a single, unwieldy, and monolithic container. -3. Since `bulker` commands behave like native commands, a workflow becomes automatically containerized with bulker. -4. Finally, this makes bulker environments very portable, since the only requirement for native-like command use is `docker` or `singularity`. +### 1: Clone the `PEPATAC` pipeline -Yet, if you would still prefer using a single container, we do still provide a [PEPATAC dockerfile](https://github.com/databio/pepatac/blob/master/containers/pepatac.Dockerfile) and support for [running the pipeline in this manner](run-container.md#running-pepatac-without-bulker). - -## Running `PEPATAC` using `bulker` - -### 1. Install and configure `bulker` - -Check out [the `bulker` setup guide to install bulker](https://bulker.databio.org/en/latest/install/) on your system. It is a straightforward python package with a few configuration steps required prior to use with `PEPATAC`. - -### 2. Load the `PEPATAC` crate - -We've already produced a `bulker` crate for `PEPATAC` that requires all software needed to run the pipeline. We can load this crate directly from the [`bulker registry`](http://hub.bulker.io/): -``` -bulker load databio/pepatac:1.0.6 -r +```console +git clone https://github.com/databio/pepatac.git ``` -### 3. Activate the `PEPATAC` crate +### 2: Initialize `refgenie` and download assets -Now that we've loaded the `PEPATAC` crate, we need to activate that specific crate so its included tools are available. -``` -bulker activate databio/pepatac:1.0.6 -``` -Now, you can run any of the commands in the crate as if they were natively installed, **but they're actually running in containers**! +`PEPATAC` uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. Because assets are user-dependent, these files must still exist outside of a container system. We need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: -### 4. Run the pipeline +```console +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` -Now we simply run the pipeline like normal, but we wouldn't have needed to install any additional tools! +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. -#### Run the pipeline at the command line -``` -pipelines/pepatac.py --single-or-paired paired \ - --prealignments rCRSd human_repeats \ - --genome hg38 \ - --sample-name test1 \ - --input examples/data/test1_r1.fastq.gz \ - --input2 examples/data/test1_r2.fastq.gz \ - --genome-size hs \ - -O $HOME/pepatac_test -``` -#### Run the pipeline using looper +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). -Since `bulker` automatically direct any calls to required software to instead be executed in containers, we can just run our project the exact same way we would when we installed everything natively! -``` -looper run examples/test_project/test_config.yaml +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation ``` -## Running `PEPATAC` without `bulker` +`PEPATAC` also requires a `bowtie2_index` asset for any pre-alignment genomes: -You can run `PEPATAC` as an individual pipeline on a single sample using these containers by directly calling `docker run` or `singularity exec`. Or, you can rely on `looper`, which is already set up to run any pipeline in existing containers using the `divvy` templating system. Instructions for both follow: +```console +refgenie pull rCRSd/bowtie2_index +refgenie pull human_repeats/bowtie2_index +``` -First, make sure your environment is set up to run either docker or singularity containers. Then, pull the container image: +### 3. Pull the container image. **Docker**: You can pull the docker [databio/pepatac image](https://hub.docker.com/r/databio/pepatac/) from dockerhub like this: -``` +```console docker pull databio/pepatac ``` Or build the image using the included Dockerfile (you can use a recipe in the included Makefile): -``` +```console cd pepatac/ make docker ``` **Singularity**: You can [download the singularity image](http://big.databio.org/simages/pepatac) or build it from the docker image using the Makefile: -``` +```console cd pepatac/ make singularity ``` Now you'll need to tell the pipeline where you saved the singularity image. You can either create an environment variable called `$SIMAGES` that points to the folder where your image is stored, or you can tweak the `pipeline_interface.yaml` file so that the `compute.singularity_image` attribute is pointing to the right location on disk. -If your containers are set up correctly, then won't need to install any additional software. - -## Running individual samples in a container +### 4. Run individual samples in a container Individual jobs can be run in a container by simply running the `pepatac.py` command through `docker run` or `singularity exec`. You can run containers either on your local computer, or in an HPC environment, as long as you have `docker` or `singularity` installed. You will need to include any volumes that contain data required by the pipeline. For example, to utilize `refgenie` assets you'll need to ensure the volume containing those files is available. In the following example, we are including an environment variable (`$GENOMES`) which points to such a directory. For example, run it locally in singularity like this: -``` +```console singularity exec --bind $GENOMES $SIMAGES/pepatac pipelines/pepatac.py --help ``` With `docker`, you can use: -``` +```console docker run --rm -it databio/pepatac pipelines/pepatac.py --help ``` Be sure to mount the volumes you need with `--volume`. If you're utilizing any environment variables (e.g. `$GENOMES`), don't forget to include those in your docker command with the `-e` option. @@ -108,7 +81,7 @@ The pipeline has been successfully run in both a Linux and MacOS environment. Wi In the first example, we're mounting our home user directory (`/home/jps3ag/`) which contains the parent directories to our `refgenie` assets (`$GENOMES`) and to the pipeline itself. We'll also provide the pipeline two environment variables, `$GENOMES` and `$HOME`. Here's that example command in a Linux environment to run the test example through the pipeline: -``` +```console docker run --rm -it --volume /home/jps3ag/:/home/jps3ag/ \ -e GENOMES='/home/jps3ag/genomes/' \ -e HOME='/home/jps3ag/' \ @@ -132,7 +105,7 @@ This necessitates a few minor changes to run that same example: Remember to [allocate sufficient memory](https://docs.docker.com/docker-for-mac/#advanced) (6-8GB should generally be adequate) in Docker for Mac. -``` +```console docker run --rm -it --volume /Users/jps3ag/:/Users/jps3ag/ \ -e GENOMES="/Users/jps3ag/genomes" \ -e HOME="/Users/jps3ag/" \ @@ -173,6 +146,6 @@ Third, close your instance when finished. singularity instance stop pepatac_instance ``` -## Running multiple samples in a container with looper +### 5. Running multiple samples in a container with looper To run multiple samples in a container, you simply need to configure `looper` to use a container-compatible template. The looper documentation has instructions for [running jobs in containers](http://looper.databio.org/en/latest/containers/). \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index be1988a8..4deeedb0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -16,13 +16,13 @@ nav: - Getting Started: - Introduction: 'README.md' - Features: 'features.md' - - Install and run test example: 'install.md' + - Install: 'install.md' - Extended tutorial: 'tutorial.md' - How-to Guides: - - Detailed install guide: 'detailed-install.md' - - Run directly: 'run-directly.md' - - Run on cluster: 'run-cluster.md' + - Run using bulker: 'run-bulker.md' - Run using containers: 'run-container.md' + - Run using conda: 'run-conda.md' + - Run natively: 'detailed-install.md' - Configure assets: 'assets.md' - Configure prealignments: 'prealignments.md' - Configure project files: 'peps.md' From b8799d60c04fabae38d972e6ea2a801661e2c1e7 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 11:20:43 -0400 Subject: [PATCH 07/66] use summits file --- pipelines/pepatac.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 7f459c75..6fa58c1c 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -1963,6 +1963,7 @@ def report_peak_count(): peak_input_file = shift_bed shift_bed_gz = shift_bed + ".gz" peak_bed = os.path.join(peak_folder, args.sample_name + "_peaks.bed") + summits_bed = os.path.join(peak_folder, args.sample_name + "_summits.bed") chr_order = os.path.join(peak_folder, "chr_order.txt") chr_keep = os.path.join(peak_folder, "chr_keep.txt") @@ -2185,16 +2186,24 @@ def report_peak_count(): fixed_peak_file = os.path.join(peak_folder, args.sample_name + "_peaks_fixedWidth.narrowPeak") # If using fixed peaks, extend from summit - if args.peak_type == "fixed": + if args.peak_type == "fixed" and args.peak_caller == "macs2": + temp = tempfile.NamedTemporaryFile(dir=peak_folder, delete=False) # extend peaks from summit by 'extend' # start extend from center of peak - cmd = ("awk -v OFS='" + "\t" + + cmd1 = ("awk -v OFS='" + "\t" + "' '{$2 = int(($3 - $2)/2 + $2 - " + str(args.extend) + "); " + "$3 = int($2 + " + str(2*args.extend) + - "); print}' " + peak_output_file + " > " + fixed_peak_file) + "); print $1, $2, $3}' " + summits_bed + " > " + + temp.name) + # reconstruct narrowPeak file + cmd2 = ("paste " + temp.name + + " <(awk -v OFS='\t' '{print $4, $5, $6, " + + "$7, $8, $9, $10}' " + peak_output_file + ")" + + " >> " + fixed_peak_file) peak_output_file = fixed_peak_file - pm.run(cmd, peak_output_file) + pm.run([cmd1, cmd2], peak_output_file) + pm.clean_add(temp.name) # remove overlapping peaks, peaks extending beyond chromosomes, # and normalize score From fa54e027451c35e889e5bc42cec5434704fd0809 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 12:32:22 -0400 Subject: [PATCH 08/66] include ability to perform checks without grabbing repo --- checkinstall | 164 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 113 insertions(+), 51 deletions(-) diff --git a/checkinstall b/checkinstall index ec573fa9..9e7a6890 100755 --- a/checkinstall +++ b/checkinstall @@ -111,60 +111,118 @@ if ! is_executable "pip"; then BULKER_INSTALL=1 fi -while IFS= read -r line; do - [ "${line:0:1}" = "#" ] && continue - IFS='>=' read -r -a array <<< "$line" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}" 2&>/dev/null ; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package.") - # printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') - - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 +if [ -f "requirements.txt" ]; then + while IFS= read -r line; do + [ "${line:0:1}" = "#" ] && continue + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") fi + fi + done < "requirements.txt" +else + declare -a requiredPkgs=("attmap>=0.12.9" "bio>=0.2.4" "codecov>=2.0" "colorama>=0.3.9" "Cython>=0.29" "cykhash>=1.0.2 " "divvy>=0.5.0" "eido>=0.1.3" "hypothesis==4.38.0" "jinja2" "jsonschema>=3.0.1" "logmuse>=0.2.5" "looper>=1.2.1" "MACS2>=2.2.7.1" "numpy>=1.17" "oyaml" "pararead" "pandas>=0.20.2" "peppy>=0.31.0" "piper" "psutil" "pysam>=0.13" "python-Levenshtein>=0.12.0" "pyyaml>=3.13" "refgenconf>=0.7.0" "refgenie" "ubiquerg>=0.6.1" "yacman>=0.6.7") + + for package in ${requiredPkgs[@]}; do + IFS='>=' read -r -a array <<< "$package" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}" 2&>/dev/null ; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package.") + # printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi fi - fi -done < "requirements.txt" + done +fi # Check tool installation declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") @@ -358,7 +416,11 @@ else fi CWD=$(pwd) -crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') + +if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then + crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') +else: + crate="databio/pepatac:1.0.7" echo "crate: ${crate}" yes n | bulker load $crate From aeb649f82929a29151ba139e08fddde3a9865dd5 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 12:33:58 -0400 Subject: [PATCH 09/66] update fixed width peak creation --- pipelines/pepatac.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 6fa58c1c..8a291168 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -2197,10 +2197,9 @@ def report_peak_count(): "); print $1, $2, $3}' " + summits_bed + " > " + temp.name) # reconstruct narrowPeak file - cmd2 = ("paste " + temp.name + - " <(awk -v OFS='\t' '{print $4, $5, $6, " + - "$7, $8, $9, $10}' " + peak_output_file + ")" + - " >> " + fixed_peak_file) + cmd2 = ("awk -v OFS='\t' '{print $4, $5, $6, $7, $8, $9, $10}' " + + peak_output_file + " | paste " + temp.name + " - " + + " > " + fixed_peak_file) peak_output_file = fixed_peak_file pm.run([cmd1, cmd2], peak_output_file) pm.clean_add(temp.name) From ab4a1887f5f9d08cdc06ba51832bbba1c08111d1 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 12:38:47 -0400 Subject: [PATCH 10/66] set rgc.seek runtime warnings to false --- pipelines/pepatac.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 8a291168..b460db62 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -486,7 +486,7 @@ def _add_resources(args, res, asset_dict=None): for reference in args.prealignments: for asset in [GENOME_IDX_KEY]: try: - res[asset] = rgc.seek(reference, asset) + res[asset] = rgc.seek(reference, asset, strict_exists=False) except KeyError: err_msg = "{} for {} is missing from REFGENIE config file." pm.fail_pipeline(KeyError(err_msg.format(asset, reference))) @@ -518,7 +518,8 @@ def _add_resources(args, res, asset_dict=None): res[seek_key] = rgc.seek(args.genome_assembly, asset_name=str(asset), tag_name=str(tag), - seek_key=str(seek_key)) + seek_key=str(seek_key), + strict_exists=False) except KeyError: key_errors.append(item) if req: @@ -989,7 +990,8 @@ def check_trim(): print("Prealignment assemblies: " + str(args.prealignments)) # Loop through any prealignment references and map to them sequentially for reference in args.prealignments: - genome_index = os.path.join(rgc.seek(reference, GENOME_IDX_KEY)) + genome_index = os.path.join( + rgc.seek(reference, GENOME_IDX_KEY, strict_exists=False)) if not os.path.exists(os.path.dirname(genome_index)): msg = "No {} index found in {}; skipping.".format( reference, os.path.dirname(genome_index)) @@ -997,7 +999,9 @@ def check_trim(): else: if not genome_index.endswith(reference): genome_index = os.path.join( - os.path.dirname(rgc.seek(reference, GENOME_IDX_KEY)), + os.path.dirname(rgc.seek(reference, + GENOME_IDX_KEY, + strict_exists=False)), reference) if args.aligner.lower() == "bwa": genome_index += ".fa" @@ -1106,10 +1110,13 @@ def no_handle(fq): if os.path.exists(unmap_fq2 + ".gz"): unmap_fq2 = unmap_fq2 + ".gz" - genome_index = os.path.join(rgc.seek(args.genome_assembly, GENOME_IDX_KEY)) + genome_index = os.path.join( + rgc.seek(args.genome_assembly, GENOME_IDX_KEY, strict_exists=False)) if not genome_index.endswith(args.genome_assembly): genome_index = os.path.join( - os.path.dirname(rgc.seek(args.genome_assembly, GENOME_IDX_KEY)), + os.path.dirname(rgc.seek(args.genome_assembly, + GENOME_IDX_KEY, + strict_exists=False)), args.genome_assembly) if args.aligner.lower() == "bwa": genome_index += ".fa" From 8b99b2b15244fa6a6d8d66fe980549e134bd129e Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 12:42:22 -0400 Subject: [PATCH 11/66] fix missing fi --- checkinstall | 1 + 1 file changed, 1 insertion(+) diff --git a/checkinstall b/checkinstall index 9e7a6890..077d4ed8 100755 --- a/checkinstall +++ b/checkinstall @@ -421,6 +421,7 @@ if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') else: crate="databio/pepatac:1.0.7" +fi echo "crate: ${crate}" yes n | bulker load $crate From 41937be9ebecd8a14c556959c345306452f062f1 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 12:42:49 -0400 Subject: [PATCH 12/66] fix colon typo --- checkinstall | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkinstall b/checkinstall index 077d4ed8..36513bdb 100755 --- a/checkinstall +++ b/checkinstall @@ -419,7 +419,7 @@ CWD=$(pwd) if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') -else: +else crate="databio/pepatac:1.0.7" fi echo "crate: ${crate}" From 541b182cd48f6858e25b42940836fe291e98ab41 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 13:21:07 -0400 Subject: [PATCH 13/66] modify bulker checks and check for docker or singularity first --- checkinstall | 91 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 28 deletions(-) diff --git a/checkinstall b/checkinstall index 36513bdb..c8e80900 100755 --- a/checkinstall +++ b/checkinstall @@ -406,41 +406,76 @@ echo -e "-----------------------------------------------------------" echo -e "Checking bulker installation... " BULKER_INSTALL=0 -if ! is_executable "bulker"; then - echo $(warn "WARNING: To use bulker, pip install bulker and checkinstall again.") - printf "\n" - BULKER_INSTALL=1 +if ! is_executable "docker"; then + DOCKER=1 else - BULKER_INSTALL=0 - echo -e $(success "SUCCESS: bulker") + DOCKER=0 fi -CWD=$(pwd) - -if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then - crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') +if ! is_executable "singularity"; then + SINGULARITY=1 else - crate="databio/pepatac:1.0.7" + SINGULARITY=0 fi -echo "crate: ${crate}" - -yes n | bulker load $crate -if [ $? -eq 0 ]; then - echo $(warn "WARNING: Could not bulker load ${crate}. Check out https://bulker.databio.org/en/latest/install/.") - printf "\n" - BULKER_INSTALL=1 +if [ "$DOCKER" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPATAC can be run using docker.") fi -isActivatable=$(bulker run ${crate} $CWD/pipelines/pepatac.py --help) -#echo "isActivatable: ${isActivatable}" -if [ -v "$isActivatable" ]; then - echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") - printf "\n" - BULKER_INSTALL=1 +if [ "$SINGULARITY" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPATAC can be run using singularity.") +fi + +if [ "$DOCKER" -eq 1 ] && [ "$SINGULARITY" -eq 1 ]; then + echo -e $(fail "ERROR: PEPATAC cannot be run using bulker. Please install docker or singularity.") else - BULKER_INSTALL=0 - echo -e $(success "SUCCESS: bulker run ${crate}") + if ! is_executable "bulker"; then + echo $(warn "WARNING: To use bulker, pip install bulker and checkinstall again.") + printf "\n" + BULKER_INSTALL=1 + else + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker") + fi + + CWD=$(pwd) + + if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then + crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') + else + crate="databio/pepatac:1.0.7" + fi + echo "crate: ${crate}" + + yes n | bulker load $crate + + if [ $? -eq 0 ]; then + echo $(warn "WARNING: Could not bulker load ${crate}. Check out https://bulker.databio.org/en/latest/install/.") + printf "\n" + BULKER_INSTALL=1 + fi + + if [ -f "$CWD/pipelines/pepatac.py" ]; then + isActivatable=$(bulker run ${crate} $CWD/pipelines/pepatac.py --help) + #echo "isActivatable: ${isActivatable}" + if [ -v "$isActivatable" ]; then + echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") + printf "\n" + BULKER_INSTALL=1 + else + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker run ${crate}") + fi + else + isActivatable=$(bulker run ${crate} bowtie2 --help) + if [ -v "$isActivatable" ]; then + echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") + printf "\n" + BULKER_INSTALL=1 + else + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker run ${crate}") + fi fi ################################################################################ @@ -453,13 +488,13 @@ else echo -e $(fail "ERROR: PEPATAC cannot be run using native installations.") fi -if [ "$CONDA_INSTALL" -eq 0 ]; then +if [ "$CONDA_INSTALL" -eq 0 ]; then echo -e $(success "SUCCESS: PEPATAC can be run using conda installation!") else echo -e $(fail "ERROR: PEPATAC cannot be run via conda.") fi -if [ "$BULKER_INSTALL" -eq 0 ]; then +if [ "$BULKER_INSTALL" -eq 0 ]; then echo -e $(success "SUCCESS: PEPATAC can be run using bulker!") else echo -e $(fail "ERROR: PEPATAC cannot be run using bulker.") From 3697f5bdb0a952651c451635493ae14c92621a91 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 22 Jun 2021 13:28:07 -0400 Subject: [PATCH 14/66] fix missing fi --- checkinstall | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/checkinstall b/checkinstall index c8e80900..4d3b547f 100755 --- a/checkinstall +++ b/checkinstall @@ -419,15 +419,16 @@ else fi if [ "$DOCKER" -eq 0 ]; then - echo -e $(success "SUCCESS: PEPATAC can be run using docker.") + echo -e $(success "SUCCESS: docker.") fi if [ "$SINGULARITY" -eq 0 ]; then - echo -e $(success "SUCCESS: PEPATAC can be run using singularity.") + echo -e $(success "SUCCESS: singularity.") fi if [ "$DOCKER" -eq 1 ] && [ "$SINGULARITY" -eq 1 ]; then echo -e $(fail "ERROR: PEPATAC cannot be run using bulker. Please install docker or singularity.") + BULKER_INSTALL=1 else if ! is_executable "bulker"; then echo $(warn "WARNING: To use bulker, pip install bulker and checkinstall again.") @@ -445,6 +446,7 @@ else else crate="databio/pepatac:1.0.7" fi + echo "crate: ${crate}" yes n | bulker load $crate @@ -475,6 +477,7 @@ else else BULKER_INSTALL=0 echo -e $(success "SUCCESS: bulker run ${crate}") + fi fi fi From c451204618fce3f2b87177d24e34081dba786c75 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 23 Jun 2021 09:48:51 -0400 Subject: [PATCH 15/66] update bulker checks --- checkinstall | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/checkinstall b/checkinstall index 4d3b547f..62ae0f57 100755 --- a/checkinstall +++ b/checkinstall @@ -419,15 +419,21 @@ else fi if [ "$DOCKER" -eq 0 ]; then - echo -e $(success "SUCCESS: docker.") + CMD_CHECK=$(docker --help) + if [ $? -eq 0 ]; then + echo -e $(success "SUCCESS: docker.") + else + echo -e $(warn "WARNING: Docker is a recognized command, but does not appear to be active. Please ensure docker is running and checkinstall again.") + DOCKER=1 + fi fi if [ "$SINGULARITY" -eq 0 ]; then echo -e $(success "SUCCESS: singularity.") fi - + if [ "$DOCKER" -eq 1 ] && [ "$SINGULARITY" -eq 1 ]; then - echo -e $(fail "ERROR: PEPATAC cannot be run using bulker. Please install docker or singularity.") + echo -e $(fail "ERROR: bulker") BULKER_INSTALL=1 else if ! is_executable "bulker"; then @@ -458,25 +464,28 @@ else fi if [ -f "$CWD/pipelines/pepatac.py" ]; then - isActivatable=$(bulker run ${crate} $CWD/pipelines/pepatac.py --help) - #echo "isActivatable: ${isActivatable}" - if [ -v "$isActivatable" ]; then + CMD_CHECK=$(bulker run ${crate} $CWD/pipelines/pepatac.py --help) + EXIT_CODE=$(echo $?) + isActivatable=$(echo "${EXIT_CODE}" | awk '{ print $1+0; exit }') + if [ "$isActivatable" -eq 0 ]; then + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker run ${crate}") + else echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") printf "\n" BULKER_INSTALL=1 - else - BULKER_INSTALL=0 - echo -e $(success "SUCCESS: bulker run ${crate}") fi else - isActivatable=$(bulker run ${crate} bowtie2 --help) - if [ -v "$isActivatable" ]; then + CMD_CHECK=$(bulker run ${crate} bowtie2 --help) + EXIT_CODE=$(echo $?) + isActivatable=$(echo "${EXIT_CODE}" | awk '{ print $1+0; exit }') + if [ "$isActivatable" -eq 0 ]; then + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker run ${crate}") + else echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") printf "\n" BULKER_INSTALL=1 - else - BULKER_INSTALL=0 - echo -e $(success "SUCCESS: bulker run ${crate}") fi fi fi From f78fb6b5dd85c6dad736f15b25a9592a22396ee3 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 23 Jun 2021 10:55:27 -0400 Subject: [PATCH 16/66] fix numbered list --- docs/consensus_peaks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/consensus_peaks.md b/docs/consensus_peaks.md index f6ed4b1c..344190e7 100644 --- a/docs/consensus_peaks.md +++ b/docs/consensus_peaks.md @@ -4,7 +4,7 @@ When a `PEP` project contains more than one sample, the `PEPATAC` project proces For example: `looper runp examples/test_project/test_config.yaml` -For the consensus peak generation, the pipeline performs the following steps: +For the consensus peak generation, the pipeline performs the following steps: 1. Overlapping peaks are identified among all the project samples. 2. For each set of overlapping peaks, the consensus peak's coordinates are defined as the coordinates of the peak with the maximum score among the set of overlapping peaks. 3. Any peaks that extend beyond chromosomes are trimmed. From 4ee4866d5cad911f732df56942b8e9546ed994ff Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 23 Jun 2021 15:29:04 -0400 Subject: [PATCH 17/66] embed equations --- docs/glossary.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/glossary.md b/docs/glossary.md index 7418a1b0..41ccf6bf 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -8,17 +8,17 @@ Additional information relevant to `PEPATAC` output and interpretation includes: - **Raw reads**: The original number of reads in the input files. - **Trimmed reads**: The number of reads remaining after trimming. -- **Trim loss rate**: The percentage of total reads that was trimmed. i.e. $\frac{(Raw\_reads - Trimmed\_reads)\times100}{Raw\_reads}$ +- **Trim loss rate**: The percentage of total reads that was trimmed. i.e. - **Mapped reads**: The number of reads mapped to primary genome. - **QC filtered reads**: The number of reads removed due to poor MAPQ values (i.e. <10). - **Aligned reads**: (Mapped_reads - QC_filtered_reads) -- **Alignment rate**: The percentage of trimmed reads that mapped to the primary genome. i.e. $\frac{Aligned\_reads}{Trimmed\_reads}$ $\times$ 100. In this case, trimmed reads represent the maximum number of reads that even have the potential to be mapped. -- **Total efficiency** - The percentage of raw reads that mapped to the primary genome. i.e. $\frac{Aligned\_reads}{Raw\_reads}$ $\times$ 100. Here we're looking at even reads that necessitated trimming, which gives an idea of how well your sample preparation was if, for example, a large number of reads had required trimming. If the efficiency is very poor but you had a high alignment rate, it would suggest an issue with sample prep because so many reads were trimmed. +- **Alignment rate**: The percentage of trimmed reads that mapped to the primary genome. i.e. . In this case, trimmed reads represent the maximum number of reads that even have the potential to be mapped. +- **Total efficiency** - The percentage of raw reads that mapped to the primary genome. i.e. . Here we're looking at even reads that necessitated trimming, which gives an idea of how well your sample preparation was if, for example, a large number of reads had required trimming. If the efficiency is very poor but you had a high alignment rate, it would suggest an issue with sample prep because so many reads were trimmed. - **Unmapped reads**: The number of trimmed reads that remains unmapped following prealignment and primary alignment. i.e. Trimmed_reads - Aligned_reads(prealignments) - Mapped_reads - **Duplicate reads**: The number of duplicate reads removed from the mapped reads. - **Dedup aligned reads**: The number of aligned reads following duplicate removal. i.e. Aligned_reads - Duplicate_reads -- **Dedup alignment rate**: The number of deduplicated, aligned reads out of the number of trimmed reads. i.e. $\frac{Dedup\_aligned\_reads}{Trimmed\_reads}$ -- **Dedup total efficiency**: The number of deduplicated, aligned reads out of the number of raw reads. i.e. $\frac{Dedup\_aligned\_reads}{Raw\_reads}$ +- **Dedup alignment rate**: The number of deduplicated, aligned reads out of the number of trimmed reads. i.e. +- **Dedup total efficiency**: The number of deduplicated, aligned reads out of the number of raw reads. i.e. ## Peak calling output From 5ed0ee98d3b09fd0966295d2a769585aef8129a5 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Thu, 24 Jun 2021 13:31:10 -0400 Subject: [PATCH 18/66] add variable macs2 param customization --- pipelines/pepatac.py | 7 ++++++- pipelines/pepatac.yaml | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index b460db62..e47946ab 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -2137,7 +2137,12 @@ def report_peak_count(): ("-n", args.sample_name), ("-g", args.genome_size) ] - cmd_base.extend(param.macs2.params.split()) + if args.peak_type == "fixed": + cmd_base.extend(param.macs2.params.split()) + elif args.peak_type == "variable": + cmd_base.extend(param.macs2_variable.params.split()) + else: + cmd_base.extend(param.macs2.params.split()) cmd = build_command(cmd_base) # Call peaks and report peak count. diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml index 8c6975e3..51897db1 100644 --- a/pipelines/pepatac.yaml +++ b/pipelines/pepatac.yaml @@ -74,6 +74,12 @@ parameters: # parameters passed to bioinformatic tools, subclassed by tool # -q: The qvalue (minimum FDR) cutoff to call significant regions. # --shift: Assign an arbitrary shift in bp. See MACS documentation. # --nomodel: Will bybass building the shifting model. + macs2_variable: + params: '-f BED -q 0.01 --shift 0 --nomodel' + # -f: Format of tag file. + # -q: The qvalue (minimum FDR) cutoff to call significant regions. + # --shift: Assign an arbitrary shift in bp. See MACS documentation. + # --nomodel: Will bybass building the shifting model. fseq: params: '-of npf -l 600 -t 4.0 -s 1' # -of: narrowPeak as output format. From fc49b6082d9abed1febe2112bcfdf3234bafd6e3 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 08:24:43 -0400 Subject: [PATCH 19/66] update curl use --- checkinstall | 215 +++++++++++++++++++-------------------------------- 1 file changed, 80 insertions(+), 135 deletions(-) diff --git a/checkinstall b/checkinstall index 62ae0f57..1ea05b3b 100755 --- a/checkinstall +++ b/checkinstall @@ -112,117 +112,65 @@ if ! is_executable "pip"; then fi if [ -f "requirements.txt" ]; then - while IFS= read -r line; do - [ "${line:0:1}" = "#" ] && continue - IFS='>=' read -r -a array <<< "$line" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}"; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') - - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") - fi - else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") - fi - fi - done < "requirements.txt" + REQS="requirements.txt" else - declare -a requiredPkgs=("attmap>=0.12.9" "bio>=0.2.4" "codecov>=2.0" "colorama>=0.3.9" "Cython>=0.29" "cykhash>=1.0.2 " "divvy>=0.5.0" "eido>=0.1.3" "hypothesis==4.38.0" "jinja2" "jsonschema>=3.0.1" "logmuse>=0.2.5" "looper>=1.2.1" "MACS2>=2.2.7.1" "numpy>=1.17" "oyaml" "pararead" "pandas>=0.20.2" "peppy>=0.31.0" "piper" "psutil" "pysam>=0.13" "python-Levenshtein>=0.12.0" "pyyaml>=3.13" "refgenconf>=0.7.0" "refgenie" "ubiquerg>=0.6.1" "yacman>=0.6.7") - - for package in ${requiredPkgs[@]}; do - IFS='>=' read -r -a array <<< "$package" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}" 2&>/dev/null ; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package.") - # printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + REQS=$(curl https://raw.githubusercontent.com/databio/pepatac/master/requirements.txt) +fi - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") - fi +while IFS= read -r line; do + [ "${line:0:1}" = "#" ] && continue + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") fi - done -fi + fi +done < $REQS # Check tool installation declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") @@ -303,6 +251,12 @@ else CONDA_INSTALL=1 fi + if [ -f "requirements.txt" ]; then + REQS="requirements.txt" + else + REQS=$(curl https://raw.githubusercontent.com/databio/pepatac/master/requirements.txt) + fi + while IFS= read -r line; do [ "${line:0:1}" = "#" ] && continue IFS='>=' read -r -a array <<< "$line" @@ -355,7 +309,7 @@ else echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") fi fi - done < "requirements.txt" + done < $REQS # Check tool installation declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") @@ -448,45 +402,36 @@ else CWD=$(pwd) if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then - crate=$(grep 'bulker_crate' $CWD/sample_pipeline_interface.yaml | awk '{print $2}') + IFACE="$CWD/sample_pipeline_interface.yaml" else - crate="databio/pepatac:1.0.7" + IFACE=$(curl https://raw.githubusercontent.com/databio/pepatac/master/sample_pipeline_interface.yaml) fi - echo "crate: ${crate}" - - yes n | bulker load $crate + CRATE=$(grep 'bulker_crate' $IFACE | awk '{print $2}') + yes n | bulker load $CRATE if [ $? -eq 0 ]; then - echo $(warn "WARNING: Could not bulker load ${crate}. Check out https://bulker.databio.org/en/latest/install/.") + echo $(warn "WARNING: Could not bulker load ${CRATE}. Check out https://bulker.databio.org/en/latest/install/.") printf "\n" BULKER_INSTALL=1 fi if [ -f "$CWD/pipelines/pepatac.py" ]; then - CMD_CHECK=$(bulker run ${crate} $CWD/pipelines/pepatac.py --help) - EXIT_CODE=$(echo $?) - isActivatable=$(echo "${EXIT_CODE}" | awk '{ print $1+0; exit }') - if [ "$isActivatable" -eq 0 ]; then - BULKER_INSTALL=0 - echo -e $(success "SUCCESS: bulker run ${crate}") - else - echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") - printf "\n" - BULKER_INSTALL=1 - fi + PIPELINE="$CWD/pipelines/pepatac.py" else - CMD_CHECK=$(bulker run ${crate} bowtie2 --help) - EXIT_CODE=$(echo $?) - isActivatable=$(echo "${EXIT_CODE}" | awk '{ print $1+0; exit }') - if [ "$isActivatable" -eq 0 ]; then - BULKER_INSTALL=0 - echo -e $(success "SUCCESS: bulker run ${crate}") - else - echo $(warn "WARNING: Could not activate the bulker crate, ${crate}. Check out https://bulker.databio.org/en/latest/install/.") - printf "\n" - BULKER_INSTALL=1 - fi + PIPELINE=$(curl https://raw.githubusercontent.com/databio/pepatac/master/pipelines/pepatac.py) + fi + + CMD_CHECK=$(bulker run ${CRATE} $PIPELINE --help) + EXIT_CODE=$(echo $?) + isActivatable=$(echo "${EXIT_CODE}" | awk '{ print $1+0; exit }') + if [ "$isActivatable" -eq 0 ]; then + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker run ${CRATE}") + else + echo $(warn "WARNING: Could not activate the bulker crate, ${CRATE}. Check out https://bulker.databio.org/en/latest/install/.") + printf "\n" + BULKER_INSTALL=1 fi fi From a711cf1e0f8bd16122b902dd331344d912f5c462 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 08:43:01 -0400 Subject: [PATCH 20/66] refgenie populate commands --- pipelines/pepatac.py | 168 ++++----------------------------- sample_pipeline_interface.yaml | 75 +++++++++------ 2 files changed, 66 insertions(+), 177 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index aac6c08b..8aa8c999 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -120,27 +120,16 @@ def parse_arguments(): "that appear with some sequence read files.") # Genome assets - prealignment_index = parser.add_mutually_exclusive_group(required=False) - prealignment_index.add_argument("--prealignment-bowtie2-index", - default=[], type=str, nargs="+", dest="prealignment_bowtie2_index", - help="Space-delimited list of paths to bowtie2 " - "prefixes to align to before primary alignment " - "(minus trailing .X.bt2).") - prealignment_index.add_argument("--prealignment-bwa-index", default=[], - type=str, nargs="+", dest="prealignment_bwa_index", - help="Space-delimited list of paths to bwa " - "index directories to align to before primary alignment.") - - primary_index = parser.add_mutually_exclusive_group(required=True) - primary_index.add_argument("--bowtie2-index", default=None, - dest="bowtie2_index", type=str, - help="Path to primary genome bowtie2 prefix " - "(minus trailing .X.bt2).") - primary_index.add_argument("--bwa-index", default=None, - dest="bwa_index", type=str, - help="Path to primary genome bwa index " - "directory.") - + parser.add_argument("--prealignments", default=[], type=str, + nargs="+", + help="Space-delimited list of reference genomes to " + "align to before primary alignment.") + # Genome assets + parser.add_argument("--genome-index", default=None, required=True, + dest="genome_index", type=str, + help="Path to primary genome index file. Either a " + "bowtie2 or bwa index.") + parser.add_argument("--chrom-sizes", default=None, required=True, dest="chrom_sizes", type=str, help="Path to primary genome chromosome sizes file.") @@ -614,76 +603,11 @@ def main(): GENOME_IDX_KEY = "bowtie2_index" # Add prealignment genome annotation files to resources - res.prealignment_index = [] - if args.prealignment_bowtie2_index and args.prealignment_bwa_index: - err_msg = (f"Incompatible prealignment settings: You specified a " - f"bowtie2 and bwa index.") - pm.fail_pipeline(RuntimeError(err_msg)) - elif args.prealignment_bowtie2_index: - if args.aligner.lower() == "bwa": - err_msg = (f"Incompatible settings: You specified bwa as your " - f"aligner but are using --prealignment-bowtie2-index " - f"to pass indices.") - pm.fail_pipeline(RuntimeError(err_msg)) - for index in args.prealignment_bowtie2_index: - if not os.path.exists(os.path.dirname(index)): - err_msg = (f"Could not find {index}.") - pm.info(IOError(err_msg)) - else: - res.prealignment_index.append(index) - elif args.prealignment_bwa_index: - if args.aligner.lower() == "bowtie2": - err_msg = (f"Incompatible settings: You specified bowtie2 as your " - f"aligner but are using --prealignment-bwa " - f"to pass indices.") - pm.fail_pipeline(RuntimeError(err_msg)) - for index in args.prealignment_bwa_index: - if not os.path.exists(os.path.dirname(index)): - err_msg = (f"Could not find {index}.") - pm.info(IOError(err_msg)) - else: - res.prealignment_index.append(index) - else: - pm.warning(f"Unable to find any prealignment indices. If this appears " - f"incorrect, confirm you passed the full path to each " - f"index directory prefix.") - + pm.info(f"prealignments: {args.prealignments}") + # Add primary genome annotation files to resources - if (args.bowtie2_index and args.bwa_index): - err_msg = (f"Incompatible settings: You specified a bowtie2 and " - f"bwa index.") - pm.fail_pipeline(RuntimeError(err_msg)) - elif (os.path.exists(os.path.dirname(args.bowtie2_index)) and not - args.bwa_index): - if args.aligner.lower() == "bwa": - err_msg = (f"Incompatible settings: You specified bwa as your " - f"aligner but are using --bwa-index " - f"to specify the index.") - pm.fail_pipeline(RuntimeError(err_msg)) - else: - res.genome_index = args.bowtie2_index - elif (os.path.exists(os.path.dirname(args.bwa_index)) and not - args.bowtie2_index): - if args.aligner.lower() == "bowtie2": - err_msg = (f"Incompatible settings: You specified bowtie2 as your " - f"aligner but are using --bowtie2-index " - f"to specify the index.") - pm.fail_pipeline(RuntimeError(err_msg)) - else: - res.genome_index = args.bwa_index - else: - err_msg = (f"A genome index file for {args.genome_assembly} " - f"for {args.aligner} is required.") - pm.fail_pipeline(IOError(err_msg)) - - if (args.chrom_sizes and os.path.isfile(args.chrom_sizes) and - os.stat(args.chrom_sizes).st_size > 0): - res.chrom_sizes = args.chrom_sizes - else: - err_msg = (f"A chromosome sizes file for {args.genome_assembly} " - f"is required.") - pm.fail_pipeline(IOError(err_msg)) - + pm.info(f"primary genome index: {args.genome_index}") + # Add optional files to resources if args.sob and not args.search_file: err_msg = (f"You specified --sob but did not include the path to" @@ -981,7 +905,6 @@ def check_trim(): "See http://pepatac.databio.org/en/latest/ for documentation.") else: # Loop through any prealignment references and map to them sequentially -<<<<<<< HEAD for count, genome_index in enumerate(res.prealignment_index): pm.info(f"Aligning with {args.aligner} to {genome_index}.") assembly_identifier = f"prealignment_{count}" @@ -1009,48 +932,6 @@ def check_trim(): to_compress.append(unmap_fq1) if args.paired_end: to_compress.append(unmap_fq2) -======= - for reference in args.prealignments: - genome_index = os.path.join( - rgc.seek(reference, GENOME_IDX_KEY, strict_exists=False)) - if not os.path.exists(os.path.dirname(genome_index)): - msg = "No {} index found in {}; skipping.".format( - reference, os.path.dirname(genome_index)) - print(msg) - else: - if not genome_index.endswith(reference): - genome_index = os.path.join( - os.path.dirname(rgc.seek(reference, - GENOME_IDX_KEY, - strict_exists=False)), - reference) - if args.aligner.lower() == "bwa": - genome_index += ".fa" - if args.no_fifo: - unmap_fq1, unmap_fq2 = _align( - args, tools, args.paired_end, False, - unmap_fq1, unmap_fq2, reference, - assembly=genome_index, - outfolder=param.outfolder, - aligndir="prealignments", - bt2_opts_txt=param.bowtie2_pre.params, - bwa_opts_txt=param.bwa_pre.params) - to_compress.append(unmap_fq1) - if args.paired_end: - to_compress.append(unmap_fq2) - else: - unmap_fq1, unmap_fq2 = _align( - args, tools, args.paired_end, True, - unmap_fq1, unmap_fq2, reference, - assembly=genome_index, - outfolder=param.outfolder, - aligndir="prealignments", - bt2_opts_txt=param.bowtie2_pre.params, - bwa_opts_txt=param.bwa_pre.params) - to_compress.append(unmap_fq1) - if args.paired_end: - to_compress.append(unmap_fq2) ->>>>>>> dev pm.timestamp("### Compress all unmapped read files") # Confirm pairing is complete @@ -1130,20 +1011,6 @@ def no_handle(fq): if os.path.exists(unmap_fq2 + ".gz"): unmap_fq2 = unmap_fq2 + ".gz" -<<<<<<< HEAD -======= - genome_index = os.path.join( - rgc.seek(args.genome_assembly, GENOME_IDX_KEY, strict_exists=False)) - if not genome_index.endswith(args.genome_assembly): - genome_index = os.path.join( - os.path.dirname(rgc.seek(args.genome_assembly, - GENOME_IDX_KEY, - strict_exists=False)), - args.genome_assembly) - if args.aligner.lower() == "bwa": - genome_index += ".fa" - ->>>>>>> dev if args.aligner.lower() == "bwa": cmd = tools.bwa + " mem -t " + str(pm.cores) cmd += " " + bwa_options @@ -2147,7 +2014,12 @@ def report_peak_count(): ("-n", args.sample_name), ("-g", args.genome_size) ] - cmd_base.extend(param.macs2.params.split()) + if args.peak_type == "fixed": + cmd_base.extend(param.macs2.params.split()) + elif args.peak_type == "variable": + cmd_base.extend(param.macs2_variable.params.split()) + else: + cmd_base.extend(param.macs2.params.split()) cmd = build_command(cmd_base) # Call peaks and report peak count. diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index a2b3ff3a..68b05427 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -5,51 +5,68 @@ input_schema: pepatac_input_schema.yaml output_schema: pepatac_output_schema.yaml command_template: > python {pipeline.path} - --sample-name {sample.sample_name} - --genome {sample.genome} - --input {sample.read1} - --single-or-paired {sample.read_type} - --chrom-sizes {refgenie.fasta.chrom_sizes} - -O {looper.results_subdir} - -P {compute.cores} - -M {compute.mem} - {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} - {% if sample.trimmer is defined %} --trimmer {sample.trimmer} {% endif %} - {% if sample.aligner is defined %} --aligner {sample.aligner} {% endif %} - {% if sample.aligner == "bowtie2" %} --bowtie2-index {refgenie.bowtie2_index.dir} {% endif %} - {% if sample.aligner == "bwa" %} --bwa-index {refgenie.bwa_index.dir} {% endif %} - {% if sample.deduplicator is defined %} --deduplicator {sample.deduplicator} {% endif %} - {% if sample.peak_caller is defined %} --peak-caller {sample.peak_caller} {% endif %} - {% if sample.macs_genome_size is defined %} --genome-size {sample.macs_genome_size} {% endif %} - {% if sample.peak_type is defined %} --peak-type {sample.peak_type} {% endif %} - {% if sample.extend is defined %} --extend {sample.extend} {% endif %} - {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks {sample.frip_ref_peaks} {% endif %} + --output_parent {{ looper.results_subdir }} + --cores {{ compute.cores }} + --mem {{ compute.mem }} + --sample_name {{ sample.sample_name }} + --input {{ sample.read1 }} + {% if sample.read2 is defined %} --input2 {{ sample.read2 }} {% endif %} + --single_or_paired {{ sample.read_type }} + --genome {{ sample.genome }} + --chrom_sizes {{ refgenie[sample.genome]["fasta"]["chrom_sizes"] }} + --TSS_name {{ refgenie[sample.genome]["refgene_anno"]["refgene_tss"] }} + --blacklist {{ refgenie[sample.genome]["blacklist"] }} + --anno_name {{ refgenie[sample.genome]["feat_annotation"] }} + {% if sample.trimmer is defined %} --trimmer {{ sample.trimmer }} {% endif %} + {% if sample.aligner is defined %} --aligner {{ sample.aligner }} {% endif %} + {% if sample.aligner == "bowtie2" %} + {% if sample.genome_index is defined %} + --genome_index {{ sample.genome_index }} + {% else %} + --genome_index {{ refgenie[sample.genome]["bowtie2_index"]["dir"] }} + refgenie + --prealignments {% for p in sample.prealignments %} + {{ p }} {{ refgenie[p]["bowtie2_index"]["dir"] }}{% endfor %} + {% endif %} + {% else %} + {% if sample.genome_index is defined %} + --genome_index {{ sample.genome_index }} + {% else %} + --genome_index {{ refgenie[sample.genome]["bwa_index"]["dir"] }} + refgenie + --prealignments {% for p in sample.prealignments %} + {{ p }} {{ refgenie[p]["bwa_index"]["dir"] }}{% endfor %} + {% endif %} + {% endif %} + {% if sample.deduplicator is defined %} --deduplicator {{ sample.deduplicator }} {% endif %} + {% if sample.peak_caller is defined %} --peak_caller {{ sample.peak_caller }} {% endif %} + {% if sample.peak_type is defined %} --peak_type {{ sample.peak_type }} {% endif %} + {% if sample.extend is defined %} --extend {{ sample.extend }} {% endif %} + {% if sample.genome_size is defined %} --genome_size {{ sample.genome_size }} {% endif %} + {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks {{ sample.frip_ref_peaks }} {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} - {% if sample.sob is defined %} --search-file {refgenie.tallymer_index.search_file} {% endif %} - {% if sample.no_scale is defined %} --no-scale {% endif %} - {% if sample.prioritize is defined %} --prioritize {% endif %} + {% if sample.sob is defined %} --search_file {{ refgenie.tallymer_index.search_file }} {% endif %} + {% if sample.no_scale is defined %} --no_scale {% endif %} + {% if sample.prioritize is defined %} --prioritize {% endif %} {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} {% if sample.lite is defined %} --lite {% endif %} {% if sample.skipqc is defined %} --skipqc {% endif %} - {% if sample.prealignment_bowtie2_index is defined %} --prealignment-bowtie2-index {refgenie.bowtie2_index.dir} {% endif %} - {% if sample.prealignment_bwa_index is defined %} --prealignment-bwa-index {refgenie.bwa_index.bwa_index} {% endif %} - --TSS-name {refgenie.refgene_anno.refgene_tss} - --blacklist {refgenie.blacklist} - --anno-name {refgenie.feat_annotation} compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac - docker_image: databio/pepatac - bulker_crate: databio/pepatac:1.0.7 + docker_image: databio/pepatac + bulker_crate: databio/pepatac 1.0.7 size_dependent_variables: resources-sample.tsv bioconductor: readFunName: runCOCOA readFunPath: BiocProject/runCOCOA.R var_templates: refgenie_config: "$REFGENIE" + custom_template: sample_template.jinja pre_submit: python_functions: - refgenconf.looper_refgenie_populate +# - looper.write_custom_template From 6f0f855f81be614ec712eede4cc6f08cfbdc3066 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 08:44:06 -0400 Subject: [PATCH 21/66] fix curl based crate check --- checkinstall | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checkinstall b/checkinstall index 1ea05b3b..e73257f7 100755 --- a/checkinstall +++ b/checkinstall @@ -407,7 +407,7 @@ else IFACE=$(curl https://raw.githubusercontent.com/databio/pepatac/master/sample_pipeline_interface.yaml) fi - CRATE=$(grep 'bulker_crate' $IFACE | awk '{print $2}') + CRATE=$(echo $IFACE | tr " " "\n" | grep -A1 'bulker_crate' | tail -n 1) yes n | bulker load $CRATE if [ $? -eq 0 ]; then From 77ac14a224db21a9537f8f73f8febb4d3be9500d Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 09:08:31 -0400 Subject: [PATCH 22/66] add docs on configuring peak calling settings --- docs/configure-peak-callers.md | 215 +++++++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 216 insertions(+) create mode 100644 docs/configure-peak-callers.md diff --git a/docs/configure-peak-callers.md b/docs/configure-peak-callers.md new file mode 100644 index 00000000..b0cee35b --- /dev/null +++ b/docs/configure-peak-callers.md @@ -0,0 +1,215 @@ +# Configure peak callers + +`PEPATAC` has the ability to use five different peak callers out of the box. If you're running the pipeline via containers, you should already have this capability, but if you installed requirements natively, you may still need to install additional peak callers should you wish to use them. + +## Default peak caller + +`PEPATAC` uses `MACS2` as the default peak caller. You do not need to specify any additional parameters to use `MACS2`. The default approach is to define fixed width peaks, to [facilitate the generation of consensus peaks, which you can read more about here](consensus_peaks.md). + +For a single sample, you may wish to enable variable width peak calling to achieve a more nuanced and specific peak call for a particular sample. Or maybe you don't wish to generate consensus peaks at all, and you want this for all your samples. You can still use `MACS2` and specify `PEPATAC` to use `--peak-type variable` to call peaks in this manner. +Here's an example with the included test sample: +```yaml +# This project config file describes your project. See looper docs for details. +name: test_project # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: test_annotation.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: pepatac_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml + derive: + attributes: [read1, read2] + sources: + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # [options: bowtie2(default), bwa] + deduplicator: samblaster # [options: samblaster(default), picard, samtools] + trimmer: skewer # [options: skewer (default), pyadapt, trimmomatic] + peak_caller: macs2 # [options: macs2 (default), fseq, genrich, hmmratac, homer] + peak_type: variable # [options: fixed(default), variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +``` +Furthermore, you can customize the settings for variable width peak calling in the [pipeline configuration file, pepatac.yaml](https://github.com/databio/pepatac/blob/master/pipelines/pepatac.yaml). +For example, you can change the 'params:' line for macs2_variable settings: +```yaml +macs2_variable: + params: '-f BED -q 0.01 --shift 0 --nomodel' +``` + +## Specifying a different peak caller + +For each of the alternative peak callers, you can specify them on the command line with the `--peak-caller --peak-type variable` variables, or you can specify them in your PEP project configuration file. You must tell `PEPATAC` to use `--peak-type variable` when specifying alternative peak callers. + +Here are examples with the included test sample: + +1. F-Seq +```yaml +# This project config file describes your project. See looper docs for details. +name: test_project # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: test_annotation.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: pepatac_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml + derive: + attributes: [read1, read2] + sources: + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # [options: bowtie2(default), bwa] + deduplicator: samblaster # [options: samblaster(default), picard, samtools] + trimmer: skewer # [options: skewer (default), pyadapt, trimmomatic] + peak_caller: fseq # [options: macs2 (default), fseq, genrich, hmmratac, homer] + peak_type: variable # [options: fixed(default), variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +``` + +2. HMMRATAC +```yaml +# This project config file describes your project. See looper docs for details. +name: test_project # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: test_annotation.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: pepatac_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml + derive: + attributes: [read1, read2] + sources: + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # [options: bowtie2(default), bwa] + deduplicator: samblaster # [options: samblaster(default), picard, samtools] + trimmer: skewer # [options: skewer (default), pyadapt, trimmomatic] + peak_caller: hmmratac # [options: macs2 (default), fseq, genrich, hmmratac, homer] + peak_type: variable # [options: fixed(default), variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +``` + +3. HOMER +```yaml +# This project config file describes your project. See looper docs for details. +name: test_project # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: test_annotation.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: pepatac_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml + derive: + attributes: [read1, read2] + sources: + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # [options: bowtie2(default), bwa] + deduplicator: samblaster # [options: samblaster(default), picard, samtools] + trimmer: skewer # [options: skewer (default), pyadapt, trimmomatic] + peak_caller: homer # [options: macs2 (default), fseq, genrich, hmmratac, homer] + peak_type: variable # [options: fixed(default), variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +``` + +4. Genrich +```yaml +# This project config file describes your project. See looper docs for details. +name: test_project # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: test_annotation.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: pepatac_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml + derive: + attributes: [read1, read2] + sources: + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # [options: bowtie2(default), bwa] + deduplicator: samblaster # [options: samblaster(default), picard, samtools] + trimmer: skewer # [options: skewer (default), pyadapt, trimmomatic] + peak_caller: genrich # [options: macs2 (default), fseq, genrich, hmmratac, homer] + peak_type: variable # [options: fixed(default), variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +``` + +Each peak caller also has a separate parameterization option in the [pipeline configuration file, pepatac.yaml](https://github.com/databio/pepatac/blob/master/pipelines/pepatac.yaml). + +For example: +```yaml + fseq: + params: '-of npf -l 600 -t 4.0 -s 1' + genrich: + params: '' + # -j: ATAC-seq mode on by default in the main pipeline, pepatac.py + hmmratac: + params: '--fragmem True --upper 10 --lower 5 --peaks True --window 500000' + homer_findpeaks: + params: '-minDist 150 -region' +``` diff --git a/mkdocs.yml b/mkdocs.yml index 4deeedb0..71e5cba9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,7 @@ nav: - Run natively: 'detailed-install.md' - Configure assets: 'assets.md' - Configure prealignments: 'prealignments.md' + - Configure peak callers: 'configure-peak-callers.md' - Configure project files: 'peps.md' - Configure seqOutBias assets: 'sob.md' - Configure computing resources: 'compute-resources.md' From e46dca6ce4e5b8360b2c21c40c053b6e1327d846 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 09:11:13 -0400 Subject: [PATCH 23/66] use single braces --- sample_pipeline_interface.yaml | 57 +++++++++++++++++----------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 68b05427..d3a7c775 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -5,55 +5,56 @@ input_schema: pepatac_input_schema.yaml output_schema: pepatac_output_schema.yaml command_template: > python {pipeline.path} - --output_parent {{ looper.results_subdir }} - --cores {{ compute.cores }} - --mem {{ compute.mem }} - --sample_name {{ sample.sample_name }} - --input {{ sample.read1 }} - {% if sample.read2 is defined %} --input2 {{ sample.read2 }} {% endif %} - --single_or_paired {{ sample.read_type }} - --genome {{ sample.genome }} - --chrom_sizes {{ refgenie[sample.genome]["fasta"]["chrom_sizes"] }} - --TSS_name {{ refgenie[sample.genome]["refgene_anno"]["refgene_tss"] }} - --blacklist {{ refgenie[sample.genome]["blacklist"] }} - --anno_name {{ refgenie[sample.genome]["feat_annotation"] }} - {% if sample.trimmer is defined %} --trimmer {{ sample.trimmer }} {% endif %} - {% if sample.aligner is defined %} --aligner {{ sample.aligner }} {% endif %} + --output_parent { looper.results_subdir } + --cores { compute.cores } + --mem { compute.mem } + --sample_name { sample.sample_name } + --input { sample.read1 } + {% if sample.read2 is defined %} --input2 { sample.read2 } {% endif %} + --single_or_paired { sample.read_type } + --genome { sample.genome } + --chrom_sizes { refgenie[sample.genome]["fasta"]["chrom_sizes"] } + --TSS_name { refgenie[sample.genome]["refgene_anno"]["refgene_tss"] } + --blacklist { refgenie[sample.genome]["blacklist"] } + --anno_name { refgenie[sample.genome]["feat_annotation"] } + {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} + {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} - --genome_index {{ sample.genome_index }} + --genome_index { sample.genome_index } {% else %} - --genome_index {{ refgenie[sample.genome]["bowtie2_index"]["dir"] }} + --genome_index { refgenie[sample.genome]["bowtie2_index"]["dir"] } refgenie --prealignments {% for p in sample.prealignments %} - {{ p }} {{ refgenie[p]["bowtie2_index"]["dir"] }}{% endfor %} + { p } { refgenie[p]["bowtie2_index"]["dir"] }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} - --genome_index {{ sample.genome_index }} + --genome_index { sample.genome_index } {% else %} - --genome_index {{ refgenie[sample.genome]["bwa_index"]["dir"] }} + --genome_index { refgenie[sample.genome]["bwa_index"]["dir"] } refgenie --prealignments {% for p in sample.prealignments %} - {{ p }} {{ refgenie[p]["bwa_index"]["dir"] }}{% endfor %} + { p } { refgenie[p]["bwa_index"]["dir"] }{% endfor %} {% endif %} {% endif %} - {% if sample.deduplicator is defined %} --deduplicator {{ sample.deduplicator }} {% endif %} - {% if sample.peak_caller is defined %} --peak_caller {{ sample.peak_caller }} {% endif %} - {% if sample.peak_type is defined %} --peak_type {{ sample.peak_type }} {% endif %} - {% if sample.extend is defined %} --extend {{ sample.extend }} {% endif %} - {% if sample.genome_size is defined %} --genome_size {{ sample.genome_size }} {% endif %} - {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks {{ sample.frip_ref_peaks }} {% endif %} + {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} + {% if sample.peak_caller is defined %} --peak_caller { sample.peak_caller } {% endif %} + {% if sample.peak_type is defined %} --peak_type { sample.peak_type } {% endif %} + {% if sample.extend is defined %} --extend { sample.extend } {% endif %} + {% if sample.genome_size is defined %} --genome_size { sample.genome_size } {% endif %} + {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks { sample.frip_ref_peaks } {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} - {% if sample.sob is defined %} --search_file {{ refgenie.tallymer_index.search_file }} {% endif %} + {% if sample.sob is defined %} --search_file { refgenie.tallymer_index.search_file } {% endif %} {% if sample.no_scale is defined %} --no_scale {% endif %} - {% if sample.prioritize is defined %} --prioritize {% endif %} {% endif %} + {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} {% if sample.lite is defined %} --lite {% endif %} {% if sample.skipqc is defined %} --skipqc {% endif %} + compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac From 9be0e0a738bd15d8365a3570aac479dcbe9cd036 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 09:17:11 -0400 Subject: [PATCH 24/66] change headers --- docs/configure-peak-callers.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/configure-peak-callers.md b/docs/configure-peak-callers.md index b0cee35b..de7cd679 100644 --- a/docs/configure-peak-callers.md +++ b/docs/configure-peak-callers.md @@ -55,7 +55,7 @@ For each of the alternative peak callers, you can specify them on the command li Here are examples with the included test sample: -1. F-Seq +### F-Seq ```yaml # This project config file describes your project. See looper docs for details. name: test_project # The name that summary files will be prefaced with @@ -91,7 +91,7 @@ sample_modifiers: frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run ``` -2. HMMRATAC +### HMMRATAC ```yaml # This project config file describes your project. See looper docs for details. name: test_project # The name that summary files will be prefaced with @@ -127,7 +127,7 @@ sample_modifiers: frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run ``` -3. HOMER +### HOMER ```yaml # This project config file describes your project. See looper docs for details. name: test_project # The name that summary files will be prefaced with @@ -163,7 +163,7 @@ sample_modifiers: frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run ``` -4. Genrich +### Genrich ```yaml # This project config file describes your project. See looper docs for details. name: test_project # The name that summary files will be prefaced with From 99291ffca174b64a3a01b72e0185f26bd891b755 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 09:22:27 -0400 Subject: [PATCH 25/66] update refgenie asset references --- sample_pipeline_interface.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index d3a7c775..a049e275 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -13,29 +13,29 @@ command_template: > {% if sample.read2 is defined %} --input2 { sample.read2 } {% endif %} --single_or_paired { sample.read_type } --genome { sample.genome } - --chrom_sizes { refgenie[sample.genome]["fasta"]["chrom_sizes"] } - --TSS_name { refgenie[sample.genome]["refgene_anno"]["refgene_tss"] } - --blacklist { refgenie[sample.genome]["blacklist"] } - --anno_name { refgenie[sample.genome]["feat_annotation"] } + --chrom_sizes { refgenie[sample.genome].fasta.chrom_sizes"] } + --TSS_name { refgenie[sample.genome].refgene_anno.refgene_tss"] } + --blacklist { refgenie[sample.genome].blacklist"] } + --anno_name { refgenie[sample.genome].feat_annotation"] } {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome_index { sample.genome_index } {% else %} - --genome_index { refgenie[sample.genome]["bowtie2_index"]["dir"] } + --genome_index { refgenie[sample.genome].bowtie2_index.dir"] } refgenie --prealignments {% for p in sample.prealignments %} - { p } { refgenie[p]["bowtie2_index"]["dir"] }{% endfor %} + { p } { refgenie[p].bowtie2_index.dir"] }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} --genome_index { sample.genome_index } {% else %} - --genome_index { refgenie[sample.genome]["bwa_index"]["dir"] } + --genome_index { refgenie[sample.genome].bwa_index.dir"] } refgenie --prealignments {% for p in sample.prealignments %} - { p } { refgenie[p]["bwa_index"]["dir"] }{% endfor %} + { p } { refgenie[p].bwa_index.dir"] }{% endfor %} {% endif %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} @@ -46,7 +46,7 @@ command_template: > {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks { sample.frip_ref_peaks } {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} - {% if sample.sob is defined %} --search_file { refgenie.tallymer_index.search_file } {% endif %} + {% if sample.sob is defined %} --search_file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% if sample.no_scale is defined %} --no_scale {% endif %} {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} @@ -66,7 +66,7 @@ bioconductor: readFunPath: BiocProject/runCOCOA.R var_templates: refgenie_config: "$REFGENIE" - custom_template: sample_template.jinja +# custom_template: sample_template.jinja pre_submit: python_functions: - refgenconf.looper_refgenie_populate From ac31bc86b363674e936dc3e343266386f50166bf Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 09:26:35 -0400 Subject: [PATCH 26/66] add refgenie key to config; adjust refgenie asset in command template --- examples/test_project/test_config.yaml | 1 + sample_pipeline_interface.yaml | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index 10edc173..37c7865c 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -30,3 +30,4 @@ sample_modifiers: extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +refgenie: \ No newline at end of file diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index a049e275..44fba524 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -13,29 +13,29 @@ command_template: > {% if sample.read2 is defined %} --input2 { sample.read2 } {% endif %} --single_or_paired { sample.read_type } --genome { sample.genome } - --chrom_sizes { refgenie[sample.genome].fasta.chrom_sizes"] } - --TSS_name { refgenie[sample.genome].refgene_anno.refgene_tss"] } - --blacklist { refgenie[sample.genome].blacklist"] } - --anno_name { refgenie[sample.genome].feat_annotation"] } + --chrom_sizes { refgenie[sample.genome].fasta.chrom_sizes } + --TSS_name { refgenie[sample.genome].refgene_anno.refgene_tss } + --blacklist { refgenie[sample.genome].blacklist } + --anno_name { refgenie[sample.genome].feat_annotation } {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome_index { sample.genome_index } {% else %} - --genome_index { refgenie[sample.genome].bowtie2_index.dir"] } + --genome_index { refgenie[sample.genome].bowtie2_index.dir } refgenie --prealignments {% for p in sample.prealignments %} - { p } { refgenie[p].bowtie2_index.dir"] }{% endfor %} + { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} --genome_index { sample.genome_index } {% else %} - --genome_index { refgenie[sample.genome].bwa_index.dir"] } + --genome_index { refgenie[sample.genome].bwa_index.dir } refgenie --prealignments {% for p in sample.prealignments %} - { p } { refgenie[p].bwa_index.dir"] }{% endfor %} + { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} From 9737313adbf7887a9af37e9aae5d0a29e30a9212 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 09:29:12 -0400 Subject: [PATCH 27/66] refgenie key no longer required --- examples/test_project/test_config.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index 37c7865c..46ed2e47 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -29,5 +29,3 @@ sample_modifiers: peak_type: fixed # Default. [options: variable] extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run - -refgenie: \ No newline at end of file From 6c883d79a9da09a245531fe5da8bccf1aacb8a7d Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 10:59:04 -0400 Subject: [PATCH 28/66] make prealignments list --- examples/test_project/test_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index 46ed2e47..25a1a3e9 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -22,7 +22,7 @@ sample_modifiers: then: genome: hg38 macs_genome_size: hs - prealignments: rCRSd + prealignments: ["rCRSd"] aligner: bowtie2 # Default. [options: bwa] deduplicator: samblaster # Default. [options: picard] trimmer: skewer # Default. [options: pyadapt, trimmomatic] From bb2540afac4bd605cebc0477f3f13d27863bf47b Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 11:06:12 -0400 Subject: [PATCH 29/66] add missing colon --- sample_pipeline_interface.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 44fba524..c0e7b732 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -24,7 +24,7 @@ command_template: > --genome_index { sample.genome_index } {% else %} --genome_index { refgenie[sample.genome].bowtie2_index.dir } - refgenie + refgenie: --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} @@ -33,7 +33,7 @@ command_template: > --genome_index { sample.genome_index } {% else %} --genome_index { refgenie[sample.genome].bwa_index.dir } - refgenie + refgenie: --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} From 61dac8bdba3eeeb4b8b897e95e43812423584489 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 14:09:42 -0400 Subject: [PATCH 30/66] update refgenie asset reference in command template --- examples/test_project/test_config.yaml | 9 +++++++++ sample_pipeline_interface.yaml | 6 ++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index 25a1a3e9..6285536f 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -29,3 +29,12 @@ sample_modifiers: peak_type: fixed # Default. [options: variable] extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run + +refgenie: + tag_overrides: + hg38: + blacklist: "default" + feat_annotation: "default" + path_overrides: + - registry_path: "hg38/blacklist" + value: "/project/shefflab/genomes_v04_210301/alias/hg38/blacklist/default/hg38_blacklist.bed.gz" diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index c0e7b732..644986c4 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -15,8 +15,8 @@ command_template: > --genome { sample.genome } --chrom_sizes { refgenie[sample.genome].fasta.chrom_sizes } --TSS_name { refgenie[sample.genome].refgene_anno.refgene_tss } - --blacklist { refgenie[sample.genome].blacklist } - --anno_name { refgenie[sample.genome].feat_annotation } + --blacklist { refgenie[sample.genome].blacklist.blacklist } + --anno_name { refgenie[sample.genome].feat_annotation.feat_annotation } {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} {% if sample.aligner == "bowtie2" %} @@ -24,7 +24,6 @@ command_template: > --genome_index { sample.genome_index } {% else %} --genome_index { refgenie[sample.genome].bowtie2_index.dir } - refgenie: --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} @@ -33,7 +32,6 @@ command_template: > --genome_index { sample.genome_index } {% else %} --genome_index { refgenie[sample.genome].bwa_index.dir } - refgenie: --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} From 7930a9779816f112307639cb679281940aef7868 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 14:10:21 -0400 Subject: [PATCH 31/66] update example test config --- examples/test_project/test_config.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index 6285536f..25a1a3e9 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -29,12 +29,3 @@ sample_modifiers: peak_type: fixed # Default. [options: variable] extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run - -refgenie: - tag_overrides: - hg38: - blacklist: "default" - feat_annotation: "default" - path_overrides: - - registry_path: "hg38/blacklist" - value: "/project/shefflab/genomes_v04_210301/alias/hg38/blacklist/default/hg38_blacklist.bed.gz" From 525611b597099929dfa772d3e5fec027661e002d Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 14:48:07 -0400 Subject: [PATCH 32/66] modify prealignment to use updated arguments --- pipelines/pepatac.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 8aa8c999..9683a57b 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.10.0" +__version__ = "0.11.0" from argparse import ArgumentParser @@ -125,12 +125,12 @@ def parse_arguments(): help="Space-delimited list of reference genomes to " "align to before primary alignment.") # Genome assets - parser.add_argument("--genome-index", default=None, required=True, + parser.add_argument("--genome-index", default=None, dest="genome_index", type=str, help="Path to primary genome index file. Either a " "bowtie2 or bwa index.") - parser.add_argument("--chrom-sizes", default=None, required=True, + parser.add_argument("--chrom-sizes", default=None, dest="chrom_sizes", type=str, help="Path to primary genome chromosome sizes file.") @@ -604,10 +604,12 @@ def main(): # Add prealignment genome annotation files to resources pm.info(f"prealignments: {args.prealignments}") + res.prealignment_index = args.prealignments # Add primary genome annotation files to resources pm.info(f"primary genome index: {args.genome_index}") - + res.genome_index = args.genome_index + # Add optional files to resources if args.sob and not args.search_file: err_msg = (f"You specified --sob but did not include the path to" @@ -897,6 +899,16 @@ def check_trim(): # Keep track of the unmapped files in order to compress them after final # alignment. + def pairs(l): + ''' + Iterate over a list in pairs + ''' + i = iter(l) + prev = next(i) + for item in i: + yield prev, item + prev = item + to_compress = [] if len(res.prealignment_index) == 0: print("You may use `--prealignment-bowtie2-index` or " @@ -905,13 +917,12 @@ def check_trim(): "See http://pepatac.databio.org/en/latest/ for documentation.") else: # Loop through any prealignment references and map to them sequentially - for count, genome_index in enumerate(res.prealignment_index): - pm.info(f"Aligning with {args.aligner} to {genome_index}.") - assembly_identifier = f"prealignment_{count}" + for genome, genome_index in pairs(res.prealignment_index): + pm.debug(f"Aligning with {args.aligner} to {genome_index}") if args.no_fifo: unmap_fq1, unmap_fq2 = _align( args, tools, args.paired_end, False, - unmap_fq1, unmap_fq2, assembly_identifier, + unmap_fq1, unmap_fq2, genome, assembly=genome_index, outfolder=param.outfolder, aligndir="prealignments", @@ -923,7 +934,7 @@ def check_trim(): else: unmap_fq1, unmap_fq2 = _align( args, tools, args.paired_end, True, - unmap_fq1, unmap_fq2, assembly_identifier, + unmap_fq1, unmap_fq2, genome, assembly=genome_index, outfolder=param.outfolder, aligndir="prealignments", From e2e9f77e48fbb659c2f13ad576cb8329ca22d3fc Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 14:48:17 -0400 Subject: [PATCH 33/66] fix hyphens --- sample_pipeline_interface.yaml | 41 ++++++++++------------------------ 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 644986c4..c2503ccc 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -5,54 +5,37 @@ input_schema: pepatac_input_schema.yaml output_schema: pepatac_output_schema.yaml command_template: > python {pipeline.path} - --output_parent { looper.results_subdir } + --output-parent { looper.results_subdir } --cores { compute.cores } --mem { compute.mem } - --sample_name { sample.sample_name } + --sample-name { sample.sample_name } --input { sample.read1 } {% if sample.read2 is defined %} --input2 { sample.read2 } {% endif %} - --single_or_paired { sample.read_type } + --single-or-paired { sample.read_type } --genome { sample.genome } - --chrom_sizes { refgenie[sample.genome].fasta.chrom_sizes } - --TSS_name { refgenie[sample.genome].refgene_anno.refgene_tss } + --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } + --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } --blacklist { refgenie[sample.genome].blacklist.blacklist } - --anno_name { refgenie[sample.genome].feat_annotation.feat_annotation } + --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} - {% if sample.aligner == "bowtie2" %} - {% if sample.genome_index is defined %} - --genome_index { sample.genome_index } - {% else %} - --genome_index { refgenie[sample.genome].bowtie2_index.dir } - --prealignments {% for p in sample.prealignments %} - { p } { refgenie[p].bowtie2_index.dir }{% endfor %} - {% endif %} - {% else %} - {% if sample.genome_index is defined %} - --genome_index { sample.genome_index } - {% else %} - --genome_index { refgenie[sample.genome].bwa_index.dir } - --prealignments {% for p in sample.prealignments %} - { p } { refgenie[p].bwa_index.dir }{% endfor %} - {% endif %} - {% endif %} + {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bwa_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} - {% if sample.peak_caller is defined %} --peak_caller { sample.peak_caller } {% endif %} - {% if sample.peak_type is defined %} --peak_type { sample.peak_type } {% endif %} + {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% endif %} + {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% endif %} {% if sample.extend is defined %} --extend { sample.extend } {% endif %} - {% if sample.genome_size is defined %} --genome_size { sample.genome_size } {% endif %} + {% if sample.genome_size is defined %} --genome-size { sample.genome_size } {% endif %} {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks { sample.frip_ref_peaks } {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} - {% if sample.sob is defined %} --search_file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} - {% if sample.no_scale is defined %} --no_scale {% endif %} + {% if sample.sob is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} + {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} {% if sample.lite is defined %} --lite {% endif %} {% if sample.skipqc is defined %} --skipqc {% endif %} - compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac From 6be839f39cf1a82a3f1cfae317849d040f88c079 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 16:03:23 -0400 Subject: [PATCH 34/66] update chrom sizes asset use --- pipelines/pepatac.py | 20 ++++++++++++++++++-- requirements.txt | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 9683a57b..015a1ecd 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -603,12 +603,22 @@ def main(): GENOME_IDX_KEY = "bowtie2_index" # Add prealignment genome annotation files to resources - pm.info(f"prealignments: {args.prealignments}") + pm.debug(f"prealignments: {args.prealignments}") res.prealignment_index = args.prealignments # Add primary genome annotation files to resources - pm.info(f"primary genome index: {args.genome_index}") res.genome_index = args.genome_index + if not res.genome_index.endswith(args.genome_assembly): + # Replace last occurrence of . with genome name + res.genome_index = (res.genome_index[:res.genome_index.rfind(".")] + + args.genome_assembly) + if args.aligner.lower() == "bwa": + res.genome_index += ".fa" + pm.debug(f"primary genome index: {args.genome_index}") + + if (args.chrom_sizes and os.path.isfile(args.chrom_sizes) and + os.stat(args.chrom_sizes).st_size > 0): + res.chrom_sizes = args.chrom_sizes # Add optional files to resources if args.sob and not args.search_file: @@ -918,6 +928,12 @@ def pairs(l): else: # Loop through any prealignment references and map to them sequentially for genome, genome_index in pairs(res.prealignment_index): + if not genome_index.endswith(genome): + # Replace last occurrence of . with genome name + genome_index = genome_index[:genome_index.rfind(".")] + genome + #genome_index = genome_index.replace('.',genome) + if args.aligner.lower() == "bwa": + genome_index += ".fa" pm.debug(f"Aligning with {args.aligner} to {genome_index}") if args.no_fifo: unmap_fq1, unmap_fq2 = _align( diff --git a/requirements.txt b/requirements.txt index 9488d5e1..7a60931f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,6 +24,6 @@ pysam>=0.13 python-Levenshtein>=0.12.0 pyyaml>=3.13 refgenconf>=0.7.0 -refgenie +#refgenie ubiquerg>=0.6.1 yacman>=0.6.7 From 695da808cbb43675042d11be5269521b1715e965 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 25 Jun 2021 16:43:30 -0400 Subject: [PATCH 35/66] check for refgenie asset existence in cmd template or override --- examples/test_project/test_config.yaml | 9 ++++++--- sample_pipeline_interface.yaml | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index 25a1a3e9..ac7efc59 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -14,13 +14,13 @@ sample_modifiers: derive: attributes: [read1, read2] sources: - test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" - test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + test_data_R1: "examples/data/{sample_name}_chr22_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_chr22_r2.fastq.gz" imply: - if: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: - genome: hg38 + genome: hg38_chr22 macs_genome_size: hs prealignments: ["rCRSd"] aligner: bowtie2 # Default. [options: bwa] @@ -29,3 +29,6 @@ sample_modifiers: peak_type: fixed # Default. [options: variable] extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run + #TSS_name: "/project/shefflab/genomes_v04_210301/alias/hg38/refgene_anno/default/hg38_TSS.bed" + blacklist: /project/shefflab/genomes_v04_210301/alias/hg38/blacklist/default/hg38_blacklist.bed.gz + anno_name: /project/shefflab/genomes_v04_210301/alias/hg38/feat_annotation/default/hg38_annotations.bed.gz diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index c2503ccc..728d1def 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -14,9 +14,9 @@ command_template: > --single-or-paired { sample.read_type } --genome { sample.genome } --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } - --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } - --blacklist { refgenie[sample.genome].blacklist.blacklist } - --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } + {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno.refgene_tss is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} + {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% else %} { refgenie[sample.genome].blacklist.blacklist } {% endif %} + {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% else %} { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bwa_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} {% endif %} From 4e213a232fdf4d6b99901ad8245e4f0cef8e7342 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 28 Jun 2021 13:26:34 -0400 Subject: [PATCH 36/66] check for asset existence and use if present --- sample_pipeline_interface.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 728d1def..9789a0f1 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -14,9 +14,9 @@ command_template: > --single-or-paired { sample.read_type } --genome { sample.genome } --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } - {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno.refgene_tss is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} - {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% else %} { refgenie[sample.genome].blacklist.blacklist } {% endif %} - {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% else %} { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} + {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} + {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% elif refgenie[sample.genome].blacklist is defined %} { refgenie[sample.genome].blacklist.blacklist } {% endif %} + {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bwa_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} {% endif %} From 8099a0777554cdc260a0706dd1e8763ccc628ff0 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 28 Jun 2021 13:45:09 -0400 Subject: [PATCH 37/66] add asset checks for genome indicies --- sample_pipeline_interface.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 9789a0f1..c68d478f 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -13,13 +13,14 @@ command_template: > {% if sample.read2 is defined %} --input2 { sample.read2 } {% endif %} --single-or-paired { sample.read_type } --genome { sample.genome } - --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } + {% if sample.chrom_sizes is defined %} --chrom-sizes { sample.chrom_sizes } {% elif refgenie[sample.genome].fasta is defined %} --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } {% endif %} {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} - {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% elif refgenie[sample.genome].blacklist is defined %} { refgenie[sample.genome].blacklist.blacklist } {% endif %} - {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} + {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% elif refgenie[sample.genome].blacklist is defined %} --blacklist { refgenie[sample.genome].blacklist.blacklist } {% endif %} + {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} - {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% else %} --genome-index { refgenie[sample.genome].bwa_index.dir } --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} {% endif %} + {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bwa_index is defined %} --genome-index { refgenie[sample.genome].bwa_index.dir } {% endif %} {% endif %} + {% if sample.aligner == "bowtie2" %} --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% else %} --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% endif %} {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% endif %} @@ -28,7 +29,7 @@ command_template: > {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks { sample.frip_ref_peaks } {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} - {% if sample.sob is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} + {% if sample.sob is defined %} {% if refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% endif %} {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} From d67465d2dd37eeb571d528fe5eaedde53b38e6bd Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 28 Jun 2021 13:45:35 -0400 Subject: [PATCH 38/66] drop unused plugins --- sample_pipeline_interface.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index c68d478f..77a3248c 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -48,8 +48,6 @@ bioconductor: readFunPath: BiocProject/runCOCOA.R var_templates: refgenie_config: "$REFGENIE" -# custom_template: sample_template.jinja pre_submit: python_functions: - refgenconf.looper_refgenie_populate -# - looper.write_custom_template From 7b6ce24d3212c9b1ce41a350adf0942cc513ad59 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 28 Jun 2021 15:59:30 -0400 Subject: [PATCH 39/66] change normalized file sample naming --- PEPATACr/R/PEPATACr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 7d53940d..7b09c59b 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1719,7 +1719,7 @@ reducePeaks <- function(input, chr_sizes, output=NA, normalize=FALSE) { final[score < 0, score := 0] # save final peak set if (is.na(output)) { - fwrite(final, paste0(sampleName(input), + fwrite(final, paste0(sampleName(input, 1), "_peaks_normalized.narrowPeak"), sep="\t", col.names=FALSE) } else { From 5adcb94a0c61e6fd98f8d4779de2407608fe8a1a Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 28 Jun 2021 16:47:40 -0400 Subject: [PATCH 40/66] split prealignment name and path by equals sign --- pipelines/pepatac.py | 4 +++- sample_pipeline_interface.yaml | 16 ++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 015a1ecd..55a146a3 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -927,7 +927,9 @@ def pairs(l): "See http://pepatac.databio.org/en/latest/ for documentation.") else: # Loop through any prealignment references and map to them sequentially - for genome, genome_index in pairs(res.prealignment_index): + for prealignment in res.prealignment_index: + pm.debug(f"prealignment: {prealignment}") + genome, genome_index = prealignment.split('=') if not genome_index.endswith(genome): # Replace last occurrence of . with genome name genome_index = genome_index[:genome_index.rfind(".")] + genome diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 77a3248c..e4562a9b 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -17,15 +17,15 @@ command_template: > {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} {% if sample.blacklist is defined %} --blacklist { sample.blacklist } {% elif refgenie[sample.genome].blacklist is defined %} --blacklist { refgenie[sample.genome].blacklist.blacklist } {% endif %} {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} - {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% endif %} - {% if sample.aligner is defined %} --aligner { sample.aligner } {% endif %} - {% if sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bwa_index is defined %} --genome-index { refgenie[sample.genome].bwa_index.dir } {% endif %} {% endif %} - {% if sample.aligner == "bowtie2" %} --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bowtie2_index.dir }{% endfor %} {% else %} --prealignments {% for p in sample.prealignments %} { p } { refgenie[p].bwa_index.dir }{% endfor %} {% endif %} + {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% else %} --trimmer "skewer" {% endif %} + {% if sample.aligner is defined %} --aligner { sample.aligner } {% set aligner = sample.aligner %} {% else %} --aligner "bowtie2" {% set aligner = "bowtie2" %} {% endif %} + {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bwa_index is defined %} --genome-index { refgenie[sample.genome].bwa_index.dir } {% endif %} {% endif %} + {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} --prealignments {% for p in sample.prealignments %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% else %} --prealignments {% for p in sample.prealignments %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endfor %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} - {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% endif %} - {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% endif %} - {% if sample.extend is defined %} --extend { sample.extend } {% endif %} - {% if sample.genome_size is defined %} --genome-size { sample.genome_size } {% endif %} + {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs2" {% endif %} + {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% else %} --peak-type "fixed" {% endif %} + {% if sample.extend is defined %} --extend { sample.extend } {% else %} --extend 250 {% endif %} + {% if sample.genome_size is defined %} --genome-size { sample.genome_size } {% else %} --genome-size "2.7e9" {% endif %} {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks { sample.frip_ref_peaks } {% endif %} {% if sample.motif is defined %} --motif {% endif %} {% if sample.sob is defined %} --sob {% endif %} From 28a5eb302ba915f9a0c7e4c0421167ddc36fa522 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 28 Jun 2021 16:49:22 -0400 Subject: [PATCH 41/66] prealignments must be array --- examples/test_project/test_config.yaml | 5 +---- pepatac_input_schema.yaml | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index ac7efc59..fa0fb0f0 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -28,7 +28,4 @@ sample_modifiers: trimmer: skewer # Default. [options: pyadapt, trimmomatic] peak_type: fixed # Default. [options: variable] extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. - frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run - #TSS_name: "/project/shefflab/genomes_v04_210301/alias/hg38/refgene_anno/default/hg38_TSS.bed" - blacklist: /project/shefflab/genomes_v04_210301/alias/hg38/blacklist/default/hg38_blacklist.bed.gz - anno_name: /project/shefflab/genomes_v04_210301/alias/hg38/feat_annotation/default/hg38_annotations.bed.gz + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run \ No newline at end of file diff --git a/pepatac_input_schema.yaml b/pepatac_input_schema.yaml index 92d64226..7472b587 100644 --- a/pepatac_input_schema.yaml +++ b/pepatac_input_schema.yaml @@ -21,7 +21,7 @@ properties: type: string description: "Refgenie genome registry identifier" prealignments: - type: ["string", "array"] + type: ["array"] descrption: "Refgenie genome registry identifiers" read_type: type: string From c9ce2222a76870139e2a9eac6092f8fde88b01d7 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 29 Jun 2021 11:17:48 -0400 Subject: [PATCH 42/66] pass sample name explicitly to reducePeaks function --- PEPATACr/R/PEPATACr.R | 6 +++--- pipelines/pepatac.py | 13 +------------ tools/PEPATAC.R | 22 +++++++++++++--------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 7b09c59b..f1042eb0 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1640,12 +1640,13 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, #' peaks. It also trims peaks extending beyond the bounds of the chromosome. #' #' @param input Path to narrowPeak file +#' @param sample_name Sample name character string #' @param chr_sizes Genome chromosome sizes file. #' @param output Output file name. #' @param normalize Remove overlaps and normalize the score. #' @keywords reduce fixed peaks #' @export -reducePeaks <- function(input, chr_sizes, output=NA, normalize=FALSE) { +reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALSE) { info <- file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { peaks <- fread(file.path(input)) @@ -1719,8 +1720,7 @@ reducePeaks <- function(input, chr_sizes, output=NA, normalize=FALSE) { final[score < 0, score := 0] # save final peak set if (is.na(output)) { - fwrite(final, paste0(sampleName(input, 1), - "_peaks_normalized.narrowPeak"), + fwrite(final, paste0(sample_name, "_peaks_normalized.narrowPeak"), sep="\t", col.names=FALSE) } else { fwrite(final, output, sep="\t", col.names=FALSE) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 55a146a3..2dd1d0ee 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -907,18 +907,6 @@ def check_trim(): unmap_genome_bam = os.path.join( map_genome_folder, args.sample_name + "_unmap.bam") - # Keep track of the unmapped files in order to compress them after final - # alignment. - def pairs(l): - ''' - Iterate over a list in pairs - ''' - i = iter(l) - prev = next(i) - for item in i: - yield prev, item - prev = item - to_compress = [] if len(res.prealignment_index) == 0: print("You may use `--prealignment-bowtie2-index` or " @@ -2129,6 +2117,7 @@ def report_peak_count(): cmd = build_command([tools.Rscript, (tool_path("PEPATAC.R"), "reduce"), ("-i", peak_output_file), + ("-s", args.sample_name), ("-c", res.chrom_sizes), "--normalize" ]) diff --git a/tools/PEPATAC.R b/tools/PEPATAC.R index d100c4f3..256444d1 100755 --- a/tools/PEPATAC.R +++ b/tools/PEPATAC.R @@ -588,6 +588,7 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { "Version: ", version, "\n\n", "Command: reduce \t reduce overlapping peaks\n\n", " -i, --input\t\t Path to narrowPeak file.\n", + " -s, --sample_name\t\t Sample name character string.\n", " -c, --chr_sizes\t Genome chromosome sizes file. .\n", " -o, --output\t\t Output file (optional).\n", " -n, --normalize\t\t Normalize scores.\n" @@ -604,18 +605,21 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { message(usage) quit() } else { - input <- opt_get(name = c("input", "i"), required=TRUE, - description="Path to narrowPeak file.") - chr_sizes <- opt_get(name = c("chr_sizes", "c"), required=TRUE, - description="Genome chromosome sizes file. .") - output <- opt_get(name = c("output", "o"), required=FALSE, - description="Output file.") - normalize <- opt_get(name = c("normalize", "n"), required=FALSE, - default=FALSE, - description="Normalize scores.") + input <- opt_get(name = c("input", "i"), required=TRUE, + description="Path to narrowPeak file.") + sample_name <- opt_get(name = c("sample_name", "s"), required=TRUE, + description="Sample name character string.") + chr_sizes <- opt_get(name = c("chr_sizes", "c"), required=TRUE, + description="Genome chromosome sizes file. .") + output <- opt_get(name = c("output", "o"), required=FALSE, + description="Output file.") + normalize <- opt_get(name = c("normalize", "n"), required=FALSE, + default=FALSE, + description="Normalize scores.") #print(message(paste0("Normalize: ", normalize))) if (is.na(output)) {output <- NA} PEPATACr::reducePeaks(input=input, + sample_name=sample_name, chr_sizes=chr_sizes, output=output, normalize=normalize) From 2d25bbaf8106d6843afb6a3862fea05efa6c4ca9 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 29 Jun 2021 13:14:16 -0400 Subject: [PATCH 43/66] update blacklist reduce peaks step --- pipelines/pepatac.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 2dd1d0ee..de7ba745 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -2179,6 +2179,7 @@ def report_peak_count(): cmd1 = build_command([tools.Rscript, (tool_path("PEPATAC.R"), "reduce"), ("-i", filter_peak), + ("-s", args.sample_name), ("-c", res.chrom_sizes) ]) cmd2 = ("touch " + blacklist_target) From 03a184cf04bcf93fe8a652aa42a39750e36d50ee Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 29 Jun 2021 13:14:51 -0400 Subject: [PATCH 44/66] fix file.path use in reducePeaks --- PEPATACr/R/PEPATACr.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index f1042eb0..0c1719d4 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1720,7 +1720,8 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS final[score < 0, score := 0] # save final peak set if (is.na(output)) { - fwrite(final, paste0(sample_name, "_peaks_normalized.narrowPeak"), + file_path <- file.path(dirname(input), sample_name) + fwrite(final, paste0(file_path, "_peaks_normalized.narrowPeak"), sep="\t", col.names=FALSE) } else { fwrite(final, output, sep="\t", col.names=FALSE) From bdb0d9301c00b38f44c2a9a1b9ed07a9eab62da1 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 29 Jun 2021 13:28:36 -0400 Subject: [PATCH 45/66] change prealignments to array --- examples/gold_atac/metadata/gold_config.yaml | 137 +++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 examples/gold_atac/metadata/gold_config.yaml diff --git a/examples/gold_atac/metadata/gold_config.yaml b/examples/gold_atac/metadata/gold_config.yaml new file mode 100644 index 00000000..b344b34e --- /dev/null +++ b/examples/gold_atac/metadata/gold_config.yaml @@ -0,0 +1,137 @@ +name: gold_atac + +pep_version: 2.0.0 +sample_table: gold_sample_table.csv + +looper: + output_dir: "$PROCESSED/pepatac/gold_atac/default" + pipeline_interfaces: "$CODE/pepatac/project_pipeline_interface.yaml" + +sample_modifiers: + append: + pipeline_interfaces: "$CODE/pepatac/sample_pipeline_interface.yaml" + derive: + attributes: [read1, read2] + sources: + SRA: "/scratch/jps3dp/tools/databio/pepatac/examples/gold_atac/metadata/{SRR}.bam" + SRA_1: "$SRAFQ/{SRR}_1.fastq.gz" + SRA_2: "$SRAFQ/{SRR}_2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + genome_size: hs + prealignments: ["rCRSd"] + +project_modifiers: + amend: + genrich: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/genrich" + sample_modifiers: + append: + peak_caller: genrich + peak_type: variable + hmmratac: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/hmmratac" + sample_modifiers: + append: + peak_caller: hmmratac + peak_type: variable + homer: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/homer" + sample_modifiers: + append: + peak_caller: homer + peak_type: variable + fseq: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/fseq" + sample_modifiers: + append: + peak_caller: fseq + peak_type: variable + fseq2: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/fseq2" + sample_modifiers: + append: + peak_caller: fseq2 + peak_type: variable + variable: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/macs2/variable" + sample_modifiers: + append: + peak_caller: macs2 + peak_type: variable + bwa: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/aligner/bwa" + sample_modifiers: + append: + aligner: bwa + picard: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/dedup/picard" + sample_modifiers: + append: + deduplicator: picard + samtools: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/dedup/samtools" + sample_modifiers: + append: + deduplicator: samtools + motif: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/motif" + sample_modifiers: + append: + motif: True + noscale: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/no-scale" + sample_modifiers: + append: + no-scale: True + prioritize: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/prioritize" + sample_modifiers: + append: + prioritize: True + sob: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/sob/default" + sample_modifiers: + append: + sob: True + sob_noscale: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/sob/no-scale" + sample_modifiers: + append: + sob: True + no-scale: True + skipqc: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/skipqc" + sample_modifiers: + append: + skipqc: True + nofifo: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/noFIFO" + sample_modifiers: + append: + noFIFO: True + lite: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/lite" + sample_modifiers: + append: + lite: True From 4c92a49fcd4eb600b9ecbe516787e2e2d6d0ce68 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 30 Jun 2021 16:53:01 -0400 Subject: [PATCH 46/66] rework CLI arguments and refgenie populate integration for prealignments --- docs/run-bulker.md | 49 +++++++++++++-- examples/test_project/test_config.yaml | 17 ++--- .../test_project/test_config_refgenie.yaml | 24 +++++++ pipelines/pepatac.py | 63 +++++++++++-------- sample_pipeline_interface.yaml | 2 +- 5 files changed, 114 insertions(+), 41 deletions(-) create mode 100644 examples/test_project/test_config_refgenie.yaml diff --git a/docs/run-bulker.md b/docs/run-bulker.md index 5def8be0..3be6841f 100644 --- a/docs/run-bulker.md +++ b/docs/run-bulker.md @@ -19,9 +19,13 @@ If you would still prefer using a single container, we do provide a [PEPATAC doc git clone https://github.com/databio/pepatac.git ``` -### 2: Initialize `refgenie` and download assets +### 2: Get genome assets -`PEPATAC` uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. Because assets are user-dependent, these files must still exist outside of a container system. We need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: +We [recommend `refgenie` to manage all required and optional genome assets](run-bulker.md#2a-initialize-refgenie-and-download-assets). However, [`PEPATAC` can also accept file paths to any of the assets](run-bulker.md#2b-download-assets). + +#### 2a: Initialize `refgenie` and download assets + +`PEPATAC` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must still exist outside of a container system. Therefore, we need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: ```console pip install refgenie @@ -45,6 +49,17 @@ refgenie pull rCRSd/bowtie2_index refgenie pull human_repeats/bowtie2_index ``` +#### 2b: Download assets + +If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: + - a chromosome sizes file: a text file containing "chr" and "size" columns. + - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). + +Optional assets include: + - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. + - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) + - a [genomic feature annotation file](annotation.md) + ### 3. Install and configure `bulker` Check out [the `bulker` setup guide to install bulker](https://bulker.databio.org/en/latest/install/) on your system. It is a straightforward python package with a few configuration steps required prior to use with `PEPATAC`. @@ -70,20 +85,46 @@ Now we simply run the pipeline like you would with a native installation, but we #### Run the pipeline at the command line -From the `pepatac/` repository folder: +If you are using `refgenie`, you can grab the path to the `--chrom-sizes` and `--genome-index` files as follows: +```console +refgenie seek hg38/fasta.chrom_sizes +refgenie seek hg38/bowtie2_index.dir +refgenie seek rCRSd/bowtie2_index.dir +``` + +Alternatively, if you are *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the refgenie servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." +```console +wget -O hg38.fasta.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default +``` + +From the `pepatac/` repository folder (using the manually downloaded genome assets): ```console pipelines/pepatac.py --single-or-paired paired \ - --prealignments rCRSd human_repeats \ + --prealignments rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ --genome hg38 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ --sample-name test1 \ --input examples/data/test1_r1.fastq.gz \ --input2 examples/data/test1_r2.fastq.gz \ --genome-size hs \ -O $HOME/pepatac_test ``` + +With a single core, this will take 20-30 minutes to complete. + #### Run the pipeline using looper Since `bulker` automatically direct any calls to required software to instead be executed in containers, we can just run our project the exact same way we would when we installed everything natively! + +**Run the pipeline with looper and manual asset specifications** ```console looper run examples/test_project/test_config.yaml +``` + +**Run the pipeline with looper and refgenie** +```console +looper run examples/test_project/test_config_refgenie.yaml ``` \ No newline at end of file diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml index fa0fb0f0..f9c959a6 100644 --- a/examples/test_project/test_config.yaml +++ b/examples/test_project/test_config.yaml @@ -14,18 +14,13 @@ sample_modifiers: derive: attributes: [read1, read2] sources: - test_data_R1: "examples/data/{sample_name}_chr22_r1.fastq.gz" - test_data_R2: "examples/data/{sample_name}_chr22_r2.fastq.gz" + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" imply: - if: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: - genome: hg38_chr22 - macs_genome_size: hs - prealignments: ["rCRSd"] - aligner: bowtie2 # Default. [options: bwa] - deduplicator: samblaster # Default. [options: picard] - trimmer: skewer # Default. [options: pyadapt, trimmomatic] - peak_type: fixed # Default. [options: variable] - extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. - frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run \ No newline at end of file + genome: hg38 + genome_index: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 + chrom_sizes: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes + prealignment_index: ["rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4"] diff --git a/examples/test_project/test_config_refgenie.yaml b/examples/test_project/test_config_refgenie.yaml new file mode 100644 index 00000000..badfa4ae --- /dev/null +++ b/examples/test_project/test_config_refgenie.yaml @@ -0,0 +1,24 @@ +# This project config file describes your project. See looper docs for details. +name: test_project # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: test_annotation.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: pepatac_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml + derive: + attributes: [read1, read2] + sources: + test_data_R1: "examples/data/{sample_name}_r1.fastq.gz" + test_data_R2: "examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignment_names: ["rCRSd"] diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index de7ba745..ee2caf07 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.11.0" +__version__ = "0.10.0" from argparse import ArgumentParser @@ -40,8 +40,8 @@ def parse_arguments(): parser = ArgumentParser(description='PEPATAC version ' + __version__) parser = pypiper.add_pypiper_args(parser, groups= ['pypiper', 'looper', 'ngs'], - required=["input", "genome", "sample-name", "output-parent", - "chrom-sizes", "primary-index"]) + required=["input", "genome", "sample_name", "output_parent", + "chrom_sizes", "genome_index"]) # Pipeline-specific arguments parser.add_argument("--trimmer", dest="trimmer", type=str.lower, @@ -119,18 +119,25 @@ def parse_arguments(): help="Skip FastQC. Useful for bugs in FastQC " "that appear with some sequence read files.") - # Genome assets - parser.add_argument("--prealignments", default=[], type=str, + # Prealignment genome assets + parser.add_argument("--prealignment-names", default=[], type=str, + nargs="+", + help="Space-delimited list of prealignment genome " + "names to align to before primary alignment.") + + parser.add_argument("--prealignment-index", default=[], type=str, nargs="+", - help="Space-delimited list of reference genomes to " - "align to before primary alignment.") + help="Space-delimited list of prealignment genome " + "name and index files delimited by an equals sign " + "to align to before primary alignment. " + "e.g. rCRSd=/path/to/bowtie2_index/.") # Genome assets - parser.add_argument("--genome-index", default=None, + parser.add_argument("--genome-index", default=None, required=True, dest="genome_index", type=str, help="Path to primary genome index file. Either a " "bowtie2 or bwa index.") - parser.add_argument("--chrom-sizes", default=None, + parser.add_argument("--chrom-sizes", default=None, required=True, dest="chrom_sizes", type=str, help="Path to primary genome chromosome sizes file.") @@ -603,22 +610,28 @@ def main(): GENOME_IDX_KEY = "bowtie2_index" # Add prealignment genome annotation files to resources - pm.debug(f"prealignments: {args.prealignments}") - res.prealignment_index = args.prealignments + if args.prealignment_index: + pm.debug(f"prealignments: {args.prealignment_index}") + res.prealignment_index = args.prealignment_index + else: + res.prealignment_index = None # Add primary genome annotation files to resources res.genome_index = args.genome_index - if not res.genome_index.endswith(args.genome_assembly): + + if res.genome_index.endswith("."): # Replace last occurrence of . with genome name - res.genome_index = (res.genome_index[:res.genome_index.rfind(".")] + - args.genome_assembly) + res.genome_index = os.path.abspath(( + res.genome_index[:res.genome_index.rfind(".")] + + args.genome_assembly) + ) if args.aligner.lower() == "bwa": res.genome_index += ".fa" pm.debug(f"primary genome index: {args.genome_index}") if (args.chrom_sizes and os.path.isfile(args.chrom_sizes) and os.stat(args.chrom_sizes).st_size > 0): - res.chrom_sizes = args.chrom_sizes + res.chrom_sizes = os.path.abspath(args.chrom_sizes) # Add optional files to resources if args.sob and not args.search_file: @@ -628,20 +641,20 @@ def main(): pm.fail_pipeline(RuntimeError(err_msg)) if (args.search_file and os.path.isfile(args.search_file) and os.stat(args.search_file).st_size > 0): - res.search_file = args.search_file + res.search_file = os.path.abspath(args.search_file) if (args.blacklist and os.path.isfile(args.blacklist) and os.stat(args.blacklist).st_size > 0): - res.blacklist = args.blacklist + res.blacklist = os.path.abspath(args.blacklist) if (args.TSS_name and os.path.isfile(args.TSS_name) and os.stat(args.TSS_name).st_size > 0): - res.refgene_tss = args.TSS_name + res.refgene_tss = os.path.abspath(args.TSS_name) if (args.anno_name and os.path.isfile(args.anno_name) and os.stat(args.anno_name).st_size > 0): - res.feat_annotation = args.anno_name + res.feat_annotation = os.path.abspath(args.anno_name) if (args.frip_ref_peaks and os.path.isfile(args.frip_ref_peaks) and os.stat(args.frip_ref_peaks).st_size > 0): - res.frip_ref_peaks = args.frip_ref_peaks + res.frip_ref_peaks = os.path.abspath(args.frip_ref_peaks) # Adapter file can be set in the config; if left null, we use a default. res.adapters = res.adapters or tool_path("NexteraPE-PE.fa") @@ -908,19 +921,19 @@ def check_trim(): map_genome_folder, args.sample_name + "_unmap.bam") to_compress = [] - if len(res.prealignment_index) == 0: - print("You may use `--prealignment-bowtie2-index` or " - "`--prealignment-bwa-index` to align to references before " - "the genome alignment step. " + if len(res.prealignment_index) == 0 or res.prealignment_index is None: + print("You may use `--prealignment-index` to align to references " + "before the genome alignment step. " "See http://pepatac.databio.org/en/latest/ for documentation.") else: # Loop through any prealignment references and map to them sequentially for prealignment in res.prealignment_index: pm.debug(f"prealignment: {prealignment}") genome, genome_index = prealignment.split('=') - if not genome_index.endswith(genome): + if genome_index.endswith("."): # Replace last occurrence of . with genome name genome_index = genome_index[:genome_index.rfind(".")] + genome + genome_index = os.path.abspath(genome_index) #genome_index = genome_index.replace('.',genome) if args.aligner.lower() == "bwa": genome_index += ".fa" diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index e4562a9b..baca0805 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -20,7 +20,7 @@ command_template: > {% if sample.trimmer is defined %} --trimmer { sample.trimmer } {% else %} --trimmer "skewer" {% endif %} {% if sample.aligner is defined %} --aligner { sample.aligner } {% set aligner = sample.aligner %} {% else %} --aligner "bowtie2" {% set aligner = "bowtie2" %} {% endif %} {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} {% else %} {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bwa_index is defined %} --genome-index { refgenie[sample.genome].bwa_index.dir } {% endif %} {% endif %} - {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} --prealignments {% for p in sample.prealignments %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% else %} --prealignments {% for p in sample.prealignments %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endfor %} {% endif %} + {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% elif aligner == "bowtie2" or sample.aligner == "bowtie2" and sample.prealignment_names is defined %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% else %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endfor %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs2" {% endif %} {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% else %} --peak-type "fixed" {% endif %} From 01df97566b051a1b23adbc27d2da422e9d731232 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 5 Jul 2021 14:08:41 -0400 Subject: [PATCH 47/66] update installation and use docs for sans refgenie approaches --- docs/run-bulker.md | 31 ++- docs/run-cluster.md | 30 ++- docs/run-conda.md | 55 ++++-- docs/run-container.md | 69 ++++--- docs/run-directly.md | 9 +- docs/tutorial.md | 230 ++++++++++++++++++++++- examples/tutorial/tutorial.yaml | 5 +- examples/tutorial/tutorial_refgenie.yaml | 30 +++ 8 files changed, 391 insertions(+), 68 deletions(-) create mode 100644 examples/tutorial/tutorial_refgenie.yaml diff --git a/docs/run-bulker.md b/docs/run-bulker.md index 3be6841f..ff490575 100644 --- a/docs/run-bulker.md +++ b/docs/run-bulker.md @@ -46,10 +46,9 @@ refgenie build hg38/feat_annotation ```console refgenie pull rCRSd/bowtie2_index -refgenie pull human_repeats/bowtie2_index ``` -#### 2b: Download assets +#### 2b: Download assets manually If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: - a chromosome sizes file: a text file containing "chr" and "size" columns. @@ -79,7 +78,7 @@ bulker activate databio/pepatac:1.0.7 ``` Now, you can run any of the commands in the crate as if they were natively installed, **but they're actually running in containers**! -### 6. Run the pipeline +### 6. Run the sample-level pipeline Now we simply run the pipeline like you would with a native installation, but we wouldn't have needed to install any additional tools! @@ -92,7 +91,7 @@ refgenie seek hg38/bowtie2_index.dir refgenie seek rCRSd/bowtie2_index.dir ``` -Alternatively, if you are *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the refgenie servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." +Alternatively, if you are *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." ```console wget -O hg38.fasta.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default @@ -102,7 +101,7 @@ wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive From the `pepatac/` repository folder (using the manually downloaded genome assets): ```console pipelines/pepatac.py --single-or-paired paired \ - --prealignments rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ + --prealignment-index rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ --genome hg38 \ --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ @@ -113,11 +112,11 @@ pipelines/pepatac.py --single-or-paired paired \ -O $HOME/pepatac_test ``` -With a single core, this will take 20-30 minutes to complete. +With a single processor, this will take 20-30 minutes to complete. #### Run the pipeline using looper -Since `bulker` automatically direct any calls to required software to instead be executed in containers, we can just run our project the exact same way we would when we installed everything natively! +Since `bulker` automatically directs any calls to required software to instead be executed in containers, we can just run our project the exact same way we would when we installed everything natively! **Run the pipeline with looper and manual asset specifications** ```console @@ -127,4 +126,20 @@ looper run examples/test_project/test_config.yaml **Run the pipeline with looper and refgenie** ```console looper run examples/test_project/test_config_refgenie.yaml -``` \ No newline at end of file +``` + +### 7: Run the project level pipeline + +`PEPATAC` also includes a project-level processing pipeline to do things like: + + - [Plot alignment statistics](files/examples/gold/summary/gold_alignmentPercent.pdf) for all samples in the project together for easy visualization + - [Plot TSS enrichment scores](files/examples/gold/summary/gold_TSSEnrichment.pdf) for all the samples in the project in a single figure + - [Produce a consensus peak set](consensus_peaks.md) for the project + - [Produce a count table](count_table.md) using the consensus peak set for all the samples in a project + +From the `pepatac/` repository folder (using the manually downloaded genome assets): +```console +looper runp examples/test_project/test_config.yaml +``` + +This should take < a minute on the test sample and will generate a `summary/` directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. diff --git a/docs/run-cluster.md b/docs/run-cluster.md index d3975dc1..76fb98c7 100644 --- a/docs/run-cluster.md +++ b/docs/run-cluster.md @@ -2,15 +2,15 @@ ## Default computing options -When you run your PEPATAC project using `looper run`, by default it will simply run each sample locally. You can change that using `looper run --package COMPUTE_PACKAGE`, where COMPUTE_PACKAGE is an option described below. This enables you to adjust your computing preferences on-the-fly. You have several built-in packages, which you can view by typing `divvy list`. Default packages include: +When you run your `PEPATAC` project using `looper run`, by default it will simply run each sample locally. You can change that using `looper run --package COMPUTE_PACKAGE`, where `COMPUTE_PACKAGE` is an option described below. This enables you to adjust your computing preferences on-the-fly. You have several built-in packages, which you can view by typing `divvy list`. Default packages include: -- `--package slurm`. Submit the jobs to a SLURM cluster using `sbatch`. -- `--package sge`. Submit the jobs to a SGE cluster using `qsub`. +- `--package slurm`. Submit the jobs to a `SLURM` cluster using `sbatch`. +- `--package sge`. Submit the jobs to a `SGE` cluster using `qsub`. -To show how this works, let's run the example project using the `slurm` compute package. Used `-d` for a dry run to create the submits scripts but not run them: +To show how this works, let's run the example project using the `slurm` compute package. Used `-d` for a dry run to create the submits scripts but not run them. +Using the manually downloaded assets (from the `pepatac/` repository): ```console -cd pepatac looper run examples/test_project/test_config.yaml -d \ --package slurm ``` @@ -18,14 +18,26 @@ looper run examples/test_project/test_config.yaml -d \ This will produce a job script: ```console -cat $PROCESSED/pepatac_test/submission/PEPATAC_test1.sub +cat pepatac_test/submission/PEPATAC_test1.sub ``` -If all looks well, run looper without `-d` to actually submit the jobs. Read more to [learn how to run PEPATAC in containers](run-container.md). +If all looks well, run looper without `-d` to actually submit the jobs. Read more to [learn how to run `PEPATAC` in containers](run-container.md). + +Using `refgenie` managed assets (from the `pepatac/` repository): +```console +looper run examples/test_project/test_config_refgenie.yaml -d \ + --package slurm +``` + +This will produce a job script: + +```console +cat pepatac_test/submission/PEPATAC_test1.sub +``` ## Customizing compute options -These default computing options may not fit your needs exactly. PEPATAC allows you to very easily change templates or add your own, so you can run PEPATAC in any possible computing environment. PEPATAC uses a standardized computing configuration called [divvy](https://divvy.databio.org). The instructions for changing these computing configuration options are universal for any software that relies on `divvy`. +These default computing options may not fit your needs exactly. `PEPATAC` allows you to very easily change templates or add your own, so you can run `PEPATAC` in any possible computing environment. `PEPATAC` uses a standardized computing configuration called [`divvy`](https://divvy.databio.org). The instructions for changing these computing configuration options are universal for any software that relies on `divvy`. To customize your compute packages, you first create a `divvy` computing configuration file and point an environment variable (`DIVCFG`) to that file: @@ -34,4 +46,4 @@ export DIVCFG="divvy_config.yaml" divvy init $DIVCFG ``` -Next, you edit that config file to add in any compute packages you need. PEPATAC will then give you access to any of your custom packages with `looper --package `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). +Next, you edit that config file to add in any compute packages you need. `PEPATAC` will then give you access to any of your custom packages with `looper --package `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). diff --git a/docs/run-conda.md b/docs/run-conda.md index 115a0218..91a4bc6e 100644 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -1,6 +1,6 @@ # Run PEPATAC in a conda environment. -We also enable setup of the pipeline using conda. As with container-based approaches, some native installation is required for complete setup. +We also enable setup of the pipeline using `conda`. As with container-based approaches, some native installation is required for complete setup. ## 1: Clone the `PEPATAC` pipeline @@ -16,7 +16,7 @@ Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinfor Be prepared for this initial installation process to take more than an hour to complete. -From the `pepatac/` directory: +From the `pepatac/` repository directory: ```{bash} conda env create -f requirements-conda.yml ``` @@ -52,11 +52,14 @@ install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistrib devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/") ``` -## 5: Initialize `refgenie` and download assets +## 5: Get genome assets -PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this: +### 5a: Initialize `refgenie` and download assets + +`PEPATAC` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must still be available natively. Therefore, we need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: ```console +pip install refgenie export REFGENIE=/path/to/your_genome_folder/genome_config.yaml refgenie init -c $REFGENIE ``` @@ -70,35 +73,52 @@ refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf h refgenie build hg38/feat_annotation ``` -PEPATAC also requires a `bowtie2_index` asset for any pre-alignment genomes: +`PEPATAC` also requires a `bowtie2_index` asset for any pre-alignment genomes: ```console refgenie pull rCRSd/bowtie2_index -refgenie pull human_repeats/bowtie2_index ``` +### 5b: Download assets manually + +If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: + - a chromosome sizes file: a text file containing "chr" and "size" columns. + - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). + +Optional assets include: + - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. + - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) + - a [genomic feature annotation file](annotation.md) + ## 6: Use `looper` to run the sample processing pipeline -Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the sample-level pipeline across each sample in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them: +Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` can utilize a project management tool called `looper` to run the sample-level pipeline across each sample in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them: -From the `pepatac/` folder: +If you are using `refgenie`, you can grab the path to the `--chrom-sizes` and `--genome-index` files as follows: ```console -looper run -d examples/test_project/test_config.yaml +refgenie seek hg38/fasta.chrom_sizes +refgenie seek hg38/bowtie2_index.dir +refgenie seek rCRSd/bowtie2_index.dir ``` -If that looked good, let's actually run the example by taking out the `-d` flag: +Alternatively, if you are *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the refgenie servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." ```console -looper run examples/test_project/test_config.yaml +wget -O hg38.fasta.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default ``` -Or, if you're using [`bulker`](https://bulker.databio.org/en/latest/) to run the pipeline in containers: +From the `pepatac/` repository folder (using the manually downloaded genome assets): +```console +looper run -d examples/test_project/test_config.yaml +``` +If that looked good, let's actually run the example by taking out the `-d` flag: ```console -bulker activate databio/pepatac looper run examples/test_project/test_config.yaml ``` -There are lots of other cool things you can do with looper, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/). +There are lots of other cool things you can do with `looper`, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/). ## 7: Use `looper` to run the project level pipeline @@ -109,7 +129,10 @@ There are lots of other cool things you can do with looper, like dry runs, repor - [Produce a consensus peak set](consensus_peaks.md) for the project - [Produce a count table](count_table.md) using the consensus peak set for all the samples in a project -`looper runp examples/test_project/test_config.yaml` +From the `pepatac/` repository folder (using the manually downloaded genome assets): +```console +looper runp examples/test_project/test_config.yaml +``` -This should take < a minute on the test sample and will generate a summary/ directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. +This should take < a minute on the test sample and will generate a `summary/` directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. diff --git a/docs/run-container.md b/docs/run-container.md index 157e9176..2761e31f 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -10,9 +10,13 @@ A popular approach is installing all dependencies in a container and just use th git clone https://github.com/databio/pepatac.git ``` -### 2: Initialize `refgenie` and download assets +### 2: Get genome assets -`PEPATAC` uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. Because assets are user-dependent, these files must still exist outside of a container system. We need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: +We [recommend `refgenie` to manage all required and optional genome assets](run-bulker.md#2a-initialize-refgenie-and-download-assets). However, [`PEPATAC` can also accept file paths to any of the assets](run-bulker.md#2b-download-assets). + +#### 2a: Initialize `refgenie` and download assets + +`PEPATAC` can use [`refgenie`](http://refgenie.databio.org/) assets for alignment and annotation. Because assets are user-dependent, these files must still exist outside of a container system. We need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: ```console pip install refgenie @@ -33,26 +37,41 @@ refgenie build hg38/feat_annotation ```console refgenie pull rCRSd/bowtie2_index -refgenie pull human_repeats/bowtie2_index +``` + +#### 2b: Download assets manually + +If you prefer not to use `refgenie`, you can also download and construct assets manually. Again, because these are user-defined assets, they must exist outside of any container system. The minimum required assets for a genome includes: + - a chromosome sizes file: a text file containing "chr" and "size" columns. + - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). + +Optional assets include: + - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. + - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) + - a [genomic feature annotation file](annotation.md) + +You can obtain the minimally required pre-constructed `--chrom-sizes` and `--genome-index` files from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." +```console +wget -O hg38.fasta.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default ``` ### 3. Pull the container image. -**Docker**: You can pull the docker [databio/pepatac image](https://hub.docker.com/r/databio/pepatac/) from dockerhub like this: +**Docker**: You can pull the docker [databio/pepatac image](https://hub.docker.com/r/databio/pepatac/) from `dockerhub` like this: ```console docker pull databio/pepatac ``` -Or build the image using the included Dockerfile (you can use a recipe in the included Makefile): +Or build the image using the included `Dockerfile` (you can use a recipe in the included `Makefile` in the `pepatac/` repository): ```console -cd pepatac/ make docker ``` -**Singularity**: You can [download the singularity image](http://big.databio.org/simages/pepatac) or build it from the docker image using the Makefile: +**Singularity**: You can [download the `singularity` image](http://big.databio.org/simages/pepatac) or build it from the docker image using the `Makefile`: ```console -cd pepatac/ make singularity ``` @@ -62,7 +81,7 @@ Now you'll need to tell the pipeline where you saved the singularity image. You Individual jobs can be run in a container by simply running the `pepatac.py` command through `docker run` or `singularity exec`. You can run containers either on your local computer, or in an HPC environment, as long as you have `docker` or `singularity` installed. You will need to include any volumes that contain data required by the pipeline. For example, to utilize `refgenie` assets you'll need to ensure the volume containing those files is available. In the following example, we are including an environment variable (`$GENOMES`) which points to such a directory. -For example, run it locally in singularity like this: +For example, run it locally in `singularity` like this: ```console singularity exec --bind $GENOMES $SIMAGES/pepatac pipelines/pepatac.py --help ``` @@ -71,24 +90,25 @@ With `docker`, you can use: ```console docker run --rm -it databio/pepatac pipelines/pepatac.py --help ``` -Be sure to mount the volumes you need with `--volume`. If you're utilizing any environment variables (e.g. `$GENOMES`), don't forget to include those in your docker command with the `-e` option. +Be sure to mount the volumes you need with `--volume`. If you're utilizing any environment variables (e.g. `$REFGENIE`), don't forget to include those in your docker command with the `-e` option. ### Container details #### Using `docker` -The pipeline has been successfully run in both a Linux and MacOS environment. With `docker` you need to bind mount your volume that contains the pipeline and your `refgenie` assets locations, as well as provide the container the same environment variables your host environment is using. +The pipeline has been successfully run in both a `Linux` and `MacOS` environment. With `docker` you need to bind mount your volume that contains the pipeline and your genome assets locations, as well as provide the container the same environment variables your host environment is using. -In the first example, we're mounting our home user directory (`/home/jps3ag/`) which contains the parent directories to our `refgenie` assets (`$GENOMES`) and to the pipeline itself. We'll also provide the pipeline two environment variables, `$GENOMES` and `$HOME`. +In the first example, we're mounting our home user directory (`/home/jps3ag/`) which contains the parent directories to our genome assets and to the pipeline itself. We'll also provide the pipeline environment variables, such as `$HOME`. -Here's that example command in a Linux environment to run the test example through the pipeline: +Here's that example command in a Linux environment to run the test example through the pipeline (using the manually downloaded genome assets): ```console docker run --rm -it --volume /home/jps3ag/:/home/jps3ag/ \ - -e GENOMES='/home/jps3ag/genomes/' \ -e HOME='/home/jps3ag/' \ databio/pepatac \ /home/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired paired \ - --prealignments rCRSd human_repeats \ + --prealignment-index rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ --genome hg38 \ + --genome-index /home/jps3ag/src/pepatac/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes /home/jps3ag/src/pepatac/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ --sample-name test1 \ --input /home/jps3ag/src/pepatac/examples/data/test1_r1.fastq.gz \ --input2 /home/jps3ag/src/pepatac/examples/data/test1_r2.fastq.gz \ @@ -96,7 +116,7 @@ docker run --rm -it --volume /home/jps3ag/:/home/jps3ag/ \ -O $HOME/pepatac_test ``` -In this second example, we'll perform the same command in a Mac environment using [Docker for Mac](https://docs.docker.com/v17.12/docker-for-mac/install/). +In this second example, we'll perform the same command in a `MacOS` environment using [`Docker` for `Mac`](https://docs.docker.com/v17.12/docker-for-mac/install/). This necessitates a few minor changes to run that same example: @@ -107,12 +127,13 @@ Remember to [allocate sufficient memory](https://docs.docker.com/docker-for-mac/ ```console docker run --rm -it --volume /Users/jps3ag/:/Users/jps3ag/ \ - -e GENOMES="/Users/jps3ag/genomes" \ -e HOME="/Users/jps3ag/" \ databio/pepatac \ /Users/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired paired \ - --prealignments rCRSd human_repeats \ + --prealignment-index rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ --genome hg38 \ + --genome-index /Users/jps3ag/src/pepatac/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes /Users/jps3ag/src/pepatac/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ --sample-name test1 \ --input /Users/jps3ag/src/pepatac/examples/data/test1_r1.fastq.gz \ --input2 /Users/jps3ag/src/pepatac/examples/data/test1_r2.fastq.gz \ @@ -122,18 +143,20 @@ docker run --rm -it --volume /Users/jps3ag/:/Users/jps3ag/ \ #### Using `singularity` -First, build a singularity container from the docker image and create a running instance (be sure to mount your directories containing your `$GENOMES` folder and pipeline. -``` +First, build a singularity container from the docker image and create a running instance: +```console singularity build pepatac docker://databio/pepatac:latest singularity instance start -B /home/jps3ag/:/home/jps3aq/ pepatac pepatac_instance ``` Second, run your command. -``` +```console singularity exec instance://pepatac_instance \ - /home/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired single \ - --prealignments rCRSd human_repeats \ + /home/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired paired \ + --prealignment-index rCRSd=/Users/jps3ag/src/pepatac/default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ --genome hg38 \ + --genome-index /Users/jps3ag/src/pepatac/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes /Users/jps3ag/src/pepatac/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ --sample-name test1 \ --input /home/jps3ag/src/pepatac/examples/data/test1_r1.fastq.gz \ --input2 /home/jps3ag/src/pepatac/examples/data/test1_r2.fastq.gz \ diff --git a/docs/run-directly.md b/docs/run-directly.md index 03e1fbfe..ec8fc49e 100644 --- a/docs/run-directly.md +++ b/docs/run-directly.md @@ -1,11 +1,14 @@ # Run the pipeline script directly -The pipeline at its core is just a python script, and you can run it on the command line for a single sample (see [command-line usage](usage.md)), which you can also get on the command line by running `pipelines/pepatac.py --help`. You just need to pass a few command-line parameters to specify sample name, reference genome, input files, etc. Here's the basic command to run a small test example through the pipeline: +The pipeline at its core is just a python script, and you can run it on the command line for a single sample (see [command-line usage](usage.md)), which you can also get on the command line by running `pipelines/pepatac.py --help`. You just need to pass a few command-line parameters to specify sample name, reference genome, input files, etc. Here's the [basic command to run a small test example through the pipeline](run-bulker.md#run-the-pipeline-at-the-command-line): +From the `pepatac/` repository folder (using the manually downloaded genome assets): ```console pipelines/pepatac.py --single-or-paired paired \ - --prealignments rCRSd human_repeats \ + --prealignments rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4 \ --genome hg38 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ --sample-name test1 \ --input examples/data/test1_r1.fastq.gz \ --input2 examples/data/test1_r2.fastq.gz \ @@ -13,4 +16,4 @@ pipelines/pepatac.py --single-or-paired paired \ -O $HOME/pepatac_test ``` -This example should take about 15 minutes to complete. +This example should take about 20-30 minutes to complete. diff --git a/docs/tutorial.md b/docs/tutorial.md index ec2aec6b..a2ac46d6 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -1,6 +1,10 @@ # PEPATAC pipeline step-by-step guide -Welcome to the `PEPATAC` extended tutorial! Use this to learn the ropes. We'll use two provided ATAC-seq datasets and run through the step-by-step process of setting up the configuration files, running the pipeline, and looking over the results together. To use this tutorial, you should have a basic familiarity with [working in a command line driven environment](http://matt.might.net/articles/basic-unix/). You also need to have already installed `PEPATAC` prerequisites, which you can do following the [install instructions](install.md). +Welcome to the `PEPATAC` extended tutorial! Use this to learn the ropes. We'll use two provided ATAC-seq datasets and run through the step-by-step process of setting up the configuration files, running the pipeline, and looking over the results together. To use this tutorial, you should have a basic familiarity with [working in a command line driven environment](http://matt.might.net/articles/basic-unix/). You also need to have already installed `PEPATAC` prerequisites, which you can do following the various [install instructions](install.md). This tutorial will follow two approaches to running the pipeline: +1. [Using `refgenie` managed genome assets, which we recommend](tutorial.md#tutorial-using-refgenie-managed-genome-assets) OR +2. [Using manually curated genome assets which do *not* require the installation and setup of `refgenie`](tutorial.md#tutorial-using-manually-downloaded-and-curated-genome-assets). + +# Tutorial using `refgenie` managed genome assets ## 1: Set up folders @@ -14,6 +18,212 @@ Let's point an environment variable to our tutorial location (change to match yo export TUTORIAL=/path/to/your/pepatac_tutorial ``` +Let's move into our newly created directory and create a few more folders that we'll use later. +```console +cd pepatac_tutorial/ +mkdir data +mkdir genomes +mkdir processed +mkdir templates +mkdir tools +cd tools/ +``` + +Time to get PEPATAC! +```console +git clone https://github.com/databio/pepatac.git +``` +Success! If you had any issues, feel free to [reach out to us with questions](contact.md). Otherwise, let's move on to installing additional software. + +## 2: Initialize `refgenie` and download assets + +As described in the various installation guides, `PEPATAC` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must always exist outside of any container system or alongside a native installation. Therefore, we still need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=/path/to/your/pepatac_tutorial/refgenie_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +`PEPATAC` also requires a `bowtie2_index` asset for any pre-alignment genomes: + +```console +refgenie pull rCRSd/bowtie2_index +``` + +## 3: Download tutorial read files + +We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial fastq files](http://big.databio.org/pepatac/). +```console +wget http://big.databio.org/pepatac/tutorial1_r1.fastq.gz +wget http://big.databio.org/pepatac/tutorial1_r2.fastq.gz +wget http://big.databio.org/pepatac/tutorial2_r1.fastq.gz +wget http://big.databio.org/pepatac/tutorial2_r2.fastq.gz +``` + +To simplify the rest of this tutorial, let's put those files in a standard location we'll use for the rest of this guide. +```console +mv tutorial1_r1.fastq.gz pepatac/examples/data/ +mv tutorial1_r2.fastq.gz pepatac/examples/data/ +mv tutorial2_r1.fastq.gz pepatac/examples/data/ +mv tutorial2_r2.fastq.gz pepatac/examples/data/ +``` + +## 4: Configure project files + +We're going to use `looper` to analyze our data. For that, we need to pass looper a configuration file. This project config file describes your project. See [`looper` docs](https://looper.readthedocs.io/en/latest/) for details. A [configuration file has been provided for you in the pipeline repository itself, named `tutorial_refgenie.yaml`](https://github.com/databio/pepatac/blob/master/examples/tutorial/tutorial_refgenie.yaml). This configuration file also points to our sample. In this case, we've provided a sample for you with the pipeline. You don't have to do anything else at this point and may [skip right to running the sample if you'd like](tutorial.md#3-using-looper-to-run-the-pipeline). Otherwise, we'll briefly touch on what those configuration files look like. +You can open the configuration file in your favorite text editor if you'd like to look closer. For the purposes of the tutorial you may safely move past this step should you choose. +```console +nano tutorial_refgenie.yaml +``` +The following is what you should see in that configuration file. +```console +name: PEPATAC_tutorial + +pep_version: 2.0.0 +sample_table: tutorial.csv + +looper: + output_dir: "${TUTORIAL}/processed/" + pipeline_interfaces: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] + +sample_modifiers: + append: + pipeline_interfaces: ["${TUTORIAL}/tools/pepatac/sample_pipeline_interface.yaml"] + derive: + attributes: [read1, read2] + sources: + # Obtain tutorial data from http://big.databio.org/pepatac/ then set + # path to your local saved files + R1: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r1.fastq.gz" + R2: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignment_names: ["rCRSd"] + deduplicator: samblaster # Default. [options: picard] + trimmer: skewer # Default. [options: pyadapt, trimmomatic] + peak_type: fixed # Default. [options: variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run +``` +There is also a sample annotation file referenced in our configuration file. The sample annotation file contains metadata and other information about our sample. Just like before, [this file, named `tutorial.csv` has been provided](https://github.com/databio/pepatac/blob/master/examples/tutorial/tutorial.csv). You may check it out if you wish, otherwise we're all set. +If you open `tutorial.csv`, you should see the following: +```console +sample_name,protocol,organism,read1,read2,read_type +tutorial1,ATAC,human,R1,R2,paired +tutorial2,ATAC,human,R1,R2,paired +``` +That's it! Let's analyze that sample! + +## 5: Using `looper` to run the sample processing pipeline +Looper requires a few variables and configuration files to work for the specific user. Let's get those set up now. `Looper` uses [`divvy`](http://code.databio.org/divvy) to manage computing resource configuration so that projects and pipelines can easily travel among environments. For more detailed information, [check out the `looper` docs](https://looper.readthedocs.io/en/latest/cluster-computing/). Let's set it up. + +We should still be in the `tools/` subdirectory, but let's move up one level. +```console +cd ../ +touch compute_config.yaml +``` +Open that file in your favorite text editor. We'll add in the following example for running locally. You'll need to edit this file further for your own setup and you can [learn more about that in the `looper` docs](https://looper.readthedocs.io/en/latest/index.html). +```console +nano compute_config.yaml +``` +Paste the following into compute_config.yaml +```console +adapters: + CODE: looper.command + JOBNAME: looper.job_name + CORES: compute.cores + LOGFILE: looper.log_file + TIME: compute.time + MEM: compute.mem + +compute_packages: + default: + submission_template: templates/localhost_template.sub + submission_command: sh +``` +Now, let's close and save that file and create an environment variable pointing to our configuration file. +```console +export DIVCFG="/path/to/pepatac_tutorial/compute_config.yaml" +``` +(Remember to add `DIVCFG` to your `.bashrc` or `.profile` to ensure it persists). +The `Looper` environment configuration file points to submission template(s) in order to know how to run a samples locally or using cluster resources. If you'd like to learn more, check out the [`DIVCFG` configuration file and submission templates](http://code.databio.org/divvy). We're going to simply setup a local template for the purposes of this tutorial. You can also easily create [templates for cluster or container use as well](https://github.com/pepkit/divcfg/tree/master/templates)! +Let's change to our `templates/` directory to make our first submission template. +```console +cd templates/ +nano localhost_template.sub +``` +Paste the following into the localhost_template.sub: +```console +#!/bin/bash + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +{ +{CODE} +} | tee {LOGFILE} --ignore-interrupts +``` + +Save and close that file, and change into the `tools/pepatac/` folder. +```console +cd ../tools/pepatac/ +``` +Now, we'll use `looper` to run the sample locally. +```console +looper run examples/tutorial/tutorial.yaml +``` +Congratulations! Your first samples should be running through the pipeline now. For both samples to run locally should take 30-50 minutes in total depending on your system. + +After the pipeline is finished, we can look through the output directory together. We've provided an example breakdown of just such a directory in the [browse output page](browse_output.md). + +## 6: Use `looper` to run the project level pipeline +The pipeline also includes project level analyses that work on all samples concurrently. This allows for analyses that require output produced by individual sample analysis. We'll run the project analysis much like we run the sample analysis: +```console +looper runp examples/tutorial/tutorial_refgenie.yaml +``` +This should take about a minute on the tutorial samples and will generate a `summary/` directory containing project level output in the parent project directory. You can [browse the tutorial data](browse_output.md) to see the example output. + +## 7: Generate an `HTML` report using `looper` + +Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](files/examples/tutorial/PEPATAC_tutorial_summary.html). Otherwise, let's generate a report ourselves. + +Using our same configuration file we used to run the samples through the pipeline, we'll now employ the `report` function of `looper`. +```console +looper report examples/tutorial/tutorial.yaml +``` +That's it! Easy, right? `Looper` conveniently provides you with the location where the HTML report is produced. You may either open the report with your preferred internet browser using the PATH returned with `looper report`, or we can change directories to the report's location and open it there. Let's go ahead and change into the directory that contains the report. +```console +cd $TUTORIAL/processed/ +firefox PEPATAC_tutorial_summary.html +``` +The `HTML` report contains a summary page that integrates the project level summary table and any project level objects including: raw aligned reads, percent aligned reads, TSS enrichment scores, and library complexity plots. The status page lists all the samples in this project along with their current status, a link to their log files, the time it took to run the sample and the peak memory used during the run. The objects page provides links to separate pages for each object type. On each object page, all the individual samples' objects are provided. Similarly, the samples page contains links to individual pages for each sample. The sample pages list the individual summary statistics for that sample as well as links to log files, command logs, and summary files. The sample pages also provide links and thumbnails for any individual objects generated for that sample. Of course, all of these files are present in the sample directory, but the report provides easy access to them all. + +# Tutorial using manually downloaded and curated genome assets + +## 1: Set up folders + +From an open terminal, let's first create a directory we'll use to run through this guide: +```console +mkdir pepatac_tutorial +``` + +Let's point an environment variable to our tutorial location (change to match your local path) to link our tutorial samples with your local environment. +```console +export TUTORIAL=/path/to/your/pepatac_tutorial +``` Let's move into our newly created directory and create a few more folders that we'll use later. ```console @@ -51,7 +261,12 @@ mv tutorial2_r1.fastq.gz pepatac/examples/data/ mv tutorial2_r2.fastq.gz pepatac/examples/data/ ``` -## 3: Configure project files +### 2: Get genome assets + +We [recommend `refgenie` to manage all required and optional genome assets](run-bulker.md#2a-initialize-refgenie-and-download-assets). However, [`PEPATAC` can also accept file paths to any of the assets](run-bulker.md#2b-download-assets). + + +## 4: Configure project files We're going to use `looper` to analyze our data. For that, we need to pass looper a configuration file. This project config file describes your project. See [`looper` docs](https://looper.readthedocs.io/en/latest/) for details. A [configuration file has been provided for you in the pipeline repository itself, conveniently named `tutorial.yaml`](https://github.com/databio/pepatac/blob/master/examples/tutorial/tutorial.yaml). This configuration file also points to our sample. In this case, we've provided a sample for you with the pipeline. You don't have to do anything else at this point and may [skip right to running the sample if you'd like](tutorial.md#3-using-looper-to-run-the-pipeline). Otherwise, we'll briefly touch on what those configuration files look like. You can open the configuration file in your favorite text editor if you'd like to look closer. For the purposes of the tutorial you may safely move past this step should you choose. @@ -84,8 +299,9 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: hg38 - macs_genome_size: hs - prealignments: rCRSd human_repeats + genome_index: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 + chrom_sizes: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes + prealignment_index: ["rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4"] deduplicator: samblaster # Default. [options: picard] trimmer: skewer # Default. [options: pyadapt, trimmomatic] peak_type: fixed # Default. [options: variable] @@ -102,7 +318,7 @@ tutorial2,ATAC,human,R1,R2,paired That's it! Let's analyze that sample! -## 4: Using `looper` to run the sample processing pipeline +## 5: Using `looper` to run the sample processing pipeline Looper requires a few variables and configuration files to work for the specific user. Let's get those set up now. `Looper` uses [`divvy`](http://code.databio.org/divvy) to manage computing resource configuration so that projects and pipelines can easily travel among environments. For more detailed information, [check out the `looper` docs](https://looper.readthedocs.io/en/latest/cluster-computing/). Let's set it up. We should still be in the `tools/` subdirectory, but let's move up one level. @@ -164,14 +380,14 @@ Congratulations! Your first samples should be running through the pipeline now. After the pipeline is finished, we can look through the output directory together. We've provided an example breakdown of just such a directory in the [browse output page](browse_output.md). -## 5: Use `looper` to run the project level pipeline +## 6: Use `looper` to run the project level pipeline The pipeline also includes project level analyses that work on all samples concurrently. This allows for analyses that require output produced by individual sample analysis. We'll run the project analysis much like we run the sample analysis: ```console looper runp examples/tutorial/tutorial.yaml ``` This should take about a minute on the tutorial samples and will generate a `summary/` directory containing project level output in the parent project directory. You can [browse the tutorial data](browse_output.md) to see the example output. -## 6: Generate an `HTML` report using `looper` +## 7: Generate an `HTML` report using `looper` Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](files/examples/tutorial/PEPATAC_tutorial_summary.html). Otherwise, let's generate a report ourselves. diff --git a/examples/tutorial/tutorial.yaml b/examples/tutorial/tutorial.yaml index 0289c607..85b1bd1b 100644 --- a/examples/tutorial/tutorial.yaml +++ b/examples/tutorial/tutorial.yaml @@ -22,8 +22,9 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: hg38 - macs_genome_size: hs - prealignments: rCRSd human_repeats + genome_index: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 + chrom_sizes: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes + prealignment_index: ["rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4"] deduplicator: samblaster # Default. [options: picard] trimmer: skewer # Default. [options: pyadapt, trimmomatic] peak_type: fixed # Default. [options: variable] diff --git a/examples/tutorial/tutorial_refgenie.yaml b/examples/tutorial/tutorial_refgenie.yaml new file mode 100644 index 00000000..40f7e585 --- /dev/null +++ b/examples/tutorial/tutorial_refgenie.yaml @@ -0,0 +1,30 @@ +name: PEPATAC_tutorial + +pep_version: 2.0.0 +sample_table: tutorial.csv + +looper: + output_dir: "${TUTORIAL}/processed/" + pipeline_interfaces: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] + +sample_modifiers: + append: + pipeline_interfaces: ["${TUTORIAL}/tools/pepatac/sample_pipeline_interface.yaml"] + derive: + attributes: [read1, read2] + sources: + # Obtain tutorial data from http://big.databio.org/pepatac/ then set + # path to your local saved files + R1: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r1.fastq.gz" + R2: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignment_names: ["rCRSd"] + deduplicator: samblaster # Default. [options: picard] + trimmer: skewer # Default. [options: pyadapt, trimmomatic] + peak_type: fixed # Default. [options: variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run From e5f8257683d4d7d61f7c3233e91fe6450293ef9c Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 12 Jul 2021 09:30:25 -0400 Subject: [PATCH 48/66] update gold example for with or without refgenie --- examples/gold_atac/metadata/gold_config.yaml | 7 +- .../metadata/gold_config_refgenie.yaml | 136 ++++++++++++++++++ 2 files changed, 141 insertions(+), 2 deletions(-) create mode 100644 examples/gold_atac/metadata/gold_config_refgenie.yaml diff --git a/examples/gold_atac/metadata/gold_config.yaml b/examples/gold_atac/metadata/gold_config.yaml index b344b34e..c5c505aa 100644 --- a/examples/gold_atac/metadata/gold_config.yaml +++ b/examples/gold_atac/metadata/gold_config.yaml @@ -1,3 +1,4 @@ +# From the `pepatac/` directory: `looper run examples/gold_atac/metadata/gold_config.yaml` name: gold_atac pep_version: 2.0.0 @@ -21,8 +22,10 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: hg38 - genome_size: hs - prealignments: ["rCRSd"] + # Manually download genome assets from refgenie servers. See http://pepatac.databio.org/en/latest/assets/ + genome_index: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 + chrom_sizes: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes + prealignment_index: ["rCRSd=default/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4"] project_modifiers: amend: diff --git a/examples/gold_atac/metadata/gold_config_refgenie.yaml b/examples/gold_atac/metadata/gold_config_refgenie.yaml new file mode 100644 index 00000000..f269d0e5 --- /dev/null +++ b/examples/gold_atac/metadata/gold_config_refgenie.yaml @@ -0,0 +1,136 @@ +name: gold_atac + +pep_version: 2.0.0 +sample_table: gold_sample_table.csv + +looper: + output_dir: "$PROCESSED/pepatac/gold_atac/default" + pipeline_interfaces: "$CODE/pepatac/project_pipeline_interface.yaml" + +sample_modifiers: + append: + pipeline_interfaces: "$CODE/pepatac/sample_pipeline_interface.yaml" + derive: + attributes: [read1, read2] + sources: + SRA: "/scratch/jps3dp/tools/databio/pepatac/examples/gold_atac/metadata/{SRR}.bam" + SRA_1: "$SRAFQ/{SRR}_1.fastq.gz" + SRA_2: "$SRAFQ/{SRR}_2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignments: ["rCRSd"] + +project_modifiers: + amend: + genrich: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/genrich" + sample_modifiers: + append: + peak_caller: genrich + peak_type: variable + hmmratac: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/hmmratac" + sample_modifiers: + append: + peak_caller: hmmratac + peak_type: variable + homer: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/homer" + sample_modifiers: + append: + peak_caller: homer + peak_type: variable + fseq: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/fseq" + sample_modifiers: + append: + peak_caller: fseq + peak_type: variable + fseq2: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/fseq2" + sample_modifiers: + append: + peak_caller: fseq2 + peak_type: variable + variable: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/peak_caller/macs2/variable" + sample_modifiers: + append: + peak_caller: macs2 + peak_type: variable + bwa: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/aligner/bwa" + sample_modifiers: + append: + aligner: bwa + picard: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/dedup/picard" + sample_modifiers: + append: + deduplicator: picard + samtools: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/dedup/samtools" + sample_modifiers: + append: + deduplicator: samtools + motif: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/motif" + sample_modifiers: + append: + motif: True + noscale: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/no-scale" + sample_modifiers: + append: + no-scale: True + prioritize: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/prioritize" + sample_modifiers: + append: + prioritize: True + sob: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/sob/default" + sample_modifiers: + append: + sob: True + sob_noscale: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/sob/no-scale" + sample_modifiers: + append: + sob: True + no-scale: True + skipqc: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/skipqc" + sample_modifiers: + append: + skipqc: True + nofifo: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/noFIFO" + sample_modifiers: + append: + noFIFO: True + lite: + looper: + output_dir: "$PROCESSED/pepatac/gold_atac/lite" + sample_modifiers: + append: + lite: True From 6897f39f264370b3a33f8ddd1762b3ead6528d5a Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 12 Jul 2021 09:30:41 -0400 Subject: [PATCH 49/66] expand assets description and how to obtain --- docs/assets.md | 91 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 11 deletions(-) diff --git a/docs/assets.md b/docs/assets.md index a7e2345a..c155d623 100644 --- a/docs/assets.md +++ b/docs/assets.md @@ -1,23 +1,92 @@ # Genome assets -`PEPATAC` relies on `refgenie` managed assets to streamline sample processing, where once these assets are built by refgenie there is no need to specify them in calls to `PEPATAC`. These managed assets include the following: -- [`bowtie2_index`](http://refgenie.databio.org/en/latest/available_assets/#bowtie2_index) -- [`blacklist`](http://refgenie.databio.org/en/latest/available_assets/#blacklist) -- `chrom_sizes` - built by default when you build/pull the `fasta` asset -- [`fasta`](http://refgenie.databio.org/en/latest/available_assets/#fasta) -- [`feat_annotation`](http://refgenie.databio.org/en/latest/available_assets/#feat_annotation) -- [`refgene_anno`](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno) - builds the TSS annotation file concurrently +`PEPATAC` can use either manually constructed or `refgenie` managed assets. `Refgenie` streamlines sample processing, where once assets are built by `refgenie` there is minimal argument calls to `PEPATAC` to use all assets. Pipeline assets include: -Additionally, should you prefer `bwa` for alignment, you would use the [`refgenie bwa_index`](http://refgenie.databio.org/en/latest/available_assets/#bwa_index) as well. +**Required** -## Using local copies instead of `refgenie` managed assets +| `PEPATAC` argument | `refgenie` asset name | Description | +|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------| +| `--genome-index` | [`bowtie2_index`](http://refgenie.databio.org/en/latest/available_assets/#bowtie2_index) | A genome index file constructed from `bowtie2-build` | +| | [`bwa_index`](https://refgenie.databio.org/en/latest/available_assets/#bwa_index) | A genome index file constructed from `bwa index`. Required when using `bwa` (optional) for alignment. | +| `--chrom-sizes` | With `refgenie`, this asset is built automatically when you build/pull the [`fasta`](http://refgenie.databio.org/en/latest/available_assets/#fasta) asset. | A text file containing "chr" and "size" columns. | -For additionally flexibility, several of the `refgenie` managed assets may be manually specified. Why is this helpful? Maybe you want to compare custom TSS annotations for a one-off project. Or, perhaps you want to identify the fraction of your reads in a set of custom features relevant only to a specific sample. If these are resources you're not planning on reusing over and over again, maybe you simply don't want to have `refgenie` track them long term. +**Optional** -Custom blacklisted regions may be specified using the `--blacklist `. The blacklisted region file must simply be a `BED` formatted file to function correctly. The [`refgenie blacklist` asset](http://refgenie.databio.org/en/latest/available_assets/#blacklist) is intended to utilize the [ENCODE blacklists](https://github.com/Boyle-Lab/Blacklist) by default. +| `PEPATAC` argument | `refgenie` asset name | Description | +|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `--prealignment-names` | Human readable genome alias(es) for `refgenie` managed `bowtie2_index` asset(s). | A space-delimited list of genome names. *e.g.* ["rCRSd", "human_repeats"] | +| `--prealignment-index` | [`bowtie2_index`](http://refgenie.databio.org/en/latest/available_assets/#bowtie2_index) | A genome index file constructed from `bowtie2-build`. Used for manually pointing to prealignment genome indices when using `bowtie2` (default) for alignment. | +| | [`bwa_index`](https://refgenie.databio.org/en/latest/available_assets/#bwa_index) | A genome index file constructed from `bwa index`. Used for manually pointing to prealignment genome indices when using `bwa` for alignment. | +| `--TSS-name` | [`refgene_anno`](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno). `refgenie` `build/pull` the TSS annotation file with this asset. | Transcription start site (TSS) annotations. *e.g.* [refGene.txt.gz](https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz) | +| `--blacklist` | [`blacklist`](http://refgenie.databio.org/en/latest/available_assets/#blacklist) | A region blacklist. *e.g.* [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) | +| `--anno-name` | [`feat_annotation`](annotation.md) | A BED-style file with "chr", "start", "end", "genomic feature name", "score" and "strand" columns. | +| `--search-file` | [`tallymer_index`](https://refgenie.databio.org/en/latest/available_assets/#tallymer_index) The `search_file` is built from this `refgenie` asset. | File used to search an index of k-mers in the genome of the same size as input read lengths. Only required for `--sob` argument (*i.e.* using `seqOutBias` for enzyme bias correction). | + +## Using `refgenie` managed assets + +`PEPATAC` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must be available natively. Therefore, you need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download all standard assets for `hg38` like so: + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb hg38/blacklist +refgenie build hg38/feat_annotation +``` + +`PEPATAC` also requires a `bowtie2_index` asset for any prealignment genomes: + +```console +refgenie pull rCRSd/bowtie2_index human_repeats/bowtie2_index +``` + +If you prefer `bwa` for alignment, you would use the [`refgenie bwa_index`](http://refgenie.databio.org/en/latest/available_assets/#bwa_index) instead. + +Furthermore, you can [learn more about using `seqOutBias` and the required `tallymer_index` here](sob.md). + +### Example using `refgenie` managed assets + +When using `refgenie`, you only need to provide the `--genome` and `--prealignment-names` argument to provide the pipeline with every required index and optional annotation file that exists for those genomes. This means, the TSS file, feature annotation file, and blacklist will all be used without needing to directly specify the paths to these files. + +From the `pepatac/` repository directory: +```console +looper run examples/test_project/test_config_refgenie.yaml +``` + +## Using manually managed assets + +Assets may also be managed manually and specified directly to the pipeline. While this frees you from needing `refgenie` installed and initialized, it does require a few more arguments to be specified. + +Custom blacklisted regions may be specified using the `--blacklist `. The blacklisted region file must simply be a `BED` formatted file to function correctly. The [`refgenie blacklist` asset](http://refgenie.databio.org/en/latest/available_assets/#blacklist) uses the [ENCODE blacklists](https://github.com/Boyle-Lab/Blacklist) by default. The TSS annotation file may be specified using `--TSS-name `. This file is also a `BED` formatted file. The `feat_annotation` asset may also be directly specified using `--anno-name `. Read [more about using custom reference data](annotation.md). +### Example using manually managed assets + +Even when *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." +```console +wget -O hg38.fasta.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default +``` + +Then, extract these files: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +tar xvf rCRSd.bowtie2_index.tgz +``` +From the `pepatac/` repository folder (using the manually downloaded genome assets): +```console +looper run examples/test_project/test_config.yaml +``` From dfce2b74f9deb341b7b412100c12b30001cd7561 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 12 Jul 2021 09:30:59 -0400 Subject: [PATCH 50/66] include untar step --- docs/run-bulker.md | 7 +++++++ docs/run-conda.md | 9 ++++++++- docs/run-container.md | 7 +++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/run-bulker.md b/docs/run-bulker.md index ff490575..4f6deb4a 100644 --- a/docs/run-bulker.md +++ b/docs/run-bulker.md @@ -98,6 +98,13 @@ wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535 wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default ``` +Then, extract these files: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +tar xvf rCRSd.bowtie2_index.tgz +``` + From the `pepatac/` repository folder (using the manually downloaded genome assets): ```console pipelines/pepatac.py --single-or-paired paired \ diff --git a/docs/run-conda.md b/docs/run-conda.md index 91a4bc6e..e9ec20aa 100644 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -101,13 +101,20 @@ refgenie seek hg38/bowtie2_index.dir refgenie seek rCRSd/bowtie2_index.dir ``` -Alternatively, if you are *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the refgenie servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." +Alternatively, if you are *not* using `refgenie`, you can still grab premade `--chrom-sizes` and `--genome-index` files from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." ```console wget -O hg38.fasta.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default ``` +Then, extract these files: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +tar xvf rCRSd.bowtie2_index.tgz +``` + From the `pepatac/` repository folder (using the manually downloaded genome assets): ```console looper run -d examples/test_project/test_config.yaml diff --git a/docs/run-container.md b/docs/run-container.md index 2761e31f..da5dae30 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -57,6 +57,13 @@ wget -O hg38.bowtie2_index.tgz http://rg.databio.org/v3/assets/archive/2230c535 wget -O rCRSd.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4/bowtie2_index?tag=default ``` +Then, extract these files: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +tar xvf rCRSd.bowtie2_index.tgz +``` + ### 3. Pull the container image. **Docker**: You can pull the docker [databio/pepatac image](https://hub.docker.com/r/databio/pepatac/) from `dockerhub` like this: From 8ea756e1fa79262956e2087e614419b95e0246be Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 12 Jul 2021 09:31:40 -0400 Subject: [PATCH 51/66] update command template logic --- pipelines/pepatac.py | 2 +- sample_pipeline_interface.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index ee2caf07..bcff87fd 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ PEPATAC - ATACseq pipeline """ diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index baca0805..77f6c9d5 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -4,7 +4,7 @@ path: pipelines/pepatac.py input_schema: pepatac_input_schema.yaml output_schema: pepatac_output_schema.yaml command_template: > - python {pipeline.path} + {pipeline.path} --output-parent { looper.results_subdir } --cores { compute.cores } --mem { compute.mem } From d16ec5494440f2cb338bd2b70261a289db8350ee Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 13 Jul 2021 12:09:30 -0400 Subject: [PATCH 52/66] fix prealignment_names argument --- examples/gold_atac/metadata/gold_config_refgenie.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gold_atac/metadata/gold_config_refgenie.yaml b/examples/gold_atac/metadata/gold_config_refgenie.yaml index f269d0e5..59346b08 100644 --- a/examples/gold_atac/metadata/gold_config_refgenie.yaml +++ b/examples/gold_atac/metadata/gold_config_refgenie.yaml @@ -4,7 +4,7 @@ pep_version: 2.0.0 sample_table: gold_sample_table.csv looper: - output_dir: "$PROCESSED/pepatac/gold_atac/default" + output_dir: "$PROCESSED/pepatac/gold_atac/default_refgenie" pipeline_interfaces: "$CODE/pepatac/project_pipeline_interface.yaml" sample_modifiers: @@ -21,7 +21,7 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: hg38 - prealignments: ["rCRSd"] + prealignment_names: ["rCRSd"] project_modifiers: amend: From 35c208253a1a50fbb619866179b7065f0580cf03 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 13 Jul 2021 12:17:18 -0400 Subject: [PATCH 53/66] reduce smaller file default resource requests --- resources-sample.tsv | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/resources-sample.tsv b/resources-sample.tsv index 084c7b0a..4ccdc290 100644 --- a/resources-sample.tsv +++ b/resources-sample.tsv @@ -1,6 +1,6 @@ max_file_size cores mem time -0.05 4 10000 00-03:00:00 -0.5 8 12000 00-08:00:00 -1 16 16000 00-12:00:00 -10 32 24000 01-00:00:00 -NaN 32 32000 02-00:00:00 +0.05 4 8000 00-03:00:00 +0.5 8 10000 00-06:00:00 +1 12 12000 00-08:00:00 +10 24 16000 00-16:00:00 +NaN 32 32000 01-00:00:00 From 5f1ff8fa9e62ff90a81c328bd81546e79748a58f Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 14 Jul 2021 14:28:45 -0400 Subject: [PATCH 54/66] update version and R package inclusion --- containers/pepatac.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile index ac10320a..e1c5a80a 100644 --- a/containers/pepatac.Dockerfile +++ b/containers/pepatac.Dockerfile @@ -5,7 +5,7 @@ FROM phusion/baseimage:master LABEL maintainer Jason Smith "jasonsmith@virginia.edu" # Version info -LABEL version 0.9.16 +LABEL version 0.10.0 # Use baseimage-docker's init system. CMD ["/sbin/my_init"] From d041ed7b5a93eddfab24788771fd120c5f818e30 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 19 Jul 2021 13:20:27 -0400 Subject: [PATCH 55/66] update bulker crate --- sample_pipeline_interface.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 77f6c9d5..9e8c1ef3 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -41,7 +41,7 @@ compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac docker_image: databio/pepatac - bulker_crate: databio/pepatac 1.0.7 + bulker_crate: databio/pepatac:1.0.8 size_dependent_variables: resources-sample.tsv bioconductor: readFunName: runCOCOA From 9344ab97183ad1339c545a2097b19490f8308e7c Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 19 Jul 2021 13:20:43 -0400 Subject: [PATCH 56/66] update deprecated guides use in FRiF plotting --- PEPATACr/R/PEPATACr.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index 0c1719d4..f78d1054 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -836,7 +836,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, group=feature, color=feature)) + #geom_line(aes(linetype=feature), size=2, alpha=0.5) + geom_line(size=2, alpha=0.5) + - guides(linetype = FALSE) + + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + theme_PEPATAC() @@ -885,7 +885,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, group=feature, color=feature)) + geom_line(size=2, alpha=0.5) + - guides(linetype = FALSE) + + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + theme_PEPATAC() @@ -919,7 +919,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, aes(x=log10(cumSize), y=frip, group=feature, color=feature)) + geom_line(aes(linetype=feature), size=2, alpha=0.5) + - guides(linetype = FALSE) + + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + theme_PEPATAC() From 5269fa7ebf296c17839946bce83711afad672950 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 20 Jul 2021 15:28:29 -0400 Subject: [PATCH 57/66] fix lists --- docs/run-container.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/run-container.md b/docs/run-container.md index da5dae30..91deba16 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -42,13 +42,13 @@ refgenie pull rCRSd/bowtie2_index #### 2b: Download assets manually If you prefer not to use `refgenie`, you can also download and construct assets manually. Again, because these are user-defined assets, they must exist outside of any container system. The minimum required assets for a genome includes: - - a chromosome sizes file: a text file containing "chr" and "size" columns. - - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). +- a chromosome sizes file: a text file containing "chr" and "size" columns. +- a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). Optional assets include: - - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. - - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) - - a [genomic feature annotation file](annotation.md) +- a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. +- a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) +- a [genomic feature annotation file](annotation.md) You can obtain the minimally required pre-constructed `--chrom-sizes` and `--genome-index` files from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `94e0d21feb576e6af61cd2a798ad30682ef2428bb7eabbb4` is the digest for "rCRSd." ```console From 55b0d80c7540f4ee1bc8be4d2f1ec5dce5513c13 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 20 Jul 2021 16:03:37 -0400 Subject: [PATCH 58/66] fix assets lists --- docs/run-bulker.md | 14 ++++++++------ docs/run-conda.md | 6 ++++-- docs/run-container.md | 3 ++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/docs/run-bulker.md b/docs/run-bulker.md index 4f6deb4a..6457f365 100644 --- a/docs/run-bulker.md +++ b/docs/run-bulker.md @@ -50,14 +50,16 @@ refgenie pull rCRSd/bowtie2_index #### 2b: Download assets manually -If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: - - a chromosome sizes file: a text file containing "chr" and "size" columns. - - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). +If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: + +- a chromosome sizes file: a text file containing "chr" and "size" columns. +- a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). Optional assets include: - - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. - - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) - - a [genomic feature annotation file](annotation.md) + +- a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. +- a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) +- a [genomic feature annotation file](annotation.md) ### 3. Install and configure `bulker` diff --git a/docs/run-conda.md b/docs/run-conda.md index e9ec20aa..ae441d5b 100644 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -81,11 +81,13 @@ refgenie pull rCRSd/bowtie2_index ### 5b: Download assets manually -If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: +If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: + - a chromosome sizes file: a text file containing "chr" and "size" columns. - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). -Optional assets include: +Optional assets include: + - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) - a [genomic feature annotation file](annotation.md) diff --git a/docs/run-container.md b/docs/run-container.md index 91deba16..d47b3d5f 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -45,7 +45,8 @@ If you prefer not to use `refgenie`, you can also download and construct assets - a chromosome sizes file: a text file containing "chr" and "size" columns. - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). -Optional assets include: +Optional assets include: + - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) - a [genomic feature annotation file](annotation.md) From da9543ada8709875f2d834baf1d8421f7f120e33 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 20 Jul 2021 16:03:48 -0400 Subject: [PATCH 59/66] update genome assets header --- docs/detailed-install.md | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/docs/detailed-install.md b/docs/detailed-install.md index 5809d464..e89f8202 100644 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -216,11 +216,15 @@ Fantastic! Now that we have the pipeline and its requirements installed, we're r ## 4. Download a reference genome -Before we analyze anything, we also need a reference genome. `PEPATAC` uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this: +Before we analyze anything, we also need a reference genome. You can use our recommended approach, `refgenie`, or download the assets manually. + +### 4a: Initialize `refgenie` and download assets + +`PEPATAC` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must still be available natively. Therefore, we need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: ```console -pip install --user refgenie -export REFGENIE=your_genome_folder/genome_config.yaml +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml refgenie init -c $REFGENIE ``` @@ -229,12 +233,26 @@ Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it pers Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: ```console -refgenie pull hg38/bowtie2_index refgene_anno feat_annotation +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation ``` -PEPATAC also requires `bowtie2_index` for any pre-alignment genomes: +`PEPATAC` also requires a `bowtie2_index` asset for any pre-alignment genomes: ```console refgenie pull rCRSd/bowtie2_index -refgenie pull human_repeats/bowtie2_index ``` + +### 4b: Download assets manually + +If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: + + - a chromosome sizes file: a text file containing "chr" and "size" columns. + - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). + +Optional assets include: + + - a TSS annotation file: a BED file containing "chr", "start", "end", "gene name", "score", and "strand" columns. + - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) + - a [genomic feature annotation file](annotation.md) + From 3c15d80a44b7339abf1560dab12b7ba958a7d20a Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 20 Jul 2021 16:14:21 -0400 Subject: [PATCH 60/66] update changelog and usage docs --- docs/changelog.md | 15 +++++++++ docs/usage.md | 84 ++++++++++++++++++++++++++++------------------- usage.txt | 84 ++++++++++++++++++++++++++++------------------- 3 files changed, 117 insertions(+), 66 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index c9e4e109..bfe964bc 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,21 @@ # Change log All notable changes to this project will be documented in this file. +## [0.10.0] -- 2021-07-20 + +### Changed + - Make refgenie optional + - Use looper refgenie populate plugin to integrate refgenie + - Update docs to reflect changes to refgenie requirement and installation + - Fix file.path in reducePeaks() and take sample name argument + - Update blacklist reduce peaks step + - Rework CLI arguments to be fully explicit + - Update settings in R + - Update dockerfile to most recent refgenie usage and R package changes + - Reduce smaller file resource requests + +### Added + - Add assets descriptions and how to obtain to docs ## [0.9.16] -- 2021-05-18 diff --git a/docs/usage.md b/docs/usage.md index e0818e3e..fef96249 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -7,22 +7,26 @@ `python pipelines/pepatac.py --help` ```{console} usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] - [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER - [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I + [--logdev] [-C CONFIG_FILE] [-O PARENT_OUTPUT_FOLDER] + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] [-S SAMPLE_NAME] -I INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] [--aligner {bowtie2,bwa}] - [--peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2}] - [-gs GENOME_SIZE] [--trimmer {trimmomatic,pyadapt,skewer}] - [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] + [-Q SINGLE_OR_PAIRED] + [--trimmer {trimmomatic,pyadapt,skewer}] + [--aligner {bowtie2,bwa}] [--deduplicator {picard,samblaster,samtools}] - [--TSS-name TSS_NAME] [--blacklist BLACKLIST] - [--anno-name ANNO_NAME] [--peak-type {fixed,variable}] + [--peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2}] + [-gs GENOME_SIZE] [--peak-type {fixed,variable}] [--extend EXTEND] [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] - [--noFIFO] [--lite] [--skipqc] [-V] + [--noFIFO] [--lite] [--skipqc] + [--prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...]] + [--prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...]] + --genome-index GENOME_INDEX --chrom-sizes CHROM_SIZES + [--TSS-name TSS_NAME] [--blacklist BLACKLIST] + [--anno-name ANNO_NAME] [--search-file SEARCH_FILE] [-V] -PEPATAC version 0.9.16 +PEPATAC version 0.10.0 optional arguments: -h, --help show this help message and exit @@ -37,45 +41,40 @@ optional arguments: -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. + -O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER + Parent output directory of project -M MEMORY_LIMIT, --mem MEMORY_LIMIT Memory limit for processes accepting such. Default units are megabytes unless specified using the suffix [K|M|G|T]. -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES Number of cores for parallelized processes + -S SAMPLE_NAME, --sample-name SAMPLE_NAME + Name for sample to run -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol + --trimmer {trimmomatic,pyadapt,skewer} + Name of read trimming program. --aligner {bowtie2,bwa} - Name of read aligner + Name of read aligner. + --deduplicator {picard,samblaster,samtools} + Name of deduplicator program. --peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2} - Name of peak caller + Name of peak caller. -gs GENOME_SIZE, --genome-size GENOME_SIZE Effective genome size. It can be 1.0e+9 or 1000000000: e.g. human (2.7e9), mouse (1.87e9), C. elegans (9e7), fruitfly (1.2e8). Default:2.7e9 - --trimmer {trimmomatic,pyadapt,skewer} - Name of read trimming program - --prealignments PREALIGNMENTS [PREALIGNMENTS ...] - Space-delimited list of reference genomes to align to - before primary alignment. - --deduplicator {picard,samblaster,samtools} - Name of deduplicator program - --TSS-name TSS_NAME Path to TSS annotation file. - --blacklist BLACKLIST - Path to genomic region blacklist file - --anno-name ANNO_NAME - Path to reference annotation file (BED format) for - calculating FRiF --peak-type {fixed,variable} Call variable or fixed width peaks. Fixed width requires MACS2. --extend EXTEND How far to extend fixed width peaks up and downstream. --frip-ref-peaks FRIP_REF_PEAKS Path to reference peak set (BED format) for - calculating FRiP - --motif Perform motif enrichment analysis + calculating FRiP. + --motif Perform motif enrichment analysis. --sob Use seqOutBias to produce signal tracks, incorporate mappability information, and account for Tn5 bias. --no-scale Do not scale signal tracks: Default is to scale by @@ -84,19 +83,38 @@ optional arguments: --prioritize Plot cFRiF/FRiF using mutually exclusive priority ranked features based on the order of feature appearance in the feature annotation asset. - --keep Enable this flag to keep prealignment BAM files - --noFIFO Do NOT use named pipes during prealignments + --keep Enable this flag to keep prealignment BAM files. + --noFIFO Do NOT use named pipes during prealignments. --lite Only keep minimal, essential output to conserve disk space. --skipqc Skip FastQC. Useful for bugs in FastQC that appear with some sequence read files. + --prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...] + Space-delimited list of prealignment genome names to + align to before primary alignment. + --prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...] + Space-delimited list of prealignment genome name and + index files delimited by an equals sign to align to + before primary alignment. e.g. + rCRSd=/path/to/bowtie2_index/. + --genome-index GENOME_INDEX + Path to primary genome index file. Either a bowtie2 or + bwa index. + --chrom-sizes CHROM_SIZES + Path to primary genome chromosome sizes file. + --TSS-name TSS_NAME Path to TSS annotation file. + --blacklist BLACKLIST + Path to genomic region blacklist file. + --anno-name ANNO_NAME + Path to reference annotation file (BED format) for + calculating FRiF. + --search-file SEARCH_FILE + Required for seqOutBias (--sob). Path to tallymer + index search file built with the same read length as + the input. -V, --version show program's version number and exit required named arguments: - -O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER - Parent output directory of project - -S SAMPLE_NAME, --sample-name SAMPLE_NAME - Name for sample to run -I INPUT_FILES [INPUT_FILES ...], --input INPUT_FILES [INPUT_FILES ...] One or more primary input files -G GENOME_ASSEMBLY, --genome GENOME_ASSEMBLY diff --git a/usage.txt b/usage.txt index 980fc5ec..0c4a2d66 100644 --- a/usage.txt +++ b/usage.txt @@ -1,20 +1,24 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] - [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER - [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I + [--logdev] [-C CONFIG_FILE] [-O PARENT_OUTPUT_FOLDER] + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] [-S SAMPLE_NAME] -I INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] [--aligner {bowtie2,bwa}] - [--peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2}] - [-gs GENOME_SIZE] [--trimmer {trimmomatic,pyadapt,skewer}] - [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] + [-Q SINGLE_OR_PAIRED] + [--trimmer {trimmomatic,pyadapt,skewer}] + [--aligner {bowtie2,bwa}] [--deduplicator {picard,samblaster,samtools}] - [--TSS-name TSS_NAME] [--blacklist BLACKLIST] - [--anno-name ANNO_NAME] [--peak-type {fixed,variable}] + [--peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2}] + [-gs GENOME_SIZE] [--peak-type {fixed,variable}] [--extend EXTEND] [--frip-ref-peaks FRIP_REF_PEAKS] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] - [--noFIFO] [--lite] [--skipqc] [-V] + [--noFIFO] [--lite] [--skipqc] + [--prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...]] + [--prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...]] + --genome-index GENOME_INDEX --chrom-sizes CHROM_SIZES + [--TSS-name TSS_NAME] [--blacklist BLACKLIST] + [--anno-name ANNO_NAME] [--search-file SEARCH_FILE] [-V] -PEPATAC version 0.9.16 +PEPATAC version 0.10.0 optional arguments: -h, --help show this help message and exit @@ -29,45 +33,40 @@ optional arguments: -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. + -O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER + Parent output directory of project -M MEMORY_LIMIT, --mem MEMORY_LIMIT Memory limit for processes accepting such. Default units are megabytes unless specified using the suffix [K|M|G|T]. -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES Number of cores for parallelized processes + -S SAMPLE_NAME, --sample-name SAMPLE_NAME + Name for sample to run -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol + --trimmer {trimmomatic,pyadapt,skewer} + Name of read trimming program. --aligner {bowtie2,bwa} - Name of read aligner + Name of read aligner. + --deduplicator {picard,samblaster,samtools} + Name of deduplicator program. --peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2} - Name of peak caller + Name of peak caller. -gs GENOME_SIZE, --genome-size GENOME_SIZE Effective genome size. It can be 1.0e+9 or 1000000000: e.g. human (2.7e9), mouse (1.87e9), C. elegans (9e7), fruitfly (1.2e8). Default:2.7e9 - --trimmer {trimmomatic,pyadapt,skewer} - Name of read trimming program - --prealignments PREALIGNMENTS [PREALIGNMENTS ...] - Space-delimited list of reference genomes to align to - before primary alignment. - --deduplicator {picard,samblaster,samtools} - Name of deduplicator program - --TSS-name TSS_NAME Path to TSS annotation file. - --blacklist BLACKLIST - Path to genomic region blacklist file - --anno-name ANNO_NAME - Path to reference annotation file (BED format) for - calculating FRiF --peak-type {fixed,variable} Call variable or fixed width peaks. Fixed width requires MACS2. --extend EXTEND How far to extend fixed width peaks up and downstream. --frip-ref-peaks FRIP_REF_PEAKS Path to reference peak set (BED format) for - calculating FRiP - --motif Perform motif enrichment analysis + calculating FRiP. + --motif Perform motif enrichment analysis. --sob Use seqOutBias to produce signal tracks, incorporate mappability information, and account for Tn5 bias. --no-scale Do not scale signal tracks: Default is to scale by @@ -76,19 +75,38 @@ optional arguments: --prioritize Plot cFRiF/FRiF using mutually exclusive priority ranked features based on the order of feature appearance in the feature annotation asset. - --keep Enable this flag to keep prealignment BAM files - --noFIFO Do NOT use named pipes during prealignments + --keep Enable this flag to keep prealignment BAM files. + --noFIFO Do NOT use named pipes during prealignments. --lite Only keep minimal, essential output to conserve disk space. --skipqc Skip FastQC. Useful for bugs in FastQC that appear with some sequence read files. + --prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...] + Space-delimited list of prealignment genome names to + align to before primary alignment. + --prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...] + Space-delimited list of prealignment genome name and + index files delimited by an equals sign to align to + before primary alignment. e.g. + rCRSd=/path/to/bowtie2_index/. + --genome-index GENOME_INDEX + Path to primary genome index file. Either a bowtie2 or + bwa index. + --chrom-sizes CHROM_SIZES + Path to primary genome chromosome sizes file. + --TSS-name TSS_NAME Path to TSS annotation file. + --blacklist BLACKLIST + Path to genomic region blacklist file. + --anno-name ANNO_NAME + Path to reference annotation file (BED format) for + calculating FRiF. + --search-file SEARCH_FILE + Required for seqOutBias (--sob). Path to tallymer + index search file built with the same read length as + the input. -V, --version show program's version number and exit required named arguments: - -O PARENT_OUTPUT_FOLDER, --output-parent PARENT_OUTPUT_FOLDER - Parent output directory of project - -S SAMPLE_NAME, --sample-name SAMPLE_NAME - Name for sample to run -I INPUT_FILES [INPUT_FILES ...], --input INPUT_FILES [INPUT_FILES ...] One or more primary input files -G GENOME_ASSEMBLY, --genome GENOME_ASSEMBLY From 069ebc6c8d095200788b8243391331de4b37481d Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 20 Jul 2021 16:15:14 -0400 Subject: [PATCH 61/66] update dockerfile --- containers/pepatac.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile index e1c5a80a..1adb2dc3 100644 --- a/containers/pepatac.Dockerfile +++ b/containers/pepatac.Dockerfile @@ -87,8 +87,8 @@ RUN pip install virtualenv && \ # Install R RUN apt update -qq && \ - DEBIAN_FRONTEND=noninteractive apt --assume-yes install --no-install-recommends dirmngr && \ - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ + DEBIAN_FRONTEND=noninteractive apt --assume-yes install --no-install-recommends dirmngr +RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" RUN DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install r-base r-base-dev r-base-core r-recommended && \ From 095090055ef8a5d759832858b842070e4d889405 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 21 Jul 2021 13:39:40 -0400 Subject: [PATCH 62/66] fix local bulker crate identification --- checkinstall | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/checkinstall b/checkinstall index e73257f7..2d6647bc 100755 --- a/checkinstall +++ b/checkinstall @@ -67,7 +67,7 @@ echo -e "Checking base requirements... " BASE_REQS=0 -declare -a requiredPkgs=("refgenie" "looper") +declare -a requiredPkgs=("looper") for package in ${requiredPkgs[@]}; do if ! pip_show $package; then @@ -401,14 +401,14 @@ else CWD=$(pwd) - if [ -f "$CWD/sample_pipeline_interface.yaml" ]; then - IFACE="$CWD/sample_pipeline_interface.yaml" + if [ -f "sample_pipeline_interface.yaml" ]; then + IFACE="sample_pipeline_interface.yaml" + CRATE=$(cat $IFACE | grep 'bulker_crate' | tr " " "\n" | tail -n 1) else IFACE=$(curl https://raw.githubusercontent.com/databio/pepatac/master/sample_pipeline_interface.yaml) + CRATE=$(echo $IFACE | tr " " "\n" | grep -A1 'bulker_crate' | tail -n 1) fi - CRATE=$(echo $IFACE | tr " " "\n" | grep -A1 'bulker_crate' | tail -n 1) - yes n | bulker load $CRATE if [ $? -eq 0 ]; then echo $(warn "WARNING: Could not bulker load ${CRATE}. Check out https://bulker.databio.org/en/latest/install/.") From b607db690ecb7c96a3adb78c71652e257a1d6596 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 21 Jul 2021 13:54:21 -0400 Subject: [PATCH 63/66] update installs to include install checks --- docs/install.md | 13 +++++++++++++ docs/run-bulker.md | 21 +++++++++++++++++---- docs/run-conda.md | 18 +++++++++++++++--- docs/run-container.md | 20 +++++++++++++++++--- 4 files changed, 62 insertions(+), 10 deletions(-) diff --git a/docs/install.md b/docs/install.md index e0d8d05f..ec3abe68 100644 --- a/docs/install.md +++ b/docs/install.md @@ -6,3 +6,16 @@ We provide several methods to setup `PEPATAC`. A fundamental challenge of any co 2. [Run the pipeline using a single, monolithic container.](run-container.md) 3. [Run the pipeline in a conda environment.](run-conda.md) 4. [Run the pipeline natively.](detailed-install.md) + +## Confirm installation + +After setting up your environment to run `PEPATAC`, you can confirm which means of running the pipeline are now executable using the included `checkinstall` script. This can either be run directly from the `pepatac/` repository: + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/pepatac/checkinstall | bash +``` \ No newline at end of file diff --git a/docs/run-bulker.md b/docs/run-bulker.md index 6457f365..dfb5989c 100644 --- a/docs/run-bulker.md +++ b/docs/run-bulker.md @@ -65,14 +65,27 @@ Optional assets include: Check out [the `bulker` setup guide to install bulker](https://bulker.databio.org/en/latest/install/) on your system. It is a straightforward python package with a few configuration steps required prior to use with `PEPATAC`. -### 4. Load the `PEPATAC` crate +### 4. Confirm installation + +After setting up your environment to run `PEPATAC` with `bulker`, you can confirm the pipeline is now executable with `bulker` using the included `checkinstall` script. This can either be run directly from the `pepatac/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/pepatac/checkinstall | bash +``` + +### 5. Load the `PEPATAC` crate We've already produced a `bulker` crate for `PEPATAC` that requires all software needed to run the pipeline. We can load this crate directly from the [`bulker registry`](http://hub.bulker.io/): ```console bulker load databio/pepatac:1.0.7 -r ``` -### 5. Activate the `PEPATAC` crate +### 6. Activate the `PEPATAC` crate Now that we've loaded the `PEPATAC` crate, we need to activate that specific crate so its included tools are available. ```console @@ -80,7 +93,7 @@ bulker activate databio/pepatac:1.0.7 ``` Now, you can run any of the commands in the crate as if they were natively installed, **but they're actually running in containers**! -### 6. Run the sample-level pipeline +### 7. Run the sample-level pipeline Now we simply run the pipeline like you would with a native installation, but we wouldn't have needed to install any additional tools! @@ -137,7 +150,7 @@ looper run examples/test_project/test_config.yaml looper run examples/test_project/test_config_refgenie.yaml ``` -### 7: Run the project level pipeline +### 8: Run the project level pipeline `PEPATAC` also includes a project-level processing pipeline to do things like: diff --git a/docs/run-conda.md b/docs/run-conda.md index ae441d5b..d13c76f8 100644 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -92,7 +92,20 @@ Optional assets include: - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) - a [genomic feature annotation file](annotation.md) -## 6: Use `looper` to run the sample processing pipeline +## 6. Confirm installation + +After setting up your environment to run `PEPATAC` with `conda`, you can confirm the pipeline is now executable with `conda` using the included `checkinstall` script. This can either be run directly from the `pepatac/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/pepatac/checkinstall | bash +``` + +## 7: Use `looper` to run the sample processing pipeline Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` can utilize a project management tool called `looper` to run the sample-level pipeline across each sample in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them: @@ -129,7 +142,7 @@ looper run examples/test_project/test_config.yaml There are lots of other cool things you can do with `looper`, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/). -## 7: Use `looper` to run the project level pipeline +## 8: Use `looper` to run the project level pipeline `PEPATAC` also includes a project-level processing pipeline to do things like: @@ -144,4 +157,3 @@ looper runp examples/test_project/test_config.yaml ``` This should take < a minute on the test sample and will generate a `summary/` directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. - diff --git a/docs/run-container.md b/docs/run-container.md index d47b3d5f..62650d3d 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -85,6 +85,19 @@ make singularity Now you'll need to tell the pipeline where you saved the singularity image. You can either create an environment variable called `$SIMAGES` that points to the folder where your image is stored, or you can tweak the `pipeline_interface.yaml` file so that the `compute.singularity_image` attribute is pointing to the right location on disk. +### 6. Confirm installation + +After setting up your environment to run `PEPATAC` using containers, you can confirm the pipeline is now executable with your container system using the included `checkinstall` script. This can either be run directly from the `pepatac/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/pepatac/checkinstall | bash +``` + ### 4. Run individual samples in a container Individual jobs can be run in a container by simply running the `pepatac.py` command through `docker run` or `singularity exec`. You can run containers either on your local computer, or in an HPC environment, as long as you have `docker` or `singularity` installed. You will need to include any volumes that contain data required by the pipeline. For example, to utilize `refgenie` assets you'll need to ensure the volume containing those files is available. In the following example, we are including an environment variable (`$GENOMES`) which points to such a directory. @@ -100,6 +113,10 @@ docker run --rm -it databio/pepatac pipelines/pepatac.py --help ``` Be sure to mount the volumes you need with `--volume`. If you're utilizing any environment variables (e.g. `$REFGENIE`), don't forget to include those in your docker command with the `-e` option. +### 5. Running multiple samples in a container with looper + +To run multiple samples in a container, you simply need to configure `looper` to use a container-compatible template. The looper documentation has instructions for [running jobs in containers](http://looper.databio.org/en/latest/containers/). + ### Container details #### Using `docker` @@ -177,6 +194,3 @@ Third, close your instance when finished. singularity instance stop pepatac_instance ``` -### 5. Running multiple samples in a container with looper - -To run multiple samples in a container, you simply need to configure `looper` to use a container-compatible template. The looper documentation has instructions for [running jobs in containers](http://looper.databio.org/en/latest/containers/). \ No newline at end of file From a78cb4a780ae6a6c1230a5ec878e9b81b1e2300e Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 21 Jul 2021 13:54:31 -0400 Subject: [PATCH 64/66] add info on install check --- docs/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.md b/docs/changelog.md index bfe964bc..db12bdd8 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -16,6 +16,7 @@ All notable changes to this project will be documented in this file. ### Added - Add assets descriptions and how to obtain to docs + - Add script to check native, conda, and container-based installs ## [0.9.16] -- 2021-05-18 From e7fc95a63428d65063aa0fb43ae17a20c91e0220 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 21 Jul 2021 13:54:41 -0400 Subject: [PATCH 65/66] include container-engine check --- checkinstall | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/checkinstall b/checkinstall index 2d6647bc..736534a9 100755 --- a/checkinstall +++ b/checkinstall @@ -451,6 +451,18 @@ else echo -e $(fail "ERROR: PEPATAC cannot be run via conda.") fi +if [ "$DOCKER" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPATAC can be run using docker!") +else + echo -e $(fail "ERROR: PEPATAC cannot be run using docker.") +fi + +if [ "$SINGULARITY" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPATAC can be run using singularity!") +else + echo -e $(fail "ERROR: PEPATAC cannot be run using singularity.") +fi + if [ "$BULKER_INSTALL" -eq 0 ]; then echo -e $(success "SUCCESS: PEPATAC can be run using bulker!") else From a0e5ef68e5d50f39802593bde03b6a3865bab11b Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 21 Jul 2021 13:59:43 -0400 Subject: [PATCH 66/66] add checkinstall section --- docs/detailed-install.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/detailed-install.md b/docs/detailed-install.md index e89f8202..664a96f5 100644 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -256,3 +256,15 @@ Optional assets include: - a region blacklist: e.g. [the ENCODE blacklist](https://github.com/Boyle-Lab/Blacklist) - a [genomic feature annotation file](annotation.md) +### 5: Confirm installation + +After setting up your environment to run `PEPATAC`, you can confirm which means of running the pipeline are now executable using the included `checkinstall` script. This can either be run directly from the `pepatac/` repository: + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/pepatac/checkinstall | bash +``` \ No newline at end of file