diff --git a/README.md b/README.md index f4cd568d..3086debf 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,16 @@ Documentation is written using [mkdocs](https://www.mkdocs.org/) and themed with Each tool gets a `nav` section in `mkdocs.yml`, which maps to its own section/tab in the rendered documentation. So to add a new page, change titles, or change structure, edit `mkdocs.yml`. To edit the documentation itself, edit the `.md` documentation files in the subfolders under `/docs`. -### Prereqs +### Prerequisites +```bash +pip install mkdocs-material mkdocstrings[python] mkdocs-jupyter ``` -pip install mkdocs-material + +You'll also need to install the packages being documented (peppy, looper, pipestat, pypiper, geofetch, eido, yacman) for the API documentation to build correctly: + +```bash +pip install peppy looper pipestat pypiper geofetch eido yacman ``` @@ -22,12 +28,14 @@ pip install mkdocs-material I recommend previewing your changes locally before deploying. You can get a hot-reload server going by cloning this repository, and then just running: -``` +```bash mkdocs serve ``` You can also use `mkdocs build` to build a portable local version of the docs. +The documentation now uses **mkdocstrings** for Python API documentation and **mkdocs-jupyter** for Jupyter notebooks. These plugins automatically generate documentation from the source code and render notebooks, so the build process is now a single step. + ### Publishing updates @@ -35,37 +43,42 @@ The documentation is published automatically upon commits to `master` using a Gi ## FAQ +### Python API Documentation -### Updating automatic documentation - -In the past, I had a plugin that would auto-document 2 things: 1. Python docs using lucidoc, and 2. Jupyter notebooks. This plugin was neat, but it caused me a lot of maintenance issues as well. So now, I've made it much simpler; now it's no longer a plugin, just a simple Python script. Update all the auto-generated docs (stored in `docs/autodoc_build`) by running the update script manually: +Python API documentation is now automatically generated using **mkdocstrings** during the build process. No separate script is needed. The API docs are defined in markdown files (e.g., `docs/peppy/code/python-api.md`) using the `:::` syntax: -```console -python update_python_autodocs.py +```markdown +::: peppy.Project + options: + docstring_style: google + show_source: true ``` -#### Configuring lucidoc rendering +This syntax tells mkdocstrings to extract and render the documentation for the specified class or function directly from the source code. -Auto-generated Python documentation with `lucidoc` rendering is configured in the `lucidoc` sections of `mkdocs.yml`. +### Jupyter Notebooks + +Jupyter notebooks are now rendered automatically using the **mkdocs-jupyter** plugin. Configure which notebooks to include in the `plugins` section of `mkdocs.yml`: ```yaml -lucidoc: - peppy: path/to/output.md +plugins: +- mkdocs-jupyter: + include: + - peppy/notebooks/*.ipynb + - looper/notebooks/*.ipynb ``` -#### Configuring jupyter rendering +Notebooks are rendered directly from `.ipynb` files during the build - no conversion step is needed. -Configure jupyter notebeeoks in the `jupyter` section, where you specify a list of `in` (for `.ipynb` files) and `out` (for `.md` files) locations. +### CLI Usage Documentation -```yaml -jupyter: - - in: path/to/notebook_folder1 - out: path/to/rendered_folder1 - - in: path/to/notebook_folder2 - out: path/to/rendered_folder2 -``` - -There, you can specify which folders contain notebooks, and to where they should be rendered as markdown. +CLI usage documentation for geofetch can be updated manually when needed using the helper script: + +```bash +python scripts/generate_cli_usage_docs.py +``` + +This script reads the template at `docs/geofetch/usage-template.md.tpl` and runs `geofetch --help` to generate `docs/geofetch/code/usage.md`. This only needs to be run when the CLI interface changes. ### Can we version the docs? diff --git a/autodoc.py b/autodoc.py deleted file mode 100644 index 5231f588..00000000 --- a/autodoc.py +++ /dev/null @@ -1,102 +0,0 @@ -# This script will auto-generate documentation for Python code, CLI usage, and Jupyter notebooks -# It is intended to be run as a pre-build step in a MkDocs project -# It will read the mkdocs.yml file for configuration -# It will use the lucidoc package to auto-generate documentation for Python code -# It will use the subprocess package to run CLI commands and capture the output -# It will use the nbconvert package to convert Jupyter notebooks to markdown - -import lucidoc -import yaml -import subprocess -import glob -import nbconvert -import os -from pathlib import Path - -import argparse - -parser = argparse.ArgumentParser(description="Description of your program") -parser.add_argument( - "--x-usage", - help="Exclude usage", - required=False, - default=False, - action="store_true", -) -parser.add_argument( - "--x-lucidoc", - help="Exclude lucidoc", - required=False, - default=False, - action="store_true", -) -parser.add_argument( - "--x-jupyter", - help="Exclude jupyter", - required=False, - default=False, - action="store_true", -) - -args = vars(parser.parse_args()) - -print(args) - -# Read the mkdocs config -with open("mkdocs.yml") as stream: - cfg = yaml.safe_load(stream) - - -if "autodoc" not in cfg: - print("No autodoc configuration found in mkdocs.yml") - exit(1) -else: - cfg = cfg["autodoc"] - -# Process auto-documented Python code -if args["x_lucidoc"] is False and "lucidoc" in cfg: - for bundle in cfg["lucidoc"]: - print(f"Documenting lucidoc '{bundle['pkg']}' at {bundle['outfile']}") - lucidoc.run_lucidoc(parse_style="rst", **bundle) -else: - print("Skipping lucidoc") - - -usage_tpl = """ -\n`{cmd}` -\n -```console -{usage} -``` -""" - -# Process CLI usage -if args["x_usage"] is False and "cli_usage" in cfg: - for item in cfg["cli_usage"]: - result = "" - with open(item["template"], "r") as file: - result = file.read() - for cmd in item["commands"]: - print(f"Documenting command '{cmd}' to '{item['outfile']}'") - usage = subprocess.check_output(cmd, shell=True).decode("utf-8") - content = usage_tpl.format(cmd=cmd, usage=usage) - result += content - with open(item["outfile"], "w") as file: - file.write(result) -else: - print("Skipping usage documentation") - -# # Render Juptyer notebooks to markdown -if args["x_jupyter"] is False and "jupyter" in cfg: - for item in cfg["jupyter"]: - files = glob.glob(f"docs/{item['in']}/*.ipynb") - for nb in files: - bn, _ = os.path.splitext(os.path.basename(nb)) - out = f"docs/{item['out']}/{bn}.md" - print(f"Converting '{nb}' to '{out}'") - md_result = nbconvert.exporters.export(nbconvert.MarkdownExporter(), nb)[0] - Path(os.path.dirname(out)).mkdir(parents=True, exist_ok=True) - with open(out, "w") as stream: - stream.write(md_result) -else: - print("Skipping jupyter notebooks") diff --git a/docs/eido/code/cli.md b/docs/eido/code/cli.md index 0bdfc283..061bfa47 100644 --- a/docs/eido/code/cli.md +++ b/docs/eido/code/cli.md @@ -42,7 +42,7 @@ eido inspect peppro_paper.yaml Sections: name, pep_version, sample_table, looper, sample_modifiers - - To inspect a specific sample, one needs to provide the sample name (via `-n`/`--sample-name` oprional argument) + - To inspect a specific sample, one needs to provide the sample name (via `-n`/`--sample-name` optional argument) ```bash @@ -213,6 +213,8 @@ eido convert peppro_paper.yaml --format csv H9_treated_PRO-seq_1,hg38,human,['$CODE/peppro/sample_pipeline_interface.yaml'],human_rDNA,PRO,/Users/mstolarczyk/H9_200nM_romidepsin_rep1_PE1.fastq.gz,PAIRED,H9 treated PRO-seq,H9_treated_PRO-seq_1,H9_200nM_romidepsin_rep1,200 nM romidepsin,8,/Users/mstolarczyk/H9_200nM_romidepsin_rep1_PE2.fastq.gz H9_treated_PRO-seq_2,hg38,human,['$CODE/peppro/sample_pipeline_interface.yaml'],human_rDNA,PRO,/Users/mstolarczyk/H9_200nM_romidepsin_rep2_PE1.fastq.gz,PAIRED,H9 treated PRO-seq,H9_treated_PRO-seq_2,H9_200nM_romidepsin_rep2,200 nM romidepsin,8,/Users/mstolarczyk/H9_200nM_romidepsin_rep2_PE2.fastq.gz H9_treated_PRO-seq_3,hg38,human,['$CODE/peppro/sample_pipeline_interface.yaml'],human_rDNA,PRO,/Users/mstolarczyk/H9_200nM_romidepsin_rep3_PE1.fastq.gz,PAIRED,H9 treated PRO-seq,H9_treated_PRO-seq_3,H9_200nM_romidepsin_rep3,200 nM romidepsin,8,/Users/mstolarczyk/H9_200nM_romidepsin_rep3_PE2.fastq.gz + + H9_PRO-seq_10,hg38,human,['$CODE/peppro/sample_pipeline_interface.yaml'],human_rDNA,PRO,/Users/mstolarczyk/H9_PRO-seq_10pct_PE1.fastq.gz,PAIRED,10% subset H9 PRO-seq 2,H9_PRO-seq_10,H9_PRO-seq_10pct,DMSO,8,/Users/mstolarczyk/H9_PRO-seq_10pct_PE2.fastq.gz H9_PRO-seq_20,hg38,human,['$CODE/peppro/sample_pipeline_interface.yaml'],human_rDNA,PRO,/Users/mstolarczyk/H9_PRO-seq_20pct_PE1.fastq.gz,PAIRED,20% subset H9 PRO-seq 2,H9_PRO-seq_20,H9_PRO-seq_20pct,DMSO,8,/Users/mstolarczyk/H9_PRO-seq_20pct_PE2.fastq.gz H9_PRO-seq_30,hg38,human,['$CODE/peppro/sample_pipeline_interface.yaml'],human_rDNA,PRO,/Users/mstolarczyk/H9_PRO-seq_30pct_PE1.fastq.gz,PAIRED,30% subset H9 PRO-seq 2,H9_PRO-seq_30,H9_PRO-seq_30pct,DMSO,8,/Users/mstolarczyk/H9_PRO-seq_30pct_PE2.fastq.gz diff --git a/docs/eido/code/plugin-api-docs.md b/docs/eido/code/plugin-api-docs.md index 3bf7ec1b..bf596cfe 100644 --- a/docs/eido/code/plugin-api-docs.md +++ b/docs/eido/code/plugin-api-docs.md @@ -1,95 +1,42 @@ - +# Eido Built-in Filters API - +Eido provides built-in filter functions that can transform PEP projects into different output formats. These filters are useful for converting PEPs to various representations like YAML, CSV, or other formats. +### Available Filters -# Package `eido` Documentation +Eido includes several built-in filters for converting and exporting PEP data: +- **basic_pep_filter**: Returns the basic PEP representation +- **yaml_pep_filter**: Converts PEP to YAML format +- **csv_pep_filter**: Exports sample tables as CSV +- **yaml_samples_pep_filter**: Exports only sample data as YAML -Project configuration +## API Reference -```python -def basic_pep_filter(p, **kwargs) -> Dict[str, str] -``` +### Filter Functions -Basic PEP filter, that does not convert the Project object. +::: eido.basic_pep_filter + options: + docstring_style: google + show_source: true + show_signature: true -This filter can save the PEP representation to file, if kwargs include `path`. -#### Parameters: +::: eido.yaml_pep_filter + options: + docstring_style: google + show_source: true + show_signature: true -- `p` (`peppy.Project`): a Project to run filter on +::: eido.csv_pep_filter + options: + docstring_style: google + show_source: true + show_signature: true - - - -```python -def yaml_pep_filter(p, **kwargs) -> Dict[str, str] -``` - -YAML PEP filter, that returns Project object representation. - -This filter can save the YAML to file, if kwargs include `path`. -#### Parameters: - -- `p` (`peppy.Project`): a Project to run filter on - - - - -```python -def csv_pep_filter(p, **kwargs) -> Dict[str, str] -``` - -CSV PEP filter, that returns Sample object representations - -This filter can save the CSVs to files, if kwargs include -`sample_table_path` and/or `subsample_table_path`. -#### Parameters: - -- `p` (`peppy.Project`): a Project to run filter on - - - - -```python -def yaml_samples_pep_filter(p, **kwargs) -> Dict[str, str] -``` - -YAML samples PEP filter, that returns only Sample object representations. - -This filter can save the YAML to file, if kwargs include `path`. -#### Parameters: - -- `p` (`peppy.Project`): a Project to run filter on - - - - - - - -*Version Information: `eido` v0.2.2, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: eido.yaml_samples_pep_filter + options: + docstring_style: google + show_source: true + show_signature: true diff --git a/docs/eido/code/python-api.md b/docs/eido/code/python-api.md index 5559572f..e4cd56b0 100644 --- a/docs/eido/code/python-api.md +++ b/docs/eido/code/python-api.md @@ -1,213 +1,71 @@ - - - - - # Package `eido` Documentation +## Package Overview -Project configuration - -## Class `EidoValidationError` -Object was not validated successfully according to schema. - - -```python -def __init__(self, message, errors_by_type) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -```python -def validate_project(project, schema) -``` - -Validate a project object against a schema -#### Parameters: - -- `project` (`peppy.Project`): a project object to validate -- `schema` (`str | dict`): schema dict to validate against or a path to onefrom the error. Useful when used ith large projects - - - - -```python -def validate_sample(project, sample_name, schema) -``` - -Validate the selected sample object against a schema -#### Parameters: - -- `project` (`peppy.Project`): a project object to validate -- `sample_name` (`str | int`): name or index of the sample to validate -- `schema` (`str | dict`): schema dict to validate against or a path to one +The `eido` package provides validation and filtering tools for PEPs (Portable Encapsulated Projects). It enables schema-based validation of project metadata and provides flexible filtering mechanisms. +### Key Features +- **Schema Validation**: Validate PEPs against JSON schemas +- **Sample Filtering**: Filter samples based on custom criteria +- **Config Validation**: Validate project configuration separately +- **Extensible Filtering**: Support for custom filter plugins +- **Error Reporting**: Detailed validation error messages +### Installation -```python -def validate_config(project, schema) +```bash +pip install eido ``` -Validate the config part of the Project object against a schema -#### Parameters: - -- `project` (`peppy.Project`): a project object to validate -- `schema` (`str | dict`): schema dict to validate against or a path to one - - - +### Quick Example ```python -def read_schema(schema) -``` - -Safely read schema from YAML-formatted file. - -If the schema imports any other schemas, they will be read recursively. -#### Parameters: - -- `schema` (`str | Mapping`): path to the schema fileor schema in a dict form - +from eido import validate_project +from peppy import Project -#### Returns: +# Load a project +prj = Project("project_config.yaml") -- `list[dict]`: read schemas - - -#### Raises: - -- `TypeError`: if the schema arg is neither a Mapping nor a file path orif the 'imports' sections in any of the schemas is not a list - - - - -```python -def inspect_project(p, sample_names=None, max_attr=10) +# Validate against a schema +validate_project(prj, "schema.yaml") ``` -Print inspection info: Project or, if sample_names argument is provided, matched samples -#### Parameters: - -- `p` (`peppy.Project`): project to inspect -- `sample_names` (`Iterable[str]`): list of samples to inspect -- `max_attr` (`int`): max number of sample attributes to display - - - - -```python -def get_available_pep_filters() -``` - -Get a list of available target formats -#### Returns: - -- `List[str]`: a list of available formats - - - - -```python -def convert_project(prj, target_format, plugin_kwargs=None) -``` - -Convert a `peppy.Project` object to a selected format -#### Parameters: - -- `prj` (`peppy.Project`): a Project object to convert -- `plugin_kwargs` (`dict`): kwargs to pass to the plugin function -- `target_format` (`str`): the format to convert the Project object to - - -#### Raises: - -- `EidoFilterError`: if the requested filter is not defined - - - - -```python -def validate_input_files(project, schemas, sample_name=None) -``` - -Determine which of the required and optional files are missing. - -The names of the attributes that are required and/or deemed as inputs -are sourced from the schema, more specifically from `required_files` -and `files` sections in samples section: -- If any of the required files are missing, this function raises an error. -- If any of the optional files are missing, the function raises a warning. -Note, this function also performs Sample object validation with jsonschema. -#### Parameters: - -- `project` (`peppy.Project`): project that defines the samples to validate -- `schema` (`str | dict`): schema dict to validate against or a path to one -- `sample_name` (`str | int`): name or index of the sample to validate. If None,validate all samples in the project - - -#### Raises: - -- `PathAttrNotFoundError`: if any required sample attribute is missing - - - - -```python -def get_input_files_size(sample, schema) -``` - -Determine which of this Sample's required attributes/files are missing and calculate sizes of the files (inputs). - -The names of the attributes that are required and/or deemed as inputs -are sourced from the schema, more specifically from required_input_attrs -and input_attrs sections in samples section. Note, this function does -perform actual Sample object validation with jsonschema. -#### Parameters: - -- `sample` (`peppy.Sample`): sample to investigate -- `schema` (`list[dict] | str`): schema dict to validate against or a path to one - - -#### Returns: - -- `dict`: dictionary with validation data, i.e missing,required_inputs, all_inputs, input_file_size - - -#### Raises: +## API Reference -- `ValidationError`: if any required sample attribute is missing +### Validation Functions +::: eido.validate_project + options: + docstring_style: google + show_source: true + show_signature: true +::: eido.validate_sample + options: + docstring_style: google + show_source: true + show_signature: true +::: eido.validate_config + options: + docstring_style: google + show_source: true + show_signature: true +### Schema Functions +::: eido.read_schema + options: + docstring_style: google + show_source: true + show_signature: true +### Exceptions -*Version Information: `eido` v0.2.2, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: eido.EidoValidationError + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/eido/nunjucks.js b/docs/eido/nunjucks.js index 1f0d873d..5898ecd3 100644 --- a/docs/eido/nunjucks.js +++ b/docs/eido/nunjucks.js @@ -7356,7 +7356,7 @@ function defined(value) { exports.defined = defined; /** - * Returns `true` if the operand (one) is divisble by the test's argument + * Returns `true` if the operand (one) is divisible by the test's argument * (two). * @param { number } one * @param { number } two diff --git a/docs/geofetch/changelog.md b/docs/geofetch/changelog.md index 2846978b..69c97337 100644 --- a/docs/geofetch/changelog.md +++ b/docs/geofetch/changelog.md @@ -1,5 +1,20 @@ # Changelog +## [0.12.8] -- 2025-07-08 +- Updated docs +- Fixed parsing nested items. [[143](https://github.com/pepkit/geofetch/issues/143)] +- Added pypiper to requirements [[142](https://github.com/pepkit/geofetch/issues/142)] +- Fixed white spaces in reference genome string [[141](https://github.com/pepkit/geofetch/issues/141)] +- Added version in CLI help [[135](https://github.com/pepkit/geofetch/issues/135)] +- Updated SRA convert to use looper > 2.0.0 and fully automate process + +## [0.12.7] -- 2024-09-11 +- Updated Python support 3.13 +- Cleaned code and Readme + +## [0.12.6] -- 2024-02-05 +- Updated support for Windows in Prefetch (Note: Some functionality may still be unavailable on Windows) + ## [0.12.5] -- 2023-11-29 - Fixed bug, where description was not populated in PEP diff --git a/docs/geofetch/code/howto-sra-to-fastq.md b/docs/geofetch/code/howto-sra-to-fastq.md index a0140420..6e6ba7ac 100644 --- a/docs/geofetch/code/howto-sra-to-fastq.md +++ b/docs/geofetch/code/howto-sra-to-fastq.md @@ -1,79 +1,137 @@ ## How to extract fastq files from SRA +1. Install geofetch + + +```bash +pip install geofetch +``` + + Defaulting to user installation because normal site-packages is not writeable + Requirement already satisfied: geofetch in /home/bnt4me/.local/lib/python3.10/site-packages (0.12.7) + Requirement already satisfied: colorama>=0.3.9 in /usr/lib/python3/dist-packages (from geofetch) (0.4.4) + Requirement already satisfied: coloredlogs>=15.0.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (15.0.1) + Requirement already satisfied: logmuse>=0.2.6 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.2.7) + Requirement already satisfied: pandas>=1.5.3 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (2.2.2) + Requirement already satisfied: peppy>=0.40.6 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.40.7) + Requirement already satisfied: piper>=0.14.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.14.4) + Requirement already satisfied: requests>=2.28.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (2.31.0) + Requirement already satisfied: rich>=12.5.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (13.7.1) + Requirement already satisfied: ubiquerg>=0.6.2 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.8.1) + Requirement already satisfied: xmltodict>=0.13.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.13.0) + Requirement already satisfied: humanfriendly>=9.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from coloredlogs>=15.0.1->geofetch) (10.0) + Requirement already satisfied: numpy>=1.22.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from pandas>=1.5.3->geofetch) (1.25.2) + Requirement already satisfied: python-dateutil>=2.8.2 in /home/bnt4me/.local/lib/python3.10/site-packages (from pandas>=1.5.3->geofetch) (2.8.2) + Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3/dist-packages (from pandas>=1.5.3->geofetch) (2022.1) + Requirement already satisfied: tzdata>=2022.7 in /home/bnt4me/.local/lib/python3.10/site-packages (from pandas>=1.5.3->geofetch) (2023.3) + Requirement already satisfied: pyyaml in /usr/lib/python3/dist-packages (from peppy>=0.40.6->geofetch) (5.4.1) + Requirement already satisfied: pephubclient>=0.4.2 in /home/bnt4me/.local/lib/python3.10/site-packages (from peppy>=0.40.6->geofetch) (0.4.2) + Requirement already satisfied: psutil in /home/bnt4me/.local/lib/python3.10/site-packages (from piper>=0.14.4->geofetch) (5.9.4) + Requirement already satisfied: yacman>=0.9.3 in /home/bnt4me/.local/lib/python3.10/site-packages (from piper>=0.14.4->geofetch) (0.9.3) + Requirement already satisfied: pipestat>=0.11.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from piper>=0.14.4->geofetch) (0.12.1) + Requirement already satisfied: charset-normalizer<4,>=2 in /home/bnt4me/.local/lib/python3.10/site-packages (from requests>=2.28.1->geofetch) (3.0.1) + Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.28.1->geofetch) (3.3) + Requirement already satisfied: urllib3<3,>=1.21.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from requests>=2.28.1->geofetch) (1.26.18) + Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.28.1->geofetch) (2020.6.20) + Requirement already satisfied: markdown-it-py>=2.2.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from rich>=12.5.1->geofetch) (3.0.0) + Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from rich>=12.5.1->geofetch) (2.17.2) + Requirement already satisfied: mdurl~=0.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=12.5.1->geofetch) (0.1.2) + Requirement already satisfied: typer>=0.7.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (0.9.4) + Requirement already satisfied: pydantic>2.5.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (2.7.3) + Requirement already satisfied: jsonschema in /home/bnt4me/.local/lib/python3.10/site-packages (from pipestat>=0.11.0->piper>=0.14.4->geofetch) (4.23.0) + Requirement already satisfied: eido in /home/bnt4me/.local/lib/python3.10/site-packages (from pipestat>=0.11.0->piper>=0.14.4->geofetch) (0.2.4) + Requirement already satisfied: jinja2 in /usr/lib/python3/dist-packages (from pipestat>=0.11.0->piper>=0.14.4->geofetch) (3.0.3) + Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas>=1.5.3->geofetch) (1.16.0) + Requirement already satisfied: attmap>=0.13.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from yacman>=0.9.3->piper>=0.14.4->geofetch) (0.13.2) + Requirement already satisfied: oyaml in /home/bnt4me/.local/lib/python3.10/site-packages (from yacman>=0.9.3->piper>=0.14.4->geofetch) (1.0) + Requirement already satisfied: attrs>=22.2.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (25.3.0) + Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (2025.4.1) + Requirement already satisfied: referencing>=0.28.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (0.36.2) + Requirement already satisfied: rpds-py>=0.7.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (0.24.0) + Requirement already satisfied: annotated-types>=0.4.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from pydantic>2.5.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (0.6.0) + Requirement already satisfied: pydantic-core==2.18.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from pydantic>2.5.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (2.18.4) + Requirement already satisfied: typing-extensions>=4.6.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from pydantic>2.5.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (4.8.0) + Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/lib/python3/dist-packages (from typer>=0.7.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (8.0.3) + + [notice] A new release of pip is available: 23.2.1 -> 25.1.1 + [notice] To update, run: python3 -m pip install --upgrade pip + + ```bash geofetch --version ``` - geofetch 0.12.4 + geofetch 0.12.8 1) Download SRA files and PEP using GEOfetch -Add flags: -a) `--add-convert-modifier` (To add looper configurations for conversion) -b) `--discard-soft` (To delete soft files. We don't need them :D) +Add flags: +1) `--add-convert-modifier` (To add looper configurations for conversion) +2) `--discard-soft` (To delete soft files. We don't need them :D) ```bash geofetch -i GSE67303 -n red_algae -m `pwd` --add-convert-modifier --discard-soft ``` - Metadata folder: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae - Trying GSE67303 (not a file) as accession... - Skipped 0 accessions. Starting now. - Processing accession 1 of 1: 'GSE67303' - Processed 4 samples. - Expanding metadata list... - Found SRA Project accession: SRP056574 - Downloading SRP056574 sra metadata - Parsing SRA file to download SRR records - Getting SRR: SRR1930183 in (GSE67303) - - 2023-08-01T17:04:12 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. - 2023-08-01T17:04:12 prefetch.2.11.3: 1) Downloading 'SRR1930183'... - 2023-08-01T17:04:12 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. - 2023-08-01T17:04:12 prefetch.2.11.3: Downloading via HTTPS... - 2023-08-01T17:04:14 prefetch.2.11.3: HTTPS download succeed - 2023-08-01T17:04:15 prefetch.2.11.3: 'SRR1930183' is valid - 2023-08-01T17:04:15 prefetch.2.11.3: 1) 'SRR1930183' was downloaded successfully - 2023-08-01T17:04:15 prefetch.2.11.3: 'SRR1930183' has 0 unresolved dependencies - Getting SRR: SRR1930184 in (GSE67303) - - 2023-08-01T17:04:15 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. - 2023-08-01T17:04:16 prefetch.2.11.3: 1) Downloading 'SRR1930184'... - 2023-08-01T17:04:16 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. - 2023-08-01T17:04:16 prefetch.2.11.3: Downloading via HTTPS... - 2023-08-01T17:04:17 prefetch.2.11.3: HTTPS download succeed - 2023-08-01T17:04:18 prefetch.2.11.3: 'SRR1930184' is valid - 2023-08-01T17:04:18 prefetch.2.11.3: 1) 'SRR1930184' was downloaded successfully - 2023-08-01T17:04:18 prefetch.2.11.3: 'SRR1930184' has 0 unresolved dependencies - Getting SRR: SRR1930185 in (GSE67303) - - 2023-08-01T17:04:19 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. - 2023-08-01T17:04:19 prefetch.2.11.3: 1) Downloading 'SRR1930185'... - 2023-08-01T17:04:19 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. - 2023-08-01T17:04:19 prefetch.2.11.3: Downloading via HTTPS... - 2023-08-01T17:04:22 prefetch.2.11.3: HTTPS download succeed - 2023-08-01T17:04:22 prefetch.2.11.3: 'SRR1930185' is valid - 2023-08-01T17:04:22 prefetch.2.11.3: 1) 'SRR1930185' was downloaded successfully - 2023-08-01T17:04:22 prefetch.2.11.3: 'SRR1930185' has 0 unresolved dependencies - Getting SRR: SRR1930186 in (GSE67303) - - 2023-08-01T17:04:22 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. - 2023-08-01T17:04:23 prefetch.2.11.3: 1) Downloading 'SRR1930186'... - 2023-08-01T17:04:23 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. - 2023-08-01T17:04:23 prefetch.2.11.3: Downloading via HTTPS... - 2023-08-01T17:04:25 prefetch.2.11.3: HTTPS download succeed - 2023-08-01T17:04:25 prefetch.2.11.3: 'SRR1930186' is valid - 2023-08-01T17:04:25 prefetch.2.11.3: 1) 'SRR1930186' was downloaded successfully - 2023-08-01T17:04:25 prefetch.2.11.3: 'SRR1930186' has 0 unresolved dependencies - Finished processing 1 accession(s) - Cleaning soft files ... - Creating complete project annotation sheets and config file... - Sample annotation sheet: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae/GSE67303_PEP/GSE67303_PEP_raw.csv . Saved! - File has been saved successfully - Config file: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae/GSE67303_PEP/GSE67303_PEP.yaml + [INFO] [00:54:23] Metadata folder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae + [INFO] [00:54:24] Trying GSE67303 (not a file) as accession... + [INFO] [00:54:24] Skipped 0 accessions. Starting now. + [INFO] [00:54:24] Processing accession 1 of 1: 'GSE67303' + [INFO] [00:54:24] Processed 4 samples. + [INFO] [00:54:24] Expanding metadata list... + [INFO] [00:54:24] Found SRA Project accession: SRP056574 + [INFO] [00:54:24] Downloading SRP056574 sra metadata + [INFO] [00:54:25] Parsing SRA file to download SRR records + [INFO] [00:54:25] Getting SRR: SRR1930183 in (GSE67303) + + 2025-07-10T04:54:26 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. + 2025-07-10T04:54:26 prefetch.2.11.3: 1) Downloading 'SRR1930183'... + 2025-07-10T04:54:26 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. + 2025-07-10T04:54:26 prefetch.2.11.3: Downloading via HTTPS... + 2025-07-10T04:54:31 prefetch.2.11.3: HTTPS download succeed + 2025-07-10T04:54:31 prefetch.2.11.3: 'SRR1930183' is valid + 2025-07-10T04:54:31 prefetch.2.11.3: 1) 'SRR1930183' was downloaded successfully + 2025-07-10T04:54:31 prefetch.2.11.3: 'SRR1930183' has 0 unresolved dependencies + [INFO] [00:54:31] Getting SRR: SRR1930184 in (GSE67303) + + 2025-07-10T04:54:32 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. + 2025-07-10T04:54:32 prefetch.2.11.3: 1) Downloading 'SRR1930184'... + 2025-07-10T04:54:32 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. + 2025-07-10T04:54:32 prefetch.2.11.3: Downloading via HTTPS... + 2025-07-10T04:54:36 prefetch.2.11.3: HTTPS download succeed + 2025-07-10T04:54:36 prefetch.2.11.3: 'SRR1930184' is valid + 2025-07-10T04:54:36 prefetch.2.11.3: 1) 'SRR1930184' was downloaded successfully + 2025-07-10T04:54:36 prefetch.2.11.3: 'SRR1930184' has 0 unresolved dependencies + [INFO] [00:54:36] Getting SRR: SRR1930185 in (GSE67303) + + 2025-07-10T04:54:37 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. + 2025-07-10T04:54:37 prefetch.2.11.3: 1) Downloading 'SRR1930185'... + 2025-07-10T04:54:37 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. + 2025-07-10T04:54:37 prefetch.2.11.3: Downloading via HTTPS... + 2025-07-10T04:54:45 prefetch.2.11.3: HTTPS download succeed + 2025-07-10T04:54:45 prefetch.2.11.3: 'SRR1930185' is valid + 2025-07-10T04:54:45 prefetch.2.11.3: 1) 'SRR1930185' was downloaded successfully + 2025-07-10T04:54:45 prefetch.2.11.3: 'SRR1930185' has 0 unresolved dependencies + [INFO] [00:54:45] Getting SRR: SRR1930186 in (GSE67303) + + 2025-07-10T04:54:46 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. + 2025-07-10T04:54:46 prefetch.2.11.3: 1) Downloading 'SRR1930186'... + 2025-07-10T04:54:46 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. + 2025-07-10T04:54:46 prefetch.2.11.3: Downloading via HTTPS... + 2025-07-10T04:54:52 prefetch.2.11.3: HTTPS download succeed + 2025-07-10T04:54:52 prefetch.2.11.3: 'SRR1930186' is valid + 2025-07-10T04:54:52 prefetch.2.11.3: 1) 'SRR1930186' was downloaded successfully + 2025-07-10T04:54:52 prefetch.2.11.3: 'SRR1930186' has 0 unresolved dependencies + [INFO] [00:54:52] Finished processing 1 accession(s) + [INFO] [00:54:52] Cleaning soft files ... + [INFO] [00:54:52] Creating complete project annotation sheets and config file... + [INFO] [00:54:52] Sample annotation sheet: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/GSE67303_PEP/GSE67303_PEP_raw.csv . Saved! + [INFO] [00:54:52] File has been saved successfully + [INFO] [00:54:52] Config file: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/GSE67303_PEP/GSE67303_PEP.yaml + [INFO] [00:54:52] Looper config file: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/looper_config.yaml Let's see if files were downloaded: @@ -83,10 +141,10 @@ Let's see if files were downloaded: ls ``` - build python-usage.ipynb SRR1930184 - code raw-data-downloading.ipynb SRR1930185 - how_to_fastq_from_sra.ipynb red_algae SRR1930186 + fq_folder raw-data-downloading.ipynb SRR1930185 + howto-sra-to-fastq.ipynb red_algae SRR1930186 processed-data-downloading.ipynb SRR1930183 + python-usage.ipynb SRR1930184 now let's check how does our config file looks like: @@ -173,7 +231,7 @@ cat ./red_algae/GSE67303_PEP/GSE67303_PEP.yaml derive: attributes: [read1, read2, SRR_files] sources: - SRA: "${SRABAM}/{srr}.bam" + SRA: "${SRARAW}/{srr}/{srr}.sra" FQ: "${SRAFQ}/{srr}.fastq.gz" FQ1: "${SRAFQ}/{srr}_1.fastq.gz" FQ2: "${SRAFQ}/{srr}_2.fastq.gz" @@ -196,23 +254,6 @@ cat ./red_algae/GSE67303_PEP/GSE67303_PEP.yaml then: read1: FQ1 - project_modifiers: - amend: - sra_convert: - looper: - results_subdir: sra_convert_results - sample_modifiers: - append: - SRR_files: SRA - pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRARAW}/{srr}/{srr}.sra" - FQ: "${SRAFQ}/{srr}.fastq.gz" - FQ1: "${SRAFQ}/{srr}_1.fastq.gz" - FQ2: "${SRAFQ}/{srr}_2.fastq.gz" - @@ -241,300 +282,385 @@ mkdir fq_folder export SRAFQ=`pwd`/fq_folder ``` +### Now install looper if you don't have it + ```bash -# Unfortunately you have to pull gefetch folder from github, and set CODE variable: -mkdir code && cd code && git clone https://github.com/pepkit/geofetch.git && export CODE=`pwd` && cd .. +# pip install looper ``` ```bash -ls +looper --version ``` - build processed-data-downloading.ipynb SRR1930183 - code python-usage.ipynb SRR1930184 - fq_folder raw-data-downloading.ipynb SRR1930185 - how_to_fastq_from_sra.ipynb red_algae SRR1930186 + 2.0.1 +  -### Now install looper if you don't have it +Let's check where is looper config file and whats inside: ```bash -looper --version +ls red_algae ``` - looper 1.4.3 -  + GSE67303_PEP looper_config.yaml output_dir ```bash -ls red_algae +cat red_algae/looper_config.yaml ``` - GSE67303_PEP + pep_config: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/GSE67303_PEP/GSE67303_PEP.yaml + output_dir: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir + pipeline_interfaces: + - /home/bnt4me/.local/lib/python3.10/site-packages/geofetch/templates/pipeline_interface_convert.yaml + +Geofetch automatically generated paths to pep_config and pipeline interfaces that are embedded into geofetch ```bash -looper run red_algae/GSE67303_PEP/GSE67303_PEP.yaml -a sra_convert -p local --output-dir . +looper run --config ./red_algae/looper_config.yaml -p local --output-dir . ``` - Looper version: 1.4.3 + Looper version: 2.0.1 Command: run - Using default config. No config found in env var: ['DIVCFG'] - Using amendments: sra_convert + Using default divvy config. You may specify in env var: ['DIVCFG'] Activating compute package 'local' - Pipestat compatible: False ## [1 of 4] sample: cm_bluelight_rep1; pipeline: sra_convert - Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_bluelight_rep1.sub - Job script (n=1; 0.06Gb): ./submission/sra_convert_cm_bluelight_rep1.sub - Compute node: bnt4me-Precision-5560 - Start time: 2023-08-01 13:06:42 - Using outfolder: ./sra_convert_results/SRR1930183 + Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep1.sub + Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep1.sub + Compute node: alex-laptop + Start time: 2025-07-10 00:59:02 + Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183 + No pipestat output schema was supplied to PipestatManager. + Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/stats.yaml' ### Pipeline run code and environment: - * Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra -O ./sra_convert_results` - * Compute host: bnt4me-Precision-5560 - * Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter - * Outfolder: ./sra_convert_results/SRR1930183/ - * Pipeline started at: (08-01 13:06:42) elapsed: 0.0 _TIME_ + * Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` + * Compute host: `alex-laptop` + * Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/` + * Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/sra_convert_log.md` + * Start time: (07-10 00:59:03) elapsed: 0.0 _TIME_ ### Version log: - * Python version: 3.10.6 - * Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper` - * Pypiper version: 0.12.3 - * Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin` - * Pipeline version: None + * Python version: `3.10.12` + * Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper` + * Pypiper version: `0.14.4` + * Pipeline dir: `/home/bnt4me/.local/bin` + * Pipeline version: ### Arguments passed to pipeline: * `bamfolder`: `` * `config_file`: `sraconvert.yaml` * `format`: `fastq` - * `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` + * `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` * `keep_sra`: `False` * `logdev`: `False` * `mode`: `convert` - * `output_parent`: `./sra_convert_results` + * `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` * `recover`: `False` * `sample_name`: `None` * `silent`: `False` - * `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter` - * `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra']` + * `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930183/SRR1930183.sra']` * `verbosity`: `None` + ### Initialized Pipestat Object: + + * PipestatManager (sra_convert) + * Backend: File + * - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/stats.yaml + * - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183 + * Multiple Pipelines Allowed: False + * Pipeline name: sra_convert + * Pipeline type: sample + * Status Schema key: None + * Results formatter: default_formatter + * Results schema source: None + * Status schema source: None + * Records count: 2 + * Sample name: DEFAULT_SAMPLE_NAME + + ---------------------------------------- Processing 1 of 1 files: SRR1930183 - Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930183_1.fastq.gz` + Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930183_1.fastq.gz` - > `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (744928) + > `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871210)
     spots read      : 1,068,319
     reads read      : 2,136,638
     reads written   : 2,136,638
     
- Command completed. Elapsed time: 0:00:02. Running peak memory: 0.08GB. - PID: 744928; Command: fasterq-dump; Return code: 0; Memory used: 0.08GB + Command completed. Elapsed time: 0:00:02. Running peak memory: 0.069GB. + PID: 871210; Command: fasterq-dump; Return code: 0; Memory used: 0.069GB Already completed files: [] ### Pipeline completed. Epilogue * Elapsed time (this run): 0:00:02 * Total elapsed time (all runs): 0:00:02 - * Peak memory (this run): 0.0803 GB - * Pipeline completed time: 2023-08-01 13:06:44 + * Peak memory (this run): 0.0685 GB + * Pipeline completed time: 2025-07-10 00:59:05 + Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml ## [2 of 4] sample: cm_bluelight_rep2; pipeline: sra_convert - Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_bluelight_rep2.sub - Job script (n=1; 0.04Gb): ./submission/sra_convert_cm_bluelight_rep2.sub - Compute node: bnt4me-Precision-5560 - Start time: 2023-08-01 13:06:44 - Using outfolder: ./sra_convert_results/SRR1930184 + Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep2.sub + Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep2.sub + Compute node: alex-laptop + Start time: 2025-07-10 00:59:06 + Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184 + No pipestat output schema was supplied to PipestatManager. + Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/stats.yaml' ### Pipeline run code and environment: - * Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra -O ./sra_convert_results` - * Compute host: bnt4me-Precision-5560 - * Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter - * Outfolder: ./sra_convert_results/SRR1930184/ - * Pipeline started at: (08-01 13:06:45) elapsed: 0.0 _TIME_ + * Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` + * Compute host: `alex-laptop` + * Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/` + * Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/sra_convert_log.md` + * Start time: (07-10 00:59:06) elapsed: 0.0 _TIME_ ### Version log: - * Python version: 3.10.6 - * Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper` - * Pypiper version: 0.12.3 - * Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin` - * Pipeline version: None + * Python version: `3.10.12` + * Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper` + * Pypiper version: `0.14.4` + * Pipeline dir: `/home/bnt4me/.local/bin` + * Pipeline version: ### Arguments passed to pipeline: * `bamfolder`: `` * `config_file`: `sraconvert.yaml` * `format`: `fastq` - * `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` + * `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` * `keep_sra`: `False` * `logdev`: `False` * `mode`: `convert` - * `output_parent`: `./sra_convert_results` + * `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` * `recover`: `False` * `sample_name`: `None` * `silent`: `False` - * `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter` - * `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra']` + * `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930184/SRR1930184.sra']` * `verbosity`: `None` + ### Initialized Pipestat Object: + + * PipestatManager (sra_convert) + * Backend: File + * - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/stats.yaml + * - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184 + * Multiple Pipelines Allowed: False + * Pipeline name: sra_convert + * Pipeline type: sample + * Status Schema key: None + * Results formatter: default_formatter + * Results schema source: None + * Status schema source: None + * Records count: 2 + * Sample name: DEFAULT_SAMPLE_NAME + + ---------------------------------------- Processing 1 of 1 files: SRR1930184 - Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930184_1.fastq.gz` + Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930184_1.fastq.gz` - > `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (744973) + > `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871261)
     spots read      : 762,229
     reads read      : 1,524,458
     reads written   : 1,524,458
     
- Command completed. Elapsed time: 0:00:02. Running peak memory: 0.012GB. - PID: 744973; Command: fasterq-dump; Return code: 0; Memory used: 0.012GB + Command completed. Elapsed time: 0:00:02. Running peak memory: 0.083GB. + PID: 871261; Command: fasterq-dump; Return code: 0; Memory used: 0.083GB Already completed files: [] ### Pipeline completed. Epilogue * Elapsed time (this run): 0:00:02 * Total elapsed time (all runs): 0:00:02 - * Peak memory (this run): 0.0118 GB - * Pipeline completed time: 2023-08-01 13:06:47 + * Peak memory (this run): 0.0832 GB + * Pipeline completed time: 2025-07-10 00:59:08 + + + Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml ## [3 of 4] sample: cm_darkness_rep1; pipeline: sra_convert - Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_darkness_rep1.sub - Job script (n=1; 0.09Gb): ./submission/sra_convert_cm_darkness_rep1.sub - Compute node: bnt4me-Precision-5560 - Start time: 2023-08-01 13:06:47 - Using outfolder: ./sra_convert_results/SRR1930185 + Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep1.sub + Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep1.sub + Compute node: alex-laptop + Start time: 2025-07-10 00:59:08 + Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185 + No pipestat output schema was supplied to PipestatManager. + Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/stats.yaml' ### Pipeline run code and environment: - * Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra -O ./sra_convert_results` - * Compute host: bnt4me-Precision-5560 - * Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter - * Outfolder: ./sra_convert_results/SRR1930185/ - * Pipeline started at: (08-01 13:06:47) elapsed: 0.0 _TIME_ + * Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` + * Compute host: `alex-laptop` + * Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/` + * Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/sra_convert_log.md` + * Start time: (07-10 00:59:09) elapsed: 0.0 _TIME_ ### Version log: - * Python version: 3.10.6 - * Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper` - * Pypiper version: 0.12.3 - * Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin` - * Pipeline version: None + * Python version: `3.10.12` + * Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper` + * Pypiper version: `0.14.4` + * Pipeline dir: `/home/bnt4me/.local/bin` + * Pipeline version: ### Arguments passed to pipeline: * `bamfolder`: `` * `config_file`: `sraconvert.yaml` * `format`: `fastq` - * `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` + * `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` * `keep_sra`: `False` * `logdev`: `False` * `mode`: `convert` - * `output_parent`: `./sra_convert_results` + * `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` * `recover`: `False` * `sample_name`: `None` * `silent`: `False` - * `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter` - * `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra']` + * `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930185/SRR1930185.sra']` * `verbosity`: `None` + ### Initialized Pipestat Object: + + * PipestatManager (sra_convert) + * Backend: File + * - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/stats.yaml + * - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185 + * Multiple Pipelines Allowed: False + * Pipeline name: sra_convert + * Pipeline type: sample + * Status Schema key: None + * Results formatter: default_formatter + * Results schema source: None + * Status schema source: None + * Records count: 2 + * Sample name: DEFAULT_SAMPLE_NAME + + ---------------------------------------- Processing 1 of 1 files: SRR1930185 - Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930185_1.fastq.gz` + Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930185_1.fastq.gz` - > `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (745021) + > `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871306)
     spots read      : 1,707,508
     reads read      : 3,415,016
     reads written   : 3,415,016
     
- Command completed. Elapsed time: 0:00:03. Running peak memory: 0.079GB. - PID: 745021; Command: fasterq-dump; Return code: 0; Memory used: 0.079GB + Command completed. Elapsed time: 0:00:04. Running peak memory: 0.07GB. + PID: 871306; Command: fasterq-dump; Return code: 0; Memory used: 0.07GB Already completed files: [] ### Pipeline completed. Epilogue - * Elapsed time (this run): 0:00:03 - * Total elapsed time (all runs): 0:00:03 - * Peak memory (this run): 0.0793 GB - * Pipeline completed time: 2023-08-01 13:06:50 + * Elapsed time (this run): 0:00:04 + * Total elapsed time (all runs): 0:00:04 + * Peak memory (this run): 0.0701 GB + * Pipeline completed time: 2025-07-10 00:59:13 + Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml ## [4 of 4] sample: cm_darkness_rep2; pipeline: sra_convert - - - Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_darkness_rep2.sub - Job script (n=1; 0.07Gb): ./submission/sra_convert_cm_darkness_rep2.sub - Compute node: bnt4me-Precision-5560 - Start time: 2023-08-01 13:06:50 - Using outfolder: ./sra_convert_results/SRR1930186 + Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep2.sub + Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep2.sub + Compute node: alex-laptop + Start time: 2025-07-10 00:59:13 + Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186 + No pipestat output schema was supplied to PipestatManager. + Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/stats.yaml' ### Pipeline run code and environment: - * Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra -O ./sra_convert_results` - * Compute host: bnt4me-Precision-5560 - * Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter - * Outfolder: ./sra_convert_results/SRR1930186/ - * Pipeline started at: (08-01 13:06:51) elapsed: 0.0 _TIME_ + * Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` + * Compute host: `alex-laptop` + * Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/` + * Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/sra_convert_log.md` + * Start time: (07-10 00:59:14) elapsed: 0.0 _TIME_ ### Version log: - * Python version: 3.10.6 - * Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper` - * Pypiper version: 0.12.3 - * Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin` - * Pipeline version: None + * Python version: `3.10.12` + * Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper` + * Pypiper version: `0.14.4` + * Pipeline dir: `/home/bnt4me/.local/bin` + * Pipeline version: ### Arguments passed to pipeline: * `bamfolder`: `` * `config_file`: `sraconvert.yaml` * `format`: `fastq` - * `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` + * `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` * `keep_sra`: `False` * `logdev`: `False` * `mode`: `convert` - * `output_parent`: `./sra_convert_results` + * `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline` * `recover`: `False` * `sample_name`: `None` * `silent`: `False` - * `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter` - * `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra']` + * `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks` + * `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930186/SRR1930186.sra']` * `verbosity`: `None` + ### Initialized Pipestat Object: + + * PipestatManager (sra_convert) + * Backend: File + * - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/stats.yaml + * - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186 + * Multiple Pipelines Allowed: False + * Pipeline name: sra_convert + * Pipeline type: sample + * Status Schema key: None + * Results formatter: default_formatter + * Results schema source: None + * Status schema source: None + * Records count: 2 + * Sample name: DEFAULT_SAMPLE_NAME + + ---------------------------------------- Processing 1 of 1 files: SRR1930186 - Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930186_1.fastq.gz` + Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930186_1.fastq.gz` - > `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (745069) + > `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871369)
     spots read      : 1,224,029
     reads read      : 2,448,058
     reads written   : 2,448,058
     
- Command completed. Elapsed time: 0:00:02. Running peak memory: 0.081GB. - PID: 745069; Command: fasterq-dump; Return code: 0; Memory used: 0.081GB + Command completed. Elapsed time: 0:00:02. Running peak memory: 0.083GB. + PID: 871369; Command: fasterq-dump; Return code: 0; Memory used: 0.083GB Already completed files: [] ### Pipeline completed. Epilogue * Elapsed time (this run): 0:00:02 * Total elapsed time (all runs): 0:00:02 - * Peak memory (this run): 0.0813 GB - * Pipeline completed time: 2023-08-01 13:06:53 + * Peak memory (this run): 0.0832 GB + * Pipeline completed time: 2025-07-10 00:59:16 + Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml + + Looper finished Samples valid for job generation: 4 of 4 - Commands submitted: 4 of 4 - Jobs submitted: 4  @@ -553,3 +679,5 @@ ls SRR1930183_1.fastq SRR1930184_1.fastq SRR1930185_1.fastq SRR1930186_1.fastq SRR1930183_2.fastq SRR1930184_2.fastq SRR1930185_2.fastq SRR1930186_2.fastq + +Everything was executed successfully and SRA files were converted into fastq files diff --git a/docs/geofetch/code/python-api.md b/docs/geofetch/code/python-api.md index 76d1a3ba..f2007487 100644 --- a/docs/geofetch/code/python-api.md +++ b/docs/geofetch/code/python-api.md @@ -1,278 +1,55 @@ - - - - - # Package `geofetch` Documentation - Package-level data -## Class `Finder` -Class for finding GSE accessions in special period of time. Additionally, user can add specific filters for the search, while initialization of the class - - -```python -def __init__(self, filters: str=None, retmax: int=10000000) -``` - - -#### Parameters: - -- `filters` (``): filters that have to be added to the query.Filter Patterns can be found here: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag -- `retmax` (``): maximum number of retrieved accessions. - - - - -```python -def find_differences(old_list: list, new_list: list) -> list -``` - -Compare 2 lists and search for elements that are not in old list -#### Parameters: - -- `old_list` (``): old list of elements -- `new_list` (``): new list of elements - - -#### Returns: - -- ``: list of elements that are not in old list but are in new_list - - - - -```python -def generate_file(self, file_path: str, gse_list: list=None) -``` - -Save the list of GSE accessions stored in this Finder object to a given file -#### Parameters: - -- `file_path` (``): root to the file where gse accessions have to be saved -- `gse_list` (``): list of gse accessions - - -#### Returns: - -- ``: NoReturn - - - - -```python -def get_gse_all(self) -> list -``` - -Get list of all gse accession available in GEO -#### Returns: - -- ``: list of gse accession - - - - -```python -def get_gse_by_date(self, start_date: str, end_date: str=None) -> list -``` - -Search gse accessions by providing start date and end date. By default, the last date is today. -#### Parameters: - -- `start_date` (``): 'YYYY/MM/DD'] -- `end_date` (``): 'YYYY/MM/DD'] - - -#### Returns: - -- ``: list of gse accessions - - - - -```python -def get_gse_by_day_count(self, n_days: int=1) -> list -``` - -Get list of gse accessions that were uploaded or updated in last X days -#### Parameters: - -- `n_days` (``): number of days from now [e.g. 5] - - -#### Returns: - -- ``: list of gse accession - - - +## Package Overview -```python -def get_gse_id_by_query(self, url: str) -> list -``` - -Run esearch (ncbi search tool) by specifying URL and retrieve gse list result -#### Parameters: - -- `url` (``): url of the query - - -#### Returns: - -- ``: list of gse ids - - - - -```python -def get_gse_last_3_month(self) -> list -``` - -Get list of gse accession that were uploaded or updated in last 3 month -#### Returns: - -- ``: list of gse accession - - - - -```python -def get_gse_last_week(self) -> list -``` - -Get list of gse accession that were uploaded or updated in last week -#### Returns: - -- ``: list of gse accession - - - - -```python -def uid_to_gse(uid: str) -> str -``` - -UID to GES accession converter -#### Parameters: - -- `uid` (``): uid string (Unique Identifier Number in GEO) - - -#### Returns: +The `geofetch` package provides tools for downloading metadata and data from Gene Expression Omnibus (GEO) and Sequence Read Archive (SRA). It can convert GEO/SRA metadata into PEP format for easy integration with other PEPkit tools. -- ``: GSE id string +### Key Features +- **GEO/SRA Download**: Fetch metadata and raw data from NCBI repositories +- **PEP Generation**: Automatically create PEP-formatted project configs +- **Flexible Filtering**: Search and filter GEO datasets by date and criteria +- **SRA Integration**: Download and convert SRA data to FASTQ format +- **Processed Data**: Download processed data matrices from GEO +### Installation - -## Class `Geofetcher` -Class to download or get projects, metadata, data from GEO and SRA - - -```python -def __init__(self, name: str='', metadata_root: str='', metadata_folder: str='', just_metadata: bool=False, refresh_metadata: bool=False, config_template: str=None, pipeline_samples: str=None, pipeline_project: str=None, skip: int=0, acc_anno: bool=False, use_key_subset: bool=False, processed: bool=False, data_source: str='samples', filter: str=None, filter_size: str=None, geo_folder: str='.', split_experiments: bool=False, bam_folder: str='', fq_folder: str='', sra_folder: str='', bam_conversion: bool=False, picard_path: str='', input: str=None, const_limit_project: int=50, const_limit_discard: int=1000, attr_limit_truncate: int=500, max_soft_size: str='1GB', discard_soft: bool=False, add_dotfile: bool=False, disable_progressbar: bool=False, add_convert_modifier: bool=False, opts=None, max_prefetch_size=None, **kwargs) +```bash +pip install geofetch ``` -Constructor -#### Parameters: - -- `input` (``): GSEnumber or path to the input file -- `name` (``): Specify a project name. Defaults to GSE number or name of accessions file name -- `metadata_root` (``): Specify a parent folder location to store metadata.The project name will be added as a subfolder [Default: $SRAMETA:] -- `metadata_folder` (``): Specify an absolute folder location to store metadata. No subfolder will be added.Overrides value of --metadata-root [Default: Not used (--metadata-root is used by default)] -- `just_metadata` (``): If set, don't actually run downloads, just create metadata -- `refresh_metadata` (``): If set, re-download metadata even if it exists. -- `config_template` (``): Project config yaml file template. -- `pipeline_samples` (``): Specify one or more filepaths to SAMPLES pipeline interface yaml files.These will be added to the project config file to make it immediately compatible with looper. [Default: null] -- `pipeline_project` (``): Specify one or more filepaths to PROJECT pipeline interface yaml files.These will be added to the project config file to make it immediately compatible with looper. [Default: null] -- `acc_anno` (``): Produce annotation sheets for each accession.Project combined PEP for the whole project won't be produced. -- `discard_soft` (``): Create project without downloading soft files on the disc -- `add_dotfile` (``): Add .pep.yaml file that points .yaml PEP file -- `disable_progressbar` (``): Set true to disable progressbar - - - +### Quick Example ```python -def fetch_all(self, input: str, name: str=None) -> Union[NoReturn, peppy.project.Project] -``` +from geofetch import Geofetcher -Main function driver/workflow Function that search, filters, downloads and save data and metadata from GEO and SRA -#### Parameters: +# Initialize geofetcher +gf = Geofetcher() -- `input` (``): GSE or input file with gse's -- `name` (``): Name of the project - - -#### Returns: - -- ``: NoReturn or peppy Project - - - - -```python -def fetch_processed_one(self, gse_file_content: list, gsm_file_content: list, gsm_filter_list: dict) -> Tuple +# Fetch a GEO series +gf.fetch_all(input="GSE####", name="my_project") ``` -Fetche one processed GSE project and return its metadata -#### Parameters: - -- `gsm_file_content` (``): gse soft file content -- `gse_file_content` (``): gsm soft file content -- `gsm_filter_list` (``): list of gsm that have to be downloaded - - -#### Returns: - -- ``: Tuple of project list of gsm samples and gse samples - - - - -```python -def get_projects(self, input: str, just_metadata: bool=True, discard_soft: bool=True) -> dict -``` - -Function for fetching projects from GEO|SRA and receiving peppy project -#### Parameters: - -- `input` (``): GSE number, or path to file of GSE numbers -- `just_metadata` (``): process only metadata -- `discard_soft` (``): clean run, without downloading soft files - - -#### Returns: - -- ``: peppy project or list of project, if acc_anno is set. - +## API Reference +### Geofetcher Class +The main class for fetching data from GEO/SRA: +::: geofetch.Geofetcher + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true +### Finder Class +Class for searching and finding GSE accessions: -*Version Information: `geofetch` v0.12.6, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: geofetch.Finder + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/geofetch/gse-finder.md b/docs/geofetch/gse-finder.md index b6d6f436..352f0d56 100644 --- a/docs/geofetch/gse-finder.md +++ b/docs/geofetch/gse-finder.md @@ -11,7 +11,7 @@ is a geofetch class that provides functions to find and retrieve a list of GSE ( ___ ## Tutorial -0) Initiale Finder object. +0) Initial Finder object. ```python from geofetch import Finder gse_obj = Finder() diff --git a/docs/geofetch/howto-prefetch.md b/docs/geofetch/howto-prefetch.md new file mode 100644 index 00000000..2ccc2128 --- /dev/null +++ b/docs/geofetch/howto-prefetch.md @@ -0,0 +1,11 @@ +# How to install `prefetch` + +To install the prefetch tool, you need to install the NCBI SRA +Toolkit. Prefetch is a part of this toolkit and is used to download +data from the Sequence Read Archive (SRA). The installation process +varies depending on your operating system. + +### Documentation: +The best way to install the NCBI SRA Toolkit is to follow the official instructions: + +[Official NCBI SRA Toolkit Installation Guide](https://github.com/ncbi/sra-tools/wiki/01.-Downloading-SRA-Toolkit) \ No newline at end of file diff --git a/docs/geofetch/notebooks/howto-sra-to-fastq.ipynb b/docs/geofetch/notebooks/howto-sra-to-fastq.ipynb index 86758e77..879a39a9 100644 --- a/docs/geofetch/notebooks/howto-sra-to-fastq.ipynb +++ b/docs/geofetch/notebooks/howto-sra-to-fastq.ipynb @@ -8,17 +8,100 @@ "## How to extract fastq files from SRA" ] }, + { + "cell_type": "markdown", + "id": "e9f15e8e35494e0b", + "metadata": {}, + "source": [ + "1. Install geofetch" + ] + }, { "cell_type": "code", "execution_count": 1, + "id": "4baa57aa54695983", + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-10T04:29:46.304220Z", + "start_time": "2025-07-10T04:29:46.117835Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: geofetch in /home/bnt4me/.local/lib/python3.10/site-packages (0.12.7)\n", + "Requirement already satisfied: colorama>=0.3.9 in /usr/lib/python3/dist-packages (from geofetch) (0.4.4)\n", + "Requirement already satisfied: coloredlogs>=15.0.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (15.0.1)\n", + "Requirement already satisfied: logmuse>=0.2.6 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.2.7)\n", + "Requirement already satisfied: pandas>=1.5.3 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (2.2.2)\n", + "Requirement already satisfied: peppy>=0.40.6 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.40.7)\n", + "Requirement already satisfied: piper>=0.14.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.14.4)\n", + "Requirement already satisfied: requests>=2.28.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (2.31.0)\n", + "Requirement already satisfied: rich>=12.5.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (13.7.1)\n", + "Requirement already satisfied: ubiquerg>=0.6.2 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.8.1)\n", + "Requirement already satisfied: xmltodict>=0.13.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from geofetch) (0.13.0)\n", + "Requirement already satisfied: humanfriendly>=9.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from coloredlogs>=15.0.1->geofetch) (10.0)\n", + "Requirement already satisfied: numpy>=1.22.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from pandas>=1.5.3->geofetch) (1.25.2)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /home/bnt4me/.local/lib/python3.10/site-packages (from pandas>=1.5.3->geofetch) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/lib/python3/dist-packages (from pandas>=1.5.3->geofetch) (2022.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /home/bnt4me/.local/lib/python3.10/site-packages (from pandas>=1.5.3->geofetch) (2023.3)\n", + "Requirement already satisfied: pyyaml in /usr/lib/python3/dist-packages (from peppy>=0.40.6->geofetch) (5.4.1)\n", + "Requirement already satisfied: pephubclient>=0.4.2 in /home/bnt4me/.local/lib/python3.10/site-packages (from peppy>=0.40.6->geofetch) (0.4.2)\n", + "Requirement already satisfied: psutil in /home/bnt4me/.local/lib/python3.10/site-packages (from piper>=0.14.4->geofetch) (5.9.4)\n", + "Requirement already satisfied: yacman>=0.9.3 in /home/bnt4me/.local/lib/python3.10/site-packages (from piper>=0.14.4->geofetch) (0.9.3)\n", + "Requirement already satisfied: pipestat>=0.11.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from piper>=0.14.4->geofetch) (0.12.1)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/bnt4me/.local/lib/python3.10/site-packages (from requests>=2.28.1->geofetch) (3.0.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.28.1->geofetch) (3.3)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from requests>=2.28.1->geofetch) (1.26.18)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.28.1->geofetch) (2020.6.20)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from rich>=12.5.1->geofetch) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from rich>=12.5.1->geofetch) (2.17.2)\n", + "Requirement already satisfied: mdurl~=0.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich>=12.5.1->geofetch) (0.1.2)\n", + "Requirement already satisfied: typer>=0.7.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (0.9.4)\n", + "Requirement already satisfied: pydantic>2.5.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (2.7.3)\n", + "Requirement already satisfied: jsonschema in /home/bnt4me/.local/lib/python3.10/site-packages (from pipestat>=0.11.0->piper>=0.14.4->geofetch) (4.23.0)\n", + "Requirement already satisfied: eido in /home/bnt4me/.local/lib/python3.10/site-packages (from pipestat>=0.11.0->piper>=0.14.4->geofetch) (0.2.4)\n", + "Requirement already satisfied: jinja2 in /usr/lib/python3/dist-packages (from pipestat>=0.11.0->piper>=0.14.4->geofetch) (3.0.3)\n", + "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas>=1.5.3->geofetch) (1.16.0)\n", + "Requirement already satisfied: attmap>=0.13.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from yacman>=0.9.3->piper>=0.14.4->geofetch) (0.13.2)\n", + "Requirement already satisfied: oyaml in /home/bnt4me/.local/lib/python3.10/site-packages (from yacman>=0.9.3->piper>=0.14.4->geofetch) (1.0)\n", + "Requirement already satisfied: attrs>=22.2.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (25.3.0)\n", + "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (2025.4.1)\n", + "Requirement already satisfied: referencing>=0.28.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (0.36.2)\n", + "Requirement already satisfied: rpds-py>=0.7.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from jsonschema->pipestat>=0.11.0->piper>=0.14.4->geofetch) (0.24.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /home/bnt4me/.local/lib/python3.10/site-packages (from pydantic>2.5.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.18.4 in /home/bnt4me/.local/lib/python3.10/site-packages (from pydantic>2.5.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (2.18.4)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /home/bnt4me/.local/lib/python3.10/site-packages (from pydantic>2.5.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (4.8.0)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/lib/python3/dist-packages (from typer>=0.7.0->pephubclient>=0.4.2->peppy>=0.40.6->geofetch) (8.0.3)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "pip install geofetch" + ] + }, + { + "cell_type": "code", + "execution_count": 24, "id": "5d04aca7", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2025-07-10T04:29:51.896318Z", + "start_time": "2025-07-10T04:29:51.884023Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "geofetch 0.12.4\n" + "geofetch 0.12.8\n" ] } ], @@ -33,76 +116,79 @@ "source": [ "1) Download SRA files and PEP using GEOfetch\n", "\n", - "Add flags: \n", - "a) `--add-convert-modifier` (To add looper configurations for conversion)\n", - "b) `--discard-soft` (To delete soft files. We don't need them :D)" + "Add flags:\n", + "1) `--add-convert-modifier` (To add looper configurations for conversion)\n", + "2) `--discard-soft` (To delete soft files. We don't need them :D)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 25, "id": "5d1d2a6a", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Metadata folder: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae\n", - "Trying GSE67303 (not a file) as accession...\n", - "Skipped 0 accessions. Starting now.\n", - "\u001B[38;5;200mProcessing accession 1 of 1: 'GSE67303'\u001B[0m\n", - "Processed 4 samples.\n", - "Expanding metadata list...\n", - "Found SRA Project accession: SRP056574\n", - "Downloading SRP056574 sra metadata\n", - "Parsing SRA file to download SRR records\n", - "Getting SRR: SRR1930183 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:12 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:12 prefetch.2.11.3: 1) Downloading 'SRR1930183'...\n", - "2023-08-01T17:04:12 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:12 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:14 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:15 prefetch.2.11.3: 'SRR1930183' is valid\n", - "2023-08-01T17:04:15 prefetch.2.11.3: 1) 'SRR1930183' was downloaded successfully\n", - "2023-08-01T17:04:15 prefetch.2.11.3: 'SRR1930183' has 0 unresolved dependencies\n", - "Getting SRR: SRR1930184 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:15 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:16 prefetch.2.11.3: 1) Downloading 'SRR1930184'...\n", - "2023-08-01T17:04:16 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:16 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:17 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:18 prefetch.2.11.3: 'SRR1930184' is valid\n", - "2023-08-01T17:04:18 prefetch.2.11.3: 1) 'SRR1930184' was downloaded successfully\n", - "2023-08-01T17:04:18 prefetch.2.11.3: 'SRR1930184' has 0 unresolved dependencies\n", - "Getting SRR: SRR1930185 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:19 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:19 prefetch.2.11.3: 1) Downloading 'SRR1930185'...\n", - "2023-08-01T17:04:19 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:19 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:22 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:22 prefetch.2.11.3: 'SRR1930185' is valid\n", - "2023-08-01T17:04:22 prefetch.2.11.3: 1) 'SRR1930185' was downloaded successfully\n", - "2023-08-01T17:04:22 prefetch.2.11.3: 'SRR1930185' has 0 unresolved dependencies\n", - "Getting SRR: SRR1930186 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:22 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:23 prefetch.2.11.3: 1) Downloading 'SRR1930186'...\n", - "2023-08-01T17:04:23 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:23 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:25 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:25 prefetch.2.11.3: 'SRR1930186' is valid\n", - "2023-08-01T17:04:25 prefetch.2.11.3: 1) 'SRR1930186' was downloaded successfully\n", - "2023-08-01T17:04:25 prefetch.2.11.3: 'SRR1930186' has 0 unresolved dependencies\n", - "Finished processing 1 accession(s)\n", - "Cleaning soft files ...\n", - "Creating complete project annotation sheets and config file...\n", - "\u001B[92mSample annotation sheet: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae/GSE67303_PEP/GSE67303_PEP_raw.csv . Saved!\u001B[0m\n", - "\u001B[92mFile has been saved successfully\u001B[0m\n", - " Config file: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae/GSE67303_PEP/GSE67303_PEP.yaml\n" + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:23]\u001b[0m Metadata folder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m Trying GSE67303 (not a file) as accession...\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m Skipped 0 accessions. Starting now.\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m \u001b[38;5;200mProcessing accession 1 of 1: 'GSE67303'\u001b[0m\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m Processed 4 samples.\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m Expanding metadata list...\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m Found SRA Project accession: SRP056574\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:24]\u001b[0m Downloading SRP056574 sra metadata\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:25]\u001b[0m Parsing SRA file to download SRR records\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:25]\u001b[0m Getting SRR: SRR1930183 in (GSE67303)\n", + "\n", + "2025-07-10T04:54:26 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", + "2025-07-10T04:54:26 prefetch.2.11.3: 1) Downloading 'SRR1930183'...\n", + "2025-07-10T04:54:26 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", + "2025-07-10T04:54:26 prefetch.2.11.3: Downloading via HTTPS...\n", + "2025-07-10T04:54:31 prefetch.2.11.3: HTTPS download succeed\n", + "2025-07-10T04:54:31 prefetch.2.11.3: 'SRR1930183' is valid\n", + "2025-07-10T04:54:31 prefetch.2.11.3: 1) 'SRR1930183' was downloaded successfully\n", + "2025-07-10T04:54:31 prefetch.2.11.3: 'SRR1930183' has 0 unresolved dependencies\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:31]\u001b[0m Getting SRR: SRR1930184 in (GSE67303)\n", + "\n", + "2025-07-10T04:54:32 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", + "2025-07-10T04:54:32 prefetch.2.11.3: 1) Downloading 'SRR1930184'...\n", + "2025-07-10T04:54:32 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", + "2025-07-10T04:54:32 prefetch.2.11.3: Downloading via HTTPS...\n", + "2025-07-10T04:54:36 prefetch.2.11.3: HTTPS download succeed\n", + "2025-07-10T04:54:36 prefetch.2.11.3: 'SRR1930184' is valid\n", + "2025-07-10T04:54:36 prefetch.2.11.3: 1) 'SRR1930184' was downloaded successfully\n", + "2025-07-10T04:54:36 prefetch.2.11.3: 'SRR1930184' has 0 unresolved dependencies\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:36]\u001b[0m Getting SRR: SRR1930185 in (GSE67303)\n", + "\n", + "2025-07-10T04:54:37 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", + "2025-07-10T04:54:37 prefetch.2.11.3: 1) Downloading 'SRR1930185'...\n", + "2025-07-10T04:54:37 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", + "2025-07-10T04:54:37 prefetch.2.11.3: Downloading via HTTPS...\n", + "2025-07-10T04:54:45 prefetch.2.11.3: HTTPS download succeed\n", + "2025-07-10T04:54:45 prefetch.2.11.3: 'SRR1930185' is valid\n", + "2025-07-10T04:54:45 prefetch.2.11.3: 1) 'SRR1930185' was downloaded successfully\n", + "2025-07-10T04:54:45 prefetch.2.11.3: 'SRR1930185' has 0 unresolved dependencies\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:45]\u001b[0m Getting SRR: SRR1930186 in (GSE67303)\n", + "\n", + "2025-07-10T04:54:46 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", + "2025-07-10T04:54:46 prefetch.2.11.3: 1) Downloading 'SRR1930186'...\n", + "2025-07-10T04:54:46 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", + "2025-07-10T04:54:46 prefetch.2.11.3: Downloading via HTTPS...\n", + "2025-07-10T04:54:52 prefetch.2.11.3: HTTPS download succeed\n", + "2025-07-10T04:54:52 prefetch.2.11.3: 'SRR1930186' is valid\n", + "2025-07-10T04:54:52 prefetch.2.11.3: 1) 'SRR1930186' was downloaded successfully\n", + "2025-07-10T04:54:52 prefetch.2.11.3: 'SRR1930186' has 0 unresolved dependencies\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m Finished processing 1 accession(s)\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m Cleaning soft files ...\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m Creating complete project annotation sheets and config file...\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m \u001b[92mSample annotation sheet: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/GSE67303_PEP/GSE67303_PEP_raw.csv . Saved!\u001b[0m\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m \u001b[92mFile has been saved successfully\u001b[0m\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m Config file: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/GSE67303_PEP/GSE67303_PEP.yaml\n", + "\u001b[1;30m[INFO]\u001b[0m \u001b[32m[00:54:52]\u001b[0m Looper config file: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/looper_config.yaml\n" ] } ], @@ -120,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 34, "id": "37def9a3", "metadata": {}, "outputs": [ @@ -128,10 +214,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001B[0m\u001B[01;34mbuild\u001B[0m python-usage.ipynb \u001B[01;34mSRR1930184\u001B[0m\n", - "\u001B[01;34mcode\u001B[0m raw-data-downloading.ipynb \u001B[01;34mSRR1930185\u001B[0m\n", - "how_to_fastq_from_sra.ipynb \u001B[01;34mred_algae\u001B[0m \u001B[01;34mSRR1930186\u001B[0m\n", - "processed-data-downloading.ipynb \u001B[01;34mSRR1930183\u001B[0m\n" + "\u001b[0m\u001b[01;34mfq_folder\u001b[0m raw-data-downloading.ipynb \u001b[01;34mSRR1930185\u001b[0m\n", + "howto-sra-to-fastq.ipynb \u001b[01;34mred_algae\u001b[0m \u001b[01;34mSRR1930186\u001b[0m\n", + "processed-data-downloading.ipynb \u001b[01;34mSRR1930183\u001b[0m\n", + "python-usage.ipynb \u001b[01;34mSRR1930184\u001b[0m\n" ] } ], @@ -149,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 27, "id": "c13991dd", "metadata": {}, "outputs": [ @@ -234,7 +320,7 @@ " derive:\n", " attributes: [read1, read2, SRR_files]\n", " sources:\n", - " SRA: \"${SRABAM}/{srr}.bam\"\n", + " SRA: \"${SRARAW}/{srr}/{srr}.sra\"\n", " FQ: \"${SRAFQ}/{srr}.fastq.gz\"\n", " FQ1: \"${SRAFQ}/{srr}_1.fastq.gz\"\n", " FQ2: \"${SRAFQ}/{srr}_2.fastq.gz\"\n", @@ -257,23 +343,6 @@ " then:\n", " read1: FQ1\n", "\n", - "project_modifiers:\n", - " amend:\n", - " sra_convert:\n", - " looper:\n", - " results_subdir: sra_convert_results\n", - " sample_modifiers:\n", - " append:\n", - " SRR_files: SRA\n", - " pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml\n", - " derive:\n", - " attributes: [read1, read2, SRR_files]\n", - " sources:\n", - " SRA: \"${SRARAW}/{srr}/{srr}.sra\"\n", - " FQ: \"${SRAFQ}/{srr}.fastq.gz\"\n", - " FQ1: \"${SRAFQ}/{srr}_1.fastq.gz\"\n", - " FQ2: \"${SRAFQ}/{srr}_2.fastq.gz\"\n", - "\n", "\n", "\n", "\n" @@ -297,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 28, "id": "d4af5280", "metadata": {}, "outputs": [], @@ -308,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 29, "id": "981f6073", "metadata": {}, "outputs": [], @@ -319,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 30, "id": "c2cb5330", "metadata": {}, "outputs": [], @@ -328,356 +397,457 @@ "export SRAFQ=`pwd`/fq_folder" ] }, + { + "cell_type": "markdown", + "id": "d03578ac", + "metadata": {}, + "source": [ + "### Now install looper if you don't have it" + ] + }, { "cell_type": "code", - "execution_count": 11, - "id": "45bee81f", + "execution_count": 16, + "id": "8d80a1a9", "metadata": {}, "outputs": [], "source": [ - "# Unfortunately you have to pull gefetch folder from github, and set CODE variable:\n", - "mkdir code && cd code && git clone https://github.com/pepkit/geofetch.git && export CODE=`pwd` && cd .." + "# pip install looper" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "1153dab2", + "execution_count": 31, + "id": "b4aa8176", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[0m\u001B[01;34mbuild\u001B[0m processed-data-downloading.ipynb \u001B[01;34mSRR1930183\u001B[0m\n", - "\u001B[01;34mcode\u001B[0m python-usage.ipynb \u001B[01;34mSRR1930184\u001B[0m\n", - "\u001B[01;34mfq_folder\u001B[0m raw-data-downloading.ipynb \u001B[01;34mSRR1930185\u001B[0m\n", - "how_to_fastq_from_sra.ipynb \u001B[01;34mred_algae\u001B[0m \u001B[01;34mSRR1930186\u001B[0m\n" + "2.0.1\n", + "\u001b[0m\n" ] } ], "source": [ - "ls" + "looper --version" ] }, { "cell_type": "markdown", - "id": "d03578ac", + "id": "fcee556a", "metadata": {}, "source": [ - "### Now install looper if you don't have it" + "Let's check where is looper config file and whats inside:" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "b4aa8176", + "execution_count": 36, + "id": "0bcd03a7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "looper 1.4.3\n", - "\u001B[0m\n" + "\u001b[0m\u001b[01;34mGSE67303_PEP\u001b[0m looper_config.yaml \u001b[01;34moutput_dir\u001b[0m\n" ] } ], "source": [ - "looper --version" + "ls red_algae" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "0bcd03a7", + "execution_count": 38, + "id": "75413be5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[0m\u001B[01;34mGSE67303_PEP\u001B[0m\n" + "pep_config: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/GSE67303_PEP/GSE67303_PEP.yaml\n", + "output_dir: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir\n", + "pipeline_interfaces:\n", + " - /home/bnt4me/.local/lib/python3.10/site-packages/geofetch/templates/pipeline_interface_convert.yaml\n" ] } ], "source": [ - "ls red_algae" + "cat red_algae/looper_config.yaml" + ] + }, + { + "cell_type": "markdown", + "id": "d031eca0", + "metadata": {}, + "source": [ + "Geofetch automatically generated paths to pep_config and pipeline interfaces that are embedded into geofetch" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "a9a67e5c", + "execution_count": 35, + "id": "3c622442", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Looper version: 1.4.3\n", + "Looper version: 2.0.1\n", "Command: run\n", - "Using default config. No config found in env var: ['DIVCFG']\n", - "Using amendments: sra_convert\n", + "Using default divvy config. You may specify in env var: ['DIVCFG']\n", "Activating compute package 'local'\n", - "Pipestat compatible: False\n", - "\u001B[36m## [1 of 4] sample: cm_bluelight_rep1; pipeline: sra_convert\u001B[0m\n", - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_bluelight_rep1.sub\n", - "Job script (n=1; 0.06Gb): ./submission/sra_convert_cm_bluelight_rep1.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:42\n", - "Using outfolder: ./sra_convert_results/SRR1930183\n", + "\u001b[36m## [1 of 4] sample: cm_bluelight_rep1; pipeline: sra_convert\u001b[0m\n", + "Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep1.sub\n", + "Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep1.sub\n", + "Compute node: alex-laptop\n", + "Start time: 2025-07-10 00:59:02\n", + "Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183\n", + "No pipestat output schema was supplied to PipestatManager.\n", + "Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/stats.yaml'\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930183/\n", - "* Pipeline started at: (08-01 13:06:42) elapsed: 0.0 _TIME_\n", + "* Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", + "* Compute host: `alex-laptop`\n", + "* Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/`\n", + "* Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/sra_convert_log.md`\n", + "* Start time: (07-10 00:59:03) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", + "* Python version: `3.10.12`\n", + "* Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper`\n", + "* Pypiper version: `0.14.4`\n", + "* Pipeline dir: `/home/bnt4me/.local/bin`\n", + "* Pipeline version: \n", "\n", "### Arguments passed to pipeline:\n", "\n", "* `bamfolder`: ``\n", "* `config_file`: `sraconvert.yaml`\n", "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", + "* `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder`\n", "* `keep_sra`: `False`\n", "* `logdev`: `False`\n", "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", + "* `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", "* `recover`: `False`\n", "* `sample_name`: `None`\n", "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra']`\n", + "* `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930183/SRR1930183.sra']`\n", "* `verbosity`: `None`\n", "\n", + "### Initialized Pipestat Object:\n", + "\n", + "* PipestatManager (sra_convert)\n", + "* Backend: File\n", + "* - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183/stats.yaml\n", + "* - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930183\n", + "* Multiple Pipelines Allowed: False\n", + "* Pipeline name: sra_convert\n", + "* Pipeline type: sample\n", + "* Status Schema key: None\n", + "* Results formatter: default_formatter\n", + "* Results schema source: None\n", + "* Status schema source: None\n", + "* Records count: 2\n", + "* Sample name: DEFAULT_SAMPLE_NAME\n", + "\n", + "\n", "----------------------------------------\n", "\n", "Processing 1 of 1 files: SRR1930183\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930183_1.fastq.gz` \n", + "Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930183_1.fastq.gz` \n", "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (744928)\n", + "> `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871210)\n", "
\n",
       "spots read      : 1,068,319\n",
       "reads read      : 2,136,638\n",
       "reads written   : 2,136,638\n",
       "
\n", - "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.08GB. \n", - " PID: 744928;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.08GB\n", + "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.069GB. \n", + " PID: 871210;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.069GB\n", "\n", "Already completed files: []\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:02\n", "* Total elapsed time (all runs): 0:00:02\n", - "* Peak memory (this run): 0.0803 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:44\n", - "\u001B[36m## [2 of 4] sample: cm_bluelight_rep2; pipeline: sra_convert\u001B[0m\n", - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_bluelight_rep2.sub\n", - "Job script (n=1; 0.04Gb): ./submission/sra_convert_cm_bluelight_rep2.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:44\n", - "Using outfolder: ./sra_convert_results/SRR1930184\n", + "* Peak memory (this run): 0.0685 GB\n", + "* Pipeline completed time: 2025-07-10 00:59:05\n", + "Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml\n", + "\u001b[36m## [2 of 4] sample: cm_bluelight_rep2; pipeline: sra_convert\u001b[0m\n", + "Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep2.sub\n", + "Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_bluelight_rep2.sub\n", + "Compute node: alex-laptop\n", + "Start time: 2025-07-10 00:59:06\n", + "Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184\n", + "No pipestat output schema was supplied to PipestatManager.\n", + "Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/stats.yaml'\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930184/\n", - "* Pipeline started at: (08-01 13:06:45) elapsed: 0.0 _TIME_\n", + "* Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", + "* Compute host: `alex-laptop`\n", + "* Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/`\n", + "* Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/sra_convert_log.md`\n", + "* Start time: (07-10 00:59:06) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", + "* Python version: `3.10.12`\n", + "* Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper`\n", + "* Pypiper version: `0.14.4`\n", + "* Pipeline dir: `/home/bnt4me/.local/bin`\n", + "* Pipeline version: \n", "\n", "### Arguments passed to pipeline:\n", "\n", "* `bamfolder`: ``\n", "* `config_file`: `sraconvert.yaml`\n", "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", + "* `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder`\n", "* `keep_sra`: `False`\n", "* `logdev`: `False`\n", "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", + "* `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", "* `recover`: `False`\n", "* `sample_name`: `None`\n", "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra']`\n", + "* `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930184/SRR1930184.sra']`\n", "* `verbosity`: `None`\n", "\n", + "### Initialized Pipestat Object:\n", + "\n", + "* PipestatManager (sra_convert)\n", + "* Backend: File\n", + "* - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184/stats.yaml\n", + "* - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930184\n", + "* Multiple Pipelines Allowed: False\n", + "* Pipeline name: sra_convert\n", + "* Pipeline type: sample\n", + "* Status Schema key: None\n", + "* Results formatter: default_formatter\n", + "* Results schema source: None\n", + "* Status schema source: None\n", + "* Records count: 2\n", + "* Sample name: DEFAULT_SAMPLE_NAME\n", + "\n", + "\n", "----------------------------------------\n", "\n", "Processing 1 of 1 files: SRR1930184\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930184_1.fastq.gz` \n", + "Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930184_1.fastq.gz` \n", "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (744973)\n", + "> `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871261)\n", "
\n",
       "spots read      : 762,229\n",
       "reads read      : 1,524,458\n",
       "reads written   : 1,524,458\n",
       "
\n", - "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.012GB. \n", - " PID: 744973;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.012GB\n", + "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.083GB. \n", + " PID: 871261;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.083GB\n", "\n", "Already completed files: []\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:02\n", "* Total elapsed time (all runs): 0:00:02\n", - "* Peak memory (this run): 0.0118 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:47\n", - "\u001B[36m## [3 of 4] sample: cm_darkness_rep1; pipeline: sra_convert\u001B[0m\n", - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_darkness_rep1.sub\n", - "Job script (n=1; 0.09Gb): ./submission/sra_convert_cm_darkness_rep1.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:47\n", - "Using outfolder: ./sra_convert_results/SRR1930185\n", + "* Peak memory (this run): 0.0832 GB\n", + "* Pipeline completed time: 2025-07-10 00:59:08\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml\n", + "\u001b[36m## [3 of 4] sample: cm_darkness_rep1; pipeline: sra_convert\u001b[0m\n", + "Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep1.sub\n", + "Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep1.sub\n", + "Compute node: alex-laptop\n", + "Start time: 2025-07-10 00:59:08\n", + "Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185\n", + "No pipestat output schema was supplied to PipestatManager.\n", + "Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/stats.yaml'\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930185/\n", - "* Pipeline started at: (08-01 13:06:47) elapsed: 0.0 _TIME_\n", + "* Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", + "* Compute host: `alex-laptop`\n", + "* Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/`\n", + "* Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/sra_convert_log.md`\n", + "* Start time: (07-10 00:59:09) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", + "* Python version: `3.10.12`\n", + "* Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper`\n", + "* Pypiper version: `0.14.4`\n", + "* Pipeline dir: `/home/bnt4me/.local/bin`\n", + "* Pipeline version: \n", "\n", "### Arguments passed to pipeline:\n", "\n", "* `bamfolder`: ``\n", "* `config_file`: `sraconvert.yaml`\n", "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", + "* `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder`\n", "* `keep_sra`: `False`\n", "* `logdev`: `False`\n", "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", + "* `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", "* `recover`: `False`\n", "* `sample_name`: `None`\n", "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra']`\n", + "* `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930185/SRR1930185.sra']`\n", "* `verbosity`: `None`\n", "\n", + "### Initialized Pipestat Object:\n", + "\n", + "* PipestatManager (sra_convert)\n", + "* Backend: File\n", + "* - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185/stats.yaml\n", + "* - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930185\n", + "* Multiple Pipelines Allowed: False\n", + "* Pipeline name: sra_convert\n", + "* Pipeline type: sample\n", + "* Status Schema key: None\n", + "* Results formatter: default_formatter\n", + "* Results schema source: None\n", + "* Status schema source: None\n", + "* Records count: 2\n", + "* Sample name: DEFAULT_SAMPLE_NAME\n", + "\n", + "\n", "----------------------------------------\n", "\n", "Processing 1 of 1 files: SRR1930185\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930185_1.fastq.gz` \n", + "Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930185_1.fastq.gz` \n", "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (745021)\n", + "> `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871306)\n", "
\n",
       "spots read      : 1,707,508\n",
       "reads read      : 3,415,016\n",
       "reads written   : 3,415,016\n",
       "
\n", - "Command completed. Elapsed time: 0:00:03. Running peak memory: 0.079GB. \n", - " PID: 745021;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.079GB\n", + "Command completed. Elapsed time: 0:00:04. Running peak memory: 0.07GB. \n", + " PID: 871306;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.07GB\n", "\n", "Already completed files: []\n", "\n", "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:03\n", - "* Total elapsed time (all runs): 0:00:03\n", - "* Peak memory (this run): 0.0793 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:50\n", - "\u001B[36m## [4 of 4] sample: cm_darkness_rep2; pipeline: sra_convert\u001B[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_darkness_rep2.sub\n", - "Job script (n=1; 0.07Gb): ./submission/sra_convert_cm_darkness_rep2.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:50\n", - "Using outfolder: ./sra_convert_results/SRR1930186\n", + "* Elapsed time (this run): 0:00:04\n", + "* Total elapsed time (all runs): 0:00:04\n", + "* Peak memory (this run): 0.0701 GB\n", + "* Pipeline completed time: 2025-07-10 00:59:13\n", + "Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml\n", + "\u001b[36m## [4 of 4] sample: cm_darkness_rep2; pipeline: sra_convert\u001b[0m\n", + "Writing script to /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep2.sub\n", + "Job script (n=1; 0.00Gb): /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/submission/sra_convert_cm_darkness_rep2.sub\n", + "Compute node: alex-laptop\n", + "Start time: 2025-07-10 00:59:13\n", + "Using outfolder: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186\n", + "No pipestat output schema was supplied to PipestatManager.\n", + "Initializing results file '/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/stats.yaml'\n", "### Pipeline run code and environment:\n", "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930186/\n", - "* Pipeline started at: (08-01 13:06:51) elapsed: 0.0 _TIME_\n", + "* Command: `/home/bnt4me/.local/bin/sraconvert --srr /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", + "* Compute host: `alex-laptop`\n", + "* Working dir: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* Outfolder: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/`\n", + "* Log file: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/sra_convert_log.md`\n", + "* Start time: (07-10 00:59:14) elapsed: 0.0 _TIME_\n", "\n", "### Version log:\n", "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", + "* Python version: `3.10.12`\n", + "* Pypiper dir: `/home/bnt4me/.local/lib/python3.10/site-packages/pypiper`\n", + "* Pypiper version: `0.14.4`\n", + "* Pipeline dir: `/home/bnt4me/.local/bin`\n", + "* Pipeline version: \n", "\n", "### Arguments passed to pipeline:\n", "\n", "* `bamfolder`: ``\n", "* `config_file`: `sraconvert.yaml`\n", "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", + "* `fqfolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder`\n", "* `keep_sra`: `False`\n", "* `logdev`: `False`\n", "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", + "* `output_parent`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline`\n", "* `recover`: `False`\n", "* `sample_name`: `None`\n", "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra']`\n", + "* `srafolder`: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks`\n", + "* `srr`: `['/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930186/SRR1930186.sra']`\n", "* `verbosity`: `None`\n", "\n", + "### Initialized Pipestat Object:\n", + "\n", + "* PipestatManager (sra_convert)\n", + "* Backend: File\n", + "* - results: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186/stats.yaml\n", + "* - status: /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/red_algae/output_dir/results_pipeline/SRR1930186\n", + "* Multiple Pipelines Allowed: False\n", + "* Pipeline name: sra_convert\n", + "* Pipeline type: sample\n", + "* Status Schema key: None\n", + "* Results formatter: default_formatter\n", + "* Results schema source: None\n", + "* Status schema source: None\n", + "* Records count: 2\n", + "* Sample name: DEFAULT_SAMPLE_NAME\n", + "\n", + "\n", "----------------------------------------\n", "\n", "Processing 1 of 1 files: SRR1930186\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930186_1.fastq.gz` \n", + "Target to produce: `/home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder/SRR1930186_1.fastq.gz` \n", "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (745069)\n", + "> `fasterq-dump /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/pepspec/docs/geofetch/notebooks/fq_folder` (871369)\n", "
\n",
       "spots read      : 1,224,029\n",
       "reads read      : 2,448,058\n",
       "reads written   : 2,448,058\n",
       "
\n", - "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.081GB. \n", - " PID: 745069;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.081GB\n", + "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.083GB. \n", + " PID: 871369;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.083GB\n", "\n", "Already completed files: []\n", "\n", "### Pipeline completed. Epilogue\n", "* Elapsed time (this run): 0:00:02\n", "* Total elapsed time (all runs): 0:00:02\n", - "* Peak memory (this run): 0.0813 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:53\n", + "* Peak memory (this run): 0.0832 GB\n", + "* Pipeline completed time: 2025-07-10 00:59:16\n", + "Using default schema: /home/bnt4me/.local/bin/pipestat_output_schema.yaml\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "\n", "Looper finished\n", "Samples valid for job generation: 4 of 4\n", - "Commands submitted: 4 of 4\n", - "Jobs submitted: 4\n", - "\u001B[0m\n" + "\u001b[0m\n" ] } ], "source": [ - "looper run red_algae/GSE67303_PEP/GSE67303_PEP.yaml -a sra_convert -p local --output-dir ." + "looper run --config ./red_algae/looper_config.yaml -p local --output-dir ." ] }, { @@ -690,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 39, "id": "2a79f578", "metadata": {}, "outputs": [], @@ -700,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 40, "id": "fefdf187", "metadata": {}, "outputs": [ @@ -716,6 +886,14 @@ "source": [ "ls" ] + }, + { + "cell_type": "markdown", + "id": "c5f46e8c", + "metadata": {}, + "source": [ + "Everything was executed sucessfully and SRA files were converted into fastq files" + ] } ], "metadata": { diff --git a/docs/looper/advanced-guide/advanced-metadata.md b/docs/looper/advanced-guide/advanced-metadata.md index b93fe5eb..242ec9cb 100644 --- a/docs/looper/advanced-guide/advanced-metadata.md +++ b/docs/looper/advanced-guide/advanced-metadata.md @@ -2,7 +2,7 @@ We already covered how you can specify sample metadata using either a [simple csv file](../user-tutorial/initialize.md) or a [PEP](../user-tutorial/metadata.md). But in that tutorial we covered only the basic features of PEPs. -PEPs are actually a lot more powerful, and many of those featuers are useful for looper projects. +PEPs are actually a lot more powerful, and many of those features are useful for looper projects. Here, we'll show you a few of the more advanced features of PEPs and explain how they can be useful with looper. We still won't cover everything here, though. If you want to see *all* the features of PEP, you should consult the [detailed PEP documentation](../../spec/simple-example.md). diff --git a/docs/looper/advanced-guide/advanced-run-options.md b/docs/looper/advanced-guide/advanced-run-options.md index 040d537e..d137c98f 100644 --- a/docs/looper/advanced-guide/advanced-run-options.md +++ b/docs/looper/advanced-guide/advanced-run-options.md @@ -244,11 +244,10 @@ Keys in the `cli.` section *must* match the long argument parser opt Looper provides several ways to select (filter) samples, so you only submit certain ones. -### Sample selection +### Sample selection by inclusion -You can use `--sel-incl` with `--sel-attr`. You choose the attribute you want you want to select using, with `--sel-attr`. Then, you choose which values you want to include, with ``--sel-incl`. - -For example, +To submit only certain samples, specify the sample attribute with `--sel-attr` and the values the attribute can take `--sel-incl`. +For example, to choose only samples where the `species` attribute is `human`, `mouse`, or `fly`: ```console looper run \ @@ -256,12 +255,18 @@ looper run \ --sel-incl human mouse fly ``` -This command would only run jobs for samples that have `human`, `mouse`, or `fly` as the value of the `species` attribute. +Similarly, to submit only one sample, with `sample_name` as `sample`, you could use: + +```console +looper run \ + --sel-attr sample_name + --sel-incl sample1 +``` -### Sample exclusion +### Sample selection by exclusion If more convenient to *exclude* samples by filter, you can use the analogous arguments `--sel-attr` with `--sel-excl`. - +This will ### Toggling sample jobs through the sample table diff --git a/docs/looper/changelog.md b/docs/looper/changelog.md index 82748d60..f5e8cac2 100644 --- a/docs/looper/changelog.md +++ b/docs/looper/changelog.md @@ -127,7 +127,7 @@ This release breaks backwards compatibility for Looper versions < 2.0.0 - fix `looper table` failing without `sample.protocol` ### Changed -- correct `--looper_conifg` to `--looper-config` +- correct `--looper_config` to `--looper-config` ## [1.5.0] -- 2023-08-09 diff --git a/docs/looper/code/python-api.md b/docs/looper/code/python-api.md deleted file mode 100644 index 01ecb554..00000000 --- a/docs/looper/code/python-api.md +++ /dev/null @@ -1,1013 +0,0 @@ - - - - - -# Package `looper` Documentation - -Project configuration, particularly for logging. - -Project-scope constants may reside here, but more importantly, some setup here -will provide a logging infrastructure for all of the project's modules. -Individual modules and classes may provide separate configuration on a more -local level, but this will at least provide a foundation. - - -## Class `Project` -Looper-specific Project. - -#### Parameters: - -- `cfg` (`str`): path to configuration file with data fromwhich Project is to be built -- `amendments` (`Iterable[str]`): name indicating amendment to use, optional -- `divcfg_path` (`str`): path to an environment configuration YAML filespecifying compute settings. -- `permissive` (`bool`): Whether a error should be thrown ifa sample input file(s) do not exist or cannot be open. -- `compute_env_file` (`str`): Environment configuration YAML file specifyingcompute settings. - - -```python -def __init__(self, cfg=None, amendments=None, divcfg_path=None, **kwargs) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -```python -def amendments(self) -``` - -Return currently active list of amendments or None if none was activated -#### Returns: - -- `Iterable[str]`: a list of currently active amendment names - - - - -```python -def cli_pifaces(self) -``` - -Collection of pipeline interface sources specified in object constructor -#### Returns: - -- `list[str]`: collection of pipeline interface sources - - - - -```python -def config(self) -``` - -Get the config mapping -#### Returns: - -- `Mapping`: config. May be formatted to comply with the mostrecent version specifications - - - - -```python -def config_file(self) -``` - -Get the config file path -#### Returns: - -- `str`: path to the config file - - - - -```python -def description(self) -``` - - - -```python -def from_dict(cls, pep_dictionary: dict) -``` - -Init a peppy project instance from a dictionary representation of an already processed PEP. -#### Parameters: - -- `pep_dictionary` (`Dict[Any]`): dict,_samples: list | dict, _subsamples: list[list | dict]} - - - - -```python -def from_pandas(cls, samples_df: pandas.core.frame.DataFrame, sub_samples_df: List[pandas.core.frame.DataFrame]=None, config: dict=None) -``` - -Init a peppy project instance from a pandas Dataframe -#### Parameters: - -- `samples_df` (``): in-memory pandas DataFrame object of samples -- `sub_samples_df` (``): in-memory list of pandas DataFrame objects of sub-samples -- `config` (``): dict of yaml file - - - - -```python -def from_pep_config(cls, cfg: str=None, amendments: Union[str, Iterable[str]]=None, sample_table_index: Union[str, Iterable[str]]=None, subsample_table_index: Union[str, Iterable[str]]=None, defer_samples_creation: bool=False) -``` - -Init a peppy project instance from a yaml file -#### Parameters: - -- `cfg` (`str`): Project config file (YAML) or sample table (CSV/TSV)with one row per sample to constitute project -- `sample_table_index` (`str | Iterable[str]`): name of the columns to setthe sample_table index to -- `subsample_table_index` (`str | Iterable[str]`): name of the columns to setthe subsample_table index to -- `amendments` (`str | Iterable[str]`): names of the amendments to activate -- `amendments` (`Iterable[str]`): amendments to use within configuration file -- `defer_samples_creation` (`bool`): whether the sample creation should be skipped - - - - -```python -def from_sample_yaml(cls, yaml_file: str) -``` - -Init a peppy project instance from a yaml file -#### Parameters: - -- `yaml_file` (`str`): path to yaml file - - - - -```python -def get_sample_piface(self, sample_name) -``` - -Get a list of pipeline interfaces associated with the specified sample. - -Note that only valid pipeline interfaces will show up in the -result (ones that exist on disk/remotely and validate successfully -against the schema) -#### Parameters: - -- `sample_name` (`str`): name of the sample to retrieve list ofpipeline interfaces for - - -#### Returns: - -- `list[looper.PipelineInterface]`: collection of validpipeline interfaces associated with selected sample - - - - -```python -def get_schemas(pifaces, schema_key='input_schema') -``` - -Get the list of unique schema paths for a list of pipeline interfaces -#### Parameters: - -- `pifaces` (`str | Iterable[str]`): pipeline interfaces to searchschemas for -- `schema_key` (`str`): where to look for schemas in the piface - - -#### Returns: - -- `Iterable[str]`: unique list of schema file paths - - - - -```python -def is_sample_table_large(self) -``` - - - -```python -def list_amendments(self) -``` - -Return a list of available amendments or None if not declared -#### Returns: - -- `Iterable[str]`: a list of available amendment names - - - - -```python -def make_project_dirs(self) -``` - -Create project directory structure if it doesn't exist. - - - -```python -def name(self) -``` - - - -```python -def output_dir(self) -``` - -Output directory for the project, specified in object constructor -#### Returns: - -- `str`: path to the output directory - - - - -```python -def pep_version(self) -``` - -The declared PEP version string - -It is validated to make sure it is a valid PEP version string -#### Returns: - -- `str`: PEP version string - - -#### Raises: - -- `InvalidConfigFileException`: in case of invalid PEP version - - - - -```python -def piface_key(self) -``` - -Name of the pipeline interface attribute for this project -#### Returns: - -- `str`: name of the pipeline interface attribute - - - - -```python -def populate_pipeline_outputs(self) -``` - -Populate project and sample output attributes based on output schemas that pipeline interfaces point to. - - - -```python -def results_folder(self) -``` - -Path to the results folder for the project -#### Returns: - -- `str`: path to the results folder in the output folder - - - - -```python -def sample_name_colname(self) -``` - -**Deprecated, please use `Project.sample_table_index` instead** - -Name of the effective sample name containing column in the sample table. -It is "sample_name" by default, but when it's missing it could be -replaced by the selected sample table index, defined on the -object instantiation stage. -#### Returns: - -- `str`: name of the column that consist of sample identifiers - - - - -```python -def sample_table(self) -``` - -Get sample table. If any sample edits were performed, it will be re-generated -#### Returns: - -- `pandas.DataFrame`: a data frame with current samples attributes - - - - -```python -def sample_table_index(self) -``` - -The effective sample table index. - -It is `sample_name` by default, but could be overwritten by the selected sample table index, -defined on the object instantiation stage or in the project configuration file -via `sample_table_index` field. -That's the sample table index selection priority order: -1. Constructor specified -2. Config specified -3. Default: `sample_table` -#### Returns: - -- `str`: name of the column that consist of sample identifiers - - - - -```python -def samples(self) -``` - -Generic/base Sample instance for each of this Project's samples. -#### Returns: - -- `Iterable[Sample]`: Sample instance for eachof this Project's samples - - - - -```python -def selected_compute_package(self) -``` - -Compute package name specified in object constructor -#### Returns: - -- `str`: compute package name - - - - -```python -def set_sample_piface(self, sample_piface: Union[List[str], str]) -> NoReturn -``` - -Add sample pipeline interfaces variable to object -#### Parameters: - -- `sample_piface` (`list | str`): sample pipeline interface - - - - -```python -def submission_folder(self) -``` - -Path to the submission folder for the project -#### Returns: - -- `str`: path to the submission in the output folder - - - - -```python -def subsample_table(self) -``` - -Get subsample table -#### Returns: - -- `pandas.DataFrame`: a data frame with subsample attributes - - - - -```python -def subsample_table_index(self) -``` - -The effective subsample table indexes. - -It is `[subasample_name, sample_name]` by default, -but could be overwritten by the selected subsample table indexes, -defined on the object instantiation stage or in the project configuration file -via `subsample_table_index` field. -That's the subsample table indexes selection priority order: -1. Constructor specified -2. Config specified -3. Default: `[subasample_name, sample_name]` -#### Returns: - -- `List[str]`: names of the columns that consist of sample and subsample identifiers - - - - -## Class `PipelineInterface` -This class parses, holds, and returns information for a yaml file that specifies how to interact with each individual pipeline. This includes both resources to request for cluster job submission, as well as arguments to be passed from the sample annotation metadata to the pipeline - -#### Parameters: - -- `config` (`str | Mapping`): path to file from which to parseconfiguration data, or pre-parsed configuration data. -- `pipeline_type` (`str`): type of the pipeline,must be either 'sample' or 'project'. - - -```python -def __init__(self, config, pipeline_type=None) -``` - -Object constructor -#### Parameters: - -- `entries` (`Iterable[(str, object)] | Mapping[str, object]`): YAML collectionof key-value pairs. -- `filepath` (`str`): Path to the YAML config file. -- `yamldata` (`str`): YAML-formatted string -- `locked` (`bool`): Whether to initialize as locked (providing write capability) -- `wait_max` (`int`): how long to wait for creating an object when the filethat data will be read from is locked -- `strict_ro_locks` (`bool`): By default, we allow RO filesystems that can't be locked.Turn on strict_ro_locks to error if locks cannot be enforced on readonly filesystems. -- `skip_read_lock` (`bool`): whether the file should not be locked for readingwhen object is created in read only mode -- `schema_source` (`str`): path or a URL to a jsonschema in YAML format to usefor optional config validation. If this argument is provided the object is always validated at least once, at the object creation stage. -- `validate_on_write` (`bool`): a boolean indicating whether the object should bevalidated every time the `write` method is executed, which is a way of preventing invalid config writing -- `create_file` (`str`): Create an empty file at filepath upon data load. - - - - -```python -def choose_resource_package(self, namespaces, file_size) -``` - -Select resource bundle for given input file size to given pipeline. -#### Parameters: - -- `file_size` (`float`): Size of input data (in gigabytes). -- `namespaces` (`Mapping[Mapping[str]]`): namespaced variables to passas a context for fluid attributes command rendering - - -#### Returns: - -- `MutableMapping`: resource bundle appropriate for given pipeline,for given input file size - - -#### Raises: - -- `ValueError`: if indicated file size is negative, or if thefile size value specified for any resource package is negative -- `InvalidResourceSpecificationException`: if no defaultresource package specification is provided - - - - -```python -def copy(self) -``` - -Copy self to a new object. - - - -```python -def exp(self) -``` - -Returns a copy of the object's data elements with env vars and user vars expanded. Use it like: object.exp["item"] - - - -```python -def get_pipeline_schemas(self, schema_key='input_schema') -``` - -Get path to the pipeline schema. -#### Parameters: - -- `schema_key` (`str`): where to look for schemas in the pipeline iface - - -#### Returns: - -- `str`: absolute path to the pipeline schema file - - - - -```python -def pipeline_name(self) -``` - - - -```python -def rebase(self, *args, **kwargs) -``` - - - -```python -def render_var_templates(self, namespaces) -``` - -Render path templates under 'var_templates' in this pipeline interface. -#### Parameters: - -- `namespaces` (`dict`): namespaces to use for rendering - - - - -```python -def reset(self, *args, **kwargs) -``` - - - -```python -def settings(self) -``` - - - -```python -def write(self, *args, **kwargs) -``` - - - -## Class `SubmissionConductor` -Collects and then submits pipeline jobs. - -This class holds a 'pool' of commands to submit as a single cluster job. -Eager to submit a job, each instance's collection of commands expands until -it reaches the 'pool' has been filled, and it's therefore time to submit the -job. The pool fills as soon as a fill criteria has been reached, which can -be either total input file size or the number of individual commands. - - -```python -def __init__(self, pipeline_interface, prj, delay=0, extra_args=None, extra_args_override=None, ignore_flags=False, compute_variables=None, max_cmds=None, max_size=None, max_jobs=None, automatic=True, collate=False) -``` - -Create a job submission manager. - -The most critical inputs are the pipeline interface and the pipeline -key, which together determine which provide critical pipeline -information like resource allocation packages and which pipeline will -be overseen by this instance, respectively. -#### Parameters: - -- `pipeline_interface` (`PipelineInterface`): Collection of importantdata for one or more pipelines, like resource allocation packages and option/argument specifications -- `prj` (``): Project with which each sample being considered isassociated (what generated each sample) -- `delay` (`float`): Time (in seconds) to wait before submitting a jobonce it's ready -- `extra_args` (`str`): string to pass to each job generated,for example additional pipeline arguments -- `extra_args_override` (`str`): string to pass to each job generated,for example additional pipeline arguments. This deactivates the 'extra' functionality that appends strings defined in Sample.command_extra and Project.looper.command_extra to the command template. -- `ignore_flags` (`bool`): Whether to ignore flag files present inthe sample folder for each sample considered for submission -- `compute_variables` (`dict[str]`): A dict with variables that will be madeavailable to the compute package. For example, this should include the name of the cluster partition to which job or jobs will be submitted -- `max_cmds` (`int | NoneType`): Upper bound on number of commands toinclude in a single job script. -- `max_size` (`int | float | NoneType`): Upper bound on total filesize of inputs used by the commands lumped into single job script. -- `max_jobs` (`int | float | NoneType`): Upper bound on total number of jobs togroup samples for submission. -- `automatic` (`bool`): Whether the submission should be automatic oncethe pool reaches capacity. -- `collate` (`bool`): Whether a collate job is to be submitted (runs onthe project level, rather that on the sample level) - - - - -```python -def add_sample(self, sample, rerun=False) -``` - -Add a sample for submission to this conductor. -#### Parameters: - -- `sample` (`peppy.Sample`): sample to be included with this conductor'scurrently growing collection of command submissions -- `rerun` (`bool`): whether the given sample is being rerun rather thanrun for the first time - - -#### Returns: - -- `bool`: Indication of whether the given sample was added tothe current 'pool.' - - -#### Raises: - -- `TypeError`: If sample subtype is provided but does not extendthe base Sample class, raise a TypeError. - - - - -```python -def failed_samples(self) -``` - - - -```python -def is_project_submittable(self, force=False) -``` - -Check whether the current project has been already submitted -#### Parameters: - -- `frorce` (`bool`): whether to force the project submission (ignore status/flags) - - - - -```python -def num_cmd_submissions(self) -``` - -Return the number of commands that this conductor has submitted. -#### Returns: - -- `int`: Number of commands submitted so far. - - - - -```python -def num_job_submissions(self) -``` - -Return the number of jobs that this conductor has submitted. -#### Returns: - -- `int`: Number of jobs submitted so far. - - - - -```python -def submit(self, force=False) -``` - -Submit one or more commands as a job. - -This call will submit the commands corresponding to the current pool -of samples if and only if the argument to 'force' evaluates to a -true value, or the pool of samples is full. -#### Parameters: - -- `force` (`bool`): Whether submission should be done/simulated evenif this conductor's pool isn't full. - - -#### Returns: - -- `bool`: Whether a job was submitted (or would've been ifnot for dry run) - - - - -```python -def write_script(self, pool, size) -``` - -Create the script for job submission. -#### Parameters: - -- `pool` (`Iterable[peppy.Sample]`): collection of sample instances -- `size` (`float`): cumulative size of the given pool - - -#### Returns: - -- `str`: Path to the job submission script created. - - - - -## Class `ComputingConfiguration` -Represents computing configuration objects. - -The ComputingConfiguration class provides a computing configuration object -that is an *in memory* representation of a `divvy` computing configuration -file. This object has various functions to allow a user to activate, modify, -and retrieve computing configuration files, and use these values to populate -job submission script templates. - -#### Parameters: - -- `entries` (`str | Iterable[(str, object)] | Mapping[str, object]`): configCollection of key-value pairs. -- `filepath` (`str`): YAML file specifying computing package data. (the`DIVCFG` file) - - -```python -def __init__(self, entries=None, wait_max=None, strict_ro_locks=False, schema_source=None, validate_on_write=False) -``` - -Object constructor -#### Parameters: - -- `entries` (`Iterable[(str, object)] | Mapping[str, object]`): YAML collectionof key-value pairs. -- `yamldata` (`str`): YAML-formatted string -- `wait_max` (`int`): how long to wait for creating an object when the filethat data will be read from is locked -- `strict_ro_locks` (`bool`): By default, we allow RO filesystems that can't be locked.Turn on strict_ro_locks to error if locks cannot be enforced on readonly filesystems. -- `skip_read_lock` (`bool`): whether the file should not be locked for readingwhen object is created in read only mode -- `schema_source` (`str`): path or a URL to a jsonschema in YAML format to usefor optional config validation. If this argument is provided the object is always validated at least once, at the object creation stage. -- `validate_on_write` (`bool`): a boolean indicating whether the object should bevalidated every time the `write` method is executed, which is a way of preventing invalid config writing - - - - -```python -def activate_package(self, package_name) -``` - -Activates a compute package. - -This copies the computing attributes from the configuration file into -the `compute` attribute, where the class stores current compute -settings. -#### Parameters: - -- `package_name` (`str`): name for non-resource compute bundle,the name of a subsection in an environment configuration file - - -#### Returns: - -- `bool`: success flag for attempt to establish compute settings - - - - -```python -def clean_start(self, package_name) -``` - -Clear current active settings and then activate the given package. -#### Parameters: - -- `package_name` (`str`): name of the resource package to activate - - -#### Returns: - -- `bool`: success flag - - - - -```python -def compute_env_var(self) -``` - -Environment variable through which to access compute settings. -#### Returns: - -- `list[str]`: names of candidate environment variables, for whichvalue may be path to compute settings file; first found is used. - - - - -```python -def default_config_file(self) -``` - -Path to default compute environment settings file. -#### Returns: - -- `str`: Path to default compute settings file - - - - -```python -def exp(self) -``` - -Returns a copy of the object's data elements with env vars and user vars expanded. Use it like: object.exp["item"] - - - -```python -def from_obj(cls, entries: object, **kwargs) -``` - -Initialize from a Python object (dict, list, or primitive). -#### Parameters: - -- `entries` (`obj`): object to initialize from. -- `kwargs` (``): Keyword arguments to pass to the constructor. - - - - -```python -def from_yaml_data(cls, yamldata, **kwargs) -``` - -Initialize from a YAML string. -#### Parameters: - -- `yamldata` (`str`): YAML-formatted string. -- `kwargs` (``): Keyword arguments to pass to the constructor. - - - - -```python -def from_yaml_file(cls, filepath: str, create_file: bool=False, **kwargs) -``` - -Initialize from a YAML file. -#### Parameters: - -- `filepath` (`str`): Path to the YAML config file. -- `create_file` (`str`): Create a file at filepath if it doesn't exist. -- `kwargs` (``): Keyword arguments to pass to the constructor. - - - - -```python -def get_active_package(self) -> yacman.yacman_future.FutureYAMLConfigManager -``` - -Returns settings for the currently active compute package -#### Returns: - -- `YAMLConfigManager`: data defining the active compute package - - - - -```python -def get_adapters(self) -> yacman.yacman_future.FutureYAMLConfigManager -``` - -Get current adapters, if defined. - -Adapters are sourced from the 'adapters' section in the root of the -divvy configuration file and updated with an active compute -package-specific set of adapters, if any defined in 'adapters' section -under currently active compute package. -#### Returns: - -- `YAMLConfigManager`: current adapters mapping - - - - -```python -def list_compute_packages(self) -``` - -Returns a list of available compute packages. -#### Returns: - -- `set[str]`: names of available compute packages - - - - -```python -def rebase(self, *args, **kwargs) -``` - - - -```python -def reset(self, *args, **kwargs) -``` - - - -```python -def reset_active_settings(self) -``` - -Clear out current compute settings. -#### Returns: - -- `bool`: success flag - - - - -```python -def settings(self) -``` - - - -```python -def submit(self, output_path, extra_vars=None) -``` - - - -```python -def template(self) -``` - -Get the currently active submission template. -#### Returns: - -- `str`: submission script content template for current state - - - - -```python -def templates_folder(self) -``` - -Path to folder with default submission templates. -#### Returns: - -- `str`: path to folder with default submission templates - - - - -```python -def update_packages(self, config_file) -``` - -Parse data from divvy configuration file. - -Given a divvy configuration file, this function will update (not -overwrite) existing compute packages with existing values. It does not -affect any currently active settings. -#### Parameters: - -- `config_file` (`str`): path to file with new divvy configuration data - - - - -```python -def write(self, filename=None) -``` - - - -```python -def write_script(self, output_path, extra_vars=None) -``` - -Given currently active settings, populate the active template to write a submission script. Additionally use the current adapters to adjust the select of the provided variables -#### Parameters: - -- `output_path` (`str`): Path to file to write as submission script -- `extra_vars` (`Iterable[Mapping]`): A list of Dict objects withkey-value pairs with which to populate template fields. These will override any values in the currently active compute package. - - -#### Returns: - -- `str`: Path to the submission script file - - - - -```python -def select_divvy_config(filepath) -``` - -Selects the divvy config file path to load. - -This uses a priority ordering to first choose a config file path if -it's given, but if not, then look in a priority list of environment -variables and choose the first available file path to return. If none of -these options succeed, the default config path will be returned. -#### Parameters: - -- `filepath` (`str | NoneType`): direct file path specification - - -#### Returns: - -- `str`: path to the config file to read - - - - - - - -*Version Information: `looper` v2.0.0a1, generated by `lucidoc` v0.4.4* \ No newline at end of file diff --git a/docs/looper/code/python-api.md.disabled b/docs/looper/code/python-api.md.disabled new file mode 100644 index 00000000..f93650e5 --- /dev/null +++ b/docs/looper/code/python-api.md.disabled @@ -0,0 +1,52 @@ +# Package `looper` Documentation + +## Package Overview + +The `looper` package is a pipeline submission engine that parses sample inputs and submits pipelines for each sample. It provides a unified interface for running bioinformatics pipelines across samples defined in PEPs. + +### Key Features + +- **Pipeline Execution**: Submit and manage pipeline jobs for multiple samples +- **Compute Management**: Configure and use different compute environments +- **Pipeline Interfaces**: Define how pipelines connect to sample metadata +- **Job Tracking**: Monitor and manage pipeline execution status + +### Installation + +```bash +pip install looper +``` + +### Quick Example + +```python +from looper import Project + +# Initialize with a project config file +prj = Project(cfg="project_config.yaml") + +# Run pipelines +prj.run() +``` + +## API Reference + +### Project Class + +The main class for managing looper projects: + +::: looper.Project + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true + +### PipelineInterface Class + +::: looper.PipelineInterface + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/pephub/README.md b/docs/pephub/README.md index 11aa125a..e5a5064c 100644 --- a/docs/pephub/README.md +++ b/docs/pephub/README.md @@ -11,6 +11,7 @@ PEPhub is an open-source database, web interface, and API for sharing, retrievin - **Public user interface**: https://pephub.databio.org/ - **API**: https://pephub-api.databio.org/api/v1/docs +- **DEV API**: https://pephub-api-dev.databio.org/api/v1/docs ## Features at-a-glance @@ -35,10 +36,10 @@ Choose your adventure:
-- :fontawesome-regular-user: [**User guide**](user/getting-started.md) +- [**User guide**](user/getting-started.md) Teaches you how to use PEPhub to manage, share, and validate your sample metadata. -- :fontawesome-solid-code: [**Developer guide**](developer/setup.md) +- [**Developer guide**](developer/setup.md) Teaches you how to contribute to PEPhub, build tools on the PEPhub API, or deploy your own instance.
diff --git a/docs/pephub/changelog.md b/docs/pephub/changelog.md index ce93ce6c..6ac72729 100644 --- a/docs/pephub/changelog.md +++ b/docs/pephub/changelog.md @@ -153,7 +153,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Fixed - sample table would exhibit odd, erratic behavior if column names were left blank -- alnding page styling was not otpimal +- alnding page styling was not optimal ## [0.9.6] - 07-20-2023 diff --git a/docs/pephub/developer/changelog.md b/docs/pephub/developer/changelog.md index 7c39b03c..272f6eda 100644 --- a/docs/pephub/developer/changelog.md +++ b/docs/pephub/developer/changelog.md @@ -72,7 +72,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Fixed - sample table would exhibit odd, erratic behavior if column names were left blank -- alnding page styling was not otpimal +- landing page styling was not optimal ## [0.9.6] - 2023-07-20 diff --git a/docs/pephub/developer/pepdbagent/database_version_migration.md b/docs/pephub/developer/pepdbagent/database_version_migration.md new file mode 100644 index 00000000..0a2bd69d --- /dev/null +++ b/docs/pephub/developer/pepdbagent/database_version_migration.md @@ -0,0 +1,80 @@ +# 🔧 Database Version Migration and Upgrade Instructions + +To change the database version smoothly and seamlessly, we use Alembic. +This tool allows us to manage database schema changes in a consistent, version-controlled manner. +As a result, users who have built their own PEPhub instances can easily upgrade to the latest version without losing +any data or needing to manually interact with the database. + +## 👷 Database schema change - for database developers + +If you are changing database schema in `db_utils.py` file and schema in the database changed, you should +percied with the following steps: + +### 0. **Set up the database URL in `alembic.ini`:** + +```text +sqlalchemy.url = postgresql+psycopg://user:password@localhost/dbname +``` + +By default, it is set to testing database. Credentials are in README.md in test folder of the repository. + +### 1. **Create a new migration script**: + +When you modify your SQLAlchemy models, follow these steps to keep the database schema in sync: + +a. Modify Models: Update your SQLAlchemy models in your code. +b. Generate Migration: +```bash +alembic revision --autogenerate -m "Describe your change" +``` + +### 2. **Edit the migration script**: +Open the newly created migration script and edit the `upgrade()` and `downgrade()` functions to define the changes you want to make to the database schema. +- The `upgrade()` function should contain the code to apply the changes. +- The `downgrade()` function should contain the code to revert the changes. + +### 3. **Apply the Migration** + +Run the migration to apply it to your database. Create small Python script with connection to the database +with `pepdbagent`, and parameter `run_migrations=True`: + +```python +from pepdbagent import PEPDatabaseAgent + +db_agent = PEPDatabaseAgent( + host="localhost", + port=5432, + database="pep-db", + user=None, + password=None, + run_migrations=True, +) +``` + +This will run all migrations of the database, including the one you just created. + + +### **Version Control** + +Each migration script has a unique identifier and tracks schema changes. Always commit these scripts to version control. + + +## 🧙‍♂️ Database schema change - for PEPhub users + +If you are changing database schema in `db_utils.py` file and schema in the database changed, you should +run the following script before connecting to the database: + +```python +from pepdbagent import PEPDatabaseAgent + +db_agent = PEPDatabaseAgent( + host="localhost", + port=5432, + database="pep-db", + user=None, + password=None, + run_migrations=True, +) +``` + +This will run all migrations of the database, including the one you just created. diff --git a/docs/pephub/img/github_orgs.png b/docs/pephub/img/github_orgs.png new file mode 100644 index 00000000..13224bc7 Binary files /dev/null and b/docs/pephub/img/github_orgs.png differ diff --git a/docs/pephub/user/organization.md b/docs/pephub/user/organization.md new file mode 100644 index 00000000..e5986603 --- /dev/null +++ b/docs/pephub/user/organization.md @@ -0,0 +1,19 @@ +# PEPhub organizations + +PEPhub allows users to create and manage PEPs within an organization. PEPhub organizations are linked to GitHub +organizations, and all members of a linked GitHub organization are designated as admins for the projects in the +corresponding PEPhub organization. By default, all projects in a PEPhub organization are public. However, admins +can make projects private, restricting visibility to members of the organization. + +### Logging in +When you log in to PEPhub through GitHub, all your organizations are fetched, and you are automatically added as an admin. + +!!! warning "Warning!" + PEPhub currently does not support GitHub user organizations that are set to private (hidden) in the user settings. + +!!! tip "Set GitHub org Membership Visibility" + + 1. Go to the GitHub page. + 2. Navigate to the organization’s page. (e.g. https://github.com/databio) + 3. Click People (in the organization's menu bar). + 4. Locate your username and ensure your membership is public.![../img/github_orgs.png](../img/github_orgs.png) \ No newline at end of file diff --git a/docs/pephub/user/pephubclient/phc_schemas.md b/docs/pephub/user/pephubclient/phc_schemas.md new file mode 100644 index 00000000..ff46a0b0 --- /dev/null +++ b/docs/pephub/user/pephubclient/phc_schemas.md @@ -0,0 +1,224 @@ + + + +# module `schema.py` + + + +**Global Variables** +--------------- +- **PEPHUB_SCHEMA_VERSION_URL** +- **PEPHUB_SCHEMA_VERSIONS_URL** +- **PEPHUB_SCHEMA_NEW_SCHEMA_URL** +- **PEPHUB_SCHEMA_NEW_VERSION_URL** +- **PEPHUB_SCHEMA_RECORD_URL** +- **LATEST_VERSION** + + +--- + +## class `PEPHubSchema` +Class for managing schemas in PEPhub and provides methods for getting, creating, updating and removing schemas records and schema versions. + + +### function `__init__` + +```python +__init__(jwt_data: str = None) +``` + + +- :param jwt_data: jwt token for authorization + + + + +--- + + +### function `add_version` + +```python +add_version( + namespace: str, + schema_name: str, + schema_value: dict, + version: str = '1.0.0', + contributors: str = None, + release_notes: str = None, + tags: Optional[str, List[str], dict] = None +) → None +``` + +Add new version to the schema registry + + +- :param namespace: Namespace of the schema +- :param schema_name: Name of the schema record +- :param schema_value: Schema value itself in dict format +- :param version: First version of the schema +- :param contributors: Schema contributors of current version +- :param release_notes: Release notes for current version +- :param tags: Tags of the current version. Can be str, list[str], or dict + +:raise: ResponseError if status not 202. :return: None + +--- + + +### function `create_schema` + +```python +create_schema( + namespace: str, + schema_name: str, + schema_value: dict, + version: str = '1.0.0', + description: str = None, + maintainers: str = None, + contributors: str = None, + release_notes: str = None, + tags: Optional[str, List[str], dict] = None, + lifecycle_stage: str = None, + private: bool = False +) → None +``` + +Create a new schema record + version in the database + + +- :param namespace: Namespace of the schema +- :param schema_name: Name of the schema record +- :param schema_value: Schema value itself in dict format +- :param version: First version of the schema +- :param description: Schema description +- :param maintainers: Schema maintainers +- :param contributors: Schema contributors of current version +- :param release_notes: Release notes for current version +- :param tags: Tags of the current version. Can be str, list[str], or dict +- :param lifecycle_stage: Stage of the schema record +- :param private: Weather project should be public or private. Default: False (public) + +:raise: ResponseError if status not 202. :return: None + +--- + + +### function `delete_schema` + +```python +delete_schema(namespace: str, schema_name: str) → None +``` + +Delete schema from the database + + +- :param namespace: Namespace of the schema +- :param schema_name: Name of the schema version + +--- + + +### function `delete_version` + +```python +delete_version(namespace: str, schema_name: str, version: str) → None +``` + +Delete schema Version + + +- :param namespace: Namespace of the schema +- :param schema_name: Name of the schema +- :param version: Schema version + +:raise: ResponseError if status not 202. :return: None + +--- + + +### function `get` + +```python +get(namespace: str, schema_name: str, version: str = 'latest') → dict +``` + +Get schema value for specific schema version. + + +- :param: namespace: namespace of schema +- :param: schema_name: name of schema +- :param: version: version of schema + +:return: Schema object as dictionary + +--- + + +### function `get_versions` + +```python +get_versions(namespace: str, schema_name: str) → SchemaVersionResult +``` + +Get list of versions + + +- :param namespace: Namespace of the schema record +- :param schema_name: Name of the schema record + +:return: { pagination: PaginationResult results: List[SchemaVersionAnnotation] } + +--- + + +### function `update_record` + +```python +update_record( + namespace: str, + schema_name: str, + update_fields: Union[dict, UpdateSchemaRecordFields] +) → None +``` + +Update schema registry data + + +- :param namespace: Namespace of the schema +- :param schema_name: Name of the schema version +- :param update_fields: dict or pydantic model UpdateSchemaRecordFields: { maintainers: str, lifecycle_stage: str, private: bool, name: str, description: str, } + +:raise: ResponseError if status not 202. :return: None + +--- + + +### function `update_version` + +```python +update_version( + namespace: str, + schema_name: str, + version: str, + update_fields: Union[dict, UpdateSchemaVersionFields] +) → None +``` + +Update released version of the schema. + + +- :param namespace: Namespace of the schema + +- :param schema_name: Name of the schema version +- :param version: Schema version +- :param update_fields: dict or pydantic model UpdateSchemaVersionFields: { contributors: str, schema_value: str, release_notes: str, } + +:raise: ResponseError if status not 202. :return: None + + + + +--- + +_This file was automatically generated via [lazydocs](https://github.com/ml-tooling/lazydocs)._ diff --git a/docs/pephub/user/version-control.md b/docs/pephub/user/version-control.md index e7859593..98043909 100644 --- a/docs/pephub/user/version-control.md +++ b/docs/pephub/user/version-control.md @@ -8,7 +8,7 @@ PEPhub automatically records a history of your files whenever changes are made. ![alt text](../img/menu-history.png) -Selecting this option will bring up the *History Interface*, which will provide buttons allowing you to view or delete entries from your history table. If you choose the `View` button for an entry, it will show you the PEP at that point in history. It also opens a new interface that will allow you to click `Restore` to overwright your current PEP with the historical version you are currently viewing, or you can `Download` the table as it was at that point in history. +Selecting this option will bring up the *History Interface*, which will provide buttons allowing you to view or delete entries from your history table. If you choose the `View` button for an entry, it will show you the PEP at that point in history. It also opens a new interface that will allow you to click `Restore` to overwrite your current PEP with the historical version you are currently viewing, or you can `Download` the table as it was at that point in history. ![alt text](../img/history-interface.png) diff --git a/docs/pephub/user/views.md b/docs/pephub/user/views.md index 88b94936..822f0c4b 100644 --- a/docs/pephub/user/views.md +++ b/docs/pephub/user/views.md @@ -2,7 +2,7 @@ ## What are views? -Large tables (*e.g.* >5,000 rows) can be unweildy with PEPhup. It can be hard to find the elements you're looking for. To address this, PEPhub provides the *Views* feature. Views provide a way to look at a subset of a large table (basically, a filtered table). +Large tables (*e.g.* >5,000 rows) can be unwieldy with PEPhup. It can be hard to find the elements you're looking for. To address this, PEPhub provides the *Views* feature. Views provide a way to look at a subset of a large table (basically, a filtered table). ## How to create a view diff --git a/docs/peppy/code/feature5_amend.md b/docs/peppy/code/feature5_amend.md index 0439c758..da78cebb 100644 --- a/docs/peppy/code/feature5_amend.md +++ b/docs/peppy/code/feature5_amend.md @@ -18,11 +18,15 @@ sample_table = examples_dir + "sample_table.csv" %cat $sample_table | column -t -s, | cat ``` - sample_name protocol organism time file_path - pig_0h RRBS pig 0 source1 - pig_1h RRBS pig 1 source1 - frog_0h RRBS frog 0 source1 - frog_1h RRBS frog 1 source1 + sample_name protocol organism time file_path + + pig_0h RRBS pig 0 source1 + + pig_1h RRBS pig 1 source1 + + frog_0h RRBS frog 0 source1 + + frog_1h RRBS frog 1 source1 ## Solution @@ -35,22 +39,22 @@ project_config_file = examples_dir + "project_config.yaml" %cat $project_config_file ``` - pep_version: "2.0.0" - sample_table: sample_table.csv - output_dir: $HOME/hello_looper_results - - sample_modifiers: - derive: - attributes: [file_path] - sources: - source1: /data/lab/project/{organism}_{time}h.fastq - source2: /path/from/collaborator/weirdNamingScheme_{external_id}.fastq - project_modifiers: - amend: - newLib: - sample_table: sample_table_newLib.csv - newLib2: - sample_table: sample_table_newLib2.csv + pep_version: "2.0.0" + sample_table: sample_table.csv + output_dir: $HOME/hello_looper_results + + sample_modifiers: + derive: + attributes: [file_path] + sources: + source1: /data/lab/project/{organism}_{time}h.fastq + source2: /path/from/collaborator/weirdNamingScheme_{external_id}.fastq + project_modifiers: + amend: + newLib: + sample_table: sample_table_newLib.csv + newLib2: + sample_table: sample_table_newLib2.csv Obviously, the amendments functionality can be combined with other `peppy` package options, e.g. imply and derive sample modifiers. The derive modifier is used in the example considered here (`derive` key in the `sample_modifiers` section of the config file). @@ -64,11 +68,11 @@ sample_table = examples_dir + "sample_table_newLib.csv" %cat $sample_table | column -t -s, | cat ``` - sample_name protocol organism time file_path - pig_0h ABCD pig 0 source1 - pig_1h ABCD pig 1 source1 - frog_0h ABCD frog 0 source1 - frog_1h ABCD frog 1 source1 + sample_name protocol organism time file_path + pig_0h ABCD pig 0 source1 + pig_1h ABCD pig 1 source1 + frog_0h ABCD frog 0 source1 + frog_1h ABCD frog 1 source1 @@ -77,11 +81,11 @@ sample_table = examples_dir + "sample_table_newLib2.csv" %cat $sample_table | column -t -s, | cat ``` - sample_name protocol organism time file_path - pig_0h EFGH pig 0 source1 - pig_1h EFGH pig 1 source1 - frog_0h EFGH frog 0 source1 - frog_1h EFGH frog 1 source1 + sample_name protocol organism time file_path + pig_0h EFGH pig 0 source1 + pig_1h EFGH pig 1 source1 + frog_0h EFGH frog 0 source1 + frog_1h EFGH frog 1 source1 ## Code diff --git a/docs/peppy/code/python-api.md b/docs/peppy/code/python-api.md index 66061391..12976153 100644 --- a/docs/peppy/code/python-api.md +++ b/docs/peppy/code/python-api.md @@ -1,698 +1,52 @@ - - - - - # Package `peppy` Documentation -Project configuration, particularly for logging. - -Project-scope constants may reside here, but more importantly, some setup here -will provide a logging infrastructure for all of the project's modules. -Individual modules and classes may provide separate configuration on a more -local level, but this will at least provide a foundation. - - -## Class `Project` -A class to model a Project (collection of samples and metadata). - -#### Parameters: - -- `cfg` (`str`): Project config file (YAML) or sample table (CSV/TSV)with one row per sample to constitute project -- `sample_table_index` (`str | Iterable[str]`): name of the columns to setthe sample_table index to -- `subsample_table_index` (`str | Iterable[str]`): name of the columns to setthe subsample_table index to -- `amendments` (`str | Iterable[str]`): names of the amendments to activate -- `amendments` (`Iterable[str]`): amendments to use within configuration file -- `defer_samples_creation` (`bool`): whether the sample creation should be skipped - - -#### Examples: - -```python - from peppy import Project - prj = Project(cfg="ngs.yaml") - samples = prj.samples -``` - - -```python -def __init__(self, cfg: str=None, amendments: Union[str, Iterable[str]]=None, sample_table_index: Union[str, Iterable[str]]=None, subsample_table_index: Union[str, Iterable[str]]=None, defer_samples_creation: bool=False) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -```python -def activate_amendments(self, amendments) -``` - -Update settings based on amendment-specific values. - -This method will update Project attributes, adding new values -associated with the amendments indicated, and in case of collision with -an existing key/attribute the amendments' values will be favored. -#### Parameters: - -- `amendments` (`Iterable[str]`): A string with amendmentnames to be activated - - -#### Returns: - -- `peppy.Project`: Updated Project instance - - -#### Raises: - -- `TypeError`: if argument to amendment parameter is null -- `NotImplementedError`: if this call is made on a project notcreated from a config file - - - - -```python -def add_samples(self, samples) -``` - -Add list of Sample objects -#### Parameters: - -- `samples` (`peppy.Sample | Iterable[peppy.Sample]`): samples to add - - - - -```python -def amendments(self) -``` - -Return currently active list of amendments or None if none was activated -#### Returns: - -- `Iterable[str]`: a list of currently active amendment names - - - - -```python -def attr_constants(self) -``` - -Update each Sample with constants declared by a Project. If Project does not declare constants, no update occurs. - - - -```python -def attr_derive(self, attrs=None) -``` - -Set derived attributes for all Samples tied to this Project instance - - - -```python -def attr_imply(self) -``` - -Infer value for additional field(s) from other field(s). - -Add columns/fields to the sample based on values in those already-set -that the sample's project defines as indicative of implications for -additional data elements for the sample. - - - -```python -def attr_merge(self) -``` - -Merge sample subannotations (from subsample table) with sample annotations (from sample_table) - - - -```python -def attr_remove(self) -``` - -Remove declared attributes from all samples that have them defined - - - -```python -def attr_synonyms(self) -``` - -Copy attribute values for all samples to a new one - - - -```python -def config(self) -``` - -Get the config mapping -#### Returns: - -- `Mapping`: config. May be formatted to comply with the mostrecent version specifications - - - - -```python -def config_file(self) -``` - -Get the config file path -#### Returns: - -- `str`: path to the config file - - - - -```python -def copy(self) -``` - -Copy self to a new object. - - - -```python -def create_samples(self, modify: bool=False) -``` - -Populate Project with Sample objects - - - -```python -def deactivate_amendments(self) -``` - -Bring the original project settings back. -#### Returns: - -- `peppy.Project`: Updated Project instance - - -#### Raises: - -- `NotImplementedError`: if this call is made on a project notcreated from a config file - - - - -```python -def description(self) -``` - - - -```python -def from_dict(cls, pep_dictionary: dict) -``` - -Init a peppy project instance from a dictionary representation of an already processed PEP. -#### Parameters: - -- `pep_dictionary` (`Dict[Any]`): dict,_samples: list | dict, _subsamples: list[list | dict]} - - - - -```python -def from_pandas(cls, samples_df: pandas.core.frame.DataFrame, sub_samples_df: List[pandas.core.frame.DataFrame]=None, config: dict=None) -``` - -Init a peppy project instance from a pandas Dataframe -#### Parameters: - -- `samples_df` (``): in-memory pandas DataFrame object of samples -- `sub_samples_df` (``): in-memory list of pandas DataFrame objects of sub-samples -- `config` (``): dict of yaml file - - - - -```python -def from_pep_config(cls, cfg: str=None, amendments: Union[str, Iterable[str]]=None, sample_table_index: Union[str, Iterable[str]]=None, subsample_table_index: Union[str, Iterable[str]]=None, defer_samples_creation: bool=False) -``` - -Init a peppy project instance from a yaml file -#### Parameters: - -- `cfg` (`str`): Project config file (YAML) or sample table (CSV/TSV)with one row per sample to constitute project -- `sample_table_index` (`str | Iterable[str]`): name of the columns to setthe sample_table index to -- `subsample_table_index` (`str | Iterable[str]`): name of the columns to setthe subsample_table index to -- `amendments` (`str | Iterable[str]`): names of the amendments to activate -- `amendments` (`Iterable[str]`): amendments to use within configuration file -- `defer_samples_creation` (`bool`): whether the sample creation should be skipped - - - - -```python -def from_sample_yaml(cls, yaml_file: str) -``` - -Init a peppy project instance from a yaml file -#### Parameters: - -- `yaml_file` (`str`): path to yaml file - - - - -```python -def get_description(self) -``` - -Infer project description from config file. - -The provided description has to be of class coercible to string -#### Returns: - -- `str`: inferred name for project. - - -#### Raises: - -- `InvalidConfigFileException`: if description is not of classcoercible to string - - - - -```python -def get_sample(self, sample_name) -``` - -Get an individual sample object from the project. - -Will raise a ValueError if the sample is not found. -In the case of multiple samples with the same name (which is not -typically allowed), a warning is raised and the first sample is returned -#### Parameters: - -- `sample_name` (`str`): The name of a sample to retrieve - - -#### Returns: - -- `peppy.Sample`: The requested Sample object - - -#### Raises: - -- `ValueError`: if there's no sample with the specified name defined - - - - -```python -def get_samples(self, sample_names) -``` - -Returns a list of sample objects given a list of sample names -#### Parameters: - -- `sample_names` (`list`): A list of sample names to retrieve - - -#### Returns: - -- `list[peppy.Sample]`: A list of Sample objects - +## Package Overview +The `peppy` package provides a Python interface for working with Portable Encapsulated Projects (PEPs). A PEP is a standardized format for organizing metadata for biological samples and sample-intensive data. +### Key Features -```python -def infer_name(self) -``` - -Infer project name from config file path. - -First assume the name is the folder in which the config file resides, -unless that folder is named "metadata", in which case the project name -is the parent of that folder. -#### Returns: - -- `str`: inferred name for project. - - -#### Raises: - -- `InvalidConfigFileException`: if the project lacks both a name anda configuration file (no basis, then, for inference) -- `InvalidConfigFileException`: if specified Project name is invalid - - - - -```python -def is_sample_table_large(self) -``` - +- **Project Management**: Create and manage collections of samples with metadata +- **Sample Access**: Retrieve individual samples and their attributes +- **Amendments**: Activate different project configurations +- **Validation**: Validate projects against schemas +### Installation -```python -def list_amendments(self) +```bash +pip install peppy ``` -Return a list of available amendments or None if not declared -#### Returns: - -- `Iterable[str]`: a list of available amendment names - - - +### Quick Example ```python -def load_samples(self) -``` - -Read the sample_table and subsample_tables into dataframes and store in the object root. The values sourced from the project config can be overwritten by the optional arguments. - +from peppy import Project +# Initialize with a project config file +prj = Project(cfg="ngs.yaml") -```python -def modify_samples(self) +# Access samples +samples = prj.samples ``` -Perform any sample modifications defined in the config. - - - -```python -def name(self) -``` - - - -```python -def parse_config_file(self, cfg_path: str=None, amendments: Iterable[str]=None) -``` - -Parse provided yaml config file and check required fields exist. -#### Parameters: - -- `cfg_path` (`str`): path to the config file to read and parse -- `amendments` (`Iterable[str]`): Name of amendments to activate - - -#### Raises: - -- `KeyError`: if config file lacks required section(s) - - - - -```python -def pep_version(self) -``` - -The declared PEP version string - -It is validated to make sure it is a valid PEP version string -#### Returns: - -- `str`: PEP version string - - -#### Raises: - -- `InvalidConfigFileException`: in case of invalid PEP version - - - - -```python -def remove_samples(self, sample_names) -``` - -Remove Samples from Project -#### Parameters: - -- `sample_names` (`Iterable[str]`): sample names to remove - - - - -```python -def sample_name_colname(self) -``` - -**Deprecated, please use `Project.sample_table_index` instead** - -Name of the effective sample name containing column in the sample table. -It is "sample_name" by default, but when it's missing it could be -replaced by the selected sample table index, defined on the -object instantiation stage. -#### Returns: - -- `str`: name of the column that consist of sample identifiers - - - - -```python -def sample_table(self) -``` - -Get sample table. If any sample edits were performed, it will be re-generated -#### Returns: - -- `pandas.DataFrame`: a data frame with current samples attributes - - - - -```python -def sample_table_index(self) -``` - -The effective sample table index. - -It is `sample_name` by default, but could be overwritten by the selected sample table index, -defined on the object instantiation stage or in the project configuration file -via `sample_table_index` field. -That's the sample table index selection priority order: -1. Constructor specified -2. Config specified -3. Default: `sample_table` -#### Returns: - -- `str`: name of the column that consist of sample identifiers - - - - -```python -def samples(self) -``` - -Generic/base Sample instance for each of this Project's samples. -#### Returns: - -- `Iterable[Sample]`: Sample instance for eachof this Project's samples - - - - -```python -def subsample_table(self) -``` - -Get subsample table -#### Returns: - -- `pandas.DataFrame`: a data frame with subsample attributes - - - - -```python -def subsample_table_index(self) -``` - -The effective subsample table indexes. - -It is `[subasample_name, sample_name]` by default, -but could be overwritten by the selected subsample table indexes, -defined on the object instantiation stage or in the project configuration file -via `subsample_table_index` field. -That's the subsample table indexes selection priority order: -1. Constructor specified -2. Config specified -3. Default: `[subasample_name, sample_name]` -#### Returns: - -- `List[str]`: names of the columns that consist of sample and subsample identifiers - - - - -```python -def to_dict(self, extended: bool=False, orient: Literal['dict', 'list', 'series', 'split', 'tight', 'records', 'index']='dict') -> dict -``` - -Convert the Project object to a dictionary. -#### Parameters: - -- `extended` (`bool`): whether to produce complete project dict (used to reinit the project) -- `orient` (`Literal`): orientation of the returned df - - -#### Returns: - -- `dict`: a dictionary representation of the Project object - - - - -## Class `Sample` -Class to model Samples based on a pandas Series. - -#### Parameters: - -- `series` (`Mapping | pandas.core.series.Series`): Sample's data. - - -```python -def __init__(self, series, prj=None) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -```python -def attributes(self) -``` - - - -```python -def copy(self) -``` - -Copy self to a new object. - - - -```python -def derive_attribute(self, data_sources, attr_name) -``` - -Uses the template path provided in the project config section "data_sources" to piece together an actual path by substituting variables (encoded by "{variable}"") with sample attributes. -#### Parameters: - -- `data_sources` (`Mapping`): mapping from key name (as a value ina cell of a tabular data structure) to, e.g., filepath -- `attr_name` (`str`): Name of sample attribute(equivalently, sample sheet column) specifying a derived column. - - -#### Returns: - -- `str`: regex expansion of data source specified in configuration,with variable substitutions made - - -#### Raises: - -- `ValueError`: if argument to data_sources parameter is null/empty - - - - -```python -def get_sheet_dict(self) -``` - -Create a K-V pairs for items originally passed in via the sample sheet. This is useful for summarizing; it provides a representation of the sample that excludes things like config files and derived entries. -#### Returns: - -- `OrderedDict`: mapping from name to value for data elementsoriginally provided via the sample sheet (i.e., the a map-like representation of the instance, excluding derived items) - - - - -```python -def project(self) -``` - -Get the project mapping -#### Returns: - -- `peppy.Project`: project object the sample was created from - - - - -```python -def to_dict(self, add_prj_ref=False) -``` - -Serializes itself as dict object. -#### Parameters: - -- `add_prj_ref` (`bool`): whether the project reference bound do theSample object should be included in the YAML representation - - -#### Returns: - -- `dict`: dict representation of this Sample - - - - -```python -def to_yaml(self, path, add_prj_ref=False) -``` - -Serializes itself in YAML format. -#### Parameters: - -- `path` (`str`): A file path to write yaml to; provide this orthe subs_folder_path -- `add_prj_ref` (`bool`): whether the project reference bound do theSample object should be included in the YAML representation - - - - -## Class `PeppyError` -Base error type for peppy custom errors. - - -```python -def __init__(self, msg) -``` - -Initialize self. See help(type(self)) for accurate signature. - +## API Reference +### Project Class +The main class for working with PEPs: +::: peppy.Project + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true +### Sample Class -*Version Information: `peppy` v0.40.1, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: peppy.Sample + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/peppy/code/tutorial.md b/docs/peppy/code/tutorial.md index bc2c0332..8cedf886 100644 --- a/docs/peppy/code/tutorial.md +++ b/docs/peppy/code/tutorial.md @@ -1,6 +1,6 @@ # Basic PEP example -This vignette will show you a simple example PEP-formatted project, and how to read it into python using the `peppy` package. This example comes from the [example_peps repsitory](https://github.com/pepkit/example_peps) in the [example_basic](https://github.com/pepkit/example_peps/tree/master/example_basic) folder. +This vignette will show you a simple example PEP-formatted project, and how to read it into python using the `peppy` package. This example comes from the [example_peps repository](https://github.com/pepkit/example_peps) in the [example_basic](https://github.com/pepkit/example_peps/tree/master/example_basic) folder. Start by importing `peppy`, and then let's take a look at the configuration file that defines our project: diff --git a/docs/pipestat/README.md b/docs/pipestat/README.md index 002084ab..fff007fc 100644 --- a/docs/pipestat/README.md +++ b/docs/pipestat/README.md @@ -14,122 +14,9 @@ Pipestat standardizes reporting of pipeline results. It provides 1) a standard s ## How does pipestat work? -A pipeline author defines all the outputs produced by a pipeline by writing a JSON-schema. The pipeline then uses pipestat to report pipeline outputs as the pipeline runs, either via the Python API or command line interface. The user configures results to be stored either in a [YAML-formatted file](https://yaml.org/spec/1.2/spec.html) or a [PostgreSQL database](https://www.postgresql.org/). The results are recorded according to the pipestat specification, in a standard, pipeline-agnostic way. This way, downstream software can use this specification to create universal tools for analyzing, monitoring, and visualizing pipeline results that will work with any pipeline or workflow. +A pipeline author defines all the outputs produced by a pipeline by writing a JSON-schema. The pipeline then uses pipestat to report pipeline outputs as the pipeline runs, either via the Python API or command line interface. The user configures results to be stored either in a [YAML-formatted file](https://yaml.org/spec/1.2/spec.html), a [PostgreSQL database](https://www.postgresql.org/) or on [PEPhub](https://pephub.databio.org/). The results are recorded according to the pipestat specification, in a standard, pipeline-agnostic way. This way, downstream software can use this specification to create universal tools for analyzing, monitoring, and visualizing pipeline results that will work with any pipeline or workflow. - -## Installing pipestat - -### Minimal install for file backend - -Install pipestat from PyPI with `pip`: - -``` -pip install pipestat -``` - -Confirm installation by calling `pipestat -h` on the command line. If the `pipestat` executable is not in your `$PATH`, append this to your `.bashrc` or `.profile` (or `.bash_profile` on macOS): - -```console -export PATH=~/.local/bin:$PATH -``` - -### Optional dependencies for database backend - -Pipestat can use either a file or a database as the backend for recording results. The default installation only provides file backend. To install dependencies required for the database backend: - -``` -pip install pipestat['dbbackend'] -``` - -### Optional dependencies for pipestat reader - -To install dependencies for the included `pipestatreader` submodule: - -``` -pip install pipestat['pipestatreader'] -``` - -## Set environment variables - - - -```console -export PIPESTAT_RESULTS_SCHEMA=output_schema.yaml -export PIPESTAT_RECORD_IDENTIFIER=my_record -export PIPESTAT_RESULTS_FILE=results_file.yaml -``` - -When setting environment variables like this, you will need to provide an `output_schema.yaml` file in your current working directory with the following example data: - -```yaml -title: An example Pipestat output schema -description: A pipeline using pipestat to report sample and project results. -type: object -properties: - pipeline_name: "default_pipeline_name" - samples: - type: object - properties: - result_name: - type: string - description: "ResultName" -``` - -## Pipeline results reporting and retrieval - -These examples assume the above environment variables are set. - -### Command-line usage - -```console -# Report a result: -pipestat report -i result_name -v 1.1 - -# Retrieve the result: -pipestat retrieve -r my_record -``` - -### Python usage - -```python -import pipestat - -# Report a result -psm = pipestat.PipestatManager() -psm.report(values={"result_name": 1.1}) - -# Retrieve a result -psm = pipestat.PipestatManager() -psm.retrieve_one(result_identifier="result_name") -``` - -## Pipeline status management - -### From command line: - - - -```console -# Set status -pipestat status set running - -# Get status -pipestat status get -``` - -### Python usage - - -```python -import pipestat - -# Set status -psm = pipestat.PipestatManager() -psm.set_status(status_identifier="running") - -# Get status -psm = pipestat.PipestatManager() -psm.get_status() -``` +## Quick start +Check out the [quickstart guide](./code/api-quickstar.md). See [API Usage](./code/python-tutorial.md) and [CLI Usage](./code/cli.md). diff --git a/docs/pipestat/backends.md b/docs/pipestat/backends.md new file mode 100644 index 00000000..e03c8e75 --- /dev/null +++ b/docs/pipestat/backends.md @@ -0,0 +1,79 @@ +# Back-end types + + +The pipestat specification describes three backend types for storing results: a [YAML-formatted file](https://yaml.org/spec/1.2/spec.html), a [PostgreSQL database](https://www.postgresql.org/) or reporting results to [PEPhub](https://pephub.databio.org/). This flexibility makes pipestat useful for a wide variety of use cases. Some users just need a simple text file for smaller-scale needs, which is convenient and universal, requiring no database infrastructure. For larger-scale systems, a database back-end is necessary. The pipestat specification provides a layer that spans the three possibilities, so that reports can be made in the same way, regardless of which back-end is used in a particular use case. + +By using the `pipestat` package to write results, the pipeline author need not be concerned with database connections or dealing with racefree file writing, as these tasks are already implemented. The user who runs the pipeline will simply configure the pipestat backend as required. + +Both backends organize the results in a hierarchy which is *always* structured this way: + +![Result hierarchy](img/result_hierarchy.svg) + + + +## File + +The changes reported using the `report` method of `PipestatManger` will be securely written to the file. Currently only [YAML](https://yaml.org/) format is supported. + +Example: + +```python +psm = PipestatManager(results_file_path="result_file.yaml", schema_path=schema_file) +``` + +For the YAML file backend, each file represents a namespace. The file always begins with a single top-level key which indicates the namespace. Second-level keys correspond to the record identifiers; third-level keys correspond to result identifiers, which point to the reported values. The values can then be any of the allowed pipestat data types, which include both basic and advanced data types. + +```yaml +default_pipeline_name: + project: {} + sample: + sample_1: + meta: + pipestat_modified_time: '2025-10-01 12:48:58' + pipestat_created_time: '2025-10-01 12:48:58' + number_of_things: '12' +``` + +## PostgreSQL database +This option gives the user the possibility to use a fully fledged database to back `PipestatManager`. + +Example: + +```python +psm = PipestatManager(config_file="config_file.yaml", schema_path=schema_file) +``` +where the config file has the following (example) values: + +```yaml +schema_path: sample_output_schema.yaml +database: + dialect: postgresql + driver: psycopg + name: pipestat-test + user: postgres + password: pipestat-password + host: 127.0.0.1 + port: 5432 + +``` + +For the PostgreSQL backend, the name of the database is configurable and defined in the [config file](config.md) in `database.name`. The database is structured like this: + +- The namespace corresponds to the name of the table. +- The record identifier is indicated in the *unique* `record_identifier` column in that table. +- Each result is specified as a column in the table, with the column name corresponding to the result identifier +- The values in the cells for a record and result identifier correspond to the actual data values reported for the given result. + +![RDB hierarchy](img/db_hierarchy.svg) + + + +## PEP on PEPhub +This option gives the user the possibility to use [PEPhub](https://pephub.databio.org/) as a backend for results. + +```python +psm = PipestatManager(pephub_path=pephubpath, schema_path="sample_output_schema.yaml") +``` + + +All three backends *can* be configured using the config file. However, the PostgreSQL backend *must* use a config file. \ No newline at end of file diff --git a/docs/pipestat/code/python-api.md b/docs/pipestat/code/python-api.md index f89ec240..c5df5176 100644 --- a/docs/pipestat/code/python-api.md +++ b/docs/pipestat/code/python-api.md @@ -1,1062 +1,65 @@ - - - - - # Package `pipestat` Documentation -## Class `PipestatError` -Base exception type for this package - - -## Class `SamplePipestatManager` -Pipestat standardizes reporting of pipeline results and pipeline status management. It formalizes a way for pipeline developers and downstream tools developers to communicate -- results produced by a pipeline can easily and reliably become an input for downstream analyses. A PipestatManager object exposes an API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file or a database. - - -```python -def __init__(self, **kwargs) -``` - -Initialize the PipestatManager object -#### Parameters: - -- `record_identifier` (`str`): record identifier to report for. Thiscreates a weak bound to the record, which can be overridden in this object method calls -- `schema_path` (`str`): path to the output schema that formalizesthe results structure -- `results_file_path` (`str`): YAML file to report into, if file isused as the object back-end -- `database_only` (`bool`): whether the reported data should not bestored in the memory, but only in the database -- `config_file` (`str`): path to the configuration file -- `config_dict` (`dict`): a mapping with the config file content -- `flag_file_dir` (`str`): path to directory containing flag files -- `show_db_logs` (`bool`): Defaults to False, toggles showing database logs -- `pipeline_type` (`str`): "sample" or "project" -- `pipeline_name` (`str`): name of the current pipeline, defaults to -- `result_formatter` (`str`): function for formatting result -- `multi_pipelines` (`bool`): allows for running multiple pipelines for one file backend -- `output_dir` (`str`): target directory for report generation via summarize and table generation via table. - - - - -```python -def clear_status(self, *args, **kwargs) -``` - - - -```python -def config_path(self) -``` - -Config path. None if the config was not provided or if provided as a mapping of the config contents -#### Returns: - -- `str`: path to the provided config - - - - -```python -def count_records(self, *args, **kwargs) -``` - - - -```python -def data(self) -``` - -Data object -#### Returns: - -- `yacman.YAMLConfigManager`: the object that stores the reported data - - - - -```python -def db_url(self) -``` - -Database URL, generated based on config credentials -#### Returns: - -- `str`: database URL - - -#### Raises: - -- `PipestatDatabaseError`: if the object is not backed by a database - - - - -```python -def file(self) -``` - -File path that the object is reporting the results into -#### Returns: - -- `str`: file path that the object is reporting the results into - - - - -```python -def get_status(self, *args, **kwargs) -``` - - - -```python -def highlighted_results(self) -``` - -Highlighted results -#### Returns: - -- `List[str]`: a collection of highlighted results - - - - -```python -def initialize_dbbackend(*args, **kwargs) -``` - - - -```python -def link(self, *args, **kwargs) -``` - - - -```python -def list_recent_results(self, *args, **kwargs) -``` - - - -```python -def output_dir(self) -``` - -Output directory for report and stats generation -#### Returns: - -- `str`: path to output_dir - - - - -```python -def pipeline_name(self) -``` - -Pipeline name -#### Returns: - -- `str`: Pipeline name - - - - -```python -def pipeline_type(self) -``` - -Pipeline type: "sample" or "project" -#### Returns: - -- `str`: pipeline type - - - - -```python -def project_name(self) -``` - -Project name the object writes the results to -#### Returns: - -- `str`: project name the object writes the results to - - - - -```python -def record_count(self) -``` - -Number of records reported -#### Returns: - -- `int`: number of records reported - - - +## Package Overview -```python -def record_identifier(self) -``` - -Pipeline type: "sample" or "project" -#### Returns: - -- `str`: pipeline type - - - - -```python -def remove(self, *args, **kwargs) -``` - - - -```python -def remove_record(self, *args, **kwargs) -``` - - - -```python -def report(self, *args, **kwargs) -``` - - - -```python -def result_schemas(self) -``` - -Result schema mappings -#### Returns: - -- `dict`: schemas that formalize the structure of each resultin a canonical jsonschema way - - - - -```python -def retrieve_one(self, *args, **kwargs) -``` - - - -```python -def schema(self) -``` - -Schema mapping -#### Returns: - -- `ParsedSchema`: schema object that formalizes the results structure - - - - -```python -def schema_path(self) -``` - -Schema path -#### Returns: - -- `str`: path to the provided schema - - - - -```python -def select_distinct(self, *args, **kwargs) -``` - - - -```python -def select_records(self, *args, **kwargs) -``` - - - -```python -def set_status(self, *args, **kwargs) -``` +The `pipestat` package standardizes reporting of pipeline results and pipeline status management. It provides a formal way for pipeline developers and downstream tools to communicate pipeline outputs and status. +### Key Features +- **Results Reporting**: Standardized API for reporting pipeline results +- **Status Management**: Track pipeline execution status +- **Backend Flexibility**: Store results in YAML files or databases +- **Schema Validation**: Validate results against defined schemas +- **Multi-pipeline Support**: Manage results from multiple pipelines -```python -def status_schema(self) -``` - -Status schema mapping -#### Returns: - -- `dict`: schema that formalizes the pipeline status structure - - - - -```python -def status_schema_source(self) -``` - -Status schema source -#### Returns: - -- `dict`: source of the schema that formalizesthe pipeline status structure - - - - -```python -def summarize(self, *args, **kwargs) -``` - - - -```python -def table(self, *args, **kwargs) -``` - - - -## Class `ProjectPipestatManager` -Pipestat standardizes reporting of pipeline results and pipeline status management. It formalizes a way for pipeline developers and downstream tools developers to communicate -- results produced by a pipeline can easily and reliably become an input for downstream analyses. A PipestatManager object exposes an API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file or a database. - - -```python -def __init__(self, **kwargs) -``` - -Initialize the PipestatManager object -#### Parameters: - -- `record_identifier` (`str`): record identifier to report for. Thiscreates a weak bound to the record, which can be overridden in this object method calls -- `schema_path` (`str`): path to the output schema that formalizesthe results structure -- `results_file_path` (`str`): YAML file to report into, if file isused as the object back-end -- `database_only` (`bool`): whether the reported data should not bestored in the memory, but only in the database -- `config_file` (`str`): path to the configuration file -- `config_dict` (`dict`): a mapping with the config file content -- `flag_file_dir` (`str`): path to directory containing flag files -- `show_db_logs` (`bool`): Defaults to False, toggles showing database logs -- `pipeline_type` (`str`): "sample" or "project" -- `pipeline_name` (`str`): name of the current pipeline, defaults to -- `result_formatter` (`str`): function for formatting result -- `multi_pipelines` (`bool`): allows for running multiple pipelines for one file backend -- `output_dir` (`str`): target directory for report generation via summarize and table generation via table. +### Installation - - - -```python -def clear_status(self, *args, **kwargs) -``` - - - -```python -def config_path(self) -``` - -Config path. None if the config was not provided or if provided as a mapping of the config contents -#### Returns: - -- `str`: path to the provided config - - - - -```python -def count_records(self, *args, **kwargs) +```bash +pip install pipestat ``` - +### Quick Example ```python -def data(self) -``` - -Data object -#### Returns: +from pipestat import SamplePipestatManager -- `yacman.YAMLConfigManager`: the object that stores the reported data +# Initialize with a schema and results file +psm = SamplePipestatManager( + schema_path="output_schema.yaml", + results_file_path="results.yaml" +) - - - -```python -def db_url(self) +# Report a result +psm.report(record_identifier="sample1", values={"result_name": 42}) ``` -Database URL, generated based on config credentials -#### Returns: +## API Reference -- `str`: database URL - - -#### Raises: - -- `PipestatDatabaseError`: if the object is not backed by a database - - - - -```python -def file(self) -``` - -File path that the object is reporting the results into -#### Returns: - -- `str`: file path that the object is reporting the results into - - - - -```python -def get_status(self, *args, **kwargs) -``` - - - -```python -def highlighted_results(self) -``` - -Highlighted results -#### Returns: - -- `List[str]`: a collection of highlighted results - - - - -```python -def initialize_dbbackend(*args, **kwargs) -``` - - - -```python -def link(self, *args, **kwargs) -``` - - - -```python -def list_recent_results(self, *args, **kwargs) -``` - - - -```python -def output_dir(self) -``` - -Output directory for report and stats generation -#### Returns: - -- `str`: path to output_dir - - - - -```python -def pipeline_name(self) -``` - -Pipeline name -#### Returns: - -- `str`: Pipeline name - - - - -```python -def pipeline_type(self) -``` - -Pipeline type: "sample" or "project" -#### Returns: - -- `str`: pipeline type - - - - -```python -def project_name(self) -``` - -Project name the object writes the results to -#### Returns: - -- `str`: project name the object writes the results to - - - - -```python -def record_count(self) -``` - -Number of records reported -#### Returns: - -- `int`: number of records reported - - - - -```python -def record_identifier(self) -``` - -Pipeline type: "sample" or "project" -#### Returns: - -- `str`: pipeline type - - - - -```python -def remove(self, *args, **kwargs) -``` - - - -```python -def remove_record(self, *args, **kwargs) -``` - - - -```python -def report(self, *args, **kwargs) -``` - - - -```python -def result_schemas(self) -``` - -Result schema mappings -#### Returns: - -- `dict`: schemas that formalize the structure of each resultin a canonical jsonschema way - - - - -```python -def retrieve_one(self, *args, **kwargs) -``` - - - -```python -def schema(self) -``` - -Schema mapping -#### Returns: - -- `ParsedSchema`: schema object that formalizes the results structure - - - - -```python -def schema_path(self) -``` - -Schema path -#### Returns: - -- `str`: path to the provided schema - - - - -```python -def select_distinct(self, *args, **kwargs) -``` - - - -```python -def select_records(self, *args, **kwargs) -``` - - - -```python -def set_status(self, *args, **kwargs) -``` - - - -```python -def status_schema(self) -``` - -Status schema mapping -#### Returns: - -- `dict`: schema that formalizes the pipeline status structure - - - - -```python -def status_schema_source(self) -``` - -Status schema source -#### Returns: - -- `dict`: source of the schema that formalizesthe pipeline status structure - - - - -```python -def summarize(self, *args, **kwargs) -``` - - - -```python -def table(self, *args, **kwargs) -``` - - - -## Class `PipestatBoss` -PipestatBoss simply holds Sample or Project Managers that are child classes of PipestatManager. :param list[str] pipeline_list: list that holds pipeline types, e.g. ['sample','project'] :param str record_identifier: record identifier to report for. This creates a weak bound to the record, which can be overridden in this object method calls :param str schema_path: path to the output schema that formalizes the results structure :param str results_file_path: YAML file to report into, if file is used as the object back-end :param bool database_only: whether the reported data should not be stored in the memory, but only in the database :param str | dict config: path to the configuration file or a mapping with the config file content :param str flag_file_dir: path to directory containing flag files :param bool show_db_logs: Defaults to False, toggles showing database logs :param str pipeline_type: "sample" or "project" :param str result_formatter: function for formatting result :param bool multi_pipelines: allows for running multiple pipelines for one file backend :param str output_dir: target directory for report generation via summarize and table generation via table. - - -```python -def __init__(self, pipeline_list: Optional[list]=None, **kwargs) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -## Class `PipestatManager` -Pipestat standardizes reporting of pipeline results and pipeline status management. It formalizes a way for pipeline developers and downstream tools developers to communicate -- results produced by a pipeline can easily and reliably become an input for downstream analyses. A PipestatManager object exposes an API for interacting with the results and pipeline status and can be backed by either a YAML-formatted file or a database. - - -```python -def __init__(self, project_name: Optional[str]=None, record_identifier: Optional[str]=None, schema_path: Optional[str]=None, results_file_path: Optional[str]=None, database_only: Optional[bool]=True, config_file: Optional[str]=None, config_dict: Optional[dict]=None, flag_file_dir: Optional[str]=None, show_db_logs: bool=False, pipeline_type: Optional[str]=None, pipeline_name: Optional[str]=None, result_formatter: staticmethod=, multi_pipelines: bool=False, output_dir: Optional[str]=None) -``` - -Initialize the PipestatManager object -#### Parameters: - -- `record_identifier` (`str`): record identifier to report for. Thiscreates a weak bound to the record, which can be overridden in this object method calls -- `schema_path` (`str`): path to the output schema that formalizesthe results structure -- `results_file_path` (`str`): YAML file to report into, if file isused as the object back-end -- `database_only` (`bool`): whether the reported data should not bestored in the memory, but only in the database -- `config_file` (`str`): path to the configuration file -- `config_dict` (`dict`): a mapping with the config file content -- `flag_file_dir` (`str`): path to directory containing flag files -- `show_db_logs` (`bool`): Defaults to False, toggles showing database logs -- `pipeline_type` (`str`): "sample" or "project" -- `pipeline_name` (`str`): name of the current pipeline, defaults to -- `result_formatter` (`str`): function for formatting result -- `multi_pipelines` (`bool`): allows for running multiple pipelines for one file backend -- `output_dir` (`str`): target directory for report generation via summarize and table generation via table. - - - - -```python -def check_multi_results(self) -``` - - - -```python -def clear_status(self, *args, **kwargs) -``` - - - -```python -def config_path(self) -``` - -Config path. None if the config was not provided or if provided as a mapping of the config contents -#### Returns: - -- `str`: path to the provided config - - - - -```python -def count_records(self, *args, **kwargs) -``` - - - -```python -def data(self) -``` - -Data object -#### Returns: - -- `yacman.YAMLConfigManager`: the object that stores the reported data - - - - -```python -def db_url(self) -``` - -Database URL, generated based on config credentials -#### Returns: - -- `str`: database URL - - -#### Raises: - -- `PipestatDatabaseError`: if the object is not backed by a database - - - - -```python -def file(self) -``` - -File path that the object is reporting the results into -#### Returns: - -- `str`: file path that the object is reporting the results into - - - - -```python -def get_status(self, *args, **kwargs) -``` - - - -```python -def highlighted_results(self) -``` - -Highlighted results -#### Returns: - -- `List[str]`: a collection of highlighted results - - - - -```python -def initialize_dbbackend(*args, **kwargs) -``` - - - -```python -def initialize_filebackend(self, record_identifier, results_file_path, flag_file_dir) -``` - - - -```python -def link(self, *args, **kwargs) -``` - - - -```python -def list_recent_results(self, *args, **kwargs) -``` - - - -```python -def output_dir(self) -``` - -Output directory for report and stats generation -#### Returns: - -- `str`: path to output_dir - - - - -```python -def pipeline_name(self) -``` - -Pipeline name -#### Returns: - -- `str`: Pipeline name - - - - -```python -def pipeline_type(self) -``` - -Pipeline type: "sample" or "project" -#### Returns: - -- `str`: pipeline type - - - - -```python -def process_schema(self, schema_path) -``` - - - -```python -def project_name(self) -``` - -Project name the object writes the results to -#### Returns: - -- `str`: project name the object writes the results to - - - - -```python -def record_count(self) -``` - -Number of records reported -#### Returns: - -- `int`: number of records reported - - - - -```python -def record_identifier(self) -``` - -Pipeline type: "sample" or "project" -#### Returns: - -- `str`: pipeline type - - - - -```python -def remove(self, *args, **kwargs) -``` - - - -```python -def remove_record(self, *args, **kwargs) -``` - - - -```python -def report(self, *args, **kwargs) -``` - - - -```python -def resolve_results_file_path(self, results_file_path) -``` - -Replace {record_identifier} in results_file_path if it exists. -#### Parameters: - -- `results_file_path` (`str`): YAML file to report into, if file isused as the object back-end - - - - -```python -def result_schemas(self) -``` - -Result schema mappings -#### Returns: - -- `dict`: schemas that formalize the structure of each resultin a canonical jsonschema way - - - - -```python -def retrieve_history(self, record_identifier: str=None, result_identifier: Union[str, List[str], NoneType]=None) -> Union[Any, Dict[str, Any]] -``` - -Retrieve a single record's history -#### Parameters: - -- `record_identifier` (`str`): single record_identifier -- `result_identifier` (`str`): single result_identifier or list of result identifiers - - -#### Returns: - -- ``: a mapping with filtered historical results - - - - -```python -def retrieve_many(self, record_identifiers: List[str], result_identifier: Optional[str]=None) -> Union[Any, Dict[str, Any]] -``` - - -#### Parameters: - -- `record_identifiers` (``): list of record identifiers -- `result_identifier` (`str`): single record_identifier - - -#### Returns: - -- ``: a mapping with filteredresults reported for the record - - - - -```python -def retrieve_one(self, *args, **kwargs) -``` - - - -```python -def schema(self) -``` - -Schema mapping -#### Returns: - -- `ParsedSchema`: schema object that formalizes the results structure - - - - -```python -def schema_path(self) -``` - -Schema path -#### Returns: - -- `str`: path to the provided schema - - - - -```python -def select_distinct(self, *args, **kwargs) -``` - - - -```python -def select_records(self, *args, **kwargs) -``` - - - -```python -def set_status(self, *args, **kwargs) -``` - - - -```python -def status_schema(self) -``` - -Status schema mapping -#### Returns: - -- `dict`: schema that formalizes the pipeline status structure - - - - -```python -def status_schema_source(self) -``` - -Status schema source -#### Returns: - -- `dict`: source of the schema that formalizesthe pipeline status structure - - - - -```python -def summarize(self, *args, **kwargs) -``` - - - -```python -def table(self, *args, **kwargs) -``` +### SamplePipestatManager Class +The main class for managing sample-level pipeline results: +::: pipestat.SamplePipestatManager + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true +### ProjectPipestatManager Class +::: pipestat.ProjectPipestatManager + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true +### Exceptions -*Version Information: `pipestat` v0.9.2, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: pipestat.PipestatError + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/pipestat/code/python-tutorial.md b/docs/pipestat/code/python-tutorial.md index 47a82232..dd1b5557 100644 --- a/docs/pipestat/code/python-tutorial.md +++ b/docs/pipestat/code/python-tutorial.md @@ -15,7 +15,7 @@ To make your Python pipeline pipestat-compatible, you first need to initialize p ## Back-end types -Two types of back-ends are currently supported: +Three types of back-ends are currently supported: 1. a **file** (pass a file path to the constructor) The changes reported using the `report` method of `PipestatManger` will be securely written to the file. Currently only [YAML](https://yaml.org/) format is supported. @@ -23,6 +23,9 @@ The changes reported using the `report` method of `PipestatManger` will be secur 2. a **PostgreSQL database** (pass a path to the pipestat config to the constructor) This option gives the user the possibility to use a fully fledged database to back `PipestatManager`. +3. a **PEP on PEPhub** (pass a pep path to the constructor, e.g. `psm = PipestatManager(pephub_path=pephubpath)`) +This option gives the user the possibility to use PEPhub as a backend for results. + ## Initializing a pipestat session diff --git a/docs/pipestat/code/reporting-objects.md b/docs/pipestat/code/reporting-objects.md index 0ea22426..d39d2269 100644 --- a/docs/pipestat/code/reporting-objects.md +++ b/docs/pipestat/code/reporting-objects.md @@ -5,10 +5,6 @@ This tutorial will show you how pipestat can report not just primitive types, bu First create a `pipestat.PipestatManager` object with our example schema: -```python - -``` - ```python import pipestat @@ -93,9 +89,3 @@ psm.retrieve_one("sample1", "mydict")['toplevel']['value'] 456 - - - -```python - -``` diff --git a/docs/pipestat/configuration.md b/docs/pipestat/configuration.md index 53c94857..04f84de8 100644 --- a/docs/pipestat/configuration.md +++ b/docs/pipestat/configuration.md @@ -44,6 +44,13 @@ Beginning with v0.10.0, there is also support for reporting results directly to psm = PipestatManager(pephub_path="databio/pipestat_demo:default", schema_path=my_schema_file_path) ``` +You can also place this in the configuration file: + +```yaml +pephub_path: "databio/pipestat_demo:default" +schema_path: sample_output_schema.yaml + +``` Apart from that, there are many other *optional* configuration points that have defaults. Please refer to the [environment variables reference](http://pipestat.databio.org/en/dev/env_vars/) to learn about the the optional configuration options and their meaning. diff --git a/docs/pipestat/install.md b/docs/pipestat/install.md new file mode 100644 index 00000000..68f52fa5 --- /dev/null +++ b/docs/pipestat/install.md @@ -0,0 +1,32 @@ + +# Installing pipestat + +### Minimal install for file backend + +Install pipestat from PyPI with `pip`: + +``` +pip install pipestat +``` + +Confirm installation by calling `pipestat -h` on the command line. If the `pipestat` executable is not in your `$PATH`, append this to your `.bashrc` or `.profile` (or `.bash_profile` on macOS): + +```console +export PATH=~/.local/bin:$PATH +``` + +### Optional dependencies for database backend + +Pipestat can use either a file or a database as the backend for recording results. The default installation only provides file backend. To install dependencies required for the database backend: + +``` +pip install pipestat['dbbackend'] +``` + +### Optional dependencies for pipestat reader + +To install dependencies for the included `pipestatreader` submodule: + +``` +pip install pipestat['pipestatreader'] +``` \ No newline at end of file diff --git a/docs/pipestat/pipestat-schema.md b/docs/pipestat/pipestat-schema.md index dc90e838..95e4d00e 100644 --- a/docs/pipestat/pipestat-schema.md +++ b/docs/pipestat/pipestat-schema.md @@ -1,7 +1,168 @@ # How to write a pipestat schema +## Introduction Pipestat requires a schema, in which all the results that the pipeline can report are specified. -It is written in JSON schema +It is written in [JSON schema](https://cswr.github.io/JsonSchema/spec/basic_types/) which defines specific data types: + +### Data types + +Each *result* reported by a pipeline must have a specified data type. The supported basic types include: + +- string +- number +- integer +- boolean +- null + +Pipestat also extends the json schema vocabulary by adding two _additional_ types, which are common results of a pipeline: `image` and `file`. These types require reporting objects with the following attributes: + +- `file`: + - `path`: path to the reported file + - `title`: human readable description of the file +- `image`: + - `path`: path to the reported image, usually PDF + - `thumbnail`: path to the reported thumbnail, usually PNG or JPEG + - `title`: human readable description of the image + + +### Complex objects +Pipestat also supports reporting more [complex objects](./code/reporting-objects.md) + +### Unsupported data types + +`tuples` are currently unsupported for reporting and retrieving. + +## A simple example + +The pipestat output schema is a YAML-formatted file. The top level keys are the unique result identifiers. The associated values are jsonschema types. The `type` attribute is required. This is an example of a minimal component, specifying only an identifier, and its type: + +```yaml +result_identifier: + type: +``` + +Here, `result_identifier` can be whatever name you want to use to identify this result. Here's a simple schema example that showcases most of the supported types: + +```yaml +title: Example Pipestat Output Schema +description: A pipeline that uses pipestat to report sample level results. +type: object +properties: + pipeline_name: "default_pipeline_name" + samples: + type: array + properties: # result identifiers are properties of the samples object + number_of_things: + type: integer + description: "Number of things" + percentage_of_things: + type: number + description: "Percentage of things" + name_of_something: + type: string + description: "Name of something" + switch_value: + type: boolean + description: "Is the switch on or off" +``` + +The top level schema is of `type` `object`. It contains properties that define `samples`. Here, the `samples`'s properties are the results. So in the above example, the results that can be reported are: `number_of_things`,`percentage_of_things`,`name_of_something`, and `switch_value`. + +## A more complex example +Here's a more complex schema example that showcases some of the more advanced jsonschema features: + +```yaml +title: An example Pipestat output schema +description: A pipeline that uses pipestat to report sample and project level results. +type: object +properties: + pipeline_name: "default_pipeline_name" + samples: + type: array + items: + type: object + properties: + number_of_things: + type: integer + description: "Number of things" + percentage_of_things: + type: number + description: "Percentage of things" + name_of_something: + type: string + description: "Name of something" + switch_value: + type: boolean + description: "Is the switch on or off" + md5sum: + type: string + description: "MD5SUM of an object" + highlight: true + collection_of_images: + description: "This store collection of values or objects" + type: array + items: + properties: + prop1: + description: "This is an example file" + $ref: "#/$defs/file" + output_file_in_object: + type: object + properties: + prop1: + description: "This is an example file" + $ref: "#/$defs/file" + prop2: + description: "This is an example image" + $ref: "#/$defs/image" + description: "Object output" + output_file_in_object_nested: + type: object + description: First Level + properties: + prop1: + type: object + description: Second Level + properties: + prop2: + type: integer + description: Third Level + output_file: + $ref: "#/$defs/file" + description: "This a path to the output file" + output_image: + $ref: "#/$defs/image" + description: "This a path to the output image" +$defs: + image: + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + file: + type: object + object_type: file + properties: + path: + type: string + title: + type: string + required: + - path + - title + +``` + +In this example, we define reusable type definitions in `image` and `file`. For more details, see [pipestat specification](pipestat-specification.md). \ No newline at end of file diff --git a/docs/pipestat/pipestat-specification.md b/docs/pipestat/pipestat-specification.md index 03176472..1641c903 100644 --- a/docs/pipestat/pipestat-specification.md +++ b/docs/pipestat/pipestat-specification.md @@ -70,20 +70,22 @@ type: object properties: pipeline_name: "default_pipeline_name" samples: - type: object - properties: # result identifiers are properties of the samples object - number_of_things: - type: integer - description: "Number of things" - percentage_of_things: - type: number - description: "Percentage of things" - name_of_something: - type: string - description: "Name of something" - switch_value: - type: boolean - description: "Is the switch on or off" + type: array + items: + type: object + properties: # result identifiers are properties of the samples object + number_of_things: + type: integer + description: "Number of things" + percentage_of_things: + type: number + description: "Percentage of things" + name_of_something: + type: string + description: "Name of something" + switch_value: + type: boolean + description: "Is the switch on or off" ``` Here's a more complex schema example that showcases some of the more advanced jsonschema features: diff --git a/docs/pipestat/report_statuses.md b/docs/pipestat/report_statuses.md new file mode 100644 index 00000000..1ba330e8 --- /dev/null +++ b/docs/pipestat/report_statuses.md @@ -0,0 +1,50 @@ +# Reporting record identifier/sample statuses + +Ensure you set a directory to contain the status files if using a file backend: + +```python +psm = PipestatManager(results_file_path="results.yaml",schema_path="output_schema.yaml", flag_file_dir="./flags/") + +``` + +Now, when running your pipeline you can simply set the status based of the current record: + +```python +psm.set_status(record_identifier="sample1", status_identifier="completed") +``` + +All statuses are defined in the schemas/status_schema.yaml file: + +```yaml +running: + description: "the pipeline is running" + color: [30, 144, 255] # dodgerblue +completed: + description: "the pipeline has completed" + color: [50, 205, 50] # limegreen +failed: + description: "the pipeline has failed" + color: [220, 20, 60] # crimson +waiting: + description: "the pipeline is waiting" + color: [240, 230, 140] # khaki +partial: + description: "the pipeline stopped before completion point" + color: [169, 169, 169] # darkgray + +``` + + +### Coming from Looper? Make sure to set this flag directory in the looper config file: + +``` +pep_config: ./metadata/pep_config.yaml # pephub registry path or local path +output_dir: ./results +pipeline_interfaces: + - pipeline/pipeline_interface.yaml +pipestat: + project_name: count_lines + results_file_path: results.yaml + flag_file_dir: results/flags + +``` \ No newline at end of file diff --git a/docs/pipestat/results_records.md b/docs/pipestat/results_records.md new file mode 100644 index 00000000..787c6733 --- /dev/null +++ b/docs/pipestat/results_records.md @@ -0,0 +1,20 @@ +# Terminology + +Key concepts when using pipestat: + +### Samples/Records: + +- *record identifier*. An identifier for a particular pipeline run, such as a sample name. If you are using pipestat in tandem with looper, record_identifier = sample_name. + +### Results: + +- *result identifier*. The name of a result, such as `aligned_read_count` or `duplication_rate`. +- *result*: An element produced by a pipeline. Results have defined data types, described herein. +- *value*. The actual data for an output result for a given record. + + +### Misc: + +- *namespace*: A way to group results that belong together. In the api, this is referenced via `pipeline_name`. This is typically an identifier for a particular pipeline, like `rnaseq-pipeline`. All results from this pipeline will share this namespace. +- *pipestat specification*: the way to structure a set of results stored from one or more pipeline runs. +- *backend*. The technology underlying the result storage, which can be either a simple file or a database. diff --git a/docs/pipestat/summarize.md b/docs/pipestat/summarize.md new file mode 100644 index 00000000..751257a9 --- /dev/null +++ b/docs/pipestat/summarize.md @@ -0,0 +1,26 @@ +# Sharing reported results + +Pipestat currently has the ability to create HTML reports of reported pipeline results. + + +```python + +from pipestat import PipestatManager + +psm = PipestatManager(schema_path="sample_output_schema.yaml", results_file_path="my_results.yaml") + +psm.summarize(output_dir="/home/output_dir") + +# You can also create a portable version to share via email etc +psm.summarize(output_dir="/home/output_dir",portable= True ) + +``` + + +Similarly this can be accomplished via the CLI: + +```shell + +pipestat summarize --results-file my_results.yaml --schema output_schema.yaml --portable + +``` \ No newline at end of file diff --git a/docs/pipestat/usage.md b/docs/pipestat/usage.md index 4ce18568..38ea24b2 100644 --- a/docs/pipestat/usage.md +++ b/docs/pipestat/usage.md @@ -5,16 +5,13 @@ Pipestat offers a CLI that can be access via the `pipestat` command in the shell Here you can see the command-line usage instructions for the main command and for each subcommand: ## `pipestat --help` ```console -Configure by setting PIPESTAT_CONFIG env var -version: 0.6.0 -usage: pipestat [-h] [--version] [--silent] [--verbosity V] [--logdev] - {report,inspect,remove,retrieve,status,init,summarize,link,serve} - ... +version: 0.12.1 +usage: pipestat [-h] [--version] [--silent] [--verbosity V] [--logdev] {report,inspect,remove,retrieve,status,init,summarize,link,serve,history} ... pipestat - report pipeline results positional arguments: - {report,inspect,remove,retrieve,status,init,summarize,link,serve} + {report,inspect,remove,retrieve,status,init,summarize,link,serve,history} report Report a result. inspect Inspect a database. remove Remove a result. @@ -24,6 +21,7 @@ positional arguments: summarize Generates HTML Report link Create symlinks of reported files serve Initializes pipestatreader API + history Retrieve history of reported results for one record identifier options: -h, --help show this help message and exit @@ -32,17 +30,15 @@ options: --verbosity V Set logging level (1-5 or logging module level name) --logdev Expand content of logging message format. -Pipestat standardizes reporting of pipeline results and pipeline status -management. It formalizes a way for pipeline developers and downstream tools -developers to communicate -- results produced by a pipeline can easily and -reliably become an input for downstream analyses. A PipestatManager object -exposes an API for interacting with the results and pipeline status and can be -backed by either a YAML-formatted file or a database. +Pipestat standardizes reporting of pipeline results and pipeline status management. +It formalizes a way for pipeline developers and downstream tools developers to communicate +-- results produced by a pipeline can easily andreliably become an input for downstream analyses. +A PipestatManager object exposes an API for interacting with the results and pipeline status and +can be backed by either a YAML-formatted file or a database. ``` ## `pipestat report --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat report [-h] [-n N] [-f F] [-c C] [-a] [-s S] [--status-schema ST] [--flag-dir FD] [-p P] -i I [-r R] -v V [-o] [-t] @@ -83,7 +79,6 @@ options: ## `pipestat inspect --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat inspect [-h] [-n N] [-f F] [-c C] [-a] [-s S] [--status-schema ST] [--flag-dir FD] [-p P] [-d] @@ -115,7 +110,6 @@ options: ## `pipestat remove --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat remove [-h] [-n N] [-f F] [-c C] [-a] [-s S] [--status-schema ST] [--flag-dir FD] [-p P] -i I [-r R] @@ -151,7 +145,6 @@ options: ## `pipestat retrieve --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat retrieve [-h] [-n N] [-f F] [-c C] [-a] [-s S] [--status-schema ST] [--flag-dir FD] [-p P] [-r R] @@ -185,7 +178,6 @@ options: ## `pipestat status --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat status [-h] {set,get} ... Manage pipeline status. @@ -201,7 +193,6 @@ options: ## `pipestat status get --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat status get [-h] [-n N] [-f F] [-c C] [-a] [-s S] [--status-schema ST] [--flag-dir FD] [-r R] [-p P] @@ -235,7 +226,6 @@ options: ## `pipestat status set --help` ```console -Configure by setting PIPESTAT_CONFIG env var usage: pipestat status set [-h] [-n N] [-f F] [-c C] [-a] [-s S] [--status-schema ST] [--flag-dir FD] [-r R] [-p P] status_identifier diff --git a/docs/pypiper/cli.md b/docs/pypiper/cli.md index 67537d5a..0764e9f5 100644 --- a/docs/pypiper/cli.md +++ b/docs/pypiper/cli.md @@ -49,7 +49,7 @@ With that said, there are a few universal (Pypiper-added) options that are frequ ## Customizing `add_pypiper_args()` -There are two ways to modulate the arguments added by `add_pypiper_args()` function: the `groups` argument, which lets you add argument groups; or the `args` argument, which lets you add arguments indvidually. By default, `add_pypiper_args()` add all arguments listed in the `pypiper` group. You may instead pass a list of one or more of these groups of arguments (to `groups`) or individual arguments (to `args`) to customize exactly the set of built-in options your pipeline implements. +There are two ways to modulate the arguments added by `add_pypiper_args()` function: the `groups` argument, which lets you add argument groups; or the `args` argument, which lets you add arguments individually. By default, `add_pypiper_args()` add all arguments listed in the `pypiper` group. You may instead pass a list of one or more of these groups of arguments (to `groups`) or individual arguments (to `args`) to customize exactly the set of built-in options your pipeline implements. For example, `parser.add_pypiper_args(parser, groups=['pypiper', 'common'])` will add all arguments listed under `pypiper` and `common` below: diff --git a/docs/pypiper/code/basic-pipeline.md b/docs/pypiper/code/basic-pipeline.md index 35e7c3d1..df7ff381 100644 --- a/docs/pypiper/code/basic-pipeline.md +++ b/docs/pypiper/code/basic-pipeline.md @@ -8,7 +8,7 @@ Pypiper is simple but powerful. Your pipeline is a python script, let's call it * `PipelineManager.run()`: The primary workhorse function; runs a command. * `PipelineManager.stop_pipeline()`: Terminate the pipeline. -That's all you need to create a powerful pipeline. You can find in-depth reference documentation for each method in the API. In particular, most of Pypiper’s power comes from the `run` method, which has a series of options outlined in [dedicated documentation on the run method](../advanced-run-method.md). +That's all you need to create a powerful pipeline. You can find in-depth reference documentation for each method in the API. In particular, most of Pypiper’s power comes from the `run` method, which has a series of options outlined in [dedicated documentation on the run method](/advanced-run-method). To write your first basic pipeline, first `import pypiper`, then specify an output folder and create a new `PipelineManager` object: diff --git a/docs/pypiper/code/hello-world.md b/docs/pypiper/code/hello-world.md index 941f873d..fe5c59da 100644 --- a/docs/pypiper/code/hello-world.md +++ b/docs/pypiper/code/hello-world.md @@ -111,7 +111,7 @@ This output is printed to your screen and also recorded in a log file (called `` * output.txt * stats.tsv -These files are explained in more detail in the reference section [outputs explained](../outputs.md). +These files are explained in more detail in the reference section [outputs explained](outputs). What's next? That depends on if you're interested in just *running* pypiper pipelines, or if you want to *develop* pypiper pipelines. The next sections are a series of HOW-TO articles that address each of these scenarios. diff --git a/docs/pypiper/code/ngstk-api.md b/docs/pypiper/code/ngstk-api.md index 01b7042e..4159b655 100644 --- a/docs/pypiper/code/ngstk-api.md +++ b/docs/pypiper/code/ngstk-api.md @@ -1,1014 +1,44 @@ - +# NGSTk API Documentation - +NGSTk (Next-Generation Sequencing Toolkit) is a toolkit class that provides helper functions for building command strings used in NGS pipelines. It can be configured with a YAML configuration file to specify custom tool paths, or it will use tools from the system PATH. +### Key Features -# Package `pypiper` Documentation +- **Command Building**: Generate command strings for common NGS tools +- **Configuration Management**: Use custom tool paths via YAML config +- **Tool Integration**: Built-in support for common tools like samtools, bedtools, etc. +- **Pipeline Integration**: Works seamlessly with PipelineManager -## Class `NGSTk` -Class to hold functions to build command strings used during pipeline runs. Object can be instantiated with a string of a path to a yaml `pipeline config file`. Since NGSTk inherits from `AttMapEcho`, the passed config file and its elements will be accessible through the NGSTk object as attributes under `config` (e.g. `NGSTk.tools.java`). In case no `config_file` argument is passed, all commands will be returned assuming the tool is in the user's $PATH. +### Installation -#### Parameters: +NGSTk is included with pypiper: -- `config_file` (`str`): Path to pipeline yaml config file (optional). -- `pm` (`pypiper.PipelineManager`): A PipelineManager with which to associate this toolkit instance;that is, essentially a source from which to grab paths to tools, resources, etc. - - -#### Examples: - -```console - from pypiper.ngstk import NGSTk as tk - tk = NGSTk() - tk.samtools_index("sample.bam") - # returns: samtools index sample.bam - - # Using a configuration file (custom executable location): - from pypiper.ngstk import NGSTk - tk = NGSTk("pipeline_config_file.yaml") - tk.samtools_index("sample.bam") - # returns: /home/.local/samtools/bin/samtools index sample.bam -``` - - -```python -def __init__(self, config_file=None, pm=None) -``` - -Initialize self. See help(type(self)) for accurate signature. - - - -```python -def add_track_to_hub(self, sample_name, track_url, track_hub, colour, five_prime='') -``` - - - -```python -def bam2fastq(self, input_bam, output_fastq, output_fastq2=None, unpaired_fastq=None) -``` - -Create command to convert BAM(s) to FASTQ(s). -#### Parameters: - -- `input_bam` (`str`): Path to sequencing reads file to convert -- `output_fastq` (``): Path to FASTQ to write -- `output_fastq2` (``): Path to (R2) FASTQ to write -- `unpaired_fastq` (``): Path to unpaired FASTQ to write - - -#### Returns: - -- `str`: Command to convert BAM(s) to FASTQ(s) - - - - -```python -def bam_conversions(self, bam_file, depth=True) -``` - -Sort and index bam files for later use. -#### Parameters: - -- `depth` (`bool`): also calculate coverage over each position - - - - -```python -def bam_to_bed(self, input_bam, output_bed) -``` - - - -```python -def bam_to_bigwig(self, input_bam, output_bigwig, genome_sizes, genome, tagmented=False, normalize=False, norm_factor=1000) -``` - -Convert a BAM file to a bigWig file. -#### Parameters: - -- `input_bam` (`str`): path to BAM file to convert -- `output_bigwig` (`str`): path to which to write file in bigwig format -- `genome_sizes` (`str`): path to file with chromosome size information -- `genome` (`str`): name of genomic assembly -- `tagmented` (`bool`): flag related to read-generating protocol -- `normalize` (`bool`): whether to normalize coverage -- `norm_factor` (`int`): number of bases to use for normalization - - -#### Returns: - -- `list[str]`: sequence of commands to execute - - - - -```python -def bam_to_fastq(self, bam_file, out_fastq_pre, paired_end) -``` - -Build command to convert BAM file to FASTQ file(s) (R1/R2). -#### Parameters: - -- `bam_file` (`str`): path to BAM file with sequencing reads -- `out_fastq_pre` (`str`): path prefix for output FASTQ file(s) -- `paired_end` (`bool`): whether the given file contains paired-endor single-end sequencing reads - - -#### Returns: - -- `str`: file conversion command, ready to run - - - - -```python -def bam_to_fastq_awk(self, bam_file, out_fastq_pre, paired_end, zipmode=False) -``` - -This converts bam file to fastq files, but using awk. As of 2016, this is much faster than the standard way of doing this using Picard, and also much faster than the bedtools implementation as well; however, it does no sanity checks and assumes the reads (for paired data) are all paired (no singletons), in the correct order. -#### Parameters: - -- `zipmode` (`bool`): Should the output be zipped? - - - - -```python -def bam_to_fastq_bedtools(self, bam_file, out_fastq_pre, paired_end) -``` - -Converts bam to fastq; A version using bedtools - - - -```python -def bowtie2_map(self, input_fastq1, output_bam, log, metrics, genome_index, max_insert, cpus, input_fastq2=None) -``` - - - -```python -def calc_frip(self, input_bam, input_bed, threads=4) -``` - -Calculate fraction of reads in peaks. - -A file of with a pool of sequencing reads and a file with peak call -regions define the operation that will be performed. Thread count -for samtools can be specified as well. -#### Parameters: - -- `input_bam` (`str`): sequencing reads file -- `input_bed` (`str`): file with called peak regions -- `threads` (`int`): number of threads samtools may use - - -#### Returns: - -- `float`: fraction of reads in peaks defined in given peaks file - - - - -```python -def calculate_frip(self, input_bam, input_bed, output, cpus=4) -``` - - - -```python -def center_peaks_on_motifs(self, peak_file, genome, window_width, motif_file, output_bed) -``` - - - -```python -def check_command(self, command) -``` - -Check if command can be called. - - - -```python -def check_fastq(self, input_files, output_files, paired_end) -``` - -Returns a follow sanity-check function to be run after a fastq conversion. Run following a command that will produce the fastq files. - -This function will make sure any input files have the same number of reads as the -output files. - - - -```python -def check_trim(self, trimmed_fastq, paired_end, trimmed_fastq_R2=None, fastqc_folder=None) -``` - -Build function to evaluate read trimming, and optionally run fastqc. - -This is useful to construct an argument for the 'follow' parameter of -a PipelineManager's 'run' method. -#### Parameters: - -- `trimmed_fastq` (`str`): Path to trimmed reads file. -- `paired_end` (`bool`): Whether the processing is being done withpaired-end sequencing data. -- `trimmed_fastq_R2` (`str`): Path to read 2 file for the paired-end case. -- `fastqc_folder` (`str`): Path to folder within which to place fastqcoutput files; if unspecified, fastqc will not be run. - - -#### Returns: - -- `callable`: Function to evaluate read trimming and possibly runfastqc. - - - - -```python -def count_concordant(self, aligned_bam) -``` - -Count only reads that "aligned concordantly exactly 1 time." -#### Parameters: - -- `aligned_bam` (`str`): File for which to count mapped reads. - - - - -```python -def count_fail_reads(self, file_name, paired_end) -``` - -Counts the number of reads that failed platform/vendor quality checks. -#### Parameters: - -- `paired_end` (``): This parameter is ignored; samtools automatically correctly responds dependingon the data in the bamfile. We leave the option here just for consistency, since all the other counting functions require the parameter. This makes it easier to swap counting functions during pipeline development. - - - - -```python -def count_flag_reads(self, file_name, flag, paired_end) -``` - -Counts the number of reads with the specified flag. -#### Parameters: - -- `file_name` (`str`): name of reads file -- `flag` (`str`): sam flag value to be read -- `paired_end` (`bool`): This parameter is ignored; samtools automatically correctly responds dependingon the data in the bamfile. We leave the option here just for consistency, since all the other counting functions require the parameter. This makes it easier to swap counting functions during pipeline development. - - - - -```python -def count_lines(self, file_name) -``` - -Uses the command-line utility wc to count the number of lines in a file. For MacOS, must strip leading whitespace from wc. -#### Parameters: - -- `file_name` (`str`): name of file whose lines are to be counted - - - - -```python -def count_lines_zip(self, file_name) -``` - -Uses the command-line utility wc to count the number of lines in a file. For MacOS, must strip leading whitespace from wc. For compressed files. -#### Parameters: - -- `file` (``): file_name - - - - -```python -def count_mapped_reads(self, file_name, paired_end) -``` - -Mapped_reads are not in fastq format, so this one doesn't need to accommodate fastq, and therefore, doesn't require a paired-end parameter because it only uses samtools view. Therefore, it's ok that it has a default parameter, since this is discarded. -#### Parameters: - -- `file_name` (`str`): File for which to count mapped reads. -- `paired_end` (`bool`): This parameter is ignored; samtools automatically correctly responds dependingon the data in the bamfile. We leave the option here just for consistency, since all the other counting functions require the parameter. This makes it easier to swap counting functions during pipeline development. - - -#### Returns: - -- `int`: Either return code from samtools view command, or -1 to indicate an error state. - - - - -```python -def count_multimapping_reads(self, file_name, paired_end) -``` - -Counts the number of reads that mapped to multiple locations. Warning: currently, if the alignment software includes the reads at multiple locations, this function will count those more than once. This function is for software that randomly assigns, but flags reads as multimappers. -#### Parameters: - -- `file_name` (`str`): name of reads file -- `paired_end` (``): This parameter is ignored; samtools automatically correctly responds dependingon the data in the bamfile. We leave the option here just for consistency, since all the other counting functions require the parameter. This makes it easier to swap counting functions during pipeline development. - - - - -```python -def count_reads(self, file_name, paired_end) -``` - -Count reads in a file. - -Paired-end reads count as 2 in this function. -For paired-end reads, this function assumes that the reads are split -into 2 files, so it divides line count by 2 instead of 4. -This will thus give an incorrect result if your paired-end fastq files -are in only a single file (you must divide by 2 again). -#### Parameters: - -- `file_name` (`str`): Name/path of file whose reads are to be counted. -- `paired_end` (`bool`): Whether the file contains paired-end reads. - - - - -```python -def count_unique_mapped_reads(self, file_name, paired_end) -``` - -For a bam or sam file with paired or or single-end reads, returns the number of mapped reads, counting each read only once, even if it appears mapped at multiple locations. -#### Parameters: - -- `file_name` (`str`): name of reads file -- `paired_end` (`bool`): True/False paired end data - - -#### Returns: - -- `int`: Number of uniquely mapped reads. - - - - -```python -def count_unique_reads(self, file_name, paired_end) -``` - -Sometimes alignment software puts multiple locations for a single read; if you just count those reads, you will get an inaccurate count. This is _not_ the same as multimapping reads, which may or may not be actually duplicated in the bam file (depending on the alignment software). This function counts each read only once. This accounts for paired end or not for free because pairs have the same read name. In this function, a paired-end read would count as 2 reads. - - - -```python -def count_uniquelymapping_reads(self, file_name, paired_end) -``` - -Counts the number of reads that mapped to a unique position. -#### Parameters: - -- `file_name` (`str`): name of reads file -- `paired_end` (`bool`): This parameter is ignored. - - - - -```python -def fastqc(self, file, output_dir) -``` - -Create command to run fastqc on a FASTQ file -#### Parameters: - -- `file` (`str`): Path to file with sequencing reads -- `output_dir` (`str`): Path to folder in which to place output - - -#### Returns: - -- `str`: Command with which to run fastqc - - - - -```python -def fastqc_rename(self, input_bam, output_dir, sample_name) -``` - -Create pair of commands to run fastqc and organize files. - -The first command returned is the one that actually runs fastqc when -it's executed; the second moves the output files to the output -folder for the sample indicated. -#### Parameters: - -- `input_bam` (`str`): Path to file for which to run fastqc. -- `output_dir` (`str`): Path to folder in which fastqc output will bewritten, and within which the sample's output folder lives. -- `sample_name` (`str`): Sample name, which determines subfolder withinoutput_dir for the fastqc files. - - -#### Returns: - -- `list[str]`: Pair of commands, to run fastqc and then move the files totheir intended destination based on sample name. - - - - -```python -def filter_peaks_mappability(self, peaks, alignability, filtered_peaks) -``` - - - -```python -def filter_reads(self, input_bam, output_bam, metrics_file, paired=False, cpus=16, Q=30) -``` - -Remove duplicates, filter for >Q, remove multiple mapping reads. For paired-end reads, keep only proper pairs. - - - -```python -def genome_wide_coverage(self, input_bam, genome_windows, output) -``` - - - -```python -def get_chrs_from_bam(self, file_name) -``` - -Uses samtools to grab the chromosomes from the header that are contained in this bam file. - - - -```python -def get_file_size(self, filenames) -``` - -Get size of all files in string (space-separated) in megabytes (Mb). -#### Parameters: - -- `filenames` (`str`): a space-separated string of filenames - - - - -```python -def get_fragment_sizes(self, bam_file) -``` - - - -```python -def get_frip(self, sample) -``` - -Calculates the fraction of reads in peaks for a given sample. -#### Parameters: - -- `sample` (`pipelines.Sample`): Sample object with "peaks" attribute. - - - - -```python -def get_input_ext(self, input_file) -``` - -Get the extension of the input_file. Assumes you're using either .bam or .fastq/.fq or .fastq.gz/.fq.gz. - - - -```python -def get_mitochondrial_reads(self, bam_file, output, cpus=4) -``` - - - -```python -def get_peak_number(self, sample) -``` - -Counts number of peaks from a sample's peak file. -#### Parameters: - -- `sample` (`pipelines.Sample`): Sample object with "peaks" attribute. - - - - -```python -def get_read_type(self, bam_file, n=10) -``` - -Gets the read type (single, paired) and length of bam file. -#### Parameters: - -- `bam_file` (`str`): Bam file to determine read attributes. -- `n` (`int`): Number of lines to read from bam file. - - -#### Returns: - -- `str, int`: tuple of read type and read length - - - - -```python -def homer_annotate_pPeaks(self, peak_file, genome, motif_file, output_bed) -``` - - - -```python -def homer_find_motifs(self, peak_file, genome, output_dir, size=150, length='8,10,12,14,16', n_motifs=12) -``` - - - -```python -def htseq_count(self, input_bam, gtf, output) -``` - - - -```python -def index_bam(self, input_bam) -``` - - - -```python -def input_to_fastq(self, input_file, sample_name, paired_end, fastq_folder, output_file=None, multiclass=False, zipmode=False) -``` - -Builds a command to convert input file to fastq, for various inputs. - -Takes either .bam, .fastq.gz, or .fastq input and returns -commands that will create the .fastq file, regardless of input type. -This is useful to made your pipeline easily accept any of these input -types seamlessly, standardizing you to fastq which is still the -most common format for adapter trimmers, etc. You can specify you want -output either zipped or not. -Commands will place the output fastq file in given `fastq_folder`. -#### Parameters: - -- `input_file` (`str`): filename of input you want to convert to fastq -- `multiclass` (`bool`): Are both read1 and read2 included in a singlefile? User should not need to set this; it will be inferred and used in recursive calls, based on number files, and the paired_end arg. -- `zipmode` (`bool`): Should the output be .fastq.gz? Otherwise, just fastq - - -#### Returns: - -- `str`: A command (to be run with PipelineManager) that will ensureyour fastq file exists. - - - - -```python -def kallisto(self, input_fastq, output_dir, output_bam, transcriptome_index, cpus, input_fastq2=None, size=180, b=200) -``` - - - -```python -def link_to_track_hub(self, track_hub_url, file_name, genome) -``` - - - -```python -def macs2_call_peaks(self, treatment_bams, output_dir, sample_name, genome, control_bams=None, broad=False, paired=False, pvalue=None, qvalue=None, include_significance=None) -``` - -Use MACS2 to call peaks. -#### Parameters: - -- `treatment_bams` (`str | Iterable[str]`): Paths to files with data toregard as treatment. -- `output_dir` (`str`): Path to output folder. -- `sample_name` (`str`): Name for the sample involved. -- `genome` (`str`): Name of the genome assembly to use. -- `control_bams` (`str | Iterable[str]`): Paths to files with data toregard as control -- `broad` (`bool`): Whether to do broad peak calling. -- `paired` (`bool`): Whether reads are paired-end -- `pvalue` (`float | NoneType`): Statistical significance measure topass as --pvalue to peak calling with MACS -- `qvalue` (`float | NoneType`): Statistical significance measure topass as --qvalue to peak calling with MACS -- `include_significance` (`bool | NoneType`): Whether to pass astatistical significance argument to peak calling with MACS; if omitted, this will be True if the peak calling is broad or if either p-value or q-value is specified; default significance specification is a p-value of 0.001 if a significance is to be specified but no value is provided for p-value or q-value. - - -#### Returns: - -- `str`: Command to run. - - - - -```python -def macs2_call_peaks_atacseq(self, treatment_bam, output_dir, sample_name, genome) -``` - - - -```python -def macs2_plot_model(self, r_peak_model_file, sample_name, output_dir) -``` - - - -```python -def make_dir(self, path) -``` - -Forge path to directory, creating intermediates as needed. -#### Parameters: - -- `path` (`str`): Path to create. - - - - -```python -def make_sure_path_exists(self, path) -``` - -Alias for make_dir - - - -```python -def mark_duplicates(self, aligned_file, out_file, metrics_file, remove_duplicates='True') -``` - - - -```python -def merge_bams(self, input_bams, merged_bam, in_sorted='TRUE', tmp_dir=None) -``` - -Combine multiple files into one. - -The tmp_dir parameter is important because on poorly configured -systems, the default can sometimes fill up. -#### Parameters: - -- `input_bams` (`Iterable[str]`): Paths to files to combine -- `merged_bam` (`str`): Path to which to write combined result. -- `in_sorted` (`bool | str`): Whether the inputs are sorted -- `tmp_dir` (`str`): Path to temporary directory. - - - - -```python -def merge_bams_samtools(self, input_bams, merged_bam) -``` - - - -```python -def merge_fastq(self, inputs, output, run=False, remove_inputs=False) -``` - -Merge FASTQ files (zipped or not) into one. -#### Parameters: - -- `inputs` (`Iterable[str]`): Collection of paths to files to merge. -- `output` (`str`): Path to single output file. -- `run` (`bool`): Whether to run the command. -- `remove_inputs` (`bool`): Whether to keep the original files. - - -#### Returns: - -- `NoneType | str`: Null if running the command, otherwise thecommand itself - - -#### Raises: - -- `ValueError`: Raise ValueError if the call is such thatinputs are to be deleted but command is not run. - - - - -```python -def merge_or_link(self, input_args, raw_folder, local_base='sample') -``` - -Standardizes various input possibilities by converting either .bam, .fastq, or .fastq.gz files into a local file; merging those if multiple files given. -#### Parameters: - -- `input_args` (`list`): This is a list of arguments, each one is aclass of inputs (which can in turn be a string or a list). Typically, input_args is a list with 2 elements: first a list of read1 files; second an (optional!) list of read2 files. -- `raw_folder` (`str`): Name/path of folder for the merge/link. -- `local_base` (`str`): Usually the sample name. This (plus fileextension) will be the name of the local file linked (or merged) by this function. - - - - -```python -def move_file(self, old, new) -``` - - - -```python -def parse_bowtie_stats(self, stats_file) -``` - -Parses Bowtie2 stats file, returns series with values. -#### Parameters: - -- `stats_file` (`str `): Bowtie2 output file with alignment statistics. - - - - -```python -def parse_duplicate_stats(self, stats_file) -``` - -Parses sambamba markdup output, returns series with values. -#### Parameters: - -- `stats_file` (`str`): sambamba output file with duplicate statistics. - - - - -```python -def parse_qc(self, qc_file) -``` - -Parse phantompeakqualtools (spp) QC table and return quality metrics. -#### Parameters: - -- `qc_file` (`str`): Path to phantompeakqualtools output file, whichcontains sample quality measurements. - - - - -```python -def picard_mark_duplicates(self, input_bam, output_bam, metrics_file, temp_dir='.') -``` - - - -```python -def plot_atacseq_insert_sizes(self, bam, plot, output_csv, max_insert=1500, smallest_insert=30) -``` - -Heavy inspiration from here: https://github.com/dbrg77/ATAC/blob/master/ATAC_seq_read_length_curve_fitting.ipynb - - - -```python -def preseq_coverage(self, bam_file, output_prefix) -``` - - - -```python -def preseq_curve(self, bam_file, output_prefix) -``` - - - -```python -def preseq_extrapolate(self, bam_file, output_prefix) -``` - - - -```python -def remove_file(self, file_name) -``` - - - -```python -def run_spp(self, input_bam, output, plot, cpus) -``` - -Run the SPP read peak analysis tool. -#### Parameters: - -- `input_bam` (`str`): Path to reads file -- `output` (`str`): Path to output file -- `plot` (`str`): Path to plot file -- `cpus` (`int`): Number of processors to use - - -#### Returns: - -- `str`: Command with which to run SPP - - - - -```python -def sam_conversions(self, sam_file, depth=True) -``` - -Convert sam files to bam files, then sort and index them for later use. -#### Parameters: - -- `depth` (`bool`): also calculate coverage over each position - - - - -```python -def sambamba_remove_duplicates(self, input_bam, output_bam, cpus=16) -``` - - - -```python -def samtools_index(self, bam_file) -``` - -Index a bam file. - - - -```python -def samtools_view(self, file_name, param, postpend='') -``` - -Run samtools view, with flexible parameters and post-processing. - -This is used internally to implement the various count_reads functions. -#### Parameters: - -- `file_name` (`str`): file_name -- `param` (`str`): String of parameters to pass to samtools view -- `postpend` (`str`): String to append to the samtools command;useful to add cut, sort, wc operations to the samtools view output. - - - - -```python -def shift_reads(self, input_bam, genome, output_bam) -``` - - - -```python -def simple_frip(self, input_bam, input_bed, threads=4) -``` - - - -```python -def skewer(self, input_fastq1, output_prefix, output_fastq1, log, cpus, adapters, input_fastq2=None, output_fastq2=None) -``` - -Create commands with which to run skewer. -#### Parameters: - -- `input_fastq1` (`str`): Path to input (read 1) FASTQ file -- `output_prefix` (`str`): Prefix for output FASTQ file names -- `output_fastq1` (`str`): Path to (read 1) output FASTQ file -- `log` (`str`): Path to file to which to write logging information -- `cpus` (`int | str`): Number of processing cores to allow -- `adapters` (`str`): Path to file with sequencing adapters -- `input_fastq2` (`str`): Path to read 2 input FASTQ file -- `output_fastq2` (`str`): Path to read 2 output FASTQ file - - -#### Returns: - -- `list[str]`: Sequence of commands to run to trim reads withskewer and rename files as desired. - - - - -```python -def slurm_footer(self) -``` - - - -```python -def slurm_header(self, job_name, output, queue='shortq', n_tasks=1, time='10:00:00', cpus_per_task=8, mem_per_cpu=2000, nodes=1, user_mail='', mail_type='end') -``` - - - -```python -def slurm_submit_job(self, job_file) -``` - - - -```python -def sort_index_bam(self, input_bam, output_bam) -``` - - - -```python -def spp_call_peaks(self, treatment_bam, control_bam, treatment_name, control_name, output_dir, broad, cpus, qvalue=None) -``` - -Build command for R script to call peaks with SPP. -#### Parameters: - -- `treatment_bam` (`str`): Path to file with data for treatment sample. -- `control_bam` (`str`): Path to file with data for control sample. -- `treatment_name` (`str`): Name for the treatment sample. -- `control_name` (`str`): Name for the control sample. -- `output_dir` (`str`): Path to folder for output. -- `broad` (`str | bool`): Whether to specify broad peak calling mode. -- `cpus` (`int`): Number of cores the script may use. -- `qvalue` (`float`): FDR, as decimal value - - -#### Returns: - -- `str`: Command to run. - - - - -```python -def topHat_map(self, input_fastq, output_dir, genome, transcriptome, cpus) -``` - - - -```python -def trimmomatic(self, input_fastq1, output_fastq1, cpus, adapters, log, input_fastq2=None, output_fastq1_unpaired=None, output_fastq2=None, output_fastq2_unpaired=None) -``` - - - -```python -def validate_bam(self, input_bam) +```bash +pip install pypiper ``` -Wrapper for Picard's ValidateSamFile. -#### Parameters: - -- `input_bam` (`str`): Path to file to validate. - - -#### Returns: - -- `str`: Command to run for the validation. - - - +### Quick Example ```python -def zinba_call_peaks(self, treatment_bed, control_bed, cpus, tagmented=False) -``` +from pypiper.ngstk import NGSTk +# Initialize NGSTk +tk = NGSTk() - -```python -def ziptool(self) +# Generate a command +cmd = tk.samtools_index("sample.bam") +# Returns: "samtools index sample.bam" ``` -Returns the command to use for compressing/decompressing. -#### Returns: - -- `str`: Either 'gzip' or 'pigz' if installed and multiple cores - - - - - +## API Reference +### NGSTk Class -*Version Information: `pypiper` v0.14.1, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: pypiper.ngstk.NGSTk + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/pypiper/code/python-api.md b/docs/pypiper/code/python-api.md index a90c6ca4..93e424ec 100644 --- a/docs/pypiper/code/python-api.md +++ b/docs/pypiper/code/python-api.md @@ -1,433 +1,50 @@ - - - - - # Package `pypiper` Documentation -## Class `PipelineManager` -Base class for instantiating a PipelineManager object, the main class of Pypiper. +## Package Overview -#### Parameters: +The `pypiper` package provides a framework for building robust, restartable bioinformatics pipelines. It handles common pipeline tasks like checkpointing, logging, and resource monitoring. -- `name` (`str`): Choose a name for your pipeline;it's used to name the output files, flags, etc. -- `outfolder` (`str`): Folder in which to store the results. -- `args` (`argparse.Namespace`): Optional args object from ArgumentParser;Pypiper will simply record these arguments from your script -- `multi` (`bool`): Enables running multiple pipelines in one scriptor for interactive use. It simply disables the tee of the output, so you won't get output logged to a file. -- `dirty` (`bool`): Overrides the pipeline's clean_add()manual parameters, to *never* clean up intermediate files automatically. Useful for debugging; all cleanup files are added to manual cleanup script. -- `recover` (`bool`): Specify recover mode, to overwrite lock files.If pypiper encounters a locked target, it will ignore the lock and recompute this step. Useful to restart a failed pipeline. -- `new_start` (`bool`): start over and run every command even if output exists -- `force_follow` (`bool`): Force run all follow functionseven if the preceding command is not run. By default, following functions are only run if the preceding command is run. -- `cores` (`int`): number of processors to use, default 1 -- `mem` (`str`): amount of memory to use. Default units are megabytes unlessspecified using the suffix [K|M|G|T]." -- `config_file` (`str`): path to pipeline configuration file, optional -- `output_parent` (`str`): path to folder in which output folder will live -- `overwrite_checkpoints` (`bool`): Whether to override the stage-skippinglogic provided by the checkpointing system. This is useful if the calls to this manager's run() method will be coming from a class that implements pypiper.Pipeline, as such a class will handle checkpointing logic automatically, and will set this to True to protect from a case in which a restart begins upstream of a stage for which a checkpoint file already exists, but that depends on the upstream stage and thus should be rerun if it's "parent" is rerun. -- `pipestat_record_identifier` (`str`): record_identifier to report results via pipestat -- `pipestat_schema` (`str`): output schema used by pipestat to report results -- `pipestat_results_file` (`str`): path to file backend for reporting results -- `pipestat_config_file` (`str`): path to pipestat configuration file -- `pipestat_pipeline_type` (`str`): Sample or Project level pipeline -- `pipestat_result_formatter` (``): function used to style reported results, defaults to result_formatter_markdown +### Key Features +- **Automatic Checkpointing**: Resume pipelines from where they left off +- **Resource Monitoring**: Track memory and CPU usage +- **Result Reporting**: Integrate with pipestat for standardized results +- **Container Support**: Run commands in Docker containers +- **Pipeline Management**: Built-in logging and status tracking -#### Raises: +### Installation -- `TypeError`: if start or stop point(s) are provided both directly andvia args namespace, or if both stopping types (exclusive/prospective and inclusive/retrospective) are provided. - - -```python -def __init__(self, name, outfolder, version=None, args=None, multi=False, dirty=False, recover=False, new_start=False, force_follow=False, cores=1, mem='1000M', config_file=None, output_parent=None, overwrite_checkpoints=False, logger_kwargs=None, pipestat_record_identifier=None, pipestat_schema=None, pipestat_results_file=None, pipestat_config=None, pipestat_pipeline_type=None, pipestat_result_formatter=None, **kwargs) +```bash +pip install pypiper ``` -Initialize self. See help(type(self)) for accurate signature. - - +### Quick Example ```python -def callprint(self, cmd, shell=None, lock_file=None, nofail=False, container=None) -``` - -Prints the command, and then executes it, then prints the memory use and return code of the command. - -Uses python's subprocess.Popen() to execute the given command. The shell argument is simply -passed along to Popen(). You should use shell=False (default) where possible, because this enables memory -profiling. You should use shell=True if you require shell functions like redirects (>) or stars (*), but this -will prevent the script from monitoring memory use. The pipes (|) will be used to split the command into -subprocesses run within python, so the memory profiling is possible. -cmd can also be a series (a dict object) of multiple commands, which will be run in succession. -#### Parameters: - -- `cmd` (`str | Iterable[str]`): Bash command(s) to be run. -- `lock_file` (`str`): a lock file name -- `nofail` (`bool`): FalseNofail can be used to implement non-essential parts of the pipeline; if these processes fail, they will not cause the pipeline to bail out. -- `shell` (`bool`): None (will tryto determine based on the command) -- `container` (``): Named Docker container in which to execute. -- `container` (``): str - - - - -```python -def checkprint(self, cmd, shell=None, nofail=False) -``` - -Just like callprint, but checks output -- so you can get a variable in python corresponding to the return value of the command you call. This is equivalent to running subprocess.check_output() instead of subprocess.call(). -#### Parameters: - -- `cmd` (`str | Iterable[str]`): Bash command(s) to be run. -- `shell` (`bool | str`): If command requires should be run in its own shell. Optional.Default: "guess" -- `run()` will try to guess if the command should be run in a shell (based on the presence of a pipe (|) or redirect (>), To force a process to run as a direct subprocess, set `shell` to False; to force a shell, set True. -- `nofail` (`bool`): FalseNofail can be used to implement non-essential parts of the pipeline; if these processes fail, they will not cause the pipeline to bail out. - - -#### Returns: - -- `str`: text output by the executed subprocess (check_output) - - - +from pypiper import PipelineManager -```python -def clean_add(self, regex, conditional=False, manual=False) -``` - -Add files (or regexs) to a cleanup list, to delete when this pipeline completes successfully. When making a call with run that produces intermediate files that should be deleted after the pipeline completes, you flag these files for deletion with this command. Files added with clean_add will only be deleted upon success of the pipeline. -#### Parameters: - -- `regex` (`str`): A unix-style regular expression that matches files to delete(can also be a file name). -- `conditional` (`bool`): True means the files will only be deleted if no otherpipelines are currently running; otherwise they are added to a manual cleanup script called {pipeline_name}_cleanup.sh -- `manual` (`bool`): True means the files will just be added to a manual cleanup script. - - - - -```python -def complete(self) -``` - -Stop a completely finished pipeline. - - - -```python -def critical(self, msg, *args, **kwargs) -``` - - - -```python -def debug(self, msg, *args, **kwargs) -``` - - - -```python -def error(self, msg, *args, **kwargs) -``` +# Initialize a pipeline +pm = PipelineManager( + name="my_pipeline", + outfolder="results/" +) +# Run a command +pm.run("echo 'Hello, world!'") - -```python -def fail_pipeline(self, exc: Exception, dynamic_recover: bool=False) -``` - -If the pipeline does not complete, this function will stop the pipeline gracefully. It sets the status flag to failed and skips the normal success completion procedure. -#### Parameters: - -- `exc` (`Exception`): Exception to raise. -- `dynamic_recover` (`bool`): Whether to recover e.g. for job termination. - - - - -```python -def fatal(self, msg, *args, **kwargs) -``` - - - -```python -def get_container(self, image, mounts) -``` - - - -```python -def get_elapsed_time(self) -``` - -Parse the pipeline profile file, collect the unique and last duplicated runtimes and sum them up. In case the profile is not found, an estimate is calculated (which is correct only in case the pipeline was not rerun) -#### Returns: - -- `int`: sum of runtimes in seconds - - - - -```python -def get_stat(self, key) -``` - -Returns a stat that was previously reported. This is necessary for reporting new stats that are derived from two stats, one of which may have been reported by an earlier run. For example, if you first use report_result to report (number of trimmed reads), and then in a later stage want to report alignment rate, then this second stat (alignment rate) will require knowing the first stat (number of trimmed reads); however, that may not have been calculated in the current pipeline run, so we must retrieve it from the stats.yaml output file. This command will retrieve such previously reported stats if they were not already calculated in the current pipeline run. -#### Parameters: - -- `key` (``): key of stat to retrieve - - - - -```python -def halt(self, checkpoint=None, finished=False, raise_error=True) -``` - -Stop the pipeline before completion point. -#### Parameters: - -- `checkpoint` (`str`): Name of stage just reached or just completed. -- `finished` (`bool`): Whether the indicated stage was just finished(True), or just reached (False) -- `raise_error` (`bool`): Whether to raise an exception to trulyhalt execution. - - - - -```python -def halted(self) -``` - -Is the managed pipeline in a paused/halted state? -#### Returns: - -- `bool`: Whether the managed pipeline is in a paused/halted state. - - - - -```python -def info(self, msg, *args, **kwargs) +# Stop the pipeline +pm.stop_pipeline() ``` +## API Reference +### PipelineManager Class -```python -def make_sure_path_exists(path) -``` - -Creates all directories in a path if it does not exist. -#### Parameters: - -- `path` (`str`): Path to create. - - -#### Raises: - -- `Exception`: if the path creation attempt hits an error with code indicating a cause other than pre-existence. - - - - -```python -def pipestat(self) -``` - -`pipestat.PipestatManager` object to use for pipeline results reporting and status management - -Depending on the object configuration it can report to -a YAML-formatted file or PostgreSQL database. Please refer to pipestat -documentation for more details: http://pipestat.databio.org/ -#### Returns: - -- `pipestat.PipestatManager`: object to use for results reporting - - - - -```python -def process_counter(self) -``` - -Increments process counter with regard to the follow state: if currently executed command is a follow function of another one, the counter is not incremented. -#### Returns: - -- `str | int`: current counter state, a number if the counter has been incremented or a number of the previous process plus "f" otherwise - - - - -```python -def remove_container(self, container) -``` - - - -```python -def report_object(self, key, filename, anchor_text=None, anchor_image=None, annotation=None, nolog=False, result_formatter=None, force_overwrite=True) -``` - -Writes a key:value pair to self.pipeline_stats_file. Note: this function will be deprecated. Using report_result is recommended. -#### Parameters: - -- `key` (`str`): name (key) of the object -- `filename` (`str`): relative path to the file (relative to parentoutput dir) -- `anchor_text` (`str`): text used as the link anchor test or caption torefer to the object. If not provided, defaults to the key. -- `anchor_image` (`str`): a path to an HTML-displayable image thumbnail(so, .png or .jpg, for example). If a path, the path should be relative to the parent output dir. -- `annotation` (`str`): By default, the figures will be annotated withthe pipeline name, so you can tell which pipeline records which figures. If you want, you can change this. -- `nolog` (`bool`): Turn on this flag to NOT print this result in thelogfile. Use sparingly in case you will be printing the result in a different format. -- `result_formatter` (`str`): function for formatting via pipestat backend -- `force_overwrite` (`bool`): overwrite results if they already exist? - - -#### Returns: - -- `str reported_result`: the reported result is returned as a list of formatted strings. - - - - -```python -def report_result(self, key, value, nolog=False, result_formatter=None, force_overwrite=True) -``` - -Writes a key:value pair to self.pipeline_stats_file. -#### Parameters: - -- `key` (`str`): name (key) of the stat -- `value` (`dict`): value of the stat to report. -- `nolog` (`bool`): Turn on this flag to NOT print this result in thelogfile. Use sparingly in case you will be printing the result in a different format. -- `result_formatter` (`str`): function for formatting via pipestat backend -- `force_overwrite` (`bool`): overwrite results if they already exist? - - -#### Returns: - -- `str reported_result`: the reported result is returned as a list of formatted strings. - - - - -```python -def run(self, cmd, target=None, lock_name=None, shell=None, nofail=False, clean=False, follow=None, container=None, default_return_code=0) -``` - -The primary workhorse function of PipelineManager, this runs a command. - -This is the command execution function, which enforces -race-free file-locking, enables restartability, and multiple pipelines -can produce/use the same files. The function will wait for the file -lock if it exists, and not produce new output (by default) if the -target output file already exists. If the output is to be created, -it will first create a lock file to prevent other calls to run -(for example, in parallel pipelines) from touching the file while it -is being created. It also records the memory of the process and -provides some logging output. -#### Parameters: - -- `cmd` (`str | list[str]`): Shell command(s) to be run. -- `target` (`str | Sequence[str]`): Output file(s) to produce, optional.If all target files exist, the command will not be run. If no target is given, a lock_name must be provided. -- `lock_name` (`str`): Name of lock file. Optional. -- `shell` (`bool`): If command requires should be run in its own shell.Optional. Default: None --will try to determine whether the command requires a shell. -- `nofail` (`bool`): Whether the pipeline proceed past a nonzero return froma process, default False; nofail can be used to implement non-essential parts of the pipeline; if a 'nofail' command fails, the pipeline is free to continue execution. -- `clean` (`bool`): True means the target file will be automatically addedto an auto cleanup list. Optional. -- `follow` (`callable`): Function to call after executing (each) command. -- `container` (`str`): Name for Docker container in which to run commands. -- `default_return_code` (`Any`): Return code to use, might be used to discriminatebetween runs that did not execute any commands and runs that did. - - -#### Returns: - -- `int`: Return code of process. If a list of commands is passed,this is the maximum of all return codes for all commands. - - - - -```python -def start_pipeline(self, args=None, multi=False) -``` - -Initialize setup. Do some setup, like tee output, print some diagnostics, create temp files. You provide only the output directory (used for pipeline stats, log, and status flag files). - - - -```python -def stop_pipeline(self, status='completed') -``` - -Terminate the pipeline. - -This is the "healthy" pipeline completion function. -The normal pipeline completion function, to be run by the pipeline -at the end of the script. It sets status flag to completed and records -some time and memory statistics to the log file. - - - -```python -def time_elapsed(time_since) -``` - -Returns the number of seconds that have elapsed since the time_since parameter. -#### Parameters: - -- `time_since` (`float`): Time as a float given by time.time(). - - - - -```python -def timestamp(self, message='', checkpoint=None, finished=False, raise_error=True) -``` - -Print message, time, and time elapsed, perhaps creating checkpoint. - -This prints your given message, along with the current time, and time -elapsed since the previous timestamp() call. If you specify a -HEADING by beginning the message with "###", it surrounds the message -with newlines for easier readability in the log file. If a checkpoint -is designated, an empty file is created corresponding to the name -given. Depending on how this manager's been configured, the value of -the checkpoint, and whether this timestamp indicates initiation or -completion of a group of pipeline steps, this call may stop the -pipeline's execution. -#### Parameters: - -- `message` (`str`): Message to timestamp. -- `checkpoint` (`str`): Name of checkpoint; this tends to be somethingthat reflects the processing logic about to be or having just been completed. Provision of an argument to this parameter means that a checkpoint file will be created, facilitating arbitrary starting and stopping point for the pipeline as desired. -- `finished` (`bool`): Whether this call represents the completion of aconceptual unit of a pipeline's processing -- `raise_error` (``): Whether to raise exception ifcheckpoint or current state indicates that a halt should occur. - - - - -```python -def warning(self, msg, *args, **kwargs) -``` - - - - - +The main class for building and managing pipelines: -*Version Information: `pypiper` v0.14.1, generated by `lucidoc` v0.4.4* \ No newline at end of file +::: pypiper.PipelineManager + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true diff --git a/docs/spec/howto-multi-value-attributes.md b/docs/spec/howto-multi-value-attributes.md index 39dc42e2..be7c1fe3 100644 --- a/docs/spec/howto-multi-value-attributes.md +++ b/docs/spec/howto-multi-value-attributes.md @@ -38,7 +38,7 @@ sample_table: annotation.csv subsample_table: subsample_table.csv ``` -Make sure the `sample_name` column of this table matche the `sample_name` column in your sample_table, and then include any columns that require multiple values. `PEP` will automatically include all of these values as appropriate. +Make sure the `sample_name` column of this table match the `sample_name` column in your sample_table, and then include any columns that require multiple values. `PEP` will automatically include all of these values as appropriate. Here's a simple example of a PEP that uses subsamples. If you define `annotation.csv` like this: diff --git a/docs/ubiquerg/README.md b/docs/ubiquerg/README.md new file mode 100644 index 00000000..01daf142 --- /dev/null +++ b/docs/ubiquerg/README.md @@ -0,0 +1,22 @@ +# ubiquerg + +![Run pytests](https://github.com/pepkit/ubiquerg/workflows/Run%20pytests/badge.svg) +[![codecov](https://codecov.io/gh/pepkit/ubiquerg/branch/master/graph/badge.svg)](https://codecov.io/gh/pepkit/ubiquerg) +[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](https://pepkit.github.io) + +Ubiquerg is a utility package with a collection of helpful universally useful functions. The name means work (erg) everywhere (ubique), indicating our intention for these to be low-level functions that can be used in lots of different places. Functions are divided into groups, including: + +- collection +- environment +- files +- paths +- system +- web +- cli_tools + +## Development guidelines + +- Ubiquerg should have no dependencies outside of standard built-in python modules. Please do not add any functions that introduce a new dependency. +- Functions should be generic. They should perform basic, low-level processing that is not specific to a particular application. +- Functions should only be added to ubiquerg if they are used in at least 2 existing modules. +- Functions should be kept relatively small and simple (ideally <50 lines of code). diff --git a/docs/ubiquerg/changelog.md b/docs/ubiquerg/changelog.md new file mode 100644 index 00000000..a48f1bdb --- /dev/null +++ b/docs/ubiquerg/changelog.md @@ -0,0 +1,159 @@ +# Changelog + +This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. + +## [0.8.2] -- 2025-12-01 + +### Changed +- Removed veracitools as test dependency, and all other test dependencies. + +## [0.8.1] - 2025-03-05 + +### Added +- Ported deep_update from yacman + +## [0.8.0] - 2024-04-02 +### Changed +- Expanded `mkabs` function to handle more cases +- Allow `is_url` to work on Path objects +- Remove `mock` test requirement in favor of importing `unittest.mock as mock` + +## [0.7.0] - 2024-01-02 + +### Added +- Experimental support for three-locking. + + +## [0.6.3] - 2023-08-08 +### Fixed +- Incorrect read of registry path. [Issue 35](https://github.com/pepkit/ubiquerg/issues/35) + +## [0.6.2] - 2021-01-28 + +### Fixed +- `is_url` function; [Issue 32](https://github.com/pepkit/ubiquerg/issues/32) + +## [0.6.1] - 2020-07-01 + +### Changed +- file locking enhancements + +## [0.6.0] - 2020-06-23 + +### Added +- file locking utilities: + - `create_lock` + - `remove_lock` + - `make_lock_path` + - `create_file_racefree` + - `wait_for_lock` + +## [0.5.2] - 2020-05-15 +### Changed +- in `size` function, if a file is not found a warning is issued, instead of a message + +### Added +- `VersionInHelpParser` methods characterizing the instance: + - `arg_defaults` + - `dests_by_subparser` + - `subcommands` + - `top_level_args` + - `subparsers` + +## [0.5.1] - 2020-03-30 +### Added +- `uniqify` function +- support for collection of paths in `size` function + +### Fixed +- path expansion issues; [Issue #24](https://github.com/pepkit/ubiquerg/issues/24) + +### Changed + +## [0.5.0] - 2019-10-17 +### Added +- add `asciify_dict` function + +## [0.4.9] - 2019-09-17 +### Added +- add `--version` argument in `VersionInHelpParser` +- add `untar` function +- add `mkabs` function + +## [0.4.8] - 2019-08-27 +### Added +- `parse_registry_path` function. + +## [0.4.7] - 2019-08-09 +### Fixed +- `is_writable` function; [Issue 16](https://github.com/pepkit/ubiquerg/issues/16) + +## [0.4.6] - 2019-08-08 +### Added +- file/directory size checker +- `is_writable` function + +## [0.4.5] - 2019-07-01 +### Changed +- If argument to callability checker is a file, require executability; if it's a folder, it's not callable. +### Fixed +- Populate intended field in error message for bad argument to callability checker. + +## [0.4.4] - 2019-06-20 +### Added +- Command callability checker + +## [0.4.3] - 2019-06-06 +### Changed +- To avoid implicitly wrapping `input()` in `eval`, never use `2to3`. + +## [0.4.2] - 2019-06-06 +### Fixed +- More robust handling of terminal interaction in `query_yes_no` + +## [0.4] - 2019-05-31 +### Added +- `checksum`, using md5 +- `query_yes_no` to facilitate binary user terminal interaction + +## [0.3] - 2019-05-29 +### Added +- `TmpEnv`: environment variable context manager + +## [0.2.1] - 2019-05-16 +### Changed +- Use `is_url` to determine slash behavior in `expandpath` + +## [0.2] - 2019-05-16 +### Added +- `is_url` + +## [0.1] - 2019-05-08 +### Changed +- Remove `ExpectContext`; see [`veracitools`](https://github.com/pepkit/veracitools) + +## [0.0.5.1] - 2019-05-08 +### Fixed +- Control exports to fix a docs build issue; see [Issue 2](https://github.com/pepkit/ubiquerg/issues/2) + +## [0.0.5] - 2019-05-08 +### Added +- `expandpath` utility for dealing with user and environment variables in paths + +## [0.0.4] - 2019-05-03 +### Added +- `ExpectContext` for uniform test execution, regardless of whether expectation is an ordinary object or an exception +### Changed +- When minimum item count exceeds pool size and/or the "pool" of items is empty, `powerset` returns an empty collection rather than a collection with a single empty element. + +## [0.0.3] - 2019-05-02 +### Added +- CLI optarg string builder (`build_cli_extra`) +- `powerset` (all subsets of a collection) + +## [0.0.2] - 2019-05-01 +## Changed +- Restrict offerings to most generic functionality. + +## [0.0.1] - 2019-04-30 +- First release version diff --git a/docs/ubiquerg/code/python-api.md b/docs/ubiquerg/code/python-api.md new file mode 100644 index 00000000..4fb9dffd --- /dev/null +++ b/docs/ubiquerg/code/python-api.md @@ -0,0 +1,77 @@ +# Package `ubiquerg` Documentation + +## Package Overview + +Ubiquerg is a utility package with a collection of helpful universally useful functions. The name means work (erg) everywhere (ubique), indicating the intention for these to be low-level functions that can be used in lots of different places. + +### Installation + +```bash +pip install ubiquerg +``` + +## API Reference + +### CLI Tools + +::: ubiquerg.cli_tools + options: + docstring_style: google + show_source: true + show_signature: true + +### Collection Utilities + +::: ubiquerg.collection + options: + docstring_style: google + show_source: true + show_signature: true + +### Environment Utilities + +::: ubiquerg.environment + options: + docstring_style: google + show_source: true + show_signature: true + +### File Operations + +::: ubiquerg.files + options: + docstring_style: google + show_source: true + show_signature: true + +### File Locking + +::: ubiquerg.file_locking + options: + docstring_style: google + show_source: true + show_signature: true + +### Path Utilities + +::: ubiquerg.paths + options: + docstring_style: google + show_source: true + show_signature: true + +### System Utilities + +::: ubiquerg.system + options: + docstring_style: google + show_source: true + show_signature: true + +### Web Utilities + +::: ubiquerg.web + options: + docstring_style: google + show_source: true + show_signature: true diff --git a/docs/yacman/README.md b/docs/yacman/README.md new file mode 100644 index 00000000..809161d3 --- /dev/null +++ b/docs/yacman/README.md @@ -0,0 +1,32 @@ +yacman
+![Run pytests](https://github.com/databio/yacman/workflows/Run%20pytests/badge.svg) +![Test locking parallel](https://github.com/databio/yacman/workflows/Test%20locking%20parallel/badge.svg) +[![codecov](https://codecov.io/gh/databio/yacman/branch/master/graph/badge.svg)](https://codecov.io/gh/databio/yacman) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Anaconda-Server Badge](https://anaconda.org/conda-forge/yacman/badges/version.svg)](https://anaconda.org/conda-forge/yacman) + +Yacman is a YAML configuration manager. It provides convenient tools for dealing with YAML configuration files and is part of the PEP ecosystem. Several PEP tools use yacman for managing configuration files. + +## Quick start + +```python +from yacman import YAMLConfigManager, write_lock + +# Create from a file +ym = YAMLConfigManager.from_yaml_file("config.yaml") + +# Access values +print(ym["my_key"]) + +# Update and write safely +ym["new_key"] = "new_value" +with write_lock(ym) as locked_ym: + locked_ym.rebase() + locked_ym.write() +``` + +## Documentation + +- [Tutorial](notebooks/tutorial.ipynb) - Interactive notebook with features and usage examples +- [API documentation](code/python-api.md) - Detailed API reference +- [Upgrading guide](upgrading.md) - How to upgrade from v0.x to v1.0 diff --git a/docs/yacman/changelog.md b/docs/yacman/changelog.md new file mode 100644 index 00000000..428b99fb --- /dev/null +++ b/docs/yacman/changelog.md @@ -0,0 +1,219 @@ +# Changelog + +This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. + +## [1.0.0] -- Unreleased + +### Changed +- Renamed `FutureYAMLConfigManager` to `YAMLConfigManager` (the "future" is now!) +- `FutureYAMLConfigManager` is still available as a deprecated alias with a warning (will be removed in v1.1.0) + +### Fixed +- Simplified dependencies +- Removed deprecated code (`IK` constant, `_warn_deprecated` function) +- Removed deprecated properties (`alias_dict`, `_raw_alias_dict`) + +### Removed +- jsonschema validation +- attmap support + +## [0.9.4] -- 2025-11-03 + +### Added +- Python 3.13 support + +### Fixed +- Missing import for `urlopen` from `urllib.request` +- Missing import for `Mapping` from `collections.abc` + +### Removed +- Deprecated Python < 3.7 compatibility code + +## [0.9.3] -- 2024-02-01 + +### Added +- New `FutureYAMLConfigManager` object, prep for v1. +- Improved file locking system with `read_lock` and `write_lock` context managers +- New `from_x` object construction API. + + +## [0.9.2] -- 2023-10-05 + +## Added +- new functionality for handling names of config files + +## Fixed +- bugs with selecting a config file +- bug with exiting Python on system interrupt + +## [0.9.1] -- 2023-06-15 + +## Added +- `.priority_get()` function on `YAMLConfigManager` object + +## [0.9.0] -- 2022-05-04 + +This is a transition release that is compatible with the 0.X series, and also provides new capability from 1.0, which will not be backwards-compatible + +## Added +- new `YAMLConfigManager` object, to replace YacAttMap, which will be the new interface in 1.0. + +## [0.8.4] -- 2021-12-02 +## Fixed +- a bug that prevented writing a readonly file to an external path + +## [0.8.3] -- 2021-09-20 +## Fixed +- removed use2to3 for compatibility with setuptools upgrade. + +## [0.8.2] -- 2021-06-28 +## Fixed +- if file is empty, initialize its contents to an empty dict, which prevents failure +- check for previously applied path to `yaml.SafeLoader` before patching + +## [0.8.1] -- 2021-03-18 +## Fixed +- Clarified message for `__internal` key. + + +## [0.8.0] -- 2021-03-10 +### Added +- jsonschema validation support. The `YacAttMap` contents can be validated when object is constructed and on every call to the `write` method +- `__internal` key in `YacAttMap` object, which stores a `attamp.AttMap` of meta attributes. `__internal` can be accessed in clients as: `yacman.IK` + +## Deprecated +- use of the following properties, which should be accessed via `__internal` key from now on: + - `YacAttMap.file_path` + - `YacAttMap.writable` + +## [0.7.1] -- 2021-02-22 +### Added +- environment variables expansion in provided paths in `select_config` function +### Fixed +- issues with locking nonexistent files; [#41](https://github.com/databio/yacman/issues/41) + +## [0.7.0] -- 2020-08-28 +### Added +- `AliasedYacAttMap` class that supports top-level key aliases + +## [0.6.9] -- 2020-07-01 +### Changed +- improved file locking +### Removed +- possibility to provide a file path as `entries` in the `YacAttMap` constructor + +## [0.6.8] -- 2020-06-25 +### Changed +- extended lock wait time and the frequency of checks +- drop Python 2 support +### Fixed +- a problem with file locking after other process unlocked it before the timeout + + +## [0.6.7] -- 2020-02-07 +### Changed +- load_yaml function can accommodate URLs. + + +## [0.6.6] -- 2019-12-13 +### Added +- possibility to use `YacAttMap` in a context manager even if it was _not_ read from a file, but a file path attribute has been set + + +## [0.6.5] -- 2019-12-02 + +### Added +- context manager functionality to `YacAttMap` class + +### Changed +- method name: `unlock` to `make_readonly` +- `make_writable` behavior: it re-reads the source file now + +## [0.6.4] -- 2019-11-04 + +### Added +- distribute license file with the package + +## [0.6.3] -- 2019-10-22 + +### Fixed +- silent lock creation failures in case the lock directory does not exist; [#24](https://github.com/databio/yacman/issues/24) + +### Added +- `YacAttMap` properties: `file_path` and `writable` + +## [0.6.2] -- 2019-10-10 + +### Changed + +- in `select_config` always use default config file path when no valid path is determined + +## [0.6.1] -- 2019-10-08 + +### Added +- `strict_env` argument to the `select_config` function + +### Changed +- in `select_config` use the `default_config_filepath` even if no `config_env_vars` were specified + +## [0.6.0] -- 2019-10-02 + +### Added +- add support for multi-user context operation +- `writable` argument to create the object in a read-only or writable mode +- `wait_max` argument to specify the wait time for the lock before raising a `RuntimeError` +- `unlock` method +- `make_writable` method + +### Changed +- entries argument accepting a file path becomes deprecated and throws a `DeprecationWarning` and will be removed altogether in the future release + +## [0.5.2] -- 2019-08-20 + +### Changed +- Force all indexes to be strings (not floats or ints). + +## [0.5.1] -- 2019-08-02 + +### Added +- Allow providing a yaml string to constructor. + +## [0.5.0] -- 2019-06-18 + +### Added +- Improve constructor to allow either a dict or a filepath +- Make printing prettier + +## [0.4.2] -- 2019-06-18 + +### Changed +- Parameterize existence check for `select_config`. + +## [0.4.1] -- 2019-06-14 + +### Changed +- Parameterize behavior when `select_config` filepath argument does not exist. + +## [0.4.0] -- 2019-06-07 + +### Fixed +- Fix bug when building a `YacAttMap` with a filepath in Python 2.7: [Issue 6](https://github.com/databio/yacman/issues/6) + +### CHanged +- Defer exception handling from `load_yaml` to client code. + +## [0.3.0] -- 2019-06-04 + +### Added +- Allow a YacAttMap to remember its own path so it can use `write` without an argument. + +## [0.2.0] -- 2019-05-21 + +### Changed +- Changed `select_load` to just `select` so you load on your own. + +### Fixed +- Fixed packaging bug + +## [0.1.0] -- 2019-05-15 +- First functional public release of `yacman`. diff --git a/docs/yacman/code/python-api.md b/docs/yacman/code/python-api.md new file mode 100644 index 00000000..372b3f8e --- /dev/null +++ b/docs/yacman/code/python-api.md @@ -0,0 +1,125 @@ +# Package `yacman` Documentation + +## Package Overview + +Yacman is a YAML configuration manager that provides convenience tools for dealing with YAML configuration files. It's designed for safe, concurrent access to configuration data with file locking support and a flexible attribute-based access pattern. + +### Key Features + +- **Attribute-Based Access**: Access YAML data as object attributes +- **File Locking**: Race-free reading and writing in multi-user contexts +- **Flexible Construction**: Create from files, strings, or dictionaries +- **Path Expansion**: Automatically expand environment variables and paths +- **Alias Support**: Define custom aliases for configuration keys +- **Context Managers**: Safe read and write operations with locking + +### Installation + +```bash +pip install yacman +``` + +### Quick Example + +```python +from yacman import YAMLConfigManager + +# Create from a file +ym = YAMLConfigManager.from_yaml_file("config.yaml") + +# Access values +print(ym["my_key"]) + +# Update and write safely +ym["new_key"] = "new_value" +from yacman import write_lock +with write_lock(ym) as locked_ym: + locked_ym.rebase() + locked_ym.write() +``` + +## API Reference + +### YAMLConfigManager Class + +The main class for managing YAML configuration files with locking support: + +::: yacman.YAMLConfigManager + options: + docstring_style: google + show_source: true + show_signature: true + merge_init_into_class: true + +### Context Managers + +Yacman provides context managers for safe file locking. These are re-exported from the `ubiquerg` package for convenience. + +#### `write_lock(config_manager)` + +Context manager for write operations with exclusive locking. Prevents other processes from reading or writing the file while you hold the lock. + +**Parameters:** +- `config_manager` (YAMLConfigManager): The configuration manager instance to lock + +**Returns:** +- YAMLConfigManager: The locked configuration manager + +**Usage:** +```python +from yacman import YAMLConfigManager, write_lock + +ym = YAMLConfigManager.from_yaml_file("config.yaml") +ym["key"] = "value" + +with write_lock(ym) as locked_ym: + locked_ym.rebase() # Sync with any file changes + locked_ym.write() # Write to disk +``` + +#### `read_lock(config_manager)` + +Context manager for read operations with shared locking. Multiple processes can hold read locks simultaneously, but no process can hold a write lock while read locks exist. + +**Parameters:** + +- `config_manager` (YAMLConfigManager): The configuration manager instance to lock + +**Returns:** + +- YAMLConfigManager: The locked configuration manager + +**Usage:** + +```python +from yacman import YAMLConfigManager, read_lock + +ym = YAMLConfigManager.from_yaml_file("config.yaml") + +with read_lock(ym) as locked_ym: + locked_ym.rebase() # Sync with file + print(locked_ym.to_dict()) +``` + +**Note:** These context managers are provided by the `ubiquerg` package and re-exported by yacman for convenience. For more details on the locking implementation, see the [ubiquerg documentation](https://github.com/databio/ubiquerg). + +See the [tutorial](../notebooks/tutorial.ipynb) for more examples. + +### Utility Functions + +Yacman provides several utility functions for working with YAML files and paths: + +- `load_yaml(filepath)`: Load a YAML file and return its contents as a dictionary +- `select_config(config_filepath, config_env_vars, default_config_filepath)`: Select a configuration file from multiple sources +- `expandpath(path)`: Expand environment variables and user home directory in a path + +These functions are available in the `yacman` module. See the source code or [tutorial](../notebooks/tutorial.ipynb) for usage examples. + +## Deprecated Classes (v0.x) + +The following classes are deprecated in v1.0 and maintained only for backwards compatibility. Use `YAMLConfigManager` instead: + +- `YacAttMap` - Replaced by `YAMLConfigManager` +- `AliasedYacAttMap` - Use `YAMLConfigManager` instead + +See the [upgrading guide](../upgrading.md) for migration instructions. diff --git a/docs/yacman/img/yacman_bug.svg b/docs/yacman/img/yacman_bug.svg new file mode 100644 index 00000000..c5774141 --- /dev/null +++ b/docs/yacman/img/yacman_bug.svg @@ -0,0 +1,146 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/yacman/img/yacman_logo.svg b/docs/yacman/img/yacman_logo.svg new file mode 100644 index 00000000..3ee88264 --- /dev/null +++ b/docs/yacman/img/yacman_logo.svg @@ -0,0 +1,174 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/yacman/notebooks/tutorial.ipynb b/docs/yacman/notebooks/tutorial.ipynb new file mode 100644 index 00000000..8f29d5de --- /dev/null +++ b/docs/yacman/notebooks/tutorial.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Yacman Tutorial: YAMLConfigManager\n", + "\n", + "This tutorial shows you the features of the `yacman` package using the modern v1.0 API with `YAMLConfigManager`.\n", + "\n", + "First, let's prepare some test data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import yaml\n", + "from yacman import YAMLConfigManager, write_lock, read_lock\n", + "\n", + "# Sample data for demonstrations\n", + "yaml_dict = {\n", + " 'cfg_version': 1.0,\n", + " 'database': {\n", + " 'host': 'localhost',\n", + " 'port': 5432,\n", + " 'name': 'mydb'\n", + " },\n", + " 'features': ['logging', 'caching', 'monitoring']\n", + "}\n", + "\n", + "yaml_str = \"\"\"\\ \n", + "cfg_version: 1.0\n", + "database:\n", + " host: localhost\n", + " port: 5432\n", + " name: mydb\n", + "features:\n", + " - logging\n", + " - caching\n", + " - monitoring\n", + "\"\"\"\n", + "\n", + "# Create a test file\n", + "filepath = \"test_config.yaml\"\n", + "with open(filepath, 'w') as f:\n", + " yaml.dump(yaml_dict, f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating YAMLConfigManager objects\n", + "\n", + "There are several ways to create a `YAMLConfigManager` object in v1.0:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. From a YAML file\n", + "\n", + "Use `from_yaml_file()` to load configuration from a file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_file(filepath)\n", + "print(ym.to_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. From a dictionary\n", + "\n", + "Use `from_obj()` to create from a Python dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_obj(yaml_dict)\n", + "print(ym.to_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. From a YAML string\n", + "\n", + "Use `from_yaml_data()` to parse a YAML-formatted string:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_data(yaml_str)\n", + "print(ym.to_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Accessing configuration values\n", + "\n", + "You can access values using dictionary-style syntax:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_file(filepath)\n", + "\n", + "# Access top-level keys\n", + "print(f\"Config version: {ym['cfg_version']}\")\n", + "print(f\"Features: {ym['features']}\")\n", + "\n", + "# Access nested values\n", + "print(f\"Database host: {ym['database']['host']}\")\n", + "print(f\"Database port: {ym['database']['port']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## File locking and safe writes\n", + "\n", + "YAMLConfigManager provides race-free writing with file locking, making it safe for multi-user/multi-process contexts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write locks\n", + "\n", + "Use `write_lock()` for exclusive write access:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_file(filepath)\n", + "\n", + "# Modify the configuration\n", + "ym['new_feature'] = 'authentication'\n", + "ym['database']['timeout'] = 30\n", + "\n", + "# Write with lock\n", + "with write_lock(ym) as locked_ym:\n", + " locked_ym.rebase() # Capture any changes since file was loaded\n", + " locked_ym.write()\n", + "\n", + "print(\"Configuration written successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read locks\n", + "\n", + "Use `read_lock()` for shared read access. Multiple processes can hold read locks simultaneously:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_file(filepath)\n", + "\n", + "# Rebase to sync with file changes\n", + "with read_lock(ym) as locked_ym:\n", + " locked_ym.rebase()\n", + " print(f\"Current config: {locked_ym.to_dict()}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reset vs Rebase\n", + "\n", + "- `rebase()`: Replays in-memory changes on top of file contents\n", + "- `reset()`: Discards in-memory changes and loads from file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_file(filepath)\n", + "\n", + "# Make an in-memory change\n", + "ym['temp_value'] = 'will be discarded'\n", + "print(f\"Before reset: {ym.get('temp_value')}\")\n", + "\n", + "# Reset discards in-memory changes\n", + "with read_lock(ym) as locked_ym:\n", + " locked_ym.reset()\n", + "\n", + "print(f\"After reset: {ym.get('temp_value')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Updating from another object\n", + "\n", + "You can merge configuration from a dictionary into an existing YAMLConfigManager:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_yaml_file(filepath)\n", + "\n", + "# Update with additional configuration\n", + "overrides = {\n", + " 'database': {'host': 'production.example.com'},\n", + " 'debug': False\n", + "}\n", + "\n", + "ym.update_from_obj(overrides)\n", + "print(ym.to_dict())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment variable expansion\n", + "\n", + "YAMLConfigManager can expand environment variables in configuration values using the `.exp` property:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Set an environment variable\n", + "os.environ['DB_HOST'] = 'prod-server.example.com'\n", + "\n", + "# Create config with environment variable reference\n", + "config_with_env = {\n", + " 'database': {\n", + " 'host': '${DB_HOST}',\n", + " 'port': 5432\n", + " }\n", + "}\n", + "\n", + "ym = YAMLConfigManager.from_obj(config_with_env)\n", + "\n", + "# Access without expansion\n", + "print(f\"Raw value: {ym['database']['host']}\")\n", + "\n", + "# Access with expansion\n", + "print(f\"Expanded value: {ym.exp['database']['host']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Converting to YAML string\n", + "\n", + "You can serialize the configuration back to a YAML string:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ym = YAMLConfigManager.from_obj(yaml_dict)\n", + "print(ym.to_yaml())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complete example: Configuration management workflow\n", + "\n", + "Here's a complete example showing a typical configuration management workflow:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Load base configuration\n", + "config_file = \"app_config.yaml\"\n", + "base_config = {\n", + " 'app_name': 'MyApp',\n", + " 'version': '1.0.0',\n", + " 'database': {\n", + " 'host': 'localhost',\n", + " 'port': 5432\n", + " },\n", + " 'cache': {\n", + " 'enabled': True,\n", + " 'ttl': 3600\n", + " }\n", + "}\n", + "\n", + "# Save initial config\n", + "with open(config_file, 'w') as f:\n", + " yaml.dump(base_config, f)\n", + "\n", + "# 2. Load and modify configuration\n", + "ym = YAMLConfigManager.from_yaml_file(config_file)\n", + "print(\"Loaded configuration:\")\n", + "print(ym.to_yaml())\n", + "\n", + "# 3. Apply environment-specific overrides\n", + "env_overrides = {\n", + " 'database': {'host': 'prod-db.example.com'},\n", + " 'cache': {'ttl': 7200}\n", + "}\n", + "ym.update_from_obj(env_overrides)\n", + "\n", + "# 4. Add new configuration\n", + "ym['features'] = ['logging', 'metrics', 'tracing']\n", + "ym['deployment'] = {'region': 'us-east-1'}\n", + "\n", + "# 5. Save with write lock\n", + "with write_lock(ym) as locked_ym:\n", + " locked_ym.rebase()\n", + " locked_ym.write()\n", + "\n", + "print(\"\\nFinal configuration:\")\n", + "print(ym.to_yaml())\n", + "\n", + "# Cleanup\n", + "import os\n", + "os.remove(config_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "Key takeaways:\n", + "\n", + "1. **Creation**: Use `from_yaml_file()`, `from_obj()`, or `from_yaml_data()` to create YAMLConfigManager objects\n", + "2. **Access**: Use dictionary-style syntax to access configuration values\n", + "3. **Writing**: Always use `write_lock()` context manager with `rebase()` before `write()`\n", + "4. **Reading**: Use `read_lock()` for safe concurrent reads\n", + "5. **Updates**: Use `update_from_obj()` to merge configurations\n", + "6. **Environment variables**: Use `.exp` property to expand environment variables\n", + "\n", + "For more details, see the [API documentation](../code/python-api.md)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Cleanup test files\n", + "import os\n", + "if os.path.exists(filepath):\n", + " os.remove(filepath)\n", + "if os.path.exists('test_config.yaml'):\n", + " os.remove('test_config.yaml')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/yacman/upgrading.md b/docs/yacman/upgrading.md new file mode 100644 index 00000000..c2e963af --- /dev/null +++ b/docs/yacman/upgrading.md @@ -0,0 +1,158 @@ +# Upgrading to yacman v1.0 + +## Overview + +Yacman v1.0 provides two major feature upgrades: + +1. **New constructor pattern**: Constructors now use `YAMLConfigManager.from_x(...)` methods to make object creation clearer +2. **Separate read/write locks**: Locks are now separated into read locks and write locks, allowing multiple simultaneous readers + +## Upgrading from v0.9.3 to v1.0.0 + +### Step 1: Update imports + +If you were using `FutureYAMLConfigManager` in v0.9.3, update your imports: + +**Before (v0.9.3):** +```python +from yacman import FutureYAMLConfigManager as YAMLConfigManager +``` + +**After (v1.0.0):** +```python +from yacman import YAMLConfigManager +``` + +### Step 2: Update context managers + +Context managers now use explicit `write_lock` or `read_lock` functions: + +```python +from yacman import write_lock, read_lock +``` + +**Before (v0.9.3):** +```python +with ym as locked_ym: + locked_ym.write() +``` + +**After (v1.0.0):** +```python +with write_lock(ym) as locked_ym: + locked_ym.rebase() + locked_ym.write() +``` + +**Important**: In v1.0, you must call `rebase()` before `write()` if you want to allow for multiple processes that may have written to the file since you read it in. + +### Step 3: Update constructors + +You can no longer create a `YAMLConfigManager` object directly. Use the `from_x()` constructor methods instead: + +**Before (v0.9.3):** +```python +ym = YAMLConfigManager(filepath="config.yaml") +ym = YAMLConfigManager(entries={"key": "value"}) +``` + +**After (v1.0.0):** +```python +from yacman import YAMLConfigManager + +# From a file +ym = YAMLConfigManager.from_yaml_file("config.yaml") + +# From a dictionary +data = {"key": "value"} +ym = YAMLConfigManager.from_obj(data) + +# From a YAML string +yaml_data = "key: value" +ym = YAMLConfigManager.from_yaml_data(yaml_data) +``` + +### Step 4: Update file loading with overrides + +In the past, you could load from a file and overwrite some attributes with a dict, all from the constructor. This is now more explicit: + +**Before (v0.9.3):** +```python +ym = YAMLConfigManager(filepath="config.yaml", entries={"override_key": "value"}) +``` + +**After (v1.0.0):** +```python +ym = YAMLConfigManager.from_yaml_file("config.yaml") +ym.update_from_obj({"override_key": "value"}) +``` + +## Complete examples + +### Example 1: Basic usage with locks + +```python +from yacman import YAMLConfigManager, write_lock, read_lock + +data = { + "my_list": [1, 2, 3], + "my_int": 8, + "my_str": "hello world!", + "my_dict": {"nested_val": 15} +} + +# Create from object +ym = YAMLConfigManager.from_obj(data) + +# Access values +print(ym["my_list"]) +print(ym["my_int"]) +print(ym["my_dict"]) + +# Modify and write with a write lock +ym["new_var"] = 15 + +with write_lock(ym) as locked_ym: + locked_ym.rebase() # Capture any changes since file was loaded + locked_ym.write() +``` + +### Example 2: Read locks + +```python +# Use a read lock to rebase +# This will replay any in-memory updates on top of whatever is re-read from the file +with read_lock(ym) as locked_ym: + locked_ym.rebase() + +# Use a read lock to reset the in-memory object to whatever is on disk +with read_lock(ym) as locked_ym: + locked_ym.reset() +``` + +### Example 3: Expanding environment variables + +To expand environment variables in values, use the `.exp` attribute: + +```python +ym = YAMLConfigManager.from_yaml_file("config.yaml") +expanded_value = ym.exp["path_with_env_vars"] +``` + +## Migration checklist + +- [ ] Update imports from `FutureYAMLConfigManager` to `YAMLConfigManager` +- [ ] Replace direct constructor calls with `from_yaml_file()`, `from_obj()`, or `from_yaml_data()` +- [ ] Update context managers to use `write_lock()` or `read_lock()` +- [ ] Add `rebase()` calls before `write()` in write-lock contexts +- [ ] Replace combined file+override constructors with explicit `update_from_obj()` calls +- [ ] Test all file I/O operations +- [ ] Verify environment variable expansion works correctly + +## Deprecated features + +The following are deprecated in v1.0 and will be removed in future versions: + +- Direct instantiation of `YAMLConfigManager` (use `from_x()` methods instead) +- Using YAMLConfigManager object as a context manager directly (use `write_lock()` or `read_lock()`) +- `YacAttMap` class (replaced by `YAMLConfigManager`) diff --git a/mkdocs.yml b/mkdocs.yml index 5b12fc8f..994935ab 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,7 +51,7 @@ markdown_extensions: custom_fences: - name: mermaid class: mermaid - format: "!!python/name:pymdownx.superfences.fence_code_format" + format: !!python/name:pymdownx.superfences.fence_code_format extra_css: - stylesheets/extra.css @@ -123,6 +123,7 @@ nav: - Specifying samples to download: geofetch/file-specification.md - Set SRA data download location: geofetch/howto-location.md - Run SRA convert: geofetch/code/howto-sra-to-fastq.md + - Install prefetch: geofetch/howto-prefetch.md - Reference: - How to cite: citations.md - API: geofetch/code/python-api.md @@ -163,7 +164,7 @@ nav: - Reference: - Reference: - CLI Usage: looper/usage.md - - API: looper/code/python-api.md + # - API: looper/code/python-api.md # Temporarily disabled due to wildcard import issue - How to cite: citations.md - FAQ: looper/faq.md - Support: looper/support.md @@ -180,12 +181,14 @@ nav: - How to use views: pephub/user/views.md - PEP of PEPs (POP): pephub/user/pops.md - Accessing GEO metadata: pephub/user/geo.md + - PEPhub organization: pephub/user/organization.md - PEPHubClient: - PEPhubClient: pephub/user/pephubclient/README.md - Quickstart: pephub/user/pephubclient/tutorial.md - Python API: pephub/user/pephubclient/phc_usage.md - Python API samples: pephub/user/pephubclient/phc_samples_usage.md - Python API views: pephub/user/pephubclient/phc_views_usage.md + - Python API schemas: pephub/user/pephubclient/phc_schemas.md - CLI usage: pephub/user/pephubclient/cli.md - Changelog: pephub/user/pephubclient/changelog.md - Developer guide: @@ -201,6 +204,7 @@ nav: - PEPembed: pephub/developer/pepembed/README.md - pepdbagent: - pepdbagent: pephub/developer/pepdbagent/README.md + - Database version migration: pephub/developer/pepdbagent/database_version_migration.md - Database tutorial: pephub/developer/pepdbagent/db_tutorial.md - Changelog: pephub/developer/pepdbagent/changelog.md - geopephub: pephub/developer/geopephub.md @@ -229,16 +233,22 @@ nav: - Changelog: peppy/changelog.md - Pipestat: - Pipestat: pipestat/README.md + - User guide: + - Quickstart: pipestat/code/api-quickstart.md + - Use Python API: pipestat/code/python-tutorial.md + - Use command line interface: pipestat/code/cli.md - How-to guides: - - Quickstart: pipestat/code/api-quickstart.md + - Install: pipestat/install.md - Write a pipestat schema: pipestat/pipestat-schema.md - - Use Python API: pipestat/code/python-tutorial.md - - Use command line interface: pipestat/code/cli.md - - Report objects as results: pipestat/code/reporting-objects.md + - Reporting pipeline status: pipestat/report_statuses.md - Configure pipestat: pipestat/configuration.md - - Multi pipelines and result files: pipestat/multi.md + - Advanced: + - Multi pipelines and result files: pipestat/multi.md + - Report objects as results: pipestat/code/reporting-objects.md + - Summarize reported results: pipestat/summarize.md - Reference: - - How to cite: citations.md + - Terminology - Results and Record Identifiers: pipestat/results_records.md + - Backends: pipestat/backends.md - CLI usage: pipestat/usage.md - Configuration format: pipestat/config.md - Testing configuration: pipestat/testing.md @@ -247,6 +257,7 @@ nav: - Python API: pipestat/code/python-api.md - Support: https://github.com/pepkit/pipestat/issues - Contributing: pipestat/contributing.md + - How to cite: citations.md - Changelog: pipestat/changelog.md - Pypiper: - Pypiper: pypiper/README.md @@ -274,56 +285,52 @@ nav: - Support: pypiper/support.md - Contributing: pypiper/contributing.md - Changelog: pypiper/changelog.md + - Yacman: + - Yacman: yacman/README.md + - Getting started: + - What is yacman?: yacman/README.md + - Tutorial: yacman/notebooks/tutorial.ipynb + - Upgrading to v1.0: yacman/upgrading.md + - Reference: + - How to cite: citations.md + - API: yacman/code/python-api.md + - Changelog: yacman/changelog.md + - Support: https://github.com/databio/yacman/issues + - Ubiquerg: + - Ubiquerg: ubiquerg/README.md + - Reference: + - How to cite: citations.md + - API: ubiquerg/code/python-api.md + - Changelog: ubiquerg/changelog.md + - Support: https://github.com/pepkit/ubiquerg/issues - How to cite: citations.md - Stats: statistics.md -autodoc: - lucidoc: - - pkg: peppy - outfile: docs/peppy/code/python-api.md - - pkg: looper - outfile: docs/looper/code/python-api.md - - pkg: pipestat - outfile: docs/pipestat/code/python-api.md - - pkg: pypiper - outfile: docs/pypiper/code/python-api.md - whitelist: - - PipelineManager - - pkg: pypiper - outfile: docs/pypiper/code/ngstk-api.md - whitelist: - - NGSTk - - pkg: geofetch - outfile: docs/geofetch/code/python-api.md - - pkg: eido - outfile: docs/eido/code/python-api.md - blacklist: - - basic_pep_filter - - yaml_pep_filter - - csv_pep_filter - - yaml_samples_pep_filter - - outfile: docs/eido/code/plugin-api-docs.md - pkg: eido - whitelist: - - basic_pep_filter - - yaml_pep_filter - - csv_pep_filter - - yaml_samples_pep_filter - jupyter: - - in: eido/notebooks - out: eido/code - - in: geofetch/notebooks - out: geofetch/code - - in: looper/notebooks - out: looper/code - - in: peppy/notebooks - out: peppy/code - - in: pipestat/notebooks - out: pipestat/code - - in: pypiper/notebooks - out: pypiper/code - cli_usage: - - template: docs/geofetch/usage-template.md.tpl - outfile: docs/geofetch/code/usage.md - commands: - - geofetch --help +plugins: +- mkdocs-jupyter: + include: + - eido/notebooks/*.ipynb + - geofetch/notebooks/*.ipynb + - looper/notebooks/*.ipynb + - peppy/notebooks/*.ipynb + - pipestat/notebooks/*.ipynb + - pypiper/notebooks/*.ipynb + - yacman/notebooks/*.ipynb + ignore_h1_titles: True +- mkdocstrings: + default_handler: python + handlers: + python: + options: + docstring_style: google + show_symbol_type_heading: true + show_symbol_type_toc: true + show_root_heading: true + show_root_full_path: false + show_source: true + show_signature: true + show_if_no_docstring: false + separate_signature: true + merge_init_into_class: true + allow_inspection: true + show_bases: false diff --git a/scripts/generate_cli_usage_docs.py b/scripts/generate_cli_usage_docs.py new file mode 100755 index 00000000..f196172e --- /dev/null +++ b/scripts/generate_cli_usage_docs.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +""" +Generate geofetch CLI usage documentation. + +Run manually when geofetch CLI interface changes: + python scripts/generate_cli_usage_docs.py +""" +import subprocess + +template = "docs/geofetch/usage-template.md.tpl" +outfile = "docs/geofetch/code/usage.md" +command = "geofetch --help" + +print(f"Generating CLI usage documentation for: {command}") + +with open(template) as f: + result = f.read() + +usage = subprocess.check_output(command, shell=True).decode("utf-8") +result += f"\n`{command}`\n\n```console\n{usage}\n```\n" + +with open(outfile, "w") as f: + f.write(result) + +print(f"✓ Generated {outfile}")