diff --git a/geofetch/cli.py b/geofetch/cli.py index a56407c..037ad54 100644 --- a/geofetch/cli.py +++ b/geofetch/cli.py @@ -8,13 +8,25 @@ def _safe_echo(var): - """Returns an environment variable if it exists, or an empty string if not""" + """Return an environment variable if it exists, or an empty string if not. + + Args: + var: Environment variable name. + + Returns: + Environment variable value or empty string. + """ return os.getenv(var, "") def _parse_cmdl(cmdl): - """ - parser + """Parse command line arguments. + + Args: + cmdl: Command line arguments. + + Returns: + Parsed arguments. """ parser = VersionInHelpParser( description="Automatic GEO and SRA data downloader", @@ -207,7 +219,7 @@ def _parse_cmdl(cmdl): help="""Optional: Filter size for processed files that are stored as sample repository [Default: None]. Works only for sample data. - Supported input formats : 12B, 12KB, 12MB, 12GB. + Supported input formats : 12B, 12KB, 12MB, 12GB. Ignored unless 'processed' flag is set.""", ) diff --git a/geofetch/finder.py b/geofetch/finder.py index 587e1ae..f806782 100644 --- a/geofetch/finder.py +++ b/geofetch/finder.py @@ -27,49 +27,57 @@ class Finder: - """ - Class for finding GSE accessions in special period of time. + """Class for finding GSE accessions in special period of time. + Additionally, user can add specific filters for the search, - while initialization of the class + while initialization of the class. """ def __init__(self, filters: str = None, retmax: int = RETMAX): - """ - :param filters: filters that have to be added to the query. - Filter Patterns can be found here: - https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag - :param retmax: maximum number of retrieved accessions. + """Initialize Finder. + + Args: + filters: Filters that have to be added to the query. + Filter Patterns can be found here: + https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag. + retmax: Maximum number of retrieved accessions. """ self.query_customized_ending = ETOOLS_ENDING.format(retmax=retmax) self.query_filter_str = self._create_filter_str(filters) self.last_result = [] def get_gse_all(self) -> list: - """ - Get list of all gse accession available in GEO - :return: list of gse accession + """Get list of all gse accession available in GEO. + + Returns: + List of gse accession. """ return self.get_gse_id_by_query(url=self._compose_url()) def get_gse_last_3_month(self) -> list: - """ - Get list of gse accession that were uploaded or updated in last 3 month - :return: list of gse accession + """Get list of gse accession that were uploaded or updated in last 3 month. + + Returns: + List of gse accession. """ return self.get_gse_id_by_query(url=self._compose_url(THREE_MONTH_FILTER)) def get_gse_last_week(self) -> list: - """ - Get list of gse accession that were uploaded or updated in last week - :return: list of gse accession + """Get list of gse accession that were uploaded or updated in last week. + + Returns: + List of gse accession. """ return self.get_gse_by_day_count(7) def get_gse_by_day_count(self, n_days: int = 1) -> list: - """ - Get list of gse accessions that were uploaded or updated in last X days - :param n_days: number of days from now [e.g. 5] - :return: list of gse accession + """Get list of gse accessions that were uploaded or updated in last X days. + + Args: + n_days: Number of days from now [e.g. 5]. + + Returns: + List of gse accession. """ today = datetime.today() start_date = today - timedelta(days=n_days) @@ -77,11 +85,16 @@ def get_gse_by_day_count(self, n_days: int = 1) -> list: return self.get_gse_by_date(start_date_str) def get_gse_by_date(self, start_date: str, end_date: str = None) -> list: - """ - Search gse accessions by providing start date and end date. By default, the last date is today. - :param start_date: the oldest date of update (from YYYY/MM/DD to now) [input format: 'YYYY/MM/DD'] - :param end_date: the nearest date of update (from __ to YYYY/MM/DD) [input format: 'YYYY/MM/DD'] - :return: list of gse accessions + """Search gse accessions by providing start date and end date. + + By default, the last date is today. + + Args: + start_date: The oldest date of update (from YYYY/MM/DD to now) [input format: 'YYYY/MM/DD']. + end_date: The nearest date of update (from __ to YYYY/MM/DD) [input format: 'YYYY/MM/DD']. + + Returns: + List of gse accessions. """ if end_date is None: end_date = TODAY_DATE @@ -89,10 +102,13 @@ def get_gse_by_date(self, start_date: str, end_date: str = None) -> list: return self.get_gse_id_by_query(url=self._compose_url(new_date_filter)) def get_gse_id_by_query(self, url: str) -> list: - """ - Run esearch (ncbi search tool) by specifying URL and retrieve gse list result - :param url: url of the query - :return: list of gse ids + """Run esearch (ncbi search tool) by specifying URL and retrieve gse list result. + + Args: + url: Url of the query. + + Returns: + List of gse ids. """ uids_list = self._run_search_query(url) gse_id_list = [self.uid_to_gse(d) for d in uids_list] @@ -101,30 +117,39 @@ def get_gse_id_by_query(self, url: str) -> list: @staticmethod def uid_to_gse(uid: str) -> str: - """ - UID to GES accession converter - :param uid: uid string (Unique Identifier Number in GEO) - :return: GSE id string + """Convert UID to GSE accession. + + Args: + uid: Uid string (Unique Identifier Number in GEO). + + Returns: + GSE id string. """ uid_regex = re.compile(r"[1-9]+0+([1-9]+[0-9]*)") return "GSE" + uid_regex.match(uid).group(1) @staticmethod def find_differences(old_list: list, new_list: list) -> list: - """ - Compare 2 lists and search for elements that are not in old list - :param old_list: old list of elements - :param new_list: new list of elements - :return: list of elements that are not in old list but are in new_list + """Compare 2 lists and search for elements that are not in old list. + + Args: + old_list: Old list of elements. + new_list: New list of elements. + + Returns: + List of elements that are not in old list but are in new_list. """ return list(set(new_list) - set(old_list)) @staticmethod def _run_search_query(url: str) -> list: - """ - Run get request and return list of uids found - :param url: url of the query - :return: list of UIDs + """Run get request and return list of uids found. + + Args: + url: Url of the query. + + Returns: + List of UIDs. """ x = requests.get(url) if x.status_code != 200: @@ -143,20 +168,26 @@ def _run_search_query(url: str) -> list: @staticmethod def _create_filter_str(filters: str = None) -> str: - """ - Tune filter for url request - :param filters: filter should look like here: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag - :return: tuned filter string + """Tune filter for url request. + + Args: + filters: Filter should look like here: https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag. + + Returns: + Tuned filter string. """ if filters == "" or filters is None: return "" return f"+(AND+{filters})" def _compose_url(self, date_filter: str = None) -> str: - """ - Compose final url by adding date filter - :param date_filter: date filter that has to be used in the query - :return: string of final url + """Compose final url by adding date filter. + + Args: + date_filter: Date filter that has to be used in the query. + + Returns: + String of final url. """ if date_filter is None: date_filter = "" @@ -164,11 +195,11 @@ def _compose_url(self, date_filter: str = None) -> str: return f"{ETOOLS_GEO_GSE_BASE}{self.query_filter_str}{date_filter}{self.query_customized_ending}" def generate_file(self, file_path: str, gse_list: list = None): - """ - Save the list of GSE accessions stored in this Finder object to a given file - :param file_path: root to the file where gse accessions have to be saved - :param gse_list: list of gse accessions - :return: NoReturn + """Save the list of GSE accessions stored in this Finder object to a given file. + + Args: + file_path: Root to the file where gse accessions have to be saved. + gse_list: List of gse accessions. """ if gse_list is None: gse_list = self.last_result diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index e841716..be5eb97 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -70,9 +70,7 @@ class Geofetcher: - """ - Class to download or get projects, metadata, data from GEO and SRA - """ + """Class to download or get projects, metadata, data from GEO and SRA.""" def __init__( self, @@ -111,72 +109,67 @@ def __init__( max_prefetch_size=None, **kwargs, ): - """ - Constructor - - :param input: GSEnumber or path to the input file - :param name: Specify a project name. Defaults to GSE number or name of accessions file name - :param metadata_root: Specify a parent folder location to store metadata. - The project name will be added as a subfolder [Default: $SRAMETA:] - :param metadata_folder: Specify an absolute folder location to store metadata. No subfolder will be added. - Overrides value of --metadata-root [Default: Not used (--metadata-root is used by default)] - :param just_metadata: If set, don't actually run downloads, just create metadata - :param refresh_metadata: If set, re-download metadata even if it exists. - :param config_template: Project config yaml file template. - :param pipeline_samples: Specify one or more filepaths to SAMPLES pipeline interface yaml files. + """Constructor. + + Args: + input: GSEnumber or path to the input file. + name: Specify a project name. Defaults to GSE number or name of accessions file name. + metadata_root: Specify a parent folder location to store metadata. + The project name will be added as a subfolder [Default: $SRAMETA:]. + metadata_folder: Specify an absolute folder location to store metadata. No subfolder will be added. + Overrides value of --metadata-root [Default: Not used (--metadata-root is used by default)]. + just_metadata: If set, don't actually run downloads, just create metadata. + refresh_metadata: If set, re-download metadata even if it exists. + config_template: Project config yaml file template. + pipeline_samples: Specify one or more filepaths to SAMPLES pipeline interface yaml files. These will be added to the project config file to make it immediately compatible with looper. - [Default: null] - :param pipeline_project: Specify one or more filepaths to PROJECT pipeline interface yaml files. + [Default: null]. + pipeline_project: Specify one or more filepaths to PROJECT pipeline interface yaml files. These will be added to the project config file to make it immediately compatible with looper. - [Default: null] - :param acc_anno: Produce annotation sheets for each accession. + [Default: null]. + acc_anno: Produce annotation sheets for each accession. Project combined PEP for the whole project won't be produced. - :param discard_soft: Create project without downloading soft files on the disc - :param add_dotfile: Add .pep.yaml file that points .yaml PEP file - :param disable_progressbar: Set true to disable progressbar - - :param const_limit_project: Optional: Limit of the number of the constant sample characters - that should not be in project yaml. [Default: 50] - :param const_limit_discard: Optional: Limit of the number of the constant sample characters - that should not be discarded [Default: 250] - :param attr_limit_truncate: Optional: Limit of the number of sample characters. + discard_soft: Create project without downloading soft files on the disc. + add_dotfile: Add .pep.yaml file that points .yaml PEP file. + disable_progressbar: Set true to disable progressbar. + const_limit_project: Optional: Limit of the number of the constant sample characters + that should not be in project yaml. [Default: 50]. + const_limit_discard: Optional: Limit of the number of the constant sample characters + that should not be discarded [Default: 250]. + attr_limit_truncate: Optional: Limit of the number of sample characters. Any attribute with more than X characters will truncate to the first X, where X is a number of characters - [Default: 500] - - :param max_soft_size: Optional: Max size of soft file. - Supported input formats : 12B, 12KB, 12MB, 12GB. [Default value: 1GB] - - :param processed: Download processed da_soft_sizeta [Default: download raw data]. - :param data_source: Specifies the source of data on the GEO record to retrieve processed data, + [Default: 500]. + max_soft_size: Optional: Max size of soft file. + Supported input formats : 12B, 12KB, 12MB, 12GB. [Default value: 1GB]. + processed: Download processed data [Default: download raw data]. + data_source: Specifies the source of data on the GEO record to retrieve processed data, which may be attached to the collective series entity, or to individual samples. Allowable values are: - samples, series or both (all). Ignored unless 'processed' flag is set. [Default: samples] - :param filter: Filter regex for processed filenames [Default: None].Ignored unless 'processed' flag is set. - :param filter_size: Filter size for processed files that are stored as sample repository [Default: None]. + samples, series or both (all). Ignored unless 'processed' flag is set. [Default: samples]. + filter: Filter regex for processed filenames [Default: None].Ignored unless 'processed' flag is set. + filter_size: Filter size for processed files that are stored as sample repository [Default: None]. Works only for sample data. Supported input formats : 12B, 12KB, 12MB, 12GB. Ignored unless 'processed' flag is set. - :param geo_folder: Specify a location to store processed GEO files. - Ignored unless 'processed' flag is set.[Default: $GEODATA:] - - :param split_experiments: Split SRR runs into individual samples. By default, SRX experiments with multiple SRR + geo_folder: Specify a location to store processed GEO files. + Ignored unless 'processed' flag is set.[Default: $GEODATA:]. + split_experiments: Split SRR runs into individual samples. By default, SRX experiments with multiple SRR Runs will have a single entry in the annotation table, with each run as a separate row in the - subannotation table. This setting instead treats each run as a separate sample [Works with raw data] - :param bam_folder: Optional: Specify folder of bam files. Geofetch will not download sra files when - corresponding bam files already exist. [Default: $SRABAM:] [Works with raw data] - :param fq_folder: Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding - fastq files already exist. [Default: $SRAFQ:] [Works with raw data] - :param use_key_subset: Use just the keys defined in this module when writing out metadata. [Works with raw data] - :param sra_folder: Optional: Specify a location to store sra files - [Default: $SRARAW:" + safe_echo("SRARAW") + ] - :param bam_conversion: Optional: set True to convert bam files [Works with raw data] - :param picard_path: Specify a path to the picard jar, if you want to convert fastq to bam - [Default: $PICARD:" + safe_echo("PICARD") + "] [Works with raw data] - :param add_convert_modifier: Add looper SRA convert modifier to config file. - - :param skip: Skip some accessions. [Default: no skip]. - :param opts: opts object [Optional] - :param str | int max_prefetch_size: argmuent to prefetch command's --max-size option; - for reference: https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump#check-the-maximum-size-limit-of-the-prefetch-tool - :param kwargs: other values + subannotation table. This setting instead treats each run as a separate sample [Works with raw data]. + bam_folder: Optional: Specify folder of bam files. Geofetch will not download sra files when + corresponding bam files already exist. [Default: $SRABAM:] [Works with raw data]. + fq_folder: Optional: Specify folder of fastq files. Geofetch will not download sra files when corresponding + fastq files already exist. [Default: $SRAFQ:] [Works with raw data]. + use_key_subset: Use just the keys defined in this module when writing out metadata. [Works with raw data]. + sra_folder: Optional: Specify a location to store sra files + [Default: $SRARAW:" + safe_echo("SRARAW") + ]. + bam_conversion: Optional: set True to convert bam files [Works with raw data]. + picard_path: Specify a path to the picard jar, if you want to convert fastq to bam + [Default: $PICARD:" + safe_echo("PICARD") + "] [Works with raw data]. + add_convert_modifier: Add looper SRA convert modifier to config file. + skip: Skip some accessions. [Default: no skip]. + opts: opts object [Optional]. + max_prefetch_size: Argument to prefetch command's --max-size option; + for reference: https://github.com/ncbi/sra-tools/wiki/08.-prefetch-and-fasterq-dump#check-the-maximum-size-limit-of-the-prefetch-tool. + **kwargs: Other values. """ global _LOGGER @@ -290,12 +283,15 @@ def __init__( def get_projects( self, input: str, just_metadata: bool = True, discard_soft: bool = True ) -> dict: - """ - Function for fetching projects from GEO|SRA and receiving peppy project - :param input: GSE number, or path to file of GSE numbers - :param just_metadata: process only metadata - :param discard_soft: clean run, without downloading soft files - :return: peppy project or list of project, if acc_anno is set. + """Function for fetching projects from GEO|SRA and receiving peppy project. + + Args: + input: GSE number, or path to file of GSE numbers. + just_metadata: Process only metadata. + discard_soft: Clean run, without downloading soft files. + + Returns: + Peppy project or list of project, if acc_anno is set. """ self.just_metadata = just_metadata self.just_object = True @@ -358,12 +354,16 @@ def get_projects( return new_pr_dict def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Project]: - """ - Main function driver/workflow - Function that search, filters, downloads and save data and metadata from GEO and SRA - :param input: GSE or input file with gse's - :param name: Name of the project - :return: NoReturn or peppy Project + """Main function driver/workflow. + + Function that search, filters, downloads and save data and metadata from GEO and SRA. + + Args: + input: GSE or input file with gse's. + name: Name of the project. + + Returns: + NoReturn or peppy Project. """ if name is not None: @@ -577,12 +577,15 @@ def _process_sra_meta( gsm_enter_dict: dict = None, gsm_metadata: dict = None, ): - """ - Create srp multitable and update gsm_metadata based on srp - :param srp_list_result: list of srp got from sra file - :param gsm_enter_dict: gsm enter content - :param gsm_metadata: dict of samples of gsm - :return: srp multitable + """Create srp multitable and update gsm_metadata based on srp. + + Args: + srp_list_result: List of srp got from sra file. + gsm_enter_dict: Gsm enter content. + gsm_metadata: Dict of samples of gsm. + + Returns: + Srp multitable. """ gsm_multi_table = {} runs = [] @@ -658,10 +661,10 @@ def _process_sra_meta( return gsm_multi_table, gsm_metadata, runs def _download_raw_data(self, run_name: str) -> NoReturn: - """ - Download raw data from SRA by providing run name + """Download raw data from SRA by providing run name. - :param run_name: Run name from SRA + Args: + run_name: Run name from SRA. """ bam_file = ( "" @@ -712,12 +715,15 @@ def fetch_processed_one( gsm_file_content: list, gsm_filter_list: dict, ) -> Tuple: - """ - Fetche one processed GSE project and return its metadata - :param gsm_file_content: gse soft file content - :param gse_file_content: gsm soft file content - :param gsm_filter_list: list of gsm that have to be downloaded - :return: Tuple of project list of gsm samples and gse samples + """Fetch one processed GSE project and return its metadata. + + Args: + gsm_file_content: Gse soft file content. + gse_file_content: Gsm soft file content. + gsm_filter_list: List of gsm that have to be downloaded. + + Returns: + Tuple of project list of gsm samples and gse samples. """ ( meta_processed_samples, @@ -746,15 +752,19 @@ def _generate_processed_meta( meta_processed_series: list, gse_meta_dict: Union[dict, None] = None, ) -> dict: - """ - Generate and save PEPs for processed accessions. GEO has data in GSE and GSM, - conditions are used to decide which PEPs will be saved. - :param name: name of the folder/file where PEP will be saved - :param meta_processed_samples: - :param meta_processed_series: - :param gse_meta_dict: dict of metadata fetched from one experiment. - Used to add this data to config file. - :return: dict of objects if just_object is set, otherwise dicts of None + """Generate and save PEPs for processed accessions. + + GEO has data in GSE and GSM, conditions are used to decide which PEPs will be saved. + + Args: + name: Name of the folder/file where PEP will be saved. + meta_processed_samples: Metadata for processed samples. + meta_processed_series: Metadata for processed series. + gse_meta_dict: Dict of metadata fetched from one experiment. + Used to add this data to config file. + + Returns: + Dict of objects if just_object is set, otherwise dicts of None. """ return_objects = {f"{name}_samples": None, f"{name}_series": None} @@ -813,12 +823,12 @@ def _generate_processed_meta( def _download_processed_data( self, acc_gse: str, meta_processed_samples: list, meta_processed_series: list ) -> NoReturn: - """ - Download processed data from GEO by providing project annotation list - :param acc_gse: accession number of the project - :param meta_processed_samples: list of annotation of samples - :param meta_processed_series: list of annotation of series - :return: Noreturn + """Download processed data from GEO by providing project annotation list. + + Args: + acc_gse: Accession number of the project. + meta_processed_samples: List of annotation of samples. + meta_processed_series: List of annotation of series. """ data_geo_folder = os.path.join(self.geo_folder, acc_gse) _LOGGER.debug("Data folder: " + data_geo_folder) @@ -851,22 +861,26 @@ def _download_processed_data( self._download_processed_file(file_url, data_geo_folder) def _expand_metadata_dict(self, metadata_dict: dict) -> dict: - """ - Expand all lists of all items in the dict by creating new items or joining them + """Expand all lists of all items in the dict by creating new items or joining them. + + Args: + metadata_dict: Metadata dict. - :param metadata_dict: metadata dict - :return: expanded metadata dict + Returns: + Expanded metadata dict. """ prj_list = _dict_to_list_converter(proj_dict=metadata_dict) prj_list = self._expand_metadata_list(prj_list) return _dict_to_list_converter(proj_list=prj_list) def _expand_metadata_list(self, metadata_list: list) -> list: - """ - Expanding all lists of all items in the list by creating new items or joining them + """Expand all lists of all items in the list by creating new items or joining them. + + Args: + metadata_list: List of dicts that store metadata. - :param list metadata_list: list of dicts that store metadata - :return list: expanded metadata list + Returns: + Expanded metadata list. """ _LOGGER.info("Expanding metadata list...") list_of_keys = _get_list_of_keys(metadata_list) @@ -877,13 +891,16 @@ def _expand_metadata_list(self, metadata_list: list) -> list: return metadata_list def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): - """ - Expand list of one element (item) in the list by creating new items or joining them + """Expand list of one element (item) in the list by creating new items or joining them. + ["first1: fff", ...] -> separate columns - :param list metadata_list: list of dicts that store metadata - :param str dict_key: key in the dictionaries that have to be expanded - :return list: expanded metadata list + Args: + metadata_list: List of dicts that store metadata. + dict_key: Key in the dictionaries that have to be expanded. + + Returns: + Expanded metadata list. """ try: element_is_list = any( @@ -982,13 +999,15 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str): return metadata_list def _write_gsm_annotation(self, gsm_metadata: dict, file_annotation: str) -> str: - """ - Write metadata sheet out as an annotation file. + """Write metadata sheet out as an annotation file. + + Args: + gsm_metadata: The data to write, parsed from a file + with metadata/annotation information. + file_annotation: The path to the file to write. - :param Mapping gsm_metadata: the data to write, parsed from a file - with metadata/annotation information - :param str file_annotation: the path to the file to write - :return str: path to the file + Returns: + Path to the file. """ keys = list(list(gsm_metadata.values())[0].keys()) fp = expandpath(file_annotation) @@ -1010,15 +1029,17 @@ def _write_processed_annotation( just_object: bool = False, gse_meta_dict: dict = None, ) -> Union[NoReturn, peppy.Project]: - """ - Save annotation file by providing list of dictionaries with files metadata - - :param list processed_metadata: list of dictionaries with files metadata - :param str file_annotation_path: the path to the metadata file that has to be saved - :param just_object: True, if you want to get peppy object without saving file - :param gse_meta_dict: dict of metadata fetched from one experiment. - Used to add this data to config file. - :return: none, or peppy project + """Save annotation file by providing list of dictionaries with files metadata. + + Args: + processed_metadata: List of dictionaries with files metadata. + file_annotation_path: The path to the metadata file that has to be saved. + just_object: True, if you want to get peppy object without saving file. + gse_meta_dict: Dict of metadata fetched from one experiment. + Used to add this data to config file. + + Returns: + None, or peppy project. """ if len(processed_metadata) == 0: _LOGGER.info( @@ -1084,11 +1105,13 @@ def _write_processed_annotation( @staticmethod def _find_genome(metadata_list: list) -> list: - """ - Create new genome column by searching joining few columns + """Create new genome column by searching joining few columns. + + Args: + metadata_list: List with metadata dict. - :param metadata_list: list with metadata dict - :return: list with metadata dict where genome column was added + Returns: + List with metadata dict where genome column was added. """ list_keys = _get_list_of_keys(metadata_list) genome_keys = [ @@ -1111,15 +1134,18 @@ def _write_raw_annotation_new( subannot_dict: dict = None, gse_meta_dict: dict = None, ) -> Union[None, peppy.Project]: - """ - Combine individual accessions into project-level annotations, and writing - individual accession files (if requested) - - :param name: Name of the run, project, or acc --> will influence name of the folder where project will be created - :param metadata_dict: dictionary of sample annotations - :param subannot_dict: dictionary of subsample annotations - :param gse_meta_dict: dict of experiment metadata that was sotred in gse - :return: none or peppy object + """Combine individual accessions into project-level annotations. + + Write individual accession files (if requested). + + Args: + name: Name of the run, project, or acc --> will influence name of the folder where project will be created. + metadata_dict: Dictionary of sample annotations. + subannot_dict: Dictionary of subsample annotations. + gse_meta_dict: Dict of experiment metadata that was stored in gse. + + Returns: + None or peppy object. """ try: assert len(metadata_dict) > 0 @@ -1242,13 +1268,15 @@ def _create_config_processed( proj_meta: list, meta_in_series: dict = True, ) -> str: - """ - Compose and generate config file content + """Compose and generate config file content. + + Args: + file_annotation_path: Root to the annotation file. + proj_meta: Common metadata that has to added to config file. + meta_in_series: Metadata in series. - :param file_annotation_path: root to the annotation file - :param proj_meta: common metadata that has to added to config file - :param meta_in_series: - :return: generated, complete config file content + Returns: + Generated, complete config file content. """ geofetchdir = os.path.dirname(__file__) @@ -1287,14 +1315,16 @@ def _create_config_processed( def _create_config_raw( self, proj_meta, proj_root_sample, subanot_path_yaml, meta_in_series=None ): - """ - Compose and generate config file content for raw data + """Compose and generate config file content for raw data. + + Args: + proj_meta: Root to the annotation file. + proj_root_sample: Path to sampletable file. + subanot_path_yaml: Path to subannotation file. + meta_in_series: Metadata in series. - :param proj_meta: root to the annotation file - :param proj_root_sample: path to sampletable file - :param subanot_path_yaml: path to subannotation file - :param meta_in_series: - :return: generated, complete config file content + Returns: + Generated, complete config file content. """ meta_list_str = [ f'{list(i.keys())[0]}: "{_sanitize_config_string(list(i.values())[0])}"' @@ -1349,12 +1379,15 @@ def _create_config_raw( @staticmethod def _check_sample_name_standard(metadata_dict: dict) -> dict: - """ - Standardize sample name and checking if it exists - (This function is used for raw data) + """Standardize sample name and check if it exists. - :param metadata_dict: metadata dict - :return: metadata dict with standardize sample names + This function is used for raw data. + + Args: + metadata_dict: Metadata dict. + + Returns: + Metadata dict with standardize sample names. """ fixed_dict = {} for key_sample, value_sample in metadata_dict.items(): @@ -1376,14 +1409,16 @@ def _separate_common_meta( del_limit: int = 1000, attr_limit_truncate: int = 500, ) -> tuple: - """ - Separate experiment(project) metadata from sample metadata + """Separate experiment(project) metadata from sample metadata. - :param list or dict meta_list: list of dictionaries of samples - :param int max_len: threshold of the length of the common value that can be stored in the sample table - :param int del_limit: threshold of the length of the common value that have to be deleted - :param int attr_limit_truncate: max length of the attribute in the sample csv - :return set: Return is a set of list, where 1 list (or dict) is + Args: + meta_list: List of dictionaries of samples. + max_len: Threshold of the length of the common value that can be stored in the sample table. + del_limit: Threshold of the length of the common value that have to be deleted. + attr_limit_truncate: Max length of the attribute in the sample csv. + + Returns: + Return is a set of list, where 1 list (or dict) is list of samples metadata dictionaries and 2: list of common samples metadata dictionaries that are linked to the project. """ @@ -1452,11 +1487,12 @@ def _separate_common_meta( return meta_list, new_meta_project def _download_SRA_file(self, run_name: str): - """ - Download SRA file by ising 'prefetch' utility from the SRA Toolkit - more info: (http://www.ncbi.nlm.nih.gov/books/NBK242621/) + """Download SRA file using 'prefetch' utility from the SRA Toolkit. - :param str run_name: SRR number of the SRA file + More info: (http://www.ncbi.nlm.nih.gov/books/NBK242621/) + + Args: + run_name: SRR number of the SRA file. """ # Set up a simple loop to try a few times in case of failure @@ -1479,11 +1515,11 @@ def _download_SRA_file(self, run_name: str): time.sleep(t * 2) def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn: - """ - Convert SRA file to BAM file by using samtools function "sam-dump" + """Convert SRA file to BAM file by using samtools function "sam-dump". - :param str bam_file: path to BAM file that has to be created - :param str run_name: SRR number of the SRA file that has to be converted + Args: + bam_file: Path to BAM file that has to be created. + run_name: SRR number of the SRA file that has to be converted. """ _LOGGER.info("Converting to bam: " + run_name) sra_file = os.path.join(self.sra_folder, run_name + ".sra") @@ -1506,12 +1542,14 @@ def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoRet def _sra_to_bam_conversion_fastq_damp( self, bam_file: str, run_name: str, picard_path: str = None ) -> NoReturn: - """ - Convert SRA file to BAM file by using fastq-dump - (is used when sam-dump fails, yielding an empty bam file. Here fastq -> bam conversion is used) - :param str bam_file: path to BAM file that has to be created - :param str run_name: SRR number of the SRA file that has to be converted - :param str picard_path: Path to The Picard toolkit. More info: https://broadinstitute.github.io/picard/ + """Convert SRA file to BAM file by using fastq-dump. + + Used when sam-dump fails, yielding an empty bam file. Here fastq -> bam conversion is used. + + Args: + bam_file: Path to BAM file that has to be created. + run_name: SRR number of the SRA file that has to be converted. + picard_path: Path to The Picard toolkit. More info: https://broadinstitute.github.io/picard/. """ # check to make sure it worked @@ -1547,16 +1585,17 @@ def _sra_to_bam_conversion_fastq_damp( def _write_subannotation( self, tabular_data: dict, filepath: str, column_names: list = None ): - """ - Write one or more tables to a given CSV filepath. - - :param tabular_data: Mapping | Iterable[Mapping]: single KV pair collection, or collection - of such collections, to write to disk as tabular data - :param str filepath: path to file to write, possibly with environment - variables included, e.g. from a config file - :param Iterable[str] column_names: collection of names for columns to - write - :return str: path to file written + """Write one or more tables to a given CSV filepath. + + Args: + tabular_data: Single KV pair collection, or collection + of such collections, to write to disk as tabular data. + filepath: Path to file to write, possibly with environment + variables included, e.g. from a config file. + column_names: Collection of names for columns to write. + + Returns: + Path to file written. """ _LOGGER.info(f"Sample subannotation sheet: {filepath}") fp = expandpath(filepath) @@ -1576,12 +1615,13 @@ def _write_subannotation( def _download_file( self, file_url: str, data_folder: str, new_name: str = None, sleep_after=0.5 ) -> NoReturn: - """ - Given an url for a file, downloading file to specified folder - :param str file_url: the URL of the file to download - :param str data_folder: path to the folder where data should be downloaded - :param float sleep_after: time to sleep after downloading - :param str new_name: new file name in the + """Given a url for a file, download file to specified folder. + + Args: + file_url: The URL of the file to download. + data_folder: Path to the folder where data should be downloaded. + sleep_after: Time to sleep after downloading. + new_name: New file name. """ filename = os.path.basename(file_url) if new_name is None: @@ -1606,11 +1646,14 @@ def _download_file( def _get_list_of_processed_files( self, file_gse_content: list, file_gsm_content: list ) -> tuple: - """ - Given a paths to GSE and GSM metafile create a list of dicts of metadata of processed files - :param list file_gse_content: list of lines of gse metafile - :param list file_gsm_content: list of lines of gse metafile - :return: tuple[list of metadata of processed sample files and series files] + """Given paths to GSE and GSM metafile create a list of dicts of metadata of processed files. + + Args: + file_gse_content: List of lines of gse metafile. + file_gsm_content: List of lines of gsm metafile. + + Returns: + Tuple[list of metadata of processed sample files and series files]. """ tar_re = re.compile(r".*\.tar$") gse_numb = None @@ -1778,11 +1821,14 @@ def _get_list_of_processed_files( return meta_processed_samples, meta_processed_series def _run_filter(self, meta_list: list, col_name: str = "file") -> list: - """ - Filters files and metadata using Regular expression filter - :param meta_list: list of composed metadata - :param col_name: name of the column where file names are stored - :return: metadata list after file_name filter + """Filter files and metadata using Regular expression filter. + + Args: + meta_list: List of composed metadata. + col_name: Name of the column where file names are stored. + + Returns: + Metadata list after file_name filter. """ filtered_list = [] for meta_elem in meta_list: @@ -1796,11 +1842,14 @@ def _run_filter(self, meta_list: list, col_name: str = "file") -> list: return filtered_list def _run_size_filter(self, meta_list, col_name="file_size"): - """ - Filters files and metadata by file size column specified in meta_list - :param meta_list: list of composed metadata - :param col_name: name of the column where is size information stored - :return: metadata list after size filter + """Filter files and metadata by file size column specified in meta_list. + + Args: + meta_list: List of composed metadata. + col_name: Name of the column where is size information stored. + + Returns: + Metadata list after size filter. """ if self.filter_size is not None: filtered_list = [] @@ -1819,12 +1868,15 @@ def _run_size_filter(self, meta_list, col_name="file_size"): return filtered_list def _download_processed_file(self, file_url: str, data_folder: str) -> bool: - """ - Given a url for a file, download it, and extract anything passing the filter. - :param str file_url: the URL of the file to download - :param str data_folder: the local folder where the file should be saved - :return bool: True if the file is downloaded successfully; false if it does - not pass filters and is not downloaded. + """Given a url for a file, download it, and extract anything passing the filter. + + Args: + file_url: The URL of the file to download. + data_folder: The local folder where the file should be saved. + + Returns: + True if the file is downloaded successfully; false if it does + not pass filters and is not downloaded. """ if not self.geo_folder: @@ -1862,12 +1914,12 @@ def _download_processed_file(self, file_url: str, data_folder: str) -> bool: raise e def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): - """ - Parse out the SRA project identifier from the GSE file + """Parse out the SRA project identifier from the GSE file. - :param list file_gse_content: list of content of file_sde_content - :param dict gsm_metadata: dict of GSM metadata - :param str file_sra: full path to SRA.csv metafile that has to be downloaded + Args: + file_gse_content: List of content of file_sde_content. + gsm_metadata: Dict of GSM metadata. + file_sra: Full path to SRA.csv metafile that has to be downloaded. """ # acc_SRP = None @@ -1951,10 +2003,13 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None): return [] def _get_SRP_list(self, srp_number: str) -> list: - """ - Get a list of srp by using requests and xml searching and getting list of dicts of SRRs - :param str srp_number: SRP number - :return: list of dicts of SRRs + """Get a list of srp by using requests and xml searching and getting list of dicts of SRRs. + + Args: + srp_number: SRP number. + + Returns: + List of dicts of SRRs. """ if not srp_number: _LOGGER.info("No srp number in this accession found") @@ -1996,13 +2051,15 @@ def _get_SRP_list(self, srp_number: str) -> list: def _read_gsm_metadata( self, acc_GSE: str, acc_GSE_list: dict, file_gsm_content: list ) -> dict: - """ - A simple state machine to parse SOFT formatted files (Here, the GSM file) + """A simple state machine to parse SOFT formatted files (Here, the GSM file). + + Args: + acc_GSE: GSE number (Series accession). + acc_GSE_list: List of GSE. + file_gsm_content: List of contents of gsm file. - :param str acc_GSE: GSE number (Series accession) - :param dict acc_GSE_list: list of GSE - :param list file_gsm_content: list of contents of gsm file - :return dict: dictionary of experiment information (gsm_metadata) + Returns: + Dictionary of experiment information (gsm_metadata). """ gsm_metadata = {} @@ -2095,12 +2152,13 @@ def _write( msg_pre: str = None, omit_newline: bool = False, ): - """ - Save new file (used for config file) - :param f_var_value: path to the file - :param content: content of the file - :param msg_pre: msg that have to be printed - :param omit_newline: omit new line + """Save new file (used for config file). + + Args: + f_var_value: Path to the file. + content: Content of the file. + msg_pre: Msg that have to be printed. + omit_newline: Omit new line. """ fp = expandpath(f_var_value) _LOGGER.info((msg_pre or "") + fp) diff --git a/geofetch/sraconvert.py b/geofetch/sraconvert.py index 7b05a34..abd4f7b 100755 --- a/geofetch/sraconvert.py +++ b/geofetch/sraconvert.py @@ -86,16 +86,25 @@ def _parse_cmdl(cmdl): def safe_echo(var): - """Returns an environment variable if it exists, or an empty string if not""" + """Return an environment variable if it exists, or an empty string if not. + + Args: + var: Environment variable name. + + Returns: + Environment variable value or empty string. + """ return os.getenv(var, "") def uniqify(seq): # Dave Kirby - """ - Return only unique items in a sequence, preserving order + """Return only unique items in a sequence, preserving order. + + Args: + seq: List of items to uniqify. - :param list seq: List of items to uniqify - :return list[object]: Original list with duplicates removed + Returns: + Original list with duplicates removed. """ # Order preserving seen = set() diff --git a/geofetch/utils.py b/geofetch/utils.py index 66bdd88..00d870c 100644 --- a/geofetch/utils.py +++ b/geofetch/utils.py @@ -33,14 +33,18 @@ def build_prefetch_command( def is_known_type(accn: str = None, typename: str = None): - """ - Determine if the given accession is of a known type. + """Determine if the given accession is of a known type. + + Args: + accn: Accession of interest. + typename: Check this typename for known status rather + than parsing an accession. + + Returns: + Whether the given accession is of a known type. - :param str accn: accession of interest - :param str typename: check this typename for known status rather - than parsing an accession - :return bool: whether the given accession is of a known type. - :raise TypeError: if neither argument is provided or one/both are empty. + Raises: + TypeError: If neither argument is provided or one/both are empty. """ if not (accn or typename): raise TypeError("Specify either accession or accession typename") @@ -54,19 +58,19 @@ def is_known_type(accn: str = None, typename: str = None): def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=None): - """ - Create a list of GSE accessions, either from file or a single value. + """Create a list of GSE accessions, either from file or a single value. This will be a dict, with the GSE# as the key, and corresponding value is a list of GSM# specifying the samples we're interested in from that GSE#. An empty sample list means we should get all samples from that GSE#. This loop will create this dict. - :param input_arg: Input argument (GSE, or file) - :param str metadata_folder: path to folder for accession metadata - :param bool just_metadata: whether to only process metadata, not the - actual data associated with the accession - :param str | int max_size: argument for prefetch command's --max-size option + Args: + input_arg: Input argument (GSE, or file). + metadata_folder: Path to folder for accession metadata. + just_metadata: Whether to only process metadata, not the + actual data associated with the accession. + max_size: Argument for prefetch command's --max-size option. """ acc_GSE_list = {} @@ -134,11 +138,13 @@ def parse_accessions(input_arg, metadata_folder, just_metadata=False, max_size=N def parse_SOFT_line(line: str) -> dict: - """ - Parse SOFT formatted line, returning a dictionary with the key-value pair. + """Parse SOFT formatted line, returning a dictionary with the key-value pair. - :param str line: A SOFT-formatted line to parse ( !key = value ) - :return dict[str, str]: A python Dict object representing the key-value. + Args: + line: A SOFT-formatted line to parse ( !key = value ). + + Returns: + A python Dict object representing the key-value. """ elems = line[1:].split("=") return {elems[0].rstrip(): "=".join(elems[1:]).lstrip()} @@ -148,11 +154,11 @@ class AccessionException(Exception): """Exceptional condition(s) dealing with accession number(s).""" def __init__(self, reason: str = ""): - """ - Optionally provide explanation for exceptional condition. + """Optionally provide explanation for exceptional condition. - :param str reason: some context or perhaps just a value that - could not be interpreted as an accession + Args: + reason: Some context or perhaps just a value that + could not be interpreted as an accession. """ super(AccessionException, self).__init__(reason) @@ -161,11 +167,11 @@ class SoftFileException(Exception): """Exceptional condition(s) dealing with accession number(s).""" def __init__(self, reason: str = ""): - """ - Optionally provide explanation for exceptional condition. + """Optionally provide explanation for exceptional condition. - :param str reason: some context or perhaps just a value that - could not be interpreted as an accession + Args: + reason: Some context or perhaps just a value that + could not be interpreted as an accession. """ super(SoftFileException, self).__init__(reason) @@ -176,16 +182,17 @@ class Accession(object): _LOGGER = logging.getLogger("{}.{}".format(__name__, "Accession")) def __init__(self, accn, strict=True): - """ - Create an instance with an accession and optionally a validation - strictness flag. - - :param str accn: accession - :param bool strict: strictness of the validation (whether to require - that the accession type is known here) - :raise AccessionException: if the given accession value isn't - prefixed with three characters followed by an integer, or if - strict validation is required and the accession type is unknown + """Create an instance with an accession and optionally a validation strictness flag. + + Args: + accn: Accession. + strict: Strictness of the validation (whether to require + that the accession type is known here). + + Raises: + AccessionException: If the given accession value isn't + prefixed with three characters followed by an integer, or if + strict validation is required and the accession type is unknown. """ typename, number = self._validate(accn) if strict and not is_known_type(accn): @@ -203,15 +210,17 @@ def fetch_metadata( clean: bool = False, max_soft_size: int = 1073741824, ) -> list: - """ - Fetch the metadata associated with this accession. - - :param str typename: type indicating URL format, use type - parsed at construction if unspecified - :param str outpath: path to file to which to write output, optional - :param bool clean: if true, files won't be saved - :param int max_soft_size: max soft file size in bytes - :return: list of lines in soft file + """Fetch the metadata associated with this accession. + + Args: + typename: Type indicating URL format, use type + parsed at construction if unspecified. + outpath: Path to file to which to write output, optional. + clean: If true, files won't be saved. + max_soft_size: Max soft file size in bytes. + + Returns: + List of lines in soft file. """ typename = (typename or self.typename).upper() @@ -283,10 +292,13 @@ def fetch_metadata( @staticmethod def _validate(accn: str): - """ - Determine if given value looks like an accession. - :param str accn: ordinary accession identifier. - :return: typename, number + """Determine if given value looks like an accession. + + Args: + accn: Ordinary accession identifier. + + Returns: + typename, number. """ typename, number = split_accn(accn) if len(typename) != 3: @@ -305,16 +317,15 @@ def _validate(accn: str): @staticmethod def accn_type_exception(accn: str, typename: str, include_known: bool = True): - """ - Create an exception instance based on an accession and a - parsed unknown typename. - - :param str accn: accession identifier from which unknown typename - was parsed - :param str typename: unknown typename that was parsed - :param bool include_known: whether to include the known - typenames in the exception message - :return AccessionException: the exception instance + """Create an exception instance based on an accession and a parsed unknown typename. + + Args: + accn: Accession identifier from which unknown typename was parsed. + typename: Unknown typename that was parsed. + include_known: Whether to include the known typenames in the exception message. + + Returns: + The exception instance. """ message = "Unknown accn type for '{}': '{}'".format(accn, typename) if include_known: @@ -323,22 +334,26 @@ def accn_type_exception(accn: str, typename: str, include_known: bool = True): def split_accn(accn: str): - """ - Split accession into prefix and number, leaving suffix as text - and converting the type prefix to uppercase. + """Split accession into prefix and number, leaving suffix as text and converting the type prefix to uppercase. + + Args: + accn: Ordinary accession identifier. - :param str accn: ordinary accession identifier. - :return str, str: prefix and integral suffix + Returns: + Prefix and integral suffix. """ typename, number_text = accn[:3], accn[3:] return typename.upper(), number_text def convert_size(size_str: str) -> int: - """ - Converting size, that was provided as string with suffix - :param str size_str: size as string with suffix: gb, mb, kb or b - :return int: size as int value in bytes + """Convert size, that was provided as string with suffix. + + Args: + size_str: Size as string with suffix: gb, mb, kb or b. + + Returns: + Size as int value in bytes. """ abbreviation_dict = {"gb": 1073741824, "mb": 1048576, "kb": 1024, "b": 1} supported_formats = r"(\dgb|\dmb|\db|\dkb)$" @@ -362,10 +377,10 @@ def convert_size(size_str: str) -> int: def clean_soft_files(meta_dir: str): - """ - Cleaning, deleting all soft files after downloading files - and creating PEPs - :param str meta_dir: Path to the metadata files + """Clean, delete all soft files after downloading files and creating PEPs. + + Args: + meta_dir: Path to the metadata files. """ try: dir_files = os.listdir(meta_dir) @@ -398,11 +413,13 @@ def run_subprocess(*args, **kwargs): def _get_list_of_keys(list_of_dict: list): - """ - Getting list of all keys that are in the dictionaries in the list + """Get list of all keys that are in the dictionaries in the list. - :param list list_of_dict: list of dicts with metadata - :return list: list of dictionary keys + Args: + list_of_dict: List of dicts with metadata. + + Returns: + List of dictionary keys. """ dict_keys = {"sample_name": None} @@ -415,19 +432,26 @@ def _get_list_of_keys(list_of_dict: list): def _get_value(all_line: str): - """ - :param all_line: string with key value. (e.g. '!Series_geo_accession = GSE188720') - :return: value (e.g. GSE188720) + """Extract value from SOFT format line. + + Args: + all_line: String with key value. (e.g. '!Series_geo_accession = GSE188720'). + + Returns: + Value (e.g. GSE188720). """ line_value = all_line.split("= ")[-1] return line_value.split(": ")[-1].rstrip("\n") def _read_tar_filelist(raw_text: str) -> dict: - """ - Creating list for supplementary files that are listed in "filelist.txt" - :param str raw_text: path to the file with information about files that are zipped ("filelist.txt") - :return dict: dict of supplementary file names and additional information + """Create list for supplementary files that are listed in "filelist.txt". + + Args: + raw_text: Path to the file with information about files that are zipped ("filelist.txt"). + + Returns: + Dict of supplementary file names and additional information. """ f = StringIO(raw_text) files_info = {} @@ -450,10 +474,13 @@ def _read_tar_filelist(raw_text: str) -> dict: def _check_file_existance(meta_processed_sample: list) -> list: - """ - Checking if last element of the list has files. If list of files is empty deleting it - :param: meta_processed_sample: list with metadata dictionary - :return: list with metadata dictionary after processing + """Check if last element of the list has files. If list of files is empty delete it. + + Args: + meta_processed_sample: List with metadata dictionary. + + Returns: + List with metadata dictionary after processing. """ nb = len(meta_processed_sample) - 1 if nb > -1: @@ -464,11 +491,11 @@ def _check_file_existance(meta_processed_sample: list) -> list: def _separate_list_of_files(meta_list: Union[list, dict], col_name: str = "files"): - """ - This method is separating list of files (dict value) or just simple dict - into two different dicts - :param col_name: column name that should be added with filenames - :param meta_list: list, or dict with metadata + """Separate list of files (dict value) or just simple dict into two different dicts. + + Args: + col_name: Column name that should be added with filenames. + meta_list: List, or dict with metadata. """ separated_list = [] if isinstance(meta_list, list): @@ -493,22 +520,23 @@ def _separate_list_of_files(meta_list: Union[list, dict], col_name: str = "files def _update_columns( metadata: dict, experiment_name: str, sample_name: str, read_type: str ) -> dict: - """ - Update the metadata associated with a particular experiment. + """Update the metadata associated with a particular experiment. For the experiment indicated, this function updates the value (mapping), including new data and populating columns used by looper based on existing values in the mapping. - :param Mapping metadata: the key-value mapping to update - :param str experiment_name: name of the experiment from which these - data came and are associated; the key in the metadata mapping - for which the value is to be updated - :param str sample_name: name of the sample with which these data are - associated - :param str read_type: usually "single" or "paired," an indication of the - type of sequencing reads for this experiment - :return: updated metadata + Args: + metadata: The key-value mapping to update. + experiment_name: Name of the experiment from which these + data came and are associated; the key in the metadata mapping + for which the value is to be updated. + sample_name: Name of the sample with which these data are associated. + read_type: Usually "single" or "paired," an indication of the + type of sequencing reads for this experiment. + + Returns: + Updated metadata. """ exp = metadata[experiment_name] @@ -538,10 +566,13 @@ def _update_columns( def _sanitize_config_string(text: str) -> str: - """ - Function that sanitizes text in config file. - :param text: Any string that have to be sanitized - :return: sanitized strings + """Sanitize text in config file. + + Args: + text: Any string that have to be sanitized. + + Returns: + Sanitized strings. """ new_str = text new_str = new_str.replace('"', '\\"') @@ -550,10 +581,13 @@ def _sanitize_config_string(text: str) -> str: def _sanitize_name(name_str: str) -> str: - """ - Function that sanitizes strings. (Replace all odd characters) - :param str name_str: Any string value that has to be sanitized. - :return: sanitized strings + """Sanitize strings by replacing all odd characters. + + Args: + name_str: Any string value that has to be sanitized. + + Returns: + Sanitized strings. """ new_str = name_str punctuation1 = r"""!"#$%&'()*,./:;<=>?@[\]^_`{|}~""" @@ -564,18 +598,24 @@ def _sanitize_name(name_str: str) -> str: def _create_dot_yaml(file_path: str, yaml_path: str) -> NoReturn: - """ - Function that creates .pep.yaml file that points to actual yaml file - :param str file_path: Path to the .pep.yaml file that we want to create - :param str yaml_path: path or name of the actual yaml file + """Create .pep.yaml file that points to actual yaml file. + + Args: + file_path: Path to the .pep.yaml file that we want to create. + yaml_path: Path or name of the actual yaml file. """ with open(file_path, "w+") as file: file.writelines(f"config_file: {yaml_path}") def _which(program: str): - """ - return str: the path to a program to make sure it exists + """Return the path to a program to make sure it exists. + + Args: + program: Program name. + + Returns: + The path to the program if it exists. """ import os @@ -597,13 +637,17 @@ def is_exe(fp): def _dict_to_list_converter( proj_dict: Dict = None, proj_list: List = None ) -> Union[Dict, List]: - """ - Converter project dict to list and vice versa + """Convert project dict to list and vice versa. + dict -> list list -> dict - :param proj_dict: project dictionary - :param proj_list: project list - :return: converted values + + Args: + proj_dict: Project dictionary. + proj_list: Project list. + + Returns: + Converted values. """ if proj_dict is not None: new_meta_list = [] @@ -627,10 +671,13 @@ def _dict_to_list_converter( def _standardize_colnames(meta_list: Union[list, dict]) -> Union[list, dict]: - """ - Standardize column names by lower-casing and underscore - :param list meta_list: list of dictionaries of samples - :return : list of dictionaries of samples with standard colnames + """Standardize column names by lower-casing and underscore. + + Args: + meta_list: List of dictionaries of samples. + + Returns: + List of dictionaries of samples with standard colnames. """ # check if meta_list is dict and converting it to list input_is_dict = False @@ -659,9 +706,7 @@ def _standardize_colnames(meta_list: Union[list, dict]) -> Union[list, dict]: def _separate_file_url(meta_list): - """ - This method is adding dict key without file_name without path - """ + """Add dict key without file_name without path.""" separated_list = [] for meta_elem in meta_list: new_dict = meta_elem.copy() @@ -686,8 +731,15 @@ def _separate_file_url(meta_list): def make_sample_name_unique( sanit_name: str, separated_list: list, new_number: int = 1 ) -> str: - """ - Check if name is unique for current sample + """Check if name is unique for current sample. + + Args: + sanit_name: Sanitized name. + separated_list: List of separated samples. + new_number: Number to append if name is not unique. + + Returns: + Unique sample name. """ if sanit_name not in [f["sample_name"] for f in separated_list]: return sanit_name @@ -698,14 +750,18 @@ def make_sample_name_unique( def _filter_gsm(meta_processed_samples: list, gsm_list: dict) -> list: - """ - Getting metadata list of all samples of one experiment and filtering it - by the list of GSM that was specified in the input files. - And then changing names of the sample names. + """Get metadata list of all samples of one experiment and filter it. + + Filter by the list of GSM that was specified in the input files. + And then change names of the sample names. + + Args: + meta_processed_samples: List of metadata dicts of samples. + gsm_list: List of dicts where GSM (samples) are keys and + sample names are values. Where values can be empty string. - :param meta_processed_samples: list of metadata dicts of samples - :param gsm_list: list of dicts where GSM (samples) are keys and - sample names are values. Where values can be empty string + Returns: + Filtered list of samples. """ if gsm_list.keys(): @@ -723,12 +779,13 @@ def _filter_gsm(meta_processed_samples: list, gsm_list: dict) -> list: def _unify_list_keys(processed_meta_list: list) -> list: - """ - Unifying list of dicts with metadata, so every dict will have - same keys + """Unify list of dicts with metadata, so every dict will have same keys. + + Args: + processed_meta_list: List of dicts with metadata. - :param list processed_meta_list: list of dicts with metadata - :return list: list of unified dicts with metadata + Returns: + List of unified dicts with metadata. """ list_of_keys = _get_list_of_keys(processed_meta_list) for k in list_of_keys: @@ -739,10 +796,13 @@ def _unify_list_keys(processed_meta_list: list) -> list: def gse_content_to_dict(gse_content: List[str]) -> Dict[str, dict]: - """ - Unpack gse soft file to dict - :param gse_content: list of strings of gse soft file - :return: dict of gse content + """Unpack gse soft file to dict. + + Args: + gse_content: List of strings of gse soft file. + + Returns: + Dict of gse content. """ gse_dict = {} for line in gse_content: @@ -761,10 +821,10 @@ def gse_content_to_dict(gse_content: List[str]) -> Dict[str, dict]: def is_prefetch_callable() -> bool: - """ - Test if the prefetch command can be run. + """Test if the prefetch command can be run. - :return: True if it is available. + Returns: + True if it is available. """ try: # Option -V means display version and then quit.