From e7d1cf3059579482ccf27d757adffd13d7cb6d8a Mon Sep 17 00:00:00 2001 From: Sam M Date: Fri, 8 Aug 2025 14:13:46 -0700 Subject: [PATCH 1/4] Add basic error handling in the main fetch_all loop --- geofetch/geofetch.py | 240 ++++++++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 118 deletions(-) diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index e841716..9e8d1e6 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -401,139 +401,143 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje description="Processing... ", disable=self.disable_progressbar, ): - ncount += 1 - if ncount <= self.skip: - continue - elif ncount == self.skip + 1: - _LOGGER.info(f"Skipped {self.skip} accessions. Starting now.") - - if not self.just_object or not self.acc_anno: - _LOGGER.info( - f"\033[38;5;200mProcessing accession {ncount} of {nkeys}: '{acc_GSE}'\033[0m" - ) + try: + ncount += 1 + if ncount <= self.skip: + continue + elif ncount == self.skip + 1: + _LOGGER.info(f"Skipped {self.skip} accessions. Starting now.") - if len(re.findall(GSE_PATTERN, acc_GSE)) != 1: - _LOGGER.debug(len(re.findall(GSE_PATTERN, acc_GSE))) - _LOGGER.warning( - "This does not appear to be a correctly formatted GSE accession! " - "Continue anyway..." - ) + if not self.just_object or not self.acc_anno: + _LOGGER.info( + f"\033[38;5;200mProcessing accession {ncount} of {nkeys}: '{acc_GSE}'\033[0m" + ) - if len(acc_GSE_list[acc_GSE]) > 0: - _LOGGER.info( - f"Limit to: {list(acc_GSE_list[acc_GSE])}" - ) # a list of GSM#s - - # For each GSE acc, produce a series of metadata files - file_gse = os.path.join(self.metadata_expanded, acc_GSE + "_GSE.soft") - file_gsm = os.path.join(self.metadata_expanded, acc_GSE + "_GSM.soft") - file_sra = os.path.join(self.metadata_expanded, acc_GSE + "_SRA.csv") - - if not os.path.isfile(file_gse) or self.refresh_metadata: - file_gse_content = Accession(acc_GSE).fetch_metadata( - file_gse, - clean=self.discard_soft, - max_soft_size=self.max_soft_size, - ) - else: - _LOGGER.info(f"Found previous GSE file: {file_gse}") - gse_file_obj = open(file_gse, "r") - file_gse_content = gse_file_obj.read().split("\n") - file_gse_content = [elem for elem in file_gse_content if len(elem) > 0] - - file_gse_content_dict = gse_content_to_dict(file_gse_content) - - if not os.path.isfile(file_gsm) or self.refresh_metadata: - file_gsm_content = Accession(acc_GSE).fetch_metadata( - file_gsm, - typename="GSM", - clean=self.discard_soft, - max_soft_size=self.max_soft_size, - ) - else: - _LOGGER.info(f"Found previous GSM file: {file_gsm}") - gsm_file_obj = open(file_gsm, "r") - file_gsm_content = gsm_file_obj.read().split("\n") - file_gsm_content = [elem for elem in file_gsm_content if len(elem) > 0] - - gsm_enter_dict = acc_GSE_list[acc_GSE] - - # download processed data - if self.processed: - ( - meta_processed_samples, - meta_processed_series, - ) = self.fetch_processed_one( - gse_file_content=file_gse_content, - gsm_file_content=file_gsm_content, - gsm_filter_list=gsm_enter_dict, - ) + if len(re.findall(GSE_PATTERN, acc_GSE)) != 1: + _LOGGER.debug(len(re.findall(GSE_PATTERN, acc_GSE))) + _LOGGER.warning( + "This does not appear to be a correctly formatted GSE accession! " + "Continue anyway..." + ) - # download processed files: - if not self.just_metadata: - self._download_processed_data( - acc_gse=acc_GSE, - meta_processed_samples=meta_processed_samples, - meta_processed_series=meta_processed_series, + if len(acc_GSE_list[acc_GSE]) > 0: + _LOGGER.info( + f"Limit to: {list(acc_GSE_list[acc_GSE])}" + ) # a list of GSM#s + + # For each GSE acc, produce a series of metadata files + file_gse = os.path.join(self.metadata_expanded, acc_GSE + "_GSE.soft") + file_gsm = os.path.join(self.metadata_expanded, acc_GSE + "_GSM.soft") + file_sra = os.path.join(self.metadata_expanded, acc_GSE + "_SRA.csv") + + if not os.path.isfile(file_gse) or self.refresh_metadata: + file_gse_content = Accession(acc_GSE).fetch_metadata( + file_gse, + clean=self.discard_soft, + max_soft_size=self.max_soft_size, ) + else: + _LOGGER.info(f"Found previous GSE file: {file_gse}") + gse_file_obj = open(file_gse, "r") + file_gse_content = gse_file_obj.read().split("\n") + file_gse_content = [elem for elem in file_gse_content if len(elem) > 0] + + file_gse_content_dict = gse_content_to_dict(file_gse_content) + + if not os.path.isfile(file_gsm) or self.refresh_metadata: + file_gsm_content = Accession(acc_GSE).fetch_metadata( + file_gsm, + typename="GSM", + clean=self.discard_soft, + max_soft_size=self.max_soft_size, + ) + else: + _LOGGER.info(f"Found previous GSM file: {file_gsm}") + gsm_file_obj = open(file_gsm, "r") + file_gsm_content = gsm_file_obj.read().split("\n") + file_gsm_content = [elem for elem in file_gsm_content if len(elem) > 0] - # generating PEPs for processed files: - if self.acc_anno: - self._generate_processed_meta( - acc_GSE, + gsm_enter_dict = acc_GSE_list[acc_GSE] + + # download processed data + if self.processed: + ( meta_processed_samples, meta_processed_series, - gse_meta_dict=file_gse_content_dict, + ) = self.fetch_processed_one( + gse_file_content=file_gse_content, + gsm_file_content=file_gsm_content, + gsm_filter_list=gsm_enter_dict, ) - else: - # adding metadata from current experiment to the project - processed_metadata_samples.extend(meta_processed_samples) - processed_metadata_series.extend(meta_processed_series) + # download processed files: + if not self.just_metadata: + self._download_processed_data( + acc_gse=acc_GSE, + meta_processed_samples=meta_processed_samples, + meta_processed_series=meta_processed_series, + ) - else: - # read gsm metadata - gsm_metadata = self._read_gsm_metadata( - acc_GSE, acc_GSE_list, file_gsm_content - ) + # generating PEPs for processed files: + if self.acc_anno: + self._generate_processed_meta( + acc_GSE, + meta_processed_samples, + meta_processed_series, + gse_meta_dict=file_gse_content_dict, + ) - # download sra metadata - srp_list_result = self._get_SRA_meta( - file_gse_content, gsm_metadata, file_sra - ) - if not srp_list_result: - _LOGGER.info("No SRP data, continuing ....") - _LOGGER.warning("No raw pep will be created! ....") - # delete current acc if no raw data was found - # del metadata_dict[acc_GSE] - pass - else: - _LOGGER.info("Parsing SRA file to download SRR records") - gsm_multi_table, gsm_metadata, runs = self._process_sra_meta( - srp_list_result, gsm_enter_dict, gsm_metadata - ) + else: + # adding metadata from current experiment to the project + processed_metadata_samples.extend(meta_processed_samples) + processed_metadata_series.extend(meta_processed_series) - # download raw data: - if not self.just_metadata: - for run in runs: - # download raw data - _LOGGER.info(f"Getting SRR: {run} in ({acc_GSE})") - self._download_raw_data(run) else: - _LOGGER.info("Dry run, no data will be downloaded") - - # save one project - if self.acc_anno and nkeys > 1: - self._write_raw_annotation_new( - name=acc_GSE, - metadata_dict=gsm_metadata, - subannot_dict=gsm_multi_table, - gse_meta_dict=file_gse_content_dict, + # read gsm metadata + gsm_metadata = self._read_gsm_metadata( + acc_GSE, acc_GSE_list, file_gsm_content ) - else: - metadata_dict_combined.update(gsm_metadata) - subannotation_dict_combined.update(gsm_multi_table) + # download sra metadata + srp_list_result = self._get_SRA_meta( + file_gse_content, gsm_metadata, file_sra + ) + if not srp_list_result: + _LOGGER.info("No SRP data, continuing ....") + _LOGGER.warning("No raw pep will be created! ....") + # delete current acc if no raw data was found + # del metadata_dict[acc_GSE] + pass + else: + _LOGGER.info("Parsing SRA file to download SRR records") + gsm_multi_table, gsm_metadata, runs = self._process_sra_meta( + srp_list_result, gsm_enter_dict, gsm_metadata + ) + + # download raw data: + if not self.just_metadata: + for run in runs: + # download raw data + _LOGGER.info(f"Getting SRR: {run} in ({acc_GSE})") + self._download_raw_data(run) + else: + _LOGGER.info("Dry run, no data will be downloaded") + + # save one project + if self.acc_anno and nkeys > 1: + self._write_raw_annotation_new( + name=acc_GSE, + metadata_dict=gsm_metadata, + subannot_dict=gsm_multi_table, + gse_meta_dict=file_gse_content_dict, + ) + + else: + metadata_dict_combined.update(gsm_metadata) + subannotation_dict_combined.update(gsm_multi_table) + except Exception as e: + _LOGGER.warning(f"Couldn't process {acc_GSE}: {e}") + continue _LOGGER.info(f"Finished processing {len(acc_GSE_list)} accession(s)") From 2e6578ab4065e4ceff2a50e1455ef686c05776e9 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 5 Nov 2025 20:50:58 -0500 Subject: [PATCH 2/4] fixed pr comments + lint + test python version --- .github/workflows/run-pytest.yml | 2 +- geofetch/geofetch.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 3d158c5..2fa4ad3 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.8", "3.12"] + python-version: ["3.9", "3.13"] os: [ubuntu-latest] steps: diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index 9e8d1e6..e6be294 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -438,9 +438,11 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje ) else: _LOGGER.info(f"Found previous GSE file: {file_gse}") - gse_file_obj = open(file_gse, "r") - file_gse_content = gse_file_obj.read().split("\n") - file_gse_content = [elem for elem in file_gse_content if len(elem) > 0] + with open(file_gse, "r") as gse_file_obj: + file_gse_content = gse_file_obj.read().split("\n") + file_gse_content = [ + elem for elem in file_gse_content if len(elem) > 0 + ] file_gse_content_dict = gse_content_to_dict(file_gse_content) @@ -453,9 +455,11 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje ) else: _LOGGER.info(f"Found previous GSM file: {file_gsm}") - gsm_file_obj = open(file_gsm, "r") - file_gsm_content = gsm_file_obj.read().split("\n") - file_gsm_content = [elem for elem in file_gsm_content if len(elem) > 0] + with open(file_gsm, "r") as gsm_file_obj: + file_gsm_content = gsm_file_obj.read().split("\n") + file_gsm_content = [ + elem for elem in file_gsm_content if len(elem) > 0 + ] gsm_enter_dict = acc_GSE_list[acc_GSE] @@ -536,7 +540,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje metadata_dict_combined.update(gsm_metadata) subannotation_dict_combined.update(gsm_multi_table) except Exception as e: - _LOGGER.warning(f"Couldn't process {acc_GSE}: {e}") + _LOGGER.warning(f"Couldn't process {acc_GSE}: {e}", exc_info=True) continue _LOGGER.info(f"Finished processing {len(acc_GSE_list)} accession(s)") From 846564adc9a04c5f79271e1933285015aabe98a0 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 1 Dec 2025 16:04:09 -0500 Subject: [PATCH 3/4] fixed series return object --- geofetch/geofetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geofetch/geofetch.py b/geofetch/geofetch.py index e6be294..5f8c66b 100755 --- a/geofetch/geofetch.py +++ b/geofetch/geofetch.py @@ -810,7 +810,7 @@ def _generate_processed_meta( f"{name}_series", name + EXP_SUPP_METADATA_FILE, ) - self._write_processed_annotation( + return_objects[f"{name}_series"] = self._write_processed_annotation( meta_processed_series, pep_acc_path_exp, just_object=self.just_object, From edbcca2e0e3d763103e3944bbeb64281c8ec0ca9 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 1 Dec 2025 18:42:13 -0500 Subject: [PATCH 4/4] updated version --- geofetch/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geofetch/_version.py b/geofetch/_version.py index 5e44a42..8f52a6e 100644 --- a/geofetch/_version.py +++ b/geofetch/_version.py @@ -1 +1 @@ -__version__ = "0.12.8" +__version__ = "0.12.9"