From c0233677015ae3a785c592e460a4f4712dfba0ab Mon Sep 17 00:00:00 2001 From: Michael Skinnider Date: Sun, 1 Mar 2026 09:21:38 -0500 Subject: [PATCH 1/4] remove invalid candidate smiels --- src/clm/commands/write_structural_prior_CV.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/clm/commands/write_structural_prior_CV.py b/src/clm/commands/write_structural_prior_CV.py index 7a48a018..317dcc95 100644 --- a/src/clm/commands/write_structural_prior_CV.py +++ b/src/clm/commands/write_structural_prior_CV.py @@ -242,6 +242,25 @@ def write_structural_prior_CV( logger.info("Reading sample file from generative model") gen = read_csv_file(sample_file) + # some SMILES may be invalid when tabulate_molecules used a different + # rdkit version -- validate only generated SMILES that are candidates to + # match a test molecule + gen_sorted = gen.sort_values("mass", kind="stable") + gen_masses = gen_sorted["mass"].values + lo_vals = test["mass_range"].apply(lambda r: r[0]).values + hi_vals = test["mass_range"].apply(lambda r: r[1]).values + lefts = np.searchsorted(gen_masses, lo_vals, side="left") + rights = np.searchsorted(gen_masses, hi_vals, side="right") + candidate_positions = set() + for l, r in zip(lefts, rights): + candidate_positions.update(range(l, r)) + candidates = gen_sorted.iloc[sorted(candidate_positions)] + invalid_idx = candidates.index[ + candidates["smiles"].progress_apply(lambda s: clean_mol( + s, raise_error=False) is None) + ] + gen = gen.drop(invalid_idx) + inputs = {"model": gen.assign(source="model")} if pubchem_file: From b4eece5ec78714f5e250b3b21d5c01137ba801d4 Mon Sep 17 00:00:00 2001 From: Michael Skinnider Date: Sun, 1 Mar 2026 09:49:52 -0500 Subject: [PATCH 2/4] precommit formatting --- src/clm/commands/write_structural_prior_CV.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/clm/commands/write_structural_prior_CV.py b/src/clm/commands/write_structural_prior_CV.py index 317dcc95..e5339697 100644 --- a/src/clm/commands/write_structural_prior_CV.py +++ b/src/clm/commands/write_structural_prior_CV.py @@ -252,12 +252,13 @@ def write_structural_prior_CV( lefts = np.searchsorted(gen_masses, lo_vals, side="left") rights = np.searchsorted(gen_masses, hi_vals, side="right") candidate_positions = set() - for l, r in zip(lefts, rights): - candidate_positions.update(range(l, r)) + for left, right in zip(lefts, rights): + candidate_positions.update(range(left, right)) candidates = gen_sorted.iloc[sorted(candidate_positions)] invalid_idx = candidates.index[ - candidates["smiles"].progress_apply(lambda s: clean_mol( - s, raise_error=False) is None) + candidates["smiles"].progress_apply( + lambda s: clean_mol(s, raise_error=False) is None + ) ] gen = gen.drop(invalid_idx) From e575c6fbb55257ebd65e6d013269fff2083c78b6 Mon Sep 17 00:00:00 2001 From: "Michael A. Skinnider" Date: Sun, 1 Mar 2026 09:50:30 -0500 Subject: [PATCH 3/4] more precommit --- src/clm/commands/write_structural_prior_CV.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/clm/commands/write_structural_prior_CV.py b/src/clm/commands/write_structural_prior_CV.py index e5339697..3dc31395 100644 --- a/src/clm/commands/write_structural_prior_CV.py +++ b/src/clm/commands/write_structural_prior_CV.py @@ -242,8 +242,8 @@ def write_structural_prior_CV( logger.info("Reading sample file from generative model") gen = read_csv_file(sample_file) - # some SMILES may be invalid when tabulate_molecules used a different - # rdkit version -- validate only generated SMILES that are candidates to + # some SMILES may be invalid when tabulate_molecules used a different + # rdkit version -- validate only generated SMILES that are candidates to # match a test molecule gen_sorted = gen.sort_values("mass", kind="stable") gen_masses = gen_sorted["mass"].values From 89d48039c17476686aca5bc1a85557c8c3237fab Mon Sep 17 00:00:00 2001 From: Seungchan An Date: Tue, 3 Mar 2026 17:08:00 -0500 Subject: [PATCH 4/4] log removal of invalid SMILES with examples --- src/clm/commands/write_structural_prior_CV.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/clm/commands/write_structural_prior_CV.py b/src/clm/commands/write_structural_prior_CV.py index 3dc31395..e5c1b855 100644 --- a/src/clm/commands/write_structural_prior_CV.py +++ b/src/clm/commands/write_structural_prior_CV.py @@ -260,8 +260,23 @@ def write_structural_prior_CV( lambda s: clean_mol(s, raise_error=False) is None ) ] + gen = gen.drop(invalid_idx) + n_candidates = len(candidates) + n_invalid = len(invalid_idx) + + # log if invalid SMILES were detected and removed + if n_invalid > 0: + examples = gen.loc[invalid_idx, "smiles"].head(5).tolist() + + logger.warning( + f"Removed {n_invalid} invalid SMILES among " + f"{n_candidates} candidates to match a test molecule " + f"(possibly due to a different rdkit version). " + f"Examples: {examples}" + ) + inputs = {"model": gen.assign(source="model")} if pubchem_file: