1+ # This script depends on having already run ring-data.csv
2+ # It will inspect the output of that script and generate a control dataset
3+ # This dataset will have the same number of molecules, but all molecules will be free
4+ # from rings entirely. Each molecule in this dataset will have a number of
5+ # bonds and double bonds corresponding to the number of bonds/double bonds in the ring data.
6+
7+ import random
8+ import networkx as nx
9+ import pandas as pd
10+ from rdkit import Chem
11+ from rdkit .Chem import RWMol
12+ from pathlib import Path
13+
14+ def random_tree_prufer (n , max_degree = 4 ):
15+ """Generate a random tree with n nodes using Prüfer sequence. Reject if any node exceeds max_degree."""
16+ while True :
17+ seq = [random .randint (0 , n - 1 ) for _ in range (n - 2 )]
18+ deg = [1 ] * n
19+ for v in seq :
20+ deg [v ] += 1
21+ if any (d > max_degree for d in deg ):
22+ # print("rejecting sequence")
23+ continue
24+
25+ G = nx .from_prufer_sequence (seq )
26+
27+ return G
28+
29+
30+ def assign_double_bonds (G , D , max_valence = 4 ):
31+ """Randomly assign D double bonds to edges of G while obeying valence constraint."""
32+ edges = list (G .edges ())
33+ if D > len (edges ):
34+ return None # impossible
35+ elif D == 0 :
36+ return []
37+ for _ in range (10 * D ):
38+ candidate_edges = set (random .sample (edges , D ))
39+ # Count valence
40+ bond_counts = {node : 0 for node in G .nodes ()}
41+ for u , v in G .edges ():
42+ bond_order = 2 if (u , v ) in candidate_edges or (v , u ) in candidate_edges else 1
43+ bond_counts [u ] += bond_order
44+ bond_counts [v ] += bond_order
45+ if all (val <= max_valence for val in bond_counts .values ()):
46+ # print("Double bond success")
47+ return candidate_edges
48+ else :
49+ # print('rejecting double bond configuration')
50+ continue
51+
52+
53+ def graph_to_smiles_with_double (G , double_edges ):
54+ mol = RWMol ()
55+ for _ in G .nodes ():
56+ mol .AddAtom (Chem .Atom ("C" ))
57+ for u , v in G .edges ():
58+ btype = Chem .BondType .DOUBLE if (u , v ) in double_edges or (v , u ) in double_edges else Chem .BondType .SINGLE
59+ mol .AddBond (int (u ), int (v ), btype )
60+ m = mol .GetMol ()
61+ Chem .SanitizeMol (m )
62+ return Chem .MolToSmiles (m )
63+
64+
65+ def sample_acyclic_hydrocarbons (B , D ):
66+ """Sample n_samples of acyclic hydrocarbons with B total bonds, D of which are double bonds."""
67+ assert B >= 1 , "At least one bond required"
68+ assert D <= B , "Double bonds must be ≤ total bonds"
69+ n_carbons = B + 1 # from tree: nodes = edges + 1
70+
71+ smiles_set = set ()
72+ while len (smiles_set ) < 1 :
73+ G = random_tree_prufer (n_carbons , max_degree = 4 )
74+ double_bonds = assign_double_bonds (G , D , max_valence = 4 )
75+ if double_bonds is None :
76+ continue
77+ try :
78+ smi = graph_to_smiles_with_double (G , double_bonds )
79+ smiles_set .add (smi )
80+ except Exception :
81+ continue # skip invalid molecules
82+ return list (smiles_set )[0 ]
83+
84+ def get_pah_metadata ():
85+ pah_data = pd .read_csv ("../data/polycyclic_hydrocarbons/selected_compas_3x.csv" )
86+ # Take the parameters we need for the control
87+ return pah_data [["bonds" , "double_bonds" , "atoms" ]]
88+
89+ def sample_acyclic_smiles (bond_metadata ):
90+ smiles = bond_metadata .apply (lambda row :
91+ sample_acyclic_hydrocarbons (row ["bonds" ],
92+ row ["double_bonds" ]),
93+ axis = 1 )
94+ return smiles
95+
96+ def write_mols_to_file (ach_df ):
97+
98+ Path ("../data/acyclic_hydrocarbons/" ).mkdir (parents = True , exist_ok = True )
99+
100+ ach_df ["mol" ] = ach_df ["smiles" ].apply (Chem .MolFromSmiles )
101+ ach_df ["filename" ] = ach_df .index .to_series ().map (lambda x : f"../data/acyclic_hydrocarbons/ach_{ x } .mol" )
102+ ach_df .apply (lambda row : Chem .MolToMolFile (row ["mol" ], row ["filename" ]), axis = 1 )
103+ metadata_file = "../data/acyclic_hydrocarbons/sampled_ach.csv"
104+ ach_df [["filename" , "double_bonds" , "bonds" , "atoms" ]].to_csv (metadata_file )
105+
106+ if __name__ == "__main__" :
107+ # Set random seed for sampling
108+ random .seed (137 )
109+ # Get controls from ring data
110+ ach_data = get_pah_metadata ()
111+ # Sample acyclic smiles
112+ ach_data ["smiles" ] = sample_acyclic_smiles (ach_data )
113+ # Write data and meta data to file
114+ write_mols_to_file (ach_data )
115+
0 commit comments