Skip to content

Commit 1a8d815

Browse files
committed
Sampling random branched alkenes with bond counts derived from the ring data
1 parent 4136e5e commit 1a8d815

File tree

1 file changed

+115
-0
lines changed

1 file changed

+115
-0
lines changed

scripts/alkene-sampling.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# This script depends on having already run ring-data.csv
2+
# It will inspect the output of that script and generate a control dataset
3+
# This dataset will have the same number of molecules, but all molecules will be free
4+
# from rings entirely. Each molecule in this dataset will have a number of
5+
# bonds and double bonds corresponding to the number of bonds/double bonds in the ring data.
6+
7+
import random
8+
import networkx as nx
9+
import pandas as pd
10+
from rdkit import Chem
11+
from rdkit.Chem import RWMol
12+
from pathlib import Path
13+
14+
def random_tree_prufer(n, max_degree=4):
15+
"""Generate a random tree with n nodes using Prüfer sequence. Reject if any node exceeds max_degree."""
16+
while True:
17+
seq = [random.randint(0, n - 1) for _ in range(n - 2)]
18+
deg = [1] * n
19+
for v in seq:
20+
deg[v] += 1
21+
if any(d > max_degree for d in deg):
22+
# print("rejecting sequence")
23+
continue
24+
25+
G = nx.from_prufer_sequence(seq)
26+
27+
return G
28+
29+
30+
def assign_double_bonds(G, D, max_valence=4):
31+
"""Randomly assign D double bonds to edges of G while obeying valence constraint."""
32+
edges = list(G.edges())
33+
if D > len(edges):
34+
return None # impossible
35+
elif D == 0:
36+
return []
37+
for _ in range(10*D):
38+
candidate_edges = set(random.sample(edges, D))
39+
# Count valence
40+
bond_counts = {node: 0 for node in G.nodes()}
41+
for u, v in G.edges():
42+
bond_order = 2 if (u, v) in candidate_edges or (v, u) in candidate_edges else 1
43+
bond_counts[u] += bond_order
44+
bond_counts[v] += bond_order
45+
if all(val <= max_valence for val in bond_counts.values()):
46+
# print("Double bond success")
47+
return candidate_edges
48+
else:
49+
# print('rejecting double bond configuration')
50+
continue
51+
52+
53+
def graph_to_smiles_with_double(G, double_edges):
54+
mol = RWMol()
55+
for _ in G.nodes():
56+
mol.AddAtom(Chem.Atom("C"))
57+
for u, v in G.edges():
58+
btype = Chem.BondType.DOUBLE if (u, v) in double_edges or (v, u) in double_edges else Chem.BondType.SINGLE
59+
mol.AddBond(int(u), int(v), btype)
60+
m = mol.GetMol()
61+
Chem.SanitizeMol(m)
62+
return Chem.MolToSmiles(m)
63+
64+
65+
def sample_acyclic_hydrocarbons(B, D):
66+
"""Sample n_samples of acyclic hydrocarbons with B total bonds, D of which are double bonds."""
67+
assert B >= 1, "At least one bond required"
68+
assert D <= B, "Double bonds must be ≤ total bonds"
69+
n_carbons = B + 1 # from tree: nodes = edges + 1
70+
71+
smiles_set = set()
72+
while len(smiles_set) < 1:
73+
G = random_tree_prufer(n_carbons, max_degree=4)
74+
double_bonds = assign_double_bonds(G, D, max_valence=4)
75+
if double_bonds is None:
76+
continue
77+
try:
78+
smi = graph_to_smiles_with_double(G, double_bonds)
79+
smiles_set.add(smi)
80+
except Exception:
81+
continue # skip invalid molecules
82+
return list(smiles_set)[0]
83+
84+
def get_pah_metadata():
85+
pah_data = pd.read_csv("../data/polycyclic_hydrocarbons/selected_compas_3x.csv")
86+
# Take the parameters we need for the control
87+
return pah_data[["bonds", "double_bonds", "atoms"]]
88+
89+
def sample_acyclic_smiles(bond_metadata):
90+
smiles = bond_metadata.apply(lambda row:
91+
sample_acyclic_hydrocarbons(row["bonds"],
92+
row["double_bonds"]),
93+
axis=1)
94+
return smiles
95+
96+
def write_mols_to_file(ach_df):
97+
98+
Path("../data/acyclic_hydrocarbons/").mkdir(parents=True, exist_ok=True)
99+
100+
ach_df["mol"] = ach_df["smiles"].apply(Chem.MolFromSmiles)
101+
ach_df["filename"] = ach_df.index.to_series().map(lambda x: f"../data/acyclic_hydrocarbons/ach_{x}.mol")
102+
ach_df.apply(lambda row: Chem.MolToMolFile(row["mol"], row["filename"]), axis=1)
103+
metadata_file = "../data/acyclic_hydrocarbons/sampled_ach.csv"
104+
ach_df[["filename", "double_bonds", "bonds", "atoms"]].to_csv(metadata_file)
105+
106+
if __name__ == "__main__":
107+
# Set random seed for sampling
108+
random.seed(137)
109+
# Get controls from ring data
110+
ach_data = get_pah_metadata()
111+
# Sample acyclic smiles
112+
ach_data["smiles"] = sample_acyclic_smiles(ach_data)
113+
# Write data and meta data to file
114+
write_mols_to_file(ach_data)
115+

0 commit comments

Comments
 (0)