diff --git a/chebai/callbacks/epoch_metrics.py b/chebai/callbacks/epoch_metrics.py index c1cf7bd3..76d6a8fd 100644 --- a/chebai/callbacks/epoch_metrics.py +++ b/chebai/callbacks/epoch_metrics.py @@ -62,7 +62,8 @@ def update(self, preds: torch.Tensor, labels: torch.Tensor) -> None: labels (torch.Tensor): Ground truth labels. """ tps = torch.sum( - torch.logical_and(preds > self.threshold, labels.to(torch.bool)), dim=0 + torch.logical_and(preds > self.threshold, labels.to(torch.bool)), + dim=0, ) self.true_positives += tps self.positive_predictions += torch.sum(preds > self.threshold, dim=0) diff --git a/chebai/models/base.py b/chebai/models/base.py index e657963f..7653f13c 100644 --- a/chebai/models/base.py +++ b/chebai/models/base.py @@ -4,6 +4,7 @@ import torch from lightning.pytorch.core.module import LightningModule +from lightning.pytorch.utilities.rank_zero import rank_zero_info from chebai.preprocessing.structures import XYData @@ -106,7 +107,8 @@ def _get_prediction_and_labels( Returns: Tuple[torch.Tensor, torch.Tensor]: Predictions and labels. """ - return output, labels + # cast labels to int + return output, labels.to(torch.int) if labels is not None else labels def _process_labels_in_batch(self, batch: XYData) -> torch.Tensor: """ @@ -158,6 +160,13 @@ def _process_for_loss( """ return model_output, labels, loss_kwargs + def on_train_epoch_start(self) -> None: + # pass current epoch to datamodule if it has the attribute curr_epoch (for PubChemBatched dataset) + rank_zero_info(f"Starting epoch {self.current_epoch}") + if hasattr(self.trainer.datamodule, "curr_epoch"): + rank_zero_info(f"Setting datamodule.curr_epoch to {self.current_epoch}") + self.trainer.datamodule.curr_epoch = self.current_epoch + def training_step( self, batch: XYData, batch_idx: int ) -> Dict[str, Union[torch.Tensor, Any]]: @@ -310,6 +319,8 @@ def _execute( for metric_name, metric in metrics.items(): metric.update(pr, tar) self._log_metrics(prefix, metrics, len(batch)) + if isinstance(d, dict) and "loss" not in d: + print(f"d has keys {d.keys()}, log={log}, criterion={self.criterion}") return d def _log_metrics(self, prefix: str, metrics: torch.nn.Module, batch_size: int): diff --git a/chebai/models/classic_ml.py b/chebai/models/classic_ml.py new file mode 100644 index 00000000..c63d94a6 --- /dev/null +++ b/chebai/models/classic_ml.py @@ -0,0 +1,97 @@ +import os +import pickle as pkl +from typing import Any, Dict, List, Optional + +import numpy as np +import torch +import tqdm +from sklearn.exceptions import NotFittedError +from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression + +from chebai.models.base import ChebaiBaseNet + +LR_MODEL_PATH = os.path.join("models", "LR") + + +class LogisticRegression(ChebaiBaseNet): + """ + Logistic Regression model using scikit-learn, wrapped to fit the ChebaiBaseNet interface. + """ + + def __init__( + self, + out_dim: int, + input_dim: int, + only_predict_classes: Optional[List] = None, + n_classes=1528, + **kwargs, + ): + super().__init__(out_dim=out_dim, input_dim=input_dim, **kwargs) + self.models = [ + SklearnLogisticRegression(solver="liblinear") for _ in range(n_classes) + ] + # indices of classes (in the dataset used for training) where a model should be trained + self.only_predict_classes = only_predict_classes + + def forward(self, x: Dict[str, Any], **kwargs) -> torch.Tensor: + print( + f"forward called with x[features].shape {x['features'].shape}, self.training {self.training}" + ) + if self.training: + self.fit_sklearn(x["features"], x["labels"]) + preds = [] + for model in self.models: + try: + p = torch.from_numpy(model.predict(x["features"])).float() + p = p.to(x["features"].device) + preds.append(p) + except NotFittedError: + preds.append( + torch.zeros((x["features"].shape[0]), device=(x["features"].device)) + ) + except AttributeError: + preds.append( + torch.zeros((x["features"].shape[0]), device=(x["features"].device)) + ) + preds = torch.stack(preds, dim=1) + print(f"preds shape {preds.shape}") + return preds.squeeze(-1) + + def fit_sklearn(self, X, y): + """ + Fit the underlying sklearn model. X and y should be numpy arrays. + """ + for i, model in tqdm.tqdm(enumerate(self.models), desc="Fitting models"): + import os + + if os.path.exists(os.path.join(LR_MODEL_PATH, f"LR_model_{i}.pkl")): + print(f"Loading model {i} from file") + self.models[i] = pkl.load( + open(os.path.join(LR_MODEL_PATH, f"LR_model_{i}.pkl"), "rb") + ) + else: + if ( + self.only_predict_classes and i not in self.only_predict_classes + ): # only try these classes + continue + try: + model.fit(X, y[:, i]) + except ValueError: + self.models[i] = PlaceholderModel() + # dump + pkl.dump( + model, open(os.path.join(LR_MODEL_PATH, f"LR_model_{i}.pkl"), "wb") + ) + + def configure_optimizers(self, **kwargs): + pass + + +class PlaceholderModel: + """Acts like a trained model, but isn't. Use this if training fails and you need a placeholder.""" + + def __init__(self, default_prediction=1): + self.default_prediction = default_prediction + + def predict(self, preds): + return np.ones(preds.shape[0]) * self.default_prediction diff --git a/chebai/models/electra.py b/chebai/models/electra.py index 589f0b02..c053db1c 100644 --- a/chebai/models/electra.py +++ b/chebai/models/electra.py @@ -224,6 +224,7 @@ def __init__( config: Optional[Dict[str, Any]] = None, pretrained_checkpoint: Optional[str] = None, load_prefix: Optional[str] = None, + freeze_electra: bool = False, **kwargs: Any, ): # Remove this property in order to prevent it from being stored as a @@ -262,6 +263,10 @@ def __init__( else: self.electra = ElectraModel(config=self.config) + if freeze_electra: + for param in self.electra.parameters(): + param.requires_grad = False + def _process_for_loss( self, model_output: Dict[str, Tensor], diff --git a/chebai/models/lstm.py b/chebai/models/lstm.py index 3a0949c4..96ecc944 100644 --- a/chebai/models/lstm.py +++ b/chebai/models/lstm.py @@ -1,7 +1,7 @@ import logging from torch import nn -from torch.nn.utils.rnn import pack_padded_sequence +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence from chebai.models.base import ChebaiBaseNet @@ -9,23 +9,47 @@ class ChemLSTM(ChebaiBaseNet): - def __init__(self, in_d, out_d, num_classes, **kwargs): - super().__init__(num_classes, **kwargs) - self.lstm = nn.LSTM(in_d, out_d, batch_first=True) - self.embedding = nn.Embedding(800, 100) + def __init__( + self, + out_d, + in_d, + num_classes, + criterion: nn.Module = None, + num_layers=6, + dropout=0.2, + **kwargs, + ): + super().__init__( + out_dim=out_d, + input_dim=in_d, + criterion=criterion, + num_classes=num_classes, + **kwargs, + ) + self.lstm = nn.LSTM( + in_d, + out_d, + batch_first=True, + dropout=dropout, + bidirectional=True, + num_layers=num_layers, + ) + self.embedding = nn.Embedding(1400, in_d) self.output = nn.Sequential( - nn.Linear(out_d, in_d), + nn.Linear(out_d * 2, out_d), nn.ReLU(), nn.Dropout(0.2), - nn.Linear(in_d, num_classes), + nn.Linear(out_d, num_classes), ) - def forward(self, data): - x = data.x - x_lens = data.lens + def forward(self, data, *args, **kwargs): + x = data["features"] + x_lens = data["model_kwargs"]["lens"] x = self.embedding(x) x = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False) - x = self.lstm(x)[1][0] - # = pad_packed_sequence(x, batch_first=True)[0] + x = self.lstm(x)[0] + x = pad_packed_sequence(x, batch_first=True)[0][ + :, 0 + ] # reduce sequence dimension to first element x = self.output(x) - return x.squeeze(0) + return x diff --git a/chebai/preprocessing/bin/smiles_token/tokens.txt b/chebai/preprocessing/bin/smiles_token/tokens.txt index 9ce39f9d..960173cd 100644 --- a/chebai/preprocessing/bin/smiles_token/tokens.txt +++ b/chebai/preprocessing/bin/smiles_token/tokens.txt @@ -984,3 +984,3390 @@ p [ClH2+] [BrH2+] [IH2+] +[RuH+2] +[RuH2+2] +[p-] +[15NH] +[Fe+4] +[11CH3] +[P@H] +[Ru+8] +[15n] +[15nH] +[Er+3] +[14CH2] +[Si+3] +[B@@-] +[76Br] +[IH+] +[128Ba] +[BiH] +[14cH] +[14c] +[13NH2] +[Nb+5] +[IH] +[14CH] +[ReH] +[18FH] +[c+] +[RuH2] +[Ru+6] +[IrH4] +[Pt+] +[Mo+2] +[20OH] +[Tc+3] +b +[Dy+3] +[195Pt] +[p+] +[si] +[18OH] +[36Ar] +[68Ga+3] +[RuH3] +[66Ga] +[Al+2] +[18C] +[Nb+3] +[siH] +[75As+3] +[Mn+] +[ClH4+3] +[68Ga] +[Ru+5] +[Mo+] +[Tc+4] +[11CH2] +[211At] +[77Br] +[99Tc+3] +[oH+] +[Nb-2] +[InH] +[P-2] +[184Hf] +[B@-] +[PoH] +[124I] +[14C@H] +[Si@@H] +[35Cl] +[W+] +[37Cl] +[Bi+2] +[13CH4] +[18F-] +[15NH3+] +[Si@H] +[Nb+2] +[98Tc+5] +[Ta+2] +[Rh-] +[151Eu+3] +[RuH] +[63Ni+2] +[NiH] +[PdH2] +[52Mn] +[16OH-] +[Fe+6] +[64Cu] +[194Os] +[Ir+] +[13C-] +[121I] +[Tm+3] +[19BH2] +[Sn+3] +[AlH2+] +[186Re] +[XeH] +[Os+6] +[15N+] +[122I] +[99Tc+6] +[GaH] +[12CH3] +[12C@H] +[AlH2-] +[16OH] +[GeH2-] +[49Ti] +[SiH-2] +[14C@@H] +[11CH4] +[197Hg+] +[Rh+] +[Th+2] +[Yb+2] +[145Eu] +[Cu-] +[RuH+3] +[20CH2] +[SnH2+2] +[136Ba] +[188Re] +[b-] +[se+] +[212Pb+2] +[Ga-] +[WH2] +[232Th] +[225Ac] +[89Zr] +[214Bi] +[pH+] +[TlH] +[99Tc+4] +[10CH2] +[AlH6-3] +[12CH2] +[123IH] +[14C@@] +[6Li+] +[SnH+] +[SnH+3] +[Tb+4] +[99Tc+5] +[125IH] +[144Pm] +[IrH2] +[10BH-] +[10BH2] +[60Co+3] +[14C-] +[NiH2] +[140Ce] +[125I-] +[177Lu+3] +[169Lu] +[85Sr] +[OsH6] +[7Li+] +[18o] +[InH4-] +[OsH2] +[In-] +[11CH] +[ClH] +[13CH2-] +[35P] +[15NH4+] +[RhH+2] +[86Rb+] +[166Ho+3] +[RuH+] +[75Br] +[SiH2-2] +[I@-] +[227Th] +[90Y] +[11c] +[11cH] +[PtH+] +[FeH] +[si-] +[213Bi+3] +[Os+5] +[Te@] +[64Cu+2] +[SbH+] +[14nH] +[14n] +[99Tc+7] +[12C@@H] +[192Bi] +[PtH] +[TaH2] +[32Cl] +[153Sm] +[255Fm] +[133IH] +[12C@] +[AlH-] +[61Cu] +[52Ti] +[117Sn+4] +[83Rb+] +[18O-] +[238Pu] +[165Dy] +[AlH+2] +[16N+] +[141Cs] +[67Cu+2] +[239Am] +[B@H-] +[201Hg] +[231Th] +[126Te] +[17OH] +[66Zn+2] +[Ge-2] +[98Tc+7] +[15n+] +[203Hg+] +[124I-] +[Ge@@] +[207At] +[Tc+5] +[177Lu] +[111In+3] +[CoH2] +[PdH+] +[12c] +[10CH3] +[YH] +[TaH3] +[TaH5] +[12CH] +[Tc+2] +[244Am] +[68Ge] +[35SH] +[RhH] +[MoH2] +[34SH] +[111In] +[RuH4] +[17C] +[Se@] +[65Zn+2] +[15N-] +[PtH2] +[135I] +[123Xe] +[62Zn] +[122Sb] +[si+] +[137La] +[ZrH2] +[53Mn] +[111In-] +[125Cs] +[Tc+6] +[106Pd] +[194Ir] +[159Gd] +[FeH4] +[141Sm] +[111InH3] +[Tc+] +[si+2] +[64Zn+2] +[te+] +[HgH] +[Pd-] +[Zr-2] +[10B-] +[10BH] +[8BH2] +[85Sr+2] +[IrH+2] +[PbH2+2] +[Re-2] +[12B] +[Zr+] +[10BH3] +[11BH3] +[91Y] +[218AtH] +[Ge@] +[CuH+] +[86Y] +[170Yb] +[63Cu+2] +[164Dy] +[173Ta] +[16C] +[ClH+2] +[153Gd+3] +[OsH] +[11C-] +[231Pa] +[TiH] +[229Th] +[72Zn] +[ZrH] +[67Cu] +[14O] +[156Eu] +[155Sm] +[138Ce] +[B@@H-] +[MnH2] +[16NH2] +[51Mn] +[42K] +[MoH5] +[128Sn] +[ClH2+2] +[17F] +[77BrH] +[16n+] +[ZnH+] +[153Sm+3] +[100Tc+4] +[94Ru] +[98Tc] +[IrH3] +[132La] +[242Am] +[14NH] +[162Er] +[208Bi] +[127Xe] +[11CH3-] +[Os+7] +[137Cs+] +[201Tl+] +[13CH+] +[ClH+3] +[129Cs] +[105Rh+3] +[127Sb+3] +[131Cs] +[168Yb] +[17NH] +[9C-] +[33SH2] +[13NH] +[Ge@@H] +[105Ru] +[PdH] +[82Br] +[12cH] +[41Ca] +[184Ir] +[82Rb+] +[14NH2] +[94Zr+4] +[74Se] +[80Br] +[123Te] +[70Zn] +[Tc+7] +[160Dy] +[P@@H] +[148Pm] +[64Zn] +[136Eu+3] +[SnH2+] +[232U] +[234U] +[246Cm] +[24Mg] +[Se@@] +[142Sm] +[68GaH] +[40K+] +[173Yb] +[45Ca+2] +[126IH] +[55Fe+3] +[Ta-2] +[151Nd] +[91Sr] +[Bi-2] +[130Te] +[GaH4-] +[BrH] +[SbH-] +[13CH3+] +[RhH2] +[38Cl-] +[75Ge] +[239Pu] +[ReH7] +[99Tc+2] +[RhH3] +[26Mg] +[Os+8] +[CuH2] +[122Xe] +[Pr+] +[74As] +[239Th] +[SeH2+] +[17OH2] +[136Cs+] +[13CH3-] +[IrH] +[11B-] +[Te@@] +[195Pt+2] +[134Cs+] +[TiH2] +[90Nb] +[146Eu] +[45Ca] +[15NH3] +[SnH-] +[176W] +[110Ru] +[237Pu] +[RuH6] +[217Bi] +[11C@@H] +[150Sm] +[179Lu] +[65Cu+] +[180W] +[132Te] +[90Sr+2] +[14c-] +[213BiH] +[145Pm] +[131SbH3] +[60Co] +[66Ga+3] +[225Ra] +[165Er] +[147Sm] +[129Sb] +[179Hf] +[129Cs+] +[AuH3] +[92Nb] +[GeH6-2] +[233Ra] +[FeH2] +[149Pm] +[ZnH2] +[99Ru] +[AgH] +[1HH] +[200Hg] +[16CH2] +[131I-] +[248Cf] +[CuH] +[232Pa] +[135I-] +[Ge@H] +[AuH] +[67Ga] +[193Pt+4] +[125Te+4] +[7Be] +[10c] +[WH] +[22CH3-] +[105Rh] +[OsH-] +[TaH] +[237Np] +[47V] +[191Pt+2] +[127Cs] +[13O] +[15NH+] +[135Ba] +[67GaH3] +[15OH] +[151Sm] +[18CH2] +[145Nd] +[97Zr] +[249Cf] +[100Tc+] +[I@@-] +[57Fe+2] +[102Pd] +[52Fe+3] +[181Ta+2] +[123I-] +[127I-] +[202Bi] +[106Ru] +[174Yb] +[81Rb+] +[150Pm] +[22C] +[143La] +[66Ni] +[126Sb] +[68GaH3] +[13c-] +[35S-] +[12C-] +[62Cu] +[183Hf] +[VH2] +[182Ta] +[15n-] +[230U] +[253Fm] +[90Y+3] +[237Am] +[173Lu] +[71Ge] +[204TlH] +[SbH2+] +[172Er] +[144Ce] +[107Ag] +[34s] +[CeH3] +[131I+2] +[59Fe+2] +[Mn-2] +[96Tc] +[68Cu] +[25Mg+2] +[105Ag] +[76Se] +[245Bk] +[111InH2] +[93Mo] +[154Gd] +[127Sn] +[Cl@-] +[76As] +[101Mo] +[152Gd] +[193Pt+2] +[12CH4] +[99Y+3] +[173Tm] +[9CH] +[113In+3] +[237U] +[88Sr+2] +[176Yb] +[75BrH] +[BiH4] +[15NH2+] +[242Cm] +[12BH2] +[59Fe] +[14NH3] +[79Kr] +[siH-] +[TcH4] +[69Zn] +[177Hf] +[89Zr+4] +[CrH2] +[125Sb] +[41Ar] +[70Ga] +[69Ga] +[78As] +[143Nd] +[51Cr+3] +[73AsH3] +[167Tm] +[13NH3] +[126SbH3] +[74AsH3] +[WH4] +[9c] +[100Mo] +[199PbH2] +[115Sb] +[176Lu] +[99Ru+2] +[100Pd] +[240Np] +[198Au] +[233Np] +[130I-] +[NbH3] +[95Y] +[16n] +[196Bi] +[181Os] +[CoH+] +[MnH+] +[10Be] +[44Ca+2] +[183Ta] +[155Gd] +[140Ba+2] +[77AsH3] +[235U] +[86Zr] +[131Te] +[17O-] +[17FH] +[250Bk] +[125Xe] +[AsH+] +[187Re] +[79BrH] +[192Ir] +[169Er+3] +[147Tb] +[AlH2-2] +[186Os] +[11CH3+] +[15nH+] +[152Sm+3] +[40PH] +[101Pd] +[47Ti] +[CoH+2] +[53Cr+6] +[227Ac] +[182Re] +[40Ar] +[191Pt+4] +[241Am] +[227Th+4] +[YH2] +[CoH3] +[149Gd] +[137Ba+2] +[39K+] +[Zr-3] +[161Er] +[Os-3] +[181Ta] +[49Ca] +[169Yb] +[45K] +[184W] +[196Au] +[179Ta] +[72Se] +[80Se] +[14CH4] +[210Tl] +[37SH2] +[FeH3] +[62Zn+2] +[15NH-] +[Re-] +[194Au] +[87Sr+2] +[131Ba] +[104Cd] +[131IH] +[124Xe] +[BiH2+2] +[88Nb] +[175Yb+3] +[240U] +[193Pt] +[62Cu+2] +[32P+] +[32PH] +[8B] +[132Cs+] +[LaH3] +[236Np] +[siH+] +[Zr-] +[18OH-] +[134Cs] +[ClH3+2] +[42K+] +[42Ca] +[94Tc+7] +[192Os] +[22Na+] +[38K] +[109Ag] +[136Eu] +[22Na] +[121Sn+2] +[173Hf] +[120I] +[149Tb] +[203Hg+2] +[139Pr] +[73Se] +[240Cm] +[162Dy] +[39Ar] +[89Nb] +[Cd-] +[115Cd] +[253Cf] +[235Pu] +[144Cs] +[18OH3+] +[186Ta] +[115Ag] +[169Yb+3] +[77Kr] +[TiH+] +[138Nd] +[18n] +[34SH2] +[39S] +[92Y] +[135Ce] +[236Pu] +[92Zr] +[50Ti] +[65Ga] +[189Os] +[184Os] +[15CH4] +[131Cs+] +[151Tb] +[38Ar] +[99Mo] +[161Gd] +[CrH+2] +[CoH] +[203PbH] +[81Rb] +[163Dy] +[166Tm] +[bH-] +[31SH] +[86Sr] +[189Ir] +[171Tm] +[194Pb] +[204Hg+] +[231U] +[ZnH] +[59Ni] +[19FH] +[13C+] +[118Sb] +[28Mg+2] +[22c] +[241Cm] +[144Ce+4] +[44Sc] +[38Cl] +[187Ir] +[148Eu] +[57Co+2] +[201TlH3] +[153Pm] +[203PbH2] +[36Cl] +[69Ga+3] +[Co-] +[81Br] +[95Tc+4] +[22CH2] +[170Tm] +[234Np] +[110Sn] +[SH2] +[36ClH] +[TiH4] +[218Pb] +[141Cs+] +[223Ac] +[104Tc] +[239Np] +[198Au+3] +[130SbH3] +[198Bi] +[134Xe] +[109Pd] +[153Gd] +[203Bi] +[253Es] +[XeH2] +[244Cm] +[79Rb+] +[141Pr+3] +[15NH2-] +[86Tc] +[103Pd+2] +[17c] +[82Br-] +[20CH] +[112Pd] +[165Tm] +[89Y+3] +[174Lu] +[23Na+] +[164Ho] +[201Au] +[115In] +[99Tc+] +[19B] +[238Am] +[127Te] +[133I-] +[130Xe] +[83Sr+2] +[184Ta] +[240Am] +[15C] +[197Hg+2] +[186Lu] +[155Eu] +[178Yb] +[35Cl-] +[166Ho] +[70AsH3] +[58Co+2] +[14CH2-] +[137Pr] +[135IH] +[99Y] +[85Rb+] +[13OH] +[90Tc] +[Sn@] +[113In] +[95Ru] +[ReH4] +[15C@@H] +[15CH2] +[109Pd+2] +[47Ca+2] +[17C-] +[17CH] +[58Co] +[38PH3] +[134Ce] +[71Zn] +[110Pd] +[148Nd] +[14N+] +[CrH3] +[58Fe+2] +[235U+2] +[167Er] +[178Ta] +[101Tc] +[130Cs] +[122I-] +[CuH2-] +[158Gd] +[238Th] +[238Np] +[160Tb] +[168Er] +[83BrH] +[246Am] +[199Pb] +[79SeH2] +[157Dy] +[9C] +[FeH6] +[76Kr] +[243Am] +[34S-] +[88Rb+] +[WH3] +[MoH] +[13CH-] +[40PH3] +[218Rn] +[59Co+3] +[172Tm] +[209Bi] +[199Tl+] +[66Ge] +[95Zr] +[71As] +[46Ti] +[232Np] +[48Sc] +[90Zr] +[123I+2] +[159Ho] +[40Ca] +[44K+] +[ZrH2+2] +[19C] +[195Tl] +[126Ba] +[159Gd+3] +[167Yb] +[12C@@] +[13OH2] +[195Ir] +[109Cd] +[109Cd+2] +[87Y] +[35s] +[148Tb] +[81BrH] +[ZrH3] +[162Tm] +[206Bi] +[72AsH3] +[146Nd] +[239U] +[246Bk] +[87Rb+] +[177W] +[176Hf] +[GaH-] +[156Ho] +[101Rh] +[212Bi] +[257Md] +[190Os] +[OsH4] +[46Ca] +[250Es] +[70As] +[57Co] +[55Fe+2] +[122SbH3] +[156Sm] +[ThH4] +[94Mo] +[181Re] +[105Pd] +[13N+] +[139Ba] +[30PH3] +[120I-] +[155Dy] +[84BrH] +[116In] +[PtH4] +[60Ni+2] +[186W] +[107Cd] +[46Sc] +[11C@H] +[95Tc] +[67Zn+2] +[13B] +[112Sn] +[128I] +[193Au] +[103Ru+2] +[136Ce] +[195Pb] +[89Sr+2] +[210PoH2] +[70Se] +[138Xe] +[35SH2] +[UH2] +[BH+] +[61Co] +[VH] +[178W] +[124IH] +[185Ir] +[99Rh] +[18O-2] +[209PbH2] +[120IH] +[91Zr] +[Hf+] +[15C-] +[OsH3] +[119SbH3] +[148Sm] +[149Sm] +[118Pd+2] +[BH4+] +[NiH+] +[29Al] +[58Co+3] +[142Pr] +[212PbH2] +[144Ce+3] +[47Sc] +[200Pb] +[224Rn] +[133Ba] +[53Cr] +[7Be+2] +[26AlH3] +[188Pt] +[12NH3] +[77As] +[182Hf] +[33PH] +[193Os] +[248Cm] +[113Sn] +[121SnH2] +[110Cd] +[43K+] +[NbH2] +[116Te] +[168Tm] +[165Dy+3] +[154Sm] +[162Yb] +[89Rb+] +[47Ca] +[18CH3] +[135Cs+] +[223Fr] +[61Ni] +[24Na+] +[174Hf+4] +[167Ho] +[84Rb+] +[50Cr] +[153Eu] +[38PH] +[194Bi] +[ReH3] +[60Co+2] +[110In] +[77Ge] +[177Re] +[211Bi] +[94Nb] +[222Ra] +[159Dy] +[136Cs] +[ReH6] +[170Lu] +[129I+2] +[61Cu+] +[134Te] +[HgH2] +[93Y] +[BiH2+] +[MnH] +[CeH] +[18o+] +[39ClH] +[EuH3] +[148Gd] +[133Xe] +[142Nd] +[36SH] +[Cl@@-] +[209BiH3] +[210BiH3] +[200Bi] +[SiH4-] +[11CH-] +[52V] +[58Ni] +[185W] +[249Bk] +[72BrH] +[185Ta] +[251Es] +[158Eu] +[243Pu] +[205Pb] +[84Sr] +[37Ar] +[82BrH] +[79Rb] +[208TlH] +[207Bi] +[172Lu] +[15OH2] +[157Tb] +[244Cf] +[15CH] +[95Nb] +[83Kr] +[110Ag+] +[77Br-] +[199TlH] +[17OH-] +[86Y+3] +[90Mo] +[65Cu+2] +[202Hg] +[171Lu] +[13NH2-] +[178Lu] +[212Ra] +[10CH4] +[9CH4] +[171Er] +[125Sn] +[P@@H+] +[142Ce] +[254Fm] +[67Ge] +[87Y+3] +[108Pd] +[104Rh] +[201Bi] +[18CH] +[64Ni] +[181Hf] +[156Dy] +[35S-2] +[151Pm] +[182Ir] +[71Se] +[88Kr] +[56Ni] +[60Fe] +[161Ho] +[NiH2+2] +[84Kr] +[234Pu] +[179W] +[217At] +[54Fe] +[37Cl-] +[MoH4] +[71Ga] +[238U] +[127Cs+] +[76BrH] +[157Ho] +[100Tc] +[234Pa] +[218PoH2] +[17O+] +[HgH+] +[230Th] +[77se] +[35ClH] +[18O+] +[Os-] +[34Cl-] +[228Ac] +[195Pt+4] +[132I-] +[189Re] +[142Ba+2] +[Ta+] +[45Ti] +[254Es] +[203TlH] +[122IH] +[142Pm] +[136Nd] +[80Kr] +[102Ag] +[32ClH] +[13cH-] +[124Sb] +[27Mg] +[113Ag] +[228Pa] +[144Nd] +[44Ca] +[P@H+] +[54Cr] +[246Cf] +[155Tb] +[124Sn] +[201TlH] +[155Ho] +[TiH+3] +[20Ne] +[201Pb] +[166Dy] +[138Cs] +[162Ho] +[211Rn] +[204Tl] +[186Pt] +[228Th] +[170Tm+3] +[100Rh] +[193Ir] +[213Bi] +[157Lu] +[142Ba] +[36SH2] +[15O+] +[129IH] +[230Pu] +[19OH2] +[154Eu+3] +[157Sm] +[195Hg] +[175Yb] +[121Xe] +[112Ag] +[15O-2] +[ClH3+3] +[37ClH] +[252Cf] +[158Dy] +[40K] +[78BrH] +[111Cd+2] +[103Pd] +[88Rb] +[132Xe] +[190Ir] +[22Ne] +[31P-3] +[57Co+3] +[72As] +[122Te] +[90Zr+4] +[57Mn] +[175Hf] +[198Pb] +[96Mo] +[152Dy] +[203Pb] +[34ClH] +[102Rh] +[194Hg] +[233U+4] +[187W] +[54Mn] +[117Sb] +[139Nd] +[117Cd] +[126Sb+3] +[54Fe+3] +[235Np] +[15CH3] +[16CH3] +[SeH5] +[128Te] +[194Tl] +[204Pb] +[200Tl] +[106Rh] +[87Sr] +[125I+2] +[56Co] +[172Hf] +[18C@@H] +[78AsH3] +[49V] +[112In] +[102Ru] +[178Hf] +[167Dy] +[104Pd] +[220Fr] +[14CH-] +[31PH3] +[210PbH2] +[147Eu] +[43Sc] +[31PH] +[191Ir] +[191Os] +[YbH2] +[164Er] +[9Li] +[22nH] +[68Zn] +[132Cs] +[81Se] +[69As] +[86Kr] +[245Am] +[131Sb] +[51Ti] +[58Fe+3] +[166Yb] +[208PbH2] +[InH-] +[157Gd+3] +[144Pr] +[218At] +[164Dy+3] +[117In] +[202Pb] +[94Zr] +[149Eu] +[238Cm] +[139Ce] +[AlH5-2] +[245Pu] +[75Br-] +[82Sr+2] +[94Tc] +[141Pm] +[28Mg] +[133Ba+2] +[114Sn] +[PtH2+2] +[172Yb] +[245Cm] +[103Ag] +[142La] +[169Er] +[32PH3] +[233U] +[74BrH] +[203Pb+2] +[133Te] +[52Cr] +[Zr-4] +[18C-] +[63Ni] +[135La] +[97Tc] +[208Tl] +[89Zr+3] +[16O+] +[97Ru] +[44K] +[48Cr] +[151Gd] +[130Cs+] +[141La] +[205Bi+3] +[103Ru] +[108Cd] +[131La] +[141Ce+3] +[38K+] +[94Y] +[66Cu] +[16OH2] +[14CH3-] +[204Hg] +[224Ac] +[205Bi] +[113I] +[36Cl-] +[170Hf] +[82Rb] +[31S] +[83Rb] +[65Ni] +[74Br-] +[139Cs] +[70Ge] +[106Cd] +[160Gd] +[75SeH] +[199Au] +[84Rb] +[107Rh] +[210Bi] +[121Te] +[188Ir] +[ThH2] +[GeH5-] +[116SbH3] +[21NH3] +[88Y] +[138Pr] +[117SnH2] +[156Gd] +[141Ce] +[19Ne] +[191Pt] +[55Fe] +[118Pd] +[14OH2] +[202PbH2] +[80Sr] +[82Se-2] +[240Pu] +[104Ag] +[114In+3] +[210At] +[196Pb] +[197Pb] +[209Pb] +[210Pb] +[211Pb] +[212Pb] +[213Pb] +[214Pb] +[147Pm] +[126I-] +[141Pr] +[203Tl+] +[SmH3] +[76AsH3] +[24Na] +[107Pd] +[121I-] +[258Md] +[103Rh] +[226Th] +[236U] +[174Ta] +[228Rn] +[138Ba] +[154Tb] +[136Pr] +[80BrH] +[146Ce] +[182W] +[188Os] +[131Xe] +[132Ba] +[252Fm] +[83Se] +[140Ba] +[51Fe] +[246Pu] +[106Ag] +[38SH2] +[48Ca] +[58Fe] +[16NH3] +[63Zn] +[111Sn] +[62Ga] +[44Ti] +[76Br-] +[181W] +[KrH] +[141Nd] +[60Cu] +[9cH] +[56Mn] +[209Tl] +[137Ba] +[248Am] +[216Bi] +[Ti-] +[128Sb] +[146Gd] +[82Kr] +[53Ni] +[108Ag] +[145Gd] +[229Rn] +[85Kr] +[211PbH2] +[180Os] +[166Er] +[81Br-] +[SeH4] +[242Pu] +[154Eu] +[ScH3] +[41Ca+2] +[129I-] +[72Br-] +[75As+5] +[43K] +[116Sb] +[120Te] +[150Nd] +[130Sb] +[195Au] +[175Tm] +[As@] +[ClH2+3] +[73Ga] +[254Cf] +[69Ge] +[247Cm] +[83Sr] +[RuH5] +[98Nb] +[147Nd] +[150Eu] +[MoH3] +[119In] +[144Pr+3] +[97Mo] +[129Te] +[188W] +[206Tl] +[149Nd] +[200Pt] +[82Se+6] +[97Nb] +[149Pr] +[198Hg] +[49Cr] +[135Xe] +[52Fe] +[177Yb] +[48V] +[62Ni] +[21Ne] +[185Os] +[178Re] +[62Co] +[120Sb] +[EuH2] +[182Os] +[127Sb] +[221Fr] +[244Pu] +[68Ge+4] +[197Tl] +[172Ta] +[80Br-] +[BiH+] +[170Er] +[123Sn] +[161Dy] +[202Tl] +[89Sr] +[147Gd] +[150Tb] +[43Ca+2] +[BiH3+2] +[96Zr] +[98Tc+4] +[110Te] +[89Kr] +[145Pr] +[49Sc] +[17NH4+] +[180Hf] +[44Sc+3] +[73As] +[140La] +[137Ce] +[119Sb] +[247Bk] +[76Ge] +[121Sn] +[220Ra] +[156Tb] +[208Tl+] +[153Tb] +[16O-] +[130IH] +[20CH3] +[187Os] +[14NH4+] +[50Cr+3] +[81Sr] +[222Fr] +[55Co] +[41K] +[72Ga] +[78Se] +[137Xe] +[103Cd] +[93Zr] +[126Xe] +[80Rb] +[176Ta] +[199Pt] +[205PbH2] +[197Pt] +[200Au] +[120Xe] +[136Xe] +[20C] +[100Tc+5] +[157Gd] +[17B] +[198Tl] +[SnH2-] +[127IH] +[65Cu] +[186Ir] +[193Hg] +[132IH] +[147Pr] +[145Sm] +[122Sn] +[161Tb] +[110Ag] +[250Cf] +[33PH3] +[241Pu] +[32SH2] +[185Re] +[78Ge] +[106Ru+3] +[146Sm] +[109In] +[17NH3] +[233Pa] +[134IH] +[92Sr+2] +[BH3+] +[64Ga] +[92Sr] +[82Se+4] +[62Cu+] +[226Ac] +[171Yb] +[34S-2] +[249Cm] +[56Fe] +[227Ra] +[143Ce] +[226Rn] +[64Cu+] +[152Tb] +[34S+] +[207Tl] +[111Ag] +[227Pa] +[157Eu] +[184Re] +[72Ge] +[SnH+2] +[117Sn+2] +[230Pa] +[78Kr] +[134Ba] +[199Hg] +[13CH2+] +[250Cm] +[183Re] +[121IH] +[251Cf] +[81Kr] +[125Cs+] +[208Pb] +[143Pm] +[114In] +[113Sn+4] +[82Sr] +[74Ge] +[UH3] +[52Mn+2] +[114Cd] +[33ClH] +[79Br-] +[22CH4] +[70Zn+2] +[144Sm] +[124Te] +[seH+] +[51Cr+6] +[152Sm] +[130Ba] +[Po@] +[174Hf] +[141Ba] +[128IH] +[27Al+3] +[234Th] +[88Zr] +[111IH] +[177Ta] +[191Os+4] +[152Eu] +[48Ti] +[87Kr] +[91Y+3] +[180Ta] +[128Xe] +[143Cs] +[86Rb] +[45K+] +[180Re] +[126Sn] +[146Pm] +[143Pr] +[116Cd] +[89Rb] +[230Ra] +[WH6] +[167Tm+3] +[96Nb] +[92Mo] +[57Ni] +[189Pt] +[134La] +[79Se] +[38ClH] +[125Sn+4] +[243Cm] +[257Fm] +[85Br] +[206Pb] +[138Cs+] +[175Ta] +[16nH] +[138La] +[112Cd] +[93Tc] +[28SiH3] +[166Tb] +[161Tb+3] +[158Tb] +[90Sr] +[32PH2] +[RuH+2] +[RuH2+2] +[p-] +[15NH] +[Fe+4] +[11CH3] +[P@H] +[Ru+8] +[15n] +[15nH] +[Er+3] +[14CH2] +[Si+3] +[B@@-] +[76Br] +[IH+] +[128Ba] +[BiH] +[14cH] +[14c] +[13NH2] +[Nb+5] +[IH] +[14CH] +[ReH] +[18FH] +[c+] +[RuH2] +[Ru+6] +[IrH4] +[Pt+] +[Mo+2] +[20OH] +[Tc+3] +b +[Dy+3] +[195Pt] +[p+] +[si] +[18OH] +[36Ar] +[68Ga+3] +[RuH3] +[66Ga] +[Al+2] +[18C] +[Nb+3] +[siH] +[75As+3] +[Mn+] +[ClH4+3] +[68Ga] +[Ru+5] +[Mo+] +[Tc+4] +[11CH2] +[211At] +[77Br] +[99Tc+3] +[oH+] +[Nb-2] +[InH] +[P-2] +[184Hf] +[B@-] +[PoH] +[124I] +[14C@H] +[Si@@H] +[35Cl] +[W+] +[37Cl] +[Bi+2] +[13CH4] +[18F-] +[15NH3+] +[Si@H] +[Nb+2] +[98Tc+5] +[Ta+2] +[Rh-] +[151Eu+3] +[RuH] +[63Ni+2] +[NiH] +[PdH2] +[52Mn] +[16OH-] +[Fe+6] +[64Cu] +[194Os] +[Ir+] +[13C-] +[121I] +[Tm+3] +[19BH2] +[Sn+3] +[AlH2+] +[186Re] +[XeH] +[Os+6] +[15N+] +[122I] +[99Tc+6] +[GaH] +[12CH3] +[12C@H] +[AlH2-] +[16OH] +[GeH2-] +[49Ti] +[SiH-2] +[14C@@H] +[11CH4] +[197Hg+] +[Rh+] +[Th+2] +[Yb+2] +[145Eu] +[Cu-] +[RuH+3] +[20CH2] +[SnH2+2] +[136Ba] +[188Re] +[b-] +[se+] +[212Pb+2] +[Ga-] +[WH2] +[232Th] +[225Ac] +[89Zr] +[214Bi] +[pH+] +[TlH] +[99Tc+4] +[10CH2] +[AlH6-3] +[12CH2] +[123IH] +[14C@@] +[6Li+] +[SnH+] +[SnH+3] +[Tb+4] +[99Tc+5] +[125IH] +[144Pm] +[IrH2] +[10BH-] +[10BH2] +[60Co+3] +[14C-] +[NiH2] +[140Ce] +[125I-] +[177Lu+3] +[169Lu] +[85Sr] +[OsH6] +[7Li+] +[18o] +[InH4-] +[OsH2] +[In-] +[11CH] +[ClH] +[13CH2-] +[35P] +[15NH4+] +[RhH+2] +[86Rb+] +[166Ho+3] +[RuH+] +[75Br] +[SiH2-2] +[I@-] +[227Th] +[90Y] +[11c] +[11cH] +[PtH+] +[FeH] +[si-] +[213Bi+3] +[Os+5] +[Te@] +[64Cu+2] +[SbH+] +[14nH] +[14n] +[99Tc+7] +[12C@@H] +[192Bi] +[PtH] +[TaH2] +[32Cl] +[153Sm] +[255Fm] +[133IH] +[12C@] +[AlH-] +[61Cu] +[52Ti] +[117Sn+4] +[83Rb+] +[18O-] +[238Pu] +[165Dy] +[AlH+2] +[16N+] +[141Cs] +[67Cu+2] +[239Am] +[B@H-] +[201Hg] +[231Th] +[126Te] +[17OH] +[66Zn+2] +[Ge-2] +[98Tc+7] +[15n+] +[203Hg+] +[124I-] +[Ge@@] +[207At] +[Tc+5] +[177Lu] +[111In+3] +[CoH2] +[PdH+] +[12c] +[10CH3] +[YH] +[TaH3] +[TaH5] +[12CH] +[Tc+2] +[244Am] +[68Ge] +[35SH] +[RhH] +[MoH2] +[34SH] +[111In] +[RuH4] +[17C] +[Se@] +[65Zn+2] +[15N-] +[PtH2] +[135I] +[123Xe] +[62Zn] +[122Sb] +[si+] +[137La] +[ZrH2] +[53Mn] +[111In-] +[125Cs] +[Tc+6] +[106Pd] +[194Ir] +[159Gd] +[FeH4] +[141Sm] +[111InH3] +[Tc+] +[si+2] +[64Zn+2] +[te+] +[HgH] +[Pd-] +[Zr-2] +[10B-] +[10BH] +[8BH2] +[85Sr+2] +[IrH+2] +[PbH2+2] +[Re-2] +[12B] +[Zr+] +[10BH3] +[11BH3] +[91Y] +[218AtH] +[Ge@] +[CuH+] +[86Y] +[170Yb] +[63Cu+2] +[164Dy] +[173Ta] +[16C] +[ClH+2] +[153Gd+3] +[OsH] +[11C-] +[231Pa] +[TiH] +[229Th] +[72Zn] +[ZrH] +[67Cu] +[14O] +[156Eu] +[155Sm] +[138Ce] +[B@@H-] +[MnH2] +[16NH2] +[51Mn] +[42K] +[MoH5] +[128Sn] +[ClH2+2] +[17F] +[77BrH] +[16n+] +[ZnH+] +[153Sm+3] +[100Tc+4] +[94Ru] +[98Tc] +[IrH3] +[132La] +[242Am] +[14NH] +[162Er] +[208Bi] +[127Xe] +[11CH3-] +[Os+7] +[137Cs+] +[201Tl+] +[13CH+] +[ClH+3] +[129Cs] +[105Rh+3] +[127Sb+3] +[131Cs] +[168Yb] +[17NH] +[9C-] +[33SH2] +[13NH] +[Ge@@H] +[105Ru] +[PdH] +[82Br] +[12cH] +[41Ca] +[184Ir] +[82Rb+] +[14NH2] +[94Zr+4] +[74Se] +[80Br] +[123Te] +[70Zn] +[Tc+7] +[160Dy] +[P@@H] +[148Pm] +[64Zn] +[136Eu+3] +[SnH2+] +[232U] +[234U] +[246Cm] +[24Mg] +[Se@@] +[142Sm] +[68GaH] +[40K+] +[173Yb] +[45Ca+2] +[126IH] +[55Fe+3] +[Ta-2] +[151Nd] +[91Sr] +[Bi-2] +[130Te] +[GaH4-] +[BrH] +[SbH-] +[13CH3+] +[RhH2] +[38Cl-] +[75Ge] +[239Pu] +[ReH7] +[99Tc+2] +[RhH3] +[26Mg] +[Os+8] +[CuH2] +[122Xe] +[Pr+] +[74As] +[239Th] +[SeH2+] +[17OH2] +[136Cs+] +[13CH3-] +[IrH] +[11B-] +[Te@@] +[195Pt+2] +[134Cs+] +[TiH2] +[90Nb] +[146Eu] +[45Ca] +[15NH3] +[SnH-] +[176W] +[110Ru] +[237Pu] +[RuH6] +[217Bi] +[11C@@H] +[150Sm] +[179Lu] +[65Cu+] +[180W] +[132Te] +[90Sr+2] +[14c-] +[213BiH] +[145Pm] +[131SbH3] +[60Co] +[66Ga+3] +[225Ra] +[165Er] +[147Sm] +[129Sb] +[179Hf] +[129Cs+] +[AuH3] +[92Nb] +[GeH6-2] +[233Ra] +[FeH2] +[149Pm] +[ZnH2] +[99Ru] +[AgH] +[1HH] +[200Hg] +[16CH2] +[131I-] +[248Cf] +[CuH] +[232Pa] +[135I-] +[Ge@H] +[AuH] +[67Ga] +[193Pt+4] +[125Te+4] +[7Be] +[10c] +[WH] +[22CH3-] +[105Rh] +[OsH-] +[TaH] +[237Np] +[47V] +[191Pt+2] +[127Cs] +[13O] +[15NH+] +[135Ba] +[67GaH3] +[15OH] +[151Sm] +[18CH2] +[145Nd] +[97Zr] +[249Cf] +[100Tc+] +[I@@-] +[57Fe+2] +[102Pd] +[52Fe+3] +[181Ta+2] +[123I-] +[127I-] +[202Bi] +[106Ru] +[174Yb] +[81Rb+] +[150Pm] +[22C] +[143La] +[66Ni] +[126Sb] +[68GaH3] +[13c-] +[35S-] +[12C-] +[62Cu] +[183Hf] +[VH2] +[182Ta] +[15n-] +[230U] +[253Fm] +[90Y+3] +[237Am] +[173Lu] +[71Ge] +[204TlH] +[SbH2+] +[172Er] +[144Ce] +[107Ag] +[34s] +[CeH3] +[131I+2] +[59Fe+2] +[Mn-2] +[96Tc] +[68Cu] +[25Mg+2] +[105Ag] +[76Se] +[245Bk] +[111InH2] +[93Mo] +[154Gd] +[127Sn] +[Cl@-] +[76As] +[101Mo] +[152Gd] +[193Pt+2] +[12CH4] +[99Y+3] +[173Tm] +[9CH] +[113In+3] +[237U] +[88Sr+2] +[176Yb] +[75BrH] +[BiH4] +[15NH2+] +[242Cm] +[12BH2] +[59Fe] +[14NH3] +[79Kr] +[siH-] +[TcH4] +[69Zn] +[177Hf] +[89Zr+4] +[CrH2] +[125Sb] +[41Ar] +[70Ga] +[69Ga] +[78As] +[143Nd] +[51Cr+3] +[73AsH3] +[167Tm] +[13NH3] +[126SbH3] +[74AsH3] +[WH4] +[9c] +[100Mo] +[199PbH2] +[115Sb] +[176Lu] +[99Ru+2] +[100Pd] +[240Np] +[198Au] +[233Np] +[130I-] +[NbH3] +[95Y] +[16n] +[196Bi] +[181Os] +[CoH+] +[MnH+] +[10Be] +[44Ca+2] +[183Ta] +[155Gd] +[140Ba+2] +[77AsH3] +[235U] +[86Zr] +[131Te] +[17O-] +[17FH] +[250Bk] +[125Xe] +[AsH+] +[187Re] +[79BrH] +[192Ir] +[169Er+3] +[147Tb] +[AlH2-2] +[186Os] +[11CH3+] +[15nH+] +[152Sm+3] +[40PH] +[101Pd] +[47Ti] +[CoH+2] +[53Cr+6] +[227Ac] +[182Re] +[40Ar] +[191Pt+4] +[241Am] +[227Th+4] +[YH2] +[CoH3] +[149Gd] +[137Ba+2] +[39K+] +[Zr-3] +[161Er] +[Os-3] +[181Ta] +[49Ca] +[169Yb] +[45K] +[184W] +[196Au] +[179Ta] +[72Se] +[80Se] +[14CH4] +[210Tl] +[37SH2] +[FeH3] +[62Zn+2] +[15NH-] +[Re-] +[194Au] +[87Sr+2] +[131Ba] +[104Cd] +[131IH] +[124Xe] +[BiH2+2] +[88Nb] +[175Yb+3] +[240U] +[193Pt] +[62Cu+2] +[32P+] +[32PH] +[8B] +[132Cs+] +[LaH3] +[236Np] +[siH+] +[Zr-] +[18OH-] +[134Cs] +[ClH3+2] +[42K+] +[42Ca] +[94Tc+7] +[192Os] +[22Na+] +[38K] +[109Ag] +[136Eu] +[22Na] +[121Sn+2] +[173Hf] +[120I] +[149Tb] +[203Hg+2] +[139Pr] +[73Se] +[240Cm] +[162Dy] +[39Ar] +[89Nb] +[Cd-] +[115Cd] +[253Cf] +[235Pu] +[144Cs] +[18OH3+] +[186Ta] +[115Ag] +[169Yb+3] +[77Kr] +[TiH+] +[138Nd] +[18n] +[34SH2] +[39S] +[92Y] +[135Ce] +[236Pu] +[92Zr] +[50Ti] +[65Ga] +[189Os] +[184Os] +[15CH4] +[131Cs+] +[151Tb] +[38Ar] +[99Mo] +[161Gd] +[CrH+2] +[CoH] +[203PbH] +[81Rb] +[163Dy] +[166Tm] +[bH-] +[31SH] +[86Sr] +[189Ir] +[171Tm] +[194Pb] +[204Hg+] +[231U] +[ZnH] +[59Ni] +[19FH] +[13C+] +[118Sb] +[28Mg+2] +[22c] +[241Cm] +[144Ce+4] +[44Sc] +[38Cl] +[187Ir] +[148Eu] +[57Co+2] +[201TlH3] +[153Pm] +[203PbH2] +[36Cl] +[69Ga+3] +[Co-] +[81Br] +[95Tc+4] +[22CH2] +[170Tm] +[234Np] +[110Sn] +[SH2] +[36ClH] +[TiH4] +[218Pb] +[141Cs+] +[223Ac] +[104Tc] +[239Np] +[198Au+3] +[130SbH3] +[198Bi] +[134Xe] +[109Pd] +[153Gd] +[203Bi] +[253Es] +[XeH2] +[244Cm] +[79Rb+] +[141Pr+3] +[15NH2-] +[86Tc] +[103Pd+2] +[17c] +[82Br-] +[20CH] +[112Pd] +[165Tm] +[89Y+3] +[174Lu] +[23Na+] +[164Ho] +[201Au] +[115In] +[99Tc+] +[19B] +[238Am] +[127Te] +[133I-] +[130Xe] +[83Sr+2] +[184Ta] +[240Am] +[15C] +[197Hg+2] +[186Lu] +[155Eu] +[178Yb] +[35Cl-] +[166Ho] +[70AsH3] +[58Co+2] +[14CH2-] +[137Pr] +[135IH] +[99Y] +[85Rb+] +[13OH] +[90Tc] +[Sn@] +[113In] +[95Ru] +[ReH4] +[15C@@H] +[15CH2] +[109Pd+2] +[47Ca+2] +[17C-] +[17CH] +[58Co] +[38PH3] +[134Ce] +[71Zn] +[110Pd] +[148Nd] +[14N+] +[CrH3] +[58Fe+2] +[235U+2] +[167Er] +[178Ta] +[101Tc] +[130Cs] +[122I-] +[CuH2-] +[158Gd] +[238Th] +[238Np] +[160Tb] +[168Er] +[83BrH] +[246Am] +[199Pb] +[79SeH2] +[157Dy] +[9C] +[FeH6] +[76Kr] +[243Am] +[34S-] +[88Rb+] +[WH3] +[MoH] +[13CH-] +[40PH3] +[218Rn] +[59Co+3] +[172Tm] +[209Bi] +[199Tl+] +[66Ge] +[95Zr] +[71As] +[46Ti] +[232Np] +[48Sc] +[90Zr] +[123I+2] +[159Ho] +[40Ca] +[44K+] +[ZrH2+2] +[19C] +[195Tl] +[126Ba] +[159Gd+3] +[167Yb] +[12C@@] +[13OH2] +[195Ir] +[109Cd] +[109Cd+2] +[87Y] +[35s] +[148Tb] +[81BrH] +[ZrH3] +[162Tm] +[206Bi] +[72AsH3] +[146Nd] +[239U] +[246Bk] +[87Rb+] +[177W] +[176Hf] +[GaH-] +[156Ho] +[101Rh] +[212Bi] +[257Md] +[190Os] +[OsH4] +[46Ca] +[250Es] +[70As] +[57Co] +[55Fe+2] +[122SbH3] +[156Sm] +[ThH4] +[94Mo] +[181Re] +[105Pd] +[13N+] +[139Ba] +[30PH3] +[120I-] +[155Dy] +[84BrH] +[116In] +[PtH4] +[60Ni+2] +[186W] +[107Cd] +[46Sc] +[11C@H] +[95Tc] +[67Zn+2] +[13B] +[112Sn] +[128I] +[193Au] +[103Ru+2] +[136Ce] +[195Pb] +[89Sr+2] +[210PoH2] +[70Se] +[138Xe] +[35SH2] +[UH2] +[BH+] +[61Co] +[VH] +[178W] +[124IH] +[185Ir] +[99Rh] +[18O-2] +[209PbH2] +[120IH] +[91Zr] +[Hf+] +[15C-] +[OsH3] +[119SbH3] +[148Sm] +[149Sm] +[118Pd+2] +[BH4+] +[NiH+] +[29Al] +[58Co+3] +[142Pr] +[212PbH2] +[144Ce+3] +[47Sc] +[200Pb] +[224Rn] +[133Ba] +[53Cr] +[7Be+2] +[26AlH3] +[188Pt] +[12NH3] +[77As] +[182Hf] +[33PH] +[193Os] +[248Cm] +[113Sn] +[121SnH2] +[110Cd] +[43K+] +[NbH2] +[116Te] +[168Tm] +[165Dy+3] +[154Sm] +[162Yb] +[89Rb+] +[47Ca] +[18CH3] +[135Cs+] +[223Fr] +[61Ni] +[24Na+] +[174Hf+4] +[167Ho] +[84Rb+] +[50Cr] +[153Eu] +[38PH] +[194Bi] +[ReH3] +[60Co+2] +[110In] +[77Ge] +[177Re] +[211Bi] +[94Nb] +[222Ra] +[159Dy] +[136Cs] +[ReH6] +[170Lu] +[129I+2] +[61Cu+] +[134Te] +[HgH2] +[93Y] +[BiH2+] +[MnH] +[CeH] +[18o+] +[39ClH] +[EuH3] +[148Gd] +[133Xe] +[142Nd] +[36SH] +[Cl@@-] +[209BiH3] +[210BiH3] +[200Bi] +[SiH4-] +[11CH-] +[52V] +[58Ni] +[185W] +[249Bk] +[72BrH] +[185Ta] +[251Es] +[158Eu] +[243Pu] +[205Pb] +[84Sr] +[37Ar] +[82BrH] +[79Rb] +[208TlH] +[207Bi] +[172Lu] +[15OH2] +[157Tb] +[244Cf] +[15CH] +[95Nb] +[83Kr] +[110Ag+] +[77Br-] +[199TlH] +[17OH-] +[86Y+3] +[90Mo] +[65Cu+2] +[202Hg] +[171Lu] +[13NH2-] +[178Lu] +[212Ra] +[10CH4] +[9CH4] +[171Er] +[125Sn] +[P@@H+] +[142Ce] +[254Fm] +[67Ge] +[87Y+3] +[108Pd] +[104Rh] +[201Bi] +[18CH] +[64Ni] +[181Hf] +[156Dy] +[35S-2] +[151Pm] +[182Ir] +[71Se] +[88Kr] +[56Ni] +[60Fe] +[161Ho] +[NiH2+2] +[84Kr] +[234Pu] +[179W] +[217At] +[54Fe] +[37Cl-] +[MoH4] +[71Ga] +[238U] +[127Cs+] +[76BrH] +[157Ho] +[100Tc] +[234Pa] +[218PoH2] +[17O+] +[HgH+] +[230Th] +[77se] +[35ClH] +[18O+] +[Os-] +[34Cl-] +[228Ac] +[195Pt+4] +[132I-] +[189Re] +[142Ba+2] +[Ta+] +[45Ti] +[254Es] +[203TlH] +[122IH] +[142Pm] +[136Nd] +[80Kr] +[102Ag] +[32ClH] +[13cH-] +[124Sb] +[27Mg] +[113Ag] +[228Pa] +[144Nd] +[44Ca] +[P@H+] +[54Cr] +[246Cf] +[155Tb] +[124Sn] +[201TlH] +[155Ho] +[TiH+3] +[20Ne] +[201Pb] +[166Dy] +[138Cs] +[162Ho] +[211Rn] +[204Tl] +[186Pt] +[228Th] +[170Tm+3] +[100Rh] +[193Ir] +[213Bi] +[157Lu] +[142Ba] +[36SH2] +[15O+] +[129IH] +[230Pu] +[19OH2] +[154Eu+3] +[157Sm] +[195Hg] +[175Yb] +[121Xe] +[112Ag] +[15O-2] +[ClH3+3] +[37ClH] +[252Cf] +[158Dy] +[40K] +[78BrH] +[111Cd+2] +[103Pd] +[88Rb] +[132Xe] +[190Ir] +[22Ne] +[31P-3] +[57Co+3] +[72As] +[122Te] +[90Zr+4] +[57Mn] +[175Hf] +[198Pb] +[96Mo] +[152Dy] +[203Pb] +[34ClH] +[102Rh] +[194Hg] +[233U+4] +[187W] +[54Mn] +[117Sb] +[139Nd] +[117Cd] +[126Sb+3] +[54Fe+3] +[235Np] +[15CH3] +[16CH3] +[SeH5] +[128Te] +[194Tl] +[204Pb] +[200Tl] +[106Rh] +[87Sr] +[125I+2] +[56Co] +[172Hf] +[18C@@H] +[78AsH3] +[49V] +[112In] +[102Ru] +[178Hf] +[167Dy] +[104Pd] +[220Fr] +[14CH-] +[31PH3] +[210PbH2] +[147Eu] +[43Sc] +[31PH] +[191Ir] +[191Os] +[YbH2] +[164Er] +[9Li] +[22nH] +[68Zn] +[132Cs] +[81Se] +[69As] +[86Kr] +[245Am] +[131Sb] +[51Ti] +[58Fe+3] +[166Yb] +[208PbH2] +[InH-] +[157Gd+3] +[144Pr] +[218At] +[164Dy+3] +[117In] +[202Pb] +[94Zr] +[149Eu] +[238Cm] +[139Ce] +[AlH5-2] +[245Pu] +[75Br-] +[82Sr+2] +[94Tc] +[141Pm] +[28Mg] +[133Ba+2] +[114Sn] +[PtH2+2] +[172Yb] +[245Cm] +[103Ag] +[142La] +[169Er] +[32PH3] +[233U] +[74BrH] +[203Pb+2] +[133Te] +[52Cr] +[Zr-4] +[18C-] +[63Ni] +[135La] +[97Tc] +[208Tl] +[89Zr+3] +[16O+] +[97Ru] +[44K] +[48Cr] +[151Gd] +[130Cs+] +[141La] +[205Bi+3] +[103Ru] +[108Cd] +[131La] +[141Ce+3] +[38K+] +[94Y] +[66Cu] +[16OH2] +[14CH3-] +[204Hg] +[224Ac] +[205Bi] +[113I] +[36Cl-] +[170Hf] +[82Rb] +[31S] +[83Rb] +[65Ni] +[74Br-] +[139Cs] +[70Ge] +[106Cd] +[160Gd] +[75SeH] +[199Au] +[84Rb] +[107Rh] +[210Bi] +[121Te] +[188Ir] +[ThH2] +[GeH5-] +[116SbH3] +[21NH3] +[88Y] +[138Pr] +[117SnH2] +[156Gd] +[141Ce] +[19Ne] +[191Pt] +[55Fe] +[118Pd] +[14OH2] +[202PbH2] +[80Sr] +[82Se-2] +[240Pu] +[104Ag] +[114In+3] +[210At] +[196Pb] +[197Pb] +[209Pb] +[210Pb] +[211Pb] +[212Pb] +[213Pb] +[214Pb] +[147Pm] +[126I-] +[141Pr] +[203Tl+] +[SmH3] +[76AsH3] +[24Na] +[107Pd] +[121I-] +[258Md] +[103Rh] +[226Th] +[236U] +[174Ta] +[228Rn] +[138Ba] +[154Tb] +[136Pr] +[80BrH] +[146Ce] +[182W] +[188Os] +[131Xe] +[132Ba] +[252Fm] +[83Se] +[140Ba] +[51Fe] +[246Pu] +[106Ag] +[38SH2] +[48Ca] +[58Fe] +[16NH3] +[63Zn] +[111Sn] +[62Ga] +[44Ti] +[76Br-] +[181W] +[KrH] +[141Nd] +[60Cu] +[9cH] +[56Mn] +[209Tl] +[137Ba] +[248Am] +[216Bi] +[Ti-] +[128Sb] +[146Gd] +[82Kr] +[53Ni] +[108Ag] +[145Gd] +[229Rn] +[85Kr] +[211PbH2] +[180Os] +[166Er] +[81Br-] +[SeH4] +[242Pu] +[154Eu] +[ScH3] +[41Ca+2] +[129I-] +[72Br-] +[75As+5] +[43K] +[116Sb] +[120Te] +[150Nd] +[130Sb] +[195Au] +[175Tm] +[As@] +[ClH2+3] +[73Ga] +[254Cf] +[69Ge] +[247Cm] +[83Sr] +[RuH5] +[98Nb] +[147Nd] +[150Eu] +[MoH3] +[119In] +[144Pr+3] +[97Mo] +[129Te] +[188W] +[206Tl] +[149Nd] +[200Pt] +[82Se+6] +[97Nb] +[149Pr] +[198Hg] +[49Cr] +[135Xe] +[52Fe] +[177Yb] +[48V] +[62Ni] +[21Ne] +[185Os] +[178Re] +[62Co] +[120Sb] +[EuH2] +[182Os] +[127Sb] +[221Fr] +[244Pu] +[68Ge+4] +[197Tl] +[172Ta] +[80Br-] +[BiH+] +[170Er] +[123Sn] +[161Dy] +[202Tl] +[89Sr] +[147Gd] +[150Tb] +[43Ca+2] +[BiH3+2] +[96Zr] +[98Tc+4] +[110Te] +[89Kr] +[145Pr] +[49Sc] +[17NH4+] +[180Hf] +[44Sc+3] +[73As] +[140La] +[137Ce] +[119Sb] +[247Bk] +[76Ge] +[121Sn] +[220Ra] +[156Tb] +[208Tl+] +[153Tb] +[16O-] +[130IH] +[20CH3] +[187Os] +[14NH4+] +[50Cr+3] +[81Sr] +[222Fr] +[55Co] +[41K] +[72Ga] +[78Se] +[137Xe] +[103Cd] +[93Zr] +[126Xe] +[80Rb] +[176Ta] +[199Pt] +[205PbH2] +[197Pt] +[200Au] +[120Xe] +[136Xe] +[20C] +[100Tc+5] +[157Gd] +[17B] +[198Tl] +[SnH2-] +[127IH] +[65Cu] +[186Ir] +[193Hg] +[132IH] +[147Pr] +[145Sm] +[122Sn] +[161Tb] +[110Ag] +[250Cf] +[33PH3] +[241Pu] +[32SH2] +[185Re] +[78Ge] +[106Ru+3] +[146Sm] +[109In] +[17NH3] +[233Pa] +[134IH] +[92Sr+2] +[BH3+] +[64Ga] +[92Sr] +[82Se+4] +[62Cu+] +[226Ac] +[171Yb] +[34S-2] +[249Cm] +[56Fe] +[227Ra] +[143Ce] +[226Rn] +[64Cu+] +[152Tb] +[34S+] +[207Tl] +[111Ag] +[227Pa] +[157Eu] +[184Re] +[72Ge] +[SnH+2] +[117Sn+2] +[230Pa] +[78Kr] +[134Ba] +[199Hg] +[13CH2+] +[250Cm] +[183Re] +[121IH] +[251Cf] +[81Kr] +[125Cs+] +[208Pb] +[143Pm] +[114In] +[113Sn+4] +[82Sr] +[74Ge] +[UH3] +[52Mn+2] +[114Cd] +[33ClH] +[79Br-] +[22CH4] +[70Zn+2] +[144Sm] +[124Te] +[seH+] +[51Cr+6] +[152Sm] +[130Ba] +[Po@] +[174Hf] +[141Ba] +[128IH] +[27Al+3] +[234Th] +[88Zr] +[111IH] +[177Ta] +[191Os+4] +[152Eu] +[48Ti] +[87Kr] +[91Y+3] +[180Ta] +[128Xe] +[143Cs] +[86Rb] +[45K+] +[180Re] +[126Sn] +[146Pm] +[143Pr] +[116Cd] +[89Rb] +[230Ra] +[WH6] +[167Tm+3] +[96Nb] +[92Mo] +[57Ni] +[189Pt] +[134La] +[79Se] +[38ClH] +[125Sn+4] +[243Cm] +[257Fm] +[85Br] +[206Pb] +[138Cs+] +[175Ta] +[16nH] +[138La] +[112Cd] +[93Tc] +[28SiH3] +[166Tb] +[161Tb+3] +[158Tb] +[90Sr] +[32PH2] +[CaH2] diff --git a/chebai/preprocessing/collate.py b/chebai/preprocessing/collate.py index ecbcb876..b420ef47 100644 --- a/chebai/preprocessing/collate.py +++ b/chebai/preprocessing/collate.py @@ -130,7 +130,7 @@ def process_label_rows(self, labels: Tuple) -> torch.Tensor: """ return pad_sequence( [ - torch.tensor([v if v is not None else False for v in row]) + torch.tensor([bool(v) if v is not None else False for v in row]) for row in labels ], batch_first=True, diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py index 3ac0a803..68254007 100644 --- a/chebai/preprocessing/datasets/base.py +++ b/chebai/preprocessing/datasets/base.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union import lightning as pl +import numpy as np import pandas as pd import torch import tqdm @@ -256,6 +257,9 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader: Returns: DataLoader: A DataLoader object. """ + rank_zero_info( + f"Loading {kind} data... (datamodule.current_epoch={self.curr_epoch if hasattr(self, 'curr_epoch') else 'N/A'})" + ) dataset = self.load_processed_data(kind) if "ids" in kwargs: ids = kwargs.pop("ids") @@ -446,6 +450,7 @@ def setup(self, *args, **kwargs) -> None: rank_zero_info(f"Check for processed data in {self.processed_dir}") rank_zero_info(f"Cross-validation enabled: {self.use_inner_cross_validation}") + rank_zero_info(f"Looking for files: {self.processed_file_names}") if any( not os.path.isfile(os.path.join(self.processed_dir, f)) for f in self.processed_file_names @@ -717,11 +722,18 @@ class _DynamicDataset(XYBaseDataModule, ABC): Args: dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. + apply_label_filter (Optional[str]): Path to a classes.txt file - only labels that are in the labels filter + file will be used (in that order). All labels in the label filter have to be present in the dataset. This filter + is only active when loading splits from a CSV file. Defaults to None. + apply_id_filter (Optional[str]): Path to a data.pt file from a different dataset - only IDs that are in the + id filter file will be used. Defaults to None. This filter is only active when loading splits from a CSV file. **kwargs: Additional keyword arguments passed to XYBaseDataModule. Attributes: dynamic_data_split_seed (int): The seed for random data splitting, default is 42. splits_file_path (Optional[str]): Path to the CSV file containing split assignments. + apply_label_filter (Optional[str]): Path to a classes.txt file for label filtering. + apply_id_filter (Optional[str]): Path to a data.pt file for ID filtering. """ # ---- Index for columns of processed `data.pkl` (should be derived from `_graph_to_raw_dataset` method) ------ @@ -731,6 +743,8 @@ class _DynamicDataset(XYBaseDataModule, ABC): def __init__( self, + apply_label_filter: Optional[str] = None, + apply_id_filter: Optional[str] = None, **kwargs, ): super(_DynamicDataset, self).__init__(**kwargs) @@ -744,6 +758,8 @@ def __init__( self.splits_file_path = self._validate_splits_file_path( kwargs.get("splits_file_path", None) ) + self.apply_label_filter = apply_label_filter + self.apply_id_filter = apply_id_filter self._data_pkl_filename: str = "data.pkl" @staticmethod @@ -1161,6 +1177,27 @@ def _retrieve_splits_from_csv(self) -> None: ) df_data = pd.DataFrame(data) + if self.apply_id_filter: + print(f"Applying ID filter from {self.apply_id_filter}...") + with open(self.apply_id_filter, "r") as f: + id_filter = [ + line["ident"] + for line in torch.load(self.apply_id_filter, weights_only=False) + ] + df_data = df_data[df_data["ident"].isin(id_filter)] + + if self.apply_label_filter: + print(f"Applying label filter from {self.apply_label_filter}...") + with open(self.apply_label_filter, "r") as f: + label_filter = [line.strip() for line in f] + with open(os.path.join(self.processed_dir_main, "classes.txt"), "r") as cf: + classes = [line.strip() for line in cf] + # reorder labels + old_labels = np.stack(df_data["labels"]) + label_mapping = [classes.index(lbl) for lbl in label_filter] + new_labels = old_labels[:, label_mapping] + df_data["labels"] = list(new_labels) + train_ids = splits_df[splits_df["split"] == "train"]["id"] validation_ids = splits_df[splits_df["split"] == "validation"]["id"] test_ids = splits_df[splits_df["split"] == "test"]["id"] diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py index c59b8387..edcc8c41 100644 --- a/chebai/preprocessing/datasets/chebi.py +++ b/chebai/preprocessing/datasets/chebi.py @@ -15,7 +15,7 @@ from abc import ABC from collections import OrderedDict from itertools import cycle, permutations, product -from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Optional, Union import numpy as np import pandas as pd @@ -115,15 +115,14 @@ class _ChEBIDataExtractor(_DynamicDataset, ABC): chebi_version will be used for training, validation and test. Defaults to None. single_class (int, optional): The ID of the single class to predict. If not set, all available labels will be predicted. Defaults to None. - dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. - splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. + subset (Literal["2_STAR", "3_STAR"], optional): If set, only use entities that are part of the given subset. **kwargs: Additional keyword arguments (passed to XYBaseDataModule). Attributes: single_class (Optional[int]): The ID of the single class to predict. chebi_version_train (Optional[int]): The version of ChEBI to use for training and validation. - dynamic_data_split_seed (int): The seed for random data splitting, default is 42. - splits_file_path (Optional[str]): Path to csv file containing split assignments. + subset (Optional[Literal["2_STAR", "3_STAR"]]): If set, only use entities that are part of the given subset. + """ # ---- Index for columns of processed `data.pkl` (derived from `_graph_to_raw_dataset` method) ------ @@ -139,6 +138,7 @@ def __init__( self, chebi_version_train: Optional[int] = None, single_class: Optional[int] = None, + subset: Optional[Literal["2_STAR", "3_STAR"]] = None, augment_smiles: bool = False, aug_smiles_variations: Optional[int] = None, **kwargs, @@ -162,6 +162,8 @@ def __init__( self.aug_smiles_variations = aug_smiles_variations # predict only single class (given as id of one of the classes present in the raw data set) self.single_class = single_class + self.subset = subset + super(_ChEBIDataExtractor, self).__init__(**kwargs) # use different version of chebi for training and validation (if not None) # (still uses self.chebi_version for test set) @@ -277,7 +279,9 @@ def _extract_class_hierarchy(self, data_path: str) -> "nx.DiGraph": and term_doc.id.prefix == "CHEBI" ): term_dict = term_callback(term_doc) - if term_dict: + if term_dict and ( + not self.subset or term_dict["subset"] == self.subset + ): elements.append(term_dict) g = nx.DiGraph() @@ -617,6 +621,20 @@ def base_dir(self) -> str: """ return os.path.join("data", f"chebi_v{self.chebi_version}") + @property + def processed_dir_main(self) -> str: + """ + Returns the main directory path where processed data is stored. + + Returns: + str: The path to the main processed data directory, based on the base directory and the instance's name. + """ + return os.path.join( + self.base_dir, + self._name if self.subset is None else f"{self._name}_{self.subset}", + "processed", + ) + @property def processed_dir(self) -> str: """ @@ -946,6 +964,22 @@ class ChEBIOver50Partial(ChEBIOverXPartial, ChEBIOver50): pass +class ChEBIOverXFingerprints(ChEBIOverX): + """A class that uses Fingerprints for the processed data (used for fixed-length ML models).""" + + READER = dr.FingerprintReader + + +class ChEBIOver100Fingerprints(ChEBIOverXFingerprints, ChEBIOver100): + """ + A class for extracting data from the ChEBI dataset with Fingerprints reader and a threshold of 100. + + Inherits from ChEBIOverXFingerprints and ChEBIOver100. + """ + + pass + + class JCIExtendedBPEData(JCIExtendedBase): READER = dr.ChemBPEReader @@ -994,6 +1028,7 @@ def term_callback(doc: "fastobo.term.TermFrame") -> Union[Dict, bool]: parents = [] name = None smiles = None + subset = None for clause in doc: if isinstance(clause, fastobo.term.PropertyValueClause): t = clause.property_value @@ -1013,6 +1048,8 @@ def term_callback(doc: "fastobo.term.TermFrame") -> Union[Dict, bool]: parents.append(chebi_to_int(str(clause.term))) elif isinstance(clause, fastobo.term.NameClause): name = str(clause.name) + elif isinstance(clause, fastobo.term.SubsetClause): + subset = str(clause.subset) if isinstance(clause, fastobo.term.IsObsoleteClause): if clause.obsolete: @@ -1025,6 +1062,7 @@ def term_callback(doc: "fastobo.term.TermFrame") -> Union[Dict, bool]: "has_part": parts, "name": name, "smiles": smiles, + "subset": subset, } diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py index 48dd1efc..1491463c 100644 --- a/chebai/preprocessing/datasets/pubchem.py +++ b/chebai/preprocessing/datasets/pubchem.py @@ -14,7 +14,7 @@ import tempfile import time from datetime import datetime -from typing import Generator, List, Optional, Tuple, Type +from typing import Generator, List, Optional, Tuple, Type, Union import numpy as np import pandas as pd @@ -176,12 +176,12 @@ def raw_file_names(self) -> List[str]: return ["smiles.txt"] @property - def processed_file_names(self) -> List[str]: + def processed_file_names_dict(self) -> List[str]: """ Returns: List[str]: List of processed data file names. """ - return ["test.pt", "train.pt", "validation.pt"] + return {"train": "train.pt", "test": "test.pt", "validation": "validation.pt"} def _set_processed_data_props(self): """ @@ -212,6 +212,146 @@ def _perform_data_preparation(self, *args, **kwargs): print("Done") +class PubChemBatched(PubChem): + """Store train data as batches of 10m, validation and test should each be 100k max""" + + READER: Type[dr.ChemDataReader] = dr.ChemDataReader + + def __init__(self, train_batch_size=1_000_000, *args, **kwargs): + super(PubChemBatched, self).__init__(*args, **kwargs) + self.curr_epoch = 0 + self.train_batch_size = train_batch_size + if self._k != self.FULL: + self.val_batch_size = ( + 100_000 + if self.validation_split * self._k > 100_000 + else int(self.validation_split * self._k) + ) + self.test_batch_size = ( + 100_000 + if self.test_split * self._k > 100_000 + else int(self.test_split * self._k) + ) + else: + self.val_batch_size = 100_000 + self.test_batch_size = 100_000 + + @property + def processed_file_names_dict(self) -> List[str]: + """ + Returns: + List[str]: List of processed data file names. + """ + train_samples = ( + self._k if self._k != self.FULL else 120_000_000 # estimated PubChem size + ) # estimate size + train_samples -= self.val_batch_size + self.test_batch_size + train_batches = ( + {"train": "train.pt"} + if train_samples <= self.train_batch_size + else { + f"train_{i}": f"train_{i}.pt" + for i in range(train_samples // self.train_batch_size) + } + ) + train_batches["test"] = "test.pt" + train_batches["validation"] = "validation.pt" + return train_batches + + def _tokenize_batched(self, data): + """ + Load data from a file and return a list of dictionaries, batched in 1,000,000 entries. + + Args: + path (str): The path to the input file. + batch_size (int): The size of each batch. + batch_idx (int): The index of the batch to load. + + Returns: + List: A list of dictionaries containing the features and labels. + """ + print(f"Processing {len(data)} lines...") + batch = [] + for i, d in enumerate(tqdm.tqdm(data, total=len(data))): + if d["features"] is not None: + batch.append(self.reader.to_data(d)) + if i % self.train_batch_size == 0 and i > 0: + print(f"Generating batch {i // self.train_batch_size - 1}") + batch = [b for b in batch if b["features"] is not None] + if self.n_token_limit is not None: + batch = [ + b for b in batch if len(b["features"]) <= self.n_token_limit + ] + yield batch + batch = [] + print("Generating final batch") + batch = [b for b in batch if b["features"] is not None] + if self.n_token_limit is not None: + batch = [b for b in batch if len(b["features"]) <= self.n_token_limit] + yield batch + + def setup_processed(self): + """ + Prepares processed data and saves them as Torch tensors. + """ + filename = os.path.join(self.raw_dir, self.raw_file_names[0]) + print("Load data from file", filename) + data_not_tokenized = [entry for entry in self._load_dict(filename)] + print("Create splits") + train, test = train_test_split( + data_not_tokenized, test_size=self.test_batch_size + self.val_batch_size + ) + del data_not_tokenized + test, val = train_test_split(test, train_size=self.test_batch_size) + # Save first (and only) test batch + torch.save( + next(self._tokenize_batched(test)), + os.path.join(self.processed_dir, self.processed_file_names_dict["test"]), + ) + # save first (and only) validation batch + torch.save( + next(self._tokenize_batched(val)), + os.path.join( + self.processed_dir, self.processed_file_names_dict["validation"] + ), + ) + + # batch training if necessary + if len(train) > self.train_batch_size: + for i, batch in enumerate(self._tokenize_batched(train)): + torch.save(batch, os.path.join(self.processed_dir, f"train_{i}.pt")) + else: + torch.save( + next(self._tokenize_batched(train)), + os.path.join(self.processed_dir, "train.pt"), + ) + + self.reader.on_finish() + + def train_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]: + """ + Returns the train DataLoader. This swaps the training batch for each epoch. + + Args: + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + + Returns: + DataLoader: A DataLoader object for training data. + """ + return self.dataloader( + ( + "train" + if "train" in self.processed_file_names_dict + else f"train_{self.curr_epoch}" + ), + shuffle=True, + num_workers=self.num_workers, + persistent_workers=True, + **kwargs, + ) + + class PubChemDissimilar(PubChem): """ Subset of PubChem, but choosing the most dissimilar molecules (according to fingerprint) diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py index e308a2e1..22b91a0e 100644 --- a/chebai/preprocessing/reader.py +++ b/chebai/preprocessing/reader.py @@ -202,8 +202,12 @@ def _read_data(self, raw_data: str) -> List[int]: except Exception as e: print(f"RDKit failed to process {raw_data}") print(f"\t{e}") - - return [self._get_token_index(v[1]) for v in _tokenize(raw_data)] + try: + return [self._get_token_index(v[1]) for v in _tokenize(raw_data)] + except ValueError as e: + print(f"could not process {raw_data}") + print(f"\t{e}") + return None class DeepChemDataReader(ChemDataReader): @@ -377,3 +381,33 @@ def name(cls) -> str: def _read_data(self, raw_data: str) -> List[int]: """Convert characters in raw data to their ordinal values.""" return [ord(s) for s in raw_data] + + +class FingerprintReader(DataReader): + """ + Data reader for chemical data using RDKit fingerprints. + + Args: + collator_kwargs: Optional dictionary of keyword arguments for the collator. + kwargs: Additional keyword arguments. + """ + + COLLATOR = DefaultCollator + + def __init__(self, fingerprint_size=1024, *args, **kwargs): + super().__init__(*args, **kwargs) + self.fingerprint_size = fingerprint_size + + @classmethod + def name(cls) -> str: + """Returns the name of the data reader.""" + return "rdkit_fingerprint" + + def _read_data(self, raw_data: str) -> List[int]: + """Generate RDKit fingerprint from raw SMILES data.""" + mol = Chem.MolFromSmiles(raw_data.strip()) + if mol is None: + raise ValueError(f"Invalid SMILES: {raw_data}") + return list( + Chem.RDKFingerprint(mol, fpSize=self.fingerprint_size).ToBitString() + ) diff --git a/chebai/result/generate_class_properties.py b/chebai/result/generate_class_properties.py index 8c8f96bf..6a043e5a 100644 --- a/chebai/result/generate_class_properties.py +++ b/chebai/result/generate_class_properties.py @@ -121,6 +121,7 @@ def generate_props( model_config_file_path: str, data_config_file_path: str, output_path: str | None = None, + apply_id_filter: str | None = None, ) -> None: """ Run inference on validation set, compute TPV/NPV per class, and save to JSON. @@ -132,11 +133,13 @@ def generate_props( data_config_file_path: Path to yaml config file of the data. output_path: Optional path where to write the JSON metrics file. Defaults to '/classes.json'. + apply_id_filter: Optional path to a (data.pt) file containing IDs to filter the dataset. This is useful for comparing datasets with different ids. """ data_cls_path, data_cls_kwargs = parse_config_file(data_config_file_path) data_module: XYBaseDataModule = load_data_instance( data_cls_path, data_cls_kwargs ) + data_module.apply_id_filter = apply_id_filter splits_file_path = Path(data_module.processed_dir_main, "splits.csv") if data_module.splits_file_path is None: @@ -168,17 +171,26 @@ def generate_props( raise ValueError(f"Unknown data partition: {data_partition}") print(f"Running inference on {data_partition} data...") - classes_file = Path(data_module.processed_dir_main) / "classes.txt" + if data_module.apply_label_filter is not None: + classes_file = data_module.apply_label_filter + else: + classes_file = Path(data_module.processed_dir_main) / "classes.txt" class_names = self.load_class_labels(classes_file) num_classes = len(class_names) metrics_obj_dict: dict[str, torchmetrics.Metric] = { - "cm": MultilabelConfusionMatrix(num_labels=num_classes), - "f1": MultilabelF1Score(num_labels=num_classes, average=None), + "cm": MultilabelConfusionMatrix(num_labels=num_classes).to( + device=model.device + ), + "f1": MultilabelF1Score(num_labels=num_classes, average=None).to( + device=model.device + ), } for batch_idx, batch in enumerate(data_loader): + batch = batch.to(device=model.device) data = model._process_batch(batch, batch_idx=batch_idx) - labels = data["labels"] + labels = data["labels"].to(device=model.device) + data["features"][0].to(device=model.device) model_output = model(data, **data.get("model_kwargs", {})) preds, targets = model._get_prediction_and_labels( data, labels, model_output @@ -213,6 +225,7 @@ def generate( model_config_file_path: str, data_config_file_path: str, output_path: str | None = None, + apply_id_filter: str | None = None, ) -> None: """ CLI command to generate JSON with metrics on validation set. @@ -237,11 +250,13 @@ def generate( model_config_file_path, data_config_file_path, output_path, + apply_id_filter=apply_id_filter, ) if __name__ == "__main__": - # _generate_classes_props_json.py generate \ + # Usage: + # generate_classes_properties.py generate \ # --data_partition "val" \ # --model_ckpt_path "model/ckpt/path" \ # --model_config_file_path "model/config/file/path" \ diff --git a/chebai/trainer/CustomTrainer.py b/chebai/trainer/CustomTrainer.py index 2ecee680..f7c7c3e2 100644 --- a/chebai/trainer/CustomTrainer.py +++ b/chebai/trainer/CustomTrainer.py @@ -4,8 +4,11 @@ import pandas as pd import torch from lightning import LightningModule, Trainer +from lightning.fabric.utilities.data import _set_sampler_epoch from lightning.fabric.utilities.types import _PATH from lightning.pytorch.loggers import WandbLogger +from lightning.pytorch.loops.fit_loop import _FitLoop +from lightning.pytorch.trainer import call from torch.nn.utils.rnn import pad_sequence from chebai.loggers.custom import CustomLogger @@ -39,6 +42,9 @@ def __init__(self, *args, **kwargs): log_kwargs[log_key] = log_value self.logger.log_hyperparams(log_kwargs) + # use custom fit loop (https://lightning.ai/docs/pytorch/LTS/extensions/loops.html#overriding-the-default-loops) + self.fit_loop = LoadDataLaterFitLoop(self, self.min_epochs, self.max_epochs) + def _resolve_logging_argument(self, key: str, value: Any) -> Tuple[str, Any]: """ Resolves logging arguments, handling nested structures such as lists and complex objects. @@ -147,3 +153,34 @@ def log_dir(self) -> Optional[str]: dirpath = self.strategy.broadcast(dirpath) return dirpath + + +class LoadDataLaterFitLoop(_FitLoop): + + def on_advance_start(self) -> None: + """Calls the hook ``on_train_epoch_start`` **before** the dataloaders are setup. This is necessary + so that the dataloaders can get information from the model. For example: The on_train_epoch_start + hook sets the curr_epoch attribute of the PubChemBatched dataset. With the Lightning configuration, + the dataloaders would always load batch 0 first, run an epoch, then get the epoch number (usually 0, + unless resuming from a checkpoint), then load batch 0 again (or some other batch). With this + implementation, the dataloaders are setup after the epoch number is set, so that the correct + batch is loaded.""" + trainer = self.trainer + + # update the epoch value for all samplers + assert self._combined_loader is not None + for i, dl in enumerate(self._combined_loader.flattened): + _set_sampler_epoch(dl, self.epoch_progress.current.processed) + + if not self.restarted_mid_epoch and not self.restarted_on_epoch_end: + if not self.restarted_on_epoch_start: + self.epoch_progress.increment_ready() + + call._call_callback_hooks(trainer, "on_train_epoch_start") + call._call_lightning_module_hook(trainer, "on_train_epoch_start") + + self.epoch_progress.increment_started() + + # this is usually at the front of advance_start, but here we need it at the end + # might need to setup data again depending on `trainer.reload_dataloaders_every_n_epochs` + self.setup_data() diff --git a/configs/model/electra.yml b/configs/model/electra.yml index c3cf2fdf..663a8fa1 100644 --- a/configs/model/electra.yml +++ b/configs/model/electra.yml @@ -3,7 +3,7 @@ init_args: optimizer_kwargs: lr: 1e-3 config: - vocab_size: 1400 + vocab_size: 4400 max_position_embeddings: 1800 num_attention_heads: 8 num_hidden_layers: 6 diff --git a/configs/model/lstm.yml b/configs/model/lstm.yml new file mode 100644 index 00000000..9ee3f183 --- /dev/null +++ b/configs/model/lstm.yml @@ -0,0 +1,7 @@ +class_path: chebai.models.lstm.ChemLSTM +init_args: + in_d: 100 + out_d: 100 + num_classes: 1528 + optimizer_kwargs: + lr: 1e-3 diff --git a/tests/unit/dataset_classes/testChebiTermCallback.py b/tests/unit/dataset_classes/testChebiTermCallback.py index 8680760e..9ea77177 100644 --- a/tests/unit/dataset_classes/testChebiTermCallback.py +++ b/tests/unit/dataset_classes/testChebiTermCallback.py @@ -36,6 +36,7 @@ def test_process_valid_terms(self) -> None: "has_part": set(), "name": "Compound A", "smiles": "C1=CC=CC=C1", + "subset": "2_STAR", } actual_dict: Dict[str, Any] = term_callback( diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py index 5e6fb099..ec018f00 100644 --- a/tests/unit/readers/testChemDataReader.py +++ b/tests/unit/readers/testChemDataReader.py @@ -97,12 +97,15 @@ def test_read_data_with_new_token(self) -> None: def test_read_data_with_invalid_input(self) -> None: """ Test the _read_data method with an invalid input. - The invalid token should raise an error or be handled appropriately. + The invalid token should prompt a return value None """ raw_data = "%INVALID%" - with self.assertRaises(ValueError): - self.reader._read_data(raw_data) + result = self.reader._read_data(raw_data) + self.assertIsNone( + result, + "The output for invalid token '%INVALID%' should be None.", + ) @patch("builtins.open", new_callable=mock_open) def test_finish_method_for_new_tokens(self, mock_file: mock_open) -> None: