Spaces:

HassounLab
/

MVP

Sleeping

App Files Files Community

yzhouchen001 commited on Oct 24

Commit

94aa6f9

1 Parent(s): c65d76d

partial push

Browse files

Files changed (25) hide show

.gitignore +176 -0
massspecgym/__init__.py +0 -0
massspecgym/data/__init__.py +8 -0
massspecgym/data/data_module.py +102 -0
massspecgym/data/datasets.py +225 -0
massspecgym/data/transforms.py +208 -0
massspecgym/definitions.py +27 -0
massspecgym/models/__init__.py +0 -0
massspecgym/models/base.py +180 -0
massspecgym/models/de_novo/__init__.py +6 -0
massspecgym/models/de_novo/base.py +241 -0
massspecgym/models/de_novo/dummy.py +46 -0
massspecgym/models/de_novo/random.py +1750 -0
massspecgym/models/de_novo/smiles_tranformer.py +200 -0
massspecgym/models/layers.py +101 -0
massspecgym/models/retrieval/__init__.py +13 -0
massspecgym/models/retrieval/base.py +206 -0
massspecgym/models/retrieval/deepsets.py +101 -0
massspecgym/models/retrieval/fingerprint_ffn.py +65 -0
massspecgym/models/retrieval/from_dict.py +67 -0
massspecgym/models/retrieval/random.py +22 -0
massspecgym/models/simulation/__init__.py +0 -0
massspecgym/models/simulation/base.py +63 -0
massspecgym/models/tokenizers.py +156 -0
massspecgym/utils.py +484 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,176 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# data
+data/*
+experiments/main_result/*
+experiments/old/*
+experiments/test_dir/*
+my_notebooks/*
+other/*
+.cache/
+!data/.gitkeep
+!experiments/.gitkeep
+!data/sample/

massspecgym/__init__.py ADDED Viewed

File without changes

massspecgym/data/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .datasets import MassSpecDataset, RetrievalDataset
+from .data_module import MassSpecDataModule
+__all__ = [
+    "MassSpecDataset",
+    "RetrievalDataset",
+    "MassSpecDataModule"
+]

massspecgym/data/data_module.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import typing as T
+import pandas as pd
+import numpy as np
+import pytorch_lightning as pl
+import massspecgym.utils as utils
+from pathlib import Path
+from typing import Optional
+from torch.utils.data.dataset import Subset
+from torch.utils.data.dataloader import DataLoader
+from massspecgym.data.datasets import MassSpecDataset
+class MassSpecDataModule(pl.LightningDataModule):
+    """
+    Data module containing a mass spectrometry dataset. This class is responsible for loading, splitting, and wrapping
+    the dataset into data loaders according to pre-defined train, validation, test folds.
+    """
+    def __init__(
+        self,
+        dataset: MassSpecDataset,
+        batch_size: int,
+        num_workers: int = 0,
+        persistent_workers: bool = True,
+        split_pth: Optional[Path] = None,
+        **kwargs
+    ):
+        """
+        Args:
+            split_pth (Optional[Path], optional): Path to a .tsv file with columns "identifier" and "fold",
+                corresponding to dataset item IDs, and "fold", containg "train", "val", "test"
+                values. Default is None, in which case the split from the `dataset` is used.
+        """
+        super().__init__(**kwargs)
+        self.dataset = dataset
+        self.split_pth = split_pth
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.persistent_workers = persistent_workers if num_workers > 0 else False
+    def prepare_data(self):
+        if self.split_pth is None:
+            self.split = self.dataset.metadata[["identifier", "fold"]]
+        else:
+            # NOTE: custom split is not tested
+            self.split = pd.read_csv(self.split_pth, sep="\t")
+            if set(self.split.columns) != {"identifier", "fold"}:
+                raise ValueError('Split file must contain "id" and "fold" columns.')
+            self.split["identifier"] = self.split["identifier"].astype(str)
+            if set(self.dataset.metadata["identifier"]) != set(self.split["identifier"]):
+                raise ValueError(
+                    "Dataset item IDs must match the IDs in the split file."
+                )
+        self.split = self.split.set_index("identifier")["fold"]
+        if not set(self.split) <= {"train", "val", "test"}:
+            raise ValueError(
+                '"Folds" column must contain only "train", "val", or "test" values.'
+            )
+    def setup(self, stage=None):
+        split_mask = self.split.loc[self.dataset.metadata["identifier"]].values
+        if stage == "fit" or stage is None:
+            self.train_dataset = Subset(
+                self.dataset, np.where(split_mask == "train")[0]
+            )
+            self.val_dataset = Subset(self.dataset, np.where(split_mask == "val")[0])
+        if stage == "test":
+            self.test_dataset = Subset(self.dataset, np.where(split_mask == "test")[0])
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            persistent_workers=self.persistent_workers,
+            drop_last=False,
+            collate_fn=self.dataset.collate_fn,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            persistent_workers=self.persistent_workers,
+            drop_last=False,
+            collate_fn=self.dataset.collate_fn,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            persistent_workers=self.persistent_workers,
+            drop_last=False,
+            collate_fn=self.dataset.collate_fn,
+        )

massspecgym/data/datasets.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import pandas as pd
+import json
+import typing as T
+import numpy as np
+import torch
+import matchms
+import massspecgym.utils as utils
+from pathlib import Path
+from rdkit import Chem
+from torch.utils.data.dataset import Dataset
+from torch.utils.data.dataloader import default_collate
+from matchms.importing import load_from_mgf
+from massspecgym.data.transforms import SpecTransform, MolTransform, MolToInChIKey
+class MassSpecDataset(Dataset):
+    """
+    Dataset containing mass spectra and their corresponding molecular structures. This class is
+    responsible for loading the data from disk and applying transformation steps to the spectra and
+    molecules.
+    """
+    def __init__(
+        self,
+        spec_transform: T.Optional[T.Union[SpecTransform, T.Dict[str, SpecTransform]]] = None,
+        mol_transform: T.Optional[T.Union[MolTransform, T.Dict[str, MolTransform]]] = None,
+        pth: T.Optional[Path] = None,
+        return_mol_freq: bool = True,
+        return_identifier: bool = True,
+        dtype: T.Type = torch.float32
+    ):
+        """
+        Args:
+            pth (Optional[Path], optional): Path to the .tsv or .mgf file containing the mass spectra.
+                Default is None, in which case the MassSpecGym dataset is downloaded from HuggingFace Hub.
+        """
+        self.pth = pth
+        self.spec_transform = spec_transform
+        self.mol_transform = mol_transform
+        self.return_mol_freq = return_mol_freq
+        if self.pth is None:
+            self.pth = utils.hugging_face_download("MassSpecGym.tsv")
+        if isinstance(self.pth, str):
+            self.pth = Path(self.pth)
+        if self.pth.suffix == ".tsv":
+            self.metadata = pd.read_csv(self.pth, sep="\t")
+            self.spectra = self.metadata.apply(
+                lambda row: matchms.Spectrum(
+                    mz=np.array([float(m) for m in row["mzs"].split(",")]),
+                    intensities=np.array(
+                        [float(i) for i in row["intensities"].split(",")]
+                    ),
+                    metadata={"precursor_mz": row["precursor_mz"]},
+                ),
+                axis=1,
+            )
+            self.metadata = self.metadata.drop(columns=["mzs", "intensities"])
+        elif self.pth.suffix == ".mgf":
+            self.spectra = list(load_from_mgf(str(self.pth)))
+            self.metadata = pd.DataFrame([s.metadata for s in self.spectra])
+        else:
+            raise ValueError(f"{self.pth.suffix} file format not supported.")
+        if self.return_mol_freq:
+            if "inchikey" not in self.metadata.columns:
+                self.metadata["inchikey"] = self.metadata["smiles"].apply(utils.smiles_to_inchi_key)
+            self.metadata["mol_freq"] = self.metadata.groupby("inchikey")["inchikey"].transform("count")
+        self.return_identifier = return_identifier
+        self.dtype = dtype
+    def __len__(self) -> int:
+        return len(self.spectra)
+    def __getitem__(
+        self, i: int, transform_spec: bool = True, transform_mol: bool = True
+    ) -> dict:
+        spec = self.spectra[i]
+        metadata = self.metadata.iloc[i]
+        mol = metadata["smiles"]
+        # Apply all transformations to the spectrum
+        item = {}
+        if transform_spec and self.spec_transform:
+            if isinstance(self.spec_transform, dict):
+                for key, transform in self.spec_transform.items():
+                    item[key] = transform(spec) if transform is not None else spec
+            else:
+                item["spec"] = self.spec_transform(spec)
+        else:
+            item["spec"] = spec
+        # Apply all transformations to the molecule
+        if transform_mol and self.mol_transform:
+            if isinstance(self.mol_transform, dict):
+                for key, transform in self.mol_transform.items():
+                    item[key] = transform(mol) if transform is not None else mol
+            else:
+                item["mol"] = self.mol_transform(mol)
+        else:
+            item["mol"] = mol
+        # Add other metadata to the item
+        # item.update({
+        #     k: metadata[k] for k in ["precursor_mz", "adduct"]
+        # })
+        if self.return_mol_freq:
+            item["mol_freq"] = metadata["mol_freq"]
+        if self.return_identifier:
+            item["identifier"] = metadata["identifier"]
+        # TODO: this should be refactored
+        for k, v in item.items():
+            if not isinstance(v, str):
+                try:
+                    item[k] = torch.as_tensor(v, dtype=self.dtype)
+                except:
+                    continue
+        return item
+    @staticmethod
+    def collate_fn(batch: T.Iterable[dict]) -> dict:
+        """
+        Custom collate function to handle the outputs of __getitem__.
+        """
+        return default_collate(batch)
+class RetrievalDataset(MassSpecDataset):
+    """
+    Dataset containing mass spectra and their corresponding molecular structures, with additional
+    candidates of molecules for retrieval based on spectral similarity.
+    """
+    def __init__(
+        self,
+        mol_label_transform: MolTransform = MolToInChIKey(),
+        candidates_pth: T.Optional[T.Union[Path, str]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.candidates_pth = candidates_pth
+        self.mol_label_transform = mol_label_transform
+        # Download candidates from HuggigFace Hub if not a path to exisiting file is passed
+        if self.candidates_pth is None:
+            self.candidates_pth = utils.hugging_face_download(
+                "molecules/MassSpecGym_retrieval_candidates_mass.json"
+            )
+        elif isinstance(self.candidates_pth, str):
+            if Path(self.candidates_pth).is_file():
+                self.candidates_pth = Path(self.candidates_pth)
+            else:
+                self.candidates_pth = utils.hugging_face_download(candidates_pth)
+        # Read candidates_pth from json to dict: SMILES -> respective candidate SMILES
+        with open(self.candidates_pth, "r") as file:
+            self.candidates = json.load(file)
+    def __getitem__(self, i) -> dict:
+        item = super().__getitem__(i, transform_mol=False)
+        # Save the original SMILES representation of the query molecule (for evaluation)
+        item["smiles"] = item["mol"]
+        # Get candidates
+        if item["mol"] not in self.candidates:
+            raise ValueError(f'No candidates for the query molecule {item["mol"]}.')
+        item["candidates"] = self.candidates[item["mol"]]
+        # Save the original SMILES representations of the canidates (for evaluation)
+        item["candidates_smiles"] = item["candidates"]
+        # Create neg/pos label mask by matching the query molecule with the candidates
+        item_label = self.mol_label_transform(item["mol"])
+        item["labels"] = [
+            self.mol_label_transform(c) == item_label for c in item["candidates"]
+        ]
+        if not any(item["labels"]):
+            raise ValueError(
+                f'Query molecule {item["mol"]} not found in the candidates list.'
+            )
+        # Transform the query and candidate molecules
+        item["mol"] = self.mol_transform(item["mol"])
+        item["candidates"] = [self.mol_transform(c) for c in item["candidates"]]
+        if isinstance(item["mol"], np.ndarray):
+            item["mol"] = torch.as_tensor(item["mol"], dtype=self.dtype)
+            # item["candidates"] = [torch.as_tensor(c, dtype=self.dtype) for c in item["candidates"]]
+        return item
+    @staticmethod
+    def collate_fn(batch: T.Iterable[dict]) -> dict:
+        # Standard collate for everything except candidates and their labels (which may have different length per sample)
+        collated_batch = {}
+        for k in batch[0].keys():
+            if k not in ["candidates", "labels", "candidates_smiles"]:
+                collated_batch[k] = default_collate([item[k] for item in batch])
+        # Collate candidates and labels by concatenating and storing sizes of each list
+        collated_batch["candidates"] = torch.as_tensor(
+            np.concatenate([item["candidates"] for item in batch])
+        )
+        collated_batch["labels"] = torch.as_tensor(
+            sum([item["labels"] for item in batch], start=[])
+        )
+        collated_batch["batch_ptr"] = torch.as_tensor(
+            [len(item["candidates"]) for item in batch]
+        )
+        collated_batch["candidates_smiles"] = \
+            sum([item["candidates_smiles"] for item in batch], start=[])
+        return collated_batch
+# TODO: Datasets for unlabeled data.

massspecgym/data/transforms.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import numpy as np
+import torch
+import matchms
+import matchms.filtering as ms_filters
+from rdkit.Chem import AllChem as Chem
+from typing import Optional
+from abc import ABC, abstractmethod
+import massspecgym.utils as utils
+from massspecgym.definitions import CHEM_ELEMS
+class SpecTransform(ABC):
+    """
+    Base class for spectrum transformations. Custom transformatios should inherit from this class.
+    The transformation consists of two consecutive steps:
+        1. Apply a series of matchms filters to the input spectrum (method `matchms_transforms`).
+        2. Convert the matchms spectrum to a torch tensor (method `matchms_to_torch`).
+    """
+    @abstractmethod
+    def matchms_transforms(self, spec: matchms.Spectrum) -> matchms.Spectrum:
+        """
+        Apply a series of matchms filters to the input spectrum. Abstract method.
+        """
+    @abstractmethod
+    def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:
+        """
+        Convert a matchms spectrum to a torch tensor. Abstract method.
+        """
+    def __call__(self, spec: matchms.Spectrum) -> torch.Tensor:
+        """
+        Compose the matchms filters and the torch conversion.
+        """
+        return self.matchms_to_torch(self.matchms_transforms(spec))
+def default_matchms_transforms(
+    spec: matchms.Spectrum,
+    n_max_peaks: int = 60,
+    mz_from: float = 10,
+    mz_to: float = 1000,
+) -> matchms.Spectrum:
+    spec = ms_filters.select_by_mz(spec, mz_from=mz_from, mz_to=mz_to)
+    if n_max_peaks is not None:
+        spec = ms_filters.reduce_to_number_of_peaks(spec, n_max=n_max_peaks)
+    spec = ms_filters.normalize_intensities(spec)
+    return spec
+class SpecTokenizer(SpecTransform):
+    def __init__(
+        self,
+        n_peaks: Optional[int] = 60,
+        prec_mz_intensity: Optional[float] = 1.1,
+        matchms_kwargs: Optional[dict] = None
+    ) -> None:
+        self.n_peaks = n_peaks
+        self.prec_mz_intensity = prec_mz_intensity
+        self.matchms_kwargs = matchms_kwargs if matchms_kwargs is not None else {}
+    def matchms_transforms(self, spec: matchms.Spectrum) -> matchms.Spectrum:
+        return default_matchms_transforms(spec, n_max_peaks=self.n_peaks, **self.matchms_kwargs)
+    def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:
+        """
+        Stack arrays of mz and intensities into a matrix of shape (num_peaks, 2).
+        If the number of peaks is less than `n_peaks`, pad the matrix with zeros.
+        """
+        spec_t = np.vstack([spec.peaks.mz, spec.peaks.intensities]).T
+        if self.prec_mz_intensity is not None:
+            spec_t = np.vstack([[spec.metadata["precursor_mz"], self.prec_mz_intensity], spec_t])
+        if self.n_peaks is not None:
+            spec_t = utils.pad_spectrum(
+                spec_t,
+                self.n_peaks + 1 if self.prec_mz_intensity is not None else self.n_peaks
+            )
+        return torch.from_numpy(spec_t)
+class SpecBinner(SpecTransform):
+    def __init__(
+        self,
+        max_mz: float = 1005,
+        bin_width: float = 1,
+        to_rel_intensities: bool = True,
+    ) -> None:
+        self.max_mz = max_mz
+        self.bin_width = bin_width
+        self.to_rel_intensities = to_rel_intensities
+        if not (max_mz / bin_width).is_integer():
+            raise ValueError("`max_mz` must be divisible by `bin_width`.")
+    def matchms_transforms(self, spec: matchms.Spectrum) -> matchms.Spectrum:
+        return default_matchms_transforms(spec, mz_to=self.max_mz, n_max_peaks=None)
+    def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:
+        """
+        Bin the spectrum into a fixed number of bins.
+        """
+        binned_spec = self._bin_mass_spectrum(
+            mzs=spec.peaks.mz,
+            intensities=spec.peaks.intensities,
+            max_mz=self.max_mz,
+            bin_width=self.bin_width,
+            to_rel_intensities=self.to_rel_intensities,
+        )
+        return torch.from_numpy(binned_spec)
+    def _bin_mass_spectrum(
+        self, mzs, intensities, max_mz, bin_width, to_rel_intensities=True
+    ):
+        # Calculate the number of bins
+        num_bins = int(np.ceil(max_mz / bin_width))
+        # Calculate the bin indices for each mass
+        bin_indices = np.floor(mzs / bin_width).astype(int)
+        # Filter out mzs that exceed max_mz
+        valid_indices = bin_indices[mzs <= max_mz]
+        valid_intensities = intensities[mzs <= max_mz]
+        # Clip bin indices to ensure they are within the valid range
+        valid_indices = np.clip(valid_indices, 0, num_bins - 1)
+        # Initialize an array to store the binned intensities
+        binned_intensities = np.zeros(num_bins)
+        # Use np.add.at to sum intensities in the appropriate bins
+        np.add.at(binned_intensities, valid_indices, valid_intensities)
+        # Generate the bin edges for reference
+        # bin_edges = np.arange(0, max_mz + bin_width, bin_width)
+        # Normalize the intensities to relative intensities
+        if to_rel_intensities:
+            binned_intensities /= np.max(binned_intensities)
+        return binned_intensities  # , bin_edges
+class MolTransform(ABC):
+    @abstractmethod
+    def from_smiles(self, mol: str):
+        """
+        Convert a SMILES string to a tensor-like representation. Abstract method.
+        """
+    def __call__(self, mol: str):
+        return self.from_smiles(mol)
+class MolFingerprinter(MolTransform):
+    def __init__(self, type: str = "morgan", fp_size: int = 2048, radius: int = 2):
+        if type != "morgan":
+            raise NotImplementedError(
+                "Only Morgan fingerprints are implemented at the moment."
+            )
+        self.type = type
+        self.fp_size = fp_size
+        self.radius = radius
+    def from_smiles(self, mol: str):
+        mol = Chem.MolFromSmiles(mol)
+        return utils.morgan_fp(
+            mol, fp_size=self.fp_size, radius=self.radius, to_np=True
+        )
+class MolToInChIKey(MolTransform):
+    def __init__(self, twod: bool = True) -> None:
+        self.twod = twod
+    def from_smiles(self, mol: str) -> str:
+        mol = Chem.MolFromSmiles(mol)
+        return utils.mol_to_inchi_key(mol, twod=self.twod)
+class MolToFormulaVector(MolTransform):
+    def __init__(self):
+        self.element_index = {element: i for i, element in enumerate(CHEM_ELEMS)}
+    def from_smiles(self, smiles: str):
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            raise ValueError(f"Invalid SMILES string: {smiles}")
+        # Add explicit hydrogens to the molecule
+        mol = Chem.AddHs(mol)
+        # Initialize a vector of zeros for the 118 elements
+        formula_vector = np.zeros(118, dtype=np.int32)
+        # Iterate over atoms in the molecule and count occurrences of each element
+        for atom in mol.GetAtoms():
+            symbol = atom.GetSymbol()
+            if symbol in self.element_index:
+                index = self.element_index[symbol]
+                formula_vector[index] += 1
+            else:
+                raise ValueError(f"Element '{symbol}' not found in the list of 118 elements.")
+        return formula_vector
+    @staticmethod
+    def num_elements():
+        return len(CHEM_ELEMS)

massspecgym/definitions.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Global variables used across the package."""
+import pathlib
+# Dirs
+MASSSPECGYM_ROOT_DIR = pathlib.Path(__file__).parent.absolute()
+MASSSPECGYM_REPO_DIR = MASSSPECGYM_ROOT_DIR.parent
+MASSSPECGYM_DATA_DIR = MASSSPECGYM_REPO_DIR / 'data'
+MASSSPECGYM_TEST_RESULTS_DIR = MASSSPECGYM_DATA_DIR / 'test_results'
+MASSSPECGYM_ASSETS_DIR = MASSSPECGYM_REPO_DIR / 'assets'
+# Special tokens
+PAD_TOKEN = "<pad>"
+SOS_TOKEN = "<s>"
+EOS_TOKEN = "</s>"
+UNK_TOKEN = "<unk>"
+# Chemistry
+# List of all 118 elements (indexed by atomic number)
+CHEM_ELEMS = [
+    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
+    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
+    "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe",
+    "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
+    "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac",
+    "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh",
+    "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
+]

massspecgym/models/__init__.py ADDED Viewed

File without changes

massspecgym/models/base.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import typing as T
+import collections
+from enum import Enum
+from abc import ABC, abstractmethod
+from pathlib import Path
+import torch
+import pytorch_lightning as pl
+from torchmetrics import Metric, SumMetric
+from massspecgym.utils import ReturnScalarBootStrapper
+class Stage(Enum):
+    TRAIN = 'train'
+    VAL = 'val'
+    TEST = 'test'
+    NONE = 'none'
+    def to_pref(self) -> str:
+        return f"{self.value}_" if self != Stage.NONE else ""
+class MassSpecGymModel(pl.LightningModule, ABC):
+    def __init__(
+        self,
+        lr: float = 1e-4,
+        weight_decay: float = 0.0,
+        log_only_loss_at_stages: T.Sequence[Stage | str] = (),
+        bootstrap_metrics: bool = True,
+        df_test_path: T.Optional[str | Path] = None,
+        *args,
+        **kwargs
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        # Setup metring logging
+        self.log_only_loss_at_stages = [
+            Stage(s) if isinstance(s, str) else s for s in log_only_loss_at_stages
+        ]
+        self.bootstrap_metrics = bootstrap_metrics
+        # Init dictionary to store dataframe columns where rows correspond to samples
+        # (for constructing test dataframe with predictions and metrics for each sample)
+        self.df_test_path = Path(df_test_path) if df_test_path is not None else None
+        self.df_test = collections.defaultdict(list)
+    @abstractmethod
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError(
+            "Method `step` must be implemented in the model-specific child class."
+        )
+    def training_step(
+        self, batch: dict, batch_idx: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.step(batch, stage=Stage.TRAIN)
+    def validation_step(
+        self, batch: dict, batch_idx: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.step(batch, stage=Stage.VAL)
+    def test_step(
+        self, batch: dict, batch_idx: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.step(batch, stage=Stage.TEST)
+    @abstractmethod
+    def on_batch_end(
+        self, outputs: T.Any, batch: dict, batch_idx: int, stage: Stage
+    ) -> None:
+        """
+        Method to be called at the end of each batch. This method should be implemented by a child,
+        task-dedicated class and contain the evaluation necessary for the task.
+        """
+        raise NotImplementedError(
+            "Method `on_batch_end` must be implemented in the task-specific child class."
+        )
+    def on_train_batch_end(self, *args, **kwargs):
+        return self.on_batch_end(*args, **kwargs, stage=Stage.TRAIN)
+    def on_validation_batch_end(self, *args, **kwargs):
+        return self.on_batch_end(*args, **kwargs, stage=Stage.VAL)
+    def on_test_batch_end(self, *args, **kwargs):
+        return self.on_batch_end(*args, **kwargs, stage=Stage.TEST)
+    def configure_optimizers(self):
+        return torch.optim.Adam(
+            self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay
+        )
+    def get_checkpoint_monitors(self) -> list[dict]:
+        monitors = [
+            {"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": True}
+        ]
+        return monitors
+    def _update_metric(
+        self,
+        name: str,
+        metric_class: type[Metric],
+        update_args: T.Any,
+        batch_size: T.Optional[int] = None,
+        prog_bar: bool = False,
+        metric_kwargs: T.Optional[dict] = None,
+        log: bool = True,
+        log_n_samples: bool = False,
+        bootstrap: bool = False,
+        num_bootstraps: int = 100
+    ) -> None:
+        """
+        This method enables updating and logging metrics without instantiating them in advance in
+        the __init__ method. The metrics are aggreated over batches and logged at the end of the
+        epoch. If the metric does not exist yet, it is instantiated and added as an attribute to the
+        model.
+        """
+        # Process arguments
+        bootstrap = bootstrap and self.bootstrap_metrics
+        # Log total number of samples (useful for debugging)
+        if log_n_samples:
+            self._update_metric(
+                name=name + "_n_samples",
+                metric_class=SumMetric,
+                update_args=(len(update_args[0]),),
+                batch_size=1,
+            )
+        # Init metric if does not exits yet
+        if hasattr(self, name):
+            metric = getattr(self, name)
+        else:
+            if metric_kwargs is None:
+                metric_kwargs = dict()
+            metric = metric_class(**metric_kwargs)
+            metric = metric.to(self.device)
+            setattr(self, name, metric)
+        # Update
+        metric(*update_args)
+        # Log
+        if log:
+            self.log(
+                name,
+                metric,
+                prog_bar=prog_bar,
+                batch_size=batch_size,
+                on_step=False,
+                on_epoch=True,
+                add_dataloader_idx=False,
+                metric_attribute=name  # Suggested by a torchmetrics error
+            )
+        # Bootstrap
+        if bootstrap:
+            def _bootsrapped_metric_class(**metric_kwargs):
+                metric = metric_class(**metric_kwargs)
+                return ReturnScalarBootStrapper(metric, std=True, num_bootstraps=num_bootstraps)
+            self._update_metric(
+                name=name + "_std",
+                metric_class=_bootsrapped_metric_class,
+                update_args=update_args,
+                batch_size=batch_size,
+                metric_kwargs=metric_kwargs,
+            )
+    def _update_df_test(self, dct: dict) -> None:
+        for col, vals in dct.items():
+            if isinstance(vals, torch.Tensor):
+                vals = vals.tolist()
+            self.df_test[col].extend(vals)

massspecgym/models/de_novo/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .base import DeNovoMassSpecGymModel
+from .random import RandomDeNovo
+from .dummy import DummyDeNovo
+from .smiles_tranformer import SmilesTransformer
+__all__ = ["DeNovoMassSpecGymModel", "RandomDeNovo", "DummyDeNovo", "SmilesTransformer"]

massspecgym/models/de_novo/base.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import typing as T
+from abc import ABC
+import torch
+import pandas as pd
+from rdkit import Chem
+from rdkit.DataStructs import TanimotoSimilarity
+from torchmetrics.aggregation import MeanMetric
+from massspecgym.models.base import MassSpecGymModel, Stage
+from massspecgym.utils import morgan_fp, mol_to_inchi_key, MyopicMCES
+class DeNovoMassSpecGymModel(MassSpecGymModel, ABC):
+    def __init__(
+        self,
+        top_ks: T.Iterable[int] = (1, 10),
+        myopic_mces_kwargs: T.Optional[T.Mapping] = None,
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.top_ks = top_ks
+        self.myopic_mces = MyopicMCES(**(myopic_mces_kwargs or {}))
+        self.mol_pred_kind: T.Literal["smiles", "rdkit"] = "smiles"
+        # caches of already computed results to avoid expensive re-computations
+        self.mces_cache = dict()
+        self.mol_2_morgan_fp = dict()
+    def on_batch_end(
+        self,
+        outputs: T.Any,
+        batch: dict,
+        batch_idx: int,
+        stage: Stage
+    ) -> None:
+        self.log(
+            f"{stage.to_pref()}loss",
+            outputs['loss'],
+            batch_size=batch['spec'].size(0),
+            sync_dist=True,
+            prog_bar=True,
+        )
+        if stage in self.log_only_loss_at_stages:
+            return
+        metric_vals = self.evaluate_de_novo_step(
+            outputs["mols_pred"],  # (bs, k) list of generated rdkit molecules or SMILES strings
+            batch["mol"],  # (bs) list of ground truth SMILES strings
+            stage=stage
+        )
+        if stage == Stage.TEST and self.df_test_path is not None:
+            self._update_df_test(metric_vals)
+    def evaluate_de_novo_step(
+        self,
+        mols_pred: list[list[T.Optional[Chem.Mol | str]]],
+        mol_true: list[str],
+        stage: Stage,
+    ) -> dict[str, torch.Tensor]:
+        """
+        # TODO: refactor to compute only for max(k) and then use the result to obtain the rest by
+        subsetting.
+        Main evaluation method for the models for de novo molecule generation from mass spectra.
+        Args:
+            mols_pred (list[list[Mol | str]]): (bs, k) list of generated rdkit molecules or SMILES
+                strings with possible Nones if no molecule was generated
+            mol_true (list[str]): (bs) list of ground-truth SMILES strings
+        """
+        # Initialize return dictionary to store metric values per sample
+        metric_vals = {}
+        # Get SMILES and RDKit molecule objects for all predictions
+        if self.mol_pred_kind == "smiles":
+            smiles_pred_valid, mols_pred_valid = [], []
+            for mols_pred_sample in mols_pred:
+                smiles_pred_valid_sample, mols_pred_valid_sample = [], []
+                for s in mols_pred_sample:
+                    m = Chem.MolFromSmiles(s) if s is not None else None
+                    # If SMILES cannot be converted to RDKit molecule, the molecule is set to None
+                    smiles_pred_valid_sample.append(s if m is not None else None)
+                    mols_pred_valid_sample.append(m)
+                smiles_pred_valid.append(smiles_pred_valid_sample)
+                mols_pred_valid.append(mols_pred_valid_sample)
+            smiles_pred, mols_pred = smiles_pred_valid, mols_pred_valid
+        elif self.mol_pred_kind == "rdkit":
+            smiles_pred = [
+                [Chem.MolToSmiles(m) if m is not None else None for m in ms]
+                for ms in mols_pred
+            ]
+        else:
+            raise ValueError(f"Invalid mol_pred_kind: {self.mol_pred_kind}")
+        # Auxiliary metric: number of valid molecules
+        self._update_metric(
+            stage.to_pref() + f"num_valid_mols",
+            MeanMetric,
+            ([sum([m is not None for m in ms]) for ms in mols_pred],),
+            batch_size=len(mols_pred),
+        )
+        # Get RDKit molecule objects for ground truth
+        smile_true = mol_true
+        mol_true = [Chem.MolFromSmiles(sm) for sm in mol_true]
+        def _get_morgan_fp_with_cache(mol):
+            """
+            A helper function to retrieve either cached Morgan Fingerprint value, or to compute and cache it
+            @param mol: RDKit molecule object
+            @return:
+            """
+            if mol not in self.mol_2_morgan_fp:
+                self.mol_2_morgan_fp[mol] = morgan_fp(mol, to_np=False)
+            return self.mol_2_morgan_fp[mol]
+        # Evaluate top-k metrics
+        for top_k in self.top_ks:
+            # Get top-k predicted molecules for each ground-truth sample
+            smiles_pred_top_k = [smiles_pred_sample[:top_k] for smiles_pred_sample in smiles_pred]
+            mols_pred_top_k = [mols_pred_sample[:top_k] for mols_pred_sample in mols_pred]
+            # 1. Evaluate minimum common edge subgraph:
+            # Calculate MCES distance between top-k predicted molecules and ground truth and
+            # report the minimum distance. The minimum distances for each sample in the batch are
+            # averaged across the epoch.
+            min_mces_dists = []
+            mces_thld = 100
+            # Iterate over batch
+            for preds, true in zip(smiles_pred_top_k, smile_true):
+                # Iterate over top-k predicted molecule samples
+                dists = []
+                for pred in preds:
+                    if pred is None:
+                        dists.append(mces_thld)
+                    else:
+                        if (true, pred) not in self.mces_cache:
+                            mce_val = self.myopic_mces(true, pred)
+                            self.mces_cache[(true, pred)] = mce_val
+                        dists.append(self.mces_cache[(true, pred)])
+                min_mces_dists.append(min(min(dists), mces_thld))
+            min_mces_dists = torch.tensor(min_mces_dists, device=self.device)
+            # Log
+            metric_name = stage.to_pref() + f"top_{top_k}_mces_dist"
+            self._update_metric(
+                metric_name,
+                MeanMetric,
+                (min_mces_dists,),
+                batch_size=len(min_mces_dists),
+                bootstrap=stage == Stage.TEST
+            )
+            metric_vals[metric_name] = min_mces_dists
+            # 2. Evaluate Tanimoto similarity:
+            # Calculate Tanimoto similarity between top-k predicted molecules and ground truth and
+            # report the maximum similarity. The maximum similarities for each sample in the batch
+            # are averaged across the epoch.
+            fps_pred_top_k = [
+                [_get_morgan_fp_with_cache(m) if m is not None else None for m in ms]
+                for ms in mols_pred_top_k
+            ]
+            fp_true = [_get_morgan_fp_with_cache(m) for m in mol_true]
+            max_tanimoto_sims = []
+            # Iterate over batch
+            for preds, true in zip(fps_pred_top_k, fp_true):
+                # Iterate over top-k predicted molecule samples
+                sims = [
+                    TanimotoSimilarity(true, pred)
+                    if pred is not None else 0
+                    for pred in preds
+                ]
+                max_tanimoto_sims.append(max(sims))
+            max_tanimoto_sims = torch.tensor(max_tanimoto_sims, device=self.device)
+            # Log
+            metric_name = stage.to_pref() + f"top_{top_k}_max_tanimoto_sim"
+            self._update_metric(
+                metric_name,
+                MeanMetric,
+                (max_tanimoto_sims,),
+                batch_size=len(max_tanimoto_sims),
+                bootstrap=stage == Stage.TEST
+            )
+            metric_vals[metric_name] = max_tanimoto_sims
+            # 3. Evaluate exact match (accuracy):
+            # Calculate if the ground truth molecule is in the top-k predicted molecules and report
+            # the average across the epoch.
+            in_top_k = [
+                mol_to_inchi_key(true) in [
+                    mol_to_inchi_key(pred)
+                    if pred is not None else None
+                    for pred in preds
+                ]
+                for true, preds in zip(mol_true, mols_pred_top_k)
+            ]
+            in_top_k = torch.tensor(in_top_k, device=self.device)
+            # Log
+            metric_name = stage.to_pref() + f"top_{top_k}_accuracy"
+            self._update_metric(
+                metric_name,
+                MeanMetric,
+                (in_top_k,),
+                batch_size=len(in_top_k),
+                bootstrap=stage == Stage.TEST
+            )
+            metric_vals[metric_name] = in_top_k
+        return metric_vals
+    def test_step(
+        self,
+        batch: dict,
+        batch_idx: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        outputs = super().test_step(batch, batch_idx)
+        # Get generated (i.e., predicted) SMILES
+        if self.df_test_path is not None:
+            self._update_df_test({
+                'identifier': batch['identifier'],
+                'mols_pred': outputs['mols_pred']
+            })
+        return outputs
+    def on_test_epoch_end(self):
+        # Save test data frame to disk
+        if self.df_test_path is not None:
+            df_test = pd.DataFrame(self.df_test)
+            self.df_test_path.parent.mkdir(parents=True, exist_ok=True)
+            df_test.to_pickle(self.df_test_path)

massspecgym/models/de_novo/dummy.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import random
+import torch
+from massspecgym.models.base import Stage
+from massspecgym.models.de_novo.base import DeNovoMassSpecGymModel
+class DummyDeNovo(DeNovoMassSpecGymModel):
+    def __init__(self, n_samples: int = 10, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.n_samples = n_samples
+        self.dummy_smiles = [
+            "O",                          # Water (H₂O)
+            "C",                          # Methane (CH₄)
+            "CCO",                        # Ethanol (C₂H₆O)
+            "C(C1C(C(C(C(O1)O)O)O)O)O",   # Glucose (C₆H₁₂O₆)
+            "CC(=O)C",                    # Acetone (C₃H₆O)
+            "CC(=O)Oc1ccccc1C(=O)O",      # Aspirin (C₉H₈O₄)
+            "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", # Caffeine (C₈H₁₀N₄O₂)
+            "c1ccccc1",                   # Benzene (C₆H₆)
+            "CC(=O)O",                    # Acetic Acid (C₂H₄O₂)
+            "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", # Ibuprofen (C₁₃H₁₈O₂)
+            None
+        ]
+        self.mol_pred_kind = "smiles"
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        bs = batch['spec'].shape[0]
+        # Sample dummy molecules from the pre-defined list
+        mols_pred = [[random.choice(self.dummy_smiles) for _ in range(self.n_samples)] for _ in range(bs)]
+        # Random baseline, so we return a dummy loss
+        loss = torch.tensor(0.0, requires_grad=True)
+        # Return molecules in the dict
+        return dict(loss=loss, mols_pred=mols_pred)
+    def configure_optimizers(self):
+        # No optimizer needed for a random baseline
+        return None

massspecgym/models/de_novo/random.py ADDED Viewed

	@@ -0,0 +1,1750 @@

+from collections import deque, defaultdict
+from collections.abc import Generator
+from dataclasses import dataclass
+from random import choice, shuffle
+import chemparse
+import numpy as np
+import torch
+from massspecgym.models.base import Stage
+from massspecgym.models.de_novo.base import DeNovoMassSpecGymModel
+from rdkit import Chem
+from rdkit.Chem.MolStandardize import rdMolStandardize
+from rdkit.Chem.rdMolDescriptors import CalcMolFormula
+from rdkit.Chem.Descriptors import ExactMolWt
+from rdkit.Chem.rdchem import Mol, BondType
+from copy import deepcopy
+from collections import Counter
+import bisect
+from itertools import combinations
+# type aliases for code readability
+chem_element = str
+number_of_atoms = int
+@dataclass(frozen=True, order=True)
+class ValenceAndCharge:
+    """
+    A data class to store valence value with the corresponding charge
+    """
+    valence: int
+    charge: int
+@dataclass(frozen=True, order=True)
+class AtomWithValence:
+    """
+    A data class to store atom info including the computed valence
+    """
+    atom_type: chem_element
+    atom_valence_and_charge: ValenceAndCharge
+@dataclass(frozen=True, order=True)
+class BondToNeighbouringAtom:
+    """
+    A data class to store info about the adjacent atom
+    """
+    adjacent_atom: AtomWithValence
+    bond_type: int
+@dataclass
+class AtomNodeForRandomTraversal:
+    """
+    A data class to store atom info including the computed valence
+    """
+    atom_with_valence: AtomWithValence
+    _remaining_node_degree: int = None
+    _remaining_node_charge: int = None
+    def __post_init__(self):
+        """Setting up remaining node degree and charge for random traversal"""
+        self._remaining_node_degree = (
+            self.atom_with_valence.atom_valence_and_charge.valence
+        )
+        self._remaining_node_charge = (
+            self.atom_with_valence.atom_valence_and_charge.charge
+        )
+    @property
+    def remaining_node_degree(self):
+        """remaining_node_degree variable getter"""
+        return self._remaining_node_degree
+    @remaining_node_degree.setter
+    def remaining_node_degree(self, value: int):
+        """remaining_node_degree variable setter"""
+        self._remaining_node_degree = value
+    @property
+    def remaining_node_charge(self):
+        """remaining_node_charge variable getter"""
+        return self._remaining_node_charge
+    @remaining_node_charge.setter
+    def remaining_node_charge(self, value: int):
+        """remaining_node_charge variable setter"""
+        self._remaining_node_charge = value
+def create_rdkit_molecule_from_edge_list(
+    edge_list: list[tuple[int, int]], all_graph_nodes: list[AtomNodeForRandomTraversal]
+) -> Mol:
+    """
+    A helper function converting a randomly generated edge list into rdkit.Chem.rdchem.Mol object
+    @param edge_list: a list of edges, where each edge is specified by the index of its nodes
+    @param all_graph_nodes: a list of all atomic nodes in the molecular graph
+    """
+    # first we traverse all randomly generated edges and compute bond types between each pair of atoms
+    edge_2_bondtype = defaultdict(int)
+    for edge_node_i, edge_node_j in edge_list:
+        edge_2_bondtype[
+            (min(edge_node_i, edge_node_j), max(edge_node_i, edge_node_j))
+        ] += 1
+    # helper routine to get the rdking enum bondtype
+    def _get_rdkit_bondtype(bondtype: int) -> BondType:
+        int_bondtype_2_enum = {
+            1: BondType.SINGLE,
+            2: BondType.DOUBLE,
+            3: BondType.TRIPLE,
+            4: BondType.QUADRUPLE,
+            5: BondType.QUINTUPLE,
+            6: BondType.HEXTUPLE,
+        }
+        try:
+            return int_bondtype_2_enum[bondtype]
+        except KeyError:
+            raise NotImplementedError(f"Bond type {bondtype} is not supported")
+    edge_list_rdkit = [
+        (node_i, node_j, _get_rdkit_bondtype(bondtype))
+        for (node_i, node_j), bondtype in edge_2_bondtype.items()
+    ]
+    # creating an empty editable molecule
+    mol = Chem.RWMol()
+    # adding the atoms to the molecule object
+    # as some all_graph nodes can represent charges, we have to remember mapping of molecular atom index to
+    # the corresponding atom index in all_graph_nodes
+    all_graph_atom_idx_2_mol_atom_idx = {}
+    for all_graph_atom_idx, atom in enumerate(all_graph_nodes):
+        # ignoring charge-related graph nodes
+        if atom.atom_with_valence.atom_type not in {"+", "-"}:
+            all_graph_atom_idx_2_mol_atom_idx[all_graph_atom_idx] = mol.GetNumAtoms()
+            next_atom = Chem.Atom(atom.atom_with_valence.atom_type)
+            next_atom.SetFormalCharge(
+                atom.atom_with_valence.atom_valence_and_charge.charge
+            )
+            mol.AddAtom(next_atom)
+    # adding bonds
+    for (edge_node_i, edge_node_j, bond_type) in edge_list_rdkit:
+        # checking if the edge represents a charge of connected atom
+        the_edge_represents_charge = len(
+            {
+                all_graph_nodes[node_i].atom_with_valence.atom_type
+                for node_i in [edge_node_i, edge_node_j]
+            }.intersection({"+", "-"})
+        )
+        if the_edge_represents_charge:
+            # setting a charge to the corresponding atom
+            for node_i in [edge_node_i, edge_node_j]:
+                if all_graph_nodes[node_i].atom_with_valence.atom_type in {"+", "-"}:
+                    charge_value = (
+                        1
+                        if all_graph_nodes[node_i].atom_with_valence.atom_type == "+"
+                        else -1
+                    )
+                else:
+                    atom_node_i = node_i
+            mol.GetAtomWithIdx(
+                all_graph_atom_idx_2_mol_atom_idx[atom_node_i]
+            ).SetFormalCharge(charge_value)
+        else:
+            mol.AddBond(
+                all_graph_atom_idx_2_mol_atom_idx[edge_node_i],
+                all_graph_atom_idx_2_mol_atom_idx[edge_node_j],
+                bond_type,
+            )
+    # returning the rdkit.Chem.rdchem.Mol object
+    return mol.GetMol()
+class RandomDeNovo(DeNovoMassSpecGymModel):
+    def __init__(
+        self,
+        formula_known: bool = True,
+        count_of_valid_valence_assignments: int = 10,
+        estimate_chem_element_stats: bool = False,
+        max_top_k: int = 10,
+        enforce_connectivity: bool = True,
+        cache_results: bool = True,
+        **kwargs
+    ):
+        """
+        @param formula_known: a boolean flag about the information available prior to generation
+                              If formula_known is True, we should generate molecules with the specified formula
+                              If formula_known is False, we should generate any molecule with the specified mass
+        @param count_of_valid_valence_assignments: an integer controlling process of selecting valence assignment
+                                                   to each atom in the generated molecule.
+                                                   `count_of_valid_valence_assignments` of assignment corresponding to
+                                                    the formula are generated, then one assignment is is picked at random.
+                                                    The default is set to 3 for the computational speed purposes.
+                                                    When setting to 1, the first feasible valence assignment will be used.
+        @param estimate_chem_element_stats: a boolean flag controlling if prior information about elements' valences
+                                            and bond type distributions is estimated from training data
+        @param max_top_k: a maximum number of candidates to generate. If the count of valid valence assignments do
+                          not allow generation of max_top_k, then less candidates are returned
+        @param enforce_connectivity: a boolean flag controlling connectivity of randomly generated molecules.
+                                     When it is set to True, first a random spanning tree is sampled
+        @param cache_results: a boolean flag controlling caching of already generated structures.
+                              When set to True, for each unique formula the set of random molecules is cached to avoid
+                              recomputation.
+        """
+        super(RandomDeNovo, self).__init__(**kwargs)
+        self.formula_known = formula_known
+        self.count_of_valid_valence_assignments = count_of_valid_valence_assignments
+        self.estimate_chem_element_stats = estimate_chem_element_stats
+        self.max_top_k = min(max(self.top_ks), max_top_k)
+        self.enforce_connectivity = enforce_connectivity
+        # prior chemical knownledge about element valences
+        self.element_2_valences = ELEMENT_VALENCES
+        # a dictionary structure to record molecular weights with corresponding formulas from training data
+        # during training steps, for each molecular weight we record all encountered formulas
+        # then on training end we compute proportions of the formulas and record it as a mapping
+        # mol_weight -> [[formula_1, formula_2], [proportion_of_formula_1, proportion_of_formula_2]]
+        self.mol_weight_2_formulas = defaultdict(list)
+        # a helper array to store sorted list of train molecular weights.
+        # It will be used for the O(logn) lookup of the closest mol weight
+        self.mol_weight_trn_values: list[float] = None
+        # a dictionary structure for statistics about bond type distributions
+        # the dictionary has the following mapping:
+        # chem_element ->
+        #   ValenceAndCharge ->
+        #     number of already bonded atoms ->
+        #       [already created BondToNeighbouringAtom] ->
+        #                                           AtomWithValence ->
+        #                                                          list of (bond_type, count) + total_count
+        self.element_2_bond_stats = None
+        # a cache with already precomputed sets of randomly generated molecules for the given formula
+        self.formula_2_random_smiles = {}
+        self.cache_results = cache_results
+    def generator_for_splits_of_chem_element_atoms_by_possible_valences(
+        self,
+        atom_type: chem_element,
+        possible_valences: list[ValenceAndCharge],
+        atom_count: int,
+        already_assigned_groups_of_atoms: dict[AtomWithValence, number_of_atoms],
+    ) -> Generator[dict[AtomWithValence, number_of_atoms]]:
+        """
+        A recursive generator function to iterate over all possible partitions of element atoms
+        into groups with different valid valences.
+        Each allowed valence value can have any number from atoms, from zero up to total `atom_count`
+        @param atom_type: chemical element
+        @param possible_valences: a list of allowed valences
+        @param atom_count: a total number of element atoms to split into valence groups
+        @param already_assigned_groups_of_atoms: partial results to pass into the subsequent recursive calls
+        @return A generator for lazy enumeration over all possible splits of `atom_count` atoms into subgroups
+                of valid valences specified in `possible valences` parameters.
+                Each return value is a dictionary, mapping atom with fixed valence to a total count of such instances
+                in the molecule.
+        @note In the future the method can be made into a function in a separate utils module,
+        for the simplicity of codebase organization and testing purposes it's kept as the method for now
+        """
+        # the check for a base case of the recursion
+        if atom_count == 0:
+            yield already_assigned_groups_of_atoms
+        elif len(possible_valences):
+            # taking the first valence value from the possible ones
+            next_valence = possible_valences[0]
+            # iterating over possible sizes for a group of atoms with `next_valence` value of the valence
+            for size_of_group in range(atom_count, -1, -1):
+                # recording the assigned size of the group
+                already_assigned_groups_of_atoms_next = (
+                    already_assigned_groups_of_atoms.copy()
+                )
+                atom_with_valence = AtomWithValence(
+                    atom_type=atom_type, atom_valence_and_charge=next_valence
+                )
+                already_assigned_groups_of_atoms_next[atom_with_valence] = size_of_group
+                yield from self.generator_for_splits_of_chem_element_atoms_by_possible_valences(
+                    atom_type=atom_type,
+                    possible_valences=possible_valences[1:],
+                    atom_count=atom_count - size_of_group,
+                    already_assigned_groups_of_atoms=already_assigned_groups_of_atoms_next,
+                )
+    def assigner_of_valences_to_all_atoms(
+        self,
+        unassigned_molecule_elements_with_counts: dict[chem_element, number_of_atoms],
+        already_assigned_atoms_with_valences: dict[AtomWithValence, number_of_atoms],
+        common_valences_only: bool = True,
+    ) -> Generator[dict[AtomWithValence, number_of_atoms]]:
+        """
+        A recursive function to iterate over all possible valid assignments of valences for each atom in the molecule
+        @param unassigned_molecule_elements_with_counts: a dictionary representation of a molecule,
+                                                         mapping each present element to a corresponding number of atoms.
+                                                         The function is recursive, in the subsequence calls
+                                                         the dictionary represents an yet-unprocessed submolecule
+        @param already_assigned_atoms_with_valences: partial results to pass into the subsequent recursive calls,
+                                                     stored as a dictionary, mapping atom with fixed valence
+                                                     to a total count of such atoms in the molecule
+        @param common_valences_only: a flag for using the common valence values for each element
+        @return A generator for lazy enumeration over all possible assignments of all molecule atoms into subgroups
+                defined by valences. Valence values are the valid ones for the corresponding chemical element.
+                Each return value is a dictionary, mapping atom of specified chemical element with a fixed valence
+                to a total count of such atoms in the molecule.
+        @note In the future the method can be made into a function in a separate utils module,
+        for the simplicity of codebase organization and testing purposes it's kept as the method for now
+        """
+        # the check for a base case of the recursion
+        if len(unassigned_molecule_elements_with_counts) == 0:
+            yield already_assigned_atoms_with_valences
+        else:
+            # processing the next chemical element in the molecule
+            chem_element_type, atom_count = list(
+                unassigned_molecule_elements_with_counts.items()
+            )[0]
+            # for the subsequence recursive calls the picked atom will be removed from the yet-to-be-processed
+            remaining_unassigned_atoms_with_counts = (
+                unassigned_molecule_elements_with_counts.copy()
+            )
+            del remaining_unassigned_atoms_with_counts[chem_element_type]
+            # generating splits of the element count into groups with possible valences
+            valences_common, valences_others = self.element_2_valences[
+                chem_element_type.capitalize()
+            ]
+            possible_element_valences = (
+                valences_common
+                if common_valences_only
+                else valences_common + valences_others
+            )
+            # we ignore "the direction" of ionic bonds, therefore we work with absolute values of valences
+            possible_element_valences = map(
+                lambda x: ValenceAndCharge(valence=np.abs(x.valence), charge=x.charge),
+                possible_element_valences,
+            )
+            # we require a connected molecule graph, so we ignore possible 0 values of valences
+            possible_element_valences = list(
+                set(filter(lambda x: x.valence > 0, possible_element_valences))
+            )
+            # creating a generator for lazy enumeration over all possible splits of element atoms
+            # into subgroups of possible valid valences
+            valence_split_generator = (
+                self.generator_for_splits_of_chem_element_atoms_by_possible_valences(
+                    atom_type=chem_element_type,
+                    possible_valences=possible_element_valences,
+                    atom_count=atom_count,
+                    already_assigned_groups_of_atoms=dict(),
+                )
+            )
+            # iterating over splits of the element count into groups with possible valences
+            for element_atoms_with_valence_2_count in valence_split_generator:
+                already_assigned_atoms_with_valences_new = (
+                    already_assigned_atoms_with_valences.copy()
+                )
+                already_assigned_atoms_with_valences_new.update(
+                    element_atoms_with_valence_2_count
+                )
+                yield from self.assigner_of_valences_to_all_atoms(
+                    unassigned_molecule_elements_with_counts=remaining_unassigned_atoms_with_counts,
+                    already_assigned_atoms_with_valences=already_assigned_atoms_with_valences_new,
+                    common_valences_only=common_valences_only,
+                )
+    def is_valence_assignment_feasible(
+        self, valence_assignment: dict[AtomWithValence, number_of_atoms]
+    ) -> bool:
+        """
+        A function for checking if the valence assignment to all molecule atoms can be feasible
+        @param valence_assignment: an assignment of all molecule atoms into subgroups of plausible valences
+        @note In the future the method can be made into a function in a separate utils module,
+        for the simplicity of codebase organization and testing purposes it's kept as the method for now
+        """
+        # considering a molecule as a graph with atom being nodes and chemical bonds being edges
+        # computing sum of all node degrees
+        sum_of_all_node_degrees = sum(
+            [
+                atom.atom_valence_and_charge.valence * count_of_atoms
+                for atom, count_of_atoms in valence_assignment.items()
+            ]
+        )
+        if sum_of_all_node_degrees % 2 == 1:
+            # the valence assignment is infeasible as in the graph the number of edges is half of the total degrees sum
+            # therefore the sum_of_all_node_degrees must be an even number
+            return False
+        total_number_of_bonds = sum_of_all_node_degrees / 2
+        # the total number of all atoms in the whole molecule
+        total_number_of_atoms_in_molecule = sum(valence_assignment.values())
+        if total_number_of_bonds < total_number_of_atoms_in_molecule - 1:
+            # the valence assignment is infeasible as the molecule graph cannot be connected
+            return False
+        # check that charges add up to zero
+        total_charge = 0
+        for atom, count_of_atoms in valence_assignment.items():
+            # we do not take virtual nodes for the charged molecules, we force the remaining submolecule to be neutral
+            if atom.atom_type not in {"+", "-"}:
+                total_charge += atom.atom_valence_and_charge.charge * count_of_atoms
+        if total_charge != 0:
+            return False
+        return True
+    def get_feasible_atom_valence_assignments(
+        self, chemical_formula: str
+    ) -> list[dict[AtomWithValence, number_of_atoms]]:
+        """
+        A function generating candidate assignments of valences to individual atoms in the molecule.
+        Candidates are returned in a random order.
+        @param chemical_formula: a string containing the chemical formula of the molecule
+        @note In the future the method can be made into a function in a separate utils module,
+        for the simplicity of codebase organization and testing purposes it's kept as the method for now
+        """
+        # parsing chemical formula into a dictionary of elements with corresponding counts
+        element_2_count = {
+            element: int(count)
+            for element, count in chemparse.parse_formula(chemical_formula).items()
+        }
+        # checking that all input elements are valid
+        for element in element_2_count.keys():
+            if element.capitalize() not in self.element_2_valences:
+                raise ValueError(
+                    f"Found an unknown element {element.capitalize()} in the formula {chemical_formula}"
+                )
+        # estimate the total number of all atoms in the whole molecule
+        # it will be used to check validity of the valence assignments
+        total_number_of_atoms_in_molecule = sum(element_2_count.values())
+        generated_candidate_valence_assignments = []
+        valence_assignment_generator = self.assigner_of_valences_to_all_atoms(
+            unassigned_molecule_elements_with_counts=element_2_count,
+            already_assigned_atoms_with_valences=dict(),
+            common_valences_only=True,
+        )
+        termination_assignment_value = {AtomWithValence("No more assignments", -1): -1}
+        next_valence_assignment = next(
+            valence_assignment_generator, termination_assignment_value
+        )
+        while (
+            len(generated_candidate_valence_assignments)
+            < self.count_of_valid_valence_assignments
+            and next_valence_assignment != termination_assignment_value
+        ):
+            if self.is_valence_assignment_feasible(next_valence_assignment):
+                generated_candidate_valence_assignments.append(next_valence_assignment)
+            next_valence_assignment = next(
+                valence_assignment_generator, termination_assignment_value
+            )
+        # if no valence assignment was found with common valences,
+        # then try generating assignments including not-common valences
+        if len(generated_candidate_valence_assignments) == 0:
+            valence_assignment_generator = self.assigner_of_valences_to_all_atoms(
+                unassigned_molecule_elements_with_counts=element_2_count,
+                already_assigned_atoms_with_valences=dict(),
+                common_valences_only=False,
+            )
+            next_valence_assignment = next(
+                valence_assignment_generator, termination_assignment_value
+            )
+            while (
+                len(generated_candidate_valence_assignments)
+                < self.count_of_valid_valence_assignments
+                and next_valence_assignment != termination_assignment_value
+            ):
+                if self.is_valence_assignment_feasible(next_valence_assignment):
+                    generated_candidate_valence_assignments.append(
+                        next_valence_assignment
+                    )
+                next_valence_assignment = next(
+                    valence_assignment_generator, termination_assignment_value
+                )
+        if len(generated_candidate_valence_assignments) == 0:
+            raise ValueError(
+                f"No valence assignments can be generated for the formula {chemical_formula}"
+            )
+        shuffle(generated_candidate_valence_assignments)
+        return generated_candidate_valence_assignments
+    def sample_second_edgenode_at_random(
+        self,
+        edge_start_node_i: int,
+        all_graph_nodes: list[AtomNodeForRandomTraversal],
+        open_nodes_for_sampling: dict[str, set[int]],
+        possible_candidates_type: str,
+        closed_set: set[int],
+        use_chem_element_stats: bool = False,
+        already_connected_neighbours: list[BondToNeighbouringAtom] = None,
+    ):
+        """
+        A function randomly sampling the second node for an edge
+        @param edge_start_node_i: index of the first edge node
+        @param all_graph_nodes: a list of all nodes in the molecule graph
+        @param open_nodes_for_sampling: dictionary with sets of node indices which
+                                                 can be considered for closing the edge.
+                                                 Each set is specified by the dictionary key:
+                                                     "coordinate_bond_negatively_charged_targets",
+                                                     "coordinate_bond_positively_charged_targets",
+                                                     "covalent_bond_targets"
+        @param possible_candidates_type: the `open_nodes_for_sampling` dictionary key
+        @param closed_set: closed set for traversal
+        @param use_chem_element_stats: a boolean flag setting up usage of per chem. elements statistics about its bonds
+        @param already_connected_neighbours: an adjacency list of already sampled neighbours
+        """
+        if not use_chem_element_stats:
+            edge_end_node_j = choice(
+                [
+                    candidate_node_j
+                    for candidate_node_j in open_nodes_for_sampling[
+                        possible_candidates_type
+                    ]
+                    if candidate_node_j not in closed_set
+                ]
+            )
+            bond_degree = 1
+        else:
+            # checking the current state of the atom and gathering the corresponding stats
+            number_of_already_sampled_neighbours = len(already_connected_neighbours)
+            # note that the graph is undirected, start-end node refers to the random traversal only
+            start_atom = all_graph_nodes[edge_start_node_i]
+            if self.element_2_bond_stats is None:
+                raise RuntimeError(
+                    "To use chem. element stats, the model has to be trained first,"
+                    "to record training molecular weights with corresponding formulas."
+                )
+            # the structure of `self.element_2_bond_stats` is
+            # chem_element ->
+            #   ValenceAndCharge ->
+            #     number of already bonded atoms ->
+            #       [already created BondToNeighbouringAtom] ->
+            #                                           AtomWithValence ->
+            #                                                          list of (bond_type, count)+ total_count
+            # if we don't have stats  -> fall back to sampling from all candidates
+            try:
+                element_stats = self.element_2_bond_stats[
+                    start_atom.atom_with_valence.atom_type
+                ][
+                    ValenceAndCharge(
+                        start_atom.atom_with_valence.atom_valence_and_charge.valence,
+                        start_atom.atom_with_valence.atom_valence_and_charge.charge,
+                    )
+                ][
+                    number_of_already_sampled_neighbours
+                ][
+                    tuple(sorted(already_connected_neighbours))
+                ]
+                full_candidates_list = []
+                neighb_with_stats_candidates_list = []
+                neighb_with_stats_bondcounts = []
+                neighb_with_stats_bondlists = []
+                # iterating over open nodes of the corresponding bond type
+                for candidate_node_j in open_nodes_for_sampling[
+                    possible_candidates_type
+                ]:
+                    if candidate_node_j not in closed_set:
+                        # remembering all candidates in case no statistic-based option is there
+                        full_candidates_list.append(candidate_node_j)
+                        # checking if the candidate is present in element-specific bond stats
+                        candidate_neighb_atom = all_graph_nodes[
+                            candidate_node_j
+                        ].atom_with_valence
+                        if candidate_neighb_atom in element_stats:
+                            neighb_with_stats_candidates_list.append(candidate_node_j)
+                            bondslist, total_bond_count = element_stats[
+                                candidate_neighb_atom
+                            ]
+                            neighb_with_stats_bondcounts.append(total_bond_count)
+                            neighb_with_stats_bondlists.append(bondslist)
+                # when no stats-based neighbour remain (e.g. hydrogens are not recorded in the stats)
+                if len(neighb_with_stats_candidates_list) == 0:
+                    edge_end_node_j = choice(full_candidates_list)
+                    bond_degree = 1
+                else:
+                    # sampling based on frequences in bond stats
+                    total_bondcount_sum = sum(neighb_with_stats_bondcounts)
+                    proportions = [
+                        val / total_bondcount_sum
+                        for val in neighb_with_stats_bondcounts
+                    ]
+                    edge_end_node_j = np.random.choice(
+                        neighb_with_stats_candidates_list, p=proportions
+                    )
+                    # getting i of the sampled neighbour to access its bond-stats
+                    neighb_i = neighb_with_stats_candidates_list.index(edge_end_node_j)
+                    # for the sampled end node, we sample the type of the bond based on the stats
+                    bondtypes_possible = []
+                    counts_of_possible_bondtypes = []
+                    total_possible_bondtype_count = 0
+                    # we leave only the bonds which current state of random generation allows
+                    # i.e., we cannot sample a bond violating the current remaining degree of `edge_start_node_i`
+                    start_node_remaining_degree = all_graph_nodes[
+                        edge_start_node_i
+                    ].remaining_node_degree
+                    for bondtype, count in neighb_with_stats_bondlists[neighb_i]:
+                        if bondtype <= start_node_remaining_degree:
+                            bondtypes_possible.append(bondtype)
+                            counts_of_possible_bondtypes.append(count)
+                            total_possible_bondtype_count += count
+                    # if no bonds can be closed for the sampled element, fall back to sampling from full candidates list
+                    if len(bondtypes_possible) == 0:
+                        edge_end_node_j = choice(full_candidates_list)
+                        bond_degree = 1
+                    else:
+                        bond_degree_proportions = [
+                            num / total_possible_bondtype_count
+                            for num in counts_of_possible_bondtypes
+                        ]
+                        bond_degree = np.random.choice(
+                            bondtypes_possible, p=bond_degree_proportions
+                        )
+                        already_connected_neighbours.append(
+                            BondToNeighbouringAtom(
+                                adjacent_atom=all_graph_nodes[
+                                    edge_end_node_j
+                                ].atom_with_valence,
+                                bond_type=bond_degree,
+                            )
+                        )
+            except:
+                edge_end_node_j = choice(
+                    [
+                        candidate_node_j
+                        for candidate_node_j in open_nodes_for_sampling[
+                            possible_candidates_type
+                        ]
+                        if candidate_node_j not in closed_set
+                    ]
+                )
+                bond_degree = 1
+        return edge_end_node_j, bond_degree, already_connected_neighbours
+    def sample_edge_at_random(
+        self,
+        all_graph_nodes: list[AtomNodeForRandomTraversal],
+        open_nodes_for_sampling: dict[str, set[int]],
+        edge_start_node_i: int = None,
+        closed_set: set[int] = None,
+        use_chem_element_stats: bool = False,
+        atom_2_already_connected_neighbours: list[list[BondToNeighbouringAtom]] = None,
+    ) -> tuple[tuple[int, int], list[AtomNodeForRandomTraversal], set[int]]:
+        """
+        Helper function to filter atoms suitable for generation of a random bond with `edge_start_node_i`
+        and sampling a random edge
+        @param all_graph_nodes: a list of all nodes in the molecule graph
+        @param edge_start_node_i: index of the first edge node
+        @param open_nodes_for_sampling: dictionary with sets of node indices which
+                                                 can be considered for closing the edge.
+                                                 Each set is specified by the dictionary key:
+                                                     "coordinate_bond_negatively_charged_targets",
+                                                     "coordinate_bond_positively_charged_targets",
+                                                     "covalent_bond_targets"
+        @param use_chem_element_stats: a boolean flag setting up usage of per chem. elements statistics about its bonds
+        @param closed_set: closed set for traversal
+        @param atom_2_already_connected_neighbours: a mapping from atom to its adjacency list of already sampled neighbours
+        @return: a sampled edge and updated structures `all_graph_nodes`, `open_nodes_for_sampling`
+        """
+        # sample the start node for the edge if it's not specified
+        if edge_start_node_i is None:
+            edge_start_node_i = choice(
+                sum(map(list, open_nodes_for_sampling.values()), [])
+            )
+        if closed_set is None:
+            closed_set = {edge_start_node_i}
+        # check if the start edge atom has the charge and therefore can form coordinate bond
+        can_form_coordinate_bond = (
+            all_graph_nodes[edge_start_node_i].remaining_node_charge != 0
+        )
+        # if possible, create coordinate bond at random
+        is_bond_coordinate = can_form_coordinate_bond and np.random.rand() < 0.5
+        if is_bond_coordinate:
+            start_node_charge_sign = np.sign(
+                all_graph_nodes[edge_start_node_i].remaining_node_charge
+            )
+            # if for the coordinate bond one atom is positively charged, then another must be charged negatively
+            if start_node_charge_sign > 0:
+                possible_candidates_type = "coordinate_bond_neg_charged_targets"
+            else:
+                possible_candidates_type = "coordinate_bond_pos_charged_targets"
+        else:
+            possible_candidates_type = "covalent_bond_targets"
+        (
+            edge_end_node_j,
+            node_degree_reduction,
+            atom_2_already_connected_neighbours[edge_start_node_i],
+        ) = self.sample_second_edgenode_at_random(
+            edge_start_node_i,
+            all_graph_nodes,
+            open_nodes_for_sampling,
+            possible_candidates_type,
+            closed_set,
+            use_chem_element_stats,
+            atom_2_already_connected_neighbours[edge_start_node_i],
+        )
+        # decrease the node degrees correspondingly
+        for node_of_a_new_edge_i in [edge_start_node_i, edge_end_node_j]:
+            all_graph_nodes[
+                node_of_a_new_edge_i
+            ].remaining_node_degree -= node_degree_reduction
+            # if all bonds are created for the particular atom, it is no more open for traversal
+            if all_graph_nodes[node_of_a_new_edge_i].remaining_node_degree <= 0:
+                for candidates_type in open_nodes_for_sampling.keys():
+                    if node_of_a_new_edge_i in open_nodes_for_sampling[candidates_type]:
+                        open_nodes_for_sampling[candidates_type].remove(
+                            node_of_a_new_edge_i
+                        )
+            # if the added bond was coordinate, modify the remaining charges correspondingly
+            elif is_bond_coordinate:
+                new_charge_abs_value = (
+                    np.abs(all_graph_nodes[node_of_a_new_edge_i].remaining_node_charge)
+                    - 1
+                )
+                # check if the node still can form coordinate bonds
+                if new_charge_abs_value == 0:
+                    for candidates_type in [
+                        "coordinate_bond_neg_charged_targets",
+                        "coordinate_bond_pos_charged_targets",
+                    ]:
+                        if (
+                            node_of_a_new_edge_i
+                            in open_nodes_for_sampling[candidates_type]
+                        ):
+                            open_nodes_for_sampling[candidates_type].remove(
+                                node_of_a_new_edge_i
+                            )
+                else:
+                    charge_sign = np.sign(
+                        all_graph_nodes[node_of_a_new_edge_i].remaining_node_charge
+                    )
+                    all_graph_nodes[node_of_a_new_edge_i].remaining_node_charge = (
+                        charge_sign * new_charge_abs_value
+                    )
+        return (
+            (edge_start_node_i, edge_end_node_j),
+            all_graph_nodes,
+            open_nodes_for_sampling,
+        )
+    def generate_random_molecule_graphs_via_traversal(
+        self,
+        chemical_formula: str,
+        max_number_of_retries_per_valence_assignment: int = 100,
+    ) -> list[Mol]:
+        """
+        A function generating random molecule graph(s).
+        The generation process ensures that each graph is connected.
+        If any of the `self.count_of_valid_valence_assignments` enables it,
+        the function returns graph(s) without self-loops.
+        @param chemical_formula: a string containing the chemical formula of the molecule
+        @param max_number_of_retries_per_valence_assignment: a max count of attempts to generate a random spanning tree
+                                                             for a given potentially feasible valence assignment
+        @note In the future the method can be made into a function in a separate utils module,
+        for the simplicity of codebase organization and testing purposes it's kept as the method for now
+        """
+        # check if for the input formula the random structures have been already generated
+        if self.cache_results and chemical_formula in self.formula_2_random_smiles:
+            return self.formula_2_random_smiles[chemical_formula]
+        # get candidate partitions of all molecule atoms into valences
+        candidate_valence_assignments = self.get_feasible_atom_valence_assignments(
+            chemical_formula
+        )
+        # iterate over each valence assignment to all atoms, the order is random
+        assert (
+            len(candidate_valence_assignments) > 0
+        ), f"No potentially feasible atom valence assignment for {chemical_formula}"
+        # number of iteration over feasible valence assignments
+        num_of_iterations_over_splits_into_valences = int(
+            np.ceil(self.max_top_k / len(candidate_valence_assignments))
+        )
+        generated_molecules = []
+        # we request to generate self.max_top_k molecule(s)
+        while len(generated_molecules) < self.max_top_k:
+            for _ in range(num_of_iterations_over_splits_into_valences):
+                for valence_assignment in candidate_valence_assignments:
+                    # first randomly create a spanning tree of the molecule graph, to ensure the connectivity of molecule.
+                    # The feasibility check `self.is_valence_assignment_feasible` inside the
+                    # `self.get_feasible_atom_valence_assignments` function should ensure the possibility to create the tree.
+                    spanning_tree_was_generated = False
+                    spanning_tree_generation_attempts = 0
+                    while (
+                        not spanning_tree_was_generated
+                        and spanning_tree_generation_attempts
+                        < max_number_of_retries_per_valence_assignment
+                    ):
+                        spanning_tree_generation_attempts += 1
+                        # we optimistically set the value of `spanning_tree_was_generated` to True,
+                        # If the current traversal do not lead to a spanning tree,
+                        # then `spanning_tree_was_generated` is set to False in the code below
+                        spanning_tree_was_generated = True
+                        # prepare node list for a random edges generation
+                        all_graph_nodes = []
+                        for (
+                            atom_with_valence,
+                            num_of_atoms_in_molecule,
+                        ) in valence_assignment.items():
+                            for _ in range(num_of_atoms_in_molecule):
+                                all_graph_nodes.append(
+                                    AtomNodeForRandomTraversal(
+                                        atom_with_valence=atom_with_valence
+                                    )
+                                )
+                        # a helper structure to record already sampled bonds
+                        # it is used only if we use estimated chem elements stats in the generation process
+                        atom_2_already_connected_neighbours = [
+                            [] for _ in range(len(all_graph_nodes))
+                        ]
+                        # recording sets of nodes available for random sampling of covalent and coordinate bonds
+                        coordinate_bond_neg_charged_targets = {
+                            node_i
+                            for node_i, node in enumerate(all_graph_nodes)
+                            if np.sign(node.remaining_node_charge) == -1
+                        }
+                        coordinate_bond_pos_charged_targets = {
+                            node_i
+                            for node_i, node in enumerate(all_graph_nodes)
+                            if np.sign(node.remaining_node_charge) == 1
+                        }
+                        covalent_bond_targets = {
+                            node_i
+                            for node_i, node in enumerate(all_graph_nodes)
+                            if node.remaining_node_charge == 0
+                            or node.remaining_node_degree
+                            > np.abs(node.remaining_node_charge)
+                        }
+                        open_nodes_for_sampling = {
+                            "coordinate_bond_neg_charged_targets": coordinate_bond_neg_charged_targets,
+                            "coordinate_bond_pos_charged_targets": coordinate_bond_pos_charged_targets,
+                            "covalent_bond_targets": covalent_bond_targets,
+                        }
+                        # the final edge list will be stored into the variable below.
+                        # An edge is defined by a pair of position indices in the `all_graph_nodes` list
+                        edge_list = []
+                        # the nodes already included into the spanning tree
+                        # the set is used for quick blacklisting, while the list is used for possible backtracking when
+                        (
+                            spanning_tree_visited_nodes_set,
+                            spanning_tree_traversal_list,
+                        ) = (
+                            set(),
+                            deque(),
+                        )
+                        # sample a random start of spanning tree generation
+                        edge_start_node_i = choice(list(range(len(all_graph_nodes))))
+                        spanning_tree_visited_nodes_set.add(edge_start_node_i)
+                        spanning_tree_traversal_list.append(edge_start_node_i)
+                        while self.enforce_connectivity and len(
+                            spanning_tree_visited_nodes_set
+                        ) < len(all_graph_nodes):
+                            # check if the start edge atom has the charge and therefore can form coordinate bond
+                            try:
+                                (
+                                    (edge_start_node_i, edge_end_node_i),
+                                    all_graph_nodes,
+                                    open_nodes_for_sampling,
+                                ) = self.sample_edge_at_random(
+                                    all_graph_nodes,
+                                    open_nodes_for_sampling,
+                                    edge_start_node_i=edge_start_node_i,
+                                    closed_set=spanning_tree_visited_nodes_set,
+                                    use_chem_element_stats=self.estimate_chem_element_stats,
+                                    atom_2_already_connected_neighbours=atom_2_already_connected_neighbours,
+                                )
+                            except IndexError:
+                                spanning_tree_was_generated = False
+                                break
+                            # note that the graph is undirected, start-end node refers to the random traversal only
+                            edge_list.append((edge_start_node_i, edge_end_node_i))
+                            # recording the node added to the random spanning tree
+                            spanning_tree_visited_nodes_set.add(edge_end_node_i)
+                            spanning_tree_traversal_list.append(edge_end_node_i)
+                            # finding a start node for the next sampled edge.
+                            # We have to ensure that such a node still has some degree not covered by sampling nodes.
+                            # For that, we might need to backtrack.
+                            candidate_for_start_node_i = edge_end_node_i
+                            try:
+                                while (
+                                    all_graph_nodes[
+                                        candidate_for_start_node_i
+                                    ].remaining_node_degree
+                                    == 0
+                                ):
+                                    spanning_tree_traversal_list.pop()
+                                    candidate_for_start_node_i = (
+                                        spanning_tree_traversal_list[-1]
+                                    )
+                            except IndexError:
+                                spanning_tree_was_generated = False
+                                break
+                            edge_start_node_i = candidate_for_start_node_i
+                    # after the spanning tree edges were sampled,
+                    # now we randomly connect nodes with remaining degrees yet uncovered by sampled bonds
+                    while sum(map(len, open_nodes_for_sampling.values())) >= 2:
+                        try:
+                            (
+                                (edge_start_node_i, edge_end_node_i),
+                                all_graph_nodes,
+                                open_nodes_for_sampling,
+                            ) = self.sample_edge_at_random(
+                                all_graph_nodes,
+                                open_nodes_for_sampling,
+                                use_chem_element_stats=self.estimate_chem_element_stats,
+                                atom_2_already_connected_neighbours=atom_2_already_connected_neighbours,
+                            )
+                        except IndexError:
+                            break
+                        edge_list.append((edge_start_node_i, edge_end_node_i))
+                    # if all nodes were covered by edges without self-loops, then we remember the generated molecule
+                    if sum(map(len, open_nodes_for_sampling.values())) == 0:
+                        generated_molecules.append(
+                            create_rdkit_molecule_from_edge_list(
+                                edge_list, all_graph_nodes
+                            )
+                        )
+                        if len(generated_molecules) == self.max_top_k:
+                            if self.cache_results:
+                                self.formula_2_random_smiles[
+                                    chemical_formula
+                                ] = generated_molecules
+                            return generated_molecules
+        if self.cache_results:
+            self.formula_2_random_smiles[chemical_formula] = generated_molecules
+        return generated_molecules
+    def training_step(
+        self, batch: dict, batch_idx: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # recording statistics about chemical elements
+        if self.estimate_chem_element_stats:
+            if self.element_2_bond_stats is None:
+                self.element_2_bond_stats = defaultdict(dict)
+            for mol_smiles in batch["mol"]:
+                molecule = Chem.MolFromSmiles(mol_smiles)
+                # in order to work with double and single bonds instead of aromatic
+                Chem.Kekulize(molecule, clearAromaticFlags=True)
+                # we add hydrogen atoms (which are ommited by default)
+                molecule = Chem.AddHs(molecule)
+                formula = CalcMolFormula(molecule)
+                for atom in molecule.GetAtoms():
+                    valence = atom.GetTotalValence()
+                    charge = atom.GetFormalCharge()
+                    valence_charge = ValenceAndCharge(valence, charge)
+                    chem_element_type = atom.GetSymbol()
+                    atom_bonds = atom.GetBonds()
+                    if (
+                        valence_charge
+                        not in self.element_2_bond_stats[chem_element_type]
+                    ):
+                        # for each value of atom's valence, number of neighbours we will count types of neighbouring atoms
+                        self.element_2_bond_stats[chem_element_type][
+                            valence_charge
+                        ] = dict()
+                    all_atom_neighbours = set()
+                    for bond in atom_bonds:
+                        start_atom_idx = atom.GetIdx()
+                        end_atom = [
+                            _atom
+                            for _atom in [bond.GetBeginAtom(), bond.GetEndAtom()]
+                            if _atom.GetIdx() != start_atom_idx
+                        ][0]
+                        all_atom_neighbours.add(
+                            (
+                                end_atom.GetSymbol(),
+                                end_atom.GetTotalValence(),
+                                end_atom.GetFormalCharge(),
+                                bond.GetBondTypeAsDouble(),
+                            )
+                        )
+                    all_neighbour_subsets = [
+                        [set(subset) for subset in combinations(all_atom_neighbours, r)]
+                        for r in range(len(all_atom_neighbours))
+                    ]
+                    for neighbour_subsets_of_fixed_size in all_neighbour_subsets:
+                        subset_size = len(neighbour_subsets_of_fixed_size[0])
+                        # for each number of already connected atoms we record the neighbours and then possible bonds yet to be closed
+                        if (
+                            subset_size
+                            not in self.element_2_bond_stats[chem_element_type][
+                                valence_charge
+                            ]
+                        ):
+                            self.element_2_bond_stats[chem_element_type][
+                                valence_charge
+                            ][subset_size] = dict()
+                        for neighbours in neighbour_subsets_of_fixed_size:
+                            neighbours_tuple = tuple(sorted(neighbours))
+                            if (
+                                neighbours_tuple
+                                not in self.element_2_bond_stats[chem_element_type][
+                                    valence_charge
+                                ][subset_size]
+                            ):
+                                self.element_2_bond_stats[chem_element_type][
+                                    valence_charge
+                                ][subset_size][neighbours_tuple] = defaultdict(int)
+                            remaining_bonds = all_atom_neighbours.difference(neighbours)
+                            for remaining_bonded_neighbour in remaining_bonds:
+                                self.element_2_bond_stats[chem_element_type][
+                                    valence_charge
+                                ][subset_size][neighbours_tuple][
+                                    remaining_bonded_neighbour
+                                ] += 1
+        # recording molecular weight
+        for mol_smiles in batch["mol"]:
+            molecule = Chem.MolFromSmiles(mol_smiles)
+            formula = CalcMolFormula(molecule)
+            weight = ExactMolWt(molecule)
+            self.mol_weight_2_formulas[weight].append(formula)
+        # Random baseline, so we return a dummy loss
+        loss = torch.tensor(0.0, requires_grad=True)
+        return dict(loss=loss, mols_pred=["C"])
+    def on_train_end(self) -> None:
+        # for each molecular weight we compute proportions of recorded molecular formulas
+        molecular_weight_2_formula_counts = {
+            weight: Counter(formulas)
+            for weight, formulas in self.mol_weight_2_formulas.items()
+        }
+        weight_2_formula_proportions = {}
+        for weight, formula_2_count in molecular_weight_2_formula_counts.items():
+            total_count = sum(formula_2_count.values())
+            weight_2_formula_proportions[weight] = {
+                formula: count / total_count
+                for formula, count in formula_2_count.items()
+            }
+        # for consequent sampling using numpy.random.choice function, we store the results in the format
+        # weight -> [[formula_1, formula_2], [proportion_of_formula_1, proportion_of_formula_2]]
+        self.mol_weight_2_formulas = {
+            weight: [
+                list(formula_2_proportions.keys()),
+                list(formula_2_proportions.values()),
+            ]
+            for weight, formula_2_proportions in weight_2_formula_proportions.items()
+        }
+        # storing weights in the sorted list for the logarithmic time look-up of the closest weight value
+        self.mol_weight_trn_values = sorted(self.mol_weight_2_formulas.keys())
+        # if chem element stats are used, then the corresponding data structure is reformated in accordance with the
+        # description from docstring to the class __init__:
+        # chem_element ->
+        #   ValenceAndCharge ->
+        #     number of already bonded atoms ->
+        #       [already created BondToNeighbouringAtom] ->
+        #                                           AtomWithValence ->
+        #                                                          list of (bond_type, count) + total_count
+        if self.estimate_chem_element_stats:
+            element_2_bond_stats = defaultdict(dict)
+            for (
+                chem_element,
+                valence_charge_2_stats,
+            ) in self.element_2_bond_stats.items():
+                for valence_charge, num_bonds_2_stats in valence_charge_2_stats.items():
+                    element_2_bond_stats[chem_element][valence_charge] = dict()
+                    for num_bonds, bonds_2_stats in num_bonds_2_stats.items():
+                        element_2_bond_stats[chem_element][valence_charge][
+                            num_bonds
+                        ] = dict()
+                        for (
+                            bonds,
+                            neighb_atom_with_valence_2_stats,
+                        ) in bonds_2_stats.items():
+                            present_bonds_sorted = tuple(
+                                sorted(
+                                    [
+                                        BondToNeighbouringAtom(
+                                            adjacent_atom=AtomWithValence(
+                                                atom_type=bond[0],
+                                                atom_valence_and_charge=ValenceAndCharge(
+                                                    valence=bond[1], charge=bond[2]
+                                                ),
+                                            ),
+                                            bond_type=bond[3],
+                                        )
+                                        for bond in bonds
+                                    ]
+                                )
+                            )
+                            element_2_bond_stats[chem_element][valence_charge][
+                                num_bonds
+                            ][present_bonds_sorted] = defaultdict(list)
+                            for (
+                                neighb_atom_type,
+                                neighb_atom_valence,
+                                neighb_atom_charge,
+                                bondtype,
+                            ), count in neighb_atom_with_valence_2_stats.items():
+                                neighbouring_atom = AtomWithValence(
+                                    atom_type=neighb_atom_type,
+                                    atom_valence_and_charge=ValenceAndCharge(
+                                        valence=neighb_atom_valence,
+                                        charge=neighb_atom_charge,
+                                    ),
+                                )
+                                element_2_bond_stats[chem_element][valence_charge][
+                                    num_bonds
+                                ][present_bonds_sorted][neighbouring_atom].append(
+                                    (bondtype, count)
+                                )
+                            # computing total count of all bound per neighbouring atom
+                            for (
+                                neighbouring_atom,
+                                list_of_bondtype_counts,
+                            ) in element_2_bond_stats[chem_element][valence_charge][
+                                num_bonds
+                            ][
+                                present_bonds_sorted
+                            ].items():
+                                total_count_of_bonds = sum(
+                                    map(lambda x: x[1], list_of_bondtype_counts)
+                                )
+                                element_2_bond_stats[chem_element][valence_charge][
+                                    num_bonds
+                                ][present_bonds_sorted][neighbouring_atom] = (
+                                    list_of_bondtype_counts,
+                                    total_count_of_bonds,
+                                )
+            self.element_2_bond_stats = element_2_bond_stats
+    def sample_formula_with_the_closest_molecular_weight(
+        self, molecular_weight: float
+    ) -> str:
+        """
+        A method sampling chemical formula observed in training data with the closest weight to `molecular_weight`
+        @param molecular_weight: Molecular weight of a structure to be generated
+        """
+        if self.mol_weight_trn_values is None:
+            raise RuntimeError(
+                "For random denovo generation without known formula, the model has to be trained first,"
+                "to record training molecular weights with corresponding formulas."
+            )
+        # finding a place in the sorted array for insertion of the `molecular_weight`, while preserving sorted order
+        idx_of_closest_larger = bisect.bisect_left(
+            self.mol_weight_trn_values, molecular_weight
+        )
+        # check if the exact same molecular weight was observed in training data, otherwise select the closest weight
+        if molecular_weight == self.mol_weight_trn_values[idx_of_closest_larger]:
+            idx_of_closest = idx_of_closest_larger
+        elif idx_of_closest_larger > 0:
+            # determining the closest molecular weight out of both neighbours
+            idx_of_closest_smaller = idx_of_closest_larger - 1
+            weight_difference_with_smaller_neighbour = (
+                molecular_weight - self.mol_weight_trn_values[idx_of_closest_smaller]
+            )
+            weight_difference_with_larger_neighbour = (
+                self.mol_weight_trn_values[idx_of_closest_larger] - molecular_weight
+            )
+            if (
+                weight_difference_with_larger_neighbour
+                < weight_difference_with_smaller_neighbour
+            ):
+                idx_of_closest = idx_of_closest_larger
+            else:
+                idx_of_closest = idx_of_closest_smaller
+        else:
+            idx_of_closest = 0
+        # the value of the molecular weight observed in training labels, which is the closest to `molecular_weight`
+        closest_observed_molecular_weight = self.mol_weight_trn_values[idx_of_closest]
+        # getting chemical formulas observed for this molecular weight
+        # self.mol_weight_2_formulas is a dictionary containing the following mapping
+        #  weight -> [[formula_1, formula_2], [proportion_of_formula_1, proportion_of_formula_2]]
+        feasible_formulas, formula_proportions = self.mol_weight_2_formulas[
+            closest_observed_molecular_weight
+        ]
+        # if just one formula is known, it is returned directly
+        if len(feasible_formulas) == 1:
+            return feasible_formulas[0]
+        # otherwise we randomly sample in accordance with proportions
+        return np.random.choice(feasible_formulas, p=formula_proportions)
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        mols = batch["mol"]  # List of SMILES of length batch_size
+        # If formula_known is True, we should generate molecules with the same formula as label (`mols` above)
+        # If formula_known is False, we should generate any molecule with the same mass as label
+        # obtaining molecule objects from SMILES
+        molecules = [Chem.MolFromSmiles(smiles) for smiles in mols]
+        # getting the formulas
+        if self.formula_known:
+            formulas = [CalcMolFormula(molecule) for molecule in molecules]
+        else:
+            molecular_weights = [ExactMolWt(molecule) for molecule in molecules]
+            formulas = [
+                self.sample_formula_with_the_closest_molecular_weight(mol_weight)
+                for mol_weight in molecular_weights
+            ]
+        # (bs, k) list of rdkit molecules
+        mols_pred = [
+            self.generate_random_molecule_graphs_via_traversal(formula)
+            for formula in formulas
+        ]
+        for predicted_mol_group in mols_pred:
+            for mol in predicted_mol_group:
+                Chem.RemoveHs(mol)
+        # list of predicted smiles
+        smiles_pred = [
+            [
+                Chem.MolToSmiles(mol_candidate)
+                for mol_candidate in candidates_per_input_mol
+            ]
+            for candidates_per_input_mol in mols_pred
+        ]
+        # Random baseline, so we return a dummy loss
+        loss = torch.tensor(0.0, requires_grad=True)
+        return dict(loss=loss, mols_pred=smiles_pred)
+    def configure_optimizers(self):
+        # No optimizer needed for a random baseline
+        return None
+# element valences taken from sources like https://sciencenotes.org/element-valency-pdf
+# the first list contains the typical valences, each tuple is a valence value with the corresponding charge
+ELEMENT_VALENCES = {
+    "H": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [ValenceAndCharge(valence=0, charge=0), ValenceAndCharge(valence=1, charge=-1)],
+    ),
+    "He": ([ValenceAndCharge(valence=0, charge=0)], []),
+    "Li": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [ValenceAndCharge(valence=1, charge=-1)],
+    ),
+    "Be": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "B": (
+        [ValenceAndCharge(valence=3, charge=0), ValenceAndCharge(valence=4, charge=-1)],
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "C": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=3, charge=-1),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=2, charge=-1),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=1, charge=-1),
+        ],
+    ),
+    "N": (
+        [ValenceAndCharge(valence=3, charge=0), ValenceAndCharge(valence=4, charge=1)],
+        [
+            ValenceAndCharge(valence=2, charge=-1),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+            ValenceAndCharge(valence=1, charge=-1),
+        ],
+    ),
+    "O": (
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=1, charge=-1)],
+        [ValenceAndCharge(valence=3, charge=1)],
+    ),
+    "F": ([ValenceAndCharge(valence=1, charge=0)], []),
+    "Ne": ([ValenceAndCharge(valence=0, charge=0)], []),
+    "Na": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [],
+    ),
+    "Mg": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "Al": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Si": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [],
+    ),
+    "P": (
+        [ValenceAndCharge(valence=5, charge=0)],
+        [
+            ValenceAndCharge(valence=4, charge=1),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+        ],
+    ),
+    "S": (
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=6, charge=0)],
+        [
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=1, charge=-1),
+            ValenceAndCharge(valence=3, charge=1),
+        ],
+    ),
+    "Cl": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [],
+    ),
+    "Ar": ([ValenceAndCharge(valence=0, charge=0)], []),
+    "K": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [],
+    ),
+    "Ca": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "Sc": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Ti": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "V": (
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+        ],
+        [
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Cr": (
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+        ],
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Mn": (
+        [
+            ValenceAndCharge(valence=7, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+        ],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Fe": (
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=3, charge=0)],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Co": (
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=3, charge=0)],
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Ni": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Cu": (
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=1, charge=0)],
+        [
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Zn": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [ValenceAndCharge(valence=1, charge=0), ValenceAndCharge(valence=0, charge=0)],
+    ),
+    "Ga": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Ge": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+        ],
+    ),
+    "As": (
+        [ValenceAndCharge(valence=5, charge=0), ValenceAndCharge(valence=4, charge=1)],
+        [],
+    ),
+    "Se": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [],
+    ),
+    "Br": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [],
+    ),
+    "Kr": (
+        [ValenceAndCharge(valence=0, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Rb": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [],
+    ),
+    "Sr": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "Y": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Zr": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Nb": (
+        [ValenceAndCharge(valence=5, charge=0)],
+        [
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Mo": (
+        [ValenceAndCharge(valence=6, charge=0), ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Tc": (
+        [ValenceAndCharge(valence=7, charge=0), ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Ru": (
+        [ValenceAndCharge(valence=4, charge=0), ValenceAndCharge(valence=3, charge=0)],
+        [
+            ValenceAndCharge(valence=8, charge=0),
+            ValenceAndCharge(valence=7, charge=0),
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Rh": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Pd": (
+        [ValenceAndCharge(valence=4, charge=0), ValenceAndCharge(valence=2, charge=0)],
+        [ValenceAndCharge(valence=0, charge=0)],
+    ),
+    "Ag": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Cd": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "In": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Sn": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Sb": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=5, charge=0), ValenceAndCharge(valence=3, charge=-1)],
+    ),
+    "Te": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0), ValenceAndCharge(valence=6, charge=0)],
+    ),
+    "I": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=7, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Xe": (
+        [ValenceAndCharge(valence=0, charge=0)],
+        [
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=8, charge=0),
+        ],
+    ),
+    "Cs": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [],
+    ),
+    "Ba": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "La": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Ce": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Pr": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Nd": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Pm": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Sm": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Eu": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Gd": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Tb": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Dy": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Ho": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Er": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Tm": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Yb": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Lu": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Hf": ([ValenceAndCharge(valence=4, charge=0)], []),
+    "Ta": ([ValenceAndCharge(valence=5, charge=0)], []),
+    "W": (
+        [ValenceAndCharge(valence=6, charge=0), ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+        ],
+    ),
+    "Re": (
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+        ],
+        [
+            ValenceAndCharge(valence=7, charge=0),
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+            ValenceAndCharge(valence=1, charge=0),
+            ValenceAndCharge(valence=0, charge=0),
+        ],
+    ),
+    "Os": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=8, charge=0),
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=2, charge=0),
+        ],
+    ),
+    "Ir": (
+        [ValenceAndCharge(valence=4, charge=0), ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=6, charge=0), ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Pt": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Au": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Hg": (
+        [ValenceAndCharge(valence=2, charge=0)],
+        [ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Tl": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=1, charge=0)],
+    ),
+    "Pb": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Bi": (
+        [ValenceAndCharge(valence=3, charge=0), ValenceAndCharge(valence=1, charge=0)],
+        [ValenceAndCharge(valence=5, charge=0)],
+    ),
+    "Po": (
+        [ValenceAndCharge(valence=4, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "At": (
+        [ValenceAndCharge(valence=1, charge=0)],
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+            ValenceAndCharge(valence=7, charge=0),
+        ],
+    ),
+    "Rn": (
+        [ValenceAndCharge(valence=0, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Fr": ([ValenceAndCharge(valence=1, charge=0)], []),
+    "Ra": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "Ac": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Th": ([ValenceAndCharge(valence=4, charge=0)], []),
+    "Pa": (
+        [ValenceAndCharge(valence=5, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "U": (
+        [ValenceAndCharge(valence=6, charge=0)],
+        [
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+        ],
+    ),
+    "Np": (
+        [ValenceAndCharge(valence=7, charge=0)],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=4, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+        ],
+    ),
+    "Pu": (
+        [ValenceAndCharge(valence=7, charge=0), ValenceAndCharge(valence=4, charge=0)],
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+        ],
+    ),
+    "Am": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=5, charge=0), ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Cm": (
+        [
+            ValenceAndCharge(valence=6, charge=0),
+            ValenceAndCharge(valence=5, charge=0),
+            ValenceAndCharge(valence=3, charge=0),
+        ],
+        [],
+    ),
+    "Bk": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=4, charge=0)],
+    ),
+    "Cf": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Es": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Fm": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Md": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "No": (
+        [ValenceAndCharge(valence=3, charge=0)],
+        [ValenceAndCharge(valence=2, charge=0)],
+    ),
+    "Lr": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Rf": ([ValenceAndCharge(valence=4, charge=0)], []),
+    "Db": ([ValenceAndCharge(valence=5, charge=0)], []),
+    "Sg": ([ValenceAndCharge(valence=6, charge=0)], []),
+    "Bh": ([ValenceAndCharge(valence=7, charge=0)], []),
+    "Hs": ([ValenceAndCharge(valence=8, charge=0)], []),
+    "Mt": ([ValenceAndCharge(valence=8, charge=0)], []),
+    "Ds": ([ValenceAndCharge(valence=8, charge=0)], []),
+    "Rg": ([ValenceAndCharge(valence=8, charge=0)], []),
+    "Cn": ([ValenceAndCharge(valence=2, charge=0)], []),
+    "Nh": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Fl": ([ValenceAndCharge(valence=4, charge=0)], []),
+    "Mc": ([ValenceAndCharge(valence=3, charge=0)], []),
+    "Lv": ([ValenceAndCharge(valence=4, charge=0)], []),
+    "Ts": ([ValenceAndCharge(valence=7, charge=0)], []),
+    "Og": ([ValenceAndCharge(valence=0, charge=0)], []),
+    "+": ([ValenceAndCharge(valence=1, charge=1)], []),
+    "-": ([ValenceAndCharge(valence=1, charge=-1)], []),
+}

massspecgym/models/de_novo/smiles_tranformer.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import typing as T
+from torch_geometric.nn import MLP
+from massspecgym.models.tokenizers import SpecialTokensBaseTokenizer
+from massspecgym.data.transforms import MolToFormulaVector
+from massspecgym.models.base import Stage
+from massspecgym.models.de_novo.base import DeNovoMassSpecGymModel
+from massspecgym.definitions import PAD_TOKEN, SOS_TOKEN, EOS_TOKEN
+class SmilesTransformer(DeNovoMassSpecGymModel):
+    def __init__(
+        self,
+        input_dim: int,
+        d_model: int,
+        nhead: int,
+        num_encoder_layers: int,
+        num_decoder_layers: int,
+        smiles_tokenizer: SpecialTokensBaseTokenizer,
+        start_token: str = SOS_TOKEN,
+        end_token: str = EOS_TOKEN,
+        pad_token: str = PAD_TOKEN,
+        dropout: float = 0.1,
+        max_smiles_len: int = 200,
+        k_predictions: int = 1,
+        temperature: T.Optional[float] = 1.0,
+        pre_norm: bool = False,
+        chemical_formula: bool = False,
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.smiles_tokenizer = smiles_tokenizer
+        self.vocab_size = smiles_tokenizer.get_vocab_size()
+        for token in [start_token, end_token, pad_token]:
+            assert token in smiles_tokenizer.get_vocab(), f"Token {token} not found in tokenizer vocabulary."
+        self.start_token_id = smiles_tokenizer.token_to_id(start_token)
+        self.end_token_id = smiles_tokenizer.token_to_id(end_token)
+        self.pad_token_id = smiles_tokenizer.token_to_id(pad_token)
+        self.d_model = d_model
+        self.max_smiles_len = max_smiles_len
+        self.k_predictions = k_predictions
+        self.temperature = temperature
+        if self.k_predictions == 1:  # TODO: this logic should be changed because sampling with k = 1 also makes sense
+            self.temperature = None
+        self.src_encoder = nn.Linear(input_dim, d_model)
+        self.tgt_embedding = nn.Embedding(self.vocab_size, d_model)
+        self.transformer = nn.Transformer(
+            d_model=d_model,
+            nhead=nhead,
+            num_encoder_layers=num_encoder_layers,
+            num_decoder_layers=num_decoder_layers,
+            dim_feedforward=4 * d_model,
+            dropout=dropout,
+            norm_first=pre_norm
+        )
+        self.tgt_decoder = nn.Linear(d_model, self.vocab_size)
+        self.chemical_formula = chemical_formula
+        if self.chemical_formula:
+            self.formula_mlp = MLP(
+                in_channels=MolToFormulaVector.num_elements(),
+                hidden_channels=MolToFormulaVector.num_elements(),
+                out_channels=d_model,
+                num_layers=1,
+                dropout=dropout,
+                norm=None
+            )
+        self.criterion = nn.CrossEntropyLoss()
+    def forward(self, batch):
+        spec = batch["spec"]  # (batch_size, seq_len, in_dim)
+        smiles = batch["mol"]  # List of SMILES of length batch_size
+        smiles = self.smiles_tokenizer.encode_batch(smiles)
+        smiles = [s.ids for s in smiles]
+        smiles = torch.tensor(smiles, device=spec.device)  # (batch_size, seq_len)
+        # Generating padding masks for variable-length sequences
+        src_key_padding_mask = self.generate_src_padding_mask(spec)
+        tgt_key_padding_mask = self.generate_tgt_padding_mask(smiles)
+        # Create target mask (causal mask)
+        tgt_seq_len = smiles.size(1)
+        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_seq_len).to(smiles.device)
+        # Preapre inputs for transformer teacher forcing
+        src = spec.permute(1, 0, 2)  # (seq_len, batch_size, in_dim)
+        smiles = smiles.permute(1, 0)  # (seq_len, batch_size)
+        tgt = smiles[:-1, :]
+        tgt_mask = tgt_mask[:-1, :-1]
+        src_key_padding_mask = src_key_padding_mask
+        tgt_key_padding_mask = tgt_key_padding_mask[:, :-1]
+        # Input and output embeddings
+        src = self.src_encoder(src)  # (seq_len, batch_size, d_model)
+        if self.chemical_formula:
+            formula_emb = self.formula_mlp(batch["formula"])  # (batch_size, d_model)
+            src = src + formula_emb.unsqueeze(0)  # (seq_len, batch_size, d_model) + (1, batch_size, d_model)
+        src = src * (self.d_model**0.5)
+        tgt = self.tgt_embedding(tgt) * (self.d_model**0.5)  # (seq_len, batch_size, d_model)
+        # Transformer forward pass
+        memory = self.transformer.encoder(src, src_key_padding_mask=src_key_padding_mask)
+        output = self.transformer.decoder(tgt, memory, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)
+        # Logits to vocabulary
+        output = self.tgt_decoder(output)  # (seq_len, batch_size, vocab_size)
+        # Reshape before returning
+        smiles_pred = output.view(-1, self.vocab_size)
+        smiles = smiles[1:, :].contiguous().view(-1)
+        return smiles_pred, smiles
+    def step(self, batch: dict, stage: Stage = Stage.NONE) -> dict:
+        # Forward pass
+        smiles_pred, smiles = self.forward(batch)
+        # Compute loss
+        loss = self.criterion(smiles_pred, smiles)
+        # Generate SMILES strings
+        if stage in self.log_only_loss_at_stages:
+            mols_pred = None
+        else:
+            mols_pred = self.decode_smiles(batch)
+        return dict(loss=loss, mols_pred=mols_pred)
+    def generate_src_padding_mask(self, spec):
+        return spec.sum(-1) == 0
+    def generate_tgt_padding_mask(self, smiles):
+        return smiles == self.pad_token_id
+    def decode_smiles(self, batch):
+        decoded_smiles_str = []
+        for _ in range(self.k_predictions):
+            decoded_smiles = self.greedy_decode(
+                batch,
+                max_len=self.max_smiles_len,
+                temperature=self.temperature,
+            )
+            decoded_smiles = [seq.tolist() for seq in decoded_smiles]
+            decoded_smiles_str.append(self.smiles_tokenizer.decode_batch(decoded_smiles))
+        # Transpose from (k, batch_size) to (batch_size, k)
+        decoded_smiles_str = list(map(list, zip(*decoded_smiles_str)))
+        return decoded_smiles_str
+    def greedy_decode(self, batch, max_len, temperature):
+        with torch.inference_mode():
+            spec = batch["spec"]    # (batch_size, seq_len, in_dim)
+            src_key_padding_mask = self.generate_src_padding_mask(spec)
+            spec = spec.permute(1, 0, 2)  # (seq_len, batch_size, in_dim)
+            src = self.src_encoder(spec)  # (seq_len, batch_size, d_model)
+            if self.chemical_formula:
+                formula_emb = self.formula_mlp(batch["formula"])  # (batch_size, d_model)
+                src = src + formula_emb.unsqueeze(0)  # (seq_len, batch_size, d_model) + (1, batch_size, d_model)
+            src = src * (self.d_model**0.5)
+            memory = self.transformer.encoder(src, src_key_padding_mask=src_key_padding_mask)
+            batch_size = src.size(1)
+            out_tokens = torch.ones(1, batch_size).fill_(self.start_token_id).type(torch.long).to(spec.device)
+            for _ in range(max_len - 1):
+                tgt = self.tgt_embedding(out_tokens) * (self.d_model**0.5)
+                tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(0)).to(src.device)
+                out = self.transformer.decoder(tgt, memory, tgt_mask=tgt_mask)
+                out = self.tgt_decoder(out[-1, :])  # (batch_size, vocab_size)
+                # Select next token
+                if self.temperature is None:
+                    probs = F.softmax(out, dim=-1)
+                    next_token = torch.argmax(probs, dim=-1)  # (batch_size,)
+                else:
+                    probs = F.softmax(out / temperature, dim=-1)
+                    next_token = torch.multinomial(probs, num_samples=1).squeeze(1)  # (batch_size,)
+                next_token = next_token.unsqueeze(0)  # (1, batch_size)
+                out_tokens = torch.cat([out_tokens, next_token], dim=0)
+                if torch.all(next_token == self.end_token_id):
+                    break
+            out_tokens = out_tokens.permute(1, 0)  # (batch_size, seq_len)
+            return out_tokens

massspecgym/models/layers.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Reproduced from https://github.com/pluskal-lab/DreaMS/blob/main/dreams/models/layers/fourier_features.py
+import torch
+import torch.nn as nn
+from math import ceil
+class FourierFeatures(nn.Module):
+    """
+    A module for generating Fourier features for input data. This module maps input data
+    to a higher-dimensional space using sinusoidal functions, enhancing the representation
+    capabilities for various tasks.
+    Args:
+        strategy (str): Strategy for generating frequency components. Available options are
+            'random', 'voronov_et_al', and 'dreams'. Each option corresponds to a certain paper:
+            - 'random': https://doi.org/10.48550/arXiv.2006.10739.
+            - 'voronov_et_al': https://doi.org/10.48550/arXiv.2207.02980.
+            - 'dreams': https://doi.org/10.26434/chemrxiv-2023-kss3r-v2.
+        x_min (float, optional): The minimum value for generating frequencies. Defaults to 1e-4.
+        x_max (float, optional): The maximum value for generating frequencies. Defaults to 1000.
+        trainable (bool, optional): If True, the frequencies are treated as trainable parameters.
+            Defaults to False.
+        funcs (str, optional): Specifies the trigonometric functions to use. Options are 'both',
+            'sin', and 'cos'. Defaults to 'both'.
+        sigma (float, optional): Standard deviation used for random frequency initialization
+            when strategy is 'random'. Defaults to 10.
+        num_freqs (int, optional): Number of frequency components to generate. Defaults to 512.
+    """
+    def __init__(
+        self,
+        strategy='dreams',
+        x_min=1e-4,
+        x_max=1000,
+        trainable=False,
+        funcs="both",
+        sigma=10,
+        num_freqs=512,
+    ):
+        assert funcs in {"both", "sin", "cos"}, "funcs must be 'both', 'sin', or 'cos'"
+        assert 0 < x_min < 1, "x_min must be a positive fraction"
+        super().__init__()
+        self.funcs = funcs
+        self.strategy = strategy
+        self.trainable = trainable
+        self.num_freqs = num_freqs
+        if strategy == "random":
+            self.b = torch.randn(num_freqs) * sigma
+        elif self.strategy == "voronov_et_al":
+            self.b = torch.tensor(
+                [
+                    1 / (x_min * (x_max / x_min) ** (2 * i / (num_freqs - 2)))
+                    for i in range(1, num_freqs)
+                ],
+            )
+        elif self.strategy == "dreams":
+            self.b = torch.tensor(
+                [1 / (x_min * i) for i in range(2, ceil(1 / x_min), 2)]
+                + [1 / (1 * i) for i in range(2, ceil(x_max), 1)],
+            )
+        else:
+            raise ValueError(f"Unknown strategy: {strategy}")
+        self.b = self.b.unsqueeze(0)
+        self.b = nn.Parameter(self.b, requires_grad=self.trainable)
+        self.register_parameter("Fourier frequencies", self.b)
+    @property
+    def num_features(self):
+        """
+        Returns the number of features generated by the FourierFeatures module.
+        If both sine and cosine functions are used, the number of features is doubled.
+        Returns:
+            int: The number of features.
+        """
+        return self.b.shape[1] if self.funcs != "both" else 2 * self.b.shape[1]
+    def forward(self, x):
+        """
+        Applies the Fourier transformation to the input data.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, input_dim) to transform.
+        Returns:
+            torch.Tensor: Fourier features.
+        """
+        x = 2 * torch.pi * x @ self.b
+        if self.funcs == "both":
+            x = torch.cat((torch.cos(x), torch.sin(x)), dim=-1)
+        elif self.funcs == "cos":
+            x = torch.cos(x)
+        elif self.funcs == "sin":
+            x = torch.sin(x)
+        return x

massspecgym/models/retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .base import RetrievalMassSpecGymModel
+from .random import RandomRetrieval
+from .deepsets import DeepSetsRetrieval
+from .fingerprint_ffn import FingerprintFFNRetrieval
+from .from_dict import FromDictRetrieval
+__all__ = [
+    "RetrievalMassSpecGymModel",
+    "RandomRetrieval",
+    "DeepSetsRetrieval",
+    "FingerprintFFNRetrieval",
+    "FromDictRetrieval"
+]

massspecgym/models/retrieval/base.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import typing as T
+from abc import ABC
+import pandas as pd
+import torch
+from torchmetrics import CosineSimilarity, MeanMetric
+from torchmetrics.functional.retrieval import retrieval_hit_rate
+from torch_geometric.utils import unbatch
+from massspecgym.models.base import MassSpecGymModel, Stage
+import massspecgym.utils as utils
+class RetrievalMassSpecGymModel(MassSpecGymModel, ABC):
+    def __init__(
+        self,
+        at_ks: T.Iterable[int] = (1, 5, 20),
+        myopic_mces_kwargs: T.Optional[T.Mapping] = None,
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.at_ks = at_ks
+        self.myopic_mces = utils.MyopicMCES(**(myopic_mces_kwargs or {}))
+    def on_batch_end(
+        self, outputs: T.Any, batch: dict, batch_idx: int, stage: Stage
+    ) -> None:
+        """
+        Compute evaluation metrics for the retrieval model based on the batch and corresponding
+        predictions.
+        """
+        self.log(
+            f"{stage.to_pref()}loss",
+            outputs['loss'],
+            batch_size=batch['spec'].size(0),
+            sync_dist=True,
+            prog_bar=True,
+        )
+        if stage in self.log_only_loss_at_stages:
+            return
+        metric_vals = {}
+        metric_vals |= self.evaluate_retrieval_step(
+            outputs["scores"],
+            batch["labels"],
+            batch["batch_ptr"],
+            stage=stage,
+        )
+        metric_vals |= self.evaluate_mces_at_1(
+            outputs["scores"],
+            batch["labels"],
+            batch["smiles"],
+            batch["candidates_smiles"],
+            batch["batch_ptr"],
+            stage=stage,
+        )
+        if stage == Stage.TEST and self.df_test_path is not None:
+            self._update_df_test(metric_vals)
+    def evaluate_retrieval_step(
+        self,
+        scores: torch.Tensor,
+        labels: torch.Tensor,
+        batch_ptr: torch.Tensor,
+        stage: Stage,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Main evaluation method for the retrieval models. The retrieval step is evaluated by
+        computing the hit rate at different top-k values.
+        Args:
+            scores (torch.Tensor): Concatenated scores for all candidates for all samples in the
+                batch
+            labels (torch.Tensor): Concatenated True/False labels for all candidates for all samples
+                 in the batch
+            batch_ptr (torch.Tensor): Number of each sample's candidates in the concatenated tensors
+        """
+        # Initialize return dictionary to store metric values per sample
+        metric_vals = {}
+        # Evaluate hitrate at different top-k values
+        indexes = utils.batch_ptr_to_batch_idx(batch_ptr)
+        scores = unbatch(scores, indexes)
+        labels = unbatch(labels, indexes)
+        for at_k in self.at_ks:
+            hit_rates = []
+            for scores_sample, labels_sample in zip(scores, labels):
+                hit_rates.append(retrieval_hit_rate(scores_sample, labels_sample, top_k=at_k))
+            hit_rates = torch.tensor(hit_rates, device=batch_ptr.device)
+            metric_name = f"{stage.to_pref()}hit_rate@{at_k}"
+            self._update_metric(
+                metric_name,
+                MeanMetric,
+                (hit_rates,),
+                batch_size=batch_ptr.size(0),
+                bootstrap=stage == Stage.TEST
+            )
+            metric_vals[metric_name] = hit_rates
+        return metric_vals
+    def evaluate_mces_at_1(
+        self,
+        scores: torch.Tensor,
+        labels: torch.Tensor,
+        smiles: list[str],
+        candidates_smiles: list[str],
+        batch_ptr: torch.Tensor,
+        stage: Stage,
+    ) -> dict[str, torch.Tensor]:
+        """
+        TODO
+        """
+        if labels.sum() != len(smiles):
+            raise ValueError("MCES@1 evaluation currently supports exactly 1 positive candidate per sample.")
+        # Initialize return dictionary to store metric values per sample
+        metric_vals = {}
+        # Get top-1 predicted molecules for each ground-truth sample
+        smiles_pred_top_1 = []
+        batch_ptr = torch.cumsum(batch_ptr, dim=0)
+        for i, j in zip(torch.cat([torch.tensor([0], device=batch_ptr.device), batch_ptr]), batch_ptr):
+            scores_sample = scores[i:j]
+            top_1_idx = i + torch.argmax(scores_sample)
+            smiles_pred_top_1.append(candidates_smiles[top_1_idx])
+        # Calculate MCES distance between top-1 predicted molecules and ground truth
+        mces_dists = [
+            self.myopic_mces(sm, sm_pred)
+            for sm, sm_pred in zip(smiles, smiles_pred_top_1)
+        ]
+        mces_dists = torch.tensor(mces_dists, device=scores.device)
+        # Log
+        metric_name = f"{stage.to_pref()}mces@1"
+        self._update_metric(
+            metric_name,
+            MeanMetric,
+            (mces_dists,),
+            batch_size=len(mces_dists),
+            bootstrap=stage == Stage.TEST
+        )
+        metric_vals[metric_name] = mces_dists
+        return metric_vals
+    def evaluate_fingerprint_step(
+        self,
+        y_true: torch.Tensor,
+        y_pred: torch.Tensor,
+        stage: Stage,
+    ) -> None:
+        """
+        Utility evaluation method to assess the quality of predicted fingerprints. This method is
+        not a part of the necessary evaluation logic (not called in the `on_batch_end` method)
+        since retrieval models are not bound to predict fingerprints.
+        Args:
+            y_true (torch.Tensor): [batch_size, fingerprint_size] tensor of true fingerprints
+            y_pred (torch.Tensor): [batch_size, fingerprint_size] tensor of predicted fingerprints
+        """
+        # Cosine similarity between predicted and true fingerprints
+        self._update_metric(
+            f"{stage.to_pref()}fingerprint_cos_sim",
+            CosineSimilarity,
+            (y_pred, y_true),
+            batch_size=y_true.size(0),
+            metric_kwargs=dict(reduction="mean")
+        )
+    def test_step(
+        self,
+        batch: dict,
+        batch_idx: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        outputs = super().test_step(batch, batch_idx)
+        # Get sorted candidate SMILES based on the predicted scores for each sample
+        if self.df_test_path is not None:
+            indexes = utils.batch_ptr_to_batch_idx(batch['batch_ptr'])
+            scores = unbatch(outputs['scores'], indexes)
+            candidates_smiles = utils.unbatch_list(batch['candidates_smiles'], indexes)
+            sorted_candidate_smiles = []
+            for scores_sample, candidates_smiles_sample in zip(scores, candidates_smiles):
+                candidates_smiles_sample = [
+                    x for _, x in sorted(zip(scores_sample, candidates_smiles_sample), reverse=True)
+                ]
+                sorted_candidate_smiles.append(candidates_smiles_sample)
+            self._update_df_test({
+                'identifier': batch['identifier'],
+                'sorted_candidate_smiles': sorted_candidate_smiles
+            })
+        return outputs
+    def on_test_epoch_end(self):
+        # Save test data frame to disk
+        if self.df_test_path is not None:
+            df_test = pd.DataFrame(self.df_test)
+            self.df_test_path.parent.mkdir(parents=True, exist_ok=True)
+            df_test.to_pickle(self.df_test_path)

massspecgym/models/retrieval/deepsets.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import typing as T
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.nn import MLP
+from massspecgym.models.base import Stage
+from massspecgym.models.retrieval.base import RetrievalMassSpecGymModel
+from massspecgym.models.layers import FourierFeatures
+from massspecgym.utils import CosSimLoss
+class DeepSetsRetrieval(RetrievalMassSpecGymModel):
+    def __init__(
+        self,
+        in_channels: int = 2,  # m/z and intensity of a peak
+        hidden_channels: int = 512,  # hidden layer size
+        out_channels: int = 4096,  # fingerprint size
+        num_layers_per_mlp: int = 2,
+        dropout: float = 0.0,
+        norm: T.Optional[str] = None,
+        fourier_features: bool = True,
+        fourier_features_mz_channels: T.Optional[int] = None,
+        fourier_features_kwargs: T.Optional[dict] = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.fourier_features = fourier_features
+        if fourier_features:
+            if fourier_features_kwargs is None:
+                fourier_features_kwargs = {}
+            self.ff = FourierFeatures(**fourier_features_kwargs)
+            if fourier_features_mz_channels is None:
+                fourier_features_mz_channels = int(0.8 * hidden_channels)
+            else:
+                assert fourier_features_mz_channels < hidden_channels
+            self.ff_proj_mz = nn.Linear(self.ff.num_features, fourier_features_mz_channels)
+            self.ff_proj_i = nn.Linear(1, hidden_channels - fourier_features_mz_channels)
+            in_channels = hidden_channels
+        self.phi = MLP(
+            in_channels=in_channels,
+            hidden_channels=hidden_channels,
+            out_channels=hidden_channels,
+            num_layers=num_layers_per_mlp,
+            dropout=dropout,
+            norm=norm
+        )
+        self.rho = MLP(
+            in_channels=hidden_channels,
+            hidden_channels=hidden_channels,
+            out_channels=out_channels,
+            num_layers=num_layers_per_mlp,
+            dropout=dropout,
+            norm=norm
+        )
+        self.loss_fn = CosSimLoss()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.fourier_features:
+            x_mz = x[:, :, 0].unsqueeze(-1)
+            x_mz = self.ff(x_mz)
+            x_mz = self.ff_proj_mz(x_mz)
+            x_i = x[:, :, 1].unsqueeze(-1)
+            x_i = self.ff_proj_i(x_i)
+            x = torch.cat((x_mz, x_i), dim=-1)
+        x = self.phi(x)
+        x = x.sum(dim=-2)  # sum over peaks
+        x = self.rho(x)
+        x = F.sigmoid(x)  # predict proper fingerprint
+        return x
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Unpack inputs
+        x = batch["spec"]
+        fp_true = batch["mol"]
+        cands = batch["candidates"]
+        batch_ptr = batch["batch_ptr"]
+        # Predict fingerprint
+        fp_pred = self.forward(x)
+        # Calculate loss
+        loss = self.loss_fn(fp_true, fp_pred)
+        # Evaluation performance on fingerprint prediction (optional)
+        self.evaluate_fingerprint_step(fp_true, fp_pred, stage=stage)
+        # Calculate final similarity scores between predicted fingerprints and corresponding
+        # candidate fingerprints for retrieval
+        fp_pred_repeated = fp_pred.repeat_interleave(batch_ptr, dim=0)
+        scores = nn.functional.cosine_similarity(fp_pred_repeated, cands)
+        return dict(loss=loss, scores=scores)

massspecgym/models/retrieval/fingerprint_ffn.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import typing as T
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_geometric.nn import MLP
+from massspecgym.models.base import Stage
+from massspecgym.models.retrieval.base import RetrievalMassSpecGymModel
+from massspecgym.utils import CosSimLoss
+class FingerprintFFNRetrieval(RetrievalMassSpecGymModel):
+    def __init__(
+        self,
+        in_channels: int = 1000,  # number of bins
+        hidden_channels: int = 512,  # hidden layer size
+        out_channels: int = 4096,  # fingerprint size
+        num_layers: int = 2,
+        dropout: float = 0.0,
+        norm: T.Optional[str] = None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.ffn = MLP(
+            in_channels=in_channels,
+            hidden_channels=hidden_channels,
+            out_channels=out_channels,
+            num_layers=num_layers,
+            dropout=dropout,
+            norm=norm
+        )
+        self.loss_fn = CosSimLoss()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ffn(x)
+        x = F.sigmoid(x)  # predict proper fingerprint
+        return x
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Unpack inputs
+        x = batch["spec"]
+        fp_true = batch["mol"]
+        cands = batch["candidates"]
+        batch_ptr = batch["batch_ptr"]
+        # Predict fingerprint
+        fp_pred = self.forward(x)
+        # Calculate loss
+        loss = self.loss_fn(fp_true, fp_pred)
+        # Evaluation performance on fingerprint prediction (optional)
+        self.evaluate_fingerprint_step(fp_true, fp_pred, stage=stage)
+        # Calculate final similarity scores between predicted fingerprints and corresponding
+        # candidate fingerprints for retrieval
+        fp_pred_repeated = fp_pred.repeat_interleave(batch_ptr, dim=0)
+        scores = nn.functional.cosine_similarity(fp_pred_repeated, cands)
+        return dict(loss=loss, scores=scores)

massspecgym/models/retrieval/from_dict.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pickle
+import typing as T
+from pathlib import Path
+import torch
+import torch.nn as nn
+from massspecgym.models.base import Stage
+from massspecgym.models.retrieval.base import RetrievalMassSpecGymModel
+class FromDictRetrieval(RetrievalMassSpecGymModel):
+    """
+    Read predictions from dictionary with MassSpecGym ids as keys. Currently, the class
+    only implements reading fingerprints from the dictionary.
+    """
+    def __init__(
+        self,
+        dct: T.Optional[dict[str, T.Any]] = None,
+        dct_path: T.Optional[T.Union[str, Path]] = None,  # pickled dict path
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        if dct is None and dct_path is None:
+            raise ValueError("Either dct or dct_path must be provided.")
+        if dct is not None and dct_path is not None:
+            raise ValueError("Only one of dct or dct_path must be provided.")
+        if dct_path is not None:
+            with open(dct_path, "rb") as file:
+                dct = pickle.load(file)
+        dct = {k: torch.tensor(v) for k, v in dct.items()}
+        self.dct = dct
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Unpack inputs
+        ids = batch["identifier"]
+        fp_true = batch["mol"]
+        cands = batch["candidates"]
+        batch_ptr = batch["batch_ptr"]
+        # Read predicted fingerprints from dictionary
+        fp_pred = torch.stack([self.dct[id] for id in ids]).to(fp_true.device)
+        # Evaluation performance on fingerprint prediction (optional)
+        self.evaluate_fingerprint_step(fp_true, fp_pred, stage=stage)
+        # Calculate final similarity scores between predicted fingerprints and corresponding
+        # candidate fingerprints for retrieval
+        fp_pred_repeated = fp_pred.repeat_interleave(batch_ptr, dim=0)
+        scores = nn.functional.cosine_similarity(fp_pred_repeated, cands).to(fp_true.device)
+        # Random baseline, so we return a dummy loss
+        loss = torch.tensor(0.0, requires_grad=True, device=fp_true.device)
+        return dict(loss=loss, scores=scores)
+    def configure_optimizers(self):
+        # No training, so no optimizers
+        return None

massspecgym/models/retrieval/random.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+from massspecgym.models.base import Stage
+from massspecgym.models.retrieval.base import RetrievalMassSpecGymModel
+class RandomRetrieval(RetrievalMassSpecGymModel):
+    def step(
+        self, batch: dict, stage: Stage = Stage.NONE
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Generate random retrieval scores
+        scores = torch.rand(batch["candidates"].shape[0]).to(self.device)
+        # Random baseline, so we return a dummy loss
+        loss = torch.tensor(0.0, requires_grad=True)
+        return dict(loss=loss, scores=scores)
+    def configure_optimizers(self):
+        # No optimizer needed for a random baseline
+        return None

massspecgym/models/simulation/__init__.py ADDED Viewed

File without changes

massspecgym/models/simulation/base.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import typing as T
+from abc import ABC
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from torchmetrics import RetrievalHitRate, CosineSimilarity
+from massspecgym.models.base import MassSpecGymModel
+class SimulationMassSpecGymModel(MassSpecGymModel, ABC):
+    def on_batch_end(
+        self, outputs: T.Any, batch: dict, batch_idx: int, metric_pref: str = ""
+    ) -> None:
+        """
+        Compute evaluation metrics for the retrieval model based on the batch and corresponding predictions.
+        This method will be used in the `on_train_batch_end`, `on_validation_batch_end`, since `on_test_batch_end` is
+        overriden below.
+        """
+        self.evaluate_cos_similarity_step(
+            outputs["spec_pred"],
+            batch["spec"],
+            metric_pref=metric_pref,
+        )
+    def on_test_batch_end(
+        self, outputs: T.Any, batch: dict, batch_idx: int
+    ) -> None:
+        metric_pref = "_test"
+        self.evaluate_cos_similarity_step(
+            outputs["spec_pred"],
+            batch["spec"],
+            metric_pref=metric_pref
+        )
+        self.evaluate_hit_rate_step(
+            outputs["spec_pred"],
+            batch["spec"],
+            metric_pref=metric_pref
+        )
+    def evaluate_cos_similarity_step(
+        self,
+        specs_pred: torch.Tensor,
+        specs: torch.Tensor,
+        metric_pref: str = ""
+    ) -> None:
+        """
+        Evaulate cosine similarity.
+        """
+        raise NotImplementedError
+    def evaluate_hit_rate_step(
+        self,
+        specs_pred: torch.Tensor,
+        specs: torch.Tensor,
+        metric_pref: str = ""
+    ) -> None:
+        """
+        Evaulate Hit rate @ {1, 5, 20} (typically reported as Accuracy @ {1, 5, 20}).
+        """
+        raise NotImplementedError

massspecgym/models/tokenizers.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import pandas as pd
+import typing as T
+import selfies as sf
+from tokenizers import ByteLevelBPETokenizer
+from tokenizers import Tokenizer, processors, models
+from tokenizers.implementations import BaseTokenizer, ByteLevelBPETokenizer
+import massspecgym.utils as utils
+from massspecgym.definitions import PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN
+class SpecialTokensBaseTokenizer(BaseTokenizer):
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        max_len: T.Optional[int] = None,
+    ):
+        """Initialize the base tokenizer with special tokens performing padding and truncation."""
+        super().__init__(tokenizer)
+        # Save essential attributes
+        self.pad_token = PAD_TOKEN
+        self.sos_token = SOS_TOKEN
+        self.eos_token = EOS_TOKEN
+        self.unk_token = UNK_TOKEN
+        self.max_length = max_len
+        # Add special tokens
+        self.add_special_tokens([self.pad_token, self.sos_token, self.eos_token, self.unk_token])
+        # Get token IDs
+        self.pad_token_id = self.token_to_id(self.pad_token)
+        self.sos_token_id = self.token_to_id(self.sos_token)
+        self.eos_token_id = self.token_to_id(self.eos_token)
+        self.unk_token_id = self.token_to_id(self.unk_token)
+        # Enable padding
+        self.enable_padding(
+            direction="right",
+            pad_token=self.pad_token,
+            pad_id=self.pad_token_id,
+            length=max_len,
+        )
+        # Enable truncation
+        self.enable_truncation(max_len)
+        # Set post-processing to add SOS and EOS tokens
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{self.sos_token} $A {self.eos_token}",
+            pair=f"{self.sos_token} $A {self.eos_token} {self.sos_token} $B {self.eos_token}",
+            special_tokens=[
+                (self.sos_token, self.sos_token_id),
+                (self.eos_token, self.eos_token_id),
+            ],
+        )
+class SelfiesTokenizer(SpecialTokensBaseTokenizer):
+    def __init__(
+            self,
+            selfies_train: T.Optional[T.Union[str, T.List[str]]] = None,
+            **kwargs
+        ):
+        """
+        Initialize the SELFIES tokenizer with optional training data to build a vocanulary.
+        Args:
+            selfies_train (str or list of str): Either a list of SELFIES strings to build the vocabulary from,
+                or a `semantic_robust_alphabet` string indicating the usahe of `selfies.get_semantic_robust_alphabet()`
+                alphabet. If None, the MassSpecGym training molecules will be used.
+        """
+        if selfies_train == 'semantic_robust_alphabet':
+            alphabet = list(sorted(sf.get_semantic_robust_alphabet()))
+        else:
+            if not selfies_train:
+                selfies_train = utils.load_train_mols()
+                selfies = [sf.encoder(s, strict=False) for s in selfies_train]
+            else:
+                selfies = selfies_train
+            alphabet = list(sorted(sf.get_alphabet_from_selfies(selfies)))
+        vocab = {symbol: i for i, symbol in enumerate(alphabet)}
+        vocab[UNK_TOKEN] = len(vocab)
+        tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token=UNK_TOKEN))
+        super().__init__(tokenizer, **kwargs)
+    def encode(self, text: str, add_special_tokens: bool = True) -> Tokenizer:
+        """Encodes a SMILES string into a list of SELFIES token IDs."""
+        selfies_string = sf.encoder(text, strict=False)
+        selfies_tokens = list(sf.split_selfies(selfies_string))
+        return super().encode(
+            selfies_tokens, is_pretokenized=True, add_special_tokens=add_special_tokens
+        )
+    def decode(self, token_ids: T.List[int], skip_special_tokens: bool = True) -> str:
+        """Decodes a list of SELFIES token IDs back into a SMILES string."""
+        selfies_string = super().decode(
+            token_ids, skip_special_tokens=skip_special_tokens
+        )
+        selfies_string = self._decode_wordlevel_str_to_selfies(selfies_string)
+        return sf.decoder(selfies_string)
+    def encode_batch(
+        self, texts: T.List[str], add_special_tokens: bool = True
+    ) -> T.List[Tokenizer]:
+        """Encodes a batch of SMILES strings into a list of SELFIES token IDs."""
+        selfies_strings = [
+            list(sf.split_selfies(sf.encoder(text, strict=False))) for text in texts
+        ]
+        return super().encode_batch(
+            selfies_strings, is_pretokenized=True, add_special_tokens=add_special_tokens
+        )
+    def decode_batch(
+        self, token_ids_batch: T.List[T.List[int]], skip_special_tokens: bool = True
+    ) -> T.List[str]:
+        """Decodes a batch of SELFIES token IDs back into SMILES strings."""
+        selfies_strings = super().decode_batch(
+            token_ids_batch, skip_special_tokens=skip_special_tokens
+        )
+        return [
+            sf.decoder(
+                self._decode_wordlevel_str_to_selfies(
+                    selfies_string
+                )
+            )
+            for selfies_string in selfies_strings
+        ]
+    def _decode_wordlevel_str_to_selfies(self, text: str) -> str:
+        """Converts a WordLevel string back to a SELFIES string."""
+        return text.replace(" ", "")
+class SmilesBPETokenizer(SpecialTokensBaseTokenizer):
+    def __init__(self, smiles_pth: T.Optional[str] = None, **kwargs):
+        """
+        Initialize the BPE tokenizer for SMILES strings, with optional training data.
+        Args:
+            smiles_pth (str): Path to a file containing SMILES strings to train the tokenizer on. If None,
+                the MassSpecGym training molecules will be used.
+        """
+        tokenizer = ByteLevelBPETokenizer()
+        if smiles_pth:
+            tokenizer.train(smiles_pth)
+        else:
+            smiles = utils.load_unlabeled_mols("smiles").tolist()
+            smiles += utils.load_train_mols().tolist()
+            print(f"Training tokenizer on {len(smiles)} SMILES strings.")
+            tokenizer.train_from_iterator(smiles)
+        super().__init__(tokenizer, **kwargs)

massspecgym/utils.py ADDED Viewed

	@@ -0,0 +1,484 @@

+import numpy as np
+# import seaborn as sns
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import matplotlib.colors
+import matplotlib.cm as cm
+import matplotlib.colors as mcolors
+import matplotlib.ticker as ticker
+import pandas as pd
+import typing as T
+import pulp
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from itertools import groupby
+from pathlib import Path
+from myopic_mces.myopic_mces import MCES
+from rdkit.Chem import AllChem as Chem
+from rdkit.Chem import DataStructs, Draw
+from rdkit.Chem.Descriptors import ExactMolWt
+# from huggingface_hub import hf_hub_download
+# from standardizeUtils.standardizeUtils import (
+#     standardize_structure_with_pubchem,
+#     standardize_structure_list_with_pubchem,
+# )
+from torchmetrics.wrappers import BootStrapper
+from torchmetrics.metric import Metric
+def load_massspecgym(fold: T.Optional[str] = None) -> pd.DataFrame:
+    """
+    Load the MassSpecGym dataset.
+    Args:
+        fold (str, optional): Fold name to load. If None, the entire dataset is loaded.
+    """
+    df = pd.read_csv(hugging_face_download("MassSpecGym.tsv"), sep="\t")
+    df = df.set_index("identifier")
+    df['mzs'] = df['mzs'].apply(parse_spec_array)
+    df['intensities'] = df['intensities'].apply(parse_spec_array)
+    if fold is not None:
+        df = df[df['fold'] == fold]
+    return df
+def load_unlabeled_mols(col_name: str = "smiles") -> pd.Series:
+    """
+    Load a list of unlabeled molecules.
+    Args:
+        col_name (str, optional): Name of the column to return. Should be one of ["smiles", "selfies"].
+    """
+    return pd.read_csv(
+        hugging_face_download(
+            "molecules/MassSpecGym_molecules_MCES2_disjoint_with_test_fold_4M.tsv"
+        ),
+        sep="\t"
+    )[col_name]
+def load_train_mols(col_name: str = "smiles") -> pd.Series:
+    """
+    Load a list of training molecules.
+    Args:
+        col_name (str, optional): Name of the column to return. Should be one of ["smiles", "selfies"].
+    """
+    return load_massspecgym("train")[col_name]
+def pad_spectrum(
+    spec: np.ndarray, max_n_peaks: int, pad_value: float = 0.0
+) -> np.ndarray:
+    """
+    Pad a spectrum to a fixed number of peaks by appending zeros to the end of the spectrum.
+    Args:
+        spec (np.ndarray): Spectrum to pad represented as numpy array of shape (n_peaks, 2).
+        max_n_peaks (int): Maximum number of peaks in the padded spectrum.
+        pad_value (float, optional): Value to use for padding.
+    """
+    n_peaks = spec.shape[0]
+    if n_peaks > max_n_peaks:
+        raise ValueError(
+            f"Number of peaks in the spectrum ({n_peaks}) is greater than the maximum number of peaks."
+        )
+    else:
+        return np.pad(
+            spec,
+            ((0, max_n_peaks - n_peaks), (0, 0)),
+            mode="constant",
+            constant_values=pad_value,
+        )
+def morgan_fp(mol: Chem.Mol, fp_size=2048, radius=2, to_np=True):
+    """
+    Compute Morgan fingerprint for a molecule.
+    Args:
+        mol (Chem.Mol): _description_
+        fp_size (int, optional): Size of the fingerprint.
+        radius (int, optional): Radius of the fingerprint.
+        to_np (bool, optional): Convert the fingerprint to numpy array.
+    """
+    fp = Chem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=fp_size)
+    if to_np:
+        fp_np = np.zeros((0,), dtype=np.int32)
+        DataStructs.ConvertToNumpyArray(fp, fp_np)
+        fp = fp_np
+    return fp
+def tanimoto_morgan_similarity(mol1: T.Union[Chem.Mol, str], mol2: T.Union[Chem.Mol, str]) -> float:
+    """
+    Compute Tanimoto similarity between two molecules using Morgan fingerprints.
+    Args:
+        mol1 (T.Union[Chem.Mol, str]): First molecule as RDKit molecule or SMILES string.
+        mol2 (T.Union[Chem.Mol, str]): Second molecule as RDKit molecule or SMILES string.
+    """
+    if isinstance(mol1, str):
+        mol1 = Chem.MolFromSmiles(mol1)
+    if isinstance(mol2, str):
+        mol2 = Chem.MolFromSmiles(mol2)
+    return DataStructs.TanimotoSimilarity(morgan_fp(mol1, to_np=False), morgan_fp(mol2, to_np=False))
+def standardize_smiles(smiles: T.Union[str, T.List[str]]) -> T.Union[str, T.List[str]]:
+    """
+    Standardize SMILES representation of a molecule using PubChem standardization.
+    """
+    if isinstance(smiles, str):
+        return standardize_structure_with_pubchem(smiles, 'smiles')
+    elif isinstance(smiles, list):
+        return standardize_structure_list_with_pubchem(smiles, 'smiles')
+    else:
+        raise ValueError("Input should be a SMILES tring or a list of SMILES strings.")
+def mol_to_inchi_key(mol: Chem.Mol, twod: bool = True) -> str:
+    """
+    Convert a molecule to InChI Key representation.
+    Args:
+        mol (Chem.Mol): RDKit molecule object.
+        twod (bool, optional): Return 2D InChI Key (first 14 characers of InChI Key).
+    """
+    inchi_key = Chem.MolToInchiKey(mol)
+    if twod:
+        inchi_key = inchi_key.split("-")[0]
+    return inchi_key
+def smiles_to_inchi_key(mol: str, twod: bool = True) -> str:
+    """
+    Convert a SMILES molecule to InChI Key representation.
+    Args:
+        mol (str): SMILES string.
+        twod (bool, optional): Return 2D InChI Key (first 14 characers of InChI Key).
+    """
+    mol = Chem.MolFromSmiles(mol)
+    return mol_to_inchi_key(mol, twod)
+def hugging_face_download(file_name: str) -> str:
+    """
+    Download a file from the Hugging Face Hub and return its location on disk.
+    Args:
+        file_name (str): Name of the file to download.
+    """
+    return hf_hub_download(
+        repo_id="roman-bushuiev/MassSpecGym",
+        filename="data/" + file_name,
+        repo_type="dataset",
+    )
+def init_plotting(figsize=(6, 2), font_scale=1.0, style="whitegrid"):
+    # Set default figure size
+    plt.show()  # Does not work without this line for some reason
+    sns.set_theme(rc={"figure.figsize": figsize})
+    mpl.rcParams['svg.fonttype'] = 'none'
+    # Set default style and font scale
+    sns.set_style(style)
+    sns.set_context("paper", font_scale=font_scale)
+    sns.set_palette(["#009473", "#D94F70", "#5A5B9F", "#F0C05A", "#7BC4C4", "#FF6F61"])
+def parse_spec_array(arr: str) -> np.ndarray:
+    return np.array(list(map(float, arr.split(","))))
+def spec_array_to_str(arr: np.ndarray) -> str:
+    return ",".join(map(str, arr))
+def compute_mass(smiles: str) -> float:
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        raise ValueError("Invalid SMILES string.")
+    return ExactMolWt(mol)
+def plot_spectrum(spec, hue=None, xlim=None, ylim=None, mirror_spec=None, highl_idx=None,
+                  figsize=(6, 2), colors=None, save_pth=None):
+    if colors is not None:
+        assert len(colors) >= 3
+    else:
+        colors = ['blue', 'green', 'red']
+    # Normalize input spectrum
+    def norm_spec(spec):
+        assert len(spec.shape) == 2
+        if spec.shape[0] != 2:
+            spec = spec.T
+        mzs, ins = spec[0], spec[1]
+        return mzs, ins / max(ins) * 100
+    mzs, ins = norm_spec(spec)
+    # Initialize plotting
+    init_plotting(figsize=figsize)
+    fig, ax = plt.subplots(1, 1)
+    # Setup color palette
+    if hue is not None:
+        norm = matplotlib.colors.Normalize(vmin=min(hue), vmax=max(hue), clip=True)
+        mapper = cm.ScalarMappable(norm=norm, cmap=cm.cool)
+        plt.colorbar(mapper, ax=ax)
+    # Plot spectrum
+    for i in range(len(mzs)):
+        if hue is not None:
+            color = mcolors.to_hex(mapper.to_rgba(hue[i]))
+        else:
+            color = colors[0]
+        plt.plot([mzs[i], mzs[i]], [0, ins[i]], color=color, marker='o', markevery=(1, 2), mfc='white', zorder=2)
+    # Plot mirror spectrum
+    if mirror_spec is not None:
+        mzs_m, ins_m = norm_spec(mirror_spec)
+        @ticker.FuncFormatter
+        def major_formatter(x, pos):
+            label = str(round(-x)) if x < 0 else str(round(x))
+            return label
+        for i in range(len(mzs_m)):
+            plt.plot([mzs_m[i], mzs_m[i]], [0, -ins_m[i]], color=colors[2], marker='o', markevery=(1, 2), mfc='white',
+                     zorder=1)
+        ax.yaxis.set_major_formatter(major_formatter)
+    # Setup axes
+    if xlim is not None:
+        plt.xlim(xlim[0], xlim[1])
+    else:
+        plt.xlim(0, max(mzs) + 10)
+    if ylim is not None:
+        plt.ylim(ylim[0], ylim[1])
+    plt.xlabel('m/z')
+    plt.ylabel('Intensity [%]')
+    if save_pth is not None:
+        raise NotImplementedError()
+def show_mols(mols, legends='new_indices', smiles_in=False, svg=False, sort_by_legend=False, max_mols=500,
+              legend_float_decimals=4, mols_per_row=6, save_pth: T.Optional[Path] = None):
+    """
+    Returns svg image representing a grid of skeletal structures of the given molecules. Copy-pasted
+     from https://github.com/pluskal-lab/DreaMS/blob/main/dreams/utils/mols.py
+    :param mols: list of rdkit molecules
+    :param smiles_in: True - SMILES inputs, False - RDKit mols
+    :param legends: list of labels for each molecule, length must be equal to the length of mols
+    :param svg: True - return svg image, False - return png image
+    :param sort_by_legend: True - sort molecules by legend values
+    :param max_mols: maximum number of molecules to show
+    :param legend_float_decimals: number of decimal places to show for float legends
+    :param mols_per_row: number of molecules per row to show
+    :param save_pth: path to save the .svg image to
+    """
+    if smiles_in:
+        mols = [Chem.MolFromSmiles(e) for e in mols]
+    if legends == 'new_indices':
+        legends = list(range(len(mols)))
+    elif legends == 'masses':
+        legends = [ExactMolWt(m) for m in mols]
+    elif callable(legends):
+        legends = [legends(e) for e in mols]
+    if sort_by_legend:
+        idx = np.argsort(legends).tolist()
+        legends = [legends[i] for i in idx]
+        mols = [mols[i] for i in idx]
+    legends = [f'{l:.{legend_float_decimals}f}' if isinstance(l, float) else str(l) for l in legends]
+    img = Draw.MolsToGridImage(mols, maxMols=max_mols, legends=legends, molsPerRow=min(max_mols, mols_per_row),
+                         useSVG=svg, returnPNG=False)
+    if save_pth:
+        with open(save_pth, 'w') as f:
+            f.write(img.data)
+    return img
+class MyopicMCES():
+    def __init__(
+        self,
+        ind: int = 0,  # dummy index
+        solver: str = pulp.listSolvers(onlyAvailable=True)[0],  # Use the first available solver
+        threshold: int = 15,  # MCES threshold
+        always_stronger_bound: bool = True, # "False" makes computations a lot faster, but leads to overall higher MCES values
+        solver_options: dict = None
+    ):
+        self.ind = ind
+        self.solver = solver
+        self.threshold = threshold
+        self.always_stronger_bound = always_stronger_bound
+        if solver_options is None:
+            solver_options = dict(msg=0)  # make ILP solver silent
+        self.solver_options = solver_options
+    # def __call__(self, smiles_1: str, smiles_2: str) -> float:
+    #     retval = MCES(
+    #         s1=smiles_1,
+    #         s2=smiles_2,
+    #         ind=self.ind,
+    #         threshold=self.threshold,
+    #         always_stronger_bound=self.always_stronger_bound,
+    #         solver=self.solver,
+    #         solver_options=self.solver_options
+    #     )
+    #     dist = retval[1]
+    #     return dist
+    def __call__(self, smiles_1: str, smiles_2: str) -> float:
+        retval = MCES(
+            smiles_1,
+            smiles_2,
+            threshold=self.threshold,
+            always_stronger_bound=self.always_stronger_bound,
+            solver=self.solver,
+            solver_options = self.solver_options
+        )
+        dist = retval[1]
+        return dist
+class ReturnScalarBootStrapper(BootStrapper):
+    def __init__(
+        self,
+        base_metric: Metric,
+        num_bootstraps: int = 10,
+        mean: bool = False,
+        std: bool = False,
+        quantile: T.Optional[T.Union[float, torch.Tensor]] = None,
+        raw: bool = False,
+        sampling_strategy: str = "poisson",
+        **kwargs: T.Any
+    ) -> None:
+        """Wrapper for BootStrapper that returns a scalar value in compute instead of a dictionary."""
+        if mean + std + bool(quantile) + raw != 1:
+            raise ValueError("Exactly one of mean, std, quantile or raw should be True.")
+        if std:
+            self.compute_key = "std"
+        else:
+            raise NotImplementedError("Currently only std is implemented.")
+        super().__init__(
+            base_metric=base_metric,
+            num_bootstraps=num_bootstraps,
+            mean=mean,
+            std=std,
+            quantile=quantile,
+            raw=raw,
+            sampling_strategy=sampling_strategy,
+            **kwargs
+        )
+    def compute(self):
+        return super().compute()[self.compute_key]
+def batch_ptr_to_batch_idx(batch_ptr: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a tensor of batch pointers to a tensor of batch indexes.
+    For example [1, 3, 2] -> [0, 1, 1, 1, 2, 2]
+    Args:
+        batch_ptr (Tensor): Tensor of batch pointers.
+    """
+    indexes = torch.arange(batch_ptr.size(0), device=batch_ptr.device)
+    indexes = torch.repeat_interleave(indexes, batch_ptr)
+    return indexes
+def unbatch_list(batch_list: list, batch_idx: torch.Tensor) -> list:
+    """
+    Unbatch a list of items using the batch indexes (i.e., number of samples per batch).
+    Args:
+        batch_list (list): List of items to unbatch.
+        batch_idx (Tensor): Tensor of batch indexes.
+    """
+    return [
+        [batch_list[j] for j in range(len(batch_list)) if batch_idx[j] == i]
+        for i in range(batch_idx[-1] + 1)
+    ]
+class CosSimLoss(nn.Module):
+    def __init__(self):
+        super(CosSimLoss, self).__init__()
+    def forward(self, inputs, targets):
+        return 1 - F.cosine_similarity(inputs, targets).mean()
+def parse_sirius_ms(spectra_file: str) -> T.Tuple[dict, T.List[T.Tuple[str, np.ndarray]]]:
+    """
+    Parses spectra from the SIRIUS .ms file.
+    Copied from the code of Goldman et al.:
+    https://github.com/samgoldman97/mist/blob/4c23d34fc82425ad5474a53e10b4622dcdbca479/src/mist/utils/parse_utils.py#LL10C77-L10C77.
+    :return T.Tuple[dict, T.List[T.Tuple[str, np.ndarray]]]: metadata and list of spectra tuples containing name and array
+    """
+    lines = [i.strip() for i in open(spectra_file, "r").readlines()]
+    group_num = 0
+    metadata = {}
+    spectras = []
+    my_iterator = groupby(
+        lines, lambda line: line.startswith(">") or line.startswith("#")
+    )
+    for index, (start_line, lines) in enumerate(my_iterator):
+        group_lines = list(lines)
+        subject_lines = list(next(my_iterator)[1])
+        # Get spectra
+        if group_num > 0:
+            spectra_header = group_lines[0].split(">")[1]
+            peak_data = [
+                [float(x) for x in peak.split()[:2]]
+                for peak in subject_lines
+                if peak.strip()
+            ]
+            # Check if spectra is empty
+            if len(peak_data):
+                peak_data = np.vstack(peak_data)
+                # Add new tuple
+                spectras.append((spectra_header, peak_data))
+        # Get meta data
+        else:
+            entries = {}
+            for i in group_lines:
+                if " " not in i:
+                    continue
+                elif i.startswith("#INSTRUMENT TYPE"):
+                    key = "#INSTRUMENT TYPE"
+                    val = i.split(key)[1].strip()
+                    entries[key[1:]] = val
+                else:
+                    start, end = i.split(" ", 1)
+                    start = start[1:]
+                    while start in entries:
+                        start = f"{start}'"
+                    entries[start] = end
+            metadata.update(entries)
+        group_num += 1
+    metadata["_FILE_PATH"] = spectra_file
+    metadata["_FILE"] = Path(spectra_file).stem
+    return metadata, spectras