Spaces:

liuganghuggingface
/

Polymer-Design-With-GraphDiT

Runtime error

App Files Files Community

liuganghuggingface commited on Oct 7, 2024

Commit

7c67898

verified ·

1 Parent(s): 71df9ee

Update graph_decoder/diffusion_model.py

Browse files

Files changed (1) hide show

graph_decoder/diffusion_model.py +354 -354

graph_decoder/diffusion_model.py CHANGED Viewed

@@ -11,25 +11,6 @@ from .transformer import Transformer
 from .molecule_utils import graph_to_smiles, check_valid
 from .visualize_utils import MolecularVisualization
-class GraphDiT(nn.Module):
-    def __init__(
-        self,
-        model_config_path,
-        data_info_path,
-        model_dtype,
-    ):
-        super().__init__()
-    def init_model(self, model_dir):
-        pass
-    def disable_grads(self):
-        pass
-    def generate(self, properties, guide_scale, num_nodes, number_chain_steps):
-        return 0, 0
 # class GraphDiT(nn.Module):
 #     def __init__(
 #         self,
@@ -38,346 +19,365 @@ class GraphDiT(nn.Module):
 #         model_dtype,
 #     ):
 #         super().__init__()
-#         dm_cfg, data_info = utils.load_config(model_config_path, data_info_path)
-#         input_dims = data_info.input_dims
-#         output_dims = data_info.output_dims
-#         nodes_dist = data_info.nodes_dist
-#         active_index = data_info.active_index
-#         self.model_config = dm_cfg
-#         self.data_info = data_info
-#         self.T = dm_cfg.diffusion_steps
-#         self.Xdim = input_dims["X"]
-#         self.Edim = input_dims["E"]
-#         self.ydim = input_dims["y"]
-#         self.Xdim_output = output_dims["X"]
-#         self.Edim_output = output_dims["E"]
-#         self.ydim_output = output_dims["y"]
-#         self.node_dist = nodes_dist
-#         self.active_index = active_index
-#         self.max_n_nodes = data_info.max_n_nodes
-#         self.atom_decoder = data_info.atom_decoder
-#         self.hidden_size = dm_cfg.hidden_size
-#         self.mol_visualizer = MolecularVisualization(self.atom_decoder)
-#         self.denoiser = Transformer(
-#             max_n_nodes=self.max_n_nodes,
-#             hidden_size=dm_cfg.hidden_size,
-#             depth=dm_cfg.depth,
-#             num_heads=dm_cfg.num_heads,
-#             mlp_ratio=dm_cfg.mlp_ratio,
-#             drop_condition=dm_cfg.drop_condition,
-#             Xdim=self.Xdim,
-#             Edim=self.Edim,
-#             ydim=self.ydim,
-#         )
-#         self.model_dtype = model_dtype
-#         self.noise_schedule = utils.PredefinedNoiseScheduleDiscrete(
-#             dm_cfg.diffusion_noise_schedule, timesteps=dm_cfg.diffusion_steps
-#         )
-#         x_marginals = data_info.node_types.to(self.model_dtype) / torch.sum(
-#             data_info.node_types.to(self.model_dtype)
-#         )
-#         e_marginals = data_info.edge_types.to(self.model_dtype) / torch.sum(
-#             data_info.edge_types.to(self.model_dtype)
-#         )
-#         x_marginals = x_marginals / x_marginals.sum()
-#         e_marginals = e_marginals / e_marginals.sum()
-#         xe_conditions = data_info.transition_E.to(self.model_dtype)
-#         xe_conditions = xe_conditions[self.active_index][:, self.active_index]
-#         xe_conditions = xe_conditions.sum(dim=1)
-#         ex_conditions = xe_conditions.t()
-#         xe_conditions = xe_conditions / xe_conditions.sum(dim=-1, keepdim=True)
-#         ex_conditions = ex_conditions / ex_conditions.sum(dim=-1, keepdim=True)
-#         self.transition_model = utils.MarginalTransition(
-#             x_marginals=x_marginals,
-#             e_marginals=e_marginals,
-#             xe_conditions=xe_conditions,
-#             ex_conditions=ex_conditions,
-#             y_classes=self.ydim_output,
-#             n_nodes=self.max_n_nodes,
-#         )
-#         self.limit_dist = utils.PlaceHolder(X=x_marginals, E=e_marginals, y=None)
 #     def init_model(self, model_dir):
-#         model_file = os.path.join(model_dir, 'model.pt')
-#         if os.path.exists(model_file):
-#             self.denoiser.load_state_dict(torch.load(model_file, map_location='cpu', weights_only=True))
-#         else:
-#             raise FileNotFoundError(f"Model file not found: {model_file}")
 #     def disable_grads(self):
-#         self.denoiser.disable_grads()
-#     def forward(
-#         self, x, edge_index, edge_attr, graph_batch, properties, no_label_index
-#     ):
-#         raise ValueError('Not Implement')
-#     def _forward(self, noisy_data, unconditioned=False):
-#         noisy_x, noisy_e, properties = (
-#             noisy_data["X_t"].to(self.model_dtype),
-#             noisy_data["E_t"].to(self.model_dtype),
-#             noisy_data["y_t"].to(self.model_dtype).clone(),
-#         )
-#         node_mask, timestep = (
-#             noisy_data["node_mask"],
-#             noisy_data["t"],
-#         )
-#         pred = self.denoiser(
-#             noisy_x,
-#             noisy_e,
-#             node_mask,
-#             properties,
-#             timestep,
-#             unconditioned=unconditioned,
-#         )
-#         return pred
-#     def apply_noise(self, X, E, y, node_mask):
-#         """Sample noise and apply it to the data."""
-#         # Sample a timestep t.
-#         # When evaluating, the loss for t=0 is computed separately
-#         lowest_t = 0 if self.training else 1
-#         t_int = torch.randint(
-#             lowest_t, self.T + 1, size=(X.size(0), 1), device=X.device
-#         ).to(
-#             self.model_dtype
-#         )  # (bs, 1)
-#         s_int = t_int - 1
-#         t_float = t_int / self.T
-#         s_float = s_int / self.T
-#         # beta_t and alpha_s_bar are used for denoising/loss computation
-#         beta_t = self.noise_schedule(t_normalized=t_float)  # (bs, 1)
-#         alpha_s_bar = self.noise_schedule.get_alpha_bar(t_normalized=s_float)  # (bs, 1)
-#         alpha_t_bar = self.noise_schedule.get_alpha_bar(t_normalized=t_float)  # (bs, 1)
-#         Qtb = self.transition_model.get_Qt_bar(
-#             alpha_t_bar, X.device
-#         )  # (bs, dx_in, dx_out), (bs, de_in, de_out)
-#         bs, n, d = X.shape
-#         X_all = torch.cat([X, E.reshape(bs, n, -1)], dim=-1)
-#         prob_all = X_all @ Qtb.X
-#         probX = prob_all[:, :, : self.Xdim_output]
-#         probE = prob_all[:, :, self.Xdim_output :].reshape(bs, n, n, -1)
-#         sampled_t = utils.sample_discrete_features(
-#             probX=probX, probE=probE, node_mask=node_mask
-#         )
-#         X_t = F.one_hot(sampled_t.X, num_classes=self.Xdim_output)
-#         E_t = F.one_hot(sampled_t.E, num_classes=self.Edim_output)
-#         assert (X.shape == X_t.shape) and (E.shape == E_t.shape)
-#         y_t = y
-#         z_t = utils.PlaceHolder(X=X_t, E=E_t, y=y_t).type_as(X_t).mask(node_mask)
-#         noisy_data = {
-#             "t_int": t_int,
-#             "t": t_float,
-#             "beta_t": beta_t,
-#             "alpha_s_bar": alpha_s_bar,
-#             "alpha_t_bar": alpha_t_bar,
-#             "X_t": z_t.X,
-#             "E_t": z_t.E,
-#             "y_t": z_t.y,
-#             "node_mask": node_mask,
-#         }
-#         return noisy_data
-#     @torch.no_grad()
-#     def generate(
-#         self,
-#         properties,
-#         guide_scale=1.,
-#         num_nodes=None,
-#         number_chain_steps=50,
-#     ):
-#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#         properties = [float('nan') if x is None else x for x in properties]
-#         properties = torch.tensor(properties, dtype=torch.float).reshape(1, -1).to(device)
-#         batch_size = properties.size(0)
-#         assert batch_size == 1
-#         if num_nodes is None:
-#             num_nodes = self.node_dist.sample_n(batch_size, device)
-#         else:
-#             num_nodes = torch.LongTensor([num_nodes]).to(device)
-#         arange = (
-#             torch.arange(self.max_n_nodes, device=device)
-#             .unsqueeze(0)
-#             .expand(batch_size, -1)
-#         )
-#         node_mask = arange < num_nodes.unsqueeze(1)
-#         z_T = utils.sample_discrete_feature_noise(
-#             limit_dist=self.limit_dist, node_mask=node_mask
-#         )
-#         X, E = z_T.X, z_T.E
-#         assert (E == torch.transpose(E, 1, 2)).all()
-#         if number_chain_steps > 0:
-#             chain_X_size = torch.Size((number_chain_steps, X.size(1)))
-#             chain_E_size = torch.Size((number_chain_steps, E.size(1), E.size(2)))
-#             chain_X = torch.zeros(chain_X_size)
-#             chain_E = torch.zeros(chain_E_size)
-#         # Iteratively sample p(z_s | z_t) for t = 1, ..., T, with s = t - 1.
-#         y = properties
-#         for s_int in reversed(range(0, self.T)):
-#             s_array = s_int * torch.ones((batch_size, 1)).type_as(y)
-#             t_array = s_array + 1
-#             s_norm = s_array / self.T
-#             t_norm = t_array / self.T
-#             # Sample z_s
-#             sampled_s, discrete_sampled_s = self.sample_p_zs_given_zt(
-#                 s_norm, t_norm, X, E, y, node_mask, guide_scale, device
-#             )
-#             X, E, y = sampled_s.X, sampled_s.E, sampled_s.y
-#             if number_chain_steps > 0:
-#                 # Save the first keep_chain graphs
-#                 write_index = (s_int * number_chain_steps) // self.T
-#                 chain_X[write_index] = discrete_sampled_s.X[:1]
-#                 chain_E[write_index] = discrete_sampled_s.E[:1]
-#         # Sample
-#         sampled_s = sampled_s.mask(node_mask, collapse=True)
-#         X, E, y = sampled_s.X, sampled_s.E, sampled_s.y
-#         molecule_list = []
-#         n = num_nodes[0]
-#         atom_types = X[0, :n].cpu()
-#         edge_types = E[0, :n, :n].cpu()
-#         molecule_list.append([atom_types, edge_types])
-#         smiles = graph_to_smiles(molecule_list, self.atom_decoder)[0]
-#         # Visualize Chains
-#         if number_chain_steps > 0:
-#             final_X_chain = X[:1]
-#             final_E_chain = E[:1]
-#             chain_X[0] = final_X_chain                  # Overwrite last frame with the resulting X, E
-#             chain_E[0] = final_E_chain
-#             chain_X = utils.reverse_tensor(chain_X)
-#             chain_E = utils.reverse_tensor(chain_E)
-#             # Repeat last frame to see final sample better
-#             chain_X = torch.cat([chain_X, chain_X[-1:].repeat(10, 1)], dim=0)
-#             chain_E = torch.cat([chain_E, chain_E[-1:].repeat(10, 1, 1)], dim=0)
-#             mol_img_list = self.mol_visualizer.visualize_chain(chain_X.numpy(), chain_E.numpy())
-#         else:
-#             mol_img_list = []
-#         return smiles, mol_img_list
-#     def check_valid(self, smiles):
-#         return check_valid(smiles)
-#     def sample_p_zs_given_zt(
-#         self, s, t, X_t, E_t, properties, node_mask, guide_scale, device
-#     ):
-#         """Samples from zs ~ p(zs | zt). Only used during sampling.
-#         if last_step, return the graph prediction as well"""
-#         bs, n, _ = X_t.shape
-#         beta_t = self.noise_schedule(t_normalized=t)  # (bs, 1)
-#         alpha_s_bar = self.noise_schedule.get_alpha_bar(t_normalized=s)
-#         alpha_t_bar = self.noise_schedule.get_alpha_bar(t_normalized=t)
-#         # Neural net predictions
-#         noisy_data = {
-#             "X_t": X_t,
-#             "E_t": E_t,
-#             "y_t": properties,
-#             "t": t,
-#             "node_mask": node_mask,
-#         }
-#         def get_prob(noisy_data, unconditioned=False):
-#             pred = self._forward(noisy_data, unconditioned=unconditioned)
-#             # Normalize predictions
-#             pred_X = F.softmax(pred.X, dim=-1)  # bs, n, d0
-#             pred_E = F.softmax(pred.E, dim=-1)  # bs, n, n, d0
-#             # Retrieve transitions matrix
-#             Qtb = self.transition_model.get_Qt_bar(alpha_t_bar, device)
-#             Qsb = self.transition_model.get_Qt_bar(alpha_s_bar, device)
-#             Qt = self.transition_model.get_Qt(beta_t, device)
-#             Xt_all = torch.cat([X_t, E_t.reshape(bs, n, -1)], dim=-1)
-#             predX_all = torch.cat([pred_X, pred_E.reshape(bs, n, -1)], dim=-1)
-#             unnormalized_probX_all = utils.reverse_diffusion(
-#                 predX_0=predX_all, X_t=Xt_all, Qt=Qt.X, Qsb=Qsb.X, Qtb=Qtb.X
-#             )
-#             unnormalized_prob_X = unnormalized_probX_all[:, :, : self.Xdim_output]
-#             unnormalized_prob_E = unnormalized_probX_all[
-#                 :, :, self.Xdim_output :
-#             ].reshape(bs, n * n, -1)
-#             unnormalized_prob_X[torch.sum(unnormalized_prob_X, dim=-1) == 0] = 1e-5
-#             unnormalized_prob_E[torch.sum(unnormalized_prob_E, dim=-1) == 0] = 1e-5
-#             prob_X = unnormalized_prob_X / torch.sum(
-#                 unnormalized_prob_X, dim=-1, keepdim=True
-#             )  # bs, n, d_t-1
-#             prob_E = unnormalized_prob_E / torch.sum(
-#                 unnormalized_prob_E, dim=-1, keepdim=True
-#             )  # bs, n, d_t-1
-#             prob_E = prob_E.reshape(bs, n, n, pred_E.shape[-1])
-#             return prob_X, prob_E
-#         prob_X, prob_E = get_prob(noisy_data)
-#         ### Guidance
-#         if guide_scale != 1:
-#             uncon_prob_X, uncon_prob_E = get_prob(
-#                 noisy_data, unconditioned=True
-#             )
-#             prob_X = (
-#                 uncon_prob_X
-#                 * (prob_X / uncon_prob_X.clamp_min(1e-5)) ** guide_scale
-#             )
-#             prob_E = (
-#                 uncon_prob_E
-#                 * (prob_E / uncon_prob_E.clamp_min(1e-5)) ** guide_scale
-#             )
-#             prob_X = prob_X / prob_X.sum(dim=-1, keepdim=True).clamp_min(1e-5)
-#             prob_E = prob_E / prob_E.sum(dim=-1, keepdim=True).clamp_min(1e-5)
-#         # assert ((prob_X.sum(dim=-1) - 1).abs() < 1e-3).all()
-#         # assert ((prob_E.sum(dim=-1) - 1).abs() < 1e-3).all()
-#         sampled_s = utils.sample_discrete_features(
-#             prob_X, prob_E, node_mask=node_mask, step=s[0, 0].item()
-#         )
-#         X_s = F.one_hot(sampled_s.X, num_classes=self.Xdim_output).to(self.model_dtype)
-#         E_s = F.one_hot(sampled_s.E, num_classes=self.Edim_output).to(self.model_dtype)
-#         assert (E_s == torch.transpose(E_s, 1, 2)).all()
-#         assert (X_t.shape == X_s.shape) and (E_t.shape == E_s.shape)
-#         out_one_hot = utils.PlaceHolder(X=X_s, E=E_s, y=properties)
-#         out_discrete = utils.PlaceHolder(X=X_s, E=E_s, y=properties)
-#         return out_one_hot.mask(node_mask).type_as(properties), out_discrete.mask(
-#             node_mask, collapse=True
-#         ).type_as(properties)

 from .molecule_utils import graph_to_smiles, check_valid
 from .visualize_utils import MolecularVisualization
 # class GraphDiT(nn.Module):
 #     def __init__(
 #         self,
 #         model_dtype,
 #     ):
 #         super().__init__()
 #     def init_model(self, model_dir):
+#         pass
 #     def disable_grads(self):
+#         pass
+#     def generate(self, properties, guide_scale, num_nodes, number_chain_steps):
+#         return 0, 0
+class GraphDiT(nn.Module):
+    def __init__(
+        self,
+        model_config_path,
+        data_info_path,
+        model_dtype,
+    ):
+        super().__init__()
+        dm_cfg, data_info = utils.load_config(model_config_path, data_info_path)
+        input_dims = data_info.input_dims
+        output_dims = data_info.output_dims
+        nodes_dist = data_info.nodes_dist
+        active_index = data_info.active_index
+        self.model_config = dm_cfg
+        self.data_info = data_info
+        self.T = dm_cfg.diffusion_steps
+        self.Xdim = input_dims["X"]
+        self.Edim = input_dims["E"]
+        self.ydim = input_dims["y"]
+        self.Xdim_output = output_dims["X"]
+        self.Edim_output = output_dims["E"]
+        self.ydim_output = output_dims["y"]
+        self.node_dist = nodes_dist
+        self.active_index = active_index
+        self.max_n_nodes = data_info.max_n_nodes
+        self.atom_decoder = data_info.atom_decoder
+        self.hidden_size = dm_cfg.hidden_size
+        self.mol_visualizer = MolecularVisualization(self.atom_decoder)
+        self.denoiser = Transformer(
+            max_n_nodes=self.max_n_nodes,
+            hidden_size=dm_cfg.hidden_size,
+            depth=dm_cfg.depth,
+            num_heads=dm_cfg.num_heads,
+            mlp_ratio=dm_cfg.mlp_ratio,
+            drop_condition=dm_cfg.drop_condition,
+            Xdim=self.Xdim,
+            Edim=self.Edim,
+            ydim=self.ydim,
+        )
+        self.model_dtype = model_dtype
+        self.noise_schedule = utils.PredefinedNoiseScheduleDiscrete(
+            dm_cfg.diffusion_noise_schedule, timesteps=dm_cfg.diffusion_steps
+        )
+        x_marginals = data_info.node_types.to(self.model_dtype) / torch.sum(
+            data_info.node_types.to(self.model_dtype)
+        )
+        e_marginals = data_info.edge_types.to(self.model_dtype) / torch.sum(
+            data_info.edge_types.to(self.model_dtype)
+        )
+        x_marginals = x_marginals / x_marginals.sum()
+        e_marginals = e_marginals / e_marginals.sum()
+        xe_conditions = data_info.transition_E.to(self.model_dtype)
+        xe_conditions = xe_conditions[self.active_index][:, self.active_index]
+        xe_conditions = xe_conditions.sum(dim=1)
+        ex_conditions = xe_conditions.t()
+        xe_conditions = xe_conditions / xe_conditions.sum(dim=-1, keepdim=True)
+        ex_conditions = ex_conditions / ex_conditions.sum(dim=-1, keepdim=True)
+        self.transition_model = utils.MarginalTransition(
+            x_marginals=x_marginals,
+            e_marginals=e_marginals,
+            xe_conditions=xe_conditions,
+            ex_conditions=ex_conditions,
+            y_classes=self.ydim_output,
+            n_nodes=self.max_n_nodes,
+        )
+        self.limit_dist = utils.PlaceHolder(X=x_marginals, E=e_marginals, y=None)
+    def init_model(self, model_dir):
+        model_file = os.path.join(model_dir, 'model.pt')
+        if os.path.exists(model_file):
+            self.denoiser.load_state_dict(torch.load(model_file, map_location='cpu', weights_only=True))
+        else:
+            raise FileNotFoundError(f"Model file not found: {model_file}")
+    def disable_grads(self):
+        self.denoiser.disable_grads()
+    def forward(
+        self, x, edge_index, edge_attr, graph_batch, properties, no_label_index
+    ):
+        raise ValueError('Not Implement')
+    def _forward(self, noisy_data, unconditioned=False):
+        noisy_x, noisy_e, properties = (
+            noisy_data["X_t"].to(self.model_dtype),
+            noisy_data["E_t"].to(self.model_dtype),
+            noisy_data["y_t"].to(self.model_dtype).clone(),
+        )
+        node_mask, timestep = (
+            noisy_data["node_mask"],
+            noisy_data["t"],
+        )
+        pred = self.denoiser(
+            noisy_x,
+            noisy_e,
+            node_mask,
+            properties,
+            timestep,
+            unconditioned=unconditioned,
+        )
+        return pred
+    def apply_noise(self, X, E, y, node_mask):
+        """Sample noise and apply it to the data."""
+        # Sample a timestep t.
+        # When evaluating, the loss for t=0 is computed separately
+        lowest_t = 0 if self.training else 1
+        t_int = torch.randint(
+            lowest_t, self.T + 1, size=(X.size(0), 1), device=X.device
+        ).to(
+            self.model_dtype
+        )  # (bs, 1)
+        s_int = t_int - 1
+        t_float = t_int / self.T
+        s_float = s_int / self.T
+        # beta_t and alpha_s_bar are used for denoising/loss computation
+        beta_t = self.noise_schedule(t_normalized=t_float)  # (bs, 1)
+        alpha_s_bar = self.noise_schedule.get_alpha_bar(t_normalized=s_float)  # (bs, 1)
+        alpha_t_bar = self.noise_schedule.get_alpha_bar(t_normalized=t_float)  # (bs, 1)
+        Qtb = self.transition_model.get_Qt_bar(
+            alpha_t_bar, X.device
+        )  # (bs, dx_in, dx_out), (bs, de_in, de_out)
+        bs, n, d = X.shape
+        X_all = torch.cat([X, E.reshape(bs, n, -1)], dim=-1)
+        prob_all = X_all @ Qtb.X
+        probX = prob_all[:, :, : self.Xdim_output]
+        probE = prob_all[:, :, self.Xdim_output :].reshape(bs, n, n, -1)
+        sampled_t = utils.sample_discrete_features(
+            probX=probX, probE=probE, node_mask=node_mask
+        )
+        X_t = F.one_hot(sampled_t.X, num_classes=self.Xdim_output)
+        E_t = F.one_hot(sampled_t.E, num_classes=self.Edim_output)
+        assert (X.shape == X_t.shape) and (E.shape == E_t.shape)
+        y_t = y
+        z_t = utils.PlaceHolder(X=X_t, E=E_t, y=y_t).type_as(X_t).mask(node_mask)
+        noisy_data = {
+            "t_int": t_int,
+            "t": t_float,
+            "beta_t": beta_t,
+            "alpha_s_bar": alpha_s_bar,
+            "alpha_t_bar": alpha_t_bar,
+            "X_t": z_t.X,
+            "E_t": z_t.E,
+            "y_t": z_t.y,
+            "node_mask": node_mask,
+        }
+        return noisy_data
+    @torch.no_grad()
+    def generate(
+        self,
+        properties,
+        guide_scale=1.,
+        num_nodes=None,
+        number_chain_steps=50,
+    ):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        properties = [float('nan') if x is None else x for x in properties]
+        properties = torch.tensor(properties, dtype=torch.float).reshape(1, -1).to(device)
+        batch_size = properties.size(0)
+        assert batch_size == 1
+        if num_nodes is None:
+            num_nodes = self.node_dist.sample_n(batch_size, device)
+        else:
+            num_nodes = torch.LongTensor([num_nodes]).to(device)
+        arange = (
+            torch.arange(self.max_n_nodes, device=device)
+            .unsqueeze(0)
+            .expand(batch_size, -1)
+        )
+        node_mask = arange < num_nodes.unsqueeze(1)
+        z_T = utils.sample_discrete_feature_noise(
+            limit_dist=self.limit_dist, node_mask=node_mask
+        )
+        X, E = z_T.X, z_T.E
+        assert (E == torch.transpose(E, 1, 2)).all()
+        if number_chain_steps > 0:
+            chain_X_size = torch.Size((number_chain_steps, X.size(1)))
+            chain_E_size = torch.Size((number_chain_steps, E.size(1), E.size(2)))
+            chain_X = torch.zeros(chain_X_size)
+            chain_E = torch.zeros(chain_E_size)
+        # Iteratively sample p(z_s | z_t) for t = 1, ..., T, with s = t - 1.
+        y = properties
+        for s_int in reversed(range(0, self.T)):
+            s_array = s_int * torch.ones((batch_size, 1)).type_as(y)
+            t_array = s_array + 1
+            s_norm = s_array / self.T
+            t_norm = t_array / self.T
+            # Sample z_s
+            sampled_s, discrete_sampled_s = self.sample_p_zs_given_zt(
+                s_norm, t_norm, X, E, y, node_mask, guide_scale, device
+            )
+            X, E, y = sampled_s.X, sampled_s.E, sampled_s.y
+            if number_chain_steps > 0:
+                # Save the first keep_chain graphs
+                write_index = (s_int * number_chain_steps) // self.T
+                chain_X[write_index] = discrete_sampled_s.X[:1]
+                chain_E[write_index] = discrete_sampled_s.E[:1]
+        # Sample
+        sampled_s = sampled_s.mask(node_mask, collapse=True)
+        X, E, y = sampled_s.X, sampled_s.E, sampled_s.y
+        molecule_list = []
+        n = num_nodes[0]
+        atom_types = X[0, :n].cpu()
+        edge_types = E[0, :n, :n].cpu()
+        molecule_list.append([atom_types, edge_types])
+        smiles = graph_to_smiles(molecule_list, self.atom_decoder)[0]
+        # Visualize Chains
+        if number_chain_steps > 0:
+            final_X_chain = X[:1]
+            final_E_chain = E[:1]
+            chain_X[0] = final_X_chain                  # Overwrite last frame with the resulting X, E
+            chain_E[0] = final_E_chain
+            chain_X = utils.reverse_tensor(chain_X)
+            chain_E = utils.reverse_tensor(chain_E)
+            # Repeat last frame to see final sample better
+            chain_X = torch.cat([chain_X, chain_X[-1:].repeat(10, 1)], dim=0)
+            chain_E = torch.cat([chain_E, chain_E[-1:].repeat(10, 1, 1)], dim=0)
+            mol_img_list = self.mol_visualizer.visualize_chain(chain_X.numpy(), chain_E.numpy())
+        else:
+            mol_img_list = []
+        return smiles, mol_img_list
+    def check_valid(self, smiles):
+        return check_valid(smiles)
+    def sample_p_zs_given_zt(
+        self, s, t, X_t, E_t, properties, node_mask, guide_scale, device
+    ):
+        """Samples from zs ~ p(zs | zt). Only used during sampling.
+        if last_step, return the graph prediction as well"""
+        bs, n, _ = X_t.shape
+        beta_t = self.noise_schedule(t_normalized=t)  # (bs, 1)
+        alpha_s_bar = self.noise_schedule.get_alpha_bar(t_normalized=s)
+        alpha_t_bar = self.noise_schedule.get_alpha_bar(t_normalized=t)
+        # Neural net predictions
+        noisy_data = {
+            "X_t": X_t,
+            "E_t": E_t,
+            "y_t": properties,
+            "t": t,
+            "node_mask": node_mask,
+        }
+        def get_prob(noisy_data, unconditioned=False):
+            pred = self._forward(noisy_data, unconditioned=unconditioned)
+            # Normalize predictions
+            pred_X = F.softmax(pred.X, dim=-1)  # bs, n, d0
+            pred_E = F.softmax(pred.E, dim=-1)  # bs, n, n, d0
+            # Retrieve transitions matrix
+            Qtb = self.transition_model.get_Qt_bar(alpha_t_bar, device)
+            Qsb = self.transition_model.get_Qt_bar(alpha_s_bar, device)
+            Qt = self.transition_model.get_Qt(beta_t, device)
+            Xt_all = torch.cat([X_t, E_t.reshape(bs, n, -1)], dim=-1)
+            predX_all = torch.cat([pred_X, pred_E.reshape(bs, n, -1)], dim=-1)
+            unnormalized_probX_all = utils.reverse_diffusion(
+                predX_0=predX_all, X_t=Xt_all, Qt=Qt.X, Qsb=Qsb.X, Qtb=Qtb.X
+            )
+            unnormalized_prob_X = unnormalized_probX_all[:, :, : self.Xdim_output]
+            unnormalized_prob_E = unnormalized_probX_all[
+                :, :, self.Xdim_output :
+            ].reshape(bs, n * n, -1)
+            unnormalized_prob_X[torch.sum(unnormalized_prob_X, dim=-1) == 0] = 1e-5
+            unnormalized_prob_E[torch.sum(unnormalized_prob_E, dim=-1) == 0] = 1e-5
+            prob_X = unnormalized_prob_X / torch.sum(
+                unnormalized_prob_X, dim=-1, keepdim=True
+            )  # bs, n, d_t-1
+            prob_E = unnormalized_prob_E / torch.sum(
+                unnormalized_prob_E, dim=-1, keepdim=True
+            )  # bs, n, d_t-1
+            prob_E = prob_E.reshape(bs, n, n, pred_E.shape[-1])
+            return prob_X, prob_E
+        prob_X, prob_E = get_prob(noisy_data)
+        ### Guidance
+        if guide_scale != 1:
+            uncon_prob_X, uncon_prob_E = get_prob(
+                noisy_data, unconditioned=True
+            )
+            prob_X = (
+                uncon_prob_X
+                * (prob_X / uncon_prob_X.clamp_min(1e-5)) ** guide_scale
+            )
+            prob_E = (
+                uncon_prob_E
+                * (prob_E / uncon_prob_E.clamp_min(1e-5)) ** guide_scale
+            )
+            prob_X = prob_X / prob_X.sum(dim=-1, keepdim=True).clamp_min(1e-5)
+            prob_E = prob_E / prob_E.sum(dim=-1, keepdim=True).clamp_min(1e-5)
+        # assert ((prob_X.sum(dim=-1) - 1).abs() < 1e-3).all()
+        # assert ((prob_E.sum(dim=-1) - 1).abs() < 1e-3).all()
+        sampled_s = utils.sample_discrete_features(
+            prob_X, prob_E, node_mask=node_mask, step=s[0, 0].item()
+        )
+        X_s = F.one_hot(sampled_s.X, num_classes=self.Xdim_output).to(self.model_dtype)
+        E_s = F.one_hot(sampled_s.E, num_classes=self.Edim_output).to(self.model_dtype)
+        assert (E_s == torch.transpose(E_s, 1, 2)).all()
+        assert (X_t.shape == X_s.shape) and (E_t.shape == E_s.shape)
+        out_one_hot = utils.PlaceHolder(X=X_s, E=E_s, y=properties)
+        out_discrete = utils.PlaceHolder(X=X_s, E=E_s, y=properties)
+        return out_one_hot.mask(node_mask).type_as(properties), out_discrete.mask(
+            node_mask, collapse=True
+        ).type_as(properties)