Synthyra
/

Profluent-E1-300M

@@ -1,8 +1,6 @@
 import os
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
-import numpy as np
-import networkx as nx
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -10,15 +8,15 @@ from torch.nn.utils.rnn import pad_sequence
 from einops import rearrange, repeat
 from enum import Enum
-from typing import Any, TypedDict, Callable, Optional, List
 from dataclasses import dataclass
 from tokenizers import Tokenizer
 from transformers import PretrainedConfig, PreTrainedModel
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import ModelOutput
 from transformers.utils import logging
-from tqdm.auto import tqdm
-from embedding_mixin import EmbeddingMixin, Pooler
 logger = logging.get_logger(__name__)
@@ -1356,275 +1354,6 @@ class DecoderLayer(nn.Module):
         return hidden_states, self_attn_weights, present_key_value
-### Support for embedding datasets with low code
-class _LegacyPooler:
-    def __init__(self, pooling_types: List[str]):
-        self.pooling_types = pooling_types
-        self.pooling_options = {
-            'mean': self.mean_pooling,
-            'max': self.max_pooling,
-            'norm': self.norm_pooling,
-            'median': self.median_pooling,
-            'std': self.std_pooling,
-            'var': self.var_pooling,
-            'cls': self.cls_pooling,
-            'parti': self._pool_parti,
-        }
-    def _create_pooled_matrices_across_layers(self, attentions: torch.Tensor) -> torch.Tensor:
-        maxed_attentions = torch.max(attentions, dim=1)[0]
-        return maxed_attentions
-    def _page_rank(self, attention_matrix, personalization=None, nstart=None, prune_type="top_k_outdegree"):
-        # Run PageRank on the attention matrix converted to a graph.
-        # Raises exceptions if the graph doesn't match the token sequence or has no edges.
-        # Returns the PageRank scores for each token node.
-        G = self._convert_to_graph(attention_matrix)
-        if G.number_of_nodes() != attention_matrix.shape[0]:
-            raise Exception(
-                f"The number of nodes in the graph should be equal to the number of tokens in sequence! You have {G.number_of_nodes()} nodes for {attention_matrix.shape[0]} tokens.")
-        if G.number_of_edges() == 0:
-            raise Exception(f"You don't seem to have any attention edges left in the graph.")
-        return nx.pagerank(G, alpha=0.85, tol=1e-06, weight='weight', personalization=personalization, nstart=nstart, max_iter=100)
-    def _convert_to_graph(self, matrix):
-        # Convert a matrix (e.g., attention scores) to a directed graph using networkx.
-        # Each element in the matrix represents a directed edge with a weight.
-        G = nx.from_numpy_array(matrix, create_using=nx.DiGraph)
-        return G
-    def _calculate_importance_weights(self, dict_importance, attention_mask: Optional[torch.Tensor] = None):
-        # Remove keys where attention_mask is 0
-        if attention_mask is not None:
-            for k in list(dict_importance.keys()):
-                if attention_mask[k] == 0:
-                    del dict_importance[k]
-        #dict_importance[0] # remove cls
-        #dict_importance[-1] # remove eos
-        total = sum(dict_importance.values())
-        return np.array([v / total for _, v in dict_importance.items()])
-    def _pool_parti(self, emb: torch.Tensor, attentions: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): # (b, L, d) -> (b, d)
-        maxed_attentions = self._create_pooled_matrices_across_layers(attentions).numpy()
-        # emb is (b, L, d), maxed_attentions is (b, L, L)
-        emb_pooled = []
-        for e, a, mask in zip(emb, maxed_attentions, attention_mask):
-            dict_importance = self._page_rank(a)
-            importance_weights = self._calculate_importance_weights(dict_importance, mask)
-            num_tokens = int(mask.sum().item())
-            emb_pooled.append(np.average(e[:num_tokens], weights=importance_weights, axis=0))
-        pooled = torch.tensor(np.array(emb_pooled))
-        return pooled
-    def mean_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        if attention_mask is None:
-            return emb.mean(dim=1)
-        else:
-            attention_mask = attention_mask.unsqueeze(-1)
-            return (emb * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)
-    def max_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        if attention_mask is None:
-            return emb.max(dim=1).values
-        else:
-            attention_mask = attention_mask.unsqueeze(-1)
-            return (emb * attention_mask).max(dim=1).values
-    def norm_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        if attention_mask is None:
-            return emb.norm(dim=1, p=2)
-        else:
-            attention_mask = attention_mask.unsqueeze(-1)
-            return (emb * attention_mask).norm(dim=1, p=2)
-    def median_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        if attention_mask is None:
-            return emb.median(dim=1).values
-        else:
-            attention_mask = attention_mask.unsqueeze(-1)
-            return (emb * attention_mask).median(dim=1).values
-    def std_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        if attention_mask is None:
-            return emb.std(dim=1)
-        else:
-            # Compute variance correctly over non-masked positions, then take sqrt
-            var = self.var_pooling(emb, attention_mask, **kwargs)
-            return torch.sqrt(var)
-    def var_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        if attention_mask is None:
-            return emb.var(dim=1)
-        else:
-            # Correctly compute variance over only non-masked positions
-            attention_mask = attention_mask.unsqueeze(-1)  # (b, L, 1)
-            # Compute mean over non-masked positions
-            mean = (emb * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)  # (b, d)
-            mean = mean.unsqueeze(1)  # (b, 1, d)
-            # Compute squared differences from mean, only over non-masked positions
-            squared_diff = (emb - mean) ** 2  # (b, L, d)
-            # Sum squared differences over non-masked positions and divide by count
-            var = (squared_diff * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)  # (b, d)
-            return var
-    def cls_pooling(self, emb: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs): # (b, L, d) -> (b, d)
-        return emb[:, 0, :]
-    def __call__(
-            self,
-            emb: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            attentions: Optional[torch.Tensor] = None
-        ): # [mean, max]
-        final_emb = []
-        for pooling_type in self.pooling_types:
-            final_emb.append(self.pooling_options[pooling_type](emb=emb, attention_mask=attention_mask, attentions=attentions)) # (b, d)
-        return torch.cat(final_emb, dim=-1) # (b, n_pooling_types * d)
-class _LegacyEmbeddingMixin:
-    def _embed(self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        raise NotImplementedError
-    @property
-    def device(self) -> torch.device:
-        """Get the device of the model."""
-        return next(self.parameters()).device
-    def _read_sequences_from_db(self, db_path: str) -> set[str]:
-        """Read sequences from SQLite database."""
-        import sqlite3
-        sequences = []
-        with sqlite3.connect(db_path) as conn:
-            c = conn.cursor()
-            c.execute("SELECT sequence FROM embeddings")
-            while True:
-                row = c.fetchone()
-                if row is None:
-                    break
-                sequences.append(row[0])
-        return set(sequences)
-    def embed_dataset(
-        self,
-        sequences: List[str],
-        #tokenizer: PreTrainedTokenizerBase, # For E1, the tokenizing is handled by _embed
-        batch_size: int = 2,
-        max_len: int = 512,
-        truncate: bool = True,
-        full_embeddings: bool = False,
-        embed_dtype: torch.dtype = torch.float32,
-        pooling_types: List[str] = ['mean'],
-        sql: bool = False,
-        save: bool = True,
-        sql_db_path: str = 'embeddings.db',
-        save_path: str = 'embeddings.pth',
-        **kwargs,
-    ) -> Optional[dict[str, torch.Tensor]]:
-        """Embed a dataset of protein sequences.
-        Args:
-            sequences: List of protein sequences
-            batch_size: Batch size for processing
-            max_len: Maximum sequence length
-            full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
-            pooling_type: Type of pooling ('mean' or 'cls')
-            sql: Whether to store embeddings in SQLite database - will be stored in float32
-            sql_db_path: Path to SQLite database
-        Returns:
-            Dictionary mapping sequences to embeddings, or None if sql=True
-        Note:
-            - If sql=True, embeddings can only be stored in float32
-            - sql is ideal if you need to stream a very large dataset for training in real-time
-            - save=True is ideal if you can store the entire embedding dictionary in RAM
-            - sql will be used if it is True and save is True or False
-            - If your sql database or .pth file is already present, they will be scanned first for already embedded sequences
-            - Sequences will be truncated to max_len and sorted by length in descending order for faster processing
-        Example:
-            >>> embedder = EmbeddingMixin()
-            >>> embedding_dict = embedder.embed_dataset(
-                sequences=[
-                    'MALWMRLLPLLALLALWGPDPAAA', ... # list of protein sequences
-                ],
-                batch_size=2, # adjust for your GPU memory
-                max_len=512, # adjust for your needs
-                full_embeddings=False, # if True, no pooling is performed
-                embed_dtype=torch.float32, # cast to what dtype you want
-                pooling_type=['mean', 'cls'], # more than one pooling type will be concatenated together
-                sql=False, # if True, embeddings will be stored in SQLite database
-                sql_db_path='embeddings.db',
-                save=True, # if True, embeddings will be saved as a .pth file
-                save_path='embeddings.pth',
-            )
-            >>> # embedding_dict is a dictionary mapping sequences to their embeddings as tensors for .pth or numpy arrays for sql
-        """
-        sequences = list(set([seq[:max_len] if truncate else seq for seq in sequences]))
-        sequences = sorted(sequences, key=len, reverse=True)
-        hidden_size = self.config.hidden_size
-        pooler = Pooler(pooling_types) if not full_embeddings else None
-        def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-            if full_embeddings or residue_embeddings.ndim == 2: # if already pooled or want residue-wise embeddings
-                return residue_embeddings
-            else:
-                return pooler(residue_embeddings, attention_mask)
-        if sql:
-            import sqlite3
-            conn = sqlite3.connect(sql_db_path)
-            c = conn.cursor()
-            c.execute('CREATE TABLE IF NOT EXISTS embeddings (sequence text PRIMARY KEY, embedding blob)')
-            already_embedded = self._read_sequences_from_db(sql_db_path)
-            to_embed = [seq for seq in sequences if seq not in already_embedded]
-            print(f"Found {len(already_embedded)} already embedded sequences in {sql_db_path}")
-            print(f"Embedding {len(to_embed)} new sequences")
-            if len(to_embed) > 0:
-                with torch.no_grad():
-                    for batch_start in tqdm(range(0, len(to_embed), batch_size), desc='Embedding batches'):
-                        seqs = to_embed[batch_start:batch_start + batch_size]
-                        input_ids, attention_mask = self._embed(seqs, return_attention_mask=True)
-                        embeddings = get_embeddings(input_ids, attention_mask).float() # sql requires float32
-                        for seq, emb, mask in zip(seqs, embeddings, attention_mask):
-                            if full_embeddings:
-                                emb = emb[mask.bool()].reshape(-1, hidden_size)
-                            c.execute("INSERT OR REPLACE INTO embeddings VALUES (?, ?)", (seq, emb.cpu().numpy().tobytes()))
-                        conn.commit()
-                conn.commit()
-            conn.close()
-            return None
-        embeddings_dict = {}
-        if os.path.exists(save_path):
-            embeddings_dict = torch.load(save_path, map_location='cpu', weights_only=True)
-            to_embed = [seq for seq in sequences if seq not in embeddings_dict]
-            print(f"Found {len(embeddings_dict)} already embedded sequences in {save_path}")
-            print(f"Embedding {len(to_embed)} new sequences")
-        else:
-            to_embed = sequences
-            print(f"Embedding {len(to_embed)} new sequences")
-        if len(to_embed) > 0:
-            with torch.no_grad():
-                for batch_start in tqdm(range(0, len(to_embed), batch_size), desc='Embedding batches'):
-                    seqs = to_embed[batch_start:batch_start + batch_size]
-                    last_hidden_state, attention_mask = self._embed(seqs, return_attention_mask=True)
-                    embeddings = get_embeddings(last_hidden_state, attention_mask).to(embed_dtype)
-                    for seq, emb, mask in zip(seqs, embeddings, attention_mask):
-                        if full_embeddings:
-                            emb = emb[mask.bool()].reshape(-1, hidden_size)
-                        embeddings_dict[seq] = emb.cpu()
-        if save:
-            torch.save(embeddings_dict, save_path)
-        return embeddings_dict
 class E1PreTrainedModel(PreTrainedModel):
     config_class = E1Config
     config: E1Config

 import os
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from enum import Enum
+from typing import Any, TypedDict, Callable, List
 from dataclasses import dataclass
 from tokenizers import Tokenizer
 from transformers import PretrainedConfig, PreTrainedModel
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import ModelOutput
 from transformers.utils import logging
+from .embedding_mixin import EmbeddingMixin, Pooler
 logger = logging.get_logger(__name__)
         return hidden_states, self_attn_weights, present_key_value
 class E1PreTrainedModel(PreTrainedModel):
     config_class = E1Config
     config: E1Config