Spaces:

EarthSpeciesProject
/

NatureLM-Audio

Running on Zero

App Files Files Community

Cheeky Sparrow commited on Aug 5

Commit

426874e

1 Parent(s): 2c6a5a0

push

Browse files

Files changed (39) hide show

.gitattributes +11 -0
NatureLM/__init__.py +19 -0
NatureLM/augmentations.py +349 -0
NatureLM/checkpoint_utils.py +100 -0
NatureLM/config.py +234 -0
NatureLM/dataset.py +550 -0
NatureLM/dist_utils.py +109 -0
NatureLM/infer.py +315 -0
NatureLM/logger.py +190 -0
NatureLM/models/NatureLM.py +666 -0
NatureLM/models/Qformer.py +1091 -0
NatureLM/models/__init__.py +19 -0
NatureLM/models/__pycache__/NatureLM.cpython-310.pyc +0 -0
NatureLM/models/__pycache__/Qformer.cpython-310.pyc +0 -0
NatureLM/models/__pycache__/__init__.cpython-310.pyc +0 -0
NatureLM/models/__pycache__/utils.cpython-310.pyc +0 -0
NatureLM/models/aves.py +59 -0
NatureLM/models/beats/BEATs.py +181 -0
NatureLM/models/beats/Tokenizers.py +173 -0
NatureLM/models/beats/__init__.py +0 -0
NatureLM/models/beats/__pycache__/BEATs.cpython-310.pyc +0 -0
NatureLM/models/beats/__pycache__/__init__.cpython-310.pyc +0 -0
NatureLM/models/beats/__pycache__/backbone.cpython-310.pyc +0 -0
NatureLM/models/beats/__pycache__/modules.cpython-310.pyc +0 -0
NatureLM/models/beats/backbone.py +741 -0
NatureLM/models/beats/modules.py +201 -0
NatureLM/models/beats/quantizer.py +222 -0
NatureLM/models/utils.py +29 -0
NatureLM/optims.py +154 -0
NatureLM/processors.py +278 -0
NatureLM/runner.py +515 -0
NatureLM/storage_utils.py +26 -0
NatureLM/task_metric_utils.py +283 -0
NatureLM/task_metrics.py +128 -0
NatureLM/utils.py +382 -0
README.md +26 -6
Space.yaml +3 -0
configs/inference.yml +61 -0
requirements.txt +30 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/484366__spacejoe__bird-3.wav filter=lfs diff=lfs merge=lfs -text
+assets/Lazuli_Bunting_yell-YELLLAZB20160625SM303143.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/nri-battlesounds.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/nri-GreenTreeFrogEvergladesNP.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/nri-StreamMUWO.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/esp_favicon.png filter=lfs diff=lfs merge=lfs -text
+assets/naturelm-audio-overiew.png filter=lfs diff=lfs merge=lfs -text
+assets/nri-SensationJazz.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/yell-YELLAMRO20160506SM3.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/yell-YELLFLBCSACR20075171.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/yell-YELLWolfvCar20160111T22ms2.mp3 filter=lfs diff=lfs merge=lfs -text

NatureLM/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .config import Config
+from .models.NatureLM import NatureLM
+from .utils import generate_sample_batches, prepare_sample_waveforms
+__all__ = ["Config", "NatureLM", "generate_sample_batches", "prepare_sample_waveforms"]

NatureLM/augmentations.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import logging
+import random
+import numpy as np
+import torch as th
+from torch import nn
+from torch.nn import functional as F
+from NatureLM.utils import mel_frequencies
+logger = logging.getLogger(__name__)
+class RevEcho(nn.Module):
+    """
+    Hacky Reverb but runs on GPU without slowing down training. This reverb adds a
+    succession of attenuated echos of the input signal to itself. Intuitively, the delay
+    of the first echo will happen after roughly 2x the radius of the room and is
+    controlled by `first_delay`. Then RevEcho keeps adding echos with the same delay and
+    further attenuation until the amplitude ratio between the last and first echo is
+    1e-3. The attenuation factor and the number of echos to adds is controlled by RT60
+    (measured in seconds). RT60 is the average time to get to -60dB (n.b. volume is
+    measured over the squared amplitude so this matches the 1e-3 ratio).
+    At each call to RevEcho, `first_delay`, `initial` and `RT60` are sampled from their
+    range. Then, to prevent this reverb from being too regular, the delay time is
+    resampled uniformly within `first_delay +/- 10%`, as controlled by the `jitter`
+    parameter.
+    Finally, for a denser reverb, multiple trains of echos are added with different
+    jitter noises.
+    Args:
+        - initial: amplitude of the first echo as a fraction of the input signal. For
+          each sample, actually sampled from `[0, initial]`. Larger values means louder
+          reverb. Physically, this would depend on the absorption of the room walls.
+        - rt60: range of values to sample the RT60 in seconds, i.e. after RT60 seconds,
+          the echo amplitude is 1e-3 of the first echo. The default values follow the
+          recommendations of https://arxiv.org/ftp/arxiv/papers/2001/2001.08662.pdf,
+          Section 2.4. Physically this would also be related to the absorption of the
+          room walls and there is likely a relation between `RT60` and `initial`, which
+          we ignore here.
+        - first_delay: range of values to sample the first echo delay in seconds. The
+          default values are equivalent to sampling a room of 3 to 10 meters.
+        - repeat: how many train of echos with differents jitters to add. Higher values
+          means a denser reverb.
+        - jitter: jitter used to make each repetition of the reverb echo train slightly
+          different. For instance a jitter of 0.1 means the delay between two echos will
+          be in the range `first_delay +- 10%`, with the jittering noise being resampled
+          after each single echo.
+        - keep_clean: fraction of the reverb of the clean speech to add back to the
+          ground truth. 0 = dereverberation, 1 = no dereverberation.
+        - sample_rate: sample rate of the input signals.
+    """
+    def __init__(
+        self,
+        proba=0.5,
+        initial=0.3,
+        rt60=(0.3, 1.3),
+        first_delay=(0.01, 0.03),
+        repeat=3,
+        jitter=0.1,
+        keep_clean=0.1,
+        sample_rate=16000,
+        rng=None,
+        seed=42,
+    ):
+        super().__init__()
+        self.proba = proba
+        self.initial = initial
+        self.rt60 = rt60
+        self.first_delay = first_delay
+        self.repeat = repeat
+        self.jitter = jitter
+        self.keep_clean = keep_clean
+        self.sample_rate = sample_rate
+        self.seed = seed
+        self.rng = rng if rng is not None else random.Random(self.seed)
+    def _reverb(self, source, initial, first_delay, rt60):
+        """
+        Return the reverb for a single source.
+        """
+        length = source.shape[-1]
+        reverb = th.zeros_like(source)
+        for _ in range(self.repeat):
+            frac = 1  # what fraction of the first echo amplitude is still here
+            echo = initial * source
+            while frac > 1e-3:
+                # First jitter noise for the delay
+                jitter = 1 + self.jitter * self.rng.uniform(-1, 1)
+                delay = min(1 + int(jitter * first_delay * self.sample_rate), length)
+                # Delay the echo in time by padding with zero on the left
+                echo = F.pad(echo[:, :, :-delay], (delay, 0))
+                reverb += echo
+                # Second jitter noise for the attenuation
+                jitter = 1 + self.jitter * self.rng.uniform(-1, 1)
+                # we want, with `d` the attenuation, d**(rt60 / first_ms) = 1e-3
+                # i.e. log10(d) = -3 * first_ms / rt60, so that
+                attenuation = 10 ** (-3 * jitter * first_delay / rt60)
+                echo *= attenuation
+                frac *= attenuation
+        return reverb
+    def forward(self, samples):
+        if self.rng.random() >= self.proba:
+            return samples
+        raw_wav = samples.get("raw_wav", None)
+        # add channel dimension if not exist
+        if raw_wav.dim() == 2:
+            raw_wav = raw_wav.unsqueeze(1)
+        # Sample characteristics for the reverb
+        initial = self.rng.random() * self.initial
+        first_delay = self.rng.uniform(*self.first_delay)
+        rt60 = self.rng.uniform(*self.rt60)
+        reverb_wav = self._reverb(raw_wav, initial, first_delay, rt60)
+        raw_wav += self.keep_clean * reverb_wav
+        # remove channel dimension
+        if raw_wav.dim() == 3 and raw_wav.shape[1] == 1:
+            raw_wav = raw_wav.squeeze(1)
+        samples["raw_wav"] = raw_wav
+        return samples
+class BandMask(nn.Module):
+    """
+    Maskes bands of frequencies. Similar to Park, Daniel S., et al.
+    "Specaugment: A simple data augmentation method for automatic speech recognition."
+    (https://arxiv.org/pdf/1904.08779.pdf) but over the waveform.
+    """
+    def __init__(self, maxwidth=0.2, bands=120, sample_rate=16_000, rng=None, seed=42):
+        """__init__.
+        :param maxwidth: the maximum width to remove
+        :param bands: number of bands
+        :param sample_rate: signal sample rate
+        """
+        super().__init__()
+        self.maxwidth = maxwidth
+        self.bands = bands
+        self.sample_rate = sample_rate
+        self.seed = seed
+        self.rng = rng if rng is not None else random.Random(self.seed)
+    def forward(self, samples):
+        raw_wav = samples.get("raw_wav", None)
+        # add channel dimension if not exist
+        if raw_wav.dim() == 2:
+            raw_wav = raw_wav.unsqueeze(1)
+        bands = self.bands
+        bandwidth = int(abs(self.maxwidth) * bands)
+        mels = mel_frequencies(bands, 40, self.sample_rate / 2) / self.sample_rate
+        low = self.rng.randrange(bands)
+        high = self.rng.randrange(low, min(bands, low + bandwidth))
+        filters = LowPassFilters([mels[low], mels[high]]).to(raw_wav.device)
+        low, midlow = filters(raw_wav)
+        # band pass filtering
+        out = raw_wav - midlow + low
+        # remove channel dimension
+        if out.dim() == 3 and out.shape[1] == 1:
+            out = out.squeeze(1)
+        samples["raw_wav"] = out
+        return samples
+class Shift(nn.Module):
+    def __init__(self, shift=8192, same=False, rngth=None):
+        """
+        :param shift: randomly shifts the signals up to a given factor
+        :param same: shifts both clean and noisy files by the same factor
+        """
+        super().__init__()
+        self.shift = shift
+        self.same = same
+        self.rngth = rngth
+    def forward(self, samples):
+        raw_wav = samples.get("raw_wav", None)
+        batch, channels, length = raw_wav.shape
+        length = length - self.shift
+        if self.shift > 0:
+            offsets = th.randint(
+                self.shift, [1 if self.same else batch, 1, 1], device=raw_wav.device, generator=self.rngth
+            )
+            offsets = offsets.expand(-1, channels, -1)
+            indexes = th.arange(length, device=raw_wav.device)
+            import pdb
+            pdb.set_trace()
+            raw_wav = raw_wav.gather(2, indexes + offsets)
+        samples["raw_wav"] = raw_wav
+        return samples
+class TimeScale(nn.Module):
+    """Fast time scale."""
+    def __init__(self, scale=2.0, target=1, rngnp=None, seed=42):
+        """
+        :param scale: randomly scales up to this maximum factor
+        """
+        super().__init__()
+        self.scale = scale
+        self.target = target
+        self.seed = seed
+        self.rngnp = rngnp if rngnp is not None else np.random.default_rng(seed=self.seed)
+    def forward(self, samples):
+        try:
+            raw_wav = samples.get("raw_wav")
+        except KeyError:
+            logger.error("Missing required key 'raw_wav' in samples dict")
+            raise
+        if "padding_mask" in samples:
+            masks = samples.get("padding_mask")
+        else:
+            masks = th.ones_like(raw_wav)
+        # add channel dimension if not exist
+        if raw_wav.dim() == 2:
+            raw_wav = raw_wav.unsqueeze(1)
+            masks = masks.unsqueeze(1)
+        # what to augment: noise, clean, or both
+        if self.target == -1:
+            targets = [i for i in range(raw_wav.shape[0])]
+        else:
+            targets = [self.target]
+        for t in targets:
+            signal = raw_wav[t]
+            scaling = np.power(self.scale, self.rngnp.uniform(-1, 1))
+            output_size = int(signal.shape[-1] * scaling)
+            ref = th.arange(output_size, device=signal.device, dtype=signal.dtype).div_(scaling)
+            ref1 = ref.clone().type(th.int64)
+            ref2 = th.min(ref1 + 1, th.full_like(ref1, signal.shape[-1] - 1, dtype=th.int64))
+            r = ref - ref1.type(ref.type())
+            scaled_signal = signal[..., ref1] * (1 - r) + signal[..., ref2] * r
+            scaled_masks = masks[t][..., ref1] * (1 - r) + masks[t][..., ref2] * r
+            # trim or zero pad to the original size
+            if scaled_signal.shape[-1] > signal.shape[-1]:
+                nframes_offset = (scaled_signal.shape[-1] - signal.shape[-1]) // 2
+                scaled_signal = scaled_signal[..., nframes_offset : nframes_offset + signal.shape[-1]]
+                scaled_masks = scaled_masks[..., nframes_offset : nframes_offset + signal.shape[-1]]
+            else:
+                nframes_diff = signal.shape[-1] - scaled_signal.shape[-1]
+                pad_left = int(np.random.uniform() * nframes_diff)
+                pad_right = nframes_diff - pad_left
+                scaled_signal = F.pad(
+                    input=scaled_signal, pad=(pad_left, pad_right, 0, 0, 0, 0), mode="constant", value=0
+                )
+                scaled_masks = F.pad(
+                    input=scaled_masks, pad=(pad_left, pad_right, 0, 0, 0, 0), mode="constant", value=0
+                )
+            raw_wav[t] = scaled_signal
+            masks[t] = scaled_masks
+        # remove channel dimension
+        if raw_wav.dim() == 3 and raw_wav.shape[1] == 1:
+            raw_wav = raw_wav.squeeze(1)
+            masks = masks.squeeze(1)
+        samples["raw_wav"] = raw_wav
+        samples["padding_mask"] = masks
+        return samples
+class Flip(nn.Module):
+    def __init__(self, p=0.0, rngth=None):
+        super(Flip, self).__init__()
+        self.p = p
+        self.rngth = rngth
+    def forward(self, samples):
+        raw_wav = samples["raw_wav"]
+        if raw_wav.dim() > 2:
+            flip_mask = th.rand(raw_wav.shape[0], device=raw_wav.device, generator=self.rngth) <= self.p
+            raw_wav[flip_mask] = raw_wav[flip_mask].flip(-1)
+        else:
+            if th.rand(1, generator=self.rngth) <= self.p:
+                raw_wav = raw_wav.flip(0)
+        samples["raw_wav"] = raw_wav
+        return samples
+class LowPassFilters(th.nn.Module):
+    """
+    Bank of low pass filters.
+    Args:
+        cutoffs (list[float]): list of cutoff frequencies, in [0, 1] expressed as `f/f_s` where
+            f_s is the samplerate.
+        width (int | None): width of the filters (i.e. kernel_size=2 * width + 1).
+            Default to `2 / min(cutoffs)`. Longer filters will have better attenuation
+            but more side effects.
+    Shape:
+        - Input: `(*, T)`
+        - Output: `(F, *, T` with `F` the len of `cutoffs`.
+    """
+    def __init__(self, cutoffs: list, width: int | None = None):
+        super().__init__()
+        self.cutoffs = cutoffs
+        if not width:
+            width = int(2 / min(cutoffs))
+        self.width = width
+        window = th.hamming_window(2 * width + 1, periodic=False)
+        t = np.arange(-width, width + 1, dtype=np.float32)
+        filters = []
+        for cutoff in cutoffs:
+            sinc = th.from_numpy(np.sinc(2 * cutoff * t))
+            filters.append(2 * cutoff * sinc * window)
+        self.register_buffer("filters", th.stack(filters).unsqueeze(1))
+    def forward(self, input):
+        *others, t = input.shape
+        input = input.view(-1, 1, t)
+        out = F.conv1d(input, self.filters, padding=self.width)
+        return out.permute(1, 0, 2).reshape(-1, *others, t)
+    def __repr__(self):
+        return "LossPassFilters(width={},cutoffs={})".format(self.width, self.cutoffs)

NatureLM/checkpoint_utils.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Module for training utilities.
+This module contains utility functions for training models. For example, saving model checkpoints.
+"""
+import logging
+import os
+import tempfile
+from typing import Any, Union
+import torch
+import torch.nn as nn
+logger = logging.getLogger(__name__)
+def maybe_unwrap_dist_model(model: nn.Module, use_distributed: bool) -> nn.Module:
+    return model.module if use_distributed else model
+def get_state_dict(model, drop_untrained_params: bool = True) -> dict[str, Any]:
+    """Get model state dict. Optionally drop untrained parameters to keep only those that require gradient.
+    Args:
+        model: Model to get state dict from
+        drop_untrained_params: Whether to drop untrained parameters
+    Returns:
+        dict: Model state dict
+    """
+    if not drop_untrained_params:
+        return model.state_dict()
+    param_grad_dict = {k: v.requires_grad for (k, v) in model.named_parameters()}
+    state_dict = model.state_dict()
+    for k in list(state_dict.keys()):
+        if k in param_grad_dict.keys() and not param_grad_dict[k]:
+            # delete parameters that do not require gradient
+            del state_dict[k]
+    return state_dict
+def torch_save_to_bucket(save_obj: Any, save_path: Union[str, os.PathLike], compress: bool = True) -> None:
+    """Save an object directly to GCS bucket without intermediate disk storage.
+    Args:
+        save_obj: Object to save (usually model state dict or checkpoint)
+        save_path: Path to save in GCS bucket (must be gs:// path)
+        compress: Whether to use compression. Default: True
+    """
+    if not is_gcs_path(save_path):
+        raise ValueError("save_path must be a GCS path")
+    # save to a temporary local file and then upload to GCS
+    with tempfile.NamedTemporaryFile() as tmp:
+        torch.save(save_obj, tmp.name, _use_new_zipfile_serialization=compress)
+        try:
+            save_path.upload_from(tmp.name)
+        except Exception as e:
+            logger.error(f"Error saving to GCP bucket: {e}")
+            raise e
+def save_model_checkpoint(
+    model: nn.Module,
+    save_path: Union[str, os.PathLike],
+    use_distributed: bool = False,
+    drop_untrained_params: bool = False,
+    **objects_to_save,
+) -> None:
+    """Save model checkpoint.
+    Args:
+        model (nn.Module): Model to save
+        output_dir (str): Output directory to save checkpoint
+        use_distributed (bool): Whether the model is distributed, if so, unwrap it. Default: False.
+        is_best (bool): Whether the model is the best in the training run. Default: False.
+        drop_untrained_params (bool): Whether to drop untrained parameters to save. Default: True.
+        prefix (str): Prefix to add to the checkpoint file name. Default: "".
+        extention (str): Extension to use for the checkpoint file. Default: "pth".
+        **objects_to_save: Additional objects to save, e.g. optimizer state dict, etc.
+    """
+    if not is_gcs_path(save_path) and not os.path.exists(os.path.dirname(save_path)):
+        raise FileNotFoundError(f"Directory {os.path.dirname(save_path)} does not exist.")
+    model_no_ddp = maybe_unwrap_dist_model(model, use_distributed)
+    state_dict = get_state_dict(model_no_ddp, drop_untrained_params)
+    save_obj = {
+        "model": state_dict,
+        **objects_to_save,
+    }
+    logger.info("Saving checkpoint to {}.".format(save_path))
+    if is_gcs_path(save_path):
+        torch_save_to_bucket(save_obj, save_path)
+    else:
+        torch.save(save_obj, save_path)

NatureLM/config.py ADDED Viewed

	@@ -0,0 +1,234 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import Any, Literal
+import yaml
+from pydantic import BaseModel, field_validator
+from pydantic.v1.utils import deep_update
+from pydantic_settings import BaseSettings, CliSettingsSource, YamlConfigSettingsSource
+class OptimizerConfig(BaseModel, extra="forbid", validate_assignment=True):
+    max_epoch: int
+    warmup_steps: int
+    warmup_start_lr: float = -1
+    init_lr: float
+    min_lr: float
+    weight_decay: float
+    beta2: float = 0.999
+    max_grad_norm: float | None = None
+    max_grad_value: float | None = None
+    device: str = "cuda"
+class AugmentationsConfig(BaseModel, extra="forbid", validate_assignment=True):
+    use_augmentation: bool = False
+    noise_prob: float = 0
+    noise_dirs: list[Path] | None = None
+    low_snr: float = -5
+    high_snr: float = 20
+    time_scale_prob: float = 0
+    time_scale: float = 1.2
+    mixup_prob: float = 0
+    mixup_count: int = 3
+    mask_audio_prob: float = 0
+class RunConfig(BaseModel, extra="forbid", validate_assignment=True):
+    wandb_enabled: bool = True
+    amp: bool = False
+    seed: int
+    output_dir: Path
+    evaluate: bool
+    log_freq: int
+    epoch_based: bool
+    iters_per_epoch: int
+    accum_grad_iters: int
+    batch_size_train: int
+    batch_size_eval: int
+    num_workers: int
+    custom_metrics: bool
+    decode_ratio: float
+    device: Literal["cuda", "cpu"] = "cuda"
+    use_distributed: bool = False
+    world_size: int = 1
+    rank: int = 0
+    gpu: int | None = None
+    dist_backend: Literal["nccl"] = "nccl"
+    dist_url: str = "env://"
+    optims: OptimizerConfig
+    augmentations: AugmentationsConfig
+class DatasetsConfig(BaseModel, extra="forbid", validate_assignment=True):
+    train_ann_path: Path
+    valid_ann_path: Path
+    test_ann_path: Path
+    audio_max_length_seconds: int
+    @field_validator("train_ann_path", "valid_ann_path", "test_ann_path", mode="after")
+    @classmethod
+    def check_files(cls, path: Path) -> Path:
+        if not path.exists():
+            raise ValueError(f"File {path} does not exist")
+        if path.suffix.lower() != ".jsonl":
+            raise ValueError(f"File {path} must be a JSONL file")
+        return path
+class BeatsConfig(BaseModel, extra="forbid", validate_assignment=True):
+    input_patch_size: int = -1
+    embed_dim: int = 512
+    conv_bias: bool = False
+    encoder_layers: int = 12
+    encoder_embed_dim: int = 768
+    encoder_ffn_embed_dim: int = 3072
+    encoder_attention_heads: int = 12
+    activation_fn: str = "gelu"
+    layer_wise_gradient_decay_ratio: float = 0.6
+    layer_norm_first: bool = False
+    deep_norm: bool = True
+    dropout: float = 0.0
+    attention_dropout: float = 0.0
+    activation_dropout: float = 0.0
+    encoder_layerdrop: float = 0.05
+    dropout_input: float = 0.0
+    conv_pos: int = 128
+    conv_pos_groups: int = 16
+    relative_position_embedding: bool = True
+    num_buckets: int = 320
+    max_distance: int = 800
+    gru_rel_pos: bool = True
+    finetuned_model: bool = True
+    predictor_dropout: float = 0.0
+    predictor_class: int = 527
+class GenerateConfig(BaseModel, extra="forbid", validate_assignment=True):
+    max_new_tokens: int
+    num_beams: int
+    do_sample: bool
+    min_length: int
+    temperature: float
+    repetition_penalty: float
+    length_penalty: float
+class ModelConfig(BaseModel, extra="forbid", validate_assignment=True):
+    llama_path: Path
+    beats_path: Path | None = None
+    beats_cfg: BeatsConfig
+    ckpt: Path | None = None
+    freeze_beats: bool = True
+    use_audio_Qformer: bool = True
+    max_pooling: bool = False
+    downsample_factor: int = 4
+    freeze_audio_QFormer: bool = False
+    window_level_Qformer: bool = True
+    num_audio_query_token: int = 1
+    second_per_window: float = 0.333333
+    second_stride: float = 0.333333
+    audio_llama_proj_model: Path | None = None
+    freeze_audio_llama_proj: bool = False
+    device: str = "cuda"
+    lora: bool = True
+    lora_rank: int = 8
+    lora_alpha: int = 32
+    lora_dropout: float = 0.1
+    flash_attn: Literal["eager", "flash_attention_2"] = "eager"
+    prompt_template: str = ""
+    max_txt_len: int = 128
+    end_sym: str = "</s>"
+    @field_validator("beats_path", "audio_llama_proj_model", "ckpt", mode="before")
+    @classmethod
+    def detect_gcs_path(cls, value: Any) -> Any:
+        """Pydantic's automatic type conversion won't be able to deal with gs:// paths
+        so we need to manually detect and convert them to GSPath objects _before_
+        validation"""
+        return value
+    @field_validator("ckpt", "audio_llama_proj_model", mode="before")
+    @classmethod
+    def legacy_empty_str(cls, value: Any) -> Any:
+        """In some of our config files we use "" to indicate that we don't have
+        a checkpoint. We've now switched to using None for this in the Config model but
+        let's keep this validator for backwards compatibility so people don't have to
+        change their configs"""
+        if isinstance(value, str) and value == "":
+            return None
+        else:
+            return value
+    @classmethod
+    def from_yaml(cls, yaml_file: str | os.PathLike) -> "ModelConfig":
+        yaml_values = YamlConfigSettingsSource(cls, yaml_file=str(yaml_file))
+        return cls.model_validate(yaml_values())
+class Config(BaseSettings, extra="forbid", validate_assignment=True):
+    model: ModelConfig
+    run: RunConfig | None = None
+    datasets: DatasetsConfig | None = None
+    generate: GenerateConfig | None = None
+    def pretty_print(self):
+        print(self.model_dump_json(indent=4))
+    @classmethod
+    def from_sources(cls, yaml_file: str | Path, cli_args: list[str] = []) -> "Config":
+        """Create a Config object from a YAML file and CLI arguments. If there are
+        any conflicts, the CLI arguments will take precedence over the YAML file."""
+        yaml_file = Path(yaml_file)
+        if not yaml_file.exists():
+            raise FileNotFoundError(f"Config file {yaml_file} does not exist")
+        yaml_values = YamlConfigSettingsSource(cls, yaml_file=yaml_file)
+        cli_values = CliSettingsSource(cls, cli_parse_args=["--" + opt for opt in cli_args])
+        final_values = deep_update(yaml_values(), cli_values())
+        return cls.model_validate(final_values)
+    def to_yaml(self, path: str | os.PathLike) -> None:
+        save_config_as_yaml(self, path)
+def save_config_as_yaml(data: BaseModel, filepath: str | os.PathLike) -> None:
+    """
+    Pydantic supports serializing/exporting models to various formats (dict, json, etc)
+    but not to yaml. This function is a workaround for that limitation.
+    """
+    filepath = Path(filepath)
+    if filepath.exists():
+        raise FileExistsError(f"File {filepath} already exists")
+    # The mode="json" is required because otherwise yaml.same_dump() can't deal with
+    # Path|GSPath objects
+    with filepath.open("w") as f:
+        yaml.safe_dump(data.model_dump(mode="json"), f, sort_keys=False)

NatureLM/dataset.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Mixing examples.
+Can mix:
+ - base: options-detection add: open-ended:
+    Take all open-ended labels. Add them to the options. Add them to the labels.
+- base: open-ended, add: open-ended
+    Concatenate labels
+"""
+import glob
+import json
+import os
+import random
+from collections import defaultdict
+from pathlib import Path
+from typing import Literal
+import numpy as np
+import soundfile as sf
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+from NatureLM.utils import snr_scale, time_scale
+def write_example_to_file(base_filename, audio, sr=16000, suffix="_output", save_dir="debug_outputs"):
+    """
+    Writes the audio tensor to a file for debugging or inspection purposes.
+    Args:
+        base_filename (str): The base name of the original file.
+        audio (torch.Tensor or numpy.ndarray): The audio waveform to save.
+        sr (int): Sampling rate of the audio (default: 16000 Hz).
+        suffix (str): Optional suffix to append to the filename.
+        save_dir (str): Directory where the files will be saved.
+    """
+    if isinstance(audio, torch.Tensor):
+        audio = audio.numpy()  # Convert to numpy if necessary
+    # Ensure the save directory exists
+    os.makedirs(save_dir, exist_ok=True)
+    # Create the output file path
+    filename = f"{os.path.splitext(base_filename)[0]}{suffix}.wav"
+    output_path = os.path.join(save_dir, filename)
+    try:
+        # Write the audio to the file
+        sf.write(output_path, audio, sr)
+        print(f"Saved audio to {output_path}")
+    except Exception as e:
+        print(f"Failed to write audio to file: {e}")
+# Example usage in your code
+# write_example_to_file(os.path.basename(ann["path"]), audio, suffix="_ts")
+def collater(samples):
+    """Collate samples into a batch.
+    Samples is a list of dictionaries, each containing the following keys:
+    - raw_wav: a list of tensors containing the raw audio waveform
+    - text: a list of strings containing the text
+    - task: a list of strings containing the task
+    - id: a list of strings containing the id
+    - prompt: a list of strings containing the prompt
+    - index: a list of integers containing the index
+    The indiviudal audio waveforms will be stacked along the batch dimension for easier
+    processing in the audio model. To keep which audio belongs to which sample, we add
+    the audio_chunk_sizes key to the batch dictionary.
+    """
+    flat_raw_wav = []
+    audio_chunk_sizes = []
+    for s in samples:
+        chunk_size = len(s["raw_wav"])
+        audio_chunk_sizes.append(chunk_size)
+        flat_raw_wav.extend(s["raw_wav"])
+    # raw_wav = [torch.from_numpy(a) for a in flat_raw_wav]
+    raw_wav = flat_raw_wav
+    raw_wav_length = torch.tensor([len(a) for a in raw_wav])
+    raw_wav = pad_sequence(raw_wav, batch_first=True, padding_value=0)
+    paddding_mask = torch.arange(raw_wav.size(1)).unsqueeze(0) >= raw_wav_length.unsqueeze(1)
+    text = [s["text"] for s in samples]
+    prompt = [s["prompt"] for s in samples]
+    task = [s["task"] for s in samples]
+    id = [s["id"] for s in samples]
+    index = [s["index"] for s in samples]
+    return {
+        "raw_wav": raw_wav,
+        "padding_mask": paddding_mask,
+        "text": text,
+        "task": task,
+        "id": id,
+        "prompt": prompt,
+        "index": index,
+        "audio_chunk_sizes": audio_chunk_sizes,
+    }
+class NatureLMDataset(Dataset):
+    def __init__(
+        self,
+        ann_path: str | Path,
+        *,
+        max_length_seconds: int = 10,
+        cropping: Literal["random", "start"] | None = "random",
+        noise_prob: float = 0.0,
+        noise_dirs: list[str] | list[Path] | None = None,
+        low_snr: float = -5,
+        high_snr: float = 20,
+        time_scale_prob: float = 0.0,
+        time_scale: float = 1.2,
+        seed: int = 0,
+        mixup_prob: float = 0.0,
+        mixup_count: int = 3,
+        use_augmentation: bool = False,
+        mask_audio_prob: float = 0.0,
+    ):
+        super().__init__()
+        ann_path = Path(ann_path)
+        if not ann_path.exists():
+            raise FileNotFoundError(f"Dataset file {ann_path} not found")
+        try:
+            with open(ann_path, "r") as f:
+                data = json.load(f)
+                self.annotation = data["annotation"]
+        except (json.JSONDecodeError, KeyError):
+            with open(ann_path, "r") as f:
+                self.annotation = [json.loads(line) for line in f]
+        #### mixup related variables
+        ### hash table for tasks to sample the tasks faster
+        self.tasks = defaultdict(list)
+        for i, ann in enumerate(self.annotation):
+            if "task" in ann and "text" in ann and ann["text"] != "None" and "path" in ann:
+                self.tasks[ann["task"]].append(i)
+        self.mixup_tasks = {
+            task: []
+            for task in self.tasks.keys()
+            if task.endswith("simple-detection")
+            or task.endswith("multiple-detection")  # Add more tasks after validating prompt mixing.
+            or task.endswith("sci-detection-random")
+            or task.endswith("common-detection-random")
+        }
+        for k in self.mixup_tasks.keys():
+            # whichever the base, only mix open-ended tasks.
+            if "sci-" in k:
+                self.mixup_tasks[k] = [
+                    task
+                    for task in self.mixup_tasks.keys()
+                    if task.endswith("sci-simple-detection") or task.endswith("sci-multiple-detection")
+                ]
+            elif "common-" in k:
+                self.mixup_tasks[k] = [
+                    task
+                    for task in self.mixup_tasks.keys()
+                    if task.endswith("common-simple-detection") or task.endswith("common-multiple-detection")
+                ]
+            else:
+                self.mixup_tasks[k] = [task for task in self.mixup_tasks.keys() if "common-" in task]
+        # print("num annotations", len(self.annotation))
+        # print("annotation 0", self.annotation[0])
+        # self.annotation = [a for a in self.annotation if "task" in a and "detection" not in a["task"]] # no detection... :(
+        self.max_length_seconds = max_length_seconds
+        self.cropping = cropping
+        self.use_augmentation = use_augmentation
+        ### noise augmentation
+        self.rng = random.Random(seed)
+        self.rngnp = np.random.default_rng(seed=seed)
+        self.noise_dirs = noise_dirs
+        self.noise_prob = noise_prob
+        self.noise_files = []
+        self.low_snr = low_snr
+        self.high_snr = high_snr
+        self.mask_audio_prob = mask_audio_prob
+        if noise_dirs is not None and len(self.noise_dirs) > 0 and self.use_augmentation:
+            for noise_dir in noise_dirs:
+                noise_from_dir = glob.glob(os.path.join(noise_dir, "*.wav"))
+                if len(noise_from_dir) < 3000:
+                    noise_from_dir = noise_from_dir * 3
+                print("noise files from dir", noise_dir, len(noise_from_dir))
+                self.noise_files.extend(noise_from_dir)
+        ### mixup augmentation
+        self.mixup_prob = mixup_prob
+        self.mixup_count = mixup_count
+        # ### time scale augmentation
+        self.time_scale = time_scale
+        self.time_scale_prob = time_scale_prob
+        # tasks = set([annotation["task"] if "task" in annotation else "empty" for annotation in self.annotation])
+        print(":::all tasks:::", self.tasks.keys())
+        print("num examples", len(self.annotation))
+    def __len__(self):
+        return len(self.annotation)
+    def collater(self, samples):
+        return collater(samples)
+    def load_audio(self, audio_path, shift_allowed: bool, noise_allowed: bool):
+        audio, sr = sf.read(audio_path)
+        # assert sr == 16000
+        if sr != 16000:
+            print("other sr!", sr, audio_path)
+        if len(audio.shape) == 2:  # stereo to mono
+            audio = audio.mean(axis=1)
+        ### time scale augmentation
+        if self.use_augmentation and self.rng.random() < self.time_scale_prob and self.time_scale > 0 and shift_allowed:
+            # print(f"{index} scaling audio")
+            # write_example_to_file(os.path.basename(ann["path"]), audio[: sr * self.max_length_seconds] )
+            audio = time_scale(torch.tensor(audio), scale=self.time_scale, rngnp=self.rngnp).numpy()
+            # write_example_to_file(os.path.basename(ann["path"]), audio[: sr * self.max_length_seconds] , suffix='_ts')
+        # Randomly crop a max_length_seconds window if audio is longer than 10 seconds
+        if len(audio) > sr * self.max_length_seconds and self.cropping == "random":
+            max_start = len(audio) - sr * self.max_length_seconds
+            start = random.randint(0, max_start)
+            audio = audio[start : start + sr * self.max_length_seconds]
+        else:  # no random cropping
+            audio = audio[: sr * self.max_length_seconds]  # Truncate audio to at most max_length_seconds
+        ### noise augmentation
+        audio = torch.tensor(audio)
+        ### noise augmentation
+        if (
+            self.use_augmentation
+            and self.rng.random() < self.noise_prob
+            and len(self.noise_files) > 0
+            and noise_allowed
+        ):
+            # write_example_to_file(os.path.basename(ann["path"]), audio)
+            # print(f"{index} adding noise")
+            noise_file = self.rng.choice(self.noise_files)
+            if not os.path.exists(noise_file):
+                print(f"Warning: noise file {noise_file} does not exist")
+            else:
+                noise_audio, noise_sr = sf.read(noise_file)
+                assert noise_sr == 16000
+                if len(noise_audio.shape) == 2:
+                    noise_audio = noise_audio.mean(axis=1)
+                noise_audio = torch.tensor(noise_audio)
+                ### repeat or trim to the audio size
+                if len(audio) > len(noise_audio):
+                    if len(noise_audio) == 0:
+                        print(
+                            "----- Warning: Noise audio length is zero. ---------- ",
+                            noise_file,
+                        )
+                        # Option 1: Skip noise augmentation by setting noise_audio to zero
+                        noise_audio = torch.zeros_like(audio)
+                    else:
+                        nrepeats = int(np.maximum(2, np.ceil(len(audio) / len(noise_audio))))
+                        noise_audio = noise_audio.repeat(nrepeats)
+                ### Randomly crop the noise file if it is too long
+                if len(noise_audio) > len(audio):
+                    max_start = len(noise_audio) - len(audio)
+                    start = random.randint(0, max_start)
+                    noise_audio = noise_audio[start : start + len(audio)]
+                ### remix with specified snr
+                snr = self.rngnp.uniform(self.low_snr, self.high_snr)
+                snr = torch.tensor([snr])
+                noise_audio = snr_scale(audio, noise_audio, snr)
+                audio = audio + noise_audio
+                # write_example_to_file(os.path.basename(audio_path), audio, suffix='_noise')
+            if len(audio) > self.max_length_seconds * sr:
+                print("long audio", len(audio), len(noise_audio))
+                audio = audio[: self.max_length_seconds * sr]
+        # pad all audios to max_len_seconds in _getitem_ to ensure no padding inconsistencies.
+        if len(audio) < sr * self.max_length_seconds:
+            pad_size = sr * self.max_length_seconds - len(audio)
+            audio = torch.nn.functional.pad(audio, (0, pad_size))
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return audio
+    def _mix_labels(self, text, text_to_mix):
+        """
+        Given two comma-separated label strings (e.g., "gorilla, zebra"),
+        combine them without introducing duplicates. If either is "None",
+        return the other as-is (unless both are "None").
+        """
+        # If `text_to_mix` is explicitly "None", just return `text`.
+        if text_to_mix == "None":
+            return text
+        # If `text` is explicitly "None", just return `text_to_mix`.
+        if text == "None":
+            return text_to_mix
+        # Split both strings by comma, stripping whitespace
+        text_list = [item.strip() for item in text.split(",") if item.strip()]
+        text_to_mix_list = [item.strip() for item in text_to_mix.split(",") if item.strip()]
+        # Deduplicate: add only new items from text_to_mix_list
+        combined_set = set(text_list)
+        for item in text_to_mix_list:
+            if item not in combined_set:
+                text_list.append(item)
+                combined_set.add(item)
+        # If there's nothing left after deduplication, return "None".
+        if not text_list:
+            return "None"
+        # Rejoin them into a comma-separated string
+        return ", ".join(text_list)
+    def _mix_prompts(self, text, text_to_mix, prompt):
+        """
+        If the prompt is in the form:
+            "Which of these, if any, are present in the audio recording? option1, option2, ..."
+        1. Parse out the question (before '?') and the list of prompt choices (after '?').
+        2. Convert both `text` and `text_to_mix` into lists, checking for items not in the prompt.
+        3. Append any missing answers to the prompt choices.
+        4. Shuffle the choices.
+        5. Reassemble and return the new prompt.
+        If the prompt does not follow the expected structure, it is returned unmodified.
+        """
+        # Split into two parts: question + choices
+        splitted = prompt.split("?")
+        if len(splitted) != 2:
+            # If we don't have exactly one question mark segment, just return the original prompt
+            return prompt
+        question = splitted[0].strip()
+        potential_choices_str = splitted[1].strip()
+        # Split the prompt choices
+        if not potential_choices_str:
+            prompt_choices = []
+        else:
+            prompt_choices = [c.strip() for c in potential_choices_str.split(",") if c.strip()]
+        # Parse `text`
+        text_list = [item.strip() for item in text.split(",") if item.strip()]
+        # Parse `text_to_mix`
+        text_to_mix_list = [item.strip() for item in text_to_mix.split(",") if item.strip()]
+        # Add any new items from text_list to the prompt
+        for item in text_list:
+            if item not in prompt_choices:
+                prompt_choices.append(item)
+        # Add any new items from text_to_mix_list to the prompt
+        for item in text_to_mix_list:
+            if item not in prompt_choices:
+                prompt_choices.append(item)
+        # Shuffle consistently with self.rng
+        self.rng.shuffle(prompt_choices)
+        # Reassemble
+        new_prompt = question + "? " + ", ".join(prompt_choices)
+        return new_prompt
+    def _apply_mixup(self, prompt, audio, text, task, filename=None):
+        # mixup_applied = False
+        if (
+            self.use_augmentation and self.rng.random() < self.mixup_prob and task in self.mixup_tasks
+            # and text != "None" # Allow complex 'None' examples.
+        ):
+            # write_example_to_file(os.path.basename(ann["path"]), audio)
+            # print(f"{index} mixing up")
+            mixup_indices = []
+            for pair_task in self.mixup_tasks[task]:
+                mixup_indices.extend(self.tasks[pair_task])
+            # mixup_indices = mixup_indices.remove(index)
+            if len(mixup_indices) == 0:
+                print("No mixup partner found")
+            else:
+                ### choose n_mixup random partners
+                n_mixup = self.rng.randint(1, self.mixup_count)
+                mixup_indices = self.rng.sample(mixup_indices, n_mixup)
+                # print(f"Mixing up with indices {mixup_indices}")
+                for mixup_index in mixup_indices:
+                    mixup_ann = self.annotation[mixup_index]
+                    mixup_audio, _ = sf.read(mixup_ann["path"])
+                    if len(mixup_audio.shape) == 2:
+                        mixup_audio = mixup_audio.mean(axis=1)
+                    mixup_audio = mixup_audio[: len(audio)]
+                    if len(mixup_audio) < len(audio):
+                        pad_size = len(audio) - len(mixup_audio)
+                        mixup_audio = np.pad(mixup_audio, (0, pad_size), mode="constant")
+                    mixup_audio = torch.from_numpy(mixup_audio).float()
+                    lam = np.clip(self.rngnp.beta(1.0, 1.0), 0.1, 0.8)
+                    # Mix the raw_wav
+                    audio = lam * audio + (1 - lam) * mixup_audio
+                    ### Mix the prompts if the labels are given in prompts
+                    if text in prompt:
+                        prompt = self._mix_prompts(text, mixup_ann["text"], prompt)
+                    ### Mix the labels
+                    text = self._mix_labels(text, mixup_ann["text"])
+                # mixup_applied = True
+        # DEBUG: If mixup was actually applied, save the final audio
+        # if mixup_applied and filename is not None:
+        #     # Just add a suffix to the original filename to indicate mixup
+        #     base_filename = os.path.basename(filename)
+        #     write_example_to_file(
+        #         base_filename=base_filename,
+        #         audio=audio,
+        #         sr=16000,
+        #         suffix="_mixup",
+        #         save_dir="mixup_outputs"
+        #     )
+        #     print(f"mixup for {filename}::: prompt {prompt} label {text}")
+        return prompt, audio, text
+    def _load_noise(self, shift_allowed: bool):
+        noise_file = self.rng.choice(self.noise_files)
+        noise_audio, noise_sr = sf.read(noise_file)
+        assert noise_sr == 16000, f"Expected noise sample rate 16000, got {noise_sr}"
+        if len(noise_audio.shape) == 2:
+            noise_audio = noise_audio.mean(axis=1)
+        # Time scale augmentation if applicable
+        if self.use_augmentation and self.rng.random() < self.time_scale_prob and self.time_scale > 0 and shift_allowed:
+            noise_audio = time_scale(torch.tensor(noise_audio), scale=self.time_scale, rngnp=self.rngnp).numpy()
+        # Randomly crop or pad to match max_length_seconds
+        if len(noise_audio) > self.max_length_seconds * 16000 and self.cropping == "random":
+            max_start = len(noise_audio) - self.max_length_seconds * 16000
+            start = random.randint(0, max_start)
+            noise_audio = noise_audio[start : start + self.max_length_seconds * 16000]
+        else:
+            noise_audio = noise_audio[: self.max_length_seconds * 16000]
+        # Pad if needed
+        if len(noise_audio) < self.max_length_seconds * 16000:
+            pad_size = self.max_length_seconds * 16000 - len(noise_audio)
+            noise_audio = np.pad(noise_audio, (0, pad_size), mode="constant")
+        noise_audio = torch.tensor(noise_audio).float()
+        noise_audio = torch.clamp(noise_audio, -1.0, 1.0)
+        return noise_audio
+    def __getitem__(self, index):
+        ann = self.annotation[index]
+        # print("loading audio::", ann)
+        shift_allowed = "pitch" not in ann.get("task", "")
+        noise_allowed = (
+            "/A/" not in ann.get("path", "")
+            and "-qa" not in ann.get("task", "")
+            and "icl" not in ann.get("task", "")
+            and "caption" not in ann.get("task", "")
+            and "animal-instructions" not in ann.get("task", "")
+        )
+        task = ann.get("task", "asr")
+        text = ann["text"]
+        prompt = ann["prompt"]
+        replace_with_noise = (
+            self.use_augmentation
+            and task.endswith("detection")
+            and self.rng.random() < self.mask_audio_prob
+            and len(self.noise_files) > 0
+        )
+        if replace_with_noise:
+            # Replace audio with noise
+            audio = self._load_noise(shift_allowed)
+            audios = [audio]
+            text = "None"
+        else:
+            if "path" in ann and ann["path"] is not None:
+                audio = self.load_audio(ann["path"], shift_allowed, noise_allowed)
+                audios = [audio]
+            else:
+                audios = [self.load_audio(p, shift_allowed, noise_allowed) for p in ann["files"]]
+            if len(audios) == 1:
+                prompt, mixed_audio, text = self._apply_mixup(prompt, audio, text, task, filename=ann["path"])
+                audios = [mixed_audio]
+        return {
+            "raw_wav": audios,
+            "text": text,
+            "task": task,
+            "id": ann.get("path") or ";".join(ann["files"]),
+            "prompt": prompt,
+            "index": index,  # track which element for eval output
+            "ann": ann,  # Include annotation for mixup
+        }
+if __name__ == "__main__":
+    dataset = NatureLMDataset(
+        ann_path="/home/ubuntu/foundation-model-storage/foundation-model-data/data/compiled-datasets/v1/s2_eval_valid.jsonl",
+        noise_dirs=["resource/audio_demo"],
+        max_length_seconds=10,
+        use_augmentation=True,
+        mixup_prob=1.0,  # For demonstration, force mixup if possible
+        mixup_count=2,  # Up to 2 mixup partners
+        mask_audio_prob=0.2,
+        seed=42,
+        noise_prob=0.5,
+    )
+    # Process just a few to see the saved mixups
+    for i in range(300):
+        sample = dataset[i]
+        # print("Final text:", sample["text"])
+        # print("Final prompt:", sample["prompt"])
+        # print("-" * 40)
+    print("Done! Look in 'debug_outputs' folder for saved mixup files.")

NatureLM/dist_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import functools
+import os
+import torch
+import torch.distributed as dist
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.use_distributed = False
+        return
+    args.use_distributed = True
+    torch.cuda.set_device(args.gpu)
+    print(
+        "| distributed init (rank {}, world {}): {}".format(args.rank, args.world_size, args.dist_url),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=datetime.timedelta(days=365),  # allow auto-downloading and de-compressing
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def get_dist_info():
+    if torch.__version__ < "1.0":
+        initialized = dist._initialized
+    else:
+        initialized = dist.is_initialized()
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:  # non-distributed training
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def main_process(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper

NatureLM/infer.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""Run NatureLM-audio over a set of audio files paths or a directory with audio files."""
+import argparse
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import soundfile as sf
+import torch
+from NatureLM.config import Config
+from NatureLM.models import NatureLM
+from NatureLM.processors import NatureLMAudioProcessor
+from NatureLM.utils import move_to_device
+_MAX_LENGTH_SECONDS = 10
+_MIN_CHUNK_LENGTH_SECONDS = 0.5
+_SAMPLE_RATE = 16000  # Assuming the model uses a sample rate of 16kHz
+_AUDIO_FILE_EXTENSIONS = [".wav", ".mp3", ".flac", ".ogg"]  # Add other audio file formats as needed
+_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
+__this_dir = Path(__file__).parent.parent
+_DEFAULT_CONFIG_PATH = __this_dir / "configs" / "inference.yml"
+def load_model_and_config(
+    cfg_path: str | Path = _DEFAULT_CONFIG_PATH, device: str = _DEVICE
+) -> tuple[NatureLM, Config]:
+    """Load the NatureLM model and configuration.
+    Returns:
+        tuple: The loaded model and configuration.
+    """
+    model = NatureLM.from_pretrained("EarthSpeciesProject/NatureLM-audio")
+    model = model.to(device).eval()
+    model.llama_tokenizer.pad_token_id = model.llama_tokenizer.eos_token_id
+    model.llama_model.generation_config.pad_token_id = model.llama_tokenizer.pad_token_id
+    cfg = Config.from_sources(cfg_path)
+    return model, cfg
+def output_template(model_output: str, start_time: float, end_time: float) -> str:
+    """Format the output of the model."""
+    return f"#{start_time:.2f}s - {end_time:.2f}s#: {model_output}\n"
+def sliding_window_inference(
+    audio: str | Path | np.ndarray,
+    query: str,
+    processor: NatureLMAudioProcessor,
+    model: NatureLM,
+    cfg: Config,
+    window_length_seconds: float = 10.0,
+    hop_length_seconds: float = 10.0,
+    input_sr: int = _SAMPLE_RATE,
+    device: str = _DEVICE,
+) -> str:
+    """Run inference on a long audio file using sliding window approach.
+    Args:
+        audio (str | Path | np.ndarray): Path to the audio file.
+        query (str): Query for the model.
+        processor (NatureLMAudioProcessor): Audio processor.
+        model (NatureLM): NatureLM model.
+        cfg (Config): Model configuration.
+        window_length_seconds (float): Length of the sliding window in seconds.
+        hop_length_seconds (float): Hop length for the sliding window in seconds.
+        input_sr (int): Sample rate of the audio file.
+    Returns:
+        str: The output of the model.
+    Raises:
+        ValueError: If the audio file is too short or if the audio file path is invalid.
+    """
+    if isinstance(audio, str) or isinstance(audio, Path):
+        audio_array, input_sr = sf.read(str(audio))
+    elif isinstance(audio, np.ndarray):
+        audio_array = audio
+        print(f"Using provided sample rate: {input_sr}")
+    audio_array = audio_array.squeeze()
+    if audio_array.ndim > 1:
+        axis_to_average = int(np.argmin(audio_array.shape))
+        audio_array = audio_array.mean(axis=axis_to_average)
+        audio_array = audio_array.squeeze()
+    # Do initial check that the audio is long enough
+    if audio_array.shape[-1] < int(_MIN_CHUNK_LENGTH_SECONDS * input_sr):
+        raise ValueError(f"Audio is too short. Minimum length is {_MIN_CHUNK_LENGTH_SECONDS} seconds.")
+    start = 0
+    stride = int(hop_length_seconds * input_sr)
+    window_length = int(window_length_seconds * input_sr)
+    output = ""
+    while True:
+        chunk = audio_array[start : start + window_length]
+        if chunk.shape[-1] < int(_MIN_CHUNK_LENGTH_SECONDS * input_sr):
+            break
+        # Resamples, pads, truncates and creates torch Tensor
+        audio_tensor, prompt_list = processor([chunk], [query], [input_sr])
+        input_to_model = {
+            "raw_wav": audio_tensor,
+            "prompt": prompt_list[0],
+            "audio_chunk_sizes": 1,
+            "padding_mask": torch.zeros_like(audio_tensor).to(torch.bool),
+        }
+        input_to_model = move_to_device(input_to_model, device)
+        # generate
+        prediction: str = model.generate(input_to_model, cfg.generate, prompt_list)[0]
+        # Post-process the prediction
+        prediction = output_template(prediction, start / input_sr, (start + window_length) / input_sr)
+        output += prediction
+        # Move the window
+        start += stride
+        if start + window_length > audio_array.shape[-1]:
+            break
+    return output
+class Pipeline:
+    """Pipeline for running NatureLM-audio inference on a list of audio files or audio arrays"""
+    def __init__(self, model: NatureLM = None, cfg_path: str | Path = _DEFAULT_CONFIG_PATH):
+        self.cfg_path = cfg_path
+        # Load model and config
+        if model is not None:
+            self.cfg = Config.from_sources(cfg_path)
+            self.model = model
+        else:
+            # Download model from hub
+            self.model, self.cfg = load_model_and_config(cfg_path)
+        self.processor = NatureLMAudioProcessor(sample_rate=_SAMPLE_RATE, max_length_seconds=_MAX_LENGTH_SECONDS)
+    def __call__(
+        self,
+        audios: list[str | Path | np.ndarray],
+        queries: str | list[str],
+        window_length_seconds: float = 10.0,
+        hop_length_seconds: float = 10.0,
+        input_sample_rate: int = _SAMPLE_RATE,
+        verbose: bool = False,
+    ) -> list[str]:
+        """Run inference on a list of audio file paths or a single audio file with a
+        single query or a list of queries. If multiple queries are provided,
+        we assume that they are in the same order as the audio files. If a single query
+        is provided, it will be used for all audio files.
+        Args:
+            audios (list[str | Path | np.ndarray]): List of audio file paths or a single audio file path or audio array(s)
+            queries (str | list[str]): Queries for the model.
+            window_length_seconds (float): Length of the sliding window in seconds. Defaults to 10.0.
+            hop_length_seconds (float): Hop length for the sliding window in seconds. Defaults to 10.0.
+            input_sample_rate (int): Sample rate of the audio. Defaults to 16000, which is the model's sample rate.
+            verbose (bool): If True, print the output of the model for each audio file.
+            Defaults to False.
+        Returns:
+            str | list[str]: The output of the model..
+        Raises:
+            ValueError: If the number of audio files and queries do not match.
+        Example:
+            >>> pipeline = Pipeline()
+            >>> audios = ["assets/nri-GreenTreeFrogEvergladesNP.mp3"]
+            >>> queries = ["Which species is this? Provide the common name."]
+            >>> results = pipeline(audios, queries)
+            >>> print(results)
+            ['#0.00s - 10.00s#: Green Treefrog\n']
+        """
+        if isinstance(audios, str) or isinstance(audios, Path):
+            audios = [audios]
+        if isinstance(queries, str):
+            queries = [queries] * len(audios)
+        if len(audios) != len(queries):
+            raise ValueError("Number of audio files and queries must match.")
+        # Run inference
+        results = []
+        for audio, query in zip(audios, queries):
+            output = sliding_window_inference(
+                audio,
+                query,
+                self.processor,
+                self.model,
+                self.cfg,
+                window_length_seconds,
+                hop_length_seconds,
+                input_sr=input_sample_rate,
+            )
+            results.append(output)
+            if verbose:
+                print(f"Processed {audio}, model output:\n=======\n{output}\n=======")
+        return results
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser("Run NatureLM-audio inference")
+    parser.add_argument(
+        "-a", "--audio", type=str, required=True, help="Path to an audio file or a directory containing audio files"
+    )
+    parser.add_argument("-q", "--query", type=str, required=True, help="Query for the model")
+    parser.add_argument(
+        "--cfg-path",
+        type=str,
+        default="configs/inference.yml",
+        help="Path to the configuration file for the model",
+    )
+    parser.add_argument("--output_path", type=str, default="inference_output.jsonl", help="Output path for the results")
+    parser.add_argument(
+        "--window_length_seconds", type=float, default=10.0, help="Length of the sliding window in seconds"
+    )
+    parser.add_argument(
+        "--hop_length_seconds", type=float, default=10.0, help="Hop length for the sliding window in seconds"
+    )
+    args = parser.parse_args()
+    return args
+def main(
+    cfg_path: str | Path,
+    audio_path: str | Path,
+    query: str,
+    output_path: str,
+    window_length_seconds: float,
+    hop_length_seconds: float,
+) -> None:
+    """Main function to run the NatureLM-audio inference script.
+    It takes command line arguments for audio file path, query, output path,
+    window length, and hop length. It processes the audio files and saves the
+    results to a CSV file.
+    Args:
+        cfg_path (str | Path): Path to the configuration file.
+        audio_path (str | Path): Path to the audio file or directory.
+        query (str): Query for the model.
+        output_path (str): Path to save the output results.
+        window_length_seconds (float): Length of the sliding window in seconds.
+        hop_length_seconds (float): Hop length for the sliding window in seconds.
+    Raises:
+        ValueError: If the audio file path is invalid or if the query is empty.
+        ValueError: If no audio files are found.
+        ValueError: If the audio file extension is not supported.
+    """
+    # Prepare sample
+    audio_path = Path(audio_path)
+    if audio_path.is_dir():
+        audio_paths = []
+        print(f"Searching for audio files in {str(audio_path)} with extensions {', '.join(_AUDIO_FILE_EXTENSIONS)}")
+        for ext in _AUDIO_FILE_EXTENSIONS:
+            audio_paths.extend(list(audio_path.rglob(f"*{ext}")))
+        print(f"Found {len(audio_paths)} audio files in {str(audio_path)}")
+    else:
+        # check that the extension is valid
+        if not any(audio_path.suffix == ext for ext in _AUDIO_FILE_EXTENSIONS):
+            raise ValueError(
+                f"Invalid audio file extension. Supported extensions are: {', '.join(_AUDIO_FILE_EXTENSIONS)}"
+            )
+        audio_paths = [audio_path]
+    # check that query is not empty
+    if not query:
+        raise ValueError("Query cannot be empty")
+    if not audio_paths:
+        raise ValueError("No audio files found. Please check the path or file extensions.")
+    # Load model and config
+    model, cfg = load_model_and_config(cfg_path)
+    # Load audio processor
+    processor = NatureLMAudioProcessor(sample_rate=_SAMPLE_RATE, max_length_seconds=_MAX_LENGTH_SECONDS)
+    # Run inference
+    results = {"audio_path": [], "output": []}
+    for path in audio_paths:
+        output = sliding_window_inference(path, query, processor, model, cfg, window_length_seconds, hop_length_seconds)
+        results["audio_path"].append(str(path))
+        results["output"].append(output)
+        print(f"Processed {path}, model output:\n=======\n{output}\n=======\n")
+    # Save results as a csv
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    df = pd.DataFrame(results)
+    df.to_json(output_path, orient="records", lines=True)
+    print(f"Results saved to {output_path}")
+if __name__ == "__main__":
+    args = parse_args()
+    main(
+        cfg_path=args.cfg_path,
+        audio_path=args.audio,
+        query=args.query,
+        output_path=args.output_path,
+        window_length_seconds=args.window_length_seconds,
+        hop_length_seconds=args.hop_length_seconds,
+    )

NatureLM/logger.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import datetime
+import logging
+import time
+from collections import defaultdict, deque
+import torch
+import torch.distributed as dist
+import wandb
+from NatureLM.dist_utils import is_dist_avail_and_initialized, is_main_process
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+    def global_avg(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None, logger=None, start_step=None):
+        i = 0
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.4f}")
+        data_time = SmoothedValue(fmt="{avg:.4f}")
+        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
+        log_msg = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_msg.append("max mem: {memory:.0f}")
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                if is_main_process():
+                    if logger is not None:
+                        assert start_step is not None, "start_step is needed to compute global_step!"
+                        for name, meter in self.meters.items():
+                            logger.add_scalar("{}".format(name), float(str(meter)), global_step=start_step + i)
+                        # Log to wandb
+                        wandb.log({name: float(str(meter)) for name, meter in self.meters.items()}, step=start_step + i)
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    print(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable)))
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def setup_logger():
+    logging.basicConfig(
+        level=logging.INFO if is_main_process() else logging.WARN,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[logging.StreamHandler()],
+    )

NatureLM/models/NatureLM.py ADDED Viewed

	@@ -0,0 +1,666 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from pathlib import Path
+from typing import Literal, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from huggingface_hub import PyTorchModelHubMixin
+from peft import LoraConfig, TaskType, get_peft_model
+from torch.nn import CrossEntropyLoss
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList
+from NatureLM.checkpoint_utils import save_model_checkpoint
+from NatureLM.config import BeatsConfig, ModelConfig, save_config_as_yaml
+from NatureLM.utils import universal_torch_load
+from .beats.BEATs import BEATs, BEATsConfig
+from .Qformer import BertConfig, BertLMHeadModel
+from .utils import StoppingCriteriaSub
+torch.backends.cuda.matmul.allow_tf32 = True
+auth_token = os.getenv('llama')
+class NatureLM(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        *,
+        llama_path: Path,
+        beats_path: Path | os.PathLike | None = None,
+        beats_cfg: BeatsConfig,
+        freeze_beats: bool = True,
+        use_audio_Qformer: bool = True,
+        max_pooling: bool = False,
+        num_audio_query_token: int = 1,
+        freeze_audio_QFormer: bool = False,
+        window_level_Qformer: bool = True,
+        second_per_window: float = 0.333333,
+        second_stride: float = 0.333333,
+        downsample_factor: int = 4,
+        audio_llama_proj_model: Path | os.PathLike | None = None,
+        freeze_audio_llama_proj: bool = False,
+        lora: bool = True,
+        lora_rank: int = 8,
+        lora_alpha: int = 32,
+        lora_dropout: float = 0.1,
+        flash_attn: Literal["eager", "flash_attention_2"] = "eager",
+        prompt_template: str = "",
+        max_txt_len: int = 128,
+        end_sym: str = "</s>",
+        device: str = "cuda",
+    ):
+        super().__init__()
+        self.beats_path = beats_path
+        self.beats_cfg = beats_cfg
+        self.use_audio_Qformer = use_audio_Qformer
+        self.max_pooling = max_pooling
+        self.window_level_Qformer = window_level_Qformer
+        self.second_per_window = second_per_window
+        self.second_stride = second_stride
+        self.downsample_factor = downsample_factor
+        self.lora = lora
+        self.max_txt_len = max_txt_len
+        self.end_sym = end_sym
+        self.prompt_template = prompt_template
+        self.flash_attn = flash_attn
+        logging.info(f"Llama path: {llama_path}")
+        logging.info("Loading Llama Tokenizer")
+        self.llama_tokenizer = AutoTokenizer.from_pretrained(llama_path, use_fast=False, use_auth_token=auth_token)
+        self.llama_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        self.llama_tokenizer.padding_side = "right"
+        logging.info("Loading Llama Model")
+        if device == "cpu":
+            self.llama_model = AutoModelForCausalLM.from_pretrained(
+                llama_path,
+                torch_dtype=torch.float32,
+                attn_implementation="eager",
+                device_map="cpu",
+                use_auth_token=auth_token
+            )
+            # An issue with tiny-llama is that pad_token_id was set to -1, but
+            # model.save_pretrained checks generation configs and does not allow -1 as
+            # pad_token_id
+            self.llama_model.generation_config.pad_token_id = self.llama_tokenizer.pad_token_id
+        else:
+            self.llama_model = AutoModelForCausalLM.from_pretrained(
+                llama_path,
+                torch_dtype=torch.bfloat16,
+                attn_implementation=flash_attn,
+                use_auth_token=auth_token
+            )
+        self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
+        if self.lora:
+            for param in self.llama_model.parameters():
+                param.requires_grad = False
+        logging.info("Loading LLaMA Done")
+        self.llama_embed_tokens = self.llama_model.model.embed_tokens
+        if self.lora:
+            logging.info("Setting up LoRA for llama model")
+            self.peft_config = LoraConfig(
+                task_type=TaskType.CAUSAL_LM,
+                inference_mode=False,
+                r=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+            )
+            self.llama_model = get_peft_model(self.llama_model, self.peft_config)
+            self.llama_embed_tokens = self.llama_model.model.model.embed_tokens
+            self.llama_model.print_trainable_parameters()
+            logging.info("LoRA Training")
+        logging.info("Loading BEATs Model")
+        self.beats = BEATs(cfg=BEATsConfig(dict(self.beats_cfg)))
+        if self.beats_path:
+            beats_ckpt = universal_torch_load(self.beats_path, cache_mode="none", map_location="cpu")
+            self.beats.load_state_dict(beats_ckpt["model"])
+        self.ln_audio = nn.LayerNorm(self.beats.cfg.encoder_embed_dim)
+        if freeze_beats:
+            for param in self.beats.parameters():
+                param.requires_grad = False
+            self.beats.eval()
+            logging.info("freeze BEATs")
+        if self.use_audio_Qformer:
+            self.audio_Qformer, self.audio_query_tokens = self.init_audio_Qformer(
+                num_query_token=num_audio_query_token,
+                audio_width=self.beats.cfg.encoder_embed_dim,
+            )
+            self.audio_Qformer.bert.embeddings.word_embeddings = None
+            self.audio_Qformer.bert.embeddings.position_embeddings = None
+            for layer in self.audio_Qformer.bert.encoder.layer:
+                layer.output = None
+                layer.intermediate = None
+            self.audio_Qformer.cls = None
+            if freeze_audio_QFormer:
+                for param in self.audio_Qformer.parameters():
+                    param.requires_grad = False
+                self.audio_Qformer.eval()
+                self.audio_query_tokens.requires_grad = False
+                logging.info("freeze audio QFormer")
+            logging.info("Loading audio LLAMA proj")
+            self.audio_llama_proj = nn.Linear(
+                self.audio_Qformer.config.hidden_size,
+                self.llama_model.config.hidden_size,
+            )
+            if audio_llama_proj_model:
+                logging.info(f"Loading audio LLAMA proj from {audio_llama_proj_model}")
+                # audio_llama_proj_weight = torch.load(audio_llama_proj_model, map_location="cpu")
+                audio_llama_proj_weight = universal_torch_load(
+                    audio_llama_proj_model, cache_mode="use", map_location="cpu"
+                )
+                self.load_state_dict(audio_llama_proj_weight["model"], strict=False)
+            if freeze_audio_llama_proj:
+                for param in self.audio_llama_proj.parameters():
+                    param.requires_grad = False
+                self.audio_llama_proj.eval()
+                logging.info("freeze audio LLAMA proj")
+        elif self.max_pooling:
+            hidden_size = (
+                768
+                if self.aves
+                else 768
+                if self.htsat
+                else 1024
+                if self.aves_large
+                else self.beats.cfg.encoder_embed_dim
+            )
+            self.audio_llama_proj = nn.Linear(
+                hidden_size, self.llama_model.config.hidden_size
+            )  # Single embedding, just project to LLM.
+        elif self.htsat:
+            self.audio_llama_proj = nn.Linear(
+                512, self.llama_model.config.hidden_size
+            )  # Single embedding, just project to LLM.
+        else:
+            # feel free to add other aligners here
+            raise NotImplementedError("Have to use audio qformer")
+        self.config: ModelConfig = None  # set this in from_config
+    @classmethod
+    def from_config(cls, config: ModelConfig):
+        model = cls(
+            llama_path=config.llama_path,
+            beats_path=config.beats_path,
+            freeze_beats=config.freeze_beats,
+            use_audio_Qformer=config.use_audio_Qformer,
+            max_pooling=config.max_pooling,
+            num_audio_query_token=config.num_audio_query_token,
+            freeze_audio_QFormer=config.freeze_audio_QFormer,
+            window_level_Qformer=config.window_level_Qformer,
+            second_per_window=config.second_per_window,
+            second_stride=config.second_stride,
+            downsample_factor=config.downsample_factor,
+            audio_llama_proj_model=config.audio_llama_proj_model,
+            freeze_audio_llama_proj=config.freeze_audio_llama_proj,
+            lora=config.lora,
+            lora_rank=config.lora_rank,
+            lora_alpha=config.lora_alpha,
+            lora_dropout=config.lora_dropout,
+            prompt_template=config.prompt_template,
+            max_txt_len=config.max_txt_len,
+            end_sym=config.end_sym,
+            flash_attn=config.flash_attn,
+            device=config.device,
+        )
+        model.config = config
+        ckpt_path = config.ckpt
+        if ckpt_path:
+            logging.info(f"⏳ Load NatureLM ckpt from: {ckpt_path}")
+            ckpt = universal_torch_load(ckpt_path, cache_mode="use", map_location="cpu")
+            model.load_state_dict(ckpt["model"], strict=False)
+            logging.info("✅ Finished loading from ckpt")
+        return model
+    def _save_to_local(
+        self,
+        output_dir: Union[str, os.PathLike],
+        use_distributed: bool = False,
+        drop_untrained_params: bool = False,
+    ) -> None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Save the config
+        config_path = output_dir / "model_config.yaml"
+        save_config_as_yaml(self.config, config_path)
+        # Save the model
+        model_path = output_dir / "model.pt"
+        save_model_checkpoint(
+            self,
+            model_path,
+            drop_untrained_params=drop_untrained_params,
+            use_distributed=use_distributed,
+        )
+        # Save the tokenizer and llama model
+        tokenizer_path = output_dir / "llama"
+        self.llama_tokenizer.save_pretrained(tokenizer_path)
+        self.llama_model.save_pretrained(tokenizer_path)
+        # Save the audio model
+        if self.beats_path:
+            beats_path = output_dir / "beats.pt"
+            save_model_checkpoint(
+                self.beats,
+                beats_path,
+                drop_untrained_params=drop_untrained_params,
+                cfg=self.beats_cfg,
+            )
+        # Save the audio projection
+        audio_llama_proj_path = output_dir / "audio_llama_proj.pt"
+        save_model_checkpoint(
+            self.audio_llama_proj,
+            audio_llama_proj_path,
+            drop_untrained_params=drop_untrained_params,
+        )
+    @staticmethod
+    def init_audio_Qformer(num_query_token, audio_width, num_hidden_layers=2):
+        encoder_config = BertConfig.from_pretrained("bert-base-uncased")
+        encoder_config.num_hidden_layers = num_hidden_layers
+        encoder_config.encoder_width = audio_width
+        # insert cross-attention layer every other block
+        encoder_config.add_cross_attention = True
+        encoder_config.cross_attention_freq = 1
+        encoder_config.query_length = num_query_token
+        Qformer = BertLMHeadModel(config=encoder_config)
+        query_tokens = nn.Parameter(torch.zeros(1, num_query_token, encoder_config.hidden_size))
+        query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
+        return Qformer, query_tokens
+    @property
+    def device(self):
+        return list(self.parameters())[0].device
+    def _encode_auditory_feature(self, audio_embeds, audio_pad_mask):
+        if self.max_pooling:
+            # Max Pooling logic to reduce sequence length
+            # Apply 1D Max Pooling along the time dimension
+            audio_embeds = F.max_pool1d(
+                audio_embeds.transpose(1, 2),
+                kernel_size=self.downsample_factor,
+                stride=self.downsample_factor,
+            ).transpose(1, 2)
+            audio_embeds = self.audio_llama_proj(audio_embeds)
+            # print("audio pad mask is", audio_pad_mask)
+            audio_atts = ~audio_pad_mask
+            # Adjust the padding mask using max pooling
+            audio_atts = F.max_pool1d(
+                audio_atts.unsqueeze(1).float(),
+                kernel_size=self.downsample_factor,
+                stride=self.downsample_factor,
+            ).squeeze(1)
+            audio_atts = audio_atts > 0
+            # print(f"audio pad mask shape after pooling: {audio_atts.shape}")
+            # print("audio pad mask post", audio_atts)
+        elif self.use_audio_Qformer:
+            # Q-Former logic
+            audio_embeds = self.ln_audio(audio_embeds)
+            # Generate attention mask
+            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
+            if self.window_level_Qformer:
+                B, T, C = audio_embeds.shape  # batch, T, Channels
+                kernel = round(1500 * self.second_per_window / 30.0)  # 160 ms patches; calculate kernel size
+                stride = round(1500 * self.second_stride / 30.0)  # Calculate stride size
+                kernel = (1, kernel)
+                stride = (1, stride)
+                # Transpose and unfold audio embeddings to create overlapping windows
+                audio_embeds_tr = audio_embeds.transpose(1, 2).unsqueeze(2)
+                audio_embeds_overlap = F.unfold(
+                    audio_embeds_tr,
+                    kernel_size=kernel,
+                    dilation=1,
+                    padding=0,
+                    stride=stride,
+                )
+                _, _, L = audio_embeds_overlap.shape
+                audio_embeds_overlap = audio_embeds_overlap.view(B, -1, kernel[1], L)
+                audio_embeds_overlap = torch.permute(
+                    audio_embeds_overlap, [0, 3, 2, 1]
+                )  # (B, num_windows, kernel_size, C)
+                audio_embeds = audio_embeds_overlap.reshape(-1, kernel[1], C)
+                audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
+                # Q-Former mechanism
+                query_tokens = self.audio_query_tokens.expand(audio_embeds.shape[0], -1, -1)
+                query_output = self.audio_Qformer.bert(
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=audio_embeds,
+                    encoder_attention_mask=audio_atts,
+                    return_dict=True,
+                )
+                audio_embeds = self.audio_llama_proj(query_output.last_hidden_state)
+                if self.window_level_Qformer:
+                    audio_embeds = audio_embeds.view(B, -1, audio_embeds.size(2)).contiguous()
+            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
+        elif self.htsat:
+            # HTSAT processing
+            audio_embeds = self.ln_audio(audio_embeds)
+            audio_embeds = self.audio_llama_proj(audio_embeds).reshape(-1, 30, self.llama_model.config.hidden_size)
+            audio_atts = torch.ones(audio_embeds.size()[:-1], dtype=torch.long).to(audio_embeds.device)
+        else:
+            raise NotImplementedError("no audio qformer or max pooling")
+        return audio_embeds, audio_atts
+    def encode_audio(self, raw_wav, audio_padding_mask=None):
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            audio_embeds, audio_pad_mask = self.beats(raw_wav, padding_mask=audio_padding_mask)
+            return self._encode_auditory_feature(audio_embeds=audio_embeds, audio_pad_mask=audio_pad_mask)
+    def prompt_wrap(self, audio_embeds, audio_atts, prompt: list[str]):
+        """Merge audio embeddings with embeddings of the tokens in the prompt.
+        Args:
+            audio_embeds (list): List of tensors of audio embeddings.
+            audio_atts (list): List of tensors of audio padding masks.
+            prompt (list): List of strings with the prompt for each sample. Each prompt
+                should contain the placeholder(s) "<AudioHere>" to indicate where the
+                audio embeddings should be inserted.
+        Returns:
+            tuple: A tuple containing the wrapped audio embeddings and padding masks.
+        """
+        def interleave_lists(longer: list, shorter: list) -> list:
+            """Interleave two lists where the first list is one element longer.
+            Args:
+            longer (list): The first list with length n.
+            shorter (list): The second list with length n-1.
+            Returns:
+            list: A new list with elements interleaved from longer and shorter.
+            Example:
+            >>> interleave_lists(['a1', 'a2', 'a3'], ['b1', 'b2'])
+            ['a1', 'b1', 'a2', 'b2', 'a3']
+            """
+            interleaved_list = []
+            for i in range(len(shorter)):
+                interleaved_list.append(longer[i])
+                interleaved_list.append(shorter[i])
+            interleaved_list.append(longer[-1])  # last element is from longer
+            return interleaved_list
+        device = audio_embeds[0].device
+        wrapped_embeds_list = []
+        wrapped_atts_list = []
+        batch_size = len(prompt)
+        for i in range(batch_size):
+            prompt_parts = prompt[i].split("<AudioHere>")
+            wrapped_embeds = []
+            wrapped_atts = []
+            for part in prompt_parts:
+                tokens = self.llama_tokenizer(part, return_tensors="pt", add_special_tokens=False).to(device)
+                part_embeds = self.llama_embed_tokens(tokens.input_ids).squeeze(0)
+                part_atts = tokens.attention_mask.squeeze(0)
+                wrapped_embeds.append(part_embeds)
+                wrapped_atts.append(part_atts)
+            # Process each element in the batch to remove padding
+            if self.max_pooling:
+                audio_embeds[i] = list(audio_embeds[i].unbind(0))
+                audio_atts[i] = list(audio_atts[i].unbind(0))
+                for j in range(len(audio_embeds[i])):
+                    audio_embeds[i][j] = audio_embeds[i][j][audio_atts[i][j]]
+                    audio_atts[i][j] = audio_atts[i][j][audio_atts[i][j]]
+            # Interleave wrapped_embeds and audio_embeds using interleave_lists
+            wrapped_embeds = interleave_lists(wrapped_embeds, audio_embeds[i])
+            wrapped_atts = interleave_lists(wrapped_atts, audio_atts[i])
+            wrapped_embeds = torch.cat(wrapped_embeds, dim=0)
+            wrapped_atts = torch.cat(wrapped_atts, dim=0)
+            wrapped_embeds_list.append(wrapped_embeds)
+            wrapped_atts_list.append(wrapped_atts)
+        wrapped_embeds = pad_sequence(wrapped_embeds_list, batch_first=True)
+        wrapped_atts = pad_sequence(wrapped_atts_list, batch_first=True)
+        return wrapped_embeds, wrapped_atts
+    def forward(self, samples, verbose=True):
+        # Prepare prompts
+        prompt = samples["prompt"]
+        prompt = [self.prompt_template.format(p) for p in prompt]
+        # Use audio/audio encoder to encode audio/audio
+        raw_wav = samples.get("raw_wav", None)
+        audio_padding_mask = samples.get("padding_mask", None)
+        audio_embeds, audio_atts = self.encode_audio(raw_wav, audio_padding_mask)
+        audio_chunk_sizes = samples["audio_chunk_sizes"]
+        split_audio_embeds = list(torch.split(audio_embeds, audio_chunk_sizes, dim=0))
+        split_audio_atts = list(torch.split(audio_atts, audio_chunk_sizes, dim=0))
+        # Wrap audio_embeds with prompts
+        audio_embeds, audio_atts = self.prompt_wrap(split_audio_embeds, split_audio_atts, prompt)
+        # Prepare inputs for LLM
+        text = [t + self.end_sym for t in samples["text"]]
+        to_regress_tokens = self.llama_tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=self.max_txt_len,
+            add_special_tokens=False,
+        ).to(audio_embeds.device)
+        to_regress_embeds = self.llama_embed_tokens(to_regress_tokens.input_ids)
+        # Prepare targets
+        targets = to_regress_tokens.input_ids.masked_fill(
+            to_regress_tokens.input_ids == self.llama_tokenizer.pad_token_id, -100
+        )
+        batch_size = audio_embeds.size(0)
+        # BOS token embeddings
+        bos_token_id = self.llama_tokenizer.bos_token_id
+        bos = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=audio_embeds.device)
+        bos_embeds = self.llama_embed_tokens(bos)
+        # Prepare lists to collect per-sample embeddings, attention masks, and targets
+        inputs_embeds_list = []
+        attention_mask_list = []
+        targets_list = []
+        for i in range(batch_size):
+            # Extract non-padded audio embeddings and attention mask
+            audio_embed = audio_embeds[i][audio_atts[i].bool()]
+            audio_att = audio_atts[i][audio_atts[i].bool()]
+            # Extract non-padded text embeddings and attention mask
+            text_embed = to_regress_embeds[i][to_regress_tokens.attention_mask[i].bool()]
+            text_att = to_regress_tokens.attention_mask[i][to_regress_tokens.attention_mask[i].bool()]
+            # Extract corresponding targets for the text tokens
+            target = targets[i][to_regress_tokens.attention_mask[i].bool()]
+            # Concatenate embeddings: BOS token, audio embeddings, text embeddings
+            input_embeds = torch.cat([bos_embeds[i], audio_embed, text_embed], dim=0)
+            # Concatenate attention masks: BOS token mask, audio attention mask, text attention mask
+            att_mask = torch.cat(
+                [
+                    torch.ones(1, device=audio_embeds.device, dtype=audio_att.dtype),
+                    audio_att,
+                    text_att,
+                ],
+                dim=0,
+            )
+            # Create targets: Ignore index (-100) for BOS and audio tokens, actual targets for text tokens
+            ignore_targets = torch.full(
+                (1 + audio_embed.size(0),),
+                -100,
+                device=audio_embeds.device,
+                dtype=targets.dtype,
+            )
+            sample_targets = torch.cat([ignore_targets, target], dim=0)
+            # Append to lists
+            inputs_embeds_list.append(input_embeds)
+            attention_mask_list.append(att_mask)
+            targets_list.append(sample_targets)
+        # Pad sequences to the maximum length in the batch
+        inputs_embeds_padded = pad_sequence(inputs_embeds_list, batch_first=True)
+        attention_mask_padded = pad_sequence(attention_mask_list, batch_first=True, padding_value=0)
+        targets_padded = pad_sequence(targets_list, batch_first=True, padding_value=-100)
+        # Now use the padded embeddings, attention masks, and targets in the model
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            outputs = self.llama_model(
+                inputs_embeds=inputs_embeds_padded,
+                attention_mask=attention_mask_padded,
+                return_dict=True,
+                labels=targets_padded,
+            )
+            loss = outputs.loss  # Original batch loss
+        # Compute per-example loss
+        nvocab = self.llama_model.config.vocab_size
+        logits = outputs.logits
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = targets_padded[..., 1:].contiguous()
+        # Compute loss per token
+        loss_fct_per_example = CrossEntropyLoss(reduction="none")
+        loss_per_token = loss_fct_per_example(
+            shift_logits.view(-1, nvocab),  # Flatten to [batch_size * (seq_len-1), vocab_size]
+            shift_labels.view(-1),  # Flatten to [batch_size * (seq_len-1)]
+        )
+        loss_per_token = loss_per_token.view(shift_labels.size())  # Reshape back to [batch_size, seq_len-1]
+        # Create mask
+        mask = shift_labels != -100  # [batch_size, seq_len-1]
+        # Apply mask to loss_per_token
+        loss_per_token = loss_per_token * mask.float()
+        # Compute per-example loss
+        loss_per_example = loss_per_token.sum(dim=1) / mask.sum(dim=1).clamp(min=1)
+        if verbose:
+            # Calculate predictions
+            predicted_tokens = shift_logits.argmax(dim=-1)  # [batch_size, seq_len-1]
+            # Compute per-example correct counts
+            correct_per_sample = ((predicted_tokens == shift_labels) & mask).sum(dim=1).float()  # [batch_size]
+            total_tokens_per_sample = mask.sum(dim=1).float()  # [batch_size]
+            # Total correct and total tokens across the batch
+            correct = correct_per_sample.sum()
+            total = total_tokens_per_sample.sum()
+            return {
+                "loss": loss,
+                "correct": correct,
+                "total": total,
+                "per_example_loss": loss_per_example,
+                "correct_per_sample": correct_per_sample,
+                "total_per_sample": total_tokens_per_sample,
+            }
+        return {"loss": loss, "per_example_loss": loss_per_example}
+    @torch.inference_mode()
+    def generate(self, samples, generate_cfg, prompts):
+        batch_size = len(prompts)
+        raw_wav = samples["raw_wav"]
+        audio_padding_mask = samples.get("padding_mask", None)
+        audio_embeds, audio_atts = self.encode_audio(raw_wav, audio_padding_mask=audio_padding_mask)
+        split_audio_embeds = list(torch.split(audio_embeds, samples["audio_chunk_sizes"], dim=0))
+        split_audio_atts = list(torch.split(audio_atts, samples["audio_chunk_sizes"], dim=0))
+        audio_embeds, audio_atts = self.prompt_wrap(split_audio_embeds, split_audio_atts, prompts)
+        bos = (
+            torch.ones(
+                [batch_size, 1],
+                dtype=torch.int32,
+                device=audio_embeds.device,
+            )
+            * self.llama_tokenizer.bos_token_id
+        )
+        bos_embeds = self.llama_embed_tokens(bos)
+        atts_bos = audio_atts[:, :1]
+        embeds = torch.cat([bos_embeds, audio_embeds], dim=1)
+        attns = torch.cat([atts_bos, audio_atts], dim=1)
+        stop_words_ids = [torch.tensor([2]).to(audio_embeds.device)]
+        stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+        with torch.autocast(self.device.type, dtype=torch.bfloat16):
+            outputs = self.llama_model.generate(  # TODO: Wrap the llama_model with outlines https://outlines-dev.github.io/outlines/reference/models/transformers/
+                inputs_embeds=embeds.bfloat16(),
+                max_new_tokens=generate_cfg.max_new_tokens,
+                stopping_criteria=stopping_criteria,
+                num_beams=generate_cfg.num_beams,
+                do_sample=generate_cfg.do_sample,
+                min_length=generate_cfg.min_length,
+                temperature=generate_cfg.temperature,
+                # top_p=generate_cfg.get("top_p", 0.9),
+                repetition_penalty=generate_cfg.repetition_penalty,
+                length_penalty=generate_cfg.length_penalty,
+                attention_mask=attns.bfloat16(),
+                # prefix_allowed_tokens_fn=prefix_tokens_fn
+                # logits_processor=None
+                # constraints=[constraint] if constraint is not None else None
+            )
+        text = self.llama_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        return text

NatureLM/models/Qformer.py ADDED Viewed

	@@ -0,0 +1,1091 @@

+"""
+Adapted from salesforce@LAVIS. Below is the original copyright:
+ * Copyright (c) 2023, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+import math
+from typing import Tuple
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, device, nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.config = config
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        mixed_query_layer = self.query(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if self.config.add_cross_attention and layer_num % self.config.cross_attention_freq == 0:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+        self.intermediate_query = BertIntermediate(config)
+        self.output_query = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+            if self.has_cross_attention:
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config, i) for i in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=False):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if input_ids is None:
+            assert query_embeds is not None, "You have to specify query_embeds when input_ids is None"
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        query_mask = input_ids.new_ones(query_embeds.shape[:-1])
+        attention_mask = torch.cat([query_mask, attention_mask], dim=-1)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "query_embeds": query_embeds,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+class BertForMaskedLM(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=False,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1] :, :]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

NatureLM/models/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .NatureLM import NatureLM
+def load_model(config):
+    return NatureLM.from_config(config)

NatureLM/models/__pycache__/NatureLM.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

NatureLM/models/__pycache__/Qformer.cpython-310.pyc ADDED Viewed

Binary file (30 kB). View file

NatureLM/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (329 Bytes). View file

NatureLM/models/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (926 Bytes). View file

NatureLM/models/aves.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchaudio.models import wav2vec2_model
+class AvesEmbedding(nn.Module):
+    def __init__(self, sr, large=False):
+        super().__init__()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # reference: https://pytorch.org/audio/stable/_modules/torchaudio/models/wav2vec2/utils/import_fairseq.html
+        if large:
+            config = self.load_config("configs/birdaves_bioxlarge.config")
+        else:
+            config = self.load_config("configs/birdaves_bioxbase.config")
+        self.model = wav2vec2_model(**config, aux_num_out=None)
+        state_dict = torch.hub.load_state_dict_from_url(
+            "https://storage.googleapis.com/esp-public-files/birdaves/birdaves-biox-base.torchaudio.pt",
+            map_location=device,
+        )
+        self.model.load_state_dict(state_dict)
+        self.model.feature_extractor.requires_grad_(True)
+        # bundle = torchaudio.pipelines.WAV2VEC2_BASE
+        # self.model = bundle.get_model()
+        self.sr = sr
+    def load_config(self, config_path):
+        with open(config_path, "r") as ff:
+            obj = json.load(ff)
+        return obj
+    def forward(self, sig, padding_mask):
+        # extract_feature in the torchaudio version will output all 12 layers' output, -1 to select the final one
+        # print("sig", sig)
+        out = self.model.extract_features(sig.float())[0][-1]
+        atts = ~padding_mask
+        atts = atts.unsqueeze(1).float()
+        atts = F.max_pool1d(atts, kernel_size=320, stride=320)
+        atts = atts > 0
+        padding_mask = ~atts
+        return out, padding_mask
+    def freeze(self):
+        for param in self.model.encoder.parameters():
+            param.requires_grad = False
+        self.model.feature_extractor.requires_grad_(False)
+    def unfreeze(self):
+        for param in self.model.encoder.parameters():
+            param.requires_grad = True
+        self.model.feature_extractor.requires_grad_(True)

NatureLM/models/beats/BEATs.py ADDED Viewed

	@@ -0,0 +1,181 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torchaudio.compliance.kaldi as ta_kaldi
+from torch.nn import LayerNorm
+from .backbone import TransformerEncoder
+logger = logging.getLogger(__name__)
+class BEATsConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+        self.layer_wise_gradient_decay_ratio: float = 1.0  # ratio for layer-wise gradient decay
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+        # label predictor
+        self.finetuned_model: bool = False  # whether the model is a fine-tuned model.
+        self.predictor_dropout: float = 0.1  # dropout probability for the predictor
+        self.predictor_class: int = 527  # target class number for the predictor
+        if cfg is not None:
+            self.update(cfg)
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+    def to_dict(self):
+        return self.__dict__
+class BEATs(nn.Module):
+    def __init__(
+        self,
+        cfg: BEATsConfig,
+    ) -> None:
+        super().__init__()
+        logger.info(f"BEATs Config: {cfg.__dict__}")
+        self.cfg = cfg
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None
+        )
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(
+            1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, bias=cfg.conv_bias
+        )
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        if cfg.finetuned_model:
+            self.predictor_dropout = nn.Dropout(cfg.predictor_dropout)
+            self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class)
+        else:
+            self.predictor = None
+    def forward_padding_mask(
+        self,
+        features: torch.Tensor,
+        padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def preprocess(
+        self,
+        source: torch.Tensor,
+        fbank_mean: float = 15.41663,
+        fbank_std: float = 6.55582,
+    ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2**15
+            fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def extract_features(
+        self,
+        source: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        fbank_mean: float = 15.41663,
+        fbank_std: float = 6.55582,
+        feature_only=False,
+    ):
+        fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std).to(torch.float32)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        x = self.dropout_input(features)
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+        )
+        if not feature_only and self.predictor is not None:
+            x = self.predictor_dropout(x)
+            logits = self.predictor(x)
+            if padding_mask is not None and padding_mask.any():
+                logits[padding_mask] = 0
+                logits = logits.sum(dim=1)
+                logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits)
+            else:
+                logits = logits.mean(dim=1)
+            lprobs = torch.sigmoid(logits)
+            return lprobs, padding_mask
+        else:
+            return x, padding_mask
+    def forward(self, source: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+        return self.extract_features(source, padding_mask, feature_only=True)

NatureLM/models/beats/Tokenizers.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torchaudio.compliance.kaldi as ta_kaldi
+from torch.nn import LayerNorm
+from .backbone import (
+    TransformerEncoder,
+)
+from .quantizer import (
+    NormEMAVectorQuantizer,
+)
+logger = logging.getLogger(__name__)
+class TokenizersConfig:
+    def __init__(self, cfg=None):
+        self.input_patch_size: int = -1  # path size of patch embedding
+        self.embed_dim: int = 512  # patch embedding dimension
+        self.conv_bias: bool = False  # include bias in conv encoder
+        self.encoder_layers: int = 12  # num encoder layers in the transformer
+        self.encoder_embed_dim: int = 768  # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072  # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12  # num encoder attention heads
+        self.activation_fn: str = "gelu"  # activation function to use
+        self.layer_norm_first: bool = False  # apply layernorm first in the transformer
+        self.deep_norm: bool = False  # apply deep_norm first in the transformer
+        # dropouts
+        self.dropout: float = 0.1  # dropout probability for the transformer
+        self.attention_dropout: float = 0.1  # dropout probability for attention weights
+        self.activation_dropout: float = 0.0  # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0  # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0  # dropout to apply to the input (after feat extr)
+        # positional embeddings
+        self.conv_pos: int = 128  # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16  # number of groups for convolutional positional embedding
+        # relative position embedding
+        self.relative_position_embedding: bool = False  # apply relative position embedding
+        self.num_buckets: int = 320  # number of buckets for relative position embedding
+        self.max_distance: int = 1280  # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = False  # apply gated relative position embedding
+        # quantizer
+        self.quant_n: int = 1024  # codebook number in quantizer
+        self.quant_dim: int = 256  # codebook dimension in quantizer
+        if cfg is not None:
+            self.update(cfg)
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+class Tokenizers(nn.Module):
+    def __init__(
+        self,
+        cfg: TokenizersConfig,
+    ) -> None:
+        super().__init__()
+        logger.info(f"Tokenizers Config: {cfg.__dict__}")
+        self.cfg = cfg
+        self.embed = cfg.embed_dim
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None
+        )
+        self.input_patch_size = cfg.input_patch_size
+        self.patch_embedding = nn.Conv2d(
+            1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, bias=cfg.conv_bias
+        )
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        assert not cfg.deep_norm or not cfg.layer_norm_first
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+        self.quantize = NormEMAVectorQuantizer(
+            n_embed=cfg.quant_n,
+            embedding_dim=cfg.quant_dim,
+            beta=1.0,
+            kmeans_init=True,
+            decay=0.99,
+        )
+        self.quant_n = cfg.quant_n
+        self.quantize_layer = nn.Sequential(
+            nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim),
+            nn.Tanh(),
+            nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim),  # for quantize
+        )
+    def forward_padding_mask(
+        self,
+        features: torch.Tensor,
+        padding_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+    def preprocess(
+        self,
+        source: torch.Tensor,
+        fbank_mean: float = 15.41663,
+        fbank_std: float = 6.55582,
+    ) -> torch.Tensor:
+        fbanks = []
+        for waveform in source:
+            waveform = waveform.unsqueeze(0) * 2**15
+            fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10)
+            fbanks.append(fbank)
+        fbank = torch.stack(fbanks, dim=0)
+        fbank = (fbank - fbank_mean) / (2 * fbank_std)
+        return fbank
+    def extract_labels(
+        self,
+        source: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        fbank_mean: float = 15.41663,
+        fbank_std: float = 6.55582,
+    ):
+        fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(fbank, padding_mask)
+        fbank = fbank.unsqueeze(1)
+        features = self.patch_embedding(fbank)
+        features = features.reshape(features.shape[0], features.shape[1], -1)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        x = self.dropout_input(features)
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+        )
+        quantize_input = self.quantize_layer(x)
+        quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input)
+        return embed_ind

NatureLM/models/beats/__init__.py ADDED Viewed

File without changes

NatureLM/models/beats/__pycache__/BEATs.cpython-310.pyc ADDED Viewed

Binary file (4.48 kB). View file

NatureLM/models/beats/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (173 Bytes). View file

NatureLM/models/beats/__pycache__/backbone.cpython-310.pyc ADDED Viewed

Binary file (16.6 kB). View file

NatureLM/models/beats/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (6.14 kB). View file

NatureLM/models/beats/backbone.py ADDED Viewed

	@@ -0,0 +1,741 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import math
+from typing import Dict, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import LayerNorm, Parameter
+from .modules import (
+    GLU_Linear,
+    GradMultiply,
+    SamePad,
+    get_activation_fn,
+    quant_noise,
+)
+class TransformerEncoder(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.pos_conv = nn.Conv1d(
+            self.embedding_dim,
+            self.embedding_dim,
+            kernel_size=args.conv_pos,
+            padding=args.conv_pos // 2,
+            groups=args.conv_pos_groups,
+        )
+        dropout = 0
+        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
+        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
+        nn.init.constant_(self.pos_conv.bias, 0)
+        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+        if hasattr(args, "relative_position_embedding"):
+            self.relative_position_embedding = args.relative_position_embedding
+            self.num_buckets = args.num_buckets
+            self.max_distance = args.max_distance
+        else:
+            self.relative_position_embedding = False
+            self.num_buckets = 0
+            self.max_distance = 0
+        self.layers = nn.ModuleList(
+            [
+                TransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                    num_attention_heads=args.encoder_attention_heads,
+                    dropout=self.dropout,
+                    attention_dropout=args.attention_dropout,
+                    activation_dropout=args.activation_dropout,
+                    activation_fn=args.activation_fn,
+                    layer_norm_first=args.layer_norm_first,
+                    deep_norm=args.deep_norm,
+                    has_relative_attention_bias=self.relative_position_embedding,
+                    num_buckets=self.num_buckets,
+                    max_distance=self.max_distance,
+                    gru_rel_pos=args.gru_rel_pos,
+                    encoder_layers=args.encoder_layers,
+                )
+                for i in range(args.encoder_layers)
+            ]
+        )
+        if self.relative_position_embedding:
+            for i in range(1, args.encoder_layers):
+                del self.layers[i].self_attn.relative_attention_bias
+                self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+        self.apply(init_bert_params)
+        if args.deep_norm:
+            deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4)
+            for i in range(args.encoder_layers):
+                nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1)
+                nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1)
+                nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta)
+                nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta)
+        self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1)
+    def forward(self, x, padding_mask=None, layer=None):
+        x, layer_results = self.extract_features(x, padding_mask, layer)
+        if self.layer_norm_first and layer is None:
+            x = self.layer_norm(x)
+        return x, layer_results
+    def extract_features(self, x, padding_mask=None, tgt_layer=None):
+        if padding_mask is not None:
+            x[padding_mask] = 0
+        x_conv = self.pos_conv(x.transpose(1, 2))
+        x_conv = x_conv.transpose(1, 2)
+        x = x + x_conv
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        layer_results = []
+        z = None
+        if tgt_layer is not None:
+            layer_results.append((x, z))
+        r = None
+        pos_bias = None
+        for i, layer in enumerate(self.layers):
+            if self.layer_wise_gradient_decay_ratio != 1.0:
+                x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio)
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias)
+            if tgt_layer is not None:
+                layer_results.append((x, z))
+            if i == tgt_layer:
+                r = x
+                break
+        if r is not None:
+            x = r
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        return x, layer_results
+class TransformerSentenceEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: float = 768,
+        ffn_embedding_dim: float = 3072,
+        num_attention_heads: float = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        layer_norm_first: bool = False,
+        deep_norm: bool = False,
+        has_relative_attention_bias: bool = False,
+        num_buckets: int = 0,
+        max_distance: int = 0,
+        rescale_init: bool = False,
+        gru_rel_pos: bool = False,
+        encoder_layers: int = 0,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.activation_name = activation_fn
+        self.activation_fn = get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+            has_relative_attention_bias=has_relative_attention_bias,
+            num_buckets=num_buckets,
+            max_distance=max_distance,
+            rescale_init=rescale_init,
+            gru_rel_pos=gru_rel_pos,
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.layer_norm_first = layer_norm_first
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        if self.activation_name == "glu":
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
+        else:
+            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+        self.deep_norm = deep_norm
+        if self.deep_norm:
+            self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4)
+        else:
+            self.deep_norm_alpha = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        self_attn_mask: torch.Tensor = None,
+        self_attn_padding_mask: torch.Tensor = None,
+        need_weights: bool = False,
+        pos_bias=None,
+    ):
+        residual = x
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias,
+            )
+            x = self.dropout1(x)
+            x = residual + x
+            residual = x
+            x = self.final_layer_norm(x)
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=need_weights,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias,
+            )
+            x = self.dropout1(x)
+            x = residual * self.deep_norm_alpha + x
+            x = self.self_attn_layer_norm(x)
+            residual = x
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual * self.deep_norm_alpha + x
+            x = self.final_layer_norm(x)
+        return x, attn, pos_bias
+class MultiheadAttention(nn.Module):
+    """Multi-headed attention.
+    See "Attention Is All You Need" for more details.
+    """
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        has_relative_attention_bias=False,
+        num_buckets=32,
+        max_distance=128,
+        gru_rel_pos=False,
+        rescale_init=False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout_module = nn.Dropout(dropout)
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
+        self.head_dim = embed_dim // num_heads
+        self.q_head_dim = self.head_dim
+        self.k_head_dim = self.head_dim
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+        k_bias = True
+        if rescale_init:
+            k_bias = False
+        k_embed_dim = embed_dim
+        q_embed_dim = embed_dim
+        self.k_proj = quant_noise(nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size)
+        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size)
+        self.q_proj = quant_noise(nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size)
+        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size)
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.gru_rel_pos = gru_rel_pos
+        if self.gru_rel_pos:
+            self.grep_linear = nn.Linear(self.q_head_dim, 8)
+            self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1))
+        self.reset_parameters()
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+        if self.has_relative_attention_bias:
+            nn.init.xavier_normal_(self.relative_attention_bias.weight)
+    def _relative_positions_bucket(self, relative_positions, bidirectional=True):
+        num_buckets = self.num_buckets
+        max_distance = self.max_distance
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets = num_buckets // 2
+            relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets
+            relative_positions = torch.abs(relative_positions)
+        else:
+            relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions))
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+        relative_postion_if_large = max_exact + (
+            torch.log(relative_positions.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_postion_if_large = torch.min(
+            relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1)
+        )
+        relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large)
+        return relative_buckets
+    def compute_bias(self, query_length, key_length):
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(relative_position, bidirectional=True)
+        relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
+        values = self.relative_attention_bias(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+        position_bias: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        is_tpu = query.device.type == "xla"
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert key_bsz == bsz
+                assert value is not None
+                assert src_len, bsz == value.shape[:2]
+        if self.has_relative_attention_bias and position_bias is None:
+            position_bias = self.compute_bias(tgt_len, src_len)
+            position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len)
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        alpha = 32
+        q *= 1 / alpha
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.q_head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.k_head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1)
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask),
+                    ],
+                    dim=1,
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            if not is_tpu:
+                attn_weights = attn_weights.masked_fill(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"),
+                )
+            else:
+                attn_weights = attn_weights.transpose(0, 2)
+                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.transpose(0, 2)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if before_softmax:
+            return attn_weights, v, position_bias
+        if position_bias is not None:
+            attn_mask_rel_pos = position_bias
+            if self.gru_rel_pos == 1:
+                query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling
+                _B, _H, _L, __ = query_layer.size()
+                gate_a, gate_b = torch.sigmoid(
+                    self.grep_linear(query_layer).view(_B, _H, _L, 2, 4).sum(-1, keepdim=False)
+                ).chunk(2, dim=-1)
+                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
+                attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias
+            attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size())
+            attn_weights = attn_weights + attn_mask_rel_pos
+        attn_weights_float = F.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights, position_bias
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), key_padding_mask.float()], dim=1)
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - prev_key_padding_mask.size(1)),
+                    device=prev_key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat([prev_key_padding_mask.float(), filler.float()], dim=1)
+            else:
+                new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = torch.zeros(
+                    (batch_size, src_len - key_padding_mask.size(1)),
+                    device=key_padding_mask.device,
+                )
+                new_key_padding_mask = torch.cat([filler.float(), key_padding_mask.float()], dim=1)
+            else:
+                new_key_padding_mask = key_padding_mask.float()
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+def init_bert_params(module):
+    """
+    Initialize the weights specific to the BERT Model.
+    This overrides the default initializations depending on the specified arguments.
+        1. If normal_init_linear_weights is set then weights of linear
+           layer will be initialized using the normal distribution and
+           bais will be set to the specified value.
+        2. If normal_init_embed_weights is set then weights of embedding
+           layer will be initialized using the normal distribution.
+        3. If normal_init_proj_weights is set then weights of
+           in_project_weight for MultiHeadAttention initialized using
+           the normal distribution (to be validated).
+    """
+    def normal_(data):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+    if isinstance(module, nn.Linear):
+        normal_(module.weight.data)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight.data)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    if isinstance(module, MultiheadAttention):
+        normal_(module.q_proj.weight.data)
+        normal_(module.k_proj.weight.data)
+        normal_(module.v_proj.weight.data)

NatureLM/models/beats/modules.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
+class SamePad(nn.Module):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal:
+            self.remove = kernel_size - 1
+        else:
+            self.remove = 1 if kernel_size % 2 == 0 else 0
+    def forward(self, x):
+        if self.remove > 0:
+            x = x[:, :, : -self.remove]
+        return x
+class Swish(nn.Module):
+    def __init__(self):
+        super(Swish, self).__init__()
+        self.act = torch.nn.Sigmoid()
+    def forward(self, x):
+        return x * self.act(x)
+class GLU_Linear(nn.Module):
+    def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
+        super(GLU_Linear, self).__init__()
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        if glu_type == "sigmoid":
+            self.glu_act = torch.nn.Sigmoid()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        elif glu_type == "relu":
+            self.glu_act = torch.nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = torch.nn.GELU()
+        if bias_in_glu:
+            self.linear = nn.Linear(input_dim, output_dim * 2, True)
+        else:
+            self.linear = nn.Linear(input_dim, output_dim * 2, False)
+    def forward(self, x):
+        # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
+        x = self.linear(x)
+        if self.glu_type == "bilinear":
+            x = x[:, :, 0 : self.output_dim] * x[:, :, self.output_dim : self.output_dim * 2]
+        else:
+            x = x[:, :, 0 : self.output_dim] * self.glu_act(x[:, :, self.output_dim : self.output_dim * 2])
+        return x
+def gelu_accurate(x):
+    if not hasattr(gelu_accurate, "_a"):
+        gelu_accurate._a = math.sqrt(2 / math.pi)
+    return 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
+def gelu(x: torch.Tensor) -> torch.Tensor:
+    return torch.nn.functional.gelu(x.float()).type_as(x)
+def get_activation_fn(activation: str):
+    """Returns the activation function corresponding to `activation`"""
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return gelu
+    elif activation == "gelu_fast":
+        warnings.warn("--activation-fn=gelu_fast has been renamed to gelu_accurate")
+        return gelu_accurate
+    elif activation == "gelu_accurate":
+        return gelu_accurate
+    elif activation == "tanh":
+        return torch.tanh
+    elif activation == "linear":
+        return lambda x: x
+    elif activation == "glu":
+        return lambda x: x
+    else:
+        raise RuntimeError("--activation-fn {} not supported".format(activation))
+def quant_noise(module, p, block_size):
+    """
+    Wraps modules and applies quantization noise to the weights for
+    subsequent quantization with Iterative Product Quantization as
+    described in "Training with Quantization Noise for Extreme Model Compression"
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights,
+          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper
+          which consists in randomly dropping blocks
+    """
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+    # supported modules
+    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+    # 2D matrix
+    if not is_conv:
+        assert module.weight.size(1) % block_size == 0, "Input features must be a multiple of block sizes"
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            assert module.in_channels % block_size == 0, "Input channels must be a multiple of block sizes"
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            assert k % block_size == 0, "Kernel size must be a multiple of block size"
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device,
+                    )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+            # scale weights and apply mask
+            mask = mask.to(torch.bool)  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module

NatureLM/models/beats/quantizer.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# --------------------------------------------------------
+# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058)
+# Github source: https://github.com/microsoft/unilm/tree/master/beats
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on VQGAN code bases
+# https://github.com/CompVis/taming-transformers
+# --------------------------------------------------------'
+import torch
+import torch.distributed as distributed
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from einops import rearrange, repeat
+except ImportError:
+    pass
+def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)
+def ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def sample_vectors(samples, num):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False):
+    dim, dtype, _ = samples.shape[-1], samples.dtype, samples.device
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        if use_cosine_sim:
+            dists = samples @ means.t()
+        else:
+            diffs = rearrange(samples, "n d -> n () d") - rearrange(means, "c d -> () c d")
+            dists = -(diffs**2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        if use_cosine_sim:
+            new_means = l2norm(new_means)
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EmbeddingEMA(nn.Module):
+    def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=""):
+        super().__init__()
+        self.num_tokens = num_tokens
+        self.codebook_dim = codebook_dim
+        self.decay = decay
+        self.eps = eps
+        if codebook_init_path == "":
+            if not kmeans_init:
+                weight = torch.randn(num_tokens, codebook_dim)
+                weight = l2norm(weight)
+            else:
+                weight = torch.zeros(num_tokens, codebook_dim)
+            self.register_buffer("initted", torch.Tensor([not kmeans_init]))
+        else:
+            print(f"load init codebook weight from {codebook_init_path}")
+            codebook_ckpt_weight = torch.load(codebook_init_path, map_location="cpu")
+            weight = codebook_ckpt_weight.clone()
+            self.register_buffer("initted", torch.Tensor([True]))
+        self.weight = nn.Parameter(weight, requires_grad=False)
+        self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False)
+        self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False)
+        # self.register_buffer('initted', torch.Tensor([not kmeans_init]))
+        self.update = True
+    @torch.jit.ignore
+    def init_embed_(self, data):
+        if self.initted:
+            return
+        print("Performing Kemans init for codebook")
+        embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True)
+        self.weight.data.copy_(embed)
+        self.cluster_size.data.copy_(cluster_size)
+        self.initted.data.copy_(torch.Tensor([True]))
+    def forward(self, embed_id):
+        return F.embedding(embed_id, self.weight)
+    def cluster_size_ema_update(self, new_cluster_size):
+        self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay)
+    def embed_avg_ema_update(self, new_embed_avg):
+        self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay)
+    def weight_update(self, num_tokens):
+        n = self.cluster_size.sum()
+        smoothed_cluster_size = (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n
+        # normalize embedding average with smoothed cluster size
+        embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1)
+        # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1))
+        self.weight.data.copy_(embed_normalized)
+def norm_ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+    moving_avg.data.copy_(l2norm(moving_avg.data))
+class NormEMAVectorQuantizer(nn.Module):
+    def __init__(
+        self,
+        n_embed,
+        embedding_dim,
+        beta,
+        decay=0.99,
+        eps=1e-5,
+        statistic_code_usage=True,
+        kmeans_init=False,
+        codebook_init_path="",
+    ):
+        super().__init__()
+        self.codebook_dim = embedding_dim
+        self.num_tokens = n_embed
+        self.beta = beta
+        self.decay = decay
+        # learnable = True if orthogonal_reg_weight > 0 else False
+        self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path)
+        self.statistic_code_usage = statistic_code_usage
+        if statistic_code_usage:
+            self.register_buffer("cluster_size", torch.zeros(n_embed))
+        if distributed.is_available() and distributed.is_initialized():
+            print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!")
+            self.all_reduce_fn = distributed.all_reduce
+        else:
+            self.all_reduce_fn = nn.Identity()
+    def reset_cluster_size(self, device):
+        if self.statistic_code_usage:
+            self.register_buffer("cluster_size", torch.zeros(self.num_tokens))
+            self.cluster_size = self.cluster_size.to(device)
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        # z, 'b c h w -> b h w c'
+        # z = rearrange(z, 'b c h w -> b h w c')
+        # z = z.transpose(1, 2)
+        z = l2norm(z)
+        z_flattened = z.reshape(-1, self.codebook_dim)
+        self.embedding.init_embed_(z_flattened)
+        d = (
+            z_flattened.pow(2).sum(dim=1, keepdim=True)
+            + self.embedding.weight.pow(2).sum(dim=1)
+            - 2 * torch.einsum("bd,nd->bn", z_flattened, self.embedding.weight)
+        )  # 'n d -> d n'
+        encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(encoding_indices).view(z.shape)
+        encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype)
+        if not self.training:
+            with torch.no_grad():
+                cluster_size = encodings.sum(0)
+                self.all_reduce_fn(cluster_size)
+                ema_inplace(self.cluster_size, cluster_size, self.decay)
+        if self.training and self.embedding.update:
+            # EMA cluster size
+            bins = encodings.sum(0)
+            self.all_reduce_fn(bins)
+            # self.embedding.cluster_size_ema_update(bins)
+            ema_inplace(self.cluster_size, bins, self.decay)
+            zero_mask = bins == 0
+            bins = bins.masked_fill(zero_mask, 1.0)
+            embed_sum = z_flattened.t() @ encodings
+            self.all_reduce_fn(embed_sum)
+            embed_normalized = (embed_sum / bins.unsqueeze(0)).t()
+            embed_normalized = l2norm(embed_normalized)
+            embed_normalized = torch.where(zero_mask[..., None], self.embedding.weight, embed_normalized)
+            norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay)
+        # compute loss for embedding
+        loss = self.beta * F.mse_loss(z_q.detach(), z)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        # z_q, 'b h w c -> b c h w'
+        # z_q = rearrange(z_q, 'b h w c -> b c h w')
+        # z_q = z_q.transpose(1, 2)
+        return z_q, loss, encoding_indices

NatureLM/models/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from transformers import StoppingCriteria
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = stops
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        for stop in self.stops:
+            if torch.all((stop == input_ids[0][-len(stop) :])).item():
+                return True
+        return False

NatureLM/optims.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# This script is from https://github.com/salesforce/LAVIS/blob/main/lavis/common/optims.py
+import logging
+import math
+import torch
+from NatureLM.config import OptimizerConfig
+class LinearWarmupStepLRScheduler:
+    def __init__(
+        self,
+        optimizer,
+        max_epoch,
+        min_lr,
+        init_lr,
+        decay_rate=1,
+        warmup_start_lr=-1,
+        warmup_steps=0,
+        **kwargs,
+    ):
+        self.optimizer = optimizer
+        self.max_epoch = max_epoch
+        self.min_lr = min_lr
+        self.decay_rate = decay_rate
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+    def step(self, cur_epoch, cur_step):
+        if cur_epoch == 0:
+            warmup_lr_schedule(
+                step=cur_step,
+                optimizer=self.optimizer,
+                max_step=self.warmup_steps,
+                init_lr=self.warmup_start_lr,
+                max_lr=self.init_lr,
+            )
+        else:
+            step_lr_schedule(
+                epoch=cur_epoch,
+                optimizer=self.optimizer,
+                init_lr=self.init_lr,
+                min_lr=self.min_lr,
+                decay_rate=self.decay_rate,
+            )
+class LinearWarmupCosineLRScheduler:
+    def __init__(
+        self,
+        optimizer,
+        max_epoch,
+        iters_per_epoch,
+        min_lr,
+        init_lr,
+        warmup_steps=0,
+        warmup_start_lr=-1,
+        **kwargs,
+    ):
+        self.optimizer = optimizer
+        self.max_epoch = max_epoch
+        self.iters_per_epoch = iters_per_epoch
+        self.min_lr = min_lr
+        self.init_lr = init_lr
+        self.warmup_steps = warmup_steps
+        self.warmup_start_lr = warmup_start_lr if warmup_start_lr >= 0 else init_lr
+    def step(self, cur_epoch, cur_step):
+        total_cur_step = cur_epoch * self.iters_per_epoch + cur_step
+        if total_cur_step < self.warmup_steps:
+            warmup_lr_schedule(
+                step=cur_step,
+                optimizer=self.optimizer,
+                max_step=self.warmup_steps,
+                init_lr=self.warmup_start_lr,
+                max_lr=self.init_lr,
+            )
+        else:
+            cosine_lr_schedule(
+                epoch=total_cur_step,
+                optimizer=self.optimizer,
+                max_epoch=self.max_epoch * self.iters_per_epoch,
+                init_lr=self.init_lr,
+                min_lr=self.min_lr,
+            )
+def cosine_lr_schedule(optimizer, epoch, max_epoch, init_lr, min_lr):
+    """Decay the learning rate"""
+    lr = (init_lr - min_lr) * 0.5 * (1.0 + math.cos(math.pi * epoch / max_epoch)) + min_lr
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def warmup_lr_schedule(optimizer, step, max_step, init_lr, max_lr):
+    """Warmup the learning rate"""
+    lr = min(max_lr, init_lr + (max_lr - init_lr) * step / max(max_step, 1))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def step_lr_schedule(optimizer, epoch, init_lr, min_lr, decay_rate):
+    """Decay the learning rate"""
+    lr = max(min_lr, init_lr * (decay_rate**epoch))
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def get_optimizer(model, config: OptimizerConfig):
+    num_parameters = 0
+    p_wd, p_non_wd = [], []
+    for n, p in model.named_parameters():
+        if not p.requires_grad:
+            continue  # frozen weights
+        print(n)
+        if p.ndim < 2 or "bias" in n or "ln" in n or "bn" in n:
+            p_non_wd.append(p)
+        else:
+            p_wd.append(p)
+        num_parameters += p.data.nelement()
+    logging.info("number of trainable parameters: %d" % num_parameters)
+    optim_params = [
+        {
+            "params": p_wd,
+            "weight_decay": float(config.weight_decay),
+        },
+        {"params": p_non_wd, "weight_decay": 0},
+    ]
+    beta2 = config.beta2
+    if config.device == "cpu":
+        optimizer = torch.optim.AdamW(
+            optim_params,
+            lr=float(config.init_lr),
+            weight_decay=float(config.weight_decay),
+            betas=(0.9, beta2),
+        )
+    else:
+        import bitsandbytes as bnb
+        optimizer = bnb.optim.PagedAdamW8bit(
+            optim_params,
+            lr=float(config.init_lr),
+            weight_decay=float(config.weight_decay),
+            betas=(0.9, beta2),
+        )
+    return optimizer

NatureLM/processors.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""Module contains the audio and text processor for NatureLM-audio inference and evaluation"""
+import json
+import os
+from dataclasses import dataclass, field
+import numpy as np
+import resampy
+import soundfile as sf
+import torch
+@dataclass
+class NatureLMAudioProcessor:
+    """Preprocess samples to make them ready for NatureLM-audio inference.
+    Arguments
+    ---------
+    naturelm_sample_rate : int
+        The sample rate of the NatureLM model
+    max_length_seconds : int
+        The maximum length of audio in seconds
+    audio_token_placeholder : str
+        The placeholder for the audio token in the instruction
+    prompt_template : str
+        The template for the prompt. The instruction or query from the user is inserted in the placeholder at {prompt}
+    Examples
+    --------
+    >>> processor = NatureLMAudioProcessor()
+    >>> audios = [np.random.rand(32000), np.random.rand(32000)]
+    >>> instructions = ["What is the weather today?", "What is the time now?"]
+    >>> input_sample_rates = [32000, 32000]
+    >>> audios, instructions = processor(audios, instructions, input_sample_rates)
+    >>> audios.shape == (2, 160000)
+    True
+    >>> "<Audio><AudioHere></Audio> " in instructions[0]
+    True
+    >>> "<|start_header_id|>user<|end_header_id|>" in instructions[0]
+    True
+    """
+    sample_rate: int = 16000
+    max_length_seconds: int = 10
+    audio_token_placeholder: str = "<Audio><AudioHere></Audio> "
+    prompt_template: str = "<|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    def prepare_audio(self, audio: list[float] | np.ndarray | os.PathLike, input_sr: int = None) -> torch.Tensor:
+        """Prepare an audio array or file path for inference"""
+        if isinstance(audio, str | os.PathLike):
+            audio, sr = sf.read(audio)
+            input_sr = sr
+        elif isinstance(audio, list):
+            audio = np.array(audio)
+        assert isinstance(audio, np.ndarray), "Audio not a numpy array"
+        # Convert stereo to mono
+        if len(audio.shape) == 2:
+            # find the smaller axis as channel dim to avg over (like (2, T) or (T, 2), 2 = channel dim
+            axis_to_average = int(np.argmin(audio.shape))
+            audio = audio.mean(axis=axis_to_average)
+        # Resample
+        if input_sr is not None and input_sr != self.sample_rate:
+            # audio = torchaudio.functional.resample(
+            #     torch.from_numpy(audio), orig_freq=input_sr, new_freq=self.sample_rate
+            # )
+            audio = resampy.resample(audio, input_sr, self.sample_rate)
+            audio = torch.from_numpy(audio.squeeze())
+        else:
+            audio = torch.from_numpy(audio)
+        # Truncate audio to at most max_length_seconds
+        audio = audio[: self.sample_rate * self.max_length_seconds]
+        # Pad to max_length_seconds if short
+        if len(audio) < self.sample_rate * self.max_length_seconds:
+            pad_size = self.sample_rate * self.max_length_seconds - len(audio)
+            audio = torch.nn.functional.pad(audio, (0, pad_size))
+        # Clamp
+        audio = torch.clamp(audio, -1.0, 1.0)
+        return audio.squeeze()
+    def prepare_instruction(self, instruction: str) -> str:
+        """Add the audio token placeholder to the instruction and format it
+        according to the llama tokenizer.
+        """
+        if self.audio_token_placeholder not in instruction:
+            instruction = self.audio_token_placeholder + instruction
+        instruction = self.prompt_template.format(prompt=instruction.strip())
+        return instruction
+    def __call__(
+        self,
+        audios: list[list[float] | np.ndarray] | list[str | os.PathLike],
+        instructions: list[str],
+        input_sample_rates: list[int],
+    ) -> tuple[torch.Tensor, list[str]]:
+        """Prepare audios and instructions for inference
+        Arguments
+        ---------
+        audios : list[list[float] | np.ndarray] | list[str | os.PathLike]
+            The audio samples or file paths
+        instructions : list[str]
+            The instructions or queries
+        input_sample_rates : list[int]
+            The sample rates of the input audio samples
+        Returns
+        -------
+        tuple[torch.Tensor, list[str]]
+            The prepared audios and instructions
+        """
+        audios = torch.stack(
+            [self.prepare_audio(audio, input_sr) for audio, input_sr in zip(audios, input_sample_rates)]
+        )
+        instructions = [self.prepare_instruction(instruction) for instruction in instructions]
+        return audios, instructions
+@dataclass
+class NatureLMAudioEvalProcessor(NatureLMAudioProcessor):
+    """Preprocess samples to make them ready for NatureLM-audio evaluation on BEANS-Zero dataset.
+    This requires a few additional parameters compared to the NatureLMAudioProcessor.
+    Arguments
+    ---------
+    naturelm_sample_rate : int
+        The sample rate of the NatureLM model
+    max_length_seconds : int
+        The maximum length of audio in seconds
+    audio_token_placeholder : str
+        The placeholder for the audio token in the instruction
+    prompt_template : str
+        The template for the prompt. The instruction or query from the user is inserted in the placeholder at {prompt}
+    dataset_name : list[str]
+        The name of the dataset being processed
+    true_labels : list[str]
+        The true labels or expected outputs for the samples.
+    task: str
+        The task for the dataset. Can be 'detection', 'captioning', or 'classification'
+    threshold_too_many_detection_labels : int
+        The threshold for the number of labels in the dataset to switch to a detection prompt. Default is 8.
+    Examples
+    --------
+    >>> processor = NatureLMAudioEvalProcessor(task="detection", true_labels=["dog", "cat", "bird", "None", "mouse", "elephant", "lion", "tiger", "bear"])
+    >>> audios = [np.random.rand(32000), np.random.rand(32000)]
+    >>> instructions = ["What is the weather today?", "What is the time now?"]
+    >>> input_sample_rates = [32000, 32000]
+    >>> audios, instructions = processor(audios, instructions, input_sample_rates)
+    >>> audios.shape == (2, 160000)
+    True
+    >>> "<Audio><AudioHere></Audio> " in instructions[0]
+    True
+    >>> "<|start_header_id|>user<|end_header_id|>" in instructions[0]
+    True
+    >>> "What are the common names" in instructions[0]
+    True
+    """
+    dataset_name: str = "beans-zero"
+    true_labels: list[str] = field(default_factory=lambda _: [])
+    task: str = "detection"
+    threshold_too_many_detection_labels: int = 8
+    def __post_init__(self):
+        self.detection_prompt: str = (
+            "<Audio><AudioHere></Audio> What are the common names for the species in the audio, if any?"
+        )
+        # find the unique labels in the dataset
+        self.dataset_labels = set(self.true_labels)
+        if self.task == "detection":
+            self.dataset_labels.add("None")
+        if self.task == "captioning":
+            self.dataset_labels = set()
+    def prepare_instruction(self, instruction: str) -> str:
+        """Add the audio token placeholder to the instruction and format it"""
+        if self.task == "detection" and len(self.dataset_labels) > self.threshold_too_many_detection_labels:
+            instruction = self.detection_prompt
+        if self.audio_token_placeholder not in instruction:
+            instruction = self.audio_token_placeholder + instruction
+        instruction = self.prompt_template.format(prompt=instruction.strip())
+        return instruction
+class NatureLMInferenceDataset(torch.utils.data.Dataset):
+    """A pytorch dataset for batched inference with NatureLM-audio
+    TODO: currently, if the batch contains very different prompts the model doesnt work well.
+    Arguments
+    ---------
+    ds : datasets.Dataset
+        The huggingface dataset containing the samples
+    Examples
+    --------
+    TODO: Add examples
+    """
+    def __init__(self, ds, processor):
+        self.ds = ds
+        self.processor = processor
+    def __getitem__(self, idx):
+        sample = self.ds[idx]
+        input_sample_rate = json.loads(sample["metadata"])["sample_rate"]
+        audio_tensor = self.processor.prepare_audio(sample["audio"], input_sample_rate)
+        instruction = self.processor.prepare_instruction(sample["instruction"])
+        return {
+            "raw_wav": audio_tensor,
+            "text": "",
+            "task": sample["task"],
+            "audio_chunk_sizes": len(audio_tensor),
+            "index": idx,
+            "id": sample["id"],
+            "prompt": instruction,
+            "label": sample["output"],
+        }
+    def __len__(self):
+        return len(self.ds)
+def collater(samples: list[dict]) -> dict:
+    """Collate samples into a batch.
+    Samples is a list of dictionaries, each containing the following keys:
+    - raw_wav: a list of tensors containing the raw audio waveform
+    - text: a list of strings containing the text
+    - task: a list of strings containing the task
+    - id: a list of strings containing the id
+    - prompt: a list of strings containing the prompt
+    - index: a list of integers containing the index
+    - audio_chunk_sizes: a list of integers containing the size of each audio chunk
+    The indiviudal audio waveforms will be stacked along the batch dimension for easier
+    processing in the audio model. To keep which audio belongs to which sample, we add
+    the audio_chunk_sizes key to the batch dictionary.
+    """
+    raw_wav = torch.stack([s["raw_wav"] for s in samples])
+    paddding_mask = torch.zeros_like(raw_wav).to(torch.bool)
+    text = [s["text"] for s in samples]
+    prompt = [s["prompt"] for s in samples]
+    task = [s["task"] for s in samples]
+    id = [s["id"] for s in samples]
+    index = [s["index"] for s in samples]
+    label = [s["label"] for s in samples]
+    return {
+        "raw_wav": raw_wav,
+        "padding_mask": paddding_mask,
+        "text": text,
+        "task": task,
+        "id": id,
+        "prompt": prompt,
+        "index": index,
+        "audio_chunk_sizes": 1,
+        "label": label,
+    }

NatureLM/runner.py ADDED Viewed

	@@ -0,0 +1,515 @@

+# This script is based on https://github.com/salesforce/LAVIS/blob/main/lavis/runners/runner_base.py
+import datetime
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from pathlib import Path
+import torch
+import torch.distributed
+import torch.distributed as dist
+import wandb
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from NatureLM.config import Config
+from NatureLM.dist_utils import get_rank, get_world_size, is_dist_avail_and_initialized, is_main_process, main_process
+from NatureLM.logger import MetricLogger, SmoothedValue
+from NatureLM.optims import LinearWarmupCosineLRScheduler, get_optimizer
+from NatureLM.task_metrics import get_task_metrics
+from NatureLM.utils import get_dataloader, prepare_sample_dist
+class Runner:
+    def __init__(self, cfg: Config, model, datasets, job_id):
+        self.config = cfg
+        # log
+        device = "cuda:0"
+        if is_main_process():
+            if self.config.run.wandb_enabled:
+                wandb.init(project="earthlm", config=self.config.model_dump())
+            else:
+                wandb.init(mode="disabled")
+        if "LOCAL_RANK" in os.environ:
+            device = int(os.environ["LOCAL_RANK"])
+        else:
+            device = self.config.run.device
+        print(f"device is {device} could have been {self.config.run.device}")
+        self.output_dir = Path(self.config.run.output_dir) / job_id
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.log_writter = SummaryWriter(self.output_dir)
+        # settings
+        self.device = torch.device(device)
+        self.use_distributed = self.config.run.use_distributed
+        self.start_epoch = 0
+        self.max_epoch = self.config.run.optims.max_epoch
+        self.evaluate_only = self.config.run.evaluate
+        self.cuda_enabled = self.device.type == "cuda"
+        # test prompt
+        self.prompt_template = self.config.model.prompt_template
+        # model
+        self._model = model
+        torch.nn.SyncBatchNorm.convert_sync_batchnorm(self._model)
+        self._model.to(self.device)
+        if self.use_distributed:
+            self.model = DDP(
+                self._model,
+                find_unused_parameters=True,
+                static_graph=False,
+                device_ids=[self.device],
+            )
+        else:
+            self.model = self._model
+        # dataloaders
+        self.train_loader = get_dataloader(
+            datasets["train"],
+            self.config.run,
+            is_train=True,
+            use_distributed=self.use_distributed,
+        )
+        self.valid_loader = get_dataloader(
+            datasets["valid"],
+            self.config.run,
+            is_train=False,
+            use_distributed=self.use_distributed,
+        )
+        self.test_loader = get_dataloader(
+            datasets["test"],
+            self.config.run,
+            is_train=False,
+            use_distributed=self.use_distributed,
+        )
+        # scaler
+        self.use_amp = self.config.run.amp
+        if self.use_amp:
+            self.scaler = torch.cuda.amp.GradScaler()
+        else:
+            self.scaler = None
+        # optimizer & scheduler
+        self.iters_per_epoch = (
+            len(self.train_loader) if self.config.run.epoch_based else self.config.run.iters_per_epoch
+        )
+        self.optimizer = get_optimizer(self.model, self.config.run.optims)
+        self.scheduler = LinearWarmupCosineLRScheduler(
+            self.optimizer,
+            max_epoch=self.max_epoch,
+            iters_per_epoch=self.iters_per_epoch,
+            min_lr=self.config.run.optims.min_lr,
+            init_lr=self.config.run.optims.init_lr,
+            warmup_steps=self.config.run.optims.warmup_steps,
+            warmup_start_lr=self.config.run.optims.warmup_start_lr,
+        )
+        #### augmentations
+        # self.rng = random.Random(self.config.run.seed)
+        # self.rngnp = np.random.default_rng(seed=self.config.run.seed)
+        # self.rngth = torch.Generator(device=args.device)
+        # self.rngth.manual_seed(self.config.run.seed)
+        # augments = []
+        # if self.config.run.augmentations.flip:
+        #     augments.append(augmentations.Flip(self.config.run.augmentations.flip, rngth=self.rngth, seed=self.config.run.seed))
+        # if self.config.run.augmentations.bandmask:
+        #     augments.append(augmentations.BandMask(self.config.run.augmentations.bandmask, sample_rate=args.sample_rate, rng=self.rng, seed=self.config.run.seed))
+        # if self.config.run.augmentations.revecho:
+        #     augments.append(
+        #         augmentations.RevEcho(proba=self.config.run.augmentations.revecho,rng=self.rng,seed=self.config.run.seed))
+        # self.augment = torch.nn.Sequential(*augments)
+        self.log_config()
+    def unwrap_dist_model(self, model):
+        if self.use_distributed:
+            return model.module
+        else:
+            return model
+    def train_epoch(self, epoch):
+        self.model.train()
+        metric_logger = MetricLogger(delimiter="  ")
+        metric_logger.add_meter("lr", SmoothedValue(window_size=1, fmt="{value:.6f}"))
+        metric_logger.add_meter("loss", SmoothedValue(window_size=1, fmt="{value:.4f}"))
+        logging.info("Start training epoch {}, {} iters per inner epoch.".format(epoch, self.iters_per_epoch))
+        header = "Train: data epoch: [{}]".format(epoch)
+        # Get gradient clipping parameters from config
+        clip_grad_norm = self.config.run.optims.max_grad_norm
+        clip_grad_value = self.config.run.optims.max_grad_value
+        for i in metric_logger.log_every(
+            range(self.iters_per_epoch),
+            self.config.run.log_freq,
+            header=header,
+            logger=self.log_writter,
+            start_step=epoch * self.iters_per_epoch,
+        ):
+            if i >= self.iters_per_epoch:
+                break
+            samples = next(self.train_loader)
+            samples = prepare_sample_dist(samples, self.device)
+            #### augmentation
+            # if False:
+            #     samples = self.augment(samples)
+            self.scheduler.step(cur_epoch=epoch, cur_step=i)
+            with torch.autocast(self.device.type, enabled=self.use_amp, dtype=torch.bfloat16):
+                loss = self.model(samples)["loss"]
+                if torch.isnan(loss):
+                    print("loss nan", samples)
+                #     continue
+            if self.use_amp and self.scaler:
+                self.scaler.scale(loss).backward()
+            else:
+                loss.backward()
+            # Apply gradient clipping
+            if clip_grad_norm is not None:
+                if self.use_amp and self.scaler:
+                    self.scaler.unscale_(self.optimizer)
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=clip_grad_norm)
+            if clip_grad_value is not None:
+                if self.use_amp and self.scaler:
+                    self.scaler.unscale_(self.optimizer)
+                torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=clip_grad_value)
+            if (i + 1) % self.config.run.accum_grad_iters == 0:
+                if self.use_amp and self.scaler:
+                    self.scaler.step(self.optimizer)
+                    self.scaler.update()
+                else:
+                    self.optimizer.step()
+                self.optimizer.zero_grad()
+            metric_logger.update(loss=loss.item())
+            metric_logger.update(lr=self.optimizer.param_groups[0]["lr"])
+        metric_logger.synchronize_between_processes()
+        logging.info("Averaged stats: " + str(metric_logger.global_avg()))
+        return {k: "{:.3f}".format(meter.global_avg) for k, meter in metric_logger.meters.items()}
+    @torch.no_grad()
+    def valid_epoch(self, epoch, split, decode=True, save_json=False, decode_ratio=1.0):
+        """
+        Decode = True will lead to calculation of custom metrics which are based on text.
+        decode_ratio controls the percentage of batches which will have custom metrics computed,
+        a speed trade-off due to the cost of the 'generate' method.
+        """
+        model = self.unwrap_dist_model(self.model)
+        model.eval()
+        dataloader = getattr(self, split + "_loader", None)
+        assert dataloader is not None, f"{split}_loader does not exist."
+        metric_logger = MetricLogger(delimiter="  ")
+        header = f"Eval: data epoch: [{epoch}]"
+        results_per_task = defaultdict(list)  # Store results per task
+        overall_results = []  # Store all results for overall metrics
+        # Calculate N based on decode_ratio
+        if decode_ratio <= 0.0:
+            N = float("inf")  # Effectively never run generate
+        elif decode_ratio >= 1.0:
+            N = 1  # Run generate every batch
+        else:
+            N = max(int(1 / decode_ratio), 1)  # Ensure N is at least 1
+        batch_idx = 0
+        # Initialize overall metrics
+        overall_res = {
+            "loss": torch.tensor(0.0, device=self.device),
+            "correct": torch.tensor(0.0, device=self.device),
+            "total": torch.tensor(0.0, device=self.device),
+        }
+        # Initialize per-task metrics
+        per_task_res = defaultdict(
+            lambda: {
+                "loss": torch.tensor(0.0, device=self.device),
+                "correct": torch.tensor(0.0, device=self.device),
+                "total": torch.tensor(0.0, device=self.device),
+                "n_sample": 0,
+                "predicted_texts": [],
+                "gold_texts": [],
+            }
+        )
+        for samples in metric_logger.log_every(dataloader, self.config.run.log_freq, header=header):
+            samples = prepare_sample_dist(samples, self.device)
+            with torch.autocast(self.device.type, enabled=self.use_amp):
+                forward_result = model(samples, verbose=True)
+            # Extract batch-level loss and correct counts
+            batch_loss = forward_result.get("loss", torch.tensor(0.0, device=self.device))
+            batch_correct = forward_result.get("correct", torch.tensor(0.0, device=self.device))
+            batch_total = forward_result.get("total", torch.tensor(1.0, device=self.device))
+            batch_size = len(samples["id"])
+            # Update overall metrics with batch-level values
+            overall_res["loss"] += batch_loss.detach()
+            overall_res["correct"] += batch_correct.detach()
+            overall_res["total"] += batch_total.detach()
+            # Decide whether to run generate based on decode_ratio
+            if decode and (batch_idx % N == 0):
+                prompts = samples.get("prompt", None)
+                try:
+                    generated_texts = model.generate(samples, self.config.generate, prompts=prompts)
+                except Exception as e:
+                    print("error in generation", e)
+                    generated_texts = [None] * batch_size
+            else:
+                generated_texts = [None] * batch_size  # Placeholder if not decoding
+            # Process per-sample data for per-task metrics and result saving
+            for i in range(batch_size):
+                task = samples["task"][i]
+                # Collect per-task batch-level metrics
+                per_task_res[task]["loss"] += batch_loss.detach()
+                per_task_res[task]["correct"] += batch_correct.detach()
+                per_task_res[task]["total"] += batch_total.detach()
+                per_task_res[task]["n_sample"] += 1
+                res = {
+                    "id": samples["id"][i],
+                    "ground_truth": samples["text"][i],  # Gold label from dataloader
+                    "task": task,
+                    "predicted_text": generated_texts[i],
+                }
+                if decode and generated_texts[i] is not None:
+                    res["prompt"] = samples.get("prompt", [None])[i]
+                results_per_task[task].append(res)
+                overall_results.append(res)
+                # Collect texts for custom metrics
+                if generated_texts[i] is not None:
+                    per_task_res[task]["predicted_texts"].append(generated_texts[i])
+                    per_task_res[task]["gold_texts"].append(samples["text"][i])
+            batch_idx += 1  # Increment batch index
+        if save_json:
+            for task, task_results in results_per_task.items():
+                self.save_result(task_results, self.output_dir, f"eval_{split}_{task}_epoch_{epoch}")
+            # Optionally save overall results
+            self.save_result(overall_results, self.output_dir, f"eval_{split}_epoch_{epoch}")
+        # Synchronize metrics across processes if in distributed mode
+        if is_dist_avail_and_initialized():
+            for key in overall_res:
+                dist.all_reduce(overall_res[key])
+        overall_ret = {
+            "loss": (overall_res["loss"] / batch_idx).item(),
+            "agg_metrics": (overall_res["correct"] / overall_res["total"]).item(),
+        }
+        if is_main_process():
+            # Log overall metrics
+            wandb.log(
+                {
+                    f"{split}_loss": overall_ret["loss"],
+                    f"{split}_accuracy": overall_ret["agg_metrics"],
+                    "epoch": epoch,
+                }
+            )
+        # Compute and log per-task metrics
+        for task, res in per_task_res.items():
+            if "caption-none" in task:
+                continue
+            if self.use_distributed:
+                print(f"Rank {dist.get_rank()}, task={task}, ")
+            print(
+                f"loss={res['loss'].shape, res['loss'].dtype}, "
+                f"correct={res['correct'].shape, res['correct'].dtype}, "
+                f"total={res['total'].shape, res['total'].dtype}, "
+                f"n_sample={res['n_sample']}"
+            )
+            # Synchronize metrics across processes if in distributed mode
+            if is_dist_avail_and_initialized():
+                dist.all_reduce(res["loss"])
+                dist.all_reduce(res["correct"])
+                dist.all_reduce(res["total"])
+                dist.all_reduce(torch.tensor(res["n_sample"], device=self.device))
+            ret = {
+                "loss": (res["loss"] / res["n_sample"]).item(),
+                "agg_metrics": (res["correct"] / res["total"]).item(),
+            }
+            if is_main_process():
+                # Log per-task metrics
+                wandb.log(
+                    {
+                        f"{split}_{task}_loss": ret["loss"],
+                        f"{split}_{task}_accuracy": ret["agg_metrics"],
+                        "epoch": epoch,
+                    }
+                )
+                # Get and compute custom metrics for this task
+                metrics_list = get_task_metrics(task)
+                predicted_texts = res["predicted_texts"]
+                gold_texts = res["gold_texts"]
+                for metric in metrics_list:
+                    if predicted_texts and gold_texts:
+                        metric_value = metric.compute_metric(predicted_texts, gold_texts)
+                        metric_name = metric.__class__.__name__
+                        wandb.log(
+                            {
+                                f"{split}_{task}_{metric_name}": metric_value,
+                                "epoch": epoch,
+                            }
+                        )
+        return overall_ret  # Return overall metrics
+    def save_result(self, result, result_dir, filename):
+        result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, get_rank()))
+        final_result_file = os.path.join(result_dir, "%s.json" % filename)
+        try:
+            json.dump(result, open(result_file, "w"), ensure_ascii=False)
+        except Exception as e:
+            logging.warning(f"Error saving {result_file}. Error: {e}")
+            json.dump(result, open(result_file, "w", encoding="utf-8"), ensure_ascii=False)
+        # if is_dist_avail_and_initialized():
+        #     dist.barrier()
+        if is_main_process():
+            logging.info("rank %d starts merging results." % get_rank())
+            result = []
+            for rank in range(get_world_size()):
+                result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank))
+                try:
+                    res = json.load(open(result_file, "r"))
+                except Exception as e:
+                    logging.warning(f"Error reading {result_file}. Error: {e}")
+                    res = json.load(open(result_file, "r", encoding="utf-8"))
+                result += res
+            try:
+                json.dump(result, open(final_result_file, "w"), ensure_ascii=False)
+            except Exception as e:
+                logging.warning(f"Error saving {final_result_file}. Error: {e}")
+                json.dump(
+                    result,
+                    open(final_result_file, "w", encoding="utf-8"),
+                    ensure_ascii=False,
+                )
+            print("result file saved to %s" % final_result_file)
+    def train(self):
+        start_time = time.time()
+        best_agg_metric = 0
+        best_epoch = 0
+        for cur_epoch in range(self.start_epoch, self.max_epoch):
+            if self.evaluate_only:
+                break
+            # training phase
+            logging.info("Training Phase")
+            train_stats = self.train_epoch(cur_epoch)
+            self.log_stats(train_stats, split_name="train")
+            # validating phase
+            logging.info("Validating Phase")
+            valid_log = self.valid_epoch(
+                cur_epoch,
+                "valid",
+                decode=self.config.run.custom_metrics,
+                save_json=False,
+                decode_ratio=self.config.run.decode_ratio,
+            )
+            if valid_log is not None:
+                if is_main_process():
+                    agg_metrics = valid_log["agg_metrics"]
+                    if agg_metrics > best_agg_metric:
+                        best_agg_metric = agg_metrics
+                        best_epoch = cur_epoch
+                        self.save_checkpoint(cur_epoch, is_best=True)
+                    valid_log.update({"best_epoch": best_epoch})
+                    self.log_stats(valid_log, split_name="valid")
+            self.save_checkpoint(cur_epoch, is_best=False)
+            # if self.use_distributed:
+            #     dist.barrier()
+        # testing phase
+        if self.evaluate_only:
+            self.valid_epoch("best", "test", decode=True, save_json=True)
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logging.info("Training time {}".format(total_time_str))
+    @main_process
+    def log_config(self):
+        with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
+            f.write(json.dumps(self.config.model_dump_json(), indent=4) + "\n")
+    @main_process
+    def log_stats(self, stats, split_name):
+        if isinstance(stats, dict):
+            log_stats = {**{f"{split_name}_{k}": v for k, v in stats.items()}}
+            with open(os.path.join(self.output_dir, "log.txt"), "a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+        elif isinstance(stats, list):
+            pass
+    @main_process
+    def save_checkpoint(self, cur_epoch, is_best=False):
+        """
+        Save the checkpoint at the current epoch.
+        """
+        model_no_ddp = self.unwrap_dist_model(self.model)
+        param_grad_dic = {k: v.requires_grad for (k, v) in model_no_ddp.named_parameters()}
+        state_dict = model_no_ddp.state_dict()
+        for k in list(state_dict.keys()):
+            if k in param_grad_dic.keys() and not param_grad_dic[k]:
+                # delete parameters that do not require gradient
+                del state_dict[k]
+        save_obj = {
+            "model": state_dict,
+            "optimizer": self.optimizer.state_dict(),
+            "config": dict(self.config),
+            "scaler": self.scaler.state_dict() if self.scaler else None,
+            "epoch": cur_epoch,
+        }
+        save_to = os.path.join(
+            self.output_dir,
+            "checkpoint_{}.pth".format("best" if is_best else cur_epoch),
+        )
+        logging.info("Saving checkpoint at epoch {} to {}.".format(cur_epoch, save_to))
+        torch.save(save_obj, save_to)

NatureLM/storage_utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import logging
+import os
+from functools import lru_cache
+from typing import Union
+import cloudpathlib
+from google.cloud.storage.client import Client
+logger = logging.getLogger(__name__)
+def is_gcs_path(path: Union[str, os.PathLike]) -> bool:
+    return str(path).startswith("gs://")
+@lru_cache(maxsize=1)
+def _get_client():
+    return cloudpathlib.GSClient(storage_client=Client())
+try:
+    _gcp_storage_client = _get_client()
+except Exception:
+    logger.warning("Failed to initialize GCS client." "Training wont be able to use GSPath or R2Path without a client.")
+    _gcp_storage_client = None

NatureLM/task_metric_utils.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# Taken from DCASE 2021 Task 5 evaluation source code
+# https://github.com/c4dm/dcase-few-shot-bioacoustic
+# MIT License
+import mir_eval
+import numpy as np
+import scipy
+def fast_intersect(ref, est):
+    """Find all intersections between reference events and estimated events (fast).
+    Best-case complexity: O(N log N + M log M) where N=length(ref) and M=length(est)
+    Parameters
+    ----------
+    ref: np.ndarray [shape=(2, n)], real-valued
+         Array of reference events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    est: np.ndarray [shape=(2, m)], real-valued
+         Array of estimated events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    Returns
+    -------
+    matches: list of sets, length n, integer-valued
+         Property: matches[i] contains the set of all indices j such that
+            (ref[0, i]<=est[1, j]) AND (ref[1, i]>=est[0, j])
+    """
+    ref_on_argsort = np.argsort(ref[0, :])
+    ref_off_argsort = np.argsort(ref[1, :])
+    est_on_argsort = np.argsort(est[0, :])
+    est_off_argsort = np.argsort(est[1, :])
+    est_on_maxindex = est.shape[1]
+    est_off_minindex = 0
+    estref_matches = [set()] * ref.shape[1]
+    refest_matches = [set()] * ref.shape[1]
+    for ref_id in range(ref.shape[1]):
+        ref_onset = ref[0, ref_on_argsort[ref_id]]
+        est_off_sorted = est[1, est_off_argsort[est_off_minindex:]]
+        search_result = np.searchsorted(est_off_sorted, ref_onset, side="left")
+        est_off_minindex += search_result
+        refest_match = est_off_argsort[est_off_minindex:]
+        refest_matches[ref_on_argsort[ref_id]] = set(refest_match)
+        ref_offset = ref[1, ref_off_argsort[-1 - ref_id]]
+        est_on_sorted = est[0, est_on_argsort[: (1 + est_on_maxindex)]]
+        search_result = np.searchsorted(est_on_sorted, ref_offset, side="right")
+        est_on_maxindex = search_result - 1
+        estref_match = est_on_argsort[: (1 + est_on_maxindex)]
+        estref_matches[ref_off_argsort[-1 - ref_id]] = set(estref_match)
+    zip_iterator = zip(refest_matches, estref_matches)
+    matches = [x.intersection(y) for (x, y) in zip_iterator]
+    return matches
+def iou(ref, est, method="fast"):
+    """Compute pairwise "intersection over union" (IOU) metric between reference
+    events and estimated events.
+    Let us denote by a_i and b_i the onset and offset of reference event i.
+    Let us denote by u_j and v_j the onset and offset of estimated event j.
+    The IOU between events i and j is defined as
+        (min(b_i, v_j)-max(a_i, u_j)) / (max(b_i, v_j)-min(a_i, u_j))
+    if the events are non-disjoint, and equal to zero otherwise.
+    Parameters
+    ----------
+    ref: np.ndarray [shape=(2, n)], real-valued
+         Array of reference events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    est: np.ndarray [shape=(2, m)], real-valued
+         Array of estimated events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    method: str, optional.
+         If "fast" (default), computes pairwise intersections via a custom
+         dynamic programming algorithm, see fast_intersect.
+         If "slow", computes pairwise intersections via bruteforce quadratic
+         search, see slow_intersect.
+    Returns
+    -------
+    S: scipy.sparse.dok.dok_matrix, real-valued
+        Sparse 2-D matrix. S[i,j] contains the IOU between ref[i] and est[j]
+        if these events are non-disjoint and zero otherwise.
+    """
+    n_refs = ref.shape[1]
+    n_ests = est.shape[1]
+    S = scipy.sparse.dok_matrix((n_refs, n_ests))
+    if method == "fast":
+        matches = fast_intersect(ref, est)
+    elif method == "slow":
+        matches = slow_intersect(ref, est)
+    for ref_id in range(n_refs):
+        matching_ests = matches[ref_id]
+        ref_on = ref[0, ref_id]
+        ref_off = ref[1, ref_id]
+        for matching_est_id in matching_ests:
+            est_on = est[0, matching_est_id]
+            est_off = est[1, matching_est_id]
+            intersection = min(ref_off, est_off) - max(ref_on, est_on)
+            union = max(ref_off, est_off) - min(ref_on, est_on)
+            intersection_over_union = intersection / union
+            S[ref_id, matching_est_id] = intersection_over_union
+    return S
+def compute_intersection(ref, est, method="fast"):
+    """Compute pairwise intersection between reference
+    events and estimated events.
+    Let us denote by a_i and b_i the onset and offset of reference event i.
+    Let us denote by u_j and v_j the onset and offset of estimated event j.
+    The Intersection between events i and j is defined as
+        (min(b_i, v_j)-max(a_i, u_j))
+    if the events are non-disjoint, and equal to zero otherwise.
+    Parameters
+    ----------
+    ref: np.ndarray [shape=(2, n)], real-valued
+         Array of reference events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    est: np.ndarray [shape=(2, m)], real-valued
+         Array of estimated events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    method: str, optional.
+         If "fast" (default), computes pairwise intersections via a custom
+         dynamic programming algorithm, see fast_intersect.
+         If "slow", computes pairwise intersections via bruteforce quadratic
+         search, see slow_intersect.
+    Returns
+    -------
+    S: scipy.sparse.dok.dok_matrix, real-valued
+        Sparse 2-D matrix. S[i,j] contains the Intersection between ref[i] and est[j]
+        if these events are non-disjoint and zero otherwise.
+    """
+    n_refs = ref.shape[1]
+    n_ests = est.shape[1]
+    S = scipy.sparse.dok_matrix((n_refs, n_ests))
+    if method == "fast":
+        matches = fast_intersect(ref, est)
+    elif method == "slow":
+        matches = slow_intersect(ref, est)
+    for ref_id in range(n_refs):
+        matching_ests = matches[ref_id]
+        ref_on = ref[0, ref_id]
+        ref_off = ref[1, ref_id]
+        for matching_est_id in matching_ests:
+            est_on = est[0, matching_est_id]
+            est_off = est[1, matching_est_id]
+            intersection = min(ref_off, est_off) - max(ref_on, est_on)
+            # union = max(ref_off, est_off) - min(ref_on, est_on)
+            # intersection_over_union = intersection / union
+            S[ref_id, matching_est_id] = intersection #_over_union
+    return S
+def match_events(ref, est, min_iou=0.0, method="fast"):
+    """
+    Compute a maximum matching between reference and estimated event times,
+    subject to a criterion of minimum intersection-over-union (IOU).
+    Given two lists of events ``ref`` (reference) and ``est`` (estimated),
+    we seek the largest set of correspondences ``(ref[i], est[j])`` such that
+        ``iou(ref[i], est[j]) <= min_iou``
+    and such that each ``ref[i]`` and ``est[j]`` is matched at most once.
+    This function is strongly inspired by mir_eval.onset.util.match_events.
+    It relies on mir_eval's implementation of the Hopcroft-Karp algorithm from
+    maximum bipartite graph matching. However, one important difference is that
+    mir_eval's distance function relies purely on onset times, whereas this function
+    considers both onset times and offset times to compute the IOU metric between
+    reference events and estimated events.
+    Parameters
+    ----------
+    ref: np.ndarray [shape=(2, n)], real-valued
+         Array of reference events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    est: np.ndarray [shape=(2, m)], real-valued
+         Array of estimated events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    min_iou: real number in [0, 1). Default: 0.
+         Threshold for minimum amount of intersection over union (IOU) to match
+         any two events. See the iou method for implementation details.
+    method: str, optional.
+         If "fast" (default), computes pairwise intersections via a custom
+         dynamic programming algorithm, see fast_intersect.
+         If "slow", computes pairwise intersections via bruteforce quadratic
+         search, see slow_intersect.
+    Returns
+    -------
+    matching : list of tuples
+        Every tuple corresponds to a match between one reference event and
+        one estimated event.
+            ``matching[i] == (i, j)`` where ``ref[i]`` matches ``est[j]``.
+        Note that all values i and j appear at most once in the list.
+    """
+    # Intersect reference events and estimated events
+    S = iou(ref, est, method=method)
+    # Threshold intersection-over-union (IOU) ratio
+    S_bool = scipy.sparse.dok_matrix(S > min_iou)
+    hits = S_bool.keys()
+    # Construct the bipartite graph
+    G = {}
+    for ref_i, est_i in hits:
+        if est_i not in G:
+            G[est_i] = []
+        G[est_i].append(ref_i)
+    # Apply Hopcroft-Karp algorithm (from mir_eval package)
+    # to obtain maximum bipartite graph matching
+    matching = sorted(mir_eval.util._bipartite_match(G).items())
+    return matching
+def slow_intersect(ref, est):
+    """Find all intersections between reference events and estimated events (slow).
+    Best-case complexity: O(N*M) where N=ref.shape[1] and M=est.shape[1]
+    Parameters
+    ----------
+    ref: np.ndarray [shape=(2, n)], real-valued
+         Array of reference events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    est: np.ndarray [shape=(2, m)], real-valued
+         Array of estimated events. Each column is an event.
+         The first row denotes onset times and the second row denotes offset times.
+    Returns
+    -------
+    matches: list of sets, length n, integer-valued
+         Property: matches[i] contains the set of all indices j such that
+            (ref[0, i]<=est[1, j]) AND (ref[1, i]>=est[0, j])
+    """
+    matches = []
+    for i in range(ref.shape[1]):
+        matches.append(
+            set(
+                [
+                    j
+                    for j in range(est.shape[1])
+                    if ((ref[0, i] <= est[1, j]) and (ref[1, i] >= est[0, j]))
+                ]
+            )
+        )
+    return
+def frames_to_st_dict(x, sr=16000):
+    # x : Tensor of shape (batch, time) or (time,). Entries are 2 (POS), 1 (UNK), and 0 (NEG).
+    # returns a list of dicts {"Begin Time (s)" : [...], "End Time (s)" : [...], "Annotation" : [...]} if batch dim exists, or a single dict
+    if len(x.size()) == 2:
+        outs = []
+        for i in range(x.size(0)):
+            x_sub = x[i,:]
+            outs.append(_frames_to_st_dict_single(x_sub, sr=sr))
+        return outs
+    else:
+        return _frames_to_st_dict_single(x, sr=sr)
+def _frames_to_st_dict_single(x, sr=16000):
+    d = {"Begin Time (s)" : [], "End Time (s)" : [], "Annotation" : []}
+    for label_i in [1,2]:
+        labels = x.numpy() == label_i  # POS : 2, UNK : 1, NEG : 0
+        starts = np.where((~labels[:-1]) & (labels[1:]))[0] + 1
+        if labels[0]:
+            starts = np.insert(starts, 0, 0)
+        ends = np.where((labels[:-1]) & (~labels[1:]))[0] + 1
+        if labels[-1]:
+            ends = np.append(ends, len(labels))
+        for start, end in zip(starts, ends):
+            d["Begin Time (s)"].append(start/sr)
+            d["End Time (s)"].append(end/sr)
+            d["Annotation"].append("POS" if label_i == 2 else "UNK")
+    return d

NatureLM/task_metrics.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import re
+from abc import ABC, abstractmethod
+from typing import List, Tuple
+import numpy as np
+from NatureLM.task_metric_utils import match_events
+# Assume the following functions are imported from the reference implementations:
+# - match_events
+# - iou
+# - fast_intersect
+# - slow_intersect
+# - compute_intersection
+class Metric(ABC):
+    @abstractmethod
+    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
+        pass
+class ExactAccuracy(Metric):
+    """Exact-match accuracy metric."""
+    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
+        predicted_texts = [pt.lower().strip() for pt in predicted_texts]
+        gold_texts = [gt.lower().strip() for gt in gold_texts]
+        correct = sum(p == g for p, g in zip(predicted_texts, gold_texts))
+        return correct / len(gold_texts) if gold_texts else 0.0
+class FewShot(Metric):
+    """Few-shot learning metric based on event matching using IoU."""
+    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
+        # Initialize counts
+        total_TP = 0
+        total_FP = 0
+        total_FN = 0
+        for pred_text, gold_text in zip(predicted_texts, gold_texts):
+            # Extract events from texts
+            pred_events = parse_timestamps_from_text(pred_text)
+            gold_events = parse_timestamps_from_text(gold_text)
+            # Convert events to numpy arrays for match_events function
+            # Each event is (start_time, end_time), need to transpose to shape (2, n)
+            pred_array = np.array(pred_events).T if pred_events else np.empty((2, 0))
+            gold_array = np.array(gold_events).T if gold_events else np.empty((2, 0))
+            # Use match_events function from the reference implementation
+            matches = match_events(gold_array, pred_array, min_iou=0.5, method="fast")
+            TP = len(matches)
+            FP = len(pred_events) - TP
+            FN = len(gold_events) - TP
+            total_TP += TP
+            total_FP += FP
+            total_FN += FN
+        # Compute precision, recall, and F1 score
+        precision = total_TP / (total_TP + total_FP) if (total_TP + total_FP) > 0 else 0.0
+        recall = total_TP / (total_TP + total_FN) if (total_TP + total_FN) > 0 else 0.0
+        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
+        return f1_score
+class NoneAccuracy(Metric):
+    """Accuracy for cases where 'None' is the correct answer."""
+    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
+        # Normalize texts
+        predicted_texts = [pt.lower().strip() for pt in predicted_texts]
+        gold_texts = [gt.lower().strip() for gt in gold_texts]
+        # Filter indices where gold_text is 'none'
+        indices = [i for i, gt in enumerate(gold_texts) if gt == "none"]
+        if not indices:
+            return 0.0  # No 'None' cases in gold_texts
+        correct = sum(predicted_texts[i] == "none" for i in indices)
+        return correct / len(indices)
+class MultipleSpeciesAccuracy(Metric):
+    """Accuracy for cases where the correct answer has at least one comma (multiple species)."""
+    def compute_metric(self, predicted_texts: List[str], gold_texts: List[str]) -> float:
+        # Normalize texts
+        predicted_texts = [pt.lower().strip() for pt in predicted_texts]
+        gold_texts = [gt.lower().strip() for gt in gold_texts]
+        # Filter indices where gold_text contains at least one comma
+        indices = [i for i, gt in enumerate(gold_texts) if "," in gt]
+        if not indices:
+            return 0.0  # No multiple-species cases in gold_texts
+        correct = sum(predicted_texts[i] == gold_texts[i] for i in indices)
+        return correct / len(indices)
+def get_task_metrics(task: str) -> List[Metric]:
+    """Get a list of metric instances appropriate for the given task."""
+    all_metrics = []
+    metrics_dict = {}
+    if "classification" in task:
+        metrics_dict["ExactAccuracy"] = ExactAccuracy()
+    if "fewshot" in task:
+        metrics_dict["FewShot"] = FewShot()
+    if "detection" in task:
+        metrics_dict["ExactAccuracy"] = ExactAccuracy()  # Ensures no duplicate
+        metrics_dict["NoneAccuracy"] = NoneAccuracy()
+        metrics_dict["MultipleSpeciesAccuracy"] = MultipleSpeciesAccuracy()
+    all_metrics = list(metrics_dict.values())
+    return all_metrics
+def parse_timestamps_from_text(text: str) -> List[Tuple[float, float]]:
+    """
+    Function to parse timestamps from text.
+    Extracts timestamps in the format "start-end" where start and end are floats.
+    """
+    # Regular expression to extract timestamps in the format "start-end"
+    pattern = r"(\d+\.\d+)-(\d+\.\d+)"
+    matches = re.findall(pattern, text)
+    events = [(float(start), float(end)) for start, end in matches]
+    return events

NatureLM/utils.py ADDED Viewed

	@@ -0,0 +1,382 @@

+# Copyright (2024) Earth Species Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Literal
+import numpy as np
+import resampy
+import soundfile as sf
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torch.utils.data import DataLoader, DistributedSampler
+from NatureLM.dist_utils import get_rank, get_world_size
+logger = logging.getLogger(__name__)
+TARGET_SAMPLE_RATE = 16_000
+def snr_scale(clean, noise, snr):
+    # Ensure both clean and noise have the same length
+    assert clean.shape == noise.shape, "Clean and noise must have the same shape."
+    # Compute power (mean squared amplitude)
+    power_signal = torch.mean(clean**2)
+    power_noise = torch.mean(noise**2)
+    # Prevent division by zero
+    epsilon = 1e-10
+    power_noise = torch.clamp(power_noise, min=epsilon)
+    # Calculate desired noise power based on SNR
+    desired_noise_power = power_signal / (10 ** (snr / 10))
+    # Scale noise to achieve the desired noise power
+    scale = torch.sqrt(desired_noise_power / power_noise)
+    scaled_noise = scale * noise
+    return scaled_noise
+def time_scale(signal, scale=2.0, rngnp=None, seed=42):
+    if rngnp is None:
+        rngnp = np.random.default_rng(seed=seed)
+    scaling = np.power(scale, rngnp.uniform(-1, 1))
+    output_size = int(signal.shape[-1] * scaling)
+    ref = torch.arange(output_size, device=signal.device, dtype=signal.dtype).div_(scaling)
+    ref1 = ref.clone().type(torch.int64)
+    ref2 = torch.min(ref1 + 1, torch.full_like(ref1, signal.shape[-1] - 1, dtype=torch.int64))
+    r = ref - ref1.type(ref.type())
+    scaled_signal = signal[..., ref1] * (1 - r) + signal[..., ref2] * r
+    ## trim or zero pad to torche original size
+    if scaled_signal.shape[-1] > signal.shape[-1]:
+        nframes_offset = (scaled_signal.shape[-1] - signal.shape[-1]) // 2
+        scaled_signal = scaled_signal[..., nframes_offset : nframes_offset + signal.shape[-1]]
+    else:
+        nframes_diff = signal.shape[-1] - scaled_signal.shape[-1]
+        pad_left = int(np.random.uniform() * nframes_diff)
+        pad_right = nframes_diff - pad_left
+        scaled_signal = F.pad(input=scaled_signal, pad=(pad_left, pad_right), mode="constant", value=0)
+    return scaled_signal
+def mel_frequencies(n_mels, fmin, fmax):
+    def _hz_to_mel(f):
+        return 2595 * np.log10(1 + f / 700)
+    def _mel_to_hz(m):
+        return 700 * (10 ** (m / 2595) - 1)
+    low = _hz_to_mel(fmin)
+    high = _hz_to_mel(fmax)
+    mels = np.linspace(low, high, n_mels)
+    return _mel_to_hz(mels)
+def now_as_str() -> str:
+    return datetime.now().strftime("%Y%m%d%H%M")
+def get_dataloader(dataset, config, is_train=True, use_distributed=True):
+    if use_distributed:
+        sampler = DistributedSampler(dataset, shuffle=is_train, num_replicas=get_world_size(), rank=get_rank())
+    else:
+        sampler = None
+    loader = DataLoader(
+        dataset,
+        batch_size=config.batch_size_train if is_train else config.batch_size_eval,
+        num_workers=config.num_workers,
+        pin_memory=False,
+        sampler=sampler,
+        shuffle=sampler is None and is_train,
+        collate_fn=dataset.collater,
+        drop_last=is_train,
+    )
+    if is_train:
+        loader = IterLoader(loader, use_distributed=use_distributed)
+    return loader
+def apply_to_sample(f, sample):
+    if len(sample) == 0:
+        return {}
+    def _apply(x):
+        if torch.is_tensor(x):
+            return f(x)
+        elif isinstance(x, dict):
+            return {key: _apply(value) for key, value in x.items()}
+        elif isinstance(x, list):
+            return [_apply(x) for x in x]
+        else:
+            return x
+    return _apply(sample)
+def move_to_device(sample, device):
+    def _move_to_device(tensor):
+        return tensor.to(device)
+    return apply_to_sample(_move_to_device, sample)
+def prepare_sample(samples, cuda_enabled=True):
+    if cuda_enabled:
+        samples = move_to_device(samples, "cuda")
+    # TODO fp16 support
+    return samples
+def prepare_sample_dist(samples, device):
+    samples = move_to_device(samples, device)
+    # TODO fp16 support
+    return samples
+class IterLoader:
+    """
+    A wrapper to convert DataLoader as an infinite iterator.
+    Modified from:
+        https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+    """
+    def __init__(self, dataloader: DataLoader, use_distributed: bool = False):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._use_distributed = use_distributed
+        self._epoch = 0
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+        return data
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return len(self._dataloader)
+def prepare_one_sample(wav_path: str, wav_processor=None, cuda_enabled=True) -> dict:
+    """Prepare a single sample for inference.
+    Args:
+        wav_path: Path to the audio file.
+        wav_processor: A function to process the audio file.
+        cuda_enabled: Whether to move the sample to the GPU.
+    """
+    audio, sr = sf.read(wav_path)
+    if len(audio.shape) == 2:  # stereo to mono
+        audio = audio.mean(axis=1)
+    if len(audio) < sr:  # pad audio to at least 1s
+        sil = np.zeros(sr - len(audio), dtype=float)
+        audio = np.concatenate((audio, sil), axis=0)
+    audio = audio[: sr * 10]  # truncate audio to at most 10s
+    # spectrogram = wav_processor(audio, sampling_rate=sr, return_tensors="pt")["input_features"]
+    print("audio shape", audio.shape)
+    audio_t = torch.tensor(audio).unsqueeze(0)
+    audio_t = torchaudio.functional.resample(audio_t, sr, TARGET_SAMPLE_RATE)
+    print("audio shape after resample", audio_t.shape)
+    samples = {
+        "raw_wav": audio_t,
+        "padding_mask": torch.zeros(len(audio), dtype=torch.bool).unsqueeze(0),
+        "audio_chunk_sizes": [1],
+    }
+    if cuda_enabled:
+        samples = move_to_device(samples, "cuda")
+    return samples
+def prepare_one_sample_waveform(audio, cuda_enabled=True, sr=16000):
+    print("shape", audio.shape)
+    if len(audio.shape) == 2:  # stereo to mono
+        print("converting stereo to mono?")
+        audio = audio.mean(axis=1)
+    if len(audio) < sr:  # pad audio to at least 1s
+        sil = np.zeros(sr - len(audio), dtype=float)
+        audio = np.concatenate((audio, sil), axis=0)
+    audio = audio[: sr * 10]  # truncate audio to at most 30s
+    samples = {
+        "raw_wav": torch.tensor(audio).unsqueeze(0).type(torch.DoubleTensor),
+        "padding_mask": torch.zeros(len(audio), dtype=torch.bool).unsqueeze(0),
+    }
+    if cuda_enabled:
+        samples = move_to_device(samples, "cuda")
+    return samples
+def prepare_sample_waveforms(audio_paths, cuda_enabled=True, sr=TARGET_SAMPLE_RATE, max_length_seconds=10):
+    batch_len = sr  # minimum length of audio
+    audios = []
+    for audio_path in audio_paths:
+        audio, loaded_sr = sf.read(audio_path)
+        if len(audio.shape) == 2:
+            audio = audio[:, 0]
+        audio = audio[: loaded_sr * 10]
+        audio = resampy.resample(audio, loaded_sr, sr)
+        audio = torch.from_numpy(audio)
+        if len(audio) < sr * max_length_seconds:
+            pad_size = sr * max_length_seconds - len(audio)
+            audio = torch.nn.functional.pad(audio, (0, pad_size))
+        audio = torch.clamp(audio, -1.0, 1.0)
+        if len(audio) > batch_len:
+            batch_len = len(audio)
+        audios.append(audio)
+    padding_mask = torch.zeros((len(audios), batch_len), dtype=torch.bool)
+    for i in range(len(audios)):
+        if len(audios[i]) < batch_len:
+            pad_len = batch_len - len(audios[i])
+            sil = torch.zeros(pad_len, dtype=torch.float32)
+            audios[i] = torch.cat((audios[i], sil), dim=0)
+            padding_mask[i, len(audios[i]) :] = True
+    audios = torch.stack(audios, dim=0)
+    samples = {
+        "raw_wav": audios,
+        "padding_mask": padding_mask,
+        "audio_chunk_sizes": [len(audio_paths)],
+    }
+    if cuda_enabled:
+        samples = move_to_device(samples, "cuda")
+    return samples
+def generate_sample_batches(
+    audio_path,
+    cuda_enabled: bool = True,
+    sr: int = TARGET_SAMPLE_RATE,
+    chunk_len: int = 10,
+    hop_len: int = 5,
+    batch_size: int = 4,
+):
+    audio, loaded_sr = sf.read(audio_path)
+    if len(audio.shape) == 2:  # stereo to mono
+        audio = audio.mean(axis=1)
+    audio = torchaudio.functional.resample(torch.from_numpy(audio), loaded_sr, sr)
+    hop_len = hop_len * sr
+    chunk_len = max(len(audio), chunk_len * sr)
+    chunks = []
+    for i in range(0, len(audio), hop_len):
+        chunk = audio[i : i + chunk_len]
+        if len(chunk) < chunk_len:
+            break
+        chunks.append(chunk)
+    for i in range(0, len(chunks), batch_size):
+        batch = chunks[i : i + batch_size]
+        padding_mask = torch.zeros((len(batch), sr * chunk_len), dtype=torch.bool)
+        batch = torch.stack(batch, dim=0)
+        samples = {
+            "raw_wav": batch,
+            "padding_mask": padding_mask,
+            "audio_chunk_sizes": [1 for _ in range(len(batch))],
+        }
+        if cuda_enabled:
+            samples = move_to_device(samples, "cuda")
+        yield samples
+def prepare_samples_for_detection(samples, prompt, label):
+    prompts = [prompt for i in range(len(samples["raw_wav"]))]
+    labels = [label for i in range(len(samples["raw_wav"]))]
+    task = ["detection" for i in range(len(samples["raw_wav"]))]
+    samples["prompt"] = prompts
+    samples["text"] = labels
+    samples["task"] = task
+    return samples
+def universal_torch_load(
+    f: str | os.PathLike,
+    *,
+    cache_mode: Literal["none", "use", "force"] = "none",
+    **kwargs,
+) -> Any:
+    """
+    Wrapper function for torch.load that can handle GCS paths.
+    This function provides a convenient way to load PyTorch objects from both local and
+    Google Cloud Storage (GCS) paths. For GCS paths, it can optionally caches the
+    downloaded files locally to avoid repeated downloads.
+    The cache location is determined by:
+    1. The ESP_CACHE_HOME environment variable if set
+    2. Otherwise defaults to ~/.cache/esp/
+    Args:
+        f: File-like object, string or PathLike object.
+           Can be a local path or a GCS path (starting with 'gs://').
+        cache_mode (str, optional): Cache mode for GCS files. Options are:
+            "none": No caching (use bucket directly)
+            "use": Use cache if available, download if not
+            "force": Force redownload even if cache exists
+            Defaults to "none".
+        **kwargs: Additional keyword arguments passed to torch.load().
+    Returns:
+        The object loaded from the file using torch.load.
+    Raises:
+        IsADirectoryError: If the GCS path points to a directory instead of a file.
+        FileNotFoundError: If the local file does not exist.
+    """
+    f = Path(f)
+    if not f.exists():
+        raise FileNotFoundError(f"File does not exist: {f}")
+    with open(f, "rb") as opened_file:
+        return torch.load(opened_file, **kwargs)

README.md CHANGED Viewed

@@ -1,14 +1,34 @@
 ---
-title: NatureLM Audio
-emoji: 🔥
-colorFrom: yellow
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.40.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Description
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NatureLM Audio Demo
+emoji: 🎵
+colorFrom: purple
+colorTo: purple
 sdk: gradio
+sdk_version: 5.38.2
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Audio analysis with NatureLM model
 ---
+# NatureLM Audio Demo
+This is a demo of the NatureLM audio analysis model. The app provides three main features:
+## Features
+1. **Chat Interface**: Upload audio files and ask questions about them
+2. **Batch Processing**: Process multiple audio files with the same task
+3. **Long Recording Analysis**: Analyze long audio recordings by chunking them
+## Usage
+- **First Use**: The model will load automatically when you first use it (this may take a few minutes)
+- **Subsequent Uses**: The model stays loaded for faster responses
+- **Demo Mode**: If the model fails to load, the app will run in demo mode
+## Model Loading
+The app uses lazy loading to start quickly. The model is only loaded when you first interact with it, not during app initialization. This prevents timeout issues on HuggingFace Spaces.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

Space.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+sdk: gradio
+python_version: 3.10
+hardware: cpu

configs/inference.yml ADDED Viewed

	@@ -0,0 +1,61 @@

+model:
+  llama_path: "meta-llama/Meta-Llama-3.1-8B-Instruct"
+  freeze_beats: True
+  device: "cuda"
+  use_audio_Qformer: True
+  max_pooling: False
+  downsample_factor: 8
+  freeze_audio_QFormer: False
+  window_level_Qformer: True
+  num_audio_query_token: 1
+  second_per_window: 0.333333
+  second_stride: 0.333333
+  audio_llama_proj_model: ""
+  freeze_audio_llama_proj: False
+  lora: True
+  lora_rank: 32
+  lora_alpha: 32
+  lora_dropout: 0.1
+  prompt_template: "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+  max_txt_len: 160
+  end_sym: <|end_of_text|>
+  beats_cfg:
+    input_patch_size: 16
+    embed_dim: 512
+    conv_bias: False
+    encoder_layers: 12
+    encoder_embed_dim: 768
+    encoder_ffn_embed_dim: 3072
+    encoder_attention_heads: 12
+    activation_fn: "gelu"
+    layer_wise_gradient_decay_ratio: 0.6
+    layer_norm_first: False
+    deep_norm: True
+    dropout: 0.0
+    attention_dropout: 0.0
+    activation_dropout: 0.0
+    encoder_layerdrop: 0.05
+    dropout_input: 0.0
+    conv_pos: 128
+    conv_pos_groups: 16
+    relative_position_embedding: True
+    num_buckets: 320
+    max_distance: 800
+    gru_rel_pos: True
+    finetuned_model: True
+    predictor_dropout: 0.0
+    predictor_class: 527
+generate:
+  max_new_tokens: 300
+  num_beams: 2
+  do_sample: False
+  min_length: 1
+  temperature: 0.1
+  repetition_penalty: 1.0
+  length_penalty: 1.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+torch>=2.2.2
+torchaudio>=2.2.2
+torchvision>=0.17.2
+transformers[sentencepiece]>=4.44.2
+datasets>=2.20.0
+cloudpathlib[gs]>=0.20.0
+einops>=0.8.0
+gradio>=5.10.0
+google-cloud-aiplatform>=1.76.0
+Levenshtein>=0.25.1
+librosa>=0.9.2
+memoization>=0.4.0
+mir-eval>=0.7
+numpy>=1.26.4
+pandas>=1.4.3
+peft>=0.11.1
+plumbum>=1.7.2
+pydantic-settings>=2.7.1
+pydantic>=2.7.4
+pydub>=0.25.1
+pyyaml>=6.0
+resampy>=0.3.1
+scipy>=1.14.0
+soundfile>=0.12.1
+tensorboard>=2.18.0
+tensorboardX>=2.6.2.2
+tqdm>=4.66.4
+wandb>=0.17.3
+click>=8.1.7
+git+https://github.com/earthspecies/beans-zero.git