Spaces:

sohei1l
/

clip-tagger

Running

File size: 4,983 Bytes

e9a623f
e791eef
 
 
ec858b7
ee11310
 
938c18f
 
 
 
 
 
 
 
 
 
 
e791eef
 
 
 
ee11310
e791eef
 
938c18f
ec858b7
 
 
ee11310
ec858b7
 
ee11310
938c18f
e791eef
ee11310
 
e791eef
 
 
 
77b219c
ec858b7
ee11310
e791eef
 
 
 
77b219c
 
ec858b7
77b219c
 
 
 
 
e791eef
77b219c
 
e9a623f
938c18f
e791eef
ee11310
 
e791eef
938c18f
ee11310
e791eef
 
ee11310
 
 
 
 
 
 
 
e791eef
 
 
ee11310
77b219c
ee11310
ec858b7
 
 
 
77b219c
ee11310
ec858b7
77b219c
 
 
ec858b7
ee11310
 
 
 
 
 
e791eef
 
ec858b7
77b219c
 
e791eef
 
ee11310
 
ec858b7
ee11310
 
 
 
ec858b7
 
ee11310
 
ec858b7
ee11310
 
 
 
 
ec858b7
 
e791eef
938c18f
ec858b7
 
 
 
 
 
938c18f
ec858b7
 
ee11310
 
ec858b7
e791eef

import { pipeline } from '@xenova/transformers';

class CLAPProcessor {
  constructor() {
    this.classifier = null;
    this.isLoaded = false;
    this.candidateLabels = [
      'speech', 'male voice', 'female voice', 'narration', 'reading aloud', 'conversation',
      'music', 'singing', 'instrumental music', 'classical music', 'rock music', 'pop music', 
      'jazz', 'electronic music', 'acoustic music', 'background music',
      'guitar', 'piano', 'drums', 'violin', 'trumpet', 'saxophone', 'flute',
      'nature sounds', 'rain', 'wind', 'ocean waves', 'birds chirping', 'water running',
      'ambient sounds', 'room tone', 'background noise', 'white noise',
      'animal sounds', 'dog barking', 'cat meowing', 'birds singing',
      'mechanical sounds', 'car engine', 'traffic', 'airplane', 'train', 'motorcycle',
      'household sounds', 'door closing', 'footsteps', 'typing', 'telephone ringing', 'alarm clock',
      'human sounds', 'applause', 'laughter', 'crying', 'coughing', 'sneezing', 'breathing',
      'silence', 'quiet', 'noise', 'distortion', 'static'
    ];
  }

  async initialize() {
    if (this.isLoaded) return;

    try {
      console.log('Loading CLAP pipeline...');
      
      this.classifier = await pipeline(
        'zero-shot-audio-classification',
        'Xenova/clap-htsat-unfused'
      );
      
      this.isLoaded = true;
      console.log('CLAP pipeline ready!');
    } catch (error) {
      console.error('❌ CLAP initialization failed:', error);
      throw new Error(`CLAP loading failed: ${error.message}`);
    }
  }

  async processAudio(audioBuffer) {
    console.log('Processing audio...');
    
    if (!this.isLoaded) {
      await this.initialize();
    }

    try {
      // Convert AudioBuffer to raw audio data (Float32Array)
      const rawAudio = this.convertAudioBuffer(audioBuffer);
      
      console.log('Running classification...');
      console.log('Audio data type:', rawAudio.constructor.name, 'Length:', rawAudio.length);
      console.log('Audio data sample:', rawAudio.slice(0, 5));
      console.log('Is Float32Array?', rawAudio instanceof Float32Array);
      console.log('Candidate labels:', this.candidateLabels.slice(0, 3));
      
      // Run the classification - pass raw Float32Array and candidate labels as separate params
      const results = await this.classifier(rawAudio, this.candidateLabels);
      
      console.log('Classification results:', results);
      
      // Format results
      const formattedTags = this.formatResults(results);
      
      console.log('Final tags:', formattedTags);
      return formattedTags;
      
    } catch (error) {
      console.error('❌ Audio processing error:', error);
      
      // Return fallback tags with error info
      return [
        { label: 'audio', confidence: 0.9 },
        { label: 'sound', confidence: 0.8 },
        { label: 'unknown', confidence: 0.5 }
      ];
    }
  }

  convertAudioBuffer(audioBuffer) {
    console.log('Converting audio buffer:', {
      duration: audioBuffer.duration.toFixed(2) + 's',
      sampleRate: audioBuffer.sampleRate,
      channels: audioBuffer.numberOfChannels
    });
    
    // Extract audio data as Float32Array
    let rawAudio;
    if (audioBuffer.numberOfChannels === 1) {
      // Mono audio - copy to new Float32Array to ensure proper type
      const channelData = audioBuffer.getChannelData(0);
      rawAudio = new Float32Array(channelData);
    } else {
      // Convert stereo to mono by averaging
      const left = audioBuffer.getChannelData(0);
      const right = audioBuffer.getChannelData(1);
      rawAudio = new Float32Array(left.length);
      for (let i = 0; i < left.length; i++) {
        rawAudio[i] = (left[i] + right[i]) / 2;
      }
    }
    
    console.log('Converted to Float32Array, length:', rawAudio.length);
    return rawAudio;
  }

  formatResults(results) {
    if (!Array.isArray(results)) {
      console.warn('⚠️ Unexpected results format:', results);
      return [
        { label: 'audio', confidence: 0.9 },
        { label: 'sound', confidence: 0.8 }
      ];
    }
    
    // Sort by score and take top 5
    return results
      .sort((a, b) => b.score - a.score)
      .slice(0, 5)
      .map(result => ({
        label: result.label,
        confidence: Math.max(0, Math.min(1, result.score))
      }));
  }

  async fileToAudioBuffer(file) {
    console.log('Decoding file:', file.name, `(${Math.round(file.size / 1024)}KB)`);
    
    try {
      const arrayBuffer = await file.arrayBuffer();
      const audioContext = new (window.AudioContext || window.webkitAudioContext)();
      const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
      
      console.log('File decoded successfully');
      return audioBuffer;
    } catch (error) {
      console.error('❌ File decoding failed:', error);
      throw new Error(`Audio decoding failed: ${error.message}`);
    }
  }
}

export default CLAPProcessor;