File size: 4,983 Bytes
e9a623f
e791eef
 
 
ec858b7
ee11310
 
938c18f
 
 
 
 
 
 
 
 
 
 
e791eef
 
 
 
ee11310
e791eef
 
938c18f
ec858b7
 
 
ee11310
ec858b7
 
ee11310
938c18f
e791eef
ee11310
 
e791eef
 
 
 
77b219c
ec858b7
ee11310
e791eef
 
 
 
77b219c
 
ec858b7
77b219c
 
 
 
 
e791eef
77b219c
 
e9a623f
938c18f
e791eef
ee11310
 
e791eef
938c18f
ee11310
e791eef
 
ee11310
 
 
 
 
 
 
 
e791eef
 
 
ee11310
77b219c
ee11310
ec858b7
 
 
 
77b219c
ee11310
ec858b7
77b219c
 
 
ec858b7
ee11310
 
 
 
 
 
e791eef
 
ec858b7
77b219c
 
e791eef
 
ee11310
 
ec858b7
ee11310
 
 
 
ec858b7
 
ee11310
 
ec858b7
ee11310
 
 
 
 
ec858b7
 
e791eef
938c18f
ec858b7
 
 
 
 
 
938c18f
ec858b7
 
ee11310
 
ec858b7
e791eef
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import { pipeline } from '@xenova/transformers';

class CLAPProcessor {
  constructor() {
    this.classifier = null;
    this.isLoaded = false;
    this.candidateLabels = [
      'speech', 'male voice', 'female voice', 'narration', 'reading aloud', 'conversation',
      'music', 'singing', 'instrumental music', 'classical music', 'rock music', 'pop music', 
      'jazz', 'electronic music', 'acoustic music', 'background music',
      'guitar', 'piano', 'drums', 'violin', 'trumpet', 'saxophone', 'flute',
      'nature sounds', 'rain', 'wind', 'ocean waves', 'birds chirping', 'water running',
      'ambient sounds', 'room tone', 'background noise', 'white noise',
      'animal sounds', 'dog barking', 'cat meowing', 'birds singing',
      'mechanical sounds', 'car engine', 'traffic', 'airplane', 'train', 'motorcycle',
      'household sounds', 'door closing', 'footsteps', 'typing', 'telephone ringing', 'alarm clock',
      'human sounds', 'applause', 'laughter', 'crying', 'coughing', 'sneezing', 'breathing',
      'silence', 'quiet', 'noise', 'distortion', 'static'
    ];
  }

  async initialize() {
    if (this.isLoaded) return;

    try {
      console.log('Loading CLAP pipeline...');
      
      this.classifier = await pipeline(
        'zero-shot-audio-classification',
        'Xenova/clap-htsat-unfused'
      );
      
      this.isLoaded = true;
      console.log('CLAP pipeline ready!');
    } catch (error) {
      console.error('❌ CLAP initialization failed:', error);
      throw new Error(`CLAP loading failed: ${error.message}`);
    }
  }

  async processAudio(audioBuffer) {
    console.log('Processing audio...');
    
    if (!this.isLoaded) {
      await this.initialize();
    }

    try {
      // Convert AudioBuffer to raw audio data (Float32Array)
      const rawAudio = this.convertAudioBuffer(audioBuffer);
      
      console.log('Running classification...');
      console.log('Audio data type:', rawAudio.constructor.name, 'Length:', rawAudio.length);
      console.log('Audio data sample:', rawAudio.slice(0, 5));
      console.log('Is Float32Array?', rawAudio instanceof Float32Array);
      console.log('Candidate labels:', this.candidateLabels.slice(0, 3));
      
      // Run the classification - pass raw Float32Array and candidate labels as separate params
      const results = await this.classifier(rawAudio, this.candidateLabels);
      
      console.log('Classification results:', results);
      
      // Format results
      const formattedTags = this.formatResults(results);
      
      console.log('Final tags:', formattedTags);
      return formattedTags;
      
    } catch (error) {
      console.error('❌ Audio processing error:', error);
      
      // Return fallback tags with error info
      return [
        { label: 'audio', confidence: 0.9 },
        { label: 'sound', confidence: 0.8 },
        { label: 'unknown', confidence: 0.5 }
      ];
    }
  }

  convertAudioBuffer(audioBuffer) {
    console.log('Converting audio buffer:', {
      duration: audioBuffer.duration.toFixed(2) + 's',
      sampleRate: audioBuffer.sampleRate,
      channels: audioBuffer.numberOfChannels
    });
    
    // Extract audio data as Float32Array
    let rawAudio;
    if (audioBuffer.numberOfChannels === 1) {
      // Mono audio - copy to new Float32Array to ensure proper type
      const channelData = audioBuffer.getChannelData(0);
      rawAudio = new Float32Array(channelData);
    } else {
      // Convert stereo to mono by averaging
      const left = audioBuffer.getChannelData(0);
      const right = audioBuffer.getChannelData(1);
      rawAudio = new Float32Array(left.length);
      for (let i = 0; i < left.length; i++) {
        rawAudio[i] = (left[i] + right[i]) / 2;
      }
    }
    
    console.log('Converted to Float32Array, length:', rawAudio.length);
    return rawAudio;
  }

  formatResults(results) {
    if (!Array.isArray(results)) {
      console.warn('⚠️ Unexpected results format:', results);
      return [
        { label: 'audio', confidence: 0.9 },
        { label: 'sound', confidence: 0.8 }
      ];
    }
    
    // Sort by score and take top 5
    return results
      .sort((a, b) => b.score - a.score)
      .slice(0, 5)
      .map(result => ({
        label: result.label,
        confidence: Math.max(0, Math.min(1, result.score))
      }));
  }

  async fileToAudioBuffer(file) {
    console.log('Decoding file:', file.name, `(${Math.round(file.size / 1024)}KB)`);
    
    try {
      const arrayBuffer = await file.arrayBuffer();
      const audioContext = new (window.AudioContext || window.webkitAudioContext)();
      const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
      
      console.log('File decoded successfully');
      return audioBuffer;
    } catch (error) {
      console.error('❌ File decoding failed:', error);
      throw new Error(`Audio decoding failed: ${error.message}`);
    }
  }
}

export default CLAPProcessor;