Spaces:
Running
Running
File size: 4,983 Bytes
e9a623f e791eef ec858b7 ee11310 938c18f e791eef ee11310 e791eef 938c18f ec858b7 ee11310 ec858b7 ee11310 938c18f e791eef ee11310 e791eef 77b219c ec858b7 ee11310 e791eef 77b219c ec858b7 77b219c e791eef 77b219c e9a623f 938c18f e791eef ee11310 e791eef 938c18f ee11310 e791eef ee11310 e791eef ee11310 77b219c ee11310 ec858b7 77b219c ee11310 ec858b7 77b219c ec858b7 ee11310 e791eef ec858b7 77b219c e791eef ee11310 ec858b7 ee11310 ec858b7 ee11310 ec858b7 ee11310 ec858b7 e791eef 938c18f ec858b7 938c18f ec858b7 ee11310 ec858b7 e791eef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import { pipeline } from '@xenova/transformers';
class CLAPProcessor {
constructor() {
this.classifier = null;
this.isLoaded = false;
this.candidateLabels = [
'speech', 'male voice', 'female voice', 'narration', 'reading aloud', 'conversation',
'music', 'singing', 'instrumental music', 'classical music', 'rock music', 'pop music',
'jazz', 'electronic music', 'acoustic music', 'background music',
'guitar', 'piano', 'drums', 'violin', 'trumpet', 'saxophone', 'flute',
'nature sounds', 'rain', 'wind', 'ocean waves', 'birds chirping', 'water running',
'ambient sounds', 'room tone', 'background noise', 'white noise',
'animal sounds', 'dog barking', 'cat meowing', 'birds singing',
'mechanical sounds', 'car engine', 'traffic', 'airplane', 'train', 'motorcycle',
'household sounds', 'door closing', 'footsteps', 'typing', 'telephone ringing', 'alarm clock',
'human sounds', 'applause', 'laughter', 'crying', 'coughing', 'sneezing', 'breathing',
'silence', 'quiet', 'noise', 'distortion', 'static'
];
}
async initialize() {
if (this.isLoaded) return;
try {
console.log('Loading CLAP pipeline...');
this.classifier = await pipeline(
'zero-shot-audio-classification',
'Xenova/clap-htsat-unfused'
);
this.isLoaded = true;
console.log('CLAP pipeline ready!');
} catch (error) {
console.error('❌ CLAP initialization failed:', error);
throw new Error(`CLAP loading failed: ${error.message}`);
}
}
async processAudio(audioBuffer) {
console.log('Processing audio...');
if (!this.isLoaded) {
await this.initialize();
}
try {
// Convert AudioBuffer to raw audio data (Float32Array)
const rawAudio = this.convertAudioBuffer(audioBuffer);
console.log('Running classification...');
console.log('Audio data type:', rawAudio.constructor.name, 'Length:', rawAudio.length);
console.log('Audio data sample:', rawAudio.slice(0, 5));
console.log('Is Float32Array?', rawAudio instanceof Float32Array);
console.log('Candidate labels:', this.candidateLabels.slice(0, 3));
// Run the classification - pass raw Float32Array and candidate labels as separate params
const results = await this.classifier(rawAudio, this.candidateLabels);
console.log('Classification results:', results);
// Format results
const formattedTags = this.formatResults(results);
console.log('Final tags:', formattedTags);
return formattedTags;
} catch (error) {
console.error('❌ Audio processing error:', error);
// Return fallback tags with error info
return [
{ label: 'audio', confidence: 0.9 },
{ label: 'sound', confidence: 0.8 },
{ label: 'unknown', confidence: 0.5 }
];
}
}
convertAudioBuffer(audioBuffer) {
console.log('Converting audio buffer:', {
duration: audioBuffer.duration.toFixed(2) + 's',
sampleRate: audioBuffer.sampleRate,
channels: audioBuffer.numberOfChannels
});
// Extract audio data as Float32Array
let rawAudio;
if (audioBuffer.numberOfChannels === 1) {
// Mono audio - copy to new Float32Array to ensure proper type
const channelData = audioBuffer.getChannelData(0);
rawAudio = new Float32Array(channelData);
} else {
// Convert stereo to mono by averaging
const left = audioBuffer.getChannelData(0);
const right = audioBuffer.getChannelData(1);
rawAudio = new Float32Array(left.length);
for (let i = 0; i < left.length; i++) {
rawAudio[i] = (left[i] + right[i]) / 2;
}
}
console.log('Converted to Float32Array, length:', rawAudio.length);
return rawAudio;
}
formatResults(results) {
if (!Array.isArray(results)) {
console.warn('⚠️ Unexpected results format:', results);
return [
{ label: 'audio', confidence: 0.9 },
{ label: 'sound', confidence: 0.8 }
];
}
// Sort by score and take top 5
return results
.sort((a, b) => b.score - a.score)
.slice(0, 5)
.map(result => ({
label: result.label,
confidence: Math.max(0, Math.min(1, result.score))
}));
}
async fileToAudioBuffer(file) {
console.log('Decoding file:', file.name, `(${Math.round(file.size / 1024)}KB)`);
try {
const arrayBuffer = await file.arrayBuffer();
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
console.log('File decoded successfully');
return audioBuffer;
} catch (error) {
console.error('❌ File decoding failed:', error);
throw new Error(`Audio decoding failed: ${error.message}`);
}
}
}
export default CLAPProcessor; |