<title>Multimodal Media Analyzer</title>

  

    * { boxsizing: borderbox; margin: 0; padding: 0; }

    body {

      fontfamily: systemui, sansserif;

      maxwidth: 820px;

      margin: 0 auto;

      padding: 1.5rem 1rem;

      background: #f1f5f9;

      color: #1e293b;

    }

 

    header { marginbottom: 1.5rem; }

    header h1 { fontsize: 1.5rem; }

    header p  { color: #64748b; font-size: 0.9rem; margin-top: 0.2rem; }

 

    /* Model status indicators */

    .modelstatusbar {

      display: flex;

      gap: 0.5rem;

      flexwrap: wrap;

      margintop: 0.75rem;

    }

    .modelbadge {

      fontsize: 0.78rem;

      padding: 0.2rem 0.6rem;

      borderradius: 12px;

      background: #fef3c7;

      color: #92400e;

    }

    .modelbadge.ready { background: #dcfce7; color: #15803d; }

 

    /* Tab bar */

    .tabs {

      display: flex;

      background: white;

      borderradius: 8px;

      padding: 0.25rem;

      gap: 0.25rem;

      marginbottom: 1.25rem;

      border: 1px solid #e2e8f0;

    }

    .tab {

      flex: 1;

      padding: 0.5rem;

      textalign: center;

      borderradius: 6px;

      cursor: pointer;

      fontsize: 0.9rem;

      color: #64748b;

      transition: all 0.15s;

    }

    .tab.active { background: #2563eb; color: white; font-weight: 600; }

 

    /* Input panels */

    .panel { display: none; }

    .panel.active { display: block; }

 

    .uploadarea {

      background: white;

      border: 2px dashed #cbd5e1;

      borderradius: 8px;

      padding: 2rem;

      textalign: center;

      cursor: pointer;

    }

    .uploadarea input { display: none; }

 

    #img-preview {

      margintop: 1rem;

      maxwidth: 100%;

      maxheight: 320px;

      borderradius: 8px;

      display: none;

      objectfit: cover;

    }

 

    .miccenter { textalign: center; padding: 1rem 0; }

    #rec-btn {

      width: 72px; height: 72px;

      borderradius: 50%; border: none;

      background: #dc2626; color: white;

      fontsize: 1.6rem; cursor: pointer;

      display: flex; alignitems: center; justifycontent: center;

      margin: 0 auto 0.5rem;

    }

    #rec-btn.recording { background: #374151; }

    #rec-btn:disabled  { background: #94a3b8; cursor: not-allowed; }

    #rec-timer { font-weight: 600; color: #374151; margin-bottom: 0.25rem; }

    #rec-hint  { font-size: 0.85rem; color: #64748b; }

    #wave-canvas { display: block; margin: 0.5rem auto; border-radius: 4px; }

 

    /* Results grid */

    .resultsgrid {

      display: grid;

      gridtemplatecolumns: repeat(autofit, minmax(220px, 1fr));

      gap: 1rem;

      margintop: 1.25rem;

    }

    .resultcard {

      background: white;

      border: 1px solid #e2e8f0;

      borderradius: 8px;

      padding: 1rem;

    }

    .resultcard h3 {

      fontsize: 0.75rem;

      texttransform: uppercase;

      letterspacing: 0.06em;

      color: #64748b;

      marginbottom: 0.6rem;

    }

    .labelitem {

      display: flex;

      justifycontent: spacebetween;

      alignitems: center;

      padding: 0.25rem 0;

      fontsize: 0.875rem;

      borderbottom: 1px solid #f1f5f9;

    }

    .labelscore {

      fontsize: 0.8rem;

      color: #64748b;

      background: #f1f5f9;

      padding: 0.1rem 0.4rem;

      borderradius: 4px;

    }

    .captionbody {

      fontsize: 0.95rem;

      lineheight: 1.5;

      fontstyle: italic;

      color: #334155;

    }

    .transcriptbody {

      fontsize: 0.95rem;

      lineheight: 1.6;

      color: #334155;

      whitespace: prewrap;

    }

    .placeholdertext { color: #94a3b8; font-style: italic; font-size: 0.9rem; }

    #global-status {

      fontsize: 0.85rem;

      color: #64748b;

      marginbottom: 1rem;

    }

 

    @media (maxwidth: 500px) {

      .resultsgrid { gridtemplatecolumns: 1fr; }

    }

  

  <header>

    <h1>Multimodal Media Analyzer</h1>

    <p>Image classification, captioning, and speech transcription all in your browser.</p>

    <div class=“model-status-bar”>

      <span class=“model-badge” id=“badge-cls”>Classifier: loading...</span>

      <span class=“model-badge” id=“badge-cap”>Captioner: loading...</span>

      <span class=“model-badge” id=“badge-asr”>Whisper: loading...</span>

    </div>

  </header>

 

  <div id=“global-status”>Loading models in parallel first run downloads ~400 MB total.</div>

 

  <div class=“tabs”>

    <div class=“tab active” datatab=“image”>🖼 Image Analysis</div>

    <div class=“tab” datatab=“speech”>🎙 Speech Transcription</div>

  </div>

 

  <! Image panel >

  <div class=“panel active” id=“panel-image”>

    <div class=“upload-area” id=“img-drop”>

      

      <p>Click or drag an image to analyze</p>

      <p style=“font-size:0.8rem;color:#94a3b8;margin-top:0.3rem”>

        JPG, PNG, WebP, GIF supported

      </p>

    </div>

    <img id=“img-preview” alt=“Preview” />

  </div>

 

  <! Speech panel >

  <div class=“panel” id=“panel-speech”>

    <div class=“mic-center”>

      <button id=“rec-btn” disabled>🎙</button>

      <div id=“rec-timer”>0:00</div>

      <div id=“rec-hint”>Waiting for Whisper model...</div>

    </div>

    

  </div>

 

  <! Results shown for both modes >

  <div class=“results-grid” id=“results-grid” style=“display:none”>

    <! Image results (shown in image mode) >

    <div class=“result-card” id=“card-cls” style=“display:none”>

      <h3>Classification</h3>

      <div id=“cls-content”>

        <p class=“placeholder-text”>No results yet.</p>

      </div>

    </div>

    <div class=“result-card” id=“card-cap” style=“display:none”>

      <h3>Caption</h3>

      <div id=“cap-content”>

        <p class=“placeholder-text”>No caption yet.</p>

      </div>

    </div>

    <! Speech results (shown in speech mode) >

    <div class=“result-card” id=“card-asr” style=“display:none”>

      <h3>Transcription</h3>

      <div id=“asr-content”>

        <p class=“placeholder-text”>Record audio to see the transcription.</p>

      </div>

    </div>

  </div>

 

  

    import { pipeline }

      from ‘https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2’;

 

    // ── Pipeline references ───────────────────────────────────────────────

    let classifier, captioner, transcriber;

    let readyCount = 0;

 

    // Update a model badge to “ready” state

    function markReady(badgeId, label) {

      const badge = document.getElementById(badgeId);

      badge.textContent = `${label}: ready`;

      badge.classList.add(‘ready’);

      readyCount++;

      if (readyCount === 3) {

        globalStatus.textContent =

          ‘All models ready. Upload an image or record audio.’;

        recBtn.disabled = false;

        recHint.textContent = ‘Click to start recording.’;

      }

    }

 

    // Load all three pipelines simultaneously

    Promise.all([

      pipeline(‘image-classification’, ‘Xenova/vit-base-patch16-224’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-cls’, ‘Classifier’)

      }),

      pipeline(‘image-to-text’, ‘Xenova/vit-gpt2-image-captioning’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-cap’, ‘Captioner’)

      }),

      pipeline(‘automatic-speech-recognition’, ‘Xenova/whisper-tiny.en’, {

        dtype: ‘q8’,

        progress_callback: p => p.status === ‘done’ && markReady(‘badge-asr’, ‘Whisper’)

      })

    ]).then(([cls, cap, asr]) => {

      classifier  = cls;

      captioner   = cap;

      transcriber = asr;

    }).catch(err => {

      globalStatus.textContent = `Error loading models: ${err.message}`;

    });

 

    // ── UI references ─────────────────────────────────────────────────────

    const globalStatus = document.getElementById(‘global-status’);

    const resultsGrid  = document.getElementById(‘results-grid’);

    const recBtn       = document.getElementById(‘rec-btn’);

    const recHint      = document.getElementById(‘rec-hint’);

    const recTimer     = document.getElementById(‘rec-timer’);

    const waveCanvas   = document.getElementById(‘wave-canvas’);

    const waveCtx      = waveCanvas.getContext(‘2d’);

 

    // ── Image analysis ────────────────────────────────────────────────────

    async function analyzeImage(dataUrl) {

      if (!classifier || !captioner) {

        globalStatus.textContent = ‘Models still loading. Please wait.’;

        return;

      }

 

      globalStatus.textContent = ‘Running classification and captioning…’;

 

      // Show image result cards, hide speech card

      document.getElementById(‘card-cls’).style.display = ‘block’;

      document.getElementById(‘card-cap’).style.display = ‘block’;

      document.getElementById(‘card-asr’).style.display = ‘none’;

      resultsGrid.style.display = ‘grid’;

 

      document.getElementById(‘cls-content’).innerHTML =

        

Classifying…

;

      document.getElementById(‘cap-content’).innerHTML =

        

Generating caption…

;

 

      try {

        // Run classification and captioning in parallel

        const [classResults, captionResults] = await Promise.all([

          classifier(dataUrl, { top_k: 4 }),

          captioner(dataUrl, { max_new_tokens: 60 })

        ]);

 

        // Render classification labels

        document.getElementById(‘cls-content’).innerHTML =

          classResults.map(({ label, score }) => `

            <div class=“label-item”>

              <span>${label}</span>

              <span class=“label-score”>${(score * 100).toFixed(1)}%</span>

            </div>`).join();

 

        // Render generated caption

        document.getElementById(‘cap-content’).innerHTML =

          `<p class=“caption-body”>“${captionResults[0]?.generated_text ?? ‘No caption.’}”</p>`;

 

        globalStatus.textContent = ‘Analysis complete.’;

      } catch (err) {

        globalStatus.textContent = `Error: ${err.message}`;

      }

    }

 

    // File upload handler for images

    const imgDrop  = document.getElementById(‘img-drop’);

    const imgInput = document.getElementById(‘img-input’);

    const imgPrev  = document.getElementById(‘img-preview’);

 

    function handleImageFile(file) {

      if (!file?.type.startsWith(‘image/’)) return;

      const reader = new FileReader();

      reader.onload = e => {

        imgPrev.src = e.target.result;

        imgPrev.style.display = ‘block’;

        analyzeImage(e.target.result);

      };

      reader.readAsDataURL(file);

    }

 

    imgDrop.addEventListener(‘click’, () => imgInput.click());

    imgInput.addEventListener(‘change’, e => handleImageFile(e.target.files[0]));

    imgDrop.addEventListener(‘dragover’, e => e.preventDefault());

    imgDrop.addEventListener(‘drop’, e => {

      e.preventDefault();

      handleImageFile(e.dataTransfer.files[0]);

    });

 

    // ── Audio decoding helper ─────────────────────────────────────────────

    async function decodeAudio(arrayBuffer) {

      const audioCtx    = new AudioContext({ sampleRate: 16000 });

      const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);

      return audioBuffer.getChannelData(0);  // Mono Float32Array at 16kHz

    }

 

    // ── Speech transcription ──────────────────────────────────────────────

    async function runTranscription(audioData) {

      // Show speech result card, hide image cards

      document.getElementById(‘card-cls’).style.display = ‘none’;

      document.getElementById(‘card-cap’).style.display = ‘none’;

      document.getElementById(‘card-asr’).style.display = ‘block’;

      resultsGrid.style.display = ‘grid’;

 

      document.getElementById(‘asr-content’).innerHTML =

        

Transcribing…

;

 

      globalStatus.textContent = ‘Running Whisper transcription…’;

 

      try {

        const result = await transcriber(audioData, {

          chunk_length_s: 30,

          stride_length_s: 5

        });

        document.getElementById(‘asr-content’).innerHTML =

          `<p class=“transcript-body”>${result.text.trim()}</p>`;

        globalStatus.textContent = ‘Transcription complete.’;

      } catch (err) {

        globalStatus.textContent = `Error: ${err.message}`;

      }

    }

 

    // ── Microphone recording ──────────────────────────────────────────────

    let mediaRecorder, audioChunks = [], timerInterval, analyserNode, animId;

    let secs = 0;

 

    function drawWave() {

      const buf = new Uint8Array(analyserNode.frequencyBinCount);

      analyserNode.getByteTimeDomainData(buf);

      waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

      waveCtx.beginPath();

      waveCtx.strokeStyle = ‘#2563eb’;

      waveCtx.lineWidth = 1.5;

      buf.forEach((v, i) => {

        const x = (i / buf.length) * waveCanvas.width;

        const y = (v / 128.0) * (waveCanvas.height / 2);

        i === 0 ? waveCtx.moveTo(x, y) : waveCtx.lineTo(x, y);

      });

      waveCtx.stroke();

      animId = requestAnimationFrame(drawWave);

    }

 

    recBtn.addEventListener(‘click’, async () => {

      if (mediaRecorder?.state === ‘recording’) {

        mediaRecorder.stop();

        recBtn.classList.remove(‘recording’);

        recBtn.textContent = ‘🎙’;

        clearInterval(timerInterval);

        cancelAnimationFrame(animId);

        waveCtx.clearRect(0, 0, waveCanvas.width, waveCanvas.height);

        recHint.textContent = ‘Processing…’;

      } else {

        try {

          const stream  = await navigator.mediaDevices.getUserMedia({ audio: true });

          const actx    = new AudioContext();

          analyserNode  = actx.createAnalyser();

          actx.createMediaStreamSource(stream).connect(analyserNode);

          analyserNode.fftSize = 256;

 

          mediaRecorder = new MediaRecorder(stream);

          audioChunks   = [];

          mediaRecorder.ondataavailable = e => e.data.size && audioChunks.push(e.data);

          mediaRecorder.onstop = async () => {

            const blob        = new Blob(audioChunks, { type: ‘audio/webm’ });

            const arrayBuffer = await blob.arrayBuffer();

            const audioData   = await decodeAudio(arrayBuffer);

            stream.getTracks().forEach(t => t.stop());

            await runTranscription(audioData);

            recHint.textContent = ‘Click to record again.’;

          };

 

          mediaRecorder.start();

          recBtn.classList.add(‘recording’);

          recBtn.textContent = ‘⏹’;

          secs = 0;

          recTimer.textContent = ‘0:00’;

          timerInterval = setInterval(() => {

            secs++;

            recTimer.textContent =

              `${Math.floor(secs / 60)}:${String(secs % 60).padStart(2, ‘0’)}`;

          }, 1000);

          recHint.textContent = ‘Recording… click to stop.’;

          drawWave();

        } catch (err) {

          recHint.textContent = `Mic error: ${err.message}`;

        }

      }

    });

 

    // ── Tab switching ─────────────────────────────────────────────────────

    document.querySelectorAll(‘.tab’).forEach(tab => {

      tab.addEventListener(‘click’, () => {

        document.querySelectorAll(‘.tab, .panel’).forEach(el =>

          el.classList.remove(‘active’));

        tab.classList.add(‘active’);

        document.getElementById(`panel${tab.dataset.tab}`).classList.add(‘active’);

      });

    });



Source link