faster-whisper — Skillopedia

Faster Whisper Local speech-to-text using faster-whisper — a CTranslate2 reimplementation of OpenAI's Whisper that runs 4-6x faster with identical accuracy. With GPU acceleration, expect 20x realtime transcription (a 10-minute audio file in 30 seconds). When to Use Use this skill when you need to: - Transcribe audio/video files — meetings, interviews, podcasts, lectures, YouTube videos - Generate subtitles — SRT, VTT, ASS, LRC, or TTML broadcast-standard subtitles - Identify speakers — diarization labels who said what ( ) - Transcribe from URLs — YouTube links and direct audio URLs (auto-down…

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, re.I),\n re.compile(r'thank\\s+you\\s+for\\s+watching', re.I),\n re.compile(r'thank\\s+you\\s+for\\s+(listening|your\\s+attention)', re.I),\n re.compile(r'subtitles?\\s+by', re.I),\n re.compile(r'(transcribed|captioned)\\s+by', re.I),\n re.compile(r'^\\s*www\\.\\S+\\s*

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, re.I),\n re.compile(r'^\\s*[.!?,;:\\u2026]+\\s*

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

), # lone punctuation / ellipsis\n re.compile(r'^\\s*

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

), # empty\n]\n\n\ndef filter_hallucinations(segments):\n \"\"\"Remove segments matching common Whisper hallucination patterns.\"\"\"\n filtered = []\n prev_text = None\n for seg in segments:\n text = seg.get(\"text\", \"\").strip()\n if any(p.search(text) for p in HALLUCINATION_PATTERNS):\n continue\n if text == prev_text: # exact duplicate consecutive segment\n continue\n prev_text = text\n filtered.append(seg)\n return filtered\n\n\n# ---------------------------------------------------------------------------\n# Channel extraction\n# ---------------------------------------------------------------------------\n\ndef extract_channel(audio_path, channel, quiet=False):\n \"\"\"Extract a stereo channel from audio using ffmpeg.\n\n channel: 'left' (c0), 'right' (c1), or 'mix' (no-op, returns original).\n Returns (output_path, tmp_path_to_cleanup_or_None).\n \"\"\"\n if channel == \"mix\":\n return audio_path, None\n\n if not shutil.which(\"ffmpeg\"):\n if not quiet:\n print(\"⚠️ ffmpeg not found — cannot extract channel; using full mix\", file=sys.stderr)\n return audio_path, None\n\n pan = \"c0\" if channel == \"left\" else \"c1\"\n tmp_path = audio_path + f\".{channel}.wav\"\n cmd = [\n \"ffmpeg\", \"-y\", \"-i\", audio_path,\n \"-af\", f\"pan=mono|c0={pan}\",\n \"-ar\", \"16000\",\n tmp_path,\n ]\n if not quiet:\n print(f\"🎚️ Extracting {channel} channel...\", file=sys.stderr)\n try:\n subprocess.run(cmd, check=True, capture_output=True)\n return tmp_path, tmp_path\n except subprocess.CalledProcessError:\n if not quiet:\n print(\"⚠️ Channel extraction failed; using full mix\", file=sys.stderr)\n if os.path.exists(tmp_path):\n os.remove(tmp_path)\n return audio_path, None\n\n\n# ---------------------------------------------------------------------------\n# Filler word removal\n# ---------------------------------------------------------------------------\n\n_FILLER_PATTERNS = [\n # Single-word hesitation sounds (word boundary match)\n re.compile(r'\\b(um+|uh+|er+|ah+|hmm+|hm+)\\b', re.I),\n # Discourse markers (case-insensitive, word boundaries)\n re.compile(r'\\byou know\\b', re.I),\n re.compile(r'\\bI mean\\b', re.I),\n re.compile(r'\\byou see\\b', re.I),\n]\n\n# Single-word filler matcher (stripped of surrounding whitespace and punctuation)\n_FILLER_WORD_RE = re.compile(r'^(um+|uh+|er+|ah+|hmm+|hm+)

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, re.I)\n\n# Multi-word discourse markers as tuples of lowercased bare words\n_FILLER_BIGRAMS = [(\"you\", \"know\"), (\"i\", \"mean\"), (\"you\", \"see\")]\n\n\ndef _word_bare(w):\n \"\"\"Return the bare lowercased text of a word token (strip spaces + punctuation).\"\"\"\n return re.sub(r\"[^\\w']\", \"\", w[\"word\"].lower().strip())\n\n\ndef _filter_word_list(words):\n \"\"\"Remove filler words from a word list.\n\n Removes single-word hesitations and multi-word discourse markers.\n Returns a new list (does not mutate the input).\n \"\"\"\n if not words:\n return words\n\n # First pass: mark words to remove\n remove_idx = set()\n\n # Single-word fillers\n for i, w in enumerate(words):\n if _FILLER_WORD_RE.match(_word_bare(w)):\n remove_idx.add(i)\n\n # Multi-word bigram markers\n for i in range(len(words) - 1):\n if i in remove_idx or i + 1 in remove_idx:\n continue\n pair = (_word_bare(words[i]), _word_bare(words[i + 1]))\n if pair in _FILLER_BIGRAMS:\n remove_idx.add(i)\n remove_idx.add(i + 1)\n\n return [w for idx, w in enumerate(words) if idx not in remove_idx]\n\n\ndef remove_filler_words(segments):\n \"\"\"Strip hesitation fillers and discourse markers from segment text and word list.\n\n Modifies segment['text'] using regex substitution and also filters\n segment['words'] to remove matching word tokens. Drops segments that\n become empty after cleaning.\n \"\"\"\n cleaned = []\n for seg in segments:\n text = seg[\"text\"]\n for pat in _FILLER_PATTERNS:\n text = pat.sub(\"\", text)\n # Remove leading punctuation left behind after filler removal\n text = re.sub(r'^[\\s,.!?;:]+', '', text)\n # Fix up punctuation spacing: remove spaces before punctuation\n text = re.sub(r'\\s+([,.!?;:])', r'\\1', text)\n # Collapse consecutive identical punctuation (e.g. \",,\" → \",\")\n text = re.sub(r'([,.!?;:])\\1+', r'\\1', text)\n # Remove orphaned commas before terminal punctuation (e.g. \",?\" → \"?\")\n text = re.sub(r',([.!?])', r'\\1', text)\n # Collapse multiple spaces\n text = re.sub(r' +', ' ', text)\n text = text.strip()\n if not text:\n continue\n seg = dict(seg) # shallow copy to avoid mutating original\n seg[\"text\"] = text\n # Also clean the word list so word-split formatters (--max-words-per-line,\n # --max-chars-per-line) don't re-introduce filler words in SRT/VTT/ASS/TTML.\n if seg.get(\"words\"):\n seg[\"words\"] = _filter_word_list(seg[\"words\"])\n cleaned.append(seg)\n return cleaned\n\n\n# ---------------------------------------------------------------------------\n# Paragraph detection\n# ---------------------------------------------------------------------------\n\ndef detect_paragraphs(segments, min_gap=3.0, sentence_gap=1.5):\n \"\"\"Mark segment dicts with 'paragraph_start': True at paragraph boundaries.\n\n A new paragraph starts when:\n - The gap to the previous segment >= min_gap seconds, OR\n - The previous segment ends a sentence (terminal punct) AND\n the gap >= sentence_gap seconds.\n The first segment always gets paragraph_start = True.\n Uses _TERMINAL_PUNCT defined in the merge_sentences section below.\n \"\"\"\n if not segments:\n return segments\n segments[0][\"paragraph_start\"] = True\n for i in range(1, len(segments)):\n prev = segments[i - 1]\n curr = segments[i]\n gap = curr[\"start\"] - prev[\"end\"]\n prev_text = prev.get(\"text\", \"\").rstrip()\n ends_sentence = bool(_TERMINAL_PUNCT.search(prev_text))\n if gap >= min_gap or (ends_sentence and gap >= sentence_gap):\n curr[\"paragraph_start\"] = True\n return segments\n\n\n# ---------------------------------------------------------------------------\n# Character-based subtitle line splitting\n# ---------------------------------------------------------------------------\n\ndef split_words_by_chars(words, max_chars):\n \"\"\"Split a list of word dicts into chunks where each chunk's joined text\n fits within max_chars characters.\n\n Returns a list of word lists (chunks).\n \"\"\"\n if not words:\n return [words]\n chunks = []\n current = []\n current_len = 0\n for w in words:\n word_text = w[\"word\"]\n candidate_len = current_len + len(word_text)\n if current and candidate_len > max_chars:\n chunks.append(current)\n current = [w]\n current_len = len(word_text)\n else:\n current.append(w)\n current_len = candidate_len\n if current:\n chunks.append(current)\n return chunks\n\n\n# ---------------------------------------------------------------------------\n# Speaker name mapping\n# ---------------------------------------------------------------------------\n\ndef apply_speaker_names(segments, names_str):\n \"\"\"Replace SPEAKER_1, SPEAKER_2, … with real names from a comma-separated list.\"\"\"\n names = [n.strip() for n in names_str.split(\",\") if n.strip()]\n mapping = {}\n for seg in segments:\n raw = seg.get(\"speaker\", \"\")\n if raw and raw.startswith(\"SPEAKER_\"):\n if raw not in mapping:\n try:\n idx = int(raw.split(\"_\", 1)[1]) - 1\n mapping[raw] = names[idx] if 0 \u003c= idx \u003c len(names) else raw\n except (ValueError, IndexError):\n mapping[raw] = raw\n seg[\"speaker\"] = mapping[raw]\n if seg.get(\"words\"):\n for w in seg[\"words\"]:\n if w.get(\"speaker\") == raw:\n w[\"speaker\"] = mapping[raw]\n return segments\n\n\n# ---------------------------------------------------------------------------\n# Subtitle burn-in\n# ---------------------------------------------------------------------------\n\ndef burn_subtitles(video_path, srt_content, output_path, quiet=False):\n \"\"\"Burn SRT subtitles into a video file using ffmpeg.\"\"\"\n tmp_srt = None\n try:\n with tempfile.NamedTemporaryFile(\n mode=\"w\", suffix=\".srt\", delete=False, encoding=\"utf-8\"\n ) as f:\n f.write(srt_content)\n tmp_srt = f.name\n\n # Escape colons/backslashes in path for ffmpeg filtergraph\n escaped = tmp_srt.replace(\"\\\\\", \"/\").replace(\":\", \"\\\\:\")\n cmd = [\n \"ffmpeg\", \"-y\", \"-i\", video_path,\n \"-vf\", f\"subtitles={escaped}\",\n \"-c:a\", \"copy\",\n output_path,\n ]\n if not quiet:\n print(f\"🎬 Burning subtitles into {output_path}...\", file=sys.stderr)\n subprocess.run(cmd, check=True)\n else:\n subprocess.run(cmd, check=True, capture_output=True)\n if not quiet:\n print(f\"✅ Burned: {output_path}\", file=sys.stderr)\n except subprocess.CalledProcessError as e:\n print(f\"⚠️ Burn-in failed: {e}\", file=sys.stderr)\n finally:\n if tmp_srt and os.path.exists(tmp_srt):\n os.unlink(tmp_srt)\n\n\n# ---------------------------------------------------------------------------\n# URL download\n# ---------------------------------------------------------------------------\n\ndef download_url(url, quiet=False):\n \"\"\"Download audio from URL using yt-dlp. Returns (audio_path, tmpdir).\"\"\"\n ytdlp = shutil.which(\"yt-dlp\")\n if not ytdlp:\n pipx_path = Path.home() / \".local/share/pipx/venvs/yt-dlp/bin/yt-dlp\"\n if pipx_path.exists():\n ytdlp = str(pipx_path)\n else:\n print(\"Error: yt-dlp not found. Install with: pipx install yt-dlp\", file=sys.stderr)\n sys.exit(1)\n\n tmpdir = tempfile.mkdtemp(prefix=\"faster-whisper-\")\n out_tmpl = os.path.join(tmpdir, \"audio.%(ext)s\")\n\n cmd = [ytdlp, \"-x\", \"--audio-format\", \"mp3\", \"-o\", out_tmpl, \"--no-playlist\"]\n if quiet:\n cmd.append(\"-q\")\n cmd.append(url)\n\n if not quiet:\n print(\"⬇️ Downloading audio from URL...\", file=sys.stderr)\n\n try:\n subprocess.run(cmd, check=True, capture_output=quiet)\n except subprocess.CalledProcessError as e:\n print(f\"Error downloading URL: {e}\", file=sys.stderr)\n shutil.rmtree(tmpdir, ignore_errors=True)\n sys.exit(1)\n\n files = list(Path(tmpdir).glob(\"audio.*\"))\n if not files:\n print(\"Error: No audio file downloaded\", file=sys.stderr)\n shutil.rmtree(tmpdir, ignore_errors=True)\n sys.exit(1)\n\n return str(files[0]), tmpdir\n\n\n# ---------------------------------------------------------------------------\n# RSS / Podcast feed\n# ---------------------------------------------------------------------------\n\ndef fetch_rss_episodes(rss_url, latest=5, quiet=False):\n \"\"\"Parse a podcast RSS feed and return audio enclosure URLs.\n\n Returns list of (url, title) tuples, newest-first (standard RSS order).\n Uses only stdlib — no extra dependencies.\n \"\"\"\n import urllib.request\n import xml.etree.ElementTree as ET\n\n if not quiet:\n print(f\"📡 Fetching RSS feed: {rss_url}\", file=sys.stderr)\n\n try:\n req = urllib.request.Request(\n rss_url, headers={\"User-Agent\": \"faster-whisper-skill/1.0\"}\n )\n with urllib.request.urlopen(req, timeout=30) as resp:\n xml_data = resp.read()\n except Exception as e:\n print(f\"Error fetching RSS feed: {e}\", file=sys.stderr)\n sys.exit(1)\n\n try:\n root = ET.fromstring(xml_data)\n except ET.ParseError as e:\n print(f\"Error parsing RSS XML: {e}\", file=sys.stderr)\n sys.exit(1)\n\n items = root.findall(\".//item\")\n if not items:\n print(\"Error: No \u003citem> elements found in RSS feed\", file=sys.stderr)\n sys.exit(1)\n\n episodes = []\n for item in items:\n enclosure = item.find(\"enclosure\")\n if enclosure is None:\n continue\n url = (enclosure.get(\"url\") or \"\").strip()\n if not url:\n continue\n title_el = item.find(\"title\")\n title = (title_el.text or url).strip() if title_el is not None else url\n episodes.append((url, title))\n\n if not episodes:\n print(\"Error: No audio \u003cenclosure> elements found in RSS feed\", file=sys.stderr)\n sys.exit(1)\n\n total = len(episodes)\n take = min(latest, total) if latest else total\n if not quiet:\n print(f\" Found {total} episode(s) — processing {take}\", file=sys.stderr)\n\n return episodes[:take] if latest else episodes\n\n\n# ---------------------------------------------------------------------------\n# Audio preprocessing\n# ---------------------------------------------------------------------------\n\ndef preprocess_audio(audio_path, normalize=False, denoise=False, quiet=False):\n \"\"\"Preprocess audio with ffmpeg filters (normalize volume, reduce noise).\n\n Returns (processed_path, tmp_path_to_cleanup_or_None).\n \"\"\"\n if not normalize and not denoise:\n return audio_path, None\n\n filters = []\n if denoise:\n # High-pass to remove rumble + FFT-based noise reduction\n filters.append(\"highpass=f=200\")\n filters.append(\"afftdn=nf=-25\")\n if normalize:\n # EBU R128 loudness normalization\n filters.append(\"loudnorm=I=-16:TP=-1.5:LRA=11\")\n\n tmp_path = audio_path + \".preprocessed.wav\"\n filter_str = \",\".join(filters)\n cmd = [\n \"ffmpeg\", \"-y\", \"-i\", audio_path,\n \"-af\", filter_str,\n \"-ar\", \"16000\", \"-ac\", \"1\",\n tmp_path,\n ]\n\n if not quiet:\n labels = []\n if normalize:\n labels.append(\"normalizing\")\n if denoise:\n labels.append(\"denoising\")\n print(f\"🔧 Preprocessing: {' + '.join(labels)}...\", file=sys.stderr)\n\n try:\n subprocess.run(cmd, check=True, capture_output=True)\n return tmp_path, tmp_path\n except subprocess.CalledProcessError:\n if not quiet:\n print(\"⚠️ Preprocessing failed, using original audio\", file=sys.stderr)\n if os.path.exists(tmp_path):\n os.remove(tmp_path)\n return audio_path, None\n\n\n# ---------------------------------------------------------------------------\n# Word-level alignment (wav2vec2)\n# ---------------------------------------------------------------------------\n\n_align_cache = {} # reuse model across files in batch mode\n\n# Characters to strip before alignment (numbers, punctuation except apostrophe)\n_ALIGN_CLEAN = re.compile(r\"[^a-z'\\u00e0-\\u00ff]\") # keep letters, ', accented\n\n\ndef run_alignment(audio_path, segments, quiet=False):\n \"\"\"Refine word timestamps using wav2vec2 forced alignment (MMS model).\n\n Tokenises each word into character-level token groups, concatenates\n them, runs CTC forced alignment on the segment emission, then maps\n aligned spans back to words. Falls back per-segment on failure.\n \"\"\"\n global _align_cache\n\n try:\n import torch\n import torchaudio\n except ImportError:\n print(\n \"Error: torchaudio not installed (required for --precise).\\n\"\n \" Reinstall with: ./setup.sh\",\n file=sys.stderr,\n )\n sys.exit(1)\n\n if not quiet:\n print(\"🎯 Refining word timestamps (wav2vec2)...\", file=sys.stderr)\n\n # --- load / cache model ---------------------------------------------------\n if \"model\" not in _align_cache:\n bundle = torchaudio.pipelines.MMS_FA\n model = bundle.get_model()\n try:\n if torch.cuda.is_available():\n model = model.to(\"cuda\")\n _align_cache[\"device\"] = \"cuda\"\n else:\n _align_cache[\"device\"] = \"cpu\"\n except Exception:\n _align_cache[\"device\"] = \"cpu\"\n\n _align_cache[\"model\"] = model\n _align_cache[\"tokenizer\"] = bundle.get_tokenizer()\n _align_cache[\"aligner\"] = bundle.get_aligner()\n _align_cache[\"sample_rate\"] = bundle.sample_rate\n\n model = _align_cache[\"model\"]\n tokenizer = _align_cache[\"tokenizer\"]\n aligner = _align_cache[\"aligner\"]\n target_sr = _align_cache[\"sample_rate\"]\n device = _align_cache[\"device\"]\n\n # --- load audio -----------------------------------------------------------\n waveform, sr = torchaudio.load(audio_path)\n if waveform.shape[0] > 1:\n waveform = waveform.mean(dim=0, keepdim=True) # stereo → mono\n if sr != target_sr:\n waveform = torchaudio.functional.resample(waveform, sr, target_sr)\n sr = target_sr\n\n # --- emissions (one pass over full audio) ---------------------------------\n with torch.inference_mode():\n emission, _ = model(waveform.to(device))\n emission = emission[0].cpu() # (num_frames, num_classes)\n\n num_samples = waveform.shape[1]\n num_frames = emission.shape[0]\n frame_dur = (num_samples / num_frames) / sr # seconds per emission frame\n\n aligned_count = 0\n\n for seg in segments:\n words = seg.get(\"words\")\n if not words:\n continue\n\n # tokenise each word → list of token groups [[t], [t], ...]\n word_map = [] # (index-in-words, token_groups, group_count)\n all_groups = []\n for i, w in enumerate(words):\n raw = w[\"word\"].strip().lower()\n cleaned = _ALIGN_CLEAN.sub(\"\", raw)\n if not cleaned:\n continue\n try:\n groups = tokenizer(cleaned) # [[t1], [t2], ...] per char\n if groups:\n word_map.append((i, len(groups)))\n all_groups.extend(groups)\n except Exception:\n continue\n\n if not all_groups:\n continue\n\n # slice emission for this segment\n seg_start_frame = max(0, int(seg[\"start\"] / frame_dur))\n seg_end_frame = min(num_frames, int(seg[\"end\"] / frame_dur))\n seg_emission = emission[seg_start_frame:seg_end_frame]\n\n if seg_emission.shape[0] \u003c len(all_groups):\n continue\n\n try:\n # aligner expects List[List[int]], returns List[List[TokenSpan]]\n all_spans = aligner(seg_emission, all_groups)\n except Exception:\n continue\n\n if len(all_spans) != len(all_groups):\n continue\n\n # map spans back to words by group count\n grp_idx = 0\n for orig_idx, count in word_map:\n char_spans = all_spans[grp_idx : grp_idx + count]\n grp_idx += count\n\n # each char_spans[j] is [TokenSpan, ...] for one character\n first = char_spans[0] if char_spans else []\n last = char_spans[-1] if char_spans else []\n if not first or not last:\n continue\n\n start_t = round((seg_start_frame + first[0].start) * frame_dur, 3)\n end_t = round((seg_start_frame + last[-1].end) * frame_dur, 3)\n\n words[orig_idx][\"start\"] = start_t\n words[orig_idx][\"end\"] = end_t\n aligned_count += 1\n\n # tighten segment boundaries to aligned words\n valid = [w for w in words if w.get(\"start\") is not None]\n if valid:\n seg[\"start\"] = valid[0][\"start\"]\n seg[\"end\"] = valid[-1][\"end\"]\n\n if not quiet:\n print(f\" Refined {aligned_count} word timestamps\", file=sys.stderr)\n\n return segments\n\n\n# ---------------------------------------------------------------------------\n# Speaker diarization\n# ---------------------------------------------------------------------------\n\ndef run_diarization(audio_path, segments, quiet=False, min_speakers=None, max_speakers=None, hf_token=None):\n \"\"\"Assign speaker labels to segments using pyannote.audio.\"\"\"\n try:\n from pyannote.audio import Pipeline as PyannotePipeline\n except ImportError:\n print(\n \"Error: pyannote.audio not installed.\\n\"\n \" Install: ./setup.sh --diarize\\n\"\n \" Or: pip install pyannote.audio\",\n file=sys.stderr,\n )\n sys.exit(1)\n\n if not quiet:\n print(\"🔊 Running speaker diarization...\", file=sys.stderr)\n\n try:\n pretrained_kwargs = {}\n if hf_token:\n pretrained_kwargs[\"use_auth_token\"] = hf_token\n pipeline = PyannotePipeline.from_pretrained(\n \"pyannote/speaker-diarization-3.1\",\n **pretrained_kwargs,\n )\n except Exception as e:\n print(f\"Error loading diarization model: {e}\", file=sys.stderr)\n print(\n \" Ensure you have a HuggingFace token at ~/.cache/huggingface/token\\n\"\n \" and accepted: https://hf.co/pyannote/speaker-diarization-3.1\",\n file=sys.stderr,\n )\n sys.exit(1)\n\n # Move to GPU if available\n try:\n import torch\n if torch.cuda.is_available():\n pipeline.to(torch.device(\"cuda\"))\n except Exception:\n pass\n\n # pyannote works best with WAV; convert compressed formats to avoid\n # sample-count mismatches (known issue with MP3/OGG)\n diarize_path = audio_path\n tmp_wav = None\n if not audio_path.lower().endswith(\".wav\"):\n tmp_wav = audio_path + \".diarize.wav\"\n try:\n subprocess.run(\n [\"ffmpeg\", \"-y\", \"-i\", audio_path, \"-ar\", \"16000\", \"-ac\", \"1\", tmp_wav],\n check=True, capture_output=True,\n )\n diarize_path = tmp_wav\n except Exception:\n # Fall back to original file if conversion fails\n tmp_wav = None\n\n try:\n diarize_kwargs = {}\n if min_speakers is not None:\n diarize_kwargs[\"min_speakers\"] = min_speakers\n if max_speakers is not None:\n diarize_kwargs[\"max_speakers\"] = max_speakers\n diarize_result = pipeline(diarize_path, **diarize_kwargs)\n finally:\n if tmp_wav and os.path.exists(tmp_wav):\n os.remove(tmp_wav)\n\n # pyannote 4.x returns DiarizeOutput with .speaker_diarization attribute;\n # pyannote 3.x returns an Annotation directly\n if hasattr(diarize_result, \"speaker_diarization\"):\n annotation = diarize_result.speaker_diarization\n else:\n annotation = diarize_result\n\n # Build speaker timeline\n timeline = [\n {\"start\": turn.start, \"end\": turn.end, \"speaker\": speaker}\n for turn, _, speaker in annotation.itertracks(yield_label=True)\n ]\n\n def speaker_at(t):\n \"\"\"Find the speaker at a given timestamp by max overlap with a point.\"\"\"\n best, best_overlap = None, 0\n for tl in timeline:\n if tl[\"start\"] \u003c= t \u003c= tl[\"end\"]:\n overlap = min(tl[\"end\"], t + 0.01) - max(tl[\"start\"], t)\n if overlap > best_overlap:\n best_overlap = overlap\n best = tl[\"speaker\"]\n return best\n\n # Collect all words across segments for word-level speaker assignment\n all_words = []\n for seg in segments:\n if seg.get(\"words\"):\n all_words.extend(seg[\"words\"])\n\n if all_words:\n # Word-level diarization: assign speaker to each word, then regroup\n # into speaker-homogeneous segments\n for w in all_words:\n mid = (w[\"start\"] + w[\"end\"]) / 2\n w[\"speaker\"] = speaker_at(mid)\n\n # Group consecutive words by speaker into new segments\n new_segments = []\n current_speaker = None\n current_words = []\n\n def flush_group():\n if not current_words:\n return\n new_segments.append({\n \"start\": current_words[0][\"start\"],\n \"end\": current_words[-1][\"end\"],\n \"text\": \"\".join(w[\"word\"] for w in current_words),\n \"speaker\": current_speaker,\n \"words\": list(current_words),\n })\n\n for w in all_words:\n sp = w.get(\"speaker\")\n if sp != current_speaker and current_words:\n flush_group()\n current_words = []\n current_speaker = sp\n current_words.append(w)\n flush_group()\n\n segments = new_segments\n else:\n # No word-level data: fall back to segment-level assignment\n for seg in segments:\n mid = (seg[\"start\"] + seg[\"end\"]) / 2\n seg[\"speaker\"] = speaker_at(mid)\n\n # Rename to SPEAKER_1, SPEAKER_2, ... in order of appearance\n seen = {}\n for seg in segments:\n raw = seg.get(\"speaker\")\n if raw and raw not in seen:\n seen[raw] = f\"SPEAKER_{len(seen) + 1}\"\n if raw:\n seg[\"speaker\"] = seen[raw]\n\n if not quiet:\n print(f\" Found {len(seen)} speaker(s)\", file=sys.stderr)\n\n return segments, list(seen.values())\n\n\n# ---------------------------------------------------------------------------\n# Speaker audio export\n# ---------------------------------------------------------------------------\n\ndef export_speakers_audio(audio_path, segments, output_dir, quiet=False):\n \"\"\"Export each speaker's audio as a separate WAV file.\n\n Groups diarized segments by speaker and uses ffmpeg's *aselect* filter to\n extract and concatenate each speaker's turns into a single file.\n Requires ffmpeg and diarized segments (speaker field on each segment).\n \"\"\"\n if not shutil.which(\"ffmpeg\"):\n print(\"⚠️ --export-speakers requires ffmpeg in PATH\", file=sys.stderr)\n return\n\n # Group by speaker\n speaker_ranges = {}\n for seg in segments:\n sp = seg.get(\"speaker\")\n if not sp:\n continue\n speaker_ranges.setdefault(sp, []).append((seg[\"start\"], seg[\"end\"]))\n\n if not speaker_ranges:\n print(\n \"⚠️ No speaker-labeled segments found — diarization produced no speakers.\\n\"\n \" This usually means no speech was detected in the audio.\",\n file=sys.stderr,\n )\n return\n\n out_dir = Path(output_dir)\n out_dir.mkdir(parents=True, exist_ok=True)\n\n for speaker, ranges in sorted(speaker_ranges.items()):\n out_file = out_dir / f\"{speaker}.wav\"\n\n # Build aselect expression: 'between(t,S,E)+between(t,S,E)+...'\n select_expr = \"+\".join(\n f\"between(t,{start:.3f},{end:.3f})\" for start, end in ranges\n )\n\n cmd = [\n \"ffmpeg\", \"-y\", \"-i\", audio_path,\n \"-af\", f\"aselect='{select_expr}',asetpts=N/SR/TB\",\n str(out_file),\n ]\n\n total_dur = sum(e - s for s, e in ranges)\n if not quiet:\n print(\n f\"🎤 Exporting {speaker}: {len(ranges)} segment(s), \"\n f\"{format_duration(total_dur)}...\",\n file=sys.stderr,\n )\n\n try:\n subprocess.run(cmd, check=True, stderr=subprocess.DEVNULL if quiet else None)\n if not quiet:\n print(f\" 💾 {out_file}\", file=sys.stderr)\n except subprocess.CalledProcessError as e:\n print(f\"⚠️ Failed to export {speaker}: {e}\", file=sys.stderr)\n\n if not quiet:\n print(f\"✅ Speaker audio saved to: {out_dir}\", file=sys.stderr)\n\n\n# ---------------------------------------------------------------------------\n# Sentence merging\n# ---------------------------------------------------------------------------\n\n_TERMINAL_PUNCT = re.compile(r'[.!?…。！？][\"\\')\\]]*\\s*

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

)\n\n\ndef merge_sentences(segments):\n \"\"\"Merge consecutive short segments into sentence-boundary-aware chunks.\n\n A new chunk is started when:\n - The previous segment's text ends with terminal punctuation (. ! ? … etc.)\n - OR the gap between consecutive segments exceeds 2 seconds.\n \"\"\"\n MAX_GAP = 2.0 # seconds\n\n merged = []\n accum = []\n\n def flush():\n if not accum:\n return\n start = accum[0][\"start\"]\n end = accum[-1][\"end\"]\n text = \" \".join(s[\"text\"].strip() for s in accum).strip()\n words = []\n for s in accum:\n words.extend(s.get(\"words\", []))\n # Most common speaker in merged segments\n speakers = [s.get(\"speaker\") for s in accum if s.get(\"speaker\")]\n speaker = max(set(speakers), key=speakers.count) if speakers else None\n seg = {\"start\": start, \"end\": end, \"text\": text}\n if words:\n seg[\"words\"] = words\n if speaker:\n seg[\"speaker\"] = speaker\n merged.append(seg)\n\n for seg in segments:\n if accum:\n gap = seg[\"start\"] - accum[-1][\"end\"]\n if gap > MAX_GAP:\n flush()\n accum = []\n accum.append(seg)\n if _TERMINAL_PUNCT.search(seg[\"text\"]):\n flush()\n accum = []\n\n flush()\n return merged\n\n\n# ---------------------------------------------------------------------------\n# Chapter detection\n# ---------------------------------------------------------------------------\n\ndef detect_chapters(segments, min_gap=8.0):\n \"\"\"Detect chapter breaks from silence gaps between segments.\n\n A new chapter starts when the silence between two consecutive segments\n exceeds *min_gap* seconds. Returns a list of chapter dicts:\n {\"chapter\": N, \"start\": seconds, \"title\": \"Chapter N\"}\n \"\"\"\n if not segments:\n return []\n\n chapters = [{\"chapter\": 1, \"start\": segments[0][\"start\"], \"title\": \"Chapter 1\"}]\n chapter_num = 1\n\n for i in range(1, len(segments)):\n gap = segments[i][\"start\"] - segments[i - 1][\"end\"]\n if gap >= min_gap:\n chapter_num += 1\n chapters.append({\n \"chapter\": chapter_num,\n \"start\": segments[i][\"start\"],\n \"title\": f\"Chapter {chapter_num}\",\n })\n\n return chapters\n\n\ndef _fmt_chapter_ts(seconds):\n \"\"\"Format chapter timestamp: M:SS or H:MM:SS.\"\"\"\n h = int(seconds // 3600)\n m = int((seconds % 3600) // 60)\n s = int(seconds % 60)\n return f\"{h}:{m:02d}:{s:02d}\" if h else f\"{m}:{s:02d}\"\n\n\ndef format_chapters_output(chapters, fmt=\"youtube\"):\n \"\"\"Render chapter list.\n\n fmt=\"youtube\" → \"0:00 Chapter 1\\\\n5:30 Chapter 2\" (YouTube description format)\n fmt=\"text\" → \"Chapter 1: 00:00:00\\\\nChapter 2: 00:05:30\"\n fmt=\"json\" → JSON array\n \"\"\"\n if fmt == \"json\":\n return json.dumps(chapters, indent=2, ensure_ascii=False)\n\n if fmt == \"text\":\n lines = []\n for ch in chapters:\n h = int(ch[\"start\"] // 3600)\n m = int((ch[\"start\"] % 3600) // 60)\n s = int(ch[\"start\"] % 60)\n ts = f\"{h:02d}:{m:02d}:{s:02d}\"\n lines.append(f\"{ch['title']}: {ts}\")\n return \"\\n\".join(lines)\n\n # Default: YouTube-compatible \"M:SS Title\"\n return \"\\n\".join(\n f\"{_fmt_chapter_ts(ch['start'])} {ch['title']}\" for ch in chapters\n )\n\n\n# ---------------------------------------------------------------------------\n# Transcript search\n# ---------------------------------------------------------------------------\n\ndef search_transcript(segments, query, fuzzy=False):\n \"\"\"Search transcript segments for *query*.\n\n Returns a list of matching segment dicts (with start, end, text, speaker).\n Case-insensitive. With fuzzy=True, also matches partial/approximate terms\n by checking individual word tokens for similarity (ratio ≥ 0.6).\n \"\"\"\n import difflib\n\n query_lower = query.lower()\n matches = []\n\n for seg in segments:\n text = seg[\"text\"].strip()\n text_lower = text.lower()\n\n matched = query_lower in text_lower\n\n if not matched and fuzzy:\n # Check each word token in the segment for similarity to query.\n # This handles short queries (e.g. \"wrld\" matching \"world\") better\n # than comparing the full segment text via SequenceMatcher ratio.\n words_in_seg = re.findall(r\"[\\w']+\", text_lower)\n for word in words_in_seg:\n ratio = difflib.SequenceMatcher(None, query_lower, word).ratio()\n if ratio >= 0.6:\n matched = True\n break\n # Fallback: also try full-text ratio for multi-word query phrases\n if not matched and \" \" in query_lower:\n ratio = difflib.SequenceMatcher(None, query_lower, text_lower).ratio()\n if ratio >= 0.6:\n matched = True\n\n if matched:\n matches.append({\n \"start\": seg[\"start\"],\n \"end\": seg[\"end\"],\n \"text\": text,\n \"speaker\": seg.get(\"speaker\"),\n })\n\n return matches\n\n\ndef format_search_results(matches, query):\n \"\"\"Format search results for display.\"\"\"\n if not matches:\n return f'No matches found for: \"{query}\"'\n\n lines = [f'🔍 {len(matches)} match(es) for \"{query}\":']\n for m in matches:\n ts = _fmt_chapter_ts(m[\"start\"])\n speaker = f\"[{m['speaker']}] \" if m.get(\"speaker\") else \"\"\n lines.append(f\" [{ts}] {speaker}{m['text']}\")\n\n return \"\\n\".join(lines)\n\n\n# ---------------------------------------------------------------------------\n# Language map (per-file language override for batch mode)\n# ---------------------------------------------------------------------------\n\ndef parse_language_map(lang_map_str):\n \"\"\"Parse --language-map value into a {pattern: lang_code} dict.\n\n Two forms accepted:\n Inline: \"interview*.mp3=en,lecture.mp3=fr,keynote.wav=de\"\n JSON file: \"@/path/to/map.json\" (must be a dict of {pattern: lang})\n\n Patterns can be exact filenames, stems, or fnmatch glob patterns.\n \"\"\"\n if not lang_map_str:\n return {}\n\n if lang_map_str.startswith(\"@\"):\n json_path = lang_map_str[1:]\n with open(json_path, encoding=\"utf-8\") as f:\n return json.load(f)\n\n mapping = {}\n for part in lang_map_str.split(\",\"):\n part = part.strip()\n if \"=\" not in part:\n continue\n pattern, lang = part.rsplit(\"=\", 1)\n mapping[pattern.strip()] = lang.strip()\n return mapping\n\n\ndef resolve_file_language(audio_path, lang_map, fallback=None):\n \"\"\"Return the language code for *audio_path* using *lang_map*.\n\n Priority:\n 1. Exact filename match (e.g. \"interview.mp3\")\n 2. Exact stem match (e.g. \"interview\")\n 3. fnmatch glob match on filename (e.g. \"interview*.mp3\")\n 4. fnmatch glob match on stem (e.g. \"interview*\")\n 5. Fallback (global --language setting or None = auto-detect)\n \"\"\"\n if not lang_map:\n return fallback\n\n name = Path(audio_path).name\n stem = Path(audio_path).stem\n\n for pattern, lang in lang_map.items():\n if pattern in (name, stem):\n return lang\n\n for pattern, lang in lang_map.items():\n if fnmatch.fnmatch(name, pattern) or fnmatch.fnmatch(stem, pattern):\n return lang\n\n return fallback\n\n\n# ---------------------------------------------------------------------------\n# File resolution\n# ---------------------------------------------------------------------------\n\nAUDIO_EXTS = {\n \".mp3\", \".wav\", \".m4a\", \".flac\", \".ogg\", \".webm\",\n \".mp4\", \".mkv\", \".avi\", \".wma\", \".aac\",\n}\n\n\ndef resolve_inputs(inputs):\n \"\"\"Expand globs, directories, and URLs into a flat list of audio paths.\"\"\"\n files = []\n for inp in inputs:\n if is_url(inp):\n files.append(inp)\n continue\n expanded = sorted(glob.glob(inp, recursive=True)) or [inp]\n for p_str in expanded:\n p = Path(p_str)\n if p.is_dir():\n files.extend(\n str(f) for f in sorted(p.iterdir())\n if f.is_file() and f.suffix.lower() in AUDIO_EXTS\n )\n elif p.is_file():\n files.append(str(p))\n else:\n print(f\"Warning: not found: {inp}\", file=sys.stderr)\n return files\n\n\n# ---------------------------------------------------------------------------\n# Core transcription\n# ---------------------------------------------------------------------------\n\ndef transcribe_file(audio_path, pipeline, args):\n \"\"\"Transcribe a single audio file. Returns result dict.\"\"\"\n t0 = time.time()\n\n # --- Preprocessing (normalize / denoise) ---\n preprocess_tmp = None\n channel_tmp = None\n effective_path = str(audio_path)\n\n # --- Channel extraction (stereo → mono channel) ---\n channel = getattr(args, \"channel\", \"mix\")\n if channel != \"mix\":\n effective_path, channel_tmp = extract_channel(\n effective_path, channel, quiet=args.quiet\n )\n\n if args.normalize or args.denoise:\n effective_path, preprocess_tmp = preprocess_audio(\n effective_path, normalize=args.normalize, denoise=args.denoise,\n quiet=args.quiet,\n )\n\n need_words = (\n args.word_timestamps\n or args.min_confidence is not None\n or args.diarize # word-level needed for accurate speaker assignment\n ) and not args.stream # streaming skips post-processing\n\n kw = dict(\n language=args.language,\n task=\"translate\" if args.translate else \"transcribe\",\n beam_size=args.beam_size,\n word_timestamps=need_words,\n vad_filter=not args.no_vad,\n hotwords=args.hotwords,\n initial_prompt=args.initial_prompt,\n prefix=args.prefix,\n condition_on_previous_text=not args.no_condition_on_previous_text,\n multilingual=args.multilingual if args.multilingual else None,\n )\n\n # Optional parameters — only pass if explicitly set (avoids overriding defaults)\n if args.hallucination_silence_threshold is not None:\n kw[\"hallucination_silence_threshold\"] = args.hallucination_silence_threshold\n if args.compression_ratio_threshold is not None:\n kw[\"compression_ratio_threshold\"] = args.compression_ratio_threshold\n if args.log_prob_threshold is not None:\n kw[\"log_prob_threshold\"] = args.log_prob_threshold\n if args.max_new_tokens is not None:\n kw[\"max_new_tokens\"] = args.max_new_tokens\n if args.clip_timestamps is not None:\n # BatchedInferencePipeline expects List[dict] with \"start\"/\"end\" keys (seconds as floats).\n # Parse \"0,3\" → [{\"start\": 0.0, \"end\": 3.0}]\n # Parse \"0,30;60,90\" → [{\"start\": 0.0, \"end\": 30.0}, {\"start\": 60.0, \"end\": 90.0}]\n parsed_clips = []\n for clip_str in args.clip_timestamps.split(\";\"):\n parts = clip_str.strip().split(\",\")\n if len(parts) == 2:\n parsed_clips.append({\"start\": float(parts[0]), \"end\": float(parts[1])})\n else:\n raise ValueError(f\"Invalid clip range '{clip_str}'. Expected 'start,end' (seconds).\")\n kw[\"clip_timestamps\"] = parsed_clips\n if args.progress:\n kw[\"log_progress\"] = True\n\n if not args.no_batch:\n kw[\"batch_size\"] = args.batch_size\n\n # VAD tuning parameters\n vad_dict = {}\n vad_threshold = args.vad_threshold if args.vad_threshold is not None else args.vad_onset\n vad_neg_threshold = args.vad_neg_threshold if args.vad_neg_threshold is not None else args.vad_offset\n if vad_threshold is not None:\n vad_dict[\"threshold\"] = vad_threshold\n if vad_neg_threshold is not None:\n vad_dict[\"neg_threshold\"] = vad_neg_threshold\n if args.min_speech_duration is not None:\n vad_dict[\"min_speech_duration_ms\"] = args.min_speech_duration\n if args.max_speech_duration is not None:\n vad_dict[\"max_speech_duration_s\"] = args.max_speech_duration\n if args.min_silence_duration is not None:\n vad_dict[\"min_silence_duration_ms\"] = args.min_silence_duration\n if args.speech_pad is not None:\n vad_dict[\"speech_pad_ms\"] = args.speech_pad\n if vad_dict:\n kw[\"vad_parameters\"] = vad_dict\n\n # Temperature control\n if args.temperature is not None:\n temps = [float(t.strip()) for t in args.temperature.split(\",\")]\n kw[\"temperature\"] = temps[0] if len(temps) == 1 else temps\n\n # No-speech threshold\n if args.no_speech_threshold is not None:\n kw[\"no_speech_threshold\"] = args.no_speech_threshold\n\n # Beam search / sampling tuning\n if args.best_of is not None:\n kw[\"best_of\"] = args.best_of\n if args.patience is not None:\n kw[\"patience\"] = args.patience\n if args.repetition_penalty is not None:\n kw[\"repetition_penalty\"] = args.repetition_penalty\n if args.no_repeat_ngram_size is not None:\n kw[\"no_repeat_ngram_size\"] = args.no_repeat_ngram_size\n\n # --- Advanced inference params (Part 1 new flags) ---\n if args.no_timestamps:\n _ts_formats = {\"srt\", \"vtt\", \"tsv\", \"lrc\", \"ass\", \"ttml\"}\n conflicts = (\n args.word_timestamps\n or any(f in _ts_formats for f in getattr(args, \"_formats\", [args.format]))\n or args.diarize\n )\n if conflicts:\n print(\n \"⚠️ --no-timestamps ignored: incompatible with \"\n \"--word-timestamps / --format srt/vtt/tsv/lrc/ass/ttml / --diarize\",\n file=sys.stderr,\n )\n else:\n kw[\"without_timestamps\"] = True\n\n if args.chunk_length is not None:\n kw[\"chunk_length\"] = args.chunk_length\n\n if args.language_detection_threshold is not None:\n kw[\"language_detection_threshold\"] = args.language_detection_threshold\n\n if args.language_detection_segments is not None:\n kw[\"language_detection_segments\"] = args.language_detection_segments\n\n if args.length_penalty is not None:\n kw[\"length_penalty\"] = args.length_penalty\n\n if args.prompt_reset_on_temperature is not None:\n kw[\"prompt_reset_on_temperature\"] = args.prompt_reset_on_temperature\n\n if args.no_suppress_blank:\n kw[\"suppress_blank\"] = False\n\n if args.suppress_tokens is not None:\n try:\n ids = [int(x.strip()) for x in args.suppress_tokens.split(\",\") if x.strip()]\n kw[\"suppress_tokens\"] = [-1] + ids\n except ValueError:\n print(f\"⚠️ Invalid --suppress-tokens value: {args.suppress_tokens!r} — skipped\", file=sys.stderr)\n\n if args.max_initial_timestamp is not None:\n kw[\"max_initial_timestamp\"] = args.max_initial_timestamp\n\n if args.prepend_punctuations is not None:\n kw[\"prepend_punctuations\"] = args.prepend_punctuations\n\n if args.append_punctuations is not None:\n kw[\"append_punctuations\"] = args.append_punctuations\n\n segments_iter, info = pipeline.transcribe(effective_path, **kw)\n\n segments = []\n full_text = \"\"\n\n for seg in segments_iter:\n # Confidence filter (needs word-level probabilities)\n if args.min_confidence is not None and seg.words:\n avg = sum(w.probability for w in seg.words) / len(seg.words)\n if avg \u003c args.min_confidence:\n continue\n\n full_text += seg.text\n seg_data = {\"start\": seg.start, \"end\": seg.end, \"text\": seg.text}\n\n if need_words and seg.words:\n seg_data[\"words\"] = [\n {\n \"word\": w.word,\n \"start\": w.start,\n \"end\": w.end,\n \"probability\": w.probability,\n }\n for w in seg.words\n ]\n\n segments.append(seg_data)\n\n # Streaming: print segment immediately\n if args.stream:\n line = f\"[{format_ts_vtt(seg.start)} → {format_ts_vtt(seg.end)}] {seg.text.strip()}\"\n print(line, flush=True)\n\n # Refine word timestamps with wav2vec2 (before diarization so it benefits)\n # Auto-runs whenever word timestamps are computed (--precise, --diarize,\n # --word-timestamps, --min-confidence all trigger word-level output)\n if need_words and not args.stream:\n segments = run_alignment(effective_path, segments, quiet=args.quiet)\n\n # Diarize after transcription (and alignment if --precise)\n speakers = None\n if args.diarize and not args.stream:\n segments, speakers = run_diarization(\n effective_path, segments, quiet=args.quiet,\n min_speakers=args.min_speakers, max_speakers=args.max_speakers,\n hf_token=args.hf_token,\n )\n # Apply speaker name mapping if provided\n if getattr(args, \"speaker_names\", None):\n segments = apply_speaker_names(segments, args.speaker_names)\n\n # Filter hallucinations if requested\n if getattr(args, \"filter_hallucinations\", False):\n segments = filter_hallucinations(segments)\n\n # Cleanup preprocessing and channel extraction temp files\n if preprocess_tmp and os.path.exists(preprocess_tmp):\n os.remove(preprocess_tmp)\n if channel_tmp and os.path.exists(channel_tmp):\n os.remove(channel_tmp)\n\n elapsed = time.time() - t0\n dur = info.duration\n rt = round(dur / elapsed, 1) if elapsed > 0 else 0\n\n result = {\n \"file\": Path(audio_path).name,\n \"text\": full_text.strip(),\n \"language\": info.language,\n \"language_probability\": info.language_probability,\n \"duration\": dur,\n \"segments\": segments,\n \"stats\": {\n \"processing_time\": round(elapsed, 2),\n \"realtime_factor\": rt,\n },\n }\n if args.translate:\n result[\"task\"] = \"translate\"\n if speakers:\n result[\"speakers\"] = speakers\n\n if not args.quiet:\n task_label = \"translated\" if args.translate else \"transcribed\"\n print(\n f\"✅ {result['file']}: {format_duration(dur)} {task_label} in \"\n f\"{format_duration(elapsed)} ({rt}× realtime)\",\n file=sys.stderr,\n )\n\n return result\n\n\n# ---------------------------------------------------------------------------\n# Output\n# ---------------------------------------------------------------------------\n\nEXT_MAP = {\n \"text\": \".txt\", \"json\": \".json\", \"srt\": \".srt\",\n \"vtt\": \".vtt\", \"tsv\": \".tsv\", \"csv\": \".csv\", \"lrc\": \".lrc\",\n \"html\": \".html\", \"ass\": \".ass\", \"ttml\": \".ttml\",\n}\n\n\ndef format_result(result, fmt, max_words_per_line=None, max_chars_per_line=None):\n \"\"\"Render a result dict in the requested format.\"\"\"\n if fmt == \"json\":\n return json.dumps(result, indent=2, ensure_ascii=False)\n if fmt == \"srt\":\n return to_srt(result[\"segments\"], max_words_per_line=max_words_per_line,\n max_chars_per_line=max_chars_per_line)\n if fmt == \"vtt\":\n return to_vtt(result[\"segments\"], max_words_per_line=max_words_per_line,\n max_chars_per_line=max_chars_per_line)\n if fmt == \"tsv\":\n return to_tsv(result[\"segments\"])\n if fmt == \"csv\":\n return to_csv(result[\"segments\"])\n if fmt == \"lrc\":\n return to_lrc(result[\"segments\"])\n if fmt == \"html\":\n return to_html(result)\n if fmt == \"ass\":\n return to_ass(result[\"segments\"], max_words_per_line=max_words_per_line,\n max_chars_per_line=max_chars_per_line)\n if fmt == \"ttml\":\n return to_ttml(\n result[\"segments\"],\n language=result.get(\"language\", \"en\"),\n max_words_per_line=max_words_per_line,\n max_chars_per_line=max_chars_per_line,\n )\n return to_text(result[\"segments\"])\n\n\n# ---------------------------------------------------------------------------\n# CLI\n# ---------------------------------------------------------------------------\n\ndef main():\n # Pre-import onnxruntime silently to suppress the harmless WSL2 device-discovery warning.\n # onnxruntime writes directly to stderr fd when first imported (device_discovery.cc:211).\n # By importing it here with fd 2 redirected, we populate sys.modules so that later\n # lazy imports (faster_whisper's SileroVADModel) hit the cache instead of re-triggering.\n try:\n _old_stderr_fd = os.dup(2)\n try:\n with open(os.devnull, \"wb\") as _devnull:\n os.dup2(_devnull.fileno(), 2)\n import onnxruntime as _ort # noqa: F401\n finally:\n os.dup2(_old_stderr_fd, 2)\n os.close(_old_stderr_fd)\n except Exception:\n pass # If anything goes wrong, just continue — stderr stays intact\n\n # Early exit handlers — must run BEFORE argparse so they work without AUDIO positional arg\n _SCRIPT_DIR = Path(__file__).parent\n\n if \"--version\" in sys.argv:\n try:\n import importlib.metadata\n _fw_version = importlib.metadata.version(\"faster-whisper\")\n except Exception:\n _fw_version = getattr(sys.modules.get(\"faster_whisper\"), \"__version__\", \"unknown\")\n print(f\"faster-whisper {_fw_version}\")\n sys.exit(0)\n\n if \"--update\" in sys.argv:\n _venv_python = _SCRIPT_DIR.parent / \".venv\" / \"bin\" / \"python\"\n if shutil.which(\"uv\"):\n subprocess.run(\n [\"uv\", \"pip\", \"install\", \"--python\", str(_venv_python), \"--upgrade\", \"faster-whisper\"],\n check=True,\n )\n else:\n subprocess.run(\n [str(_venv_python), \"-m\", \"pip\", \"install\", \"--upgrade\", \"faster-whisper\"],\n check=True,\n )\n try:\n import importlib.metadata\n _fw_version = importlib.metadata.version(\"faster-whisper\")\n except Exception:\n _fw_version = \"unknown\"\n print(f\"✅ faster-whisper updated to {_fw_version}\")\n sys.exit(0)\n\n p = argparse.ArgumentParser(\n description=\"Transcribe audio with faster-whisper\",\n epilog=(\n \"examples:\\n\"\n \" %(prog)s audio.mp3\\n\"\n \" %(prog)s audio.mp3 --format srt -o subtitles.srt\\n\"\n \" %(prog)s https://youtube.com/watch?v=... --language en\\n\"\n \" %(prog)s *.mp3 --skip-existing -o ./transcripts/\\n\"\n \" %(prog)s meeting.wav --diarize --format vtt\\n\"\n \" %(prog)s lecture.mp3 --initial-prompt 'Kubernetes, gRPC'\\n\"\n ),\n formatter_class=argparse.RawDescriptionHelpFormatter,\n )\n\n # --- Positional ---\n p.add_argument(\n \"audio\", nargs=\"*\", metavar=\"AUDIO\",\n help=\"Audio file(s), directory, glob pattern, or URL. Optional when --rss is used.\",\n )\n\n # --- Model & language ---\n p.add_argument(\n \"-m\", \"--model\", default=\"distil-large-v3.5\",\n help=\"Whisper model (default: distil-large-v3.5)\",\n )\n p.add_argument(\n \"--revision\", default=None, metavar=\"REV\",\n help=\"Model revision (git branch/tag/commit hash) to pin a specific version\",\n )\n p.add_argument(\n \"-l\", \"--language\", default=None,\n help=\"Language code, e.g. en, es, fr (auto-detects if omitted)\",\n )\n p.add_argument(\n \"--language-map\", default=None, metavar=\"MAP\",\n help=\"Per-file language override for batch mode. Inline: 'interview*.mp3=en,lecture.wav=fr' \"\n \"or JSON file: '@/path/to/map.json'. Overrides --language for matched files; \"\n \"unmatched files fall back to --language (or auto-detect). \"\n \"Patterns support fnmatch globs on filename or stem.\",\n )\n p.add_argument(\n \"--initial-prompt\", default=None, metavar=\"TEXT\",\n help=\"Prompt to condition the model (terminology, formatting hints)\",\n )\n p.add_argument(\n \"--prefix\", default=None, metavar=\"TEXT\",\n help=\"Prefix to condition the first segment (e.g. known starting words)\",\n )\n p.add_argument(\n \"--hotwords\", default=None, metavar=\"WORDS\",\n help=\"Hotwords to boost recognition (space-separated)\",\n )\n p.add_argument(\n \"--translate\", action=\"store_true\",\n help=\"Translate to English instead of transcribing\",\n )\n p.add_argument(\n \"--multilingual\", action=\"store_true\",\n help=\"Enable multilingual/code-switching mode (helps smaller models)\",\n )\n p.add_argument(\n \"--hf-token\", default=None, metavar=\"TOKEN\",\n help=\"HuggingFace token for private models and diarization (overrides cached token)\",\n )\n p.add_argument(\n \"--model-dir\", default=None, metavar=\"PATH\",\n help=\"Custom directory for model cache (default: ~/.cache/huggingface/hub)\",\n )\n\n # --- Output format ---\n p.add_argument(\n \"-f\", \"--format\", default=\"text\",\n help=\"Output format (default: text). \"\n \"Accepts one or a comma-separated list of: \"\n \"text, json, srt, vtt, tsv, csv, lrc, html, ass, ttml. \"\n \"Example: --format srt,text\",\n )\n p.add_argument(\n \"--word-timestamps\", action=\"store_true\",\n help=\"Include word-level timestamps (auto-enabled for --diarize)\",\n )\n p.add_argument(\n \"--stream\", action=\"store_true\",\n help=\"Output segments as they are transcribed (streaming mode; disables diarize/alignment)\",\n )\n p.add_argument(\n \"--max-words-per-line\", type=int, default=None, metavar=\"N\",\n help=\"For SRT/VTT, split long segments into sub-cues with at most N words each \"\n \"(requires word-level timestamps; falls back to full segment if no word data)\",\n )\n p.add_argument(\n \"--max-chars-per-line\", type=int, default=None, metavar=\"N\",\n help=\"For SRT/VTT/ASS/TTML, split subtitle lines so each fits within N characters \"\n \"(requires word-level timestamps; takes priority over --max-words-per-line)\",\n )\n p.add_argument(\n \"--channel\", default=\"mix\", choices=[\"left\", \"right\", \"mix\"],\n help=\"Stereo channel to transcribe: left, right, or mix (default: mix). \"\n \"Requires ffmpeg.\",\n )\n p.add_argument(\n \"--clean-filler\", action=\"store_true\",\n help=\"Remove hesitation fillers (um, uh, er, ah, hmm) and discourse markers \"\n \"(you know, I mean, you see) from transcript text\",\n )\n p.add_argument(\n \"--detect-paragraphs\", action=\"store_true\",\n help=\"Insert paragraph breaks in text output based on silence gaps between segments\",\n )\n p.add_argument(\n \"--paragraph-gap\", type=float, default=3.0, metavar=\"SEC\",\n help=\"Minimum silence gap in seconds to start a new paragraph (default: 3.0). \"\n \"Used with --detect-paragraphs\",\n )\n p.add_argument(\n \"--merge-sentences\", action=\"store_true\",\n help=\"Merge consecutive segments into sentence-level chunks \"\n \"(useful for improving SRT/VTT readability)\",\n )\n p.add_argument(\n \"-o\", \"--output\", default=None, metavar=\"PATH\",\n help=\"Output file or directory (directory for batch mode)\",\n )\n p.add_argument(\n \"--output-template\", default=None, metavar=\"TEMPLATE\",\n help=\"Output filename template for batch mode. Supports: \"\n \"{stem} (input filename without ext), {lang} (detected language), \"\n \"{ext} (format extension), {model} (model name). \"\n \"Example: '{stem}_{lang}.{ext}' → 'interview_en.srt'\",\n )\n\n # --- Inference tuning ---\n p.add_argument(\n \"--beam-size\", type=int, default=5, metavar=\"N\",\n help=\"Beam search size (default: 5)\",\n )\n p.add_argument(\n \"--temperature\", default=None, metavar=\"T\",\n help=\"Sampling temperature or comma-separated fallback list (e.g. '0.0' or '0.0,0.2,0.4'); \"\n \"default uses faster-whisper's built-in schedule [0.0,0.2,0.4,0.6,0.8,1.0]\",\n )\n p.add_argument(\n \"--no-speech-threshold\", type=float, default=None, metavar=\"PROB\",\n help=\"Probability threshold below which segments are treated as silence/no-speech \"\n \"(default: 0.6)\",\n )\n p.add_argument(\n \"--batch-size\", type=int, default=8, metavar=\"N\",\n help=\"Batch size for batched inference (default: 8; reduce if OOM)\",\n )\n p.add_argument(\"--no-vad\", action=\"store_true\",\n help=\"Disable voice activity detection\")\n p.add_argument(\n \"--vad-threshold\", type=float, default=None, metavar=\"T\",\n help=\"VAD speech probability threshold (default: 0.5); higher = more conservative\",\n )\n p.add_argument(\n \"--vad-neg-threshold\", type=float, default=None, metavar=\"T\",\n help=\"VAD negative threshold for ending speech segments (default: auto)\",\n )\n p.add_argument(\n \"--vad-onset\", type=float, default=None, metavar=\"T\",\n help=\"Alias for --vad-threshold (legacy compatibility)\",\n )\n p.add_argument(\n \"--vad-offset\", type=float, default=None, metavar=\"T\",\n help=\"Alias for --vad-neg-threshold (legacy compatibility)\",\n )\n p.add_argument(\n \"--min-speech-duration\", type=int, default=None, metavar=\"MS\",\n help=\"Minimum speech segment duration in milliseconds (default: 0)\",\n )\n p.add_argument(\n \"--max-speech-duration\", type=float, default=None, metavar=\"SEC\",\n help=\"Maximum speech segment duration in seconds (default: unlimited)\",\n )\n p.add_argument(\n \"--min-silence-duration\", type=int, default=None, metavar=\"MS\",\n help=\"Minimum silence duration before splitting a segment in ms (default: 2000)\",\n )\n p.add_argument(\n \"--speech-pad\", type=int, default=None, metavar=\"MS\",\n help=\"Padding added around speech segments in milliseconds (default: 400)\",\n )\n p.add_argument(\"--no-batch\", action=\"store_true\",\n help=\"Disable batched inference (use standard WhisperModel)\")\n p.add_argument(\n \"--hallucination-silence-threshold\", type=float, default=None, metavar=\"SEC\",\n help=\"Skip silent sections where model hallucinates (e.g. 1.0 sec)\",\n )\n p.add_argument(\n \"--no-condition-on-previous-text\", action=\"store_true\",\n help=\"Don't condition on previous text (reduces repetition/hallucination loops; auto-enabled for distil models)\",\n )\n p.add_argument(\n \"--condition-on-previous-text\", action=\"store_true\",\n help=\"Force-enable conditioning on previous text (overrides auto-disable for distil models)\",\n )\n p.add_argument(\n \"--compression-ratio-threshold\", type=float, default=None, metavar=\"RATIO\",\n help=\"Filter segments above this compression ratio (default: 2.4)\",\n )\n p.add_argument(\n \"--log-prob-threshold\", type=float, default=None, metavar=\"PROB\",\n help=\"Filter segments below this avg log probability (default: -1.0)\",\n )\n p.add_argument(\n \"--max-new-tokens\", type=int, default=None, metavar=\"N\",\n help=\"Maximum tokens per segment (prevents runaway generation)\",\n )\n p.add_argument(\n \"--clip-timestamps\", default=None, metavar=\"RANGE\",\n help=\"Transcribe specific time ranges: '30,60' or '0,30;60,90' (seconds)\",\n )\n p.add_argument(\n \"--progress\", action=\"store_true\",\n help=\"Show transcription progress bar\",\n )\n p.add_argument(\n \"--best-of\", type=int, default=None, metavar=\"N\",\n help=\"Number of candidates when sampling with non-zero temperature (default: 5)\",\n )\n p.add_argument(\n \"--patience\", type=float, default=None, metavar=\"F\",\n help=\"Beam search patience factor; higher allows more beam candidates (default: 1.0)\",\n )\n p.add_argument(\n \"--repetition-penalty\", type=float, default=None, metavar=\"F\",\n help=\"Penalty applied to previously generated tokens to reduce repetition (default: 1.0)\",\n )\n p.add_argument(\n \"--no-repeat-ngram-size\", type=int, default=None, metavar=\"N\",\n help=\"Prevent repetition of n-grams of this size (default: 0 = disabled)\",\n )\n\n # --- Advanced inference tuning ---\n p.add_argument(\n \"--no-timestamps\", action=\"store_true\",\n help=\"Output text segments without timing information (faster; \"\n \"incompatible with --word-timestamps, --format srt/vtt/tsv, --diarize)\",\n )\n p.add_argument(\n \"--chunk-length\", type=int, default=None, metavar=\"N\",\n help=\"Audio chunk length in seconds for batched inference (default: auto); \"\n \"ignored with --no-batch\",\n )\n p.add_argument(\n \"--language-detection-threshold\", type=float, default=None, metavar=\"T\",\n help=\"Confidence threshold for automatic language detection (default: 0.5)\",\n )\n p.add_argument(\n \"--language-detection-segments\", type=int, default=None, metavar=\"N\",\n help=\"Number of audio segments to sample for language detection \"\n \"(default: 1; increase for more accurate detection)\",\n )\n p.add_argument(\n \"--length-penalty\", type=float, default=None, metavar=\"F\",\n help=\"Length penalty for beam search; >1 favors longer outputs, \u003c1 favors shorter \"\n \"(default: 1.0)\",\n )\n p.add_argument(\n \"--prompt-reset-on-temperature\", type=float, default=None, metavar=\"T\",\n help=\"Reset initial prompt when temperature fallback reaches this threshold (default: 0.5)\",\n )\n p.add_argument(\n \"--no-suppress-blank\", action=\"store_true\",\n help=\"Disable blank token suppression (may improve transcription of soft speech)\",\n )\n p.add_argument(\n \"--suppress-tokens\", default=None, metavar=\"IDS\",\n help=\"Comma-separated token IDs to suppress in addition to the default -1 \"\n \"(e.g. '1234,5678')\",\n )\n p.add_argument(\n \"--max-initial-timestamp\", type=float, default=None, metavar=\"T\",\n help=\"Maximum timestamp allowed for the first transcribed segment in seconds \"\n \"(default: 1.0)\",\n )\n p.add_argument(\n \"--prepend-punctuations\", default=None, metavar=\"CHARS\",\n help=\"Punctuation characters to merge into the preceding word \"\n \"(default: \\\"'¿([{-\\\")\",\n )\n p.add_argument(\n \"--append-punctuations\", default=None, metavar=\"CHARS\",\n help=\"Punctuation characters to merge into the following word \"\n \"(default: \\\"'.。,，!！?？:：\\\")]}\\、\\\")\",\n )\n\n # --- Advanced features ---\n p.add_argument(\n \"--diarize\", action=\"store_true\",\n help=\"Speaker diarization (requires pyannote.audio; install via setup.sh --diarize)\",\n )\n p.add_argument(\n \"--min-speakers\", type=int, default=None, metavar=\"N\",\n help=\"Minimum number of speakers hint for diarization\",\n )\n p.add_argument(\n \"--max-speakers\", type=int, default=None, metavar=\"N\",\n help=\"Maximum number of speakers hint for diarization\",\n )\n p.add_argument(\n \"--min-confidence\", type=float, default=None, metavar=\"PROB\",\n help=\"Drop segments below this avg word confidence (0.0–1.0)\",\n )\n p.add_argument(\n \"--skip-existing\", action=\"store_true\",\n help=\"Skip files whose output already exists (batch mode)\",\n )\n p.add_argument(\n \"--detect-language-only\", action=\"store_true\",\n help=\"Detect the language of the audio and exit (no transcription). \"\n \"Output: 'Language: en (probability: 0.984)'. With --format json: JSON object.\",\n )\n p.add_argument(\n \"--stats-file\", default=None, metavar=\"PATH\",\n help=\"Write performance stats JSON sidecar after transcription. \"\n \"If a directory: writes {stem}.stats.json in that dir. \"\n \"In batch mode, one stats file per input.\",\n )\n p.add_argument(\n \"--burn-in\", default=None, metavar=\"OUTPUT\",\n help=\"Burn subtitles into the original video: transcribe, then ffmpeg-overlay SRT \"\n \"into the input file and save to OUTPUT (single-file mode only; requires ffmpeg)\",\n )\n p.add_argument(\n \"--speaker-names\", default=None, metavar=\"NAMES\",\n help=\"Comma-separated speaker names to replace SPEAKER_1, SPEAKER_2, etc. \"\n \"(e.g. 'Alice,Bob'). Requires --diarize\",\n )\n p.add_argument(\n \"--filter-hallucinations\", action=\"store_true\",\n help=\"Filter common Whisper hallucinations: music/applause markers, \"\n \"'Thank you for watching', duplicate consecutive segments, etc.\",\n )\n p.add_argument(\n \"--keep-temp\", action=\"store_true\",\n help=\"Keep temp files from URL downloads instead of deleting them \"\n \"(useful for re-processing downloaded audio without re-downloading)\",\n )\n p.add_argument(\n \"--parallel\", type=int, default=None, metavar=\"N\",\n help=\"Number of parallel workers for batch processing \"\n \"(default: sequential; mainly useful on CPU with many small files)\",\n )\n\n # --- Preprocessing ---\n p.add_argument(\n \"--normalize\", action=\"store_true\",\n help=\"Normalize audio volume before transcription (EBU R128 loudnorm)\",\n )\n p.add_argument(\n \"--denoise\", action=\"store_true\",\n help=\"Apply noise reduction before transcription (high-pass + FFT denoise)\",\n )\n\n # --- Device ---\n p.add_argument(\n \"--device\", default=\"auto\", choices=[\"auto\", \"cpu\", \"cuda\"],\n help=\"Compute device (default: auto)\",\n )\n p.add_argument(\n \"--compute-type\", default=\"auto\",\n choices=[\"auto\", \"int8\", \"int8_float16\", \"float16\", \"float32\"],\n help=\"Quantization (default: auto; int8_float16 = hybrid for GPU)\",\n )\n p.add_argument(\n \"--threads\", type=int, default=None, metavar=\"N\",\n help=\"Number of CPU threads for CTranslate2 inference (default: auto)\",\n )\n p.add_argument(\n \"-q\", \"--quiet\", action=\"store_true\",\n help=\"Suppress progress messages\",\n )\n p.add_argument(\n \"--log-level\", default=\"warning\",\n choices=[\"debug\", \"info\", \"warning\", \"error\"],\n help=\"Set faster_whisper library logging level (default: warning)\",\n )\n\n # --- Utility ---\n p.add_argument(\n \"--version\", action=\"store_true\",\n help=\"Show installed faster-whisper version and exit\",\n )\n p.add_argument(\n \"--update\", action=\"store_true\",\n help=\"Upgrade faster-whisper in the skill venv and exit\",\n )\n\n # --- RSS / Podcast ---\n p.add_argument(\n \"--rss\", default=None, metavar=\"URL\",\n help=\"Podcast RSS feed URL — extracts audio enclosures and transcribes them. \"\n \"AUDIO positional is optional when --rss is used.\",\n )\n p.add_argument(\n \"--rss-latest\", type=int, default=5, metavar=\"N\",\n help=\"Number of most-recent episodes to process from --rss feed \"\n \"(default: 5; use 0 for all episodes)\",\n )\n\n # --- Reliability ---\n p.add_argument(\n \"--retries\", type=int, default=0, metavar=\"N\",\n help=\"Retry failed files up to N times with exponential backoff \"\n \"(default: 0 = no retry; incompatible with --parallel)\",\n )\n\n # --- Transcript search ---\n p.add_argument(\n \"--search\", default=None, metavar=\"TERM\",\n help=\"Search the transcript for TERM and print matching segments with timestamps. \"\n \"Replaces the normal transcript output (use with -o to save search results to file).\",\n )\n p.add_argument(\n \"--search-fuzzy\", action=\"store_true\",\n help=\"Use fuzzy/approximate matching with --search (useful for typos or partial words)\",\n )\n\n # --- Chapter detection ---\n p.add_argument(\n \"--detect-chapters\", action=\"store_true\",\n help=\"Detect chapter/section breaks from silence gaps between segments and print chapter markers.\",\n )\n p.add_argument(\n \"--chapter-gap\", type=float, default=8.0, metavar=\"SEC\",\n help=\"Minimum silence gap in seconds to start a new chapter (default: 8.0)\",\n )\n p.add_argument(\n \"--chapters-file\", default=None, metavar=\"PATH\",\n help=\"Write chapter markers to this file (default: print to stdout alongside transcript). \"\n \"Format is controlled by --chapter-format.\",\n )\n p.add_argument(\n \"--chapter-format\", default=\"youtube\",\n choices=[\"youtube\", \"text\", \"json\"],\n help=\"Chapter output format: youtube (M:SS Title), text (Title: HH:MM:SS), json (default: youtube)\",\n )\n\n # --- Speaker audio export ---\n p.add_argument(\n \"--export-speakers\", default=None, metavar=\"DIR\",\n help=\"After diarization, export each speaker's audio turns to separate WAV files in DIR. \"\n \"Requires --diarize and ffmpeg.\",\n )\n\n # --- Backward compat (hidden) ---\n p.add_argument(\"-j\", \"--json\", action=\"store_true\", help=argparse.SUPPRESS)\n p.add_argument(\"--vad\", action=\"store_true\", help=argparse.SUPPRESS)\n p.add_argument(\"--precise\", action=\"store_true\", help=argparse.SUPPRESS)\n\n args = p.parse_args()\n if args.json:\n args.format = \"json\"\n if args.precise:\n args.word_timestamps = True\n\n # Parse --format as comma-separated list; validate each entry\n _VALID_FORMATS = {\"text\", \"json\", \"srt\", \"vtt\", \"tsv\", \"csv\", \"lrc\", \"html\", \"ass\", \"ttml\"}\n _raw_formats = [f.strip() for f in args.format.split(\",\") if f.strip()]\n _invalid = [f for f in _raw_formats if f not in _VALID_FORMATS]\n if _invalid:\n p.error(\n f\"Invalid format(s): {', '.join(_invalid)}. \"\n f\"Choose from: {', '.join(sorted(_VALID_FORMATS))}\"\n )\n args._formats = _raw_formats if _raw_formats else [\"text\"]\n args.format = args._formats[0] # backward compat\n\n # Multi-format + file path (not dir) is an error\n if len(args._formats) > 1 and args.output and Path(args.output).suffix:\n p.error(\n f\"Multiple formats ({', '.join(args._formats)}) require -o to be a directory, \"\n f\"not a file path. Use: -o /path/to/output/dir/\"\n )\n\n # Validate: need at least one audio source\n if not args.audio and not args.rss:\n p.error(\"AUDIO file(s) are required, or use --rss to specify a podcast feed\")\n\n # Apply HuggingFace token to environment early (model loading picks it up)\n if args.hf_token:\n os.environ[\"HF_TOKEN\"] = args.hf_token\n os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = args.hf_token\n\n # Parse --language-map early so we can validate before loading the model\n lang_map = {}\n if getattr(args, \"language_map\", None):\n try:\n lang_map = parse_language_map(args.language_map)\n except Exception as e:\n print(f\"Error parsing --language-map: {e}\", file=sys.stderr)\n sys.exit(1)\n\n # Apply faster_whisper library logging level\n logging.basicConfig()\n logging.getLogger(\"faster_whisper\").setLevel(getattr(logging, args.log_level.upper()))\n\n # Handle \"turbo\" alias → large-v3-turbo\n if args.model.lower() == \"turbo\":\n args.model = \"large-v3-turbo\"\n\n # Auto-disable condition_on_previous_text for distil models (HuggingFace recommendation)\n # Prevents repetition loops inherent to distil model architecture.\n # Override with --condition-on-previous-text if you need the old behaviour.\n is_distil = args.model.lower().startswith(\"distil-\")\n if is_distil and not args.no_condition_on_previous_text and not args.condition_on_previous_text:\n args.no_condition_on_previous_text = True\n if not args.quiet:\n print(\n \"ℹ️ distil model detected: auto-disabling condition_on_previous_text \"\n \"(reduces repetition loops; pass --condition-on-previous-text to override)\",\n file=sys.stderr,\n )\n\n # Warn when --speaker-names is used without --diarize (has no effect)\n if getattr(args, \"speaker_names\", None) and not args.diarize:\n print(\"⚠️ --speaker-names has no effect without --diarize; ignoring\", file=sys.stderr)\n\n # Streaming mode disables post-processing that needs all segments\n if args.stream:\n if args.diarize:\n print(\"⚠️ --stream disables --diarize (needs all segments)\", file=sys.stderr)\n args.diarize = False\n if args.word_timestamps:\n print(\"⚠️ --stream disables word-level alignment (needs all segments)\", file=sys.stderr)\n\n # Conflict check: --chunk-length requires batched mode\n if args.chunk_length is not None and args.no_batch:\n print(\"⚠️ --chunk-length ignored with --no-batch (only valid for batched inference)\", file=sys.stderr)\n args.chunk_length = None\n\n # ---- Resolve inputs (including stdin '-') ----\n temp_dirs = []\n stdin_tmp = None\n raw_inputs = list(args.audio) # mutable copy\n\n # Handle --rss: fetch podcast episodes and prepend their URLs\n if args.rss:\n rss_episodes = fetch_rss_episodes(\n args.rss,\n latest=args.rss_latest if args.rss_latest != 0 else None,\n quiet=args.quiet,\n )\n if not args.quiet:\n for _, title in rss_episodes:\n print(f\" 📻 {title}\", file=sys.stderr)\n raw_inputs = [url for url, _ in rss_episodes] + raw_inputs\n\n # Check for stdin '-' usage\n if \"-\" in raw_inputs:\n if len(raw_inputs) > 1:\n print(\"Error: stdin '-' cannot be combined with other inputs in batch mode\", file=sys.stderr)\n sys.exit(1)\n if not args.quiet:\n print(\"📥 Reading audio from stdin...\", file=sys.stderr)\n stdin_data = sys.stdin.buffer.read()\n stdin_tmp = tempfile.NamedTemporaryFile(\n delete=False, suffix=\".audio\", prefix=\"fw-stdin-\"\n )\n stdin_tmp.write(stdin_data)\n stdin_tmp.flush()\n stdin_tmp.close()\n raw_inputs = [stdin_tmp.name]\n\n audio_files = []\n for inp in raw_inputs:\n if is_url(inp):\n path, td = download_url(inp, quiet=args.quiet)\n audio_files.append(path)\n temp_dirs.append(td)\n else:\n audio_files.extend(resolve_inputs([inp]))\n\n if not audio_files:\n print(\"Error: No audio files found\", file=sys.stderr)\n sys.exit(1)\n\n is_batch = len(audio_files) > 1\n\n # ---- Device setup ----\n device = args.device\n compute_type = args.compute_type\n cuda_ok, gpu_name = check_cuda_available()\n\n if device == \"auto\":\n device = \"cuda\" if cuda_ok else \"cpu\"\n if device == \"cpu\" and not args.quiet:\n print(\"⚠️ CUDA not available — using CPU (this will be slow!)\", file=sys.stderr)\n print(\" To enable GPU: pip install torch --index-url https://download.pytorch.org/whl/cu121\", file=sys.stderr)\n\n if compute_type == \"auto\":\n compute_type = \"float16\" if device == \"cuda\" else \"int8\"\n\n if cuda_ok and compute_type == \"float16\" and args.compute_type == \"auto\" and not args.quiet:\n import re as _re\n gpu_name = gpu_name or \"\"\n if _re.search(r\"RTX 30[0-9]{2}\", gpu_name, _re.IGNORECASE):\n print(f\"💡 Tip: For {gpu_name}, --compute-type int8_float16 saves ~1GB VRAM with minimal quality loss\", file=sys.stderr)\n\n use_batched = not args.no_batch\n\n if not args.quiet:\n mode = f\"batched (bs={args.batch_size})\" if use_batched else \"standard\"\n gpu_str = f\" on {gpu_name}\" if device == \"cuda\" and gpu_name else \"\"\n task_str = \" [translate→en]\" if args.translate else \"\"\n stream_str = \" [streaming]\" if args.stream else \"\"\n print(f\"🎙️ {args.model} ({device}/{compute_type}){gpu_str} [{mode}]{task_str}{stream_str}\", file=sys.stderr)\n if is_batch:\n print(f\"📁 {len(audio_files)} files queued\", file=sys.stderr)\n\n # ---- Load model ----\n try:\n model_kwargs = dict(device=device, compute_type=compute_type)\n if args.revision is not None:\n model_kwargs[\"revision\"] = args.revision\n if args.threads is not None:\n model_kwargs[\"cpu_threads\"] = args.threads\n if getattr(args, \"model_dir\", None):\n model_kwargs[\"download_root\"] = args.model_dir\n model = WhisperModel(args.model, **model_kwargs)\n pipe = BatchedInferencePipeline(model) if use_batched else model\n except Exception as e:\n print(f\"Error loading model: {e}\", file=sys.stderr)\n sys.exit(1)\n\n # ---- Detect language only (early exit) ----\n if args.detect_language_only:\n try:\n from faster_whisper.audio import decode_audio\n except ImportError:\n # Older versions may use different path\n try:\n from faster_whisper import decode_audio\n except ImportError:\n def decode_audio(path, sampling_rate=16000):\n import numpy as np\n import subprocess as _sp\n cmd = [\"ffmpeg\", \"-i\", path, \"-ar\", str(sampling_rate), \"-ac\", \"1\",\n \"-f\", \"f32le\", \"-\"]\n result = _sp.run(cmd, capture_output=True, check=True)\n return np.frombuffer(result.stdout, dtype=np.float32)\n\n exit_code = 0\n for audio_path in audio_files:\n try:\n audio_np = decode_audio(audio_path)\n lang, lang_prob, _ = model.detect_language(audio=audio_np)\n prob_val = float(lang_prob)\n if args.format == \"json\":\n print(json.dumps({\"language\": lang, \"language_probability\": round(prob_val, 4)}, ensure_ascii=False))\n else:\n print(f\"Language: {lang} (probability: {prob_val:.3f})\")\n except Exception as e:\n print(f\"Error detecting language for {audio_path}: {e}\", file=sys.stderr)\n exit_code = 1\n # Clean up any URL-downloaded temp directories before exiting\n for td in temp_dirs:\n shutil.rmtree(td, ignore_errors=True)\n if stdin_tmp and os.path.exists(stdin_tmp.name):\n os.unlink(stdin_tmp.name)\n sys.exit(exit_code)\n\n # ---- Transcribe ----\n results = []\n failed_files = []\n total_audio = 0\n wall_start = time.time()\n\n _skip_count = [0] # mutable counter for batch summary\n\n def _should_skip(audio_path):\n if args.skip_existing and args.output:\n out_dir = Path(args.output)\n if out_dir.is_dir():\n formats = getattr(args, \"_formats\", [args.format])\n # Skip only when ALL requested format outputs already exist\n all_exist = all(\n (out_dir / (Path(audio_path).stem + EXT_MAP.get(fmt, \".txt\"))).exists()\n for fmt in formats\n )\n if all_exist:\n if not args.quiet:\n print(f\"⏭️ Skip (exists): {Path(audio_path).name}\", file=sys.stderr)\n _skip_count[0] += 1\n return True\n return False\n\n if getattr(args, \"parallel\", None) and args.parallel > 1 and is_batch:\n if device == \"cuda\" and not args.quiet:\n print(\n f\"⚠️ --parallel on GPU: each call uses the full GPU; \"\n \"benefit is limited vs sequential batched mode\",\n file=sys.stderr,\n )\n if args.retries and not args.quiet:\n print(\"⚠️ --retries is not supported with --parallel (ignored)\", file=sys.stderr)\n pending = [af for af in audio_files if not _should_skip(af)]\n with ThreadPoolExecutor(max_workers=args.parallel) as executor:\n # Build per-file args copies with language-map overrides\n def _make_args(af):\n file_lang = resolve_file_language(af, lang_map, args.language)\n if file_lang != args.language:\n a = copy.copy(args)\n a.language = file_lang\n return a\n return args\n\n future_to_path = {\n executor.submit(transcribe_file, af, pipe, _make_args(af)): af\n for af in pending\n }\n for future in as_completed(future_to_path):\n af = future_to_path[future]\n name = Path(af).name\n try:\n r = future.result()\n r[\"_audio_path\"] = af\n results.append(r)\n total_audio += r[\"duration\"]\n except Exception as e:\n print(f\"❌ {name}: {e}\", file=sys.stderr)\n failed_files.append((af, str(e)))\n else:\n # ETA tracking for sequential batch mode\n pending_files = [af for af in audio_files if not _should_skip(af)]\n pending_total = len(pending_files)\n eta_wall_start = time.time()\n files_done = 0\n\n for audio_path in audio_files:\n name = Path(audio_path).name\n\n if _should_skip(audio_path):\n continue\n\n # Per-file language override via --language-map\n file_lang = resolve_file_language(audio_path, lang_map, args.language)\n if lang_map and file_lang != args.language and not args.quiet and is_batch:\n print(f\" 🌐 Language override: {file_lang}\", file=sys.stderr)\n\n # Build per-file args (only copy if language differs to avoid overhead)\n file_args = args\n if file_lang != args.language:\n file_args = copy.copy(args)\n file_args.language = file_lang\n\n if not args.quiet and is_batch:\n # ETA prefix before file name (files_done = completed so far)\n current_idx = files_done + 1 # 1-based index of current file\n if files_done > 0:\n elapsed_so_far = time.time() - eta_wall_start\n avg_per_file = elapsed_so_far / files_done\n remaining = pending_total - files_done\n eta_sec = avg_per_file * remaining\n eta_str = format_duration(eta_sec)\n print(\n f\"▶️ [{current_idx}/{pending_total}] {name} | ETA: {eta_str}\",\n file=sys.stderr,\n )\n else:\n print(f\"▶️ [{current_idx}/{pending_total}] {name}\", file=sys.stderr)\n\n success = False\n last_error = None\n max_attempts = args.retries + 1\n for attempt in range(max_attempts):\n try:\n r = transcribe_file(audio_path, pipe, file_args)\n # Store the original audio_path on result for stats/template use\n r[\"_audio_path\"] = audio_path\n results.append(r)\n total_audio += r[\"duration\"]\n files_done += 1\n success = True\n break\n except Exception as e:\n last_error = e\n if attempt \u003c args.retries:\n wait = 2 ** (attempt + 1)\n print(\n f\"⚠️ {name}: attempt {attempt + 1}/{max_attempts} failed: {e}. \"\n f\"Retrying in {wait}s...\",\n file=sys.stderr,\n )\n time.sleep(wait)\n\n if not success:\n print(\n f\"❌ {name}: failed after {max_attempts} attempt(s): {last_error}\",\n file=sys.stderr,\n )\n failed_files.append((audio_path, str(last_error)))\n files_done += 1 # count failed files too for accurate ETA\n if not is_batch:\n sys.exit(1)\n\n # Cleanup temp dirs and stdin temp file\n for td in temp_dirs:\n if getattr(args, \"keep_temp\", False):\n if not args.quiet:\n print(f\"📁 Temp files kept: {td}\", file=sys.stderr)\n else:\n shutil.rmtree(td, ignore_errors=True)\n if stdin_tmp and os.path.exists(stdin_tmp.name):\n os.unlink(stdin_tmp.name)\n\n if not results:\n if args.skip_existing:\n if not args.quiet:\n print(\"All files already transcribed (--skip-existing)\", file=sys.stderr)\n sys.exit(0)\n print(\"Error: No files transcribed\", file=sys.stderr)\n sys.exit(1)\n\n # ---- Write output ----\n for r in results:\n # Apply --merge-sentences post-processing before formatting\n if args.merge_sentences and r.get(\"segments\"):\n r[\"segments\"] = merge_sentences(r[\"segments\"])\n # Rebuild full text from merged segments\n r[\"text\"] = \" \".join(s[\"text\"].strip() for s in r[\"segments\"]).strip()\n\n # ---- Speaker audio export (requires diarization) ----\n if getattr(args, \"export_speakers\", None):\n if not args.diarize:\n if not args.quiet:\n print(\"⚠️ --export-speakers requires --diarize; skipping\", file=sys.stderr)\n else:\n audio_src = r.get(\"_audio_path\", r[\"file\"])\n export_speakers_audio(\n audio_src, r.get(\"segments\", []),\n args.export_speakers, quiet=args.quiet,\n )\n\n # ---- Streaming mode already printed segments to stdout ----\n if args.stream and not args.output:\n _write_stats(r, args)\n continue\n\n # ---- Apply paragraph detection ----\n if getattr(args, \"detect_paragraphs\", False) and r.get(\"segments\"):\n r[\"segments\"] = detect_paragraphs(\n r[\"segments\"],\n min_gap=getattr(args, \"paragraph_gap\", 3.0),\n )\n\n # ---- Apply filler word removal ----\n if getattr(args, \"clean_filler\", False) and r.get(\"segments\"):\n r[\"segments\"] = remove_filler_words(r[\"segments\"])\n r[\"text\"] = \" \".join(s[\"text\"].strip() for s in r[\"segments\"]).strip()\n\n # Determine output filename stem for template/stats\n audio_path = r.get(\"_audio_path\", r[\"file\"])\n stem = Path(audio_path).stem\n lang = r.get(\"language\", \"xx\")\n model_name = args.model\n\n # ---- Pre-compute chapters (must happen before output formatting for JSON embedding) ----\n # Stored in _computed_chapters so the display block below can reuse it without a second call.\n _computed_chapters = None\n if getattr(args, \"detect_chapters\", False) and r.get(\"segments\"):\n _computed_chapters = detect_chapters(r[\"segments\"], min_gap=args.chapter_gap)\n _formats_list = getattr(args, \"_formats\", [args.format])\n if \"json\" in _formats_list:\n r[\"chapters\"] = _computed_chapters # embed in JSON output\n\n # ---- Transcript search mode ----\n if getattr(args, \"search\", None):\n matches = search_transcript(\n r.get(\"segments\", []),\n args.search,\n fuzzy=getattr(args, \"search_fuzzy\", False),\n )\n search_output = format_search_results(matches, args.search)\n if args.output:\n out_path = Path(args.output)\n if out_path.is_dir() or (is_batch and not out_path.suffix):\n out_path.mkdir(parents=True, exist_ok=True)\n dest = out_path / (stem + \".txt\")\n else:\n dest = out_path\n dest.write_text(search_output, encoding=\"utf-8\")\n if not args.quiet:\n print(f\"💾 {dest}\", file=sys.stderr)\n else:\n if is_batch:\n print(f\"\\n=== {r['file']} ===\")\n print(search_output)\n else:\n # ---- Multi-format output loop ----\n formats = getattr(args, \"_formats\", [args.format])\n if len(formats) > 1 and not args.output:\n print(\n f\"⚠️ Multiple formats requested but no -o DIR specified; \"\n f\"showing only '{formats[0]}' on stdout. \"\n f\"Use -o \u003cdir> to write all formats.\",\n file=sys.stderr,\n )\n for fmt_idx, fmt in enumerate(formats):\n ext = EXT_MAP.get(fmt, \".txt\").lstrip(\".\")\n output = format_result(\n r, fmt,\n max_words_per_line=args.max_words_per_line,\n max_chars_per_line=getattr(args, \"max_chars_per_line\", None),\n )\n\n if args.output:\n out_path = Path(args.output)\n # Treat as directory when: it's already a dir, OR batch mode, OR multiple formats requested\n multi_fmt = len(formats) > 1\n if out_path.is_dir() or (is_batch and not out_path.suffix) or (multi_fmt and not out_path.suffix):\n out_path.mkdir(parents=True, exist_ok=True)\n # Apply output template if provided\n if args.output_template:\n filename = args.output_template.format(\n stem=stem, lang=lang, ext=ext, model=model_name,\n )\n dest = out_path / filename\n else:\n dest = out_path / (stem + EXT_MAP.get(fmt, \".txt\"))\n else:\n dest = out_path\n dest.write_text(output, encoding=\"utf-8\")\n if not args.quiet:\n print(f\"💾 {dest}\", file=sys.stderr)\n else:\n # Only print first format to stdout\n if fmt_idx == 0:\n if is_batch and fmt == \"text\":\n print(f\"\\n=== {r['file']} ===\")\n print(output)\n\n # ---- Chapter detection output ----\n if _computed_chapters is not None:\n chapters = _computed_chapters # reuse pre-computed result\n chapters_output = format_chapters_output(chapters, fmt=args.chapter_format)\n if not args.quiet:\n if not chapters or len(chapters) == 1:\n print(\n f\"ℹ️ Chapter detection: only 1 chapter found \"\n f\"(no silence gaps ≥ {args.chapter_gap}s)\",\n file=sys.stderr,\n )\n else:\n print(\n f\"📑 {len(chapters)} chapter(s) detected \"\n f\"(gap threshold: {args.chapter_gap}s):\",\n file=sys.stderr,\n )\n\n chapters_dest = getattr(args, \"chapters_file\", None)\n if chapters_dest:\n Path(chapters_dest).parent.mkdir(parents=True, exist_ok=True)\n Path(chapters_dest).write_text(chapters_output, encoding=\"utf-8\")\n if not args.quiet:\n print(f\"📑 Chapters saved: {chapters_dest}\", file=sys.stderr)\n else:\n # Print to stdout after transcript — clear header so agents can parse it separately\n print(f\"\\n=== CHAPTERS ({len(chapters)}) ===\\n{chapters_output}\")\n\n # Write stats sidecar\n _write_stats(r, args)\n\n # Subtitle burn-in (single file only)\n if getattr(args, \"burn_in\", None):\n if is_batch:\n if not args.quiet:\n print(\"⚠️ --burn-in is only supported for single-file mode; skipping\", file=sys.stderr)\n elif not r.get(\"segments\"):\n if not args.quiet:\n print(\"⚠️ --burn-in skipped: no speech segments detected\", file=sys.stderr)\n else:\n srt_content = to_srt(r[\"segments\"])\n src_path = r.get(\"_audio_path\", r[\"file\"])\n burn_subtitles(src_path, srt_content, args.burn_in, quiet=args.quiet)\n\n # Batch summary\n if is_batch and not args.quiet:\n wall = time.time() - wall_start\n rt = total_audio / wall if wall > 0 else 0\n skip_note = f\" ({_skip_count[0]} skipped)\" if _skip_count[0] else \"\"\n print(\n f\"\\n📊 Done: {len(results)} files{skip_note}, {format_duration(total_audio)} audio \"\n f\"in {format_duration(wall)} ({rt:.1f}× realtime)\",\n file=sys.stderr,\n )\n if failed_files:\n print(f\"❌ Failed: {len(failed_files)} file(s):\", file=sys.stderr)\n for path, err in failed_files:\n print(f\" • {Path(path).name}: {err}\", file=sys.stderr)\n\n\ndef _write_stats(r, args):\n \"\"\"Write a JSON stats sidecar file for result r, if --stats-file is set.\"\"\"\n if not getattr(args, \"stats_file\", None):\n return\n\n audio_path = r.get(\"_audio_path\", r[\"file\"])\n stem = Path(audio_path).stem\n stats_path = Path(args.stats_file)\n\n # Directory → write {stem}.stats.json inside it\n if stats_path.is_dir() or args.stats_file.endswith(os.sep):\n stats_path.mkdir(parents=True, exist_ok=True)\n dest = stats_path / f\"{stem}.stats.json\"\n else:\n dest = stats_path\n\n word_count = sum(len(s[\"text\"].split()) for s in r.get(\"segments\", []))\n elapsed = r[\"stats\"][\"processing_time\"]\n duration = r.get(\"duration\", 0)\n\n stats = {\n \"file\": r[\"file\"],\n \"language\": r.get(\"language\"),\n \"language_probability\": round(r.get(\"language_probability\", 0), 4),\n \"duration_seconds\": round(duration, 2),\n \"processing_time_seconds\": elapsed,\n \"realtime_factor\": r[\"stats\"].get(\"realtime_factor\", 0),\n \"segment_count\": len(r.get(\"segments\", [])),\n \"word_count\": word_count,\n \"model\": args.model,\n \"compute_type\": args.compute_type,\n \"device\": args.device,\n }\n\n try:\n dest.parent.mkdir(parents=True, exist_ok=True)\n dest.write_text(json.dumps(stats, indent=2, ensure_ascii=False), encoding=\"utf-8\")\n if not getattr(args, \"quiet\", False):\n print(f\"📈 Stats: {dest}\", file=sys.stderr)\n except Exception as e:\n print(f\"⚠️ Failed to write stats file {dest}: {e}\", file=sys.stderr)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":114264,"content_sha256":"3c313601d0aa3586eb63e6e8dd45fdca65481151fdf247aee07db8da3dbef32a"},{"filename":"setup.sh","content":"#!/usr/bin/env bash\n# faster-whisper skill setup\n# Creates venv and installs dependencies (with GPU support where available)\n#\n# Usage:\n# ./setup.sh # Base install\n# ./setup.sh --diarize # Base install + speaker diarization (pyannote.audio)\n\nset -e\n\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\nVENV_DIR=\"$SCRIPT_DIR/.venv\"\nINSTALL_DIARIZE=false\n\n# Parse arguments\nfor arg in \"$@\"; do\n case \"$arg\" in\n --diarize) INSTALL_DIARIZE=true ;;\n --update)\n # Upgrade faster-whisper in existing venv without full reinstall\n if [ ! -d \"$VENV_DIR\" ]; then\n echo \"❌ No venv found at $VENV_DIR — run ./setup.sh first\"\n exit 1\n fi\n if command -v uv &> /dev/null; then\n uv pip install --python \"$VENV_DIR/bin/python\" --upgrade faster-whisper\n else\n \"$VENV_DIR/bin/pip\" install --upgrade faster-whisper\n fi\n echo \"✅ faster-whisper updated\"\n \"$VENV_DIR/bin/python\" -c \"import faster_whisper; print(f'Version: {faster_whisper.__version__}')\"\n exit 0\n ;;\n --check)\n # Quick system check: GPU, Python, ffmpeg, venv, faster-whisper, yt-dlp, pyannote\n echo \"🔍 faster-whisper skill system check\"\n echo \"\"\n\n # Python\n if command -v python3 &>/dev/null; then\n PY_VER=$(python3 -c 'import sys; print(f\"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}\")')\n echo \"✅ Python: $PY_VER\"\n else\n echo \"❌ Python: not found\"\n fi\n\n # ffmpeg\n if command -v ffmpeg &>/dev/null; then\n FF_VER=$(ffmpeg -version 2>&1 | head -1 | awk '{print $3}')\n echo \"✅ ffmpeg: $FF_VER\"\n else\n echo \"⚠️ ffmpeg: not found (needed for --normalize, --denoise, --burn-in)\"\n fi\n\n # GPU/CUDA\n NVIDIA_SMI_CHECK=\"\"\n if command -v nvidia-smi &>/dev/null; then\n NVIDIA_SMI_CHECK=\"nvidia-smi\"\n elif grep -qi microsoft /proc/version 2>/dev/null; then\n for wsl_smi in /usr/lib/wsl/lib/nvidia-smi /usr/lib/wsl/drivers/*/nvidia-smi; do\n [ -f \"$wsl_smi\" ] && NVIDIA_SMI_CHECK=\"$wsl_smi\" && break\n done\n fi\n if [ -n \"$NVIDIA_SMI_CHECK\" ]; then\n GPU_CHECK=$(\"$NVIDIA_SMI_CHECK\" --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)\n DRV_CHECK=$(\"$NVIDIA_SMI_CHECK\" --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)\n if [ -n \"$GPU_CHECK\" ]; then\n echo \"✅ GPU: $GPU_CHECK (driver $DRV_CHECK)\"\n else\n echo \"⚠️ GPU: nvidia-smi found but no GPU reported\"\n fi\n else\n echo \"⚠️ GPU: no NVIDIA GPU detected (CPU mode)\"\n fi\n\n # venv\n if [ -d \"$VENV_DIR\" ]; then\n echo \"✅ venv: $VENV_DIR\"\n # faster-whisper version\n if \"$VENV_DIR/bin/python\" -c \"import faster_whisper\" 2>/dev/null; then\n FW_VER=$(\"$VENV_DIR/bin/python\" -c \"import faster_whisper; print(faster_whisper.__version__)\" 2>/dev/null)\n echo \"✅ faster-whisper: $FW_VER\"\n else\n echo \"❌ faster-whisper: not installed (run ./setup.sh)\"\n fi\n # CUDA in venv\n CUDA_CHECK=$(\"$VENV_DIR/bin/python\" -c \"import torch; print(torch.cuda.is_available())\" 2>/dev/null || echo \"False\")\n if [ \"$CUDA_CHECK\" = \"True\" ]; then\n CUDA_DEV=$(\"$VENV_DIR/bin/python\" -c \"import torch; print(torch.cuda.get_device_name(0))\" 2>/dev/null)\n echo \"✅ CUDA in venv: available ($CUDA_DEV)\"\n else\n echo \"⚠️ CUDA in venv: not available (CPU mode; check PyTorch CUDA install)\"\n fi\n # pyannote (timeout 10s to avoid slow CUDA init hanging the check)\n PA_RESULT=$(timeout 10 \"$VENV_DIR/bin/python\" -c \"\nimport importlib.util, sys\nspec = importlib.util.find_spec('pyannote.audio')\nif spec is None:\n sys.exit(1)\n# Only read version from metadata, skip full import (avoids 30-60s CUDA load)\ntry:\n from importlib.metadata import version\n print(version('pyannote.audio'))\nexcept Exception:\n print('installed')\n\" 2>/dev/null)\n PA_EXIT=$?\n if [ $PA_EXIT -eq 0 ] && [ -n \"$PA_RESULT\" ]; then\n echo \"✅ pyannote.audio: $PA_RESULT (--diarize available)\"\n elif [ $PA_EXIT -eq 124 ]; then\n echo \"⚠️ pyannote.audio: check timed out (likely installed; run --diarize to verify)\"\n else\n echo \"ℹ️ pyannote.audio: not installed (--diarize unavailable; run ./setup.sh --diarize)\"\n fi\n else\n echo \"❌ venv: not found (run ./setup.sh)\"\n fi\n\n # yt-dlp\n YTDLP_CHECK=\"\"\n if command -v yt-dlp &>/dev/null; then\n YTDLP_CHECK=\"yt-dlp\"\n elif [ -f \"$HOME/.local/share/pipx/venvs/yt-dlp/bin/yt-dlp\" ]; then\n YTDLP_CHECK=\"$HOME/.local/share/pipx/venvs/yt-dlp/bin/yt-dlp\"\n fi\n if [ -n \"$YTDLP_CHECK\" ]; then\n YTDLP_VER=$(\"$YTDLP_CHECK\" --version 2>/dev/null)\n echo \"✅ yt-dlp: $YTDLP_VER (URL/YouTube input available)\"\n else\n echo \"ℹ️ yt-dlp: not installed (URL/YouTube input unavailable; pipx install yt-dlp)\"\n fi\n\n # HuggingFace token\n if [ -f \"$HOME/.cache/huggingface/token\" ]; then\n echo \"✅ HuggingFace token: present\"\n else\n echo \"ℹ️ HuggingFace token: not found (needed for --diarize; run huggingface-cli login)\"\n fi\n\n echo \"\"\n exit 0\n ;;\n --help|-h)\n echo \"Usage: ./setup.sh [--diarize] [--update] [--check]\"\n echo \"\"\n echo \"Options:\"\n echo \" --diarize Also install pyannote.audio for speaker diarization\"\n echo \" Requires HuggingFace token at ~/.cache/huggingface/token\"\n echo \" and model agreement at https://hf.co/pyannote/speaker-diarization-3.1\"\n echo \" --update Upgrade faster-whisper in the existing venv without full reinstall\"\n echo \" --check Verify system dependencies (GPU, Python, ffmpeg, venv, yt-dlp)\"\n exit 0\n ;;\n esac\ndone\n\necho \"🎙️ Setting up faster-whisper skill...\"\n\n# Detect OS\nOS=\"$(uname -s)\"\nARCH=\"$(uname -m)\"\n\ncase \"$OS\" in\n Linux*) OS_TYPE=\"linux\" ;;\n Darwin*) OS_TYPE=\"macos\" ;;\n *) OS_TYPE=\"unknown\" ;;\nesac\n\necho \"✓ Platform: $OS_TYPE ($ARCH)\"\n\n# Check for Python 3.10+\nif ! command -v python3 &> /dev/null; then\n echo \"❌ Python 3 not found. Please install Python 3.10 or later.\"\n exit 1\nfi\n\nPYTHON_VERSION=$(python3 -c 'import sys; print(f\"{sys.version_info.major}.{sys.version_info.minor}\")')\nPYTHON_MAJOR=$(echo \"$PYTHON_VERSION\" | cut -d. -f1)\nPYTHON_MINOR=$(echo \"$PYTHON_VERSION\" | cut -d. -f2)\n\nif [ \"$PYTHON_MAJOR\" -lt 3 ] || ([ \"$PYTHON_MAJOR\" -eq 3 ] && [ \"$PYTHON_MINOR\" -lt 10 ]); then\n echo \"❌ Python 3.10+ required (found $PYTHON_VERSION)\"\n exit 1\nfi\n\necho \"✓ Python $PYTHON_VERSION\"\n\n# Check for ffmpeg (required)\nif ! command -v ffmpeg &> /dev/null; then\n echo \"❌ ffmpeg not found (required for audio processing)\"\n echo \"\"\n echo \"Install ffmpeg:\"\n if [ \"$OS_TYPE\" = \"macos\" ]; then\n echo \" brew install ffmpeg\"\n else\n echo \" Ubuntu/Debian: sudo apt install ffmpeg\"\n echo \" Fedora: sudo dnf install ffmpeg\"\n echo \" Arch: sudo pacman -S ffmpeg\"\n fi\n echo \"\"\n exit 1\nfi\n\necho \"✓ ffmpeg found\"\n\n# Detect GPU/acceleration availability\nHAS_CUDA=false\nHAS_APPLE_SILICON=false\nGPU_NAME=\"\"\nNVIDIA_SMI=\"\"\n\nif [ \"$OS_TYPE\" = \"linux\" ]; then\n # Check for NVIDIA GPU (Linux/WSL)\n # Try nvidia-smi in PATH first\n if command -v nvidia-smi &> /dev/null; then\n NVIDIA_SMI=\"nvidia-smi\"\n else\n # WSL2: nvidia-smi is in /usr/lib/wsl/lib/ (not in PATH by default)\n if grep -qi microsoft /proc/version 2>/dev/null; then\n for wsl_smi in /usr/lib/wsl/lib/nvidia-smi /usr/lib/wsl/drivers/*/nvidia-smi; do\n if [ -f \"$wsl_smi\" ]; then\n NVIDIA_SMI=\"$wsl_smi\"\n echo \"✓ WSL2 detected\"\n break\n fi\n done\n fi\n fi\n \n # If we found nvidia-smi, get GPU info\n if [ -n \"$NVIDIA_SMI\" ]; then\n GPU_NAME=$($NVIDIA_SMI --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)\n if [ -n \"$GPU_NAME\" ]; then\n HAS_CUDA=true\n fi\n fi\nelif [ \"$OS_TYPE\" = \"macos\" ]; then\n if [ \"$ARCH\" = \"arm64\" ]; then\n HAS_APPLE_SILICON=true\n GPU_NAME=\"Apple Silicon\"\n echo \"✓ Apple Silicon detected\"\n fi\nfi\n\nif [ \"$HAS_CUDA\" = true ]; then\n echo \"✓ GPU detected: $GPU_NAME\"\nfi\n\n# Create venv\nif [ -d \"$VENV_DIR\" ]; then\n echo \"✓ Virtual environment exists\"\nelse\n echo \"Creating virtual environment...\"\n if command -v uv &> /dev/null; then\n uv venv \"$VENV_DIR\" --python python3\n else\n python3 -m venv \"$VENV_DIR\"\n fi\n echo \"✓ Virtual environment created\"\nfi\n\n# Helper: install with uv or pip\npip_install() {\n if command -v uv &> /dev/null; then\n uv pip install --python \"$VENV_DIR/bin/python\" \"$@\"\n else\n \"$VENV_DIR/bin/pip\" install \"$@\"\n fi\n}\n\n# Install base dependencies\necho \"Installing faster-whisper...\"\nif ! command -v uv &> /dev/null; then\n \"$VENV_DIR/bin/pip\" install --upgrade pip\nfi\npip_install -r \"$SCRIPT_DIR/requirements.txt\"\n\n# Install PyTorch based on platform\nif [ \"$HAS_CUDA\" = true ]; then\n echo \"\"\n echo \"🚀 Installing PyTorch with CUDA support...\"\n echo \" This enables ~10-20x faster transcription on your GPU.\"\n echo \"\"\n if command -v uv &> /dev/null; then\n uv pip install --python \"$VENV_DIR/bin/python\" torch torchaudio --index-url https://download.pytorch.org/whl/cu121\n else\n \"$VENV_DIR/bin/pip\" install torch torchaudio --index-url https://download.pytorch.org/whl/cu121\n fi\n echo \"✓ PyTorch + torchaudio with CUDA installed\"\nelif [ \"$OS_TYPE\" = \"macos\" ]; then\n echo \"\"\n echo \"🍎 Installing PyTorch for macOS...\"\n pip_install torch torchaudio\n echo \"✓ PyTorch installed\"\n if [ \"$HAS_APPLE_SILICON\" = true ]; then\n echo \"ℹ️ Note: faster-whisper uses CPU on macOS (Apple Silicon is still fast!)\"\n fi\nelse\n echo \"\"\n echo \"ℹ️ No NVIDIA GPU detected. Using CPU mode.\"\n echo \" If you have a GPU, ensure CUDA drivers are installed.\"\nfi\n\n# Install diarization dependencies (optional)\nif [ \"$INSTALL_DIARIZE\" = true ]; then\n echo \"\"\n echo \"🔊 Installing speaker diarization (pyannote.audio)...\"\n pip_install pyannote.audio\n\n # Check for HuggingFace token\n HF_TOKEN_PATH=\"$HOME/.cache/huggingface/token\"\n if [ ! -f \"$HF_TOKEN_PATH\" ]; then\n echo \"\"\n echo \"⚠️ No HuggingFace token found at $HF_TOKEN_PATH\"\n echo \" Diarization requires:\"\n echo \" 1. A HuggingFace account and token (huggingface-cli login)\"\n echo \" 2. Accept model agreement: https://hf.co/pyannote/speaker-diarization-3.1\"\n echo \" 3. Accept model agreement: https://hf.co/pyannote/segmentation-3.0\"\n else\n echo \"✓ HuggingFace token found\"\n fi\n echo \"✓ pyannote.audio installed\"\nfi\n\n# Make scripts executable\nchmod +x \"$SCRIPT_DIR/scripts/\"*\n\necho \"\"\necho \"✅ Setup complete!\"\necho \"\"\nif [ \"$HAS_CUDA\" = true ]; then\n echo \"🚀 GPU acceleration enabled — expect ~20x realtime speed\"\nelif [ \"$HAS_APPLE_SILICON\" = true ]; then\n echo \"🍎 Apple Silicon — expect ~3-5x realtime speed on CPU\"\nelse\n echo \"💻 CPU mode — transcription will be slower but functional\"\nfi\nif [ \"$INSTALL_DIARIZE\" = true ]; then\n echo \"🔊 Speaker diarization enabled (--diarize flag)\"\nfi\necho \"\"\necho \"Usage:\"\necho \" $SCRIPT_DIR/scripts/transcribe audio.mp3\"\necho \" $SCRIPT_DIR/scripts/transcribe audio.mp3 --format srt -o subtitles.srt\"\necho \" $SCRIPT_DIR/scripts/transcribe audio.mp3 --diarize\"\necho \"\"\necho \"First run will download the model (~756MB for distil-large-v3.5).\"\n","content_type":"application/x-sh; charset=utf-8","language":"bash","size":12721,"content_sha256":"a678dfd560c09e8084e0fcc84c285d775d31365ce2dcbb51daf808dc48179870"},{"filename":"skill.json","content":"{\n \"name\": \"faster-whisper\",\n \"version\": \"1.5.1\",\n \"description\": \"Local speech-to-text using faster-whisper with 4-6x speed boost over OpenAI Whisper. GPU acceleration enables ~20x realtime transcription.\",\n \"author\": \"ThePlasmak\",\n \"tags\": [\n \"whisper\",\n \"transcription\",\n \"speech-to-text\",\n \"gpu\",\n \"cuda\",\n \"audio\",\n \"ml\"\n ],\n \"requires\": {\n \"bins\": [\n \"python3\"\n ],\n \"optionalBins\": [\n \"ffmpeg\",\n \"yt-dlp\"\n ],\n \"optionalPaths\": [\n \"~/.cache/huggingface/token\"\n ]\n },\n \"platforms\": [\n \"linux\",\n \"macos\",\n \"wsl2\"\n ]\n}\n","content_type":"application/json; charset=utf-8","language":"json","size":599,"content_sha256":"bffcbb78c3010933dd97bb6606b876108ddd2d8c69e426c7c4a8cf4efdc42974"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Faster Whisper","type":"text"}]},{"type":"paragraph","content":[{"text":"Local speech-to-text using faster-whisper — a CTranslate2 reimplementation of OpenAI's Whisper that runs ","type":"text"},{"text":"4-6x faster","type":"text","marks":[{"type":"strong"}]},{"text":" with identical accuracy. With GPU acceleration, expect ","type":"text"},{"text":"~20x realtime","type":"text","marks":[{"type":"strong"}]},{"text":" transcription (a 10-minute audio file in ~30 seconds).","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"When to Use","type":"text"}]},{"type":"paragraph","content":[{"text":"Use this skill when you need to:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Transcribe audio/video files","type":"text","marks":[{"type":"strong"}]},{"text":" — meetings, interviews, podcasts, lectures, YouTube videos","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Generate subtitles","type":"text","marks":[{"type":"strong"}]},{"text":" — SRT, VTT, ASS, LRC, or TTML broadcast-standard subtitles","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Identify speakers","type":"text","marks":[{"type":"strong"}]},{"text":" — diarization labels who said what (","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Transcribe from URLs","type":"text","marks":[{"type":"strong"}]},{"text":" — YouTube links and direct audio URLs (auto-downloads via yt-dlp)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Transcribe podcast feeds","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--rss \u003cfeed-url>","type":"text","marks":[{"type":"code_inline"}]},{"text":" fetches and transcribes episodes","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Batch process files","type":"text","marks":[{"type":"strong"}]},{"text":" — glob patterns, directories, skip-existing support; ETA shown automatically","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Convert speech to text locally","type":"text","marks":[{"type":"strong"}]},{"text":" — no API costs, works offline (after model download)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Translate to English","type":"text","marks":[{"type":"strong"}]},{"text":" — translate any language to English with ","type":"text"},{"text":"--translate","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Do multilingual transcription","type":"text","marks":[{"type":"strong"}]},{"text":" — supports 99+ languages with auto-detection","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Transcribe a batch of files in different languages","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--language-map","type":"text","marks":[{"type":"code_inline"}]},{"text":" assigns a different language per file","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Transcribe multilingual audio","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--multilingual","type":"text","marks":[{"type":"code_inline"}]},{"text":" for mixed-language audio","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Transcribe audio with specific terms","type":"text","marks":[{"type":"strong"}]},{"text":" — use ","type":"text"},{"text":"--initial-prompt","type":"text","marks":[{"type":"code_inline"}]},{"text":" for jargon-heavy content or any other terms to look out for","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Preprocess noisy audio (before transcription)","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--normalize","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"--denoise","type":"text","marks":[{"type":"code_inline"}]},{"text":" before transcription","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Stream output","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--stream","type":"text","marks":[{"type":"code_inline"}]},{"text":" shows segments as they're transcribed","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Clip time ranges","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--clip-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":" to transcribe specific sections","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search the transcript","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--search \"term\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" finds all timestamps where a word/phrase appears","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Detect chapters","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--detect-chapters","type":"text","marks":[{"type":"code_inline"}]},{"text":" finds section breaks from silence gaps","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Export speaker audio","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--export-speakers DIR","type":"text","marks":[{"type":"code_inline"}]},{"text":" saves each speaker's turns as separate WAV files","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Spreadsheet output","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"--format csv","type":"text","marks":[{"type":"code_inline"}]},{"text":" produces a properly-quoted CSV with timestamps","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Trigger phrases:","type":"text","marks":[{"type":"strong"}]},{"text":" \"transcribe this audio\", \"convert speech to text\", \"what did they say\", \"make a transcript\", \"audio to text\", \"subtitle this video\", \"who's speaking\", \"translate this audio\", \"translate to English\", \"find where X is mentioned\", \"search transcript for\", \"when did they say\", \"at what timestamp\", \"add chapters\", \"detect chapters\", \"find breaks in the audio\", \"table of contents for this recording\", \"TTML subtitles\", \"DFXP subtitles\", \"broadcast format subtitles\", \"Netflix format\", \"ASS subtitles\", \"aegisub format\", \"advanced substation alpha\", \"mpv subtitles\", \"LRC subtitles\", \"timed lyrics\", \"karaoke subtitles\", \"music player lyrics\", \"HTML transcript\", \"confidence-colored transcript\", \"color-coded transcript\", \"separate audio per speaker\", \"export speaker audio\", \"split by speaker\", \"transcript as CSV\", \"spreadsheet output\", \"transcribe podcast\", \"podcast RSS feed\", \"different languages in batch\", \"per-file language\", \"transcribe in multiple formats\", \"srt and txt at the same time\", \"output both srt and text\", \"remove filler words\", \"clean up ums and uhs\", \"strip hesitation sounds\", \"remove you know and I mean\", \"transcribe left channel\", \"transcribe right channel\", \"stereo channel\", \"left track only\", \"wrap subtitle lines\", \"character limit per line\", \"max chars per subtitle\", \"detect paragraphs\", \"paragraph breaks\", \"group into paragraphs\", \"add paragraph spacing\"","type":"text"}]},{"type":"paragraph","content":[{"text":"⚠️ Agent guidance — keep invocations minimal:","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"CORE RULE: default command (","type":"text","marks":[{"type":"em"}]},{"text":"./scripts/transcribe audio.mp3","type":"text","marks":[{"type":"code_inline"},{"type":"em"}]},{"text":") is the fastest path — add flags only when the user explicitly asks for that capability.","type":"text","marks":[{"type":"em"}]}]},{"type":"paragraph","content":[{"text":"Transcription:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks \"who said what\" / \"identify speakers\" / \"label speakers\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--format srt/vtt/ass/lrc/ttml","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks for subtitles/captions in that format","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--format csv","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks for CSV or spreadsheet output","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--word-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user needs word-level timing","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--initial-prompt","type":"text","marks":[{"type":"code_inline"}]},{"text":" if there's domain-specific jargon to prime","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--translate","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user wants non-English audio translated to English","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--normalize","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"--denoise","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user mentions bad audio quality or noise","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--stream","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user wants live/progressive output for long files","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--clip-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user wants a specific time range","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--temperature 0.0","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the model is hallucinating on music/silence","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--vad-threshold","type":"text","marks":[{"type":"code_inline"}]},{"text":" if VAD is aggressively cutting speech or including noise","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--min-speakers","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"--max-speakers","type":"text","marks":[{"type":"code_inline"}]},{"text":" when you know the speaker count","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--hf-token","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the token is not cached at ","type":"text"},{"text":"~/.cache/huggingface/token","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--max-words-per-line","type":"text","marks":[{"type":"code_inline"}]},{"text":" for subtitle readability on long segments","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--filter-hallucinations","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the transcript contains obvious artifacts (music markers, duplicates)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--merge-sentences","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks for sentence-level subtitle cues","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--clean-filler","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks to remove filler words (um, uh, you know, I mean, hesitation sounds)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--channel left|right","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user mentions stereo tracks, dual-channel recordings, or asks for a specific channel","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--max-chars-per-line N","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user specifies a character limit per subtitle line (e.g., \"Netflix format\", \"42 chars per line\"); takes priority over ","type":"text"},{"text":"--max-words-per-line","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--detect-paragraphs","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks for paragraph breaks or structured text output; ","type":"text"},{"text":"--paragraph-gap","type":"text","marks":[{"type":"code_inline"}]},{"text":" (default 3.0s) only if they want a custom gap","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--speaker-names \"Alice,Bob\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user provides real names to replace SPEAKER_1/2 — always requires ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--hotwords WORDS","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user names specific rare terms not well served by ","type":"text"},{"text":"--initial-prompt","type":"text","marks":[{"type":"code_inline"}]},{"text":"; prefer ","type":"text"},{"text":"--initial-prompt","type":"text","marks":[{"type":"code_inline"}]},{"text":" for general domain jargon","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--prefix TEXT","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user knows the exact words the audio starts with","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--detect-language-only","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user only wants to identify the language, not transcribe","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--stats-file PATH","type":"text","marks":[{"type":"code_inline"}]},{"text":" if the user asks for performance stats, RTF, or benchmark info","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--parallel N","type":"text","marks":[{"type":"code_inline"}]},{"text":" for large CPU batch jobs; GPU handles one file efficiently on its own — don't add for single files or small batches","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--retries N","type":"text","marks":[{"type":"code_inline"}]},{"text":" for unreliable inputs (URLs, network files) where transient failures are expected","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--burn-in OUTPUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" only when user explicitly asks to embed/burn subtitles into the video; requires ffmpeg and a video file input","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--keep-temp","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user may re-process the same URL to avoid re-downloading","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--output-template","type":"text","marks":[{"type":"code_inline"}]},{"text":" when user specifies a custom naming pattern in batch mode","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Multi-format output","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"--format srt,text","type":"text","marks":[{"type":"code_inline"}]},{"text":"): only when user explicitly wants multiple formats in one pass; always pair with ","type":"text"},{"text":"-o \u003cdir>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Any word-level feature auto-runs wav2vec2 alignment (~5-10s overhead)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":" adds ~20-30s on top of that","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Search:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--search \"term\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user asks to find/locate/search for a specific word or phrase in audio","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--search","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"replaces","type":"text","marks":[{"type":"strong"}]},{"text":" the normal transcript output — it prints only matching segments with timestamps","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Add ","type":"text"},{"text":"--search-fuzzy","type":"text","marks":[{"type":"code_inline"}]},{"text":" only when the user mentions approximate/partial matching or typos","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"To save search results to a file, use ","type":"text"},{"text":"-o results.txt","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"Chapter detection:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--detect-chapters","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user asks for chapters, sections, a table of contents, or \"where does the topic change\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Default ","type":"text"},{"text":"--chapter-gap 8","type":"text","marks":[{"type":"code_inline"}]},{"text":" (8-second silence = new chapter) works for most podcasts/lectures; tune down for dense content","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--chapter-format youtube","type":"text","marks":[{"type":"code_inline"}]},{"text":" (default) outputs YouTube-ready timestamps; use ","type":"text"},{"text":"json","type":"text","marks":[{"type":"code_inline"}]},{"text":" for programmatic use","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Always use ","type":"text","marks":[{"type":"strong"}]},{"text":"--chapters-file PATH","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" when combining chapters with a transcript output — avoids mixing chapter markers into the transcript text","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"If the user only wants chapters (not the transcript), pipe stdout to a file with ","type":"text"},{"text":"-o /dev/null","type":"text","marks":[{"type":"code_inline"}]},{"text":" and use ","type":"text"},{"text":"--chapters-file","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Batch mode limitation:","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"--chapters-file","type":"text","marks":[{"type":"code_inline"}]},{"text":" takes a single path — in batch mode, each file's chapters overwrite the previous. For batch chapter detection, omit ","type":"text"},{"text":"--chapters-file","type":"text","marks":[{"type":"code_inline"}]},{"text":" (chapters print to stdout under ","type":"text"},{"text":"=== CHAPTERS (N) ===","type":"text","marks":[{"type":"code_inline"}]},{"text":") or use a separate run per file","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Speaker audio export:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--export-speakers DIR","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user explicitly asks to save each speaker's audio separately","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Always pair with ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":" — it silently skips if no speaker labels are present","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Requires ffmpeg; outputs ","type":"text"},{"text":"SPEAKER_1.wav","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"SPEAKER_2.wav","type":"text","marks":[{"type":"code_inline"}]},{"text":", etc. (or real names if ","type":"text"},{"text":"--speaker-names","type":"text","marks":[{"type":"code_inline"}]},{"text":" is set)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Language map:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--language-map","type":"text","marks":[{"type":"code_inline"}]},{"text":" in batch mode when the user has confirmed different languages across files","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Inline format: ","type":"text"},{"text":"\"interview*.mp3=en,lecture*.mp3=fr\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" — fnmatch globs on filename","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"JSON file format: ","type":"text"},{"text":"@/path/to/map.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" where the file is ","type":"text"},{"text":"{\"pattern\": \"lang_code\"}","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"RSS / Podcast:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Only add ","type":"text"},{"text":"--rss URL","type":"text","marks":[{"type":"code_inline"}]},{"text":" when the user provides a podcast RSS feed URL","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Default fetches 5 newest episodes; ","type":"text"},{"text":"--rss-latest 0","type":"text","marks":[{"type":"code_inline"}]},{"text":" for all; ","type":"text"},{"text":"--skip-existing","type":"text","marks":[{"type":"code_inline"}]},{"text":" to resume safely","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Always use ","type":"text","marks":[{"type":"strong"}]},{"text":"-o \u003cdir>","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" with ","type":"text"},{"text":"--rss","type":"text","marks":[{"type":"code_inline"}]},{"text":" — without it, all episode transcripts print to stdout concatenated, which is hard to use; each episode gets its own file when ","type":"text"},{"text":"-o \u003cdir>","type":"text","marks":[{"type":"code_inline"}]},{"text":" is set","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Output format for agent relay:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Search results","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"--search","type":"text","marks":[{"type":"code_inline"}]},{"text":") → print directly to user; output is human-readable","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Chapter output","type":"text","marks":[{"type":"strong"}]},{"text":" → if no ","type":"text"},{"text":"--chapters-file","type":"text","marks":[{"type":"code_inline"}]},{"text":", chapters appear in stdout under ","type":"text"},{"text":"=== CHAPTERS (N) ===","type":"text","marks":[{"type":"code_inline"}]},{"text":" header after the transcript; with ","type":"text"},{"text":"--format json","type":"text","marks":[{"type":"code_inline"}]},{"text":", chapters are also embedded in the JSON under ","type":"text"},{"text":"\"chapters\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" key","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Subtitle formats","type":"text","marks":[{"type":"strong"}]},{"text":" (SRT, VTT, ASS, LRC, TTML) → always write to ","type":"text"},{"text":"-o","type":"text","marks":[{"type":"code_inline"}]},{"text":" file; tell the user the output path, never paste raw subtitle content","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Data formats","type":"text","marks":[{"type":"strong"}]},{"text":" (CSV, HTML, TTML, JSON) → always write to ","type":"text"},{"text":"-o","type":"text","marks":[{"type":"code_inline"}]},{"text":" file; tell the user the output path, don't paste raw XML/CSV/HTML","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ASS format","type":"text","marks":[{"type":"strong"}]},{"text":" → for Aegisub, VLC, mpv; write to file and tell user they can open it in Aegisub or play it in VLC/mpv","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"LRC format","type":"text","marks":[{"type":"strong"}]},{"text":" → timed lyrics for music players (Foobar2000, AIMP, VLC); write to file","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Multi-format","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"--format srt,text","type":"text","marks":[{"type":"code_inline"}]},{"text":") → requires ","type":"text"},{"text":"-o \u003cdir>","type":"text","marks":[{"type":"code_inline"}]},{"text":"; each format goes to a separate file; tell user all paths written","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"JSON format","type":"text","marks":[{"type":"strong"}]},{"text":" → useful for programmatic post-processing; not ideal to paste in full to user","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Text/transcript","type":"text","marks":[{"type":"strong"}]},{"text":" → safe to show directly to user for short files; summarise for long ones","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Stats output","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"--stats-file","type":"text","marks":[{"type":"code_inline"}]},{"text":") → summarise key fields (duration, processing time, RTF) for the user rather than pasting raw JSON","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Language detection","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"--detect-language-only","type":"text","marks":[{"type":"code_inline"}]},{"text":") → print the result directly; it's a single line","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ETA","type":"text","marks":[{"type":"strong"}]},{"text":" is printed automatically to stderr for batch jobs; no action needed","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"When NOT to use:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cloud-only environments without local compute","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Files \u003c10 seconds where API call latency doesn't matter","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"faster-whisper vs whisperx:","type":"text","marks":[{"type":"strong"}]},{"text":" This skill covers everything whisperx does — diarization (","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":"), word-level timestamps (","type":"text"},{"text":"--word-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":"), SRT/VTT subtitles — so whisperx is not needed. Use whisperx only if you specifically need its pyannote pipeline or batch-GPU features not covered here.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Quick Reference","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Task","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Command","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Notes","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Basic transcription","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Batched inference, VAD on, distil-large-v3.5","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"SRT subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format srt -o subs.srt","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Word timestamps auto-enabled","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"VTT subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format vtt -o subs.vtt","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"WebVTT format","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Word timestamps","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --word-timestamps --format srt","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"wav2vec2 aligned (~10ms)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Speaker diarization","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --diarize","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Requires pyannote.audio","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Translate → English","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --translate","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Any language → English","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Stream output","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --stream","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Live segments as transcribed","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Clip time range","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --clip-timestamps \"30,60\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Only 30s–60s","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Denoise + normalize","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --denoise --normalize","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Clean up noisy audio first","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Reduce hallucination","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --hallucination-silence-threshold 1.0","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Skip hallucinated silence","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"YouTube/URL","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe https://youtube.com/watch?v=...","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Auto-downloads via yt-dlp","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Batch process","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 -o ./transcripts/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Output to directory","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Batch with skip","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 --skip-existing -o ./out/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Resume interrupted batches","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Domain terms","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --initial-prompt 'Kubernetes gRPC'","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Boost rare terminology","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Hotwords boost","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --hotwords 'JIRA Kubernetes'","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Bias decoder toward specific words","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Prefix conditioning","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --prefix 'Good morning,'","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Seed the first segment with known opening words","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pin model version","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --revision v1.2.0","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Reproducible transcription with a pinned revision","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Debug library logs","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --log-level debug","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Show faster_whisper internal logs","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Turbo model","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 -m turbo","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Alias for large-v3-turbo","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Faster English","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --model distil-medium.en -l en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"English-only, 6.8x faster","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Maximum accuracy","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --model large-v3 --beam-size 10","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Full model","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"JSON output","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format json -o out.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Programmatic access with stats","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Filter noise","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --min-confidence 0.6","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Drop low-confidence segments","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Hybrid quantization","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --compute-type int8_float16","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Save VRAM, minimal quality loss","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Reduce batch size","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --batch-size 4","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"If OOM on GPU","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"TSV output","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format tsv -o out.tsv","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"OpenAI Whisper–compatible TSV","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fix hallucinations","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --temperature 0.0 --no-speech-threshold 0.8","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Lock temperature + skip silence","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tune VAD sensitivity","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --vad-threshold 0.6 --min-silence-duration 500","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tighter speech detection","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Known speaker count","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe meeting.wav --diarize --min-speakers 2 --max-speakers 3","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Constrain diarization","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Subtitle word wrapping","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format srt --word-timestamps --max-words-per-line 8","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Split long cues","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Private/gated model","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --hf-token hf_xxx","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pass token directly","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Show version","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe --version","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Print faster-whisper version","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Upgrade in-place","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./setup.sh --update","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Upgrade without full reinstall","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"System check","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./setup.sh --check","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Verify GPU, Python, ffmpeg, venv, yt-dlp, pyannote","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Detect language only","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-language-only","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fast language ID, no transcription","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Detect language JSON","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-language-only --format json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Machine-readable language detection","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"LRC subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format lrc -o lyrics.lrc","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Timed lyrics format for music players","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"ASS subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format ass -o subtitles.ass","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Advanced SubStation Alpha (Aegisub, mpv, VLC)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Merge sentences","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format srt --merge-sentences","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Join fragments into sentence chunks","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Stats sidecar","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --stats-file stats.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Write perf stats JSON after transcription","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Batch stats","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 --stats-file ./stats/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"One stats file per input in dir","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Template naming","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 -o ./out/ --output-template \"{stem}_{lang}.{ext}\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Custom batch output filenames","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Stdin input","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"ffmpeg -i input.mp4 -f wav - | ./scripts/transcribe -","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Pipe audio directly from stdin","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Custom model dir","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --model-dir ~/my-models","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Custom HuggingFace cache dir","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Local model","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 -m ./my-model-ct2","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CTranslate2 model dir","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"HTML transcript","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format html -o out.html","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Confidence-colored","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Burn subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe video.mp4 --burn-in output.mp4","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Requires ffmpeg + video input","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Name speakers","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --diarize --speaker-names \"Alice,Bob\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Replaces SPEAKER_1/2","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Filter hallucinations","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --filter-hallucinations","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Removes artifacts","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Keep temp files","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe https://... --keep-temp","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"For URL re-processing","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Parallel batch","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 --parallel 4 -o ./out/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CPU multi-file","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"RTX 3070 recommended","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --compute-type int8_float16","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Saves ~1GB VRAM, minimal quality loss","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CPU thread count","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --threads 8","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Force CPU thread count (default: auto)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Podcast RSS (latest 5)","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe --rss https://feeds.example.com/podcast.xml","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Downloads & transcribes newest 5 episodes","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Podcast RSS (all episodes)","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe --rss https://... --rss-latest 0 -o ./episodes/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"All episodes, one file each","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Podcast + SRT subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe --rss https://... --format srt -o ./subs/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Subtitle all episodes","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Retry on failure","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 --retries 3 -o ./out/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Retry up to 3× with backoff on error","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CSV output","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format csv -o out.csv","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Spreadsheet-ready with header row; properly quoted","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CSV with speakers","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --diarize --format csv -o out.csv","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Adds speaker column","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Language map (inline)","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 --language-map \"interview*.mp3=en,lecture.wav=fr\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Per-file language in batch","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Language map (JSON)","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 --language-map @langs.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"JSON file: {\"pattern\": \"lang\"}","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Batch with ETA","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe *.mp3 -o ./out/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Automatic ETA shown for each file in batch","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"TTML subtitles","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format ttml -o subtitles.ttml","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Broadcast-standard DFXP/TTML (Netflix, BBC, Amazon)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"TTML with speaker labels","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --diarize --format ttml -o subtitles.ttml","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Speaker-labeled TTML","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Search transcript","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --search \"keyword\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Find timestamps where keyword appears","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Search to file","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --search \"keyword\" -o results.txt","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Save search results","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fuzzy search","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --search \"aproximate\" --search-fuzzy","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Approximate/partial matching","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Detect chapters","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-chapters","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Auto-detect chapters from silence gaps","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Chapter gap tuning","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-chapters --chapter-gap 5","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Chapters on gaps ≥5s (default: 8s)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Chapters to file","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-chapters --chapters-file ch.txt","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Save YouTube-format chapter list","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Chapters JSON","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-chapters --chapter-format json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Machine-readable chapter list","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Export speaker audio","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --diarize --export-speakers ./speakers/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Save each speaker's audio to separate WAV files","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Multi-format output","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format srt,text -o ./out/","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Write SRT + TXT in one pass","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Remove filler words","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --clean-filler","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Strip um/uh/er/ah/hmm and discourse markers","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Left channel only","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --channel left","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Extract left stereo channel before transcribing","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Right channel only","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --channel right","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Extract right stereo channel","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Max chars per line","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --format srt --max-chars-per-line 42","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Character-based subtitle wrapping","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Detect paragraphs","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-paragraphs","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Insert paragraph breaks in text output","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Paragraph gap tuning","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"./scripts/transcribe audio.mp3 --detect-paragraphs --paragraph-gap 5.0","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Tune gap threshold (default 3.0s)","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Model Selection","type":"text"}]},{"type":"paragraph","content":[{"text":"Choose the right model for your needs:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"dot"},"content":[{"text":"digraph model_selection {\n rankdir=LR;\n node [shape=box, style=rounded];\n\n start [label=\"Start\", shape=doublecircle];\n need_accuracy [label=\"Need maximum\\naccuracy?\", shape=diamond];\n multilingual [label=\"Multilingual\\ncontent?\", shape=diamond];\n resource_constrained [label=\"Resource\\nconstraints?\", shape=diamond];\n\n large_v3 [label=\"large-v3\\nor\\nlarge-v3-turbo\", style=\"rounded,filled\", fillcolor=lightblue];\n large_turbo [label=\"large-v3-turbo\", style=\"rounded,filled\", fillcolor=lightblue];\n distil_large [label=\"distil-large-v3.5\\n(default)\", style=\"rounded,filled\", fillcolor=lightgreen];\n distil_medium [label=\"distil-medium.en\", style=\"rounded,filled\", fillcolor=lightyellow];\n distil_small [label=\"distil-small.en\", style=\"rounded,filled\", fillcolor=lightyellow];\n\n start -> need_accuracy;\n need_accuracy -> large_v3 [label=\"yes\"];\n need_accuracy -> multilingual [label=\"no\"];\n multilingual -> large_turbo [label=\"yes\"];\n multilingual -> resource_constrained [label=\"no (English)\"];\n resource_constrained -> distil_small [label=\"mobile/edge\"];\n resource_constrained -> distil_medium [label=\"some limits\"];\n resource_constrained -> distil_large [label=\"no\"];\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Model Table","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Standard Models (Full Whisper)","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Model","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Size","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Speed","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Accuracy","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use Case","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"tiny","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"tiny.en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"39M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fastest","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Basic","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Quick drafts","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"base","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"base.en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"74M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Very fast","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Good","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"General use","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"small","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"small.en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"244M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fast","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Better","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Most tasks","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"medium","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"medium.en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"769M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Moderate","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"High","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Quality transcription","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"large-v1/v2/v3","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"1.5GB","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Slower","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Best","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Maximum accuracy","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"large-v3-turbo","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"809M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fast","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Excellent","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"High accuracy (slower than distil)","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Distilled Models (~6x Faster, ~1% WER difference)","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Model","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Size","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Speed vs Standard","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Accuracy","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use Case","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"distil-large-v3.5","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"756M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~6.3x faster","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"7.08% WER","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Default, best balance","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"distil-large-v3","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"756M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~6.3x faster","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"7.53% WER","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Previous default","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"distil-large-v2","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"756M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~5.8x faster","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"10.1% WER","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Fallback","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"distil-medium.en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"394M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~6.8x faster","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"11.1% WER","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"English-only, resource-constrained","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"distil-small.en","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"166M","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~5.6x faster","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"12.1% WER","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Mobile/edge devices","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":".en","type":"text","marks":[{"type":"code_inline"}]},{"text":" models are English-only and slightly faster/better for English content.","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"Note for distil models:","type":"text","marks":[{"type":"strong"}]},{"text":" HuggingFace recommends disabling ","type":"text"},{"text":"condition_on_previous_text","type":"text","marks":[{"type":"code_inline"}]},{"text":" for all distil models to prevent repetition loops. The script ","type":"text"},{"text":"auto-applies","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"--no-condition-on-previous-text","type":"text","marks":[{"type":"code_inline"}]},{"text":" whenever a ","type":"text"},{"text":"distil-*","type":"text","marks":[{"type":"code_inline"}]},{"text":" model is detected. Pass ","type":"text"},{"text":"--condition-on-previous-text","type":"text","marks":[{"type":"code_inline"}]},{"text":" to override if needed.","type":"text"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Custom & Fine-tuned Models","type":"text"}]},{"type":"paragraph","content":[{"text":"WhisperModel accepts local CTranslate2 model directories and HuggingFace repo names — no code changes needed.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Load a local CTranslate2 model","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./scripts/transcribe audio.mp3 --model /path/to/my-model-ct2","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Convert a HuggingFace model to CTranslate2","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"pip install ctranslate2\nct2-transformers-converter \\\n --model openai/whisper-large-v3 \\\n --output_dir whisper-large-v3-ct2 \\\n --copy_files tokenizer.json preprocessor_config.json \\\n --quantization float16\n./scripts/transcribe audio.mp3 --model ./whisper-large-v3-ct2","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Load a model by HuggingFace repo name (auto-downloads)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./scripts/transcribe audio.mp3 --model username/whisper-large-v3-ct2","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Custom model cache directory","type":"text"}]},{"type":"paragraph","content":[{"text":"By default, models are cached in ","type":"text"},{"text":"~/.cache/huggingface/","type":"text","marks":[{"type":"code_inline"}]},{"text":". Use ","type":"text"},{"text":"--model-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":" to override:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./scripts/transcribe audio.mp3 --model-dir ~/my-models","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Setup","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Linux / macOS / WSL2","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Base install (creates venv, installs deps, auto-detects GPU)\n./setup.sh\n\n# With speaker diarization support\n./setup.sh --diarize","type":"text"}]},{"type":"paragraph","content":[{"text":"Requirements:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Python 3.10+","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ffmpeg is ","type":"text"},{"text":"not required","type":"text","marks":[{"type":"strong"}]},{"text":" for basic transcription — PyAV (bundled with faster-whisper) handles audio decoding. ffmpeg is only needed for ","type":"text"},{"text":"--burn-in","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--normalize","type":"text","marks":[{"type":"code_inline"}]},{"text":", and ","type":"text"},{"text":"--denoise","type":"text","marks":[{"type":"code_inline"}]},{"text":".","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Optional: yt-dlp (for URL/YouTube input)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Optional: pyannote.audio (for ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":", installed via ","type":"text"},{"text":"setup.sh --diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Platform Support","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Platform","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Acceleration","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Speed","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Linux + NVIDIA GPU","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CUDA","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~20x realtime 🚀","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"WSL2 + NVIDIA GPU","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CUDA","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~20x realtime 🚀","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"macOS Apple Silicon","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CPU*","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~3-5x realtime","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"macOS Intel","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CPU","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~1-2x realtime","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Linux (no GPU)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CPU","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~1x realtime","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"*faster-whisper uses CTranslate2 which is CPU-only on macOS, but Apple Silicon is fast enough for practical use.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"GPU Support (IMPORTANT!)","type":"text"}]},{"type":"paragraph","content":[{"text":"The setup script auto-detects your GPU and installs PyTorch with CUDA. ","type":"text"},{"text":"Always use GPU if available","type":"text","marks":[{"type":"strong"}]},{"text":" — CPU transcription is extremely slow.","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Hardware","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Speed","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"9-min video","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"RTX 3070 (GPU)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~20x realtime","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~27 sec","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CPU (int8)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~0.3x realtime","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"~30 min","type":"text"}]}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"RTX 3070 tip","type":"text","marks":[{"type":"strong"}]},{"text":": Use ","type":"text"},{"text":"--compute-type int8_float16","type":"text","marks":[{"type":"code_inline"}]},{"text":" for hybrid quantization — saves ~1GB VRAM with minimal quality loss. Ideal for running diarization alongside transcription.","type":"text"}]}]},{"type":"paragraph","content":[{"text":"If setup didn't detect your GPU, manually install PyTorch with CUDA:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# For CUDA 12.x\nuv pip install --python .venv/bin/python torch --index-url https://download.pytorch.org/whl/cu121\n\n# For CUDA 11.x\nuv pip install --python .venv/bin/python torch --index-url https://download.pytorch.org/whl/cu118","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"WSL2 users","type":"text","marks":[{"type":"strong"}]},{"text":": Ensure you have the ","type":"text"},{"text":"NVIDIA CUDA drivers for WSL","type":"text","marks":[{"type":"link","attrs":{"href":"https://docs.nvidia.com/cuda/wsl-user-guide/","title":null}}]},{"text":" installed on Windows","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Usage","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Basic transcription\n./scripts/transcribe audio.mp3\n\n# SRT subtitles\n./scripts/transcribe audio.mp3 --format srt -o subtitles.srt\n\n# WebVTT subtitles\n./scripts/transcribe audio.mp3 --format vtt -o subtitles.vtt\n\n# Transcribe from YouTube URL\n./scripts/transcribe https://youtube.com/watch?v=dQw4w9WgXcQ --language en\n\n# Speaker diarization\n./scripts/transcribe meeting.wav --diarize\n\n# Diarized VTT subtitles\n./scripts/transcribe meeting.wav --diarize --format vtt -o meeting.vtt\n\n# Prime with domain terminology\n./scripts/transcribe lecture.mp3 --initial-prompt \"Kubernetes, gRPC, PostgreSQL, NGINX\"\n\n# Batch process a directory\n./scripts/transcribe ./recordings/ -o ./transcripts/\n\n# Batch with glob, skip already-done files\n./scripts/transcribe *.mp3 --skip-existing -o ./transcripts/\n\n# Filter low-confidence segments\n./scripts/transcribe noisy-audio.mp3 --min-confidence 0.6\n\n# JSON output with full metadata\n./scripts/transcribe audio.mp3 --format json -o result.json\n\n# Specify language (faster than auto-detect)\n./scripts/transcribe audio.mp3 --language en","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Options","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"Input:\n AUDIO Audio file(s), directory, glob pattern, or URL\n Accepts: mp3, wav, m4a, flac, ogg, webm, mp4, mkv, avi, wma, aac\n URLs auto-download via yt-dlp (YouTube, direct links, etc.)\n\nModel & Language:\n -m, --model NAME Whisper model (default: distil-large-v3.5; \"turbo\" = large-v3-turbo)\n --revision REV Model revision (git branch/tag/commit) to pin a specific version\n -l, --language CODE Language code, e.g. en, es, fr (auto-detects if omitted)\n --initial-prompt TEXT Prompt to condition the model (terminology, formatting style)\n --prefix TEXT Prefix to condition the first segment (e.g. known starting words)\n --hotwords WORDS Space-separated hotwords to boost recognition\n --translate Translate any language to English (instead of transcribing)\n --multilingual Enable multilingual/code-switching mode (helps smaller models)\n --hf-token TOKEN HuggingFace token for private/gated models and diarization\n --model-dir PATH Custom model cache directory (default: ~/.cache/huggingface/)\n\nOutput Format:\n -f, --format FMT text | json | srt | vtt | tsv | lrc | html | ass | ttml (default: text)\n Accepts comma-separated list: --format srt,text writes both in one pass\n Multi-format requires -o \u003cdir> when saving to files\n --word-timestamps Include word-level timestamps (wav2vec2 aligned automatically)\n --stream Output segments as they are transcribed (disables diarize/alignment)\n --max-words-per-line N For SRT/VTT, split segments into sub-cues of at most N words\n --max-chars-per-line N For SRT/VTT/ASS/TTML, split lines so each fits within N characters\n Takes priority over --max-words-per-line when both are set\n --clean-filler Remove hesitation fillers (um, uh, er, ah, hmm, hm) and discourse markers\n (you know, I mean, you see) from transcript text. Off by default.\n --detect-paragraphs Insert paragraph breaks (blank lines) in text output at natural boundaries.\n A new paragraph starts when: silence gap ≥ --paragraph-gap, OR the previous\n segment ends a sentence AND the gap ≥ 1.5s.\n --paragraph-gap SEC Minimum silence gap in seconds to start a new paragraph (default: 3.0).\n Used with --detect-paragraphs.\n --channel {left,right,mix}\n Stereo channel to transcribe: left (c0), right (c1), or mix (default: mix).\n Extracts the channel via ffmpeg before transcription. Requires ffmpeg.\n --merge-sentences Merge consecutive segments into sentence-level chunks\n (improves SRT/VTT readability; groups by terminal punctuation or >2s gap)\n -o, --output PATH Output file or directory (directory for batch mode)\n --output-template TEMPLATE\n Batch output filename template. Variables: {stem}, {lang}, {ext}, {model}\n Example: \"{stem}_{lang}.{ext}\" → \"interview_en.srt\"\n\nInference Tuning:\n --beam-size N Beam search size; higher = more accurate but slower (default: 5)\n --temperature T Sampling temperature or comma-separated fallback list, e.g.\n '0.0' or '0.0,0.2,0.4' (default: faster-whisper's schedule)\n --no-speech-threshold PROB\n Probability threshold to mark segments as silence (default: 0.6)\n --batch-size N Batched inference batch size (default: 8; reduce if OOM)\n --no-vad Disable voice activity detection (on by default)\n --vad-threshold T VAD speech probability threshold (default: 0.5)\n --vad-neg-threshold T VAD negative threshold for ending speech (default: auto)\n --vad-onset T Alias for --vad-threshold (legacy)\n --vad-offset T Alias for --vad-neg-threshold (legacy)\n --min-speech-duration MS Minimum speech segment duration in ms (default: 0)\n --max-speech-duration SEC Maximum speech segment duration in seconds (default: unlimited)\n --min-silence-duration MS Minimum silence before splitting a segment in ms (default: 2000)\n --speech-pad MS Padding around speech segments in ms (default: 400)\n --no-batch Disable batched inference (use standard WhisperModel)\n --hallucination-silence-threshold SEC\n Skip silent sections where model hallucinates (e.g. 1.0)\n --no-condition-on-previous-text\n Don't condition on previous text (reduces repetition/hallucination loops;\n auto-enabled for distil models per HuggingFace recommendation)\n --condition-on-previous-text\n Force-enable conditioning on previous text (overrides auto-disable for distil models)\n --compression-ratio-threshold RATIO\n Filter segments above this compression ratio (default: 2.4)\n --log-prob-threshold PROB\n Filter segments below this avg log probability (default: -1.0)\n --max-new-tokens N Maximum tokens per segment (prevents runaway generation)\n --clip-timestamps RANGE\n Transcribe specific time ranges: '30,60' or '0,30;60,90' (seconds)\n --progress Show transcription progress bar\n --best-of N Candidates when sampling with non-zero temperature (default: 5)\n --patience F Beam search patience factor (default: 1.0)\n --repetition-penalty F Penalty for repeated tokens (default: 1.0)\n --no-repeat-ngram-size N Prevent n-gram repetitions of this size (default: 0 = off)\n\nAdvanced Inference:\n --no-timestamps Output text without timing info (faster; incompatible with\n --word-timestamps, --format srt/vtt/tsv, --diarize)\n --chunk-length N Audio chunk length in seconds for batched inference (default: auto)\n --language-detection-threshold T\n Confidence threshold for language auto-detection (default: 0.5)\n --language-detection-segments N\n Audio segments to sample for language detection (default: 1)\n --length-penalty F Beam search length penalty; >1 favors longer, \u003c1 favors shorter (default: 1.0)\n --prompt-reset-on-temperature T\n Reset initial prompt when temperature fallback hits threshold (default: 0.5)\n --no-suppress-blank Disable blank token suppression (may help soft/quiet speech)\n --suppress-tokens IDS Comma-separated token IDs to suppress in addition to default -1\n --max-initial-timestamp T\n Maximum timestamp for the first segment in seconds (default: 1.0)\n --prepend-punctuations CHARS\n Punctuation characters merged into preceding word (default: \"'¿([{-)\n --append-punctuations CHARS\n Punctuation characters merged into following word (default: \"'.。,，!！?？:：\")]}、\")\n\nPreprocessing:\n --normalize Normalize audio volume (EBU R128 loudnorm) before transcription\n --denoise Apply noise reduction (high-pass + FFT denoise) before transcription\n\nAdvanced:\n --diarize Speaker diarization (requires pyannote.audio)\n --min-speakers N Minimum number of speakers hint for diarization\n --max-speakers N Maximum number of speakers hint for diarization\n --speaker-names NAMES Comma-separated names to replace SPEAKER_1, SPEAKER_2 (e.g. 'Alice,Bob')\n Requires --diarize\n --min-confidence PROB Filter segments below this avg word confidence (0.0–1.0)\n --skip-existing Skip files whose output already exists (batch mode)\n --detect-language-only\n Detect language and exit (no transcription). Output: \"Language: en (probability: 0.984)\"\n With --format json: {\"language\": \"en\", \"language_probability\": 0.984}\n --stats-file PATH Write JSON stats sidecar after transcription (processing time, RTF, word count, etc.)\n Directory path → writes {stem}.stats.json inside; file path → exact path\n --burn-in OUTPUT Burn subtitles into the original video (single-file mode only; requires ffmpeg)\n --filter-hallucinations\n Filter common Whisper hallucinations: music/applause markers, duplicate segments,\n 'Thank you for watching', lone punctuation, etc.\n --keep-temp Keep temp files from URL downloads (useful for re-processing without re-downloading)\n --parallel N Number of parallel workers for batch processing (default: sequential)\n --retries N Retry failed files up to N times with exponential backoff (default: 0;\n incompatible with --parallel)\n\nBatch ETA:\n Automatically shown for sequential batch jobs (no flag needed). After each file completes,\n the next file's progress line includes: [current/total] filename | ETA: Xm Ys\n ETA is calculated from average time per file × remaining files.\n Shown to stderr (surfaced to users via OpenClaw/Clawdbot output).\n\nLanguage Map (per-file language override):\n --language-map MAP Per-file language override for batch mode. Two forms:\n Inline: \"interview*.mp3=en,lecture.wav=fr,keynote.wav=de\"\n JSON file: \"@/path/to/map.json\" (must be {pattern: lang} dict)\n Patterns support fnmatch globs on filename or stem.\n Priority: exact filename > exact stem > glob on filename > glob on stem > fallback.\n Files not matched fall back to --language (or auto-detect if not set).\n\nTranscript Search:\n --search TERM Search the transcript for TERM and print matching segments with timestamps.\n Replaces normal transcript output (use -o to save results to a file).\n Case-insensitive exact substring match by default.\n --search-fuzzy Enable fuzzy/approximate matching with --search (useful for typos, phonetic\n near-misses, or partial words; uses SequenceMatcher ratio ≥ 0.6)\n\nChapter Detection:\n --detect-chapters Auto-detect chapter/section breaks from silence gaps and print chapter markers.\n Output is printed after the transcript (or to --chapters-file).\n --chapter-gap SEC Minimum silence gap in seconds between consecutive segments to start a new\n chapter (default: 8.0). Tune down for dense speech, up for sparse content.\n --chapters-file PATH Write chapter markers to this file (default: stdout after transcript)\n --chapter-format FMT youtube | text | json — chapter output format:\n youtube: \"0:00 Chapter 1\" (YouTube description ready)\n text: \"Chapter 1: 00:00:00\"\n json: JSON array with chapter, start, title fields\n (default: youtube)\n\nSpeaker Audio Export:\n --export-speakers DIR After diarization, export each speaker's audio turns concatenated into\n separate WAV files saved in DIR. Requires --diarize and ffmpeg.\n Output: SPEAKER_1.wav, SPEAKER_2.wav, … (or real names if --speaker-names set)\n\nRSS / Podcast:\n --rss URL Podcast RSS feed URL — extracts audio enclosures and transcribes them.\n AUDIO positional is optional when --rss is used.\n --rss-latest N Number of most-recent episodes to process (default: 5; 0 = all episodes)\n\nDevice:\n --device DEV auto | cpu | cuda (default: auto)\n --compute-type TYPE auto | int8 | int8_float16 | float16 | float32 (default: auto)\n int8_float16 = hybrid mode for GPU (saves VRAM, minimal quality loss)\n --threads N CPU thread count for CTranslate2 (default: auto)\n -q, --quiet Suppress progress and status messages\n --log-level LEVEL Set faster_whisper library logging level: debug | info | warning | error\n (default: warning; use debug to see CTranslate2/VAD internals)\n\nUtility:\n --version Print installed faster-whisper version and exit\n --update Upgrade faster-whisper in the skill venv and exit","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Output Formats","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Text (default)","type":"text"}]},{"type":"paragraph","content":[{"text":"Plain transcript text. With ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":", speaker labels are inserted:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"[SPEAKER_1]\n Hello, welcome to the meeting.\n[SPEAKER_2]\n Thanks for having me.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"JSON (","type":"text"},{"text":"--format json","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"Full metadata including segments, timestamps, language detection, and performance stats:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"json"},"content":[{"text":"{\n \"file\": \"audio.mp3\",\n \"text\": \"Hello, welcome...\",\n \"language\": \"en\",\n \"language_probability\": 0.98,\n \"duration\": 600.5,\n \"segments\": [...],\n \"speakers\": [\"SPEAKER_1\", \"SPEAKER_2\"],\n \"stats\": {\n \"processing_time\": 28.3,\n \"realtime_factor\": 21.2\n }\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"SRT (","type":"text"},{"text":"--format srt","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"Standard subtitle format for video players:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"1\n00:00:00,000 --> 00:00:02,500\n[SPEAKER_1] Hello, welcome to the meeting.\n\n2\n00:00:02,800 --> 00:00:04,200\n[SPEAKER_2] Thanks for having me.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"VTT (","type":"text"},{"text":"--format vtt","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"WebVTT format for web video players:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"WEBVTT\n\n1\n00:00:00.000 --> 00:00:02.500\n[SPEAKER_1] Hello, welcome to the meeting.\n\n2\n00:00:02.800 --> 00:00:04.200\n[SPEAKER_2] Thanks for having me.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"TSV (","type":"text"},{"text":"--format tsv","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"Tab-separated values, OpenAI Whisper–compatible. Columns: ","type":"text"},{"text":"start_ms","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"end_ms","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"text","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"0\t2500\tHello, welcome to the meeting.\n2800\t4200\tThanks for having me.","type":"text"}]},{"type":"paragraph","content":[{"text":"Useful for piping into other tools or spreadsheets. No header row.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"ASS/SSA (","type":"text"},{"text":"--format ass","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"Advanced SubStation Alpha format — supported by Aegisub, VLC, mpv, MPC-HC, and most video editors. Offers richer styling than SRT (font, size, color, position) via the ","type":"text"},{"text":"[V4+ Styles]","type":"text","marks":[{"type":"code_inline"}]},{"text":" section:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"[Script Info]\nScriptType: v4.00+\n...\n\n[V4+ Styles]\nStyle: Default,Arial,20,&H00FFFFFF,...\n\n[Events]\nFormat: Layer, Start, End, Style, Name, ..., Text\nDialogue: 0,0:00:00.00,0:00:02.50,Default,,[SPEAKER_1] Hello, welcome.\nDialogue: 0,0:00:02.80,0:00:04.20,Default,,[SPEAKER_2] Thanks for having me.","type":"text"}]},{"type":"paragraph","content":[{"text":"Timestamps use ","type":"text"},{"text":"H:MM:SS.cc","type":"text","marks":[{"type":"code_inline"}]},{"text":" (centiseconds). Edit the ","type":"text"},{"text":"[V4+ Styles]","type":"text","marks":[{"type":"code_inline"}]},{"text":" block in Aegisub to customise font, color, and position without re-transcribing.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"LRC (","type":"text"},{"text":"--format lrc","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"Timed lyrics format used by music players (e.g., Foobar2000, VLC, AIMP). Timestamps use ","type":"text"},{"text":"[mm:ss.xx]","type":"text","marks":[{"type":"code_inline"}]},{"text":" where ","type":"text"},{"text":"xx","type":"text","marks":[{"type":"code_inline"}]},{"text":" = centiseconds:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"[00:00.50]Hello, welcome to the meeting.\n[00:02.80]Thanks for having me.","type":"text"}]},{"type":"paragraph","content":[{"text":"With diarization, speaker labels are included:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"[00:00.50][SPEAKER_1] Hello, welcome to the meeting.\n[00:02.80][SPEAKER_2] Thanks for having me.","type":"text"}]},{"type":"paragraph","content":[{"text":"Default file extension: ","type":"text"},{"text":".lrc","type":"text","marks":[{"type":"code_inline"}]},{"text":". Useful for music transcription, karaoke, and any workflow requiring timed text with music-player compatibility.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Speaker Diarization","type":"text"}]},{"type":"paragraph","content":[{"text":"Identifies who spoke when using ","type":"text"},{"text":"pyannote.audio","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/pyannote/pyannote-audio","title":null}}]},{"text":".","type":"text"}]},{"type":"paragraph","content":[{"text":"Setup:","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"./setup.sh --diarize","type":"text"}]},{"type":"paragraph","content":[{"text":"Requirements:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"HuggingFace token at ","type":"text"},{"text":"~/.cache/huggingface/token","type":"text","marks":[{"type":"code_inline"}]},{"text":" (","type":"text"},{"text":"huggingface-cli login","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Accepted model agreements:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"https://hf.co/pyannote/speaker-diarization-3.1","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"https://hf.co/pyannote/segmentation-3.0","type":"text"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"Usage:","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Basic diarization (text output)\n./scripts/transcribe meeting.wav --diarize\n\n# Diarized subtitles\n./scripts/transcribe meeting.wav --diarize --format srt -o meeting.srt\n\n# Diarized JSON (includes speakers list)\n./scripts/transcribe meeting.wav --diarize --format json","type":"text"}]},{"type":"paragraph","content":[{"text":"Speakers are labeled ","type":"text"},{"text":"SPEAKER_1","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"SPEAKER_2","type":"text","marks":[{"type":"code_inline"}]},{"text":", etc. in order of first appearance. Diarization runs on GPU automatically if CUDA is available.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Precise Word Timestamps","type":"text"}]},{"type":"paragraph","content":[{"text":"Whenever word-level timestamps are computed (","type":"text"},{"text":"--word-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":", or ","type":"text"},{"text":"--min-confidence","type":"text","marks":[{"type":"code_inline"}]},{"text":"), a wav2vec2 forced alignment pass automatically refines them from Whisper's ~100-200ms accuracy to ~10ms. No extra flag needed.","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Word timestamps with automatic wav2vec2 alignment\n./scripts/transcribe audio.mp3 --word-timestamps --format json\n\n# Diarization also gets precise alignment automatically\n./scripts/transcribe meeting.wav --diarize\n\n# Precise subtitles\n./scripts/transcribe audio.mp3 --word-timestamps --format srt -o subtitles.srt","type":"text"}]},{"type":"paragraph","content":[{"text":"Uses the MMS (Massively Multilingual Speech) model from torchaudio — supports 1000+ languages. The model is cached after first load, so batch processing stays fast.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"URL & YouTube Input","type":"text"}]},{"type":"paragraph","content":[{"text":"Pass any URL as input — audio is downloaded automatically via yt-dlp:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# YouTube video\n./scripts/transcribe https://youtube.com/watch?v=dQw4w9WgXcQ\n\n# Direct audio URL\n./scripts/transcribe https://example.com/podcast.mp3\n\n# With options\n./scripts/transcribe https://youtube.com/watch?v=... --language en --format srt -o subs.srt","type":"text"}]},{"type":"paragraph","content":[{"text":"Requires ","type":"text"},{"text":"yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":" (checks PATH and ","type":"text"},{"text":"~/.local/share/pipx/venvs/yt-dlp/bin/yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":").","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Batch Processing","type":"text"}]},{"type":"paragraph","content":[{"text":"Process multiple files at once with glob patterns, directories, or multiple paths:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# All MP3s in current directory\n./scripts/transcribe *.mp3\n\n# Entire directory (auto-filters audio files)\n./scripts/transcribe ./recordings/\n\n# Output to directory (one file per input)\n./scripts/transcribe *.mp3 -o ./transcripts/\n\n# Skip already-transcribed files (resume interrupted batch)\n./scripts/transcribe *.mp3 --skip-existing -o ./transcripts/\n\n# Mixed inputs\n./scripts/transcribe file1.mp3 file2.wav ./more-recordings/\n\n# Batch SRT subtitles\n./scripts/transcribe *.mp3 --format srt -o ./subtitles/","type":"text"}]},{"type":"paragraph","content":[{"text":"When outputting to a directory, files are named ","type":"text"},{"text":"{input-stem}.{ext}","type":"text","marks":[{"type":"code_inline"}]},{"text":" (e.g., ","type":"text"},{"text":"audio.mp3","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"audio.srt","type":"text","marks":[{"type":"code_inline"}]},{"text":").","type":"text"}]},{"type":"paragraph","content":[{"text":"Batch mode prints a summary after all files complete:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"📊 Done: 12 files, 3h24m audio in 10m15s (19.9× realtime)","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Workflows","type":"text"}]},{"type":"paragraph","content":[{"text":"End-to-end pipelines for common use cases.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Podcast Transcription Pipeline","type":"text"}]},{"type":"paragraph","content":[{"text":"Fetch and transcribe the latest 5 episodes from any podcast RSS feed:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Transcribe latest 5 episodes → one .txt per episode\n./scripts/transcribe --rss https://feeds.megaphone.fm/mypodcast -o ./transcripts/\n\n# All episodes, as SRT subtitles\n./scripts/transcribe --rss https://... --rss-latest 0 --format srt -o ./subtitles/\n\n# Skip already-done episodes (safe to re-run)\n./scripts/transcribe --rss https://... --skip-existing -o ./transcripts/\n\n# With diarization (who said what) + retry on flaky network\n./scripts/transcribe --rss https://... --diarize --retries 2 -o ./transcripts/","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Meeting Notes Pipeline","type":"text"}]},{"type":"paragraph","content":[{"text":"Transcribe a meeting recording with speaker labels, then output clean text:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Diarize + name speakers (replace SPEAKER_1/2 with real names)\n./scripts/transcribe meeting.wav --diarize --speaker-names \"Alice,Bob\" -o meeting.txt\n\n# Diarized JSON for post-processing (summaries, action items)\n./scripts/transcribe meeting.wav --diarize --format json -o meeting.json\n\n# Stream live while it transcribes (long meetings)\n./scripts/transcribe meeting.wav --stream","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Video Subtitle Pipeline","type":"text"}]},{"type":"paragraph","content":[{"text":"Generate ready-to-use subtitles for a video file:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# SRT subtitles with sentence merging (better readability)\n./scripts/transcribe video.mp4 --format srt --merge-sentences -o subtitles.srt\n\n# Burn subtitles directly into the video\n./scripts/transcribe video.mp4 --format srt --burn-in video_subtitled.mp4\n\n# Word-level SRT (karaoke-style), capped at 8 words per cue\n./scripts/transcribe video.mp4 --format srt --word-timestamps --max-words-per-line 8 -o subs.srt","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"YouTube Batch Pipeline","type":"text"}]},{"type":"paragraph","content":[{"text":"Transcribe multiple YouTube videos at once:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# One-liner: transcribe a playlist video + output SRT\n./scripts/transcribe \"https://youtube.com/watch?v=abc123\" --format srt -o subs.srt\n\n# Batch from a text file of URLs (one per line)\ncat urls.txt | xargs ./scripts/transcribe -o ./transcripts/\n\n# Download audio first, then transcribe (for re-use without re-downloading)\n./scripts/transcribe https://youtube.com/watch?v=abc123 --keep-temp","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Noisy Audio Pipeline","type":"text"}]},{"type":"paragraph","content":[{"text":"Clean up poor-quality recordings before transcribing:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Denoise + normalize, then transcribe\n./scripts/transcribe interview.mp3 --denoise --normalize -o interview.txt\n\n# Noisy batch with aggressive hallucination filtering\n./scripts/transcribe *.mp3 --denoise --filter-hallucinations -o ./out/","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Batch Recovery Pipeline","type":"text"}]},{"type":"paragraph","content":[{"text":"Process a large folder with retries — safe to re-run after failures:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Retry each failed file up to 3 times, skip already-done\n./scripts/transcribe ./recordings/ --skip-existing --retries 3 -o ./transcripts/\n\n# Check what failed (printed in batch summary at the end)\n# Re-run the same command — skips successes, retries failures","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Server Mode (OpenAI-Compatible API)","type":"text"}]},{"type":"paragraph","content":[{"text":"speaches","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/speaches-ai/speaches","title":null}}]},{"text":" runs faster-whisper as an OpenAI-compatible ","type":"text"},{"text":"/v1/audio/transcriptions","type":"text","marks":[{"type":"code_inline"}]},{"text":" endpoint — drop-in replacement for OpenAI Whisper API with streaming, Docker support, and live transcription.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Quick start (Docker)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"docker run --gpus all -p 8000:8000 ghcr.io/speaches-ai/speaches:latest-cuda","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Test it","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Transcribe a file via the API (same format as OpenAI)\ncurl http://localhost:8000/v1/audio/transcriptions \\\n -F [email protected] \\\n -F model=Systran/faster-whisper-large-v3","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Use with any OpenAI SDK","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"from openai import OpenAI\nclient = OpenAI(base_url=\"http://localhost:8000\", api_key=\"none\")\nwith open(\"audio.mp3\", \"rb\") as f:\n result = client.audio.transcriptions.create(model=\"Systran/faster-whisper-large-v3\", file=f)\nprint(result.text)","type":"text"}]},{"type":"paragraph","content":[{"text":"Useful when you want to expose transcription as a local API for other tools (Home Assistant, n8n, custom apps).","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Common Mistakes","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Mistake","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Problem","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Solution","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Using CPU when GPU available","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"10-20x slower transcription","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Check ","type":"text"},{"text":"nvidia-smi","type":"text","marks":[{"type":"code_inline"}]},{"text":"; verify CUDA installation","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Not specifying language","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Wastes time auto-detecting on known content","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"--language en","type":"text","marks":[{"type":"code_inline"}]},{"text":" when you know the language","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Using wrong model","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Unnecessary slowness or poor accuracy","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Default ","type":"text"},{"text":"distil-large-v3.5","type":"text","marks":[{"type":"code_inline"}]},{"text":" is excellent; only use ","type":"text"},{"text":"large-v3","type":"text","marks":[{"type":"code_inline"}]},{"text":" if accuracy issues","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Ignoring distilled models","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Missing 6x speedup with \u003c1% accuracy loss","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Try ","type":"text"},{"text":"distil-large-v3.5","type":"text","marks":[{"type":"code_inline"}]},{"text":" before reaching for standard models","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Forgetting ffmpeg","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Setup fails or audio can't be processed","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Setup script handles this; manual installs need ffmpeg separately","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Out of memory errors","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Model too large for available VRAM/RAM","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use smaller model, ","type":"text"},{"text":"--compute-type int8","type":"text","marks":[{"type":"code_inline"}]},{"text":", or ","type":"text"},{"text":"--batch-size 4","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Over-engineering beam size","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Diminishing returns past beam-size 5-7","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Default 5 is fine; try 10 for critical transcripts","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--diarize without pyannote","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Import error at runtime","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Run ","type":"text"},{"text":"setup.sh --diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":" first","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--diarize without HuggingFace token","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Model download fails","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Run ","type":"text"},{"text":"huggingface-cli login","type":"text","marks":[{"type":"code_inline"}]},{"text":" and accept model agreements","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"URL input without yt-dlp","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Download fails","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Install: ","type":"text"},{"text":"pipx install yt-dlp","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--min-confidence too high","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Drops good segments with natural pauses","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Start at 0.5, adjust up; check JSON output for probabilities","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Using --word-timestamps for basic transcription","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Adds ~5-10s overhead for negligible benefit","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Only use when word-level precision matters","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Batch without -o directory","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"All output mixed in stdout","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Use ","type":"text"},{"text":"-o ./transcripts/","type":"text","marks":[{"type":"code_inline"}]},{"text":" to write one file per input","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Performance Notes","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"First run","type":"text","marks":[{"type":"strong"}]},{"text":": Downloads model to ","type":"text"},{"text":"~/.cache/huggingface/","type":"text","marks":[{"type":"code_inline"}]},{"text":" (one-time)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Batched inference","type":"text","marks":[{"type":"strong"}]},{"text":": Enabled by default via ","type":"text"},{"text":"BatchedInferencePipeline","type":"text","marks":[{"type":"code_inline"}]},{"text":" — ~3x faster than standard mode; VAD on by default","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"GPU","type":"text","marks":[{"type":"strong"}]},{"text":": Automatically uses CUDA if available","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Quantization","type":"text","marks":[{"type":"strong"}]},{"text":": INT8 used on CPU for ~4x speedup with minimal accuracy loss","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Performance stats","type":"text","marks":[{"type":"strong"}]},{"text":": Every transcription shows audio duration, processing time, and realtime factor","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Benchmark","type":"text","marks":[{"type":"strong"}]},{"text":" (RTX 3070, 21-min file): ","type":"text"},{"text":"~24s","type":"text","marks":[{"type":"strong"}]},{"text":" with batched inference (both distil-large-v3 and v3.5) vs ~69s without","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--precise overhead","type":"text","marks":[{"type":"strong"}]},{"text":": Adds ~5-10s for wav2vec2 model load + alignment (model cached for batch)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Diarization overhead","type":"text","marks":[{"type":"strong"}]},{"text":": Adds ~10-30s depending on audio length (runs on GPU if available)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Memory","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"distil-large-v3","type":"text","marks":[{"type":"code_inline"}]},{"text":": ~2GB RAM / ~1GB VRAM","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"large-v3-turbo","type":"text","marks":[{"type":"code_inline"}]},{"text":": ~4GB RAM / ~2GB VRAM","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"tiny/base","type":"text","marks":[{"type":"code_inline"}]},{"text":": \u003c1GB RAM","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Diarization: additional ~1-2GB VRAM","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"OOM","type":"text","marks":[{"type":"strong"}]},{"text":": Lower ","type":"text"},{"text":"--batch-size","type":"text","marks":[{"type":"code_inline"}]},{"text":" (try 4) if you hit out-of-memory errors","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Pre-convert to WAV","type":"text","marks":[{"type":"strong"}]},{"text":" (optional): ","type":"text"},{"text":"ffmpeg -i input.mp3 -ar 16000 -ac 1 input.wav","type":"text","marks":[{"type":"code_inline"}]},{"text":" converts to 16kHz mono WAV before transcription. Benefit is minimal (~5%) for one-off use since PyAV decodes efficiently — most useful when re-processing the same file multiple times (research/experiments) or when a format causes PyAV decode issues. Note: ","type":"text"},{"text":"--normalize","type":"text","marks":[{"type":"code_inline"}]},{"text":" and ","type":"text"},{"text":"--denoise","type":"text","marks":[{"type":"code_inline"}]},{"text":" already perform this conversion automatically.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Silero VAD V6","type":"text","marks":[{"type":"strong"}]},{"text":": faster-whisper 1.2.1 upgraded to Silero VAD V6 (improved speech detection). Run ","type":"text"},{"text":"./setup.sh --update","type":"text","marks":[{"type":"code_inline"}]},{"text":" to get it.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Batched silence removal","type":"text","marks":[{"type":"strong"}]},{"text":": faster-whisper 1.2.0+ automatically removes silence in ","type":"text"},{"text":"BatchedInferencePipeline","type":"text","marks":[{"type":"code_inline"}]},{"text":" (used by default). Upgrade with ","type":"text"},{"text":"./setup.sh --update","type":"text","marks":[{"type":"code_inline"}]},{"text":" to get this if you installed before August 2024.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Why faster-whisper?","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Speed","type":"text","marks":[{"type":"strong"}]},{"text":": ~4-6x faster than OpenAI's original Whisper","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Accuracy","type":"text","marks":[{"type":"strong"}]},{"text":": Identical (uses same model weights)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Efficiency","type":"text","marks":[{"type":"strong"}]},{"text":": Lower memory usage via quantization","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Production-ready","type":"text","marks":[{"type":"strong"}]},{"text":": Stable C++ backend (CTranslate2)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Distilled models","type":"text","marks":[{"type":"strong"}]},{"text":": ~6x faster with \u003c1% accuracy loss","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Subtitles","type":"text","marks":[{"type":"strong"}]},{"text":": Native SRT/VTT/HTML output","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Precise alignment","type":"text","marks":[{"type":"strong"}]},{"text":": Automatic wav2vec2 refinement (~10ms word boundaries)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Diarization","type":"text","marks":[{"type":"strong"}]},{"text":": Optional speaker identification via pyannote; ","type":"text"},{"text":"--speaker-names","type":"text","marks":[{"type":"code_inline"}]},{"text":" maps to real names","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"URLs","type":"text","marks":[{"type":"strong"}]},{"text":": Direct YouTube/URL input; ","type":"text"},{"text":"--keep-temp","type":"text","marks":[{"type":"code_inline"}]},{"text":" preserves downloads for re-use","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Custom models","type":"text","marks":[{"type":"strong"}]},{"text":": Load local CTranslate2 dirs or HuggingFace repos; ","type":"text"},{"text":"--model-dir","type":"text","marks":[{"type":"code_inline"}]},{"text":" controls cache","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Quality control","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"--filter-hallucinations","type":"text","marks":[{"type":"code_inline"}]},{"text":" strips music/applause markers and duplicates","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Parallel batch","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"--parallel N","type":"text","marks":[{"type":"code_inline"}]},{"text":" for multi-threaded batch processing","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Subtitle burn-in","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"--burn-in","type":"text","marks":[{"type":"code_inline"}]},{"text":" overlays subtitles directly into video via ffmpeg","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"v1.5.0 New Features","type":"text"}]},{"type":"paragraph","content":[{"text":"Multi-format output:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--format srt,text","type":"text","marks":[{"type":"code_inline"}]},{"text":" — write multiple formats in one pass (e.g. SRT + plain text simultaneously)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Comma-separated list accepted: ","type":"text"},{"text":"srt,vtt,json","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"srt,text","type":"text","marks":[{"type":"code_inline"}]},{"text":", etc.","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Requires ","type":"text"},{"text":"-o \u003cdir>","type":"text","marks":[{"type":"code_inline"}]},{"text":" when writing multiple formats; single format unchanged","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Filler word removal:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--clean-filler","type":"text","marks":[{"type":"code_inline"}]},{"text":" — strip hesitation sounds (um, uh, er, ah, hmm, hm) and discourse markers (you know, I mean, you see) from transcript text; off by default","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Conservative regex matching at word boundaries to avoid false positives","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Segments that become empty after cleaning are dropped automatically","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Stereo channel selection:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--channel left|right|mix","type":"text","marks":[{"type":"code_inline"}]},{"text":" — extract a specific stereo channel before transcribing (default: mix)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Useful for dual-track recordings (interviewer on left, interviewee on right)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Uses ffmpeg pan filter; falls back gracefully to full mix if ffmpeg not found","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Character-based subtitle wrapping:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--max-chars-per-line N","type":"text","marks":[{"type":"code_inline"}]},{"text":" — split subtitle cues so each line fits within N characters","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Works for SRT, VTT, ASS, and TTML formats; takes priority over ","type":"text"},{"text":"--max-words-per-line","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Requires word-level timestamps; falls back to full segment if no word data","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Paragraph detection:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--detect-paragraphs","type":"text","marks":[{"type":"code_inline"}]},{"text":" — insert ","type":"text"},{"text":"\\n\\n","type":"text","marks":[{"type":"code_inline"}]},{"text":" paragraph breaks in text output at natural boundaries","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--paragraph-gap SEC","type":"text","marks":[{"type":"code_inline"}]},{"text":" — minimum silence gap for a paragraph (default: 3.0s)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Also detects paragraph breaks when the previous segment ends a sentence and gap ≥ 1.5s","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Subtitle formats:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--format ass","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Advanced SubStation Alpha (Aegisub, VLC, mpv, MPC-HC)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--format lrc","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Timed lyrics format for music players","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--format html","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Confidence-colored HTML transcript (green/yellow/red per word)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--format ttml","type":"text","marks":[{"type":"code_inline"}]},{"text":" — W3C TTML 1.0 (DFXP) broadcast standard (Netflix, Amazon Prime, BBC)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--format csv","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Spreadsheet-ready CSV with header row; RFC 4180 quoting; ","type":"text"},{"text":"speaker","type":"text","marks":[{"type":"code_inline"}]},{"text":" column when diarized","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Transcript tools:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--search TERM","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Find all timestamps where a word/phrase appears; replaces normal output; ","type":"text"},{"text":"-o","type":"text","marks":[{"type":"code_inline"}]},{"text":" to save","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--search-fuzzy","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Approximate/partial matching with ","type":"text"},{"text":"--search","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--detect-chapters","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Auto-detect chapter breaks from silence gaps; ","type":"text"},{"text":"--chapter-gap SEC","type":"text","marks":[{"type":"code_inline"}]},{"text":" (default 8s)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--chapters-file PATH","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Write chapters to file instead of stdout; ","type":"text"},{"text":"--chapter-format youtube|text|json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--export-speakers DIR","type":"text","marks":[{"type":"code_inline"}]},{"text":" — After ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":", save each speaker's turns as separate WAV files via ffmpeg","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Batch improvements:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ETA","type":"text","marks":[{"type":"strong"}]},{"text":" — ","type":"text"},{"text":"[N/total] filename | ETA: Xm Ys","type":"text","marks":[{"type":"code_inline"}]},{"text":" shown before each file in sequential batch; no flag needed","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--language-map \"pat=lang,...\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Per-file language override; fnmatch glob patterns; ","type":"text"},{"text":"@file.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" form","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--retries N","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Retry failed files with exponential backoff; failed-file summary at end","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--rss URL","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Transcribe podcast RSS feeds; ","type":"text"},{"text":"--rss-latest N","type":"text","marks":[{"type":"code_inline"}]},{"text":" for episode count","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--skip-existing","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"--parallel N","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"--output-template","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"--stats-file","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"--merge-sentences","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"Model & inference:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"distil-large-v3.5","type":"text","marks":[{"type":"code_inline"}]},{"text":" default (replaced distil-large-v3)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Auto-disables ","type":"text"},{"text":"condition_on_previous_text","type":"text","marks":[{"type":"code_inline"}]},{"text":" for distil models (prevents repetition loops)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--condition-on-previous-text","type":"text","marks":[{"type":"code_inline"}]},{"text":" to override; ","type":"text"},{"text":"--log-level","type":"text","marks":[{"type":"code_inline"}]},{"text":" for library debug output","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--model-dir PATH","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Custom HuggingFace cache dir; local CTranslate2 model support","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--no-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--chunk-length","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--length-penalty","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--repetition-penalty","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--no-repeat-ngram-size","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--clip-timestamps","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--stream","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--progress","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--best-of","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--patience","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--max-new-tokens","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--hotwords","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--prefix","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--revision","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--suppress-tokens","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"--max-initial-timestamp","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"Speaker & quality:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--speaker-names \"Alice,Bob\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Replace SPEAKER_1/2 with real names (requires ","type":"text"},{"text":"--diarize","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--filter-hallucinations","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Remove music/applause markers, duplicates, \"Thank you for watching\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--burn-in OUTPUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Burn subtitles into video via ffmpeg","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"--keep-temp","type":"text","marks":[{"type":"code_inline"}]},{"text":" — Preserve URL-downloaded audio for re-processing","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Setup:","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"setup.sh --check","type":"text","marks":[{"type":"code_inline"}]},{"text":" — System diagnostic: GPU, CUDA, Python, ffmpeg, pyannote, HuggingFace token (completes in ~12s)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ffmpeg no longer required for basic transcription (PyAV handles decoding); ","type":"text"},{"text":"skill.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" updated to reflect this (","type":"text"},{"text":"ffmpeg","type":"text","marks":[{"type":"code_inline"}]},{"text":" is now ","type":"text"},{"text":"optionalBins","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Troubleshooting","type":"text"}]},{"type":"paragraph","content":[{"text":"\"CUDA not available — using CPU\"","type":"text","marks":[{"type":"strong"}]},{"text":": Install PyTorch with CUDA (see GPU Support above) ","type":"text"},{"text":"Setup fails","type":"text","marks":[{"type":"strong"}]},{"text":": Make sure Python 3.10+ is installed ","type":"text"},{"text":"Out of memory","type":"text","marks":[{"type":"strong"}]},{"text":": Use smaller model, ","type":"text"},{"text":"--compute-type int8","type":"text","marks":[{"type":"code_inline"}]},{"text":", or ","type":"text"},{"text":"--batch-size 4","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"Slow on CPU","type":"text","marks":[{"type":"strong"}]},{"text":": Expected — use GPU for practical transcription ","type":"text"},{"text":"Model download fails","type":"text","marks":[{"type":"strong"}]},{"text":": Check ","type":"text"},{"text":"~/.cache/huggingface/","type":"text","marks":[{"type":"code_inline"}]},{"text":" permissions ","type":"text"},{"text":"Diarization model fails","type":"text","marks":[{"type":"strong"}]},{"text":": Ensure HuggingFace token exists and model agreements accepted; or pass token directly with ","type":"text"},{"text":"--hf-token hf_xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"URL download fails","type":"text","marks":[{"type":"strong"}]},{"text":": Check yt-dlp is installed (","type":"text"},{"text":"pipx install yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":") ","type":"text"},{"text":"No audio files in batch","type":"text","marks":[{"type":"strong"}]},{"text":": Check file extensions match supported formats ","type":"text"},{"text":"Check installed version","type":"text","marks":[{"type":"strong"}]},{"text":": Run ","type":"text"},{"text":"./scripts/transcribe --version","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"Upgrade faster-whisper","type":"text","marks":[{"type":"strong"}]},{"text":": Run ","type":"text"},{"text":"./setup.sh --update","type":"text","marks":[{"type":"code_inline"}]},{"text":" (upgrades in-place, no full reinstall) ","type":"text"},{"text":"Hallucinations on silence/music","type":"text","marks":[{"type":"strong"}]},{"text":": Try ","type":"text"},{"text":"--temperature 0.0 --no-speech-threshold 0.8","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"VAD splits speech incorrectly","type":"text","marks":[{"type":"strong"}]},{"text":": Tune with ","type":"text"},{"text":"--vad-threshold 0.3","type":"text","marks":[{"type":"code_inline"}]},{"text":" (lower) or ","type":"text"},{"text":"--min-silence-duration 300","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"Improve speech detection","type":"text","marks":[{"type":"strong"}]},{"text":": Run ","type":"text"},{"text":"./setup.sh --update","type":"text","marks":[{"type":"code_inline"}]},{"text":" to upgrade faster-whisper to the latest version (includes Silero VAD V6).","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"References","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"faster-whisper GitHub","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/SYSTRAN/faster-whisper","title":null}}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Distil-Whisper Paper","type":"text","marks":[{"type":"link","attrs":{"href":"https://arxiv.org/abs/2311.00430","title":null}}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"HuggingFace Models","type":"text","marks":[{"type":"link","attrs":{"href":"https://huggingface.co/collections/Systran/faster-whisper","title":null}}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"pyannote.audio","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/pyannote/pyannote-audio","title":null}}]},{"text":" (diarization)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"yt-dlp","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/yt-dlp/yt-dlp","title":null}}]},{"text":" (URL/YouTube download)","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"faster-whisper","tags":["audio","transcription","whisper","speech-to-text","ml","cuda","gpu","subtitles","diarization","podcast","chapters","search","csv","ttml","batch"],"author":"@skillopedia","source":{"stars":7,"repo_name":"faster-whisper","origin_url":"https://github.com/theplasmak/faster-whisper/blob/HEAD/SKILL.md","repo_owner":"theplasmak","body_sha256":"917d850c6fbb6d688b367ff6953dc73a0a5ab9ebcee7fd1f0cf1698d6cf8f62a","cluster_key":"af844ba06bd967a455d0d0a051048e06117ffabc583240cf01b744c45da944bb","clean_bundle":{"format":"clean-skill-bundle-v1","source":"theplasmak/faster-whisper/SKILL.md","attachments":[{"id":"17ed4f63-5992-57e9-a07c-5360f2bacac6","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/17ed4f63-5992-57e9-a07c-5360f2bacac6/attachment","path":".clawdhubignore","size":75,"sha256":"12408156788b212cdb58566a8b7ffb75dd3309969ba6e604baa30a26c0b05bc6","contentType":"text/plain; charset=utf-8"},{"id":"e488691b-e107-5024-974a-3204ce25bdee","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/e488691b-e107-5024-974a-3204ce25bdee/attachment","path":".gitignore","size":191,"sha256":"e8a7a8f7b548a900d3e97d47935c7c8b1baf1f604f102c810cb1a64992c2c39d","contentType":"text/plain; charset=utf-8"},{"id":"3f0e6cf7-cd26-512f-a5ad-d918664f0f78","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/3f0e6cf7-cd26-512f-a5ad-d918664f0f78/attachment.md","path":"CHANGELOG.md","size":10556,"sha256":"471cf3201ffb5fc914fea634ed73668c32fc442de30495bded042c2d9a595116","contentType":"text/markdown; charset=utf-8"},{"id":"3edea7de-ada8-5503-b8d7-0b17a75f7733","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/3edea7de-ada8-5503-b8d7-0b17a75f7733/attachment.md","path":"README.md","size":24638,"sha256":"e4e2585a19a1abd693078f26b51b95ab030af104af32241cfeab0294dddb7b29","contentType":"text/markdown; charset=utf-8"},{"id":"58ab7c5a-e61d-5b67-9780-02edf74e5135","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/58ab7c5a-e61d-5b67-9780-02edf74e5135/attachment.txt","path":"requirements.txt","size":117,"sha256":"db500dda68105d00efa2be011ba9beb7efa27a181d13cbb024eb671ecca7297d","contentType":"text/plain; charset=utf-8"},{"id":"3674319d-4d8b-5cb1-944b-d657d9c9d5d3","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/3674319d-4d8b-5cb1-944b-d657d9c9d5d3/attachment","path":"scripts/transcribe","size":881,"sha256":"d2aef8b922dc329a260df1c2b9c4df66ce90edc3772f7f74c19ed294b7c4cb89","contentType":"text/plain; charset=utf-8"},{"id":"075b98b5-bc68-5f55-8b7c-8a18a972c3c7","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/075b98b5-bc68-5f55-8b7c-8a18a972c3c7/attachment.cmd","path":"scripts/transcribe.cmd","size":921,"sha256":"37bca3cb69fc725ce7ec536db15b90cfcfbb2ad8a8dfe7051420ab0518b6aef3","contentType":"text/plain; charset=utf-8"},{"id":"8e158071-9694-5931-b0a2-427f8915e435","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8e158071-9694-5931-b0a2-427f8915e435/attachment.ps1","path":"scripts/transcribe.ps1","size":2437,"sha256":"da6f60767714f09ec0d82afa8316c885c16adea5ecbbd0450ee7198c738aec9c","contentType":"text/plain; charset=utf-8"},{"id":"3d42bf93-e4d3-5539-8266-2f2c477be36f","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/3d42bf93-e4d3-5539-8266-2f2c477be36f/attachment.py","path":"scripts/transcribe.py","size":114264,"sha256":"3c313601d0aa3586eb63e6e8dd45fdca65481151fdf247aee07db8da3dbef32a","contentType":"text/x-python; charset=utf-8"},{"id":"4ac0ab17-55e1-5a21-a3ac-cfc9616d1786","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/4ac0ab17-55e1-5a21-a3ac-cfc9616d1786/attachment.ps1","path":"setup.ps1","size":11054,"sha256":"659317cb0d4b1c154ddca3ae69e4da51ec1d16dc67c6cd436b60bba3e36acf12","contentType":"text/plain; charset=utf-8"},{"id":"03bc2dd3-f374-58a2-ba62-869afa120f9e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/03bc2dd3-f374-58a2-ba62-869afa120f9e/attachment.sh","path":"setup.sh","size":12721,"sha256":"a678dfd560c09e8084e0fcc84c285d775d31365ce2dcbb51daf808dc48179870","contentType":"application/x-sh; charset=utf-8"},{"id":"75f0a6b8-2eb7-5a8a-abbb-6fd378a42df1","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/75f0a6b8-2eb7-5a8a-abbb-6fd378a42df1/attachment.json","path":"skill.json","size":599,"sha256":"bffcbb78c3010933dd97bb6606b876108ddd2d8c69e426c7c4a8cf4efdc42974","contentType":"application/json; charset=utf-8"}],"bundle_sha256":"9a54cf5832a6d71aef915d33ef1f0d95d0f3e46006d9ac48b5107e9c15ce7cd2","attachment_count":12,"text_attachments":11,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":1,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"data-analytics","category_label":"Data"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"data-analytics","homepage":"https://github.com/ThePlasmak/faster-whisper","metadata":{"openclaw":{"emoji":"🗣️","requires":{"bins":["python3"],"optionalBins":["ffmpeg","yt-dlp"],"optionalPaths":["~/.cache/huggingface/token"]}}},"platforms":["linux","macos","wsl2"],"import_tag":"clean-skills-v1","description":"Local speech-to-text using faster-whisper. 4-6x faster than OpenAI Whisper with identical accuracy; GPU acceleration enables ~20x realtime transcription. SRT/VTT/TTML/CSV subtitles, speaker diarization, URL/YouTube input, batch processing with ETA, transcript search, chapter detection, per-file language map."}},"renderedAt":1782982055670}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.