抓取链接内容 & 转存知识库 🎬 视频/音频上传到乐享 :必须用 (走 OpenAPI )。 不要 用 MCP 的 或 ——它们产出 的条目,不触发 VOD 转码,视频无法播放。详见下方「YouTube 视频处理 → Step 2:上传到乐享知识库」章节。凭证存放于 (不进 git)。 概述 将文章 URL(免费/付费/登录墙)抓取为结构化 Markdown,并自动转存到乐享知识库,实现素材归档和可追溯。 最终产出物 1. — 完整文章 Markdown(含图片引用) 2. — 结构化元信息(原文链接、作者、发布时间、抓取时间等) 3. — 所有文章配图 4. 乐享知识库中的文档副本(按天维度归档) 乐享文档链接格式(⚠️ 必须遵守) 转存完成后, 必须 按以下格式输出可点击访问的链接: - : 或 返回的 - :固定值 (凡哥的企业 ID,不可省略,省略后链接无法访问) - 禁止 使用 格式——这是 MCP 内部调试链接,用户无法直接访问 文件命名规则(重要) - 必须使用原文标题命名 ,不要用 等通用名称 - 文件名格式: 、 - 示例: 、 - 如果标题中包含文件名不合法字符( 、 、 等),替换为 - 乐享知识库转存时也使用原文标题作为文档标题 工作流程 Step 1:素材收集 抓取方式决策树 根据 URL 类型选择抓取方式(按优先级排列): 1. claude.com…

, stripped):\n blocks.append({'type': 'hr'})\n i += 1\n continue\n\n h_match = re.match(r'^(#{1,3})\\s+(.*)', stripped)\n if h_match:\n level = len(h_match.group(1))\n text = h_match.group(2).strip()\n # 处理 \"### #\\n实际标题文字\" 被拆行的情况\n if text == '#' or not text:\n # 下一行可能是标题的实际文字\n if i + 1 \u003c len(lines) and lines[i + 1].strip() and not lines[i + 1].strip().startswith('#'):\n next_text = lines[i + 1].strip()\n if next_text and not re.match(r'^!\\[', next_text) and not re.match(r'^>\\s', next_text):\n blocks.append({'type': f'h{level}', 'text': next_text})\n i += 2\n continue\n # 单独的 \"### #\" 且下一行不是标题文字,跳过\n i += 1\n continue\n blocks.append({'type': f'h{level}', 'text': text})\n i += 1\n continue\n\n img_match = re.match(r'^!\\[([^\\]]*)\\]\\(([^)]+)\\)', stripped)\n if img_match:\n blocks.append({'type': 'image', 'alt': img_match.group(1), 'src': img_match.group(2)})\n i += 1\n continue\n\n if stripped.startswith('*') and stripped.endswith('*') and not stripped.startswith('**'):\n blocks.append({'type': 'caption', 'text': stripped.strip('*')})\n i += 1\n continue\n\n if stripped.startswith('>'):\n quote_lines = []\n while i \u003c len(lines) and lines[i].strip().startswith('>'):\n quote_lines.append(lines[i].strip().lstrip('>').strip())\n i += 1\n blocks.append({'type': 'blockquote', 'text': ' '.join(quote_lines)})\n continue\n\n ol_match = re.match(r'^(\\d+)\\.\\s+(.*)', stripped)\n if ol_match:\n blocks.append({'type': 'list_item', 'number': ol_match.group(1), 'text': ol_match.group(2)})\n i += 1\n continue\n\n if stripped.startswith('- ') or stripped.startswith('* '):\n blocks.append({'type': 'list_item', 'number': '•', 'text': stripped[2:]})\n i += 1\n continue\n\n # 导航链接行跳过\n if re.match(r'^\\[.+\\]\\(.+\\)\\s*/\\s*\\[.+\\]\\(.+\\)', stripped):\n i += 1\n continue\n\n # 普通段落\n para_lines = [stripped]\n i += 1\n while i \u003c len(lines) and lines[i].strip() and not lines[i].strip().startswith('#') \\\n and not lines[i].strip().startswith('![') and not lines[i].strip().startswith('>') \\\n and not re.match(r'^-{3,}

抓取链接内容 & 转存知识库 🎬 视频/音频上传到乐享 :必须用 (走 OpenAPI )。 不要 用 MCP 的 或 ——它们产出 的条目,不触发 VOD 转码,视频无法播放。详见下方「YouTube 视频处理 → Step 2:上传到乐享知识库」章节。凭证存放于 (不进 git)。 概述 将文章 URL(免费/付费/登录墙)抓取为结构化 Markdown,并自动转存到乐享知识库,实现素材归档和可追溯。 最终产出物 1. — 完整文章 Markdown(含图片引用) 2. — 结构化元信息(原文链接、作者、发布时间、抓取时间等) 3. — 所有文章配图 4. 乐享知识库中的文档副本(按天维度归档) 乐享文档链接格式(⚠️ 必须遵守) 转存完成后, 必须 按以下格式输出可点击访问的链接: - : 或 返回的 - :固定值 (凡哥的企业 ID,不可省略,省略后链接无法访问) - 禁止 使用 格式——这是 MCP 内部调试链接,用户无法直接访问 文件命名规则(重要) - 必须使用原文标题命名 ,不要用 等通用名称 - 文件名格式: 、 - 示例: 、 - 如果标题中包含文件名不合法字符( 、 、 等),替换为 - 乐享知识库转存时也使用原文标题作为文档标题 工作流程 Step 1:素材收集 抓取方式决策树 根据 URL 类型选择抓取方式(按优先级排列): 1. claude.com…

, lines[i].strip()) \\\n and not re.match(r'^\\d+\\.\\s+', lines[i].strip()):\n para_lines.append(lines[i].strip())\n i += 1\n blocks.append({'type': 'paragraph', 'text': ' '.join(para_lines)})\n\n return blocks\n\n\ndef clean_text(text: str) -> str:\n \"\"\"清理 Markdown 行内格式。\"\"\"\n text = re.sub(r'\\[([^\\]]+)\\]\\([^)]+\\)', r'\\1', text)\n text = re.sub(r'\\*\\*(.+?)\\*\\*', r'\\1', text)\n text = re.sub(r'\\*(.+?)\\*', r'\\1', text)\n text = re.sub(r'`(.+?)`', r'\\1', text)\n return text\n\n\ndef get_image_path(src: str, md_dir: str):\n \"\"\"解析图片路径。\"\"\"\n if src.startswith(('http://', 'https://')):\n return None\n abs_path = os.path.normpath(os.path.join(md_dir, src))\n return abs_path if os.path.isfile(abs_path) else None\n\n\nclass PdfWriter:\n \"\"\"PDF 写入器,处理分页和中文字体。\"\"\"\n\n def __init__(self):\n self.doc = pymupdf.open()\n self.font = pymupdf.Font(\"china-s\") # 内置简体中文字体\n self.font_bold = pymupdf.Font(\"china-ss\") # 内置简体中文粗体\n self.page = None\n self.y = MARGIN_TOP\n self._new_page()\n\n def _new_page(self):\n self.page = self.doc.new_page(width=PAGE_WIDTH, height=PAGE_HEIGHT)\n self.y = MARGIN_TOP\n\n def _ensure_space(self, needed: float):\n \"\"\"确保页面剩余空间足够,不够则换页。\"\"\"\n if self.y + needed > PAGE_HEIGHT - MARGIN_BOTTOM:\n self._new_page()\n\n def _text_width(self, text: str, font, fontsize: float) -> float:\n \"\"\"计算文本宽度。\"\"\"\n return font.text_length(text, fontsize=fontsize)\n\n def _wrap_text(self, text: str, font, fontsize: float, max_width: float) -> list:\n \"\"\"将文本按宽度限制分行。\"\"\"\n lines = []\n current = \"\"\n for ch in text:\n test = current + ch\n if self._text_width(test, font, fontsize) > max_width and current:\n lines.append(current)\n current = ch\n else:\n current = test\n if current:\n lines.append(current)\n return lines if lines else [\"\"]\n\n def write_text(self, text: str, fontsize: float, bold: bool = False,\n indent: float = 0, color: tuple = (0, 0, 0),\n max_width: float = None) -> float:\n \"\"\"写入文本,自动换行和分页。返回结束 y 坐标。\"\"\"\n font = self.font_bold if bold else self.font\n if max_width is None:\n max_width = CONTENT_WIDTH - indent\n x = MARGIN_LEFT + indent\n line_height = fontsize * LINE_HEIGHT_FACTOR\n\n wrapped = self._wrap_text(text, font, fontsize, max_width)\n\n for line_text in wrapped:\n self._ensure_space(line_height)\n tw = pymupdf.TextWriter(self.page.rect)\n tw.append(pymupdf.Point(x, self.y + fontsize), line_text, font=font, fontsize=fontsize)\n tw.write_text(self.page, color=color)\n self.y += line_height\n\n return self.y\n\n def draw_hr(self):\n \"\"\"绘制水平分割线。\"\"\"\n self._ensure_space(15)\n self.y += 5\n self.page.draw_line(\n pymupdf.Point(MARGIN_LEFT, self.y),\n pymupdf.Point(PAGE_WIDTH - MARGIN_RIGHT, self.y),\n color=(0.7, 0.7, 0.7), width=0.5\n )\n self.y += 10\n\n def draw_blockquote_bar(self, start_y: float, end_y: float):\n \"\"\"绘制引用条。\"\"\"\n self.page.draw_line(\n pymupdf.Point(MARGIN_LEFT + 5, start_y),\n pymupdf.Point(MARGIN_LEFT + 5, end_y),\n color=(0.6, 0.6, 0.6), width=2\n )\n\n def insert_image(self, img_path: str):\n \"\"\"插入图片,自动处理缩放和分页。\"\"\"\n try:\n pix = pymupdf.Pixmap(img_path)\n img_w = pix.width\n img_h = pix.height\n pix = None\n\n max_h = PAGE_HEIGHT - MARGIN_TOP - MARGIN_BOTTOM - 30\n scale = min(CONTENT_WIDTH / img_w, max_h / img_h, 1.0)\n display_w = img_w * scale\n display_h = img_h * scale\n\n remaining = PAGE_HEIGHT - MARGIN_BOTTOM - self.y\n if remaining \u003c min(display_h, 100):\n self._new_page()\n\n x_offset = MARGIN_LEFT + (CONTENT_WIDTH - display_w) / 2\n rect = pymupdf.Rect(x_offset, self.y, x_offset + display_w, self.y + display_h)\n self.page.insert_image(rect, filename=img_path)\n self.y += display_h + 8\n except Exception as e:\n print(f\" ⚠️ 插入图片失败 ({os.path.basename(img_path)}): {e}\")\n # 回退:尝试直接用文件名插入\n try:\n img_rect = pymupdf.Rect(MARGIN_LEFT, self.y, PAGE_WIDTH - MARGIN_RIGHT, self.y + 200)\n self.page.insert_image(img_rect, filename=img_path)\n self.y += 208\n except Exception:\n pass\n\n def add_blank(self, height: float = None):\n if height is None:\n height = FONT_SIZE_BODY * 0.5\n self.y += height\n if self.y > PAGE_HEIGHT - MARGIN_BOTTOM:\n self._new_page()\n\n def save(self, output_path: str):\n self.doc.save(output_path)\n self.doc.close()\n size_kb = os.path.getsize(output_path) / 1024\n print(f\"✅ PDF 已生成: {output_path} ({size_kb:.0f}KB)\")\n\n\ndef md_to_pdf(md_path: str, output_path: str = None) -> str:\n \"\"\"将 Markdown 文件转换为 PDF。\"\"\"\n if not os.path.isfile(md_path):\n raise FileNotFoundError(f\"文件不存在: {md_path}\")\n\n md_dir = os.path.dirname(os.path.abspath(md_path))\n if output_path is None:\n output_path = os.path.splitext(md_path)[0] + '.pdf'\n\n with open(md_path, 'r', encoding='utf-8') as f:\n md_text = f.read()\n\n blocks = parse_markdown(md_text)\n writer = PdfWriter()\n seen_title = False\n\n # 检查 blocks 中是否有有效的 h1 标题\n has_h1 = any(b['type'] == 'h1' and b.get('text', '').strip() for b in blocks)\n\n # 如果没有 h1 标题,尝试从 article_meta.json 获取\n fallback_title = None\n if not has_h1:\n meta_path = os.path.join(md_dir, 'article_meta.json')\n if os.path.isfile(meta_path):\n try:\n import json\n with open(meta_path, 'r', encoding='utf-8') as f:\n meta = json.load(f)\n fallback_title = meta.get('title', '').strip()\n if fallback_title:\n print(f\" ℹ️ MD 中无有效标题,从 article_meta.json 补充: {fallback_title}\")\n except Exception:\n pass\n\n # 如果需要补充标题,在开头插入 h1 块\n if fallback_title:\n blocks.insert(0, {'type': 'h1', 'text': fallback_title})\n\n for block in blocks:\n btype = block['type']\n\n if btype == 'blank':\n writer.add_blank()\n continue\n\n if btype == 'hr':\n writer.draw_hr()\n continue\n\n if btype in ('h1', 'h2', 'h3'):\n text = clean_text(block['text'])\n sizes = {'h1': FONT_SIZE_H1, 'h2': FONT_SIZE_H2, 'h3': FONT_SIZE_H3}\n fsize = sizes[btype]\n\n if btype == 'h1':\n if seen_title:\n continue\n seen_title = True\n\n writer.add_blank(fsize * 0.5)\n writer._ensure_space(fsize * 2)\n writer.write_text(text, fsize, bold=True)\n writer.add_blank(fsize * 0.3)\n continue\n\n if btype == 'paragraph':\n text = clean_text(block['text'])\n if not text.strip():\n continue\n writer._ensure_space(FONT_SIZE_BODY * 2)\n writer.write_text(text, FONT_SIZE_BODY)\n writer.add_blank(FONT_SIZE_BODY * 0.4)\n continue\n\n if btype == 'blockquote':\n text = clean_text(block['text'])\n if not text.strip():\n continue\n writer._ensure_space(FONT_SIZE_BLOCKQUOTE * 2)\n start_y = writer.y\n writer.write_text(text, FONT_SIZE_BLOCKQUOTE, indent=15, color=(0.3, 0.3, 0.3))\n writer.draw_blockquote_bar(start_y, writer.y)\n writer.add_blank(FONT_SIZE_BLOCKQUOTE * 0.4)\n continue\n\n if btype == 'list_item':\n text = clean_text(block['text'])\n number = block.get('number', '•')\n prefix = f\"{number}. \" if number != '•' else \"• \"\n writer._ensure_space(FONT_SIZE_BODY * 2)\n\n indent = writer._text_width(prefix, writer.font, FONT_SIZE_BODY) + 4\n # 写前缀\n tw = pymupdf.TextWriter(writer.page.rect)\n tw.append(pymupdf.Point(MARGIN_LEFT, writer.y + FONT_SIZE_BODY),\n prefix, font=writer.font, fontsize=FONT_SIZE_BODY)\n tw.write_text(writer.page, color=(0, 0, 0))\n # 写正文(带缩进)\n writer.write_text(text, FONT_SIZE_BODY, indent=indent)\n writer.add_blank(FONT_SIZE_BODY * 0.3)\n continue\n\n if btype == 'image':\n img_path = get_image_path(block['src'], md_dir)\n if img_path:\n writer.insert_image(img_path)\n continue\n\n if btype == 'caption':\n text = clean_text(block['text'])\n writer._ensure_space(FONT_SIZE_CAPTION * 2)\n writer.write_text(text, FONT_SIZE_CAPTION, color=(0.4, 0.4, 0.4))\n writer.add_blank(FONT_SIZE_CAPTION * 0.4)\n continue\n\n writer.save(output_path)\n return output_path\n\n\ndef main():\n parser = argparse.ArgumentParser(description='将 Markdown 文件转换为 PDF(嵌入本地图片)')\n parser.add_argument('md_path', help='Markdown 文件路径')\n parser.add_argument('--output', '-o', help='输出 PDF 路径(默认同目录同名 .pdf)')\n args = parser.parse_args()\n\n try:\n output = md_to_pdf(args.md_path, args.output)\n print(f\"🎉 完成: {output}\")\n except Exception as e:\n print(f\"❌ 转换失败: {e}\")\n sys.exit(1)\n\n\nif __name__ == '__main__':\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":14347,"content_sha256":"13007afcb1e9ab07db5387ea1376c0396dcb544a8ed82411f2d6cf179087c356"},{"filename":"scripts/translate_gemini.py","content":"#!/usr/bin/env python3\n\"\"\"用 Gemini API 将英文 Markdown 翻译为中英对照格式\"\"\"\nimport requests, json, re, os, sys, time\n\nAPI_KEY = os.environ.get(\"GEMINI_API_KEY\", \"\")\nMODEL = \"gemini-2.5-flash\"\nURL = f\"https://generativelanguage.googleapis.com/v1beta/models/{MODEL}:generateContent?key={API_KEY}\"\n\ndef translate_chunk(text):\n \"\"\"翻译一段英文为中英对照\"\"\"\n prompt = \"\"\"你是专业翻译。将以下英文 Markdown 翻译为中英对照格式。规则:\n1. 每段英文原文后紧跟对应的中文翻译,之间用一个空行分隔\n2. 保留所有 Markdown 格式(标题、链接、引用、列表等)\n3. 标题也翻译:原文标题后加中文标题\n4. 翻译要自然流畅,专业术语保留英文并附中文注释\n5. 不加分隔线、不加国旗emoji\n6. 保留原文中的图片引用 ![](...)\n\n英文原文:\n\"\"\" + text\n\n payload = {\n \"contents\": [{\"parts\": [{\"text\": prompt}]}],\n \"generationConfig\": {\"temperature\": 0.3, \"maxOutputTokens\": 16000}\n }\n \n for attempt in range(3):\n try:\n resp = requests.post(URL, json=payload, timeout=120)\n if resp.status_code == 200:\n data = resp.json()\n return data[\"candidates\"][0][\"content\"][\"parts\"][0][\"text\"]\n elif resp.status_code == 429:\n print(f\" Rate limited, waiting 30s...\")\n time.sleep(30)\n else:\n print(f\" Error {resp.status_code}: {resp.text[:200]}\")\n time.sleep(5)\n except Exception as e:\n print(f\" Exception: {e}\")\n time.sleep(5)\n return None\n\ndef main():\n if len(sys.argv) \u003c 2:\n print(\"Usage: python3 translate_gemini.py \u003cinput_md> [output_md]\")\n sys.exit(1)\n \n src = sys.argv[1]\n dst = sys.argv[2] if len(sys.argv) > 2 else src.replace(\".md\", \"_translated.md\")\n \n with open(src, \"r\") as f:\n content = f.read()\n \n # 按段落分组,每组 ~4000 字符\n paragraphs = content.split(\"\\n\\n\")\n chunks = []\n current = \"\"\n for p in paragraphs:\n if len(current) + len(p) + 2 > 4000:\n chunks.append(current)\n current = p\n else:\n current = current + \"\\n\\n\" + p if current else p\n if current:\n chunks.append(current)\n \n print(f\"Split into {len(chunks)} chunks for translation\")\n \n translated_parts = []\n for i, chunk in enumerate(chunks):\n print(f\" [{i+1}/{len(chunks)}] Translating {len(chunk)} chars...\", end=\" \", flush=True)\n result = translate_chunk(chunk)\n if result:\n translated_parts.append(result)\n print(\"OK\")\n else:\n print(\"FAILED - keeping original\")\n translated_parts.append(chunk)\n time.sleep(2) # Rate limit\n \n output = \"\\n\\n\".join(translated_parts)\n \n with open(dst, \"w\") as f:\n f.write(output)\n \n print(f\"\\nDone! Translated file: {dst}\")\n print(f\"Original: {len(content)} chars -> Translated: {len(output)} chars\")\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":3111,"content_sha256":"1f84072010e83c1ee67986b7c93a04c03c514586ac532fd5258d07ede4210daf"},{"filename":"scripts/upload_video_via_openapi.py","content":"#!/usr/bin/env python3\n\"\"\"\n通过乐享 OpenAPI 上传视频/音频到知识库,产生可播放的 entry_type=video 条目。\n\n✅ 已验证可行(2026-05-03):视频上传后立即可播放,且挂在指定目录下。\n\n正确的 3 步流程(腾讯内部乐享 lexiang.tencent.com 文档):\n 1. POST /cgi-bin/v1/kb/files/upload-params → 获取 VOD 上传签名 + state\n body: {\"name\": \"xxx.mp4\", \"media_type\": \"video\"}\n 2. PUT \u003cbucket>.cos.\u003cregion>.myqcloud.com/\u003ckey> → 上传文件到 VOD COS\n 3. POST /cgi-bin/v1/kb/entries?space_id=xxx&state=xxx → 创建 entry_type=video 节点\n body: {\"data\": {\"attributes\": {\"name\": \"xxx.mp4\", \"entry_type\": \"video\"},\n \"relationships\": {\"parent_entry\": {...}}}}\n\n🚨 关键细节(踩坑后总结,不改):\n - 签名接口必须用 /cgi-bin/v1/kb/files/upload-params(**不是** /cgi-bin/v1/docs/cos-param)\n - media_type 有三种:video / audio / file(docs/cos-param 只支持 attachment/file)\n - 签名接口的 name 和创建 entry 的 name **必须带文件后缀**(否则报\"name需指定文件后缀\")\n - kb/entries 接口用 **x-staff-id**(小写,带连字符),不是 StaffID\n - parent_entry 指向目标 page/folder 的 entry_id,不写则挂到 space 根目录\n\n凭证存放(不进 git):\n ~/.lexiang/openapi.json (AppKey / AppSecret / StaffID)\n ~/.lexiang/token_cache.json (access_token 缓存,2h 有效)\n\n用法:\n python3 upload_video_via_openapi.py \u003c本地视频> \\\\\n --space-id \u003c知识库 space_id> \\\\\n --parent-entry-id \u003c父节点 entry_id> \\\\\n [--name \"视频标题.mp4\"] # 必须带扩展名;不填则用本地文件名\n [--media-type video|audio|file] # 默认 video\n\n返回:\n entry_id(乐享知识节点 id)\n entry_type(应为 video)\n\"\"\"\nfrom __future__ import annotations\n\nimport argparse\nimport json\nimport os\nimport sys\nimport time\nimport urllib.parse\nimport urllib.request\nfrom typing import Any\n\n\nCONFIG_PATH = os.path.expanduser(\"~/.lexiang/openapi.json\")\nTOKEN_CACHE_PATH = os.path.expanduser(\"~/.lexiang/token_cache.json\")\n\n\ndef load_config() -> dict:\n if not os.path.exists(CONFIG_PATH):\n raise FileNotFoundError(\n f\"凭证文件不存在: {CONFIG_PATH}\\n\"\n f\"需要字段: app_key, app_secret, staff_id, api_base\"\n )\n with open(CONFIG_PATH) as f:\n return json.load(f)\n\n\ndef http_json(\n url: str,\n *,\n method: str = \"POST\",\n headers: dict | None = None,\n json_body: Any = None,\n form_body: dict | None = None,\n timeout: int = 120,\n) -> dict:\n data = None\n hdrs = dict(headers or {})\n if json_body is not None:\n data = json.dumps(json_body).encode()\n hdrs.setdefault(\"Content-Type\", \"application/json; charset=utf-8\")\n elif form_body is not None:\n data = urllib.parse.urlencode(form_body).encode()\n hdrs.setdefault(\"Content-Type\", \"application/x-www-form-urlencoded\")\n\n req = urllib.request.Request(url, data=data, method=method, headers=hdrs)\n try:\n with urllib.request.urlopen(req, timeout=timeout) as resp:\n body = resp.read().decode()\n except urllib.error.HTTPError as e:\n err_body = e.read().decode()\n raise RuntimeError(f\"HTTP {e.code} {url}\\n响应: {err_body[:500]}\") from e\n try:\n return json.loads(body)\n except json.JSONDecodeError:\n return {\"_raw\": body}\n\n\ndef get_access_token(cfg: dict, *, force_refresh: bool = False) -> str:\n if not force_refresh and os.path.exists(TOKEN_CACHE_PATH):\n try:\n with open(TOKEN_CACHE_PATH) as f:\n cache = json.load(f)\n if cache.get(\"expires_at\", 0) > int(time.time()):\n return cache[\"access_token\"]\n except Exception:\n pass\n\n url = f\"{cfg['api_base']}/cgi-bin/token\"\n result = http_json(\n url,\n form_body={\n \"grant_type\": \"client_credentials\",\n \"app_key\": cfg[\"app_key\"],\n \"app_secret\": cfg[\"app_secret\"],\n },\n )\n if \"access_token\" not in result:\n raise RuntimeError(f\"换取 access_token 失败: {result}\")\n\n cache = {\n \"access_token\": result[\"access_token\"],\n \"expires_at\": int(time.time()) + result[\"expires_in\"] - 300,\n }\n os.makedirs(os.path.dirname(TOKEN_CACHE_PATH), exist_ok=True)\n with open(TOKEN_CACHE_PATH, \"w\") as f:\n json.dump(cache, f)\n os.chmod(TOKEN_CACHE_PATH, 0o600)\n return result[\"access_token\"]\n\n\ndef apply_upload_params(\n *,\n api_base: str,\n access_token: str,\n staff_id: str,\n filename: str,\n media_type: str,\n) -> dict:\n \"\"\"Step 1: 获取 kb/files 的上传签名。\"\"\"\n url = f\"{api_base}/cgi-bin/v1/kb/files/upload-params\"\n headers = {\n \"Authorization\": f\"Bearer {access_token}\",\n \"x-staff-id\": staff_id,\n }\n result = http_json(\n url,\n headers=headers,\n json_body={\"name\": filename, \"media_type\": media_type},\n )\n if \"object\" not in result or \"state\" not in result[\"object\"]:\n raise RuntimeError(f\"upload-params 失败: {result}\")\n return result\n\n\ndef put_to_cos(*, local_file: str, params: dict) -> None:\n \"\"\"Step 2: PUT 文件到腾讯云 COS。\"\"\"\n bucket = params[\"options\"][\"Bucket\"]\n region = params[\"options\"][\"Region\"]\n key = params[\"object\"][\"key\"]\n auth = params[\"object\"][\"auth\"]\n file_headers = params[\"object\"].get(\"headers\", {}) or {}\n\n url = f\"https://{bucket}.cos.{region}.myqcloud.com/{key}\"\n with open(local_file, \"rb\") as f:\n data = f.read()\n\n req_headers = {\n \"Authorization\": auth[\"Authorization\"],\n \"x-cos-security-token\": auth[\"XCosSecurityToken\"],\n }\n for k, v in file_headers.items():\n req_headers[k] = v\n\n print(f\" bucket: {bucket}\")\n print(f\" region: {region}\")\n print(f\" key: {key[:70]}...\")\n print(f\" size: {len(data) / 1024 / 1024:.1f} MB\")\n\n req = urllib.request.Request(url, data=data, method=\"PUT\", headers=req_headers)\n start = time.time()\n try:\n with urllib.request.urlopen(req, timeout=600) as resp:\n etag = resp.headers.get(\"ETag\", \"(none)\")\n print(f\" ✓ HTTP {resp.status}, ETag {etag}, 耗时 {time.time()-start:.1f}s\")\n except urllib.error.HTTPError as e:\n raise RuntimeError(\n f\"COS PUT 失败 HTTP {e.code}: {e.read().decode()[:500]}\"\n ) from e\n\n\ndef create_kb_entry(\n *,\n api_base: str,\n access_token: str,\n staff_id: str,\n space_id: str,\n state: str,\n name: str,\n entry_type: str,\n parent_entry_id: str | None = None,\n) -> dict:\n \"\"\"Step 3: 创建知识节点(entry_type=video/audio/file)。\"\"\"\n url = (\n f\"{api_base}/cgi-bin/v1/kb/entries\"\n f\"?space_id={urllib.parse.quote(space_id)}\"\n f\"&state={urllib.parse.quote(state)}\"\n )\n headers = {\n \"Authorization\": f\"Bearer {access_token}\",\n \"x-staff-id\": staff_id,\n }\n body: dict = {\n \"data\": {\n \"attributes\": {\"name\": name, \"entry_type\": entry_type}\n }\n }\n if parent_entry_id:\n body[\"data\"][\"relationships\"] = {\n \"parent_entry\": {\"data\": {\"type\": \"entry\", \"id\": parent_entry_id}}\n }\n result = http_json(url, headers=headers, json_body=body)\n if \"data\" not in result or not result[\"data\"].get(\"id\"):\n raise RuntimeError(f\"创建知识节点失败: {result}\")\n return result[\"data\"]\n\n\ndef upload_media(\n *,\n local_file: str,\n space_id: str,\n parent_entry_id: str | None,\n name: str | None = None,\n media_type: str = \"video\",\n) -> dict:\n \"\"\"端到端上传入口。\"\"\"\n if not os.path.exists(local_file):\n raise FileNotFoundError(f\"文件不存在: {local_file}\")\n\n cfg = load_config()\n filename = os.path.basename(local_file) # 带扩展名\n if not name:\n name = filename # 默认用文件名(带后缀)\n elif \".\" not in name:\n # 确保 name 有文件后缀(API 强制要求)\n ext = os.path.splitext(filename)[1]\n name = name + ext\n\n print(f\"文件: {local_file}\")\n print(f\" 上传名: {filename}\")\n print(f\" 条目名: {name}\")\n print(f\" media_type: {media_type}\")\n print(f\" space: {space_id}\")\n print(f\" parent: {parent_entry_id or '(space 根目录)'}\")\n print()\n\n print(\"[0/3] 获取 access_token ...\")\n access_token = get_access_token(cfg)\n print(f\" ✓ token: {access_token[:30]}...\")\n\n print(\"[1/3] kb/files/upload-params (获取 VOD 上传签名) ...\")\n params = apply_upload_params(\n api_base=cfg[\"api_base\"],\n access_token=access_token,\n staff_id=cfg[\"staff_id\"],\n filename=filename,\n media_type=media_type,\n )\n state = params[\"object\"][\"state\"]\n print(f\" ✓ state: {state}\")\n\n print(\"[2/3] PUT 文件到腾讯云 COS ...\")\n put_to_cos(local_file=local_file, params=params)\n\n entry_type = media_type # video/audio/file\n print(f\"[3/3] 创建 kb/entry (entry_type={entry_type}) ...\")\n entry = create_kb_entry(\n api_base=cfg[\"api_base\"],\n access_token=access_token,\n staff_id=cfg[\"staff_id\"],\n space_id=space_id,\n state=state,\n name=name,\n entry_type=entry_type,\n parent_entry_id=parent_entry_id,\n )\n entry_id = entry[\"id\"]\n actual_type = entry.get(\"attributes\", {}).get(\"entry_type\", \"?\")\n print(f\" ✓ entry_id: {entry_id}\")\n print(f\" ✓ entry_type: {actual_type}\")\n\n return {\n \"entry_id\": entry_id,\n \"entry_type\": actual_type,\n \"name\": entry.get(\"attributes\", {}).get(\"name\", name),\n }\n\n\ndef main() -> None:\n parser = argparse.ArgumentParser(\n description=\"通过乐享 OpenAPI 上传视频/音频到知识库(产生 entry_type=video 条目,可播放)\",\n formatter_class=argparse.RawDescriptionHelpFormatter,\n )\n parser.add_argument(\"local_file\", help=\"本地文件路径\")\n parser.add_argument(\"--space-id\", required=True, help=\"目标知识库 space_id\")\n parser.add_argument(\n \"--parent-entry-id\",\n default=None,\n help=\"父节点 entry_id;不填则挂到 space 根目录\",\n )\n parser.add_argument(\n \"--name\",\n default=None,\n help=\"条目名称(须带文件后缀;缺省用本地文件名)\",\n )\n parser.add_argument(\n \"--media-type\",\n default=\"video\",\n choices=[\"video\", \"audio\", \"file\"],\n help=\"媒体类型,决定 kb_entry 的 entry_type(默认 video)\",\n )\n parser.add_argument(\n \"--json-output\", action=\"store_true\", help=\"以 JSON 输出结果\"\n )\n args = parser.parse_args()\n\n try:\n result = upload_media(\n local_file=args.local_file,\n space_id=args.space_id,\n parent_entry_id=args.parent_entry_id,\n name=args.name,\n media_type=args.media_type,\n )\n except Exception as exc:\n sys.stderr.write(f\"\\n❌ 上传失败: {exc}\\n\")\n sys.exit(1)\n\n print()\n if args.json_output:\n print(json.dumps(result, ensure_ascii=False, indent=2))\n else:\n print(\"✅ 上传完成!\")\n print(f\" entry_id: {result['entry_id']}\")\n print(f\" entry_type: {result['entry_type']}\")\n print(f\" 链接: https://lexiangla.com/pages/{result['entry_id']}\")\n print()\n print(\" 说明:视频已上传且挂到目标目录。乐享会做 VOD 转码,\")\n print(\" 通常几秒到几分钟内可在 Web 端播放。\")\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":11701,"content_sha256":"2d97e753cae23294bade96c9417b30a9260d3915906fb67950fd61f23463bc1d"},{"filename":"scripts/yt_download_transcribe.py","content":"#!/usr/bin/env python3\n\"\"\"\nYouTube 视频下载 + 音频转录 + 中英对照翻译\n\n功能:\n1. 使用 yt-dlp 下载 YouTube 视频(最佳质量)\n2. 使用 OpenAI Whisper 将音频转录为文字\n3. 如果是英文,使用 AI 翻译成中文(一段英文原文 + 一段中文翻译的对照格式)\n4. 输出 Markdown 格式的文字稿\n\n使用方式:\n python3 yt_download_transcribe.py \u003cYouTube URL> --output-dir \u003c输出目录>\n\n依赖:\n pip3 install yt-dlp openai-whisper openai\n brew install ffmpeg\n\"\"\"\nfrom __future__ import annotations\n\nimport argparse\nimport json\nimport os\nimport re\nimport subprocess\nimport sys\nimport time\nfrom pathlib import Path\n\n\ndef check_dependencies():\n \"\"\"检查必要的依赖是否已安装\"\"\"\n missing = []\n\n # 检查 yt-dlp\n try:\n subprocess.run([\"yt-dlp\", \"--version\"],\n capture_output=True, check=True)\n except (subprocess.CalledProcessError, FileNotFoundError):\n missing.append(\"yt-dlp (brew install yt-dlp)\")\n\n # 检查 whisper\n try:\n import whisper # noqa: F401\n except ImportError:\n missing.append(\"openai-whisper (pip3 install openai-whisper)\")\n\n # 检查 ffmpeg\n try:\n subprocess.run([\"ffmpeg\", \"-version\"], capture_output=True, check=True)\n except (subprocess.CalledProcessError, FileNotFoundError):\n missing.append(\"ffmpeg (brew install ffmpeg)\")\n\n if missing:\n print(\"❌ 缺少以下依赖:\")\n for dep in missing:\n print(f\" - {dep}\")\n sys.exit(1)\n\n\ndef get_video_info(url: str, cookies_from_browser: str | None = \"chrome\") -> dict:\n \"\"\"获取视频元信息(标题、时长、频道等)\"\"\"\n print(\"📋 获取视频信息...\")\n cmd = [\"yt-dlp\", \"--dump-json\", \"--no-download\"]\n if cookies_from_browser:\n cmd.extend([\"--cookies-from-browser\", cookies_from_browser])\n cmd.append(url)\n result = subprocess.run(cmd, capture_output=True, text=True)\n if result.returncode != 0:\n # 如果带 cookies 失败,尝试不带 cookies\n if cookies_from_browser:\n print(\" ⚠️ 使用浏览器 cookies 失败,尝试不带 cookies...\")\n cmd_no_cookies = [\"yt-dlp\", \"--dump-json\", \"--no-download\", url]\n result = subprocess.run(cmd_no_cookies, capture_output=True, text=True)\n if result.returncode != 0:\n print(f\"❌ 获取视频信息失败: {result.stderr}\")\n sys.exit(1)\n\n # 解析 JSON(跳过 stderr 警告行,只取最后一行 JSON)\n stdout_lines = result.stdout.strip().split('\\n')\n json_line = stdout_lines[-1] if stdout_lines else \"\"\n info = json.loads(json_line)\n return {\n \"title\": info.get(\"title\", \"Unknown\"),\n \"channel\": info.get(\"channel\", info.get(\"uploader\", \"Unknown\")),\n \"upload_date\": info.get(\"upload_date\", \"\"),\n \"duration\": info.get(\"duration\", 0),\n \"description\": info.get(\"description\", \"\"),\n \"url\": url,\n \"webpage_url\": info.get(\"webpage_url\", url),\n \"thumbnail\": info.get(\"thumbnail\", \"\"),\n \"view_count\": info.get(\"view_count\", 0),\n \"like_count\": info.get(\"like_count\", 0),\n }\n\n\ndef sanitize_filename(name: str) -> str:\n \"\"\"清理文件名中的非法字符\"\"\"\n # 替换文件名不合法字符\n name = re.sub(r'[/\\\\:*?\"\u003c>|]', '-', name)\n # 去除首尾空格和点\n name = name.strip(' .')\n # 限制长度\n if len(name) > 200:\n name = name[:200]\n return name\n\n\ndef download_video(url: str, output_dir: str, title: str,\n cookies_from_browser: str | None = \"chrome\") -> str:\n \"\"\"下载 YouTube 视频,返回视频文件路径\"\"\"\n safe_title = sanitize_filename(title)\n output_template = os.path.join(output_dir, f\"{safe_title}.%(ext)s\")\n\n print(f\"⬇️ 下载视频: {title}\")\n print(f\" 输出目录: {output_dir}\")\n\n # 优先使用 HLS(m3u8) 格式避免 YouTube DASH 403 错误\n # 95-1: 720p HLS, 94-1: 480p HLS, 93-1: 360p HLS\n # 回退到传统 DASH 格式\n cmd = [\n \"yt-dlp\",\n \"-f\", \"95-1/94-1/93-1/bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best\",\n \"--merge-output-format\", \"mp4\",\n \"-o\", output_template,\n \"--no-playlist\",\n \"--progress\",\n \"--newline\",\n ]\n if cookies_from_browser:\n cmd.extend([\"--cookies-from-browser\", cookies_from_browser])\n cmd.append(url)\n\n result = subprocess.run(\n cmd,\n capture_output=False, # 显示下载进度\n text=True\n )\n\n if result.returncode != 0:\n print(\"❌ 视频下载失败\")\n sys.exit(1)\n\n # 找到下载的视频文件\n video_file = os.path.join(output_dir, f\"{safe_title}.mp4\")\n if not os.path.exists(video_file):\n # 尝试查找其他可能的文件名\n for f in os.listdir(output_dir):\n if f.endswith(('.mp4', '.mkv', '.webm')) and safe_title[:20] in f:\n video_file = os.path.join(output_dir, f)\n break\n\n if not os.path.exists(video_file):\n print(\"❌ 找不到下载的视频文件\")\n sys.exit(1)\n\n file_size = os.path.getsize(video_file) / (1024 * 1024)\n print(f\" ✅ 下载完成: {os.path.basename(video_file)} ({file_size:.1f} MB)\")\n return video_file\n\n\ndef extract_audio(video_path: str) -> str:\n \"\"\"从视频中提取音频(WAV 格式,Whisper 推荐)\"\"\"\n audio_path = os.path.splitext(video_path)[0] + \".wav\"\n\n if os.path.exists(audio_path):\n print(f\" ⏭️ 音频文件已存在,跳过提取\")\n return audio_path\n\n print(\"🎵 提取音频...\")\n result = subprocess.run(\n [\n \"ffmpeg\", \"-i\", video_path,\n \"-vn\", # 不要视频\n \"-acodec\", \"pcm_s16le\", # WAV 格式\n \"-ar\", \"16000\", # 16kHz 采样率(Whisper 推荐)\n \"-ac\", \"1\", # 单声道\n \"-y\", # 覆盖已有文件\n audio_path\n ],\n capture_output=True, text=True\n )\n\n if result.returncode != 0:\n print(f\"❌ 音频提取失败: {result.stderr}\")\n sys.exit(1)\n\n print(f\" ✅ 音频提取完成: {os.path.basename(audio_path)}\")\n return audio_path\n\n\ndef transcribe_audio(audio_path: str, model_name: str = \"base\") -> dict:\n \"\"\"使用 Whisper 转录音频\"\"\"\n import whisper\n\n print(f\"🎙️ 使用 Whisper ({model_name}) 转录音频...\")\n print(f\" 这可能需要几分钟,请耐心等待...\")\n\n start_time = time.time()\n\n model = whisper.load_model(model_name)\n result = model.transcribe(\n audio_path,\n verbose=True, # 显示转录进度\n language=None, # 自动检测语言\n task=\"transcribe\",\n )\n\n elapsed = time.time() - start_time\n print(f\" ✅ 转录完成(耗时 {elapsed:.1f}s)\")\n print(f\" 检测到语言: {result.get('language', 'unknown')}\")\n\n return result\n\n\ndef format_timestamp(seconds: float) -> str:\n \"\"\"将秒数转为 HH:MM:SS 格式\"\"\"\n hours = int(seconds // 3600)\n minutes = int((seconds % 3600) // 60)\n secs = int(seconds % 60)\n if hours > 0:\n return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n return f\"{minutes:02d}:{secs:02d}\"\n\n\ndef merge_segments_to_paragraphs(segments: list, max_gap: float = 2.0,\n max_duration: float = 60.0) -> list:\n \"\"\"\n 将 Whisper 的细粒度 segments 合并为段落。\n 合并规则:\n - 如果两个相邻 segment 之间的间隔 \u003c max_gap 秒,合并\n - 如果合并后的段落时长超过 max_duration 秒,强制断开\n - 遇到句号/问号/感叹号结尾时,倾向断开\n \"\"\"\n if not segments:\n return []\n\n paragraphs = []\n current = {\n \"start\": segments[0][\"start\"],\n \"end\": segments[0][\"end\"],\n \"text\": segments[0][\"text\"].strip(),\n }\n\n for seg in segments[1:]:\n gap = seg[\"start\"] - current[\"end\"]\n duration = seg[\"end\"] - current[\"start\"]\n text = seg[\"text\"].strip()\n\n # 判断是否需要断开\n ends_with_punct = current[\"text\"].rstrip().endswith(('.', '?', '!', '。', '?', '!'))\n should_break = (\n gap > max_gap or\n duration > max_duration or\n (ends_with_punct and gap > 0.5)\n )\n\n if should_break:\n paragraphs.append(current)\n current = {\n \"start\": seg[\"start\"],\n \"end\": seg[\"end\"],\n \"text\": text,\n }\n else:\n current[\"end\"] = seg[\"end\"]\n current[\"text\"] += \" \" + text\n\n paragraphs.append(current)\n return paragraphs\n\n\ndef translate_paragraphs(paragraphs: list, source_lang: str) -> list:\n \"\"\"\n 使用 OpenAI API 将段落翻译为中文。\n 分批翻译以避免 token 限制。\n 返回翻译后的段落列表(与输入对应)。\n \"\"\"\n if source_lang == \"zh\" or source_lang == \"Chinese\":\n print(\" ℹ️ 源语言为中文,跳过翻译\")\n return [p[\"text\"] for p in paragraphs]\n\n print(f\"🌐 翻译段落({source_lang} → 中文)...\")\n\n # 检查 OpenAI API Key\n api_key = os.environ.get(\"OPENAI_API_KEY\")\n if not api_key:\n print(\" ⚠️ 未设置 OPENAI_API_KEY,跳过翻译\")\n return [None] * len(paragraphs)\n\n try:\n from openai import OpenAI\n client = OpenAI(api_key=api_key)\n except ImportError:\n print(\" ⚠️ 未安装 openai 库,跳过翻译\")\n return [None] * len(paragraphs)\n\n translations = []\n batch_size = 10 # 每批翻译 10 个段落\n\n for i in range(0, len(paragraphs), batch_size):\n batch = paragraphs[i:i + batch_size]\n batch_texts = []\n for j, p in enumerate(batch):\n batch_texts.append(f\"[{j + 1}] {p['text']}\")\n\n prompt = (\n \"将以下编号的段落翻译为中文。保持编号格式,每段翻译独立一行。\\n\"\n \"翻译要求:自然流畅的中文表达,专业术语保留英文并附中文注释。\\n\\n\"\n + \"\\n\".join(batch_texts)\n )\n\n try:\n response = client.chat.completions.create(\n model=\"gpt-4o-mini\",\n messages=[\n {\"role\": \"system\", \"content\": \"你是一个专业的翻译员,擅长将英文技术内容翻译为准确、自然的中文。\"},\n {\"role\": \"user\", \"content\": prompt}\n ],\n temperature=0.3,\n )\n reply = response.choices[0].message.content.strip()\n\n # 解析翻译结果\n batch_translations = []\n lines = reply.split('\\n')\n current_trans = \"\"\n for line in lines:\n # 匹配 [N] 开头的行\n match = re.match(r'^\\[(\\d+)\\]\\s*(.+)', line.strip())\n if match:\n if current_trans:\n batch_translations.append(current_trans.strip())\n current_trans = match.group(2)\n elif line.strip():\n current_trans += \" \" + line.strip()\n if current_trans:\n batch_translations.append(current_trans.strip())\n\n # 确保数量匹配\n while len(batch_translations) \u003c len(batch):\n batch_translations.append(None)\n\n translations.extend(batch_translations[:len(batch)])\n\n except Exception as e:\n print(f\" ⚠️ 翻译批次 {i // batch_size + 1} 失败: {e}\")\n translations.extend([None] * len(batch))\n\n # 显示进度\n done = min(i + batch_size, len(paragraphs))\n print(f\" 翻译进度: {done}/{len(paragraphs)} 段\")\n\n return translations\n\n\ndef generate_markdown(video_info: dict, paragraphs: list,\n translations: list | None,\n detected_language: str) -> str:\n \"\"\"\n 生成 Markdown 格式的文字稿。\n 如果有翻译,采用一段英文原文 + 一段中文翻译的对照格式。\n \"\"\"\n lines = []\n\n # 标题\n title = video_info[\"title\"]\n lines.append(f\"# {title}\")\n lines.append(\"\")\n\n # 元信息\n lines.append(f\"**频道**: {video_info['channel']}\")\n if video_info.get(\"upload_date\"):\n date_str = video_info[\"upload_date\"]\n if len(date_str) == 8:\n date_str = f\"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}\"\n lines.append(f\"**发布日期**: {date_str}\")\n if video_info.get(\"duration\"):\n lines.append(f\"**时长**: {format_timestamp(video_info['duration'])}\")\n lines.append(f\"**原始链接**: {video_info['webpage_url']}\")\n lines.append(f\"**转录语言**: {detected_language}\")\n lines.append(\"\")\n lines.append(\"---\")\n lines.append(\"\")\n\n # 正文\n is_bilingual = translations and any(t is not None for t in translations)\n\n if is_bilingual:\n lines.append(\"## 文字稿(中英对照)\")\n lines.append(\"\")\n lines.append(\"> 以下内容采用「英文原文 + 中文翻译」对照排列。\")\n lines.append(\"\")\n\n for i, para in enumerate(paragraphs):\n timestamp = format_timestamp(para[\"start\"])\n # 英文原文\n lines.append(f\"**[{timestamp}]**\")\n lines.append(\"\")\n lines.append(para[\"text\"])\n lines.append(\"\")\n # 中文翻译\n if i \u003c len(translations) and translations[i]:\n lines.append(f\"🇨🇳 {translations[i]}\")\n lines.append(\"\")\n lines.append(\"---\")\n lines.append(\"\")\n else:\n lines.append(\"## 文字稿\")\n lines.append(\"\")\n\n for para in paragraphs:\n timestamp = format_timestamp(para[\"start\"])\n lines.append(f\"**[{timestamp}]** {para['text']}\")\n lines.append(\"\")\n\n return \"\\n\".join(lines)\n\n\ndef save_meta_json(video_info: dict, output_path: str,\n detected_language: str, paragraph_count: int,\n video_file: str | None = None):\n \"\"\"保存元信息 JSON\"\"\"\n meta = {\n \"url\": video_info[\"webpage_url\"],\n \"title\": video_info[\"title\"],\n \"channel\": video_info[\"channel\"],\n \"upload_date\": video_info.get(\"upload_date\", \"\"),\n \"duration\": video_info.get(\"duration\", 0),\n \"description\": video_info.get(\"description\", \"\")[:500],\n \"thumbnail\": video_info.get(\"thumbnail\", \"\"),\n \"view_count\": video_info.get(\"view_count\", 0),\n \"like_count\": video_info.get(\"like_count\", 0),\n \"detected_language\": detected_language,\n \"paragraph_count\": paragraph_count,\n \"video_file\": video_file,\n \"fetched_at\": time.strftime(\"%Y-%m-%dT%H:%M:%S\"),\n }\n\n with open(output_path, \"w\", encoding=\"utf-8\") as f:\n json.dump(meta, f, ensure_ascii=False, indent=2)\n\n\ndef main():\n parser = argparse.ArgumentParser(\n description=\"下载 YouTube 视频 + 转录 + 翻译\"\n )\n parser.add_argument(\"url\", help=\"YouTube 视频 URL\")\n parser.add_argument(\n \"--output-dir\", \"-o\", default=\".\",\n help=\"输出目录(默认当前目录)\"\n )\n parser.add_argument(\n \"--whisper-model\", \"-m\", default=\"base\",\n choices=[\"tiny\", \"base\", \"small\", \"medium\", \"large\"],\n help=\"Whisper 模型大小(默认 base,越大越准但越慢)\"\n )\n parser.add_argument(\n \"--skip-download\", action=\"store_true\",\n help=\"跳过视频下载(使用已下载的视频)\"\n )\n parser.add_argument(\n \"--skip-translate\", action=\"store_true\",\n help=\"跳过翻译步骤\"\n )\n parser.add_argument(\n \"--keep-audio\", action=\"store_true\",\n help=\"保留提取的音频文件(默认删除)\"\n )\n parser.add_argument(\n \"--cookies-from-browser\", default=\"chrome\",\n help=\"从哪个浏览器获取 cookies(默认 chrome,设为空字符串禁用)\"\n )\n\n args = parser.parse_args()\n\n # 处理 cookies 参数\n cookies_browser = args.cookies_from_browser if args.cookies_from_browser else None\n\n # 检查依赖\n check_dependencies()\n\n # 确保输出目录存在\n os.makedirs(args.output_dir, exist_ok=True)\n\n # Step 1: 获取视频信息\n video_info = get_video_info(args.url, cookies_browser)\n safe_title = sanitize_filename(video_info[\"title\"])\n print(f\" 标题: {video_info['title']}\")\n print(f\" 频道: {video_info['channel']}\")\n print(f\" 时长: {format_timestamp(video_info.get('duration', 0))}\")\n print()\n\n # Step 2: 下载视频\n if args.skip_download:\n # 查找已有的视频文件\n video_file = None\n for f in os.listdir(args.output_dir):\n if f.endswith(('.mp4', '.mkv', '.webm')) and safe_title[:20] in f:\n video_file = os.path.join(args.output_dir, f)\n break\n if not video_file:\n print(\"❌ 未找到已下载的视频文件\")\n sys.exit(1)\n print(f\"⏭️ 使用已下载的视频: {os.path.basename(video_file)}\")\n else:\n video_file = download_video(args.url, args.output_dir, video_info[\"title\"], cookies_browser)\n\n print()\n\n # Step 3: 提取音频\n audio_file = extract_audio(video_file)\n print()\n\n # Step 4: 转录\n result = transcribe_audio(audio_file, args.whisper_model)\n detected_language = result.get(\"language\", \"unknown\")\n segments = result.get(\"segments\", [])\n print()\n\n # Step 5: 合并为段落\n paragraphs = merge_segments_to_paragraphs(segments)\n print(f\"📝 合并为 {len(paragraphs)} 个段落\")\n print()\n\n # Step 6: 翻译(如果不是中文)\n translations = None\n if not args.skip_translate and detected_language not in (\"zh\", \"Chinese\"):\n translations = translate_paragraphs(paragraphs, detected_language)\n print()\n\n # Step 7: 生成 Markdown\n md_content = generate_markdown(\n video_info, paragraphs, translations, detected_language\n )\n md_path = os.path.join(args.output_dir, f\"{safe_title}.md\")\n with open(md_path, \"w\", encoding=\"utf-8\") as f:\n f.write(md_content)\n print(f\"📄 文字稿已保存: {md_path}\")\n\n # Step 8: 保存元信息\n meta_path = os.path.join(args.output_dir, f\"{safe_title}_meta.json\")\n save_meta_json(\n video_info, meta_path, detected_language,\n len(paragraphs), video_file\n )\n print(f\"📋 元信息已保存: {meta_path}\")\n\n # 清理音频文件\n if not args.keep_audio and os.path.exists(audio_file):\n os.remove(audio_file)\n print(f\"🗑️ 已清理音频文件: {os.path.basename(audio_file)}\")\n\n print()\n print(\"=\" * 60)\n print(f\"✅ 处理完成!\")\n print(f\" 视频文件: {video_file}\")\n print(f\" 文字稿: {md_path}\")\n print(f\" 元信息: {meta_path}\")\n print(f\" 段落数: {len(paragraphs)}\")\n print(f\" 语言: {detected_language}\")\n if translations and any(t is not None for t in translations):\n print(f\" 翻译: ✅ 已翻译为中文(中英对照)\")\n print(\"=\" * 60)\n\n # 返回关键路径供后续脚本使用\n return {\n \"video_file\": video_file,\n \"md_path\": md_path,\n \"meta_path\": meta_path,\n \"title\": video_info[\"title\"],\n \"safe_title\": safe_title,\n }\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":19468,"content_sha256":"3c8a4bda4085d92b0fd3b14c43fc0da1627b65bfb94e5df9c649e20250e919a5"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"抓取链接内容 & 转存知识库","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🎬 视频/音频上传到乐享","type":"text","marks":[{"type":"strong"}]},{"text":":必须用 ","type":"text"},{"text":"scripts/upload_video_via_openapi.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(走 OpenAPI ","type":"text"},{"text":"/cgi-bin/v1/kb/files/upload-params","type":"text","marks":[{"type":"code_inline"}]},{"text":")。","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"用 MCP 的 ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"docs/cos-param","type":"text","marks":[{"type":"code_inline"}]},{"text":"——它们产出 ","type":"text"},{"text":"entry_type=file","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的条目,不触发 VOD 转码,视频无法播放。详见下方「YouTube 视频处理 → Step 2:上传到乐享知识库」章节。凭证存放于 ","type":"text"},{"text":"~/.lexiang/openapi.json","type":"text","marks":[{"type":"code_inline"}]},{"text":"(不进 git)。","type":"text"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"概述","type":"text"}]},{"type":"paragraph","content":[{"text":"将文章 URL(免费/付费/登录墙)抓取为结构化 Markdown,并自动转存到乐享知识库,实现素材归档和可追溯。","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"最终产出物","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 完整文章 Markdown(含图片引用)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/\u003c原文标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 结构化元信息(原文链接、作者、发布时间、抓取时间等)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 所有文章配图","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"乐享知识库中的文档副本(按天维度归档)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"乐享文档链接格式(⚠️ 必须遵守)","type":"text"}]},{"type":"paragraph","content":[{"text":"转存完成后,","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"按以下格式输出可点击访问的链接:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"https://lexiangla.com/pages/{entry_id}?company_from=e6c565d6d16811efac17768586f8a025","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"entry_id","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"},{"text":"import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"entry_create_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 返回的 ","type":"text"},{"text":"entry.id","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"company_from","type":"text","marks":[{"type":"code_inline"}]},{"text":":固定值 ","type":"text"},{"text":"e6c565d6d16811efac17768586f8a025","type":"text","marks":[{"type":"code_inline"}]},{"text":"(凡哥的企业 ID,不可省略,省略后链接无法访问)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"禁止","type":"text","marks":[{"type":"strong"}]},{"text":"使用 ","type":"text"},{"text":"mcp.lexiang-app.com/pages/...","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式——这是 MCP 内部调试链接,用户无法直接访问","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"文件命名规则(重要)","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"必须使用原文标题命名","type":"text","marks":[{"type":"strong"}]},{"text":",不要用 ","type":"text"},{"text":"article.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等通用名称","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文件名格式:","type":"text"},{"text":"\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"\u003c原文标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"示例:","type":"text"},{"text":"How Notion uses Custom Agents.md","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"How Notion uses Custom Agents_meta.json","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果标题中包含文件名不合法字符(","type":"text"},{"text":"/","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"\\","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":":","type":"text","marks":[{"type":"code_inline"}]},{"text":"等),替换为 ","type":"text"},{"text":"-","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"乐享知识库转存时也使用原文标题作为文档标题","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"工作流程","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Step 1:素材收集","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"抓取方式决策树","type":"text"}]},{"type":"paragraph","content":[{"text":"根据 URL 类型选择抓取方式(按优先级排列):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"claude.com / anthropic.com 博客","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"claude.com/blog/*","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"anthropic.com/research/*","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"anthropic.com/news/*","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 直接用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(已内置 Webflow SPA 支持,自动检测 ","type":"text"},{"text":".u-rich-text-blog","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":".w-richtext","type":"text","marks":[{"type":"code_inline"}]},{"text":" 容器并移除内嵌 ","type":"text"},{"text":"\u003cstyle>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 标签)。用法与其他站点一致:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 scripts/fetch_article.py fetch \"\u003cURL>\" --output-dir \u003c项目子目录>","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"微信公众号文章","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"mp.weixin.qq.com","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ ","type":"text"},{"text":"优先使用乐享 MCP ","type":"text","marks":[{"type":"strong"}]},{"text":"file_create_hyperlink","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"(一步到位,后端自动抓取图文+OCR),详见下方「微信公众号文章处理」章节。降级方案:","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"YouTube 视频","type":"text","marks":[{"type":"strong"}]},{"text":" → 使用 ","type":"text"},{"text":"yt_download_transcribe.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(yt-dlp 下载 + Whisper 转录 + AI 翻译),详见下方「YouTube 视频处理」章节","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"播客音频","type":"text","marks":[{"type":"strong"}]},{"text":"(小宇宙 ","type":"text"},{"text":"xiaoyuzhoufm.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"、Apple Podcasts 等)→ yt-dlp 下载音频 + Whisper 转录,详见下方「播客音频处理」章节","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"PDF 文件或 PDF 直链","type":"text","marks":[{"type":"strong"}]},{"text":"(如 arXiv PDF、乐享知识库中已存储的 PDF、本地 PDF 路径)→ 详见下方「PDF 处理」章节。","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"用 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 处理 PDF","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"微博","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"weibo.com","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ ","type":"text"},{"text":"必须用 ","type":"text","marks":[{"type":"strong"}]},{"text":"fetch_article.py --cdp","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"(微博强制登录,WebFetch/Playwright 均被拦截),详见下方「微博帖子抓取」章节","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"付费/登录墙文章","type":"text","marks":[{"type":"strong"}]},{"text":" → 用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(Cookie 注入或 CDP 模式)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"免费图文文章","type":"text","marks":[{"type":"strong"}]},{"text":"(正文含图片/截图/图表)→ ","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 只能返回文本,无法提取和下载页面中的图片)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"免费纯文字文章","type":"text","marks":[{"type":"strong"}]},{"text":"(正文无配图)→ 可用 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":",内容不完整时切换 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"SPA 动态渲染网站","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取正文为空或极少)→ ","type":"text"},{"text":"Playwright 直接生成 PDF","type":"text","marks":[{"type":"strong"}]},{"text":",详见下方「SPA 网站 Playwright 直接出 PDF」章节","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"批量抓取帮助中心/文档站","type":"text","marks":[{"type":"strong"}]},{"text":"(如 readme.io、GitBook、Guru 等)→ Playwright 直接生成 PDF,详见下方「SPA 网站 Playwright 直接出 PDF」章节","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文字观点","type":"text","marks":[{"type":"strong"}]},{"text":" → 直接整理","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片素材","type":"text","marks":[{"type":"strong"}]},{"text":" → 分析图片内容","type":"text"}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 关键原则","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 工具","type":"text"},{"text":"只能返回文本内容,无法提取和下载页面中的图片","type":"text","marks":[{"type":"strong"}]},{"text":"。任何包含图片、截图、图表的文章,都","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"使用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取,否则图片信息会完全丢失。当不确定文章是否含图时,","type":"text"},{"text":"默认用 ","type":"text","marks":[{"type":"strong"}]},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"。","type":"text"}]},{"type":"paragraph","content":[{"text":"⚠️ SPA 降级原则","type":"text","marks":[{"type":"strong"}]},{"text":":如果 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取后正文内容极少(\u003c 200 字符),说明该网站是 SPA 动态渲染,通用内容提取器无法工作。此时应切换到 ","type":"text"},{"text":"Playwright 直接生成 PDF","type":"text","marks":[{"type":"strong"}]},{"text":" 方案。","type":"text"}]},{"type":"paragraph","content":[{"text":"⚠️ 图片处理必须贯穿全流程","type":"text","marks":[{"type":"strong"}]},{"text":":抓取阶段产出的 ","type":"text"},{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 目录中的图片,在上传到乐享时","type":"text"},{"text":"不可遗漏","type":"text","marks":[{"type":"strong"}]},{"text":"。无论走哪条路径(","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" / MCP connector 分块导入 / ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":"),只要本地有图片文件,就必须上传到乐享文档中。具体图片上传流程见下方「步骤 4」的降级方案 A。","type":"text"}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"付费/登录墙文章获取","type":"text"}]},{"type":"paragraph","content":[{"text":"适用于","type":"text"},{"text":"所有需要登录态才能查看全文的网站","type":"text","marks":[{"type":"strong"}]},{"text":"(Substack 付费订阅、Medium 会员、知识星球、财新网、The Information 等),使用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 脚本:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Cookie 注入模式(默认,适用于大部分站点)\npython scripts/fetch_article.py fetch \u003cURL> --output-dir \u003c项目子目录>\n\n# CDP 模式(适用于 Cloudflare 保护站点、需要 Google 账号登录的站点)\npython scripts/fetch_article.py fetch \u003cURL> --output-dir \u003c项目子目录> --cdp","type":"text"}]},{"type":"paragraph","content":[{"text":"两种浏览器模式","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"模式","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"参数","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"原理","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"适用场景","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Cookie 注入","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"(默认)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"从 Chrome Cookie DB 提取 cookies → 注入 Playwright 浏览器","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Medium 等大部分站点","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CDP","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--cdp","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通过 Chrome DevTools Protocol 连接用户真实 Chrome(port 9222),复用完整登录态","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Substack(自动启用)","type":"text","marks":[{"type":"strong"}]},{"text":"、OpenAI、Cloudflare 保护站点、LinkedIn、Google 系网站等","type":"text"}]}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"自动升级到 CDP 模式的场景","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Substack 站点","type":"text","marks":[{"type":"strong"}]},{"text":"(所有 ","type":"text"},{"text":"*.substack.com","type":"text","marks":[{"type":"code_inline"}]},{"text":" 及已知自定义域名):自动使用 CDP 模式,并在抓取前","type":"text"},{"text":"校验登录态","type":"text","marks":[{"type":"strong"}]},{"text":"。未登录时会暂停提示用户在 Chrome 中登录,验证通过后才继续抓取。","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cloudflare 保护站点","type":"text","marks":[{"type":"strong"}]},{"text":"(如 openai.com):自动切换 CDP 模式,等待 JS challenge 通过。","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"手动指定 ","type":"text"},{"text":"--cdp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数。","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"CDP 模式前置条件","type":"text","marks":[{"type":"strong"}]},{"text":":确保 Chrome 浏览器已开启 CDP 远程调试端口:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 方式1(推荐):直接用带 CDP 的方式启动 Chrome\n/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 &\n\n# 方式2:如果 Chrome 已在运行,需要先关闭再以 CDP 模式重启\n# 脚本会自动尝试此操作,但可能需要用户手动确认","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ CDP 独立 profile 的已知限制","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"paragraph","content":[{"text":"脚本会使用独立的 CDP profile 目录(","type":"text"},{"text":"~/.fetch_article/chrome_cdp_profile","type":"text","marks":[{"type":"code_inline"}]},{"text":"),虽然会自动复制 Cookies 文件,但","type":"text"},{"text":"以下登录态信息不会被同步","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"localStorage","type":"text","marks":[{"type":"code_inline"}]},{"text":"(Substack 等 SPA 站点的会话 token)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Service Worker","type":"text","marks":[{"type":"code_inline"}]},{"text":" 缓存","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"sessionStorage","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"实际影响","type":"text","marks":[{"type":"strong"}]},{"text":":对于 Substack 等依赖 localStorage 的站点,仅靠 Cookies 复制可能无法完全还原登录态。脚本已通过 ","type":"text"},{"text":"Substack 登录态缓存","type":"text","marks":[{"type":"strong"}]},{"text":"机制(","type":"text"},{"text":"~/.substack/storage_state.json","type":"text","marks":[{"type":"code_inline"}]},{"text":")弥补此限制——首次登录后会保存完整的 Playwright storage state(含 Cookies + localStorage),后续抓取直接复用。","type":"text"}]},{"type":"paragraph","content":[{"text":"最佳实践","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"首次使用 Substack 前","type":"text","marks":[{"type":"strong"}]},{"text":",先运行 ","type":"text"},{"text":"python scripts/fetch_article.py login","type":"text","marks":[{"type":"code_inline"}]},{"text":" 完成登录并缓存","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果 CDP 模式下登录态校验失败,脚本会自动暂停并引导用户在弹出的 Chrome 窗口中登录","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"登录成功后会自动刷新缓存,后续抓取无需重复登录","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"工作原理","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"自动从 Chrome 浏览器的 Cookie 数据库提取目标域名的登录 cookies","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将 cookies 注入 Playwright 浏览器上下文","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"加载页面,自动检测并等待 Cloudflare challenge 通过(如有)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"滚动加载懒加载内容、下载所有图片","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"自动格式转换","type":"text","marks":[{"type":"strong"}]},{"text":":检测下载图片的真实格式(WebP/SVG 伪装成 .png/.jpg 很常见),自动转为真正的 PNG 以确保 PDF 生成和文档嵌入兼容","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将正文转换为 Markdown(","type":"text"},{"text":"article.md","type":"text","marks":[{"type":"code_inline"}]},{"text":"),图片保存到 ","type":"text"},{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 子目录","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"内容提取时自动选择","type":"text"},{"text":"最长的内容容器","type":"text","marks":[{"type":"strong"}]},{"text":"(避免只抓到免费预览区域)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"标题提取增强","type":"text","marks":[{"type":"strong"}]},{"text":"(多策略回退):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CSS 选择器优先级:","type":"text"},{"text":"h1.post-title","type":"text","marks":[{"type":"code_inline"}]},{"text":" > ","type":"text"},{"text":"article h1","type":"text","marks":[{"type":"code_inline"}]},{"text":" > ","type":"text"},{"text":"[class*=\"title\"] h1","type":"text","marks":[{"type":"code_inline"}]},{"text":" > ","type":"text"},{"text":"h1","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"回退到 ","type":"text"},{"text":"\u003cmeta property=\"og:title\">","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"\u003cmeta name=\"title\">","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"document.title","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"自动清理标题中的网站后缀(如 ","type":"text"},{"text":"\" - Cursor\"","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"\" | Substack\"","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"正文中与已提取标题相同的第一个 ","type":"text"},{"text":"\u003ch1>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 会被自动去重,避免 MD 中标题重复","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"作者提取增强","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CSS 选择器 + ","type":"text"},{"text":"meta[name=\"author\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":" + ","type":"text"},{"text":"[rel=\"author\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":" + ","type":"text"},{"text":"meta[property=\"article:author\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":" 多策略回退","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"PDF 文件/链接处理(2026-05-12 实战验证 ✅)","type":"text"}]},{"type":"paragraph","content":[{"text":"触发条件","type":"text","marks":[{"type":"strong"}]},{"text":":用户提供的是 PDF 直链(如 ","type":"text"},{"text":"arxiv.org/pdf/xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":"、COS/CDN 直链)、乐享知识库中已存储的 PDF 条目链接,或本地 PDF 文件路径。","type":"text"}]},{"type":"paragraph","content":[{"text":"核心原则","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"PDF 的文字和图片必须分两条路处理,不能合并为一步","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"非中文 PDF(如英文论文)","type":"text"},{"text":"默认翻译为中英对照格式","type":"text","marks":[{"type":"strong"}]},{"text":"后再转存,不可跳过","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"最终产出为乐享","type":"text"},{"text":"在线文档(page 类型)","type":"text","marks":[{"type":"strong"}]},{"text":",支持后续编辑","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Step A:获取 PDF 文件","type":"text"}]},{"type":"paragraph","content":[{"text":"情况1:乐享知识库中已有 PDF 条目","type":"text","marks":[{"type":"strong"}]},{"text":"(如 ","type":"text"},{"text":"lexiangla.com/pages/xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"# 1. 从 URL 提取 entry_id\nentry = mcp.entry_describe_entry(entry_id=\"\u003centry_id>\")\nfile_id = entry.target_id # PDF 的文件ID\n\n# 2. 获取下载链接(有效期 3600s)\nresult = mcp.file_download_file(file_id=file_id, expire_seconds=3600)\ndownload_url = result.url\n\n# 3. 下载 PDF\ncurl -L -o paper.pdf \"\u003cdownload_url>\"","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 注意:","type":"text"},{"text":"entry_describe_ai_parse_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 对大型 PDF 会超过 80K 字符限制而报错,","type":"text"},{"text":"不要用它获取 PDF 内容","type":"text","marks":[{"type":"strong"}]},{"text":",改用下载方式。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"情况2:arXiv 等直链 PDF","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"curl -L -o paper.pdf \"https://arxiv.org/pdf/2605.05538\"","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Step B:提取文字(pymupdf)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"import fitz\n\ndoc = fitz.open('paper.pdf')\ntext = ''\nfor page in doc:\n text += page.get_text()\nwith open('paper.txt', 'w') as f:\n f.write(text)\nprint(f\"Pages: {doc.page_count}, chars: {len(text)}\")","type":"text"}]},{"type":"paragraph","content":[{"text":"get_text()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 只提取纯文字——矢量图(流程图/柱状图)和表格结构会丢失,这是预期行为,图形通过 Step C 单独处理。","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Step C:定位并精确裁剪图形","type":"text"}]},{"type":"paragraph","content":[{"text":"PDF 中的图形分两类,提取方式不同:","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"类型","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"判断方法","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"提取方法","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"光栅图","type":"text","marks":[{"type":"strong"}]},{"text":"(嵌入的 PNG/JPEG)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"page.get_images()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 有结果","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"page.get_image_rects(xref)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 获取坐标 → ","type":"text"},{"text":"clip=Rect","type":"text","marks":[{"type":"code_inline"}]},{"text":" 裁剪","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"矢量图","type":"text","marks":[{"type":"strong"}]},{"text":"(流程图/柱状图,PDF 绘图命令)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"page.get_images()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 无结果,但 ","type":"text"},{"text":"page.get_drawings()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 有数据","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"page.get_drawings()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 获取边界 → ","type":"text"},{"text":"clip=Rect","type":"text","marks":[{"type":"code_inline"}]},{"text":" 截图","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"定位图形坐标","type":"text","marks":[{"type":"strong"}]},{"text":"(先找出各 Figure 所在页面):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"doc = fitz.open('paper.pdf')\n\nfor page_idx in range(doc.page_count):\n page = doc[page_idx]\n \n # 找 Figure caption 位置(确定图形所在页)\n blocks = page.get_text(\"blocks\")\n for b in blocks:\n if 'Figure' in b[4]:\n print(f\"Page {page_idx+1}: [{b[0]:.0f},{b[1]:.0f},{b[2]:.0f},{b[3]:.0f}] {b[4][:60]}\")\n \n # 光栅图坐标\n for img in page.get_images(full=True):\n rects = page.get_image_rects(img[0])\n print(f\" Raster img: {rects}\")\n \n # 矢量图分布(按左栏/右栏区分,双栏论文左栏 x\u003c295,右栏 x>295)\n drawings = page.get_drawings()\n if drawings:\n xs0 = [d['rect'].x0 for d in drawings]\n xs1 = [d['rect'].x1 for d in drawings]\n ys0 = [d['rect'].y0 for d in drawings]\n ys1 = [d['rect'].y1 for d in drawings]\n print(f\" Vector drawings bbox: ({min(xs0):.0f},{min(ys0):.0f},{max(xs1):.0f},{max(ys1):.0f})\")","type":"text"}]},{"type":"paragraph","content":[{"text":"精确裁剪图形区域","type":"text","marks":[{"type":"strong"}]},{"text":"(3x 高清,仅裁剪图形本身+caption,不含论文正文):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"mat = fitz.Matrix(3.0, 3.0) # 3x 放大,确保清晰\n\n# 根据上面分析的坐标,裁剪每个 Figure\n# 双栏论文参考坐标:左栏 x: 58-290,右栏 x: 295-535\n# y 坐标从 caption 文字位置往上定图形起点\nclip = fitz.Rect(x0, y0, x1, y1) # 精确到图形边界,含 caption\npix = page.get_pixmap(matrix=mat, clip=clip)\npix.save(f'Figure{n}.png')","type":"text"}]},{"type":"paragraph","content":[{"text":"⚠️ 关键注意事项","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"绝对不能用 ","type":"text","marks":[{"type":"strong"}]},{"text":"page.get_pixmap()","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 不传 clip","type":"text","marks":[{"type":"strong"}]},{"text":"——会截整页(含论文正文),不是图形本身","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"get_image_rects()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 只对光栅图有效;矢量图只能通过 ","type":"text"},{"text":"get_drawings()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 bbox + caption 坐标推断边界","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"验证截图效果:","type":"text"},{"text":"Read","type":"text","marks":[{"type":"code_inline"}]},{"text":" 工具可预览 PNG,检查是否干净(只含图形+caption)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果矢量图跨越双栏(","type":"text"},{"text":"x0 \u003c 295","type":"text","marks":[{"type":"code_inline"}]},{"text":"),说明是通栏图,裁剪区域应包含完整宽度","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Step D:语言检测与翻译","type":"text"}]},{"type":"paragraph","content":[{"text":"读取 ","type":"text"},{"text":"paper.txt","type":"text","marks":[{"type":"code_inline"}]},{"text":" 前 500 字符,统计中文字符占比:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中文字符比例 ≥ 30% → 中文内容,","type":"text"},{"text":"跳过翻译","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中文字符比例 \u003c 30% → 非中文(如英文论文),","type":"text"},{"text":"必须翻译为中英对照格式","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"paragraph","content":[{"text":"中英对照翻译格式","type":"text","marks":[{"type":"strong"}]},{"text":"(每段先英文原文,紧跟中文翻译,保留所有结构):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"## Introduction / 引言\n\nStandard RAG pipelines follow a static retrieve-then-generate paradigm...\n\n标准 RAG 流水线遵循静态的\"先检索再生成\"范式...\n\n### Tables(表格保留完整数据,加中文表头)\n\n### Figures(图形位置用文字描述,图片在 Step E 中插入)","type":"text"}]},{"type":"paragraph","content":[{"text":"翻译方式(按优先级):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"translate_gemini.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"GEMINI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 可用时)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"translate_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"OPENAI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 可用时)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"AI 助手在对话中直接逐段翻译","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"Step E:导入乐享在线文档 + 插入图片","type":"text"}]},{"type":"paragraph","content":[{"text":"1. 创建在线文档(import_content)","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"result = mcp.entry_import_content(\n name=\"论文标题 中英对照翻译\",\n content=open('paper_translated.md').read(), # 中英对照版\n content_type=\"markdown\",\n parent_id=\"\u003c日期目录ID>\",\n space_id=\"\u003cSPACE_ID>\"\n)\npage_entry_id = result.entry.id\npage_root_block_id = result.entry.target_id # page 根 block,用于 move/insert","type":"text"}]},{"type":"paragraph","content":[{"text":"2. 上传各 Figure 图片并插入正确位置","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"每张图的完整流程(三步):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Step 1: 申请上传凭证\nsession_id, upload_url = mcp.block_apply_block_attachment_upload(\n entry_id=page_entry_id,\n name=\"FigureN.png\",\n size=str(file_size),\n mime_type=\"image/png\"\n)\n\n# Step 2: 上传图片(必须包含 Content-Length 和 Content-Type)\ncurl -X PUT \"\u003cupload_url>\" \\\n -H \"Content-Type: image/png\" \\\n -H \"Content-Length: \u003csize>\" \\\n --data-binary @FigureN.png\n\n# Step 3: 在文档正确位置插入 image block\n# - 先用 block_list_block_children 获取 block 列表,找到对应章节段落的 index\n# - caption 格式:英文原文 / 中文翻译(放在一行,用 / 分隔)\nmcp.block_create_block_descendant(\n entry_id=page_entry_id,\n parent_block_id=page_root_block_id,\n index=\"\u003c正确位置>\",\n descendant=[{\n \"block_type\": \"image\",\n \"image\": {\n \"session_id\": session_id,\n \"caption\": \"Figure N: English caption / 图N:中文翻译\",\n \"align\": \"center\"\n }\n }]\n)","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 重要","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"caption 英中放在一行","type":"text","marks":[{"type":"strong"}]},{"text":":用 ","type":"text"},{"text":"/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 分隔,不要额外插入独立段落作为图注,否则显示时会有多余换行","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片位置要精确","type":"text","marks":[{"type":"strong"}]},{"text":":先获取 block 列表找到对应章节 block 的 index,不能全部 append 到末尾","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"update_block","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 不支持修改 image block 的 caption","type":"text","marks":[{"type":"strong"}]},{"text":":如果 caption 写错了,只能 delete + recreate。重建时可用已有的 ","type":"text"},{"text":"file_id","type":"text","marks":[{"type":"code_inline"}]},{"text":"(通过 ","type":"text"},{"text":"block_describe_block","type":"text","marks":[{"type":"code_inline"}]},{"text":" 查询),","type":"text"},{"text":"不需要重新上传文件","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"每次插入图片后 index 会偏移","type":"text","marks":[{"type":"strong"}]},{"text":":如需在不同位置插入多张图,要按顺序从前往后插,并累计计算 index","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"3. 图片位置确定方法","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"先获取文档当前全部 block:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"blocks = mcp.block_list_block_children(entry_id=page_entry_id, with_descendants=False)","type":"text"}]},{"type":"paragraph","content":[{"text":"找到对应章节的最后一个 p/h block 的 index,图片插入该 index+1 处。","type":"text"}]},{"type":"paragraph","content":[{"text":"已验证的工作流(arXiv PDF 2605.05538 实战,2026-05-12)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Figure 1/2 为光栅图(嵌入 PNG),用 ","type":"text"},{"text":"get_image_rects()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 定位","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Figure 3/4 为矢量图(柱状图),用 ","type":"text"},{"text":"get_drawings()","type":"text","marks":[{"type":"code_inline"}]},{"text":" bbox + caption 坐标确定裁剪区域","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Figure 5 为光栅图(截图),用 ","type":"text"},{"text":"get_image_rects()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 定位","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"所有图片 3x 渲染后 60-90KB,质量良好","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"微信公众号文章(mp.weixin.qq.com)专项优化","type":"text"}]},{"type":"paragraph","content":[{"text":"脚本对微信公众号文章有专门的检测和处理策略:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"自动检测","type":"text","marks":[{"type":"strong"}]},{"text":":识别 ","type":"text"},{"text":"mp.weixin.qq.com","type":"text","marks":[{"type":"code_inline"}]},{"text":" 域名,自动启用微信模式","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"无需登录","type":"text","marks":[{"type":"strong"}]},{"text":":微信公众号文章是公开可读的,跳过登录检测和 Cookie 注入流程","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"专用内容选择器","type":"text","marks":[{"type":"strong"}]},{"text":":使用 ","type":"text"},{"text":"#js_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":".rich_media_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 精准定位正文区域(而非通用选择器可能匹配到页面其他内容)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"标题提取","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"#activity-name","type":"text","marks":[{"type":"code_inline"}]},{"text":" > ","type":"text"},{"text":"h1.rich_media_title","type":"text","marks":[{"type":"code_inline"}]},{"text":" > 通用 h1 > meta 标签回退","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"作者提取","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"#js_name","type":"text","marks":[{"type":"code_inline"}]},{"text":"(公众号名称)> ","type":"text"},{"text":".rich_media_meta_nickname","type":"text","marks":[{"type":"code_inline"}]},{"text":" > 通用选择器回退","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"日期提取","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"#publish_time","type":"text","marks":[{"type":"code_inline"}]},{"text":" > 通用 time/date 选择器回退","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片懒加载增强","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"微信图片使用 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":" + IntersectionObserver 懒加载","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"滚动速度放慢(300px 步长、200ms 间隔)以确保触发所有 IntersectionObserver","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"强制将未触发的 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 复制到 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":"(兜底策略)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片下载时优先使用 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的高清原图 URL","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片格式识别","type":"text","marks":[{"type":"strong"}]},{"text":":微信图片 URL 格式特殊(","type":"text"},{"text":"mmbiz.qpic.cn/...?wx_fmt=png","type":"text","marks":[{"type":"code_inline"}]},{"text":"),从 ","type":"text"},{"text":"wx_fmt","type":"text","marks":[{"type":"code_inline"}]},{"text":" 查询参数推断文件扩展名","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Referer 防盗链","type":"text","marks":[{"type":"strong"}]},{"text":":通过 Playwright 页面上下文的 ","type":"text"},{"text":"page.request.get()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 下载图片,自动携带正确的 Referer 头","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Substack 站点(如 www.lennysnewsletter.com)专项优化","type":"text","marks":[{"type":"strong"}]},{"text":": 脚本对 Substack 托管的站点(","type":"text"},{"text":"*.substack.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"lennysnewsletter.com","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等)有专门的登录检测和","type":"text"},{"text":"登录态缓存","type":"text","marks":[{"type":"strong"}]},{"text":"机制:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"登录态缓存","type":"text","marks":[{"type":"strong"}]},{"text":":登录成功后自动保存 Playwright ","type":"text"},{"text":"storage_state","type":"text","marks":[{"type":"code_inline"}]},{"text":" 到 ","type":"text"},{"text":"~/.substack/storage_state.json","type":"text","marks":[{"type":"code_inline"}]},{"text":",后续抓取直接复用,","type":"text"},{"text":"无需重复登录和邮箱验证","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"优先级","type":"text","marks":[{"type":"strong"}]},{"text":":缓存 ","type":"text"},{"text":"storage_state","type":"text","marks":[{"type":"code_inline"}]},{"text":" > Chrome cookies > 引导登录","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"自动检测登录状态","type":"text","marks":[{"type":"strong"}]},{"text":":加载页面后检查右上角是否有用户头像(已登录)还是 \"Sign in\" 按钮(未登录)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"已登录","type":"text","marks":[{"type":"strong"}]},{"text":" → 直接抓取全文,并刷新缓存延长有效期","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"缓存过期","type":"text","marks":[{"type":"strong"}]},{"text":" → 自动清理旧缓存,进入引导登录流程","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"未登录","type":"text","marks":[{"type":"strong"}]},{"text":" → 打开可见浏览器窗口引导登录,用户在终端输入 ","type":"text"},{"text":"y","type":"text","marks":[{"type":"code_inline"}]},{"text":" 确认后二次验证,通过后自动缓存","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"独立登录命令","type":"text","marks":[{"type":"strong"}]},{"text":"(推荐首次使用时先执行):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python scripts/fetch_article.py login","type":"text"}]},{"type":"paragraph","content":[{"text":"此命令单独完成 Substack 登录并缓存,不需要指定文章 URL。后续所有 Substack 文章抓取都会自动复用此登录态。","type":"text"}]},{"type":"paragraph","content":[{"text":"非 Substack 站点的登录确认机制","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"无 Chrome cookies 时自动切换到非无头模式,打开可见浏览器窗口","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"终端提示用户完成登录操作后","type":"text"},{"text":"按回车键","type":"text","marks":[{"type":"strong"}]},{"text":"继续","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"收到确认信号后重新加载页面并检测付费墙状态","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"付费墙检测","type":"text","marks":[{"type":"strong"}]},{"text":":脚本同时检测以下信号:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"DOM 元素:","type":"text"},{"text":"[data-testid=\"paywall\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":".paywall","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文本关键词:","type":"text"},{"text":"This post is for paid subscribers","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"Subscribe to read","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"Upgrade to paid","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"注意:不同网站的付费墙 DOM 结构和关键词不同,如遇新网站抓取不完整,需检查页面实际的付费墙标识并更新检测逻辑","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"判断内容是否完整的方法","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"先用 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 尝试获取,如果明显被截断(内容不完整、出现付费提示),则切换到 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"抓取完成后","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"告知用户查看 ","type":"text"},{"text":"article.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" 确认内容完整性","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"关注文章末尾是否有作者署名/总结段落作为完整性标志","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果用户反馈内容不完整,检查:(1) 登录账号是否有付费权限 (2) 页面是否有懒加载内容未触发 (3) 内容选择器是否匹配到了免费预览区而非全文区","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"产出物","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 完整文章 Markdown(含图片引用)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/\u003c原文标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 结构化元信息(原文链接、作者、发布时间、抓取时间等)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 所有文章配图","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"\u003c原文标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"json"},"content":[{"text":"{\n \"url\": \"原文链接\",\n \"title\": \"文章标题\",\n \"subtitle\": \"副标题\",\n \"author\": \"作者\",\n \"date\": \"发布时间\",\n \"content_length\": 12345,\n \"image_count\": 5,\n \"images\": [\"images/img_01_xxx.png\", ...],\n \"fetched_at\": \"2026-02-25T10:30:00\"\n}","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"X.com / Twitter 帖子抓取(必须用 CDP 模式)","type":"text"}]},{"type":"paragraph","content":[{"text":"X.com 是登录墙网站的典型代表","type":"text","marks":[{"type":"strong"}]},{"text":",","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和普通 Cookie 注入模式都无法抓取,","type":"text"},{"text":"必须使用 CDP 模式","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# CDP 模式(必须)\npython scripts/fetch_article.py fetch \"https://x.com/\u003cusername>/status/\u003cid>\" --output-dir \u003c项目子目录> --cdp","type":"text"}]},{"type":"paragraph","content":[{"text":"CDP 模式工作原理","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"通过 Chrome DevTools Protocol (port 9222) 连接用户真实 Chrome 浏览器","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"复用浏览器中已登录的 X 账号会话","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"绕过自动化浏览器检测(X 会检测并阻止 Playwright/Selenium)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"CDP 模式前置条件","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 启动 Chrome 并开启 CDP 端口\n/Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 &\n\n# 验证\ncurl -s http://localhost:9222/json/version","type":"text"}]},{"type":"paragraph","content":[{"text":"X.com 抓取的特殊处理","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"帖子内容会转换为 Markdown 格式","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片(帖子中的媒体)会下载到 ","type":"text"},{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 目录","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"帖子中的链接会转换为 Markdown 链接格式","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"转发数、点赞数等元信息会保留","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"产出物","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 帖子 Markdown","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/\u003c原文标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 元信息","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 帖子中的媒体图片","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"微博帖子抓取(必须用 CDP 模式)","type":"text"}]},{"type":"paragraph","content":[{"text":"微博是强制登录墙网站","type":"text","marks":[{"type":"strong"}]},{"text":",所有端口(PC 端 ","type":"text"},{"text":"weibo.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"、移动端 ","type":"text"},{"text":"m.weibo.cn","type":"text","marks":[{"type":"code_inline"}]},{"text":"、API ","type":"text"},{"text":"m.weibo.cn/statuses/show","type":"text","marks":[{"type":"code_inline"}]},{"text":")均需要登录态,","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和普通 Playwright 都会被重定向到 ","type":"text"},{"text":"Sina Visitor System","type":"text","marks":[{"type":"code_inline"}]},{"text":" 登录页。","type":"text"},{"text":"必须使用 CDP 模式","type":"text","marks":[{"type":"strong"}]},{"text":"。","type":"text"}]},{"type":"paragraph","content":[{"text":"抓取命令","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# CDP 模式(必须)— 连接本地已登录 Chrome\npython scripts/fetch_article.py fetch \"https://weibo.com/\u003cuid>/\u003cmid>\" --output-dir \u003c项目子目录> --cdp","type":"text"}]},{"type":"paragraph","content":[{"text":"完整流程","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"确保 Chrome 已开启 CDP 端口","type":"text","marks":[{"type":"strong"}]},{"text":"(port 9222)且已登录微博","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"运行 ","type":"text","marks":[{"type":"strong"}]},{"text":"fetch_article.py --cdp","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":":脚本会连接真实 Chrome,复用微博登录态","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CDP 连接失败时的自动降级","type":"text","marks":[{"type":"strong"}]},{"text":":脚本会回退到 Cookie 注入模式(从 Chrome Cookie DB 提取 cookies),但微博 Cookie 注入通常也能工作","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"抓取后整理内容","type":"text","marks":[{"type":"strong"}]},{"text":":微博原始 HTML 结构较乱,抓取结果中可能包含导航、按钮等噪音文本,需要手动清理或用 AI 整理为结构化 Markdown","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"转存乐享","type":"text","marks":[{"type":"strong"}]},{"text":":使用 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建页面(非 ","type":"text"},{"text":"file_create_hyperlink","type":"text","marks":[{"type":"code_inline"}]},{"text":",后者仅支持微信公众号链接)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"微博抓取的特殊注意事项","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"问题","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"说明","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 失败","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"微博强制登录,WebFetch 会被重定向到 ","type":"text"},{"text":"passport.weibo.com/visitor/visitor","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Playwright 失败","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"微博检测 HeadlessChrome UA,即使用 ","type":"text"},{"text":"--browser=chrome","type":"text","marks":[{"type":"code_inline"}]},{"text":" 也会被拦截","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CDP 前置条件","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Chrome 必须已开启 ","type":"text"},{"text":"--remote-debugging-port=9222","type":"text","marks":[{"type":"code_inline"}]},{"text":" 且已登录微博","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"内容整理","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"微博页面标题通常是「微博正文 - 微博」,转存时应提取作者名和关键主题作为标题","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"图片处理","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"微博图片使用 ","type":"text"},{"text":"sinaimg.cn","type":"text","marks":[{"type":"code_inline"}]},{"text":" CDN,","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 可以下载,但部分图片可能需要 Referer","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"产出物","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/article.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 微博内容 Markdown(注意:微博标题通常是通用的,需手动重命名)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/article_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 元信息","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c项目子目录>/images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 微博中的图片(如有)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"转存乐享时的标题建议","type":"text","marks":[{"type":"strong"}]},{"text":": 微博原始标题是「微博正文 - 微博」,转存时应改为有意义的标题,格式建议:","type":"text"},{"text":"\u003c作者>:\u003c主题关键词>","type":"text","marks":[{"type":"code_inline"}]},{"text":",例如:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"唐杰THU:最近的一些想法(AI 技术趋势)","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"李飞飞:关于 Spatial AI 的思考","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"英文文章翻译为中英对照","type":"text"}]},{"type":"paragraph","content":[{"text":"对于英文文章(如 X 帖子、英文博客等),可以使用 OpenAI API 翻译为中英对照格式:","type":"text"}]},{"type":"paragraph","content":[{"text":"翻译脚本","type":"text","marks":[{"type":"strong"}]},{"text":" (","type":"text"},{"text":"scripts/translate_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python scripts/translate_article.py \u003c原文.md> \u003c输出.md> --model gpt-4o-mini","type":"text"}]},{"type":"paragraph","content":[{"text":"翻译格式","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"## 英文标题\n\n[英文原文段落]\n\n[中文翻译]\n\n## 第二节英文标题\n\n[英文原文...]\n\n[中文翻译...]","type":"text"}]},{"type":"paragraph","content":[{"text":"翻译工作流","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"先用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取原文","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"用 ","type":"text"},{"text":"translate_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 翻译为中英对照","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将翻译后的 Markdown 上传到乐享知识库","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"依赖","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"OPENAI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 环境变量","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"使用 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 获取的免费文章","type":"text"}]},{"type":"paragraph","content":[{"text":"对于通过 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 获取到完整内容的免费文章,","type":"text"},{"text":"同样需要保存原文","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"保存原文全文","type":"text","marks":[{"type":"strong"}]},{"text":":将 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 返回的内容直接保存为 Markdown,","type":"text"},{"text":"不做总结、不做摘要、不做改写","type":"text","marks":[{"type":"strong"}]},{"text":",保持原文的完整结构和措辞","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文件名使用原文标题:","type":"text"},{"text":"\u003c项目子目录>/\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"手动构建 ","type":"text"},{"text":"\u003c原文标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":",包含 URL、标题、作者、日期等元信息","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果文章包含图片,尽量下载保存到 ","type":"text"},{"text":"\u003c项目子目录>/images/","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"关键区分","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 工具可能会返回总结/摘要版本而非原文全文。如果返回的内容明显是总结(缺少原始段落、引用、细节),需要在 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 调用时明确要求\"返回完整原始全文内容,不要总结或缩写\"。保存到本地的","type":"text"},{"text":"必须是原文全文","type":"text","marks":[{"type":"strong"}]},{"text":",而不是经过 AI 总结的摘要。","type":"text"}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"YouTube 视频处理(yt-dlp + Whisper + 翻译 + 乐享)","type":"text"}]},{"type":"paragraph","content":[{"text":"当用户提供 YouTube 视频链接时","type":"text","marks":[{"type":"strong"}]},{"text":",使用 ","type":"text"},{"text":"yt_download_transcribe.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 脚本完成完整的下载-转录-翻译-归档工作流。","type":"text"}]},{"type":"paragraph","content":[{"text":"⚠️ 重要","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"使用 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":"(无法获取视频内容),","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"使用 NotebookLM(已替换为本地 Whisper 方案,速度更快、无外部依赖)。","type":"text"}]},{"type":"paragraph","content":[{"text":"工作流概述","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"yt-dlp 下载视频","type":"text","marks":[{"type":"strong"}]},{"text":" → 本地 ","type":"text"},{"text":".mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":" 文件","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ffmpeg 提取音频","type":"text","marks":[{"type":"strong"}]},{"text":" → WAV 格式(16kHz 单声道)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Whisper 转录","type":"text","marks":[{"type":"strong"}]},{"text":" → 带时间戳的文字稿","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"AI 翻译","type":"text","marks":[{"type":"strong"}]},{"text":"(如果是英文)→ 中英对照格式的 Markdown","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"上传乐享知识库","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文字稿:","type":"text"},{"text":"以在线文档(page)格式上传","type":"text","marks":[{"type":"strong"}]},{"text":",支持后续按块维度编辑更新","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"视频文件:以文件(file)格式上传","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"清理","type":"text","marks":[{"type":"strong"}]},{"text":":上传成功后删除本地视频文件","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Step 1:下载 + 转录 + 翻译","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"cd \u003c项目子目录>\n\n# 完整流程(下载 + 转录 + 翻译)\npython3 scripts/yt_download_transcribe.py \"\u003cYouTube URL>\" \\\n --output-dir . \\\n --whisper-model base\n\n# 常用参数:\n# --whisper-model tiny|base|small|medium|large 转录模型(越大越准但越慢)\n# --skip-download 跳过下载(用于重新转录已下载的视频)\n# --skip-translate 跳过翻译步骤\n# --keep-audio 保留提取的音频文件","type":"text"}]},{"type":"paragraph","content":[{"text":"产出物","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c视频标题>.mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 下载的视频文件","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c视频标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 文字稿 Markdown(英文视频为中英对照格式)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003c视频标题>_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 视频元信息","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"文字稿格式","type":"text","marks":[{"type":"strong"}]},{"text":"(英文视频,中英对照):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"# 视频标题\n\n**频道**: xxx\n**发布日期**: 2026-03-10\n**时长**: 15:30\n**原始链接**: https://www.youtube.com/watch?v=xxx\n**转录语言**: en\n\n---\n\n## 文字稿(中英对照)\n\n> 以下内容采用「英文原文 + 中文翻译」对照排列。\n\n**[00:00]**\n\nThis is the original English text from the video...\n\n这是视频中的中文翻译文本...\n\n**[01:23]**\n\nNext paragraph of English text...\n\n下一段中文翻译...","type":"text"}]},{"type":"paragraph","content":[{"text":"Whisper 模型选择建议","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"模型","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"速度","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"精度","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"适用场景","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"tiny","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"最快","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"较低","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"快速预览、非关键内容","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"base","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"快","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"中等","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"默认推荐","type":"text","marks":[{"type":"strong"}]},{"text":",适合大部分场景","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"small","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"中等","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"较高","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"口音较重、背景噪音较多","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"medium","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"慢","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"高","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"重要内容、需要高精度","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"large","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"最慢","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"最高","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"专业内容、学术演讲","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"Step 2:上传到乐享知识库","type":"text","marks":[{"type":"strong"}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"通过 lexiang MCP 工具完成上传,流程与 Step 2(普通文章转存乐享)一致。","type":"text"},{"text":"前提是 lexiang MCP 已连接","type":"text","marks":[{"type":"strong"}]},{"text":"(参见 Step 2 的「乐享 MCP 工具的调用方式」章节)。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"文字稿上传","type":"text","marks":[{"type":"strong"}]},{"text":"(在线文档 page 类型):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"获取知识库根节点 → 检查/创建日期目录(同上述步骤 1-3)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"调用 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"parent_id=\u003c日期目录ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"name=\"\u003c视频标题>\"","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"content=\u003c文字稿Markdown内容>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"content_type=\"markdown\"","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"⚠️ 此接口不支持 ","type":"text"},{"text":"after","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数,新建文档会追加到目录末尾,","type":"text"},{"text":"无法控制位置","type":"text","marks":[{"type":"strong"}]}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"在线文档支持后续在乐享中按块维度编辑更新(如修正翻译)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"视频文件上传","type":"text","marks":[{"type":"strong"}]},{"text":"(🚨 推荐使用 OpenAPI 路径,MCP 的 ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 产生不可播放的 file 条目):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# ✅ 推荐:通过 OpenAPI 上传,产生 entry_type=video,乐享会 VOD 转码,真能播放\npython3 scripts/upload_video_via_openapi.py \"\u003c视频路径>.mp4\" \\\n --space-id \u003cspace_id> \\\n --parent-entry-id \u003c父节点 entry_id> \\\n --media-type video","type":"text"}]},{"type":"paragraph","content":[{"text":"需要在 ","type":"text"},{"text":"~/.lexiang/openapi.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 配置 AppKey/AppSecret/StaffID(","type":"text"},{"text":"不入 git","type":"text","marks":[{"type":"strong"}]},{"text":")。","type":"text"}]},{"type":"paragraph","content":[{"text":"OpenAPI 正确流程(脚本已封装):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"POST /cgi-bin/v1/kb/files/upload-params","type":"text","marks":[{"type":"code_inline"}]},{"text":"(body: ","type":"text"},{"text":"{\"name\":\"xxx.mp4\",\"media_type\":\"video\"}","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 获取 VOD 上传签名 + state","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"PUT \u003cbucket>.cos.\u003cregion>.myqcloud.com/\u003ckey>","type":"text","marks":[{"type":"code_inline"}]},{"text":" → 上传到 VOD COS","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"POST /cgi-bin/v1/kb/entries?space_id=xxx&state=xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":"(body: ","type":"text"},{"text":"entry_type=video, name=xxx.mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 创建可播放视频节点","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"🚨 关键踩坑(2026-05-03 实战总结)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 不要用 MCP 的 ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":"——产物是 ","type":"text"},{"text":"entry_type=file + extension=video","type":"text","marks":[{"type":"code_inline"}]},{"text":",不触发 VOD 转码,","type":"text"},{"text":"视频无法播放","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 不要用 ","type":"text"},{"text":"/cgi-bin/v1/docs/cos-param","type":"text","marks":[{"type":"code_inline"}]},{"text":" 签名接口——它只支持 ","type":"text"},{"text":"attachment/file","type":"text","marks":[{"type":"code_inline"}]},{"text":",签发的 state 不能用于创建 entry_type=video","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ ","type":"text"},{"text":"必须用 ","type":"text","marks":[{"type":"strong"}]},{"text":"/cgi-bin/v1/kb/files/upload-params","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"——支持 ","type":"text"},{"text":"media_type=video/audio/file","type":"text","marks":[{"type":"code_inline"}]},{"text":",签发的 state 可用于 ","type":"text"},{"text":"kb/entries","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ ","type":"text"},{"text":"name","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数","type":"text"},{"text":"必须带文件后缀","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":".mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等),否则报\"name 需指定文件后缀\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ ","type":"text"},{"text":"kb/entries","type":"text","marks":[{"type":"code_inline"}]},{"text":" 接口用 ","type":"text"},{"text":"x-staff-id","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"(小写带连字符),不是 ","type":"text"},{"text":"StaffID","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"备用:MCP 三步流程(仅适用于非视频文件,如 PDF)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"parent_entry_id=\u003c日期目录ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"name=\"\u003c文件名>.pdf\"","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"size=\u003c文件字节数>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"mime_type=\"application/pdf\"","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"upload_type=\"PRE_SIGNED_URL\"","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"curl -X PUT \"\u003cupload_url>\" -H \"Content-Type: application/pdf\" --data-binary \"@\u003c文件路径>\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"file_commit_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"session_id=\u003c上一步返回的session_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"上传成功后","type":"text","marks":[{"type":"strong"}]},{"text":":自动删除本地视频文件(","type":"text"},{"text":"rm -f \u003c视频文件路径>","type":"text","marks":[{"type":"code_inline"}]},{"text":"),节省磁盘空间。","type":"text"}]},{"type":"paragraph","content":[{"text":"依赖","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"推荐 ","type":"text","marks":[{"type":"strong"}]},{"text":"brew install yt-dlp","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":",不要用 ","type":"text"},{"text":"pip3 install","type":"text","marks":[{"type":"code_inline"}]},{"text":")— YouTube 视频下载。必须用 brew 安装以获取最新版本,pip 版本受限于系统 Python 版本(如 Python 3.9 无法安装 nightly 版),而 brew 版自带独立 Python 环境","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"openai-whisper","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"pip3 install openai-whisper","type":"text","marks":[{"type":"code_inline"}]},{"text":")— 音频转录","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ffmpeg","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"brew install ffmpeg","type":"text","marks":[{"type":"code_inline"}]},{"text":")— 音频提取","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"openai","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"pip3 install openai","type":"text","marks":[{"type":"code_inline"}]},{"text":")— 翻译(需要 ","type":"text"},{"text":"OPENAI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 环境变量)。","type":"text"},{"text":"如果没有 API Key,可以跳过翻译步骤,由 AI 助手直接在对话中翻译后更新文档","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"播客音频处理(yt-dlp + Whisper + 乐享)","type":"text"}]},{"type":"paragraph","content":[{"text":"当用户提供播客链接时","type":"text","marks":[{"type":"strong"}]},{"text":"(小宇宙FM ","type":"text"},{"text":"xiaoyuzhoufm.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"、Apple Podcasts 等),使用 yt-dlp 下载音频 + Whisper 转录的方式处理。","type":"text"}]},{"type":"paragraph","content":[{"text":"⚠️ 重要","type":"text","marks":[{"type":"strong"}]},{"text":":yt-dlp 的 generic extractor 可以从播客页面中自动提取音频 URL(m4a/mp3),","type":"text"},{"text":"不需要","type":"text","marks":[{"type":"strong"}]},{"text":" cookies,也","type":"text"},{"text":"不需要","type":"text","marks":[{"type":"strong"}]},{"text":"专门的播客 extractor。","type":"text"}]},{"type":"paragraph","content":[{"text":"工作流概述","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"yt-dlp 下载音频","type":"text","marks":[{"type":"strong"}]},{"text":" → 本地 ","type":"text"},{"text":".m4a","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":".mp3","type":"text","marks":[{"type":"code_inline"}]},{"text":" 文件(播客没有视频,直接是音频)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"ffmpeg 提取/转换音频","type":"text","marks":[{"type":"strong"}]},{"text":" → WAV 格式(16kHz 单声道,Whisper 推荐)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Whisper 转录","type":"text","marks":[{"type":"strong"}]},{"text":" → 带时间戳的文字稿","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"繁简转换","type":"text","marks":[{"type":"strong"}]},{"text":"(如需要)→ Whisper base 模型对中文会输出繁体,需用 ","type":"text"},{"text":"opencc","type":"text","marks":[{"type":"code_inline"}]},{"text":" 转为简体","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"上传乐享知识库","type":"text","marks":[{"type":"strong"}]},{"text":"(通过 lexiang MCP 工具):","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文字稿:","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建为在线文档(page)格式","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"音频文件:","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"curl PUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"file_commit_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 三步上传","type":"text"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"Step 1:下载音频","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"cd \u003c项目子目录>\n\n# yt-dlp 直接下载播客音频(不需要 cookies)\nyt-dlp --no-playlist -o \"%(title)s.%(ext)s\" \"\u003c播客链接>\"","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"小宇宙链接格式","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"https://www.xiaoyuzhoufm.com/episode/\u003cepisode_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":" yt-dlp 会通过 generic extractor 自动从页面中提取 ","type":"text"},{"text":"media.xyzcdn.net","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的音频直链。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"Step 2:提取 WAV + Whisper 转录","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 提取 WAV(16kHz 单声道)\nffmpeg -i \"\u003c音频文件>.m4a\" -vn -acodec pcm_s16le -ar 16000 -ac 1 -y \"\u003c音频文件>.wav\"\n\n# Whisper 转录(中文播客指定 language=zh)\npython3 -c \"\nimport whisper, json, time\nmodel = whisper.load_model('base')\nresult = model.transcribe('\u003c音频文件>.wav', language='zh', verbose=False)\nwith open('whisper_segments.json', 'w', encoding='utf-8') as f:\n json.dump(result['segments'], f, ensure_ascii=False, indent=2)\nprint(f'Done: {len(result[\\\"segments\\\"])} segments')\n\"","type":"text"}]},{"type":"paragraph","content":[{"text":"Step 3:合并段落 + 繁简转换 + 生成 Markdown","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"使用与 YouTube 视频相同的段落合并逻辑(max_gap=1.5s, max_duration=30s,遇句末标点+gap>0.8s 断开)。","type":"text"}]},{"type":"paragraph","content":[{"text":"关键","type":"text","marks":[{"type":"strong"}]},{"text":":Whisper base 模型对中文普通话倾向输出繁体字,必须用 ","type":"text"},{"text":"opencc","type":"text","marks":[{"type":"code_inline"}]},{"text":" 进行繁简转换:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"pip3 install opencc-python-reimplemented","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"import opencc\nconverter = opencc.OpenCC(\"t2s\")\nsimplified_text = converter.convert(traditional_text)","type":"text"}]},{"type":"paragraph","content":[{"text":"文字稿 Markdown 格式","type":"text","marks":[{"type":"strong"}]},{"text":"(中文播客):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"# 播客标题\n\n> 播客:节目名 | 平台:小宇宙FM\n> 嘉宾:xxx | 主播:xxx\n> 发布日期:YYYY-MM-DD | 时长:xx分xx秒\n> 原始链接:https://www.xiaoyuzhoufm.com/episode/xxx\n> 转录工具:Whisper base + OpenCC 繁简转换\n\n---\n\n## Part 1:章节标题\n\n**[00:00]** 第一段转录文本,由多个 Whisper segment 合并而成...\n\n**[01:23]** 第二段转录文本...\n\n## Part 2:章节标题\n\n**[15:30]** 第三段转录文本...","type":"text"}]},{"type":"paragraph","content":[{"text":"文字稿整理规范(🚨 必须遵守,避免格式混乱)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"核心原则","type":"text","marks":[{"type":"strong"}]},{"text":":Whisper 输出的 segments 是细碎的短句(通常每条1-5秒),必须","type":"text"},{"text":"先合并为自然段落","type":"text","marks":[{"type":"strong"}]},{"text":",再插入章节标题和时间戳。直接按 segment 粒度插入标题会导致同一个标题在每个短句前重复出现。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"段落合并策略","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 关键 bug 修复(2026-05-08)","type":"text","marks":[{"type":"strong"}]},{"text":":Whisper base 对中文输出几乎没有句号等标点,因此\"句末标点断开\"条件基本不会触发。","type":"text"},{"text":"唯一有效的断开条件是 duration 和 gap","type":"text","marks":[{"type":"strong"}]},{"text":"。必须确保 duration 计算正确:","type":"text"},{"text":"duration = 当前 segment 的 end - 段落起始 cur_start","type":"text","marks":[{"type":"code_inline"}]},{"text":"。","type":"text"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"相邻 segment 间隔 > 1.0s → ","type":"text"},{"text":"强制断开","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"累计时长 > 15s(","type":"text"},{"text":"seg.end - cur_start > 15","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ ","type":"text"},{"text":"强制断开","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"遇到句号、问号等句末标点 + gap > 0.5s → 断开为新段落","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"合并后的段落开头标注时间戳 ","type":"text"},{"text":"**[MM:SS]**","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"⚠️ 参数选择依据","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"max_duration=15s","type":"text","marks":[{"type":"code_inline"}]},{"text":" 而非 30s/60s:因为中文 Whisper 没有标点输出,只能靠 duration 强制切割。15s 约 200 字/段,阅读体验较好","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"max_gap=1.0s","type":"text","marks":[{"type":"code_inline"}]},{"text":":对话中的自然停顿通常 > 1s","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"目标:48 分钟播客应产出 150-200 段(平均 90-100 字/段)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"合并代码参考","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"paragraphs = []\ncur_text = \"\"\ncur_start = 0\ncur_end = 0\n\nfor seg in segments:\n start, end, text = seg[\"start\"], seg[\"end\"], seg[\"text\"].strip()\n if not text:\n continue\n if cur_text:\n gap = start - cur_end\n duration = end - cur_start # ⚠️ 必须用 end 而非 cur_end\n if duration > 15 or gap > 1.0:\n paragraphs.append({\"start\": cur_start, \"end\": cur_end, \"text\": cur_text.strip()})\n cur_text, cur_start, cur_end = text, start, end\n else:\n cur_text += text\n cur_end = end\n else:\n cur_text, cur_start, cur_end = text, start, end\nif cur_text:\n paragraphs.append({\"start\": cur_start, \"end\": cur_end, \"text\": cur_text.strip()})","type":"text"}]},{"type":"paragraph","content":[{"text":"章节标题插入策略(🚨 关键,避免重复)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"从播客简介/shownotes 中提取章节时间线","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将章节时间点转换为秒数,建立映射","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"每个标题只插入一次","type":"text","marks":[{"type":"strong"}]},{"text":":用 ","type":"text"},{"text":"inserted_headers = set()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 跟踪已插入的标题","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"在段落合并","type":"text"},{"text":"完成后","type":"text","marks":[{"type":"strong"}]},{"text":",根据段落起始时间匹配最近的章节标题","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"匹配条件:","type":"text"},{"text":"段落起始时间 >= 章节时间点","type":"text","marks":[{"type":"code_inline"}]},{"text":" 且 ","type":"text"},{"text":"该标题尚未插入","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"常见错误(必须避免)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 在每个 Whisper segment 级别插入章节标题 → 同一标题重复几十次","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 用宽松时间容差匹配(如 ","type":"text"},{"text":"abs(start - ts) \u003c 5","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 多个 segment 命中同一标题","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 不跟踪已插入状态 → 标题被重复插入","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ 先合并 segments 为段落,再在段落级别插入标题,每个标题只插入一次","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Step 4:上传到乐享知识库","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"与 YouTube 视频处理相同的流程(通过 lexiang MCP 工具完成,","type":"text"},{"text":"前提是 MCP 已连接","type":"text","marks":[{"type":"strong"}]},{"text":"):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"获取知识库根节点 → 检查/创建日期目录","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文字稿使用 ","type":"text"},{"text":"entry_import_content_to_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建为","type":"text"},{"text":"在线文档(page 类型)","type":"text","marks":[{"type":"strong"}]},{"text":",","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"直接上传 .md 文件(排版会乱,用户无法正常阅读)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"音频文件","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"使用 ","type":"text"},{"text":"upload_video_via_openapi.py --media-type audio","type":"text","marks":[{"type":"code_inline"}]},{"text":"(走 OpenAPI VOD 路径),","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"用 MCP 的 ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":"(产生 entry_type=file,无法在线播放)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"文字稿在线文档导入方法(🚨 分块导入,避免内容丢失)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"核心问题","type":"text","marks":[{"type":"strong"}]},{"text":":播客文字稿通常 15-25K chars,无法在单次 MCP 工具调用中传入全部内容。","type":"text"},{"text":"必须分块导入","type":"text","marks":[{"type":"strong"}]},{"text":"。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"分块导入流程","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"先用 ","type":"text"},{"text":"entry_create_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"entry_type=\"page\"","type":"text","marks":[{"type":"code_inline"}]},{"text":")创建空白 page,获取 ","type":"text"},{"text":"entry_id","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将 markdown 内容按行分块,每块 ≤ 4000 chars(确保没有超长单行)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"第一块","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"entry_import_content_to_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"entry_id=\u003cpage_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"force_write=true","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"content=\u003c第一块>","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"后续块","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"entry_import_content_to_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"entry_id=\u003cpage_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"force_write=false","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"content=\u003c后续块>","type":"text","marks":[{"type":"code_inline"}]},{"text":")— 追加到末尾","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"验证:调用 ","type":"text"},{"text":"entry_describe_ai_parse_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 确认内容完整(检查最后一个时间戳是否接近播客总时长)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"⚠️ 关键注意事项","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"每块内容必须是完整的 markdown 结构(不要在标题或段落中间切断)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果文字稿中有单行超过 4000 chars 的情况(说明合并策略有 bug),需要回到 Step 3 修复合并逻辑","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"48 分钟播客(~200 段 × ~100 字/段 = ~20K chars)通常需要分 5-6 块导入","type":"text"}]}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# ✅ 正确:通过 OpenAPI 上传音频(产生 entry_type=audio,触发 VOD 转码可播放)\npython3 scripts/upload_video_via_openapi.py \"\u003c音频文件>.m4a\" \\\n --space-id \u003cspace_id> \\\n --parent-entry-id \u003c日期目录 entry_id> \\\n --media-type audio","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 关键踩坑(2026-05-08 实战验证)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" + curl PUT + ","type":"text"},{"text":"file_commit_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" → 产出 ","type":"text"},{"text":"entry_type=file","type":"text","marks":[{"type":"code_inline"}]},{"text":",音频","type":"text"},{"text":"无法播放","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ ","type":"text"},{"text":"upload_video_via_openapi.py --media-type audio","type":"text","marks":[{"type":"code_inline"}]},{"text":" → 产出 ","type":"text"},{"text":"entry_type=audio","type":"text","marks":[{"type":"code_inline"}]},{"text":",乐享自动 VOD 转码,","type":"text"},{"text":"可在线播放","type":"text","marks":[{"type":"strong"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"播客 vs YouTube 的关键区别","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"维度","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"YouTube 视频","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"播客音频","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"文件格式","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":".mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":"(视频)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":".m4a","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":".mp3","type":"text","marks":[{"type":"code_inline"}]},{"text":"(纯音频)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"文件大小","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"较大(HLS 720p ~500MB)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"较小(~60MB/小时)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"下载方式","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"需要 HLS 格式避免 403","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"直接下载,无反爬","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"cookies","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通常需要","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"不需要","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Whisper 语言","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通常是英文(需翻译)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通常是中文(需繁简转换)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"上传 MIME","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"video/mp4","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"audio/mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"audio/mpeg","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"依赖","type":"text","marks":[{"type":"strong"}]},{"text":"(额外):","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"opencc-python-reimplemented","type":"text","marks":[{"type":"code_inline"}]},{"text":"(","type":"text"},{"text":"pip3 install opencc-python-reimplemented","type":"text","marks":[{"type":"code_inline"}]},{"text":")— 繁体转简体(Whisper base 模型中文输出为繁体时需要)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"结构化分析","type":"text"}]},{"type":"paragraph","content":[{"text":"输出结构化分析:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"【文章主题】一句话概括\n【核心论点】3-5 个关键观点\n【关键数据】文章中的重要数据/图表\n【利益相关】作者/机构的立场与潜在倾向(如有)\n【原文出处】完整标题 + URL","type":"text"}]},{"type":"paragraph","content":[{"text":"规划图表:第 1 张为总览图,第 2-N 张各聚焦 1 个核心论点。向用户确认图表数量和主题划分。","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Step 2:原文保存到乐享知识库","type":"text"}]},{"type":"paragraph","content":[{"text":"在进入信息图生成流程之前,先将原文完整保存到乐享知识库","type":"text","marks":[{"type":"strong"}]},{"text":",确保素材归档和可追溯。","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"配置文件与初始化","type":"text"}]},{"type":"paragraph","content":[{"text":"本 skill 的目标知识库等信息通过配置文件管理,","type":"text"},{"text":"不在 SKILL.md 中硬编码","type":"text","marks":[{"type":"strong"}]},{"text":"。","type":"text"}]},{"type":"paragraph","content":[{"text":"配置文件路径:","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"(位于 skill 根目录,即与本 SKILL.md 同级)","type":"text"}]},{"type":"heading","attrs":{"level":5},"content":[{"text":"对话式配置初始化(首次使用时自动触发)","type":"text"}]},{"type":"paragraph","content":[{"text":"当 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中 ","type":"text"},{"text":"_initialized","type":"text","marks":[{"type":"code_inline"}]},{"text":" 为 ","type":"text"},{"text":"false","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 为空时,","type":"text"},{"text":"在执行任何乐享操作前","type":"text","marks":[{"type":"strong"}]},{"text":",必须先通过对话引导用户完成配置。","type":"text"}]},{"type":"paragraph","content":[{"text":"核心设计","type":"text","marks":[{"type":"strong"}]},{"text":":用户只需要粘贴一个乐享知识库链接,Agent 自动完成所有配置。","type":"text"}]},{"type":"paragraph","content":[{"text":"链接格式","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"https://\u003cdomain>/spaces/\u003cspace_id>?company_from=\u003ccompany_from>","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"示例:","type":"text"},{"text":"https://lexiangla.com/spaces/b6013f6492894a29abbd89d5f2e636c6?company_from=e6c565d6d16811efac17768586f8a025","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"从链接中可解析出三个关键信息:","type":"text"},{"text":"域名","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"lexiangla.com","type":"text","marks":[{"type":"code_inline"}]},{"text":")、","type":"text"},{"text":"space_id","type":"text","marks":[{"type":"strong"}]},{"text":"、","type":"text"},{"text":"company_from","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"paragraph","content":[{"text":"流程如下","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"paragraph","content":[{"text":"第一步:检测 MCP 连接","type":"text","marks":[{"type":"strong"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"尝试调用任意一个 lexiang MCP 工具(如 ","type":"text"},{"text":"whoami","type":"text","marks":[{"type":"code_inline"}]},{"text":")检测 MCP 是否已连接","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果调用成功 → MCP 已连接,进入第二步","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果调用失败(MCP 未连接)→ 引导用户完成 MCP 鉴权:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"⚠️ 乐享 MCP 尚未连接。请先完成鉴权配置:\n\n1. 访问 https://lexiangla.com/mcp 登录后获取 COMPANY_FROM 和 LEXIANG_TOKEN\n2. 按照你使用的 Agent 配置 MCP 连接:\n - CodeBuddy:在 MCP 管理面板中添加 lexiang server\n - OpenClaw:运行 claw install https://github.com/tencent-lexiang/lexiang-mcp-skill\n - 其他 Agent:在 MCP 配置文件中添加 lexiang server\n3. 完成后告诉我,我会继续配置流程。","type":"text"}]},{"type":"paragraph","content":[{"text":"不要继续后续步骤","type":"text","marks":[{"type":"strong"}]},{"text":",等待用户完成 MCP 连接后重试。","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"第二步:请求用户提供知识库链接","type":"text","marks":[{"type":"strong"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"向用户发送引导消息:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"🔧 首次使用,需要配置目标知识库。\n\n请粘贴你想用来归档文章的乐享知识库链接,格式如:\nhttps://lexiangla.com/spaces/xxxxx?company_from=yyyyy\n\n💡 获取方式:在乐享中打开目标知识库,复制浏览器地址栏中的链接即可。","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"等待用户输入","type":"text","marks":[{"type":"strong"}]},{"text":",不要自行猜测或列举知识库","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"第三步:解析链接并验证","type":"text","marks":[{"type":"strong"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"从用户提供的链接中用正则解析出三个字段:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"domain","type":"text","marks":[{"type":"strong"}]},{"text":":链接的域名部分(如 ","type":"text"},{"text":"lexiangla.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"),用于生成后续访问链接","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"space_id","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"/spaces/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 后面的路径段(如 ","type":"text"},{"text":"b6013f6492894a29abbd89d5f2e636c6","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"company_from","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"company_from=","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数值(如 ","type":"text"},{"text":"e6c565d6d16811efac17768586f8a025","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果链接格式不正确(缺少 ","type":"text"},{"text":"space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"company_from","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 提示用户重新粘贴正确的链接","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"调用 ","type":"text"},{"text":"space_describe_space","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"space_id=\u003c解析出的 space_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":")验证知识库是否存在","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果验证失败 → 提示用户检查链接是否正确或是否有该知识库的访问权限","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"第四步:写入配置并确认","type":"text","marks":[{"type":"strong"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将解析和验证得到的信息写入 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lexiang.target_space.space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" = 解析出的 space_id","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lexiang.target_space.space_name","type":"text","marks":[{"type":"code_inline"}]},{"text":" = 从 ","type":"text"},{"text":"space_describe_space","type":"text","marks":[{"type":"code_inline"}]},{"text":" 返回值获取的知识库名称","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lexiang.target_space.company_from","type":"text","marks":[{"type":"code_inline"}]},{"text":" = 解析出的 company_from","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lexiang.access_domain.domain","type":"text","marks":[{"type":"code_inline"}]},{"text":" = 解析出的域名","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lexiang.access_domain.page_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" = ","type":"text"},{"text":"https://\u003cdomain>/pages/{entry_id}?company_from={company_from}","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"lexiang.access_domain.space_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" = ","type":"text"},{"text":"https://\u003cdomain>/spaces/{space_id}?company_from={company_from}","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"_initialized","type":"text","marks":[{"type":"code_inline"}]},{"text":" = ","type":"text"},{"text":"true","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"向用户确认配置结果:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"✅ 配置完成!\n\n📚 目标知识库:\u003c知识库名称>\n🔗 访问链接:https://\u003cdomain>/spaces/\u003cspace_id>?company_from=\u003ccompany_from>\n\n后续抓取的文章将自动归档到此知识库。如需更换,告诉我「重新配置知识库」即可。","type":"text"}]}]}]},{"type":"heading","attrs":{"level":5},"content":[{"text":"重新配置","type":"text"}]},{"type":"paragraph","content":[{"text":"当用户说「重新配置知识库」、「切换知识库」、「更换目标知识库」等类似意图时:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中 ","type":"text"},{"text":"_initialized","type":"text","marks":[{"type":"code_inline"}]},{"text":" 设为 ","type":"text"},{"text":"false","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"重新执行上述对话式初始化流程(从第一步开始)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":5},"content":[{"text":"用户输入容错","type":"text"}]},{"type":"paragraph","content":[{"text":"用户可能不会粘贴完美的链接,需要处理以下情况:","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"用户输入","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"处理方式","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"完整链接 ","type":"text"},{"text":"https://lexiangla.com/spaces/xxx?company_from=yyy","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"直接解析 ✅","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"不带 company_from 的链接 ","type":"text"},{"text":"https://lexiangla.com/spaces/xxx","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"提示:「链接中缺少 company_from 参数。请在乐享中重新复制完整链接(地址栏中通常会包含 ?company_from=xxx),或者访问 https://lexiangla.com/mcp 获取你的 COMPANY_FROM 值告诉我。」","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"纯 space_id ","type":"text"},{"text":"b6013f6492894a29abbd89d5f2e636c6","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"提示:「请提供完整的知识库链接(包含 company_from 参数),我需要从链接中同时获取知识库 ID 和企业标识。」","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"页面链接 ","type":"text"},{"text":"https://lexiangla.com/pages/xxx","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"提示:「这是一个页面链接,请提供知识库链接(格式:https://lexiangla.com/spaces/xxx?company_from=yyy)。你可以在乐享中进入目标知识库首页,复制地址栏链接。」","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"返回的文档链接打不开/无权限","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"链接中缺少 ","type":"text"},{"text":"company_from","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数。页面链接必须带 ","type":"text"},{"text":"?company_from=xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":",格式:","type":"text"},{"text":"https://lexiangla.com/pages/\u003centry_id>?company_from=\u003ccompany_from>","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"heading","attrs":{"level":5},"content":[{"text":"配置结构","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"json"},"content":[{"text":"{\n \"_initialized\": false,\n \"lexiang\": {\n \"target_space\": {\n \"space_id\": \"\",\n \"space_name\": \"\",\n \"company_from\": \"\"\n },\n \"access_domain\": {\n \"domain\": \"lexiangla.com\",\n \"page_url_template\": \"https://lexiangla.com/pages/{entry_id}?company_from={company_from}\",\n \"space_url_template\": \"https://lexiangla.com/spaces/{space_id}?company_from={company_from}\"\n }\n }\n}","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"access_domain","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 会从用户粘贴的链接中自动提取域名","type":"text","marks":[{"type":"strong"}]},{"text":",无需手动配置。适配自定义域名的乐享部署。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"后续文档中所有 ","type":"text"},{"text":"\u003cSPACE_ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"\u003cCOMPANY_FROM>","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"\u003cACCESS_DOMAIN>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等占位符,均指从 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中读取的实际值。","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"乐享 MCP 工具的调用方式(重要 — 多 Agent 适配)","type":"text"}]},{"type":"paragraph","content":[{"text":"本 skill 需要服务多个 Agent 产品(OpenClaw、CodeBuddy、Claude Desktop 等)。不同 Agent 连接乐享 MCP 的方式不同,但","type":"text"},{"text":"暴露的工具名称和参数完全一致","type":"text","marks":[{"type":"strong"}]},{"text":"(都是 lexiang MCP server 提供的标准工具)。","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"核心原则","type":"text","marks":[{"type":"strong"}]},{"text":":本 skill 只描述「调用哪个工具 + 传什么参数」,","type":"text"},{"text":"不规定具体的 MCP 调用语法","type":"text","marks":[{"type":"strong"}]},{"text":"。每个 Agent 按自己的方式调用即可。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"各 Agent 产品的 MCP 连接方式","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Agent 产品","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"lexiang MCP 连接方式","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"工具调用方式","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CodeBuddy","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"在 ","type":"text"},{"text":"~/.codebuddy/mcp.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中配置 lexiang server,通过 IDE 的 MCP 管理面板启用连接","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"直接调用 ","type":"text"},{"text":"space_describe_space","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等 lexiang MCP 工具","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"OpenClaw","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"claw install https://github.com/tencent-lexiang/lexiang-mcp-skill","type":"text","marks":[{"type":"code_inline"}]},{"text":",加载 skill 时自动连接 MCP","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"同上,通过 skill 暴露的 MCP 工具调用","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Claude Desktop / 其他 MCP 兼容 Agent","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"在 Agent 的 MCP 配置文件中添加 lexiang server URL","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"同上","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"MCP 连接检测与降级","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"paragraph","content":[{"text":"在执行乐享操作前,","type":"text"},{"text":"必须先检测 lexiang MCP 是否已连接","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"读取 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":",检查 ","type":"text"},{"text":"_initialized","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和 ","type":"text"},{"text":"lexiang.target_space.space_id","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果未初始化 → 先触发对话式配置初始化(参见上方「对话式配置初始化」),初始化流程中会自动完成 MCP 连接检测","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果已初始化,尝试调用 ","type":"text"},{"text":"space_describe_space","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"space_id=\u003cconfig 中的 space_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":")验证 MCP 连接","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果调用成功 → MCP 已连接,继续后续流程","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果调用失败(MCP 未连接)→ ","type":"text"},{"text":"提示用户检查 MCP 连接","type":"text","marks":[{"type":"strong"}]},{"text":",给出对应 Agent 的操作指引:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"CodeBuddy:「请在 MCP 管理面板中确认 lexiang server 已启用并显示为已连接状态」","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"OpenClaw:「请确认已安装 lexiang skill(","type":"text"},{"text":"claw install https://github.com/tencent-lexiang/lexiang-mcp-skill","type":"text","marks":[{"type":"code_inline"}]},{"text":")」","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"其他 Agent:「请确认 MCP 配置中已添加 lexiang server」","type":"text"}]}]}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 禁止降级为 curl 调用 REST API","type":"text","marks":[{"type":"strong"}]},{"text":":即使 MCP 未连接,也","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"自行编写 curl 调用乐享 REST API,因为:(1) 认证信息硬编码在 curl 中不安全;(2) 不同 Agent 的执行环境差异大,curl 方式不通用;(3) REST API 的 URL 格式和鉴权方式可能变化。应该引导用户修复 MCP 连接。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"认证配置","type":"text","marks":[{"type":"strong"}]},{"text":"(首次使用时需要):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"访问 ","type":"text"},{"text":"https://lexiangla.com/mcp","type":"text","marks":[{"type":"link","attrs":{"href":"https://lexiangla.com/mcp","title":null}}]},{"text":" 登录后获取 ","type":"text"},{"text":"LEXIANG_TOKEN","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"(访问令牌,格式:","type":"text"},{"text":"lxmcp_xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"COMPANY_FROM","type":"text","marks":[{"type":"code_inline"}]},{"text":" 无需手动获取 — 会从用户粘贴的知识库链接中自动解析","type":"text"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"配置方式(二选一):","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"环境变量","type":"text","marks":[{"type":"strong"}]},{"text":"(推荐):","type":"text"},{"text":"export LEXIANG_TOKEN=\"lxmcp_xxx\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"直接修改 MCP 配置","type":"text","marks":[{"type":"strong"}]},{"text":":将 MCP server URL 中的 ","type":"text"},{"text":"${LEXIANG_TOKEN}","type":"text","marks":[{"type":"code_inline"}]},{"text":" 占位符替换为实际值","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"详细配置步骤参见:","type":"text"},{"text":"lexiang-mcp-skill setup.md","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/tencent-lexiang/lexiang-mcp-skill/blob/main/setup.md","title":null}}]}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"目标知识库","type":"text"}]},{"type":"paragraph","content":[{"text":"从 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 ","type":"text"},{"text":"lexiang.target_space","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中读取:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"知识库名称","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"config.lexiang.target_space.space_name","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"知识库访问链接","type":"text","marks":[{"type":"strong"}]},{"text":":按 ","type":"text"},{"text":"config.lexiang.access_domain.space_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式拼接","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Space ID","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"config.lexiang.target_space.space_id","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 访问链接域名","type":"text","marks":[{"type":"strong"}]},{"text":":用户可访问的乐享前端域名从 ","type":"text"},{"text":"config.lexiang.access_domain.domain","type":"text","marks":[{"type":"code_inline"}]},{"text":" 读取(默认为 ","type":"text"},{"text":"lexiangla.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"),","type":"text"},{"text":"不是","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"mcp.lexiang-app.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"(后者是 MCP API 服务端域名,浏览器无法直接访问)。所有展示给用户的链接必须按 ","type":"text"},{"text":"config.lexiang.access_domain.page_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式生成。","type":"text"}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"目录组织方式","type":"text"}]},{"type":"paragraph","content":[{"text":"按","type":"text"},{"text":"天维度","type":"text","marks":[{"type":"strong"}]},{"text":"组织目录:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"知识库根目录/\n 2026-02-25/\n 文章标题A (图文文章,在线文档 page 类型,图片内嵌)\n 文章标题B (纯文本文章,在线文档 page 类型)\n 2026-02-26/\n 文章标题C (在线文档 page 类型)","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 默认格式","type":"text","marks":[{"type":"strong"}]},{"text":":所有文章(无论是否含图片)","type":"text"},{"text":"统一使用在线文档(page)格式上传","type":"text","marks":[{"type":"strong"}]},{"text":"。在线文档支持在乐享中直接编辑、划词评论、全文检索,体验远优于 PDF。PDF 仅作为降级方案(","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 失败时)或用户明确要求时使用。","type":"text"}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"操作流程","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 严格按步骤顺序执行,不得跳步!","type":"text","marks":[{"type":"strong"}]},{"text":" 必须完成步骤 0→1→2→3→4 的完整流程。尤其是","type":"text"},{"text":"步骤 2(创建日期目录)不可跳过","type":"text","marks":[{"type":"strong"}]},{"text":"——文档必须上传到当天日期命名的文件夹中,而不是直接上传到知识库根目录。如果跳过步骤 2 直接用 ","type":"text"},{"text":"root_entry_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 作为上传目标,文档将错误地出现在根目录下。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"通过 lexiang MCP 工具,按以下步骤完成转存:","type":"text"}]},{"type":"paragraph","content":[{"text":"步骤 0:读取配置(含初始化检测)","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"读取 skill 目录下的 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 文件","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"检查 ","type":"text"},{"text":"_initialized","type":"text","marks":[{"type":"code_inline"}]},{"text":" 是否为 ","type":"text"},{"text":"true","type":"text","marks":[{"type":"code_inline"}]},{"text":" 且 ","type":"text"},{"text":"lexiang.target_space.space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 非空","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果","type":"text"},{"text":"未初始化","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"_initialized","type":"text","marks":[{"type":"code_inline"}]},{"text":" 为 ","type":"text"},{"text":"false","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 为空)→ ","type":"text"},{"text":"触发对话式配置初始化流程","type":"text","marks":[{"type":"strong"}]},{"text":"(参见上方「对话式配置初始化」),完成后再继续","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"提取 ","type":"text"},{"text":"lexiang.target_space.space_id","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"lexiang.access_domain.page_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等配置项","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"步骤 1:获取知识库根节点","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"调用 ","type":"text"},{"text":"space_describe_space","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"space_id=\u003cconfig 中的 SPACE_ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"从返回结果中提取 ","type":"text"},{"text":"root_entry_id","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"步骤 2:检查/创建当天日期目录(🚨 必须先查再建,禁止直接创建)","type":"text","marks":[{"type":"strong"}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 这是本 skill 最常见的错误!","type":"text","marks":[{"type":"strong"}]},{"text":" 2026-05-11 实战中,Agent 未查询直接创建了同名目录。每次执行到此步骤,","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"严格按照下方决策树执行,绝对禁止跳过查询直接调用创建工具。","type":"text"}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"paragraph","content":[{"text":"🚨 执行前必读:两种常见错误","type":"text","marks":[{"type":"strong"}]}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"#","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"错误做法 ❌","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"正确做法 ✅","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"1","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"直接调用 ","type":"text"},{"text":"mcp__lexiang__entry_create_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建文件夹","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"先调用 ","type":"text"},{"text":"mcp__lexiang__entry_list_children","type":"text","marks":[{"type":"code_inline"}]},{"text":" 查询根目录","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"2","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"只匹配 ","type":"text"},{"text":"name==\"2026-05-11\"","type":"text","marks":[{"type":"code_inline"}]},{"text":",不检查 ","type":"text"},{"text":"entry_type","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"必须同时匹配 ","type":"text"},{"text":"name==\"2026-05-11\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" ","type":"text"},{"text":"且","type":"text","marks":[{"type":"strong"}]},{"text":" ","type":"text"},{"text":"entry_type==\"folder\"","type":"text","marks":[{"type":"code_inline"}]}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"paragraph","content":[{"text":"决策树(必须逐条执行,不可跳步):","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"步骤 2a:查询根目录\n 工具:mcp__lexiang__entry_list_children\n 参数:{\"parent_id\": \"\u003croot_entry_id>\"}\n\n 遍历返回的 entries[] 数组:\n 查找是否有 entry_type==\"folder\" 且 name==\"当天日期\" 的条目\n 例如今天 2026-05-11 → 查找 name==\"2026-05-11\" 且 entry_type==\"folder\"\n\n 如果找到 → 记录其 id → 【跳到步骤 3,不创建】\n 如果没找到 → 【继续到步骤 2b】\n\n步骤 2b:确认不存在后,才能创建\n 调用:mcp__lexiang__entry_create_entry\n 参数:{\"entry_type\": \"folder\", \"parent_entry_id\": \"\u003croot_entry_id>\", \"name\": \"当天日期\"}\n\n 🚨 创建后必须置顶(否则新目录会出现在末尾):\n 1. 先调用 entry_list_children 获取父目录当前第一个条目的 entry_id\n 2. 调用 entry_move_entry,使用 **before** 参数传入第一个条目的 entry_id,将新目录移到它之前(即置顶)\n 参数:{\"entry_id\": \"\u003c新建的entry_id>\", \"parent_id\": \"\u003croot_entry_id>\", \"before\": \"\u003c当前第一个条目的entry_id>\"}\n \n ⚠️ 注意事项:\n - after=\"\" 实测是移到末尾(非置顶),API 文档描述有误,禁止使用\n - 必须用 before=\u003c第一个条目ID> 才能真正置顶\n - 这一步不可省略!创建目录后如果不执行 move,新目录会默认出现在最底部","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"📌 关于分页的说明","type":"text","marks":[{"type":"strong"}]},{"text":":日期目录按创建时间倒序排列,当天的目录如果存在一定在第一页,无需处理分页。但如果你在处理非日期目录的场景(如查找某个不确定的条目),应注意 ","type":"text"},{"text":"next_page_token","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的存在。","type":"text"}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"paragraph","content":[{"text":"❌ 错误示例(禁止这样做):","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"# 错误:直接创建,不查询\n→ 调用 mcp__lexiang__entry_create_entry,参数 name=\"2026-05-11\"\n→ 结果:知识库中出现多个同名 \"2026-05-11\" 文件夹","type":"text"}]},{"type":"paragraph","content":[{"text":"✅ 正确示例(必须这样做):","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"# 正确:先查询第一页,找到就用,找不到才创建\n→ 调用 mcp__lexiang__entry_list_children,参数 {\"parent_id\": \"\u003croot_entry_id>\"}\n→ 遍历 entries,检查是否有 name==\"2026-05-11\" 且 entry_type==\"folder\"\n→ 找到 → 记录 id,跳过创建,直接进入步骤 3\n→ 没找到 → 调用 mcp__lexiang__entry_create_entry 创建","type":"text"}]},{"type":"paragraph","content":[{"text":"步骤 3:去重检查","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"调用 ","type":"text"},{"text":"entry_list_children","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"parent_id=\u003c日期目录ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":")查询该日期目录下已有的条目","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"按「名称 + 类型」检查是否已存在同名文档,如果已存在则跳过上传并告知用户","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"步骤 3.5:非中文文章翻译(🚨 强制检查,不可跳过)","type":"text","marks":[{"type":"strong"}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 重要","type":"text","marks":[{"type":"strong"}]},{"text":":无论文章是通过 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":"、PDF 下载还是其他方式获取,在上传到乐享之前","type":"text"},{"text":"都必须经过语言检测和翻译步骤","type":"text","marks":[{"type":"strong"}]},{"text":"。这是一个","type":"text"},{"text":"强制检查点","type":"text","marks":[{"type":"strong"}]},{"text":",不存在任何可以跳过的\"简化路径\"。","type":"text"}]},{"type":"paragraph","content":[{"text":"PDF 特别说明","type":"text","marks":[{"type":"strong"}]},{"text":":PDF 文件(包括英文学术论文)同样适用此规则。非中文 PDF 必须先翻译为中英对照格式后再转存乐享,详见上方「PDF 文件/链接处理」章节的 Step D。","type":"text"}]},{"type":"paragraph","content":[{"text":"常见遗漏场景","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 用 ","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取后直接转 PDF 上传 → 英文原文未翻译","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 觉得文章\"看起来不长\"就跳过翻译 → 知识库中留下纯英文文档","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ 翻译脚本不可用就放弃翻译 → 应该由 Agent 直接在对话中翻译","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"❌ PDF 是论文就不翻译 → 英文论文同样必须翻译","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"正确做法","type":"text","marks":[{"type":"strong"}]},{"text":":每篇文章上传前,","type":"text"},{"text":"必须先执行语言检测","type":"text","marks":[{"type":"strong"}]},{"text":",非中文则翻译后再上传。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"在上传到乐享之前,","type":"text"},{"text":"必须检测原文语言","type":"text","marks":[{"type":"strong"}]},{"text":"。如果原文不是中文,则需要先翻译为","type":"text"},{"text":"中英对照格式","type":"text","marks":[{"type":"strong"}]},{"text":"后再归档。","type":"text"}]},{"type":"paragraph","content":[{"text":"语言检测规则","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"读取 ","type":"text"},{"text":"\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的前 500 个字符,统计中文字符(Unicode 范围 ","type":"text"},{"text":"\\u4e00-\\u9fff","type":"text","marks":[{"type":"code_inline"}]},{"text":")占比","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中文字符占比 ","type":"text"},{"text":"≥ 30%","type":"text","marks":[{"type":"strong"}]},{"text":" → 判定为中文文章,","type":"text"},{"text":"跳过翻译","type":"text","marks":[{"type":"strong"}]},{"text":",直接进入步骤 4","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中文字符占比 ","type":"text"},{"text":"\u003c 30%","type":"text","marks":[{"type":"strong"}]},{"text":" → 判定为非中文文章,","type":"text"},{"text":"执行翻译","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"paragraph","content":[{"text":"翻译排版格式(中英对照)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"按段落逐段翻译,每段原文紧跟对应中文翻译","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"段落之间不加分隔线 ","type":"text","marks":[{"type":"strong"}]},{"text":"---","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":",仅通过空行分隔","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中文翻译段落开头不加国旗 emoji(🇨🇳)","type":"text","marks":[{"type":"strong"}]},{"text":",直接以中文开始","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"标题也需要翻译,保留原文标题 + 中文翻译标题","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"列表项、引用块等结构元素同样逐条翻译","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"保留原文中的图片引用","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"![](images/xxx.png)","type":"text","marks":[{"type":"code_inline"}]},{"text":"),图片引用放在对应段落的上方或下方,确保图文对应关系不丢失","type":"text"}]}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"# Original English Title\n# 中文翻译标题\n\nOriginal first paragraph text...\n\n第一段的中文翻译...\n\n![](images/img_01_xxx.png)\n\nOriginal second paragraph text...\n\n第二段的中文翻译...","type":"text"}]},{"type":"paragraph","content":[{"text":"翻译方式(按优先级)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"translate_article.py 脚本","type":"text","marks":[{"type":"strong"}]},{"text":"(如果 ","type":"text"},{"text":"OPENAI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 可用):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 scripts/translate_article.py \"\u003c原文标题>.md\" \"\u003c原文标题>_translated.md\" --model gpt-4o-mini","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"AI 助手直接翻译","type":"text","marks":[{"type":"strong"}]},{"text":"(如果无 API Key):由 Agent 在对话中逐段翻译全文,生成 ","type":"text"},{"text":"\u003c原文标题>_translated.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"翻译完成后","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"本地保存两个文件:","type":"text"},{"text":"\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":"(原文)和 ","type":"text"},{"text":"\u003c原文标题>_translated.md","type":"text","marks":[{"type":"code_inline"}]},{"text":"(中英对照版)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"归档到乐享知识库的必须是翻译后的中英对照版本","type":"text","marks":[{"type":"strong"}]},{"text":"(","type":"text"},{"text":"_translated.md","type":"text","marks":[{"type":"code_inline"}]},{"text":"),确保知识库中的内容对中文读者友好","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"乐享文档标题使用:","type":"text"},{"text":"\u003c原文标题中文翻译>(\u003c原文标题>)","type":"text","marks":[{"type":"code_inline"}]},{"text":",如:","type":"text"},{"text":"AI 原型精通阶梯(The AI Prototyping Mastery Ladder)","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"步骤 3.7:评价信息处理(可选)","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"如果在转存前用户提供了对文章的评价(例如:\"这篇文章好在:1)... 2)...\"),需要在上传时自动添加评价信息:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"检测评价信息","type":"text","marks":[{"type":"strong"}]},{"text":":在对话中识别用户是否提供了评价内容(关键词:好在、评价、优点、建议等)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"保存评价内容","type":"text","marks":[{"type":"strong"}]},{"text":":将评价信息保存到临时文件(如 ","type":"text"},{"text":"/tmp/evaluation.txt","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"传入脚本参数","type":"text","marks":[{"type":"strong"}]},{"text":":调用 ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 时,添加 ","type":"text"},{"text":"--evaluation-file /tmp/evaluation.txt","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"脚本自动处理","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 会自动在文档顶部插入评价信息(格式为 blockquote,乐享可能自动转换为 callout 组件)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"对于非在线文档格式(如视频)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"由于 lexiang MCP 工具中","type":"text"},{"text":"没有创建评论的 API","type":"text","marks":[{"type":"strong"}]},{"text":"(只有查询评论的 ","type":"text"},{"text":"comment_list_comments","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和 ","type":"text"},{"text":"comment_describe_comment","type":"text","marks":[{"type":"code_inline"}]},{"text":"),暂时无法自动添加评论到视频文件","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"建议","type":"text","marks":[{"type":"strong"}]},{"text":":转存完成后,手动在乐享中添加评论","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"示例:用户提供评价后的处理流程","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 1. 将用户评价保存到临时文件\ncat > /tmp/evaluation.txt \u003c\u003c 'EOF'\n这篇文章好在:\n1)把智能体Agent做了分类,每个分类定义了对应是适用场景;\n2)列举了详实的案例说明;\n3)通过构建的复杂度、技术架构、实现时长、运行成本、衡量成功等几个维度来系统化地综合判断Agent落地的优先级\n4)未来关于Agent选型上,能够提供系统性的参考建议\nEOF\n\n# 2. 调用 md_to_page.py,传入评价文件\npython3 scripts/md_to_page.py \"\u003c原文标题>_translated.md\" \\\n --parent-id \u003c日期目录ID> --name \"\u003c文档标题>\" \\\n --evaluation-file /tmp/evaluation.txt \\\n --token \"$LEXIANG_TOKEN\" --company-from \"$COMPANY_FROM\"","type":"text"}]},{"type":"paragraph","content":[{"text":"评价信息格式说明","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"脚本会将评价信息格式化为 blockquote(以 ","type":"text"},{"text":">","type":"text","marks":[{"type":"code_inline"}]},{"text":" 开头的 Markdown 格式)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"在乐享在线文档中,blockquote 可能被自动渲染为 callout 组件(带有左侧竖线或背景色)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果需要真正的 callout 组件(特殊 block 类型),需要通过 ","type":"text"},{"text":"block_create_block_descendant","type":"text","marks":[{"type":"code_inline"}]},{"text":" API 创建,但需要先了解 callout 的 block 结构","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"步骤 3.8:页面内嵌视频检测与链接附加(⚠️ 不可跳过)","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"在生成 PDF 或上传之前,","type":"text"},{"text":"必须检测页面中是否包含嵌入视频","type":"text","marks":[{"type":"strong"}]},{"text":"。嵌入视频(如 Wistia、YouTube、Vimeo、Loom 等 iframe 嵌入)在转为 PDF 时会完全丢失,因此需要将视频链接以文本形式附加到文档末尾,确保知识库读者能找到并观看原始视频。","type":"text"}]},{"type":"paragraph","content":[{"text":"检测范围","type":"text","marks":[{"type":"strong"}]},{"text":"(按优先级扫描):","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003ciframe>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 嵌入 — 匹配 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中包含 ","type":"text"},{"text":"youtube","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"youtu.be","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"vimeo","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"loom","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"wistia","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"vidyard","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"player","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 iframe","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003cvideo>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 标签 — 提取 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或内部 ","type":"text"},{"text":"\u003csource>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"\u003ca>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 链接 — 匹配 ","type":"text"},{"text":"href","type":"text","marks":[{"type":"code_inline"}]},{"text":" 指向 ","type":"text"},{"text":"youtube.com/watch","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"youtu.be/","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"vimeo.com/","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"loom.com/share","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等视频平台","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"平台特定容器 — 如 readme.io 的 ","type":"text"},{"text":"rdmd-embed","type":"text","marks":[{"type":"code_inline"}]},{"text":" 组件、","type":"text"},{"text":"[class*=\"video\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":" 容器等","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"视频链接还原规则","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Wistia embed(","type":"text"},{"text":"fast.wistia.net/embed/iframe/\u003cid>","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 附加可观看链接 ","type":"text"},{"text":"https://fast.wistia.net/embed/iframe/\u003cid>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"YouTube embed(","type":"text"},{"text":"youtube.com/embed/\u003cid>","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 还原为 ","type":"text"},{"text":"https://www.youtube.com/watch?v=\u003cid>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Vimeo embed(","type":"text"},{"text":"player.vimeo.com/video/\u003cid>","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 还原为 ","type":"text"},{"text":"https://vimeo.com/\u003cid>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Loom embed(","type":"text"},{"text":"loom.com/embed/\u003cid>","type":"text","marks":[{"type":"code_inline"}]},{"text":")→ 还原为 ","type":"text"},{"text":"https://www.loom.com/share/\u003cid>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"其他视频 URL → 原样保留","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"附加格式","type":"text","marks":[{"type":"strong"}]},{"text":":在 Markdown 文档末尾(PDF 生成前)追加一个独立章节:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"markdown"},"content":[{"text":"\n---\n\n## 📹 页面内嵌视频\n\n本页面包含以下嵌入视频,PDF 中无法播放,请通过链接观看:\n\n1. [视频] https://fast.wistia.net/embed/iframe/xxxxx\n2. [视频] https://www.youtube.com/watch?v=yyyyy","type":"text"}]},{"type":"paragraph","content":[{"text":"如果使用 Playwright 直接生成 PDF(非 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取),应在 ","type":"text"},{"text":"page.pdf()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 之前通过 ","type":"text"},{"text":"page.evaluate()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 在页面底部注入视频链接信息块。","type":"text"}]},{"type":"paragraph","content":[{"text":"步骤 4:上传到乐享(统一使用在线文档格式)","type":"text","marks":[{"type":"strong"}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 核心原则:所有文章默认使用在线文档(page)格式上传,不再默认转 PDF。","type":"text","marks":[{"type":"strong"}]},{"text":" 在线文档的优势:支持编辑、划词评论、全文检索、移动端阅读体验好。 PDF 仅在以下情况使用:(1) ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 都失败时的最终降级;(2) 用户明确要求 PDF 格式。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"检查 ","type":"text"},{"text":"\u003c原文标题>.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" 文件同目录下是否存在 ","type":"text"},{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 目录且包含图片文件:","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 图片判断 Checklist(必须逐条检查)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 目录是否存在且有图片?","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Markdown 内容中是否有 ","type":"text"},{"text":"![](images/xxx)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 本地引用?(仅检查目录不够!如果 images/ 有图片但 Markdown 中无引用,说明抓取阶段出了问题——正文提取失败导致图片引用丢失,需要重新抓取)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果 Markdown 中只有外链图片(","type":"text"},{"text":"![](https://...)","type":"text","marks":[{"type":"code_inline"}]},{"text":")而无本地引用,说明图片没有被下载到本地。","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 导入外链图片后,乐享中如果外链 CDN 有防盗链/过期,图片将不可见。此时应先下载图片到本地,再用 ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 导入。","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"判断结论","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ images/ 有图片 + Markdown 有 ","type":"text"},{"text":"![](images/xxx)","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"图文文章","type":"text","marks":[{"type":"strong"}]},{"text":",走图文路径","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"⚠️ images/ 有图片 + Markdown 无本地引用 → ","type":"text"},{"text":"抓取异常","type":"text","marks":[{"type":"strong"}]},{"text":",需重新抓取或手动修复 Markdown","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"⚠️ images/ 无图片 + Markdown 有外链图片 → ","type":"text"},{"text":"外链图片","type":"text","marks":[{"type":"strong"}]},{"text":",建议下载后转本地引用","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"✅ images/ 无图片 + Markdown 无图片引用 → ","type":"text"},{"text":"纯文本文章","type":"text","marks":[{"type":"strong"}]},{"text":",走纯文本路径","type":"text"}]}]}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"有图片(图文文章)","type":"text","marks":[{"type":"strong"}]},{"text":" → 使用 ","type":"text"},{"text":"scripts/md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 将 Markdown 图文导入为在线文档(图片内嵌到正文对应位置):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"python3 scripts/md_to_page.py \"\u003c原文标题>.md\" \\\n --parent-id \u003c日期目录ID> --name \"\u003c原文标题>\" \\\n --token \"$LEXIANG_TOKEN\" --company-from \"$COMPANY_FROM\"","type":"text"}]},{"type":"paragraph","content":[{"text":"脚本会自动:按图片位置拆分 markdown → 分段导入文字(直传原始 markdown,不做 base64 编码)→ 逐张上传图片到 COS → 在正确位置插入 image block。","type":"text"}]},{"type":"paragraph","content":[{"text":"降级方案 A(脚本无 token 时 — 通过 MCP connector 交替导入图文)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"paragraph","content":[{"text":"当 ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 因缺少 LEXIANG_TOKEN 无法运行时(如 mcp.json 中无 lexiang 配置,只有 connector 模式),改用以下流程。","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 核心原则:交替导入,严禁先全文后补图!","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"必须按「文字段→图片→文字段→图片→...」的顺序交替导入,这样每张图片自然落在正确位置。 绝对不能先把全部文字一次性导入,事后再补图——这会导致所有图片堆积在文档末尾,破坏图文混排。","type":"text"}]}]},{"type":"paragraph","content":[{"text":"执行流程(照此逐步执行,不可跳步)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"步骤 A1:准备——按图片位置拆分 Markdown\n 读取 article.md 内容\n 按 ![xxx](images/yyy) 引用位置拆分为交替的 segments 数组:\n [(\"text\", \"第一段文字...\"), (\"image\", \"img_01.png\"), (\"text\", \"第二段文字...\"), (\"image\", \"img_02.png\"), ...]\n 如果文字段超过 15000 字符,按段落 \\n\\n 二次拆分为多个 ≤15000 的子块\n\n步骤 A2:创建空白 page\n entry_create_entry(entry_type=\"page\", parent_entry_id=\u003c日期目录ID>, name=\"\u003c原文标题>\")\n 记录 entry_id\n\n步骤 A3:交替导入(按 segments 数组顺序逐个处理)\n is_first = true\n for each segment in segments:\n if segment.type == \"text\":\n entry_import_content_to_entry(entry_id, content=segment.text, force_write=is_first)\n is_first = false\n \n elif segment.type == \"image\":\n img_path = images/\u003csegment.filename>\n if 文件不存在 or 文件 \u003c 1KB → 跳过\n \n // 三步上传图片\n 1. block_apply_block_attachment_upload(entry_id, name, size, mime_type)→ session_id + upload_url\n 2. curl -X PUT \"\u003cupload_url>\" --data-binary @\u003cimg_path> → 确认 HTTP 200\n 3. block_create_block_descendant(entry_id, index=\"-1\", descendant=[{block_type: \"image\", image: {session_id}}])\n \n // index=\"-1\" 在这里是正确的!因为是交替执行,当前末尾就是刚导入的文字段之后","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 为什么 index=-1 在交替导入中是正确的?","type":"text","marks":[{"type":"strong"}]},{"text":" 因为文字和图片严格交替执行:先导入文字(追加到末尾),再在末尾插入图片,图片自然位于刚导入的文字之后。 ","type":"text"},{"text":"只有「先全文导入完毕→事后补图」的场景下,index=-1 才会导致图片堆积在末尾。","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"⚠️ 小图片(\u003c50KB 的 icon/分隔线/SVG 装饰图)可跳过","type":"text","marks":[{"type":"strong"}]},{"text":",只上传有信息量的关键图片(图表/截图/概念图)。","type":"text"}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 得到 APP 文章特殊情况","type":"text","marks":[{"type":"strong"}]},{"text":":得到文章通常有 80-100+ 张图片,其中大部分是公式渲染图(3-10KB),真正有信息量的数据图表约 5-10 张(>50KB)。对得到文章,","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"上传 >50KB 的关键图片(概念图、流程图、案例配图等),不需要逐张上传所有小图。","type":"text"}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ 得到文章完整转存流程(2026-05-09 实战验证)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"fetch_article.py --cdp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取全文 → 本地 article.md + images/","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"提取纯文字版(去掉 ","type":"text"},{"text":"![](images/...)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 引用 + 去掉得到APP UI噪声如\"展开\"、\"分享\"、点赞数、用户留言等)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"创建 page → 分块导入纯文字(每块 ≤4000 chars)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"筛选 >50KB 的关键图片(用 ","type":"text"},{"text":"find images/ -size +50k","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"排除 SVG 格式的 UI 图标(查看文件头是否为 ","type":"text"},{"text":"\u003c?xml","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"逐张上传关键图片并插入文档对应位置","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"降级方案 B(最终降级)","type":"text","marks":[{"type":"strong"}]},{"text":":如果以上都失败 → 调用 ","type":"text"},{"text":"scripts/md_to_pdf.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 转为 PDF,再通过三步上传流程上传:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"parent_entry_id=\u003c日期目录ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"name=\"\u003c原文标题>.pdf\"","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"size=\u003c文件字节数>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"mime_type=\"application/pdf\"","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"upload_type=\"PRE_SIGNED_URL\"","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"使用 ","type":"text"},{"text":"curl -X PUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" 将 PDF 文件上传到返回的 ","type":"text"},{"text":"upload_url","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"file_commit_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":"(参数:","type":"text"},{"text":"session_id=\u003c上一步返回的session_id>","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"🚨 绝对禁止","type":"text","marks":[{"type":"strong"}]},{"text":":不要用 ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 直接上传 .md 文件!.md 上传后在乐享中会丢失所有图片信息,用户看到的只是含 ","type":"text"},{"text":"![](images/xxx)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 引用的纯文本,毫无可读性。","type":"text"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"无图片(纯文本文章)","type":"text","marks":[{"type":"strong"}]},{"text":" → 使用 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建为","type":"text"},{"text":"在线文档(page 类型)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"参数:","type":"text"},{"text":"space_id=\u003cconfig 中的 SPACE_ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"parent_id=\u003c日期目录ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"name=\"\u003c原文标题>\"","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"content=\u003cMarkdown文件内容>","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"content_type=\"markdown\"","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"在线文档支持在乐享中直接编辑","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"通过 ","type":"text","marks":[{"type":"strong"}]},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 抓取的文章(无本地图片文件)","type":"text","marks":[{"type":"strong"}]},{"text":" → 直接使用 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建在线文档,Markdown 内容中的外链图片在乐享中可能无法显示,但文字内容完整可编辑、可检索。","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"步骤 5:输出结果","type":"text","marks":[{"type":"strong"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"按 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中的 ","type":"text"},{"text":"lexiang.access_domain.page_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式拼接文档链接,告知用户","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"示例:","type":"text"},{"text":"https://lexiangla.com/pages/\u003centry_id>?company_from=\u003ccompany_from>","type":"text","marks":[{"type":"code_inline"}]},{"text":"(域名和 company_from 从配置读取,","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"硬编码)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"⚠️ 链接必须包含 ","type":"text","marks":[{"type":"strong"}]},{"text":"company_from","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 参数","type":"text","marks":[{"type":"strong"}]},{"text":",否则用户打开页面会跳转到登录页或显示无权限","type":"text"}]}]}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"注意事项","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"配置初始化是前置条件","type":"text","marks":[{"type":"strong"}]},{"text":":首次使用时会自动通过对话引导完成知识库配置,无需手动编辑文件","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"MCP 连接是前置条件","type":"text","marks":[{"type":"strong"}]},{"text":":必须先确认 lexiang MCP 已连接才能执行操作。不同 Agent 的连接方式不同,参见上方「乐享 MCP 工具的调用方式」","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"访问链接域名","type":"text","marks":[{"type":"strong"}]},{"text":":展示给用户的链接一律按 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中 ","type":"text"},{"text":"page_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式生成(含 ","type":"text"},{"text":"company_from","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数),","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"使用 ","type":"text"},{"text":"mcp.lexiang-app.com","type":"text","marks":[{"type":"code_inline"}]},{"text":",","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"省略 ","type":"text"},{"text":"company_from","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"上传前自动去重","type":"text","marks":[{"type":"strong"}]},{"text":":按「文档名称 + 文档类型」在目标日期目录下查重,避免重复上传","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"默认使用在线文档(page)格式","type":"text","marks":[{"type":"strong"}]},{"text":":所有文章(含图文)统一以在线文档格式上传,支持编辑、检索、评论。PDF 仅作为最终降级方案","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"纯文本文章直接用 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":",图文文章优先用 ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(图片内嵌),降级用 ","type":"text"},{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":"(图片不内嵌但文字完整)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"PDF 转换依赖 ","type":"text"},{"text":"pymupdf","type":"text","marks":[{"type":"code_inline"}]},{"text":" 库(","type":"text"},{"text":"pip3 install pymupdf","type":"text","marks":[{"type":"code_inline"}]},{"text":"),仅在前两种方式都失败时使用","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果同一天多次处理不同文章,它们会归入同一个日期目录下","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"使用 ","type":"text"},{"text":"_mcp_fields","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数可以减少返回数据量,如 ","type":"text"},{"text":"_mcp_fields=[\"id\", \"root_entry_id\", \"name\"]","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"脚本文件","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"文件","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"用途","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"scripts/fetch_article.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通用文章抓取脚本(Chrome cookies + Playwright)。支持付费墙/登录墙、微信公众号、得到、**Webflow SPA(claude.com/anthropic.com)**等多种站点,自动识别内容容器,输出 Markdown + 图片 + 元信息 JSON。Webflow 支持为内置自动检测,无需额外参数","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"scripts/md_to_pdf.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Markdown 转 PDF 脚本(使用 pymupdf,嵌入本地图片,正确渲染中文,支持标题回退和拆行标题修复)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"scripts/md_to_page.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"【推荐】","type":"text","marks":[{"type":"strong"}]},{"text":" Markdown 图文导入乐享在线文档脚本。按图片位置将 markdown 拆分为 text/image 交替段落,分段导入到乐享 page(文字用 entry_import_content_to_entry 直传原始 markdown,图片用 block_apply_block_attachment_upload + curl PUT + block_create_block_descendant 三步上传)。⚠️ 脚本通过 HTTP JSON-RPC 直连乐享 MCP API,content 字段","type":"text"},{"text":"不需要 base64 编码","type":"text","marks":[{"type":"strong"}]},{"text":"(直传原始 markdown 字符串)。支持任意长度文章,图片内嵌到正文对应位置,生成可编辑、可划词评论的在线文档。用法:","type":"text"},{"text":"python3 scripts/md_to_page.py \u003cmd_file> --entry-id \u003cID> --token \u003cTOKEN> --company-from \u003cCF>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"--parent-id \u003cPID> --name \"标题\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" 创建新页面","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"scripts/yt_download_transcribe.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"YouTube 视频下载 + Whisper 转录 + AI 翻译脚本(yt-dlp 下载、ffmpeg 提取音频、Whisper 转录、OpenAI 翻译为中英对照 Markdown)。也可用于播客音频转录(跳过视频下载步骤)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"scripts/translate_gemini.py","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"使用 Gemini API 将英文 Markdown 翻译为中英对照格式。按 ~4K 字符分段翻译,每段间隔 2 秒避免限频。模型:","type":"text"},{"text":"gemini-2.5-flash","type":"text","marks":[{"type":"code_inline"}]},{"text":"。需要 ","type":"text"},{"text":"GEMINI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 环境变量。用法:","type":"text"},{"text":"python3 scripts/translate_gemini.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(翻译后生成 ","type":"text"},{"text":"_translated.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" 文件)","type":"text"}]}]}]}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"注意","type":"text","marks":[{"type":"strong"}]},{"text":":乐享知识库操作不再通过独立脚本(","type":"text"},{"text":"save_to_lexiang.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"upload_yt_to_lexiang.sh","type":"text","marks":[{"type":"code_inline"}]},{"text":")完成,而是由大模型通过 ","type":"text"},{"text":"lexiang MCP 工具","type":"text","marks":[{"type":"strong"}]},{"text":"直接执行。不同 Agent 产品(OpenClaw、CodeBuddy、Claude Desktop 等)各自管理 MCP 连接,但调用的工具名称和参数完全一致。","type":"text"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"经验总结","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"在线文档图文导入(md_to_page.py)","type":"text"}]},{"type":"paragraph","content":[{"text":"核心方案","type":"text","marks":[{"type":"strong"}]},{"text":":Python 脚本通过 HTTP JSON-RPC 直连乐享 MCP API,按图片位置将 markdown 拆分为 text/image 交替段落,逐段导入。","type":"text"}]},{"type":"paragraph","content":[{"text":"为什么不走 IDE 的 MCP 工具调用","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"IDE MCP 工具调用有参数长度限制,45K 字符的 markdown base64 编码后 62K,无法一次性传递","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Python 脚本直连 HTTP JSON-RPC 没有此限制,按 ~15K 字符分段传输即可","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"关键踩坑(⚠️ 重要)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"不要做 base64 编码","type":"text","marks":[{"type":"strong"}]},{"text":":通过 HTTP JSON-RPC 直连时,","type":"text"},{"text":"entry_import_content_to_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 content 字段直传原始 markdown 字符串。如果做了 base64 编码,乐享会把 base64 字符串当成纯文本存储,页面显示为乱码。只有通过 IDE MCP 协议调用时才需要 base64","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片需逐张插入","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"block_create_block_descendant","type":"text","marks":[{"type":"code_inline"}]},{"text":" 一次传多张图片的 block 会失败,必须一张一张来","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文字分段追加","type":"text","marks":[{"type":"strong"}]},{"text":":第一段用 ","type":"text"},{"text":"force_write=true","type":"text","marks":[{"type":"code_inline"}]},{"text":" 覆盖,后续段用 ","type":"text"},{"text":"force_write=false","type":"text","marks":[{"type":"code_inline"}]},{"text":" 追加到末尾","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片位置要正确","type":"text","marks":[{"type":"strong"}]},{"text":":先按原文中 ","type":"text"},{"text":"![](images/xxx.jpg)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的位置拆分 markdown,确保文字和图片按原文顺序交替插入","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"乐享文档名称","type":"text","marks":[{"type":"strong"}]},{"text":":要与文章原标题一致,创建时通过 ","type":"text"},{"text":"--name","type":"text","marks":[{"type":"code_inline"}]},{"text":" 指定,或创建后用 ","type":"text"},{"text":"entry_rename_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 修改","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"翻译注意事项","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"所有英文文章默认必须翻译为中英对照格式再归档","type":"text","marks":[{"type":"strong"}]},{"text":",不可跳过","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"翻译脚本 ","type":"text"},{"text":"translate_gemini.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 使用 Gemini API(模型:","type":"text"},{"text":"gemini-2.5-flash","type":"text","marks":[{"type":"code_inline"}]},{"text":"),按 ~4K 字符分段翻译","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Gemini API ","type":"text"},{"text":"gemini-2.0-flash","type":"text","marks":[{"type":"code_inline"}]},{"text":" 已下线,务必使用 ","type":"text"},{"text":"gemini-2.5-flash","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或更新的模型","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"翻译完成后用 ","type":"text"},{"text":"md_to_page.py --entry-id \u003cID>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 覆盖更新在线文档","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果没有 Gemini API Key 也没有 OpenAI API Key,由 AI 助手在对话中翻译后写入文件","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"自测清单","type":"text","marks":[{"type":"strong"}]},{"text":"(发布前必须完成):","type":"text"}]},{"type":"checkbox_list","attrs":{"id":null},"content":[{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"通过 ","type":"text"},{"text":"entry_describe_ai_parse_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 验证文字内容可读(非 base64 乱码)","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"通过 ","type":"text"},{"text":"block_list_block_children","type":"text","marks":[{"type":"code_inline"}]},{"text":" 验证图片 block 存在且有 file_id","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"验证文档名称与文章标题一致","type":"text"}]}]},{"type":"checkbox_item","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"验证中英对照格式(英文在前,中文翻译紧跟其后)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"YouTube 视频下载与转录","type":"text"}]},{"type":"paragraph","content":[{"text":"核心方案","type":"text","marks":[{"type":"strong"}]},{"text":":yt-dlp 下载 → ffmpeg 提取音频 → Whisper 本地转录 → OpenAI API 翻译","type":"text"}]},{"type":"paragraph","content":[{"text":"为什么不用 NotebookLM / summarize.sh","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"NotebookLM 需要 Google 账号且有额度限制,部分视频可能因版权限制无法提取","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"summarize.sh 依赖外部 API(Apify/YouTube 字幕 API),部分视频无字幕时无法工作","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Whisper 本地转录","type":"text"},{"text":"不依赖字幕","type":"text","marks":[{"type":"strong"}]},{"text":",直接从音频波形识别语音,覆盖率 100%","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"yt-dlp 版本与安装(关键!)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"必须使用 ","type":"text","marks":[{"type":"strong"}]},{"text":"brew install yt-dlp","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 安装,不要用 ","type":"text"},{"text":"pip3 install yt-dlp","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"原因:pip 版本受限于系统 Python 版本(macOS 自带 Python 3.9),无法安装 yt-dlp 的 nightly 版本(需要 Python 3.10+)。而 YouTube 频繁更新反爬策略,旧版 yt-dlp 会遇到 HTTP 403 Forbidden 错误","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"brew 安装的 yt-dlp 自带独立 Python 环境,始终能获取最新版本","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"脚本中调用方式:直接用 ","type":"text"},{"text":"yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 命令,","type":"text"},{"text":"不要","type":"text","marks":[{"type":"strong"}]},{"text":"用 ","type":"text"},{"text":"python3 -m yt_dlp","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"YouTube DASH 格式 403 错误(重要!)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"YouTube 正在强制使用 SABR(Streaming ABR)流媒体协议,传统 DASH 分片下载(","type":"text"},{"text":"bestvideo+bestaudio","type":"text","marks":[{"type":"code_inline"}]},{"text":")会触发 HTTP 403 Forbidden","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"解决方案","type":"text","marks":[{"type":"strong"}]},{"text":":优先使用 HLS(m3u8)格式下载,不会被 SABR 拦截","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"脚本中的格式选择顺序:","type":"text"},{"text":"95-1/94-1/93-1/bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"95-1","type":"text","marks":[{"type":"code_inline"}]},{"text":": 720p HLS(推荐,画质和文件大小的最佳平衡)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"94-1","type":"text","marks":[{"type":"code_inline"}]},{"text":": 480p HLS","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"93-1","type":"text","marks":[{"type":"code_inline"}]},{"text":": 360p HLS","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"后面是传统 DASH 格式作为回退","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"HLS 格式下载的视频文件会比 DASH 大一些(720p HLS 约 500-600MB vs DASH 约 200-300MB)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"注意","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"--extractor-args \"youtube:player_client=android\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" 不支持 cookies,不是可靠的 403 解决方案","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Whisper 转录最佳实践","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"音频预处理:16kHz 采样率、单声道 WAV(","type":"text"},{"text":"ffmpeg -ar 16000 -ac 1","type":"text","marks":[{"type":"code_inline"}]},{"text":"),减少文件大小且是 Whisper 推荐格式","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"段落合并策略:相邻 segment 间隔 \u003c2s 且总时长 \u003c60s 则合并,句号/问号结尾时倾向断开","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"模型选择:默认用 ","type":"text"},{"text":"base","type":"text","marks":[{"type":"code_inline"}]},{"text":"(速度和精度的最佳平衡),重要内容用 ","type":"text"},{"text":"small","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"medium","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"翻译策略","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"使用 OpenAI ","type":"text"},{"text":"gpt-4o-mini","type":"text","marks":[{"type":"code_inline"}]},{"text":",分批翻译(每批 10 段),避免 token 超限","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"翻译 prompt 要求\"自然流畅的中文表达,专业术语保留英文并附中文注释\"","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中英对照格式:每段先展示英文原文,紧跟中文翻译,段间用空行分隔(不加分隔线和国旗 emoji)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果没有 OPENAI_API_KEY","type":"text","marks":[{"type":"strong"}]},{"text":":脚本会跳过翻译步骤,输出纯英文文字稿。此时可以由 AI 助手在对话中直接翻译全文,然后用 ","type":"text"},{"text":"md_to_page.py --entry-id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 更新乐享文档","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"上传乐享的关键决策","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"文字稿使用 ","type":"text"},{"text":"在线文档(page)格式","type":"text","marks":[{"type":"strong"}]},{"text":"而非文件上传,原因:支持在乐享中按块维度编辑更新,可以逐段修正翻译或补充注释","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"视频使用 ","type":"text"},{"text":"文件(file)格式","type":"text","marks":[{"type":"strong"}]},{"text":"上传,因为视频不需要在线编辑","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"上传成功后自动删除本地视频文件,避免占用磁盘空间","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"视频上传到乐享的正确方式(重要!)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"通过 lexiang MCP 工具完成,使用三步上传流程:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":":申请上传凭证(传入 ","type":"text"},{"text":"parent_entry_id","type":"text","marks":[{"type":"code_inline"}]},{"text":"=日期目录 ID、","type":"text"},{"text":"upload_type","type":"text","marks":[{"type":"code_inline"}]},{"text":"=PRE_SIGNED_URL、","type":"text"},{"text":"mime_type","type":"text","marks":[{"type":"code_inline"}]},{"text":"=video/mp4、","type":"text"},{"text":"size","type":"text","marks":[{"type":"code_inline"}]},{"text":"=文件字节数)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"curl -X PUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" 上传文件到返回的 ","type":"text"},{"text":"upload_url","type":"text","marks":[{"type":"code_inline"}]},{"text":"(预签名 URL,直传 COS)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"file_commit_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":":确认上传完成(传入 ","type":"text"},{"text":"session_id","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"518MB 视频的 PUT 上传约需 30-60 秒","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"播客音频转录","type":"text"}]},{"type":"paragraph","content":[{"text":"核心方案","type":"text","marks":[{"type":"strong"}]},{"text":":yt-dlp(generic extractor)下载音频 → ffmpeg 转 WAV → Whisper 转录 → opencc 繁简转换","type":"text"}]},{"type":"paragraph","content":[{"text":"yt-dlp 对小宇宙的支持","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"yt-dlp 没有小宇宙专用 extractor,但 ","type":"text"},{"text":"generic extractor 完全够用","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"小宇宙页面中嵌入了 ","type":"text"},{"text":"\u003caudio>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 标签,音频直链在 ","type":"text"},{"text":"media.xyzcdn.net","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"下载不需要 cookies,直接用 ","type":"text"},{"text":"yt-dlp --no-playlist -o \"%(title)s.%(ext)s\" \u003cURL>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 即可","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"下载速度约 7MB/s,63 分钟播客(59MB)仅需 8 秒","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Whisper 中文转录的繁体问题(重要!)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Whisper base 模型对中文普通话","type":"text"},{"text":"倾向输出繁体字","type":"text","marks":[{"type":"strong"}]},{"text":"(如「歡迎」→ 应为「欢迎」)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"这是 Whisper 的已知行为,因为训练数据中繁体中文比重较大","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"解决方案","type":"text","marks":[{"type":"strong"}]},{"text":":转录后用 ","type":"text"},{"text":"opencc-python-reimplemented","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 ","type":"text"},{"text":"t2s","type":"text","marks":[{"type":"code_inline"}]},{"text":"(Traditional to Simplified)模式批量转换","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"安装:","type":"text"},{"text":"pip3 install opencc-python-reimplemented","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"用法:","type":"text"},{"text":"opencc.OpenCC(\"t2s\").convert(text)","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"中文播客 vs 英文 YouTube 的流程差异","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"中文播客","type":"text"},{"text":"不需要翻译","type":"text","marks":[{"type":"strong"}]},{"text":",但","type":"text"},{"text":"需要繁简转换","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"播客音频是直接的 m4a/mp3 文件,","type":"text"},{"text":"不需要从视频中提取音频","type":"text","marks":[{"type":"strong"}]},{"text":"(但仍需 ffmpeg 转为 WAV 格式给 Whisper)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Whisper 转录时","type":"text"},{"text":"指定 ","type":"text","marks":[{"type":"strong"}]},{"text":"language='zh'","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 可以提高中文识别准确率","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"上传乐享时 MIME 类型用 ","type":"text"},{"text":"audio/mp4","type":"text","marks":[{"type":"code_inline"}]},{"text":"(m4a)或 ","type":"text"},{"text":"audio/mpeg","type":"text","marks":[{"type":"code_inline"}]},{"text":"(mp3),不是 ","type":"text"},{"text":"video/mp4","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"paragraph","content":[{"text":"转录性能参考","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"63 分钟中文播客 → Whisper base 模型在 CPU 上转录耗时约 115 秒","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"产出 2496 个 segments,合并后 65 个段落","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"微信公众号图文抓取","type":"text"}]},{"type":"paragraph","content":[{"text":"核心问题","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 工具无法获取微信公众号文章的图片(懒加载 + 防盗链),","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"使用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"。","type":"text"}]},{"type":"paragraph","content":[{"text":"技术原理","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"懒加载机制","type":"text","marks":[{"type":"strong"}]},{"text":":微信图片的真实 URL 存放在 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 而非 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":",依赖 ","type":"text"},{"text":"IntersectionObserver","type":"text","marks":[{"type":"code_inline"}]},{"text":" 在元素进入视口时才加载。Playwright 无头浏览器通过 ","type":"text"},{"text":"window.scrollBy(0, 300)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 配合 ","type":"text"},{"text":"asyncio.sleep(0.2)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 模拟慢速滚动,逐步触发所有图片的懒加载观察器","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"兜底策略","type":"text","marks":[{"type":"strong"}]},{"text":":滚动完成后,通过 ","type":"text"},{"text":"page.evaluate()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 遍历所有 ","type":"text"},{"text":"img[data-src]","type":"text","marks":[{"type":"code_inline"}]},{"text":",将未被触发的 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 强制复制到 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"高清图优先","type":"text","marks":[{"type":"strong"}]},{"text":":提取图片 URL 时优先使用 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":"(高清原图),而非 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":"(可能是低分辨率占位图)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"格式识别","type":"text","marks":[{"type":"strong"}]},{"text":":微信图片 URL 无常规扩展名(如 ","type":"text"},{"text":"mmbiz.qpic.cn/...?wx_fmt=png","type":"text","marks":[{"type":"code_inline"}]},{"text":"),需解析 ","type":"text"},{"text":"wx_fmt","type":"text","marks":[{"type":"code_inline"}]},{"text":" 查询参数推断文件格式","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"防盗链绕过","type":"text","marks":[{"type":"strong"}]},{"text":":通过 Playwright 页面上下文的 ","type":"text"},{"text":"page.request.get()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 下载图片,自动携带正确的 Referer 头","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"专用选择器","type":"text","marks":[{"type":"strong"}]},{"text":":微信文章有固定 DOM 结构(","type":"text"},{"text":"#js_content","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"#activity-name","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"#js_name","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"#publish_time","type":"text","marks":[{"type":"code_inline"}]},{"text":"),使用专用选择器比通用选择器更精准可靠","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"关键决策","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"微信文章是公开可读的,跳过登录检测和 Cookie 注入流程","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"滚动参数(300px 步长、200ms 间隔)经实测可平衡速度与懒加载触发成功率","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Markdown 转换时 ","type":"text"},{"text":"imageMap","type":"text","marks":[{"type":"code_inline"}]},{"text":" 同时匹配 ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":",确保无论 HTML 中引用哪个属性都能正确替换","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"验证标准","type":"text","marks":[{"type":"strong"}]},{"text":":抓取完成后检查 ","type":"text"},{"text":"article_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中的 ","type":"text"},{"text":"image_count","type":"text","marks":[{"type":"code_inline"}]},{"text":" 字段,与原文图片数量比对,确认无遗漏。","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"新平台适配思路","type":"text"}]},{"type":"paragraph","content":[{"text":"适配新平台时,需依次识别和处理以下 4 个维度:","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"懒加载机制","type":"text","marks":[{"type":"strong"}]},{"text":" — 图片是否用 ","type":"text"},{"text":"data-src","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"data-lazy","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等延迟加载?需要怎样的滚动策略触发?","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"专用 DOM 结构","type":"text","marks":[{"type":"strong"}]},{"text":" — 正文、标题、作者、日期的选择器是什么?","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片 URL 格式","type":"text","marks":[{"type":"strong"}]},{"text":" — 扩展名是否在路径中?是否需要从查询参数推断?","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"防盗链策略","type":"text","marks":[{"type":"strong"}]},{"text":" — 是否需要正确的 Referer?是否有其他鉴权机制?","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"微信公众号文章处理(mp.weixin.qq.com)","type":"text"}]},{"type":"paragraph","content":[{"text":"首选方案:乐享 MCP ","type":"text","marks":[{"type":"strong"}]},{"text":"file_create_hyperlink","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":"(2026-05-09 验证 ✅)","type":"text","marks":[{"type":"strong"}]}]},{"type":"paragraph","content":[{"text":"乐享后端原生支持微信公众号文章的抓取与解析,","type":"text"},{"text":"一步到位","type":"text","marks":[{"type":"strong"}]},{"text":",无需本地抓取和手动上传图片。","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"mcp__lexiang__file_create_hyperlink(\n url = \"https://mp.weixin.qq.com/s/...\",\n parent_entry_id = \"\u003c目标目录 entry_id>\",\n name = \"\u003c文章标题>\" // 可选,不传会自动从微信提取\n)","type":"text"}]},{"type":"paragraph","content":[{"text":"返回值","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"finished: true","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 后端抓取完成","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"entry.id","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 新创建的知识条目 ID","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"entry_type: \"flink\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 外部链接类型","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"extension: \"wechat\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" — 自动识别微信来源","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"后端自动完成的事情","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"抓取微信文章全文(正文 + 图片)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片保存到乐享 COS(","type":"text"},{"text":"/assets/xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"OCR 识别图片中的文字(用于全文检索和 AI 解析)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"自动提取标题、作者、发布时间等元信息","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"优势","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"一步完成,省去 fetch_article.py + 分块导入 + 逐张上传图片的复杂流程","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Token 消耗从 ~50K 降到 \u003c1K","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"图片质量由乐享后端保证,无需本地下载和上传","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"支持乐享的全文检索和 AI 解析(RAG)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"如需附加用户评价/评论","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"创建 hyperlink 后,可用 ","type":"text"},{"text":"entry_import_content_to_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":"(force_write=false)追加评价内容","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"或用 ","type":"text"},{"text":"block_create_block_descendant","type":"text","marks":[{"type":"code_inline"}]},{"text":" 在文档末尾插入评价 block","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"降级方案(当 ","type":"text","marks":[{"type":"strong"}]},{"text":"file_create_hyperlink","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 失败时)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果返回 ","type":"text"},{"text":"finished: false","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或错误码,改用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 本地抓取 + 降级方案 A 导入","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"某些被限制的微信文章(如已删除、需付费等)可能无法通过此接口抓取","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"注意事项","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"产出的 entry_type 是 ","type":"text"},{"text":"flink","type":"text","marks":[{"type":"code_inline"}]},{"text":"(外部链接),而非 ","type":"text"},{"text":"page","type":"text","marks":[{"type":"code_inline"}]},{"text":"(在线文档)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"flink 类型在乐享中以原始文章格式展示,支持全文检索和 AI 解析","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果用户明确要求以「在线文档/page」格式存储(需要后续编辑),才使用 fetch_article.py 降级方案","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"得到 APP 文章抓取(dedao.cn)","type":"text"}]},{"type":"paragraph","content":[{"text":"核心问题","type":"text","marks":[{"type":"strong"}]},{"text":":得到 APP(","type":"text"},{"text":"www.dedao.cn","type":"text","marks":[{"type":"code_inline"}]},{"text":")的文章内容是","type":"text"},{"text":"付费内容 + SPA 动态渲染","type":"text","marks":[{"type":"strong"}]},{"text":",","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的通用提取逻辑都无法直接获取正文。","type":"text"}]},{"type":"paragraph","content":[{"text":"技术原因","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"SPA 架构","type":"text","marks":[{"type":"strong"}]},{"text":":得到网页版是 React SPA,文章正文通过 JS 异步渲染,","type":"text"},{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 只能拿到空白壳页面","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"付费墙","type":"text","marks":[{"type":"strong"}]},{"text":":文章属于付费专栏内容,必须有已登录且已订阅的账号才能查看全文","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"DOM 结构特殊","type":"text","marks":[{"type":"strong"}]},{"text":":正文容器使用 ","type":"text"},{"text":".iget-articles","type":"text","marks":[{"type":"code_inline"}]},{"text":" 类名,不在 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的默认选择器列表(","type":"text"},{"text":"article","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":".post-content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 等)中。通用 ","type":"text"},{"text":"article","type":"text","marks":[{"type":"code_inline"}]},{"text":" 选择器只匹配到极少内容(~167 字符),而真正的正文在 ","type":"text"},{"text":".iget-articles","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中有 6000+ 字符","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"内容区混杂","type":"text","marks":[{"type":"strong"}]},{"text":":正文容器中混入了标题重复、音频时长、\"划重点\"、用户评论等非正文内容,需要清理","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"抓取方案","type":"text","marks":[{"type":"strong"}]},{"text":":使用 ","type":"text"},{"text":"CDP 模式","type":"text","marks":[{"type":"strong"}]},{"text":"连接已登录得到的 Chrome 浏览器:","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 前提:用户已在 Chrome 中登录得到 APP 且有文章阅读权限\npython scripts/fetch_article.py fetch \"https://www.dedao.cn/course/article?id=\u003cID>\" --output-dir \u003c目录> --cdp","type":"text"}]},{"type":"paragraph","content":[{"text":"已知限制","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的通用内容提取逻辑对得到 DOM 结构匹配不佳,","type":"text"},{"text":"抓取结果可能不完整","type":"text","marks":[{"type":"strong"}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"正确做法是通过 Playwright CDP 连接后,","type":"text"},{"text":"手动指定 ","type":"text","marks":[{"type":"strong"}]},{"text":".iget-articles","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 选择器","type":"text","marks":[{"type":"strong"}]},{"text":"提取正文:","type":"text"}]}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"# 通过 CDP 连接后,用专用选择器提取得到文章正文\ncontent_el = await page.query_selector('.iget-articles')\nif content_el:\n text = await content_el.inner_text() # 完整正文","type":"text"}]},{"type":"paragraph","content":[{"text":"内容清理要点","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"去掉正文开头的标题重复、日期、音频时长等元信息(通常在 ","type":"text"},{"text":"凡哥杂谈,你好","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或类似开场白之前)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"去掉正文末尾的\"划重点\"、\"添加到笔记\"、\"首次发布\"、\"用户留言\"等非正文内容","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果是多篇系列文章(如上/下篇),合并时用 ","type":"text"},{"text":"## 上篇","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":"## 下篇","type":"text","marks":[{"type":"code_inline"}]},{"text":" 分隔","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"作者信息需要手动确认(通用提取器可能抓错)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"得到文章转存乐享完整流程(2026-05-09 实战验证 ✅)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"以下流程已在实际操作中验证通过,确保图文完整转存。","type":"text"}]}]},{"type":"ordered_list","attrs":{"order":1,"listStyle":"number"},"content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"抓取","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"},{"text":"python scripts/fetch_article.py fetch \"\u003cURL>\" --output-dir articles/dedao_\u003cID短码> --cdp","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"产出:","type":"text"},{"text":"article.md","type":"text","marks":[{"type":"code_inline"}]},{"text":" + ","type":"text"},{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 目录(通常 80-100+ 张图,大部分是小于 10KB 的公式/icon 图)","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"提取纯文字版","type":"text","marks":[{"type":"strong"}]},{"text":"(去除图片引用和得到 UI 噪声):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 去除图片引用 ![](images/...)\n# 去除得到 APP 特有 UI 噪声:\n# - \"展开\"/\"收起\" 按钮文字\n# - 点赞数、评论数、分享按钮(如 \"25\"、\"8\"、\"218\"、\"分享\")\n# - \"关注\" 按钮\n# - 用户昵称 + 日期行(如 \"Christy\\n05-05\")\n# - \"划重点\" / \"添加到笔记\" / \"写笔记划线删除划线复制\" 等功能按钮\n# - \"首次发布: ...\" 行\n# - \"我的留言\" / \"用户留言\" / \"全部 精选 筛选\" 等区域标记\n# 保留正文 + 注释引用","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"创建在线文档 + 分块导入文字","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"entry_create_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":"(entry_type=\"page\", parent_entry_id=日期目录, name=\"\u003c文章标题>(来源描述)\")","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"将纯文字版分块(≤4000 chars/块),第一块 force_write=true,后续 force_write=false 追加","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"验证导入结果(spot check 关键段落)","type":"text"}]}]}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"筛选并上传关键图片","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# 找出 >50KB 的关键图片\nfind images/ -size +50k -type f | sort\n\n# 排除 SVG/UI 图标(检查文件头)\nfile images/img_04_*.png # 如果是 SVG XML 则跳过\n\n# 查看图片内容(确认哪些有信息价值)\n# 典型有价值的:概念图、流程图、人物照片、数据图表\n# 典型无价值的:SVG 格式的得到 APP logo/icon","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"逐张上传图片到文档对应位置","type":"text","marks":[{"type":"strong"}]},{"text":"(每张图3步):","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"① block_apply_block_attachment_upload(entry_id, name, size, mime_type) → session_id + upload_url\n② curl -X PUT \"\u003cupload_url>\" -H \"Content-Type: \u003cmime>\" -H \"Content-Length: \u003csize>\" --data-binary @\u003cfile>\n③ block_create_block_descendant(entry_id, parent_block_id=page_block_id, index=\u003c位置>, descendant=[{block_type:\"image\", image:{session_id, caption, align:\"center\"}}])","type":"text"}]},{"type":"paragraph","content":[{"text":"图片位置确定","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"先用 ","type":"text"},{"text":"block_list_block_children","type":"text","marks":[{"type":"code_inline"}]},{"text":"(entry_id, with_descendants=false)获取所有一级 block","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"根据原文 article.md 中 ","type":"text"},{"text":"![](images/xxx)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的位置,找到对应文字段落的 block_id","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"用 index 参数插入(注意:每插入一张图,后面的 block index 都会 +1)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果精确位置难以确定,也可以用 index=-1 追加到末尾(所有图集中放在文末也可接受)","type":"text"}]}]}]}]}]},{"type":"paragraph","content":[{"text":"适用场景","type":"text","marks":[{"type":"strong"}]},{"text":":得到 APP 专栏文章(","type":"text"},{"text":"www.dedao.cn/course/article?id=xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]},{"type":"paragraph","content":[{"text":"TODO","type":"text","marks":[{"type":"strong"}]},{"text":":考虑在 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中增加得到专用检测和选择器(类似微信公众号的 ","type":"text"},{"text":"_is_wechat_article","type":"text","marks":[{"type":"code_inline"}]},{"text":" 机制),自动使用 ","type":"text"},{"text":".iget-articles","type":"text","marks":[{"type":"code_inline"}]},{"text":" 提取正文。","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"SPA 网站 Playwright 直接出 PDF(正文隔离方案)","type":"text"}]},{"type":"paragraph","content":[{"text":"适用场景","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取后正文为空或极少(\u003c 200 字符),说明网站是 SPA 动态渲染,通用 Markdown 提取器无法工作","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"批量抓取帮助中心/文档站(如 Guru help.getguru.com、readme.io 托管站、GitBook 等)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"已知案例:","type":"text"},{"text":"vcsmemo.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"(Nuxt.js SPA)、","type":"text"},{"text":"help.getguru.com","type":"text","marks":[{"type":"code_inline"}]},{"text":"(readme.io)","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"核心方案","type":"text","marks":[{"type":"strong"}]},{"text":":用 Playwright 无头浏览器直接访问页面 → 等待 SPA 渲染完成 → 隔离正文区域 → ","type":"text"},{"text":"page.pdf()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 生成 PDF。","type":"text"}]},{"type":"paragraph","content":[{"text":"关键步骤","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"1. 加载与等待","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"await page.goto(url, { waitUntil: \"networkidle\", timeout: 60000 });\nawait page.waitForTimeout(5000); // SPA 需要额外等待 JS 渲染","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"2. 滚动触发懒加载图片","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"await page.evaluate(async () => {\n const delay = (ms) => new Promise(r => setTimeout(r, ms));\n for (let i = 0; i \u003c document.body.scrollHeight; i += 300) {\n window.scrollBy(0, 300);\n await delay(200);\n }\n window.scrollTo(0, 0);\n});\nawait page.waitForTimeout(3000);","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"3. 正文隔离(⚠️ 最关键的一步)","type":"text"}]},{"type":"paragraph","content":[{"text":"问题","type":"text","marks":[{"type":"strong"}]},{"text":":直接 ","type":"text"},{"text":"page.pdf()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 会把整个页面打进 PDF,包括导航栏、侧边栏、相关推荐、页脚等非正文内容。","type":"text"},{"text":"必须在生成 PDF 前隔离正文区域","type":"text","marks":[{"type":"strong"}]},{"text":"。","type":"text"}]},{"type":"paragraph","content":[{"text":"正文隔离策略(三步法)","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"paragraph","content":[{"text":"Step A:定位正文容器","type":"text","marks":[{"type":"strong"}]},{"text":" — 找到包含文章核心段落的最小公共祖先节点","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"// 用文章中的关键句子定位正文 \u003cp> 标签\nconst articleParagraphs = [];\ndocument.querySelectorAll(\"p\").forEach(p => {\n if (p.textContent.includes(\"文章中的某段独特文字\")) {\n articleParagraphs.push(p);\n }\n});\n\n// 计算所有正文段落的最小公共祖先\nlet commonAncestor = articleParagraphs[0];\nfor (let i = 1; i \u003c articleParagraphs.length; i++) {\n // ... 向上遍历 DOM 树找公共祖先\n}","type":"text"}]},{"type":"paragraph","content":[{"text":"Step B:替换 body","type":"text","marks":[{"type":"strong"}]},{"text":" — 将整个 ","type":"text"},{"text":"document.body","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的内容替换为正文容器的克隆","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"const articleContent = commonAncestor.cloneNode(true);\ndocument.body.innerHTML = \"\";\ndocument.body.appendChild(articleContent);","type":"text"}]},{"type":"paragraph","content":[{"text":"Step C:清理残余","type":"text","marks":[{"type":"strong"}]},{"text":" — 从正文容器内部移除混入的非正文元素","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"// 移除正文容器内可能混入的非内容元素\narticleContent.querySelectorAll(\n '[class*=\"related\"], [class*=\"sidebar\"], [class*=\"comment\"], ' +\n '[class*=\"share\"], [class*=\"subscribe\"], nav, header, footer'\n).forEach(el => el.remove());\n\n// 按文本内容移除(如\"相关文章\"、\"登录\"等中文导航项)\narticleContent.querySelectorAll(\"*\").forEach(el => {\n const t = el.textContent.trim();\n if (t === \"相关文章\" || t === \"登录\" || t.startsWith(\"Signal, not noise\")) {\n const wrapper = el.closest(\"section, div, aside\");\n wrapper ? wrapper.remove() : el.remove();\n }\n});","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"4. 样式优化","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"articleContent.style.maxWidth = \"750px\";\narticleContent.style.margin = \"0 auto\";\narticleContent.style.padding = \"30px 20px\";\narticleContent.style.fontSize = \"15px\";\narticleContent.style.lineHeight = \"1.8\";\n\narticleContent.querySelectorAll(\"img\").forEach(img => {\n img.style.maxWidth = \"100%\";\n img.style.height = \"auto\";\n});","type":"text"}]},{"type":"heading","attrs":{"level":4},"content":[{"text":"5. 生成 PDF","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"javascript"},"content":[{"text":"await page.pdf({\n path: outputPath,\n format: \"A4\",\n printBackground: true,\n margin: { top: \"15mm\", bottom: \"15mm\", left: \"15mm\", right: \"15mm\" },\n});","type":"text"}]},{"type":"paragraph","content":[{"text":"常见需要移除的非正文元素","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"元素类型","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"典型选择器/文本","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"说明","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"左侧导航","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"nav","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"[class*=\"sidebar\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":", 包含\"首页/快讯/登录\"等文本","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"网站主导航","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"右侧推荐","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"[class*=\"related\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":", 包含\"相关文章\"文本","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"相关文章推荐","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"顶部搜索","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"[class*=\"search\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"header","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"搜索栏和网站 header","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"底部页脚","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"footer","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"[class*=\"footer\"]","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"版权信息等","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"作者卡片","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"[class*=\"author-card\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":", 包含头像+简介的独立区块","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"如果在正文外部","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"订阅入口","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"[class*=\"subscribe\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"[class*=\"newsletter\"]","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CTA 按钮","type":"text"}]}]}]}]},{"type":"paragraph","content":[{"text":"调试技巧","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"在 ","type":"text"},{"text":"page.pdf()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 之前先 ","type":"text"},{"text":"page.screenshot({ path: \"debug.png\", fullPage: true })","type":"text","marks":[{"type":"code_inline"}]},{"text":" 截图确认隔离效果","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"如果首次隔离不干净,根据截图调整选择器,迭代优化","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"已验证的 SPA 网站","type":"text","marks":[{"type":"strong"}]},{"text":":","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"网站","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"框架","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"正文定位方式","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"vcsmemo.com","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Nuxt.js","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通过文章段落文本找公共祖先,class ","type":"text"},{"text":"left","type":"text","marks":[{"type":"code_inline"}]},{"text":" 内的 ","type":"text"},{"text":"section","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"help.getguru.com","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"readme.io","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"移除 ","type":"text"},{"text":".rm-Sidebar","type":"text","marks":[{"type":"code_inline"}]},{"text":" + ","type":"text"},{"text":"nav","type":"text","marks":[{"type":"code_inline"}]},{"text":" + ","type":"text"},{"text":"header","type":"text","marks":[{"type":"code_inline"}]},{"text":" + ","type":"text"},{"text":"footer","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"dedao.cn","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"React SPA","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CDP 模式 + ","type":"text"},{"text":".iget-articles","type":"text","marks":[{"type":"code_inline"}]},{"text":" 专用选择器","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Python 兼容性","type":"text"}]},{"type":"paragraph","content":[{"text":"脚本使用 ","type":"text"},{"text":"from __future__ import annotations","type":"text","marks":[{"type":"code_inline"}]},{"text":" 以兼容 Python 3.9(","type":"text"},{"text":"str | None","type":"text","marks":[{"type":"code_inline"}]},{"text":" 联合类型语法在 3.9 中不可用)。","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"常见问题","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"问题","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"原因","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"修复方法","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"YouTube 视频下载 HTTP 403 Forbidden","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"yt-dlp 版本过旧 + YouTube 强制 SABR 流媒体协议,传统 DASH 分片下载被拦截","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"① ","type":"text"},{"text":"brew install yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 升级到最新版(不要用 pip);② 脚本已配置优先使用 HLS(m3u8) 格式(","type":"text"},{"text":"95-1/94-1/93-1","type":"text","marks":[{"type":"code_inline"}]},{"text":"),自动回退","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"pip3 install --upgrade yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 无法安装最新版","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"macOS 自带 Python 3.9,yt-dlp nightly 版需要 Python 3.10+","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"改用 ","type":"text"},{"text":"brew install yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":",brew 版自带独立 Python 环境","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"脚本中 ","type":"text"},{"text":"python3 -m yt_dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 调用失败","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"pip 安装的旧版 yt-dlp 与 brew 安装的新版不一致","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"脚本已修改为直接调用 ","type":"text"},{"text":"yt-dlp","type":"text","marks":[{"type":"code_inline"}]},{"text":" 命令(brew 安装的版本)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"视频上传乐享报\"不支持的文件格式\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"旧版 COS API(","type":"text"},{"text":"/kb/files/upload-params","type":"text","marks":[{"type":"code_inline"}]},{"text":")不识别视频格式","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"通过 lexiang MCP 工具使用三步上传流程:","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"curl PUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"file_commit_upload","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Whisper 转录速度极慢","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"模型太大或音频太长","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"换用 ","type":"text"},{"text":"tiny","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或 ","type":"text"},{"text":"base","type":"text","marks":[{"type":"code_inline"}]},{"text":" 模型;对于长视频(>1h),考虑用 ","type":"text"},{"text":"--whisper-model tiny","type":"text","marks":[{"type":"code_inline"}]},{"text":" 先快速预览","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"翻译结果为空","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"未设置 ","type":"text"},{"text":"OPENAI_API_KEY","type":"text","marks":[{"type":"code_inline"}]},{"text":" 环境变量","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"export OPENAI_API_KEY=sk-xxx","type":"text","marks":[{"type":"code_inline"}]},{"text":";或使用 ","type":"text"},{"text":"--skip-translate","type":"text","marks":[{"type":"code_inline"}]},{"text":" 跳过翻译,由 AI 助手在对话中直接翻译全文后用 ","type":"text"},{"text":"md_to_page.py --entry-id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 更新乐享文档","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"中英对照格式段落错位","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"AI 翻译返回的段落数与原文不匹配","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"脚本已有容错处理(缺少翻译的段落会跳过),可手动补充翻译","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"视频上传乐享超时","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"视频文件过大(>500MB)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"使用 MCP 的 ","type":"text"},{"text":"file_apply_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 预签名 URL 方式上传,518MB 文件约 30-60 秒即可完成","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Whisper 中文转录输出繁体字","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Whisper base 模型对中文普通话倾向输出繁体","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"用 ","type":"text"},{"text":"opencc-python-reimplemented","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 ","type":"text"},{"text":"t2s","type":"text","marks":[{"type":"code_inline"}]},{"text":" 模式进行繁简转换:","type":"text"},{"text":"opencc.OpenCC(\"t2s\").convert(text)","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"小宇宙播客下载提示 generic extractor","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"yt-dlp 没有小宇宙专用 extractor","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"正常现象,generic extractor 能自动从页面提取音频直链(","type":"text"},{"text":"media.xyzcdn.net","type":"text","marks":[{"type":"code_inline"}]},{"text":"),下载完全正常","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"微信文章图片丢失","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"web_fetch","type":"text","marks":[{"type":"code_inline"}]},{"text":" 无法触发懒加载和绕过防盗链","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"首选","type":"text","marks":[{"type":"strong"}]},{"text":":使用 ","type":"text"},{"text":"file_create_hyperlink","type":"text","marks":[{"type":"code_inline"}]},{"text":" 直接导入(乐享后端自动处理图文)。","type":"text"},{"text":"降级","type":"text","marks":[{"type":"strong"}]},{"text":":使用 ","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(脚本自动检测微信域名并启用专用处理策略)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"乐享知识库操作失败","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"MCP 连接异常或 Token 过期","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"① 确认当前 Agent 的 lexiang MCP 已连接(CodeBuddy 检查 MCP 面板、OpenClaw 检查 skill 安装状态);② Token 过期时访问 https://lexiangla.com/mcp 获取新 Token 并更新 MCP 配置","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"文件上传到了知识库根目录而非日期目录","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"跳过了步骤 2(创建日期目录)和步骤 3(去重检查),直接以 ","type":"text"},{"text":"root_entry_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 作为 ","type":"text"},{"text":"parent_entry_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 上传","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"严格按照步骤 1→2→3→4 顺序执行,步骤 2 中先 ","type":"text"},{"text":"entry_list_children","type":"text","marks":[{"type":"code_inline"}]},{"text":" 检查日期目录是否存在,不存在则创建","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"展示给用户的乐享链接无法访问","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"使用了 MCP API 域名 ","type":"text"},{"text":"mcp.lexiang-app.com","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或缺少 ","type":"text"},{"text":"company_from","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"所有展示给用户的链接必须按 ","type":"text"},{"text":"config.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中 ","type":"text"},{"text":"page_url_template","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式生成:","type":"text"},{"text":"https://lexiangla.com/pages/\u003centry_id>?company_from=\u003ccompany_from>","type":"text","marks":[{"type":"code_inline"}]},{"text":"。","type":"text"},{"text":"company_from 不可省略","type":"text","marks":[{"type":"strong"}]},{"text":",否则用户无法访问","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"PDF 中缺少标题","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 ","type":"text"},{"text":"processNode","type":"text","marks":[{"type":"code_inline"}]},{"text":" 将正文 ","type":"text"},{"text":"\u003ch1>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 转为 ","type":"text"},{"text":"# 标题","type":"text","marks":[{"type":"code_inline"}]},{"text":",与手动拼接的元信息头标题重复;某些网站(如 Lenny's Newsletter)标题在 ","type":"text"},{"text":"articleEl","type":"text","marks":[{"type":"code_inline"}]},{"text":" 外部导致 MD 文件第一行 ","type":"text"},{"text":"# ","type":"text","marks":[{"type":"code_inline"}]},{"text":" 为空","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:(1) ","type":"text"},{"text":"processNode","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中自动去重正文中与已提取 title 相同的第一个 h1 (2) 标题提取增加 ","type":"text"},{"text":"og:title","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"meta[name=\"title\"]","type":"text","marks":[{"type":"code_inline"}]},{"text":"、","type":"text"},{"text":"document.title","type":"text","marks":[{"type":"code_inline"}]},{"text":" 多策略回退 (3) ","type":"text"},{"text":"md_to_pdf.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 增加标题回退——当 MD 中无有效 h1 时从 ","type":"text"},{"text":"article_meta.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 补充","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"PDF 中缺少子标题","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"某些网站的 HTML 结构导致 ","type":"text"},{"text":"### # 从 Tab 到 Agents","type":"text","marks":[{"type":"code_inline"}]},{"text":" 被拆为两行:","type":"text"},{"text":"### #","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和 ","type":"text"},{"text":"从 Tab 到 Agents","type":"text","marks":[{"type":"code_inline"}]},{"text":",","type":"text"},{"text":"parse_markdown","type":"text","marks":[{"type":"code_inline"}]},{"text":" 将 ","type":"text"},{"text":"#","type":"text","marks":[{"type":"code_inline"}]},{"text":" 视为无效标题丢弃","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:","type":"text"},{"text":"parse_markdown","type":"text","marks":[{"type":"code_inline"}]},{"text":" 增加拆行标题检测——当标题文字为 ","type":"text"},{"text":"#","type":"text","marks":[{"type":"code_inline"}]},{"text":" 或空时,检查下一行是否为实际标题文字并合并","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"md_to_page.py 导入后文字显示为 base64 乱码","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"脚本通过 HTTP JSON-RPC 直连乐享 MCP API 时,对 content 做了多余的 base64 编码。乐享 MCP 的 base64 要求仅针对 IDE 侧 MCP 协议","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:去掉 ","type":"text"},{"text":"import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 函数中的 ","type":"text"},{"text":"base64.b64encode()","type":"text","marks":[{"type":"code_inline"}]},{"text":",直传原始 markdown。⚠️ 通过 HTTP JSON-RPC 直连时","type":"text"},{"text":"永远不要做 base64 编码","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"md_to_page.py 批量插入图片 block 失败","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"block_create_block_descendant","type":"text","marks":[{"type":"code_inline"}]},{"text":" 一次传多张图片的 descendant 数组会超时或报错","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"改为逐张插入,每次只传一个 image block 的 descendant + children","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Gemini API 调用报 404 模型不存在","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"gemini-2.0-flash","type":"text","marks":[{"type":"code_inline"}]},{"text":" 模型已下线","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"使用 ","type":"text"},{"text":"gemini-2.5-flash","type":"text","marks":[{"type":"code_inline"}]},{"text":" 替代。可通过 ","type":"text"},{"text":"curl \"https://generativelanguage.googleapis.com/v1beta/models?key=$GEMINI_API_KEY\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" 查看当前可用模型","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"英文文章未翻译就归档","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"跳过了步骤 3.5 的语言检测和翻译","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"所有英文文章必须翻译为中英对照后再归档","type":"text","marks":[{"type":"strong"}]},{"text":",这是强制步骤不可跳过。使用 ","type":"text"},{"text":"translate_gemini.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(Gemini API)或 ","type":"text"},{"text":"translate_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(OpenAI API)翻译,翻译完用 ","type":"text"},{"text":"md_to_page.py --entry-id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 覆盖更新","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"translate_gemini.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 报错 FileNotFoundError","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"脚本硬编码了源文件路径,不读取命令行参数","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:改用 ","type":"text"},{"text":"sys.argv[1]","type":"text","marks":[{"type":"code_inline"}]},{"text":" 读取输入文件,","type":"text"},{"text":"sys.argv[2]","type":"text","marks":[{"type":"code_inline"}]},{"text":" 读取输出文件,默认输出 ","type":"text"},{"text":"_translated.md","type":"text","marks":[{"type":"code_inline"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 执行报错 IndentationError","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"添加 ","type":"text"},{"text":"--evaluation","type":"text","marks":[{"type":"code_inline"}]},{"text":"/","type":"text"},{"text":"--evaluation-file","type":"text","marks":[{"type":"code_inline"}]},{"text":" 参数时缩进不一致","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:参数定义须与上方 ","type":"text"},{"text":"--base-url","type":"text","marks":[{"type":"code_inline"}]},{"text":" 对齐;Python 严禁混用 tab 和空格","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 下载的图片在 ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 中提示 NOT FOUND","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"下载保存的文件名与写入 Markdown 的引用不一致(如 ","type":"text"},{"text":"img_06_1c1cfc4c.gif","type":"text","marks":[{"type":"code_inline"}]},{"text":" vs ","type":"text"},{"text":"img_06_1c1cfc42.gif","type":"text","marks":[{"type":"code_inline"}]},{"text":")","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 ","type":"text"},{"text":"process_images","type":"text","marks":[{"type":"code_inline"}]},{"text":" 函数中,保存到 ","type":"text"},{"text":"images/","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的文件名与替换 Markdown ","type":"text"},{"text":"src","type":"text","marks":[{"type":"code_inline"}]},{"text":" 时的文件名必须完全一致;建议统一使用 ","type":"text"},{"text":"hash[:8]","type":"text","marks":[{"type":"code_inline"}]},{"text":" + 原始扩展名,并在替换后打印映射表方便排查","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"乐享 MCP 更新 token 后工具仍报 \"not found\"","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"mcp.json","type":"text","marks":[{"type":"code_inline"}]},{"text":" 配置已更新,但 MCP 服务未重新加载","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"必须重启 WorkBuddy","type":"text","marks":[{"type":"strong"}]},{"text":"(或禁用再重新启用 MCP 服务),新的 token 才能生效","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 新增评价信息功能","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"需要在文档顶部插入用户评价(callout 组件)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已添加 ","type":"text"},{"text":"--evaluation","type":"text","marks":[{"type":"code_inline"}]},{"text":"(短文本)和 ","type":"text"},{"text":"--evaluation-file","type":"text","marks":[{"type":"code_inline"}]},{"text":"(文件路径)两个参数;评价内容会以 blockquote 格式插入文档顶部,乐享会自动渲染为 callout 组件","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"播客文字稿章节标题重复出现几十次","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"在 Whisper segment 级别(1-5秒粒度)插入章节标题,且用宽松时间容差匹配 ","type":"text"},{"text":"abs(start - ts) \u003c 5","type":"text","marks":[{"type":"code_inline"}]},{"text":",导致多个 segment 都命中同一标题","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"必须先合并 segments 为段落(gap\u003c2s, duration\u003c60s),再在段落级别插入标题","type":"text","marks":[{"type":"strong"}]},{"text":"。用 ","type":"text"},{"text":"inserted_headers = set()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 跟踪已插入标题,每个标题只插入一次","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"日期目录重复创建","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"直接调用 ","type":"text"},{"text":"entry_create_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 而不先查询目录是否已存在","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"必须先用 ","type":"text","marks":[{"type":"strong"}]},{"text":"entry_list_children","type":"text","marks":[{"type":"code_inline"},{"type":"strong"}]},{"text":" 查询根目录","type":"text","marks":[{"type":"strong"}]},{"text":",匹配到同名 folder 则复用其 ID,不存在才创建。已在步骤 2 中加强约束","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"得到文章转存后无图片","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"只导入了纯文字,未执行图片上传步骤","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"得到文章","type":"text"},{"text":"必须","type":"text","marks":[{"type":"strong"}]},{"text":"在文字导入后,逐张上传 >50KB 的关键图片(概念图/流程图/配图),流程:","type":"text"},{"text":"block_apply_block_attachment_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"curl PUT","type":"text","marks":[{"type":"code_inline"}]},{"text":" → ","type":"text"},{"text":"block_create_block_descendant","type":"text","marks":[{"type":"code_inline"}]},{"text":"(image block)。详见\"得到 APP 文章抓取\"章节","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"图片上传后显示不出来","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"block_create_block_descendant","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 image block 未正确传入 session_id","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"image block 的 ","type":"text"},{"text":"session_id","type":"text","marks":[{"type":"code_inline"}]},{"text":" 必须来自同一个 ","type":"text"},{"text":"block_apply_block_attachment_upload","type":"text","marks":[{"type":"code_inline"}]},{"text":" 返回值,且 curl PUT 必须返回 HTTP 200 才表示文件上传成功","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 抓取 Webflow SPA 站点(如 claude.com)正文为空","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"旧版通用选择器列表中无 Webflow 容器,且内嵌 ","type":"text"},{"text":"\u003cstyle>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 标签污染提取","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:","type":"text"},{"text":"fetch_article.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 内置 ","type":"text"},{"text":"_is_webflow_blog()","type":"text","marks":[{"type":"code_inline"}]},{"text":" 检测,自动使用 ","type":"text"},{"text":".u-rich-text-blog","type":"text","marks":[{"type":"code_inline"}]},{"text":" / ","type":"text"},{"text":".w-richtext","type":"text","marks":[{"type":"code_inline"}]},{"text":" 选择器,并在提取前移除内嵌 ","type":"text"},{"text":"\u003cstyle>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 标签。无需使用独立脚本","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 无法匹配某些图片引用","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"旧版 img_pattern 只匹配 ","type":"text"},{"text":"img_XX_HASH.ext","type":"text","marks":[{"type":"code_inline"}]},{"text":" 格式(fetch_article.py),不匹配其他命名格式","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:img_pattern 改为通用 ","type":"text"},{"text":"!\\[[^\\]]*\\]\\(images/([^)]+)\\)","type":"text","marks":[{"type":"code_inline"}]},{"text":",兼容所有图片命名格式","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"新建目录排到末尾而非顶部","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":" 和手动调用 ","type":"text"},{"text":"entry_move_entry","type":"text","marks":[{"type":"code_inline"}]},{"text":" 时使用 ","type":"text"},{"text":"after=\u003c第一个条目ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":"(排第二)或 ","type":"text"},{"text":"after=\"\"","type":"text","marks":[{"type":"code_inline"}]},{"text":"(排末尾)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"已修复:统一使用 ","type":"text"},{"text":"before=\u003c第一个条目ID>","type":"text","marks":[{"type":"code_inline"}]},{"text":" 实现真正置顶。","type":"text"},{"text":"after=\"\"","type":"text","marks":[{"type":"code_inline"}]},{"text":" 的 API 文档描述不准确,实测是排末尾","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"images/ 有图片但乐享文档无图片","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 导入 Markdown 时,本地 ","type":"text"},{"text":"![](images/xxx)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 引用不会自动上传图片","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"entry_import_content","type":"text","marks":[{"type":"code_inline"}]},{"text":" 只处理文字,","type":"text"},{"text":"不会上传本地图片","type":"text","marks":[{"type":"strong"}]},{"text":"。有本地图片必须走 ","type":"text"},{"text":"md_to_page.py","type":"text","marks":[{"type":"code_inline"}]},{"text":"(自动处理图文)或降级方案 A(","type":"text"},{"text":"交替导入","type":"text","marks":[{"type":"strong"}]},{"text":"文字和图片,严禁先全文后补图)。步骤 4 开头的「图片判断 Checklist」是必查项","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"图片全部堆积在文档末尾","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"先一次性导入全部文字,事后用 ","type":"text"},{"text":"index=-1","type":"text","marks":[{"type":"code_inline"}]},{"text":" 补图","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"必须交替导入","type":"text","marks":[{"type":"strong"}]},{"text":":按 ","type":"text"},{"text":"![](images/xxx)","type":"text","marks":[{"type":"code_inline"}]},{"text":" 位置拆分 Markdown 为 segments,先导入一段文字 → 上传图片(index=-1 追加到末尾,此时末尾就是正确位置)→ 导入下一段文字 → 上传下一张图片... 详见降级方案 A 的执行流程","type":"text"}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"fetch-archive-to-lexiang","author":"@skillopedia","source":{"stars":0,"repo_name":"fetch-archive-to-lexiang","origin_url":"https://github.com/ajaxhe/fetch-archive-to-lexiang/blob/HEAD/SKILL.md","repo_owner":"ajaxhe","body_sha256":"a00dba369c19790ad7278acff9c93c9bff044c6777af459c67fd1eadbe41a455","cluster_key":"a905ff53b3c94a6b92b1f4c072a7e69070a570250343471967889a21bac62b01","clean_bundle":{"format":"clean-skill-bundle-v1","source":"ajaxhe/fetch-archive-to-lexiang/SKILL.md","attachments":[{"id":"38b55512-81a5-5f91-b097-8a5948af0e95","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/38b55512-81a5-5f91-b097-8a5948af0e95/attachment","path":".gitignore","size":314,"sha256":"46d1fbd3332cde4182ab4cbac8f64dfb01d09355eba43b06102cb7d96fad7f4b","contentType":"text/plain; charset=utf-8"},{"id":"6d8b700d-ac16-58aa-b361-3c2b6eb6a4f9","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/6d8b700d-ac16-58aa-b361-3c2b6eb6a4f9/attachment.md","path":"README.md","size":7551,"sha256":"409d90e60174dec35e83e46f85eb5d4714d0e552c2f39a928408138f217e6f1e","contentType":"text/markdown; charset=utf-8"},{"id":"8f12094a-0e26-5f55-b874-eaff4987d8a4","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8f12094a-0e26-5f55-b874-eaff4987d8a4/attachment.example","path":"config.json.example","size":869,"sha256":"2b74095d42b7ff31ca02c6fe912d34c157f206f4ba2939cf9309ce7c7fc60a42","contentType":"text/plain; charset=utf-8"},{"id":"6448b2e5-38e2-5bff-93a3-52a1e0a1f58f","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/6448b2e5-38e2-5bff-93a3-52a1e0a1f58f/attachment.py","path":"scripts/fetch_article.py","size":76016,"sha256":"44fad5ffebd6e2ec7a6b52af614b741f2873fc14f65f713af665f26c22a07e12","contentType":"text/x-python; charset=utf-8"},{"id":"d0fec989-2ed7-5e7f-b195-11d470e4a9f0","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/d0fec989-2ed7-5e7f-b195-11d470e4a9f0/attachment.py","path":"scripts/md_to_page.py","size":8770,"sha256":"fada7c1c42ec1fadd76f6763f2c391b5ce7e93b6a4f8107bdb7f20c51638a8b4","contentType":"text/x-python; charset=utf-8"},{"id":"8dddb433-470f-5979-8b7c-5790aa49f633","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8dddb433-470f-5979-8b7c-5790aa49f633/attachment.py","path":"scripts/md_to_pdf.py","size":14347,"sha256":"13007afcb1e9ab07db5387ea1376c0396dcb544a8ed82411f2d6cf179087c356","contentType":"text/x-python; charset=utf-8"},{"id":"6a5709bf-b3af-578e-a4b1-4d08def2511d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/6a5709bf-b3af-578e-a4b1-4d08def2511d/attachment.py","path":"scripts/translate_gemini.py","size":3111,"sha256":"1f84072010e83c1ee67986b7c93a04c03c514586ac532fd5258d07ede4210daf","contentType":"text/x-python; charset=utf-8"},{"id":"3305892a-b742-52b2-aea6-8ff9f14fd934","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/3305892a-b742-52b2-aea6-8ff9f14fd934/attachment.py","path":"scripts/upload_video_via_openapi.py","size":11701,"sha256":"2d97e753cae23294bade96c9417b30a9260d3915906fb67950fd61f23463bc1d","contentType":"text/x-python; charset=utf-8"},{"id":"78617a51-18f2-5901-ab2d-3d5f3e37d1c3","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/78617a51-18f2-5901-ab2d-3d5f3e37d1c3/attachment.py","path":"scripts/yt_download_transcribe.py","size":19468,"sha256":"3c8a4bda4085d92b0fd3b14c43fc0da1627b65bfb94e5df9c649e20250e919a5","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"b701f3c86369f5092c5d46040613f360c681f0a6518386a52750f8422eabeffd","attachment_count":9,"text_attachments":7,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":2,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"documents-office","category_label":"Documents"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"documents-office","import_tag":"clean-skills-v1","description":"通用文章抓取与归档工具。抓取任意 URL(免费/付费/登录墙)的文章全文,转换为结构化 Markdown,并可选转存到乐享知识库。支持 Substack、Medium、知识星球等付费平台的登录态管理。支持 YouTube 视频下载(yt-dlp)、播客音频下载(小宇宙FM等)、音频转录(Whisper)、翻译(中英对照格式),并将音视频和文字稿上传乐享知识库(文字稿使用在线文档格式,支持按块编辑)。支持 PDF 文件/链接:自动提取文本+精确裁剪图形,非中文内容默认翻译为中英对照后转存乐享。支持微博帖子抓取(CDP 模式绕过登录墙)。关键词触发:抓取文章、获取全文、付费文章、转存知识库、乐享、保存原文、fetch article、归档、YouTube、视频转录、字幕提取、视频下载、播客、podcast、小宇宙、xiaoyuzhou、PDF、论文、arxiv、微博、weibo。"}},"renderedAt":1782981553835}

抓取链接内容 & 转存知识库 🎬 视频/音频上传到乐享 :必须用 (走 OpenAPI )。 不要 用 MCP 的 或 ——它们产出 的条目,不触发 VOD 转码,视频无法播放。详见下方「YouTube 视频处理 → Step 2:上传到乐享知识库」章节。凭证存放于 (不进 git)。 概述 将文章 URL(免费/付费/登录墙)抓取为结构化 Markdown,并自动转存到乐享知识库,实现素材归档和可追溯。 最终产出物 1. — 完整文章 Markdown(含图片引用) 2. — 结构化元信息(原文链接、作者、发布时间、抓取时间等) 3. — 所有文章配图 4. 乐享知识库中的文档副本(按天维度归档) 乐享文档链接格式(⚠️ 必须遵守) 转存完成后, 必须 按以下格式输出可点击访问的链接: - : 或 返回的 - :固定值 (凡哥的企业 ID,不可省略,省略后链接无法访问) - 禁止 使用 格式——这是 MCP 内部调试链接,用户无法直接访问 文件命名规则(重要) - 必须使用原文标题命名 ,不要用 等通用名称 - 文件名格式: 、 - 示例: 、 - 如果标题中包含文件名不合法字符( 、 、 等),替换为 - 乐享知识库转存时也使用原文标题作为文档标题 工作流程 Step 1:素材收集 抓取方式决策树 根据 URL 类型选择抓取方式(按优先级排列): 1. claude.com…