x-tweet-fetcher — Skillopedia

X Tweet Fetcher Fetch tweets from X/Twitter without authentication. Supports tweet content, reply threads, user timelines, and Chinese platforms. Feature Overview | Feature | Command | Dependencies | |---------|---------|-------------| | Single tweet | | None (zero deps) | | Reply threads | | Camofox | | User timeline | | Camofox | | Chinese platforms | | Camofox (except WeChat) | | Google search | | Camofox | --- Basic Usage (Zero Dependencies) Fetch a Single Tweet What It Fetches | Content Type | Support | |-------------|---------| | Regular tweets | ✅ Full text + stats | | Long tweets (Twi…

, time_text) or re.search(r'^\\d+-\\d+\\s+\\d+:\\d+

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, time_text):\n current_article[\"time\"] = time_text\n # Extract post URL\n url_match = re.search(r'/url:\\s*(https?://[^$]+)', next_line)\n if url_match:\n current_article[\"time_url\"] = url_match.group(1).strip()\n continue\n\n # Verified text (认证信息): \" - text: 认证信息\"\n if current_article[\"author\"] and not current_article[\"verified_text\"]:\n if stripped.startswith(\" - text:\"):\n text = stripped[len(\" - text:\"):].strip()\n # Skip common UI text\n if text and text.lower() not in (\"转发\", \"评论\", \"赞\", \"收藏\", \"更多\", \"\"):\n # Check if it's verified info (contains \"已编辑\" or short)\n if \"已编辑\" in text or len(text) \u003c 60:\n current_article[\"verified_text\"] = text\n elif not current_article[\"content\"]:\n current_article[\"content\"] = text\n continue\n\n # Topics: \" - link \\\"#话题#\\\"\"\n if stripped.startswith(\" - link #\"):\n match = re.match(r'^- link \"(#[^#]+#)\"', stripped.lstrip())\n if match:\n current_article[\"topics\"].append(match.group(1))\n\n # Content: \" - text: 正文\" (after verified text)\n if current_article.get(\"verified_text\") and not current_article.get(\"content\"):\n if stripped.startswith(\" - text:\"):\n text = stripped[len(\" - text:\"):].strip()\n if text and len(text) > 5:\n current_article[\"content\"] = text\n continue\n\n # Stats: \" - text:  241  102\" (转发评论)\n if stripped.startswith(\" - text:\") and \"转发\" not in stripped:\n text = stripped[len(\" - text:\"):].strip()\n # Match pattern with numbers (possibly with emoji icons)\n nums = re.findall(r'(\\d+(?:\\.\\d+)?万?)', text)\n if len(nums) >= 2:\n try:\n current_article[\"shares\"] = parse_wan_number(nums[0])\n current_article[\"comments\"] = parse_wan_number(nums[1])\n except:\n pass\n\n # Likes: \" - button \\\"1793\\\" [eN]:\"\n if stripped.startswith(\" - button \"):\n match = re.match(r'^- button \"(\\d+(?:\\.\\d+)?万?)\"', stripped.lstrip())\n if match:\n current_article[\"likes\"] = parse_wan_number(match.group(1))\n\n # Add last article if still open\n if in_article and current_article is not None:\n articles.append(current_article)\n\n # Use first article\n if not articles:\n return {\n \"url\": url,\n \"platform\": \"weibo\",\n \"title\": \"\",\n \"author\": \"未知\",\n \"published_at\": \"\",\n \"fetched_at\": datetime.now(timezone(timedelta(hours=8))).isoformat(),\n \"content\": \"\",\n \"stats\": {\"likes\": 0, \"comments\": 0, \"shares\": 0, \"views\": 0},\n \"media\": [],\n \"comments\": [],\n \"availability\": \"partial\",\n \"unavailable_fields\": [\"comments\"],\n }\n\n first = articles[0]\n \n # Build content: verified text + main content\n full_content = \"\"\n if first.get(\"verified_text\"):\n full_content = first[\"verified_text\"]\n if first.get(\"content\"):\n if full_content:\n full_content += \"\\n\" + first[\"content\"]\n else:\n full_content = first[\"content\"]\n\n result = {\n \"url\": first.get(\"time_url\") or url,\n \"platform\": \"weibo\",\n \"title\": \"\",\n \"author\": first.get(\"author\", \"未知\"),\n \"author_handle\": first.get(\"author_url\", \"\"),\n \"published_at\": first.get(\"time\", \"\"),\n \"fetched_at\": datetime.now(timezone(timedelta(hours=8))).isoformat(),\n \"content\": full_content,\n \"stats\": {\n \"likes\": first.get(\"likes\", 0),\n \"comments\": first.get(\"comments\", 0),\n \"shares\": first.get(\"shares\", 0),\n \"views\": 0,\n },\n \"media\": [],\n \"topics\": first.get(\"topics\", []),\n \"comments\": [],\n \"availability\": \"partial\",\n \"unavailable_fields\": [\"comments\"],\n }\n\n return result\n\n def to_markdown(self, data: Dict[str, Any]) -> str:\n lines = [\n \"---\",\n f\"platform: {data['platform']}\",\n f\"url: {data['url']}\",\n f\"title: \\\"{data.get('title', '')}\\\"\",\n f\"author: \\\"{data.get('author', '')}\\\"\",\n f\"published_at: \\\"{data.get('published_at', '')}\\\"\",\n f\"fetched_at: \\\"{data.get('fetched_at', '')}\\\"\",\n \"stats:\",\n f\" likes: {data.get('stats', {}).get('likes', 0)}\",\n f\" comments: {data.get('stats', {}).get('comments', 0)}\",\n f\" shares: {data.get('stats', {}).get('shares', 0)}\",\n f\"availability: {data.get('availability', 'full')}\",\n \"---\",\n \"\",\n ]\n\n if data.get(\"title\"):\n lines.append(f\"# {data['title']}\\n\")\n\n if data.get(\"content\"):\n lines.append(data[\"content\"])\n\n return \"\\n\".join(lines)\n\n\n# ---------------------------------------------------------------------------\n# Bilibili parser\n# ---------------------------------------------------------------------------\n\nclass BilibiliParser(PlatformParser):\n \"\"\"Parser for Bilibili videos.\"\"\"\n\n name = \"bilibili\"\n\n def can_handle(self, url: str) -> bool:\n return bool(re.search(r'bilibili\\.com|b23\\.tv', url, re.IGNORECASE))\n\n def fetch(self, url: str, port: int = 9377) -> Dict[str, Any]:\n if not check_camofox(port):\n return {\"url\": url, \"platform\": \"bilibili\", \"error\": t(\"camofox_not_running\", port=port)}\n\n print(t(\"opening_via_camofox\", url=url), file=sys.stderr)\n\n session_key = f\"bilibili-{int(time.time())}\"\n snapshot = camofox_fetch_page(url, session_key, wait=8, port=port)\n\n if not snapshot:\n return {\"url\": url, \"platform\": \"bilibili\", \"error\": t(\"snapshot_failed\")}\n\n data = self._parse_snapshot(snapshot, url)\n return data\n\n def _parse_snapshot(self, snapshot: str, url: str) -> Dict[str, Any]:\n \"\"\"Parse Bilibili video page snapshot.\n \n Real snapshot format:\n - heading \"标题\" [level=1]\n - text: 1019.1万 (播放量)\n - text: 1.1万 2026-02-17 23:51:30 (弹幕+时间)\n - text: 未经作者授权... (简介)\n - text: 77.7万 (点赞)\n - text: 8.8万 (投币)\n - text: 19.8万 (收藏)\n - text: 19.1万 (转发)\n - link \"UP主名\" [eN]: (UP主，URL包含space.bilibili.com)\n - text: 关注 61.8万 (粉丝数)\n \"\"\"\n lines = snapshot.split(\"\\n\")\n\n title = \"\"\n author = \"\"\n description = \"\"\n published_at = \"\"\n views = 0\n bullets = 0\n likes = 0\n coins = 0\n favorites = 0\n shares = 0\n followers = 0\n\n heading_found = False\n stats_started = False\n stats_count = 0\n \n i = 0\n while i \u003c len(lines):\n line = lines[i]\n stripped = line.strip()\n\n # 1. 标题: heading \"...\" [level=1]\n if not title and stripped.startswith('- heading \"'):\n match = re.match(r'^- heading \"(.+)\" \\[level=(\\d+)\\]', stripped)\n if match:\n title = match.group(1)\n heading_found = True\n i += 1\n continue\n\n # 如果还没找到 heading，继续下一行\n if not heading_found:\n i += 1\n continue\n\n # 2. 播放量: heading后第一个包含\"万\"的text行（不包含日期）\n if heading_found and not views and stripped.startswith(\"- text:\"):\n text_content = stripped[len(\"- text:\"):].strip()\n # 播放量：纯数字+万，不包含日期时间\n if \"万\" in text_content and not re.search(r'\\d{4}-\\d{2}-\\d{2}', text_content):\n # 可能是播放量 (1019.1万)\n # 排除简介（未经作者授权）\n if not text_content.startswith(\"未经\"):\n views = parse_wan_number(text_content)\n i += 1\n continue\n\n # 3. 弹幕+发布时间: 包含日期格式 YYYY-MM-DD HH:MM:SS 的行\n if not published_at and stripped.startswith(\"- text:\"):\n text_content = stripped[len(\"- text:\"):].strip()\n # 匹配日期时间格式\n date_match = re.search(r'(\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2}:\\d{2})', text_content)\n if date_match:\n published_at = date_match.group(1)\n # 尝试提取弹幕数 (1.1万 2026-02-17 23:51:30)\n bullet_match = re.search(r'([\\d.]+万)', text_content)\n if bullet_match:\n bullets = parse_wan_number(bullet_match.group(1))\n i += 1\n continue\n\n # 4. 简介: text: 未经作者授权开头的行\n if not description and stripped.startswith(\"- text:\"):\n text_content = stripped[len(\"- text:\"):].strip()\n if text_content.startswith(\"未经作者授权\") or text_content.startswith(\"未经\"):\n description = text_content\n i += 1\n continue\n\n # 5. 互动数据: 在\"发送\"按钮后连续出现4个带\"万\"的text行 (点赞、投币、收藏、转发)\n # 格式: - text: 77.7万 \\n - img \\n - text: 8.8万 \\n - img ...\n # 所以我们需要找连续4个包含\"万\"且格式为 X.X万的 text 行\n if not stats_started and stripped.startswith(\"- text:\"):\n text_content = stripped[len(\"- text:\"):].strip()\n # 检查是否是互动数据格式 (如 77.7万)\n if re.match(r'^[\\d.]+万

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, text_content):\n # 这可能是第一个互动数据\n # 检查前后是否有img\n stats_started = True\n stats_count = 1\n likes = parse_wan_number(text_content)\n \n # 继续检查接下来的行\n j = i + 1\n while j \u003c len(lines) and stats_count \u003c 4:\n next_stripped = lines[j].strip()\n if next_stripped.startswith(\"- text:\"):\n next_text = next_stripped[len(\"- text:\"):].strip()\n if re.match(r'^[\\d.]+万

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, next_text):\n stats_count += 1\n if stats_count == 2:\n coins = parse_wan_number(next_text)\n elif stats_count == 3:\n favorites = parse_wan_number(next_text)\n elif stats_count == 4:\n shares = parse_wan_number(next_text)\n else:\n break\n elif next_stripped == \"- img\":\n # img 行，继续\n pass\n else:\n break\n j += 1\n \n # 跳过已检查的行\n i = j\n continue\n \n # 6. UP主: link \"UP主名\" [eN]: 且URL包含 space.bilibili.com\n if not author and stripped.startswith(\"- link \"):\n # 检查下一行是否是 space.bilibili.com URL\n if i + 1 \u003c len(lines):\n next_line = lines[i + 1].strip()\n if \"/url:\" in next_line and \"space.bilibili.com\" in next_line:\n match = re.match(r'^- link \"([^\"]+)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, stripped)\n if match:\n author = match.group(1)\n # 跳过检查 next_line\n i += 2\n continue\n\n # 7. 粉丝数: text: 关注 61.8万\n if not followers and stripped.startswith(\"- text:\"):\n text_content = stripped[len(\"- text:\"):].strip()\n # 匹配 \"关注数字万\"\n follow_match = re.match(r'^关注\\s+([\\d.]+万)

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, text_content)\n if follow_match:\n followers = parse_wan_number(follow_match.group(1))\n\n i += 1\n\n result = {\n \"url\": url,\n \"platform\": \"bilibili\",\n \"title\": title,\n \"author\": author or \"未知UP主\",\n \"published_at\": published_at,\n \"fetched_at\": datetime.now(timezone(timedelta(hours=8))).isoformat(),\n \"content\": description,\n \"stats\": {\n \"likes\": likes,\n \"comments\": 0, # Requires login\n \"shares\": shares,\n \"views\": views,\n \"bullets\": bullets,\n \"coins\": coins,\n \"favorites\": favorites,\n \"followers\": followers,\n },\n \"media\": [],\n \"comments\": [],\n \"availability\": \"partial\",\n \"unavailable_fields\": [\"comments\"],\n }\n\n return result\n\n def to_markdown(self, data: Dict[str, Any]) -> str:\n lines = [\n \"---\",\n f\"platform: {data['platform']}\",\n f\"url: {data['url']}\",\n f\"title: \\\"{data.get('title', '')}\\\"\",\n f\"author: \\\"{data.get('author', '')}\\\"\",\n f\"published_at: \\\"{data.get('published_at', '')}\\\"\",\n f\"fetched_at: \\\"{data.get('fetched_at', '')}\\\"\",\n \"stats:\",\n f\" likes: {data.get('stats', {}).get('likes', 0)}\",\n f\" views: {data.get('stats', {}).get('views', 0)}\",\n f\" comments: {data.get('stats', {}).get('comments', 0)}\",\n f\" shares: {data.get('stats', {}).get('shares', 0)}\",\n f\" bullets: {data.get('stats', {}).get('bullets', 0)}\",\n f\"availability: {data.get('availability', 'partial')}\",\n \"---\",\n \"\",\n ]\n\n if data.get(\"title\"):\n lines.append(f\"# {data['title']}\\n\")\n\n if data.get(\"tags\"):\n lines.append(f\"**标签**: {' '.join('#' + tag for tag in data['tags'])}\\n\")\n\n if data.get(\"content\"):\n lines.append(f\"## 简介\\n{data['content']}\")\n\n return \"\\n\".join(lines)\n\n\n# ---------------------------------------------------------------------------\n# CSDN parser\n# ---------------------------------------------------------------------------\n\nclass CSDNParser(PlatformParser):\n \"\"\"Parser for CSDN blog articles.\"\"\"\n\n name = \"csdn\"\n\n def can_handle(self, url: str) -> bool:\n return bool(re.search(r'blog\\.csdn\\.net|csdn\\.net', url, re.IGNORECASE))\n\n def fetch(self, url: str, port: int = 9377) -> Dict[str, Any]:\n if not check_camofox(port):\n return {\"url\": url, \"platform\": \"csdn\", \"error\": t(\"camofox_not_running\", port=port)}\n\n print(t(\"opening_via_camofox\", url=url), file=sys.stderr)\n\n session_key = f\"csdn-{int(time.time())}\"\n snapshot = camofox_fetch_page(url, session_key, wait=8, port=port)\n\n if not snapshot:\n return {\"url\": url, \"platform\": \"csdn\", \"error\": t(\"snapshot_failed\")}\n\n data = self._parse_snapshot(snapshot, url)\n return data\n\n def _parse_snapshot(self, snapshot: str, url: str) -> Dict[str, Any]:\n \"\"\"Parse CSDN page snapshot.\n \n Real snapshot format for download list page:\n - listitem with link containing file info and URL\n e.g., \"1.69MB 强化学习算法在大语言模型...zip 2026-02-19\"\n \n For article pages, typical format:\n - heading \"文章标题\" [level=1]\n - link \"作者名\" [eN]:\n - text: 发布时间\n - text: 阅读数, 点赞数, 评论数\n - text: 文章内容...\n \"\"\"\n lines = snapshot.split(\"\\n\")\n\n title = \"\"\n author = \"\"\n published_at = \"\"\n content = \"\"\n views = 0\n likes = 0\n comments_count = 0\n \n # Try to detect page type\n is_download_page = False\n downloads = []\n \n # Check if it's a download list (contains file sizes like \"1.69MB\", \"201KB\")\n if \"MB\" in snapshot or \"KB\" in snapshot:\n is_download_page = True\n\n if is_download_page:\n # Parse as download list\n for i, line in enumerate(lines):\n stripped = line.strip()\n \n # Download items: link with file info\n if stripped.startswith(\"- listitem:\"):\n # Check next few lines for link\n for j in range(1, 5):\n if i + j \u003c len(lines):\n next_line = lines[i + j].strip()\n if next_line.startswith(\"- link \"):\n # Check for URL\n if i + j + 1 \u003c len(lines):\n url_line = lines[i + j + 1].strip()\n if \"/url:\" in url_line:\n # Extract file info from link text\n match = re.match(r'^- link \"([^\"]+)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, next_line)\n if match:\n link_text = match.group(1)\n # Extract file size, name, date\n # Pattern: \"1.69MB 文件名 2026-02-19\"\n file_match = re.match(r'^([\\d.]+(?:MB|KB))\\s+(.+?)\\s+(\\d{4}-\\d{2}-\\d{2})

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, link_text)\n if file_match:\n size = file_match.group(1)\n filename = file_match.group(2)\n date = file_match.group(3)\n \n # Extract URL\n url_match = re.search(r'/url:\\s*(https?://[^$]+)', url_line)\n file_url = url_match.group(1).strip() if url_match else \"\"\n \n downloads.append({\n \"filename\": filename,\n \"size\": size,\n \"date\": date,\n \"url\": file_url,\n })\n break\n \n # If not download page, try to parse as article\n if not is_download_page:\n heading_found = False\n for i, line in enumerate(lines):\n stripped = line.strip()\n\n # 1. Title: heading level 1\n if not title and stripped.startswith('- heading \"'):\n match = re.match(r'^- heading \"(.+)\" \\[level=(\\d+)\\]', stripped)\n if match:\n title = match.group(1)\n heading_found = True\n continue\n\n # 2. Author\n if not author and stripped.startswith(\"- link \"):\n # Check if next line has profile URL\n if i + 1 \u003c len(lines):\n next_line = lines[i + 1].strip()\n if \"/url:\" in next_line:\n match = re.match(r'^- link \"([^\"]+)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, stripped)\n if match:\n author = match.group(1)\n continue\n\n # 3. Published time\n if not published_at and stripped.startswith(\"- text:\"):\n text = stripped[len(\"- text:\"):].strip()\n # Match date pattern\n date_match = re.match(r'^(\\d{4}-\\d{2}-\\d{2})

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, text)\n if date_match:\n published_at = date_match.group(1)\n continue\n\n # 4. Stats (阅读, 点赞, 评论)\n if stripped.startswith(\"- text:\"):\n text = stripped[len(\"- text:\"):].strip()\n # Match patterns like \"1000阅读\" or \"1000 阅读\"\n views_match = re.search(r'([\\d,]+)\\s*阅读', text)\n if views_match and views == 0:\n views = int(views_match.group(1).replace(\",\", \"\"))\n \n likes_match = re.search(r'([\\d,]+)\\s*点赞', text)\n if likes_match and likes == 0:\n likes = int(likes_match.group(1).replace(\",\", \"\"))\n \n comments_match = re.search(r'([\\d,]+)\\s*评论', text)\n if comments_match and comments_count == 0:\n comments_count = int(comments_match.group(1).replace(\",\", \"\"))\n\n # 5. Content\n if heading_found and stripped.startswith(\"- text:\") and len(stripped) > 20:\n text = stripped[len(\"- text:\"):].strip()\n # Skip UI elements\n if text and text.lower() not in (\"编辑\", \"删除\", \"收藏\", \"举报\", \"分享\", \"返回\", \"评论\"):\n content += \"\\n\" + text\n\n result = {\n \"url\": url,\n \"platform\": \"csdn\",\n \"title\": title,\n \"author\": author or \"未知作者\",\n \"published_at\": published_at,\n \"fetched_at\": datetime.now(timezone(timedelta(hours=8))).isoformat(),\n \"content\": content if not is_download_page else f\"共 {len(downloads)} 个下载资源\",\n \"stats\": {\n \"likes\": likes,\n \"comments\": comments_count,\n \"shares\": 0,\n \"views\": views,\n },\n \"media\": [],\n \"downloads\": downloads if is_download_page else [],\n \"comments\": [],\n \"availability\": \"partial\" if not comments_count else \"full\",\n \"unavailable_fields\": [\"comments\"] if not comments_count else [],\n }\n\n return result\n\n def to_markdown(self, data: Dict[str, Any]) -> str:\n lines = [\n \"---\",\n f\"platform: {data['platform']}\",\n f\"url: {data['url']}\",\n f\"title: \\\"{data.get('title', '')}\\\"\",\n f\"author: \\\"{data.get('author', '')}\\\"\",\n f\"published_at: \\\"{data.get('published_at', '')}\\\"\",\n f\"fetched_at: \\\"{data.get('fetched_at', '')}\\\"\",\n \"stats:\",\n f\" likes: {data.get('stats', {}).get('likes', 0)}\",\n f\" views: {data.get('stats', {}).get('views', 0)}\",\n f\" comments: {data.get('stats', {}).get('comments', 0)}\",\n f\"availability: {data.get('availability', 'partial')}\",\n \"---\",\n \"\",\n ]\n\n if data.get(\"title\"):\n lines.append(f\"# {data['title']}\\n\")\n\n if data.get(\"toc\"):\n lines.append(\"## 目录\\n\")\n for item in data[\"toc\"]:\n lines.append(f\"- {item}\")\n lines.append(\"\")\n\n if data.get(\"content\"):\n lines.append(data[\"content\"])\n\n return \"\\n\".join(lines)\n\n\n# ---------------------------------------------------------------------------\n# WeChat (微信公众号) parser — no Camofox needed, direct HTTP\n# ---------------------------------------------------------------------------\n\nclass WeixinParser(PlatformParser):\n \"\"\"Parser for WeChat Official Account articles (mp.weixin.qq.com).\"\"\"\n\n name = \"weixin\"\n\n def can_handle(self, url: str) -> bool:\n return bool(re.search(r'mp\\.weixin\\.qq\\.com', url, re.IGNORECASE))\n\n def fetch(self, url: str, port: int = 9377) -> Dict[str, Any]:\n \"\"\"Fetch WeChat article via direct HTTP (public pages, no login needed).\"\"\"\n print(f\"[fetch_china] 正在抓取微信公众号文章 {url} ...\", file=sys.stderr)\n\n try:\n req = urllib.request.Request(url, headers={\n \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\",\n })\n with urllib.request.urlopen(req, timeout=15) as resp:\n html = resp.read().decode(\"utf-8\", errors=\"replace\")\n except Exception as e:\n # Fallback to Camofox if direct HTTP fails\n if check_camofox(port):\n print(\"[fetch_china] HTTP 失败，尝试 Camofox ...\", file=sys.stderr)\n snapshot = camofox_fetch_page(url, f\"weixin-{int(time.time())}\", wait=8, port=port)\n if snapshot:\n return self._parse_snapshot(snapshot, url)\n return {\"url\": url, \"platform\": \"weixin\", \"error\": f\"抓取失败: {e}\"}\n\n return self._parse_html(html, url)\n\n def _parse_html(self, html: str, url: str) -> Dict[str, Any]:\n \"\"\"Parse WeChat article from raw HTML.\"\"\"\n title = \"\"\n author = \"\"\n account = \"\"\n published_at = \"\"\n content = \"\"\n\n # Title: \u003cmeta property=\"og:title\" content=\"...\">\n m = re.search(r'\u003cmeta\\s+property=\"og:title\"\\s+content=\"([^\"]*)\"', html)\n if m:\n title = self._unescape_html(m.group(1))\n if not title:\n m = re.search(r'\u003ch1[^>]*class=\"rich_media_title\"[^>]*>(.*?)\u003c/h1>', html, re.DOTALL)\n if m:\n title = re.sub(r'\u003c[^>]+>', '', m.group(1)).strip()\n\n # Author: \u003cmeta name=\"author\" content=\"...\">\n m = re.search(r'\u003cmeta\\s+name=\"author\"\\s+content=\"([^\"]*)\"', html)\n if m:\n author = self._unescape_html(m.group(1))\n\n # Account name: var nickname = \"...\" or \u003ca id=\"js_name\">...\u003c/a>\n m = re.search(r'var\\s+nickname\\s*=\\s*[\"\\']([^\"\\']+)[\"\\']', html)\n if m:\n account = m.group(1)\n if not account:\n m = re.search(r'\u003ca[^>]*id=\"js_name\"[^>]*>(.*?)\u003c/a>', html, re.DOTALL)\n if m:\n account = re.sub(r'\u003c[^>]+>', '', m.group(1)).strip()\n\n # Published time: var ct = \"timestamp\"\n m = re.search(r'var\\s+ct\\s*=\\s*[\"\\'](\\d+)[\"\\']', html)\n if m:\n ts = int(m.group(1))\n dt = datetime.fromtimestamp(ts, tz=timezone(timedelta(hours=8)))\n published_at = dt.strftime(\"%Y-%m-%d %H:%M:%S\")\n\n # Content: \u003cdiv class=\"rich_media_content\" ...>...\u003c/div>\n m = re.search(\n r'\u003cdiv[^>]*class=\"rich_media_content[^\"]*\"[^>]*>(.*?)\u003c/div>\\s*(?:\u003cdiv|\u003cscript)',\n html, re.DOTALL\n )\n if m:\n raw = m.group(1)\n # Strip HTML tags but preserve paragraphs\n raw = re.sub(r'\u003cbr\\s*/?>', '\\n', raw)\n raw = re.sub(r'\u003c/p>', '\\n', raw)\n raw = re.sub(r'\u003c[^>]+>', '', raw)\n # Clean up whitespace\n raw = re.sub(r' ', ' ', raw)\n raw = self._unescape_html(raw)\n lines = [line.strip() for line in raw.split('\\n') if line.strip()]\n content = '\\n'.join(lines)\n\n # Extract images from og:image or content\n images = []\n for img_match in re.finditer(r'data-src=\"(https?://mmbiz[^\"]+)\"', html):\n img_url = img_match.group(1)\n if img_url not in images:\n images.append(img_url)\n\n result = {\n \"url\": url,\n \"platform\": \"weixin\",\n \"title\": title,\n \"author\": author or account or \"未知公众号\",\n \"account\": account,\n \"published_at\": published_at,\n \"fetched_at\": datetime.now(timezone(timedelta(hours=8))).isoformat(),\n \"content\": content,\n \"stats\": {\n \"likes\": 0,\n \"comments\": 0,\n \"shares\": 0,\n \"views\": 0,\n },\n \"media\": images,\n \"comments\": [],\n \"availability\": \"full\" if content else \"partial\",\n \"unavailable_fields\": [\"stats\"],\n }\n return result\n\n def _parse_snapshot(self, snapshot: str, url: str) -> Dict[str, Any]:\n \"\"\"Fallback: parse from Camofox snapshot.\"\"\"\n lines = snapshot.split(\"\\n\")\n title = \"\"\n content_parts = []\n\n for line in lines:\n stripped = line.strip()\n if not title and stripped.startswith(\"- heading \"):\n m = re.match(r'^- heading \"(.+?)\"\\s*\\[level=\\d\\]', stripped)\n if m:\n title = m.group(1)\n if stripped.startswith(\"- text:\"):\n text = stripped[len(\"- text:\"):].strip()\n if text and len(text) > 5:\n content_parts.append(text)\n\n return {\n \"url\": url,\n \"platform\": \"weixin\",\n \"title\": title,\n \"author\": \"未知公众号\",\n \"published_at\": \"\",\n \"fetched_at\": datetime.now(timezone(timedelta(hours=8))).isoformat(),\n \"content\": \"\\n\".join(content_parts),\n \"stats\": {\"likes\": 0, \"comments\": 0, \"shares\": 0, \"views\": 0},\n \"media\": [],\n \"comments\": [],\n \"availability\": \"partial\",\n \"unavailable_fields\": [\"stats\", \"author\"],\n }\n\n @staticmethod\n def _unescape_html(text: str) -> str:\n \"\"\"Unescape common HTML entities.\"\"\"\n text = text.replace(\"&\", \"&\")\n text = text.replace(\"<\", \"\u003c\")\n text = text.replace(\">\", \">\")\n text = text.replace(\""\", '\"')\n text = text.replace(\"'\", \"'\")\n text = text.replace(\" \", \" \")\n return text\n\n def to_markdown(self, data: Dict[str, Any]) -> str:\n lines = [\n \"---\",\n f\"platform: {data['platform']}\",\n f\"url: {data['url']}\",\n f\"title: \\\"{data.get('title', '')}\\\"\",\n f\"author: \\\"{data.get('author', '')}\\\"\",\n f\"account: \\\"{data.get('account', '')}\\\"\",\n f\"published_at: \\\"{data.get('published_at', '')}\\\"\",\n f\"fetched_at: \\\"{data.get('fetched_at', '')}\\\"\",\n f\"availability: {data.get('availability', 'full')}\",\n \"---\",\n \"\",\n ]\n if data.get(\"title\"):\n lines.append(f\"# {data['title']}\\n\")\n if data.get(\"content\"):\n lines.append(data[\"content\"])\n if data.get(\"media\"):\n lines.append(\"\\n\\n## 图片\\n\")\n for i, img in enumerate(data[\"media\"][:10], 1):\n lines.append(f\"![图片{i}]({img})\")\n return \"\\n\".join(lines)\n\n\n# ---------------------------------------------------------------------------\n# Douyin (抖音)\n# ---------------------------------------------------------------------------\n\nclass DouyinParser(PlatformParser):\n \"\"\"Parser for Douyin (抖音) videos — extracts AI chapter summaries.\"\"\"\n\n name = \"douyin\"\n\n def can_handle(self, url: str) -> bool:\n return bool(re.search(r'douyin\\.com|v\\.douyin\\.com', url, re.IGNORECASE))\n\n def _resolve_short_url(self, url: str) -> str:\n \"\"\"Resolve v.douyin.com short URLs to full douyin.com URLs.\"\"\"\n if 'v.douyin.com' not in url:\n return url\n try:\n req = urllib.request.Request(url, method='HEAD')\n req.add_header('User-Agent', 'Mozilla/5.0')\n resp = urllib.request.urlopen(req, timeout=10)\n return resp.url\n except Exception:\n return url\n\n def fetch(self, url: str, port: int = 9377) -> Dict[str, Any]:\n if not check_camofox(port):\n return {\"url\": url, \"platform\": \"douyin\", \"error\": t(\"camofox_not_running\", port=port)}\n\n print(t(\"opening_via_camofox\", url=url), file=sys.stderr)\n\n # Resolve short URL\n resolved = self._resolve_short_url(url)\n\n session_key = f\"douyin-{int(time.time())}\"\n snapshot = camofox_fetch_page(resolved, session_key, wait=12, port=port)\n\n if not snapshot:\n return {\"url\": url, \"platform\": \"douyin\", \"error\": t(\"snapshot_failed\")}\n\n data = self._parse_snapshot(snapshot, url)\n return data\n\n def _parse_snapshot(self, snapshot: str, url: str) -> Dict[str, Any]:\n \"\"\"Parse Douyin video page snapshot.\"\"\"\n lines = snapshot.split(\"\\n\")\n\n title = \"\"\n author = \"\"\n description = \"\"\n published_at = \"\"\n likes = 0\n comments = 0\n favorites = 0\n shares = 0\n chapters = []\n\n i = 0\n while i \u003c len(lines):\n line = lines[i].strip()\n\n # Title from heading\n m = re.search(r'heading \"(.+?)\"', line)\n if m and not title:\n title = m.group(1)\n\n # Author — typically a link to user profile\n if 'douyin.com/user/' in line:\n m2 = re.search(r'link \"(.+?)\"', line)\n if m2 and not author:\n author = m2.group(1)\n\n # Published time — e.g. \"2026-02-20 06:19\"\n m_time = re.search(r'(\\d{4}-\\d{2}-\\d{2}\\s+\\d{2}:\\d{2})', line)\n if m_time and not published_at:\n published_at = m_time.group(1)\n\n # Stats — look for patterns like \"22赞\" or just numbers near like/comment/share\n m_likes = re.search(r'[\"\"\"]?(\\d+(?:\\.\\d+)?万?)\\s*赞', line)\n if m_likes:\n likes = parse_wan_number(m_likes.group(1))\n\n m_comments = re.search(r'[\"\"\"]?(\\d+(?:\\.\\d+)?万?)\\s*评论', line)\n if m_comments:\n comments = parse_wan_number(m_comments.group(1))\n\n m_favs = re.search(r'[\"\"\"]?(\\d+(?:\\.\\d+)?万?)\\s*收藏', line)\n if m_favs:\n favorites = parse_wan_number(m_favs.group(1))\n\n m_shares = re.search(r'[\"\"\"]?(\\d+(?:\\.\\d+)?万?)\\s*分享', line)\n if m_shares:\n shares = parse_wan_number(m_shares.group(1))\n\n # Chapter summaries — look for timestamp patterns \"00:00\"\n m_chapter = re.search(r'^-?\\s*(?:text:?\\s*)?(\\d{1,2}:\\d{2})\\s+(.+)', line)\n if m_chapter:\n ts = m_chapter.group(1)\n chapter_title = m_chapter.group(2).strip()\n # Next line(s) may contain the summary\n summary_lines = []\n j = i + 1\n while j \u003c len(lines):\n next_line = lines[j].strip()\n if not next_line:\n j += 1\n continue\n # Stop if next chapter or non-paragraph content\n if re.search(r'^\\d{1,2}:\\d{2}\\s', next_line):\n break\n if re.match(r'^-?\\s*(?:text:?\\s*)?(\\d{1,2}:\\d{2})', next_line):\n break\n if next_line.startswith(('- img', '- link', '- heading', \"- button\")):\n break\n # Collect paragraph text\n clean = re.sub(r'^-?\\s*(?:paragraph:?\\s*|text:?\\s*)', '', next_line).strip()\n if clean:\n summary_lines.append(clean)\n j += 1\n chapters.append({\n \"timestamp\": ts,\n \"title\": chapter_title,\n \"summary\": \" \".join(summary_lines) if summary_lines else \"\",\n })\n\n # Description — long text blocks (not chapter content)\n if 'paragraph' in line.lower() or (line.startswith('- text:') and len(line) > 80):\n desc_text = re.sub(r'^-?\\s*(?:paragraph:?\\s*|text:?\\s*)', '', line).strip()\n if len(desc_text) > len(description):\n description = desc_text\n\n i += 1\n\n return {\n \"url\": url,\n \"platform\": \"douyin\",\n \"title\": title,\n \"author\": author,\n \"description\": description,\n \"published_at\": published_at,\n \"stats\": {\n \"likes\": likes,\n \"comments\": comments,\n \"favorites\": favorites,\n \"shares\": shares,\n },\n \"chapters\": chapters,\n }\n\n def to_markdown(self, data: Dict[str, Any]) -> str:\n parts = [f\"# {data.get('title', 'Douyin Video')}\\n\"]\n if data.get('author'):\n parts.append(f\"**作者**: {data['author']}\")\n if data.get('published_at'):\n parts.append(f\"**发布时间**: {data['published_at']}\")\n\n stats = data.get('stats', {})\n stats_parts = []\n if stats.get('likes'): stats_parts.append(f\"👍 {stats['likes']}\")\n if stats.get('comments'): stats_parts.append(f\"💬 {stats['comments']}\")\n if stats.get('favorites'): stats_parts.append(f\"⭐ {stats['favorites']}\")\n if stats.get('shares'): stats_parts.append(f\"🔄 {stats['shares']}\")\n if stats_parts:\n parts.append(\" | \".join(stats_parts))\n\n if data.get('description'):\n parts.append(f\"\\n## 描述\\n\\n{data['description']}\")\n\n chapters = data.get('chapters', [])\n if chapters:\n parts.append(\"\\n## 章节摘要\\n\")\n for ch in chapters:\n parts.append(f\"**{ch['timestamp']}** {ch['title']}\")\n if ch.get('summary'):\n parts.append(f\"> {ch['summary']}\\n\")\n\n parts.append(f\"\\n---\\n*来源: {data.get('url', '')}*\")\n return \"\\n\".join(parts)\n\n\n# ---------------------------------------------------------------------------\n# Xiaohongshu (小红书) parser\n# ---------------------------------------------------------------------------\n\nclass XiaohongshuParser(PlatformParser):\n \"\"\"Parser for Xiaohongshu (小红书) notes — extracts text, images, stats.\"\"\"\n\n name = \"xiaohongshu\"\n\n # Mobile API endpoint for note detail (no login required for public notes)\n _API_URL = \"https://edith.xiaohongshu.com/api/sns/web/v1/feed\"\n _SEARCH_API = \"https://edith.xiaohongshu.com/api/sns/web/v1/search/notes\"\n\n def can_handle(self, url: str) -> bool:\n return bool(re.search(r'xiaohongshu\\.com|xhslink\\.com', url, re.IGNORECASE))\n\n def _extract_note_id(self, url: str) -> Optional[str]:\n \"\"\"Extract note ID from various URL formats.\"\"\"\n # https://www.xiaohongshu.com/explore/67b8e3f5000000000b00d8e2\n # https://www.xiaohongshu.com/discovery/item/67b8e3f5000000000b00d8e2\n m = re.search(r'(?:explore|discovery/item|notes?)/([a-f0-9]{24})', url)\n if m:\n return m.group(1)\n # xhslink.com short URLs — resolve first\n if 'xhslink.com' in url:\n try:\n req = urllib.request.Request(url, method='HEAD')\n req.add_header('User-Agent', 'Mozilla/5.0')\n resp = urllib.request.urlopen(req, timeout=10)\n return self._extract_note_id(resp.url)\n except Exception:\n pass\n return None\n\n def _fetch_via_router(self, url: str) -> Optional[str]:\n \"\"\"Fetch page HTML via router's home IP (bypasses geo-block).\"\"\"\n import subprocess\n cmd_queue = \"/root/router-agent/cmd-queue\"\n cmd_output = \"/root/router-agent/cmd-output\"\n \n # Write curl command to router queue\n curl_cmd = (\n f'curl -sL \"{url}\" '\n f'-H \"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) '\n f'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1\" '\n f'-H \"Accept: text/html\" '\n f'-H \"Accept-Language: zh-CN,zh;q=0.9\" '\n f'--max-time 15 2>/dev/null'\n )\n \n try:\n # Clear old output\n subprocess.run(['bash', '-c', f'> {cmd_output}'], timeout=3)\n # Queue command\n with open(cmd_queue, 'w') as f:\n f.write(curl_cmd)\n \n # Wait for router to execute (polls every minute)\n print(\"[xiaohongshu] 等待路由器执行抓取（最多90秒）...\", file=sys.stderr)\n for _ in range(18): # 18 * 5s = 90s\n time.sleep(5)\n try:\n with open(cmd_output, 'r') as f:\n content = f.read()\n if content and len(content) > 500:\n return content\n except FileNotFoundError:\n pass\n except Exception as e:\n print(f\"[xiaohongshu] 路由器抓取失败: {e}\", file=sys.stderr)\n return None\n\n def _parse_initial_state(self, html: str) -> Optional[Dict]:\n \"\"\"Extract __INITIAL_STATE__ JSON from SSR HTML.\"\"\"\n m = re.search(r'window\\.__INITIAL_STATE__\\s*=\\s*({.+?})\\s*\u003c/script>', html, re.DOTALL)\n if not m:\n # Try alternate pattern\n m = re.search(r'__INITIAL_STATE__\\s*=\\s*({.+?})(?:\\s*;|\\s*\u003c/)', html, re.DOTALL)\n if m:\n try:\n # XHS uses undefined in JSON, replace with null\n raw = m.group(1).replace('undefined', 'null')\n return json.loads(raw)\n except json.JSONDecodeError:\n pass\n return None\n\n def _parse_note_from_state(self, state: Dict, url: str) -> Dict[str, Any]:\n \"\"\"Parse note data from __INITIAL_STATE__.\"\"\"\n note_data = {}\n \n # Navigate the state tree to find note\n # Structure: noteDetailMap -> note_id -> note\n detail_map = state.get('note', {}).get('noteDetailMap', {})\n if not detail_map:\n detail_map = state.get('noteDetailMap', {})\n \n for note_id, wrapper in detail_map.items():\n note = wrapper.get('note', wrapper)\n \n title = note.get('title', '')\n desc = note.get('desc', '')\n \n # Author\n user = note.get('user', {})\n author = user.get('nickname', user.get('nick_name', ''))\n \n # Images\n image_list = note.get('imageList', note.get('image_list', []))\n images = []\n for img in image_list:\n img_url = img.get('urlDefault', img.get('url', img.get('url_default', '')))\n if img_url:\n images.append(img_url)\n \n # Stats\n interact = note.get('interactInfo', note.get('interact_info', {}))\n likes = parse_wan_number(str(interact.get('likedCount', interact.get('liked_count', 0))))\n collected = parse_wan_number(str(interact.get('collectedCount', interact.get('collected_count', 0))))\n comments_count = parse_wan_number(str(interact.get('commentCount', interact.get('comment_count', 0))))\n shared = parse_wan_number(str(interact.get('shareCount', interact.get('share_count', 0))))\n \n # Tags\n tag_list = note.get('tagList', note.get('tag_list', []))\n tags = [t_item.get('name', '') for t_item in tag_list if t_item.get('name')]\n \n # Time\n create_time = note.get('time', note.get('createTime', ''))\n if isinstance(create_time, (int, float)) and create_time > 1000000000:\n create_time = datetime.fromtimestamp(\n create_time / 1000 if create_time > 1e12 else create_time,\n tz=timezone(timedelta(hours=8))\n ).strftime('%Y-%m-%d %H:%M')\n \n # Type\n note_type = note.get('type', '') # 'normal' (image) or 'video'\n \n note_data = {\n \"url\": url,\n \"platform\": \"xiaohongshu\",\n \"note_id\": note_id,\n \"title\": title,\n \"author\": author,\n \"content\": desc,\n \"type\": \"video\" if note_type == 'video' else \"image\",\n \"images\": images,\n \"tags\": tags,\n \"published_at\": str(create_time),\n \"stats\": {\n \"likes\": likes,\n \"favorites\": collected,\n \"comments\": comments_count,\n \"shares\": shared,\n },\n }\n break # Take first note\n \n return note_data\n\n def _parse_snapshot(self, snapshot: str, url: str) -> Dict[str, Any]:\n \"\"\"Parse Camofox snapshot of XHS page (fallback).\"\"\"\n lines = snapshot.split(\"\\n\")\n \n title = \"\"\n author = \"\"\n content_lines = []\n likes = 0\n comments = 0\n favorites = 0\n shares = 0\n \n for line in lines:\n line = line.strip()\n \n # Title from heading\n m = re.search(r'heading \"(.+?)\"', line)\n if m and not title:\n title = m.group(1)\n \n # Author\n if 'user/profile' in line:\n m2 = re.search(r'link \"(.+?)\"', line)\n if m2 and not author:\n author = m2.group(1)\n \n # Content text\n if line.startswith('- text:') and len(line) > 20:\n text = line[8:].strip()\n if text and text not in ('发现', '发布', '通知', '关注', '收藏', '评论', '分享'):\n content_lines.append(text)\n \n # Stats\n m_likes = re.search(r'(\\d+(?:\\.\\d+)?万?)\\s*(?:赞|点赞)', line)\n if m_likes:\n likes = parse_wan_number(m_likes.group(1))\n m_fav = re.search(r'(\\d+(?:\\.\\d+)?万?)\\s*收藏', line)\n if m_fav:\n favorites = parse_wan_number(m_fav.group(1))\n m_comm = re.search(r'(\\d+(?:\\.\\d+)?万?)\\s*评论', line)\n if m_comm:\n comments = parse_wan_number(m_comm.group(1))\n \n return {\n \"url\": url,\n \"platform\": \"xiaohongshu\",\n \"title\": title,\n \"author\": author,\n \"content\": \"\\n\".join(content_lines),\n \"type\": \"unknown\",\n \"images\": [],\n \"tags\": [],\n \"published_at\": \"\",\n \"stats\": {\n \"likes\": likes,\n \"favorites\": favorites,\n \"comments\": comments,\n \"shares\": shares,\n },\n }\n\n def _fetch_via_proxy(self, url: str, proxy: str, cookies: str = None) -> Optional[str]:\n \"\"\"Fetch page HTML via user-provided proxy.\"\"\"\n try:\n proxy_handler = urllib.request.ProxyHandler({\n 'http': proxy, 'https': proxy,\n })\n opener = urllib.request.build_opener(proxy_handler)\n headers = {\n 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) '\n 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',\n 'Accept': 'text/html',\n 'Accept-Language': 'zh-CN,zh;q=0.9',\n }\n if cookies:\n headers['Cookie'] = cookies\n req = urllib.request.Request(url, headers=headers)\n with opener.open(req, timeout=15) as r:\n html = r.read().decode('utf-8', errors='ignore')\n if len(html) > 500:\n return html\n except Exception as e:\n print(f\"[xiaohongshu] 代理抓取失败: {e}\", file=sys.stderr)\n return None\n\n def _fetch_with_cookies(self, url: str, cookies: str) -> Optional[str]:\n \"\"\"Fetch page HTML with cookies (direct request, no proxy).\"\"\"\n try:\n req = urllib.request.Request(url, headers={\n 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) '\n 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',\n 'Accept': 'text/html',\n 'Accept-Language': 'zh-CN,zh;q=0.9',\n 'Cookie': cookies,\n })\n with urllib.request.urlopen(req, timeout=15) as r:\n html = r.read().decode('utf-8', errors='ignore')\n if len(html) > 500:\n return html\n except Exception as e:\n print(f\"[xiaohongshu] Cookie 抓取失败: {e}\", file=sys.stderr)\n return None\n\n def _load_cookies(self, cookies_arg: str) -> Optional[str]:\n \"\"\"Load cookies from string or file path (supports Cookie-Editor JSON export).\"\"\"\n if not cookies_arg:\n return None\n import os\n if os.path.isfile(cookies_arg):\n try:\n with open(cookies_arg, 'r') as f:\n content = f.read().strip()\n if content.startswith('['):\n data = json.loads(content)\n return '; '.join(f\"{c['name']}={c['value']}\" for c in data\n if '.xiaohongshu.com' in c.get('domain', ''))\n return content\n except Exception:\n pass\n return cookies_arg\n\n def fetch(self, url: str, port: int = 9377, proxy: str = None, cookies: str = None) -> Dict[str, Any]:\n note_id = self._extract_note_id(url)\n if not note_id:\n return {\"url\": url, \"platform\": \"xiaohongshu\", \"error\": \"无法从 URL 提取笔记 ID\"}\n \n # Normalize URL\n canonical = f\"https://www.xiaohongshu.com/explore/{note_id}\"\n \n # Load cookies\n cookie_str = self._load_cookies(cookies)\n \n # Method 0: Proxy + optional cookies (user-provided, fastest)\n if proxy:\n print(f\"[xiaohongshu] 尝试通过代理 {proxy[:30]}... 抓取\", file=sys.stderr)\n html = self._fetch_via_proxy(canonical, proxy, cookie_str)\n if html:\n state = self._parse_initial_state(html)\n if state:\n data = self._parse_note_from_state(state, url)\n if data and data.get('content'):\n return data\n \n # Method 0.5: Cookies without proxy (works if user has domestic IP)\n if cookie_str and not proxy:\n print(\"[xiaohongshu] 尝试通过 Cookies 直接抓取...\", file=sys.stderr)\n html = self._fetch_with_cookies(canonical, cookie_str)\n if html:\n state = self._parse_initial_state(html)\n if state:\n data = self._parse_note_from_state(state, url)\n if data and data.get('content'):\n return data\n \n # Method 1: Try router home IP (bypasses geo-block)\n print(\"[xiaohongshu] 尝试通过路由器家庭 IP 抓取...\", file=sys.stderr)\n html = self._fetch_via_router(canonical)\n if html:\n state = self._parse_initial_state(html)\n if state:\n data = self._parse_note_from_state(state, url)\n if data and data.get('content'):\n return data\n \n # Even without __INITIAL_STATE__, try meta tags\n title_m = re.search(r'\u003cmeta[^>]*name=\"og:title\"[^>]*content=\"([^\"]*)\"', html)\n desc_m = re.search(r'\u003cmeta[^>]*name=\"description\"[^>]*content=\"([^\"]*)\"', html)\n if desc_m and len(desc_m.group(1)) > 20:\n return {\n \"url\": url,\n \"platform\": \"xiaohongshu\",\n \"note_id\": note_id,\n \"title\": title_m.group(1) if title_m else \"\",\n \"author\": \"\",\n \"content\": desc_m.group(1),\n \"type\": \"unknown\",\n \"images\": [],\n \"tags\": [],\n \"published_at\": \"\",\n \"stats\": {},\n }\n \n # Method 2: Try Camofox browser\n if check_camofox(port):\n print(t(\"opening_via_camofox\", url=canonical), file=sys.stderr)\n snapshot = camofox_fetch_page(canonical, f\"xhs-{note_id[:8]}\", wait=10, port=port)\n if snapshot and len(snapshot) > 500:\n data = self._parse_snapshot(snapshot, url)\n if data.get('content') or data.get('title'):\n return data\n \n return {\n \"url\": url,\n \"platform\": \"xiaohongshu\",\n \"note_id\": note_id,\n \"error\": \"无法获取笔记内容。小红书需要国内 IP 或登录态。\\n\"\n \"建议: --proxy socks5://ip:port 或 --cookies 'cookie_string' 或 --cookies cookies.json\",\n }\n\n def to_markdown(self, data: Dict[str, Any]) -> str:\n parts = [f\"# {data.get('title', '小红书笔记')}\\n\"]\n if data.get('author'):\n parts.append(f\"**作者**: {data['author']}\")\n if data.get('published_at'):\n parts.append(f\"**发布时间**: {data['published_at']}\")\n if data.get('type'):\n parts.append(f\"**类型**: {data['type']}\")\n\n stats = data.get('stats', {})\n stats_parts = []\n if stats.get('likes'): stats_parts.append(f\"❤️ {stats['likes']}\")\n if stats.get('favorites'): stats_parts.append(f\"⭐ {stats['favorites']}\")\n if stats.get('comments'): stats_parts.append(f\"💬 {stats['comments']}\")\n if stats.get('shares'): stats_parts.append(f\"🔄 {stats['shares']}\")\n if stats_parts:\n parts.append(\" | \".join(stats_parts))\n\n if data.get('content'):\n parts.append(f\"\\n## 内容\\n\\n{data['content']}\")\n\n if data.get('tags'):\n parts.append(f\"\\n**标签**: {' '.join('#' + t_item for t_item in data['tags'])}\")\n\n images = data.get('images', [])\n if images:\n parts.append(f\"\\n## 图片 ({len(images)})\\n\")\n for i, img in enumerate(images, 1):\n parts.append(f\"![图片{i}]({img})\")\n\n parts.append(f\"\\n---\\n*来源: {data.get('url', '')}*\")\n return \"\\n\".join(parts)\n\n\n# ---------------------------------------------------------------------------\n# Parser registry\n# ---------------------------------------------------------------------------\n\nPARSERS = [\n WeiboParser(),\n BilibiliParser(),\n CSDNParser(),\n WeixinParser(),\n DouyinParser(),\n XiaohongshuParser(),\n]\n\n\ndef get_parser(url: str) -> Optional[PlatformParser]:\n \"\"\"Get appropriate parser for URL.\"\"\"\n for parser in PARSERS:\n if parser.can_handle(url):\n return parser\n return None\n\n\n# ---------------------------------------------------------------------------\n# Main fetch function\n# ---------------------------------------------------------------------------\n\ndef fetch(url: str, port: int = 9377, proxy: str = None, cookies: str = None) -> Dict[str, Any]:\n \"\"\"Fetch content from any supported platform.\"\"\"\n platform = identify_platform(url)\n if not platform:\n return {\"url\": url, \"error\": t(\"url_not_supported\")}\n\n parser = get_parser(url)\n if not parser:\n return {\"url\": url, \"error\": t(\"platform_unsupported\", platform=platform)}\n\n # Pass proxy/cookies to parsers that support them\n if isinstance(parser, XiaohongshuParser):\n return parser.fetch(url, port, proxy=proxy, cookies=cookies)\n return parser.fetch(url, port)\n\n\n# ---------------------------------------------------------------------------\n# CLI\n# ---------------------------------------------------------------------------\n\ndef main():\n global _lang\n\n parser = argparse.ArgumentParser(\n description=(\n \"Fetch posts from Chinese platforms (Weibo, Bilibili, CSDN, Xiaohongshu).\\n\"\n \" --url \u003cURL> Platform URL to fetch\\n\"\n \" --pretty Pretty print JSON\\n\"\n \" --text-only Human-readable output\\n\"\n \" --markdown Markdown output with YAML frontmatter\\n\"\n \" --port Camofox port (default: 9377)\\n\"\n \" --lang Language: zh (default) or en\"\n ),\n formatter_class=argparse.RawDescriptionHelpFormatter,\n )\n parser.add_argument(\"--url\", \"-u\", required=True, help=\"URL to fetch\")\n parser.add_argument(\"--pretty\", \"-p\", action=\"store_true\", help=\"Pretty print JSON\")\n parser.add_argument(\"--text-only\", \"-t\", action=\"store_true\", help=\"Human-readable output\")\n parser.add_argument(\"--markdown\", \"-m\", action=\"store_true\", help=\"Markdown output with YAML frontmatter\")\n parser.add_argument(\"--port\", type=int, default=9377, help=\"Camofox port (default: 9377)\")\n parser.add_argument(\"--proxy\", help=\"HTTP/SOCKS proxy URL (e.g. socks5://127.0.0.1:1080)\")\n parser.add_argument(\"--cookies\", help=\"Cookie string or path to cookies.json file\")\n parser.add_argument(\n \"--lang\", default=\"zh\", choices=[\"zh\", \"en\"],\n help=\"Output language: zh (default) or en\",\n )\n\n args = parser.parse_args()\n\n # Apply language setting\n _lang = args.lang\n\n indent = 2 if args.pretty else None\n\n # Fetch content\n result = fetch(args.url, port=args.port, proxy=getattr(args, 'proxy', None),\n cookies=getattr(args, 'cookies', None))\n\n # Output\n platform_parser = get_parser(args.url)\n if args.markdown:\n if platform_parser and \"error\" not in result:\n print(platform_parser.to_markdown(result))\n else:\n print(f\"# Error\\n{result.get('error', 'Unknown error')}\", file=sys.stderr)\n sys.exit(1)\n elif args.text_only:\n if platform_parser and \"error\" not in result:\n print(platform_parser.to_text(result))\n else:\n print(f\"Error: {result.get('error', 'Unknown error')}\", file=sys.stderr)\n sys.exit(1)\n else:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n\n if result.get(\"error\"):\n sys.exit(1)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":70616,"content_sha256":"d0654a5a8e50c1857dbb410f25fbe8cd6fec90223d54bacb299946504dee45d6"},{"filename":"scripts/fetch_tweet.py","content":"#!/usr/bin/env python3\n\"\"\"\nX Tweet Fetcher - Fetch tweets from X/Twitter without login or API keys.\n\nModes:\n --url \u003cURL> Fetch single tweet via FxTwitter (zero deps)\n --url \u003cURL> --replies Fetch tweet + replies via Camofox + Nitter\n --user \u003cusername> Fetch user timeline via Camofox + Nitter\n --article \u003cURL_or_ID> Fetch X Article (long-form) full text via Camofox\n --monitor @username Monitor X mentions (incremental, cron-friendly)\n --list \u003clist_url_or_id> Fetch tweets from an X List via Camofox + Nitter\n\nNote on --article mode:\n X Articles (x.com/i/article/...) require X login to view the full content.\n Without login, Camofox will capture whatever is publicly visible (title +\n partial preview). This is an X platform limitation, not a tool limitation.\n\nNote on --monitor mode:\n Uses Google search via Camofox to find mentions. First run establishes a\n baseline (no output). Subsequent runs only report new mentions.\n Exit code: 0 = no new mentions, 1 = new mentions found (cron-friendly).\n\"\"\"\n\nimport json\nimport os\nimport re\nimport sys\nimport argparse\nimport time\nimport urllib.request\nimport urllib.error\nimport urllib.parse\nfrom pathlib import Path\nfrom typing import Optional, Dict, List, Any\n\n\n# ---------------------------------------------------------------------------\n# i18n — bilingual messages (zh default, en via --lang en)\n# ---------------------------------------------------------------------------\n\n_MESSAGES = {\n \"zh\": {\n # stderr progress\n \"opening_via_camofox\": \"[x-tweet-fetcher] 正在通过 Camofox 打开 {url} ...\",\n \"camofox_tab_error\": \"[Camofox] 打开标签页失败: {err}\",\n \"camofox_snapshot_error\": \"[Camofox] 获取快照失败: {err}\",\n # error field values (go into JSON output)\n \"err_camofox_not_running_user\": (\n \"Camofox 未在 localhost:{port} 运行。\"\n \"使用 --user 前请先启动 Camofox。\"\n \"参考: https://github.com/openclaw/camofox\"\n ),\n \"err_camofox_not_running_replies\": (\n \"Camofox 未在 localhost:{port} 运行。\"\n \"使用 --replies 前请先启动 Camofox。\"\n \"参考: https://github.com/openclaw/camofox\"\n ),\n \"err_snapshot_failed\": \"无法从 Camofox 获取页面快照\",\n \"err_mutually_exclusive\": \"错误：--user、--url、--article、--monitor 和 --list 不能同时使用\",\n \"err_no_input\": \"错误：请提供 --url 或 --user\",\n \"err_prefix\": \"错误：\",\n # warning field values\n \"warn_no_tweets\": (\n \"未解析到推文。Nitter 可能触发了频率限制，或该用户不存在，请稍后重试。\"\n ),\n \"warn_no_replies\": (\n \"未解析到评论。该推文可能没有回复，或 Nitter 触发了频率限制，请稍后重试。\"\n ),\n # text-only labels\n \"timeline_header\": \"@{user} — 最新 {count} 条推文\",\n \"replies_header\": \"{url} 的评论区\",\n \"media_label\": \"🖼 {n} 张图片\",\n \"media_label_with_urls\": \"🖼 {n} 张图片: {urls}\",\n # article/tweet text-only\n \"article_by\": \"作者 @{screen_name} | {created_at}\",\n \"article_stats\": \"点赞: {likes} | 转推: {retweets} | 浏览: {views}\",\n \"article_words\": \"字数: {word_count}\",\n \"tweet_stats\": \"\\n点赞: {likes} | 转推: {retweets} | 浏览: {views}\",\n # article mode\n \"opening_article_via_camofox\": \"[x-tweet-fetcher] 正在通过 Camofox 打开 X Article {url} ...\",\n \"err_camofox_not_running_article\": (\n \"Camofox 未在 localhost:{port} 运行。\"\n \"使用 --article 前请先启动 Camofox。\"\n \"参考: https://github.com/openclaw/camofox\"\n ),\n \"err_invalid_article\": \"无法解析 Article URL 或 ID: {input}\",\n \"article_header\": \"X Article: {title}\",\n \"article_content_label\": \"正文\",\n \"article_login_note\": (\n \"注意：X Article 需要登录才能查看完整内容。\"\n \"未登录时 Camofox 只能抓到公开部分（标题+摘要）。\"\n ),\n # FxTwitter network error\n \"err_network\": \"网络错误：重试后仍无法获取推文\",\n \"err_unexpected\": \"获取推文时发生意外错误\",\n # monitor mode\n \"monitor_baseline\": \"[monitor] 首次运行，建立基线 ({count} 条)，下次运行起报告增量。\",\n \"monitor_no_new\": \"[monitor] 无新 mentions（已知 {known} 条）。\",\n \"monitor_new_found\": \"[monitor] 发现 {count} 条新 mentions！\",\n \"monitor_searching\": \"[monitor] 搜索 mentions: {query}\",\n \"monitor_camofox_error\": (\n \"Camofox 未在 localhost:{port} 运行。\"\n \"使用 --monitor 前请先启动 Camofox。\"\n \"参考: https://github.com/openclaw/camofox\"\n ),\n \"monitor_header\": \"@{username} 的新 mentions ({count} 条)\",\n # list mode\n \"list_header\": \"X List {list_id} — 最新 {count} 条推文\",\n \"err_invalid_list\": \"无法解析 List URL 或 ID: {input}\",\n \"err_camofox_not_running_list\": (\n \"Camofox 未在 localhost:{port} 运行。\"\n \"使用 --list 前请先启动 Camofox。\"\n \"参考: https://github.com/openclaw/camofox\"\n ),\n },\n \"en\": {\n \"opening_via_camofox\": \"[x-tweet-fetcher] Opening {url} via Camofox...\",\n \"camofox_tab_error\": \"[Camofox] open tab error: {err}\",\n \"camofox_snapshot_error\": \"[Camofox] snapshot error: {err}\",\n \"err_camofox_not_running_user\": (\n \"Camofox is not running on localhost:{port}. \"\n \"Please start Camofox before using --user. \"\n \"See: https://github.com/openclaw/camofox\"\n ),\n \"err_camofox_not_running_replies\": (\n \"Camofox is not running on localhost:{port}. \"\n \"Please start Camofox before using --replies. \"\n \"See: https://github.com/openclaw/camofox\"\n ),\n \"err_snapshot_failed\": \"Failed to get page snapshot from Camofox\",\n \"err_mutually_exclusive\": \"Error: --user, --url, --article, --monitor, and --list are mutually exclusive\",\n \"err_no_input\": \"Error: provide --url or --user\",\n \"err_prefix\": \"Error: \",\n \"warn_no_tweets\": (\n \"No tweets parsed. Nitter may be rate-limited or the user doesn't exist. \"\n \"Try again later.\"\n ),\n \"warn_no_replies\": (\n \"No replies parsed. The tweet may have no replies, \"\n \"or Nitter may be rate-limited. Try again later.\"\n ),\n \"timeline_header\": \"@{user} — latest {count} tweets\",\n \"replies_header\": \"Replies to {url}\",\n \"media_label\": \"🖼 {n} media\",\n \"media_label_with_urls\": \"🖼 {n} image(s): {urls}\",\n \"article_by\": \"By @{screen_name} | {created_at}\",\n \"article_stats\": \"Likes: {likes} | Retweets: {retweets} | Views: {views}\",\n \"article_words\": \"Words: {word_count}\",\n \"tweet_stats\": \"\\nLikes: {likes} | Retweets: {retweets} | Views: {views}\",\n # article mode\n \"opening_article_via_camofox\": \"[x-tweet-fetcher] Opening X Article {url} via Camofox...\",\n \"err_camofox_not_running_article\": (\n \"Camofox is not running on localhost:{port}. \"\n \"Please start Camofox before using --article. \"\n \"See: https://github.com/openclaw/camofox\"\n ),\n \"err_invalid_article\": \"Cannot parse Article URL or ID: {input}\",\n \"article_header\": \"X Article: {title}\",\n \"article_content_label\": \"Content\",\n \"article_login_note\": (\n \"Note: X Articles require login to view full content. \"\n \"Without login, Camofox can only capture the public portion (title + preview).\"\n ),\n \"err_network\": \"Network error: Failed to fetch tweet after retry\",\n \"err_unexpected\": \"An unexpected error occurred while fetching the tweet\",\n # monitor mode\n \"monitor_baseline\": \"[monitor] First run: baseline established ({count} entries). Future runs will report incremental results.\",\n \"monitor_no_new\": \"[monitor] No new mentions (known: {known}).\",\n \"monitor_new_found\": \"[monitor] Found {count} new mention(s)!\",\n \"monitor_searching\": \"[monitor] Searching mentions: {query}\",\n \"monitor_camofox_error\": (\n \"Camofox is not running on localhost:{port}. \"\n \"Please start Camofox before using --monitor. \"\n \"See: https://github.com/openclaw/camofox\"\n ),\n \"monitor_header\": \"New mentions for @{username} ({count})\",\n # list mode\n \"list_header\": \"X List {list_id} — latest {count} tweets\",\n \"err_invalid_list\": \"Cannot parse List URL or ID: {input}\",\n \"err_camofox_not_running_list\": (\n \"Camofox is not running on localhost:{port}. \"\n \"Please start Camofox before using --list. \"\n \"See: https://github.com/openclaw/camofox\"\n ),\n },\n}\n\n# Module-level lang (set once in main(), read everywhere)\n_lang: str = \"zh\"\n\n\ndef t(key: str, **kwargs) -> str:\n \"\"\"Look up a message in the current language, formatting with kwargs.\"\"\"\n msg = _MESSAGES.get(_lang, _MESSAGES[\"zh\"]).get(key, key)\n return msg.format(**kwargs) if kwargs else msg\n\n\n# ---------------------------------------------------------------------------\n# Camofox helpers\n# ---------------------------------------------------------------------------\n\ndef check_camofox(port: int = 9377) -> bool:\n \"\"\"Return True if Camofox is reachable.\"\"\"\n try:\n req = urllib.request.Request(f\"http://localhost:{port}/tabs\", method=\"GET\")\n with urllib.request.urlopen(req, timeout=3) as resp:\n resp.read()\n return True\n except Exception:\n return False\n\n\ndef camofox_open_tab(url: str, session_key: str, port: int = 9377) -> Optional[str]:\n \"\"\"Open a new Camofox tab; return tabId or None.\"\"\"\n try:\n payload = json.dumps({\n \"userId\": \"x-tweet-fetcher\",\n \"sessionKey\": session_key,\n \"url\": url,\n }).encode()\n req = urllib.request.Request(\n f\"http://localhost:{port}/tabs\",\n data=payload,\n headers={\"Content-Type\": \"application/json\"},\n method=\"POST\",\n )\n with urllib.request.urlopen(req, timeout=10) as resp:\n data = json.loads(resp.read().decode())\n return data.get(\"tabId\")\n except Exception as e:\n print(t(\"camofox_tab_error\", err=e), file=sys.stderr)\n return None\n\n\ndef camofox_snapshot(tab_id: str, port: int = 9377) -> Optional[str]:\n \"\"\"Get Nitter page snapshot text from Camofox tab.\"\"\"\n try:\n url = f\"http://localhost:{port}/tabs/{tab_id}/snapshot?userId=x-tweet-fetcher\"\n with urllib.request.urlopen(url, timeout=15) as resp:\n data = json.loads(resp.read().decode())\n return data.get(\"snapshot\", \"\")\n except Exception as e:\n print(t(\"camofox_snapshot_error\", err=e), file=sys.stderr)\n return None\n\n\ndef camofox_close_tab(tab_id: str, port: int = 9377):\n try:\n req = urllib.request.Request(\n f\"http://localhost:{port}/tabs/{tab_id}\",\n method=\"DELETE\",\n )\n urllib.request.urlopen(req, timeout=5)\n except Exception:\n pass\n\n\ndef camofox_fetch_page(url: str, session_key: str, wait: float = 8, port: int = 9377) -> Optional[str]:\n \"\"\"Open URL in Camofox, wait, snapshot, close. Returns snapshot text.\"\"\"\n tab_id = camofox_open_tab(url, session_key, port)\n if not tab_id:\n return None\n time.sleep(wait)\n snapshot = camofox_snapshot(tab_id, port)\n camofox_close_tab(tab_id, port)\n return snapshot\n\n\n# ---------------------------------------------------------------------------\n# FxTwitter single-tweet fetch (zero deps)\n# ---------------------------------------------------------------------------\n\ndef parse_tweet_url(url: str) -> tuple:\n \"\"\"Extract username and tweet_id from X/Twitter URL.\"\"\"\n patterns = [\n r'(?:x\\.com|twitter\\.com)/([a-zA-Z0-9_]{1,15})/status/(\\d+)',\n ]\n for pattern in patterns:\n match = re.search(pattern, url)\n if match:\n username = match.group(1)\n tweet_id = match.group(2)\n if not re.match(r'^[a-zA-Z0-9_]{1,15}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, username):\n raise ValueError(f\"Invalid username format: {username}\")\n if not tweet_id.isdigit():\n raise ValueError(f\"Invalid tweet ID format: {tweet_id}\")\n return username, tweet_id\n raise ValueError(f\"Cannot parse tweet URL: {url}\")\n\n\ndef extract_media(tweet_obj: Dict[str, Any]) -> Optional[Dict[str, Any]]:\n \"\"\"Extract media information (photos/videos) from tweet object.\"\"\"\n media_data = {}\n media = tweet_obj.get(\"media\", {})\n\n all_media = media.get(\"all\", [])\n if all_media and isinstance(all_media, list):\n photos = [item for item in all_media if item.get(\"type\") == \"photo\"]\n if photos:\n media_data[\"images\"] = []\n for photo in photos:\n image_info = {\"url\": photo.get(\"url\", \"\")}\n if photo.get(\"width\"):\n image_info[\"width\"] = photo.get(\"width\")\n if photo.get(\"height\"):\n image_info[\"height\"] = photo.get(\"height\")\n media_data[\"images\"].append(image_info)\n\n videos = media.get(\"videos\", [])\n if videos and isinstance(videos, list) and len(videos) > 0:\n media_data[\"videos\"] = []\n for video in videos:\n video_info = {}\n if video.get(\"url\"):\n video_info[\"url\"] = video.get(\"url\")\n if video.get(\"duration\"):\n video_info[\"duration\"] = video.get(\"duration\")\n if video.get(\"thumbnail_url\"):\n video_info[\"thumbnail\"] = video.get(\"thumbnail_url\")\n if video.get(\"variants\") and isinstance(video.get(\"variants\"), list):\n video_info[\"variants\"] = []\n for variant in video.get(\"variants\", []):\n variant_info = {}\n if variant.get(\"url\"):\n variant_info[\"url\"] = variant.get(\"url\")\n if variant.get(\"bitrate\"):\n variant_info[\"bitrate\"] = variant.get(\"bitrate\")\n if variant.get(\"content_type\"):\n variant_info[\"content_type\"] = variant.get(\"content_type\")\n if variant_info:\n video_info[\"variants\"].append(variant_info)\n if video_info:\n media_data[\"videos\"].append(video_info)\n\n return media_data if media_data else None\n\n\ndef fetch_tweet(url: str, timeout: int = 30) -> Dict[str, Any]:\n \"\"\"Fetch single tweet via FxTwitter API (zero deps).\"\"\"\n try:\n username, tweet_id = parse_tweet_url(url)\n except ValueError as e:\n return {\"url\": url, \"error\": str(e)}\n result = {\"url\": url, \"username\": username, \"tweet_id\": tweet_id}\n\n api_url = f\"https://api.fxtwitter.com/{username}/status/{tweet_id}\"\n\n max_attempts = 2\n for attempt in range(max_attempts):\n try:\n req = urllib.request.Request(api_url, headers={\"User-Agent\": \"Mozilla/5.0\"})\n with urllib.request.urlopen(req, timeout=timeout) as resp:\n data = json.loads(resp.read().decode())\n\n if data.get(\"code\") != 200:\n result[\"error\"] = f\"FxTwitter returned code {data.get('code')}: {data.get('message', 'Unknown')}\"\n return result\n\n tweet = data[\"tweet\"]\n tweet_data = {\n \"text\": tweet.get(\"text\", \"\"),\n \"author\": tweet.get(\"author\", {}).get(\"name\", \"\"),\n \"screen_name\": tweet.get(\"author\", {}).get(\"screen_name\", \"\"),\n \"likes\": tweet.get(\"likes\", 0),\n \"retweets\": tweet.get(\"retweets\", 0),\n \"bookmarks\": tweet.get(\"bookmarks\", 0),\n \"views\": tweet.get(\"views\", 0),\n \"replies_count\": tweet.get(\"replies\", 0),\n \"created_at\": tweet.get(\"created_at\", \"\"),\n \"is_note_tweet\": tweet.get(\"is_note_tweet\", False),\n \"lang\": tweet.get(\"lang\", \"\"),\n }\n\n media = extract_media(tweet)\n if media:\n tweet_data[\"media\"] = media\n\n if tweet.get(\"quote\"):\n qt = tweet[\"quote\"]\n tweet_data[\"quote\"] = {\n \"text\": qt.get(\"text\", \"\"),\n \"author\": qt.get(\"author\", {}).get(\"name\", \"\"),\n \"screen_name\": qt.get(\"author\", {}).get(\"screen_name\", \"\"),\n \"likes\": qt.get(\"likes\", 0),\n \"retweets\": qt.get(\"retweets\", 0),\n \"views\": qt.get(\"views\", 0),\n }\n quote_media = extract_media(qt)\n if quote_media:\n tweet_data[\"quote\"][\"media\"] = quote_media\n\n article = tweet.get(\"article\")\n if article:\n article_data = {\n \"title\": article.get(\"title\", \"\"),\n \"preview_text\": article.get(\"preview_text\", \"\"),\n \"created_at\": article.get(\"created_at\", \"\"),\n }\n content = article.get(\"content\", {})\n blocks = content.get(\"blocks\", [])\n if blocks:\n full_text = \"\\n\\n\".join(\n b.get(\"text\", \"\") for b in blocks if b.get(\"text\", \"\")\n )\n article_data[\"full_text\"] = full_text\n article_data[\"word_count\"] = len(full_text.split())\n article_data[\"char_count\"] = len(full_text)\n # 提取 article 内的图片\n article_images = []\n cover = article.get(\"cover_media\", {})\n if cover:\n cover_url = cover.get(\"media_info\", {}).get(\"original_img_url\")\n if cover_url:\n article_images.append({\"type\": \"cover\", \"url\": cover_url})\n for entity in article.get(\"media_entities\", []):\n img_url = entity.get(\"media_info\", {}).get(\"original_img_url\")\n if img_url:\n article_images.append({\"type\": \"image\", \"url\": img_url})\n if article_images:\n article_data[\"images\"] = article_images\n article_data[\"image_count\"] = len(article_images)\n\n tweet_data[\"article\"] = article_data\n tweet_data[\"is_article\"] = True\n else:\n tweet_data[\"is_article\"] = False\n\n result[\"tweet\"] = tweet_data\n return result\n\n except urllib.error.URLError:\n if attempt \u003c max_attempts - 1:\n time.sleep(1)\n continue\n else:\n result[\"error\"] = t(\"err_network\")\n return result\n except urllib.error.HTTPError as e:\n result[\"error\"] = f\"HTTP {e.code}: {e.reason}\"\n return result\n except Exception:\n result[\"error\"] = t(\"err_unexpected\")\n return result\n\n return result\n\n\n# ---------------------------------------------------------------------------\n# Nitter snapshot parsers\n# ---------------------------------------------------------------------------\n\ndef _parse_stats_from_text(raw: str) -> tuple:\n \"\"\"Parse stats numbers from Nitter text line like 'content 1 22 4,418'.\n\n Nitter renders stats as plain numbers separated by spaces (no icon chars on timeline).\n Returns (cleaned_text, replies, retweets, likes, views).\n \"\"\"\n # Pattern: text content followed by 2–4 space-separated numbers at end\n # e.g. \"我已经打通... 1 22 4,418\"\n # Numbers may have commas (thousands separator)\n stat_match = re.search(\n r\"^(.*?)\\s{2,}(\\d[\\d,]*)\\s{2,}(\\d[\\d,]*)\\s{2,}(\\d[\\d,]*)$\",\n raw.rstrip(),\n )\n if stat_match:\n text_part = stat_match.group(1).strip()\n nums = [int(stat_match.group(i).replace(\",\", \"\")) for i in (2, 3, 4)]\n # Nitter columns: replies | retweets | likes (views sometimes separate)\n return text_part, nums[0], nums[1], nums[2], 0\n\n # Only 2 trailing numbers\n stat_match2 = re.search(\n r\"^(.*?)\\s{2,}(\\d[\\d,]*)\\s{2,}(\\d[\\d,]*)$\",\n raw.rstrip(),\n )\n if stat_match2:\n text_part = stat_match2.group(1).strip()\n nums = [int(stat_match2.group(i).replace(\",\", \"\")) for i in (2, 3)]\n return text_part, nums[0], 0, nums[1], 0\n\n # Private-use unicode icon stats (from replies page or some Nitter versions)\n icon_match = re.search(\n r\"^(.*?)\\s*\\ue803\\s*(\\d+)\\s*\\ue80c\\s*\\ue801\\s*(\\d+)\\s*\\ue800\\s*(\\d+)\",\n raw,\n )\n if icon_match:\n return (\n icon_match.group(1).strip(),\n int(icon_match.group(2)),\n 0,\n int(icon_match.group(3)),\n int(icon_match.group(4)),\n )\n\n # No stats found — clean any icon chars and return raw text\n cleaned = re.sub(r\"\\s*[\\ue800-\\ue8ff]\\s*[\\d,]+\", \"\", raw).strip()\n return cleaned, 0, 0, 0, 0\n\n\ndef parse_timeline_snapshot(snapshot: str, limit: int = 20) -> List[Dict]:\n \"\"\"Parse Nitter user timeline page snapshot into tweet list.\n\n Nitter snapshot format (Camofox aria snapshot):\n Page starts with a TOC section (bare link anchors with no surrounding content),\n then the actual tweet cards follow. Each tweet card:\n\n - link [eN]: ← tweet permalink (url ends with /status/ID#m)\n - link [eN]: ← (optional) avatar/profile link\n - link \"AuthorName\": ← author display name\n - text: ... ← (optional blank)\n - link \"@handle\": ← author @handle\n - link \"10h\": ← timestamp (url also points to /status/ID#m)\n - link \"#hashtag\": ← optional hashtags / inline links\n - text: tweet content 1 5 1,234 ← text (+ optional trailing stats)\n - link [eN]: ← optional media (url has /pic/orig/media%2F...)\n - text: 1 7 541 ← optional separate stats-only line after media\n \"\"\"\n tweets = []\n lines = snapshot.split(\"\\n\")\n n = len(lines)\n\n # ── Step 1: collect all bare-link tweet anchors ────────────────────────\n # Format: \"- link [eN]:\" followed by \" - /url: /user/status/DIGITS#m\"\n all_anchors = [] # (line_index, status_path)\n for i in range(n - 1):\n line = lines[i].strip()\n if not re.match(r'^- link \\[e\\d+\\]:

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, line):\n continue\n url_line = lines[i + 1].strip()\n url_match = re.match(r'^- /url:\\s+(/\\w+/status/(\\d+)#m)

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, url_line)\n if url_match:\n all_anchors.append((i, url_match.group(1)))\n\n # ── Step 2: separate TOC anchors from content anchors ─────────────────\n # TOC anchors appear in the top section where consecutive anchors are packed\n # together (next line after the /url: is another anchor or a nav list).\n # Content anchors have author name / text within a window of ~5 lines.\n def _is_content_anchor(anchor_idx: int) -> bool:\n \"\"\"True if this anchor is followed by author/text (not another anchor).\"\"\"\n i, _ = all_anchors[anchor_idx]\n # Look at lines i+2 … i+8 for a named link or text\n for j in range(i + 2, min(n, i + 8)):\n stripped = lines[j].strip()\n if re.match(r'^- link \"[^\"]+\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, stripped):\n return True # named link → content\n if stripped.startswith(\"- text:\"):\n return True # text line → content\n if re.match(r'^- link \\[e\\d+\\]:

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, stripped):\n return False # another bare link → still in TOC\n if stripped.startswith(\"- list:\"):\n return False # nav list → still in header area\n return False\n\n content_anchors = [\n a for idx, a in enumerate(all_anchors)\n if _is_content_anchor(idx)\n ]\n\n # ── Step 3: parse each content tweet block ─────────────────────────────\n for idx, (start_i, tweet_path) in enumerate(content_anchors):\n if len(tweets) >= limit:\n break\n\n end_i = content_anchors[idx + 1][0] if idx + 1 \u003c len(content_anchors) else n\n\n author_name = None\n author_handle = None\n time_ago = None\n text_parts: List[str] = []\n stats_set = False\n likes = 0\n retweets = 0\n replies_count = 0\n views = 0\n media_urls = []\n\n for j in range(start_i, min(end_i, start_i + 60)):\n line = lines[j].strip()\n\n # Author display name: - link \"Name\" [eN]: or - link \"Name\":\n if not author_name:\n m = re.match(r'^- link \"([^@#][^\"]*?)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, line)\n if m:\n name = m.group(1).strip()\n skip = (\n re.match(r'^\\d+[smhd]

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, name)\n or re.match(r'^[A-Z][a-z]{2} \\d+', name)\n or name.lower() in (\n \"nitter\", \"logo\", \"more replies\",\n \"tweets\", \"tweets & replies\", \"media\", \"search\",\n \"pinned tweet\", \"retweeted\",\n )\n or name == \"\"\n )\n if not skip:\n author_name = name\n\n # Author @handle\n if not author_handle:\n m = re.match(r'^- link \"@(\\w+)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, line)\n if m:\n author_handle = f\"@{m.group(1)}\"\n\n # Timestamp\n if not time_ago:\n m = re.match(r'^- link \"(\\d+[smhd])\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, line)\n if m:\n time_ago = m.group(1)\n if not time_ago:\n m = re.match(r'^- link \"([A-Z][a-z]{2} \\d+(?:, \\d{4})?)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, line)\n if m:\n time_ago = m.group(1)\n\n # Text lines (may be multiple for multi-para tweets or embedded @mentions)\n if line.startswith(\"- text:\"):\n raw = line[len(\"- text:\"):].strip()\n if not raw:\n continue\n text_part, rc, rt, lk, vw = _parse_stats_from_text(raw)\n if lk or rc:\n # Stats found — capture only once\n if not stats_set:\n likes = lk\n retweets = rt\n replies_count = rc\n views = vw\n stats_set = True\n if text_part:\n # Skip label-like lines\n skip_labels = {\"pinned tweet\", \"retweeted\", \"\"}\n if text_part.strip().lower() not in skip_labels:\n text_parts.append(text_part.strip())\n\n # Media URL\n url_match = re.match(r'^- /url:\\s+(/pic/orig/(.+))

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, line)\n if url_match:\n encoded = url_match.group(2)\n decoded = urllib.parse.unquote(encoded)\n if decoded.startswith(\"media/\"):\n media_file = decoded[6:]\n media_url = f\"https://pbs.twimg.com/media/{media_file}\"\n if media_url not in media_urls:\n media_urls.append(media_url)\n\n tweet_text = \" \".join(text_parts).strip() if text_parts else None\n\n if tweet_text and author_handle:\n tweet_entry = {\n \"author\": author_handle,\n \"author_name\": author_name or author_handle,\n \"text\": tweet_text,\n \"time_ago\": time_ago or \"\",\n \"likes\": likes,\n \"retweets\": retweets,\n \"replies\": replies_count,\n \"views\": views,\n }\n if media_urls:\n tweet_entry[\"media\"] = media_urls\n\n # Deduplicate by (author, text)\n key = (author_handle, tweet_text[:80])\n if not any(\n (t[\"author\"], t[\"text\"][:80]) == key\n for t in tweets\n ):\n tweets.append(tweet_entry)\n\n return tweets\n\n\ndef parse_replies_snapshot(snapshot: str, original_author: str) -> List[Dict]:\n \"\"\"Parse replies from Nitter tweet page snapshot.\n\n Each reply block in Nitter looks like:\n - link [eN]: ← reply permalink (url /author/status/ID#m)\n - link \"AuthorName\": ← replier display name\n - link \"@handle\": ← replier handle\n - link \"12h\": ← time ago (OR \"Feb 15\" for older)\n - text: Replying to ← reply marker\n - link \"@original\": ← who they replied to\n - text: reply content ← actual text (may have stats at end)\n - link [eN]: ← optional media\n - text: 1 0 60 ← optional stats-only line\n \"\"\"\n replies = []\n lines = snapshot.split(\"\\n\")\n n = len(lines)\n\n i = 0\n while i \u003c len(lines):\n line = lines[i].strip()\n\n if line == \"- text: Replying to\":\n author_handle = None\n author_name = None\n reply_text = None\n reply_tweet_id = None # 新增：回复的 tweet ID（用于递归抓嵌套）\n time_ago = None\n likes = 0\n replies_count = 0\n views = 0\n media_urls = []\n links = [] # 新增：提取评论中的链接\n thread_replies = [] # 新增：嵌套回复\n stats_set = False\n\n # Scan backwards for author info (within ~15 lines)\n for j in range(i - 1, max(0, i - 15), -1):\n prev = lines[j].strip()\n\n # Extract reply tweet ID from permalink: /url: /author/status/12345#m\n if not reply_tweet_id:\n tid_m = re.match(r'^- /url:\\s+/\\w+/status/(\\d+)#m

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, prev)\n if tid_m:\n reply_tweet_id = tid_m.group(1)\n\n # @handle (not the original author)\n if not author_handle:\n m = re.match(r'^- link \"@(\\w+)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, prev)\n if m and m.group(1).lower() != original_author.lower():\n author_handle = f\"@{m.group(1)}\"\n\n # Display name (not time, not nav items)\n if not author_name:\n m = re.match(r'^- link \"([^@#][^\"]*?)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, prev)\n if m:\n name = m.group(1).strip()\n is_time = bool(\n re.match(r'^\\d+[smhd]

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, name)\n or re.match(r'^[A-Z][a-z]{2} \\d+', name)\n )\n is_skip = name.lower() in (\n \"nitter\", \"logo\", \"more replies\", \"\"\n )\n if not is_time and not is_skip:\n author_name = name\n\n # Timestamp (short: \"12h\") or date (\"Feb 15\")\n if not time_ago:\n m = re.match(r'^- link \"(\\d+[smhd])\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, prev)\n if m:\n time_ago = m.group(1)\n if not time_ago:\n m = re.match(r'^- link \"([A-Z][a-z]{2} \\d+(?:, \\d{4})?)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, prev)\n if m:\n time_ago = m.group(1)\n\n if author_handle and author_name and time_ago:\n break\n\n # Scan forward for reply text and media (skip \"@original\" link line)\n for j in range(i + 1, min(n, i + 20)):\n fwd = lines[j].strip()\n\n # Skip the \"@original_author\" line right after \"Replying to\"\n if re.match(r'^- link \"@\\w+\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, fwd):\n continue\n\n if fwd.startswith(\"- text:\"):\n raw = fwd[len(\"- text:\"):].strip()\n if not raw:\n continue\n\n text_part, rc, rt, lk, vw = _parse_stats_from_text(raw)\n\n # Capture stats once\n if (lk or rc or vw) and not stats_set:\n likes = lk\n replies_count = rc\n views = vw\n stats_set = True\n\n if text_part and not reply_text:\n skip_labels = {\"replying to\", \"\"}\n if text_part.strip().lower() not in skip_labels:\n reply_text = text_part.strip()\n\n # Media URL line\n url_match = re.match(r'^- /url:\\s+(/pic/orig/(.+))

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, fwd)\n if url_match:\n encoded = url_match.group(2)\n decoded = urllib.parse.unquote(encoded)\n if decoded.startswith(\"media/\"):\n media_file = decoded[6:]\n media_url = f\"https://pbs.twimg.com/media/{media_file}\"\n if media_url not in media_urls:\n media_urls.append(media_url)\n\n # Link URL line: extract from /url: lines following any link element\n link_url_match = re.match(r'^- /url:\\s+(.+)

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, fwd)\n if link_url_match:\n url_part = link_url_match.group(1).strip()\n # Skip media URLs (already handled above)\n if not url_part.startswith(\"/pic/\"):\n decoded_url = urllib.parse.unquote(url_part)\n # Filter out relative paths and keep valid URLs\n if decoded_url.startswith(\"http\"):\n if decoded_url not in links:\n links.append(decoded_url)\n\n # Named link where the link text itself is a URL:\n # e.g. - link \"https://github.com/some/repo\":\n named_link_match = re.match(r'^- link \"([^\"]+)\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, fwd)\n if named_link_match:\n link_text = named_link_match.group(1).strip()\n if link_text.startswith(\"http\"):\n if link_text not in links:\n links.append(link_text)\n\n # Stop at next \"Replying to\" block - but collect nested replies first\n if fwd == \"- text: Replying to\":\n # Continue scanning for nested replies within this thread\n # Skip the @original line and continue parsing nested content\n nested_reply_text = None\n nested_time_ago = None\n nested_likes = 0\n nested_replies_count = 0\n nested_views = 0\n \n for k in range(j + 1, min(n, j + 15)):\n nested_line = lines[k].strip()\n \n # Skip @handle lines\n if re.match(r'^- link \"@\\w+\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, nested_line):\n continue\n \n # Check for timestamp\n if not nested_time_ago:\n m = re.match(r'^- link \"(\\d+[smhd])\"\\s*(\\[e\\d+\\])?:?

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, nested_line)\n if m:\n nested_time_ago = m.group(1)\n \n # Parse nested reply text\n if nested_line.startswith(\"- text:\"):\n raw = nested_line[len(\"- text:\"):].strip()\n if raw:\n text_part, rc, rt, lk, vw = _parse_stats_from_text(raw)\n if text_part and not nested_reply_text:\n skip_labels = {\"replying to\", \"\"}\n if text_part.strip().lower() not in skip_labels:\n nested_reply_text = text_part.strip()\n nested_likes = lk\n nested_replies_count = rc\n nested_views = vw\n \n # Stop at next \"Replying to\" block\n if nested_line == \"- text: Replying to\":\n break\n \n if nested_reply_text:\n thread_replies.append({\n \"text\": nested_reply_text,\n \"time_ago\": nested_time_ago,\n \"likes\": nested_likes,\n \"replies\": nested_replies_count,\n \"views\": nested_views\n })\n \n # Now break for the main loop\n break\n\n if author_handle and reply_text:\n reply = {\n \"author\": author_handle,\n \"author_name\": author_name or author_handle,\n \"text\": reply_text,\n \"time_ago\": time_ago,\n \"likes\": likes,\n \"replies\": replies_count,\n \"views\": views,\n }\n if reply_tweet_id:\n reply[\"tweet_id\"] = reply_tweet_id\n if media_urls:\n reply[\"media\"] = media_urls\n if links:\n reply[\"links\"] = links\n if thread_replies:\n reply[\"thread_replies\"] = thread_replies\n\n # Deduplicate\n if not any(\n r[\"author\"] == author_handle and r[\"text\"] == reply_text\n for r in replies\n ):\n replies.append(reply)\n\n i += 1\n\n return replies\n\n\n# ---------------------------------------------------------------------------\n# High-level feature functions\n# ---------------------------------------------------------------------------\n\ndef extract_next_cursor(snapshot: str) -> Optional[str]:\n \"\"\"Extract the next-page cursor from a Nitter timeline snapshot.\n\n Nitter aria snapshot format for the \"Load more\" link:\n - link \"Load more\" [eN]:\n - /url: \"?cursor=XXXXXX\"\n\n Returns the raw cursor string (URL-decoded), or None if not found.\n \"\"\"\n lines = snapshot.split(\"\\n\")\n for i, line in enumerate(lines):\n if 'link \"Load more\"' in line:\n # Next line should be the /url: line\n for j in range(i + 1, min(len(lines), i + 4)):\n url_line = lines[j].strip()\n m = re.match(r'^- /url:\\s+\"?\\?cursor=([^\"&\\s]+)\"?', url_line)\n if m:\n return urllib.parse.unquote(m.group(1))\n return None\n\n\ndef fetch_user_timeline(\n username: str,\n limit: int = 20,\n camofox_port: int = 9377,\n nitter_instance: str = \"nitter.net\",\n) -> Dict[str, Any]:\n \"\"\"Fetch user timeline via Camofox + Nitter, with multi-page support.\n\n When limit > ~20 (one page), automatically follows Nitter's cursor-based\n pagination until enough tweets are collected or no more pages exist.\n \"\"\"\n result = {\"username\": username, \"limit\": limit}\n\n if not check_camofox(camofox_port):\n result[\"error\"] = t(\"err_camofox_not_running_user\", port=camofox_port)\n return result\n\n tweets: List[Dict] = []\n cursor: Optional[str] = None\n page = 1\n MAX_PAGES = 6 # safety cap — never fetch more than ~120 tweets\n\n while len(tweets) \u003c limit and page \u003c= MAX_PAGES:\n if cursor:\n encoded = urllib.parse.quote(cursor, safe=\"\")\n nitter_url = f\"https://{nitter_instance}/{username}?cursor={encoded}\"\n else:\n nitter_url = f\"https://{nitter_instance}/{username}\"\n\n print(\n f\"[x-tweet-fetcher] 翻页 {page}/{MAX_PAGES} — {nitter_url}\",\n file=sys.stderr,\n )\n\n snapshot = camofox_fetch_page(\n nitter_url,\n session_key=f\"timeline-{username}-p{page}\",\n wait=8,\n port=camofox_port,\n )\n\n if not snapshot:\n if page == 1:\n result[\"error\"] = t(\"err_snapshot_failed\")\n return result\n # Partial failure on later pages — stop gracefully\n print(f\"[x-tweet-fetcher] 第 {page} 页快照失败，停止翻页\", file=sys.stderr)\n break\n\n remaining = limit - len(tweets)\n new_tweets = parse_timeline_snapshot(snapshot, limit=remaining)\n\n # Deduplicate across pages by (author, text[:80])\n seen = {(tw[\"author\"], tw[\"text\"][:80]) for tw in tweets}\n for tw in new_tweets:\n key = (tw[\"author\"], tw[\"text\"][:80])\n if key not in seen:\n tweets.append(tw)\n seen.add(key)\n\n print(\n f\"[x-tweet-fetcher] 第 {page} 页: +{len(new_tweets)} 条，累计 {len(tweets)} 条\",\n file=sys.stderr,\n )\n\n if len(new_tweets) == 0:\n break # no tweets on this page — Nitter probably rate-limited\n\n # Extract cursor for next page\n cursor = extract_next_cursor(snapshot)\n if not cursor:\n break # no more pages\n\n page += 1\n if len(tweets) \u003c limit:\n time.sleep(2) # be polite between pages\n\n result[\"tweets\"] = tweets\n result[\"count\"] = len(tweets)\n result[\"pages_fetched\"] = page\n\n if len(tweets) == 0:\n result[\"warning\"] = t(\"warn_no_tweets\")\n\n return result\n\n\ndef extract_list_id(input_str: str) -> Optional[str]:\n \"\"\"Extract list ID from a URL or raw ID string.\n\n Accepts:\n - Pure numeric ID: \"123456789\"\n - List URL: \"https://x.com/i/lists/123456789\"\n - List URL (twitter.com): \"https://twitter.com/i/lists/123456789\"\n - List URL (no scheme): \"x.com/i/lists/123456789\"\n\n Returns the list ID string (digits only), or None if unparseable.\n \"\"\"\n input_str = input_str.strip()\n\n # Pure numeric ID\n if re.match(r'^\\d+

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, input_str):\n return input_str\n\n # URL containing /i/lists/\u003cid>\n m = re.search(r'/i/lists/(\\d+)', input_str)\n if m:\n return m.group(1)\n\n return None\n\n\ndef fetch_list_tweets(\n list_id: str,\n limit: int = 20,\n camofox_port: int = 9377,\n nitter_instance: str = \"nitter.net\",\n) -> Dict[str, Any]:\n \"\"\"Fetch tweets from an X List via Camofox + Nitter, with multi-page support.\n\n When limit > ~20 (one page), automatically follows Nitter's cursor-based\n pagination until enough tweets are collected or no more pages exist.\n \"\"\"\n result = {\"list_id\": list_id, \"limit\": limit}\n\n if not check_camofox(camofox_port):\n result[\"error\"] = t(\"err_camofox_not_running_list\", port=camofox_port)\n return result\n\n tweets: List[Dict] = []\n cursor: Optional[str] = None\n page = 1\n MAX_PAGES = 10 # safety cap — never fetch more than ~200 tweets\n\n while len(tweets) \u003c limit and page \u003c= MAX_PAGES:\n if cursor:\n encoded = urllib.parse.quote(cursor, safe=\"\")\n nitter_url = f\"https://{nitter_instance}/i/lists/{list_id}?cursor={encoded}\"\n else:\n nitter_url = f\"https://{nitter_instance}/i/lists/{list_id}\"\n\n print(\n f\"[x-tweet-fetcher] 翻页 {page}/{MAX_PAGES} — {nitter_url}\",\n file=sys.stderr,\n )\n\n snapshot = camofox_fetch_page(\n nitter_url,\n session_key=f\"list-{list_id}-p{page}\",\n wait=8,\n port=camofox_port,\n )\n\n if not snapshot:\n if page == 1:\n result[\"error\"] = t(\"err_snapshot_failed\")\n return result\n # Partial failure on later pages — stop gracefully\n print(f\"[x-tweet-fetcher] 第 {page} 页快照失败，停止翻页\", file=sys.stderr)\n break\n\n remaining = limit - len(tweets)\n new_tweets = parse_timeline_snapshot(snapshot, limit=remaining)\n\n # Deduplicate across pages by (author, text[:80])\n seen = {(tw[\"author\"], tw[\"text\"][:80]) for tw in tweets}\n for tw in new_tweets:\n key = (tw[\"author\"], tw[\"text\"][:80])\n if key not in seen:\n tweets.append(tw)\n seen.add(key)\n\n print(\n f\"[x-tweet-fetcher] 第 {page} 页: +{len(new_tweets)} 条，累计 {len(tweets)} 条\",\n file=sys.stderr,\n )\n\n if len(new_tweets) == 0:\n break # no tweets on this page — Nitter probably rate-limited\n\n # Extract cursor for next page\n cursor = extract_next_cursor(snapshot)\n if not cursor:\n break # no more pages\n\n page += 1\n if len(tweets) \u003c limit:\n time.sleep(2) # be polite between pages\n\n result[\"tweets\"] = tweets\n result[\"count\"] = len(tweets)\n result[\"pages_fetched\"] = page\n\n if len(tweets) == 0:\n result[\"warning\"] = t(\"warn_no_tweets\")\n\n return result\n\n\n\ndef fetch_tweet_replies(\n url: str,\n camofox_port: int = 9377,\n nitter_instance: str = \"nitter.net\",\n) -> Dict[str, Any]:\n \"\"\"Fetch tweet replies via Camofox + Nitter.\"\"\"\n try:\n username, tweet_id = parse_tweet_url(url)\n except ValueError as e:\n return {\"url\": url, \"error\": str(e)}\n\n result = {\"url\": url, \"username\": username, \"tweet_id\": tweet_id}\n\n if not check_camofox(camofox_port):\n result[\"error\"] = t(\"err_camofox_not_running_replies\", port=camofox_port)\n return result\n\n nitter_url = f\"https://{nitter_instance}/{username}/status/{tweet_id}\"\n print(t(\"opening_via_camofox\", url=nitter_url), file=sys.stderr)\n\n snapshot = camofox_fetch_page(\n nitter_url,\n session_key=f\"replies-{tweet_id}\",\n wait=8,\n port=camofox_port,\n )\n\n if not snapshot:\n result[\"error\"] = t(\"err_snapshot_failed\")\n return result\n\n replies = parse_replies_snapshot(snapshot, original_author=username)\n\n # ── 递归抓取嵌套回复（Issue #24 修复） ──\n # 对有 replies > 0 且有 tweet_id 的评论，访问其独立 status 页面\n # 获取嵌套回复内容（Nitter 评论区页面不展开嵌套回复）\n for reply in replies:\n if reply.get(\"replies\", 0) > 0 and reply.get(\"tweet_id\"):\n reply_author = reply[\"author\"].lstrip(\"@\")\n reply_tid = reply[\"tweet_id\"]\n nested_url = f\"https://{nitter_instance}/{reply_author}/status/{reply_tid}\"\n print(\n f\"[x-tweet-fetcher] 抓取嵌套回复: {reply_author}/status/{reply_tid}\",\n file=sys.stderr,\n )\n\n nested_snapshot = camofox_fetch_page(\n nested_url,\n session_key=f\"nested-{reply_tid}\",\n wait=8,\n port=camofox_port,\n )\n\n if nested_snapshot:\n nested_replies = parse_replies_snapshot(\n nested_snapshot, original_author=reply_author\n )\n if nested_replies:\n reply[\"thread_replies\"] = nested_replies\n\n result[\"replies\"] = replies\n result[\"reply_count\"] = len(replies)\n\n if len(replies) == 0:\n result[\"warning\"] = t(\"warn_no_replies\")\n\n return result\n\n\n# ---------------------------------------------------------------------------\n# X Article helpers\n# ---------------------------------------------------------------------------\n\ndef parse_article_id(input_str: str) -> Optional[str]:\n \"\"\"Extract article ID from a URL or raw ID string.\n\n Accepts:\n - Pure numeric ID: \"2011779830157557760\"\n - Article URL: \"https://x.com/i/article/2011779830157557760\"\n - Article URL (no scheme): \"x.com/i/article/2011779830157557760\"\n - Tweet URL whose text links to an article (pass the ID directly in that case)\n\n Returns the article ID string, or None if unparseable.\n \"\"\"\n input_str = input_str.strip()\n\n # Pure numeric ID\n if re.match(r'^\\d{10,25}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, input_str):\n return input_str\n\n # URL containing /i/article/\u003cid>\n m = re.search(r'/i/article/(\\d{10,25})', input_str)\n if m:\n return m.group(1)\n\n return None\n\n\ndef parse_article_snapshot(snapshot: str) -> Dict[str, Any]:\n \"\"\"Parse an X Article page snapshot (Camofox aria snapshot) into structured data.\n\n X Article accessibility tree structure (observed):\n - heading \"Article title\" ← article title\n - text: @AuthorHandle ← author handle\n - text: Author Name ← author display name\n - text: \u003cdate> ← publish date\n - text: paragraph 1\n - text: paragraph 2\n ...\n\n Because X requires login for full content, the snapshot may only contain\n title + preview/teaser. We capture whatever is available.\n\n Returns a dict with keys:\n title, author, author_handle, paragraphs, content, word_count, char_count,\n is_partial (True when content is likely truncated due to login wall)\n \"\"\"\n lines = snapshot.split(\"\\n\")\n title: Optional[str] = None\n author_handle: Optional[str] = None\n author_name: Optional[str] = None\n paragraphs: List[str] = []\n\n # Patterns\n heading_re = re.compile(r'^-\\s+heading\\s+\"(.+)\"', re.IGNORECASE)\n text_re = re.compile(r'^-\\s+text:\\s+(.*)')\n link_re = re.compile(r'^-\\s+link\\s+\"([^\"]+)\"')\n handle_re = re.compile(r'^@(\\w+)

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

)\n\n # Strings to skip (navigation / boilerplate / empty)\n _SKIP_TEXTS = {\n \"\", \"x\", \"home\", \"explore\", \"notifications\", \"messages\", \"grok\",\n \"profile\", \"more\", \"post\", \"log in\", \"sign up\", \"sign in\",\n \"already have an account?\", \"don't have an account?\",\n \"subscribe\", \"get the app\", \"help\", \"settings\", \"privacy policy\",\n \"terms of service\", \"cookie policy\", \"accessibility\",\n \"ads info\", \"more options\", \"follow\", \"following\",\n }\n\n def _is_skip(text: str) -> bool:\n stripped = text.strip().lower()\n return stripped in _SKIP_TEXTS or len(stripped) \u003c 2\n\n i = 0\n while i \u003c len(lines):\n line = lines[i].strip()\n\n # ── Heading → title ────────────────────────────────────────────────\n m = heading_re.match(line)\n if m and not title:\n candidate = m.group(1).strip()\n if not _is_skip(candidate):\n title = candidate\n i += 1\n continue\n\n # ── text: lines ────────────────────────────────────────────────────\n m = text_re.match(line)\n if m:\n raw = m.group(1).strip()\n\n # Author @handle\n hm = handle_re.match(raw)\n if hm and not author_handle:\n author_handle = raw # keep with @\n i += 1\n continue\n\n # Skip boilerplate\n if _is_skip(raw):\n i += 1\n continue\n\n # Skip short date-like strings immediately after author info\n # (e.g. \"Feb 10, 2025\") — we don't extract date for now, just skip\n if re.match(r'^[A-Z][a-z]{2}\\s+\\d{1,2},?\\s+\\d{4}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, raw):\n i += 1\n continue\n\n # Author display name heuristic: single line, no spaces (but allow\n # names like \"John Doe\"), appears early before paragraphs, not a sentence\n if not author_name and not paragraphs and len(raw.split()) \u003c= 4 and not raw.endswith(\".\"):\n author_name = raw\n i += 1\n continue\n\n # Everything else is paragraph content\n paragraphs.append(raw)\n i += 1\n continue\n\n # ── Named links can sometimes be author name or article sub-heading ─\n m = link_re.match(line)\n if m:\n text = m.group(1).strip()\n hm = handle_re.match(text)\n if hm and not author_handle:\n author_handle = text\n elif not _is_skip(text) and not author_name and not paragraphs:\n author_name = text\n i += 1\n continue\n\n i += 1\n\n content = \"\\n\\n\".join(paragraphs)\n word_count = len(content.split()) if content else 0\n char_count = len(content)\n\n # Heuristic: if content is very short (\u003c 100 chars), likely login wall\n is_partial = char_count \u003c 100\n\n return {\n \"title\": title or \"\",\n \"author\": author_name or \"\",\n \"author_handle\": author_handle or \"\",\n \"paragraphs\": paragraphs,\n \"content\": content,\n \"word_count\": word_count,\n \"char_count\": char_count,\n \"is_partial\": is_partial,\n }\n\n\ndef fetch_article(\n input_str: str,\n camofox_port: int = 9377,\n) -> Dict[str, Any]:\n \"\"\"Fetch an X Article via Camofox.\n\n ``input_str`` can be:\n - A full article URL: https://x.com/i/article/2011779830157557760\n - A bare article ID: 2011779830157557760\n\n Note: X Articles require login to read the full text. Without login,\n only publicly visible content (title + preview) is captured.\n Camofox must be running on the given port.\n\n Returns a dict with:\n article_id, url, title, author, author_handle, content,\n word_count, char_count, is_partial, warning (if partial)\n \"\"\"\n article_id = parse_article_id(input_str)\n if not article_id:\n return {\n \"input\": input_str,\n \"error\": t(\"err_invalid_article\", input=input_str),\n }\n\n article_url = f\"https://x.com/i/article/{article_id}\"\n result: Dict[str, Any] = {\n \"article_id\": article_id,\n \"url\": article_url,\n }\n\n if not check_camofox(camofox_port):\n result[\"error\"] = t(\"err_camofox_not_running_article\", port=camofox_port)\n return result\n\n print(t(\"opening_article_via_camofox\", url=article_url), file=sys.stderr)\n\n # X Articles are JS-heavy; use a longer wait (10 s)\n snapshot = camofox_fetch_page(\n article_url,\n session_key=f\"article-{article_id}\",\n wait=10,\n port=camofox_port,\n )\n\n if not snapshot:\n result[\"error\"] = t(\"err_snapshot_failed\")\n return result\n\n parsed = parse_article_snapshot(snapshot)\n\n result[\"title\"] = parsed[\"title\"]\n result[\"author\"] = parsed[\"author\"]\n result[\"author_handle\"] = parsed[\"author_handle\"]\n result[\"content\"] = parsed[\"content\"]\n result[\"word_count\"] = parsed[\"word_count\"]\n result[\"char_count\"] = parsed[\"char_count\"]\n result[\"is_partial\"] = parsed[\"is_partial\"]\n result[\"paragraphs\"] = parsed[\"paragraphs\"]\n\n if parsed[\"is_partial\"]:\n # Surface the login-wall note so callers / users understand the limitation\n result[\"warning\"] = t(\"article_login_note\")\n\n return result\n\n\n# ---------------------------------------------------------------------------\n# Mentions 监控（--monitor 模式）\n# ---------------------------------------------------------------------------\n\n# 缓存目录：~/.x-tweet-fetcher/\n_CACHE_DIR = Path.home() / \".x-tweet-fetcher\"\n# 单个用户缓存最大保留 URL 数量\n_CACHE_MAX = 500\n\n\ndef _get_cache_path(username: str) -> Path:\n \"\"\"返回指定用户的 mentions 缓存文件路径。\"\"\"\n # 去掉 @ 前缀，统一小写，避免大小写重复\n clean = username.lstrip(\"@\").lower()\n return _CACHE_DIR / f\"mentions-cache-{clean}.json\"\n\n\ndef _load_cache(username: str) -> dict:\n \"\"\"加载 mentions 缓存，返回 {'seen': [...url...], 'is_baseline': bool}。\"\"\"\n path = _get_cache_path(username)\n if path.exists():\n try:\n with open(path, \"r\", encoding=\"utf-8\") as f:\n data = json.load(f)\n # 兼容旧格式（纯列表）\n if isinstance(data, list):\n return {\"seen\": data, \"is_baseline\": False}\n return data\n except Exception:\n pass\n return {\"seen\": [], \"is_baseline\": True}\n\n\ndef _save_cache(username: str, cache: dict):\n \"\"\"保存 mentions 缓存到磁盘，超过上限时截断最旧条目。\"\"\"\n _CACHE_DIR.mkdir(parents=True, exist_ok=True)\n # 限制缓存大小，保留最新的 _CACHE_MAX 条\n if len(cache[\"seen\"]) > _CACHE_MAX:\n cache[\"seen\"] = cache[\"seen\"][-_CACHE_MAX:]\n path = _get_cache_path(username)\n with open(path, \"w\", encoding=\"utf-8\") as f:\n json.dump(cache, f, ensure_ascii=False, indent=2)\n\n\ndef _search_mentions(username: str, limit: int = 10, port: int = 9377) -> List[Dict]:\n \"\"\"\n 通过 Camofox + Google 搜索该用户的 mentions，返回去重后的搜索结果列表。\n\n 搜索策略：\n 1. site:x.com @username — 带 @ 的直接提及\n 2. site:x.com username — 不带 @ 的提及（更广）\n\n 每种策略最多取 limit 条，最终合并去重（以 URL 为 key）。\n \"\"\"\n # 避免循环 import：在函数内部 import\n try:\n import sys as _sys\n import os as _os\n # 将 scripts/ 目录加入路径，确保 camofox_client 可 import\n scripts_dir = _os.path.dirname(_os.path.abspath(__file__))\n if scripts_dir not in _sys.path:\n _sys.path.insert(0, scripts_dir)\n from camofox_client import camofox_search\n except ImportError:\n # fallback：直接用内置的 camofox_search（如果在同目录运行）\n from scripts.camofox_client import camofox_search\n\n clean = username.lstrip(\"@\")\n queries = [\n f\"site:x.com @{clean}\",\n f\"site:x.com {clean}\",\n ]\n\n seen_urls: set = set()\n results: List[Dict] = []\n\n for query in queries:\n print(t(\"monitor_searching\", query=query), file=sys.stderr)\n raw = camofox_search(query, num=limit, port=port)\n for item in raw:\n url = item.get(\"url\", \"\").strip()\n # 只保留 x.com 下的推文 URL（过滤搜索引擎导航链接）\n if url and url not in seen_urls and \"x.com\" in url:\n seen_urls.add(url)\n results.append({\n \"url\": url,\n \"title\": item.get(\"title\", \"\"),\n \"snippet\": item.get(\"snippet\", \"\"),\n })\n\n return results\n\n\ndef monitor_mentions(\n username: str,\n limit: int = 10,\n camofox_port: int = 9377,\n) -> Dict[str, Any]:\n \"\"\"\n 监控 X mentions 增量变化。\n\n 首次运行：建立基线，不报任何新内容（exit code 0）。\n 后续运行：与缓存对比，只报告新增 URL（exit code 1 = 有新内容）。\n\n 返回格式：\n {\n \"username\": \"...\",\n \"new_mentions\": [...], # 新增条目列表\n \"is_baseline\": True/False,\n \"known_count\": N,\n \"error\": \"...\" (可选)\n }\n \"\"\"\n result: Dict[str, Any] = {\n \"username\": username.lstrip(\"@\"),\n \"new_mentions\": [],\n \"is_baseline\": False,\n \"known_count\": 0,\n }\n\n # 检查 Camofox 是否运行\n if not check_camofox(camofox_port):\n result[\"error\"] = t(\"monitor_camofox_error\", port=camofox_port)\n return result\n\n # 加载本地缓存\n cache = _load_cache(username)\n seen_set = set(cache[\"seen\"])\n result[\"known_count\"] = len(seen_set)\n\n # 搜索 mentions\n all_results = _search_mentions(username, limit=limit, port=camofox_port)\n\n if cache[\"is_baseline\"]:\n # 首次运行：将所有搜索结果写入缓存作为基线，不报新内容\n new_urls = [r[\"url\"] for r in all_results]\n cache[\"seen\"] = list(seen_set | set(new_urls))\n cache[\"is_baseline\"] = False\n _save_cache(username, cache)\n result[\"is_baseline\"] = True\n result[\"known_count\"] = len(cache[\"seen\"])\n print(t(\"monitor_baseline\", count=len(cache[\"seen\"])), file=sys.stderr)\n else:\n # 后续运行：只报告不在缓存中的新条目\n new_mentions = [r for r in all_results if r[\"url\"] not in seen_set]\n\n # 将新 URL 加入缓存\n for r in new_mentions:\n cache[\"seen\"].append(r[\"url\"])\n _save_cache(username, cache)\n\n result[\"new_mentions\"] = new_mentions\n result[\"known_count\"] = len(cache[\"seen\"])\n\n if new_mentions:\n print(t(\"monitor_new_found\", count=len(new_mentions)), file=sys.stderr)\n else:\n print(t(\"monitor_no_new\", known=len(seen_set)), file=sys.stderr)\n\n return result\n\n\n# ---------------------------------------------------------------------------\n# CLI\n# ---------------------------------------------------------------------------\n\ndef main():\n global _lang\n\n parser = argparse.ArgumentParser(\n description=(\n \"Fetch tweets from X/Twitter.\\n\"\n \" --url \u003cURL> Single tweet via FxTwitter (zero deps)\\n\"\n \" --url \u003cURL> --replies Tweet replies via Camofox + Nitter\\n\"\n \" --user \u003cusername> User timeline via Camofox + Nitter\\n\"\n \" --article \u003cURL_or_ID> X Article full text via Camofox\\n\"\n \" --monitor @username Monitor X mentions (incremental, cron-friendly)\\n\"\n \" --list \u003clist_url_or_id> Fetch tweets from an X List via Camofox + Nitter\\n\"\n \"\\n\"\n \"Note: --article requires Camofox. X Articles also require X login\\n\"\n \"for full content; without login only public preview is captured.\\n\"\n \"Note: --monitor requires Camofox. First run builds a baseline (no output).\\n\"\n \"Subsequent runs report only new mentions. Exit code 1 = new content found.\"\n ),\n formatter_class=argparse.RawDescriptionHelpFormatter,\n )\n parser.add_argument(\"--url\", \"-u\", help=\"Tweet URL (x.com or twitter.com)\")\n parser.add_argument(\"--user\", help=\"X/Twitter username (without @)\")\n parser.add_argument(\"--article\", \"-a\", metavar=\"URL_or_ID\",\n help=\"X Article URL (https://x.com/i/article/ID) or bare article ID\")\n parser.add_argument(\"--monitor\", \"-m\", metavar=\"@USERNAME\",\n help=\"Monitor X mentions for a username (requires Camofox)\")\n parser.add_argument(\"--list\", \"-l\", metavar=\"LIST_URL_OR_ID\",\n help=\"Fetch tweets from an X List (URL or ID, requires Camofox)\")\n parser.add_argument(\"--limit\", type=int, default=50, help=\"Max tweets for --user / max results for --monitor (default: 50 for --user, 10 for --monitor)\")\n parser.add_argument(\"--replies\", \"-r\", action=\"store_true\", help=\"Fetch replies (requires Camofox)\")\n parser.add_argument(\"--pretty\", \"-p\", action=\"store_true\", help=\"Pretty print JSON\")\n parser.add_argument(\"--text-only\", \"-t\", action=\"store_true\", help=\"Human-readable output\")\n parser.add_argument(\"--timeout\", type=int, default=30, help=\"Request timeout in seconds (default: 30)\")\n parser.add_argument(\"--port\", type=int, default=9377, help=\"Camofox port (default: 9377)\")\n parser.add_argument(\"--nitter\", default=\"nitter.net\", help=\"Nitter instance (default: nitter.net)\")\n parser.add_argument(\n \"--lang\", default=\"zh\", choices=[\"zh\", \"en\"],\n help=\"Output language for tool messages: zh (default) or en\",\n )\n\n args = parser.parse_args()\n\n # Apply language setting globally before any t() calls\n _lang = args.lang\n\n # Count how many primary modes are requested\n _modes = [bool(args.url), bool(args.user), bool(args.article), bool(args.monitor), bool(args.list)]\n if sum(_modes) > 1:\n print(t(\"err_mutually_exclusive\"), file=sys.stderr)\n sys.exit(1)\n\n if not any(_modes):\n parser.print_help()\n sys.exit(1)\n\n indent = 2 if args.pretty else None\n\n # ── Mode 0: Mentions 监控 ─────────────────────────────────────────────\n if args.monitor:\n # --limit 对 --monitor 默认 10（搜索结果），若用户显式传 limit 则用用户的值\n monitor_limit = args.limit if args.limit != 50 else 10\n result = monitor_mentions(\n args.monitor,\n limit=monitor_limit,\n camofox_port=args.port,\n )\n\n if result.get(\"error\"):\n print(t(\"err_prefix\") + result[\"error\"], file=sys.stderr)\n sys.exit(2)\n\n if result.get(\"is_baseline\"):\n # 首次建基线，静默退出（exit 0）\n if not args.text_only:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n sys.exit(0)\n\n new_mentions = result.get(\"new_mentions\", [])\n\n if args.text_only:\n username_clean = result[\"username\"]\n if new_mentions:\n print(t(\"monitor_header\", username=username_clean, count=len(new_mentions)) + \"\\n\")\n for idx, m in enumerate(new_mentions, 1):\n print(f\"[{idx}] {m['title']}\")\n print(f\" {m['url']}\")\n if m.get(\"snippet\"):\n print(f\" {m['snippet'][:120]}\")\n print()\n # 无新内容时 text-only 模式不输出任何内容（方便 cron）\n else:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n\n # exit 1 = 有新 mentions（cron 友好），exit 0 = 无新内容\n sys.exit(1 if new_mentions else 0)\n\n # ── Mode 1: User timeline ─────────────────────────────────────────────\n if args.user:\n result = fetch_user_timeline(\n args.user,\n limit=args.limit,\n camofox_port=args.port,\n nitter_instance=args.nitter,\n )\n\n if args.text_only:\n if result.get(\"error\"):\n print(t(\"err_prefix\") + result[\"error\"], file=sys.stderr)\n sys.exit(1)\n tweets = result.get(\"tweets\", [])\n print(t(\"timeline_header\", user=args.user, count=len(tweets)) + \"\\n\")\n for idx, tw in enumerate(tweets, 1):\n print(f\"[{idx}] {tw['author_name']} ({tw['author']}) · {tw.get('time_ago', '')}\")\n print(f\" {tw['text']}\")\n stats = f\" ❤ {tw['likes']} 💬 {tw['replies']} 👁 {tw['views']}\"\n if tw.get(\"media\"):\n stats += \" \" + t(\"media_label\", n=len(tw[\"media\"]))\n print(stats)\n print()\n else:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n\n if result.get(\"error\"):\n sys.exit(1)\n return\n\n # ── Mode 2: X Article ────────────────────────────────────────────────\n if args.article:\n result = fetch_article(\n args.article,\n camofox_port=args.port,\n )\n\n if args.text_only:\n if result.get(\"error\"):\n print(t(\"err_prefix\") + result[\"error\"], file=sys.stderr)\n sys.exit(1)\n title = result.get(\"title\") or \"(no title)\"\n author = result.get(\"author\") or result.get(\"author_handle\") or \"\"\n content = result.get(\"content\", \"\")\n wc = result.get(\"word_count\", 0)\n print(t(\"article_header\", title=title))\n if author:\n print(f\"@{result.get('author_handle', '').lstrip('@') or author} {author}\")\n print(t(\"article_words\", word_count=wc))\n if result.get(\"warning\"):\n print(f\"⚠️ {result['warning']}\")\n print()\n print(content or \"(empty)\")\n else:\n print(json.dumps(result, ensure_ascii=False, indent=2 if args.pretty else None))\n\n if result.get(\"error\"):\n sys.exit(1)\n return\n\n # ── Mode 3: Tweet replies ─────────────────────────────────────────────\n if args.url and args.replies:\n result = fetch_tweet_replies(\n args.url,\n camofox_port=args.port,\n nitter_instance=args.nitter,\n )\n\n if args.text_only:\n if result.get(\"error\"):\n print(t(\"err_prefix\") + result[\"error\"], file=sys.stderr)\n sys.exit(1)\n replies = result.get(\"replies\", [])\n print(t(\"replies_header\", url=args.url) + \"\\n\")\n for idx, r in enumerate(replies, 1):\n print(f\"[{idx}] {r['author_name']} ({r['author']}) · {r.get('time_ago', '')}\")\n print(f\" {r['text']}\")\n stats = f\" ❤ {r['likes']} 💬 {r['replies']} 👁 {r['views']}\"\n if r.get(\"media\"):\n stats += \" \" + t(\"media_label_with_urls\", n=len(r[\"media\"]), urls=\", \".join(r[\"media\"]))\n print(stats)\n print()\n else:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n\n if result.get(\"error\"):\n sys.exit(1)\n return\n\n # ── Mode 4: X List tweets ─────────────────────────────────────────────\n if args.list:\n # Extract list_id from input\n list_id = extract_list_id(args.list)\n if not list_id:\n print(t(\"err_prefix\") + t(\"err_invalid_list\", input=args.list), file=sys.stderr)\n sys.exit(1)\n\n result = fetch_list_tweets(\n list_id,\n limit=args.limit,\n camofox_port=args.port,\n nitter_instance=args.nitter,\n )\n\n if args.text_only:\n if result.get(\"error\"):\n print(t(\"err_prefix\") + result[\"error\"], file=sys.stderr)\n sys.exit(1)\n tweets = result.get(\"tweets\", [])\n print(t(\"list_header\", list_id=list_id, count=len(tweets)) + \"\\n\")\n for idx, tw in enumerate(tweets, 1):\n print(f\"[{idx}] {tw['author_name']} ({tw['author']}) · {tw.get('time_ago', '')}\")\n print(f\" {tw['text']}\")\n stats = f\" ❤ {tw['likes']} 💬 {tw['replies']} 👁 {tw['views']}\"\n if tw.get(\"media\"):\n stats += \" \" + t(\"media_label\", n=len(tw[\"media\"]))\n print(stats)\n print()\n else:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n\n if result.get(\"error\"):\n sys.exit(1)\n return\n\n # ── Mode 4: Single tweet via FxTwitter (original, zero deps) ─────────\n result = fetch_tweet(args.url, timeout=args.timeout)\n\n if args.text_only:\n tweet = result.get(\"tweet\", {})\n if tweet.get(\"is_article\") and tweet.get(\"article\", {}).get(\"full_text\"):\n article = tweet[\"article\"]\n print(f\"# {article['title']}\\n\")\n print(t(\"article_by\", screen_name=tweet[\"screen_name\"], created_at=tweet.get(\"created_at\", \"\")))\n print(t(\"article_stats\", likes=tweet[\"likes\"], retweets=tweet[\"retweets\"], views=tweet[\"views\"]))\n print(t(\"article_words\", word_count=article[\"word_count\"]) + \"\\n\")\n print(article[\"full_text\"])\n elif tweet.get(\"text\"):\n print(f\"@{tweet['screen_name']}: {tweet['text']}\")\n print(t(\"tweet_stats\", likes=tweet[\"likes\"], retweets=tweet[\"retweets\"], views=tweet[\"views\"]))\n elif result.get(\"error\"):\n print(t(\"err_prefix\") + result[\"error\"], file=sys.stderr)\n sys.exit(1)\n else:\n print(json.dumps(result, ensure_ascii=False, indent=indent))\n\n if result.get(\"error\"):\n sys.exit(1)\n\n\nif __name__ == \"__main__\":\n # Version check (best-effort, no crash if unavailable)\n try:\n from scripts.version_check import check_for_update\n check_for_update(\"ythx-101/x-tweet-fetcher\")\n except Exception:\n pass\n\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":73002,"content_sha256":"fb38acf89e475aa6f23fac8c4ae21aaf726308f37b15679079c7f2550bb7d8ec"},{"filename":"scripts/sogou_wechat.py","content":"#!/usr/bin/env python3\n\"\"\"\nSogou WeChat Search - Search WeChat articles via Sogou.\nPart of x-tweet-fetcher.\n\nUsage:\n python3 sogou_wechat.py --keyword \"AI\" --limit 5\n python3 sogou_wechat.py --keyword \"人工智能\" --json\n\n # Resolve to real mp.weixin.qq.com URLs (via Google/DuckDuckGo)\n python3 sogou_wechat.py --keyword \"AI Agent\" --limit 3 --resolve --json\n\n # Use SSH proxy to avoid IP bans (set env vars)\n export SOGOU_SSH_HOST=user@host\n python3 sogou_wechat.py --keyword \"AI Agent\" --via-ssh\n\nWorkflow: Sogou search → get titles → Google/DDG find real WeChat URL → fetch_china.py reads full text\n\"\"\"\n\nimport requests\nfrom urllib.parse import quote\nimport re\nimport json\nimport argparse\nimport sys\nimport os\nimport html as html_lib\nimport subprocess\n\n\ndef sogou_wechat_search_via_router(keyword, max_results=10):\n \"\"\"Search Sogou WeChat via home router (cmd-queue/cmd-result pattern).\n \n Router polls VPS every minute, executes queued commands, pushes results back.\n Uses home IP — never gets banned by Sogou.\n \"\"\"\n import time\n queue_file = os.environ.get(\"ROUTER_CMD_QUEUE\", \"/root/router-agent/cmd-queue\")\n result_file = os.environ.get(\"ROUTER_CMD_RESULT\", \"/root/router-agent/cmd-result\")\n output_file = os.environ.get(\"ROUTER_CMD_OUTPUT\", \"/root/router-agent/cmd-output\")\n \n # Mark current result file position\n try:\n with open(result_file) as f:\n before = f.read()\n before_len = len(before)\n except FileNotFoundError:\n before_len = 0\n \n # Queue the curl command — router will fetch raw HTML\n encoded_kw = quote(keyword)\n cmd = f'curl -s \"https://weixin.sogou.com/weixin?type=2&query={encoded_kw}\" -H \"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36\"'\n \n with open(queue_file, 'w') as f:\n f.write(cmd)\n \n print(f\"Command queued, waiting for router (up to 90s)...\", file=sys.stderr)\n \n # Wait for result (router polls every ~60s)\n for _ in range(18): # 18 * 5s = 90s max\n time.sleep(5)\n try:\n with open(result_file) as f:\n after = f.read()\n if len(after) > before_len:\n # New result arrived — read the output file\n try:\n with open(output_file) as f:\n html_text = f.read()\n if 'txt-box' in html_text:\n return _parse_sogou_html(html_text, max_results)\n except FileNotFoundError:\n pass\n except FileNotFoundError:\n pass\n \n print(\"Router timeout, falling back to direct\", file=sys.stderr)\n return sogou_wechat_search(keyword, max_results)\n\n\ndef _parse_sogou_html(text, max_results=10):\n \"\"\"Parse Sogou search result HTML into structured results.\"\"\"\n results = []\n blocks = re.findall(r'\u003cdiv class=\"txt-box\">(.*?)\u003c/div>\\s*\u003c/div>', text, re.DOTALL)\n for block in blocks[:max_results]:\n title_match = re.search(r'\u003ca[^>]*href=\"([^\"]*)\"[^>]*>(.*?)\u003c/a>', block, re.DOTALL)\n if not title_match:\n continue\n article_url = title_match.group(1).replace('&', '&')\n raw_title = title_match.group(2)\n title = re.sub(r'\u003c[^>]+>', '', raw_title).strip()\n title = html_lib.unescape(title)\n author_match = re.search(r'\u003ca[^>]*class=\"account\"[^>]*>(.*?)\u003c/a>', block, re.DOTALL)\n author = re.sub(r'\u003c[^>]+>', '', author_match.group(1)).strip() if author_match else ''\n snippet_match = re.search(r'\u003cp class=\"txt-info\">(.*?)\u003c/p>', block, re.DOTALL)\n snippet = re.sub(r'\u003c[^>]+>', '', snippet_match.group(1)).strip() if snippet_match else ''\n snippet = html_lib.unescape(snippet)\n date_match = re.search(r\"document\\.write\$timeConvert\\('(\\d+)'\$\\)\", block)\n if date_match:\n from datetime import datetime\n ts = int(date_match.group(1))\n date = datetime.fromtimestamp(ts).strftime('%Y-%m-%d')\n else:\n date = ''\n if article_url.startswith('/link'):\n article_url = 'https://weixin.sogou.com' + article_url\n results.append({'title': title, 'url': article_url, 'author': author, 'snippet': snippet, 'date': date})\n return results\n \"\"\"Search Sogou WeChat via SSH proxy to avoid IP bans.\n \n Requires: SOGOU_SSH_HOST env var or ssh_host param (e.g. user@host).\n \"\"\"\n host = ssh_host or os.environ.get(\"SOGOU_SSH_HOST\")\n if not host:\n print(\"SOGOU_SSH_HOST not set, falling back to direct\", file=sys.stderr)\n return sogou_wechat_search(keyword, max_results)\n\n script = f'''\nimport requests, re, json, html as html_lib\nfrom urllib.parse import quote\nheaders = {{\"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36\"}}\nurl = f\"https://weixin.sogou.com/weixin?type=2&query={{quote({repr(keyword)})}}\"\nr = requests.get(url, headers=headers, timeout=10)\nresults = []\nblocks = re.findall(r'\u003cdiv class=\"txt-box\">(.*?)\u003c/div>\\\\s*\u003c/div>', r.text, re.DOTALL)\nfor block in blocks[:{max_results}]:\n title_m = re.search(r'\u003ca[^>]*href=\"([^\"]*)\"[^>]*>(.*?)\u003c/a>', block, re.DOTALL)\n if not title_m: continue\n article_url = title_m.group(1).replace(\"&\", \"&\")\n title = re.sub(r'\u003c[^>]+>', '', title_m.group(2)).strip()\n title = html_lib.unescape(title)\n author_m = re.search(r'\u003ca[^>]*class=\"account\"[^>]*>(.*?)\u003c/a>', block, re.DOTALL)\n author = re.sub(r'\u003c[^>]+>', '', author_m.group(1)).strip() if author_m else ''\n snippet_m = re.search(r'\u003cp class=\"txt-info\">(.*?)\u003c/p>', block, re.DOTALL)\n snippet = re.sub(r'\u003c[^>]+>', '', snippet_m.group(1)).strip() if snippet_m else ''\n snippet = html_lib.unescape(snippet)\n from datetime import datetime\n date_m = re.search(r\"document\\\\.write\\\$timeConvert\\\\('(\\\\d+)'\\\$\\\\)\", block)\n date = datetime.fromtimestamp(int(date_m.group(1))).strftime('%Y-%m-%d') if date_m else ''\n if article_url.startswith('/link'): article_url = 'https://weixin.sogou.com' + article_url\n results.append({{\"title\": title, \"url\": article_url, \"author\": author, \"snippet\": snippet, \"date\": date}})\nprint(json.dumps(results, ensure_ascii=False))\n'''\n try:\n # Write script to temp file and scp to remote\n import tempfile\n with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:\n f.write(script)\n local_path = f.name\n \n remote_path = \"/tmp/_sogou_search.py\"\n subprocess.run([\"scp\", \"-o\", \"ConnectTimeout=5\", \"-q\", local_path, f\"{host}:{remote_path}\"],\n capture_output=True, timeout=10)\n os.unlink(local_path)\n \n result = subprocess.run(\n [\"ssh\", \"-o\", \"ConnectTimeout=5\", host, \"python3\", remote_path],\n capture_output=True, text=True, timeout=30\n )\n if result.returncode == 0 and result.stdout.strip():\n return json.loads(result.stdout.strip())\n else:\n print(f\"SSH search failed: {result.stderr[:100]}\", file=sys.stderr)\n return sogou_wechat_search(keyword, max_results)\n except Exception as e:\n print(f\"SSH error: {e}, falling back to direct\", file=sys.stderr)\n return sogou_wechat_search(keyword, max_results)\n\n\ndef sogou_wechat_search(keyword, max_results=10):\n \"\"\"搜索搜狗微信公众号文章\"\"\"\n \n headers = {\n 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'\n }\n \n url = f'https://weixin.sogou.com/weixin?type=2&query={quote(keyword)}'\n \n try:\n response = requests.get(url, headers=headers, timeout=10)\n response.raise_for_status()\n text = response.text\n \n results = []\n \n # 找到所有 txt-box 块\n blocks = re.findall(r'\u003cdiv class=\"txt-box\">(.*?)\u003c/div>\\s*\u003c/div>', text, re.DOTALL)\n \n for block in blocks[:max_results]:\n # 标题和链接\n title_match = re.search(r'\u003ca[^>]*href=\"([^\"]*)\"[^>]*>(.*?)\u003c/a>', block, re.DOTALL)\n if not title_match:\n continue\n \n article_url = title_match.group(1).replace('&', '&')\n # 清理标题中的 HTML 标签\n raw_title = title_match.group(2)\n title = re.sub(r'\u003c[^>]+>', '', raw_title).strip()\n title = html_lib.unescape(title)\n \n # 作者/公众号\n author_match = re.search(r'\u003ca[^>]*class=\"account\"[^>]*>(.*?)\u003c/a>', block, re.DOTALL)\n author = re.sub(r'\u003c[^>]+>', '', author_match.group(1)).strip() if author_match else ''\n \n # 摘要\n snippet_match = re.search(r'\u003cp class=\"txt-info\">(.*?)\u003c/p>', block, re.DOTALL)\n snippet = re.sub(r'\u003c[^>]+>', '', snippet_match.group(1)).strip() if snippet_match else ''\n snippet = html_lib.unescape(snippet)\n \n # 日期 (timestamp)\n date_match = re.search(r\"document\\.write\$timeConvert\\('(\\d+)'\$\\)\", block)\n if date_match:\n from datetime import datetime\n ts = int(date_match.group(1))\n date = datetime.fromtimestamp(ts).strftime('%Y-%m-%d')\n else:\n date = ''\n \n # 完整链接\n if article_url.startswith('/link'):\n article_url = 'https://weixin.sogou.com' + article_url\n \n results.append({\n 'title': title,\n 'url': article_url,\n 'author': author,\n 'snippet': snippet,\n 'date': date\n })\n \n return results\n \n except Exception as e:\n print(f\"搜索失败: {e}\", file=sys.stderr)\n return []\n\n\ndef resolve_sogou_link(sogou_url, port=9377):\n \"\"\"Resolve Sogou redirect link to real mp.weixin.qq.com URL via Camofox.\"\"\"\n try:\n from camofox_client import camofox_open_tab, camofox_snapshot, camofox_close_tab\n import time\n tab_id = camofox_open_tab(sogou_url, f\"resolve-{int(time.time())}\", port=port)\n if not tab_id:\n return sogou_url\n time.sleep(5)\n snapshot = camofox_snapshot(tab_id, port=port)\n camofox_close_tab(tab_id, port=port)\n if snapshot:\n # Look for mp.weixin.qq.com in the final page URL or content\n import re\n mp_match = re.search(r'(https?://mp\\.weixin\\.qq\\.com/s/[A-Za-z0-9_-]+)', snapshot)\n if mp_match:\n return mp_match.group(1)\n # Check for canonical URL\n canon = re.search(r'canonical.*?(https?://mp\\.weixin\\.qq\\.com[^\\s\"\u003c>]+)', snapshot)\n if canon:\n return canon.group(1)\n return sogou_url\n except Exception:\n return sogou_url\n\n\ndef resolve_via_google(title, port=9377):\n \"\"\"Resolve article title to real mp.weixin.qq.com URL via Google search.\"\"\"\n try:\n from camofox_client import camofox_search\n query = f'site:mp.weixin.qq.com \"{title}\"'\n results = camofox_search(query, num=3, port=port)\n for r in results:\n url = r.get('url', '')\n if 'mp.weixin.qq.com' in url:\n return url\n except Exception:\n pass\n # Fallback: try DuckDuckGo\n try:\n from duckduckgo_search import DDGS\n import warnings\n warnings.filterwarnings(\"ignore\")\n ddgs = DDGS()\n query = f'site:mp.weixin.qq.com {title}'\n results = ddgs.text(query, max_results=3)\n for r in results:\n url = r.get('href', '')\n if 'mp.weixin.qq.com' in url:\n return url\n except Exception:\n pass\n return None\n\n\ndef main():\n parser = argparse.ArgumentParser(description=\"Search WeChat articles via Sogou\")\n parser.add_argument(\"--keyword\", \"-k\", required=True, help=\"Search keyword\")\n parser.add_argument(\"--limit\", \"-l\", type=int, default=10, help=\"Max results\")\n parser.add_argument(\"--json\", \"-j\", action=\"store_true\", help=\"Output JSON\")\n parser.add_argument(\"--resolve\", \"-r\", action=\"store_true\", help=\"Resolve Sogou links to real WeChat URLs (requires Camofox)\")\n parser.add_argument(\"--via-ssh\", action=\"store_true\", help=\"Route search via SSH proxy (set SOGOU_SSH_HOST env var)\")\n parser.add_argument(\"--via-router\", action=\"store_true\", help=\"Route search via home router (cmd-queue pattern, 24/7)\")\n args = parser.parse_args()\n\n if args.via_router:\n results = sogou_wechat_search_via_router(args.keyword, args.limit)\n elif args.via_ssh:\n results = sogou_wechat_search_via_ssh(args.keyword, args.limit)\n else:\n results = sogou_wechat_search(args.keyword, args.limit)\n\n if args.resolve and results:\n print(\"Resolving to real WeChat URLs (Sogou → Google/DuckDuckGo → mp.weixin.qq.com)...\", file=sys.stderr)\n for r in results:\n real_url = resolve_via_google(r['title'])\n if real_url:\n r['url'] = real_url\n r['resolved'] = True\n else:\n # Fallback: try Camofox direct resolve\n resolved = resolve_sogou_link(r['url'])\n if resolved != r['url']:\n r['url'] = resolved\n r['resolved'] = True\n\n if args.json:\n print(json.dumps(results, ensure_ascii=False, indent=2))\n else:\n if not results:\n print(\"未找到结果\")\n for i, article in enumerate(results, 1):\n print(f\"{i}. {article['title']}\")\n if article['author']:\n print(f\" 公众号: {article['author']}\")\n if article['date']:\n print(f\" 日期: {article['date']}\")\n if article['snippet']:\n print(f\" 摘要: {article['snippet'][:80]}...\")\n print(f\" 链接: {article['url'][:80]}...\")\n print()\n\n sys.exit(0 if results else 1)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":14114,"content_sha256":"c3ab28a15c7e9433a2d8ee68147b9beab0323ea46f79243a0664f2d52c93549f"},{"filename":"scripts/version_check.py","content":"\"\"\"\n版本检查模块 - 启动时检查 GitHub 是否有新版本\n零依赖，缓存结果（每天最多检查一次），失败静默\n输出到 stderr，仅在交互终端显示，后台线程不阻塞\n\"\"\"\n\nimport json\nimport os\nimport sys\nimport time\nimport threading\nimport urllib.request\nfrom pathlib import Path\n\n\ndef check_for_update(repo: str, version_file: str = None):\n \"\"\"\n 检查 GitHub 是否有新版本并提示用户。\n 在后台线程中运行，不阻塞主程序。\n 仅在交互终端（TTY）中显示提醒。\n \"\"\"\n # 非交互环境（管道、重定向）不显示\n if not sys.stderr.isatty():\n return\n \n thread = threading.Thread(\n target=_check_update_worker,\n args=(repo, version_file),\n daemon=True\n )\n thread.start()\n\n\ndef _check_update_worker(repo: str, version_file: str = None):\n try:\n # 读本地版本\n if version_file is None:\n version_file = str(Path(__file__).parent.parent / \"VERSION\")\n \n if not os.path.exists(version_file):\n return\n \n with open(version_file, 'r') as f:\n local_version = f.read().strip()\n \n if not local_version:\n return\n \n # 检查缓存（一天只查一次）\n cache_dir = Path.home() / \".cache\" / \"openclaw-updates\"\n cache_dir.mkdir(parents=True, exist_ok=True)\n cache_file = cache_dir / f\"{repo.replace('/', '_')}.json\"\n \n now = time.time()\n if cache_file.exists():\n try:\n with open(cache_file, 'r') as f:\n cache = json.load(f)\n if now - cache.get(\"checked_at\", 0) \u003c 86400:\n remote = cache.get(\"remote_version\", \"\")\n if remote and remote != local_version:\n _print_update_notice(local_version, remote, repo)\n return\n except (json.JSONDecodeError, KeyError):\n pass\n \n # 查 GitHub API\n url = f\"https://api.github.com/repos/{repo}/releases/latest\"\n req = urllib.request.Request(url, headers={\"Accept\": \"application/vnd.github.v3+json\"})\n \n with urllib.request.urlopen(req, timeout=5) as resp:\n data = json.loads(resp.read().decode())\n \n remote_version = data.get(\"tag_name\", \"\").lstrip(\"v\")\n \n # 写缓存\n with open(cache_file, 'w') as f:\n json.dump({\"checked_at\": now, \"remote_version\": remote_version}, f)\n \n if remote_version and remote_version != local_version:\n _print_update_notice(local_version, remote_version, repo)\n \n except Exception:\n pass\n\n\ndef _print_update_notice(local: str, remote: str, repo: str):\n \"\"\"打印升级提醒到 stderr（不干扰 stdout 的 JSON 输出）\"\"\"\n YELLOW = \"\\033[33m\"\n GREEN = \"\\033[32m\"\n RESET = \"\\033[0m\"\n BOLD = \"\\033[1m\"\n \n msg = (\n f\"\\n{YELLOW}{BOLD}⚠ 新版本可用!{RESET}\\n\"\n f\" 当前: v{local} → 最新: {GREEN}v{remote}{RESET}\\n\"\n f\" 运行 {BOLD}git pull{RESET} 更新\\n\"\n f\" 详情: https://github.com/{repo}/releases\\n\"\n )\n print(msg, file=sys.stderr)\n","content_type":"text/x-python; charset=utf-8","language":"python","size":3258,"content_sha256":"81ff024669eb3d42176e4f48987e491649c425aae1d9e7c35115195caa753f2f"},{"filename":"scripts/x_discover.py","content":"#!/usr/bin/env python3\n\"\"\"\nX Discover - Search and discover valuable tweets by keyword.\nPart of x-tweet-fetcher.\n\nUses DuckDuckGo search (no API key needed) to find tweets on X/Twitter.\n\nUsage:\n python3 x_discover.py --keywords \"AI Agent,automation\" --limit 5\n python3 x_discover.py --keywords \"openclaw\" --json\n python3 x_discover.py --keywords \"LLM tool\" --limit 10 --cache discover_cache.json\n\"\"\"\n\nimport json\nimport hashlib\nimport argparse\nimport sys\nfrom datetime import datetime\nfrom pathlib import Path\n\n\ndef search_web(query, max_results=5):\n \"\"\"Search via DuckDuckGo or Camofox Google (no API key needed)\"\"\"\n # Try DuckDuckGo first\n try:\n from duckduckgo_search import DDGS\n import warnings\n warnings.filterwarnings(\"ignore\")\n ddgs = DDGS()\n results = ddgs.text(query, max_results=max_results)\n if results:\n return [{\"title\": r.get(\"title\", \"\"), \"url\": r.get(\"href\", \"\"), \"snippet\": r.get(\"body\", \"\")} for r in results]\n except Exception:\n pass\n\n # Fallback: Camofox Google search\n try:\n from camofox_client import camofox_search\n results = camofox_search(query)\n if results:\n return results[:max_results]\n except Exception:\n pass\n\n print(f\"All search backends failed for: {query[:40]}...\", file=sys.stderr)\n return []\n\n\ndef url_hash(url):\n return hashlib.md5(url.encode()).hexdigest()[:12]\n\n\ndef load_cache(cache_file):\n if cache_file and Path(cache_file).exists():\n return json.loads(Path(cache_file).read_text())\n return {\"seen_urls\": []}\n\n\ndef save_cache(cache, cache_file):\n if cache_file:\n Path(cache_file).parent.mkdir(parents=True, exist_ok=True)\n Path(cache_file).write_text(json.dumps(cache, ensure_ascii=False, indent=2))\n\n\ndef discover_tweets(keywords, max_results=10, cache_file=None):\n \"\"\"\n Search for tweets matching keywords.\n \n Args:\n keywords: list of keyword strings\n max_results: max results per keyword\n cache_file: optional path to cache file (skip seen URLs)\n \n Returns:\n dict with total_new, finds list\n \"\"\"\n cache = load_cache(cache_file)\n all_finds = []\n\n for keyword in keywords:\n query = f\"site:x.com {keyword}\"\n results = search_web(query, max_results=max_results)\n\n for r in results:\n url = r.get('url', r.get('href', ''))\n if not url:\n continue\n\n h = url_hash(url)\n if h in cache[\"seen_urls\"]:\n continue\n\n cache[\"seen_urls\"].append(h)\n all_finds.append({\n \"url\": url,\n \"title\": r.get('title', ''),\n \"snippet\": r.get('body', r.get('snippet', '')),\n \"query\": keyword,\n \"found_at\": datetime.now().isoformat()\n })\n\n save_cache(cache, cache_file)\n\n return {\n \"timestamp\": datetime.now().isoformat(),\n \"total_new\": len(all_finds),\n \"finds\": all_finds\n }\n\n\ndef main():\n parser = argparse.ArgumentParser(description=\"Discover tweets by keyword search\")\n parser.add_argument(\"--keywords\", \"-k\", required=True, help=\"Comma-separated keywords\")\n parser.add_argument(\"--limit\", \"-l\", type=int, default=5, help=\"Max results per keyword\")\n parser.add_argument(\"--cache\", \"-c\", help=\"Cache file path (skip seen URLs)\")\n parser.add_argument(\"--json\", \"-j\", action=\"store_true\", help=\"Output JSON\")\n args = parser.parse_args()\n\n keywords = [k.strip() for k in args.keywords.split(\",\") if k.strip()]\n result = discover_tweets(keywords, max_results=args.limit, cache_file=args.cache)\n\n if args.json:\n print(json.dumps(result, ensure_ascii=False, indent=2))\n else:\n if result[\"total_new\"] == 0:\n print(\"No new discoveries.\")\n else:\n print(f\"Found {result['total_new']} new tweets:\\n\")\n for i, f in enumerate(result[\"finds\"], 1):\n print(f\"{i}. {f['title']}\")\n if f['snippet']:\n print(f\" {f['snippet'][:100]}...\")\n print(f\" {f['url']}\")\n print()\n\n sys.exit(0 if result[\"total_new\"] == 0 else 1)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":4276,"content_sha256":"6c2d2a1ed6295207e9872cd86b86641e6871ba7a18477e9aa85b12d622e11371"},{"filename":"scripts/x_mentions_nitter.py","content":"#!/usr/bin/env python3\n\"\"\"\nx-mentions-nitter.py - 通过 Nitter 实时抓取 @YuLin807 的 mentions\n比 Google/Brave 搜索快得多（分钟级 vs 小时级）\n\n用法：\n python3 scripts/x-mentions-nitter.py\n 退出码 0 = 无新内容，1 = 有新内容\n\"\"\"\n\nimport sys\nimport os\nimport json\nimport re\nfrom datetime import datetime\n\nsys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'skills/our/x-tweet-fetcher/scripts'))\nfrom camofox_client import camofox_fetch_page\n\nUSERNAME = \"YuLin807\"\nCACHE_FILE = \"/tmp/x-mentions-nitter-cache.json\"\nRESULT_FILE = \"/tmp/x-mentions-nitter-latest.json\"\nNITTER_URL = f\"https://nitter.net/search?f=tweets&q=%40{USERNAME}\"\n\ndef parse_mentions(snapshot):\n \"\"\"从 Nitter 快照中解析 mentions\"\"\"\n mentions = []\n lines = snapshot.split('\\n')\n \n current = {}\n for line in lines:\n line = line.strip()\n \n # 匹配用户链接 @username\n m = re.search(r'link \"@(\\w+)\"', line)\n if m and m.group(1) != USERNAME:\n current['author'] = m.group(1)\n \n # 匹配时间链接（如 \"50m\", \"1h\", \"2h\", \"Feb 26\"）\n m = re.search(r'link \"(\\d+[mhd]|[A-Z][a-z]+ \\d+)\"', line)\n if m:\n current['time'] = m.group(1)\n \n # 匹配推文链接（/user/status/id#m）\n m = re.search(r'/url: /(\\w+)/status/(\\d+)#m', line)\n if m:\n current['url'] = f\"https://x.com/{m.group(1)}/status/{m.group(2)}\"\n current['tweet_id'] = m.group(2)\n \n # 匹配 \"Replying to\" 后面的文本内容\n if line.startswith('- text: ') and 'Replying to' not in line and current.get('author'):\n text = line[8:].strip()\n # 过滤掉纯数字行（点赞/转发计数）\n if text and not re.match(r'^[\\d\\s]+

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, text) and len(text) > 2:\n current['text'] = text\n if current.get('url'):\n mentions.append(dict(current))\n current = {}\n \n return mentions\n\n\ndef load_cache():\n \"\"\"加载已知 tweet IDs\"\"\"\n if os.path.exists(CACHE_FILE):\n with open(CACHE_FILE) as f:\n return set(json.load(f))\n return set()\n\n\ndef save_cache(ids):\n \"\"\"保存已知 tweet IDs\"\"\"\n with open(CACHE_FILE, 'w') as f:\n json.dump(list(ids)[-500:], f)\n\n\ndef main():\n print(f\"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 🔍 Nitter mentions 检查...\")\n \n snapshot = camofox_fetch_page(NITTER_URL, \"nitter-mentions-check\", wait=8)\n if not snapshot:\n print(\"❌ Nitter 无响应\")\n sys.exit(0)\n \n mentions = parse_mentions(snapshot)\n print(f\"📊 解析到 {len(mentions)} 条 mentions\")\n \n # 对比缓存找新的\n cache = load_cache()\n new_mentions = [m for m in mentions if m.get('tweet_id') not in cache]\n \n # 更新缓存\n all_ids = cache | {m['tweet_id'] for m in mentions if 'tweet_id' in m}\n save_cache(all_ids)\n \n # 输出\n output = {\n \"timestamp\": datetime.now().isoformat(),\n \"total\": len(mentions),\n \"new_count\": len(new_mentions),\n \"new\": new_mentions[:10],\n }\n \n with open(RESULT_FILE, 'w') as f:\n json.dump(output, f, ensure_ascii=False, indent=2)\n \n print(json.dumps(output, ensure_ascii=False, indent=2))\n \n if new_mentions:\n print(f\"\\n⚠️ 发现 {len(new_mentions)} 条新 mentions！\")\n sys.exit(1)\n else:\n print(f\"\\n✅ 无新 mentions\")\n sys.exit(0)\n\n\nif __name__ == \"__main__\":\n main()\n","content_type":"text/x-python; charset=utf-8","language":"python","size":3560,"content_sha256":"cbee102fcc12ec4c9d1c657167b713498d69266e3f7ee93ccd64664d8899530f"},{"filename":"VERSION","content":"1.4.0\n","content_type":"text/plain; charset=utf-8","language":null,"size":6,"content_sha256":"78b591400c56b7b67b8cb3b2b8a8e65e9093897f02ce0878e6b5405c68620fa7"}],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"X Tweet Fetcher","type":"text"}]},{"type":"paragraph","content":[{"text":"Fetch tweets from X/Twitter without authentication. Supports tweet content, reply threads, user timelines, and Chinese platforms.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Feature Overview","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Feature","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Command","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Dependencies","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Single tweet","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--url \u003ctweet_url>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"None (zero deps)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Reply threads","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--url \u003ctweet_url> --replies","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Camofox","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"User timeline","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"--user \u003cusername> --limit 300","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Camofox","type":"text","marks":[{"type":"strong"}]}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Chinese platforms","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"fetch_china.py --url \u003curl>","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Camofox","type":"text","marks":[{"type":"strong"}]},{"text":" (except WeChat)","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Google search","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"camofox_search(\"query\")","type":"text","marks":[{"type":"code_inline"}]}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Camofox","type":"text","marks":[{"type":"strong"}]}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Basic Usage (Zero Dependencies)","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Fetch a Single Tweet","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# JSON output\npython3 scripts/fetch_tweet.py --url \"https://x.com/user/status/123456\"\n\n# Text only (human readable)\npython3 scripts/fetch_tweet.py --url \"https://x.com/user/status/123456\" --text-only\n\n# Pretty JSON\npython3 scripts/fetch_tweet.py --url \"https://x.com/user/status/123456\" --pretty","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"What It Fetches","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Content Type","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Support","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Regular tweets","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅ Full text + stats","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Long tweets (Twitter Blue)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅ Full text","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"X Articles (long-form)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅ Complete article text","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Quoted tweets","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅ Included","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Stats (likes/RT/views)","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅ Included","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Media URLs","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅ Images + videos","type":"text"}]}]}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Advanced Features (Requires Camofox)","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"⚠️ The following features require ","type":"text"},{"text":"Camofox","type":"text","marks":[{"type":"strong"}]},{"text":" browser service running on ","type":"text"},{"text":"localhost:9377","type":"text","marks":[{"type":"code_inline"}]},{"text":". See ","type":"text"},{"text":"Camofox Setup","type":"text","marks":[{"type":"link","attrs":{"href":"#camofox-setup","title":null}}]},{"text":" below.","type":"text"}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Fetch Reply Threads","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Fetch tweet + all replies (including nested replies)\npython3 scripts/fetch_tweet.py --url \"https://x.com/user/status/123456\" --replies\n\n# Text-only mode with replies\npython3 scripts/fetch_tweet.py --url \"https://x.com/user/status/123456\" --replies --text-only","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Fetch User Timeline","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Fetch latest tweets from a user (supports pagination, MAX_PAGES=20)\npython3 scripts/fetch_tweet.py --user \u003cusername> --limit 300","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Fetch Chinese Platform Content","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Auto-detects platform from URL\npython3 scripts/fetch_china.py --url \"https://weibo.com/...\" # Weibo\npython3 scripts/fetch_china.py --url \"https://bilibili.com/...\" # Bilibili\npython3 scripts/fetch_china.py --url \"https://csdn.net/...\" # CSDN\npython3 scripts/fetch_china.py --url \"https://mp.weixin.qq.com/...\" # WeChat (no Camofox needed!)","type":"text"}]},{"type":"table","attrs":{"layout":null},"content":[{"type":"tr","content":[{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Platform","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Status","type":"text"}]}]},{"type":"th","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Notes","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"WeChat Articles","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Uses web_fetch directly, no Camofox","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Weibo","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Camofox renders JS","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Bilibili","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Video info + stats","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"CSDN","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"✅","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Articles + code blocks","type":"text"}]}]}]},{"type":"tr","content":[{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Zhihu / Xiaohongshu","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"⚠️","type":"text"}]}]},{"type":"td","attrs":{"colspan":1,"rowspan":1,"colwidth":null,"alignment":""},"content":[{"type":"paragraph","content":[{"text":"Needs cookie import for login","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Google Search (Zero API Key)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"# Python\nfrom scripts.camofox_client import camofox_search\nresults = camofox_search(\"your search query\")\n# Returns: [{\"title\": \"...\", \"url\": \"...\", \"snippet\": \"...\"}, ...]","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# CLI\npython3 scripts/camofox_client.py \"your search query\"","type":"text"}]},{"type":"paragraph","content":[{"text":"Uses Camofox browser to search Google directly. ","type":"text"},{"text":"No Brave API key needed, no cost.","type":"text","marks":[{"type":"strong"}]}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"Camofox Setup","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"What is Camofox?","type":"text"}]},{"type":"paragraph","content":[{"text":"Camofox is an anti-detection browser service based on ","type":"text"},{"text":"Camoufox","type":"text","marks":[{"type":"link","attrs":{"href":"https://camoufox.com","title":null}}]},{"text":" (a Firefox fork with C++ level fingerprint masking). It bypasses:","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Cloudflare bot detection","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Browser fingerprinting","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"JavaScript challenges","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Installation","type":"text"}]},{"type":"paragraph","content":[{"text":"Option 1: OpenClaw Plugin","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"openclaw plugins install @askjo/camofox-browser","type":"text"}]},{"type":"paragraph","content":[{"text":"Option 2: Manual Install","type":"text","marks":[{"type":"strong"}]}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"git clone https://github.com/jo-inc/camofox-browser\ncd camofox-browser\nnpm install && npm start","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Verify","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"curl http://localhost:9377/health\n# Should return: {\"status\":\"ok\"}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"REST API","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"bash"},"content":[{"text":"# Create tab\nPOST http://localhost:9377/tabs\nBody: {\"userId\":\"test\", \"sessionKey\":\"test\", \"url\":\"https://example.com\"}\n\n# Get page snapshot\nGET http://localhost:9377/tabs/\u003cTAB_ID>/snapshot?userId=test\n\n# Close tab\nDELETE http://localhost:9377/tabs/\u003cTAB_ID>?userId=test","type":"text"}]},{"type":"hr","attrs":{"markup":"---"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"From Agent Code","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"python"},"content":[{"text":"from scripts.fetch_tweet import fetch_tweet\n\nresult = fetch_tweet(\"https://x.com/user/status/123456\")\ntweet = result[\"tweet\"]\n\n# Regular tweet\nprint(tweet[\"text\"])\nprint(f\"Likes: {tweet['likes']}, Views: {tweet['views']}\")\n\n# X Article (long-form)\nif tweet.get(\"is_article\"):\n print(tweet[\"article\"][\"title\"])\n print(tweet[\"article\"][\"full_text\"])\n\n# Links found in replies\nfor reply in result.get(\"replies\", []):\n for link in reply.get(\"links\", []):\n print(link)","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Output Format","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"json"},"content":[{"text":"{\n \"url\": \"https://x.com/user/status/123\",\n \"username\": \"user\",\n \"tweet_id\": \"123\",\n \"tweet\": {\n \"text\": \"Tweet content...\",\n \"author\": \"Display Name\",\n \"screen_name\": \"username\",\n \"likes\": 100,\n \"retweets\": 50,\n \"bookmarks\": 25,\n \"views\": 10000,\n \"replies_count\": 30,\n \"created_at\": \"Mon Jan 01 12:00:00 +0000 2026\",\n \"is_note_tweet\": false,\n \"is_article\": true,\n \"article\": {\n \"title\": \"Article Title\",\n \"full_text\": \"Complete article content...\",\n \"word_count\": 4847\n }\n },\n \"replies\": [\n {\n \"author\": \"@someone\",\n \"text\": \"Reply text...\",\n \"likes\": 5,\n \"links\": [\"https://github.com/...\"],\n \"thread_replies\": [{\"text\": \"Nested reply...\"}]\n }\n ]\n}","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"File Structure","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":""},"content":[{"text":"x-tweet-fetcher/\n├── SKILL.md # This file\n├── README.md # GitHub page with full docs\n├── scripts/\n│ ├── fetch_tweet.py # Main fetcher (tweet + replies + timeline)\n│ ├── fetch_china.py # Chinese platform fetcher\n│ ├── camofox_client.py # Camofox REST API client + camofox_search()\n│ └── x-profile-analyzer.py # User profile analysis (AI-powered)\n└── CHANGELOG.md","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Requirements","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Basic","type":"text","marks":[{"type":"strong"}]},{"text":": Python 3.7+, no external packages, no API keys","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Advanced","type":"text","marks":[{"type":"strong"}]},{"text":": Camofox running on localhost:9377","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Profile Analyzer","type":"text","marks":[{"type":"strong"}]},{"text":": MiniMax M2.5 API key (for AI analysis)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"How It Works","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Basic tweets","type":"text","marks":[{"type":"strong"}]},{"text":": ","type":"text"},{"text":"FxTwitter","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/FxEmbed/FxEmbed","title":null}}]},{"text":" public API","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Replies/timelines","type":"text","marks":[{"type":"strong"}]},{"text":": Camofox → Nitter (privacy-respecting X frontend)","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Chinese platforms","type":"text","marks":[{"type":"strong"}]},{"text":": Camofox renders JS → extracts content","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Google search","type":"text","marks":[{"type":"strong"}]},{"text":": Camofox opens Google → parses results","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"x-tweet-fetcher","author":"@skillopedia","source":{"stars":65,"repo_name":"claude-code-skills","origin_url":"https://github.com/aaaaqwq/claude-code-skills/blob/HEAD/skills/x-tweet-fetcher/SKILL.md","repo_owner":"aaaaqwq","body_sha256":"9a1b243102ca7807516f3b5922952272a4fe86eb8e9177722558e1d80e312db7","cluster_key":"73cf68ddbf2935cd43276e5400e20b97bb5d0f7476200c9ece41e5f1daa2e1b0","clean_bundle":{"format":"clean-skill-bundle-v1","source":"aaaaqwq/claude-code-skills/skills/x-tweet-fetcher/SKILL.md","attachments":[{"id":"fe40618f-8efa-51fa-81bc-2936eefe541d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/fe40618f-8efa-51fa-81bc-2936eefe541d/attachment.md","path":"CHANGELOG.md","size":2783,"sha256":"a81e45ebcec6d66c0caceccd8f32b32b5561ba88b614eccaffb26064e8e06901","contentType":"text/markdown; charset=utf-8"},{"id":"95aaed01-98ad-5c0d-b26c-0996c443fa7d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/95aaed01-98ad-5c0d-b26c-0996c443fa7d/attachment.md","path":"README.md","size":5990,"sha256":"c365bbb5f955a18ab4e940950d90960a79b71fe775ee8e59cc7dbd664045fec2","contentType":"text/markdown; charset=utf-8"},{"id":"cc064ade-c6ef-5433-9f05-7d6ad307317d","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/cc064ade-c6ef-5433-9f05-7d6ad307317d/attachment","path":"VERSION","size":6,"sha256":"78b591400c56b7b67b8cb3b2b8a8e65e9093897f02ce0878e6b5405c68620fa7","contentType":"text/plain; charset=utf-8"},{"id":"0eac65ed-aadb-563b-b153-e2f5e0126944","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/0eac65ed-aadb-563b-b153-e2f5e0126944/attachment.py","path":"scripts/__init__.py","size":46,"sha256":"278ae056c52296b6dc778a66c3b32ac9b76b10ddfb9ab91e4f3755a31845ebeb","contentType":"text/x-python; charset=utf-8"},{"id":"6bbb4e22-50e5-51d6-b32a-51dea2fe6859","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/6bbb4e22-50e5-51d6-b32a-51dea2fe6859/attachment.py","path":"scripts/camofox_client.py","size":8526,"sha256":"8dfc3e9dc7a66afe159acc33699100d9e7a0deb0a931ac3121ba03c3e83b8796","contentType":"text/x-python; charset=utf-8"},{"id":"a960dd1b-a0a5-5834-8944-7936bda521c6","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/a960dd1b-a0a5-5834-8944-7936bda521c6/attachment.py","path":"scripts/fetch_china.py","size":70616,"sha256":"d0654a5a8e50c1857dbb410f25fbe8cd6fec90223d54bacb299946504dee45d6","contentType":"text/x-python; charset=utf-8"},{"id":"f50a133e-f000-529e-a3c9-544d34fa2f83","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f50a133e-f000-529e-a3c9-544d34fa2f83/attachment.py","path":"scripts/fetch_tweet.py","size":73002,"sha256":"fb38acf89e475aa6f23fac8c4ae21aaf726308f37b15679079c7f2550bb7d8ec","contentType":"text/x-python; charset=utf-8"},{"id":"8b415dba-2637-59d9-b4be-5fc13209471e","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/8b415dba-2637-59d9-b4be-5fc13209471e/attachment.py","path":"scripts/sogou_wechat.py","size":14114,"sha256":"c3ab28a15c7e9433a2d8ee68147b9beab0323ea46f79243a0664f2d52c93549f","contentType":"text/x-python; charset=utf-8"},{"id":"f9d45cf8-5ad3-5cbe-8d5a-738e1805a7c9","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/f9d45cf8-5ad3-5cbe-8d5a-738e1805a7c9/attachment.py","path":"scripts/version_check.py","size":3258,"sha256":"81ff024669eb3d42176e4f48987e491649c425aae1d9e7c35115195caa753f2f","contentType":"text/x-python; charset=utf-8"},{"id":"546a6092-3a20-59d5-8559-06d781e0af85","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/546a6092-3a20-59d5-8559-06d781e0af85/attachment.py","path":"scripts/x_discover.py","size":4276,"sha256":"6c2d2a1ed6295207e9872cd86b86641e6871ba7a18477e9aa85b12d622e11371","contentType":"text/x-python; charset=utf-8"},{"id":"28411a12-9eb4-569d-8ed5-4c15cbc4734f","key":"uploads/10433ee7-ad12-4ae0-b34e-97553e46c6c8/28411a12-9eb4-569d-8ed5-4c15cbc4734f/attachment.py","path":"scripts/x_mentions_nitter.py","size":3560,"sha256":"cbee102fcc12ec4c9d1c657167b713498d69266e3f7ee93ccd64664d8899530f","contentType":"text/x-python; charset=utf-8"}],"bundle_sha256":"206336a44aafe51c7d18f18ce123b070d5348cbfeec6d2754d6da6a317027615","attachment_count":11,"text_attachments":10,"attachment_storage":"skillopedia-attachments-v1","binary_attachments":1,"excluded_attachments":[]},"cluster_size":1,"skill_md_path":"skills/x-tweet-fetcher/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"web-development","category_label":"Web"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"web-development","import_tag":"clean-skills-v1","description":"Fetch tweets, replies, and user timelines from X/Twitter without login or API keys. Also supports Chinese platforms (Weibo, Bilibili, CSDN, WeChat). Includes camofox_search() for zero-cost Google search without API keys. Basic tweet fetching: zero dependencies. Replies/timelines/search: requires Camofox.\n"}},"renderedAt":1782981742070}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.