web-scraper — Skillopedia

Web Scraper Skill Description Extract and process data from web pages with intelligent parsing capabilities. Trigger - command - User requests web data extraction - User needs to parse HTML Prompt You are a web scraping expert that extracts data efficiently and ethically. Puppeteer Scraper (TypeScript) Cheerio Parser (Node.js) Rate Limiting Tags , , , , Compatibility - Codex: ✅ - Claude Code: ✅ ---

, '') ?? '0'),\n rating: parseFloat(item.querySelector('.rating')?.getAttribute('data-rating') ?? '0'),\n url: item.querySelector('a')?.href ?? '',\n }));\n });\n \n await browser.close();\n return products;\n}\n```\n\n### Cheerio Parser (Node.js)\n\n```typescript\nimport axios from 'axios';\nimport * as cheerio from 'cheerio';\n\nasync function parseArticle(url: string) {\n const { data } = await axios.get(url, {\n headers: { 'User-Agent': 'Mozilla/5.0' }\n });\n \n const $ = cheerio.load(data);\n \n return {\n title: $('h1.article-title').text().trim(),\n author: $('span.author-name').text().trim(),\n date: $('time').attr('datetime'),\n content: $('article.content p').map((_, el) => $(el).text()).get().join('\\n\\n'),\n tags: $('a.tag').map((_, el) => $(el).text()).get(),\n };\n}\n```\n\n### Rate Limiting\n\n```typescript\nclass RateLimiter {\n private queue: (() => Promise\u003cvoid>)[] = [];\n private processing = false;\n \n constructor(private delayMs: number = 1000) {}\n \n async add\u003cT>(fn: () => Promise\u003cT>): Promise\u003cT> {\n return new Promise((resolve, reject) => {\n this.queue.push(async () => {\n try {\n resolve(await fn());\n } catch (e) {\n reject(e);\n }\n });\n this.process();\n });\n }\n \n private async process() {\n if (this.processing) return;\n this.processing = true;\n \n while (this.queue.length > 0) {\n const fn = this.queue.shift()!;\n await fn();\n await new Promise(r => setTimeout(r, this.delayMs));\n }\n \n this.processing = false;\n }\n}\n\n// Usage\nconst limiter = new RateLimiter(2000); // 2 seconds between requests\nconst results = await Promise.all(\n urls.map(url => limiter.add(() => scrapeProducts(url)))\n);\n```\n\n## Tags\n`web-scraping`, `data-extraction`, `parsing`, `automation`, `html`\n\n## Compatibility\n- Codex: ✅\n- Claude Code: ✅\n---","attachment_filenames":[],"attachments":[],"content_json":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Web Scraper Skill","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Description","type":"text"}]},{"type":"paragraph","content":[{"text":"Extract and process data from web pages with intelligent parsing capabilities.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Trigger","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"/scrape","type":"text","marks":[{"type":"code_inline"}]},{"text":" command","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"User requests web data extraction","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"User needs to parse HTML","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Prompt","type":"text"}]},{"type":"paragraph","content":[{"text":"You are a web scraping expert that extracts data efficiently and ethically.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Puppeteer Scraper (TypeScript)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"typescript"},"content":[{"text":"import puppeteer from 'puppeteer';\n\ninterface Product {\n name: string;\n price: number;\n rating: number;\n url: string;\n}\n\nasync function scrapeProducts(url: string): Promise\u003cProduct[]> {\n const browser = await puppeteer.launch({ headless: 'new' });\n const page = await browser.newPage();\n \n // Set user agent to avoid detection\n await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');\n \n await page.goto(url, { waitUntil: 'networkidle2' });\n \n // Wait for products to load\n await page.waitForSelector('.product-card');\n \n const products = await page.evaluate(() => {\n const items = document.querySelectorAll('.product-card');\n return Array.from(items).map(item => ({\n name: item.querySelector('.product-name')?.textContent?.trim() ?? '',\n price: parseFloat(item.querySelector('.price')?.textContent?.replace('

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.

, '') ?? '0'),\n rating: parseFloat(item.querySelector('.rating')?.getAttribute('data-rating') ?? '0'),\n url: item.querySelector('a')?.href ?? '',\n }));\n });\n \n await browser.close();\n return products;\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Cheerio Parser (Node.js)","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"typescript"},"content":[{"text":"import axios from 'axios';\nimport * as cheerio from 'cheerio';\n\nasync function parseArticle(url: string) {\n const { data } = await axios.get(url, {\n headers: { 'User-Agent': 'Mozilla/5.0' }\n });\n \n const $ = cheerio.load(data);\n \n return {\n title: $('h1.article-title').text().trim(),\n author: $('span.author-name').text().trim(),\n date: $('time').attr('datetime'),\n content: $('article.content p').map((_, el) => $(el).text()).get().join('\\n\\n'),\n tags: $('a.tag').map((_, el) => $(el).text()).get(),\n };\n}","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Rate Limiting","type":"text"}]},{"type":"code_block","attrs":{"wrap":false,"language":"typescript"},"content":[{"text":"class RateLimiter {\n private queue: (() => Promise\u003cvoid>)[] = [];\n private processing = false;\n \n constructor(private delayMs: number = 1000) {}\n \n async add\u003cT>(fn: () => Promise\u003cT>): Promise\u003cT> {\n return new Promise((resolve, reject) => {\n this.queue.push(async () => {\n try {\n resolve(await fn());\n } catch (e) {\n reject(e);\n }\n });\n this.process();\n });\n }\n \n private async process() {\n if (this.processing) return;\n this.processing = true;\n \n while (this.queue.length > 0) {\n const fn = this.queue.shift()!;\n await fn();\n await new Promise(r => setTimeout(r, this.delayMs));\n }\n \n this.processing = false;\n }\n}\n\n// Usage\nconst limiter = new RateLimiter(2000); // 2 seconds between requests\nconst results = await Promise.all(\n urls.map(url => limiter.add(() => scrapeProducts(url)))\n);","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Tags","type":"text"}]},{"type":"paragraph","content":[{"text":"web-scraping","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"data-extraction","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"parsing","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"automation","type":"text","marks":[{"type":"code_inline"}]},{"text":", ","type":"text"},{"text":"html","type":"text","marks":[{"type":"code_inline"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Compatibility","type":"text"}]},{"type":"bullet_list","content":[{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Codex: ✅","type":"text"}]}]},{"type":"list_item","content":[{"type":"paragraph","content":[{"text":"Claude Code: ✅","type":"text"}]}]}]},{"type":"hr","attrs":{"markup":"---"}}]},"metadata":{"date":"2026-06-05","name":"web-scraper","author":"@skillopedia","source":{"stars":80,"repo_name":"moyucode","origin_url":"https://github.com/aidotnet/moyucode/blob/HEAD/skills/community/web-scraper/SKILL.md","repo_owner":"aidotnet","body_sha256":"2bec79c63888d4478333a7efd1a412ef6fe7b4aaf1f64c78b4f6d1695cd3e9b7","cluster_key":"b3c0854a241888c3f581679279f63f1046043e66b6c716543419df6a29b2154c","clean_bundle":{"format":"clean-skill-bundle-v1","source":"aidotnet/moyucode/skills/community/web-scraper/SKILL.md","bundle_sha256":"cad452aaadea017c5a7cb652a9675cdcb2bfcd45c6139252f5f2e5c599be033d","attachment_count":0,"text_attachments":0,"binary_attachments":0},"cluster_size":1,"skill_md_path":"skills/community/web-scraper/SKILL.md","import_metadata":{"date":"2026-06-05","author":"@skillopedia","version":"v1","category":"browser-automation-scraping","category_label":"Browser"},"exact_dupes_collapsed_into_this":0},"version":"v1","category":"browser-automation-scraping","metadata":{"short-description":"从网页提取数据"},"import_tag":"clean-skills-v1","description":"从网页提取和处理数据，使用CSS选择器、XPath智能解析，支持限速和错误处理。"}},"renderedAt":1782988125246}

Important: agents should read /llm.txt, /llms.txt, or /.well-known/skills.json to discover the public Skillopedia API.