Puppeteer 在处理动态网页和单页应用(SPA)时具有独特的优势,可以执行 JavaScript、等待异步加载、处理路由变化等。
1. 处理动态内容加载
等待元素出现:
javascriptconst puppeteer = require('puppeteer'); async function scrapeDynamicContent() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com'); // 等待动态加载的元素 await page.waitForSelector('.dynamic-content', { visible: true }); const content = await page.$eval('.dynamic-content', el => el.textContent); console.log(content); await browser.close(); } scrapeDynamicContent();
等待特定条件:
javascriptawait page.waitForFunction(() => { return document.querySelectorAll('.item').length > 0; });
等待网络请求完成:
javascriptawait page.goto('https://example.com', { waitUntil: 'networkidle2' });
2. 处理无限滚动
基本无限滚动:
javascriptasync function scrapeInfiniteScroll() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/infinite-scroll'); const items = []; let previousHeight = 0; while (true) { // 滚动到底部 await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); // 等待新内容加载 await page.waitForTimeout(1000); // 检查是否有新内容 const currentHeight = await page.evaluate(() => document.body.scrollHeight); if (currentHeight === previousHeight) { break; // 没有新内容了 } previousHeight = currentHeight; // 收集数据 const newItems = await page.$$eval('.item', elements => { return elements.map(el => el.textContent); }); items.push(...newItems); } await browser.close(); return items; }
优化的无限滚动:
javascriptasync function scrapeInfiniteScrollOptimized() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/infinite-scroll'); const items = []; let noNewItemsCount = 0; while (noNewItemsCount < 3) { // 连续 3 次没有新内容就停止 const itemCountBefore = items.length; // 滚动到底部 await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight); }); // 等待加载指示器消失 try { await page.waitForSelector('.loading', { hidden: true, timeout: 3000 }); } catch (error) { // 加载指示器可能不存在 } // 收集新数据 const newItems = await page.$$eval('.item', elements => { return elements.map(el => el.textContent); }); if (newItems.length === itemCountBefore) { noNewItemsCount++; } else { noNewItemsCount = 0; items.push(...newItems); } } await browser.close(); return items; }
3. 处理 SPA 路由
监听路由变化:
javascriptasync function handleSPARoutes() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com'); // 监听路由变化 page.on('framenavigated', async (frame) => { console.log('Navigated to:', frame.url()); // 等待页面内容加载 await frame.waitForSelector('.content'); const title = await frame.$eval('.content', el => el.textContent); console.log('Page title:', title); }); // 点击导航链接 await page.click('#about-link'); await page.waitForTimeout(1000); await page.click('#contact-link'); await page.waitForTimeout(1000); await browser.close(); }
等待特定路由:
javascriptasync function waitForRoute(page, path) { return new Promise((resolve) => { const checkRoute = async () => { const currentPath = await page.evaluate(() => window.location.pathname); if (currentPath === path) { resolve(); } else { setTimeout(checkRoute, 100); } }; checkRoute(); }); } // 使用 await page.click('#about-link'); await waitForRoute(page, '/about');
4. 处理 AJAX 请求
等待特定 API 响应:
javascriptasync function waitForAPIResponse(page, urlPattern) { return new Promise((resolve) => { page.on('response', (response) => { if (response.url().includes(urlPattern)) { resolve(response); } }); }); } // 使用 const apiResponse = await Promise.all([ waitForAPIResponse(page, '/api/data'), page.click('#load-data-button') ]); const data = await apiResponse.json(); console.log(data);
拦截和修改 API 请求:
javascriptawait page.setRequestInterception(true); page.on('request', (request) => { if (request.url().includes('/api/data')) { // 修改请求 request.continue({ headers: { ...request.headers(), 'Authorization': 'Bearer token' } }); } else { request.continue(); } });
5. 处理 WebSocket
监听 WebSocket 消息:
javascriptconst client = await page.target().createCDPSession(); await client.send('Network.enable'); client.on('Network.webSocketFrameReceived', (params) => { console.log('WebSocket message:', params.response.payloadData); }); client.on('Network.webSocketFrameSent', (params) => { console.log('WebSocket sent:', params.response.payloadData); });
6. 处理客户端渲染
等待客户端渲染完成:
javascriptasync function waitForClientRendering(page) { // 方法 1:等待特定元素 await page.waitForSelector('.rendered-content'); // 方法 2:等待渲染标志 await page.waitForFunction(() => { return window.__RENDER_COMPLETE__ === true; }); // 方法 3:等待网络空闲 await page.waitForFunction(() => { return performance.getEntriesByType('resource').length > 0; }); }
处理 React/Vue 应用:
javascriptasync function scrapeReactApp() { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto('https://example.com/react-app'); // 等待 React 应用挂载 await page.waitForSelector('#root'); // 等待数据加载完成 await page.waitForFunction(() => { return window.__INITIAL_STATE__?.loaded === true; }); // 与 React 应用交互 await page.click('#load-more-button'); await page.waitForSelector('.new-items'); const items = await page.$$eval('.item', elements => { return elements.map(el => el.textContent); }); await browser.close(); return items; }
7. 实际应用场景
场景 1:抓取社交媒体动态内容
javascriptasync function scrapeSocialMediaPosts(username) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(`https://social-media.com/${username}`); const posts = []; // 滚动加载更多帖子 while (posts.length < 50) { // 滚动到底部 await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); // 等待新帖子加载 await page.waitForTimeout(2000); // 收集帖子数据 const newPosts = await page.$$eval('.post', elements => { return elements.map(post => ({ id: post.dataset.id, content: post.querySelector('.content')?.textContent, likes: post.querySelector('.likes')?.textContent, timestamp: post.querySelector('.timestamp')?.textContent })); }); // 只添加新帖子 const newPostIds = new Set(posts.map(p => p.id)); const uniqueNewPosts = newPosts.filter(p => !newPostIds.has(p.id)); posts.push(...uniqueNewPosts); } await browser.close(); return posts; }
场景 2:抓取电商网站商品列表
javascriptasync function scrapeEcommerceProducts(categoryUrl) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(categoryUrl); const products = []; while (true) { // 等待商品加载 await page.waitForSelector('.product-card'); // 收集当前页商品 const pageProducts = await page.$$eval('.product-card', cards => { return cards.map(card => ({ id: card.dataset.id, title: card.querySelector('.title')?.textContent, price: card.querySelector('.price')?.textContent, rating: card.querySelector('.rating')?.textContent })); }); products.push(...pageProducts); // 检查是否有下一页 const nextButton = await page.$('.next-page:not(.disabled)'); if (!nextButton) { break; } // 点击下一页 await nextButton.click(); await page.waitForTimeout(1000); } await browser.close(); return products; }
场景 3:抓取实时数据更新
javascriptasync function scrapeRealTimeData(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url); const dataUpdates = []; // 监听 DOM 变化 await page.evaluate(() => { const observer = new MutationObserver((mutations) => { mutations.forEach((mutation) => { if (mutation.type === 'childList') { window.__DATA_UPDATES__ = window.__DATA_UPDATES__ || []; window.__DATA_UPDATES__.push({ timestamp: Date.now(), addedNodes: mutation.addedNodes.length }); } }); }); observer.observe(document.body, { childList: true, subtree: true }); }); // 等待一段时间收集数据 await page.waitForTimeout(30000); // 获取收集的数据 const updates = await page.evaluate(() => { return window.__DATA_UPDATES__ || []; }); await browser.close(); return updates; }
8. 最佳实践
1. 使用适当的等待策略:
javascript// 优先使用 waitForSelector await page.waitForSelector('.element'); // 复杂条件使用 waitForFunction await page.waitForFunction(() => { return document.querySelectorAll('.item').length > 10; }); // 网络请求使用 waitForResponse await page.waitForResponse(response => response.url().includes('/api/data') );
2. 避免硬编码等待时间:
javascript// 不好的做法 await page.waitForTimeout(5000); // 好的做法 await page.waitForSelector('.loaded-content');
3. 处理加载失败:
javascripttry { await page.waitForSelector('.content', { timeout: 10000 }); } catch (error) { console.log('Content failed to load, using fallback'); // 使用备用策略 }
4. 优化性能:
javascript// 禁用不必要的资源 await page.setRequestInterception(true); page.on('request', (request) => { if (['image', 'font', 'media'].includes(request.resourceType())) { request.abort(); } else { request.continue(); } });
5. 处理反爬虫:
javascript// 设置真实的用户代理 await page.setUserAgent('Mozilla/5.0 ...'); // 添加随机延迟 const randomDelay = () => Math.random() * 2000 + 1000; await page.waitForTimeout(randomDelay()); // 模拟人类行为 await page.evaluate(() => { window.scrollBy(0, Math.random() * 500); });