乐闻世界logo
搜索文章和话题

Puppeteer 在实际项目中有哪些应用场景?请举例说明网页爬虫、自动化测试等具体实现。

2月19日 19:48

Puppeteer 在实际项目中有广泛的应用场景,从网页爬虫到自动化测试,从数据采集到性能监控。以下是一些典型的实际应用案例。

1. 网页爬虫和数据采集

案例 1:电商商品价格监控

javascript
const puppeteer = require('puppeteer'); async function monitorProductPrices(productUrls) { const browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const results = []; for (const url of productUrls) { const page = await browser.newPage(); // 设置用户代理,避免被识别为爬虫 await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); await page.goto(url, { waitUntil: 'networkidle2' }); // 等待价格元素加载 await page.waitForSelector('.price', { timeout: 5000 }); const productData = await page.evaluate(() => { return { title: document.querySelector('.product-title')?.textContent, price: document.querySelector('.price')?.textContent, availability: document.querySelector('.availability')?.textContent, rating: document.querySelector('.rating')?.textContent }; }); results.push({ url, ...productData, timestamp: new Date().toISOString() }); await page.close(); } await browser.close(); return results; } // 使用示例 const products = [ 'https://example.com/product/1', 'https://example.com/product/2' ]; monitorProductPrices(products).then(data => { console.log(JSON.stringify(data, null, 2)); });

案例 2:社交媒体数据抓取

javascript
async function scrapeSocialMedia(username) { const browser = await puppeteer.launch({ headless: 'new' }); const page = await browser.newPage(); // 模拟登录 await page.goto('https://social-media.com/login'); await page.type('#username', 'your_username'); await page.type('#password', 'your_password'); await page.click('#login-button'); await page.waitForNavigation(); // 访问用户页面 await page.goto(`https://social-media.com/${username}`); // 滚动加载更多内容 while (true) { await page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); try { await page.waitForSelector('.new-post', { timeout: 2000 }); } catch { break; } } // 抓取帖子数据 const posts = await page.evaluate(() => { return Array.from(document.querySelectorAll('.post')).map(post => ({ content: post.querySelector('.content')?.textContent, likes: post.querySelector('.likes')?.textContent, comments: post.querySelector('.comments')?.textContent, date: post.querySelector('.date')?.textContent })); }); await browser.close(); return posts; }

2. 自动化测试

案例 3:E2E 测试

javascript
const { expect } = require('expect-puppeteer'); async function runE2ETest() { const browser = await puppeteer.launch({ headless: 'new', slowMo: 50 // 减慢操作速度,便于观察 }); const page = await browser.newPage(); try { // 测试用户注册流程 await page.goto('https://example.com/register'); // 填写注册表单 await page.type('#username', 'testuser'); await page.type('#email', 'test@example.com'); await page.type('#password', 'password123'); await page.type('#confirm-password', 'password123'); // 提交表单 await Promise.all([ page.waitForNavigation(), page.click('#register-button') ]); // 验证注册成功 await expect(page).toMatch('Welcome, testuser!'); // 测试登录流程 await page.click('#logout-button'); await page.waitForNavigation(); await page.type('#login-email', 'test@example.com'); await page.type('#login-password', 'password123'); await page.click('#login-button'); await page.waitForNavigation(); // 验证登录成功 await expect(page).toMatch('Welcome back!'); console.log('E2E test passed!'); } catch (error) { console.error('E2E test failed:', error); // 保存失败截图 await page.screenshot({ path: 'test-failure.png' }); } finally { await browser.close(); } } runE2ETest();

案例 4:视觉回归测试

javascript
const fs = require('fs'); const pixelmatch = require('pixelmatch'); const { PNG } = require('pngjs'); async function visualRegressionTest(url, baselinePath) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); // 截取当前页面 const screenshot = await page.screenshot(); await browser.close(); // 如果没有基线图片,保存当前截图作为基线 if (!fs.existsSync(baselinePath)) { fs.writeFileSync(baselinePath, screenshot); console.log('Baseline image created'); return true; } // 读取基线图片 const baseline = PNG.sync.read(fs.readFileSync(baselinePath)); const current = PNG.sync.read(screenshot); // 比较图片差异 const diff = new PNG({ width: baseline.width, height: baseline.height }); const numDiffPixels = pixelmatch( baseline.data, current.data, diff.data, baseline.width, baseline.height, { threshold: 0.1 } ); // 保存差异图片 fs.writeFileSync('diff.png', PNG.sync.write(diff)); const totalPixels = baseline.width * baseline.height; const diffPercentage = (numDiffPixels / totalPixels) * 100; console.log(`Difference: ${diffPercentage.toFixed(2)}%`); // 如果差异超过阈值,测试失败 if (diffPercentage > 0.5) { console.log('Visual regression detected!'); return false; } console.log('Visual regression test passed!'); return true; } visualRegressionTest('https://example.com', 'baseline.png');

3. PDF 生成和文档处理

案例 5:动态报表生成

javascript
async function generateReport(data, outputPath) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // 生成 HTML 报表 const html = ` <!DOCTYPE html> <html> <head> <style> body { font-family: Arial, sans-serif; padding: 40px; } h1 { color: #333; } table { width: 100%; border-collapse: collapse; margin-top: 20px; } th, td { border: 1px solid #ddd; padding: 12px; text-align: left; } th { background-color: #f2f2f2; } .summary { margin-top: 30px; padding: 20px; background-color: #f9f9f9; } </style> </head> <body> <h1>销售报表</h1> <p>生成时间: ${new Date().toLocaleString()}</p> <table> <thead> <tr> <th>产品</th> <th>数量</th> <th>单价</th> <th>总价</th> </tr> </thead> <tbody> ${data.map(item => ` <tr> <td>${item.product}</td> <td>${item.quantity}</td> <td>$${item.price.toFixed(2)}</td> <td>$${(item.quantity * item.price).toFixed(2)}</td> </tr> `).join('')} </tbody> </table> <div class="summary"> <h2>总计: $${data.reduce((sum, item) => sum + item.quantity * item.price, 0).toFixed(2)}</h2> </div> </body> </html> `; await page.setContent(html); // 生成 PDF await page.pdf({ path: outputPath, format: 'A4', printBackground: true, margin: { top: '20px', right: '20px', bottom: '20px', left: '20px' } }); await browser.close(); console.log(`Report generated: ${outputPath}`); } // 使用示例 const salesData = [ { product: '产品 A', quantity: 10, price: 99.99 }, { product: '产品 B', quantity: 5, price: 149.99 }, { product: '产品 C', quantity: 8, price: 79.99 } ]; generateReport(salesData, 'sales-report.pdf');

案例 6:发票批量生成

javascript
async function generateInvoices(invoices) { const browser = await puppeteer.launch(); const page = await browser.newPage(); for (const invoice of invoices) { const html = ` <!DOCTYPE html> <html> <head> <style> body { font-family: Arial, sans-serif; padding: 40px; } .header { text-align: center; margin-bottom: 40px; } .invoice-info { margin-bottom: 30px; } table { width: 100%; border-collapse: collapse; } th, td { border: 1px solid #ddd; padding: 10px; text-align: left; } th { background-color: #f2f2f2; } .total { text-align: right; font-weight: bold; margin-top: 20px; } </style> </head> <body> <div class="header"> <h1>发票</h1> <p>发票号: ${invoice.number}</p> </div> <div class="invoice-info"> <p>日期: ${invoice.date}</p> <p>客户: ${invoice.customer}</p> </div> <table> <thead> <tr> <th>项目</th> <th>数量</th> <th>单价</th> <th>总价</th> </tr> </thead> <tbody> ${invoice.items.map(item => ` <tr> <td>${item.name}</td> <td>${item.quantity}</td> <td>$${item.price}</td> <td>$${item.quantity * item.price}</td> </tr> `).join('')} </tbody> </table> <div class="total"> 总计: $${invoice.total} </div> </body> </html> `; await page.setContent(html); await page.pdf({ path: `invoices/invoice-${invoice.number}.pdf`, format: 'A4', printBackground: true }); console.log(`Generated invoice: ${invoice.number}`); } await browser.close(); } // 使用示例 const invoices = [ { number: 'INV-001', date: '2024-01-15', customer: '客户 A', items: [ { name: '服务 A', quantity: 1, price: 500 }, { name: '服务 B', quantity: 2, price: 300 } ], total: 1100 } ]; generateInvoices(invoices);

4. 性能监控和分析

案例 7:页面性能分析

javascript
async function analyzePagePerformance(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); // 启用性能监控 const client = await page.target().createCDPSession(); await client.send('Performance.enable'); await client.send('Network.enable'); // 记录开始时间 const startTime = Date.now(); await page.goto(url, { waitUntil: 'networkidle2' }); const loadTime = Date.now() - startTime; // 获取性能指标 const metrics = await client.send('Performance.getMetrics'); // 获取关键性能指标 const performanceData = { loadTime, domContentLoaded: await page.evaluate(() => performance.timing.domContentLoadedEventEnd - performance.timing.navigationStart ), firstPaint: await page.evaluate(() => performance.getEntriesByType('paint')[0]?.startTime ), firstContentfulPaint: await page.evaluate(() => performance.getEntriesByType('paint')[1]?.startTime ), resources: metrics.metrics }; // 生成性能报告 console.log('Performance Report:'); console.log(`Load Time: ${performanceData.loadTime}ms`); console.log(`DOM Content Loaded: ${performanceData.domContentLoaded}ms`); console.log(`First Paint: ${performanceData.firstPaint}ms`); console.log(`First Contentful Paint: ${performanceData.firstContentfulPaint}ms`); await browser.close(); return performanceData; } analyzePagePerformance('https://example.com');

5. SEO 工具

案例 8:SEO 检查工具

javascript
async function seoAudit(url) { const browser = await puppeteer.launch(); const page = await browser.newPage(); await page.goto(url, { waitUntil: 'networkidle2' }); const seoData = await page.evaluate(() => { const issues = []; const warnings = []; // 检查标题 const title = document.querySelector('title'); if (!title) { issues.push('Missing title tag'); } else if (title.textContent.length > 60) { warnings.push('Title too long (> 60 characters)'); } // 检查描述 const description = document.querySelector('meta[name="description"]'); if (!description) { issues.push('Missing meta description'); } else if (description.content.length > 160) { warnings.push('Meta description too long (> 160 characters)'); } // 检查 H1 标签 const h1Tags = document.querySelectorAll('h1'); if (h1Tags.length === 0) { issues.push('Missing H1 tag'); } else if (h1Tags.length > 1) { warnings.push('Multiple H1 tags found'); } // 检查图片 alt 属性 const images = document.querySelectorAll('img'); let missingAlt = 0; images.forEach(img => { if (!img.alt) missingAlt++; }); if (missingAlt > 0) { warnings.push(`${missingAlt} images missing alt attributes`); } // 检查链接 const links = document.querySelectorAll('a[href]'); let brokenLinks = 0; links.forEach(link => { if (link.getAttribute('href').startsWith('#')) brokenLinks++; }); return { title: title?.textContent, description: description?.content, h1Count: h1Tags.length, imageCount: images.length, linkCount: links.length, issues, warnings }; }); console.log('SEO Audit Results:'); console.log(JSON.stringify(seoData, null, 2)); await browser.close(); return seoData; } seoAudit('https://example.com');

6. 最佳实践总结

1. 错误处理:

javascript
try { // 操作代码 } catch (error) { console.error('Error:', error); // 保存错误截图 await page.screenshot({ path: 'error.png' }); } finally { await browser.close(); }

2. 资源管理:

javascript
// 及时清理资源 await page.close(); await browser.close();

3. 性能优化:

javascript
// 禁用不必要的资源 await page.setRequestInterception(true); page.on('request', (request) => { if (['image', 'font'].includes(request.resourceType())) { request.abort(); } else { request.continue(); } });

4. 反爬虫策略:

javascript
// 设置真实的用户代理 await page.setUserAgent('Mozilla/5.0 ...'); // 添加延迟 await new Promise(resolve => setTimeout(resolve, 1000)); // 使用代理 const browser = await puppeteer.launch({ args: ['--proxy-server=http://proxy.example.com:8080'] });
标签:Puppeteer