crawl-cli

Web Crawling & Data Extraction

Overview

Web crawling and scraping patterns for data extraction using modern browser automation tools. Focus on ethical scraping, rate limiting, error handling, and scalable extraction pipelines.

When to Use

Building data extraction pipelines
Automating browser interactions
Scraping websites for content or data
Monitoring web page changes
Generating screenshots or PDFs

Quick Reference

Tool Best For Speed JS Support

Playwright E2E testing + scraping Fast Full

Puppeteer Chrome-specific automation Fast Full

Cheerio Static HTML parsing Fastest None

Crawlee Large-scale crawling Optimized Both

Playwright Setup (Recommended)

Installation

npm install playwright npx playwright install chromium

Basic Scraper

// lib/scraper.ts import { chromium, Browser, Page } from 'playwright';

export class Scraper { private browser: Browser | null = null;

async init(): Promise<void> { this.browser = await chromium.launch({ headless: true, }); }

async scrape(url: string): Promise<string> { if (!this.browser) throw new Error('Browser not initialized');

const context = await this.browser.newContext({
  userAgent: 'Mozilla/5.0 (compatible; MyBot/1.0; +https://example.com/bot)',
});
const page = await context.newPage();

try {
  await page.goto(url, { waitUntil: 'networkidle' });
  const content = await page.content();
  return content;
} finally {
  await context.close();
}

}

async close(): Promise<void> { if (this.browser) { await this.browser.close(); } } }

Data Extraction Pattern

// lib/extractor.ts import { Page } from 'playwright';

interface ProductData { title: string; price: string; description: string; images: string[]; }

export async function extractProduct(page: Page): Promise<ProductData> { return await page.evaluate(() => { return { title: document.querySelector('h1.product-title')?.textContent?.trim() || '', price: document.querySelector('.price')?.textContent?.trim() || '', description: document.querySelector('.description')?.textContent?.trim() || '', images: Array.from(document.querySelectorAll('.product-image img')) .map(img => (img as HTMLImageElement).src), }; }); }

Rate Limiting

Essential Rate Limiter

// lib/rate-limiter.ts export class RateLimiter { private queue: Array<() => Promise<void>> = []; private processing = false; private lastRequest = 0; private readonly minDelay: number;

constructor(requestsPerSecond: number = 1) { this.minDelay = 1000 / requestsPerSecond; }

async schedule<T>(fn: () => Promise<T>): Promise<T> { return new Promise((resolve, reject) => { this.queue.push(async () => { try { const result = await fn(); resolve(result); } catch (error) { reject(error); } }); this.processQueue(); }); }

private async processQueue(): Promise<void> { if (this.processing || this.queue.length === 0) return;

this.processing = true;

while (this.queue.length > 0) {
  const now = Date.now();
  const timeSinceLastRequest = now - this.lastRequest;

  if (timeSinceLastRequest &#x3C; this.minDelay) {
    await this.sleep(this.minDelay - timeSinceLastRequest);
  }

  const task = this.queue.shift();
  if (task) {
    this.lastRequest = Date.now();
    await task();
  }
}

this.processing = false;

}

private sleep(ms: number): Promise<void> { return new Promise(resolve => setTimeout(resolve, ms)); } }

// Usage const limiter = new RateLimiter(2); // 2 requests per second await limiter.schedule(() => scraper.scrape(url));

Adaptive Rate Limiting

// lib/adaptive-rate-limiter.ts export class AdaptiveRateLimiter { private delay: number; private readonly minDelay: number; private readonly maxDelay: number; private consecutiveErrors = 0;

constructor(options: { initialDelay?: number; minDelay?: number; maxDelay?: number; } = {}) { this.delay = options.initialDelay || 1000; this.minDelay = options.minDelay || 500; this.maxDelay = options.maxDelay || 30000; }

async wait(): Promise<void> { await new Promise(resolve => setTimeout(resolve, this.delay)); }

onSuccess(): void { this.consecutiveErrors = 0; // Gradually decrease delay on success this.delay = Math.max(this.minDelay, this.delay * 0.9); }

onError(statusCode?: number): void { this.consecutiveErrors++;

if (statusCode === 429) {
  // Rate limited - significant backoff
  this.delay = Math.min(this.maxDelay, this.delay * 3);
} else {
  // Other error - moderate backoff
  this.delay = Math.min(this.maxDelay, this.delay * 1.5);
}

}

shouldAbort(): boolean { return this.consecutiveErrors > 10; } }

Error Handling

Retry Strategy

// lib/retry.ts interface RetryOptions { maxRetries: number; baseDelay: number; maxDelay: number; retryOn?: number[]; }

export async function withRetry<T>( fn: () => Promise<T>, options: RetryOptions ): Promise<T> { const { maxRetries, baseDelay, maxDelay, retryOn = [429, 500, 502, 503, 504] } = options;

let lastError: Error | null = null;

for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await fn(); } catch (error) { lastError = error as Error;

  // Check if we should retry
  const statusCode = (error as any).statusCode;
  if (statusCode &#x26;&#x26; !retryOn.includes(statusCode)) {
    throw error;
  }

  if (attempt &#x3C; maxRetries) {
    // Exponential backoff with jitter
    const delay = Math.min(
      maxDelay,
      baseDelay * Math.pow(2, attempt) + Math.random() * 1000
    );
    await new Promise(resolve => setTimeout(resolve, delay));
  }
}

}

throw lastError; }

// Usage const result = await withRetry( () => scraper.scrape(url), { maxRetries: 3, baseDelay: 1000, maxDelay: 30000 } );

Error Classification

// lib/errors.ts export class ScraperError extends Error { constructor( message: string, public readonly code: string, public readonly recoverable: boolean, public readonly url?: string ) { super(message); this.name = 'ScraperError'; } }

export const ErrorCodes = { BLOCKED: 'BLOCKED', RATE_LIMITED: 'RATE_LIMITED', NOT_FOUND: 'NOT_FOUND', TIMEOUT: 'TIMEOUT', PARSE_ERROR: 'PARSE_ERROR', NETWORK_ERROR: 'NETWORK_ERROR', } as const;

export function classifyError(error: Error, statusCode?: number): ScraperError { if (statusCode === 403) { return new ScraperError('Access blocked', ErrorCodes.BLOCKED, false); } if (statusCode === 429) { return new ScraperError('Rate limited', ErrorCodes.RATE_LIMITED, true); } if (statusCode === 404) { return new ScraperError('Page not found', ErrorCodes.NOT_FOUND, false); } if (error.message.includes('timeout')) { return new ScraperError('Request timeout', ErrorCodes.TIMEOUT, true); } return new ScraperError(error.message, ErrorCodes.NETWORK_ERROR, true); }

Complete Crawler Implementation

// lib/crawler.ts import { chromium, Browser, BrowserContext, Page } from 'playwright';

interface CrawlerOptions { maxConcurrency: number; requestsPerSecond: number; maxDepth: number; respectRobotsTxt: boolean; }

interface CrawlResult { url: string; status: 'success' | 'error'; data?: any; error?: string; timestamp: Date; }

export class Crawler { private browser: Browser | null = null; private visited = new Set<string>(); private queue: Array<{ url: string; depth: number }> = []; private results: CrawlResult[] = []; private readonly options: CrawlerOptions; private rateLimiter: RateLimiter;

constructor(options: Partial<CrawlerOptions> = {}) { this.options = { maxConcurrency: 3, requestsPerSecond: 1, maxDepth: 3, respectRobotsTxt: true, ...options, }; this.rateLimiter = new RateLimiter(this.options.requestsPerSecond); }

async crawl(startUrl: string, extractor: (page: Page) => Promise<any>): Promise<CrawlResult[]> { this.browser = await chromium.launch({ headless: true }); this.queue.push({ url: startUrl, depth: 0 });

try {
  while (this.queue.length > 0) {
    const batch = this.queue.splice(0, this.options.maxConcurrency);
    await Promise.all(
      batch.map(item => this.processUrl(item.url, item.depth, extractor))
    );
  }
} finally {
  await this.browser?.close();
}

return this.results;

}

private async processUrl( url: string, depth: number, extractor: (page: Page) => Promise<any> ): Promise<void> { if (this.visited.has(url)) return; this.visited.add(url);

await this.rateLimiter.schedule(async () => {
  const context = await this.browser!.newContext();
  const page = await context.newPage();

  try {
    const response = await page.goto(url, {
      waitUntil: 'networkidle',
      timeout: 30000
    });

    if (!response || !response.ok()) {
      this.results.push({
        url,
        status: 'error',
        error: `HTTP ${response?.status()}`,
        timestamp: new Date(),
      });
      return;
    }

    const data = await extractor(page);
    this.results.push({
      url,
      status: 'success',
      data,
      timestamp: new Date(),
    });

    // Discover new links if not at max depth
    if (depth &#x3C; this.options.maxDepth) {
      const links = await this.extractLinks(page);
      links.forEach(link => {
        if (!this.visited.has(link)) {
          this.queue.push({ url: link, depth: depth + 1 });
        }
      });
    }
  } catch (error) {
    this.results.push({
      url,
      status: 'error',
      error: (error as Error).message,
      timestamp: new Date(),
    });
  } finally {
    await context.close();
  }
});

}

private async extractLinks(page: Page): Promise<string[]> { return page.evaluate(() => { const baseUrl = window.location.origin; return Array.from(document.querySelectorAll('a[href]')) .map(a => (a as HTMLAnchorElement).href) .filter(href => href.startsWith(baseUrl)); }); } }

Ethical Scraping

robots.txt Parser

// lib/robots.ts interface RobotsRule { userAgent: string; allow: string[]; disallow: string[]; crawlDelay?: number; }

export async function parseRobotsTxt(baseUrl: string): Promise<RobotsRule[]> { try { const response = await fetch(${baseUrl}/robots.txt); if (!response.ok) return [];

const text = await response.text();
const rules: RobotsRule[] = [];
let currentRule: RobotsRule | null = null;

for (const line of text.split('\n')) {
  const trimmed = line.trim();
  if (trimmed.startsWith('#') || !trimmed) continue;

  const [key, ...valueParts] = trimmed.split(':');
  const value = valueParts.join(':').trim();

  switch (key.toLowerCase()) {
    case 'user-agent':
      if (currentRule) rules.push(currentRule);
      currentRule = { userAgent: value, allow: [], disallow: [] };
      break;
    case 'allow':
      currentRule?.allow.push(value);
      break;
    case 'disallow':
      currentRule?.disallow.push(value);
      break;
    case 'crawl-delay':
      if (currentRule) currentRule.crawlDelay = parseInt(value, 10);
      break;
  }
}

if (currentRule) rules.push(currentRule);
return rules;

} catch { return []; } }

export function isAllowed(url: string, rules: RobotsRule[], userAgent: string): boolean { const path = new URL(url).pathname; const applicableRules = rules.filter( r => r.userAgent === '*' || r.userAgent.toLowerCase() === userAgent.toLowerCase() );

for (const rule of applicableRules) { for (const disallow of rule.disallow) { if (path.startsWith(disallow)) { // Check if explicitly allowed for (const allow of rule.allow) { if (path.startsWith(allow)) return true; } return false; } } }

return true; }

Best Practices Checklist

Ethical Scraping Checklist

Common Patterns

Screenshot Capture

async function captureScreenshot( url: string, options: { fullPage?: boolean; format?: 'png' | 'jpeg' } = {} ): Promise<Buffer> { const browser = await chromium.launch(); const page = await browser.newPage();

await page.setViewportSize({ width: 1280, height: 720 }); await page.goto(url, { waitUntil: 'networkidle' });

const screenshot = await page.screenshot({ fullPage: options.fullPage ?? false, type: options.format ?? 'png', });

await browser.close(); return screenshot; }

PDF Generation

async function generatePDF(url: string): Promise<Buffer> { const browser = await chromium.launch(); const page = await browser.newPage();

await page.goto(url, { waitUntil: 'networkidle' });

const pdf = await page.pdf({ format: 'A4', printBackground: true, margin: { top: '1cm', bottom: '1cm', left: '1cm', right: '1cm' }, });

await browser.close(); return pdf; }

Handling Dynamic Content

async function waitForContent(page: Page, selector: string): Promise<void> { // Wait for specific element await page.waitForSelector(selector, { timeout: 10000 });

// Or wait for network to be idle await page.waitForLoadState('networkidle');

// Or wait for specific request await page.waitForResponse( response => response.url().includes('/api/data') ); }

Red Flags - STOP

Never:

Scrape without rate limiting
Ignore robots.txt
Scrape login-protected content without authorization
Store scraped personal data without consent
Overwhelm servers with concurrent requests
Bypass anti-bot measures for malicious purposes

Always:

Check Terms of Service
Implement exponential backoff
Use descriptive User-Agent
Cache results to reduce requests
Handle errors gracefully
Document your scraping activity

Integration

Related skills: api-design, database-patterns, testing-patterns Tools: Playwright, Puppeteer, Cheerio, Crawlee

Safety Notice

Copy this and send it to your AI assistant to learn

Ethical Scraping Checklist

Source Transparency

Related Skills

frontend-design

openclaw-version-monitor

ask-claude

ai-dating