From 3eb40e0a4cb88639e8687c694b453f0f8ce05c49 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 16 Apr 2026 09:16:54 +0200 Subject: [PATCH 1/3] docs: add Firecrawl to ScrapeGraph v2 transition guide Comprehensive migration guide covering endpoint mapping, SDK migration (Python + JS), authentication, response format differences, monitoring, crawling, and a step-by-step migration checklist. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs.json | 1 + transition-from-firecrawl.mdx | 531 ++++++++++++++++++++++++++++++++++ 2 files changed, 532 insertions(+) create mode 100644 transition-from-firecrawl.mdx diff --git a/docs.json b/docs.json index 68da0db..ab7bf64 100644 --- a/docs.json +++ b/docs.json @@ -23,6 +23,7 @@ "introduction", "install", "transition-from-v1-to-v2", + "transition-from-firecrawl", { "group": "Use Cases", "pages": [ diff --git a/transition-from-firecrawl.mdx b/transition-from-firecrawl.mdx new file mode 100644 index 0000000..d0dc546 --- /dev/null +++ b/transition-from-firecrawl.mdx @@ -0,0 +1,531 @@ +--- +title: Transition from Firecrawl to ScrapeGraph v2 +description: A practical guide for migrating your scraping workflows from Firecrawl to ScrapeGraph v2 +--- + +## Why switch? + +ScrapeGraph v2 offers AI-powered scraping, extraction, search, crawling, and monitoring through a unified API at a competitive price. If you're coming from Firecrawl, this page maps every endpoint, SDK method, and concept to its ScrapeGraph equivalent so you can migrate quickly. + +## Feature comparison at a glance + +| Capability | Firecrawl | ScrapeGraph v2 | +|---|---|---| +| Single-page scrape | `POST /v2/scrape` | `POST /api/v2/scrape` | +| Structured extraction | `POST /v2/extract` (LLM) | `POST /api/v2/extract` (LLM) | +| Web search | `POST /v2/search` | `POST /api/v2/search` | +| Crawl (multi-page) | `POST /v2/crawl` (async) | `POST /api/v2/crawl` (async) | +| URL discovery / sitemap | `POST /v2/map` | Crawl with patterns or REST sitemap endpoint | +| Batch scrape | `POST /v2/batch/scrape` | Loop over `scrape` or use `crawl` with URL list | +| Scheduled monitoring | Change tracking (format option) | `POST /api/v2/monitor` (first-class, cron-based) | +| Browser interaction | `/v2/scrape/{id}/interact` | `FetchConfig` with `mode: "js"`, `wait`, `scrolls` | +| Autonomous agent | `POST /v2/agent` | `extract` + `FetchConfig` (stealth, JS mode) | +| Schema generation | Manual JSON Schema / Pydantic / Zod | `generate_schema` endpoint (prompt-to-schema) | +| MCP server | Not available | Full MCP server (remote + local) | +| CLI | Not available | `just-scrape` CLI with all services | +| Credits / account | Via dashboard | `credits` endpoint + dashboard | +| Request history | Not available | `history` endpoint with filters | + +## Endpoint mapping + +### Scrape + +Firecrawl returns content in multiple formats via a `formats` array. ScrapeGraph v2 uses a similar approach with `formats` entries, each specifying a `type`. + + + +```bash Firecrawl (cURL) +curl -X POST https://api.firecrawl.dev/v2/scrape \ + -H "Authorization: Bearer fc-..." \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com", + "formats": ["markdown", "html", "screenshot"] + }' +``` + +```bash ScrapeGraph v2 (cURL) +curl -X POST https://api.scrapegraphai.com/api/v2/scrape \ + -H "SGAI-APIKEY: sgai-..." \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com", + "formats": [ + { "type": "markdown" }, + { "type": "html" }, + { "type": "screenshot" } + ] + }' +``` + + + + + +```python Firecrawl (Python) +from firecrawl import Firecrawl + +fc = Firecrawl(api_key="fc-...") +result = fc.scrape("https://example.com", formats=["markdown"]) +print(result["markdown"]) +``` + +```python ScrapeGraph v2 (Python) +from scrapegraph_py import ScrapeGraphAI, ScrapeRequest + +sgai = ScrapeGraphAI(api_key="sgai-...") +result = sgai.scrape(ScrapeRequest( + url="https://example.com", + formats=[{"type": "markdown"}], +)) +if result.status == "success": + print(result.data) +``` + + + + + +```javascript Firecrawl (Node.js) +import Firecrawl from "@mendable/firecrawl-js"; + +const fc = new Firecrawl({ apiKey: "fc-..." }); +const result = await fc.scrape("https://example.com", { + formats: ["markdown"], +}); +console.log(result.markdown); +``` + +```javascript ScrapeGraph v2 (Node.js) +import { scrape } from "scrapegraph-js"; + +const result = await scrape("sgai-...", { + url: "https://example.com", + formats: [{ type: "markdown" }], +}); +if (result.status === "success") { + console.log(result.data?.results.markdown?.data); +} +``` + + + +**Key differences:** + +| Firecrawl | ScrapeGraph v2 | Notes | +|---|---|---| +| `formats: ["markdown"]` | `formats: [{ type: "markdown" }]` | Object syntax allows per-format options (mode, quality, etc.) | +| `only_main_content: true` | `formats: [{ type: "markdown", mode: "reader" }]` | Reader mode strips nav/footer content | +| `actions: [...]` | `fetch_config: { wait: 2000, scrolls: 3 }` | Declarative fetch configuration | +| `location: { country: "US" }` | `fetch_config: { country: "us" }` | ISO code in fetch config | +| `proxy: "enhanced"` | `fetch_config: { stealth: true }` | Residential proxy + anti-bot | +| `maxAge` / cache control | Not applicable | ScrapeGraph always fetches fresh | + +### Extract (structured data) + +Both platforms use LLMs to extract structured data. Firecrawl embeds extraction in the scrape endpoint (via `json` format) or has a dedicated `/extract` endpoint. ScrapeGraph v2 has a dedicated `extract` endpoint. + + + +```python Firecrawl (Python) +result = fc.extract( + urls=["https://example.com"], + prompt="Extract the company name and founding year", + schema={ + "type": "object", + "properties": { + "company": {"type": "string"}, + "founded": {"type": "integer"} + } + }, +) +``` + +```python ScrapeGraph v2 (Python) +from scrapegraph_py import ExtractRequest + +result = sgai.extract(ExtractRequest( + url="https://example.com", + prompt="Extract the company name and founding year", + schema={ + "type": "object", + "properties": { + "company": {"type": "string"}, + "founded": {"type": "integer"} + } + }, +)) +if result.status == "success": + print(result.data) +``` + + + + + +```javascript Firecrawl (Node.js) +const result = await fc.extract({ + urls: ["https://example.com"], + prompt: "Extract the company name and founding year", + schema: { + type: "object", + properties: { + company: { type: "string" }, + founded: { type: "integer" }, + }, + }, +}); +``` + +```javascript ScrapeGraph v2 (Node.js) +import { extract } from "scrapegraph-js"; + +const result = await extract("sgai-...", { + url: "https://example.com", + prompt: "Extract the company name and founding year", + schema: { + type: "object", + properties: { + company: { type: "string" }, + founded: { type: "integer" }, + }, + }, +}); +if (result.status === "success") { + console.log(result.data?.json_data); +} +``` + + + +**Key differences:** + +| Firecrawl | ScrapeGraph v2 | Notes | +|---|---|---| +| `urls: [...]` (array, supports wildcards) | `url: "..."` (single URL) | One URL per request; batch via loop or crawl | +| `enableWebSearch: true` | Use `search` endpoint separately | Search is a separate service | +| `agent.model: "FIRE-1"` | `fetch_config: { mode: "js", stealth: true }` | JS rendering + anti-bot for hard pages | + +### Search + +Both offer web search with optional extraction from results. + + + +```python Firecrawl (Python) +result = fc.search( + "best web scraping APIs 2025", + limit=5, +) +``` + +```python ScrapeGraph v2 (Python) +from scrapegraph_py import SearchRequest + +result = sgai.search(SearchRequest( + query="best web scraping APIs 2025", + num_results=5, +)) +``` + + + + + +```javascript Firecrawl (Node.js) +const result = await fc.search("best web scraping APIs 2025", { + limit: 5, +}); +``` + +```javascript ScrapeGraph v2 (Node.js) +import { search } from "scrapegraph-js"; + +const result = await search("sgai-...", { + query: "best web scraping APIs 2025", + numResults: 5, +}); +``` + + + +**Key differences:** + +| Firecrawl | ScrapeGraph v2 | Notes | +|---|---|---| +| `limit` | `num_results` (Python) / `numResults` (JS) | Max 20 per request | +| `sources: ["web", "news", "images"]` | Web results only | Single source, focused results | +| `tbs: "qdr:d"` (time filter) | `time_range: "past_24_hours"` | Human-readable time range values | +| `scrapeOptions` on results | `prompt` + `schema` on request | Extraction built into the search call | +| `location: "US"` | `country: "us"` | ISO code parameter | + +### Crawl + +Both platforms support async multi-page crawling with job management. + + + +```python Firecrawl (Python) +job = fc.start_crawl( + "https://example.com", + limit=100, + max_discovery_depth=3, + include_paths=["/blog/*"], +) +status = fc.get_crawl_status(job["id"]) +``` + +```python ScrapeGraph v2 (Python) +from scrapegraph_py import CrawlRequest + +job = sgai.crawl.start(CrawlRequest( + url="https://example.com", + max_pages=100, + max_depth=3, + include_patterns=["/blog/*"], +)) +if job.status == "success": + status = sgai.crawl.get(job.data.id) +``` + + + + + +```javascript Firecrawl (Node.js) +const job = await fc.startCrawl("https://example.com", { + limit: 100, + maxDiscoveryDepth: 3, + includePaths: ["/blog/*"], +}); +const status = await fc.getCrawlStatus(job.id); +``` + +```javascript ScrapeGraph v2 (Node.js) +import { crawl } from "scrapegraph-js"; + +const job = await crawl.start("sgai-...", { + url: "https://example.com", + maxPages: 100, + maxDepth: 3, + includePatterns: ["/blog/*"], +}); +if (job.status === "success") { + const status = await crawl.get("sgai-...", job.data?.id); +} +``` + + + +**Crawl job management mapping:** + +| Firecrawl | ScrapeGraph v2 | +|---|---| +| `start_crawl()` / `startCrawl()` | `crawl.start()` | +| `get_crawl_status(id)` / `getCrawlStatus(id)` | `crawl.get(id)` | +| `cancel_crawl(id)` / `cancelCrawl(id)` | `crawl.stop(id)` + `crawl.delete(id)` | +| — | `crawl.resume(id)` (pause/resume support) | + +**Key differences:** + +| Firecrawl | ScrapeGraph v2 | Notes | +|---|---|---| +| `limit` (max pages) | `max_pages` (1-1000) | ScrapeGraph caps at 1000 | +| `maxDiscoveryDepth` | `max_depth` | Same concept, different name | +| `includePaths` / `excludePaths` (regex) | `include_patterns` / `exclude_patterns` (glob) | Glob patterns instead of regex | +| `webhook` with events | `webhook_url` on monitors | Webhooks on monitors, not crawl jobs | +| WebSocket watcher | Poll via `crawl.get()` | Polling-based status checks | +| `sitemap: "include"` | Handled automatically or via separate sitemap service | — | + +### Map (URL discovery) + +Firecrawl's `/map` endpoint discovers URLs quickly from sitemaps and cached data. ScrapeGraph doesn't have a direct equivalent — use crawl with a shallow depth or the sitemap service. + +```python ScrapeGraph v2 — discover URLs with a shallow crawl +job = sgai.crawl.start(CrawlRequest( + url="https://example.com", + max_depth=1, + max_pages=500, + formats=[{"type": "links"}], +)) +``` + +### Monitoring / change tracking + +Firecrawl uses `changeTracking` as a format option on scrape/crawl. ScrapeGraph v2 has a **dedicated monitor service** with cron scheduling, webhooks, and activity history. + + + +```python Firecrawl (Python) +# Change tracking is a scrape format option +result = fc.scrape("https://example.com", formats=["markdown", "changeTracking"]) +``` + +```python ScrapeGraph v2 (Python) +from scrapegraph_py import MonitorCreateRequest + +# Create a scheduled monitor with webhook +monitor = sgai.monitor.create(MonitorCreateRequest( + url="https://example.com", + interval="0 */6 * * *", # every 6 hours + name="Example monitor", + formats=[{"type": "markdown"}], + webhook_url="https://your-app.com/webhook", +)) + +# Later: check what changed +activity = sgai.monitor.activity(monitor.data.cron_id) +for tick in activity.data.ticks: + if tick.changed: + print(f"Change detected at {tick.created_at}") +``` + + + +**Key differences:** + +| Firecrawl | ScrapeGraph v2 | Notes | +|---|---|---| +| `changeTracking` format option | Dedicated `monitor` service | First-class scheduled monitoring | +| Manual re-scrape to detect changes | Cron-based automatic scheduling | Set-and-forget with cron expressions | +| `git-diff` / `json` diff modes | Per-format diffs in activity ticks | Automatic diff computation | +| No webhook on changes | `webhook_url` fires on detected changes | Built-in webhook support | +| — | `monitor.pause()` / `resume()` / `delete()` | Full lifecycle management | + +## Authentication + +| | Firecrawl | ScrapeGraph v2 | +|---|---|---| +| Header | `Authorization: Bearer fc-...` | `SGAI-APIKEY: sgai-...` | +| Env var | `FIRECRAWL_API_KEY` | `SGAI_API_KEY` | +| Base URL | `https://api.firecrawl.dev/v2` | `https://api.scrapegraphai.com/api/v2` | + +## SDK installation + +| | Firecrawl | ScrapeGraph v2 | +|---|---|---| +| Python | `pip install firecrawl-py` | `pip install scrapegraph-py` | +| Node.js | `npm i @mendable/firecrawl-js` | `npm i scrapegraph-js` | +| CLI | — | `npm i -g just-scrape` | +| MCP server | — | `pip install scrapegraph-mcp` | + +## Response format + +Firecrawl returns data directly. ScrapeGraph v2 wraps all responses in an `ApiResult` envelope: + +```json +{ + "status": "success", + "data": { ... }, + "error": null, + "elapsed_ms": 1234 +} +``` + +Always check `result.status === "success"` (JS) or `result.status == "success"` (Python) before accessing `result.data`. + +## Environment variables + +| Firecrawl | ScrapeGraph v2 | Purpose | +|---|---|---| +| `FIRECRAWL_API_KEY` | `SGAI_API_KEY` | API key | +| — | `SGAI_API_URL` | Override base URL | +| — | `SGAI_TIMEOUT` | Request timeout (seconds, default 120) | +| — | `SGAI_DEBUG` | Enable debug logging (`"1"`) | + +## Features unique to ScrapeGraph v2 + +These ScrapeGraph features have no direct Firecrawl equivalent: + +- **MCP Server** — connect any MCP-compatible AI client (Claude Desktop, Cursor, etc.) to all ScrapeGraph services via a single endpoint +- **CLI (`just-scrape`)** — full-featured command-line tool for all services, with `--json` mode for scripting +- **Claude Code Skill** — install as an AI coding agent skill for Claude Code, Cursor, Copilot, Cline, and Windsurf +- **Schema generation** — `generate_schema` endpoint creates JSON schemas from natural language prompts +- **Request history** — `history` endpoint with service-level filtering and pagination +- **Monitor activity polling** — paginated tick history with per-format diffs +- **Toonify** — convert photos to cartoon-style images + +## Migration checklist + + + +### Update dependencies + +```bash +# Remove Firecrawl +pip uninstall firecrawl-py # Python +npm uninstall @mendable/firecrawl-js # Node.js + +# Install ScrapeGraph +pip install scrapegraph-py # Python +npm install scrapegraph-js # Node.js +``` + +### Update environment variables + +```bash +# Replace +# FIRECRAWL_API_KEY=fc-... + +# With +SGAI_API_KEY=sgai-... +``` + +Get your API key from the [dashboard](https://scrapegraphai.com/dashboard). + +### Update imports and client initialization + +```python +# Before +from firecrawl import Firecrawl +fc = Firecrawl(api_key="fc-...") + +# After +from scrapegraph_py import ScrapeGraphAI +sgai = ScrapeGraphAI(api_key="sgai-...") +``` + +```javascript +// Before +import Firecrawl from "@mendable/firecrawl-js"; +const fc = new Firecrawl({ apiKey: "fc-..." }); + +// After +import { scrape, extract, search, crawl, monitor } from "scrapegraph-js"; +``` + +### Replace method calls + +Use the endpoint mapping tables above to update each call. The main patterns: + +- `fc.scrape()` -> `sgai.scrape(ScrapeRequest(...))` +- `fc.extract()` -> `sgai.extract(ExtractRequest(...))` +- `fc.search()` -> `sgai.search(SearchRequest(...))` +- `fc.start_crawl()` -> `sgai.crawl.start(CrawlRequest(...))` +- Change tracking -> `sgai.monitor.create(MonitorCreateRequest(...))` + +### Handle the `ApiResult` wrapper + +Wrap all response handling with a status check: + +```python +result = sgai.extract(ExtractRequest(...)) +if result.status == "success": + data = result.data +else: + print(f"Error: {result.error}") +``` + +### Test and verify + +Run your existing test suite and compare outputs. ScrapeGraph returns equivalent data structures — the main difference is the `ApiResult` envelope. + + + +## Full SDK documentation + +- [Python SDK](/sdks/python) +- [JavaScript SDK](/sdks/javascript) +- [CLI (just-scrape)](/services/cli/introduction) +- [MCP Server](/services/mcp-server/introduction) +- [API Reference](/api-reference/introduction) From 8a95e52090e8e490505aef5967a69434494966d5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Thu, 16 Apr 2026 09:21:53 +0200 Subject: [PATCH 2/3] docs: simplify Firecrawl transition guide to essentials Remove verbose endpoint mapping code examples, key-differences tables, environment variables, response format, and features-unique sections. Keep compact comparison table, auth, SDK install, and migration checklist. Co-Authored-By: Claude Opus 4.6 (1M context) --- transition-from-firecrawl.mdx | 415 +--------------------------------- 1 file changed, 2 insertions(+), 413 deletions(-) diff --git a/transition-from-firecrawl.mdx b/transition-from-firecrawl.mdx index d0dc546..5f82296 100644 --- a/transition-from-firecrawl.mdx +++ b/transition-from-firecrawl.mdx @@ -15,382 +15,7 @@ ScrapeGraph v2 offers AI-powered scraping, extraction, search, crawling, and mon | Structured extraction | `POST /v2/extract` (LLM) | `POST /api/v2/extract` (LLM) | | Web search | `POST /v2/search` | `POST /api/v2/search` | | Crawl (multi-page) | `POST /v2/crawl` (async) | `POST /api/v2/crawl` (async) | -| URL discovery / sitemap | `POST /v2/map` | Crawl with patterns or REST sitemap endpoint | -| Batch scrape | `POST /v2/batch/scrape` | Loop over `scrape` or use `crawl` with URL list | -| Scheduled monitoring | Change tracking (format option) | `POST /api/v2/monitor` (first-class, cron-based) | -| Browser interaction | `/v2/scrape/{id}/interact` | `FetchConfig` with `mode: "js"`, `wait`, `scrolls` | -| Autonomous agent | `POST /v2/agent` | `extract` + `FetchConfig` (stealth, JS mode) | -| Schema generation | Manual JSON Schema / Pydantic / Zod | `generate_schema` endpoint (prompt-to-schema) | -| MCP server | Not available | Full MCP server (remote + local) | -| CLI | Not available | `just-scrape` CLI with all services | -| Credits / account | Via dashboard | `credits` endpoint + dashboard | -| Request history | Not available | `history` endpoint with filters | - -## Endpoint mapping - -### Scrape - -Firecrawl returns content in multiple formats via a `formats` array. ScrapeGraph v2 uses a similar approach with `formats` entries, each specifying a `type`. - - - -```bash Firecrawl (cURL) -curl -X POST https://api.firecrawl.dev/v2/scrape \ - -H "Authorization: Bearer fc-..." \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com", - "formats": ["markdown", "html", "screenshot"] - }' -``` - -```bash ScrapeGraph v2 (cURL) -curl -X POST https://api.scrapegraphai.com/api/v2/scrape \ - -H "SGAI-APIKEY: sgai-..." \ - -H "Content-Type: application/json" \ - -d '{ - "url": "https://example.com", - "formats": [ - { "type": "markdown" }, - { "type": "html" }, - { "type": "screenshot" } - ] - }' -``` - - - - - -```python Firecrawl (Python) -from firecrawl import Firecrawl - -fc = Firecrawl(api_key="fc-...") -result = fc.scrape("https://example.com", formats=["markdown"]) -print(result["markdown"]) -``` - -```python ScrapeGraph v2 (Python) -from scrapegraph_py import ScrapeGraphAI, ScrapeRequest - -sgai = ScrapeGraphAI(api_key="sgai-...") -result = sgai.scrape(ScrapeRequest( - url="https://example.com", - formats=[{"type": "markdown"}], -)) -if result.status == "success": - print(result.data) -``` - - - - - -```javascript Firecrawl (Node.js) -import Firecrawl from "@mendable/firecrawl-js"; - -const fc = new Firecrawl({ apiKey: "fc-..." }); -const result = await fc.scrape("https://example.com", { - formats: ["markdown"], -}); -console.log(result.markdown); -``` - -```javascript ScrapeGraph v2 (Node.js) -import { scrape } from "scrapegraph-js"; - -const result = await scrape("sgai-...", { - url: "https://example.com", - formats: [{ type: "markdown" }], -}); -if (result.status === "success") { - console.log(result.data?.results.markdown?.data); -} -``` - - - -**Key differences:** - -| Firecrawl | ScrapeGraph v2 | Notes | -|---|---|---| -| `formats: ["markdown"]` | `formats: [{ type: "markdown" }]` | Object syntax allows per-format options (mode, quality, etc.) | -| `only_main_content: true` | `formats: [{ type: "markdown", mode: "reader" }]` | Reader mode strips nav/footer content | -| `actions: [...]` | `fetch_config: { wait: 2000, scrolls: 3 }` | Declarative fetch configuration | -| `location: { country: "US" }` | `fetch_config: { country: "us" }` | ISO code in fetch config | -| `proxy: "enhanced"` | `fetch_config: { stealth: true }` | Residential proxy + anti-bot | -| `maxAge` / cache control | Not applicable | ScrapeGraph always fetches fresh | - -### Extract (structured data) - -Both platforms use LLMs to extract structured data. Firecrawl embeds extraction in the scrape endpoint (via `json` format) or has a dedicated `/extract` endpoint. ScrapeGraph v2 has a dedicated `extract` endpoint. - - - -```python Firecrawl (Python) -result = fc.extract( - urls=["https://example.com"], - prompt="Extract the company name and founding year", - schema={ - "type": "object", - "properties": { - "company": {"type": "string"}, - "founded": {"type": "integer"} - } - }, -) -``` - -```python ScrapeGraph v2 (Python) -from scrapegraph_py import ExtractRequest - -result = sgai.extract(ExtractRequest( - url="https://example.com", - prompt="Extract the company name and founding year", - schema={ - "type": "object", - "properties": { - "company": {"type": "string"}, - "founded": {"type": "integer"} - } - }, -)) -if result.status == "success": - print(result.data) -``` - - - - - -```javascript Firecrawl (Node.js) -const result = await fc.extract({ - urls: ["https://example.com"], - prompt: "Extract the company name and founding year", - schema: { - type: "object", - properties: { - company: { type: "string" }, - founded: { type: "integer" }, - }, - }, -}); -``` - -```javascript ScrapeGraph v2 (Node.js) -import { extract } from "scrapegraph-js"; - -const result = await extract("sgai-...", { - url: "https://example.com", - prompt: "Extract the company name and founding year", - schema: { - type: "object", - properties: { - company: { type: "string" }, - founded: { type: "integer" }, - }, - }, -}); -if (result.status === "success") { - console.log(result.data?.json_data); -} -``` - - - -**Key differences:** - -| Firecrawl | ScrapeGraph v2 | Notes | -|---|---|---| -| `urls: [...]` (array, supports wildcards) | `url: "..."` (single URL) | One URL per request; batch via loop or crawl | -| `enableWebSearch: true` | Use `search` endpoint separately | Search is a separate service | -| `agent.model: "FIRE-1"` | `fetch_config: { mode: "js", stealth: true }` | JS rendering + anti-bot for hard pages | - -### Search - -Both offer web search with optional extraction from results. - - - -```python Firecrawl (Python) -result = fc.search( - "best web scraping APIs 2025", - limit=5, -) -``` - -```python ScrapeGraph v2 (Python) -from scrapegraph_py import SearchRequest - -result = sgai.search(SearchRequest( - query="best web scraping APIs 2025", - num_results=5, -)) -``` - - - - - -```javascript Firecrawl (Node.js) -const result = await fc.search("best web scraping APIs 2025", { - limit: 5, -}); -``` - -```javascript ScrapeGraph v2 (Node.js) -import { search } from "scrapegraph-js"; - -const result = await search("sgai-...", { - query: "best web scraping APIs 2025", - numResults: 5, -}); -``` - - - -**Key differences:** - -| Firecrawl | ScrapeGraph v2 | Notes | -|---|---|---| -| `limit` | `num_results` (Python) / `numResults` (JS) | Max 20 per request | -| `sources: ["web", "news", "images"]` | Web results only | Single source, focused results | -| `tbs: "qdr:d"` (time filter) | `time_range: "past_24_hours"` | Human-readable time range values | -| `scrapeOptions` on results | `prompt` + `schema` on request | Extraction built into the search call | -| `location: "US"` | `country: "us"` | ISO code parameter | - -### Crawl - -Both platforms support async multi-page crawling with job management. - - - -```python Firecrawl (Python) -job = fc.start_crawl( - "https://example.com", - limit=100, - max_discovery_depth=3, - include_paths=["/blog/*"], -) -status = fc.get_crawl_status(job["id"]) -``` - -```python ScrapeGraph v2 (Python) -from scrapegraph_py import CrawlRequest - -job = sgai.crawl.start(CrawlRequest( - url="https://example.com", - max_pages=100, - max_depth=3, - include_patterns=["/blog/*"], -)) -if job.status == "success": - status = sgai.crawl.get(job.data.id) -``` - - - - - -```javascript Firecrawl (Node.js) -const job = await fc.startCrawl("https://example.com", { - limit: 100, - maxDiscoveryDepth: 3, - includePaths: ["/blog/*"], -}); -const status = await fc.getCrawlStatus(job.id); -``` - -```javascript ScrapeGraph v2 (Node.js) -import { crawl } from "scrapegraph-js"; - -const job = await crawl.start("sgai-...", { - url: "https://example.com", - maxPages: 100, - maxDepth: 3, - includePatterns: ["/blog/*"], -}); -if (job.status === "success") { - const status = await crawl.get("sgai-...", job.data?.id); -} -``` - - - -**Crawl job management mapping:** - -| Firecrawl | ScrapeGraph v2 | -|---|---| -| `start_crawl()` / `startCrawl()` | `crawl.start()` | -| `get_crawl_status(id)` / `getCrawlStatus(id)` | `crawl.get(id)` | -| `cancel_crawl(id)` / `cancelCrawl(id)` | `crawl.stop(id)` + `crawl.delete(id)` | -| — | `crawl.resume(id)` (pause/resume support) | - -**Key differences:** - -| Firecrawl | ScrapeGraph v2 | Notes | -|---|---|---| -| `limit` (max pages) | `max_pages` (1-1000) | ScrapeGraph caps at 1000 | -| `maxDiscoveryDepth` | `max_depth` | Same concept, different name | -| `includePaths` / `excludePaths` (regex) | `include_patterns` / `exclude_patterns` (glob) | Glob patterns instead of regex | -| `webhook` with events | `webhook_url` on monitors | Webhooks on monitors, not crawl jobs | -| WebSocket watcher | Poll via `crawl.get()` | Polling-based status checks | -| `sitemap: "include"` | Handled automatically or via separate sitemap service | — | - -### Map (URL discovery) - -Firecrawl's `/map` endpoint discovers URLs quickly from sitemaps and cached data. ScrapeGraph doesn't have a direct equivalent — use crawl with a shallow depth or the sitemap service. - -```python ScrapeGraph v2 — discover URLs with a shallow crawl -job = sgai.crawl.start(CrawlRequest( - url="https://example.com", - max_depth=1, - max_pages=500, - formats=[{"type": "links"}], -)) -``` - -### Monitoring / change tracking - -Firecrawl uses `changeTracking` as a format option on scrape/crawl. ScrapeGraph v2 has a **dedicated monitor service** with cron scheduling, webhooks, and activity history. - - - -```python Firecrawl (Python) -# Change tracking is a scrape format option -result = fc.scrape("https://example.com", formats=["markdown", "changeTracking"]) -``` - -```python ScrapeGraph v2 (Python) -from scrapegraph_py import MonitorCreateRequest - -# Create a scheduled monitor with webhook -monitor = sgai.monitor.create(MonitorCreateRequest( - url="https://example.com", - interval="0 */6 * * *", # every 6 hours - name="Example monitor", - formats=[{"type": "markdown"}], - webhook_url="https://your-app.com/webhook", -)) - -# Later: check what changed -activity = sgai.monitor.activity(monitor.data.cron_id) -for tick in activity.data.ticks: - if tick.changed: - print(f"Change detected at {tick.created_at}") -``` - - - -**Key differences:** - -| Firecrawl | ScrapeGraph v2 | Notes | -|---|---|---| -| `changeTracking` format option | Dedicated `monitor` service | First-class scheduled monitoring | -| Manual re-scrape to detect changes | Cron-based automatic scheduling | Set-and-forget with cron expressions | -| `git-diff` / `json` diff modes | Per-format diffs in activity ticks | Automatic diff computation | -| No webhook on changes | `webhook_url` fires on detected changes | Built-in webhook support | -| — | `monitor.pause()` / `resume()` / `delete()` | Full lifecycle management | +| Monitored changes | Change tracking (format option) | `POST /api/v2/monitor` (first-class, cron-based) | ## Authentication @@ -409,42 +34,6 @@ for tick in activity.data.ticks: | CLI | — | `npm i -g just-scrape` | | MCP server | — | `pip install scrapegraph-mcp` | -## Response format - -Firecrawl returns data directly. ScrapeGraph v2 wraps all responses in an `ApiResult` envelope: - -```json -{ - "status": "success", - "data": { ... }, - "error": null, - "elapsed_ms": 1234 -} -``` - -Always check `result.status === "success"` (JS) or `result.status == "success"` (Python) before accessing `result.data`. - -## Environment variables - -| Firecrawl | ScrapeGraph v2 | Purpose | -|---|---|---| -| `FIRECRAWL_API_KEY` | `SGAI_API_KEY` | API key | -| — | `SGAI_API_URL` | Override base URL | -| — | `SGAI_TIMEOUT` | Request timeout (seconds, default 120) | -| — | `SGAI_DEBUG` | Enable debug logging (`"1"`) | - -## Features unique to ScrapeGraph v2 - -These ScrapeGraph features have no direct Firecrawl equivalent: - -- **MCP Server** — connect any MCP-compatible AI client (Claude Desktop, Cursor, etc.) to all ScrapeGraph services via a single endpoint -- **CLI (`just-scrape`)** — full-featured command-line tool for all services, with `--json` mode for scripting -- **Claude Code Skill** — install as an AI coding agent skill for Claude Code, Cursor, Copilot, Cline, and Windsurf -- **Schema generation** — `generate_schema` endpoint creates JSON schemas from natural language prompts -- **Request history** — `history` endpoint with service-level filtering and pagination -- **Monitor activity polling** — paginated tick history with per-format diffs -- **Toonify** — convert photos to cartoon-style images - ## Migration checklist @@ -506,7 +95,7 @@ Use the endpoint mapping tables above to update each call. The main patterns: ### Handle the `ApiResult` wrapper -Wrap all response handling with a status check: +ScrapeGraph v2 wraps all responses in an `ApiResult` envelope. Always check status before accessing data: ```python result = sgai.extract(ExtractRequest(...)) From deb67812af5eb15175c023bde2e41d6354771921 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sun, 19 Apr 2026 10:23:34 +0200 Subject: [PATCH 3/3] Align Firecrawl transition guide with SDK idioms Match canonical SDK init patterns (env-var based), use ScrapeGraphAI class in JS example, and clarify that the ApiResult envelope is applied by the SDKs client-side, not by the raw HTTP API. Co-Authored-By: Claude Opus 4.7 (1M context) --- transition-from-firecrawl.mdx | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/transition-from-firecrawl.mdx b/transition-from-firecrawl.mdx index 5f82296..66957f0 100644 --- a/transition-from-firecrawl.mdx +++ b/transition-from-firecrawl.mdx @@ -71,7 +71,8 @@ fc = Firecrawl(api_key="fc-...") # After from scrapegraph_py import ScrapeGraphAI -sgai = ScrapeGraphAI(api_key="sgai-...") +# reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI(api_key="...") +sgai = ScrapeGraphAI() ``` ```javascript @@ -80,7 +81,9 @@ import Firecrawl from "@mendable/firecrawl-js"; const fc = new Firecrawl({ apiKey: "fc-..." }); // After -import { scrape, extract, search, crawl, monitor } from "scrapegraph-js"; +import { ScrapeGraphAI } from "scrapegraph-js"; +// reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI({ apiKey: "..." }) +const sgai = ScrapeGraphAI(); ``` ### Replace method calls @@ -95,19 +98,30 @@ Use the endpoint mapping tables above to update each call. The main patterns: ### Handle the `ApiResult` wrapper -ScrapeGraph v2 wraps all responses in an `ApiResult` envelope. Always check status before accessing data: +The ScrapeGraph Python and JS SDKs wrap every response in an `ApiResult` — no exceptions to catch. Check `status` before reading `data`: ```python -result = sgai.extract(ExtractRequest(...)) +result = sgai.extract(ExtractRequest(url="https://example.com", prompt="...")) if result.status == "success": data = result.data else: print(f"Error: {result.error}") ``` +```javascript +const result = await sgai.scrape({ url: "https://example.com", formats: [{ type: "markdown" }] }); +if (result.status === "success") { + console.log(result.data?.results.markdown?.data); +} else { + console.error(result.error); +} +``` + +Direct HTTP callers (curl, fetch) receive the unwrapped response body — the envelope is applied client-side by the SDKs. + ### Test and verify -Run your existing test suite and compare outputs. ScrapeGraph returns equivalent data structures — the main difference is the `ApiResult` envelope. +Run your existing test suite and compare outputs. ScrapeGraph returns equivalent data structures — the main difference is the `ApiResult` envelope in the SDKs.