diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..29b34eb06f 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -3,6 +3,7 @@ import markdownify import readabilipy.simple_json +from bs4 import BeautifulSoup from mcp.shared.exceptions import McpError from mcp.server import Server from mcp.server.stdio import stdio_server @@ -27,22 +28,59 @@ def extract_content_from_html(html: str) -> str: """Extract and convert HTML content to Markdown format. + Uses Mozilla Readability via readabilipy as the primary extraction method. + Falls back to readabilipy without Readability (less aggressive filtering) + or direct markdownify conversion when Readability returns empty content, + which commonly happens with progressive SSR sites that deliver content in + hidden containers awaiting client-side hydration. + Args: html: Raw HTML content to process Returns: Simplified markdown version of the content """ + # Stage 1: Try Readability (best quality for standard pages) ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["content"]: - return "Page failed to be simplified from HTML" + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + if content.strip(): + return content + + # Stage 2: Try readabilipy without Readability JS (less aggressive, + # does not filter by CSS visibility) + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + if content.strip(): + return content + + # Stage 3: Convert full HTML directly with markdownify (last resort). + # Strip