Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 42 additions & 4 deletions src/fetch/src/mcp_server_fetch/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import markdownify
import readabilipy.simple_json
from bs4 import BeautifulSoup
from mcp.shared.exceptions import McpError
from mcp.server import Server
from mcp.server.stdio import stdio_server
Expand All @@ -27,22 +28,59 @@
def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.

Uses Mozilla Readability via readabilipy as the primary extraction method.
Falls back to readabilipy without Readability (less aggressive filtering)
or direct markdownify conversion when Readability returns empty content,
which commonly happens with progressive SSR sites that deliver content in
hidden containers awaiting client-side hydration.

Args:
html: Raw HTML content to process

Returns:
Simplified markdown version of the content
"""
# Stage 1: Try Readability (best quality for standard pages)
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=True
)
if not ret["content"]:
return "<error>Page failed to be simplified from HTML</error>"
content_html = ret.get("content", "")
if content_html:
content = markdownify.markdownify(
content_html,
heading_style=markdownify.ATX,
)
if content.strip():
return content

# Stage 2: Try readabilipy without Readability JS (less aggressive,
# does not filter by CSS visibility)
ret = readabilipy.simple_json.simple_json_from_html_string(
html, use_readability=False
)
content_html = ret.get("content", "")
if content_html:
content = markdownify.markdownify(
content_html,
heading_style=markdownify.ATX,
)
if content.strip():
return content

# Stage 3: Convert full HTML directly with markdownify (last resort).
# Strip <script> and <style> first — markdownify renders them verbatim as
# plain text, which injects large blobs of JS/CSS noise into the output.
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style"]):
tag.decompose()
content = markdownify.markdownify(
ret["content"],
str(soup),
heading_style=markdownify.ATX,
)
return content
if content.strip():
return content

return "<error>Page failed to be simplified from HTML</error>"


def get_robots_txt_url(url: str) -> str:
Expand Down
86 changes: 86 additions & 0 deletions src/fetch/tests/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,3 +324,89 @@ async def test_fetch_with_proxy(self):

# Verify AsyncClient was called with proxy
mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080")



class TestExtractContentFallback:
"""Tests for the fallback extraction in extract_content_from_html."""

def test_readability_sufficient_content_no_fallback(self):
"""When Readability returns enough content, no fallback is triggered."""
html = "<html><body>" + "<p>word </p>" * 200 + "</body></html>"
readability_content = "<div>" + "<p>word </p>" * 200 + "</div>"

with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
mock_readability.return_value = {"content": readability_content}
result = extract_content_from_html(html)
# Should only be called once (Readability path succeeds)
assert mock_readability.call_count == 1
assert "word" in result

def test_readability_strips_content_falls_back_to_no_readability(self):
"""When Readability returns empty content, falls back to non-Readability extraction."""
# Simulate an SSR page where Readability strips all hidden containers, returning empty
html = "<html><body>" + "<p>content </p>" * 500 + "</body></html>"

def mock_simple_json(h, use_readability=True):
if use_readability:
# Readability stripped everything, returns empty string
return {"content": ""}
else:
# Without Readability, returns full content
return {"content": "<div>" + "<p>content </p>" * 500 + "</div>"}

with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
result = extract_content_from_html(html)
assert "content" in result
assert len(result.strip()) > 100

def test_both_readability_modes_fail_falls_back_to_markdownify(self):
"""When both readabilipy modes return too little, falls back to raw markdownify."""
html = "<html><body>" + "<p>important data </p>" * 300 + "</body></html>"

with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
# Both modes return empty/minimal content
mock_readability.return_value = {"content": ""}
result = extract_content_from_html(html)
# Should fall through to markdownify on raw HTML
assert "important data" in result
assert mock_readability.call_count == 2 # called for both modes

def test_completely_empty_html_returns_error(self):
"""Completely empty HTML returns error message."""
with patch("readabilipy.simple_json.simple_json_from_html_string") as mock_readability:
mock_readability.return_value = {"content": ""}
result = extract_content_from_html("")
assert "<error>" in result

def test_readability_none_content_triggers_fallback(self):
"""When Readability returns None content, fallback is triggered."""
html = "<html><body>" + "<p>real content </p>" * 200 + "</body></html>"

call_count = [0]
def mock_simple_json(h, use_readability=True):
call_count[0] += 1
if call_count[0] == 1:
return {"content": None} # Readability returns None
else:
return {"content": "<div>" + "<p>real content </p>" * 200 + "</div>"}

with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
result = extract_content_from_html(html)
assert "real content" in result

def test_small_readability_output_accepted(self):
"""Non-empty Readability output is accepted regardless of size ratio."""
padding = "x" * 9000
html = f"<html><body><div style=\"visibility:hidden\">{padding}</div><p>tiny</p></body></html>"

def mock_simple_json(h, use_readability=True):
if use_readability:
return {"content": "<p>tiny</p>"}
else:
return {"content": f"<div>{padding}</div><p>tiny</p>"}

with patch("readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json):
result = extract_content_from_html(html)
# Readability returned non-empty content, so it should be used directly
assert "tiny" in result
Loading