Spaces:

Nymbo
/

Fetch

Sleeping

App Files Files Community

Fetch / app.py

Nymbo

Update app.py

18bb794 verified 8 months ago

raw

history blame contribute delete

13.1 kB

	# File: main/app.py
	# Purpose: Fetch only the "relevant" page content (title, key metadata, clean body text, and hyperlinks)
	# instead of returning full HTML. Output is compact and configurable to reduce verbosity.

	import gradio as gr # UI library
	import requests # HTTP client
	from bs4 import BeautifulSoup # HTML parsing
	from readability import Document # Readability algorithm to isolate main content
	from urllib.parse import urljoin, urldefrag, urlparse # URL helpers
	import re # For whitespace cleanup and simple formatting


	# -------------------------------
	# HTTP fetching with sane defaults
	# -------------------------------
	def _http_get(url: str) -> requests.Response:
	"""
	Make an HTTP GET request with headers and a timeout.
	Layman's terms: downloads the webpage safely and politely.
	"""
	headers = {
	"User-Agent": "Mozilla/5.0 (compatible; NymboFetcher/1.0; +https://example.com)",
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	}
	# Short timeouts so the app isn't stuck forever
	return requests.get(url, headers=headers, timeout=15)


	# ----------------------------------------
	# Helpers: text cleanup & friendly trimming
	# ----------------------------------------
	def _normalize_whitespace(text: str) -> str:
	"""
	Layman's terms: squash weird spacing and too many blank lines.
	"""
	text = re.sub(r"[ \t\u00A0]+", " ", text) # collapse runs of spaces
	text = re.sub(r"\n\s\n\s\n+", "\n\n", text.strip()) # max 1 blank line at a time
	return text.strip()


	def _truncate(text: str, max_chars: int) -> tuple[str, bool]:
	"""
	Layman's terms: cut the text if it’s too long and tell the caller if we cut it.
	"""
	if max_chars is None or max_chars <= 0 or len(text) <= max_chars:
	return text, False
	return text[:max_chars].rstrip() + " …", True


	def _domain_of(url: str) -> str:
	"""
	Layman's terms: show a friendly domain like example.com.
	"""
	try:
	return urlparse(url).netloc or ""
	except Exception:
	return ""


	# -----------------------------------
	# Metadata extraction (title, etc.)
	# -----------------------------------
	def _extract_metadata(soup: BeautifulSoup, final_url: str) -> dict:
	"""
	Layman's terms: grab useful fields like title, description, site name, and canonical link.
	"""
	meta = {}

	# Title preference: <title> > og:title > twitter:title
	title_candidates = [
	(soup.title.string if soup.title and soup.title.string else None),
	_og(soup, "og:title"),
	_meta(soup, "twitter:title"),
	]
	meta["title"] = next((t.strip() for t in title_candidates if t and t.strip()), "")

	# Description preference: meta[name=description] > og:description > twitter:description
	desc_candidates = [
	_meta(soup, "description"),
	_og(soup, "og:description"),
	_meta(soup, "twitter:description"),
	]
	meta["description"] = next((d.strip() for d in desc_candidates if d and d.strip()), "")

	# Canonical URL if provided (helps dedupe / standardize)
	link_canonical = soup.find("link", rel=lambda v: v and "canonical" in v)
	meta["canonical"] = (link_canonical.get("href") or "").strip() if link_canonical else ""

	# Site name (nice for context)
	meta["site_name"] = (_og(soup, "og:site_name") or "").strip()

	# Language (if present)
	html_tag = soup.find("html")
	meta["lang"] = (html_tag.get("lang") or "").strip() if html_tag else ""

	# Final resolved URL and domain
	meta["fetched_url"] = final_url
	meta["domain"] = _domain_of(final_url)

	return meta


	def _meta(soup: BeautifulSoup, name: str) -> str \| None:
	tag = soup.find("meta", attrs={"name": name})
	return tag.get("content") if tag and tag.has_attr("content") else None


	def _og(soup: BeautifulSoup, prop: str) -> str \| None:
	tag = soup.find("meta", attrs={"property": prop})
	return tag.get("content") if tag and tag.has_attr("content") else None


	# ---------------------------------------------------------
	# Main content extraction with Readability + gentle cleanup
	# ---------------------------------------------------------
	def _extract_main_text(html: str) -> tuple[str, BeautifulSoup]:
	"""
	Layman's terms: use Readability to find the article body, then clean it to plain text.
	Returns (clean_text, soup_of_readable_html) for link scraping.
	"""
	# Readability gives us a simplified article HTML
	doc = Document(html)
	readable_html = doc.summary(html_partial=True)

	# Parse the simplified HTML so we can clean it up further
	s = BeautifulSoup(readable_html, "lxml")

	# Remove obviously noisy elements if present
	for sel in ["script", "style", "noscript", "iframe", "svg"]:
	for tag in s.select(sel):
	tag.decompose()

	# Extract text with paragraphs preserved, then normalize whitespace
	text_parts = []
	for p in s.find_all(["p", "li", "h2", "h3", "h4", "blockquote"]):
	# Keep list items and headers to retain structure without being too verbose
	chunk = p.get_text(" ", strip=True)
	if chunk:
	text_parts.append(chunk)

	clean_text = _normalize_whitespace("\n\n".join(text_parts))
	return clean_text, s


	# ------------------------------------------
	# Link extraction from the simplified content
	# ------------------------------------------
	def _extract_links(readable_soup: BeautifulSoup, base_url: str, max_links: int) -> list[tuple[str, str]]:
	"""
	Layman's terms: pull out clickable links from the article content only,
	turn them into absolute URLs, drop junk, dedupe, and cap the list.
	"""
	seen = set()
	links: list[tuple[str, str]] = []

	for a in readable_soup.find_all("a", href=True):
	href = a.get("href").strip()
	# Ignore anchors, mailto, javascript, and empty
	if not href or href.startswith("#") or href.startswith("mailto:") or href.startswith("javascript:"):
	continue

	# Resolve relative URLs and strip URL fragments (#section)
	absolute = urljoin(base_url, href)
	absolute, _ = urldefrag(absolute)

	if absolute in seen:
	continue
	seen.add(absolute)

	text = a.get_text(" ", strip=True)
	# Keep link text concise
	if len(text) > 120:
	text = text[:117] + "…"

	links.append((text or absolute, absolute))

	if len(links) >= max_links > 0:
	break

	return links


	# -------------------------
	# Formatter: compact output
	# -------------------------
	def _format_markdown(meta: dict, body: str, body_truncated: bool, links: list[tuple[str, str]],
	include_text: bool, include_metadata: bool, include_links: bool, verbosity: str) -> str:
	"""
	Layman's terms: turn the pieces into a neat, compact Markdown string.
	"""
	lines = []

	# Title header
	title = meta.get("title") or meta.get("domain") or "Untitled"
	lines.append(f"# {title}")

	# Metadata (compact)
	if include_metadata:
	md = []
	# Only show fields that exist to keep things tight
	if meta.get("description"):
	md.append(f"- Description: {meta['description']}")
	if meta.get("site_name"):
	md.append(f"- Site: {meta['site_name']}")
	if meta.get("canonical"):
	md.append(f"- Canonical: {meta['canonical']}")
	if meta.get("lang"):
	md.append(f"- Language: {meta['lang']}")
	if meta.get("fetched_url"):
	md.append(f"- Fetched From: {meta['fetched_url']}")

	if md:
	lines.append("## Metadata")
	lines.extend(md)

	# Body text
	if include_text and body:
	# For "Brief", show a very small excerpt even after truncation
	if verbosity == "Brief":
	brief, was_more = _truncate(body, 800)
	lines.append("## Text")
	lines.append(brief)
	if was_more or body_truncated:
	lines.append("\n> (Trimmed for brevity)")
	else:
	lines.append("## Text")
	lines.append(body)
	if body_truncated:
	lines.append("\n> (Trimmed for brevity)")

	# Links
	if include_links and links:
	lines.append(f"## Links ({len(links)})")
	for text, url in links:
	lines.append(f"- [{text}]({url})")

	return "\n\n".join(lines).strip()


	# --------------------------------
	# Gradio-facing function (the app)
	# --------------------------------
	def extract_relevant(
	url: str,
	verbosity: str = "Standard",
	include_metadata: bool = True,
	include_text: bool = True,
	include_links: bool = True,
	max_chars: int = 3000,
	max_links: int = 20
	) -> str:
	"""
	Given a URL, fetch the page, extract just the good stuff, and return a compact Markdown summary.
	"""
	if not url or not url.strip():
	return "Please enter a valid URL."

	try:
	resp = _http_get(url)
	resp.raise_for_status()
	except requests.exceptions.RequestException as e:
	return f"An error occurred: {e}"

	# Respect the final resolved URL (after redirects)
	final_url = str(resp.url)

	# Only process HTML-ish responses
	ctype = resp.headers.get("Content-Type", "")
	if "html" not in ctype.lower():
	return f"Unsupported content type for extraction: {ctype or 'unknown'}"

	# Decode as text (requests usually sets encoding; otherwise guess)
	resp.encoding = resp.encoding or resp.apparent_encoding
	html = resp.text

	# Full page soup (to extract metadata accurately)
	full_soup = BeautifulSoup(html, "lxml")
	meta = _extract_metadata(full_soup, final_url)

	# Extract main body text using Readability
	body_text, readable_soup = _extract_main_text(html)

	# If the body is suspiciously empty, fall back to a simpler text strategy
	if not body_text:
	fallback_text = full_soup.get_text(" ", strip=True)
	body_text = _normalize_whitespace(fallback_text)

	# Enforce verbosity presets unless user overrides via slider
	preset_caps = {"Brief": 1200, "Standard": 3000, "Full": 999999}
	target_cap = preset_caps.get(verbosity, 3000)
	# Use the smaller of user cap and preset to keep things tidy
	cap = min(max_chars if max_chars > 0 else target_cap, target_cap)
	body_text, truncated = _truncate(body_text, cap) if include_text else ("", False)

	# Extract links from the readable portion only (cleaner than whole DOM)
	links = _extract_links(readable_soup, final_url, max_links=max_links if include_links else 0)

	# Build compact Markdown
	md = _format_markdown(
	meta=meta,
	body=body_text,
	body_truncated=truncated,
	links=links,
	include_text=include_text,
	include_metadata=include_metadata,
	include_links=include_links,
	verbosity=verbosity
	)
	return md or "No content could be extracted."


	# -----------------
	# Gradio UI (Blocks)
	# -----------------
	with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
	# Title & subtitle for clarity
	gr.Markdown("# Fetch MCP — Clean Extract")
	gr.Markdown(
	"Extract title, metadata, clean text, and links — without the noisy HTML. "
	"Use Verbosity and caps to keep it tight."
	)

	with gr.Row():
	url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
	fetch_btn = gr.Button("Fetch Clean Content")

	with gr.Accordion("Options", open=False):
	with gr.Row():
	verbosity = gr.Dropdown(
	label="Verbosity",
	choices=["Brief", "Standard", "Full"],
	value="Standard",
	info="Controls how much text you get back."
	)
	max_chars = gr.Slider(
	400, 12000, value=3000, step=100,
	label="Max Characters (body text)",
	info="Hard cap for body text. Lower = less verbose."
	)
	max_links = gr.Slider(
	0, 100, value=20, step=1,
	label="Max Links",
	info="Limit how many hyperlinks we include."
	)
	with gr.Row():
	include_metadata = gr.Checkbox(value=True, label="Include Metadata")
	include_text = gr.Checkbox(value=True, label="Include Main Text")
	include_links = gr.Checkbox(value=True, label="Include Links")

	# Output as Markdown (compact and readable)
	out = gr.Markdown(label="Result")

	# Wire up the click
	fetch_btn.click(
	fn=extract_relevant,
	inputs=[url_in, verbosity, include_metadata, include_text, include_links, max_chars, max_links],
	outputs=out
	)

	# Keep MCP server enabled
	if __name__ == "__main__":
	demo.launch(mcp_server=True)