# FILE: ai-service/core/document_parser.py import fitz # PyMuPDF library import requests import io def parse_pdf_from_url(pdf_url: str) -> str: """ Downloads a PDF from a URL, extracts all text, and returns it as a single string. """ print(f" - 📑 Downloading and parsing PDF from URL...") try: # Step 1: Download the PDF content from the URL response = requests.get(pdf_url, timeout=30) response.raise_for_status() # Raise an exception for bad status codes pdf_bytes = response.content # Step 2: Open the PDF from memory using PyMuPDF doc = fitz.open(stream=pdf_bytes, filetype="pdf") full_text = "" # Step 3: Iterate through each page and extract text for page_num in range(len(doc)): page = doc.load_page(page_num) full_text += page.get_text("text") + "\n\n" doc.close() print(f" - ✅ PDF parsed successfully. Total characters: {len(full_text)}") return full_text except requests.exceptions.RequestException as e: print(f" - ❌ FAILED to download PDF: {e}") raise ConnectionError(f"Could not download the file from the provided URL: {pdf_url}") from e except Exception as e: print(f" - ❌ FAILED to parse PDF: {e}") raise ValueError("The provided file could not be parsed as a valid PDF.") from e