Spaces:

Aedelon
/

GAIA_Agent

Runtime error

App Files Files Community

GAIA_Agent / agents /video_analyzer_agent.py

Aedelon

agent enhancement (#3)

b8f6b7f verified 8 months ago

raw

history blame

12.8 kB

	from __future__ import annotations

	import logging
	import os
	import re
	import shutil
	from pathlib import Path
	from typing import Optional

	import cv2
	import yt_dlp
	from llama_index.core.agent.workflow import FunctionAgent
	from llama_index.core.base.llms.types import TextBlock, ImageBlock, ChatMessage
	from llama_index.core.tools import FunctionTool
	from llama_index.llms.google_genai import GoogleGenAI
	from tqdm import tqdm
	from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound

	# ---------------------------------------------------------------------------
	# Environment setup & logging
	# ---------------------------------------------------------------------------
	logger = logging.getLogger(__name__)


	# ---------------------------------------------------------------------------
	# Prompt loader
	# ---------------------------------------------------------------------------

	def load_prompt_from_file(filename: str = "../prompts/video_analyzer_prompt.txt") -> str:
	"""Load the system prompt for video analysis from filename.

	Falls back to a minimal prompt if the file cannot be read.
	"""
	script_dir = Path(__file__).parent
	prompt_path = (script_dir / filename).resolve()

	try:
	with prompt_path.open("r", encoding="utf-8") as fp:
	prompt = fp.read()
	logger.info("Successfully loaded system prompt from %s", prompt_path)
	return prompt
	except FileNotFoundError:
	logger.error(
	"Prompt file %s not found. Using fallback prompt.", prompt_path
	)
	except Exception as exc: # pylint: disable=broad-except
	logger.error(
	"Error loading prompt file %s: %s", prompt_path, exc, exc_info=True
	)

	# Fallback – keep it extremely short to save tokens
	return (
	"You are a video analyzer. Provide a factual, chronological "
	"description of the video, identify key events, and summarise insights."
	)


	def extract_frames(video_path, output_dir, fps=1/2):
	"""
	Extract frames from video at specified FPS
	Returns a list of (frame_path, timestamp) tuples
	"""
	os.makedirs(output_dir, exist_ok=True)

	# Open video
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	print(f"Error: Could not open video {video_path}")
	return [], None

	# Get video properties
	video_fps = cap.get(cv2.CAP_PROP_FPS)
	frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	duration = frame_count / video_fps

	# Calculate frame interval
	interval = int(video_fps / fps)
	if interval < 1:
	interval = 1

	# Extract frames
	frames = []
	frame_idx = 0

	with tqdm(total=frame_count, desc="Extracting frames") as pbar:
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	if frame_idx % interval == 0:
	timestamp = frame_idx / video_fps
	frame_path = os.path.join(output_dir, f"frame_{frame_idx:06d}.jpg")
	cv2.imwrite(frame_path, frame)
	frames.append((frame_path, timestamp))

	frame_idx += 1
	pbar.update(1)

	cap.release()
	return frames, duration


	def download_video_and_analyze(video_url: str) -> str:
	"""Download a video from video_url and return the local file path."""
	llm_model_name = os.getenv("VIDEO_ANALYZER_LLM_MODEL", "models/gemini-1.5-pro")
	gemini_api_key = os.getenv("GEMINI_API_KEY")

	ydl_opts = {
	'format': 'best',
	'outtmpl': os.path.join("downloaded_videos", 'temp_video.%(ext)s'),
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl_download:
	ydl_download.download(video_url)

	print(f"Processing video: {video_url}")

	# Create temporary directory for frames
	temp_dir = "frame_downloaded_videos"
	os.makedirs(temp_dir, exist_ok=True)

	# Extract frames
	frames, duration = extract_frames(os.path.join("downloaded_videos", 'temp_video.mp4'), temp_dir)
	if not frames:
	logging.info(f"No frames extracted from {video_url}")
	return f"No frames extracted from {video_url}"

	blocks = []
	text_block = TextBlock(text=load_prompt_from_file())
	blocks.append(text_block)

	for frame_path, timestamp in tqdm(frames, desc="Collecting frames"):
	blocks.append(ImageBlock(path=frame_path))


	llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model_name)
	logger.info("Using LLM model: %s", llm_model_name)
	response = llm.chat([ChatMessage(role="user", blocks=blocks)])

	# Clean up temporary files
	shutil.rmtree(temp_dir)
	os.remove(os.path.join("downloaded_videos", 'temp_video.mp4'))

	return response.message.content


	# --- Helper function to extract YouTube Video ID ---
	def extract_video_id(url: str) -> Optional[str]:
	"""Extracts the YouTube video ID from various URL formats."""
	# Standard watch URL: https://www.youtube.com/watch?v=VIDEO_ID
	pattern = re.compile(
	r'^(?:https?://)?' # protocole optionnel
	r'(?:www\.)?' # sous-domaine optionnel
	r'youtube\.com/watch\?' # domaine et chemin fixe
	r'(?:.*&)?' # éventuellement d'autres paramètres avant v=
	r'v=([^&]+)' # capture de l'ID (tout jusqu'au prochain & ou fin)
	)

	match = pattern.search(url)
	if match:
	video_id = match.group(1)
	return video_id # affiche "VIDEO_ID"
	else:
	print("Aucun ID trouvé")
	return None


	# --- YouTube Transcript Tool ---
	def get_youtube_transcript(video_url_or_id: str, languages: str \| None = None) -> str:
	"""Fetches the transcript for a YouTube video using its URL or video ID.
	Specify preferred languages as a list (e.g., ["en", "es"]).
	Returns the transcript text or an error message.
	"""
	if languages is None:
	languages = ["en"]

	logger.info(f"Attempting to fetch YouTube transcript for: {video_url_or_id}")
	video_id = extract_video_id(video_url_or_id)
	if video_id is None or not video_id:
	logger.error(f"Could not extract video ID from: {video_url_or_id}")
	return f"Error: Invalid YouTube URL or Video ID format: {video_url_or_id}"

	try:
	# Fetch available transcripts
	api = YouTubeTranscriptApi()
	transcript_list = api.list(video_id)

	# Try to find a transcript in the specified languages
	transcript = transcript_list.find_transcript(languages)

	# Fetch the actual transcript data (list of dicts)
	transcript_data = transcript.fetch()

	# Combine the text parts into a single string
	full_transcript = " ".join(snippet.text for snippet in transcript_data)

	full_transcript = " ".join(snippet.text for snippet in transcript_data)
	logger.info(f"Successfully fetched transcript for video ID {video_id} in language {transcript.language}.")
	return full_transcript

	except TranscriptsDisabled:
	logger.warning(f"Transcripts are disabled for video ID: {video_id}")
	return f"Error: Transcripts are disabled for this video (ID: {video_id})."
	except NoTranscriptFound as e:
	logger.warning(
	f"No transcript found for video ID {video_id} in languages {languages}. Available: {e.available_transcripts}")
	# Try fetching any available transcript if specific languages failed
	try:
	logger.info(f"Attempting to fetch any available transcript for {video_id}")
	any_transcript = transcript_list.find_generated_transcript(
	transcript_list.manually_created_transcripts.keys() or transcript_list.generated_transcripts.keys())
	any_transcript_data = any_transcript.fetch()
	full_transcript = " ".join([item["text"] for item in any_transcript_data])
	logger.info(
	f"Successfully fetched fallback transcript for video ID {video_id} in language {any_transcript.language}.")
	return full_transcript
	except Exception as fallback_e:
	logger.error(
	f"Could not find any transcript for video ID {video_id}. Original error: {e}. Fallback error: {fallback_e}")
	return f"Error: No transcript found for video ID {video_id} in languages {languages} or any fallback language."
	except Exception as e:
	logger.error(f"Unexpected error fetching transcript for video ID {video_id}: {e}", exc_info=True)
	return f"Error fetching transcript: {e}"


	download_video_and_analyze_tool = FunctionTool.from_defaults(
	name="download_video_and_analyze",
	description=(
	"Downloads a video (YouTube or direct URL), samples representative frames, "
	"and feeds them to Gemini for multimodal analysis—returning a rich textual summary "
	"of the visual content."
	),
	fn=download_video_and_analyze,
	)

	youtube_transcript_tool = FunctionTool.from_defaults(
	fn=get_youtube_transcript,
	name="get_youtube_transcript",
	description=(
	"(YouTube) Fetches the transcript text for a given YouTube video URL or video ID. "
	"Specify preferred languages (e.g., 'en', 'es'). Returns transcript or error."
	)
	)


	# ---------------------------------------------------------------------------
	# Agent factory
	# ---------------------------------------------------------------------------

	def initialize_video_analyzer_agent() -> FunctionAgent:
	"""Initialise and return a video_analyzer_agent `FunctionAgent`."""

	logger.info("Initialising VideoAnalyzerAgent …")

	llm_model_name = os.getenv("VIDEO_ANALYZER_LLM_MODEL", "models/gemini-1.5-pro")
	gemini_api_key = os.getenv("GEMINI_API_KEY")

	if not gemini_api_key:
	logger.error("GEMINI_API_KEY not found in environment variables.")
	raise ValueError("GEMINI_API_KEY must be set")

	try:
	llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model_name)
	logger.info("Using LLM model: %s", llm_model_name)

	system_prompt = load_prompt_from_file()

	tools = [download_video_and_analyze_tool, youtube_transcript_tool]

	agent = FunctionAgent(
	name="video_analyzer_agent",
	description=(
	"VideoAnalyzerAgent inspects video files using Gemini's multimodal "
	"video understanding capabilities, producing factual scene analysis, "
	"temporal segmentation, and concise summaries as guided by the system "
	"prompt."
	),
	llm=llm,
	system_prompt=system_prompt,
	tools=tools,
	can_handoff_to=[
	"planner_agent",
	"research_agent",
	"reasoning_agent",
	"code_agent",
	],
	)

	logger.info("VideoAnalyzerAgent initialised successfully.")
	return agent

	except Exception as exc: # pylint: disable=broad-except
	logger.error("Error during VideoAnalyzerAgent initialisation: %s", exc, exc_info=True)
	raise


	if __name__ == "__main__":
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	)

	logger.info("Running video_analyzer_agent.py directly for testing …")

	if not os.getenv("GEMINI_API_KEY"):
	print("Error: GEMINI_API_KEY environment variable not set. Cannot run test.")
	else:
	try:
	test_agent = initialize_video_analyzer_agent()
	summary = download_video_and_analyze("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
	print("\n--- Gemini summary ---\n")
	print(summary)
	print("Video Analyzer Agent initialised successfully for testing.")
	except Exception as exc:
	print(f"Error during testing: {exc}")

	test_agent = None
	try:
	# Test YouTube transcript tool directly
	if YOUTUBE_TRANSCRIPT_API_AVAILABLE:
	print("\nTesting YouTube transcript tool...")
	# Example video: "Attention is All You Need" paper explanation
	yt_url = "https://www.youtube.com/watch?v=TQQlZhbC5ps"
	transcript = get_youtube_transcript(yt_url)
	if not transcript.startswith("Error:"):
	print(f"Transcript fetched (first 500 chars):\n{transcript[:500]}...")
	else:
	print(f"YouTube Transcript Fetch Failed: {transcript}")
	else:
	print("\nSkipping YouTube transcript test as youtube-transcript-api is not available.")

	except Exception as e:
	print(f"Error during testing: {e}")