Spaces:

kshitijthakkar
/

loggenix-moe-0.3B-A0.1B-demo

Running

loggenix-moe-0.3B-A0.1B-demo / model_handler_ollama.py

kshitijthakkar

run via ollama quants gguf for faster inference speed

02a42b8 3 months ago

15.2 kB

	import requests
	import json
	import re
	import time
	from typing import Dict, Any, Optional, List

	# Ollama configuration
	OLLAMA_BASE_URL = "http://localhost:11434" # Default Ollama URL
	MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0" # Replace with your actual model name in Ollama

	# Inference configurations
	INFERENCE_CONFIGS = {
	"Optimized for Speed": {
	"num_predict": 512,
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	"description": "Fast responses with limited output length"
	},
	"Middle-ground": {
	"num_predict": 2048,
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	"description": "Balanced performance and output quality"
	},
	"Full Capacity": {
	"num_predict": 4096,
	"temperature": 0.7,
	"top_p": 0.9,
	"top_k": 40,
	"repeat_penalty": 1.1,
	"description": "Maximum output length with dynamic allocation"
	}
	}


	def get_inference_configs():
	"""Get available inference configurations"""
	return INFERENCE_CONFIGS


	def check_ollama_connection():
	"""Check if Ollama is running and accessible"""
	try:
	response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
	return response.status_code == 200
	except requests.RequestException:
	return False


	def list_ollama_models():
	"""List available models in Ollama"""
	try:
	response = requests.get(f"{OLLAMA_BASE_URL}/api/tags", timeout=5)
	if response.status_code == 200:
	models = response.json().get("models", [])
	return [model["name"] for model in models]
	return []
	except requests.RequestException:
	return []


	def load_model():
	"""Check Ollama connection and model availability"""
	if not check_ollama_connection():
	raise ConnectionError(
	"Cannot connect to Ollama. Please make sure Ollama is running.\n"
	"Start Ollama with: ollama serve"
	)

	available_models = list_ollama_models()
	if MODEL_NAME not in available_models:
	print(f"Warning: Model '{MODEL_NAME}' not found in Ollama.")
	print(f"Available models: {available_models}")
	print(f"Pull your model with: ollama pull {MODEL_NAME}")
	return False

	print(f"Using Ollama model: {MODEL_NAME}")
	return True


	# ===== TOOL DEFINITIONS =====

	def calculate_numbers(operation: str, num1: float, num2: float) -> Dict[str, Any]:
	"""
	Sample tool to perform basic mathematical operations on two numbers.

	Args:
	operation: The operation to perform ('add', 'subtract', 'multiply', 'divide')
	num1: First number
	num2: Second number

	Returns:
	Dictionary with result and operation details
	"""
	try:
	num1, num2 = float(num1), float(num2)

	if operation.lower() == 'add':
	result = num1 + num2
	elif operation.lower() == 'subtract':
	result = num1 - num2
	elif operation.lower() == 'multiply':
	result = num1 * num2
	elif operation.lower() == 'divide':
	if num2 == 0:
	return {"error": "Division by zero is not allowed"}
	result = num1 / num2
	else:
	return {"error": f"Unknown operation: {operation}"}

	return {
	"result": result,
	"operation": operation,
	"operands": [num1, num2],
	"formatted": f"{num1} {operation} {num2} = {result}"
	}
	except ValueError as e:
	return {"error": f"Invalid number format: {str(e)}"}
	except Exception as e:
	return {"error": f"Calculation error: {str(e)}"}


	# Tool registry
	AVAILABLE_TOOLS = {
	"calculate_numbers": {
	"function": calculate_numbers,
	"description": "Perform basic mathematical operations (add, subtract, multiply, divide) on two numbers",
	"parameters": {
	"operation": "The mathematical operation to perform",
	"num1": "First number",
	"num2": "Second number"
	}
	}
	}


	def execute_tool_call(tool_name: str, **kwargs) -> Dict[str, Any]:
	"""Execute a tool call with given parameters"""
	print(f"Executing tool: {tool_name} with parameters: {kwargs}")
	if tool_name not in AVAILABLE_TOOLS:
	return {"error": f"Unknown tool: {tool_name}"}

	try:
	tool_function = AVAILABLE_TOOLS[tool_name]["function"]
	result = tool_function(**kwargs)
	return {
	"tool_name": tool_name,
	"parameters": kwargs,
	"result": result
	}
	except Exception as e:
	print(f"Tool execution failed: {str(e)}")
	return {
	"tool_name": tool_name,
	"parameters": kwargs,
	"error": f"Tool execution error: {str(e)}"
	}


	def parse_tool_calls(text: str) -> list:
	"""
	Parse tool calls from model output.
	Supports both formats:
	- [TOOL_CALL:tool_name(param1=value1, param2=value2)]
	- <tool_call>{"name": "tool_name", "parameters": {"param1": "value1", "param2": "value2"}}</tool_call>
	"""
	tool_calls = []

	# Pattern for both formats
	pattern = r'(\[TOOL_CALL:(\w+)\((.?)\)\]\|<tool_call>\s{"name":\s"(\w+)",\s"parameters":\s{([^}])}\s}\s</tool_call>)'
	matches = re.findall(pattern, text)
	print("Raw matches:", matches)

	for match in matches:
	full_match, old_tool_name, old_params, json_tool_name, json_params = match

	# Determine which format was matched
	if old_tool_name: # Old format: [TOOL_CALL:tool_name(params)]
	tool_name = old_tool_name
	params_str = old_params
	original_call = f"[TOOL_CALL:{tool_name}({params_str})]"

	try:
	params = {}
	if params_str.strip():
	param_pairs = params_str.split(',')
	for pair in param_pairs:
	if '=' in pair:
	key, value = pair.split('=', 1)
	key = key.strip()
	value = value.strip().strip('"\'') # Remove quotes
	params[key] = value

	tool_calls.append({
	"tool_name": tool_name,
	"parameters": params,
	"original_call": original_call
	})

	except Exception as e:
	print(f"Error parsing old format tool call '{tool_name}({params_str})': {e}")
	continue

	elif json_tool_name: # JSON format: <tool_call>...</tool_call>
	tool_name = json_tool_name
	params_str = json_params
	original_call = full_match

	try:
	params = {}
	if params_str.strip():
	# Parse JSON-like parameters
	param_pairs = params_str.split(',')
	for pair in param_pairs:
	if ':' in pair:
	key, value = pair.split(':', 1)
	key = key.strip().strip('"\'')
	value = value.strip().strip('"\'')
	params[key] = value

	tool_calls.append({
	"tool_name": tool_name,
	"parameters": params,
	"original_call": original_call
	})

	except Exception as e:
	print(f"Error parsing JSON format tool call '{tool_name}': {e}")
	continue

	return tool_calls


	def process_tool_calls(text: str) -> str:
	"""Process tool calls in the generated text and replace with results"""
	tool_calls = parse_tool_calls(text)

	if not tool_calls:
	return text

	processed_text = text

	for tool_call in tool_calls:
	tool_name = tool_call["tool_name"]
	parameters = tool_call["parameters"]
	original_call = tool_call["original_call"]

	try:
	# Validate parameters before execution
	if not isinstance(parameters, dict):
	raise ValueError(f"Invalid parameters for tool {tool_name}: {parameters}")

	# Execute tool
	result = execute_tool_call(tool_name, **parameters)

	# Create replacement text
	if "error" in result:
	replacement = f"[TOOL_ERROR: {result['error']}]"
	else:
	if "result" in result["result"]:
	replacement = f"[TOOL_RESULT: {result['result']['formatted']}]"
	else:
	replacement = f"[TOOL_RESULT: {result['result']}]"

	# Replace tool call with result
	processed_text = processed_text.replace(original_call, replacement)

	except Exception as e:
	print(f"Error processing tool call '{tool_name}': {e}")
	replacement = f"[TOOL_ERROR: Failed to process tool call: {str(e)}]"
	processed_text = processed_text.replace(original_call, replacement)

	return processed_text


	def call_ollama_api(messages: List[Dict], config: Dict, stream: bool = False) -> str:
	"""
	Make a request to Ollama API

	Args:
	messages: List of message dictionaries with 'role' and 'content'
	config: Configuration dictionary with inference parameters
	stream: Whether to stream the response

	Returns:
	Generated response text
	"""
	# Convert messages to prompt format expected by your model
	# This might need adjustment based on your model's expected format
	prompt = ""
	for msg in messages:
	if msg["role"] == "system":
	prompt += f"System: {msg['content']}\n\n"
	elif msg["role"] == "user":
	prompt += f"User: {msg['content']}\n\n"
	elif msg["role"] == "assistant":
	prompt += f"Assistant: {msg['content']}\n\n"

	prompt += "Assistant: "

	payload = {
	"model": MODEL_NAME,
	"prompt": prompt,
	"stream": stream,
	"options": {
	"num_predict": config.get("num_predict", 2048),
	"temperature": config.get("temperature", 0.7),
	"top_p": config.get("top_p", 0.9),
	"top_k": config.get("top_k", 40),
	"repeat_penalty": config.get("repeat_penalty", 1.1),
	}
	}

	try:
	if stream:
	return stream_ollama_response(payload)
	else:
	response = requests.post(
	f"{OLLAMA_BASE_URL}/api/generate",
	json=payload,
	timeout=300 # 5 minutes timeout
	)
	response.raise_for_status()

	result = response.json()
	return result.get("response", "")

	except requests.RequestException as e:
	raise ConnectionError(f"Failed to connect to Ollama: {str(e)}")
	except json.JSONDecodeError as e:
	raise ValueError(f"Invalid response from Ollama: {str(e)}")


	def stream_ollama_response(payload: Dict) -> str:
	"""Stream response from Ollama and return complete text"""
	full_response = ""

	try:
	response = requests.post(
	f"{OLLAMA_BASE_URL}/api/generate",
	json=payload,
	stream=True,
	timeout=300
	)
	response.raise_for_status()

	for line in response.iter_lines():
	if line:
	try:
	chunk = json.loads(line.decode('utf-8'))
	if 'response' in chunk:
	token = chunk['response']
	full_response += token
	print(token, end='', flush=True) # Print tokens as they come

	if chunk.get('done', False):
	break

	except json.JSONDecodeError:
	continue

	except requests.RequestException as e:
	raise ConnectionError(f"Streaming failed: {str(e)}")

	print() # New line after streaming
	return full_response


	def generate_response(system_prompt: str, user_input: str, config_name: str = "Middle-ground",
	stream: bool = False) -> str:
	"""
	Generate response using Ollama API with the given system prompt and user input.

	Args:
	system_prompt: System instruction for the model
	user_input: User's input message
	config_name: Configuration preset to use
	stream: Whether to stream the response

	Returns:
	Generated response text
	"""
	# Load/check model
	if not load_model():
	return "Error: Model not available in Ollama"

	config = INFERENCE_CONFIGS[config_name]

	# Prepare messages
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_input}
	]

	start_time = time.time()

	try:
	# Generate response using Ollama
	generated_response = call_ollama_api(messages, config, stream=stream)

	inference_time = time.time() - start_time
	print(f"Inference time: {inference_time:.2f} seconds")

	# Process any tool calls in the generated response
	processed_response = process_tool_calls(generated_response)

	return processed_response

	except Exception as e:
	print(f"Error generating response: {str(e)}")
	return f"Error: {str(e)}"


	# Example usage and testing functions
	def test_connection():
	"""Test Ollama connection and model availability"""
	print("Testing Ollama connection...")

	if not check_ollama_connection():
	print("❌ Cannot connect to Ollama")
	print("Make sure Ollama is running: ollama serve")
	return False

	print("✅ Ollama is running")

	models = list_ollama_models()
	print(f"Available models: {models}")

	if MODEL_NAME not in models:
	print(f"❌ Model '{MODEL_NAME}' not found")
	print(f"Pull the model with: ollama pull {MODEL_NAME}")
	return False

	print(f"✅ Model '{MODEL_NAME}' is available")
	return True


	def example_usage():
	"""Example of how to use the system"""
	if not test_connection():
	return

	system_prompt = """You are a helpful AI assistant with access to tools. When you need to perform mathematical calculations, use the available tools by calling them in this format: [TOOL_CALL:calculate_numbers(operation="add", num1="10", num2="5")]

	Available tools:
	- calculate_numbers: Perform basic math operations (add, subtract, multiply, divide)
	"""

	user_input = "What is 125 + 675? Please calculate this for me."

	print("Generating response...")
	response = generate_response(system_prompt, user_input, "Middle-ground", stream=True)
	print(f"\nFinal response: {response}")


	if __name__ == "__main__":
	# Update MODEL_NAME to match your model in Ollama
	MODEL_NAME = "hf.co/kshitijthakkar/loggenix-moe-0.3B-A0.1B-e3-lr7e5-b16-4090-v6.3-finetuned-tool-Q8_0-GGUF:Q8_0" # Change this!

	example_usage()