""" LLM implementation using Hugging Face Inference Endpoint with OpenAI compatibility. """ import requests import os import json import logging # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Endpoint configuration HF_API_KEY = os.environ.get("HF_API_KEY", "") ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "https://cg01ow7izccjx1b2.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions") # Verify configuration if not HF_API_KEY: logger.warning("HF_API_KEY environment variable not set") if not ENDPOINT_URL: logger.warning("ENDPOINT_URL environment variable not set") # Memory store for conversation history conversation_memory = {} def run_llm(input_text, max_tokens=512, temperature=0.7): """ Process input text through HF Inference Endpoint. Args: input_text: User input to process max_tokens: Maximum tokens to generate temperature: Temperature for sampling (higher = more random) Returns: Generated response text """ headers = { "Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json" } # Format messages in OpenAI format messages = [ {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."}, {"role": "user", "content": input_text} ] payload = { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "messages": messages, "max_tokens": max_tokens, "temperature": temperature } logger.info(f"Sending request to endpoint: {ENDPOINT_URL[:30]}...") try: response = requests.post(ENDPOINT_URL, headers=headers, json=payload) response.raise_for_status() result = response.json() response_text = result["choices"][0]["message"]["content"] return response_text except requests.exceptions.RequestException as e: error_msg = f"Error calling endpoint: {str(e)}" if hasattr(e, 'response') and e.response is not None: error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}" logger.error(error_msg) return f"Error generating response: {str(e)}" def run_llm_with_memory(input_text, session_id="default", max_tokens=512, temperature=0.7): """ Process input with conversation memory. Args: input_text: User input to process session_id: Unique identifier for conversation max_tokens: Maximum tokens to generate temperature: Temperature for sampling Returns: Generated response text """ # Initialize memory if needed if session_id not in conversation_memory: conversation_memory[session_id] = [ {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."} ] # Add current input to memory conversation_memory[session_id].append({"role": "user", "content": input_text}) # Prepare the full conversation history messages = conversation_memory[session_id].copy() # Keep only the last 10 messages to avoid context length issues if len(messages) > 10: # Always keep the system message messages = [messages[0]] + messages[-9:] headers = { "Authorization": f"Bearer {HF_API_KEY}", "Content-Type": "application/json" } payload = { "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", "messages": messages, "max_tokens": max_tokens, "temperature": temperature } logger.info(f"Sending memory-based request for session {session_id}") try: response = requests.post(ENDPOINT_URL, headers=headers, json=payload) response.raise_for_status() result = response.json() response_text = result["choices"][0]["message"]["content"] # Save response to memory conversation_memory[session_id].append({"role": "assistant", "content": response_text}) return response_text except requests.exceptions.RequestException as e: error_msg = f"Error calling endpoint: {str(e)}" if hasattr(e, 'response') and e.response is not None: error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}" logger.error(error_msg) return f"Error generating response: {str(e)}" def clear_memory(session_id="default"): """ Clear conversation memory for a specific session. Args: session_id: Unique identifier for conversation """ if session_id in conversation_memory: conversation_memory[session_id] = [ {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."} ] return True return False def get_memory_sessions(): """ Get list of active memory sessions. Returns: List of session IDs """ return list(conversation_memory.keys()) def get_model_info(): """ Get information about the connected model endpoint. Returns: Dictionary with endpoint information """ return { "endpoint_url": ENDPOINT_URL, "memory_sessions": len(conversation_memory), "model_type": "Meta-Llama-3.1-8B-Instruct (Inference Endpoint)" } def test_endpoint(): """ Test the endpoint connection. Returns: Status information """ try: response = run_llm("Hello, this is a test message. Please respond with a short greeting.") return { "status": "connected", "message": "Successfully connected to endpoint", "sample_response": response[:50] + "..." if len(response) > 50 else response } except Exception as e: return { "status": "error", "message": f"Failed to connect to endpoint: {str(e)}" }