Spaces:

tommytracx
/

FluentQ

Paused

File size: 6,195 Bytes

"""
LLM implementation using Hugging Face Inference Endpoint with OpenAI compatibility.
"""
import requests
import os
import json
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Endpoint configuration
HF_API_KEY = os.environ.get("HF_API_KEY", "")
ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "https://cg01ow7izccjx1b2.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions")

# Verify configuration
if not HF_API_KEY:
    logger.warning("HF_API_KEY environment variable not set")
if not ENDPOINT_URL:
    logger.warning("ENDPOINT_URL environment variable not set")

# Memory store for conversation history
conversation_memory = {}

def run_llm(input_text, max_tokens=512, temperature=0.7):
    """
    Process input text through HF Inference Endpoint.
    
    Args:
        input_text: User input to process
        max_tokens: Maximum tokens to generate
        temperature: Temperature for sampling (higher = more random)
        
    Returns:
        Generated response text
    """
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    # Format messages in OpenAI format
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."},
        {"role": "user", "content": input_text}
    ]
    
    payload = {
        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    
    logger.info(f"Sending request to endpoint: {ENDPOINT_URL[:30]}...")
    
    try:
        response = requests.post(ENDPOINT_URL, headers=headers, json=payload)
        response.raise_for_status()
        
        result = response.json()
        response_text = result["choices"][0]["message"]["content"]
        return response_text
        
    except requests.exceptions.RequestException as e:
        error_msg = f"Error calling endpoint: {str(e)}"
        if hasattr(e, 'response') and e.response is not None:
            error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}"
        logger.error(error_msg)
        return f"Error generating response: {str(e)}"

def run_llm_with_memory(input_text, session_id="default", max_tokens=512, temperature=0.7):
    """
    Process input with conversation memory.
    
    Args:
        input_text: User input to process
        session_id: Unique identifier for conversation
        max_tokens: Maximum tokens to generate
        temperature: Temperature for sampling
        
    Returns:
        Generated response text
    """
    # Initialize memory if needed
    if session_id not in conversation_memory:
        conversation_memory[session_id] = [
            {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."}
        ]
    
    # Add current input to memory
    conversation_memory[session_id].append({"role": "user", "content": input_text})
    
    # Prepare the full conversation history
    messages = conversation_memory[session_id].copy()
    
    # Keep only the last 10 messages to avoid context length issues
    if len(messages) > 10:
        # Always keep the system message
        messages = [messages[0]] + messages[-9:]
    
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    
    logger.info(f"Sending memory-based request for session {session_id}")
    
    try:
        response = requests.post(ENDPOINT_URL, headers=headers, json=payload)
        response.raise_for_status()
        
        result = response.json()
        response_text = result["choices"][0]["message"]["content"]
        
        # Save response to memory
        conversation_memory[session_id].append({"role": "assistant", "content": response_text})
        
        return response_text
    
    except requests.exceptions.RequestException as e:
        error_msg = f"Error calling endpoint: {str(e)}"
        if hasattr(e, 'response') and e.response is not None:
            error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}"
        logger.error(error_msg)
        return f"Error generating response: {str(e)}"

def clear_memory(session_id="default"):
    """
    Clear conversation memory for a specific session.
    
    Args:
        session_id: Unique identifier for conversation
    """
    if session_id in conversation_memory:
        conversation_memory[session_id] = [
            {"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."}
        ]
        return True
    return False

def get_memory_sessions():
    """
    Get list of active memory sessions.
    
    Returns:
        List of session IDs
    """
    return list(conversation_memory.keys())

def get_model_info():
    """
    Get information about the connected model endpoint.
    
    Returns:
        Dictionary with endpoint information
    """
    return {
        "endpoint_url": ENDPOINT_URL,
        "memory_sessions": len(conversation_memory),
        "model_type": "Meta-Llama-3.1-8B-Instruct (Inference Endpoint)"
    }

def test_endpoint():
    """
    Test the endpoint connection.
    
    Returns:
        Status information
    """
    try:
        response = run_llm("Hello, this is a test message. Please respond with a short greeting.")
        return {
            "status": "connected",
            "message": "Successfully connected to endpoint",
            "sample_response": response[:50] + "..." if len(response) > 50 else response
        }
    except Exception as e:
        return {
            "status": "error",
            "message": f"Failed to connect to endpoint: {str(e)}"
        }