Spaces:
Paused
Paused
File size: 6,195 Bytes
9f5d5d3 d134f64 9f5d5d3 da518bc 9f5d5d3 da518bc 9f5d5d3 da518bc 9f5d5d3 da518bc 9f5d5d3 46f013b 9f5d5d3 da518bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
"""
LLM implementation using Hugging Face Inference Endpoint with OpenAI compatibility.
"""
import requests
import os
import json
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Endpoint configuration
HF_API_KEY = os.environ.get("HF_API_KEY", "")
ENDPOINT_URL = os.environ.get("ENDPOINT_URL", "https://cg01ow7izccjx1b2.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions")
# Verify configuration
if not HF_API_KEY:
logger.warning("HF_API_KEY environment variable not set")
if not ENDPOINT_URL:
logger.warning("ENDPOINT_URL environment variable not set")
# Memory store for conversation history
conversation_memory = {}
def run_llm(input_text, max_tokens=512, temperature=0.7):
"""
Process input text through HF Inference Endpoint.
Args:
input_text: User input to process
max_tokens: Maximum tokens to generate
temperature: Temperature for sampling (higher = more random)
Returns:
Generated response text
"""
headers = {
"Authorization": f"Bearer {HF_API_KEY}",
"Content-Type": "application/json"
}
# Format messages in OpenAI format
messages = [
{"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."},
{"role": "user", "content": input_text}
]
payload = {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature
}
logger.info(f"Sending request to endpoint: {ENDPOINT_URL[:30]}...")
try:
response = requests.post(ENDPOINT_URL, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
response_text = result["choices"][0]["message"]["content"]
return response_text
except requests.exceptions.RequestException as e:
error_msg = f"Error calling endpoint: {str(e)}"
if hasattr(e, 'response') and e.response is not None:
error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}"
logger.error(error_msg)
return f"Error generating response: {str(e)}"
def run_llm_with_memory(input_text, session_id="default", max_tokens=512, temperature=0.7):
"""
Process input with conversation memory.
Args:
input_text: User input to process
session_id: Unique identifier for conversation
max_tokens: Maximum tokens to generate
temperature: Temperature for sampling
Returns:
Generated response text
"""
# Initialize memory if needed
if session_id not in conversation_memory:
conversation_memory[session_id] = [
{"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."}
]
# Add current input to memory
conversation_memory[session_id].append({"role": "user", "content": input_text})
# Prepare the full conversation history
messages = conversation_memory[session_id].copy()
# Keep only the last 10 messages to avoid context length issues
if len(messages) > 10:
# Always keep the system message
messages = [messages[0]] + messages[-9:]
headers = {
"Authorization": f"Bearer {HF_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature
}
logger.info(f"Sending memory-based request for session {session_id}")
try:
response = requests.post(ENDPOINT_URL, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
response_text = result["choices"][0]["message"]["content"]
# Save response to memory
conversation_memory[session_id].append({"role": "assistant", "content": response_text})
return response_text
except requests.exceptions.RequestException as e:
error_msg = f"Error calling endpoint: {str(e)}"
if hasattr(e, 'response') and e.response is not None:
error_msg += f" - Status code: {e.response.status_code}, Response: {e.response.text}"
logger.error(error_msg)
return f"Error generating response: {str(e)}"
def clear_memory(session_id="default"):
"""
Clear conversation memory for a specific session.
Args:
session_id: Unique identifier for conversation
"""
if session_id in conversation_memory:
conversation_memory[session_id] = [
{"role": "system", "content": "You are a helpful AI assistant for a telecom service. Answer questions clearly and concisely."}
]
return True
return False
def get_memory_sessions():
"""
Get list of active memory sessions.
Returns:
List of session IDs
"""
return list(conversation_memory.keys())
def get_model_info():
"""
Get information about the connected model endpoint.
Returns:
Dictionary with endpoint information
"""
return {
"endpoint_url": ENDPOINT_URL,
"memory_sessions": len(conversation_memory),
"model_type": "Meta-Llama-3.1-8B-Instruct (Inference Endpoint)"
}
def test_endpoint():
"""
Test the endpoint connection.
Returns:
Status information
"""
try:
response = run_llm("Hello, this is a test message. Please respond with a short greeting.")
return {
"status": "connected",
"message": "Successfully connected to endpoint",
"sample_response": response[:50] + "..." if len(response) > 50 else response
}
except Exception as e:
return {
"status": "error",
"message": f"Failed to connect to endpoint: {str(e)}"
} |