Spaces:
Running
Running
import logging | |
import os | |
import gradio as gr | |
from dotenv import load_dotenv | |
from huggingface_hub import HfApi | |
from llm_interface import ERROR_503_DICT # Import error dict | |
from llm_interface import parse_qwen_response, query_qwen_endpoint | |
# Updated prompt imports for new order | |
from prompts import format_privacy_prompt, format_summary_highlights_prompt | |
# Import helper functions from other modules | |
from utils import list_cached_spaces # Added import | |
from utils import ( | |
check_report_exists, | |
download_cached_reports, | |
get_space_code_files, | |
upload_reports_to_dataset, | |
) | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
# Load environment variables from .env file | |
# This is important to ensure API keys and endpoints are loaded before use | |
load_dotenv() | |
# --- Constants --- | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf" | |
DATASET_ID = "yjernite/spaces-privacy-reports" | |
CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*" | |
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2" | |
TRUNCATION_WARNING = """**β οΈ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n""" | |
ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up. | |
You have a few options: | |
* **Wait & Retry:** Try clicking "Get Space Report" again in ~3-5 minutes. Endpoints often scale down to save resources and take a short time to wake up. | |
* **Select Cached Report:** Use the dropdown above to view a report for a Space that has already been analyzed. | |
* **Request Analysis:** If the error persists, please [open an issue or discussion](https://huggingface.co./spaces/yjernite/space-privacy/discussions) in the Space's Community tab requesting analysis for your target Space ID. We can run the job manually when the endpoint is available. | |
""" | |
def get_space_report_wrapper( | |
selected_cached_space: str | None, | |
new_space_id: str | None, | |
progress=gr.Progress(track_tqdm=True), | |
): | |
""" | |
Wrapper function to decide whether to fetch cache or run live analysis. | |
Handles the logic based on Dropdown and Textbox inputs. | |
Yields tuples of Gradio updates. | |
""" | |
target_space_id = None | |
source = "new" # Assume new input unless dropdown is chosen | |
# Prioritize new_space_id if provided | |
if new_space_id and new_space_id.strip(): | |
target_space_id = new_space_id.strip() | |
if target_space_id == selected_cached_space: | |
source = "dropdown_match" # User typed ID that exists in dropdown | |
else: | |
source = "new" | |
elif selected_cached_space: | |
target_space_id = selected_cached_space | |
source = "dropdown" | |
if not target_space_id: | |
# No input provided | |
return ( | |
gr.update( | |
value="Please select an existing report or enter a new Space ID.", | |
visible=True, | |
), | |
gr.update(value="", visible=False), | |
gr.update(visible=True, open=True), | |
gr.update(visible=False), | |
) | |
# Validate format | |
if "/" not in target_space_id: | |
return ( | |
gr.update( | |
value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.", | |
visible=True, | |
), | |
gr.update(value="", visible=False), | |
gr.update(visible=True, open=True), | |
gr.update(visible=False), | |
) | |
logging.info(f"Request received for: '{target_space_id}' (Source: {source})") | |
# --- Cache Handling --- | |
# If the user explicitly selected from the dropdown, try to fetch it directly. | |
if source == "dropdown": | |
progress( | |
0.1, desc="Fetching cached report..." | |
) # Simple progress for cache fetch | |
yield ( | |
gr.update(value="Fetching selected cached report...", visible=True), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
try: | |
cached_reports = download_cached_reports( | |
target_space_id, DATASET_ID, HF_TOKEN | |
) | |
summary_report = ( | |
cached_reports.get("summary", "Error: Cached summary not found.") | |
+ CACHE_INFO_MSG | |
) | |
privacy_report = ( | |
cached_reports.get("privacy", "Error: Cached privacy report not found.") | |
+ CACHE_INFO_MSG | |
) | |
logging.info( | |
f"Successfully displayed cached reports for selected '{target_space_id}'." | |
) | |
progress(1.0, desc="Complete (from cache)") | |
yield ( | |
gr.update(value=summary_report, visible=True), | |
gr.update(value=privacy_report, visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
except Exception as e: | |
error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}" | |
logging.error(error_msg) | |
progress(1.0, desc="Error") | |
yield ( | |
gr.update(value=error_msg, visible=True), | |
gr.update(value="", visible=False), | |
gr.update(visible=True, open=True), | |
gr.update(visible=False), | |
) | |
# --- Live Analysis or Check Cache for New Input --- | |
# If it came from the textbox OR was a dropdown match, we first check cache, then run live. | |
else: # source == "new" or source == "dropdown_match" | |
# This generator now performs the full analysis if needed | |
# Yield intermediate updates from the generator | |
# Important: Need to use a loop to consume the generator | |
final_update = None | |
for update_tuple in _run_live_analysis(target_space_id, progress): | |
yield update_tuple | |
final_update = update_tuple # Keep track of the last update | |
yield final_update # Return the very last state | |
def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)): | |
""" | |
Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload. | |
Yields tuples of Gradio updates. | |
(This contains the logic previously in analyze_space_privacy, minus initial input handling) | |
""" | |
steps = 8 # Steps for the full pipeline | |
privacy_truncated = False | |
summary_truncated = False | |
# --- Step 1: Check Cache --- (Check again for new/matched input) | |
progress(1 / steps, desc="Step 1/8: Checking cache...") | |
logging.info(f"Step 1/8: Checking cache for '{space_id}'...") | |
yield ( | |
gr.update(value="Checking cache for existing reports...", visible=True), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
found_in_cache = False | |
if HF_TOKEN: | |
try: | |
found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN) | |
except Exception as e: | |
logging.warning(f"Cache check failed: {e}. Proceeding.") | |
yield ( | |
gr.update( | |
value="Cache check failed, proceeding with live analysis...", | |
visible=True, | |
), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
if found_in_cache: | |
logging.info(f"Cache hit for {space_id}. Downloading.") | |
progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...") | |
yield ( | |
gr.update(value="Cache hit! Downloading reports...", visible=True), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
try: | |
cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN) | |
summary_report = ( | |
cached_reports.get("summary", "Error: Cached summary not found.") | |
+ CACHE_INFO_MSG | |
) | |
privacy_report = ( | |
cached_reports.get("privacy", "Error: Cached privacy report not found.") | |
+ CACHE_INFO_MSG | |
) | |
logging.info(f"Successfully displayed cached reports for {space_id}.") | |
progress(8 / steps, desc="Complete (from cache)") | |
yield ( | |
gr.update(value=summary_report, visible=True), | |
gr.update(value=privacy_report, visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
return # End generation here if cache successful | |
except Exception as e: | |
logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.") | |
yield ( | |
gr.update( | |
value="Cache download failed, proceeding with live analysis...", | |
visible=True, | |
), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
else: | |
logging.info(f"Cache miss for {space_id}. Performing live analysis.") | |
yield ( | |
gr.update(value="Cache miss. Fetching code...", visible=True), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
# --- Step 2: Check Endpoint Status --- | |
progress(2 / steps, desc="Step 2/8: Checking endpoint status...") | |
logging.info("Step 2/8: Checking endpoint status...") | |
yield ( | |
gr.update(value="Checking whether model endpoint is active...", visible=True), | |
gr.update(value="", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
endpoint_ready = False | |
if HF_TOKEN: | |
try: | |
api = HfApi(token=HF_TOKEN) | |
endpoint = api.get_inference_endpoint(name=ENDPOINT_NAME) | |
status = endpoint.status | |
logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}") | |
if status == 'running': | |
endpoint_ready = True | |
else: | |
logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).") | |
if status == 'scaledToZero': | |
logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...") | |
endpoint.resume() | |
msg_503 = f"**Full Service Temporarily Unavailable**: but you can **browse existing reports** or **check back later!**\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>**{status}**</span>\n\n" + ERROR_503_USER_MESSAGE | |
yield ( | |
gr.update(value=msg_503, visible=True), | |
gr.update(value="", visible=False), | |
gr.update(visible=True, open=True), | |
gr.update(visible=False) | |
) | |
return # Stop analysis, user needs to retry | |
except Exception as e: | |
logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}") | |
yield ( | |
gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True), | |
gr.update(value="", visible=False), | |
gr.update(visible=True, open=True), | |
gr.update(visible=False) | |
) | |
return # Stop analysis | |
# --- Step 3: Fetch Code Files (if not cached) --- | |
progress(3 / steps, desc="Step 3/8: Fetching code files...") | |
logging.info("Step 3/8: Fetching code files...") | |
code_files = get_space_code_files(space_id) | |
if not code_files: | |
error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space." | |
logging.warning(error_msg) | |
yield ( | |
gr.update(value=f"**Error:**\n{error_msg}", visible=True), | |
gr.update(value="Analysis Canceled", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=False), | |
) | |
return # End generation on error | |
# --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) --- | |
progress( | |
4 / steps, desc="Step 4/8: Generating detailed privacy report (AI Call 1)..." | |
) | |
logging.info("Step 4/8: Generating detailed privacy analysis report...") | |
yield ( | |
gr.update(value="Generating detailed privacy report...", visible=True), | |
gr.update(value="Generating detailed privacy report via AI...", visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
privacy_prompt_messages, privacy_truncated = format_privacy_prompt( | |
space_id, code_files | |
) | |
# --- Check for 503 after query --- | |
privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072) | |
if privacy_api_response == ERROR_503_DICT: | |
logging.warning("LLM Call 1 failed with 503.") | |
yield ( | |
gr.update( | |
value=ERROR_503_USER_MESSAGE, visible=True | |
), # Show 503 message in summary area | |
gr.update(value="", visible=False), # Clear privacy area | |
gr.update(visible=True, open=True), # Keep summary open | |
gr.update(visible=False), # Hide privacy accordion | |
) | |
return # Stop analysis | |
detailed_privacy_report = parse_qwen_response(privacy_api_response) | |
if "Error:" in detailed_privacy_report: | |
logging.error( | |
f"Failed to generate detailed privacy report: {detailed_privacy_report}" | |
) | |
yield ( | |
gr.update(value="Analysis Halted due to Error", visible=True), | |
gr.update( | |
value=f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}", | |
visible=True, | |
), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
return # End generation on error | |
if privacy_truncated: | |
detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report | |
yield ( | |
gr.update(value="Extracting model info...", visible=True), | |
gr.update(value=detailed_privacy_report, visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
# --- Step 5: Fetch Model Descriptions --- | |
progress(5 / steps, desc="Step 5/8: Fetching model descriptions...") | |
logging.info("Step 5/8: Fetching model descriptions...") | |
yield ( | |
gr.update(value="Fetching model descriptions...", visible=True), | |
gr.update(), | |
gr.update(), | |
gr.update(), | |
) | |
# --- Step 6: Generate Summary + Highlights Report (LLM Call 2) --- | |
progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...") | |
logging.info("Step 6/8: Generating summary and highlights report...") | |
yield ( | |
gr.update(value="Generating summary & highlights via AI...", visible=True), | |
gr.update(), | |
gr.update(), | |
gr.update(), | |
) | |
summary_highlights_prompt_messages, summary_truncated = ( | |
format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report) | |
) | |
# --- Check for 503 after query --- | |
summary_highlights_api_response = query_qwen_endpoint( | |
summary_highlights_prompt_messages, max_tokens=2048 | |
) | |
if summary_highlights_api_response == ERROR_503_DICT: | |
logging.warning("LLM Call 2 failed with 503.") | |
yield ( | |
gr.update( | |
value=ERROR_503_USER_MESSAGE, visible=True | |
), # Show 503 message in summary area | |
gr.update( | |
value=detailed_privacy_report, visible=True | |
), # Keep previous report visible | |
gr.update(visible=True, open=True), # Keep summary open | |
gr.update(visible=True, open=True), # Keep privacy open | |
) | |
return # Stop analysis | |
summary_highlights_report = parse_qwen_response(summary_highlights_api_response) | |
if "Error:" in summary_highlights_report: | |
logging.error( | |
f"Failed to generate summary/highlights report: {summary_highlights_report}" | |
) | |
yield ( | |
gr.update( | |
value=f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}", | |
visible=True, | |
), | |
gr.update(value=detailed_privacy_report, visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
return # End generation on error | |
if summary_truncated: | |
summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report | |
# Yield summary report before attempting upload | |
yield ( | |
gr.update(value=summary_highlights_report, visible=True), | |
gr.update(value=detailed_privacy_report, visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
# --- Step 7: Upload to Cache --- | |
progress(7 / steps, desc="Step 7/8: Uploading results to cache...") | |
logging.info("Step 7/8: Attempting to upload results to dataset cache...") | |
try: | |
if ( | |
HF_TOKEN | |
and not found_in_cache | |
and "Error:" not in detailed_privacy_report | |
and "Error:" not in summary_highlights_report | |
): | |
summary_to_save = summary_highlights_report.replace( | |
TRUNCATION_WARNING, "" | |
).replace(CACHE_INFO_MSG, "") | |
privacy_to_save = detailed_privacy_report.replace( | |
TRUNCATION_WARNING, "" | |
).replace(CACHE_INFO_MSG, "") | |
upload_reports_to_dataset( | |
space_id=space_id, | |
summary_report=summary_to_save, | |
detailed_report=privacy_to_save, | |
dataset_id=DATASET_ID, | |
hf_token=HF_TOKEN, | |
) | |
elif not HF_TOKEN: | |
logging.warning("Skipping cache upload as HF_TOKEN is not set.") | |
elif found_in_cache: | |
logging.info("Skipping cache upload as results were loaded from cache.") | |
except Exception as e: | |
logging.error(f"Non-critical error during report upload: {e}") | |
logging.info("Step 8/8: Analysis complete.") | |
progress(8 / steps, desc="Step 8/8: Analysis Complete!") | |
# --- Step 8: Yield Final Results --- (Ensure final state is correct) | |
yield ( | |
gr.update(value=summary_highlights_report, visible=True), | |
gr.update(value=detailed_privacy_report, visible=True), | |
gr.update(visible=True, open=True), | |
gr.update(visible=True, open=True), | |
) | |
# --- Load Initial Data Function (for demo.load) --- | |
def load_cached_list(): | |
"""Fetches the list of cached spaces and determines the default selection.""" | |
print("Running demo.load: Fetching list of cached spaces...") | |
# Use os.getenv here directly as HF_TOKEN might be loaded after initial import | |
token = os.getenv("HF_TOKEN") | |
cached_list = list_cached_spaces(DATASET_ID, token) | |
default_value = DEFAULT_SELECTION if DEFAULT_SELECTION in cached_list else None | |
if not cached_list: | |
print( | |
"WARNING: No cached spaces found or failed to fetch list during demo.load." | |
) | |
# Return an update object for the dropdown using gr.update() | |
return gr.update(choices=cached_list, value=default_value) | |
# --- Gradio Interface Definition --- | |
# Use HTML/CSS for centering the title | |
TITLE = "<div style='text-align: center;'><h1>π€ Space Privacy Analyzer π΅οΈ</h1></div>\n<div style='text-align: center;'><h4>Automatic code Data transfer review powered by <a href='https://huggingface.co./Qwen/Qwen2.5-Coder-32B-Instruct' target='_blank'>Qwen2.5-Coder-32B-Instruct</a></h4></div>" | |
DESCRIPTION = """ | |
### Check the Privacy of a Hugging Face Space | |
[Hugging Face π€ Spaces](https://huggingface.co./spaces) offer a convenient way to build and share code demos online. | |
In most cases, the code for these demos is open source — which provides a unique opportunity to **examine how privacy is managed** in the demo. | |
This demo leverages a code analysis model ([Qwen2.5-Coder-32B-Instruct](https://huggingface.co./Qwen/Qwen2.5-Coder-32B-Instruct)) to help explore privacy questions in two steps: | |
1. Obtain and **parse the code** of a Space to identify data inputs, AI model use, API calls, and data transfer behavior. | |
2. Generate a summary of the Space's function and highlight **key privacy points**. | |
Use the dropdown menu below to explore the [reports generated for some popular Spaces](https://huggingface.co./datasets/yjernite/spaces-privacy-reports/tree/main), or enter a new Space ID to query your own π | |
*Please note the following limitations:* | |
- *The model may miss important details in the code, especially when it leverages Docker files or external libraries.* | |
- *This app uses the base Qwen Coder model without specific adaptation to the task. We'd love to discuss how to improve this, if you want to participate [feel free to open a discussion!](https://huggingface.co./spaces/yjernite/space-privacy/discussions)* | |
""" | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown(TITLE) # This will now render the centered HTML | |
with gr.Row(): | |
with gr.Column(scale=1): # Left column for inputs | |
description_accordion = gr.Accordion( | |
"What Privacy Questions do π€ Spaces Raise? Demo Desription π", open=False, visible=True | |
) | |
with description_accordion: | |
gr.Markdown(DESCRIPTION) | |
cached_spaces_dropdown = gr.Dropdown( | |
label="Select Existing Report", | |
info="Select a Space whose report has been previously generated.", | |
choices=[], # Initialize empty, will be populated by demo.load | |
value=None, # Initialize empty | |
) | |
space_id_input = gr.Textbox( | |
label="Or Enter New Space ID", | |
placeholder="owner/space-name", | |
info="Enter a new Space ID to analyze (takes precedence over selection).", | |
) | |
analyze_button = gr.Button("Get Space Report", variant="primary", scale=1) | |
with gr.Column(scale=1): # Right column for outputs | |
# Define Accordions first, open by default, hidden initially | |
summary_accordion = gr.Accordion( | |
"Summary & Privacy Highlights", open=True, visible=True | |
) | |
privacy_accordion = gr.Accordion( | |
"Detailed Privacy Analysis Report", open=False, visible=True | |
) | |
with summary_accordion: | |
summary_markdown = gr.Markdown( | |
"Enter or select a Space ID and click Get Report.", | |
show_copy_button=True, | |
) | |
with privacy_accordion: | |
privacy_markdown = gr.Markdown( | |
"Detailed report will appear here.", show_copy_button=True | |
) | |
# --- Event Listeners --- | |
# Load event to populate the dropdown when the UI loads for a user session | |
demo.load(fn=load_cached_list, inputs=None, outputs=cached_spaces_dropdown) | |
# Button click event | |
analyze_button.click( | |
fn=get_space_report_wrapper, | |
inputs=[cached_spaces_dropdown, space_id_input], | |
outputs=[ | |
summary_markdown, | |
privacy_markdown, | |
summary_accordion, | |
privacy_accordion, | |
], | |
show_progress="full", | |
) | |
# --- Application Entry Point --- | |
if __name__ == "__main__": | |
logging.info("Starting Gradio application...") | |
demo.launch() | |