Spaces:

yjernite
/

space-privacy

Running

File size: 24,246 Bytes

import logging
import os

import gradio as gr
from dotenv import load_dotenv

from huggingface_hub import HfApi

from llm_interface import ERROR_503_DICT  # Import error dict
from llm_interface import parse_qwen_response, query_qwen_endpoint

# Updated prompt imports for new order
from prompts import format_privacy_prompt, format_summary_highlights_prompt

# Import helper functions from other modules
from utils import list_cached_spaces  # Added import
from utils import (
    check_report_exists,
    download_cached_reports,
    get_space_code_files,
    upload_reports_to_dataset,
)

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Load environment variables from .env file
# This is important to ensure API keys and endpoints are loaded before use
load_dotenv()

# --- Constants ---
HF_TOKEN = os.getenv("HF_TOKEN")
ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
DATASET_ID = "yjernite/spaces-privacy-reports"
CACHE_INFO_MSG = "\n\n*(Report retrieved from cache)*"
DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"

TRUNCATION_WARNING = """**⚠️ Warning:** The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""

ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up. 

You have a few options:

*   **Wait & Retry:** Try clicking "Get Space Report" again in ~3-5 minutes. Endpoints often scale down to save resources and take a short time to wake up.
*   **Select Cached Report:** Use the dropdown above to view a report for a Space that has already been analyzed.
*   **Request Analysis:** If the error persists, please [open an issue or discussion](https://huggingface.co./spaces/yjernite/space-privacy/discussions) in the Space's Community tab requesting analysis for your target Space ID. We can run the job manually when the endpoint is available.
"""


def get_space_report_wrapper(
    selected_cached_space: str | None,
    new_space_id: str | None,
    progress=gr.Progress(track_tqdm=True),
):
    """
    Wrapper function to decide whether to fetch cache or run live analysis.
    Handles the logic based on Dropdown and Textbox inputs.
    Yields tuples of Gradio updates.
    """
    target_space_id = None
    source = "new"  # Assume new input unless dropdown is chosen

    # Prioritize new_space_id if provided
    if new_space_id and new_space_id.strip():
        target_space_id = new_space_id.strip()
        if target_space_id == selected_cached_space:
            source = "dropdown_match"  # User typed ID that exists in dropdown
        else:
            source = "new"
    elif selected_cached_space:
        target_space_id = selected_cached_space
        source = "dropdown"

    if not target_space_id:
        # No input provided
        return (
            gr.update(
                value="Please select an existing report or enter a new Space ID.",
                visible=True,
            ),
            gr.update(value="", visible=False),
            gr.update(visible=True, open=True),
            gr.update(visible=False),
        )

    # Validate format
    if "/" not in target_space_id:
        return (
            gr.update(
                value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
                visible=True,
            ),
            gr.update(value="", visible=False),
            gr.update(visible=True, open=True),
            gr.update(visible=False),
        )

    logging.info(f"Request received for: '{target_space_id}' (Source: {source})")

    # --- Cache Handling ---
    # If the user explicitly selected from the dropdown, try to fetch it directly.
    if source == "dropdown":
        progress(
            0.1, desc="Fetching cached report..."
        )  # Simple progress for cache fetch
        yield (
            gr.update(value="Fetching selected cached report...", visible=True),
            gr.update(value="", visible=True),
            gr.update(visible=True, open=True),
            gr.update(visible=True, open=False),
        )
        try:
            cached_reports = download_cached_reports(
                target_space_id, DATASET_ID, HF_TOKEN
            )
            summary_report = (
                cached_reports.get("summary", "Error: Cached summary not found.")
                + CACHE_INFO_MSG
            )
            privacy_report = (
                cached_reports.get("privacy", "Error: Cached privacy report not found.")
                + CACHE_INFO_MSG
            )
            logging.info(
                f"Successfully displayed cached reports for selected '{target_space_id}'."
            )
            progress(1.0, desc="Complete (from cache)")
            yield (
                gr.update(value=summary_report, visible=True),
                gr.update(value=privacy_report, visible=True),
                gr.update(visible=True, open=True),
                gr.update(visible=True, open=True),
            )
        except Exception as e:
            error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
            logging.error(error_msg)
            progress(1.0, desc="Error")
            yield (
                gr.update(value=error_msg, visible=True),
                gr.update(value="", visible=False),
                gr.update(visible=True, open=True),
                gr.update(visible=False),
            )

    # --- Live Analysis or Check Cache for New Input ---
    # If it came from the textbox OR was a dropdown match, we first check cache, then run live.
    else:  # source == "new" or source == "dropdown_match"
        # This generator now performs the full analysis if needed
        # Yield intermediate updates from the generator
        # Important: Need to use a loop to consume the generator
        final_update = None
        for update_tuple in _run_live_analysis(target_space_id, progress):
            yield update_tuple
            final_update = update_tuple  # Keep track of the last update
        yield final_update  # Return the very last state


def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
    """
    Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload.
    Yields tuples of Gradio updates.
    (This contains the logic previously in analyze_space_privacy, minus initial input handling)
    """
    steps = 8  # Steps for the full pipeline
    privacy_truncated = False
    summary_truncated = False

    # --- Step 1: Check Cache --- (Check again for new/matched input)
    progress(1 / steps, desc="Step 1/8: Checking cache...")
    logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
    yield (
        gr.update(value="Checking cache for existing reports...", visible=True),
        gr.update(value="", visible=True),
        gr.update(visible=True, open=True),
        gr.update(visible=True, open=False),
    )
    found_in_cache = False
    if HF_TOKEN:
        try:
            found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
        except Exception as e:
            logging.warning(f"Cache check failed: {e}. Proceeding.")
            yield (
                gr.update(
                    value="Cache check failed, proceeding with live analysis...",
                    visible=True,
                ),
                gr.update(value="", visible=True),
                gr.update(visible=True, open=True),
                gr.update(visible=True, open=False),
            )

    if found_in_cache:
        logging.info(f"Cache hit for {space_id}. Downloading.")
        progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...")
        yield (
            gr.update(value="Cache hit! Downloading reports...", visible=True),
            gr.update(value="", visible=True),
            gr.update(visible=True, open=True),
            gr.update(visible=True, open=False),
        )
        try:
            cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN)
            summary_report = (
                cached_reports.get("summary", "Error: Cached summary not found.")
                + CACHE_INFO_MSG
            )
            privacy_report = (
                cached_reports.get("privacy", "Error: Cached privacy report not found.")
                + CACHE_INFO_MSG
            )
            logging.info(f"Successfully displayed cached reports for {space_id}.")
            progress(8 / steps, desc="Complete (from cache)")
            yield (
                gr.update(value=summary_report, visible=True),
                gr.update(value=privacy_report, visible=True),
                gr.update(visible=True, open=True),
                gr.update(visible=True, open=True),
            )
            return  # End generation here if cache successful
        except Exception as e:
            logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
            yield (
                gr.update(
                    value="Cache download failed, proceeding with live analysis...",
                    visible=True,
                ),
                gr.update(value="", visible=True),
                gr.update(visible=True, open=True),
                gr.update(visible=True, open=False),
            )
    else:
        logging.info(f"Cache miss for {space_id}. Performing live analysis.")
        yield (
            gr.update(value="Cache miss. Fetching code...", visible=True),
            gr.update(value="", visible=True),
            gr.update(visible=True, open=True),
            gr.update(visible=True, open=False),
        )

    # --- Step 2: Check Endpoint Status --- 
    progress(2 / steps, desc="Step 2/8: Checking endpoint status...")
    logging.info("Step 2/8: Checking endpoint status...")
    yield (
        gr.update(value="Checking whether model endpoint is active...", visible=True),
        gr.update(value="", visible=True),
        gr.update(visible=True, open=True),
        gr.update(visible=True, open=False),
    )

    endpoint_ready = False
    if HF_TOKEN:
        try:
            api = HfApi(token=HF_TOKEN)
            endpoint = api.get_inference_endpoint(name=ENDPOINT_NAME)
            status = endpoint.status

            logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}")

            if status == 'running':
                endpoint_ready = True
            else:
                logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).")
                if status == 'scaledToZero':
                    logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...")
                    endpoint.resume()
                msg_503 = f"**Full Service Temporarily Unavailable**: but you can **browse existing reports** or **check back later!**\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>**{status}**</span>\n\n" + ERROR_503_USER_MESSAGE
                yield (
                    gr.update(value=msg_503, visible=True),
                    gr.update(value="", visible=False),
                    gr.update(visible=True, open=True),
                    gr.update(visible=False)
                )
                return # Stop analysis, user needs to retry
        except Exception as e:
            logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}")
            yield (
                gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True),
                gr.update(value="", visible=False),
                gr.update(visible=True, open=True),
                gr.update(visible=False)
            )
            return # Stop analysis
    
    # --- Step 3: Fetch Code Files (if not cached) ---
    progress(3 / steps, desc="Step 3/8: Fetching code files...")
    logging.info("Step 3/8: Fetching code files...")
    code_files = get_space_code_files(space_id)
    if not code_files:
        error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
        logging.warning(error_msg)
        yield (
            gr.update(value=f"**Error:**\n{error_msg}", visible=True),
            gr.update(value="Analysis Canceled", visible=True),
            gr.update(visible=True, open=True),
            gr.update(visible=True, open=False),
        )
        return  # End generation on error

    # --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
    progress(
        4 / steps, desc="Step 4/8: Generating detailed privacy report (AI Call 1)..."
    )
    logging.info("Step 4/8: Generating detailed privacy analysis report...")
    yield (
        gr.update(value="Generating detailed privacy report...", visible=True),
        gr.update(value="Generating detailed privacy report via AI...", visible=True),
        gr.update(visible=True, open=True),
        gr.update(visible=True, open=True),
    )
    privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
        space_id, code_files
    )

    # --- Check for 503 after query ---
    privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
    if privacy_api_response == ERROR_503_DICT:
        logging.warning("LLM Call 1 failed with 503.")
        yield (
            gr.update(
                value=ERROR_503_USER_MESSAGE, visible=True
            ),  # Show 503 message in summary area
            gr.update(value="", visible=False),  # Clear privacy area
            gr.update(visible=True, open=True),  # Keep summary open
            gr.update(visible=False),  # Hide privacy accordion
        )
        return  # Stop analysis

    detailed_privacy_report = parse_qwen_response(privacy_api_response)

    if "Error:" in detailed_privacy_report:
        logging.error(
            f"Failed to generate detailed privacy report: {detailed_privacy_report}"
        )
        yield (
            gr.update(value="Analysis Halted due to Error", visible=True),
            gr.update(
                value=f"**Error Generating Detailed Privacy Report:**\n{detailed_privacy_report}",
                visible=True,
            ),
            gr.update(visible=True, open=True),
            gr.update(visible=True, open=True),
        )
        return  # End generation on error
    if privacy_truncated:
        detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report

    yield (
        gr.update(value="Extracting model info...", visible=True),
        gr.update(value=detailed_privacy_report, visible=True),
        gr.update(visible=True, open=True),
        gr.update(visible=True, open=True),
    )


    # --- Step 5: Fetch Model Descriptions ---
    progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
    logging.info("Step 5/8: Fetching model descriptions...")
    yield (
        gr.update(value="Fetching model descriptions...", visible=True),
        gr.update(),
        gr.update(),
        gr.update(),
    )
    # --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
    progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...")
    logging.info("Step 6/8: Generating summary and highlights report...")
    yield (
        gr.update(value="Generating summary & highlights via AI...", visible=True),
        gr.update(),
        gr.update(),
        gr.update(),
    )
    summary_highlights_prompt_messages, summary_truncated = (
        format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report)
    )

    # --- Check for 503 after query ---
    summary_highlights_api_response = query_qwen_endpoint(
        summary_highlights_prompt_messages, max_tokens=2048
    )
    if summary_highlights_api_response == ERROR_503_DICT:
        logging.warning("LLM Call 2 failed with 503.")
        yield (
            gr.update(
                value=ERROR_503_USER_MESSAGE, visible=True
            ),  # Show 503 message in summary area
            gr.update(
                value=detailed_privacy_report, visible=True
            ),  # Keep previous report visible
            gr.update(visible=True, open=True),  # Keep summary open
            gr.update(visible=True, open=True),  # Keep privacy open
        )
        return  # Stop analysis

    summary_highlights_report = parse_qwen_response(summary_highlights_api_response)

    if "Error:" in summary_highlights_report:
        logging.error(
            f"Failed to generate summary/highlights report: {summary_highlights_report}"
        )
        yield (
            gr.update(
                value=f"**Error Generating Summary/Highlights:**\n{summary_highlights_report}",
                visible=True,
            ),
            gr.update(value=detailed_privacy_report, visible=True),
            gr.update(visible=True, open=True),
            gr.update(visible=True, open=True),
        )
        return  # End generation on error
    if summary_truncated:
        summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report

    # Yield summary report before attempting upload
    yield (
        gr.update(value=summary_highlights_report, visible=True),
        gr.update(value=detailed_privacy_report, visible=True),
        gr.update(visible=True, open=True),
        gr.update(visible=True, open=True),
    )

    # --- Step 7: Upload to Cache ---
    progress(7 / steps, desc="Step 7/8: Uploading results to cache...")
    logging.info("Step 7/8: Attempting to upload results to dataset cache...")
    try:
        if (
            HF_TOKEN
            and not found_in_cache
            and "Error:" not in detailed_privacy_report
            and "Error:" not in summary_highlights_report
        ):
            summary_to_save = summary_highlights_report.replace(
                TRUNCATION_WARNING, ""
            ).replace(CACHE_INFO_MSG, "")
            privacy_to_save = detailed_privacy_report.replace(
                TRUNCATION_WARNING, ""
            ).replace(CACHE_INFO_MSG, "")
            upload_reports_to_dataset(
                space_id=space_id,
                summary_report=summary_to_save,
                detailed_report=privacy_to_save,
                dataset_id=DATASET_ID,
                hf_token=HF_TOKEN,
            )
        elif not HF_TOKEN:
            logging.warning("Skipping cache upload as HF_TOKEN is not set.")
        elif found_in_cache:
            logging.info("Skipping cache upload as results were loaded from cache.")
    except Exception as e:
        logging.error(f"Non-critical error during report upload: {e}")

    logging.info("Step 8/8: Analysis complete.")
    progress(8 / steps, desc="Step 8/8: Analysis Complete!")

    # --- Step 8: Yield Final Results --- (Ensure final state is correct)
    yield (
        gr.update(value=summary_highlights_report, visible=True),
        gr.update(value=detailed_privacy_report, visible=True),
        gr.update(visible=True, open=True),
        gr.update(visible=True, open=True),
    )


# --- Load Initial Data Function (for demo.load) ---
def load_cached_list():
    """Fetches the list of cached spaces and determines the default selection."""
    print("Running demo.load: Fetching list of cached spaces...")
    # Use os.getenv here directly as HF_TOKEN might be loaded after initial import
    token = os.getenv("HF_TOKEN")
    cached_list = list_cached_spaces(DATASET_ID, token)
    default_value = DEFAULT_SELECTION if DEFAULT_SELECTION in cached_list else None
    if not cached_list:
        print(
            "WARNING: No cached spaces found or failed to fetch list during demo.load."
        )
    # Return an update object for the dropdown using gr.update()
    return gr.update(choices=cached_list, value=default_value)


# --- Gradio Interface Definition ---
# Use HTML/CSS for centering the title
TITLE = "<div style='text-align: center;'><h1>🤗 Space Privacy Analyzer 🕵️</h1></div>\n<div style='text-align: center;'><h4>Automatic code Data transfer review powered by <a href='https://huggingface.co./Qwen/Qwen2.5-Coder-32B-Instruct' target='_blank'>Qwen2.5-Coder-32B-Instruct</a></h4></div>"

DESCRIPTION = """
### Check the Privacy of a Hugging Face Space

[Hugging Face 🤗 Spaces](https://huggingface.co./spaces) offer a convenient way to build and share code demos online.
In most cases, the code for these demos is open source &mdash; which provides a unique opportunity to **examine how privacy is managed** in the demo.

This demo leverages a code analysis model ([Qwen2.5-Coder-32B-Instruct](https://huggingface.co./Qwen/Qwen2.5-Coder-32B-Instruct)) to help explore privacy questions in two steps:
1. Obtain and **parse the code** of a Space to identify data inputs, AI model use, API calls, and data transfer behavior.
2. Generate a summary of the Space's function and highlight **key privacy points**.

Use the dropdown menu below to explore the [reports generated for some popular Spaces](https://huggingface.co./datasets/yjernite/spaces-privacy-reports/tree/main), or enter a new Space ID to query your own 👇

*Please note the following limitations:*
- *The model may miss important details in the code, especially when it leverages Docker files or external libraries.*
- *This app uses the base Qwen Coder model without specific adaptation to the task. We'd love to discuss how to improve this, if you want to participate [feel free to open a discussion!](https://huggingface.co./spaces/yjernite/space-privacy/discussions)*
"""

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(TITLE)  # This will now render the centered HTML

    with gr.Row():
        with gr.Column(scale=1):  # Left column for inputs
            description_accordion = gr.Accordion(
                "What Privacy Questions do 🤗 Spaces Raise? Demo Desription 👇", open=False, visible=True
            )
            with description_accordion:
                gr.Markdown(DESCRIPTION)

            cached_spaces_dropdown = gr.Dropdown(
                label="Select Existing Report",
                info="Select a Space whose report has been previously generated.",
                choices=[],  # Initialize empty, will be populated by demo.load
                value=None,  # Initialize empty
            )

            space_id_input = gr.Textbox(
                label="Or Enter New Space ID",
                placeholder="owner/space-name",
                info="Enter a new Space ID to analyze (takes precedence over selection).",
            )

            analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)

        with gr.Column(scale=1):  # Right column for outputs
            # Define Accordions first, open by default, hidden initially
            summary_accordion = gr.Accordion(
                "Summary & Privacy Highlights", open=True, visible=True
            )
            privacy_accordion = gr.Accordion(
                "Detailed Privacy Analysis Report", open=False, visible=True
            )
            with summary_accordion:
                summary_markdown = gr.Markdown(
                    "Enter or select a Space ID and click Get Report.",
                    show_copy_button=True,
                )
            with privacy_accordion:
                privacy_markdown = gr.Markdown(
                    "Detailed report will appear here.", show_copy_button=True
                )

    # --- Event Listeners ---

    # Load event to populate the dropdown when the UI loads for a user session
    demo.load(fn=load_cached_list, inputs=None, outputs=cached_spaces_dropdown)

    # Button click event
    analyze_button.click(
        fn=get_space_report_wrapper,
        inputs=[cached_spaces_dropdown, space_id_input],
        outputs=[
            summary_markdown,
            privacy_markdown,
            summary_accordion,
            privacy_accordion,
        ],
        show_progress="full",
    )

# --- Application Entry Point ---

if __name__ == "__main__":
    logging.info("Starting Gradio application...")
    demo.launch()