Spaces:

yjernite
/

space-privacy

Running

App Files Files Community

space-privacy / app.py

meg HF Staff

Left panel wording changes

c4933ab verified 14 days ago

raw

history blame

24.2 kB

	import logging
	import os

	import gradio as gr
	from dotenv import load_dotenv

	from huggingface_hub import HfApi

	from llm_interface import ERROR_503_DICT # Import error dict
	from llm_interface import parse_qwen_response, query_qwen_endpoint

	# Updated prompt imports for new order
	from prompts import format_privacy_prompt, format_summary_highlights_prompt

	# Import helper functions from other modules
	from utils import list_cached_spaces # Added import
	from utils import (
	check_report_exists,
	download_cached_reports,
	get_space_code_files,
	upload_reports_to_dataset,
	)

	# Configure logging
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)

	# Load environment variables from .env file
	# This is important to ensure API keys and endpoints are loaded before use
	load_dotenv()

	# --- Constants ---
	HF_TOKEN = os.getenv("HF_TOKEN")
	ENDPOINT_NAME = "qwen2-5-coder-32b-instruct-pmf"
	DATASET_ID = "yjernite/spaces-privacy-reports"
	CACHE_INFO_MSG = "\n\n(Report retrieved from cache)"
	DEFAULT_SELECTION = "HuggingFaceTB/SmolVLM2"

	TRUNCATION_WARNING = """⚠️ Warning: The input data (code and/or prior analysis) was too long for the AI model's context limit and had to be truncated. The analysis below may be incomplete or based on partial information.\n\n---\n\n"""

	ERROR_503_USER_MESSAGE = """It appears that the analysis model endpoint is currently down or starting up.

	You have a few options:

	* Wait & Retry: Try clicking "Get Space Report" again in ~3-5 minutes. Endpoints often scale down to save resources and take a short time to wake up.
	* Select Cached Report: Use the dropdown above to view a report for a Space that has already been analyzed.
	* Request Analysis: If the error persists, please [open an issue or discussion](https://huggingface.co./spaces/yjernite/space-privacy/discussions) in the Space's Community tab requesting analysis for your target Space ID. We can run the job manually when the endpoint is available.
	"""


	def get_space_report_wrapper(
	selected_cached_space: str \| None,
	new_space_id: str \| None,
	progress=gr.Progress(track_tqdm=True),
	):
	"""
	Wrapper function to decide whether to fetch cache or run live analysis.
	Handles the logic based on Dropdown and Textbox inputs.
	Yields tuples of Gradio updates.
	"""
	target_space_id = None
	source = "new" # Assume new input unless dropdown is chosen

	# Prioritize new_space_id if provided
	if new_space_id and new_space_id.strip():
	target_space_id = new_space_id.strip()
	if target_space_id == selected_cached_space:
	source = "dropdown_match" # User typed ID that exists in dropdown
	else:
	source = "new"
	elif selected_cached_space:
	target_space_id = selected_cached_space
	source = "dropdown"

	if not target_space_id:
	# No input provided
	return (
	gr.update(
	value="Please select an existing report or enter a new Space ID.",
	visible=True,
	),
	gr.update(value="", visible=False),
	gr.update(visible=True, open=True),
	gr.update(visible=False),
	)

	# Validate format
	if "/" not in target_space_id:
	return (
	gr.update(
	value=f"Invalid Space ID format: '{target_space_id}'. Use 'owner/name'.",
	visible=True,
	),
	gr.update(value="", visible=False),
	gr.update(visible=True, open=True),
	gr.update(visible=False),
	)

	logging.info(f"Request received for: '{target_space_id}' (Source: {source})")

	# --- Cache Handling ---
	# If the user explicitly selected from the dropdown, try to fetch it directly.
	if source == "dropdown":
	progress(
	0.1, desc="Fetching cached report..."
	) # Simple progress for cache fetch
	yield (
	gr.update(value="Fetching selected cached report...", visible=True),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)
	try:
	cached_reports = download_cached_reports(
	target_space_id, DATASET_ID, HF_TOKEN
	)
	summary_report = (
	cached_reports.get("summary", "Error: Cached summary not found.")
	+ CACHE_INFO_MSG
	)
	privacy_report = (
	cached_reports.get("privacy", "Error: Cached privacy report not found.")
	+ CACHE_INFO_MSG
	)
	logging.info(
	f"Successfully displayed cached reports for selected '{target_space_id}'."
	)
	progress(1.0, desc="Complete (from cache)")
	yield (
	gr.update(value=summary_report, visible=True),
	gr.update(value=privacy_report, visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)
	except Exception as e:
	error_msg = f"Failed to download cached report for selected '{target_space_id}': {e}"
	logging.error(error_msg)
	progress(1.0, desc="Error")
	yield (
	gr.update(value=error_msg, visible=True),
	gr.update(value="", visible=False),
	gr.update(visible=True, open=True),
	gr.update(visible=False),
	)

	# --- Live Analysis or Check Cache for New Input ---
	# If it came from the textbox OR was a dropdown match, we first check cache, then run live.
	else: # source == "new" or source == "dropdown_match"
	# This generator now performs the full analysis if needed
	# Yield intermediate updates from the generator
	# Important: Need to use a loop to consume the generator
	final_update = None
	for update_tuple in _run_live_analysis(target_space_id, progress):
	yield update_tuple
	final_update = update_tuple # Keep track of the last update
	yield final_update # Return the very last state


	def _run_live_analysis(space_id: str, progress=gr.Progress(track_tqdm=True)):
	"""
	Performs the full analysis pipeline: cache check, code fetch, LLM calls, upload.
	Yields tuples of Gradio updates.
	(This contains the logic previously in analyze_space_privacy, minus initial input handling)
	"""
	steps = 8 # Steps for the full pipeline
	privacy_truncated = False
	summary_truncated = False

	# --- Step 1: Check Cache --- (Check again for new/matched input)
	progress(1 / steps, desc="Step 1/8: Checking cache...")
	logging.info(f"Step 1/8: Checking cache for '{space_id}'...")
	yield (
	gr.update(value="Checking cache for existing reports...", visible=True),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)
	found_in_cache = False
	if HF_TOKEN:
	try:
	found_in_cache = check_report_exists(space_id, DATASET_ID, HF_TOKEN)
	except Exception as e:
	logging.warning(f"Cache check failed: {e}. Proceeding.")
	yield (
	gr.update(
	value="Cache check failed, proceeding with live analysis...",
	visible=True,
	),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)

	if found_in_cache:
	logging.info(f"Cache hit for {space_id}. Downloading.")
	progress(2 / steps, desc="Step 2/8: Cache hit! Downloading reports...")
	yield (
	gr.update(value="Cache hit! Downloading reports...", visible=True),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)
	try:
	cached_reports = download_cached_reports(space_id, DATASET_ID, HF_TOKEN)
	summary_report = (
	cached_reports.get("summary", "Error: Cached summary not found.")
	+ CACHE_INFO_MSG
	)
	privacy_report = (
	cached_reports.get("privacy", "Error: Cached privacy report not found.")
	+ CACHE_INFO_MSG
	)
	logging.info(f"Successfully displayed cached reports for {space_id}.")
	progress(8 / steps, desc="Complete (from cache)")
	yield (
	gr.update(value=summary_report, visible=True),
	gr.update(value=privacy_report, visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)
	return # End generation here if cache successful
	except Exception as e:
	logging.warning(f"Cache download failed for {space_id}: {e}. Proceeding.")
	yield (
	gr.update(
	value="Cache download failed, proceeding with live analysis...",
	visible=True,
	),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)
	else:
	logging.info(f"Cache miss for {space_id}. Performing live analysis.")
	yield (
	gr.update(value="Cache miss. Fetching code...", visible=True),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)

	# --- Step 2: Check Endpoint Status ---
	progress(2 / steps, desc="Step 2/8: Checking endpoint status...")
	logging.info("Step 2/8: Checking endpoint status...")
	yield (
	gr.update(value="Checking whether model endpoint is active...", visible=True),
	gr.update(value="", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)

	endpoint_ready = False
	if HF_TOKEN:
	try:
	api = HfApi(token=HF_TOKEN)
	endpoint = api.get_inference_endpoint(name=ENDPOINT_NAME)
	status = endpoint.status

	logging.info(f"Endpoint '{ENDPOINT_NAME}' status: {status}")

	if status == 'running':
	endpoint_ready = True
	else:
	logging.warning(f"Endpoint '{ENDPOINT_NAME}' is not ready (Status: {status}).")
	if status == 'scaledToZero':
	logging.info(f"Endpoint '{ENDPOINT_NAME}' is scaled to zero. Attempting to resume...")
	endpoint.resume()
	msg_503 = f"Full Service Temporarily Unavailable: but you can browse existing reports or check back later!\n\n The status of the Qwen2.5-Coder-32B-Instruct endpoint powering the analysis is currently: <span style='color:red'>{status}</span>\n\n" + ERROR_503_USER_MESSAGE
	yield (
	gr.update(value=msg_503, visible=True),
	gr.update(value="", visible=False),
	gr.update(visible=True, open=True),
	gr.update(visible=False)
	)
	return # Stop analysis, user needs to retry
	except Exception as e:
	logging.error(f"Error checking endpoint status for {ENDPOINT_NAME}: {e}")
	yield (
	gr.update(value=f"Error checking analysis endpoint status: {e}", visible=True),
	gr.update(value="", visible=False),
	gr.update(visible=True, open=True),
	gr.update(visible=False)
	)
	return # Stop analysis

	# --- Step 3: Fetch Code Files (if not cached) ---
	progress(3 / steps, desc="Step 3/8: Fetching code files...")
	logging.info("Step 3/8: Fetching code files...")
	code_files = get_space_code_files(space_id)
	if not code_files:
	error_msg = f"Could not retrieve code files for '{space_id}'. Check ID and ensure it's a public Space."
	logging.warning(error_msg)
	yield (
	gr.update(value=f"Error:\n{error_msg}", visible=True),
	gr.update(value="Analysis Canceled", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=False),
	)
	return # End generation on error

	# --- Step 4: Generate DETAILED Privacy Report (LLM Call 1) ---
	progress(
	4 / steps, desc="Step 4/8: Generating detailed privacy report (AI Call 1)..."
	)
	logging.info("Step 4/8: Generating detailed privacy analysis report...")
	yield (
	gr.update(value="Generating detailed privacy report...", visible=True),
	gr.update(value="Generating detailed privacy report via AI...", visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)
	privacy_prompt_messages, privacy_truncated = format_privacy_prompt(
	space_id, code_files
	)

	# --- Check for 503 after query ---
	privacy_api_response = query_qwen_endpoint(privacy_prompt_messages, max_tokens=3072)
	if privacy_api_response == ERROR_503_DICT:
	logging.warning("LLM Call 1 failed with 503.")
	yield (
	gr.update(
	value=ERROR_503_USER_MESSAGE, visible=True
	), # Show 503 message in summary area
	gr.update(value="", visible=False), # Clear privacy area
	gr.update(visible=True, open=True), # Keep summary open
	gr.update(visible=False), # Hide privacy accordion
	)
	return # Stop analysis

	detailed_privacy_report = parse_qwen_response(privacy_api_response)

	if "Error:" in detailed_privacy_report:
	logging.error(
	f"Failed to generate detailed privacy report: {detailed_privacy_report}"
	)
	yield (
	gr.update(value="Analysis Halted due to Error", visible=True),
	gr.update(
	value=f"Error Generating Detailed Privacy Report:\n{detailed_privacy_report}",
	visible=True,
	),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)
	return # End generation on error
	if privacy_truncated:
	detailed_privacy_report = TRUNCATION_WARNING + detailed_privacy_report

	yield (
	gr.update(value="Extracting model info...", visible=True),
	gr.update(value=detailed_privacy_report, visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)


	# --- Step 5: Fetch Model Descriptions ---
	progress(5 / steps, desc="Step 5/8: Fetching model descriptions...")
	logging.info("Step 5/8: Fetching model descriptions...")
	yield (
	gr.update(value="Fetching model descriptions...", visible=True),
	gr.update(),
	gr.update(),
	gr.update(),
	)
	# --- Step 6: Generate Summary + Highlights Report (LLM Call 2) ---
	progress(6 / steps, desc="Step 6/8: Generating summary & highlights (AI Call 2)...")
	logging.info("Step 6/8: Generating summary and highlights report...")
	yield (
	gr.update(value="Generating summary & highlights via AI...", visible=True),
	gr.update(),
	gr.update(),
	gr.update(),
	)
	summary_highlights_prompt_messages, summary_truncated = (
	format_summary_highlights_prompt(space_id, code_files, detailed_privacy_report)
	)

	# --- Check for 503 after query ---
	summary_highlights_api_response = query_qwen_endpoint(
	summary_highlights_prompt_messages, max_tokens=2048
	)
	if summary_highlights_api_response == ERROR_503_DICT:
	logging.warning("LLM Call 2 failed with 503.")
	yield (
	gr.update(
	value=ERROR_503_USER_MESSAGE, visible=True
	), # Show 503 message in summary area
	gr.update(
	value=detailed_privacy_report, visible=True
	), # Keep previous report visible
	gr.update(visible=True, open=True), # Keep summary open
	gr.update(visible=True, open=True), # Keep privacy open
	)
	return # Stop analysis

	summary_highlights_report = parse_qwen_response(summary_highlights_api_response)

	if "Error:" in summary_highlights_report:
	logging.error(
	f"Failed to generate summary/highlights report: {summary_highlights_report}"
	)
	yield (
	gr.update(
	value=f"Error Generating Summary/Highlights:\n{summary_highlights_report}",
	visible=True,
	),
	gr.update(value=detailed_privacy_report, visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)
	return # End generation on error
	if summary_truncated:
	summary_highlights_report = TRUNCATION_WARNING + summary_highlights_report

	# Yield summary report before attempting upload
	yield (
	gr.update(value=summary_highlights_report, visible=True),
	gr.update(value=detailed_privacy_report, visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)

	# --- Step 7: Upload to Cache ---
	progress(7 / steps, desc="Step 7/8: Uploading results to cache...")
	logging.info("Step 7/8: Attempting to upload results to dataset cache...")
	try:
	if (
	HF_TOKEN
	and not found_in_cache
	and "Error:" not in detailed_privacy_report
	and "Error:" not in summary_highlights_report
	):
	summary_to_save = summary_highlights_report.replace(
	TRUNCATION_WARNING, ""
	).replace(CACHE_INFO_MSG, "")
	privacy_to_save = detailed_privacy_report.replace(
	TRUNCATION_WARNING, ""
	).replace(CACHE_INFO_MSG, "")
	upload_reports_to_dataset(
	space_id=space_id,
	summary_report=summary_to_save,
	detailed_report=privacy_to_save,
	dataset_id=DATASET_ID,
	hf_token=HF_TOKEN,
	)
	elif not HF_TOKEN:
	logging.warning("Skipping cache upload as HF_TOKEN is not set.")
	elif found_in_cache:
	logging.info("Skipping cache upload as results were loaded from cache.")
	except Exception as e:
	logging.error(f"Non-critical error during report upload: {e}")

	logging.info("Step 8/8: Analysis complete.")
	progress(8 / steps, desc="Step 8/8: Analysis Complete!")

	# --- Step 8: Yield Final Results --- (Ensure final state is correct)
	yield (
	gr.update(value=summary_highlights_report, visible=True),
	gr.update(value=detailed_privacy_report, visible=True),
	gr.update(visible=True, open=True),
	gr.update(visible=True, open=True),
	)


	# --- Load Initial Data Function (for demo.load) ---
	def load_cached_list():
	"""Fetches the list of cached spaces and determines the default selection."""
	print("Running demo.load: Fetching list of cached spaces...")
	# Use os.getenv here directly as HF_TOKEN might be loaded after initial import
	token = os.getenv("HF_TOKEN")
	cached_list = list_cached_spaces(DATASET_ID, token)
	default_value = DEFAULT_SELECTION if DEFAULT_SELECTION in cached_list else None
	if not cached_list:
	print(
	"WARNING: No cached spaces found or failed to fetch list during demo.load."
	)
	# Return an update object for the dropdown using gr.update()
	return gr.update(choices=cached_list, value=default_value)


	# --- Gradio Interface Definition ---
	# Use HTML/CSS for centering the title
	TITLE = "<div style='text-align: center;'><h1>🤗 Space Privacy Analyzer 🕵️</h1></div>\n<div style='text-align: center;'><h4>Automatic code Data transfer review powered by <a href='https://huggingface.co./Qwen/Qwen2.5-Coder-32B-Instruct' target='_blank'>Qwen2.5-Coder-32B-Instruct</a></h4></div>"

	DESCRIPTION = """
	### Check the Privacy of a Hugging Face Space

	[Hugging Face 🤗 Spaces](https://huggingface.co./spaces) offer a convenient way to build and share code demos online.
	In most cases, the code for these demos is open source — which provides a unique opportunity to examine how privacy is managed in the demo.

	This demo leverages a code analysis model ([Qwen2.5-Coder-32B-Instruct](https://huggingface.co./Qwen/Qwen2.5-Coder-32B-Instruct)) to help explore privacy questions in two steps:
	1. Obtain and parse the code of a Space to identify data inputs, AI model use, API calls, and data transfer behavior.
	2. Generate a summary of the Space's function and highlight key privacy points.

	Use the dropdown menu below to explore the [reports generated for some popular Spaces](https://huggingface.co./datasets/yjernite/spaces-privacy-reports/tree/main), or enter a new Space ID to query your own 👇

	Please note the following limitations:
	- The model may miss important details in the code, especially when it leverages Docker files or external libraries.
	- This app uses the base Qwen Coder model without specific adaptation to the task. We'd love to discuss how to improve this, if you want to participate [feel free to open a discussion!](https://huggingface.co./spaces/yjernite/space-privacy/discussions)
	"""

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(TITLE) # This will now render the centered HTML

	with gr.Row():
	with gr.Column(scale=1): # Left column for inputs
	description_accordion = gr.Accordion(
	"What Privacy Questions do 🤗 Spaces Raise? Demo Desription 👇", open=False, visible=True
	)
	with description_accordion:
	gr.Markdown(DESCRIPTION)

	cached_spaces_dropdown = gr.Dropdown(
	label="Select Existing Report",
	info="Select a Space whose report has been previously generated.",
	choices=[], # Initialize empty, will be populated by demo.load
	value=None, # Initialize empty
	)

	space_id_input = gr.Textbox(
	label="Or Enter New Space ID",
	placeholder="owner/space-name",
	info="Enter a new Space ID to analyze (takes precedence over selection).",
	)

	analyze_button = gr.Button("Get Space Report", variant="primary", scale=1)

	with gr.Column(scale=1): # Right column for outputs
	# Define Accordions first, open by default, hidden initially
	summary_accordion = gr.Accordion(
	"Summary & Privacy Highlights", open=True, visible=True
	)
	privacy_accordion = gr.Accordion(
	"Detailed Privacy Analysis Report", open=False, visible=True
	)
	with summary_accordion:
	summary_markdown = gr.Markdown(
	"Enter or select a Space ID and click Get Report.",
	show_copy_button=True,
	)
	with privacy_accordion:
	privacy_markdown = gr.Markdown(
	"Detailed report will appear here.", show_copy_button=True
	)

	# --- Event Listeners ---

	# Load event to populate the dropdown when the UI loads for a user session
	demo.load(fn=load_cached_list, inputs=None, outputs=cached_spaces_dropdown)

	# Button click event
	analyze_button.click(
	fn=get_space_report_wrapper,
	inputs=[cached_spaces_dropdown, space_id_input],
	outputs=[
	summary_markdown,
	privacy_markdown,
	summary_accordion,
	privacy_accordion,
	],
	show_progress="full",
	)

	# --- Application Entry Point ---

	if __name__ == "__main__":
	logging.info("Starting Gradio application...")
	demo.launch()