Spaces:

Dhruv-Ty
/

CRAX

Sleeping

App Files Files Community

CRAX / experiments /benchmark_llavamed.py

Dhruv-Ty

initial commit

cb3a670 10 days ago

raw

history blame contribute delete

19.6 kB

	import argparse
	import json
	import requests
	import base64
	from PIL import Image
	from io import BytesIO
	from llava.conversation import conv_templates
	import time
	import os
	import glob
	import logging
	from datetime import datetime
	from tqdm import tqdm
	import re
	from typing import Dict, List, Optional, Union, Any, Tuple


	def process_image(image_path: str, target_size: int = 640) -> Image.Image:
	"""Process and resize an image to match model requirements.

	Args:
	image_path: Path to the input image file
	target_size: Target size for both width and height in pixels

	Returns:
	PIL.Image: Processed and padded image with dimensions (target_size, target_size)
	"""
	image = Image.open(image_path)
	if image.mode != "RGB":
	image = image.convert("RGB")

	# Calculate scaling to maintain aspect ratio
	ratio = min(target_size / image.width, target_size / image.height)
	new_size = (int(image.width * ratio), int(image.height * ratio))

	# Resize image
	image = image.resize(new_size, Image.LANCZOS)

	# Create new image with padding
	new_image = Image.new("RGB", (target_size, target_size), (0, 0, 0))
	# Paste resized image in center
	offset = ((target_size - new_size[0]) // 2, (target_size - new_size[1]) // 2)
	new_image.paste(image, offset)

	return new_image


	def validate_answer(response_text: str) -> Optional[str]:
	"""Extract and validate a single-letter response from the model's output.
	Handles multiple response formats and edge cases.

	Args:
	response_text: The full text output from the model

	Returns:
	A single letter answer (A-F) or None if no valid answer found
	"""
	if not response_text:
	return None

	# Clean the response text
	cleaned = response_text.strip()

	# Comprehensive set of patterns to extract the answer
	extraction_patterns = [
	# Strict format with explicit letter answer
	r"(?:THE\s)?(?:SINGLE\s)?LETTER\s(?:ANSWER\s)?(?:IS:?)\s*([A-F])\b",
	# Patterns for extracting from longer descriptions
	r"(?:correct\s+)?(?:answer\|option)\s(?:is\s)?([A-F])\b",
	r"\b(?:answer\|option)\s([A-F])[):]\s",
	# Patterns for extracting from descriptive sentences
	r"(?:most\s+likely\s+)?(?:answer\|option)\s(?:is\s)?([A-F])\b",
	r"suggest[s]?\s+(?:that\s+)?(?:the\s+)?(?:answer\s+)?(?:is\s*)?([A-F])\b",
	# Patterns with contextual words
	r"characteriz[e]?d?\s+by\s+([A-F])\b",
	r"indicat[e]?s?\s+([A-F])\b",
	# Fallback to Option X or Letterr X formats
	r"Option\s*([A-F])\b",
	r"\b([A-F])\)\s*",
	# Fallback to standalone letter
	r"^\s([A-F])\s$",
	]

	# Try each pattern
	for pattern in extraction_patterns:
	matches = re.findall(pattern, cleaned, re.IGNORECASE)
	for match in matches:
	# Ensure match is a single valid letter
	if isinstance(match, tuple):
	match = match[0] if match[0] in "ABCDEF" else None
	if match and match.upper() in "ABCDEF":
	return match.upper()

	# Final fallback: look for standalone letters in context
	context_matches = re.findall(r"\b([A-F])\b", cleaned.upper())
	context_letters = [m for m in context_matches if m in "ABCDEF"]
	if context_letters:
	return context_letters[0]

	# No valid answer found
	return None


	def load_benchmark_questions(case_id: str) -> List[str]:
	"""Find all question files for a given case ID.

	Args:
	case_id: The ID of the medical case

	Returns:
	List of paths to question JSON files
	"""
	benchmark_dir = "MedMAX/benchmark/questions"
	return glob.glob(f"{benchmark_dir}/{case_id}/{case_id}_*.json")


	def count_total_questions() -> Tuple[int, int]:
	"""Count total number of cases and questions in benchmark.

	Returns:
	Tuple containing (total_cases, total_questions)
	"""
	total_cases = len(glob.glob("MedMAX/benchmark/questions/*"))
	total_questions = sum(
	len(glob.glob(f"MedMAX/benchmark/questions/{case_id}/*.json"))
	for case_id in os.listdir("MedMAX/benchmark/questions")
	)
	return total_cases, total_questions


	def create_inference_request(
	question_data: Dict[str, Any],
	case_details: Dict[str, Any],
	case_id: str,
	question_id: str,
	worker_addr: str,
	model_name: str,
	raw_output: bool = False,
	) -> Union[Tuple[Optional[str], Optional[float]], Dict[str, Any]]:
	"""Create and send inference request to worker.

	Args:
	question_data: Dictionary containing question details and figures
	case_details: Dictionary containing case information and figures
	case_id: Identifier for the medical case
	question_id: Identifier for the specific question
	worker_addr: Address of the worker endpoint
	model_name: Name of the model to use
	raw_output: Whether to return raw model output

	Returns:
	If raw_output is False: Tuple of (validated_answer, duration)
	If raw_output is True: Dictionary with full inference details
	"""
	system_prompt = """You are a medical imaging expert. Your answer MUST be a SINGLE LETTER (A/B/C/D/E/F), provided in this format: 'The SINGLE LETTER answer is: X'.
	"""

	prompt = f"""Given the following medical case:
	Please answer this multiple choice question:
	{question_data['question']}
	Base your answer only on the provided images and case information. Respond with your SINGLE LETTER answer: """

	try:
	# Parse required figures
	if isinstance(question_data["figures"], str):
	try:
	required_figures = json.loads(question_data["figures"])
	except json.JSONDecodeError:
	required_figures = [question_data["figures"]]
	elif isinstance(question_data["figures"], list):
	required_figures = question_data["figures"]
	else:
	required_figures = [str(question_data["figures"])]
	except Exception as e:
	print(f"Error parsing figures: {e}")
	required_figures = []

	required_figures = [
	fig if fig.startswith("Figure ") else f"Figure {fig}" for fig in required_figures
	]

	# Get image paths
	image_paths = []
	for figure in required_figures:
	base_figure_num = "".join(filter(str.isdigit, figure))
	figure_letter = "".join(filter(str.isalpha, figure.split()[-1])) or None

	matching_figures = [
	case_figure
	for case_figure in case_details.get("figures", [])
	if case_figure["number"] == f"Figure {base_figure_num}"
	]

	for case_figure in matching_figures:
	subfigures = []
	if figure_letter:
	subfigures = [
	subfig
	for subfig in case_figure.get("subfigures", [])
	if subfig.get("number", "").lower().endswith(figure_letter.lower())
	or subfig.get("label", "").lower() == figure_letter.lower()
	]
	else:
	subfigures = case_figure.get("subfigures", [])

	for subfig in subfigures:
	if "local_path" in subfig:
	image_paths.append("MedMAX/data/" + subfig["local_path"])

	if not image_paths:
	print(f"No local images found for case {case_id}, question {question_id}")
	return "skipped", 0.0 # Return a special 'skipped' marker

	try:
	start_time = time.time()

	# Process each image
	processed_images = [process_image(path) for path in image_paths]

	# Create conversation
	conv = conv_templates["mistral_instruct"].copy()

	# Add image and message
	if "<image>" not in prompt:
	text = prompt + "\n<image>"
	else:
	text = prompt

	message = (text, processed_images[0], "Default") # Currently handling first image
	conv.append_message(conv.roles[0], message)
	conv.append_message(conv.roles[1], None)

	prompt = conv.get_prompt()
	headers = {"User-Agent": "LLaVA-Med Client"}
	pload = {
	"model": model_name,
	"prompt": prompt,
	"max_new_tokens": 150, # Reduce this since we only need one letter
	"temperature": 0.5, # Lower temperature for more focused responses
	"stop": conv.sep2,
	"images": conv.get_images(),
	"top_p": 1, # Lower top_p for more focused sampling
	"frequency_penalty": 0.0,
	"presence_penalty": 0.0,
	}

	max_retries = 3
	retry_delay = 5
	response_text = None

	for attempt in range(max_retries):
	try:
	response = requests.post(
	worker_addr + "/worker_generate_stream",
	headers=headers,
	json=pload,
	stream=True,
	timeout=30,
	)

	complete_output = ""
	for chunk in response.iter_lines(
	chunk_size=8192, decode_unicode=False, delimiter=b"\0"
	):
	if chunk:
	data = json.loads(chunk.decode("utf-8"))
	if data["error_code"] == 0:
	output = data["text"].split("[/INST]")[-1]
	complete_output = output
	else:
	print(f"\nError: {data['text']} (error_code: {data['error_code']})")
	if attempt < max_retries - 1:
	time.sleep(retry_delay)
	break
	return None, None

	if complete_output:
	response_text = complete_output
	break

	except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
	if attempt < max_retries - 1:
	print(f"\nNetwork error: {str(e)}. Retrying in {retry_delay} seconds...")
	time.sleep(retry_delay)
	else:
	print(f"\nFailed after {max_retries} attempts: {str(e)}")
	return None, None

	duration = time.time() - start_time

	if raw_output:
	inference_details = {
	"raw_output": response_text,
	"validated_answer": validate_answer(response_text),
	"duration": duration,
	"prompt": prompt,
	"system_prompt": system_prompt,
	"image_paths": image_paths,
	"payload": pload,
	}
	return inference_details

	return validate_answer(response_text), duration

	except Exception as e:
	print(f"Error in inference request: {str(e)}")
	return None, None


	def clean_payload(payload: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
	"""Remove image-related and large data from the payload to keep the log lean.

	Args:
	payload: Original request payload dictionary

	Returns:
	Cleaned payload dictionary with large data removed
	"""
	if not payload:
	return None

	# Create a copy of the payload to avoid modifying the original
	cleaned_payload = payload.copy()

	# Remove large or sensitive data
	if "images" in cleaned_payload:
	del cleaned_payload["images"]

	return cleaned_payload


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
	parser.add_argument("--worker-address", type=str)
	parser.add_argument("--model-name", type=str, default="llava-med-v1.5-mistral-7b")
	parser.add_argument("--output-dir", type=str, default="benchmark_results")
	parser.add_argument(
	"--raw-output", action="store_true", help="Return raw model output without validation"
	)
	parser.add_argument(
	"--num-cases",
	type=int,
	help="Number of cases to process if looking at raw outputs",
	default=2,
	)
	args = parser.parse_args()

	# Setup output directory
	os.makedirs(args.output_dir, exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

	# Setup live logging files
	live_log_filename = os.path.join(args.output_dir, f"live_benchmark_log_{timestamp}.json")
	final_results_filename = os.path.join(args.output_dir, f"final_results_{timestamp}.json")

	# Initialize live log file
	with open(live_log_filename, "w") as live_log_file:
	live_log_file.write("[\n") # Start of JSON array

	# Setup logging
	logging.basicConfig(
	filename=os.path.join(args.output_dir, f"benchmark_{timestamp}.log"),
	level=logging.INFO,
	format="%(message)s",
	)

	# Get worker address
	if args.worker_address:
	worker_addr = args.worker_address
	else:
	try:
	requests.post(args.controller_address + "/refresh_all_workers")
	ret = requests.post(args.controller_address + "/list_models")
	models = ret.json()["models"]
	ret = requests.post(
	args.controller_address + "/get_worker_address", json={"model": args.model_name}
	)
	worker_addr = ret.json()["address"]
	print(f"Worker address: {worker_addr}")
	except requests.exceptions.RequestException as e:
	print(f"Failed to connect to controller: {e}")
	return

	if worker_addr == "":
	print("No available worker")
	return

	# Load cases with local paths
	with open("MedMAX/data/updated_cases.json", "r") as file:
	data = json.load(file)

	total_cases, total_questions = count_total_questions()
	print(f"\nStarting benchmark with {args.model_name}")
	print(f"Found {total_cases} cases with {total_questions} total questions")

	results = {
	"model": args.model_name,
	"timestamp": datetime.now().isoformat(),
	"total_cases": total_cases,
	"total_questions": total_questions,
	"results": [],
	}

	cases_processed = 0
	questions_processed = 0
	correct_answers = 0
	skipped_questions = 0
	total_processed_entries = 0

	# Process each case
	for case_id, case_details in tqdm(data.items(), desc="Processing cases"):
	question_files = load_benchmark_questions(case_id)
	if not question_files:
	continue

	cases_processed += 1
	for question_file in tqdm(
	question_files, desc=f"Processing questions for case {case_id}", leave=False
	):
	with open(question_file, "r") as file:
	question_data = json.load(file)
	question_id = os.path.basename(question_file).split(".")[0]

	questions_processed += 1

	# Get model's answer
	inference_result = create_inference_request(
	question_data,
	case_details,
	case_id,
	question_id,
	worker_addr,
	args.model_name,
	raw_output=True, # Always use raw output for detailed logging
	)

	# Handle skipped questions
	if inference_result == ("skipped", 0.0):
	skipped_questions += 1
	print(f"\nCase {case_id}, Question {question_id}: Skipped (No images)")

	# Log skipped question
	skipped_entry = {
	"case_id": case_id,
	"question_id": question_id,
	"status": "skipped",
	"reason": "No images found",
	}
	with open(live_log_filename, "a") as live_log_file:
	json.dump(skipped_entry, live_log_file, indent=2)
	live_log_file.write(",\n") # Add comma for next entry

	continue

	# Extract information
	answer = inference_result["validated_answer"]
	duration = inference_result["duration"]

	# Prepare detailed logging entry
	log_entry = {
	"case_id": case_id,
	"question_id": question_id,
	"question": question_data["question"],
	"correct_answer": question_data["answer"],
	"raw_output": inference_result["raw_output"],
	"validated_answer": answer,
	"model_answer": answer,
	"is_correct": answer == question_data["answer"] if answer else False,
	"duration": duration,
	"system_prompt": inference_result["system_prompt"],
	"input_prompt": inference_result["prompt"],
	"image_paths": inference_result["image_paths"],
	"payload": clean_payload(inference_result["payload"]),
	}

	# Write to live log file
	with open(live_log_filename, "a") as live_log_file:
	json.dump(log_entry, live_log_file, indent=2)
	live_log_file.write(",\n") # Add comma for next entry

	# Print to console
	print(f"\nCase {case_id}, Question {question_id}")
	print(f"Model Answer: {answer}")
	print(f"Correct Answer: {question_data['answer']}")
	print(f"Time taken: {duration:.2f}s")

	# Track correct answers
	if answer == question_data["answer"]:
	correct_answers += 1

	# Append to results
	results["results"].append(log_entry)
	total_processed_entries += 1

	# Optional: break if reached specified number of cases
	if args.raw_output and cases_processed == args.num_cases:
	break

	# Optional: break if reached specified number of cases
	if args.raw_output and cases_processed == args.num_cases:
	break

	# Close live log file
	with open(live_log_filename, "a") as live_log_file:
	# Remove trailing comma and close JSON array
	live_log_file.seek(live_log_file.tell() - 2, 0) # Go back 2 chars to remove ',\n'
	live_log_file.write("\n]")

	# Calculate final statistics
	results["summary"] = {
	"cases_processed": cases_processed,
	"questions_processed": questions_processed,
	"total_processed_entries": total_processed_entries,
	"correct_answers": correct_answers,
	"skipped_questions": skipped_questions,
	"accuracy": (
	correct_answers / (questions_processed - skipped_questions)
	if (questions_processed - skipped_questions) > 0
	else 0
	),
	}

	# Save final results
	with open(final_results_filename, "w") as f:
	json.dump(results, f, indent=2)

	print(f"\nBenchmark Summary:")
	print(f"Total Cases Processed: {cases_processed}")
	print(f"Total Questions Processed: {questions_processed}")
	print(f"Total Processed Entries: {total_processed_entries}")
	print(f"Correct Answers: {correct_answers}")
	print(f"Skipped Questions: {skipped_questions}")
	print(f"Accuracy: {(correct_answers / (questions_processed - skipped_questions) * 100):.2f}%")
	print(f"\nResults saved to {args.output_dir}")
	print(f"Live log: {live_log_filename}")
	print(f"Final results: {final_results_filename}")


	if __name__ == "__main__":
	main()