Spaces:

Victarry
/

PP-schedule-visualizer

Running

App Files Files Community

PP-schedule-visualizer / src /strategies.py

Victarry

Initial commit: PP schedule visualization.

c048b97 about 2 months ago

raw

history blame contribute delete

25.1 kB

	from collections import defaultdict, deque
	from src.execution_model import OverlappedOperation, Operation, Schedule, ScheduleConfig


	def generate_1f1b_schedule(config: ScheduleConfig):
	schedule = Schedule(config)

	assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for 1F1B"

	for i in range(config.num_devices):
	fwd_batch_id = 0
	bwd_batch_id = 0
	cooldown_batches = warmup_batches = config.num_devices - i - 1
	steady_batches = config.num_batches - warmup_batches

	for _ in range(warmup_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(fwd_batch_id, i, "forward")
	)
	fwd_batch_id += 1

	for _ in range(steady_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(fwd_batch_id, i, "forward")
	)
	fwd_batch_id += 1
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_batch_id, i, "backward")
	)
	bwd_batch_id += 1

	for _ in range(cooldown_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_batch_id, i, "backward")
	)
	bwd_batch_id += 1

	return schedule


	def generate_zero_bubble_1p_schedule(config: ScheduleConfig):
	# Create a new schedule with split_backward=True to support backward_D and backward_W operations
	schedule = Schedule(config)
	total_batches = config.num_batches
	assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for ZB-1P"
	assert config.split_backward, "ZB-1P requires split_backward=True"

	for i in range(config.num_devices):
	fwd_batch_id = 0
	bwd_d_batch_id = 0
	bwd_w_batch_id = 0

	cooldown_batches = warmup_batches = config.num_devices - i - 1
	steady_batches = total_batches - warmup_batches

	for _ in range(warmup_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(fwd_batch_id, i, "forward")
	)
	fwd_batch_id += 1

	for _ in range(steady_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(fwd_batch_id, i, "forward")
	)
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_d_batch_id, i, "backward_D")
	)
	if fwd_batch_id - bwd_w_batch_id >= config.num_devices - 1:
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_w_batch_id, i, "backward_W")
	)
	bwd_w_batch_id += 1
	bwd_d_batch_id += 1
	fwd_batch_id += 1

	for _ in range(cooldown_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_d_batch_id, i, "backward_D")
	)

	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_w_batch_id, i, "backward_W")
	)

	bwd_w_batch_id += 1
	bwd_d_batch_id += 1

	while bwd_w_batch_id < total_batches:
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_w_batch_id, i, "backward_W")
	)
	bwd_w_batch_id += 1

	return schedule


	def generate_1f1b_overlap_schedule(config: ScheduleConfig):
	schedule = Schedule(config)

	assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for 1F1B"

	for i in range(config.num_devices):
	fwd_batch_id = 0
	bwd_batch_id = 0
	cooldown_batches = warmup_batches = 2 * (config.num_devices - i - 1) + 1
	steady_batches = config.num_batches - warmup_batches

	for _ in range(warmup_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(fwd_batch_id, i, "forward")
	)
	fwd_batch_id += 1

	for _ in range(steady_batches):
	fwd_op = schedule.get_op(fwd_batch_id, i, "forward")
	bwd_op = schedule.get_op(bwd_batch_id, i, "backward")
	overlapped_op = OverlappedOperation([fwd_op, bwd_op])
	schedule.register_overlapped_operation(overlapped_op)
	schedule.device_queues[i].add_operation(overlapped_op)

	fwd_batch_id += 1
	bwd_batch_id += 1

	for _ in range(cooldown_batches):
	schedule.device_queues[i].add_operation(
	schedule.get_op(bwd_batch_id, i, "backward")
	)
	bwd_batch_id += 1

	return schedule


	def _get_pp_rank_microbatches(
	num_microbatches,
	num_devices,
	device_id,
	num_stages_per_device,
	microbatch_group_size_per_vp_stage,
	):
	"""Get the number of total, warmup, and remaining microbatches in PP scheduling."""
	total_num_microbatches = num_microbatches * num_stages_per_device

	if num_devices > 1:
	# Run (num_model_chunks-1)*microbatch_group_size_per_vp_stage on
	# all workers, followed by more microbatches after depending on
	# stage ID (more forward passes for earlier stages, later stages can
	# immediately start with 1F1B).
	num_warmup_microbatches = (num_devices - device_id - 1) * 2
	num_warmup_microbatches += (num_stages_per_device - 1) * microbatch_group_size_per_vp_stage
	else:
	# forward_backward_no_pipelining
	num_warmup_microbatches = 1

	if num_warmup_microbatches >= total_num_microbatches:
	num_warmup_microbatches = total_num_microbatches

	return num_warmup_microbatches


	def _get_schedule_table(num_microbatches, num_model_chunks, microbatch_group_size_per_vp_stage):
	"""Get the schedule table for PP scheduling.

	Create a tunable schedule lookup table.
	The schedule lookup table uses the virtual_microbatch_id to find the corresponding microbatch_id and model_chunk_id.
	For example, the tunable schedule table for PP2 N3M5 with VP2 is constructed as below:
	virtual_microbatch_id \| 0 1 2 3 4 5 6 7 8 9
	microbatch_id \| 0 1 2 0 1 2 3 4 3 4
	model_chunk_id \| 0 0 0 1 1 1 0 0 1 1
	"""
	schedule_table = []
	for min_microbatch_id_in_group in range(
	0, num_microbatches, microbatch_group_size_per_vp_stage
	):
	if min_microbatch_id_in_group + microbatch_group_size_per_vp_stage >= num_microbatches:
	# Construct schedule for the last microbatch group
	schedule_table.extend(
	[
	(microbatch_id, model_chunk_id)
	for model_chunk_id in range(num_model_chunks)
	for microbatch_id in range(min_microbatch_id_in_group, num_microbatches)
	]
	)
	else:
	# Construct schedule for other microbatch groups
	schedule_table.extend(
	[
	(microbatch_id, model_chunk_id)
	for model_chunk_id in range(num_model_chunks)
	for microbatch_id in range(
	min_microbatch_id_in_group,
	min_microbatch_id_in_group + microbatch_group_size_per_vp_stage,
	)
	]
	)
	return schedule_table


	def _convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, schedule_table):
	"""Convert a tunable schedule lookup table to the te.make_graphed_callables() accepted
	order format. For example, the tunable schedule table for PP2 N3M5 with VP2 is as below:
	virtual_microbatch_id \| 0 1 2 3 4 5 6 7 8 9
	microbatch_id \| 0 1 2 0 1 2 3 4 3 4
	model_chunk_id \| 0 0 0 1 1 1 0 0 1 1

	Then the forward backward separated order is:
	forward \| 1 1 1 2 2 2 1 1 2 2
	backward \| -2 -2 -2 -1 -1 -1 -2 -2 -1 -1

	If num_warmup_microbatches is 5, the output order is:
	1 1 1 2 2 2 -2 1 -2 1 -2 2 -1 2 -1 -1 -2 -2 -1 -1
	"""
	_, model_chunk_id_table = zip(*schedule_table)
	forward_order = [chunk_id + 1 for chunk_id in model_chunk_id_table]
	backward_order = [chunk_id - num_model_chunks for chunk_id in model_chunk_id_table]
	order = forward_order[:num_warmup_microbatches]
	for i in range(num_warmup_microbatches, len(forward_order)):
	order.append(forward_order[i])
	order.append(backward_order[i - num_warmup_microbatches])
	if num_warmup_microbatches > 0:
	order.extend(backward_order[-num_warmup_microbatches:])
	return order


	# Some codes are copied from Megatron-LM
	def generate_1f1b_interleave_schedule(config: ScheduleConfig):
	schedule = Schedule(config)

	for device_id in range(config.num_devices):
	microbatch_group_size_per_vp_stage = config.num_devices
	num_warmup_microbatches = _get_pp_rank_microbatches(
	config.num_batches,
	config.num_devices,
	device_id,
	config.num_stages_per_device,
	microbatch_group_size_per_vp_stage,
	)

	schedule_table = _get_schedule_table(
	config.num_batches,
	config.num_stages_per_device,
	microbatch_group_size_per_vp_stage,
	)

	order = _convert_schedule_table_to_order(
	num_warmup_microbatches,
	num_model_chunks=config.num_stages_per_device,
	schedule_table=schedule_table,
	)

	cur_stage_microbatch_id = {}
	for i in range(1, config.num_stages_per_device+1):
	cur_stage_microbatch_id[i] = 0
	cur_stage_microbatch_id[-i] = 0
	for order_item in order:
	stage_id = schedule.device_queues[device_id].stages[abs(order_item)-1]

	if order_item > 0:
	op_type = "forward"
	micro_batch_id = cur_stage_microbatch_id[order_item]
	cur_stage_microbatch_id[order_item] = cur_stage_microbatch_id[order_item] + 1
	elif order_item < 0:
	op_type = "backward"
	micro_batch_id = cur_stage_microbatch_id[order_item]
	cur_stage_microbatch_id[order_item] = cur_stage_microbatch_id[order_item] + 1
	else:
	raise ValueError(f"Invalid order item: {order_item}")
	schedule.device_queues[device_id].add_operation(
	schedule.get_op(micro_batch_id, stage_id, op_type)
	)
	return schedule

	def generate_1f1b_interleave_overlap_schedule(config: ScheduleConfig):
	schedule = Schedule(config)

	for device_id in range(config.num_devices):
	microbatch_group_size_per_vp_stage = config.num_devices
	num_warmup_microbatches = _get_pp_rank_microbatches(
	config.num_batches,
	config.num_devices,
	device_id,
	config.num_stages_per_device,
	microbatch_group_size_per_vp_stage,
	)

	schedule_table = _get_schedule_table(
	config.num_batches,
	config.num_stages_per_device,
	microbatch_group_size_per_vp_stage,
	)

	# NOTE: Add one more warmup microbatch for overlapped operations!
	num_warmup_microbatches += 1
	order = _convert_schedule_table_to_order(
	num_warmup_microbatches,
	num_model_chunks=config.num_stages_per_device,
	schedule_table=schedule_table,
	)

	cur_stage_microbatch_id = {}
	for i in range(1, config.num_stages_per_device+1):
	cur_stage_microbatch_id[i] = 0
	cur_stage_microbatch_id[-i] = 0
	i = 0

	num_overlapped_batches = len(order) - num_warmup_microbatches * 2
	while i < len(order):
	if i < num_warmup_microbatches:
	order_item = order[i]
	assert order_item > 0
	op_type = "forward"
	micro_batch_id = cur_stage_microbatch_id[order_item]
	cur_stage_microbatch_id[order_item] = cur_stage_microbatch_id[order_item] + 1

	stage_id = schedule.device_queues[device_id].stages[abs(order_item)-1]
	schedule.device_queues[device_id].add_operation(
	schedule.get_op(micro_batch_id, stage_id, op_type)
	)
	i += 1
	elif i >= num_warmup_microbatches and i < num_warmup_microbatches + num_overlapped_batches - 1:
	order_item_a = order[i]
	order_item_b = order[i+1]

	op_type_a = "forward" if order_item_a > 0 else "backward"
	micro_batch_id_a = cur_stage_microbatch_id[order_item_a]
	cur_stage_microbatch_id[order_item_a] = cur_stage_microbatch_id[order_item_a] + 1

	op_type_b = "forward" if order_item_b > 0 else "backward"
	micro_batch_id_b = cur_stage_microbatch_id[order_item_b]
	cur_stage_microbatch_id[order_item_b] = cur_stage_microbatch_id[order_item_b] + 1

	stage_id_a = schedule.device_queues[device_id].stages[abs(order_item_a)-1]
	stage_id_b = schedule.device_queues[device_id].stages[abs(order_item_b)-1]

	op_a = schedule.get_op(micro_batch_id_a, stage_id_a, op_type_a)
	op_b = schedule.get_op(micro_batch_id_b, stage_id_b, op_type_b)
	overlapped_op = OverlappedOperation([op_a, op_b])
	schedule.register_overlapped_operation(overlapped_op)
	schedule.device_queues[device_id].add_operation(overlapped_op)

	i += 2
	else:
	assert i >= num_warmup_microbatches + num_overlapped_batches
	order_item = order[i]
	assert order_item < 0
	op_type = "backward"
	micro_batch_id = cur_stage_microbatch_id[order_item]
	cur_stage_microbatch_id[order_item] = cur_stage_microbatch_id[order_item] + 1

	stage_id = schedule.device_queues[device_id].stages[abs(order_item)-1]
	schedule.device_queues[device_id].add_operation(
	schedule.get_op(micro_batch_id, stage_id, op_type)
	)
	i += 1


	return schedule


	def create_overlapped_ops(schedule, batch_id1, batch_id2, stage_id, type1, type2):
	"""
	Helper function to create overlapped operations correctly.
	This handles the underlying operation creation and registration to avoid device_id issues.
	"""
	# Get the operations from the schedule
	op1 = schedule.ops[(batch_id1, stage_id, type1)]
	op2 = schedule.ops[(batch_id2, stage_id, type2)]

	# Create the overlapped operation
	overlapped_op = OverlappedOperation([op1, op2])

	# Register in the schedule to ensure proper tracking
	schedule.register_overlapped_operation(overlapped_op)

	return overlapped_op


	def generate_dualpipe_schedule(config: ScheduleConfig):
	"""
	Implements the DualPipe scheduling strategy.

	DualPipe is a bidirectional pipeline parallelism algorithm that achieves full overlap of forward
	and backward computation-communication phases and reduces pipeline bubbles.

	The DualPipe strategy has the following characteristics:
	1. Requires placement_strategy="dualpipe" in ScheduleConfig (set automatically)
	2. Each device handles both a forward stage and a reverse stage
	3. Overlaps forward and backward operations to reduce bubble size
	4. Assumes config.num_batches corresponds to half the total microbatches in original paper (M).
	5. Currently only supports split_backward=True.

	Args:
	config: The scheduling configuration

	Returns:
	A Schedule object with the DualPipe scheduling
	"""
	# Ensure placement strategy is set for Schedule initialization
	assert config.placement_strategy == "dualpipe", "DualPipe schedule currently only supports placement_strategy='dualpipe'"
	# Assertions based on DualPipe requirements
	assert config.num_stages % 2 == 0, "DualPipe requires an even number of stages (and devices)"
	assert config.num_devices == config.num_stages, "DualPipe requires num_devices == num_stages"
	assert config.num_batches % 2 == 0, "DualPipe requires an even number of microbatches (config.num_batches)"
	# Assertion based on original implementation: num_chunks >= num_ranks * 2
	# Here, M (config.num_batches) corresponds to half_num_chunks
	assert config.num_batches >= config.num_devices, "DualPipe requires config.num_batches >= config.num_devices"
	assert config.split_backward, "DualPipe schedule currently only supports split_backward=True"

	schedule = Schedule(config, init_ops=False)

	num_stages = config.num_stages
	num_devices = config.num_devices
	# config.num_batches is M in the original paper, which corresponds to half_num_chunks
	half_num_chunks = config.num_batches // 2
	num_half_ranks = num_devices // 2

	fwd_batch_ids = defaultdict(int) # (device_id, phase) -> batch_id
	bwd_d_batch_ids = defaultdict(int) # (device_id, phase) -> batch_id

	waited_weight_grad = [deque() for _ in range(num_devices)] # (device_id, ) -> List[(stage_id, batch_id)]

	for device_id in range(num_devices):
	is_in_second_half = device_id >= num_half_ranks
	if is_in_second_half:
	fwd_batch_ids[device_id, 1] = 0
	fwd_batch_ids[device_id, 0] = config.num_batches // 2
	bwd_d_batch_ids[device_id, 1] = 0
	bwd_d_batch_ids[device_id, 0] = config.num_batches // 2
	else:
	fwd_batch_ids[device_id, 0] = 0
	fwd_batch_ids[device_id, 1] = config.num_batches // 2
	bwd_d_batch_ids[device_id, 0] = 0
	bwd_d_batch_ids[device_id, 1] = config.num_batches // 2
	def get_stage_for_phase(device_id, phase, num_stages, is_in_second_half):
	stage_fwd_dir = device_id # Stage handled when moving forward (0 to N-1)
	stage_rev_dir = num_stages - 1 - device_id # Stage handled when moving backward (N-1 to 0)
	if not is_in_second_half:
	# First half: phase 0 -> fwd_dir, phase 1 -> rev_dir
	return stage_fwd_dir if phase == 0 else stage_rev_dir
	else:
	# Second half: phase 0 -> rev_dir, phase 1 -> fwd_dir
	return stage_rev_dir if phase == 0 else stage_fwd_dir


	def add_op_to_queue(device_id, stage_id, op_type, batch_id):
	# Retrieve the correct pre-initialized Operation object
	op = Operation(batch_id, stage_id, op_type)
	schedule.register_operation(op)
	# Add to the device queue
	schedule.device_queues[device_id].add_operation(op)

	def _schedule_forward_chunk(device_id, phase, is_in_second_half):
	"""Schedules a forward compute operation."""
	stage_id = get_stage_for_phase(device_id, phase, num_stages, is_in_second_half)
	batch_id = fwd_batch_ids[device_id, phase]
	add_op_to_queue(device_id, stage_id, "forward", batch_id)
	fwd_batch_ids[device_id, phase] += 1

	def _schedule_backward_chunk(device_id, phase, is_in_second_half):
	"""Schedules a backward_D with backward_W compute operation."""
	stage_id = get_stage_for_phase(device_id, phase, num_stages, is_in_second_half)
	batch_id = bwd_d_batch_ids[device_id, phase]
	add_op_to_queue(device_id, stage_id, "backward", batch_id)
	bwd_d_batch_ids[device_id, phase] += 1

	def _schedule_backward_input_chunk(device_id, phase, is_in_second_half):
	"""Schedules a backward_D compute operation."""
	stage_id = get_stage_for_phase(device_id, phase, num_stages, is_in_second_half)
	batch_id = bwd_d_batch_ids[device_id, phase]
	add_op_to_queue(device_id, stage_id, "backward_D", batch_id)
	bwd_d_batch_ids[device_id, phase] += 1
	waited_weight_grad[device_id].append((stage_id, batch_id))

	def _schedule_backward_weight_chunk(device_id):
	"""Schedules a backward_W compute operation."""
	stage_id, batch_id = waited_weight_grad[device_id].popleft()
	add_op_to_queue(device_id, stage_id, "backward_W", batch_id)

	def _schedule_forward_backward_chunk(device_id, fwd_phase, bwd_phase, is_in_second_half):
	"""Schedules an overlapped forward and backward_D compute operation."""
	fwd_stage_id = get_stage_for_phase(device_id, fwd_phase, num_stages, is_in_second_half)
	bwd_stage_id = get_stage_for_phase(device_id, bwd_phase, num_stages, is_in_second_half)

	fwd_batch_id = fwd_batch_ids[device_id, fwd_phase]

	fwd_op = Operation(fwd_batch_id, fwd_stage_id, "forward")
	schedule.register_operation(fwd_op)
	fwd_batch_ids[device_id, fwd_phase] += 1

	bwd_batch_id_d = bwd_d_batch_ids[device_id, bwd_phase]
	bwd_op = Operation(bwd_batch_id_d, bwd_stage_id, "backward")
	schedule.register_operation(bwd_op)
	bwd_d_batch_ids[device_id, bwd_phase] += 1

	# Create and register the overlapped operation
	overlapped_op = OverlappedOperation([fwd_op, bwd_op])
	schedule.register_overlapped_operation(overlapped_op)

	# Add the overlapped operation to the queue
	schedule.device_queues[device_id].add_operation(overlapped_op)


	# Process each device (rank in original code)
	for device_id in range(num_devices):
	half_rank = min(device_id, num_devices - 1 - device_id)
	is_in_second_half = device_id >= num_half_ranks
	is_middle_rank = (device_id == num_half_ranks - 1) or (device_id == num_half_ranks)

	# Map original steps to operation additions
	# Step 1: nF0
	step_1_count = (num_half_ranks - half_rank - 1) * 2
	for _ in range(step_1_count):
	_schedule_forward_chunk(device_id, 0, is_in_second_half) # F0

	# Step 2: nF0F1
	step_2_count = half_rank + 1
	for i in range(step_2_count):
	_schedule_forward_chunk(device_id, 0, is_in_second_half) # F0
	_schedule_forward_chunk(device_id, 1, is_in_second_half) # F1

	# Step 3: nB1W1F1
	step_3_count = num_half_ranks - half_rank - 1
	for _ in range(step_3_count):
	_schedule_backward_input_chunk(device_id, 1, is_in_second_half) # B1_D
	_schedule_backward_weight_chunk(device_id,) # W1
	_schedule_forward_chunk(device_id, 1, is_in_second_half) # F1

	# Step 4 (Main step): nF0B1F1B0
	step_4_count = half_num_chunks - num_devices + half_rank + 1
	for i in range(step_4_count):
	# if i == 0 and is_middle_rank:
	# Schedule F0, B1_D, W1 sequentially for middle ranks on first iteration
	# _schedule_forward_chunk(device_id, 0, is_in_second_half) # F0
	# _schedule_backward_chunk(device_id, 1, is_in_second_half)# B1
	# _schedule_backward_weight_chunk(device_id, 1, is_in_second_half) # W1
	# else:
	# Overlap F0 and B1_D, then schedule W1
	_schedule_forward_backward_chunk(device_id, 0, 1, is_in_second_half) # F0+B1

	# Overlap F1 and B0_D, then schedule W0
	_schedule_forward_backward_chunk(device_id, 1, 0, is_in_second_half) # F1+B0

	# Step 5: nB1F1B0
	step_5_count = num_half_ranks - half_rank - 1
	for _ in range(step_5_count):
	_schedule_backward_chunk(device_id, 1, is_in_second_half) # B1_D + B1_W
	_schedule_forward_backward_chunk(device_id, 1, 0, is_in_second_half) # F1+B0

	# Step 6: nB1B0
	step_6_count = half_rank + 1
	enable_zb = False
	for i in range(step_6_count):
	if i == step_6_count // 2 and half_rank % 2 == 1:
	enable_zb = True
	if enable_zb:
	_schedule_backward_input_chunk(device_id, 1, is_in_second_half)
	else:
	_schedule_backward_chunk(device_id, 1, is_in_second_half)
	if i == step_6_count // 2 and half_rank % 2 == 0:
	enable_zb = True
	if enable_zb:
	_schedule_backward_input_chunk(device_id, 0, is_in_second_half)
	else:
	_schedule_backward_chunk(device_id, 0, is_in_second_half)

	# Step 7: nWB0
	step_7_count = num_half_ranks - half_rank - 1
	for _ in range(step_7_count):
	_schedule_backward_weight_chunk(device_id) # W1 (use gradient from B1_D scheduled previously)
	_schedule_backward_input_chunk(device_id, 0, is_in_second_half) # B0_D

	# Step 8: nW
	step_8_count = half_rank + 1
	for _ in range(step_8_count):
	# W0 uses gradients from B0_D scheduled in steps 4, 5, 6.
	# W1 uses gradients from B1_D scheduled in steps 3, 4, 5, 6.
	# The last W0 gradients correspond to B0_D from step 6 or 7.
	_schedule_backward_weight_chunk(device_id) # W0 (use gradient from B0_D scheduled previously)

	return schedule