From 74becdf80d7d2d8db5fc93c4819f9e6be8b083fe Mon Sep 17 00:00:00 2001 From: anurag Date: Thu, 7 May 2026 16:09:23 -0600 Subject: [PATCH 1/2] sync internal Signed-off-by: anurag --- .gitignore | 1 + src/mldebug/aie_util.py | 10 +++++++--- src/mldebug/batch_runner.py | 30 +++++++++++++++++++++++++----- src/mldebug/client_debug.py | 13 +++++++++++++ src/mldebug/input_parser.py | 4 ++-- src/mldebug/interactive_prompt.py | 2 +- src/mldebug/layer_info.py | 16 ++++++++++------ src/mldebug/memory_dumper.py | 6 ++++++ src/mldebug/mldebug_cli.py | 7 +++++-- src/mldebug/work_dir.py | 2 +- 10 files changed, 71 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index eab9506..fd39681 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ src/cpp/build AIEDebugLibrary.dll *.lst *.Identifier +ext/tests diff --git a/src/mldebug/aie_util.py b/src/mldebug/aie_util.py index 5008c80..db62c85 100644 --- a/src/mldebug/aie_util.py +++ b/src/mldebug/aie_util.py @@ -243,12 +243,16 @@ def _filter_tiles(self, tile_type): def read_control_instr(self): """ - Read and return the value of the control instruction from the memory tile spare register. + Read and return the value of the SPARE_REG control instruction from all memory tiles. Returns: - int: Value from the SPARE_REG of memory tile (col=0, row=1). + dict[str, int]: Mapping of "MEM_TILE_{col}" to the SPARE_REG value for each memory tile. """ - return self.impl.read_register(0, 1, self.aie_iface.Memory_tile_registers["SPARE_REG"]) + spare_reg = self.aie_iface.Memory_tile_registers["SPARE_REG"] + return { + f"MEM_TILE_{c}": self.impl.read_register(c, r, spare_reg) + for c, r in self._filter_tiles(self.aie_iface.MEM_TILE_T) + } def initialize_stamp(self): """ diff --git a/src/mldebug/batch_runner.py b/src/mldebug/batch_runner.py index 06ca579..86da9e6 100644 --- a/src/mldebug/batch_runner.py +++ b/src/mldebug/batch_runner.py @@ -9,6 +9,8 @@ InteractiveController builds on this for interactive stepping. """ +import dataclasses +import json import pathlib import sys import time @@ -242,13 +244,14 @@ def schedule_layer_start(self, next_layer): be.continue_aie() # Poll stamps until breakpoint is hit - max_attempts = 1200 - while max_attempts > 0: - if all(be.poll_core_status() for be in bes_to_poll): - break + timeout = 10 + start_time = time.time() + while time.time() - start_time < timeout: if self.args.backend == "test": break - max_attempts -= 1 + time.sleep(0.1) + if all(be.poll_core_status() for be in bes_to_poll): + break # When combo events are used, it takes a few cycles to # hit the breakpoint, so pc might have moved @@ -298,6 +301,7 @@ def _process_err(self): self.status_handle.get(p + "/" + "aie_status_error.txt") else: self.status_handle.get("aie_status_error.txt") + self._write_run_summary("FAIL") sys.exit(1) def _process_end_breakpoint(self, layer, it, sid): @@ -341,6 +345,7 @@ def _process_start_breakpoint(self, layer, it, sid=0): if self.args.exit_at_layer and layer.layer_order >= self.args.exit_at_layer: LOGGER.log(f"[INFO] Exiting debugger at Layer: {layer.layer_order}") + self._write_run_summary("SUCCESS") sys.exit(0) if self.args.run_flags.layer_status and first_it: @@ -470,6 +475,7 @@ def execute_and_dump(self): self.impls[sid].continue_aie() LOGGER.log("\nFinished Execution") self._handle_fsp() + self._write_run_summary("SUCCESS") def _handle_fsp(self): """Handle end-of-run logic for VAIML Failsafe Partition mode.""" @@ -487,3 +493,17 @@ def _handle_fsp(self): "to load the next Failsafe Partition and wait for " "`waiting for user input`. Then press Enter here." ) + + def _write_run_summary(self, status): + """ + Record run state to run_summary.json + """ + rsf = self.args.top_output_dir + "/run_summary.json" + flags_dict = dataclasses.asdict(self.args.run_flags) + summary = {"status": status, "run_flags": flags_dict} + + try: + with open(rsf, "w", encoding="utf-8") as fh: + json.dump(summary, fh, indent=2, default=str) + except (IOError, OSError) as e: + print(f"Unable to write run summary file. {e}") diff --git a/src/mldebug/client_debug.py b/src/mldebug/client_debug.py index 35afd6d..dc19ded 100644 --- a/src/mldebug/client_debug.py +++ b/src/mldebug/client_debug.py @@ -263,6 +263,19 @@ def read_all_core_pc(self): print(f"\n=== Stamp {sid} Core PC ===") impl.read_all_core_pc() + def read_control_instr(self): + """ + Read the SPARE_REG control instruction from all memory tiles across all stamps. + + Returns: + dict[str, int]: Merged mapping of "MEM_TILE_{col}" to SPARE_REG value, aggregated + from each per-stamp AIEUtil. Stamps own disjoint columns, so keys do not collide. + """ + result = {} + for utl in self.aie_utls: + result.update(utl.read_control_instr()) + return result + # # START Advanced Mode Specific functionality # diff --git a/src/mldebug/input_parser.py b/src/mldebug/input_parser.py index 0f697b6..2880fff 100644 --- a/src/mldebug/input_parser.py +++ b/src/mldebug/input_parser.py @@ -36,7 +36,7 @@ class RunFlags: mock_hang: bool dump_temps: bool multistamp: bool - enable_tg: bool + disable_tg: bool @dataclass @@ -121,7 +121,7 @@ def get_flag(s, default=False): get_flag("mock_hang"), get_flag("dump_temps"), get_flag("multistamp"), - get_flag("enable_tg", default=True) + get_flag("disable_tg") ) diff --git a/src/mldebug/interactive_prompt.py b/src/mldebug/interactive_prompt.py index 4542d50..e5db2fc 100644 --- a/src/mldebug/interactive_prompt.py +++ b/src/mldebug/interactive_prompt.py @@ -111,7 +111,7 @@ def _build_shell_namespace(self): rreg = h.impl.read_register preg = h.impl.print_register wreg = h.impl.write_register - control_instr = h.aie_utls[0].read_control_instr + control_instr = h.read_control_instr add_brkpt = h.add_breakpoint status = h.status_handle.get uc_status = h.status_handle.get_uc_status diff --git a/src/mldebug/layer_info.py b/src/mldebug/layer_info.py index 54d1018..b4970bd 100644 --- a/src/mldebug/layer_info.py +++ b/src/mldebug/layer_info.py @@ -466,7 +466,7 @@ def __init__(self, args): self.mladf_report = None has_bi = args.buffer_info and Path(args.buffer_info).is_file() - use_mladf = args.mladf_report and Path(args.mladf_report).is_file() and args.run_flags.enable_tg + use_mladf = args.mladf_report and Path(args.mladf_report).is_file() and not args.run_flags.disable_tg data = None # 1. Parse the buffer info to get Layout if has_bi: @@ -877,14 +877,18 @@ def _initialize_layers_from_workdir(self, args): stamp.end_pc = f.final_lock_release_pc # Under right conditions, we don't even go through iterations - if args.run_flags.skip_iter and args.run_flags.enable_tg: + if args.run_flags.skip_iter: for idx, layer in enumerate(self.layers): if idx >= len(self.layers) - 1: layer.lcp.num_iter = 1 break next_layer_stamps = self.layers[idx+1].stamps - if (layer.stamps[0].name != next_layer_stamps[0].name - and len(layer.stamps) == len(next_layer_stamps) - and all(layer.stamps[i].elf_name == next_layer_stamps[i].elf_name for i in range(len(layer.stamps))) - ): + if args.run_flags.multistamp: + if (layer.stamps[0].name != next_layer_stamps[0].name + and len(layer.stamps) == len(next_layer_stamps) + and all(layer.stamps[i].elf_name == next_layer_stamps[i].elf_name for i in range(len(layer.stamps))) + ): + layer.lcp.num_iter = 1 + elif (layer.stamps[0].name != next_layer_stamps[0].name + and layer.stamps[0].elf_name == next_layer_stamps[0].elf_name ): layer.lcp.num_iter = 1 diff --git a/src/mldebug/memory_dumper.py b/src/mldebug/memory_dumper.py index 1dfe3d8..ff4f489 100644 --- a/src/mldebug/memory_dumper.py +++ b/src/mldebug/memory_dumper.py @@ -73,6 +73,12 @@ def get_output_path(self, buffer=None, col=None, row=None, layer_order=None, bat self._dir_cache.add(p) return p + def get_base_output_dir(self): + """ + Get the base outputput directory. Used by run summary + """ + return self.output_dir + def write_data_to_file(self, data, fname): """ Write an array of data to file in text or binary format. diff --git a/src/mldebug/mldebug_cli.py b/src/mldebug/mldebug_cli.py index e77293e..47b1df1 100644 --- a/src/mldebug/mldebug_cli.py +++ b/src/mldebug/mldebug_cli.py @@ -116,11 +116,14 @@ def debug(args, timestamp, subgraph_name=None, fsp="0", folder_name=None): print(f"Debugging New Failsafe Partition: {fsp}\n") output_dir = f"{folder_name}_{timestamp}/{subgraph_name}/{fsp}" args.subgraph_name = subgraph_name + args.top_output_dir = f"{folder_name}_{timestamp}" else: output_dir = f"output_{time.strftime('%m%d%H%M%S')}" + args.top_output_dir = output_dir if args.output_dir is not None: output_dir = args.output_dir + "/" + output_dir + args.top_output_dir = args.output_dir + "/" + args.top_output_dir launch_debug(args, output_dir) @@ -329,7 +332,7 @@ def app(): "skip_iter", "dump_temps", "multistamp", - "enable_tg" + "disable_tg" ], help="Specify one or more runtime flags:\n" "skip_dump : Do not dump memory\n" @@ -341,7 +344,7 @@ def app(): "skip_iter : Skip iterations in batch mode when possible\n" #"dump_temps : Write intermediate (.lst) files to disk\n" "multistamp : Enable N Stamp/Batch mode\n", - #"enable_tg : Enable Step to TG layers\n", + #"disable_tg : Disable Step to TG layers\n", # 'mock_hang' : Simulate hang at one of the layers in test mode metavar=" ", ) diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index 31b2811..6306fd1 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -169,7 +169,7 @@ def _get_lst(self, elf_path, elf_name): If self.dump_lst is True, writes the output listing to disk. """ lst_data = "" - exe = "llvm-objdump.elf" + exe = "llvm-objdump" archname = "aie2p" if is_windows(): exe = "llvm-objdump.exe" From e275564f200e8f9c1b695de63953ff8947042256 Mon Sep 17 00:00:00 2001 From: anurag Date: Thu, 7 May 2026 16:19:32 -0600 Subject: [PATCH 2/2] fix elf Signed-off-by: anurag --- src/mldebug/work_dir.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mldebug/work_dir.py b/src/mldebug/work_dir.py index 6306fd1..31b2811 100644 --- a/src/mldebug/work_dir.py +++ b/src/mldebug/work_dir.py @@ -169,7 +169,7 @@ def _get_lst(self, elf_path, elf_name): If self.dump_lst is True, writes the output listing to disk. """ lst_data = "" - exe = "llvm-objdump" + exe = "llvm-objdump.elf" archname = "aie2p" if is_windows(): exe = "llvm-objdump.exe"