diff --git a/CLAUDE.md b/CLAUDE.md index fc154ba..601c11d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -536,6 +536,7 @@ Before implementing: - No "flexibility" or "configurability" that wasn't requested. - No error handling for impossible scenarios. - If you write 200 lines and it could be 50, rewrite it. +- Try to keep docstrings short to medium length. Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify. diff --git a/src/mldebug/backend/core_dump_impl.py b/src/mldebug/backend/core_dump_impl.py index f38e8ca..50e9268 100644 --- a/src/mldebug/backend/core_dump_impl.py +++ b/src/mldebug/backend/core_dump_impl.py @@ -8,7 +8,7 @@ import struct from pathlib import Path from mldebug.utils import print_tile_grid -from mldebug.arch import AIE_DEV_PHX, AIE_DEV_STX, AIE_DEV_TEL, AIE_DEV_NPU3, load_aie_arch +from mldebug.arch import AIE_DEV_PHX, AIE_DEV_STX, AIE_DEV_TEL, AIE_DEV_NPU3 from .backend_interface import BackendInterface try: @@ -62,7 +62,7 @@ class CoreDumpFallbackReader: Pure Python fallback implementation for reading core dump files. Replicates the C++ CoreDumpDataAccessBackend logic. """ - def __init__(self, core_dump_file, dev_name, no_header=False, args=None): + def __init__(self, core_dump_file, dev_name, no_header=False): """ Initialize the fallback reader @@ -70,20 +70,15 @@ def __init__(self, core_dump_file, dev_name, no_header=False, args=None): core_dump_file (str): Path to the binary core dump file dev_name (str): Device name (phx, stx, telluride, npu3) no_header (bool): If True, skip header parsing and treat data as starting at offset 0 - args: Used to update device and aie_iface. """ self.filename = core_dump_file self.dev_name = dev_name.lower() - self.args = args self.file_handle = None - # Without a header to parse, we have no way to recover from an unknown device name. - # With a header, _parse_header() will detect/override dev_name and metadata. - if no_header and self.dev_name not in DEVICE_CONFIGS: + if self.dev_name not in DEVICE_CONFIGS: raise ValueError(f"Unknown device: {dev_name}. Supported: {list(DEVICE_CONFIGS.keys())}") - # Provisional metadata; may be replaced by _parse_header() based on header hwGen. - self.metadata = DEVICE_CONFIGS.get(self.dev_name) + self.metadata = DEVICE_CONFIGS[self.dev_name] self.header_size = 256 # Default header size # Open the binary dump file @@ -119,9 +114,11 @@ def __del__(self): if self.file_handle: self.file_handle.close() - def _parse_header(self): + @staticmethod + def peek_device(filename): """ - Parse the core dump file header. + Read the core dump header, print its contents, and return the device name. + Header structure (from C++ coreDumpHeader): - char magicNumber[4]: "NPU" (4 bytes) - uint32_t versionNum: Version number (4 bytes) @@ -132,12 +129,57 @@ def _parse_header(self): - uint8_t memTileRows: Number of memory tile rows (1 byte) - uint8_t totalNumRows: Total number of rows (1 byte) - uint8_t totalNumCols: Total number of columns (1 byte) + + Returns the matching device name from DEVICE_CONFIGS, or None if the file + is missing/unreadable, lacks the "NPU" magic, or has an unknown hwGen. + """ + if not filename or not Path(filename).exists(): + return None + try: + with open(filename, "rb") as f: + magic = f.read(4) + if magic[:3] != b"NPU": + return None + header = f.read(14) + if len(header) != 14: + return None + except OSError: + return None + + version_num, header_size = struct.unpack(" 1024 * 1024: # Between 18 bytes and 1MB + if self.header_size < 18 or self.header_size > 1024 * 1024: raise ValueError(f"Invalid header size in core dump: {self.header_size} bytes (expected 18-1048576)") - # Read device metadata (6 bytes) - metadata_data = self.file_handle.read(6) - if len(metadata_data) != 6: - raise RuntimeError("Core dump file is corrupted: cannot read device metadata") - - hw_gen, core_row_start, mem_row_start, mem_tile_rows, total_rows, total_cols = ( - struct.unpack(" None: + def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None, no_header=False) -> None: """ Initialize the Core Dump backend @@ -358,7 +344,6 @@ def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None core_dump_file: Path to core dump file (required) no_header: If True, parse core dump assuming no header (data starts at offset 0). Forces use of the Python fallback reader. - args: Used for device management """ self.overlay_aie_core_tiles = aie_overlay_tiles self.pc_brkpts = [0, 0] @@ -379,11 +364,11 @@ def __init__(self, aie_overlay_tiles, ctx_id, pid, dev_name, core_dump_file=None try: self.binding = MlDebug(list(self.overlay_aie_core_tiles), ctx_id, pid, dev_name, "debuglibrary", core_dump_file) print("[INFO] Core Dump backend initialized with C++ DebugLibrary") - except ImportError: + except (ImportError, TypeError): self.use_fallback = True if self.use_fallback: - self.fallback_reader = CoreDumpFallbackReader(core_dump_file, dev_name, no_header=no_header, args=args) + self.fallback_reader = CoreDumpFallbackReader(core_dump_file, dev_name, no_header=no_header) print("[INFO] Core Dump backend is read-only. Write/control operations will be ignored.") diff --git a/src/mldebug/backend/factory.py b/src/mldebug/backend/factory.py index 4bda2e1..44def34 100644 --- a/src/mldebug/backend/factory.py +++ b/src/mldebug/backend/factory.py @@ -70,5 +70,4 @@ def create_backend(backend_type, config): return core_dump_mod.CoreDumpImpl( config.tiles, config.ctx_id, config.pid, config.device, core_dump_file=config.core_dump_file, no_header=config.no_header, - args=config.args, ) diff --git a/src/mldebug/input_parser.py b/src/mldebug/input_parser.py index 2880fff..e397f74 100644 --- a/src/mldebug/input_parser.py +++ b/src/mldebug/input_parser.py @@ -17,6 +17,7 @@ import re from mldebug.arch import load_aie_arch, AIE_DEV_PHX, AIE_DEV_STX, AIE_DEV_TEL +from mldebug.backend.core_dump_impl import CoreDumpFallbackReader from mldebug.utils import LOGGER, is_aarch64, is_windows @dataclass @@ -200,6 +201,15 @@ def set_device(args) -> None: endmsg = "\n" if not args.device: endmsg = " Use -d to specify a diferent device.\n" + # For core dumps, the device is baked into the file header. Detect it now + # so the overlay (built before the backend) uses the correct aie_iface. + if getattr(args, "core_dump", None) and not getattr(args, "no_header", False): + cd_dev = CoreDumpFallbackReader.peek_device(args.core_dump) + if cd_dev: + args.device = cd_dev + print(f"[INFO] Using AIE Device: {args.device} (detected from core dump header).") + return + # if on ARM, default is telluride else STX args.device = AIE_DEV_TEL if is_aarch64() else AIE_DEV_STX genstr = "XAIE_DEV_GEN_AIE2P"