From c68855d9a34b9151c7f36e28b7796fba65c63a06 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Mon, 4 May 2026 14:09:52 +0100 Subject: [PATCH] gh-149202: Fix frame pointer unwinding on s390x and ARM -fno-omit-frame-pointer is not enough to make every target walkable by the simple manual frame pointer unwinder. The helper used by test_frame_pointer_unwind used to assume the frame pointer named a two-word record where fp[0] was the previous frame pointer and fp[1] was the return address. That is only the generic layout used by some targets. This patch keeps that default, but moves the slots behind named offsets so architecture-specific layouts can describe where the backchain and return address really live. On s390x, GCC and Clang do not emit a usable backchain unless -mbackchain is also enabled. Without it, the unwinder stops at the current C frame and the test reports no Python frames. Once backchains are present, the helper must also stop at the current thread's known C stack bounds; otherwise it can follow the final backchain far enough to dereference an invalid frame and segfault. For Linux s390x backchain frames, the documented z/Architecture stack-frame layout saves r14, the return-address register, at byte offset 112 from the frame pointer, so read the return address from that named slot instead of fp[1]. The 112-byte offset comes from Linux's s390 debugging documentation: its Stack Frame Layout table shows z/Architecture backchain frames with the backchain at offset 0 and saved r14 of the caller function at offset 112: https://www.kernel.org/doc/html/v5.3/s390/debugging390.html#stack-frame-layout This helper remains scoped to Linux s390x backchain frames. GNU SFrame's s390x notes state that the s390x ELF ABI does not generally mandate where RA and FP are saved, or whether they are saved at all: https://sourceware.org/binutils/docs/sframe-spec.html#s390x On 32-bit ARM, GCC defaults to Thumb mode on common armhf toolchains. The Thumb prologue keeps the saved frame pointer and link register at offsets that depend on the generated frame, which breaks the fp[0]/fp[1] walk used by the helper. Use -marm when it is supported for frame-pointer builds, and teach the helper the GCC ARM-mode slots where the previous frame pointer is at fp[-1] and the saved LR return address is at fp[0]. --- Doc/howto/perf_profiling.rst | 4 +- Doc/using/configure.rst | 8 +- Doc/whatsnew/3.15.rst | 5 +- ...-04-05-16-10-00.gh-issue-149202.W8sQeR.rst | 5 +- Modules/_testinternalcapi.c | 149 ++++++++++++++++-- configure | 100 ++++++++++++ configure.ac | 10 ++ 7 files changed, 257 insertions(+), 24 deletions(-) diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst index 653f28ddbabfa4..85051cdfc75f98 100644 --- a/Doc/howto/perf_profiling.rst +++ b/Doc/howto/perf_profiling.rst @@ -219,7 +219,9 @@ How to obtain the best results For best results, keep frame pointers enabled. On supported GCC-compatible toolchains, CPython builds itself with ``-fno-omit-frame-pointer`` and, when -available, ``-mno-omit-leaf-frame-pointer`` by default. These flags allow +available, ``-mno-omit-leaf-frame-pointer`` by default. On 32-bit ARM, +CPython also adds ``-marm`` when supported. On s390 platforms, CPython also +adds ``-mbackchain`` when supported. These flags allow profilers to unwind using only the frame pointer and not on DWARF debug information. This is because as the code that is interposed to allow ``perf`` support is dynamically generated it doesn't have any DWARF debugging information diff --git a/Doc/using/configure.rst b/Doc/using/configure.rst index 086f6bfa22ad4a..eb9a44da7bdcf4 100644 --- a/Doc/using/configure.rst +++ b/Doc/using/configure.rst @@ -784,9 +784,11 @@ also be used to improve performance. Disable frame pointers, which are enabled by default (see :pep:`831`). - By default, the build appends ``-fno-omit-frame-pointer`` (and - ``-mno-omit-leaf-frame-pointer`` when the compiler supports it) to - ``BASECFLAGS`` so profilers, debuggers, and system tracing tools + By default, the build appends ``-fno-omit-frame-pointer``, + ``-mno-omit-leaf-frame-pointer`` when the compiler supports it, + ``-marm`` on 32-bit ARM when supported, and ``-mbackchain`` on s390 + platforms when supported, to ``BASECFLAGS`` so + profilers, debuggers, and system tracing tools (``perf``, ``eBPF``, ``dtrace``, ``gdb``) can walk the C call stack without DWARF metadata. The flags propagate to third-party C extensions through :mod:`sysconfig`. On compilers that do not diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 586a1306d83c4c..5b846779847f2b 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -2305,8 +2305,9 @@ Build changes (:pep:`831`). Pass :option:`--without-frame-pointers` to opt out. Authors of C extensions and native libraries built with custom build systems should add ``-fno-omit-frame-pointer`` and - ``-mno-omit-leaf-frame-pointer`` to their own ``CFLAGS`` to keep the - unwind chain intact. + ``-mno-omit-leaf-frame-pointer`` to their own ``CFLAGS``, + ``-marm`` on 32-bit ARM, and ``-mbackchain`` on s390 platforms, + to keep the unwind chain intact. (Contributed by Pablo Galindo Salgado and Savannah Ostrowski in :gh:`149201`.) .. _whatsnew315-windows-tail-calling-interpreter: diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst index f82ca91f5ba000..9c8bd5deee832c 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-05-16-10-00.gh-issue-149202.W8sQeR.rst @@ -1,4 +1,5 @@ Enable frame pointers by default for GCC-compatible CPython builds, including -``-mno-omit-leaf-frame-pointer`` when the compiler supports it, so profilers -and debuggers can unwind native interpreter frames more reliably. Users can pass +``-mno-omit-leaf-frame-pointer``, ``-marm`` on 32-bit ARM, and ``-mbackchain`` +on s390 platforms when the compiler supports them, so profilers and debuggers +can unwind native interpreter frames more reliably. Users can pass ``--without-frame-pointers`` to opt out. diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index a07675bb66d8cc..1048248f8dd82a 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -59,6 +59,40 @@ static const uintptr_t min_frame_pointer_addr = 0x1000; +#ifdef __s390x__ +// Linux's s390 "Stack Frame Layout" table documents that z/Architecture +// backchain frames start with the backchain at offset 0 and store "saved r14 +// of caller function" at offset 112. The same document's register table +// identifies r14 as the return-address register, so this backchain unwinder +// reads the return address from fp + 112. +// https://www.kernel.org/doc/html/v5.3/s390/debugging390.html#stack-frame-layout +// +// This is only for Linux s390x backchain frames. The s390x ELF ABI does not +// generally mandate where RA and FP are saved, or whether they are saved at all. +// https://sourceware.org/binutils/docs/sframe-spec.html#s390x +# define S390X_FRAME_RETURN_ADDRESS_OFFSET 112 +#endif + +// The generic manual unwinder treats the frame pointer as a two-word record: +// fp[0] is the previous frame pointer and fp[1] is the return address. That is +// not true for every architecture, even with frame pointers enabled, so these +// offsets describe the actual slots used by each supported frame layout. +#if defined(__arm__) && !defined(__thumb__) && !defined(__clang__) +// GCC ARM mode keeps the caller's fp one word below fp and the saved LR at +// fp[0], so the return address is not in the generic fp[1] slot. +# define FRAME_POINTER_NEXT_OFFSET (-1) +# define FRAME_POINTER_RETURN_OFFSET 0 +#elif defined(__s390x__) +// s390x backchain frames keep the previous frame pointer at fp[0], but save the +// return-address register in the ABI register save area rather than fp[1]. +# define FRAME_POINTER_NEXT_OFFSET 0 +# define FRAME_POINTER_RETURN_OFFSET \ + (S390X_FRAME_RETURN_ADDRESS_OFFSET / (Py_ssize_t)sizeof(uintptr_t)) +#else +# define FRAME_POINTER_NEXT_OFFSET 0 +# define FRAME_POINTER_RETURN_OFFSET 1 +#endif + static PyObject * _get_current_module(void) @@ -325,16 +359,97 @@ get_jit_backend(PyObject *self, PyObject *Py_UNUSED(args)) #endif } +static int +stack_address_is_valid(uintptr_t addr, uintptr_t stack_min, uintptr_t stack_max) +{ + if (addr < min_frame_pointer_addr) { + return 0; + } + if (stack_min != 0 && (addr < stack_min || addr >= stack_max)) { + return 0; + } + return 1; +} + +static int +frame_pointer_slot_is_valid(uintptr_t *frame_pointer, Py_ssize_t offset, + uintptr_t stack_min, uintptr_t stack_max) +{ + uintptr_t fp_addr = (uintptr_t)frame_pointer; + uintptr_t slot_addr; + uintptr_t delta = (uintptr_t)Py_ABS(offset) * sizeof(uintptr_t); + if (offset < 0) { + if (fp_addr < delta) { + return 0; + } + slot_addr = fp_addr - delta; + } + else { + if (fp_addr > UINTPTR_MAX - delta) { + return 0; + } + slot_addr = fp_addr + delta; + } + if (!stack_address_is_valid(slot_addr, stack_min, stack_max)) { + return 0; + } + if (stack_max != 0) { + if (slot_addr > UINTPTR_MAX - sizeof(uintptr_t)) { + return 0; + } + if (slot_addr + sizeof(uintptr_t) > stack_max) { + return 0; + } + } + return 1; +} + +static int +next_frame_pointer_is_valid(uintptr_t *frame_pointer, uintptr_t *next_fp, + uintptr_t stack_min, uintptr_t stack_max) +{ + uintptr_t fp_addr = (uintptr_t)frame_pointer; + uintptr_t next_addr = (uintptr_t)next_fp; + if (!stack_address_is_valid(next_addr, stack_min, stack_max)) { + return 0; + } + if ((next_addr % sizeof(uintptr_t)) != 0) { + return 0; + } +#if _Py_STACK_GROWS_DOWN + return next_addr > fp_addr; +#else + return next_addr < fp_addr; +#endif +} + static PyObject * manual_unwind_from_fp(uintptr_t *frame_pointer) { Py_ssize_t max_depth = 200; - int stack_grows_down = _Py_STACK_GROWS_DOWN; + uintptr_t stack_min = 0; + uintptr_t stack_max = 0; + +#ifdef __s390x__ + Py_BUILD_ASSERT(S390X_FRAME_RETURN_ADDRESS_OFFSET % sizeof(uintptr_t) == 0); +#endif if (frame_pointer == NULL) { return PyList_New(0); } + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate != NULL) { + _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; +#if _Py_STACK_GROWS_DOWN + stack_min = tstate_impl->c_stack_hard_limit; + stack_max = tstate_impl->c_stack_top; +#else + stack_min = tstate_impl->c_stack_top; + stack_max = tstate_impl->c_stack_hard_limit; +#endif + } + PyObject *result = PyList_New(0); if (result == NULL) { return NULL; @@ -348,7 +463,21 @@ manual_unwind_from_fp(uintptr_t *frame_pointer) if ((fp_addr % sizeof(uintptr_t)) != 0) { break; } - uintptr_t return_addr = frame_pointer[1]; + if (!stack_address_is_valid(fp_addr, stack_min, stack_max)) { + break; + } + if (!frame_pointer_slot_is_valid(frame_pointer, + FRAME_POINTER_NEXT_OFFSET, + stack_min, stack_max)) { + break; + } + if (!frame_pointer_slot_is_valid(frame_pointer, + FRAME_POINTER_RETURN_OFFSET, + stack_min, stack_max)) { + break; + } + uintptr_t *next_fp = (uintptr_t *)frame_pointer[FRAME_POINTER_NEXT_OFFSET]; + uintptr_t return_addr = frame_pointer[FRAME_POINTER_RETURN_OFFSET]; PyObject *addr_obj = PyLong_FromUnsignedLongLong(return_addr); if (addr_obj == NULL) { @@ -362,22 +491,10 @@ manual_unwind_from_fp(uintptr_t *frame_pointer) } Py_DECREF(addr_obj); - uintptr_t *next_fp = (uintptr_t *)frame_pointer[0]; - // Stop if the frame pointer is extremely low. - if ((uintptr_t)next_fp < min_frame_pointer_addr) { + if (!next_frame_pointer_is_valid(frame_pointer, next_fp, + stack_min, stack_max)) { break; } - uintptr_t next_addr = (uintptr_t)next_fp; - if (stack_grows_down) { - if (next_addr <= fp_addr) { - break; - } - } - else { - if (next_addr >= fp_addr) { - break; - } - } frame_pointer = next_fp; } diff --git a/configure b/configure index 734aa3a6a721d1..561d820a782f4c 100755 --- a/configure +++ b/configure @@ -10343,6 +10343,106 @@ else case e in #( esac fi + case $host_cpu in #( + arm|armv*) : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -marm" >&5 +printf %s "checking whether C compiler accepts -marm... " >&6; } +if test ${ax_cv_check_cflags__Werror__marm+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) + ax_check_save_flags=$CFLAGS + CFLAGS="$CFLAGS -Werror -marm" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + ax_cv_check_cflags__Werror__marm=yes +else case e in #( + e) ax_cv_check_cflags__Werror__marm=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ax_check_save_flags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags__Werror__marm" >&5 +printf "%s\n" "$ax_cv_check_cflags__Werror__marm" >&6; } +if test "x$ax_cv_check_cflags__Werror__marm" = xyes +then : + + frame_pointer_cflags="$frame_pointer_cflags -marm" + +else case e in #( + e) : ;; +esac +fi + + ;; #( + *) : + ;; +esac + case $host_cpu in #( + s390*) : + + { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -mbackchain" >&5 +printf %s "checking whether C compiler accepts -mbackchain... " >&6; } +if test ${ax_cv_check_cflags__Werror__mbackchain+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) + ax_check_save_flags=$CFLAGS + CFLAGS="$CFLAGS -Werror -mbackchain" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main (void) +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO" +then : + ax_cv_check_cflags__Werror__mbackchain=yes +else case e in #( + e) ax_cv_check_cflags__Werror__mbackchain=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext + CFLAGS=$ax_check_save_flags ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags__Werror__mbackchain" >&5 +printf "%s\n" "$ax_cv_check_cflags__Werror__mbackchain" >&6; } +if test "x$ax_cv_check_cflags__Werror__mbackchain" = xyes +then : + + frame_pointer_cflags="$frame_pointer_cflags -mbackchain" + +else case e in #( + e) : ;; +esac +fi + + ;; #( + *) : + ;; +esac else case e in #( e) : ;; diff --git a/configure.ac b/configure.ac index c8cb1686d55c07..3fccdd2a24b326 100644 --- a/configure.ac +++ b/configure.ac @@ -2548,6 +2548,16 @@ AS_VAR_IF([ac_cv_gcc_compat], [yes], [ AX_CHECK_COMPILE_FLAG([-mno-omit-leaf-frame-pointer], [ frame_pointer_cflags="$frame_pointer_cflags -mno-omit-leaf-frame-pointer" ], [], [-Werror]) + AS_CASE([$host_cpu], [arm|armv*], [ + AX_CHECK_COMPILE_FLAG([-marm], [ + frame_pointer_cflags="$frame_pointer_cflags -marm" + ], [], [-Werror]) + ]) + AS_CASE([$host_cpu], [s390*], [ + AX_CHECK_COMPILE_FLAG([-mbackchain], [ + frame_pointer_cflags="$frame_pointer_cflags -mbackchain" + ], [], [-Werror]) + ]) ], [], [-Werror]) if test -n "$frame_pointer_cflags" && test "x$with_frame_pointers" != xno; then BASECFLAGS="$frame_pointer_cflags $BASECFLAGS"