Re: [PATCH v4 03/13] gpu: nova-core: gsp: replace BootUnloadGuard with local handlers
From: Eliot Courtney
Date: Tue Jun 30 2026 - 22:54:54 EST
On Mon Jun 29, 2026 at 11:09 PM JST, Alexandre Courbot wrote:
> When adding the GSP unload capability, we introduced `BootUnloadGuard`
> to automatically call `Gsp::unload` whenever an error occurred during
> the boot process, in order to try to reset the GSP to a valid state.
>
> This approach is not well-suited to the errors that may occur in HALs:
> by definition, an error occurring in the HAL means that the GSP is not
> booted; yet the first thing that `Gsp::unload` does is queue a shutdown
> message to the GSP, which will inevitably result in a timeout when done
> from a HAL.
>
> Furthermore, `BootUnloadGuard` is problematic because it holds
> additional references to the boot context, notably the `Falcon`s. These
> extra references stand in the way of making some of the `Falcon`'s
> methods mutable, since those methods would require exclusive access. As
> this behavior is only needed in one place, introducing dedicated types
> for it is distracting and unnecessary.
>
> Thus, remove `BootUnloadGuard` and adopt a two-level error handling
> strategy:
>
> - HALs are free to handle their errors as they see fit (most likely, by
> running their unload bundle if it is ready by the time of the error),
> - `Gsp::boot` uses a `ScopeGuard` that runs `Gsp::unload`, since the
> GSP should be up and running by the time `GspHal::boot` has returned.
>
> Signed-off-by: Alexandre Courbot <acourbot@xxxxxxxxxx>
> ---
> drivers/gpu/nova-core/gsp/boot.rs | 67 +++-------------------------------
> drivers/gpu/nova-core/gsp/hal.rs | 13 +++----
> drivers/gpu/nova-core/gsp/hal/gh100.rs | 20 ++++------
> drivers/gpu/nova-core/gsp/hal/tu102.rs | 23 +++++++-----
> 4 files changed, 33 insertions(+), 90 deletions(-)
>
> diff --git a/drivers/gpu/nova-core/gsp/boot.rs b/drivers/gpu/nova-core/gsp/boot.rs
> index ab0491b57944..536f2e341c01 100644
> --- a/drivers/gpu/nova-core/gsp/boot.rs
> +++ b/drivers/gpu/nova-core/gsp/boot.rs
> @@ -30,66 +30,6 @@
> },
> };
>
> -/// Arguments required to call [`Gsp::unload`](super::Gsp::unload).
> -///
> -/// Stored as their own type to avoid repeating a long and tedious list in [`BootUnloadGuard`].
> -pub(super) struct BootUnloadArgs<'a> {
> - gsp: &'a super::Gsp,
> - dev: &'a device::Device<device::Bound>,
> - bar: Bar0<'a>,
> - gsp_falcon: &'a Falcon<'a, Gsp>,
> - sec2_falcon: &'a Falcon<'a, Sec2>,
> - unload_bundle: Option<super::UnloadBundle>,
> -}
> -
> -/// Guard that calls [`Gsp::unload`](super::Gsp::unload) with a
> -/// [`UnloadBundle`](super::UnloadBundle) when dropped.
> -///
> -/// Used to ensure the `UnloadBundle` is run during failure paths.
> -pub(super) struct BootUnloadGuard<'a> {
> - guard: ScopeGuard<BootUnloadArgs<'a>, fn(BootUnloadArgs<'a>)>,
> -}
> -
> -impl<'a> BootUnloadGuard<'a> {
> - /// Wraps `unload_bundle` into a guard that executes it when dropped.
> - pub(super) fn new(
> - gsp: &'a super::Gsp,
> - dev: &'a device::Device<device::Bound>,
> - bar: Bar0<'a>,
> - gsp_falcon: &'a Falcon<'a, Gsp>,
> - sec2_falcon: &'a Falcon<'a, Sec2>,
> - unload_bundle: Option<super::UnloadBundle>,
> - ) -> Self {
> - Self {
> - guard: ScopeGuard::new_with_data(
> - BootUnloadArgs {
> - gsp,
> - dev,
> - bar,
> - gsp_falcon,
> - sec2_falcon,
> - unload_bundle,
> - },
> - |args| {
> - let _ = super::Gsp::unload(
> - args.gsp,
> - args.dev,
> - args.bar,
> - args.gsp_falcon,
> - args.sec2_falcon,
> - args.unload_bundle,
> - );
> - },
> - ),
> - }
> - }
> -
> - /// Disarms the guard and returns the [`UnloadBundle`](super::UnloadBundle) it contains.
> - pub(super) fn dismiss(self) -> Option<super::UnloadBundle> {
> - self.guard.dismiss().unload_bundle
> - }
> -}
> -
> impl super::Gsp {
> /// Attempt to boot the GSP.
> ///
> @@ -107,6 +47,7 @@ pub(crate) fn boot(
> let bar = ctx.bar;
> let chipset = ctx.chipset;
> let gsp_falcon = ctx.gsp_falcon;
> + let sec2_falcon = ctx.sec2_falcon;
> let dev = pdev.as_ref();
> let hal = super::hal::gsp_hal(chipset);
>
> @@ -118,7 +59,11 @@ pub(crate) fn boot(
> let wpr_meta = Coherent::init(dev, GFP_KERNEL, GspFwWprMeta::new(&gsp_fw, &fb_layout))?;
>
> // Perform the chipset-specific boot sequence, and retrieve the unload bundle.
> - let unload_guard = hal.boot(&self, &ctx, &fb_layout, &wpr_meta)?;
> + let unload_bundle = hal.boot(&self, &ctx, &fb_layout, &wpr_meta)?;
> +
> + let unload_guard = ScopeGuard::new_with_data(unload_bundle, |unload_bundle| {
> + let _ = self.unload(dev, bar, gsp_falcon, sec2_falcon, unload_bundle);
> + });
>
> gsp_falcon.write_os_version(gsp_fw.bootloader.app_version);
>
> diff --git a/drivers/gpu/nova-core/gsp/hal.rs b/drivers/gpu/nova-core/gsp/hal.rs
> index d3e47ef206de..851d1f24c137 100644
> --- a/drivers/gpu/nova-core/gsp/hal.rs
> +++ b/drivers/gpu/nova-core/gsp/hal.rs
> @@ -24,7 +24,6 @@
> Chipset, //
> },
> gsp::{
> - boot::BootUnloadGuard,
> Gsp,
> GspBootContext,
> GspFwWprMeta, //
> @@ -51,15 +50,15 @@ fn run(
> pub(super) trait GspHal: Send {
> /// Performs the GSP boot process, loading and running the required firmwares as needed.
> ///
> - /// Upon success, returns a guard that runs the GSP unload sequence if GSP boot does not
> - /// complete.
> - fn boot<'a>(
> + /// Upon success, returns the [`crate::gsp::UnloadBundle`] to use with [`Gsp::unload`], if one
> + /// could be created.
> + fn boot(
> &self,
> - gsp: &'a Gsp,
> - ctx: &GspBootContext<'a>,
> + gsp: &Gsp,
> + ctx: &GspBootContext<'_>,
> fb_layout: &FbLayout,
> wpr_meta: &Coherent<GspFwWprMeta>,
> - ) -> Result<BootUnloadGuard<'a>>;
> + ) -> Result<Option<crate::gsp::UnloadBundle>>;
>
> /// Performs HAL-specific post-GSP boot tasks.
> ///
> diff --git a/drivers/gpu/nova-core/gsp/hal/gh100.rs b/drivers/gpu/nova-core/gsp/hal/gh100.rs
> index 1d06405a32f6..5fe445d73599 100644
> --- a/drivers/gpu/nova-core/gsp/hal/gh100.rs
> +++ b/drivers/gpu/nova-core/gsp/hal/gh100.rs
> @@ -23,7 +23,6 @@
> Fsp, //
> },
> gsp::{
> - boot::BootUnloadGuard,
> hal::{
> GspHal,
> UnloadBundle, //
> @@ -143,27 +142,22 @@ impl GspHal for Gh100 {
> ///
> /// This path uses FSP to establish a chain of trust and boot GSP-FMC. FSP handles
> /// the GSP boot internally - no manual GSP reset/boot is needed.
> - fn boot<'a>(
> + fn boot(
> &self,
> - gsp: &'a Gsp,
> - ctx: &GspBootContext<'a>,
> + gsp: &Gsp,
> + ctx: &GspBootContext<'_>,
> fb_layout: &FbLayout,
> wpr_meta: &Coherent<GspFwWprMeta>,
> - ) -> Result<BootUnloadGuard<'a>> {
> + ) -> Result<Option<crate::gsp::UnloadBundle>> {
> let dev = ctx.dev();
> let bar = ctx.bar;
> let chipset = ctx.chipset;
> let gsp_falcon = ctx.gsp_falcon;
> - let sec2_falcon = ctx.sec2_falcon;
>
> let unload_bundle = crate::gsp::UnloadBundle(
> KBox::new(FspUnloadBundle, GFP_KERNEL)? as KBox<dyn UnloadBundle>
> );
>
> - // Wrap the unload bundle into a drop guard so it is automatically run upon failure.
> - let unload_guard =
> - BootUnloadGuard::new(gsp, dev, bar, gsp_falcon, sec2_falcon, Some(unload_bundle));
> -
> let mut fsp = Fsp::wait_secure_boot(dev, bar, chipset)?;
>
> let args = FmcBootArgs::new(
> @@ -174,11 +168,13 @@ fn boot<'a>(
> false,
> )?;
>
> - fsp.boot_fmc(dev, fb_layout, &args)?;
> + // Keep the result as we want to wait for lockdown release even in case of error, to make
> + // sure `args` is not accessed by the GSP anymore.
> + let res = fsp.boot_fmc(dev, fb_layout, &args);
In the error case this no longer runs the unload bundle (which waits for
GSP halt). But, we need to wait for GSP halt to ensure stuff is properly
torn down (e.g. in [1], not needing to wait on FSP queue is predicated
on GSP riscv actually halting, to avoid an issue on reprobe, IIUC).
Separately, are you sure that GSP lockdown will release in an error
case? I'm worried that we'll just timeout here, which while it probably
works in that it waits long enough, is semantically a bit weird.
What about using a ScopeGuard that runs the unload_bundle? That way we
will have waited for GSP halt. Later, we could add additional HAL
specific common code for trying to reset GSP in case it doesn't halt.
WDYT?
[1]: https://lore.kernel.org/all/DJAZIGJQ72A9.1688NC74RA4SY@xxxxxxxxxx/
>
> wait_for_gsp_lockdown_release(dev, gsp_falcon,
> args.boot_params_dma_handle())?;
>
> - Ok(unload_guard) + res.map(|()| Some(unload_bundle)) }
> }
>
> diff --git a/drivers/gpu/nova-core/gsp/hal/tu102.rs
> b/drivers/gpu/nova-core/gsp/hal/tu102.rs index
> 87ceb8878f01..f78e2489f5a6 100644 ---
> a/drivers/gpu/nova-core/gsp/hal/tu102.rs +++
> b/drivers/gpu/nova-core/gsp/hal/tu102.rs @@ -6,7 +6,8 @@ use kernel::{
> device, dma::Coherent, - io::Io, // + io::Io, +
> types::ScopeGuard, // };
>
> use crate::{ @@ -32,7 +33,6 @@ }, gpu::Chipset, gsp::{ -
> boot::BootUnloadGuard, hal::{ GspHal, UnloadBundle, // @@ -259,13
> +259,13 @@ fn run_fwsec_frts( struct Tu102;
>
> impl GspHal for Tu102 { - fn boot<'a>(
> + fn boot(
> &self,
> - gsp: &'a Gsp,
> - ctx: &GspBootContext<'a>,
> + gsp: &Gsp,
> + ctx: &GspBootContext<'_>,
> fb_layout: &FbLayout,
> wpr_meta: &Coherent<GspFwWprMeta>,
> - ) -> Result<BootUnloadGuard<'a>> {
> + ) -> Result<Option<crate::gsp::UnloadBundle>> {
> let dev = ctx.dev();
> let bar = ctx.bar;
> let chipset = ctx.chipset;
> @@ -290,9 +290,12 @@ fn boot<'a>(
> .ok()
> .map(crate::gsp::UnloadBundle);
>
> - // Wrap the unload bundle into a drop guard so it is automatically run upon failure.
> - let unload_guard =
> - BootUnloadGuard::new(gsp, dev, bar, gsp_falcon, sec2_falcon, unload_bundle);
> + // Run the unload bundle to try and recover the GSP if an error occurs.
> + let unload_guard = ScopeGuard::new_with_data(unload_bundle, |unload_bundle| {
> + if let Some(unload_bundle) = unload_bundle {
> + let _ = unload_bundle.0.run(dev, bar, gsp_falcon, sec2_falcon);
> + }
> + });
>
> // FWSEC-FRTS is not executed on chips where the FRTS region size is 0 (e.g. GA100).
> if !fb_layout.frts.is_empty() {
> @@ -319,7 +322,7 @@ fn boot<'a>(
> )?
> .run(dev, sec2_falcon, wpr_meta)?;
>
> - Ok(unload_guard)
> + Ok(unload_guard.dismiss())
> }
>
> fn post_boot(&self, gsp: &Gsp, ctx: &GspBootContext<'_>, gsp_fw: &GspFirmware) -> Result {