[PATCH v7 4/4] gpu: nova-core: gsp: run the unload bundle if Gsp::boot() fails

From: Alexandre Courbot

Date: Fri May 29 2026 - 03:36:01 EST


If `Gsp::boot` fails, the GSP can be left in a state where boot cannot
be attempted again unless it is reset first.

To avoid this, we want to run the unload bundle whenever `boot` fails to
try and clear the partially-initialized state.

Do this by wrapping the unload bundle into a drop guard up until `boot`
returns. After that, running the unload bundle becomes the
responsibility of the caller.

Signed-off-by: Alexandre Courbot <acourbot@xxxxxxxxxx>
---
drivers/gpu/nova-core/gsp/boot.rs | 67 ++++++++++++++++++++++++++++++++--
drivers/gpu/nova-core/gsp/hal.rs | 19 +++++-----
drivers/gpu/nova-core/gsp/hal/gh100.rs | 15 ++++----
drivers/gpu/nova-core/gsp/hal/tu102.rs | 31 ++++++++++------
4 files changed, 101 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/nova-core/gsp/boot.rs b/drivers/gpu/nova-core/gsp/boot.rs
index 8d6fcc35b653..1f83f63ceeb0 100644
--- a/drivers/gpu/nova-core/gsp/boot.rs
+++ b/drivers/gpu/nova-core/gsp/boot.rs
@@ -8,7 +8,8 @@
io::poll::read_poll_timeout,
pci,
prelude::*,
- time::Delta, //
+ time::Delta,
+ types::ScopeGuard, //
};

use crate::{
@@ -31,6 +32,66 @@
},
};

+/// Arguments required to call [`Gsp::unload`](super::Gsp::unload).
+///
+/// Stored as their own type to avoid repeating a long and tedious list in [`BootUnloadGuard`].
+pub(super) struct BootUnloadArgs<'a> {
+ gsp: &'a super::Gsp,
+ dev: &'a device::Device<device::Bound>,
+ bar: &'a Bar0,
+ gsp_falcon: &'a Falcon<Gsp>,
+ sec2_falcon: &'a Falcon<Sec2>,
+ unload_bundle: Option<super::UnloadBundle>,
+}
+
+/// Guard that calls [`Gsp::unload`](super::Gsp::unload) with a
+/// [`UnloadBundle`](super::UnloadBundle) when dropped.
+///
+/// Used to ensure the `UnloadBundle` is run during failure paths.
+pub(super) struct BootUnloadGuard<'a> {
+ guard: ScopeGuard<BootUnloadArgs<'a>, fn(BootUnloadArgs<'a>)>,
+}
+
+impl<'a> BootUnloadGuard<'a> {
+ /// Wraps `unload_bundle` into a guard that executes it when dropped.
+ pub(super) fn new(
+ gsp: &'a super::Gsp,
+ dev: &'a device::Device<device::Bound>,
+ bar: &'a Bar0,
+ gsp_falcon: &'a Falcon<Gsp>,
+ sec2_falcon: &'a Falcon<Sec2>,
+ unload_bundle: Option<super::UnloadBundle>,
+ ) -> Self {
+ Self {
+ guard: ScopeGuard::new_with_data(
+ BootUnloadArgs {
+ gsp,
+ dev,
+ bar,
+ gsp_falcon,
+ sec2_falcon,
+ unload_bundle,
+ },
+ |args| {
+ let _ = super::Gsp::unload(
+ args.gsp,
+ args.dev,
+ args.bar,
+ args.gsp_falcon,
+ args.sec2_falcon,
+ args.unload_bundle,
+ );
+ },
+ ),
+ }
+ }
+
+ /// Disarms the guard and returns the [`UnloadBundle`](super::UnloadBundle) it contains.
+ pub(super) fn dismiss(self) -> Option<super::UnloadBundle> {
+ self.guard.dismiss().unload_bundle
+ }
+}
+
impl super::Gsp {
/// Attempt to boot the GSP.
///
@@ -59,7 +120,7 @@ pub(crate) fn boot(
let wpr_meta = Coherent::init(dev, GFP_KERNEL, GspFwWprMeta::new(&gsp_fw, &fb_layout))?;

// Perform the chipset-specific boot sequence, and retrieve the unload bundle.
- let unload_bundle = hal.boot(
+ let unload_guard = hal.boot(
&self,
dev,
bar,
@@ -99,7 +160,7 @@ pub(crate) fn boot(
Err(e) => dev_warn!(pdev, "GPU name unavailable: {:?}\n", e),
}

- Ok(unload_bundle)
+ Ok(unload_guard.dismiss())
}

/// Shut down the GSP and wait until it is offline.
diff --git a/drivers/gpu/nova-core/gsp/hal.rs b/drivers/gpu/nova-core/gsp/hal.rs
index 501b852dcb29..88fc3e791114 100644
--- a/drivers/gpu/nova-core/gsp/hal.rs
+++ b/drivers/gpu/nova-core/gsp/hal.rs
@@ -25,6 +25,7 @@
Chipset, //
},
gsp::{
+ boot::BootUnloadGuard,
Gsp,
GspFwWprMeta, //
},
@@ -50,20 +51,20 @@ fn run(
pub(super) trait GspHal: Send {
/// Performs the GSP boot process, loading and running the required firmwares as needed.
///
- /// Upon success, returns the [`UnloadBundle`] to be run (if any) in order to properly reset the
- /// GSP after it has been stopped.
+ /// Upon success, returns a guard that runs the GSP unload sequence if GSP boot does not
+ /// complete.
#[allow(clippy::too_many_arguments)]
- fn boot(
+ fn boot<'a>(
&self,
- gsp: &Gsp,
- dev: &device::Device<device::Bound>,
- bar: &Bar0,
+ gsp: &'a Gsp,
+ dev: &'a device::Device<device::Bound>,
+ bar: &'a Bar0,
chipset: Chipset,
fb_layout: &FbLayout,
wpr_meta: &Coherent<GspFwWprMeta>,
- gsp_falcon: &Falcon<GspEngine>,
- sec2_falcon: &Falcon<Sec2>,
- ) -> Result<Option<crate::gsp::UnloadBundle>>;
+ gsp_falcon: &'a Falcon<GspEngine>,
+ sec2_falcon: &'a Falcon<Sec2>,
+ ) -> Result<BootUnloadGuard<'a>>;

/// Performs HAL-specific post-GSP boot tasks.
///
diff --git a/drivers/gpu/nova-core/gsp/hal/gh100.rs b/drivers/gpu/nova-core/gsp/hal/gh100.rs
index 0a8b7f763883..9a4bb22578b3 100644
--- a/drivers/gpu/nova-core/gsp/hal/gh100.rs
+++ b/drivers/gpu/nova-core/gsp/hal/gh100.rs
@@ -18,6 +18,7 @@
fb::FbLayout,
gpu::Chipset,
gsp::{
+ boot::BootUnloadGuard,
hal::GspHal,
Gsp,
GspFwWprMeta, //
@@ -31,17 +32,17 @@ impl GspHal for Gh100 {
///
/// This path uses FSP to establish a chain of trust and boot GSP-FMC. FSP handles
/// the GSP boot internally - no manual GSP reset/boot is needed.
- fn boot(
+ fn boot<'a>(
&self,
- _gsp: &Gsp,
- _dev: &device::Device<device::Bound>,
- _bar: &Bar0,
+ _gsp: &'a Gsp,
+ _dev: &'a device::Device<device::Bound>,
+ _bar: &'a Bar0,
_chipset: Chipset,
_fb_layout: &FbLayout,
_wpr_meta: &Coherent<GspFwWprMeta>,
- _gsp_falcon: &Falcon<GspEngine>,
- _sec2_falcon: &Falcon<Sec2>,
- ) -> Result<Option<crate::gsp::UnloadBundle>> {
+ _gsp_falcon: &'a Falcon<GspEngine>,
+ _sec2_falcon: &'a Falcon<Sec2>,
+ ) -> Result<BootUnloadGuard<'a>> {
Err(ENOTSUPP)
}
}
diff --git a/drivers/gpu/nova-core/gsp/hal/tu102.rs b/drivers/gpu/nova-core/gsp/hal/tu102.rs
index c4ab081f25c4..6a27e7e90279 100644
--- a/drivers/gpu/nova-core/gsp/hal/tu102.rs
+++ b/drivers/gpu/nova-core/gsp/hal/tu102.rs
@@ -32,6 +32,7 @@
},
gpu::Chipset,
gsp::{
+ boot::BootUnloadGuard,
hal::{
GspHal,
UnloadBundle, //
@@ -254,21 +255,23 @@ fn run_fwsec_frts(
struct Tu102;

impl GspHal for Tu102 {
- fn boot(
+ fn boot<'a>(
&self,
- gsp: &Gsp,
- dev: &device::Device<device::Bound>,
- bar: &Bar0,
+ gsp: &'a Gsp,
+ dev: &'a device::Device<device::Bound>,
+ bar: &'a Bar0,
chipset: Chipset,
fb_layout: &FbLayout,
wpr_meta: &Coherent<GspFwWprMeta>,
- gsp_falcon: &Falcon<GspEngine>,
- sec2_falcon: &Falcon<Sec2>,
- ) -> Result<Option<crate::gsp::UnloadBundle>> {
+ gsp_falcon: &'a Falcon<GspEngine>,
+ sec2_falcon: &'a Falcon<Sec2>,
+ ) -> Result<BootUnloadGuard<'a>> {
let bios = Vbios::new(dev, bar)?;

- // Try and prepare the unload bundle. If this fails, the GPU will need to be reset
- // before the driver can be probed again.
+ // Try and prepare the unload bundle.
+ //
+ // If the unload bundle creation fails, the GPU will need to be reset before the driver can
+ // be probed again.
let unload_bundle =
Sec2UnloadBundle::build(dev, bar, chipset, &bios, gsp_falcon, sec2_falcon)
.inspect_err(|e| {
@@ -279,8 +282,12 @@ fn boot(
"The GPU will need to be reset before the driver can bind again.\n"
);
})
- .map(crate::gsp::UnloadBundle)
- .ok();
+ .ok()
+ .map(crate::gsp::UnloadBundle);
+
+ // Wrap the unload bundle into a drop guard so it is automatically run upon failure.
+ let unload_guard =
+ BootUnloadGuard::new(gsp, dev, bar, gsp_falcon, sec2_falcon, unload_bundle);

// FWSEC-FRTS is not executed on chips where the FRTS region size is 0 (e.g. GA100).
if !fb_layout.frts.is_empty() {
@@ -311,7 +318,7 @@ fn boot(
)?
.run(dev, bar, sec2_falcon, wpr_meta)?;

- Ok(unload_bundle)
+ Ok(unload_guard)
}

fn post_boot(

--
2.54.0