[PATCH 09/13] gpu: nova-core: wait for FSP boot earlier

From: Eliot Courtney

Date: Mon Jun 15 2026 - 10:44:51 EST


For GPU architectures that use FSP CoT boot, ensure that FSP itself is
booted before trying to use it. In particular, accessing registers like
`NV_USABLE_FB_SIZE_IN_MB` for `FbHal::vidmem_size` should happen after
FSP is booted. Currently, we wait for FSP boot too late. So, move this
wait to a new preboot phase.

Signed-off-by: Eliot Courtney <ecourtney@xxxxxxxxxx>
---
drivers/gpu/nova-core/fsp.rs | 40 +++++++++++++++-------------------
drivers/gpu/nova-core/fsp/hal.rs | 4 ++--
drivers/gpu/nova-core/fsp/hal/gb100.rs | 4 ++--
drivers/gpu/nova-core/fsp/hal/gb202.rs | 4 ++--
drivers/gpu/nova-core/fsp/hal/gh100.rs | 10 ++++-----
drivers/gpu/nova-core/gpu.rs | 7 +++---
drivers/gpu/nova-core/gpu/hal.rs | 6 +++--
drivers/gpu/nova-core/gpu/hal/gh100.rs | 10 ++++++---
drivers/gpu/nova-core/gpu/hal/tu102.rs | 3 ++-
drivers/gpu/nova-core/gsp/hal/gh100.rs | 2 +-
10 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/nova-core/fsp.rs b/drivers/gpu/nova-core/fsp.rs
index 3f3211eae4d0..bf0baa5ac4ae 100644
--- a/drivers/gpu/nova-core/fsp.rs
+++ b/drivers/gpu/nova-core/fsp.rs
@@ -49,8 +49,7 @@
NvdmHeader,
NvdmType, //
},
- num,
- regs, //
+ num, //
};

mod hal;
@@ -229,41 +228,36 @@ pub(crate) fn boot_params(&self) -> &Coherent<GspFmcBootParams> {

/// FSP interface for Hopper/Blackwell GPUs.
///
-/// An `Fsp` is produced by [`Fsp::wait_secure_boot`], which only returns once FSP secure boot
-/// has completed. It owns the FSP falcon and the FMC firmware, which are used for the subsequent
-/// Chain of Trust boot.
+/// It owns the FSP falcon and the FMC firmware, which are used for the subsequent Chain of Trust
+/// boot.
pub(crate) struct Fsp {
falcon: Falcon<FspEngine>,
fsp_fw: FspFirmware,
}

impl Fsp {
- /// Waits for FSP secure boot completion, then returns the [`Fsp`] interface.
- ///
- /// Polls the thermal scratch register until FSP signals boot completion or the timeout
- /// elapses. Returning an [`Fsp`] only on success guarantees, at the API level, that the
- /// interface is not used before secure boot has completed.
- pub(crate) fn wait_secure_boot(
- dev: &device::Device<device::Bound>,
- bar: Bar0<'_>,
- chipset: Chipset,
- ) -> Result<Fsp> {
+ /// Waits for FSP secure boot completion. This must be called before trying to create the `Fsp`
+ /// interface or read any registers dependent on FSP boot completion.
+ pub(crate) fn wait_for_secure_boot(bar: Bar0<'_>, chipset: Chipset) -> Result {
/// FSP secure boot completion timeout in milliseconds.
const FSP_SECURE_BOOT_TIMEOUT_MS: i64 = 5000;

let hal = hal::fsp_hal(chipset).ok_or(ENOTSUPP)?;
- let falcon = Falcon::<FspEngine>::new(dev, chipset)?;
- let fsp_fw = FspFirmware::new(dev, chipset, FIRMWARE_VERSION)?;

read_poll_timeout(
- || Ok(hal.fsp_boot_status(bar)),
- |&status| status == regs::NV_THERM_I2CS_SCRATCH_FSP_BOOT_COMPLETE_STATUS_SUCCESS,
+ || Ok(hal.fsp_boot_done(bar)),
+ |&done| done,
Delta::from_millis(10),
Delta::from_millis(FSP_SECURE_BOOT_TIMEOUT_MS),
- )
- .inspect_err(|e| {
- dev_err!(dev, "FSP secure boot completion error: {:?}\n", e);
- })?;
+ )?;
+
+ Ok(())
+ }
+
+ /// Creates an FSP interface.
+ pub(crate) fn new(dev: &device::Device<device::Bound>, chipset: Chipset) -> Result<Self> {
+ let falcon = Falcon::<FspEngine>::new(dev, chipset)?;
+ let fsp_fw = FspFirmware::new(dev, chipset, FIRMWARE_VERSION)?;

Ok(Fsp { falcon, fsp_fw })
}
diff --git a/drivers/gpu/nova-core/fsp/hal.rs b/drivers/gpu/nova-core/fsp/hal.rs
index b6f2624bb13d..7c5a7e61835c 100644
--- a/drivers/gpu/nova-core/fsp/hal.rs
+++ b/drivers/gpu/nova-core/fsp/hal.rs
@@ -14,8 +14,8 @@
mod gh100;

pub(super) trait FspHal {
- /// Returns the secure boot status from the architecture-specific `NV_THERM_I2CS_SCRATCH` register.
- fn fsp_boot_status(&self, bar: Bar0<'_>) -> u32;
+ /// Returns whether FSP secure boot is done.
+ fn fsp_boot_done(&self, bar: Bar0<'_>) -> bool;

/// Returns the FSP Chain of Trust protocol version this chipset advertises.
fn cot_version(&self) -> u16;
diff --git a/drivers/gpu/nova-core/fsp/hal/gb100.rs b/drivers/gpu/nova-core/fsp/hal/gb100.rs
index 42f5ecfc6400..a95b2dde2a04 100644
--- a/drivers/gpu/nova-core/fsp/hal/gb100.rs
+++ b/drivers/gpu/nova-core/fsp/hal/gb100.rs
@@ -9,9 +9,9 @@
struct Gb100;

impl FspHal for Gb100 {
- fn fsp_boot_status(&self, bar: Bar0<'_>) -> u32 {
+ fn fsp_boot_done(&self, bar: Bar0<'_>) -> bool {
// GB10x shares Hopper's FSP secure boot status register.
- super::gh100::fsp_boot_status_gh100(bar)
+ super::gh100::fsp_boot_done_gh100(bar)
}

fn cot_version(&self) -> u16 {
diff --git a/drivers/gpu/nova-core/fsp/hal/gb202.rs b/drivers/gpu/nova-core/fsp/hal/gb202.rs
index 1091b169a645..a3010717c57d 100644
--- a/drivers/gpu/nova-core/fsp/hal/gb202.rs
+++ b/drivers/gpu/nova-core/fsp/hal/gb202.rs
@@ -12,10 +12,10 @@
struct Gb202;

impl FspHal for Gb202 {
- fn fsp_boot_status(&self, bar: Bar0<'_>) -> u32 {
+ fn fsp_boot_done(&self, bar: Bar0<'_>) -> bool {
bar.read(regs::gb202::NV_THERM_I2CS_SCRATCH_FSP_BOOT_COMPLETE)
.fsp_boot_complete()
- .into()
+ == regs::NV_THERM_I2CS_SCRATCH_FSP_BOOT_COMPLETE_STATUS_SUCCESS
}

fn cot_version(&self) -> u16 {
diff --git a/drivers/gpu/nova-core/fsp/hal/gh100.rs b/drivers/gpu/nova-core/fsp/hal/gh100.rs
index 291acaf2845a..a440b68205e2 100644
--- a/drivers/gpu/nova-core/fsp/hal/gh100.rs
+++ b/drivers/gpu/nova-core/fsp/hal/gh100.rs
@@ -11,16 +11,16 @@

struct Gh100;

-/// Reads the FSP secure boot status from the Hopper/GB10x thermal scratch register.
-pub(super) fn fsp_boot_status_gh100(bar: Bar0<'_>) -> u32 {
+/// Returns whether FSP secure boot is done on Hopper/GB10x.
+pub(super) fn fsp_boot_done_gh100(bar: Bar0<'_>) -> bool {
bar.read(regs::gh100::NV_THERM_I2CS_SCRATCH_FSP_BOOT_COMPLETE)
.fsp_boot_complete()
- .into()
+ == regs::NV_THERM_I2CS_SCRATCH_FSP_BOOT_COMPLETE_STATUS_SUCCESS
}

impl FspHal for Gh100 {
- fn fsp_boot_status(&self, bar: Bar0<'_>) -> u32 {
- fsp_boot_status_gh100(bar)
+ fn fsp_boot_done(&self, bar: Bar0<'_>) -> bool {
+ fsp_boot_done_gh100(bar)
}

fn cot_version(&self) -> u16 {
diff --git a/drivers/gpu/nova-core/gpu.rs b/drivers/gpu/nova-core/gpu.rs
index b3c91731db45..ca37892c3b38 100644
--- a/drivers/gpu/nova-core/gpu.rs
+++ b/drivers/gpu/nova-core/gpu.rs
@@ -295,7 +295,8 @@ pub(crate) fn new(
dev_info!(pdev,"NVIDIA ({})\n", spec);
})?,

- // We must wait for GFW_BOOT completion before doing any significant setup on the GPU.
+ // We must wait for some architecture specific setup to complete before doing any
+ // significant setup on the GPU.
_: {
let hal = hal::gpu_hal(spec.chipset);
let dma_mask = hal.dma_mask();
@@ -304,8 +305,8 @@ pub(crate) fn new(
// still constructing it, so no concurrent DMA allocations can exist.
unsafe { pdev.dma_set_mask_and_coherent(dma_mask)? };

- hal.wait_gfw_boot_completion(bar)
- .inspect_err(|_| dev_err!(pdev, "GFW boot did not complete\n"))?;
+ hal.wait_preboot_completion(bar, spec.chipset)
+ .inspect_err(|_| dev_err!(pdev, "preboot firmware did not complete\n"))?;
},

sysmem_flush: SysmemFlush::register(pdev.as_ref(), bar, spec.chipset)?,
diff --git a/drivers/gpu/nova-core/gpu/hal.rs b/drivers/gpu/nova-core/gpu/hal.rs
index 3f25882d0e56..232f073ccc06 100644
--- a/drivers/gpu/nova-core/gpu/hal.rs
+++ b/drivers/gpu/nova-core/gpu/hal.rs
@@ -19,8 +19,10 @@
mod tu102;

pub(crate) trait GpuHal {
- /// Waits for GFW_BOOT completion if required by this hardware family.
- fn wait_gfw_boot_completion(&self, bar: Bar0<'_>) -> Result;
+ /// Waits for architecture specific operations to complete before we can try to boot the GSP.
+ /// For example, may wait on GFW_BOOT completion or FSP secure boot completion, depending on the
+ /// architecture.
+ fn wait_preboot_completion(&self, bar: Bar0<'_>, chipset: Chipset) -> Result;

/// Returns the DMA mask for the current architecture.
fn dma_mask(&self) -> DmaMask;
diff --git a/drivers/gpu/nova-core/gpu/hal/gh100.rs b/drivers/gpu/nova-core/gpu/hal/gh100.rs
index e3f8ba0fab33..3aa18feec1f7 100644
--- a/drivers/gpu/nova-core/gpu/hal/gh100.rs
+++ b/drivers/gpu/nova-core/gpu/hal/gh100.rs
@@ -7,15 +7,19 @@
prelude::*, //
};

-use crate::driver::Bar0;
+use crate::{
+ driver::Bar0,
+ fsp::Fsp,
+ gpu::Chipset, //
+};

use super::GpuHal;

struct Gh100;

impl GpuHal for Gh100 {
- fn wait_gfw_boot_completion(&self, _bar: Bar0<'_>) -> Result {
- Ok(())
+ fn wait_preboot_completion(&self, bar: Bar0<'_>, chipset: Chipset) -> Result {
+ Fsp::wait_for_secure_boot(bar, chipset)
}

fn dma_mask(&self) -> DmaMask {
diff --git a/drivers/gpu/nova-core/gpu/hal/tu102.rs b/drivers/gpu/nova-core/gpu/hal/tu102.rs
index b0732e53edea..34b63a7c0ada 100644
--- a/drivers/gpu/nova-core/gpu/hal/tu102.rs
+++ b/drivers/gpu/nova-core/gpu/hal/tu102.rs
@@ -32,6 +32,7 @@

use crate::{
driver::Bar0,
+ gpu::Chipset,
regs, //
};

@@ -55,7 +56,7 @@ impl GpuHal for Tu102 {
/// This function waits for a signal indicating that core initialization is complete. Before
/// this signal is received, little can be done with the GPU. This signal is set by the FWSEC
/// running on the GSP in Heavy-secured mode.
- fn wait_gfw_boot_completion(&self, bar: Bar0<'_>) -> Result {
+ fn wait_preboot_completion(&self, bar: Bar0<'_>, _chipset: Chipset) -> Result {
// Before accessing the completion status in `NV_PGC6_AON_SECURE_SCRATCH_GROUP_05`, we must
// first check `NV_PGC6_AON_SECURE_SCRATCH_GROUP_05_PRIV_LEVEL_MASK`. This is because
// `NV_PGC6_AON_SECURE_SCRATCH_GROUP_05` becomes accessible only after the secure firmware
diff --git a/drivers/gpu/nova-core/gsp/hal/gh100.rs b/drivers/gpu/nova-core/gsp/hal/gh100.rs
index 31498ae7abd2..35554d92fda9 100644
--- a/drivers/gpu/nova-core/gsp/hal/gh100.rs
+++ b/drivers/gpu/nova-core/gsp/hal/gh100.rs
@@ -159,7 +159,7 @@ fn boot<'a>(
) -> Result<BootUnloadGuard<'a>> {
let args = FmcBootArgs::new(dev, chipset, wpr_meta, &gsp.libos, false)?;

- let mut fsp = Fsp::wait_secure_boot(dev, bar, chipset)?;
+ let mut fsp = Fsp::new(dev, chipset)?;

let unload_bundle = crate::gsp::UnloadBundle(
KBox::new(FspUnloadBundle, GFP_KERNEL)? as KBox<dyn UnloadBundle>

--
2.54.0