[PATCH v11 06/22] gpu: nova-core: Blackwell: use correct sysmem flush registers
From: John Hubbard
Date: Fri May 29 2026 - 23:14:49 EST
Blackwell GPUs moved the sysmem flush page registers away from the
Ampere/Ada location. GB10x routes the flush through a pair of HSHUB0
register sets (primary and egress) that must both be programmed to
the same address. GB20x routes it through FBHUB0.
Implement these paths in the GB10x and GB20x framebuffer HALs.
Signed-off-by: John Hubbard <jhubbard@xxxxxxxxxx>
---
drivers/gpu/nova-core/fb/hal/gb100.rs | 46 +++++++++++++++++++++++++--
drivers/gpu/nova-core/fb/hal/gb202.rs | 40 +++++++++++++++++++++--
drivers/gpu/nova-core/regs.rs | 37 +++++++++++++++++++++
3 files changed, 117 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/nova-core/fb/hal/gb100.rs b/drivers/gpu/nova-core/fb/hal/gb100.rs
index 8d63350abf8a..70f4c11b1e77 100644
--- a/drivers/gpu/nova-core/fb/hal/gb100.rs
+++ b/drivers/gpu/nova-core/fb/hal/gb100.rs
@@ -4,6 +4,8 @@
//! Blackwell GB10x framebuffer HAL.
use kernel::{
+ io::Io,
+ num::Bounded,
prelude::*,
ptr::{
const_align_up,
@@ -15,11 +17,45 @@
use crate::{
driver::Bar0,
fb::hal::FbHal,
- num::usize_into_u32, //
+ num::usize_into_u32,
+ regs, //
};
struct Gb100;
+fn read_sysmem_flush_page_gb100(bar: &Bar0) -> u64 {
+ let lo = u64::from(
+ bar.read(regs::NV_PFB_HSHUB0_PCIE_FLUSH_SYSMEM_ADDR_LO)
+ .adr(),
+ );
+ let hi = u64::from(
+ bar.read(regs::NV_PFB_HSHUB0_PCIE_FLUSH_SYSMEM_ADDR_HI)
+ .adr(),
+ );
+
+ lo | (hi << 32)
+}
+
+/// Write the sysmem flush page address through the GB10x HSHUB0 registers.
+///
+/// Both the primary and EG (egress) register pairs must be programmed to the same address,
+/// as required by hardware.
+fn write_sysmem_flush_page_gb100(bar: &Bar0, addr: Bounded<u64, 52>) {
+ // CAST: lower 32 bits. Hardware ignores bits 7:0.
+ let addr_lo = *addr as u32;
+ let addr_hi = addr.shr::<32, 20>().cast::<u32>();
+
+ // Write HI first. The hardware will trigger the flush on the LO write.
+
+ // Primary HSHUB pair.
+ bar.write_reg(regs::NV_PFB_HSHUB0_PCIE_FLUSH_SYSMEM_ADDR_HI::zeroed().with_adr(addr_hi));
+ bar.write_reg(regs::NV_PFB_HSHUB0_PCIE_FLUSH_SYSMEM_ADDR_LO::zeroed().with_adr(addr_lo));
+
+ // EG (egress) pair -- must match the primary pair.
+ bar.write_reg(regs::NV_PFB_HSHUB0_EG_PCIE_FLUSH_SYSMEM_ADDR_HI::zeroed().with_adr(addr_hi));
+ bar.write_reg(regs::NV_PFB_HSHUB0_EG_PCIE_FLUSH_SYSMEM_ADDR_LO::zeroed().with_adr(addr_lo));
+}
+
pub(super) const fn pmu_reserved_size_gb100() -> u32 {
usize_into_u32::<{ const_align_up(SZ_8M + SZ_16M + SZ_4K, Alignment::new::<SZ_128K>()).unwrap() }>(
)
@@ -27,11 +63,15 @@ pub(super) const fn pmu_reserved_size_gb100() -> u32 {
impl FbHal for Gb100 {
fn read_sysmem_flush_page(&self, bar: &Bar0) -> u64 {
- super::ga100::read_sysmem_flush_page_ga100(bar)
+ read_sysmem_flush_page_gb100(bar)
}
fn write_sysmem_flush_page(&self, bar: &Bar0, addr: u64) -> Result {
- super::ga100::write_sysmem_flush_page_ga100(bar, addr);
+ let addr: Bounded<u64, 52> = Bounded::<u64, 64>::from(addr)
+ .try_shrink::<52>()
+ .ok_or(EINVAL)?;
+
+ write_sysmem_flush_page_gb100(bar, addr);
Ok(())
}
diff --git a/drivers/gpu/nova-core/fb/hal/gb202.rs b/drivers/gpu/nova-core/fb/hal/gb202.rs
index 542c1d7429e9..5a6b815eec3d 100644
--- a/drivers/gpu/nova-core/fb/hal/gb202.rs
+++ b/drivers/gpu/nova-core/fb/hal/gb202.rs
@@ -4,24 +4,58 @@
//! Blackwell GB20x framebuffer HAL.
use kernel::{
+ io::Io,
+ num::Bounded,
prelude::*,
sizes::SizeConstants, //
};
use crate::{
driver::Bar0,
- fb::hal::FbHal, //
+ fb::hal::FbHal,
+ regs, //
};
struct Gb202;
+fn read_sysmem_flush_page_gb202(bar: &Bar0) -> u64 {
+ let lo = u64::from(
+ bar.read(regs::NV_PFB_FBHUB0_PCIE_FLUSH_SYSMEM_ADDR_LO)
+ .adr(),
+ );
+ let hi = u64::from(
+ bar.read(regs::NV_PFB_FBHUB0_PCIE_FLUSH_SYSMEM_ADDR_HI)
+ .adr(),
+ );
+
+ lo | (hi << 32)
+}
+
+/// Write the sysmem flush page address through the GB20x FBHUB0 registers.
+fn write_sysmem_flush_page_gb202(bar: &Bar0, addr: Bounded<u64, 52>) {
+ // Write HI first. The hardware will trigger the flush on the LO write.
+ bar.write_reg(
+ regs::NV_PFB_FBHUB0_PCIE_FLUSH_SYSMEM_ADDR_HI::zeroed()
+ .with_adr(addr.shr::<32, 20>().cast::<u32>()),
+ );
+ bar.write_reg(
+ regs::NV_PFB_FBHUB0_PCIE_FLUSH_SYSMEM_ADDR_LO::zeroed()
+ // CAST: lower 32 bits. Hardware ignores bits 7:0.
+ .with_adr(*addr as u32),
+ );
+}
+
impl FbHal for Gb202 {
fn read_sysmem_flush_page(&self, bar: &Bar0) -> u64 {
- super::ga100::read_sysmem_flush_page_ga100(bar)
+ read_sysmem_flush_page_gb202(bar)
}
fn write_sysmem_flush_page(&self, bar: &Bar0, addr: u64) -> Result {
- super::ga100::write_sysmem_flush_page_ga100(bar, addr);
+ let addr: Bounded<u64, 52> = Bounded::<u64, 64>::from(addr)
+ .try_shrink::<52>()
+ .ok_or(EINVAL)?;
+
+ write_sysmem_flush_page_gb202(bar, addr);
Ok(())
}
diff --git a/drivers/gpu/nova-core/regs.rs b/drivers/gpu/nova-core/regs.rs
index 356fbf364ea5..65be6ec71ed4 100644
--- a/drivers/gpu/nova-core/regs.rs
+++ b/drivers/gpu/nova-core/regs.rs
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
use kernel::{
io::{
@@ -145,6 +146,42 @@ fn fmt(&self, f: &mut kernel::fmt::Formatter<'_>) -> kernel::fmt::Result {
/// Bits 12..40 of the higher (exclusive) bound of the WPR2 region.
31:4 hi_val;
}
+
+ // Blackwell GB10x sysmem flush registers (HSHUB0).
+ //
+ // GB10x GPUs use two pairs of HSHUB registers for sysmembar: a primary pair and an EG
+ // (egress) pair. Both must be programmed to the same address. Hardware ignores bits 7:0
+ // of each LO register. HSHUB0 base is 0x00891000.
+
+ pub(crate) NV_PFB_HSHUB0_PCIE_FLUSH_SYSMEM_ADDR_LO(u32) @ 0x00891e50 {
+ 31:0 adr => u32;
+ }
+
+ pub(crate) NV_PFB_HSHUB0_PCIE_FLUSH_SYSMEM_ADDR_HI(u32) @ 0x00891e54 {
+ 19:0 adr;
+ }
+
+ pub(crate) NV_PFB_HSHUB0_EG_PCIE_FLUSH_SYSMEM_ADDR_LO(u32) @ 0x008916c0 {
+ 31:0 adr => u32;
+ }
+
+ pub(crate) NV_PFB_HSHUB0_EG_PCIE_FLUSH_SYSMEM_ADDR_HI(u32) @ 0x008916c4 {
+ 19:0 adr;
+ }
+
+ // Blackwell GB20x sysmem flush registers (FBHUB0).
+ //
+ // Unlike the older NV_PFB_NISO_FLUSH_SYSMEM_ADDR registers which encode the address with an
+ // 8-bit right-shift, these registers take the raw address split into lower/upper 32-bit halves.
+ // The hardware ignores bits 7:0 of the LO register.
+
+ pub(crate) NV_PFB_FBHUB0_PCIE_FLUSH_SYSMEM_ADDR_LO(u32) @ 0x008a1d58 {
+ 31:0 adr => u32;
+ }
+
+ pub(crate) NV_PFB_FBHUB0_PCIE_FLUSH_SYSMEM_ADDR_HI(u32) @ 0x008a1d5c {
+ 19:0 adr;
+ }
}
impl NV_PFB_PRI_MMU_LOCAL_MEMORY_RANGE {
--
2.54.0