Re: [PATCH v4 1/3] rust: block: introduce `kernel::block::mq` module

From: Benno Lossin
Date: Sun Jun 02 2024 - 16:08:45 EST


On 01.06.24 15:40, Andreas Hindborg wrote:
> +/// A generic block device.
> +///
> +/// # Invariants
> +///
> +/// - `gendisk` must always point to an initialized and valid `struct gendisk`.
> +pub struct GenDisk<T: Operations, S: GenDiskState = Added> {

I am curious, do you need the type state for this struct? AFAIU you are
only using it to configure the `GenDisk`, so could you also use a config
struct that is given to `GenDisk::new`. That way we can avoid the extra
traits and generic argument.

Since there are so many options, a builder config struct might be a good
idea.

> + tagset: Arc<TagSet<T>>,
> + gendisk: *mut bindings::gendisk,
> + _phantom: core::marker::PhantomData<S>,
> +}
> +
> +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
> +// `TagSet` It is safe to send this to other threads as long as T is Send.
> +unsafe impl<T: Operations + Send, S: GenDiskState> Send for GenDisk<T, S> {}
> +
> +/// Disks in this state are allocated and initialized, but are not yet
> +/// accessible from the kernel VFS.
> +pub enum Initialized {}
> +
> +/// Disks in this state have been attached to the kernel VFS and may receive IO
> +/// requests.
> +pub enum Added {}
> +
> +mod seal {
> + pub trait Sealed {}
> +}
> +
> +/// Typestate representing states of a `GenDisk`.
> +///
> +/// This trait cannot be implemented by downstream crates.
> +pub trait GenDiskState: seal::Sealed {
> + /// Set to true if [`GenDisk`] should call `del_gendisk` on drop.
> + const DELETE_ON_DROP: bool;
> +}
> +
> +impl seal::Sealed for Initialized {}
> +impl GenDiskState for Initialized {
> + const DELETE_ON_DROP: bool = false;
> +}
> +impl seal::Sealed for Added {}
> +impl GenDiskState for Added {
> + const DELETE_ON_DROP: bool = true;
> +}
> +
> +impl<T: Operations> GenDisk<T, Initialized> {
> + /// Try to create a new `GenDisk`.
> + pub fn try_new(tagset: Arc<TagSet<T>>) -> Result<Self> {

Since there is no non-try `new` function, I think we should name this
function just `new`.

> + let lock_class_key = crate::sync::LockClassKey::new();
> +
> + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
> + let gendisk = from_err_ptr(unsafe {
> + bindings::__blk_mq_alloc_disk(
> + tagset.raw_tag_set(),
> + core::ptr::null_mut(), // TODO: We can pass queue limits right here
> + core::ptr::null_mut(),
> + lock_class_key.as_ptr(),
> + )
> + })?;
> +
> + const TABLE: bindings::block_device_operations = bindings::block_device_operations {
> + submit_bio: None,
> + open: None,
> + release: None,
> + ioctl: None,
> + compat_ioctl: None,
> + check_events: None,
> + unlock_native_capacity: None,
> + getgeo: None,
> + set_read_only: None,
> + swap_slot_free_notify: None,
> + report_zones: None,
> + devnode: None,
> + alternative_gpt_sector: None,
> + get_unique_id: None,
> + // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to
> + // be merged (unstable in rustc 1.78 which is staged for linux 6.10)
> + // https://github.com/rust-lang/rust/issues/119618
> + owner: core::ptr::null_mut(),
> + pr_ops: core::ptr::null_mut(),
> + free_disk: None,
> + poll_bio: None,
> + };
> +
> + // SAFETY: gendisk is a valid pointer as we initialized it above
> + unsafe { (*gendisk).fops = &TABLE };
> +
> + // INVARIANT: `gendisk` was initialized above.
> + // INVARIANT: `gendisk.queue.queue_data` is set to `data` in the call to

There is no `data` in the mentioned call above.

> + // `__blk_mq_alloc_disk` above.
> + Ok(GenDisk {
> + tagset,
> + gendisk,
> + _phantom: PhantomData,
> + })
> + }
> +

[...]

> +impl<T: Operations> OperationsVTable<T> {
> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// - The caller of this function must ensure `bd` is valid
> + /// and initialized. The pointees must outlive this function.

Until when do the pointees have to be alive? "must outlive this
function" could also be the case if the pointees die immediately after
this function returns.

> + /// - This function must not be called with a `hctx` for which
> + /// `Self::exit_hctx_callback()` has been called.
> + /// - (*bd).rq must point to a valid `bindings:request` for which
> + /// `OperationsVTable<T>::init_request_callback` was called

Missing `.` at the end.

> + unsafe extern "C" fn queue_rq_callback(
> + _hctx: *mut bindings::blk_mq_hw_ctx,
> + bd: *const bindings::blk_mq_queue_data,
> + ) -> bindings::blk_status_t {
> + // SAFETY: `bd.rq` is valid as required by the safety requirement for
> + // this function.
> + let request = unsafe { &*(*bd).rq.cast::<Request<T>>() };
> +
> + // One refcount for the ARef, one for being in flight
> + request.wrapper_ref().refcount().store(2, Ordering::Relaxed);
> +
> + // SAFETY: We own a refcount that we took above. We pass that to `ARef`.
> + // By the safety requirements of this function, `request` is a valid
> + // `struct request` and the private data is properly initialized.
> + let rq = unsafe { Request::aref_from_raw((*bd).rq) };

I think that you need to require that the request is alive at least
until `blk_mq_end_request` is called for the request (since at that
point all `ARef`s will be gone).
Also if this is not guaranteed, the safety requirements of
`AlwaysRefCounted` are violated (since the object can just disappear
even if it has refcount > 0 [the refcount refers to the Rust refcount in
the `RequestDataWrapper`, not the one in C]).

> +
> + // SAFETY: We have exclusive access and we just set the refcount above.
> + unsafe { Request::start_unchecked(&rq) };
> +
> + let ret = T::queue_rq(
> + rq,
> + // SAFETY: `bd` is valid as required by the safety requirement for this function.
> + unsafe { (*bd).last },
> + );
> +
> + if let Err(e) = ret {
> + e.to_blk_status()
> + } else {
> + bindings::BLK_STS_OK as _
> + }
> + }
> +
> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure.
> + unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) {
> + T::commit_rqs()
> + }
> +
> + /// This function is called by the C kernel. It is not currently
> + /// implemented, and there is no way to exercise this code path.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure.
> + unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {}
> +
> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure.
> + unsafe extern "C" fn poll_callback(
> + _hctx: *mut bindings::blk_mq_hw_ctx,
> + _iob: *mut bindings::io_comp_batch,
> + ) -> core::ffi::c_int {
> + T::poll().into()
> + }
> +
> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure. This
> + /// function may only be called onece before `exit_hctx_callback` is called

Typo: "onece"

> + /// for the same context.
> + unsafe extern "C" fn init_hctx_callback(
> + _hctx: *mut bindings::blk_mq_hw_ctx,
> + _tagset_data: *mut core::ffi::c_void,
> + _hctx_idx: core::ffi::c_uint,
> + ) -> core::ffi::c_int {
> + from_result(|| Ok(0))
> + }
> +
> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure.
> + unsafe extern "C" fn exit_hctx_callback(
> + _hctx: *mut bindings::blk_mq_hw_ctx,
> + _hctx_idx: core::ffi::c_uint,
> + ) {
> + }
> +
> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure. `set` must
> + /// point to an initialized `TagSet<T>`.
> + unsafe extern "C" fn init_request_callback(
> + _set: *mut bindings::blk_mq_tag_set,
> + rq: *mut bindings::request,
> + _hctx_idx: core::ffi::c_uint,
> + _numa_node: core::ffi::c_uint,
> + ) -> core::ffi::c_int {
> + from_result(|| {
> + // SAFETY: The `blk_mq_tag_set` invariants guarantee that all
> + // requests are allocated with extra memory for the request data.

What guarantees that the right amount of memory has been allocated?
AFAIU that is guaranteed by the `TagSet` (but there is no invariant).

> + let pdu = unsafe { bindings::blk_mq_rq_to_pdu(rq) }.cast::<RequestDataWrapper>();
> +
> + // SAFETY: The refcount field is allocated but not initialized, this
> + // valid for write.
> + unsafe { RequestDataWrapper::refcount_ptr(pdu).write(AtomicU64::new(0)) };
> +
> + Ok(0)
> + })
> + }

[...]

> + /// Notify the block layer that a request is going to be processed now.
> + ///
> + /// The block layer uses this hook to do proper initializations such as
> + /// starting the timeout timer. It is a requirement that block device
> + /// drivers call this function when starting to process a request.
> + ///
> + /// # Safety
> + ///
> + /// The caller must have exclusive ownership of `self`, that is
> + /// `self.wrapper_ref().refcount() == 2`.
> + pub(crate) unsafe fn start_unchecked(this: &ARef<Self>) {
> + // SAFETY: By type invariant, `self.0` is a valid `struct request`. By
> + // existence of `&mut self` we have exclusive access.

We don't have a `&mut self`. But the safety requirements ask for a
unique `ARef`.

> + unsafe { bindings::blk_mq_start_request(this.0.get()) };
> + }
> +
> + fn try_set_end(this: ARef<Self>) -> Result<ARef<Self>, ARef<Self>> {
> + // We can race with `TagSet::tag_to_rq`
> + match this.wrapper_ref().refcount().compare_exchange(
> + 2,
> + 0,
> + Ordering::Relaxed,
> + Ordering::Relaxed,
> + ) {
> + Err(_old) => Err(this),
> + Ok(_) => Ok(this),
> + }
> + }
> +
> + /// Notify the block layer that the request has been completed without errors.
> + ///
> + /// This function will return `Err` if `this` is not the only `ARef`
> + /// referencing the request.
> + pub fn end_ok(this: ARef<Self>) -> Result<(), ARef<Self>> {
> + let this = Self::try_set_end(this)?;
> + let request_ptr = this.0.get();
> + core::mem::forget(this);
> +
> + // SAFETY: By type invariant, `self.0` is a valid `struct request`. By
> + // existence of `&mut self` we have exclusive access.

Same here, but in this case, the `ARef` is unique, since you called
`try_set_end`. You could make it a `# Guarantee` of `try_set_end`: "If
`Ok(aref)` is returned, then the `aref` is unique."

> + unsafe { bindings::blk_mq_end_request(request_ptr, bindings::BLK_STS_OK as _) };
> +
> + Ok(())
> + }
> +
> + /// Return a pointer to the `RequestDataWrapper` stored in the private area
> + /// of the request structure.
> + ///
> + /// # Safety
> + ///
> + /// - `this` must point to a valid allocation.
> + pub(crate) unsafe fn wrapper_ptr(this: *mut Self) -> NonNull<RequestDataWrapper> {
> + let request_ptr = this.cast::<bindings::request>();
> + let wrapper_ptr =
> + // SAFETY: By safety requirements for this function, `this` is a
> + // valid allocation.

Formatting: move the safety comment above the `let`.

---
Cheers,
Benno

> + unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::<RequestDataWrapper>() };
> + // SAFETY: By C api contract, wrapper_ptr points to a valid allocation
> + // and is not null.
> + unsafe { NonNull::new_unchecked(wrapper_ptr) }
> + }

[...]