Re: [PATCH v3 1/3] rust: block: introduce `kernel::block::mq` module

From: Benno Lossin
Date: Sat Jun 01 2024 - 05:44:08 EST


On 01.06.24 10:18, Andreas Hindborg wrote:
> From: Andreas Hindborg <a.hindborg@xxxxxxxxxxx>

[...]

> +impl<T: Operations> GenDisk<T, Initialized> {
> + /// Try to create a new `GenDisk`.
> + pub fn try_new(tagset: Arc<TagSet<T>>) -> Result<Self> {
> + let lock_class_key = crate::sync::LockClassKey::new();
> +
> + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
> + let gendisk = from_err_ptr(unsafe {
> + bindings::__blk_mq_alloc_disk(
> + tagset.raw_tag_set(),
> + core::ptr::null_mut(), // TODO: We can pass queue limits right here
> + core::ptr::null_mut(),
> + lock_class_key.as_ptr(),
> + )
> + })?;
> +
> + const TABLE: bindings::block_device_operations = bindings::block_device_operations {
> + submit_bio: None,
> + open: None,
> + release: None,
> + ioctl: None,
> + compat_ioctl: None,
> + check_events: None,
> + unlock_native_capacity: None,
> + getgeo: None,
> + set_read_only: None,
> + swap_slot_free_notify: None,
> + report_zones: None,
> + devnode: None,
> + alternative_gpt_sector: None,
> + get_unique_id: None,
> + // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to
> + // be merged (unstable in rustc 1.78 which is staged for linux 6.10)
> + // https://github.com/rust-lang/rust/issues/119618

AFAIK the 1.78 upgrade already is in rust-next (and also should appear
in v6.10-rc2, right?) do you have this on your radar?

> + owner: core::ptr::null_mut(),
> + pr_ops: core::ptr::null_mut(),
> + free_disk: None,
> + poll_bio: None,
> + };
> +
> + // SAFETY: gendisk is a valid pointer as we initialized it above
> + unsafe { (*gendisk).fops = &TABLE };
> +
> + // INVARIANT: `gendisk` was initialized above.
> + // INVARIANT: `gendisk.queue.queue_data` is set to `data` in the call to
> + // `__blk_mq_alloc_disk` above.
> + Ok(GenDisk {
> + tagset,
> + gendisk,
> + _phantom: PhantomData,
> + })
> + }

[...]

> + /// This function is called by the C kernel. A pointer to this function is
> + /// installed in the `blk_mq_ops` vtable for the driver.
> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure.
> + unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) {
> + T::commit_rqs()
> + }
> +
> + /// This function is called by the C kernel. It is not currently
> + /// implemented, and there is no way to exercise this code path.

Is it also possible to completely remove it? ie use `None` in the
VTABLE, or will the C side error?

> + ///
> + /// # Safety
> + ///
> + /// This function may only be called by blk-mq C infrastructure.
> + unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {}

[...]

> +impl<'a> RawWriter<'a> {
> + /// Create a new `RawWriter` instance.
> + fn new(buffer: &'a mut [u8]) -> Result<RawWriter<'a>> {
> + *(buffer.last_mut().ok_or(EINVAL)?) = 0;
> +
> + // INVARIANT: We null terminated the buffer above.
> + Ok(Self { buffer, pos: 0 })
> + }
> +
> + pub(crate) fn from_array<const N: usize>(
> + a: &'a mut [core::ffi::c_char; N],
> + ) -> Result<RawWriter<'a>> {

You could change the return type to be `RawWriter<'a>` and check using
`build_assert!` that `N > 0`. Then you can also call `unwrap_unchecked`
on the result that you get below.

I don't know if we want that, but it is a possibility.

> + Self::new(
> + // SAFETY: the buffer of `a` is valid for read and write as `u8` for
> + // at least `N` bytes.
> + unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::<u8>(), N) },
> + )
> + }
> +}

[...]

> +/// Store the result of `op(target.load())` in target, returning new value of
> +/// taret.
> +fn atomic_relaxed_op_return(target: &AtomicU64, op: impl Fn(u64) -> u64) -> u64 {
> + let old = target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| Some(op(x)));
> +
> + // SAFETY: Because the operation passed to `fetch_update` above always
> + // return `Some`, `old` will always be `Ok`.
> + let old = unsafe { old.unwrap_unchecked() };
> +
> + op(old)
> +}
> +
> +/// Store the result of `op(target.load)` in `target` if `target.load() !=
> +/// pred`, returning previous value of target

The function returns a bool, not a u64 (value). From the body I read
that you return whether the value was updated.

> +fn atomic_relaxed_op_unless(target: &AtomicU64, op: impl Fn(u64) -> u64, pred: u64) -> bool {
> + let x = target.load(Ordering::Relaxed);
> + loop {
> + if x == pred {
> + break;
> + }
> + if target
> + .compare_exchange_weak(x, op(x), Ordering::Relaxed, Ordering::Relaxed)
> + .is_ok()
> + {
> + break;
> + }

If this fails, you are not re-reading the value of `target`, so if
someone else just set it to `pred`, you will still continue to try to
set it from `x` to `op(x)`, but it might never have the value `x` again.
This would lead to a potentially infinite loop, right?

> + }

Do you think you can also implement this using `fetch_update`? I guess
this would do what you want, right?:

target.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |x| {
if x == pred {
None
} else {
Some(op(x))
}
}).is_ok()

> +
> + x == pred
> +}

[...]

> +impl<T: Operations> TagSet<T> {
> + /// Try to create a new tag set
> + pub fn try_new(
> + nr_hw_queues: u32,
> + num_tags: u32,
> + num_maps: u32,
> + ) -> impl PinInit<Self, error::Error> {
> + try_pin_init!( TagSet {
> + // INVARIANT: We initialize `inner` here and it is valid after the
> + // initializer has run.
> + inner <- unsafe {kernel::init::pin_init_from_closure(move |place: *mut Opaque<bindings::blk_mq_tag_set>| -> Result<()> {
> + let place = place.cast::<bindings::blk_mq_tag_set>();
> +
> + // SAFETY: pin_init_from_closure promises that `place` is writable, and
> + // zeroes is a valid bit pattern for this structure.
> + core::ptr::write_bytes(place, 0, 1);
> +
> + /// For a raw pointer to a struct, write a struct field without
> + /// creating a reference to the field
> + macro_rules! write_ptr_field {
> + ($target:ident, $field:ident, $value:expr) => {
> + ::core::ptr::write(::core::ptr::addr_of_mut!((*$target).$field), $value)
> + };
> + }
> +
> + // SAFETY: pin_init_from_closure promises that `place` is writable
> + write_ptr_field!(place, ops, OperationsVTable::<T>::build());
> + write_ptr_field!(place, nr_hw_queues , nr_hw_queues);
> + write_ptr_field!(place, timeout , 0); // 0 means default which is 30 * HZ in C
> + write_ptr_field!(place, numa_node , bindings::NUMA_NO_NODE);
> + write_ptr_field!(place, queue_depth , num_tags);
> + write_ptr_field!(place, cmd_size , core::mem::size_of::<RequestDataWrapper>().try_into()?);
> + write_ptr_field!(place, flags , bindings::BLK_MQ_F_SHOULD_MERGE);
> + write_ptr_field!(place, driver_data , core::ptr::null_mut::<core::ffi::c_void>());
> + write_ptr_field!(place, nr_maps , num_maps);

Did something not work with my suggestion?

---
Cheers,
Benno

> +
> + // SAFETY: Relevant fields of `place` are initialised above
> + let ret = bindings::blk_mq_alloc_tag_set(place);
> + if ret < 0 {
> + return Err(Error::from_errno(ret));
> + }
> +
> + Ok(())
> + })},
> + _p: PhantomData,
> + })
> + }

[...]