Re: [RFC PATCH 07/13] scsi: scsi_dh: ufshpb: Add ufshpb state machine

From: Bart Van Assche
Date: Fri May 15 2020 - 22:44:50 EST


On 2020-05-15 03:30, Avri Altman wrote:
> @@ -17,6 +18,13 @@
>
> #define UFSHPB_NAME "ufshpb"
>
> +#define UFSHPB_WRITE_BUFFER (0xfa)
> +#define WRITE_BUFFER_TIMEOUT (3 * HZ)
> +#define WRITE_BUFFER_RETRIES (3)
> +#define UFSHPB_READ_BUFFER (0xf9)
> +#define READ_BUFFER_TIMEOUT (3 * HZ)
> +#define READ_BUFFER_RETRIES (3)

Parentheses around expressions are normal but parentheses around
constants are unusual. I think the parentheses around constants can be
left out.

> +#define to_subregion() (container_of(work, struct ufshpb_subregion, hpb_work))

Could this have been defined as an inline function?

> @@ -76,6 +118,7 @@ struct ufshpb_subregion {
> * @writes - sum over subregions @writes
> * @region - region index
> * @active_subregions - actual active subregions
> + * @evicted - to indicated if this region is currently being evicted
> */
> struct ufshpb_region {
> struct ufshpb_subregion *subregion_tbl;
> @@ -85,6 +128,7 @@ struct ufshpb_region {
> unsigned int region;
>
> atomic_t active_subregions;
> + atomic_t evicted;
> };

Declaring a state variable as atomic_t is unusual. How are changes of
the @evicted member variable serialized?

> /**
> @@ -93,6 +137,7 @@ struct ufshpb_region {
> * @lh_map_ctx - list head of mapping context
> * @map_list_lock - to protect mapping context list operations
> * @region_tbl - regions/subregions table
> + * @pinned_map - to mark pinned regions
> * @sdev - scsi device for that lun
> * @regions_per_lun
> * @subregions_per_lun - lun size is not guaranteed to be region aligned
> @@ -105,6 +150,7 @@ struct ufshpb_dh_lun {
> struct list_head lh_map_ctx;
> spinlock_t map_list_lock;
> struct ufshpb_region *region_tbl;
> + unsigned long *pinned_map;
> struct scsi_device *sdev;
>
> unsigned int regions_per_lun;
> @@ -113,6 +159,10 @@ struct ufshpb_dh_lun {
> unsigned int max_active_regions;
>
> atomic_t active_regions;
> +
> + struct mutex eviction_lock;
> +
> + struct workqueue_struct *wq;
> };

Please document what the eviction_lock protects.

> +static inline void ufshpb_set_write_buf_cmd(unsigned char *cmd,
> + unsigned int region)
> +{
> + cmd[0] = UFSHPB_WRITE_BUFFER;
> + cmd[1] = 0x01;
> + put_unaligned_be16(region, &cmd[2]);
> +}

Please follow the example of the sd driver and use the verb "setup"
instead of "set" for functions that initialize a SCSI CDB.

> +static int ufshpb_submit_write_buf_cmd(struct scsi_device *sdev,
> + unsigned int region)
> +{
> + unsigned char cmd[10] = {};
> + struct scsi_sense_hdr sshdr = {};
> + u64 flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
> + REQ_FAILFAST_DRIVER;
> + int timeout = WRITE_BUFFER_TIMEOUT;
> + int cmd_retries = WRITE_BUFFER_RETRIES;
> + int ret = 0;
> +
> + ufshpb_set_write_buf_cmd(cmd, region);
> +
> + ret = scsi_execute(sdev, cmd, DMA_NONE, NULL, 0, NULL, &sshdr,
> + timeout, cmd_retries, flags, 0, NULL);
> +
> + /* HPB spec does not define any error handling */
> + sdev_printk(KERN_INFO, sdev, "%s: WRITE_BUFFER %s result %d\n",
> + UFSHPB_NAME, ret ? "failed" : "succeeded", ret);
> +
> + return ret;
> +}

I don't think that unconditionally printing the result of the WRITE
BUFFER command is acceptable. How about only reporting failures?

> +static void ufshpb_set_read_buf_cmd(unsigned char *cmd, unsigned int region,
> + unsigned int subregion,
> + unsigned int alloc_len)
> +{
> + cmd[0] = UFSHPB_READ_BUFFER;
> + cmd[1] = 0x01;
> + put_unaligned_be16(region, &cmd[2]);
> + put_unaligned_be16(subregion, &cmd[4]);
> +
> + cmd[6] = alloc_len >> 16;
> + cmd[7] = (alloc_len >> 8) & 0xff;
> + cmd[8] = alloc_len & 0xff;
> + cmd[9] = 0x00;
> +}

Please use put_unaligned_be24() instead of open-coding it.

> +static int ufshpb_subregion_alloc_pages(struct ufshpb_dh_lun *hpb,
> + struct ufshpb_subregion *s)
> +{
> + struct ufshpb_map_ctx *mctx;
> +
> + spin_lock(&hpb->map_list_lock);
> + mctx = list_first_entry_or_null(&hpb->lh_map_ctx,
> + struct ufshpb_map_ctx, list);
> + if (!mctx) {
> + spin_unlock(&hpb->map_list_lock);
> + return -EINVAL;
> + }
> +
> + list_del_init(&mctx->list);
> + spin_unlock(&hpb->map_list_lock);
> +
> + s->mctx = mctx;
> + mctx->pages = (char *)__get_free_pages(GFP_KERNEL, order);
> + if (!mctx->pages)
> + return -ENOMEM;
> +
> + return 0;
> +}

Relying on higher order pages is not acceptable because memory gets
fragmented easily. See also
https://elinux.org/images/a/a8/Controlling_Linux_Memory_Fragmentation_and_Higher_Order_Allocation_Failure-_Analysis%2C_Observations_and_Results.pdf.

> + hpb->pinned_map = kcalloc(BITS_TO_LONGS(hpb->regions_per_lun),
> + sizeof(unsigned long), GFP_KERNEL);

Is this perhaps an open-coded version of bitmap_alloc()? If so, please
use bitmap_alloc() instead.

> + snprintf(wq_name, ARRAY_SIZE(wq_name), "ufshpb_wq_%d", sdev->id);
> + wq = alloc_workqueue(wq_name, WQ_HIGHPRI, WQ_MAX_ACTIVE);
> + if (!wq) {
> + ret = -ENOMEM;
> + goto out_free;
> + }

What is the purpose of the ufshpb_wq_%d workqueues? Why to allocate
dedicated workqueues instead of using one of the existing system
workqueues? If the scsi_execute() calls would be changed into
asynchronous SCSI command submission, would these workqueues still be
necessary?

Thanks,

Bart.