Re: [PATCH 1/3] lightnvm: pblk: rework write error recovery path

From: Hans Holmberg
Date: Mon Apr 23 2018 - 08:40:05 EST


On Fri, Apr 20, 2018 at 9:38 PM, Javier Gonzalez <javier@xxxxxxxxxxxx> wrote:
>> On 19 Apr 2018, at 09.39, Hans Holmberg <hans.ml.holmberg@xxxxxxxxxxxxx> wrote:
>>
>> From: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx>
>>
>> The write error recovery path is incomplete, so rework
>> the write error recovery handling to do resubmits directly
>> from the write buffer.
>>
>> When a write error occurs, the remaining sectors in the chunk are
>> mapped out and invalidated and the request inserted in a resubmit list.
>>
>> The writer thread checks if there are any requests to resubmit,
>> scans and invalidates any lbas that have been overwritten by later
>> writes and resubmits the failed entries.
>>
>> Signed-off-by: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx>
>> ---
>> drivers/lightnvm/pblk-init.c | 2 +
>> drivers/lightnvm/pblk-recovery.c | 91 ---------------
>> drivers/lightnvm/pblk-write.c | 241 ++++++++++++++++++++++++++++-----------
>> drivers/lightnvm/pblk.h | 8 +-
>> 4 files changed, 180 insertions(+), 162 deletions(-)
>>
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index bfc488d..6f06727 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -426,6 +426,7 @@ static int pblk_core_init(struct pblk *pblk)
>> goto free_r_end_wq;
>>
>> INIT_LIST_HEAD(&pblk->compl_list);
>> + INIT_LIST_HEAD(&pblk->resubmit_list);
>>
>> return 0;
>>
>> @@ -1185,6 +1186,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
>> pblk->state = PBLK_STATE_RUNNING;
>> pblk->gc.gc_enabled = 0;
>>
>> + spin_lock_init(&pblk->resubmit_lock);
>> spin_lock_init(&pblk->trans_lock);
>> spin_lock_init(&pblk->lock);
>>
>> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
>> index 9cb6d5d..5983428 100644
>> --- a/drivers/lightnvm/pblk-recovery.c
>> +++ b/drivers/lightnvm/pblk-recovery.c
>> @@ -16,97 +16,6 @@
>>
>> #include "pblk.h"
>>
>> -void pblk_submit_rec(struct work_struct *work)
>> -{
>> - struct pblk_rec_ctx *recovery =
>> - container_of(work, struct pblk_rec_ctx, ws_rec);
>> - struct pblk *pblk = recovery->pblk;
>> - struct nvm_rq *rqd = recovery->rqd;
>> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>> - struct bio *bio;
>> - unsigned int nr_rec_secs;
>> - unsigned int pgs_read;
>> - int ret;
>> -
>> - nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
>> - NVM_MAX_VLBA);
>> -
>> - bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
>> -
>> - bio->bi_iter.bi_sector = 0;
>> - bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>> - rqd->bio = bio;
>> - rqd->nr_ppas = nr_rec_secs;
>> -
>> - pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
>> - nr_rec_secs);
>
> Please, remove functions that are not longer used. Doing a pass on the
> rest of the removed functions would be a good idea.

Yes, thanks.

>
>> - if (pgs_read != nr_rec_secs) {
>> - pr_err("pblk: could not read recovery entries\n");
>> - goto err;
>> - }
>> -
>> - if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
>
> Same here

I'll clean it up.

>> -
>> -#ifdef CONFIG_NVM_DEBUG
>> - atomic_long_add(nr_rec_secs, &pblk->recov_writes);
>> -#endif
>
> Can you add this debug counter to the new path? I see you added other
> counters, if it is a rename, can you put it on a separate patch?

Thanks for catching the lost recov counter update, what other counters
are you referring to?

>
>> -
>> - ret = pblk_submit_io(pblk, rqd);
>> - if (ret) {
>> - pr_err("pblk: I/O submission failed: %d\n", ret);
>> - goto err;
>> - }
>> -
>> - mempool_free(recovery, pblk->rec_pool);
>> - return;
>> -
>> -err:
>> - bio_put(bio);
>> - pblk_free_rqd(pblk, rqd, PBLK_WRITE);
>> -}
>> -
>> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
>> - struct pblk_rec_ctx *recovery, u64 *comp_bits,
>> - unsigned int comp)
>> -{
>> - struct nvm_rq *rec_rqd;
>> - struct pblk_c_ctx *rec_ctx;
>> - int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
>> -
>> - rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
>> - rec_ctx = nvm_rq_to_pdu(rec_rqd);
>> -
>> - /* Copy completion bitmap, but exclude the first X completed entries */
>> - bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
>> - (unsigned long int *)comp_bits,
>> - comp, NVM_MAX_VLBA);
>> -
>> - /* Save the context for the entries that need to be re-written and
>> - * update current context with the completed entries.
>> - */
>> - rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
>> - if (comp >= c_ctx->nr_valid) {
>> - rec_ctx->nr_valid = 0;
>> - rec_ctx->nr_padded = nr_entries - comp;
>> -
>> - c_ctx->nr_padded = comp - c_ctx->nr_valid;
>> - } else {
>> - rec_ctx->nr_valid = c_ctx->nr_valid - comp;
>> - rec_ctx->nr_padded = c_ctx->nr_padded;
>> -
>> - c_ctx->nr_valid = comp;
>> - c_ctx->nr_padded = 0;
>> - }
>> -
>> - recovery->rqd = rec_rqd;
>> - recovery->pblk = pblk;
>> -
>> - return 0;
>> -}
>> -
>> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
>> {
>> u32 crc;
>> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
>> index 3e6f1eb..ab45157 100644
>> --- a/drivers/lightnvm/pblk-write.c
>> +++ b/drivers/lightnvm/pblk-write.c
>> @@ -103,68 +103,147 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
>> pblk_rb_sync_end(&pblk->rwb, &flags);
>> }
>>
>> -/* When a write fails, we are not sure whether the block has grown bad or a page
>> - * range is more susceptible to write errors. If a high number of pages fail, we
>> - * assume that the block is bad and we mark it accordingly. In all cases, we
>> - * remap and resubmit the failed entries as fast as possible; if a flush is
>> - * waiting on a completion, the whole stack would stall otherwise.
>> - */
>> -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
>> +/* Map remaining sectors in chunk, starting from ppa */
>> +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
>> {
>> - void *comp_bits = &rqd->ppa_status;
>> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>> - struct pblk_rec_ctx *recovery;
>> - struct ppa_addr *ppa_list = rqd->ppa_list;
>> - int nr_ppas = rqd->nr_ppas;
>> - unsigned int c_entries;
>> - int bit, ret;
>> + struct nvm_tgt_dev *dev = pblk->dev;
>> + struct nvm_geo *geo = &dev->geo;
>> + struct pblk_line *line;
>> + struct ppa_addr map_ppa = *ppa;
>> + u64 paddr;
>> + int done = 0;
>>
>> - if (unlikely(nr_ppas == 1))
>> - ppa_list = &rqd->ppa_addr;
>> + line = &pblk->lines[pblk_ppa_to_line(*ppa)];
>> + spin_lock(&line->lock);
>>
>> - recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
>> + while (!done) {
>> + paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
>>
>> - INIT_LIST_HEAD(&recovery->failed);
>> + if (!test_and_set_bit(paddr, line->map_bitmap))
>> + line->left_msecs--;
>>
>> - bit = -1;
>> - while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
>> - struct pblk_rb_entry *entry;
>> - struct ppa_addr ppa;
>> + if (!test_and_set_bit(paddr, line->invalid_bitmap))
>> + le32_add_cpu(line->vsc, -1);
>>
>> - /* Logic error */
>> - if (bit > c_ctx->nr_valid) {
>> - WARN_ONCE(1, "pblk: corrupted write request\n");
>> - mempool_free(recovery, pblk->rec_pool);
>> - goto out;
>> + if (geo->version == NVM_OCSSD_SPEC_12) {
>> + map_ppa.ppa++;
>> + if (map_ppa.g.pg == geo->num_pg)
>> + done = 1;
>> + } else {
>> + map_ppa.m.sec++;
>> + if (map_ppa.m.sec == geo->clba)
>> + done = 1;
>> }
>> + }
>>
>> - ppa = ppa_list[bit];
>> - entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
>> - if (!entry) {
>> - pr_err("pblk: could not scan entry on write failure\n");
>> - mempool_free(recovery, pblk->rec_pool);
>> - goto out;
>> - }
>> + spin_unlock(&line->lock);
>> +}
>> +
>> +static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
>> + unsigned int nr_entries)
>
> Can you align this?

Done.

>
>> +{
>> + struct pblk_rb *rb = &pblk->rwb;
>> + struct pblk_rb_entry *entry;
>> + struct pblk_line *line;
>> + struct pblk_w_ctx *w_ctx;
>> + struct ppa_addr ppa_l2p;
>> + int flags;
>> + unsigned int pos, i;
>> +
>> + spin_lock(&pblk->trans_lock);
>> + pos = sentry;
>> + for (i = 0; i < nr_entries; i++) {
>> + entry = &rb->entries[pos];
>> + w_ctx = &entry->w_ctx;
>> +
>> + /* Check if the lba has been overwritten */
>> + ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
>> + if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
>> + w_ctx->lba = ADDR_EMPTY;
>> +
>> + /* Mark up the entry as submittable again */
>> + flags = READ_ONCE(w_ctx->flags);
>> + flags |= PBLK_WRITTEN_DATA;
>> + /* Release flags on write context. Protect from writes */
>> + smp_store_release(&w_ctx->flags, flags);
>>
>> - /* The list is filled first and emptied afterwards. No need for
>> - * protecting it with a lock
>> + /* Decrese the reference count to the line as we will
>> + * re-map these entries
>> */
>> - list_add_tail(&entry->index, &recovery->failed);
>> + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
>> + kref_put(&line->ref, pblk_line_put);
>> +
>> + pos = (pos + 1) & (rb->nr_entries - 1);
>> }
>> + spin_unlock(&pblk->trans_lock);
>> +}
>>
>> - c_entries = find_first_bit(comp_bits, nr_ppas);
>> - ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
>> - if (ret) {
>> - pr_err("pblk: could not recover from write failure\n");
>> - mempool_free(recovery, pblk->rec_pool);
>> - goto out;
>> +static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
>> +{
>> + struct pblk_c_ctx *r_ctx;
>> +
>> + r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
>> + if (!r_ctx) {
>> + pr_err("pblk: failed to allocate resubmit context");
>
> No need to print allocation failures - I know there are a few of these
> in the code, but they should be removed.

Yep, checkpatch warns on these. Done.

>
>> + return;
>> }
>>
>> + r_ctx->lun_bitmap = NULL;
>> + r_ctx->sentry = c_ctx->sentry;
>> + r_ctx->nr_valid = c_ctx->nr_valid;
>> + r_ctx->nr_padded = c_ctx->nr_padded;
>> +
>> + spin_lock(&pblk->resubmit_lock);
>> + list_add_tail(&r_ctx->list, &pblk->resubmit_list);
>> + spin_unlock(&pblk->resubmit_lock);
>> +}
>> +
>> +static void pblk_submit_rec(struct work_struct *work)
>> +{
>> + struct pblk_rec_ctx *recovery =
>> + container_of(work, struct pblk_rec_ctx, ws_rec);
>> + struct pblk *pblk = recovery->pblk;
>> + struct nvm_rq *rqd = recovery->rqd;
>> + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>> + struct ppa_addr *ppa_list;
>> +
>> + pblk_log_write_err(pblk, rqd);
>> +
>> + if (rqd->nr_ppas == 1)
>> + ppa_list = &rqd->ppa_addr;
>> + else
>> + ppa_list = rqd->ppa_list;
>> +
>> + pblk_map_remaining(pblk, ppa_list);
>> + pblk_queue_resubmit(pblk, c_ctx);
>> +
>> + pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
>> + if (c_ctx->nr_padded)
>> + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
>> + c_ctx->nr_padded);
>> + bio_put(rqd->bio);
>> + pblk_free_rqd(pblk, rqd, PBLK_WRITE);
>> + mempool_free(recovery, pblk->rec_pool);
>> +
>> + atomic_dec(&pblk->inflight_io);
>> +}
>> +
>> +
>> +static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
>> +{
>> + struct pblk_rec_ctx *recovery;
>> +
>> + recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
>> + if (!recovery) {
>> + pr_err("pblk: could not allocate recovery work\n");
>> + return;
>> + }
>> +
>> + recovery->pblk = pblk;
>> + recovery->rqd = rqd;
>> +
>> INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
>> queue_work(pblk->close_wq, &recovery->ws_rec);
>> -
>> -out:
>> - pblk_complete_write(pblk, rqd, c_ctx);
>> }
>>
>> static void pblk_end_io_write(struct nvm_rq *rqd)
>> @@ -173,8 +252,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
>> struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>>
>> if (rqd->error) {
>> - pblk_log_write_err(pblk, rqd);
>> - return pblk_end_w_fail(pblk, rqd);
>> + pblk_end_w_fail(pblk, rqd);
>> + return;
>> }
>> #ifdef CONFIG_NVM_DEBUG
>> else
>> @@ -339,6 +418,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
>> bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
>> l_mg->emeta_alloc_type, GFP_KERNEL);
>> if (IS_ERR(bio)) {
>> + pr_err("pblk: failed to map emeta io");
>> ret = PTR_ERR(bio);
>> goto fail_free_rqd;
>> }
>> @@ -515,26 +595,54 @@ static int pblk_submit_write(struct pblk *pblk)
>> unsigned int secs_avail, secs_to_sync, secs_to_com;
>> unsigned int secs_to_flush;
>> unsigned long pos;
>> + unsigned int resubmit;
>>
>> - /* If there are no sectors in the cache, flushes (bios without data)
>> - * will be cleared on the cache threads
>> - */
>> - secs_avail = pblk_rb_read_count(&pblk->rwb);
>> - if (!secs_avail)
>> - return 1;
>> -
>> - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
>> - if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
>> - return 1;
>> -
>> - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
>> - if (secs_to_sync > pblk->max_write_pgs) {
>> - pr_err("pblk: bad buffer sync calculation\n");
>> - return 1;
>> - }
>> + spin_lock(&pblk->resubmit_lock);
>> + resubmit = !list_empty(&pblk->resubmit_list);
>> + spin_unlock(&pblk->resubmit_lock);
>> +
>> + /* Resubmit failed writes first */
>> + if (resubmit) {
>> + struct pblk_c_ctx *r_ctx;
>> +
>> + spin_lock(&pblk->resubmit_lock);
>> + r_ctx = list_first_entry(&pblk->resubmit_list,
>> + struct pblk_c_ctx, list);
>> + list_del(&r_ctx->list);
>> + spin_unlock(&pblk->resubmit_lock);
>>
>> - secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
>> - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
>> + secs_avail = r_ctx->nr_valid;
>> + pos = r_ctx->sentry;
>> +
>> + pblk_prepare_resubmit(pblk, pos, secs_avail);
>> + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
>> + secs_avail);
>> +
>> + kfree(r_ctx);
>> + } else {
>> + /* If there are no sectors in the cache,
>> + * flushes (bios without data) will be cleared on
>> + * the cache threads
>> + */
>> + secs_avail = pblk_rb_read_count(&pblk->rwb);
>> + if (!secs_avail)
>> + return 1;
>> +
>> + secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
>> + if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
>> + return 1;
>> +
>> + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
>> + secs_to_flush);
>> + if (secs_to_sync > pblk->max_write_pgs) {
>> + pr_err("pblk: bad buffer sync calculation\n");
>> + return 1;
>> + }
>> +
>> + secs_to_com = (secs_to_sync > secs_avail) ?
>> + secs_avail : secs_to_sync;
>> + pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
>> + }
>>
>> bio = bio_alloc(GFP_KERNEL, secs_to_sync);
>>
>> @@ -553,6 +661,7 @@ static int pblk_submit_write(struct pblk *pblk)
>> if (pblk_submit_io_set(pblk, rqd))
>> goto fail_free_bio;
>>
>> +
>
> No need for the extra line

Removed.

>
>> #ifdef CONFIG_NVM_DEBUG
>> atomic_long_add(secs_to_sync, &pblk->sub_writes);
>> #endif
>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>> index 9838d03..cff6aea 100644
>> --- a/drivers/lightnvm/pblk.h
>> +++ b/drivers/lightnvm/pblk.h
>> @@ -128,7 +128,6 @@ struct pblk_pad_rq {
>> struct pblk_rec_ctx {
>> struct pblk *pblk;
>> struct nvm_rq *rqd;
>> - struct list_head failed;
>> struct work_struct ws_rec;
>> };
>>
>> @@ -664,6 +663,9 @@ struct pblk {
>>
>> struct list_head compl_list;
>>
>> + spinlock_t resubmit_lock; /* Resubmit list lock */
>> + struct list_head resubmit_list; /* Resubmit list for failed writes*/
>> +
>> mempool_t *page_bio_pool;
>> mempool_t *gen_ws_pool;
>> mempool_t *rec_pool;
>> @@ -849,13 +851,9 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
>> /*
>> * pblk recovery
>> */
>> -void pblk_submit_rec(struct work_struct *work);
>> struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
>> int pblk_recov_pad(struct pblk *pblk);
>> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
>> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
>> - struct pblk_rec_ctx *recovery, u64 *comp_bits,
>> - unsigned int comp);
>>
>> /*
>> * pblk gc
>> --
>> 2.7.4
>
> Otherwise, it looks good to me.

Great, thanks for the review!