Re: [PATCH v2 1/3] lightnvm: pblk: rework write error recovery path
From: Javier Gonzalez
Date: Mon Apr 30 2018 - 05:14:09 EST
> On 24 Apr 2018, at 07.45, Hans Holmberg <hans.ml.holmberg@xxxxxxxxxxxxx> wrote:
>
> From: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx>
>
> The write error recovery path is incomplete, so rework
> the write error recovery handling to do resubmits directly
> from the write buffer.
>
> When a write error occurs, the remaining sectors in the chunk are
> mapped out and invalidated and the request inserted in a resubmit list.
>
> The writer thread checks if there are any requests to resubmit,
> scans and invalidates any lbas that have been overwritten by later
> writes and resubmits the failed entries.
>
> Signed-off-by: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx>
> ---
> drivers/lightnvm/pblk-init.c | 2 +
> drivers/lightnvm/pblk-rb.c | 39 ------
> drivers/lightnvm/pblk-recovery.c | 91 -------------
> drivers/lightnvm/pblk-write.c | 267 ++++++++++++++++++++++++++-------------
> drivers/lightnvm/pblk.h | 11 +-
> 5 files changed, 181 insertions(+), 229 deletions(-)
>
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index bfc488d..6f06727 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -426,6 +426,7 @@ static int pblk_core_init(struct pblk *pblk)
> goto free_r_end_wq;
>
> INIT_LIST_HEAD(&pblk->compl_list);
> + INIT_LIST_HEAD(&pblk->resubmit_list);
>
> return 0;
>
> @@ -1185,6 +1186,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
> pblk->state = PBLK_STATE_RUNNING;
> pblk->gc.gc_enabled = 0;
>
> + spin_lock_init(&pblk->resubmit_lock);
> spin_lock_init(&pblk->trans_lock);
> spin_lock_init(&pblk->lock);
>
> diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
> index 024a366..00cd1f2 100644
> --- a/drivers/lightnvm/pblk-rb.c
> +++ b/drivers/lightnvm/pblk-rb.c
> @@ -503,45 +503,6 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
> }
>
> /*
> - * The caller of this function must ensure that the backpointer will not
> - * overwrite the entries passed on the list.
> - */
> -unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
> - struct list_head *list,
> - unsigned int max)
> -{
> - struct pblk_rb_entry *entry, *tentry;
> - struct page *page;
> - unsigned int read = 0;
> - int ret;
> -
> - list_for_each_entry_safe(entry, tentry, list, index) {
> - if (read > max) {
> - pr_err("pblk: too many entries on list\n");
> - goto out;
> - }
> -
> - page = virt_to_page(entry->data);
> - if (!page) {
> - pr_err("pblk: could not allocate write bio page\n");
> - goto out;
> - }
> -
> - ret = bio_add_page(bio, page, rb->seg_size, 0);
> - if (ret != rb->seg_size) {
> - pr_err("pblk: could not add page to write bio\n");
> - goto out;
> - }
> -
> - list_del(&entry->index);
> - read++;
> - }
> -
> -out:
> - return read;
> -}
> -
> -/*
> * Read available entries on rb and add them to the given bio. To avoid a memory
> * copy, a page reference to the write buffer is used to be added to the bio.
> *
> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
> index 9cb6d5d..5983428 100644
> --- a/drivers/lightnvm/pblk-recovery.c
> +++ b/drivers/lightnvm/pblk-recovery.c
> @@ -16,97 +16,6 @@
>
> #include "pblk.h"
>
> -void pblk_submit_rec(struct work_struct *work)
> -{
> - struct pblk_rec_ctx *recovery =
> - container_of(work, struct pblk_rec_ctx, ws_rec);
> - struct pblk *pblk = recovery->pblk;
> - struct nvm_rq *rqd = recovery->rqd;
> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> - struct bio *bio;
> - unsigned int nr_rec_secs;
> - unsigned int pgs_read;
> - int ret;
> -
> - nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
> - NVM_MAX_VLBA);
> -
> - bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
> -
> - bio->bi_iter.bi_sector = 0;
> - bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
> - rqd->bio = bio;
> - rqd->nr_ppas = nr_rec_secs;
> -
> - pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
> - nr_rec_secs);
> - if (pgs_read != nr_rec_secs) {
> - pr_err("pblk: could not read recovery entries\n");
> - goto err;
> - }
> -
> - if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
> - pr_err("pblk: could not setup recovery request\n");
> - goto err;
> - }
> -
> -#ifdef CONFIG_NVM_DEBUG
> - atomic_long_add(nr_rec_secs, &pblk->recov_writes);
> -#endif
> -
> - ret = pblk_submit_io(pblk, rqd);
> - if (ret) {
> - pr_err("pblk: I/O submission failed: %d\n", ret);
> - goto err;
> - }
> -
> - mempool_free(recovery, pblk->rec_pool);
> - return;
> -
> -err:
> - bio_put(bio);
> - pblk_free_rqd(pblk, rqd, PBLK_WRITE);
> -}
> -
> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
> - struct pblk_rec_ctx *recovery, u64 *comp_bits,
> - unsigned int comp)
> -{
> - struct nvm_rq *rec_rqd;
> - struct pblk_c_ctx *rec_ctx;
> - int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
> -
> - rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
> - rec_ctx = nvm_rq_to_pdu(rec_rqd);
> -
> - /* Copy completion bitmap, but exclude the first X completed entries */
> - bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
> - (unsigned long int *)comp_bits,
> - comp, NVM_MAX_VLBA);
> -
> - /* Save the context for the entries that need to be re-written and
> - * update current context with the completed entries.
> - */
> - rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
> - if (comp >= c_ctx->nr_valid) {
> - rec_ctx->nr_valid = 0;
> - rec_ctx->nr_padded = nr_entries - comp;
> -
> - c_ctx->nr_padded = comp - c_ctx->nr_valid;
> - } else {
> - rec_ctx->nr_valid = c_ctx->nr_valid - comp;
> - rec_ctx->nr_padded = c_ctx->nr_padded;
> -
> - c_ctx->nr_valid = comp;
> - c_ctx->nr_padded = 0;
> - }
> -
> - recovery->rqd = rec_rqd;
> - recovery->pblk = pblk;
> -
> - return 0;
> -}
> -
> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
> {
> u32 crc;
> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
> index 3e6f1eb..f62e432f 100644
> --- a/drivers/lightnvm/pblk-write.c
> +++ b/drivers/lightnvm/pblk-write.c
> @@ -103,68 +103,149 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
> pblk_rb_sync_end(&pblk->rwb, &flags);
> }
>
> -/* When a write fails, we are not sure whether the block has grown bad or a page
> - * range is more susceptible to write errors. If a high number of pages fail, we
> - * assume that the block is bad and we mark it accordingly. In all cases, we
> - * remap and resubmit the failed entries as fast as possible; if a flush is
> - * waiting on a completion, the whole stack would stall otherwise.
> - */
> -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
> +/* Map remaining sectors in chunk, starting from ppa */
> +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
> {
> - void *comp_bits = &rqd->ppa_status;
> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> - struct pblk_rec_ctx *recovery;
> - struct ppa_addr *ppa_list = rqd->ppa_list;
> - int nr_ppas = rqd->nr_ppas;
> - unsigned int c_entries;
> - int bit, ret;
> + struct nvm_tgt_dev *dev = pblk->dev;
> + struct nvm_geo *geo = &dev->geo;
> + struct pblk_line *line;
> + struct ppa_addr map_ppa = *ppa;
> + u64 paddr;
> + int done = 0;
>
> - if (unlikely(nr_ppas == 1))
> - ppa_list = &rqd->ppa_addr;
> + line = &pblk->lines[pblk_ppa_to_line(*ppa)];
> + spin_lock(&line->lock);
>
> - recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
> + while (!done) {
> + paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
>
> - INIT_LIST_HEAD(&recovery->failed);
> + if (!test_and_set_bit(paddr, line->map_bitmap))
> + line->left_msecs--;
>
> - bit = -1;
> - while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
> - struct pblk_rb_entry *entry;
> - struct ppa_addr ppa;
> + if (!test_and_set_bit(paddr, line->invalid_bitmap))
> + le32_add_cpu(line->vsc, -1);
>
> - /* Logic error */
> - if (bit > c_ctx->nr_valid) {
> - WARN_ONCE(1, "pblk: corrupted write request\n");
> - mempool_free(recovery, pblk->rec_pool);
> - goto out;
> + if (geo->version == NVM_OCSSD_SPEC_12) {
> + map_ppa.ppa++;
> + if (map_ppa.g.pg == geo->num_pg)
> + done = 1;
> + } else {
> + map_ppa.m.sec++;
> + if (map_ppa.m.sec == geo->clba)
> + done = 1;
> }
> + }
>
> - ppa = ppa_list[bit];
> - entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
> - if (!entry) {
> - pr_err("pblk: could not scan entry on write failure\n");
> - mempool_free(recovery, pblk->rec_pool);
> - goto out;
> - }
> + spin_unlock(&line->lock);
> +}
> +
> +static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
> + unsigned int nr_entries)
> +{
> + struct pblk_rb *rb = &pblk->rwb;
> + struct pblk_rb_entry *entry;
> + struct pblk_line *line;
> + struct pblk_w_ctx *w_ctx;
> + struct ppa_addr ppa_l2p;
> + int flags;
> + unsigned int pos, i;
> +
> + spin_lock(&pblk->trans_lock);
> + pos = sentry;
> + for (i = 0; i < nr_entries; i++) {
> + entry = &rb->entries[pos];
> + w_ctx = &entry->w_ctx;
> +
> + /* Check if the lba has been overwritten */
> + ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
> + if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
> + w_ctx->lba = ADDR_EMPTY;
> +
> + /* Mark up the entry as submittable again */
> + flags = READ_ONCE(w_ctx->flags);
> + flags |= PBLK_WRITTEN_DATA;
> + /* Release flags on write context. Protect from writes */
> + smp_store_release(&w_ctx->flags, flags);
>
> - /* The list is filled first and emptied afterwards. No need for
> - * protecting it with a lock
> + /* Decrese the reference count to the line as we will
> + * re-map these entries
> */
> - list_add_tail(&entry->index, &recovery->failed);
> + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
> + kref_put(&line->ref, pblk_line_put);
> +
> + pos = (pos + 1) & (rb->nr_entries - 1);
> }
> + spin_unlock(&pblk->trans_lock);
> +}
>
> - c_entries = find_first_bit(comp_bits, nr_ppas);
> - ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
> - if (ret) {
> - pr_err("pblk: could not recover from write failure\n");
> - mempool_free(recovery, pblk->rec_pool);
> - goto out;
> +static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
> +{
> + struct pblk_c_ctx *r_ctx;
> +
> + r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
> + if (!r_ctx)
> + return;
> +
> + r_ctx->lun_bitmap = NULL;
> + r_ctx->sentry = c_ctx->sentry;
> + r_ctx->nr_valid = c_ctx->nr_valid;
> + r_ctx->nr_padded = c_ctx->nr_padded;
> +
> + spin_lock(&pblk->resubmit_lock);
> + list_add_tail(&r_ctx->list, &pblk->resubmit_list);
> + spin_unlock(&pblk->resubmit_lock);
> +
> +#ifdef CONFIG_NVM_DEBUG
> + atomic_long_add(c_ctx->nr_valid, &pblk->recov_writes);
> +#endif
> +}
> +
> +static void pblk_submit_rec(struct work_struct *work)
> +{
> + struct pblk_rec_ctx *recovery =
> + container_of(work, struct pblk_rec_ctx, ws_rec);
> + struct pblk *pblk = recovery->pblk;
> + struct nvm_rq *rqd = recovery->rqd;
> + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> + struct ppa_addr *ppa_list;
> +
> + pblk_log_write_err(pblk, rqd);
> +
> + if (rqd->nr_ppas == 1)
> + ppa_list = &rqd->ppa_addr;
> + else
> + ppa_list = rqd->ppa_list;
> +
> + pblk_map_remaining(pblk, ppa_list);
> + pblk_queue_resubmit(pblk, c_ctx);
> +
> + pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
> + if (c_ctx->nr_padded)
> + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
> + c_ctx->nr_padded);
> + bio_put(rqd->bio);
> + pblk_free_rqd(pblk, rqd, PBLK_WRITE);
> + mempool_free(recovery, pblk->rec_pool);
> +
> + atomic_dec(&pblk->inflight_io);
> +}
> +
> +
> +static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
> +{
> + struct pblk_rec_ctx *recovery;
> +
> + recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
> + if (!recovery) {
> + pr_err("pblk: could not allocate recovery work\n");
> + return;
> }
>
> + recovery->pblk = pblk;
> + recovery->rqd = rqd;
> +
> INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
> queue_work(pblk->close_wq, &recovery->ws_rec);
> -
> -out:
> - pblk_complete_write(pblk, rqd, c_ctx);
> }
>
> static void pblk_end_io_write(struct nvm_rq *rqd)
> @@ -173,8 +254,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
> struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>
> if (rqd->error) {
> - pblk_log_write_err(pblk, rqd);
> - return pblk_end_w_fail(pblk, rqd);
> + pblk_end_w_fail(pblk, rqd);
> + return;
> }
> #ifdef CONFIG_NVM_DEBUG
> else
> @@ -266,31 +347,6 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
> return 0;
> }
>
> -int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
> - struct pblk_c_ctx *c_ctx)
> -{
> - struct pblk_line_meta *lm = &pblk->lm;
> - unsigned long *lun_bitmap;
> - int ret;
> -
> - lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
> - if (!lun_bitmap)
> - return -ENOMEM;
> -
> - c_ctx->lun_bitmap = lun_bitmap;
> -
> - ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas, pblk_end_io_write);
> - if (ret)
> - return ret;
> -
> - pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
> -
> - rqd->ppa_status = (u64)0;
> - rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
> -
> - return ret;
> -}
> -
> static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
> unsigned int secs_to_flush)
> {
> @@ -339,6 +395,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
> bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
> l_mg->emeta_alloc_type, GFP_KERNEL);
> if (IS_ERR(bio)) {
> + pr_err("pblk: failed to map emeta io");
> ret = PTR_ERR(bio);
> goto fail_free_rqd;
> }
> @@ -515,26 +572,54 @@ static int pblk_submit_write(struct pblk *pblk)
> unsigned int secs_avail, secs_to_sync, secs_to_com;
> unsigned int secs_to_flush;
> unsigned long pos;
> + unsigned int resubmit;
>
> - /* If there are no sectors in the cache, flushes (bios without data)
> - * will be cleared on the cache threads
> - */
> - secs_avail = pblk_rb_read_count(&pblk->rwb);
> - if (!secs_avail)
> - return 1;
> -
> - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
> - if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
> - return 1;
> -
> - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
> - if (secs_to_sync > pblk->max_write_pgs) {
> - pr_err("pblk: bad buffer sync calculation\n");
> - return 1;
> - }
> + spin_lock(&pblk->resubmit_lock);
> + resubmit = !list_empty(&pblk->resubmit_list);
> + spin_unlock(&pblk->resubmit_lock);
> +
> + /* Resubmit failed writes first */
> + if (resubmit) {
> + struct pblk_c_ctx *r_ctx;
> +
> + spin_lock(&pblk->resubmit_lock);
> + r_ctx = list_first_entry(&pblk->resubmit_list,
> + struct pblk_c_ctx, list);
> + list_del(&r_ctx->list);
> + spin_unlock(&pblk->resubmit_lock);
> +
> + secs_avail = r_ctx->nr_valid;
> + pos = r_ctx->sentry;
> +
> + pblk_prepare_resubmit(pblk, pos, secs_avail);
> + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
> + secs_avail);
>
> - secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
> - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
> + kfree(r_ctx);
> + } else {
> + /* If there are no sectors in the cache,
> + * flushes (bios without data) will be cleared on
> + * the cache threads
> + */
> + secs_avail = pblk_rb_read_count(&pblk->rwb);
> + if (!secs_avail)
> + return 1;
> +
> + secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
> + if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
> + return 1;
> +
> + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
> + secs_to_flush);
> + if (secs_to_sync > pblk->max_write_pgs) {
> + pr_err("pblk: bad buffer sync calculation\n");
> + return 1;
> + }
> +
> + secs_to_com = (secs_to_sync > secs_avail) ?
> + secs_avail : secs_to_sync;
> + pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
> + }
>
> bio = bio_alloc(GFP_KERNEL, secs_to_sync);
>
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index 9838d03..f8434a3 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -128,7 +128,6 @@ struct pblk_pad_rq {
> struct pblk_rec_ctx {
> struct pblk *pblk;
> struct nvm_rq *rqd;
> - struct list_head failed;
> struct work_struct ws_rec;
> };
>
> @@ -664,6 +663,9 @@ struct pblk {
>
> struct list_head compl_list;
>
> + spinlock_t resubmit_lock; /* Resubmit list lock */
> + struct list_head resubmit_list; /* Resubmit list for failed writes*/
> +
> mempool_t *page_bio_pool;
> mempool_t *gen_ws_pool;
> mempool_t *rec_pool;
> @@ -713,9 +715,6 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb);
> unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
> unsigned int pos, unsigned int nr_entries,
> unsigned int count);
> -unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
> - struct list_head *list,
> - unsigned int max);
> int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
> struct ppa_addr ppa, int bio_iter, bool advanced_bio);
> unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
> @@ -849,13 +848,9 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
> /*
> * pblk recovery
> */
> -void pblk_submit_rec(struct work_struct *work);
> struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
> int pblk_recov_pad(struct pblk *pblk);
> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
> - struct pblk_rec_ctx *recovery, u64 *comp_bits,
> - unsigned int comp);
>
> /*
> * pblk gc
> --
> 2.7.4
LGTM
Reviewed-by: Javier GonzÃlez <javier@xxxxxxxxxxxx>
Attachment:
signature.asc
Description: Message signed with OpenPGP