[PATCH] lightnvm: physical block device (pblk) target

From: Javier GonzÃlez
Date: Fri Nov 18 2016 - 10:03:37 EST

Next message: luca abeni: "Re: [RFC v3 1/6] Track the active utilisation"
Previous message: Geliang Tang: "[PATCH] ovl: fix return value of ovl_fill_super"
Next in thread: Javier GonzÃlez: "Re: [PATCH] lightnvm: physical block device (pblk) target"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This patch introduces a new LightNVM target implementing a full
host-based Flash Translation Layer (FTL): pblk. It differs from the
existing rrpc target in that rrpc is a hybrid approach, where
the L2P table is maintained both on the host and on the device.

pblk operates around a ring write buffer. Apart from buffering writes,
this buffer allows to respect controller constrains on how flash pages
must be written. The write buffer is complemented with a write context
buffer, which stores metadata for each 4KB write. With regards to the
mapping strategy, we follow a late-map approach, where the actual
mapping to flash pages is done when the write buffer is being written to
the media. Logical addresses are mapped physical flash pages in a
round-robin fashion in relation to the active LUNs on the target.

Apart from the typical head an tail pointers, the write buffer maintains
a number of pointers to:
- Submission pointer: Keeps track of the last entry submitted to the
media.
- Sync pointer: Keeps track of entries successfully stored on the
flash. It acts as a backpointer that guarantees that I/Os are
completed in order. This is necessary to guarantee that flushes are
completed when they should. It delegates too some responsibility
to the block layer since bios are completed in order.
- Sync-point pointer: Guarantees that flushes are respected.
- L2p-update pointer: Guarantees that lookups to cache entries will
point to their cache line after L2P mapping takes place for as long
as they remain in cache.

The size of the ring buffer is the closest power-of-2 size to the
number of active LUNs in the target X the size of a flash block.

pblk implements basic FTL functionality:
- GC: A simple cost-base garbage collector.
- Write recovery: If a block becomes bad after writes have
successfully been committed to it, the block is garbage collected,
marked as bad and returned to the media manager.
- Scan recovery: The last page of each block is used to store
metadata that allows to reconstruct the L2P table in case of a
crash. This is the current recovery strategy until we implement L2P
snapshoting.
- Rate limiter: When GC starts, a rate-limiter guarantees that entries
on the write buffer will be reserved to user I/O and GC I/O.
- Sysfs integration: Information about the target and its
state is exposed through sysfs. This includes statistics and
open, closed and bad blocks.
---
drivers/lightnvm/Kconfig | 16 +
drivers/lightnvm/Makefile | 5 +
drivers/lightnvm/pblk-cache.c | 179 ++++++++
drivers/lightnvm/pblk-core.c | 757 +++++++++++++++++++++++++++++++
drivers/lightnvm/pblk-gc.c | 620 ++++++++++++++++++++++++++
drivers/lightnvm/pblk-init.c | 769 ++++++++++++++++++++++++++++++++
drivers/lightnvm/pblk-map.c | 414 +++++++++++++++++
drivers/lightnvm/pblk-rb.c | 823 ++++++++++++++++++++++++++++++++++
drivers/lightnvm/pblk-read.c | 614 +++++++++++++++++++++++++
drivers/lightnvm/pblk-recovery.c | 792 ++++++++++++++++++++++++++++++++
drivers/lightnvm/pblk-rl.c | 262 +++++++++++
drivers/lightnvm/pblk-sysfs.c | 828 ++++++++++++++++++++++++++++++++++
drivers/lightnvm/pblk-write.c | 530 ++++++++++++++++++++++
drivers/lightnvm/pblk.h | 942 +++++++++++++++++++++++++++++++++++++++
14 files changed, 7551 insertions(+)
create mode 100644 drivers/lightnvm/pblk-cache.c
create mode 100644 drivers/lightnvm/pblk-core.c
create mode 100644 drivers/lightnvm/pblk-gc.c
create mode 100644 drivers/lightnvm/pblk-init.c
create mode 100644 drivers/lightnvm/pblk-map.c
create mode 100644 drivers/lightnvm/pblk-rb.c
create mode 100644 drivers/lightnvm/pblk-read.c
create mode 100644 drivers/lightnvm/pblk-recovery.c
create mode 100644 drivers/lightnvm/pblk-rl.c
create mode 100644 drivers/lightnvm/pblk-sysfs.c
create mode 100644 drivers/lightnvm/pblk-write.c
create mode 100644 drivers/lightnvm/pblk.h

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 2f5d5f4..e2129ad 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -42,4 +42,20 @@ config NVM_RRPC
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.

+config NVM_PBLK
+ tristate "Physical Block Device Open-Channel SSD target"
+ ---help---
+ Allows an open-channel SSD to be exposed as a block device to the
+ host. The target assumes the device exposes raw flash and must be
+ explicitly managed by the host.
+
+config NVM_PBLK_NO_RECOV
+ bool "PBLK without L2P scan recovery"
+ depends on NVM_PBLK
+ ---help---
+ Disables scan recovery when instantiating a new pblk target. Scan
+ recovery happens when the L2P table cannot be recovered from snapshot or
+ the snapshot is corrupt. Scan recovery scans the blocks owned by the
+ target and reconstructs the L2P table.
+
endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index a7a0a22..12f177e 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -5,3 +5,8 @@
obj-$(CONFIG_NVM) := core.o sysblk.o
obj-$(CONFIG_NVM_GENNVM) += gennvm.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
+obj-$(CONFIG_NVM_PBLK) += pblk.o
+pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
+ pblk-write.o pblk-cache.o pblk-read.o \
+ pblk-gc.o pblk-recovery.o pblk-map.o \
+ pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
new file mode 100644
index 0000000..aabed04
--- /dev/null
+++ b/drivers/lightnvm/pblk-cache.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+/*
+ * Copy data from current bio to write buffer. This if necessary to guarantee
+ * that (i) writes to the media at issued at the right granurality and (ii) that
+ * memory-specific constrains are respected (e.g., TLC memories need to write
+ * upper, medium and lower pages to guarantee that data has been persisted).
+ *
+ * This path is exclusively taken by user I/O.
+ *
+ * return: 1 if bio has been written to buffer, 0 otherwise.
+ */
+static int __pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+ unsigned long flags, unsigned int nr_entries)
+{
+ sector_t laddr = pblk_get_laddr(bio);
+ struct bio *ctx_bio = (bio->bi_opf & REQ_PREFLUSH) ? bio : NULL;
+ struct pblk_w_ctx w_ctx;
+ unsigned long bpos, pos;
+ unsigned int i;
+ int ret = (ctx_bio) ? NVM_IO_OK : NVM_IO_DONE;
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+ if (!pblk_rb_may_write(&pblk->rwb, nr_entries, nr_entries, &bpos))
+ return NVM_IO_REQUEUE;
+
+ w_ctx.bio = ctx_bio;
+ w_ctx.flags = flags;
+ w_ctx.priv = NULL;
+ w_ctx.paddr = 0;
+ ppa_set_empty(&w_ctx.ppa.ppa);
+
+ for (i = 0; i < nr_entries; i++) {
+ void *data = bio_data(bio);
+ struct ppa_addr ppa;
+
+ w_ctx.lba = laddr + i;
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+ pblk_rb_write_entry(&pblk->rwb, data, w_ctx, pos);
+ ppa = pblk_cacheline_to_ppa(pos);
+
+ pblk_update_map(pblk, w_ctx.lba, NULL, ppa);
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(nr_entries, &pblk->inflight_writes);
+ atomic_add(nr_entries, &pblk->req_writes);
+#endif
+
+ return ret;
+}
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+ int nr_secs = pblk_get_secs(bio);
+ int ret = NVM_IO_DONE;
+
+ if (bio->bi_opf & REQ_PREFLUSH) {
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&pblk->nr_flush);
+#endif
+ if (!bio_has_data(bio)) {
+ if (pblk_rb_sync_point_set(&pblk->rwb, bio))
+ ret = NVM_IO_OK;
+ pblk_write_kick(pblk);
+ goto out;
+ }
+ }
+
+ pblk_rl_user_in(pblk, nr_secs);
+
+retry:
+ ret = __pblk_write_to_cache(pblk, bio, flags, nr_secs);
+ if (ret == NVM_IO_REQUEUE) {
+ schedule();
+ goto retry;
+ }
+
+ spin_lock(&pblk->kick_lock);
+ pblk->write_cnt += nr_secs;
+ if (pblk->write_cnt > PBLK_KICK_SECTS) {
+ pblk->write_cnt -= PBLK_KICK_SECTS;
+ spin_unlock(&pblk->kick_lock);
+
+ pblk_write_kick(pblk);
+ } else
+ spin_unlock(&pblk->kick_lock);
+
+ if (bio->bi_opf & REQ_PREFLUSH)
+ pblk_write_kick(pblk);
+
+out:
+ return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might reside in the write cache.
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+ struct pblk_kref_buf *ref_buf,
+ unsigned int nr_entries, unsigned int nr_rec_entries,
+ unsigned long flags, struct pblk_block *gc_rblk)
+{
+ struct pblk_w_ctx w_ctx;
+ unsigned long bpos, pos;
+ unsigned int valid_entries;
+ unsigned int i;
+
+retry:
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+ if (!pblk_rb_may_write(&pblk->rwb, nr_entries, nr_rec_entries, &bpos)) {
+ schedule();
+ goto retry;
+ }
+
+ w_ctx.bio = NULL;
+ w_ctx.flags = flags | PBLK_IOTYPE_GC;
+ w_ctx.priv = ref_buf;
+ w_ctx.paddr = 0;
+ ppa_set_empty(&w_ctx.ppa.ppa);
+
+ for (i = 0, valid_entries = 0; i < nr_entries; i++) {
+ struct ppa_addr ppa;
+
+ if (lba_list[i] == ADDR_EMPTY)
+ continue;
+
+ w_ctx.lba = lba_list[i];
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(!(flags & PBLK_IOTYPE_REF));
+#endif
+ kref_get(&ref_buf->ref);
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+ pblk_rb_write_entry(&pblk->rwb, data, w_ctx, pos);
+ ppa = pblk_cacheline_to_ppa(pos);
+
+ pblk_update_map_gc(pblk, w_ctx.lba, NULL, ppa, gc_rblk);
+
+ data += PBLK_EXPOSED_PAGE_SIZE;
+ valid_entries++;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(nr_rec_entries != valid_entries);
+ atomic_add(valid_entries, &pblk->inflight_writes);
+ atomic_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+ return NVM_IO_OK;
+}
+
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
new file mode 100644
index 0000000..8111473
--- /dev/null
+++ b/drivers/lightnvm/pblk-core.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ * TODO:
+ * - Implement L2P snapshot on graceful tear down.
+ * - Separate mapping from actual stripping strategy to enable
+ * workload-specific optimizations
+ * - Implement support for new MLC & TLC chips
+ */
+
+#include "pblk.h"
+#include <linux/time.h>
+
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+{
+ mempool_t *pool;
+ struct nvm_rq *rqd;
+ int rq_size;
+
+ if (rw == WRITE) {
+ pool = pblk->w_rq_pool;
+ rq_size = pblk_w_rq_size;
+ } else {
+ pool = pblk->r_rq_pool;
+ rq_size = pblk_r_rq_size;
+ }
+
+ rqd = mempool_alloc(pool, GFP_KERNEL);
+ if (!rqd)
+ return ERR_PTR(-ENOMEM);
+
+ memset(rqd, 0, rq_size);
+ return rqd;
+}
+
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+{
+ mempool_t *pool;
+
+ if (rw == WRITE)
+ pool = pblk->w_rq_pool;
+ else
+ pool = pblk->r_rq_pool;
+
+ mempool_free(rqd, pool);
+}
+
+void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, int error)
+{
+ int offset = -1;
+ struct ppa_addr p;
+
+ if (rqd->nr_ppas == 1) {
+ p = dev_to_generic_addr(pblk->dev, rqd->ppa_addr);
+ print_ppa(&p, "rqd", error);
+ return;
+ }
+
+ while ((offset =
+ find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+ offset + 1)) < rqd->nr_ppas) {
+ p = dev_to_generic_addr(pblk->dev, rqd->ppa_list[offset]);
+ print_ppa(&p, "rqd", error);
+ }
+
+ pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages)
+{
+ struct bio_vec bv;
+ int i;
+
+ WARN_ON(off + nr_pages != bio->bi_vcnt);
+
+ bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
+ for (i = off; i < nr_pages + off; i++) {
+ bv = bio->bi_io_vec[i];
+ mempool_free(bv.bv_page, pblk->page_pool);
+ }
+}
+
+/* This function must only be used on bios owned by pblk */
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages)
+{
+ struct request_queue *q = pblk->dev->q;
+ struct page *page;
+ int ret;
+ int i;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = mempool_alloc(pblk->page_pool, flags);
+ if (!page)
+ goto err;
+
+ ret = bio_add_pc_page(q, bio, page,
+ PBLK_EXPOSED_PAGE_SIZE, 0);
+ if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+ pr_err("pblk: could not add page to bio\n");
+ mempool_free(page, pblk->page_pool);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ pblk_bio_free_pages(pblk, bio, 0, i - 1);
+ return -1;
+}
+
+void pblk_end_sync_bio(struct bio *bio)
+{
+ struct completion *waiting = bio->bi_private;
+
+ complete(waiting);
+}
+
+void pblk_write_timer_fn(unsigned long data)
+{
+ struct pblk *pblk = (struct pblk *)data;
+
+ /* Kick user I/O rate limiter queue if waiting */
+ if (waitqueue_active(&pblk->wait))
+ wake_up_nr(&pblk->wait, 1);
+
+ /* kick the write thread every tick to flush outstanding data */
+ pblk_write_kick(pblk);
+
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_flush_writer(struct pblk *pblk)
+{
+ struct bio *bio;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio)
+ return;
+
+ bio->bi_iter.bi_sector = 0; /* artificial bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
+ bio->bi_private = &wait;
+ bio->bi_end_io = pblk_end_sync_bio;
+
+ ret = pblk_write_to_cache(pblk, bio, 0);
+ if (ret == NVM_IO_OK)
+ wait_for_completion_io(&wait);
+ else if (ret != NVM_IO_DONE)
+ pr_err("pblk: tear down bio failed\n");
+
+ if (bio->bi_error)
+ pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+
+ bio_put(bio);
+}
+
+static void pblk_page_invalidate(struct pblk *pblk, struct pblk_addr *a)
+{
+ struct pblk_block *rblk = a->rblk;
+ u64 block_ppa;
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(nvm_addr_in_cache(a->ppa));
+ BUG_ON(ppa_empty(a->ppa));
+#endif
+
+ block_ppa = pblk_gaddr_to_pg_offset(pblk->dev, a->ppa);
+
+ spin_lock(&rblk->lock);
+ WARN_ON(test_and_set_bit(block_ppa, rblk->invalid_bitmap));
+ rblk->nr_invalid_secs++;
+ spin_unlock(&rblk->lock);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+ unsigned int nr_secs)
+{
+ sector_t i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = slba; i < slba + nr_secs; i++) {
+ struct pblk_addr *gp = &pblk->trans_map[i];
+
+ if (gp->rblk)
+ pblk_page_invalidate(pblk, gp);
+ ppa_set_empty(&gp->ppa);
+ gp->rblk = NULL;
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+ sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+ sector_t nr_secs = bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+
+ pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
+{
+ struct pblk_addr *gp;
+ struct ppa_addr ppa;
+
+ spin_lock(&pblk->trans_lock);
+ gp = &pblk->trans_map[lba];
+ ppa = gp->ppa;
+ spin_unlock(&pblk->trans_lock);
+
+ return ppa;
+}
+
+static void pblk_init_rlpg(struct pblk *pblk, struct pblk_block *rblk,
+ struct pblk_blk_rec_lpg *rlpg)
+{
+ u64 *lbas = pblk_rlpg_to_llba(rlpg);
+ unsigned long *bitmaps;
+ int nr_entries = pblk->nr_blk_dsecs;
+
+ rblk->cur_sec = 0;
+ rblk->nr_invalid_secs = 0;
+ rblk->rlpg = rlpg;
+
+ bitmaps = (void *)(lbas + nr_entries);
+
+ rblk->sector_bitmap = bitmaps;
+ rblk->sync_bitmap = (rblk->sector_bitmap) + rlpg->bitmap_len;
+ rblk->invalid_bitmap = (rblk->sync_bitmap) + rlpg->bitmap_len;
+}
+
+struct pblk_blk_rec_lpg *pblk_alloc_blk_meta(struct pblk *pblk,
+ struct pblk_block *rblk,
+ u32 status)
+{
+ struct pblk_blk_rec_lpg *rlpg = NULL;
+ unsigned int rlpg_len, req_len, bitmap_len;
+
+ if (pblk_recov_calc_meta_len(pblk, &bitmap_len, &rlpg_len, &req_len))
+ goto out;
+
+ rlpg = mempool_alloc(pblk->blk_meta_pool, GFP_KERNEL);
+ if (!rlpg)
+ goto out;
+ memset(rlpg, 0, req_len);
+
+ rlpg->status = status;
+ rlpg->rlpg_len = rlpg_len;
+ rlpg->req_len = req_len;
+ rlpg->bitmap_len = bitmap_len;
+ rlpg->crc = 0;
+ rlpg->nr_lbas = 0;
+ rlpg->nr_padded = 0;
+
+ pblk_init_rlpg(pblk, rblk, rlpg);
+
+out:
+ return rlpg;
+}
+
+struct pblk_block *pblk_get_blk(struct pblk *pblk, struct pblk_lun *rlun)
+{
+ struct pblk_block *rblk;
+ struct pblk_blk_rec_lpg *rlpg;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rlun->lock);
+#endif
+
+ if (list_empty(&rlun->free_list))
+ goto err;
+
+ /* Blocks are erased when put */
+ rblk = list_first_entry(&rlun->free_list, struct pblk_block, list);
+ rblk->state = NVM_BLK_ST_TGT;
+ pblk_rl_free_blks_dec(pblk, rlun);
+
+ list_move_tail(&rblk->list, &rlun->open_list);
+
+ rlpg = pblk_alloc_blk_meta(pblk, rblk, PBLK_BLK_ST_OPEN);
+ if (!rlpg)
+ goto fail_put_blk;
+
+ return rblk;
+
+fail_put_blk:
+ pblk_put_blk(pblk, rblk);
+err:
+ return NULL;
+}
+
+void pblk_set_lun_cur(struct pblk_lun *rlun, struct pblk_block *rblk)
+{
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rlun->lock);
+
+ if (rlun->cur) {
+ spin_lock(&rlun->cur->lock);
+ WARN_ON(!block_is_full(rlun->pblk, rlun->cur) &&
+ !block_is_bad(rblk));
+ spin_unlock(&rlun->cur->lock);
+ }
+#endif
+
+ rlun->cur = rblk;
+}
+
+void pblk_run_blk_ws(struct pblk *pblk, struct pblk_block *rblk,
+ void (*work)(struct work_struct *))
+{
+ struct pblk_block_ws *blk_ws;
+
+ blk_ws = mempool_alloc(pblk->blk_ws_pool, GFP_ATOMIC);
+ if (!blk_ws)
+ return;
+
+ blk_ws->pblk = pblk;
+ blk_ws->rblk = rblk;
+
+ INIT_WORK(&blk_ws->ws_blk, work);
+ queue_work(pblk->kgc_wq, &blk_ws->ws_blk);
+}
+
+void pblk_end_close_blk_bio(struct pblk *pblk, struct nvm_rq *rqd, int run_gc)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_ctx *ctx = pblk_set_ctx(pblk, rqd);
+ struct pblk_compl_close_ctx *c_ctx = ctx->c_ctx;
+
+ up(&c_ctx->rblk->rlun->wr_sem);
+
+ if (run_gc)
+ pblk_run_blk_ws(pblk, c_ctx->rblk, pblk_gc_queue);
+
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+ bio_put(rqd->bio);
+ kfree(rqd);
+}
+
+static void pblk_end_w_pad(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(c_ctx->nr_valid != 0);
+#endif
+
+ if (c_ctx->nr_padded > 1)
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+void pblk_end_io(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = container_of(rqd->ins, struct pblk, instance);
+ uint8_t nr_secs = rqd->nr_ppas;
+
+ if (bio_data_dir(rqd->bio) == READ)
+ pblk_end_io_read(pblk, rqd, nr_secs);
+ else
+ pblk_end_io_write(pblk, rqd);
+}
+
+int pblk_update_map(struct pblk *pblk, sector_t laddr, struct pblk_block *rblk,
+ struct ppa_addr ppa)
+{
+ struct pblk_addr *gp;
+ int ret = 0;
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(!rblk &&
+ pblk_rb_pos_oob(&pblk->rwb, nvm_addr_to_cacheline(ppa)));
+#endif
+
+ /* logic error: lba out-of-bounds */
+ BUG_ON(laddr >= pblk->rl.nr_secs);
+
+ spin_lock(&pblk->trans_lock);
+ gp = &pblk->trans_map[laddr];
+
+ if (gp->rblk)
+ pblk_page_invalidate(pblk, gp);
+
+ gp->ppa = ppa;
+ gp->rblk = rblk;
+
+ spin_unlock(&pblk->trans_lock);
+ return ret;
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t laddr,
+ struct pblk_block *rblk, struct ppa_addr ppa,
+ struct pblk_block *gc_rblk)
+{
+ struct pblk_addr *gp;
+ int ret = 0;
+
+ /* logic error: lba out-of-bounds */
+ BUG_ON(laddr >= pblk->rl.nr_secs);
+
+ spin_lock(&pblk->trans_lock);
+ gp = &pblk->trans_map[laddr];
+
+ /* Prevent updated entries to be overwritten by GC */
+ if (gp->rblk && gc_rblk->id != gp->rblk->id)
+ goto out;
+
+ gp->ppa = ppa;
+ gp->rblk = rblk;
+
+out:
+ spin_unlock(&pblk->trans_lock);
+ return ret;
+}
+
+static int pblk_setup_pad_rq(struct pblk *pblk, struct pblk_block *rblk,
+ struct nvm_rq *rqd, struct pblk_ctx *ctx)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ unsigned int valid_secs = c_ctx->nr_valid;
+ unsigned int padded_secs = c_ctx->nr_padded;
+ unsigned int nr_secs = valid_secs + padded_secs;
+ struct pblk_sec_meta *meta;
+ int min = pblk->min_write_pgs;
+ int i;
+ int ret = 0;
+
+ ret = pblk_write_alloc_rq(pblk, rqd, ctx, nr_secs);
+ if (ret)
+ goto out;
+
+ meta = rqd->meta_list;
+
+ if (unlikely(nr_secs == 1)) {
+ /*
+ * Single sector path - this path is highly improbable since
+ * controllers typically deal with multi-sector and multi-plane
+ * pages. This path is though useful for testing on QEMU
+ */
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(geo->sec_per_pl != 1);
+ BUG_ON(padded_secs != 0);
+#endif
+
+ ret = pblk_map_page(pblk, rblk, c_ctx->sentry, &rqd->ppa_addr,
+ &meta[0], 1, 0);
+ if (ret) {
+ /* There is no more available pages to map the current
+ * request. Rate limiter had probably failed
+ */
+ BUG_ON(1);
+ }
+
+ goto out;
+ }
+
+ for (i = 0; i < nr_secs; i += min) {
+ ret = pblk_map_page(pblk, rblk, c_ctx->sentry + i,
+ &rqd->ppa_list[i],
+ &meta[i], min, 0);
+
+ if (ret) {
+ /* There is no more available pages to map the current
+ * request. Rate limiter had probably failed
+ */
+ BUG_ON(1);
+ }
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ if (pblk_boundary_checks(dev, rqd->ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+#endif
+
+out:
+ return ret;
+}
+
+static void pblk_pad_blk(struct pblk *pblk, struct pblk_block *rblk,
+ int nr_free_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ struct pblk_ctx *ctx;
+ struct pblk_compl_ctx *c_ctx;
+ void *pad_data;
+ unsigned int bio_len;
+ int nr_secs, err;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ pad_data = kzalloc(pblk->max_write_pgs * geo->sec_size, GFP_KERNEL);
+ if (!pad_data)
+ return;
+
+ do {
+ nr_secs = (nr_free_secs > pblk->max_write_pgs) ?
+ pblk->max_write_pgs : nr_free_secs;
+
+ rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: could not alloc write req.\n ");
+ goto free_pad_data;
+ }
+ ctx = pblk_set_ctx(pblk, rqd);
+ c_ctx = ctx->c_ctx;
+
+ bio_len = nr_secs * geo->sec_size;
+ bio = bio_map_kern(dev->q, pad_data, bio_len, GFP_KERNEL);
+ if (!bio) {
+ pr_err("pblk: could not alloc tear down bio\n");
+ goto free_rqd;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* artificial bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_private = &wait;
+ bio->bi_end_io = pblk_end_sync_bio;
+ rqd->bio = bio;
+
+ ctx->flags = PBLK_IOTYPE_SYNC;
+ c_ctx->sentry = 0;
+ c_ctx->nr_valid = 0;
+ c_ctx->nr_padded = nr_secs;
+
+ if (pblk_setup_pad_rq(pblk, rblk, rqd, ctx)) {
+ pr_err("pblk: could not setup tear down req.\n");
+ goto free_bio;
+ }
+
+ err = nvm_submit_io(dev, rqd);
+ if (err) {
+ pr_err("pblk: I/O submission failed: %d\n", err);
+ goto free_bio;
+ }
+ wait_for_completion_io(&wait);
+ pblk_end_w_pad(pblk, rqd, ctx);
+
+ nr_free_secs -= nr_secs;
+ } while (nr_free_secs > 0);
+
+ kfree(pad_data);
+ return;
+
+free_bio:
+ bio_put(bio);
+free_rqd:
+ pblk_free_rqd(pblk, rqd, WRITE);
+free_pad_data:
+ kfree(pad_data);
+}
+
+static inline u64 pblk_nr_free_secs(struct pblk *pblk, struct pblk_block *rblk)
+{
+ u64 free_secs = pblk->nr_blk_dsecs;
+
+ spin_lock(&rblk->lock);
+ free_secs -= bitmap_weight(rblk->sector_bitmap, pblk->nr_blk_dsecs);
+ spin_unlock(&rblk->lock);
+
+ return free_secs;
+}
+
+static void pblk_free_blk_meta(struct pblk *pblk, struct pblk_block *rblk)
+{
+ /* All bitmaps are allocated together with the rlpg structure */
+ mempool_free(rblk->rlpg, pblk->blk_meta_pool);
+}
+
+unsigned long pblk_nr_free_blks(struct pblk *pblk)
+{
+ int i;
+ unsigned int avail = 0;
+ struct pblk_lun *rlun;
+
+ for (i = 0; i < pblk->nr_luns; i++) {
+ rlun = &pblk->luns[i];
+ spin_lock(&rlun->lock);
+ avail += rlun->nr_free_blocks;
+ spin_unlock(&rlun->lock);
+ }
+
+ return avail;
+}
+
+/*
+ * TODO: For now, we pad the whole block. In the future, pad only the pages that
+ * are needed to guarantee that future reads will come, and delegate bringing up
+ * the block for writing to the bring up recovery. Basically, this means
+ * implementing l2p snapshot and in case of power failure, if a block belongs
+ * to a target and it is not closed, scan the OOB area for each page to
+ * recover the state of the block. There should only be NUM_LUNS active blocks
+ * at any moment in time.
+ */
+void pblk_pad_open_blks(struct pblk *pblk)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk, *trblk;
+ unsigned int i, mod;
+ int nr_free_secs;
+ LIST_HEAD(open_list);
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ spin_lock(&rlun->lock);
+ list_cut_position(&open_list, &rlun->open_list,
+ rlun->open_list.prev);
+ spin_unlock(&rlun->lock);
+
+ list_for_each_entry_safe(rblk, trblk, &open_list, list) {
+ nr_free_secs = pblk_nr_free_secs(pblk, rblk);
+ div_u64_rem(nr_free_secs, pblk->min_write_pgs, &mod);
+ if (mod) {
+ pr_err("pblk: corrupted block\n");
+ continue;
+ }
+
+ /* empty block - no need for padding */
+ if (nr_free_secs == pblk->nr_blk_dsecs) {
+ pblk_put_blk(pblk, rblk);
+ continue;
+ }
+
+ pr_debug("pblk: padding %d sectors in blk:%d\n",
+ nr_free_secs, rblk->id);
+
+ pblk_pad_blk(pblk, rblk, nr_free_secs);
+ }
+
+ spin_lock(&rlun->lock);
+ list_splice(&open_list, &rlun->open_list);
+ spin_unlock(&rlun->lock);
+ }
+
+ /* Wait until padding completes and blocks are closed */
+ pblk_for_each_lun(pblk, rlun, i) {
+retry:
+ spin_lock(&rlun->lock);
+ if (!list_empty(&rlun->open_list)) {
+ spin_unlock(&rlun->lock);
+ io_schedule();
+ goto retry;
+ }
+ spin_unlock(&rlun->lock);
+ }
+}
+
+void pblk_free_blks(struct pblk *pblk)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk, *trblk;
+ unsigned int i;
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ spin_lock(&rlun->lock);
+ list_for_each_entry_safe(rblk, trblk, &rlun->prio_list, prio) {
+ pblk_free_blk_meta(pblk, rblk);
+ list_del(&rblk->prio);
+ }
+ spin_unlock(&rlun->lock);
+ }
+}
+
+void pblk_put_blk(struct pblk *pblk, struct pblk_block *rblk)
+{
+ struct pblk_lun *rlun = rblk->rlun;
+
+ spin_lock(&rlun->lock);
+ if (rblk->state & NVM_BLK_ST_TGT) {
+ list_move_tail(&rblk->list, &rlun->free_list);
+ pblk_rl_free_blks_inc(pblk, rlun);
+ rblk->state = NVM_BLK_ST_FREE;
+ } else if (rblk->state & NVM_BLK_ST_BAD) {
+ list_move_tail(&rblk->list, &rlun->bb_list);
+ rblk->state = NVM_BLK_ST_BAD;
+ } else {
+ WARN_ON_ONCE(1);
+ pr_err("pblk: erroneous block type (%d-> %u)\n",
+ rblk->id, rblk->state);
+ list_move_tail(&rblk->list, &rlun->bb_list);
+ }
+ spin_unlock(&rlun->lock);
+
+ pblk_free_blk_meta(pblk, rblk);
+}
+
+/* TODO: No need to scan if LUNs are balanced */
+static struct pblk_lun *pblk_ppa_to_lun(struct pblk *pblk, struct ppa_addr p)
+{
+ struct pblk_lun *rlun = NULL;
+ int i;
+
+ for (i = 0; i < pblk->nr_luns; i++) {
+ if (pblk->luns[i].bppa.g.ch == p.g.ch &&
+ pblk->luns[i].bppa.g.lun == p.g.lun) {
+ rlun = &pblk->luns[i];
+ break;
+ }
+ }
+
+ return rlun;
+}
+
+void pblk_mark_bb(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+
+ rlun = pblk_ppa_to_lun(pblk, ppa);
+ rblk = &rlun->blocks[ppa.g.blk];
+ rblk->state = NVM_BLK_ST_BAD;
+
+ nvm_set_bb_tbl(dev->parent, &ppa, 1, NVM_BLK_T_GRWN_BAD);
+}
+
+void pblk_erase_blk(struct pblk *pblk, struct pblk_block *rblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_lun *rlun = rblk->rlun;
+ int flags = pblk_set_progr_mode(pblk, ERASE);
+ struct ppa_addr ppa = pblk_blk_ppa_to_gaddr(pblk->dev, rblk, 0);
+ int error;
+
+ down(&rlun->wr_sem);
+ error = nvm_erase_blk(dev, &ppa, flags);
+ up(&rlun->wr_sem);
+
+ if (error) {
+ pblk_mark_bb(pblk, ppa);
+ inc_stat(pblk, &pblk->erase_failed, 0);
+ print_ppa(&ppa, "erase", 0);
+ }
+}
+
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644
index 0000000..5ab208f
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+
+static void pblk_free_gc_rqd(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ uint8_t nr_secs = rqd->nr_ppas;
+
+ if (nr_secs > 1)
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+
+ pblk_free_rqd(pblk, rqd, READ);
+}
+
+static void pblk_gc_setup_rq(struct pblk *pblk, struct pblk_block *rblk,
+ u64 *lba_list, unsigned int secs_to_gc, int off,
+ unsigned int *ignored)
+{
+ u64 lba;
+ int i;
+
+ /* Discard invalid addresses for current GC I/O */
+ for (i = 0; i < secs_to_gc; i++) {
+ lba = lba_list[i + off];
+
+ /* Omit padded entries on GC */
+ if (lba == ADDR_EMPTY) {
+ (*ignored)++;
+ continue;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(!(lba >= 0 && lba < pblk->rl.nr_secs));
+#endif
+ }
+}
+
+static int pblk_gc_read_victim_blk(struct pblk *pblk, u64 *lba_list,
+ void *data, unsigned int data_len,
+ unsigned int secs_to_gc,
+ unsigned int secs_in_disk, int off)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct request_queue *q = dev->q;
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
+ if (!bio) {
+ pr_err("pblk: could not allocate GC bio\n");
+ goto fail;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* artificial bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_private = &wait;
+ bio->bi_end_io = pblk_end_sync_bio;
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: could not allocate GC request\n");
+ goto fail_free_bio;
+ }
+
+ ret = pblk_submit_read_gc(pblk, bio, rqd, &lba_list[off],
+ secs_to_gc, secs_in_disk,
+ PBLK_IOTYPE_SYNC);
+ if (ret == NVM_IO_ERR) {
+ pr_err("pblk: GC read request failed: (%d)\n", ret);
+ goto fail_free_rqd;
+ }
+
+ wait_for_completion_io(&wait);
+ pblk_free_gc_rqd(pblk, rqd);
+
+ if (bio->bi_error) {
+ inc_stat(pblk, &pblk->read_failed_gc, 0);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, bio->bi_error);
+#endif
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(secs_to_gc, &pblk->sync_reads);
+ atomic_sub(secs_to_gc, &pblk->inflight_reads);
+#endif
+
+ bio_put(bio);
+
+ return NVM_IO_OK;
+
+fail_free_rqd:
+ pblk_free_gc_rqd(pblk, rqd);
+fail_free_bio:
+ bio_put(bio);
+fail:
+ return NVM_IO_ERR;
+}
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+ queue_work(pblk->krqd_wq, &pblk->ws_gc);
+}
+
+static int pblk_gc_write_to_buffer(struct pblk *pblk, u64 *lba_list,
+ void *data, struct pblk_kref_buf *ref_buf,
+ unsigned int data_len,
+ unsigned int secs_to_gc,
+ unsigned int secs_in_disk, int off,
+ struct pblk_block *gc_rblk)
+{
+ pblk_rl_gc_in(pblk, secs_to_gc);
+
+ pblk_write_gc_to_cache(pblk, data, &lba_list[off], ref_buf,
+ secs_to_gc, secs_in_disk,
+ PBLK_IOTYPE_REF, gc_rblk);
+
+ spin_lock(&pblk->kick_lock);
+ pblk->write_cnt += secs_to_gc;
+ if (pblk->write_cnt > PBLK_KICK_SECTS) {
+ pblk->write_cnt -= PBLK_KICK_SECTS;
+ spin_unlock(&pblk->kick_lock);
+
+ pblk_write_kick(pblk);
+ } else
+ spin_unlock(&pblk->kick_lock);
+
+ return NVM_IO_OK;
+}
+
+int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_block *rblk,
+ u64 *lba_list, unsigned int nr_entries)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_kref_buf *ref_buf;
+ void *data;
+ unsigned int data_len;
+ unsigned int alloc_entries, secs_to_gc, secs_in_disk;
+ unsigned int read_left, ignored;
+ int max = pblk->max_write_pgs;
+ int off;
+ int moved = 0;
+
+ if (nr_entries == 0)
+ return 0;
+
+ alloc_entries = (nr_entries > max) ? max : nr_entries;
+ data = kmalloc(alloc_entries * geo->sec_size, GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ ref_buf = kmalloc(sizeof(struct pblk_kref_buf), GFP_KERNEL);
+ if (!ref_buf)
+ goto fail_free_data;
+
+ kref_init(&ref_buf->ref);
+ ref_buf->data = data;
+
+ off = 0;
+ read_left = nr_entries;
+ do {
+ secs_to_gc = (read_left > max) ? max : read_left;
+ ignored = 0;
+
+ pblk_gc_setup_rq(pblk, rblk, lba_list, secs_to_gc, off,
+ &ignored);
+
+ if (ignored == secs_to_gc)
+ goto next;
+
+ secs_in_disk = secs_to_gc - ignored;
+ data_len = secs_in_disk * geo->sec_size;
+
+ /* Read from GC victim block */
+ if (pblk_gc_read_victim_blk(pblk, lba_list, data, data_len,
+ secs_to_gc, secs_in_disk, off))
+ goto fail_free_krefbuf;
+
+ /* Write to buffer */
+ if (pblk_gc_write_to_buffer(pblk, lba_list, data, ref_buf,
+ data_len, secs_to_gc,
+ secs_in_disk, off, rblk))
+ goto fail_free_krefbuf;
+
+next:
+ read_left -= secs_to_gc;
+ off += secs_to_gc;
+ moved += secs_to_gc;
+ } while (read_left > 0);
+
+ kref_put(&ref_buf->ref, pblk_free_ref_mem);
+
+ return moved;
+
+fail_free_krefbuf:
+ kfree(ref_buf);
+fail_free_data:
+ kfree(data);
+out:
+ return moved;
+}
+
+void pblk_gc_queue(struct work_struct *work)
+{
+ struct pblk_block_ws *blk_ws = container_of(work, struct pblk_block_ws,
+ ws_blk);
+ struct pblk *pblk = blk_ws->pblk;
+ struct pblk_block *rblk = blk_ws->rblk;
+ struct pblk_lun *rlun = rblk->rlun;
+
+ spin_lock(&rlun->lock);
+ list_move_tail(&rblk->list, &rlun->closed_list);
+ list_add_tail(&rblk->prio, &rlun->prio_list);
+ spin_unlock(&rlun->lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_sub(PBLK_RECOVERY_SECTORS, &pblk->inflight_meta);
+ atomic_add(PBLK_RECOVERY_SECTORS, &pblk->compl_meta);
+#endif
+
+ mempool_free(blk_ws, pblk->blk_ws_pool);
+ pr_debug("nvm: block '%d' is full, allow GC (sched)\n",
+ rblk->id);
+}
+
+/* the block with highest number of invalid pages, will be in the beginning
+ * of the list
+ */
+static struct pblk_block *rblock_max_invalid(struct pblk_block *ra,
+ struct pblk_block *rb)
+{
+ if (ra->nr_invalid_secs == rb->nr_invalid_secs)
+ return ra;
+
+ return (ra->nr_invalid_secs < rb->nr_invalid_secs) ? rb : ra;
+}
+
+/* linearly find the block with highest number of invalid pages
+ * requires lun->lock
+ */
+static struct pblk_block *block_prio_find_max(struct pblk_lun *rlun)
+{
+ struct list_head *prio_list = &rlun->prio_list;
+ struct pblk_block *rblk, *max;
+
+ /* logic error */
+ BUG_ON(list_empty(prio_list));
+
+ max = list_first_entry(prio_list, struct pblk_block, prio);
+ list_for_each_entry(rblk, prio_list, prio)
+ max = rblock_max_invalid(max, rblk);
+
+ return max;
+}
+
+static void pblk_block_gc(struct work_struct *work)
+{
+ struct pblk_block_ws *blk_ws = container_of(work, struct pblk_block_ws,
+ ws_blk);
+ struct pblk *pblk = blk_ws->pblk;
+ struct pblk_block *rblk = blk_ws->rblk;
+ struct pblk_lun *rlun = rblk->rlun;
+ unsigned int page_size = pblk_recov_page_size(pblk);
+ void *recov_page;
+ u64 *lba_list;
+ u64 gc_lba_list[PBLK_MAX_REQ_ADDRS];
+ unsigned long *invalid_bitmap;
+ int moved, total_moved = 0;
+ int nr_invalid_secs;
+ int nr_valid_secs;
+ int bit;
+ int nr_ppas;
+
+ invalid_bitmap = kmalloc(BITS_TO_LONGS(pblk->nr_blk_dsecs) *
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!invalid_bitmap)
+ return;
+
+ spin_lock(&rblk->lock);
+ nr_invalid_secs = rblk->nr_invalid_secs;
+ nr_valid_secs = pblk->nr_blk_dsecs - rblk->nr_invalid_secs;
+ bitmap_copy(invalid_bitmap, rblk->invalid_bitmap, pblk->nr_blk_dsecs);
+ spin_unlock(&rblk->lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(nr_valid_secs !=
+ pblk->nr_blk_dsecs - bitmap_weight(invalid_bitmap, pblk->nr_blk_dsecs));
+#endif
+
+ mempool_free(blk_ws, pblk->blk_ws_pool);
+ pr_debug("pblk: block '%d' being reclaimed\n", rblk->id);
+
+ recov_page = kzalloc(page_size, GFP_KERNEL);
+ if (!recov_page)
+ goto put_back;
+
+ if (pblk_recov_read(pblk, rblk, recov_page)) {
+ pr_err("pblk: could not recover last page. Blk:%d\n",
+ rblk->id);
+ goto free_recov_page;
+ }
+
+ lba_list = pblk_recov_get_lba_list(pblk, recov_page);
+ if (!lba_list) {
+ pr_err("pblk: Could not interpret recover page. Blk:%d\n",
+ rblk->id);
+ goto free_recov_page;
+ }
+
+ bit = -1;
+next_lba_list:
+ nr_ppas = 0;
+ do {
+ bit = find_next_zero_bit(invalid_bitmap,
+ pblk->nr_blk_dsecs, bit + 1);
+ gc_lba_list[nr_ppas] = lba_list[bit];
+
+ if (bit >= pblk->nr_blk_dsecs)
+ goto prepare_ppas;
+
+ nr_ppas++;
+ } while (nr_ppas < PBLK_MAX_REQ_ADDRS);
+
+prepare_ppas:
+ moved = pblk_gc_move_valid_secs(pblk, rblk, gc_lba_list, nr_ppas);
+ if (moved != nr_ppas) {
+ pr_err("pblk: could not GC all sectors:blk:%d, GC:%d/%d/%d\n",
+ rblk->id,
+ moved, nr_ppas,
+ nr_valid_secs);
+ goto put_back;
+ }
+
+ total_moved += moved;
+ if (total_moved < nr_valid_secs)
+ goto next_lba_list;
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(pblk->nr_blk_dsecs -
+ bitmap_weight(invalid_bitmap, pblk->nr_blk_dsecs) !=
+ total_moved);
+#endif
+
+ /* Block are erased before being returned to the media manager */
+ pblk_erase_blk(pblk, rblk);
+ pblk_put_blk(pblk, rblk);
+
+ kfree(invalid_bitmap);
+ kfree(recov_page);
+ return;
+
+free_recov_page:
+ kfree(recov_page);
+put_back:
+ spin_lock(&rlun->lock);
+ list_add_tail(&rblk->prio, &rlun->prio_list);
+ spin_unlock(&rlun->lock);
+
+ kfree(invalid_bitmap);
+}
+
+static void pblk_lun_gc(struct pblk *pblk, struct pblk_lun *rlun)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_block_ws *blk_ws;
+ struct pblk_block *rblk, *trblk;
+ unsigned int nr_free_blocks, nr_blocks_need;
+ int run_gc;
+ LIST_HEAD(gc_list);
+
+ nr_blocks_need = pblk_rl_gc_thrs(pblk);
+
+ if (nr_blocks_need < pblk->nr_luns)
+ nr_blocks_need = pblk->nr_luns;
+
+ spin_lock(&rlun->lock);
+ nr_free_blocks = rlun->nr_free_blocks;
+
+ run_gc = (nr_blocks_need > nr_free_blocks || gc->gc_forced);
+ while (run_gc && !list_empty(&rlun->prio_list)) {
+ rblk = block_prio_find_max(rlun);
+ if (!rblk->nr_invalid_secs)
+ goto start_gc;
+
+ nr_free_blocks++;
+ list_move_tail(&rblk->prio, &gc_list);
+
+ run_gc = (nr_blocks_need > nr_free_blocks || gc->gc_forced);
+ }
+
+start_gc:
+ spin_unlock(&rlun->lock);
+
+ list_for_each_entry_safe(rblk, trblk, &gc_list, prio) {
+ blk_ws = mempool_alloc(pblk->blk_ws_pool, GFP_ATOMIC);
+ if (!blk_ws)
+ break;
+
+ list_del_init(&rblk->prio);
+
+ /* logic error */
+ BUG_ON(!block_is_full(pblk, rblk));
+
+ blk_ws->pblk = pblk;
+ blk_ws->rblk = rblk;
+
+ INIT_WORK(&blk_ws->ws_blk, pblk_block_gc);
+ queue_work(pblk->kgc_wq, &blk_ws->ws_blk);
+
+ nr_blocks_need--;
+ }
+
+ if (unlikely(!list_empty(&rlun->g_bb_list)))
+ pblk_recov_clean_g_bb_list(pblk, rlun);
+}
+
+void pblk_gc(struct work_struct *work)
+{
+ struct pblk *pblk = container_of(work, struct pblk, ws_gc);
+ struct pblk_lun *rlun;
+ int i;
+
+ pblk_for_each_lun(pblk, rlun, i)
+ pblk_lun_gc(pblk, rlun);
+}
+
+/*
+ * timed GC every interval.
+ */
+static void pblk_gc_timer(unsigned long data)
+{
+ struct pblk *pblk = (struct pblk *)data;
+
+ pblk_gc_kick(pblk);
+ mod_timer(&pblk->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+ setup_timer(&pblk->gc_timer, pblk_gc_timer, (unsigned long)pblk);
+ mod_timer(&pblk->gc_timer, jiffies + msecs_to_jiffies(5000));
+
+ pblk->gc.gc_active = 1;
+
+ pr_debug("pblk: gc running\n");
+}
+
+/*
+ * If flush_wq == 1 then no lock should be held by the caller since
+ * flush_workqueue can sleep
+ */
+static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
+{
+ del_timer(&pblk->gc_timer);
+
+ if (flush_wq)
+ flush_workqueue(pblk->kgc_wq);
+
+ spin_lock(&pblk->gc.lock);
+ pblk->gc.gc_active = 0;
+ spin_unlock(&pblk->gc.lock);
+
+ pr_debug("pblk: gc paused\n");
+}
+
+int pblk_gc_status(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret;
+
+ spin_lock(&gc->lock);
+ ret = gc->gc_active;
+ spin_unlock(&gc->lock);
+
+ return ret;
+}
+
+static void __pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&gc->lock);
+#endif
+
+ if (gc->gc_enabled && !gc->gc_active)
+ pblk_gc_start(pblk);
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ __pblk_gc_should_start(pblk);
+ spin_unlock(&gc->lock);
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ if (gc->gc_active && !gc->gc_forced)
+ pblk_gc_stop(pblk, 0);
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+ pblk->krqd_wq = alloc_workqueue("pblk-lun", WQ_MEM_RECLAIM | WQ_UNBOUND,
+ pblk->nr_luns);
+ if (!pblk->krqd_wq)
+ return -ENOMEM;
+
+ pblk->kgc_wq = alloc_workqueue("pblk-bg", WQ_MEM_RECLAIM, 1);
+ if (!pblk->kgc_wq)
+ goto fail_destrow_krqd_qw;
+
+ pblk->gc.gc_active = 0;
+ pblk->gc.gc_forced = 0;
+ pblk->gc.gc_enabled = 1;
+
+ spin_lock_init(&pblk->gc.lock);
+
+ return 0;
+
+fail_destrow_krqd_qw:
+ destroy_workqueue(pblk->krqd_wq);
+ return -ENOMEM;
+}
+
+void pblk_gc_exit(struct pblk *pblk)
+{
+ pblk_gc_stop(pblk, 1);
+
+ if (pblk->krqd_wq)
+ destroy_workqueue(pblk->krqd_wq);
+
+ if (pblk->kgc_wq)
+ destroy_workqueue(pblk->kgc_wq);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ *gc_enabled = gc->gc_enabled;
+ *gc_active = gc->gc_active;
+ spin_unlock(&gc->lock);
+}
+
+int pblk_gc_sysfs_force(struct pblk *pblk, int value)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int rsv = 0;
+
+ if (value != 0 && value != 1)
+ return -EINVAL;
+
+ spin_lock(&gc->lock);
+ if (value == 1) {
+ gc->gc_enabled = 1;
+ rsv = 64;
+ }
+ pblk_rl_set_gc_rsc(pblk, rsv);
+ gc->gc_forced = value;
+ __pblk_gc_should_start(pblk);
+ spin_unlock(&gc->lock);
+
+ return 0;
+}
+
+int pblk_gc_sysfs_enable(struct pblk *pblk, int value)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret = 0;
+
+ if (value == 0) {
+ spin_lock(&gc->lock);
+ gc->gc_enabled = value;
+ spin_unlock(&gc->lock);
+ if (gc->gc_active)
+ pblk_gc_stop(pblk, 0);
+ } else if (value == 1) {
+ spin_lock(&gc->lock);
+ gc->gc_enabled = value;
+ if (!gc->gc_active)
+ pblk_gc_start(pblk);
+ spin_unlock(&gc->lock);
+ } else {
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
new file mode 100644
index 0000000..5de3d32
--- /dev/null
+++ b/drivers/lightnvm/pblk-init.c
@@ -0,0 +1,769 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization. Derived from rrpc.c
+ */
+
+#include "pblk.h"
+
+static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
+ *pblk_w_rq_cache, *pblk_blk_meta_cache;
+static DECLARE_RWSEM(pblk_lock);
+
+static const struct block_device_operations pblk_fops = {
+ .owner = THIS_MODULE,
+};
+
+static int pblk_submit_io_checks(struct pblk *pblk, struct bio *bio)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int bio_size = bio_sectors(bio) << 9;
+ int is_flush = (bio->bi_opf & REQ_PREFLUSH);
+
+ if ((bio_size < geo->sec_size) && (!is_flush))
+ return 1;
+
+ return 0;
+}
+
+static int pblk_submit_io(struct request_queue *q, struct pblk *pblk,
+ struct bio *bio, unsigned long flags)
+{
+ int ret;
+
+ if (pblk_submit_io_checks(pblk, bio))
+ return NVM_IO_ERR;
+
+ /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+ * constraint. Writes can be of arbitrary size.
+ */
+ if (bio_data_dir(bio) == READ) {
+ blk_queue_split(q, &bio, q->bio_split);
+ ret = pblk_submit_read(pblk, bio, flags);
+ if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+ bio_put(bio);
+
+ return ret;
+ }
+
+ /* Prevent deadlock in the case of a modest LUN configuration and large
+ * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+ * available for user I/O.
+ */
+ if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(pblk)))
+ blk_queue_split(q, &bio, q->bio_split);
+
+ ret = pblk_write_to_cache(pblk, bio, flags);
+ if (bio_flagged(bio, BIO_CLONED))
+ bio_put(bio);
+
+ return ret;
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+ struct pblk *pblk = q->queuedata;
+ int err;
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ pblk_discard(pblk, bio);
+ if (!(bio->bi_opf & REQ_PREFLUSH))
+ return BLK_QC_T_NONE;
+ }
+
+ err = pblk_submit_io(q, pblk, bio, PBLK_IOTYPE_USER);
+ switch (err) {
+ case NVM_IO_OK:
+ return BLK_QC_T_NONE;
+ case NVM_IO_ERR:
+ bio_io_error(bio);
+ break;
+ case NVM_IO_DONE:
+ bio_endio(bio);
+ break;
+ case NVM_IO_REQUEUE:
+ spin_lock(&pblk->bio_lock);
+ bio_list_add(&pblk->requeue_bios, bio);
+ spin_unlock(&pblk->bio_lock);
+ queue_work(pblk->kgc_wq, &pblk->ws_requeue);
+ break;
+ }
+
+ return BLK_QC_T_NONE;
+}
+
+static void pblk_requeue(struct work_struct *work)
+{
+ struct pblk *pblk = container_of(work, struct pblk, ws_requeue);
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+
+ spin_lock(&pblk->bio_lock);
+ bio_list_merge(&bios, &pblk->requeue_bios);
+ bio_list_init(&pblk->requeue_bios);
+ spin_unlock(&pblk->bio_lock);
+
+ while ((bio = bio_list_pop(&bios)))
+ pblk_make_rq(pblk->disk->queue, bio);
+}
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+ vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_init(struct pblk *pblk)
+{
+ sector_t i;
+
+ pblk->trans_map = vzalloc(sizeof(struct pblk_addr) * pblk->rl.nr_secs);
+ if (!pblk->trans_map)
+ return -ENOMEM;
+
+ for (i = 0; i < pblk->rl.nr_secs; i++) {
+ struct pblk_addr *p = &pblk->trans_map[i];
+
+ p->rblk = NULL;
+ ppa_set_empty(&p->ppa);
+ }
+
+ return 0;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+ pblk_rb_data_free(&pblk->rwb);
+ vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_rb_entry *entries;
+ unsigned long nr_entries;
+ unsigned int power_size, power_seg_sz;
+
+ nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+
+ entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
+ if (!entries)
+ return -ENOMEM;
+
+ power_size = get_count_order(nr_entries);
+ power_seg_sz = get_count_order(geo->sec_size);
+
+ return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define PAGE_POOL_SIZE 16
+#define ADDR_POOL_SIZE 64
+
+static int pblk_core_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ down_write(&pblk_lock);
+ if (!pblk_blk_ws_cache) {
+ pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+ sizeof(struct pblk_block_ws), 0, 0, NULL);
+ if (!pblk_blk_ws_cache) {
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_rec_cache = kmem_cache_create("pblk_rec",
+ sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+ if (!pblk_rec_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
+ 0, 0, NULL);
+ if (!pblk_r_rq_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+ 0, 0, NULL);
+ if (!pblk_w_rq_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_blk_meta_cache = kmem_cache_create("pblk_blk_m",
+ pblk->blk_meta.rlpg_page_len, 0, 0, NULL);
+ if (!pblk_blk_meta_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+ }
+ up_write(&pblk_lock);
+
+ pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
+ if (!pblk->page_pool)
+ return -ENOMEM;
+
+ pblk->blk_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk_blk_ws_cache);
+ if (!pblk->blk_ws_pool)
+ goto free_page_pool;
+
+ pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+ if (!pblk->rec_pool)
+ goto free_blk_ws_pool;
+
+ pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
+ if (!pblk->r_rq_pool)
+ goto free_rec_pool;
+
+ pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+ if (!pblk->w_rq_pool)
+ goto free_r_rq_pool;
+
+ pblk->blk_meta_pool = mempool_create_slab_pool(16, pblk_blk_meta_cache);
+ if (!pblk->blk_meta_pool)
+ goto free_w_rq_pool;
+
+ pblk->kw_wq = alloc_workqueue("pblk-writer",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, pblk->nr_luns);
+ if (!pblk->kw_wq)
+ goto free_blk_meta_pool;
+
+ /* Init write buffer */
+ if (pblk_rwb_init(pblk))
+ goto free_kw_wq;
+
+ INIT_LIST_HEAD(&pblk->compl_list);
+ return 0;
+
+free_kw_wq:
+ destroy_workqueue(pblk->kw_wq);
+free_blk_meta_pool:
+ mempool_destroy(pblk->blk_meta_pool);
+free_w_rq_pool:
+ mempool_destroy(pblk->w_rq_pool);
+free_r_rq_pool:
+ mempool_destroy(pblk->r_rq_pool);
+free_rec_pool:
+ mempool_destroy(pblk->rec_pool);
+free_blk_ws_pool:
+ mempool_destroy(pblk->blk_ws_pool);
+free_page_pool:
+ mempool_destroy(pblk->page_pool);
+ return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+ if (pblk->kw_wq)
+ destroy_workqueue(pblk->kw_wq);
+
+ mempool_destroy(pblk->page_pool);
+ mempool_destroy(pblk->blk_ws_pool);
+ mempool_destroy(pblk->rec_pool);
+ mempool_destroy(pblk->r_rq_pool);
+ mempool_destroy(pblk->w_rq_pool);
+ mempool_destroy(pblk->blk_meta_pool);
+}
+
+static void pblk_luns_free(struct pblk *pblk)
+{
+ struct pblk_lun *rlun;
+ int i;
+
+ if (!pblk->luns)
+ return;
+
+ for (i = 0; i < pblk->nr_luns; i++) {
+ rlun = &pblk->luns[i];
+ vfree(rlun->blocks);
+ }
+
+ kfree(pblk->luns);
+}
+
+static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_block *rblk;
+ struct ppa_addr ppa;
+ u8 *blks;
+ int nr_blks;
+ int i;
+ int ret;
+
+ nr_blks = geo->blks_per_lun * geo->plane_mode;
+ blks = kmalloc(nr_blks, GFP_KERNEL);
+ if (!blks)
+ return -ENOMEM;
+
+ ppa.ppa = 0;
+ ppa.g.ch = rlun->bppa.g.ch;
+ ppa.g.lun = rlun->bppa.g.lun;
+
+ ret = nvm_get_bb_tbl(dev->parent, ppa, blks);
+ if (ret) {
+ pr_err("pblk: could not get BB table\n");
+ kfree(blks);
+ goto out;
+ }
+
+ nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+ if (nr_blks < 0)
+ return nr_blks;
+
+ rlun->nr_free_blocks = geo->blks_per_lun;
+ for (i = 0; i < nr_blks; i++) {
+ if (blks[i] == NVM_BLK_T_FREE)
+ continue;
+
+ rblk = &rlun->blocks[i];
+ list_move_tail(&rblk->list, &rlun->bb_list);
+ rblk->state = NVM_BLK_ST_BAD;
+ rlun->nr_free_blocks--;
+ }
+
+out:
+ kfree(blks);
+ return ret;
+}
+
+static void pblk_set_lun_ppa(struct pblk_lun *rlun, struct ppa_addr ppa)
+{
+ rlun->bppa.ppa = 0;
+ rlun->bppa.g.ch = ppa.g.ch;
+ rlun->bppa.g.lun = ppa.g.lun;
+}
+
+static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int i, j, mod, ret = -EINVAL;
+ int max_write_ppas;
+
+ pblk->nr_luns = geo->nr_luns;
+
+ pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+ max_write_ppas = pblk->min_write_pgs * pblk->nr_luns;
+ pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
+ max_write_ppas : nvm_max_phys_sects(dev);
+
+ /* TODO: Implement unbalanced LUN support */
+ if (geo->luns_per_chnl < 0) {
+ pr_err("pblk: unbalanced LUN config. not supported yet\n");
+ return -EINVAL;
+ }
+
+ if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+ pr_err("pblk: device exposes too many sectors per write");
+ return -EINVAL;
+ }
+
+ pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
+ geo->nr_planes * pblk->nr_luns;
+
+ if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+ pr_err("pblk: cannot support device max_phys_sect\n");
+ return -EINVAL;
+ }
+
+ div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+ if (mod) {
+ pr_err("pblk: bad configuration of sectors/pages\n");
+ return -EINVAL;
+ }
+
+ pblk->luns = kcalloc(pblk->nr_luns, sizeof(struct pblk_lun),
+ GFP_KERNEL);
+ if (!pblk->luns)
+ return -ENOMEM;
+
+ pblk->rl.total_blocks = pblk->rl.nr_secs = 0;
+
+ /* 1:1 mapping */
+ for (i = 0; i < pblk->nr_luns; i++) {
+ /* Stripe across channels as much as we can*/
+ int ch = i % geo->nr_chnls;
+ int lun_raw = i / geo->nr_chnls;
+ int lunid = lun_raw + ch * geo->luns_per_chnl;
+ struct ppa_addr ppa = luns[lunid];
+
+ rlun = &pblk->luns[i];
+ rlun->pblk = pblk;
+ rlun->id = i;
+ pblk_set_lun_ppa(rlun, ppa);
+ rlun->blocks = vzalloc(sizeof(struct pblk_block) *
+ geo->blks_per_lun);
+ if (!rlun->blocks) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ INIT_LIST_HEAD(&rlun->free_list);
+ INIT_LIST_HEAD(&rlun->bb_list);
+ INIT_LIST_HEAD(&rlun->g_bb_list);
+ INIT_LIST_HEAD(&rlun->prio_list);
+ INIT_LIST_HEAD(&rlun->open_list);
+ INIT_LIST_HEAD(&rlun->closed_list);
+
+ sema_init(&rlun->wr_sem, 1);
+
+ for (j = 0; j < geo->blks_per_lun; j++) {
+ struct pblk_block *rblk = &rlun->blocks[j];
+
+ rblk->id = j;
+ rblk->rlun = rlun;
+ rblk->state = NVM_BLK_T_FREE;
+ INIT_LIST_HEAD(&rblk->prio);
+ spin_lock_init(&rblk->lock);
+
+ list_add_tail(&rblk->list, &rlun->free_list);
+ }
+
+ if (pblk_bb_discovery(dev, rlun))
+ goto err;
+
+ spin_lock_init(&rlun->lock);
+
+ pblk->rl.total_blocks += geo->blks_per_lun;
+ pblk->rl.nr_secs += geo->sec_per_lun;
+ }
+
+ return 0;
+err:
+ return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+ setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+ pblk->ts_writer = kthread_create(pblk_write_ts, pblk, "pblk-writer");
+ pblk_rl_init(pblk);
+
+ return 0;
+}
+
+static void pblk_writer_free(struct pblk *pblk)
+{
+ kthread_stop(pblk->ts_writer);
+ del_timer(&pblk->wtimer);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+ pblk_l2p_free(pblk);
+ pblk_core_free(pblk);
+ pblk_luns_free(pblk);
+ pblk_map_free(pblk);
+ pblk_writer_free(pblk);
+ pblk_rwb_free(pblk);
+ pblk_sysfs_exit(pblk);
+
+ kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk)
+{
+ pblk_flush_writer(pblk);
+ pblk_pad_open_blks(pblk);
+ pblk_rb_sync_l2p(&pblk->rwb);
+
+ if (pblk_rb_tear_down_check(&pblk->rwb)) {
+ pr_err("pblk: write buffer error on tear down\n");
+ return;
+ }
+
+ pblk_free_blks(pblk);
+
+ pr_debug("pblk: consistent tear down\n");
+
+ /* TODO: Save FTL snapshot for fast recovery */
+}
+
+static void pblk_exit(void *private)
+{
+ struct pblk *pblk = private;
+
+ down_write(&pblk_lock);
+ flush_workqueue(pblk->krqd_wq);
+ pblk_tear_down(pblk);
+ pblk_gc_exit(pblk);
+ pblk_free(pblk);
+ up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+ struct pblk *pblk = private;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ sector_t reserved, provisioned;
+
+ /* cur, gc, and two emergency blocks for each lun */
+ reserved = pblk->nr_luns * geo->sec_per_blk * 4;
+ provisioned = pblk->capacity - reserved;
+
+ if (reserved > pblk->rl.nr_secs) {
+ pr_err("pblk: not enough space available to expose storage.\n");
+ return 0;
+ }
+
+ sector_div(provisioned, 10);
+ return provisioned * 9 * NR_PHY_IN_LOG;
+}
+
+static int pblk_blocks_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+ int lun, blk;
+ int ret = 0;
+
+ /* TODO: Recover from l2p snapshot. Only perform scanning in
+ * case of failure
+ */
+
+ for (lun = 0; lun < pblk->nr_luns; lun++) {
+ rlun = &pblk->luns[lun];
+ for (blk = 0; blk < geo->blks_per_lun; blk++) {
+ rblk = &rlun->blocks[blk];
+
+ if (!rblk->state)
+ pblk->capacity += geo->sec_per_blk;
+
+#ifndef CONFIG_NVM_PBLK_NO_RECOV
+ ret = pblk_recov_scan_blk(pblk, rblk);
+ if (ret) {
+ pr_err("nvm: pblk: could not recover l2p\n");
+ return ret;
+ }
+#endif
+ }
+ }
+
+ return ret;
+}
+
+int pblk_luns_configure(struct pblk *pblk)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+ int i;
+
+ for (i = 0; i < pblk->nr_luns; i++) {
+ rlun = &pblk->luns[i];
+
+ /* Get first active block directly from mm pool */
+ spin_lock(&rlun->lock);
+ rblk = pblk_get_blk(pblk, rlun);
+ if (!rblk) {
+ spin_unlock(&rlun->lock);
+ goto err;
+ }
+
+ pblk_set_lun_cur(rlun, rblk);
+ spin_unlock(&rlun->lock);
+ }
+
+ return 0;
+err:
+ while (--i >= 0) {
+ rlun = &pblk->luns[i];
+
+ if (rlun->cur)
+ pblk_put_blk(pblk, rlun->cur);
+ }
+
+ return -ENOMEM;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk);
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+ .name = "pblk",
+ .version = {1, 0, 0},
+
+ .make_rq = pblk_make_rq,
+ .capacity = pblk_capacity,
+ .end_io = pblk_end_io,
+
+ .init = pblk_init,
+ .exit = pblk_exit,
+
+ .sysfs_init = pblk_sysfs_init,
+};
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
+{
+ struct request_queue *bqueue = dev->q;
+ struct request_queue *tqueue = tdisk->queue;
+ struct pblk *pblk;
+ int ret;
+
+ if (dev->identity.dom & NVM_RSP_L2P) {
+ pr_err("pblk: device-side L2P table not supported. (%x)\n",
+ dev->identity.dom);
+ return ERR_PTR(-EINVAL);
+ }
+
+ pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+ if (!pblk)
+ return ERR_PTR(-ENOMEM);
+
+ pblk->instance.tt = &tt_pblk;
+ pblk->dev = dev;
+ pblk->disk = tdisk;
+
+ bio_list_init(&pblk->requeue_bios);
+ spin_lock_init(&pblk->bio_lock);
+ spin_lock_init(&pblk->trans_lock);
+ spin_lock_init(&pblk->lock);
+ spin_lock_init(&pblk->kick_lock);
+ INIT_WORK(&pblk->ws_requeue, pblk_requeue);
+ INIT_WORK(&pblk->ws_gc, pblk_gc);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_set(&pblk->inflight_writes, 0);
+ atomic_set(&pblk->padded_writes, 0);
+ atomic_set(&pblk->nr_flush, 0);
+ atomic_set(&pblk->req_writes, 0);
+ atomic_set(&pblk->sub_writes, 0);
+ atomic_set(&pblk->sync_writes, 0);
+ atomic_set(&pblk->compl_writes, 0);
+ atomic_set(&pblk->inflight_meta, 0);
+ atomic_set(&pblk->compl_meta, 0);
+ atomic_set(&pblk->inflight_reads, 0);
+ atomic_set(&pblk->sync_reads, 0);
+ atomic_set(&pblk->recov_writes, 0);
+ atomic_set(&pblk->recov_gc_writes, 0);
+ atomic_set(&pblk->requeued_writes, 0);
+#endif
+
+ init_waitqueue_head(&pblk->wait);
+
+ ret = pblk_luns_init(pblk, dev->luns);
+ if (ret) {
+ pr_err("pblk: could not initialize luns\n");
+ goto err;
+ }
+
+ ret = pblk_map_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize map\n");
+ goto err;
+ }
+
+ ret = pblk_recov_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize recovery\n");
+ goto err;
+ }
+
+ ret = pblk_core_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize core\n");
+ goto err;
+ }
+
+ ret = pblk_l2p_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize maps\n");
+ goto err;
+ }
+
+ ret = pblk_blocks_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize state for blocks\n");
+ goto err;
+ }
+
+ ret = pblk_writer_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize write thread\n");
+ goto err;
+ }
+
+ ret = pblk_luns_configure(pblk);
+ if (ret) {
+ pr_err("pblk: not enough blocks available in LUNs.\n");
+ goto err;
+ }
+
+ ret = pblk_gc_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize gc\n");
+ goto err;
+ }
+
+ /* inherit the size from the underlying device */
+ blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+ blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+ blk_queue_write_cache(tqueue, true, false);
+
+ pr_info("pblk init: luns:%u, %llu sectors, buffer entries:%lu\n",
+ pblk->nr_luns, (unsigned long long)pblk->rl.nr_secs,
+ pblk_rb_nr_entries(&pblk->rwb));
+
+ wake_up_process(pblk->ts_writer);
+ return pblk;
+err:
+ pblk_free(pblk);
+ return ERR_PTR(ret);
+}
+
+static int __init pblk_module_init(void)
+{
+ return nvm_register_tgt_type(&tt_pblk);
+}
+
+static void pblk_module_exit(void)
+{
+ nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <jg@xxxxxxxxxxx>");
+MODULE_AUTHOR("Matias Bjorling <m@xxxxxxxxxxx>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device Target for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
new file mode 100644
index 0000000..4c91e7a
--- /dev/null
+++ b/drivers/lightnvm/pblk-map.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ * TODO:
+ * - Choose strategy:
+ * - Stripe across writable luns
+ * - Write to one block (one lun) at a time
+ * - Configure mapping parameters for relevant strategies (sysfs)
+ */
+
+#include "pblk.h"
+
+int __pblk_map_replace_lun(struct pblk *pblk, int lun_pos)
+{
+ int next_lun;
+
+ if (lun_pos > pblk->w_luns.nr_luns)
+ return 1;
+
+ if (unlikely(lun_pos < 0 || lun_pos >= pblk->w_luns.nr_luns)) {
+ pr_err("pblk: corrupt mapping\n");
+ return 0;
+ }
+
+ next_lun = ++pblk->w_luns.next_lun;
+ if (pblk->w_luns.next_lun == pblk->nr_luns)
+ next_lun = pblk->w_luns.next_lun = 0;
+
+ pblk->w_luns.luns[lun_pos] = &pblk->luns[next_lun];
+ return 1;
+}
+
+int pblk_map_replace_lun(struct pblk *pblk, int lun_pos)
+{
+ int ret = 1;
+
+ spin_lock(&pblk->w_luns.lock);
+ if (pblk->w_luns.nr_blocks == -1)
+ goto out;
+
+ if (++pblk->w_luns.lun_blocks[lun_pos] >= pblk->w_luns.nr_blocks) {
+ ret = __pblk_map_replace_lun(pblk, lun_pos);
+ pblk->w_luns.lun_blocks[lun_pos] = 0;
+ }
+
+out:
+ spin_unlock(&pblk->w_luns.lock);
+
+ return ret;
+}
+
+static struct pblk_lun *get_map_next_lun(struct pblk *pblk, int *lun_pos)
+{
+ struct pblk_lun *rlun;
+
+ spin_lock(&pblk->w_luns.lock);
+ *lun_pos = ++pblk->w_luns.next_w_lun;
+ if (pblk->w_luns.next_w_lun == pblk->w_luns.nr_luns)
+ *lun_pos = pblk->w_luns.next_w_lun = 0;
+
+ rlun = pblk->w_luns.luns[*lun_pos];
+ spin_unlock(&pblk->w_luns.lock);
+
+ return rlun;
+}
+
+static struct pblk_lun *pblk_map_get_lun_rr(struct pblk *pblk, int *lun_pos,
+ unsigned long *lun_bitmap,
+ int is_gc)
+{
+ struct pblk_lun *rlun;
+
+ do {
+ rlun = get_map_next_lun(pblk, lun_pos);
+ } while (test_bit(rlun->id, lun_bitmap));
+
+ return rlun;
+}
+
+/* rblk->lock must be taken */
+static inline u64 pblk_next_base_sec(struct pblk *pblk, struct pblk_block *rblk,
+ int nr_secs)
+{
+ u64 old = rblk->cur_sec;
+
+#ifdef CONFIG_NVM_DEBUG
+ int i;
+ int cur_sec = old;
+
+ for (i = 0; i < nr_secs; i++) {
+ WARN_ON(test_bit(cur_sec, rblk->sector_bitmap));
+ cur_sec++;
+ }
+#endif
+
+ /* logic error: lba out-of-bounds */
+ BUG_ON(rblk->cur_sec + nr_secs > pblk->nr_blk_dsecs);
+
+ bitmap_set(rblk->sector_bitmap, rblk->cur_sec, nr_secs);
+ rblk->cur_sec += nr_secs;
+
+ return old;
+}
+
+/* The ppa in pblk_addr comes with an offset format, not a global format */
+static void pblk_page_pad_invalidate(struct pblk *pblk, struct pblk_block *rblk,
+ struct ppa_addr a)
+{
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rblk->lock);
+#endif
+
+ WARN_ON(test_and_set_bit(a.ppa, rblk->invalid_bitmap));
+ rblk->nr_invalid_secs++;
+
+ pblk_rb_sync_init(&pblk->rwb, NULL);
+ WARN_ON(test_and_set_bit(a.ppa, rblk->sync_bitmap));
+ if (bitmap_full(rblk->sync_bitmap, pblk->nr_blk_dsecs))
+ pblk_run_blk_ws(pblk, rblk, pblk_close_blk);
+ pblk_rb_sync_end(&pblk->rwb, NULL);
+}
+
+static u64 pblk_alloc_page(struct pblk *pblk, struct pblk_block *rblk)
+{
+ u64 addr = ADDR_EMPTY;
+ int nr_secs = pblk->min_write_pgs;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rblk->lock);
+#endif
+
+ if (block_is_full(pblk, rblk))
+ goto out;
+
+ addr = pblk_next_base_sec(pblk, rblk, nr_secs);
+
+out:
+ return addr;
+}
+
+int pblk_map_page(struct pblk *pblk, struct pblk_block *rblk,
+ unsigned int sentry, struct ppa_addr *ppa_list,
+ struct pblk_sec_meta *meta_list,
+ unsigned int nr_secs, unsigned int valid_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_blk_rec_lpg *rlpg = rblk->rlpg;
+ struct pblk_w_ctx *w_ctx;
+ u64 *lba_list;
+ u64 paddr;
+ int i;
+
+ lba_list = pblk_rlpg_to_llba(rlpg);
+
+ spin_lock(&rblk->lock);
+ paddr = pblk_alloc_page(pblk, rblk);
+
+ if (paddr == ADDR_EMPTY) {
+ spin_unlock(&rblk->lock);
+ return 1;
+ }
+
+ for (i = 0; i < nr_secs; i++, paddr++) {
+ if (paddr == ADDR_EMPTY) {
+ /* We should always have available sectors for a full
+ * page write at this point. We get a new block for this
+ * LUN when the current block is full.
+ */
+ pr_err("pblk: corrupted l2p mapping, blk:%d,n:%d/%d\n",
+ rblk->id,
+ i, nr_secs);
+ spin_unlock(&rblk->lock);
+ return -EINVAL;
+ }
+
+ /* ppa to be sent to the device */
+ ppa_list[i] = pblk_blk_ppa_to_gaddr(dev, rblk, paddr);
+
+ /* Write context for target bio completion on write buffer. Note
+ * that the write buffer is protected by the sync backpointer,
+ * and only one of the writer threads have access to each
+ * specific entry at a time. Thus, it is safe to modify the
+ * context for the entry we are setting up for submission
+ * without taking any lock and/or memory barrier.
+ */
+ if (i < valid_secs) {
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+ w_ctx->paddr = paddr;
+ w_ctx->ppa.ppa = ppa_list[i];
+ w_ctx->ppa.rblk = rblk;
+ meta_list[i].lba = w_ctx->lba;
+ lba_list[paddr] = w_ctx->lba;
+ rlpg->nr_lbas++;
+ } else {
+ meta_list[i].lba = ADDR_EMPTY;
+ lba_list[paddr] = ADDR_EMPTY;
+ pblk_page_pad_invalidate(pblk, rblk,
+ addr_to_ppa(paddr));
+ rlpg->nr_padded++;
+ }
+ }
+ spin_unlock(&rblk->lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ if (pblk_boundary_checks(pblk->dev, ppa_list, nr_secs))
+ WARN_ON(1);
+#endif
+
+ return 0;
+}
+
+
+/* Simple round-robin Logical to physical address translation.
+ *
+ * Retrieve the mapping using the active append point. Then update the ap for
+ * the next write to the disk. Mapping occurs at a page granurality, i.e., if a
+ * page is 4 sectors, then each map entails 4 lba-ppa mappings - @nr_secs is the
+ * number of sectors in the page, taking number of planes also into
+ * consideration
+ *
+ * TODO: We are missing GC path
+ * TODO: Add support for MLC and TLC padding. For now only supporting SLC
+ */
+int pblk_map_rr_page(struct pblk *pblk, unsigned int sentry,
+ struct ppa_addr *ppa_list,
+ struct pblk_sec_meta *meta_list,
+ unsigned int nr_secs, unsigned int valid_secs,
+ unsigned long *lun_bitmap)
+{
+ struct pblk_block *rblk;
+ struct pblk_lun *rlun;
+ int lun_pos;
+ int ret = 0;
+
+try_lun:
+ rlun = pblk_map_get_lun_rr(pblk, &lun_pos, lun_bitmap,
+ pblk_gc_status(pblk));
+ spin_lock(&rlun->lock);
+
+try_cur:
+ rblk = rlun->cur;
+
+ /* Account for grown bad blocks */
+ if (unlikely(block_is_bad(rblk))) {
+ if (!pblk_replace_blk(pblk, rblk, rlun, lun_pos)) {
+ spin_unlock(&rlun->lock);
+ goto try_lun;
+ }
+ goto try_cur;
+ }
+
+ ret = pblk_map_page(pblk, rblk, sentry, ppa_list, meta_list,
+ nr_secs, valid_secs);
+ if (ret) {
+ if (!pblk_replace_blk(pblk, rblk, rlun, lun_pos)) {
+ spin_unlock(&rlun->lock);
+ goto try_lun;
+ }
+ goto try_cur;
+ }
+ spin_unlock(&rlun->lock);
+
+ if (down_interruptible(&rlun->wr_sem))
+ pr_err("pblk: lun semaphore failed\n");
+
+ return ret;
+}
+
+ssize_t pblk_map_set_active_luns(struct pblk *pblk, int nr_luns)
+{
+ struct pblk_lun **luns;
+ int *lun_blocks;
+ ssize_t ret = 0;
+ int old_nr_luns, cpy_luns;
+ int i;
+
+ spin_lock(&pblk->w_luns.lock);
+ if (nr_luns > pblk->nr_luns) {
+ pr_err("pblk: Not enough luns (%d > %d)\n",
+ nr_luns, pblk->nr_luns);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ old_nr_luns = pblk->w_luns.nr_luns;
+ pblk->w_luns.nr_luns = nr_luns;
+ pblk->w_luns.next_lun = (nr_luns == pblk->nr_luns) ? 0 : nr_luns + 1;
+
+ luns = kcalloc(nr_luns, sizeof(void *), GFP_ATOMIC);
+ if (!luns) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ lun_blocks = kcalloc(nr_luns, sizeof(int), GFP_ATOMIC);
+ if (!lun_blocks) {
+ kfree(luns);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cpy_luns = (old_nr_luns > nr_luns) ? nr_luns : old_nr_luns;
+
+ for (i = 0; i < cpy_luns; i++) {
+ luns[i] = pblk->w_luns.luns[i];
+ lun_blocks[i] = pblk->w_luns.lun_blocks[i];
+ }
+
+ kfree(pblk->w_luns.luns);
+ kfree(pblk->w_luns.lun_blocks);
+
+ pblk->w_luns.luns = luns;
+ pblk->w_luns.lun_blocks = lun_blocks;
+
+ /* By default consume one block per active lun */
+ pblk->w_luns.nr_blocks = 1;
+
+ for (i = cpy_luns; i < nr_luns; i++) {
+ pblk->w_luns.lun_blocks[i] = 0;
+ if (!__pblk_map_replace_lun(pblk, i))
+ goto out;
+ }
+
+ pblk->w_luns.next_w_lun = -1;
+
+out:
+ spin_unlock(&pblk->w_luns.lock);
+ return ret;
+}
+
+int pblk_map_get_active_luns(struct pblk *pblk)
+{
+ int nr_luns;
+
+ spin_lock(&pblk->w_luns.lock);
+ nr_luns = pblk->w_luns.nr_luns;
+ spin_unlock(&pblk->w_luns.lock);
+
+ return nr_luns;
+}
+
+int pblk_map_set_consume_blocks(struct pblk *pblk, int value)
+{
+ spin_lock(&pblk->w_luns.lock);
+ pblk->w_luns.nr_blocks = value;
+ spin_unlock(&pblk->w_luns.lock);
+
+ return 0;
+}
+
+int pblk_map_get_consume_blocks(struct pblk *pblk)
+{
+ int nr_blocks;
+
+ spin_lock(&pblk->w_luns.lock);
+ nr_blocks = pblk->w_luns.nr_blocks;
+ spin_unlock(&pblk->w_luns.lock);
+
+ return nr_blocks;
+}
+
+int pblk_map_init(struct pblk *pblk)
+{
+ int i;
+
+ pblk->w_luns.nr_luns = pblk->nr_luns;
+
+ pblk->w_luns.next_lun = -1;
+ pblk->w_luns.next_w_lun = -1;
+
+ /* By default, all luns are active. No need to replace on alloc. */
+ pblk->w_luns.nr_blocks = -1;
+
+ pblk->w_luns.luns = kcalloc(pblk->w_luns.nr_luns, sizeof(void *),
+ GFP_KERNEL);
+ if (!pblk->w_luns.luns)
+ return -ENOMEM;
+
+ pblk->w_luns.lun_blocks = kcalloc(pblk->w_luns.nr_luns, sizeof(int),
+ GFP_KERNEL);
+ if (!pblk->w_luns.lun_blocks) {
+ kfree(pblk->w_luns.luns);
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&pblk->w_luns.lock);
+
+ /* Set write luns in order to start with */
+ for (i = 0; i < pblk->w_luns.nr_luns; i++) {
+ pblk->w_luns.luns[i] = &pblk->luns[i];
+ pblk->w_luns.lun_blocks[i] = 0;
+ }
+
+ return 0;
+}
+
+void pblk_map_free(struct pblk *pblk)
+{
+ kfree(pblk->w_luns.luns);
+ kfree(pblk->w_luns.lun_blocks);
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
new file mode 100644
index 0000000..6bac538
--- /dev/null
+++ b/drivers/lightnvm/pblk-rb.c
@@ -0,0 +1,823 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ * pblk-rb.c - pblk's ring buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+ struct pblk_rb_pages *p, *t;
+
+ down_write(&pblk_rb_lock);
+ list_for_each_entry_safe(p, t, &rb->pages, list) {
+ free_pages((unsigned long)page_address(p->pages), p->order);
+ list_del(&p->list);
+ kfree(p);
+ }
+ up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/circular-buffers.txt)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz)
+{
+ unsigned long init_entries = 0;
+ unsigned int alloc_order = power_size;
+ unsigned int max_order = MAX_ORDER - 1;
+ unsigned int order, iter;
+
+ down_write(&pblk_rb_lock);
+ rb->entries = rb_entry_base;
+ rb->seg_size = (1 << power_seg_sz);
+ rb->nr_entries = (1 << power_size);
+ rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+ rb->sync_point = RB_EMPTY_ENTRY;
+
+ spin_lock_init(&rb->w_lock);
+ spin_lock_init(&rb->r_lock);
+ spin_lock_init(&rb->s_lock);
+
+ INIT_LIST_HEAD(&rb->pages);
+
+ if (alloc_order >= max_order) {
+ order = max_order;
+ iter = (1 << (alloc_order - max_order));
+ } else {
+ order = alloc_order;
+ iter = 1;
+ }
+
+ do {
+ struct pblk_rb_entry *entry;
+ struct pblk_rb_pages *page_set;
+ void *kaddr;
+ unsigned long set_size;
+ int i;
+
+ page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+ if (!page_set) {
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+
+ page_set->order = order;
+ page_set->pages = alloc_pages(GFP_KERNEL, order);
+ if (!page_set->pages) {
+ kfree(page_set);
+ pblk_rb_data_free(rb);
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+ kaddr = page_address(page_set->pages);
+
+ entry = &rb->entries[init_entries++];
+ entry->data = kaddr;
+ entry->w_ctx.flags |= PBLK_WRITABLE_ENTRY;
+
+ set_size = (1 << order);
+ for (i = 1; i < set_size; i++) {
+ entry = &rb->entries[init_entries++];
+ entry->data = kaddr + (i * rb->seg_size);
+ entry->w_ctx.flags |= PBLK_WRITABLE_ENTRY;
+ }
+
+ list_add_tail(&page_set->list, &rb->pages);
+ iter--;
+ } while (iter > 0);
+ up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_set(&rb->inflight_sync_point, 0);
+#endif
+
+ return 0;
+}
+
+unsigned long pblk_rb_nr_entries(struct pblk_rb *rb)
+{
+ return rb->nr_entries;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned long pblk_rb_calculate_size(unsigned long nr_entries)
+{
+ unsigned int power_size;
+
+ power_size = get_count_order(nr_entries);
+
+ /* Have a write buffer that can fit 256KB I/Os */
+ power_size = (power_size < 7) ? 7 : power_size;
+ return (1 << power_size);
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+ return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+ w_ctx->flags = PBLK_WRITABLE_ENTRY;
+ ppa_set_empty(&w_ctx->ppa.ppa);
+ w_ctx->ppa.rblk = NULL;
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+ (CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+unsigned long pblk_rb_space(struct pblk_rb *rb)
+{
+ unsigned long mem = READ_ONCE(rb->mem);
+ unsigned long sync = READ_ONCE(rb->sync);
+
+ return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned long pblk_rb_count(struct pblk_rb *rb)
+{
+ unsigned long mem = READ_ONCE(rb->mem);
+ unsigned long subm = READ_ONCE(rb->subm);
+
+ return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+/*
+ * Returns how many entries are on the write buffer at the time of call and
+ * takes the submission lock. The lock is only taken if there are any entries on
+ * the buffer. This guarantees that at least the returned amount of entries
+ * will be on the buffer when reading from it.
+ */
+unsigned long pblk_rb_read_lock(struct pblk_rb *rb)
+{
+ unsigned long ret;
+
+ spin_lock(&rb->r_lock);
+
+ ret = pblk_rb_count(rb);
+ if (!ret)
+ spin_unlock(&rb->r_lock);
+ return ret;
+}
+
+unsigned long pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned long subm;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rb->r_lock);
+#endif
+
+ subm = READ_ONCE(rb->subm);
+ /* Commit read means updating submission pointer */
+ smp_store_release(&rb->subm,
+ (subm + nr_entries) & (rb->nr_entries - 1));
+ spin_unlock(&rb->r_lock);
+
+ return subm;
+}
+
+void pblk_rb_read_unlock(struct pblk_rb *rb)
+{
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rb->r_lock);
+#endif
+ spin_unlock(&rb->r_lock);
+}
+
+static void pblk_rb_requeue_entry(struct pblk_rb *rb,
+ struct pblk_rb_entry *entry)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct ppa_addr ppa;
+ unsigned long mem, sync;
+
+ /* Serialized in pblk_rb_write_init */
+ mem = READ_ONCE(rb->mem);
+ sync = READ_ONCE(rb->sync);
+
+ /* Maintain original bio, lba and flags */
+ pblk_ppa_set_empty(&entry->w_ctx.ppa);
+ entry->w_ctx.paddr = 0;
+
+ /* Move entry to the head of the write buffer and update l2p */
+ while (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < 1)
+ ;
+ pblk_rb_write_entry(rb, entry->data, entry->w_ctx, mem);
+
+ ppa = pblk_cacheline_to_ppa(mem);
+ pblk_update_map(pblk, entry->w_ctx.lba, NULL, ppa);
+
+ /* Update memory pointer (head) */
+ smp_store_release(&rb->mem, (mem + 1) & (rb->nr_entries - 1));
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&pblk->inflight_writes);
+ atomic_inc(&pblk->requeued_writes);
+#endif
+}
+
+static void pblk_rb_update_map(struct pblk *pblk, struct pblk_w_ctx *w_ctx)
+{
+ struct pblk_block *rblk = w_ctx->ppa.rblk;
+ struct ppa_addr ppa = w_ctx->ppa.ppa;
+
+ pblk_update_map(pblk, w_ctx->lba, rblk, ppa);
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned long *l2p_upd,
+ unsigned long to_update)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ struct pblk_block *rblk;
+ unsigned long i;
+
+ for (i = 0; i < to_update; i++) {
+ entry = &rb->entries[*l2p_upd];
+ w_ctx = &entry->w_ctx;
+ rblk = w_ctx->ppa.rblk;
+
+ /* Grown bad block. For now, we requeue the entry to the write
+ * buffer and make it take the normal path to get a new ppa
+ * mapping. Since the requeue takes a place on the buffer,
+ * unpdate an extra entry.
+ */
+ if (unlikely(block_is_bad(rblk))) {
+ pblk_rb_requeue_entry(rb, entry);
+ goto next_unlock;
+ }
+
+ pblk_rb_update_map(pblk, w_ctx);
+next_unlock:
+ clean_wctx(w_ctx);
+ *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+ }
+
+ return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned long mem, unsigned long sync)
+{
+ unsigned long count;
+ int ret = 0;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rb->w_lock);
+#endif
+
+ /* Update l2p as data is being overwritten */
+ if (pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries) >
+ nr_entries)
+ goto out;
+
+ count = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+ ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+
+out:
+ return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+ unsigned long sync;
+ unsigned int to_update;
+
+ spin_lock(&rb->w_lock);
+
+ /* Protect from reads and writes */
+ sync = smp_load_acquire(&rb->sync);
+
+ to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+ __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+
+ spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+void pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx,
+ unsigned int ring_pos)
+{
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+try:
+ flags = READ_ONCE(entry->w_ctx.flags);
+ if (!(flags & PBLK_WRITABLE_ENTRY))
+ goto try;
+
+ memcpy(entry->data, data, rb->seg_size);
+
+ entry->w_ctx.bio = w_ctx.bio;
+ entry->w_ctx.lba = w_ctx.lba;
+ entry->w_ctx.ppa = w_ctx.ppa;
+ entry->w_ctx.paddr = w_ctx.paddr;
+ entry->w_ctx.priv = w_ctx.priv;
+ flags |= w_ctx.flags;
+
+ if (w_ctx.bio) {
+ /* Release pointer controlling flushes */
+ smp_store_release(&rb->sync_point, ring_pos);
+ }
+
+ flags &= ~PBLK_WRITABLE_ENTRY;
+ flags |= PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_upd,
+ unsigned int nr_com, unsigned long *pos)
+{
+ unsigned long mem;
+ unsigned long sync;
+
+ spin_lock(&rb->w_lock);
+ sync = READ_ONCE(rb->sync);
+ mem = rb->mem;
+
+ if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_upd) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ if (pblk_rb_update_l2p(rb, nr_upd, mem, sync)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, (mem + nr_com) & (rb->nr_entries - 1));
+ spin_unlock(&rb->w_lock);
+
+ *pos = mem;
+ return 1;
+}
+
+/*
+ * The caller of this function must ensure that the backpointer will not
+ * overwrite the entries passed on the list.
+ */
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_ctx *ctx,
+ struct list_head *list,
+ unsigned int max)
+{
+ struct pblk_rb_entry *entry, *tentry;
+ struct page *page;
+ unsigned int read = 0;
+ int ret;
+
+ list_for_each_entry_safe(entry, tentry, list, index) {
+ if (read > max) {
+ pr_err("pblk: too many entries on list\n");
+ goto out;
+ }
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pr_err("pblk: could not allocate write bio page\n");
+ goto out;
+ }
+
+ ret = bio_add_page(bio, page, rb->seg_size, 0);
+ if (ret != rb->seg_size) {
+ pr_err("pblk: could not add page to write bio\n");
+ goto out;
+ }
+
+ list_del(&entry->index);
+ read++;
+ }
+
+out:
+ return read;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_ctx *ctx,
+ unsigned long pos,
+ unsigned int nr_entries,
+ unsigned int count,
+ unsigned long *sync_point)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ struct pblk_rb_entry *entry;
+ struct page *page;
+ unsigned int pad = 0, read = 0, to_read = nr_entries;
+ unsigned int user_io = 0, gc_io = 0;
+ unsigned int i;
+ int flags;
+ int ret;
+
+ if (count < nr_entries) {
+ pad = nr_entries - count;
+ to_read = count;
+ }
+
+ c_ctx->sentry = pos;
+ c_ctx->nr_valid = to_read;
+ c_ctx->nr_padded = pad;
+
+ for (i = 0; i < to_read; i++) {
+ entry = &rb->entries[pos];
+
+ /* A write has been allowed into the buffer, but data is still
+ * being copied to it. It is ok to busy wait.
+ */
+try:
+ flags = READ_ONCE(entry->w_ctx.flags);
+ if (!(flags & PBLK_WRITTEN_DATA))
+ goto try;
+
+ if (flags & PBLK_IOTYPE_USER)
+ user_io++;
+ else if (flags & PBLK_IOTYPE_GC)
+ gc_io++;
+ else
+ WARN(1, "pblk: unknown IO type\n");
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pr_err("pblk: could not allocate write bio page\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_WRITABLE_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ goto out;
+ }
+
+ ret = bio_add_page(bio, page, rb->seg_size, 0);
+ if (ret != rb->seg_size) {
+ pr_err("pblk: could not add page to write bio\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_WRITABLE_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ goto out;
+ }
+
+ if (entry->w_ctx.bio != NULL) {
+ *sync_point = pos;
+#ifdef CONFIG_NVM_DEBUG
+ atomic_dec(&rb->inflight_sync_point);
+#endif
+ }
+
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_WRITABLE_ENTRY;
+
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+
+ pos = (pos + 1) & (rb->nr_entries - 1);
+ }
+
+ read = to_read;
+
+ pblk_rl_out(pblk, user_io, gc_io);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(pad, &((struct pblk *)
+ (container_of(rb, struct pblk, rwb)))->padded_writes);
+#endif
+
+out:
+ return read;
+}
+
+void pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, u64 pos)
+{
+ struct pblk_rb_entry *entry;
+ void *data;
+
+ spin_lock(&rb->w_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(pos >= rb->nr_entries);
+#endif
+ entry = &rb->entries[pos];
+
+ data = bio_data(bio);
+ memcpy(data, entry->data, rb->seg_size);
+
+ spin_unlock(&rb->w_lock);
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned long pos)
+{
+ unsigned long entry = pos & (rb->nr_entries - 1);
+
+ return &rb->entries[entry].w_ctx;
+}
+
+unsigned long pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+{
+ if (flags)
+ spin_lock_irqsave(&rb->s_lock, *flags);
+ else
+ spin_lock_irq(&rb->s_lock);
+
+ return rb->sync;
+}
+
+unsigned long pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ unsigned long sync;
+ unsigned long i;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rb->s_lock);
+#endif
+
+ sync = READ_ONCE(rb->sync);
+
+ for (i = 0; i < nr_entries; i++) {
+ entry = &rb->entries[sync];
+ w_ctx = &entry->w_ctx;
+
+ if (w_ctx->flags & PBLK_IOTYPE_REF) {
+ struct pblk_kref_buf *ref_buf;
+
+ /* logic error */
+ BUG_ON(!w_ctx->priv);
+ ref_buf = w_ctx->priv;
+ if (kref_put(&ref_buf->ref, pblk_free_ref_mem))
+ w_ctx->priv = NULL;
+
+ w_ctx->flags &= ~PBLK_IOTYPE_REF;
+ }
+
+ sync = (sync + 1) & (rb->nr_entries - 1);
+ }
+
+ /* Protect from counts */
+ smp_store_release(&rb->sync, sync);
+
+ return sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+{
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rb->s_lock);
+#endif
+
+ if (flags)
+ spin_unlock_irqrestore(&rb->s_lock, *flags);
+ else
+ spin_unlock_irq(&rb->s_lock);
+}
+
+int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio)
+{
+ struct pblk_rb_entry *entry;
+ unsigned long mem, subm, sync_point;
+ int ret = 0;
+
+ spin_lock(&rb->r_lock);
+
+ /* Protect from reads and writes */
+ mem = smp_load_acquire(&rb->mem);
+ /* Protect syncs */
+ sync_point = smp_load_acquire(&rb->sync_point);
+ subm = READ_ONCE(rb->subm);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&rb->inflight_sync_point);
+#endif
+
+ if (mem == subm)
+ goto out;
+
+ sync_point = (mem == 0) ? (rb->nr_entries - 1) : (mem - 1);
+ entry = &rb->entries[sync_point];
+
+ if (entry->w_ctx.bio) {
+ pr_err("pblk: Duplicated sync point:%lu\n", sync_point);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ entry->w_ctx.bio = bio;
+
+ /* Protect syncs */
+ smp_store_release(&rb->sync_point, sync_point);
+
+ ret = 1;
+
+out:
+ spin_unlock(&rb->r_lock);
+ return ret;
+}
+
+void pblk_rb_sync_point_reset(struct pblk_rb *rb, unsigned long sp)
+{
+ unsigned long sync_point;
+
+ /* Protect syncs */
+ sync_point = smp_load_acquire(&rb->sync_point);
+
+ if (sync_point == sp) {
+ /* Protect syncs */
+ smp_store_release(&rb->sync_point, ADDR_EMPTY);
+ }
+}
+
+unsigned long pblk_rb_sync_point_count(struct pblk_rb *rb)
+{
+ unsigned long subm, sync_point, count;
+
+ /* Protect syncs */
+ sync_point = smp_load_acquire(&rb->sync_point);
+ if (sync_point == ADDR_EMPTY)
+ return 0;
+
+ subm = READ_ONCE(rb->subm);
+
+ /* The sync point itself counts as a sector to sync */
+ count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+
+ return count;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ struct ppa_addr gppa;
+ unsigned long sync, subm, count;
+ unsigned long i;
+
+ sync = READ_ONCE(rb->sync);
+ subm = READ_ONCE(rb->subm);
+ count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+ for (i = 0; i < count; i++) {
+ entry = &rb->entries[sync];
+ w_ctx = &entry->w_ctx;
+
+ gppa = pblk_blk_ppa_to_gaddr(dev, w_ctx->ppa.rblk, w_ctx->paddr);
+
+ if (gppa.ppa == ppa->ppa)
+ return entry;
+
+ sync = (sync + 1) & (rb->nr_entries - 1);
+ }
+
+ return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+ struct pblk_rb_entry *entry;
+ int i;
+ int ret = 0;
+
+ spin_lock(&rb->w_lock);
+ spin_lock(&rb->r_lock);
+ spin_lock_irq(&rb->s_lock);
+
+ if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+ (rb->sync == rb->l2p_update) &&
+ (rb->sync_point == RB_EMPTY_ENTRY)) {
+ goto out;
+ }
+
+ if (rb->entries)
+ goto out;
+
+ for (i = 0; i < rb->nr_entries; i++) {
+ entry = &rb->entries[i];
+
+ if (entry->data)
+ goto out;
+ }
+
+ ret = 1;
+
+out:
+ spin_unlock(&rb->w_lock);
+ spin_unlock(&rb->r_lock);
+ spin_unlock_irq(&rb->s_lock);
+
+ return ret;
+}
+
+unsigned long pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned long pos)
+{
+ return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+ return (pos >= rb->nr_entries);
+}
+
+#ifdef CONFIG_NVM_DEBUG
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+ ssize_t offset;
+
+ if (rb->sync_point != ADDR_EMPTY)
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%lu\t%lu\t%lu\t%lu\t%lu\t%u\t%lu\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+ atomic_read(&rb->inflight_sync_point),
+ rb->sync_point);
+ else
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%lu\t%lu\t%lu\t%lu\t%lu\t%u\tNULL\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+ atomic_read(&rb->inflight_sync_point));
+
+ return offset;
+}
+#endif
+
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
new file mode 100644
index 0000000..50661a0b
--- /dev/null
+++ b/drivers/lightnvm/pblk-read.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+static void pblk_setup_seq_reads(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t bladdr, int nr_secs)
+{
+ struct pblk_addr *gp;
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++) {
+ gp = &pblk->trans_map[bladdr + i];
+ ppas[i] = gp->ppa;
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+static void pblk_setup_rand_reads(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs)
+{
+ struct pblk_addr *gp;
+ sector_t lba;
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++) {
+ lba = lba_list[i];
+ if (lba == ADDR_EMPTY)
+ continue;
+
+ gp = &pblk->trans_map[lba];
+ ppas[i] = gp->ppa;
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+/*
+ * There is no guarantee that the value read from cache has not been updated. In
+ * order to guarantee that writes and reads are ordered, a flush must be issued.
+ */
+static void pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+ struct ppa_addr ppa)
+{
+ pblk_rb_copy_to_bio(&pblk->rwb, bio, nvm_addr_to_cacheline(ppa));
+}
+
+static int pblk_try_read_from_cache(struct pblk *pblk, struct bio *bio,
+ struct ppa_addr ppa)
+{
+ /* The write thread commits the changes to the buffer once the l2p table
+ * has been updated. In this way, if the address read from the l2p table
+ * points to a cacheline, the lba lock guarantees that the entry is not
+ * going to be updated by new writes.
+ */
+ if (!nvm_addr_in_cache(ppa))
+ return 0;
+
+ pblk_read_from_cache(pblk, bio, ppa);
+ return 1;
+}
+
+static int pblk_read_ppalist_rq(struct pblk *pblk, struct bio *bio,
+ struct nvm_rq *rqd, unsigned long flags,
+ int nr_secs, unsigned long *read_bitmap)
+{
+ sector_t laddr = pblk_get_laddr(bio);
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ int advanced_bio = 0;
+ int i, j = 0;
+
+ /* logic error: lba out-of-bounds */
+ BUG_ON(!(laddr >= 0 && laddr + nr_secs < pblk->rl.nr_secs));
+
+ pblk_setup_seq_reads(pblk, ppas, laddr, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr *p = &ppas[i];
+
+ if (ppa_empty(*p)) {
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ continue;
+ }
+
+ /* Try to read from write buffer. Those addresses that cannot be
+ * read from the write buffer are sequentially added to the ppa
+ * list, which will later on be used to submit an I/O to the
+ * device to retrieve data.
+ */
+ if (nvm_addr_in_cache(*p)) {
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ if (unlikely(!advanced_bio)) {
+ /* This is at least a partially filled bio,
+ * advance it to copy data to the right place.
+ * We will deal with partial bios later on.
+ */
+ bio_advance(bio, i * PBLK_EXPOSED_PAGE_SIZE);
+ advanced_bio = 1;
+ }
+ pblk_read_from_cache(pblk, bio, *p);
+ } else {
+ /* Fill ppa_list with the sectors that cannot be
+ * read from cache
+ */
+ rqd->ppa_list[j] = *p;
+ j++;
+ }
+
+ if (advanced_bio)
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(nr_secs, &pblk->inflight_reads);
+#endif
+
+ return NVM_IO_OK;
+}
+
+static int pblk_submit_read_io(struct pblk *pblk, struct bio *bio,
+ struct nvm_rq *rqd, unsigned long flags)
+{
+ int err;
+
+ rqd->flags = pblk_set_read_mode(pblk);
+
+ err = nvm_submit_io(pblk->dev, rqd);
+ if (err) {
+ pr_err("pblk: I/O submission failed: %d\n", err);
+ bio_put(bio);
+ return NVM_IO_ERR;
+ }
+
+ return NVM_IO_OK;
+}
+
+static int pblk_fill_partial_read_bio(struct pblk *pblk, struct bio *bio,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap,
+ struct nvm_rq *rqd, uint8_t nr_secs)
+{
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ void *ppa_ptr = NULL;
+ dma_addr_t dma_ppa_list = 0;
+ int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+ struct bio *new_bio;
+ struct bio_vec src_bv, dst_bv;
+ void *src_p, *dst_p;
+ int hole;
+ int i;
+ int ret;
+ uint16_t flags;
+ DECLARE_COMPLETION_ONSTACK(wait);
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+#endif
+
+ new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+ if (!new_bio) {
+ pr_err("pblk: could not alloc read bio\n");
+ return NVM_IO_ERR;
+ }
+
+ if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+ goto err;
+
+ if (nr_holes != new_bio->bi_vcnt) {
+ pr_err("pblk: malformed bio\n");
+ goto err;
+ }
+
+ new_bio->bi_iter.bi_sector = 0; /* artificial bio */
+ bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+ new_bio->bi_private = &wait;
+ new_bio->bi_end_io = pblk_end_sync_bio;
+
+ flags = r_ctx->flags;
+ r_ctx->flags |= PBLK_IOTYPE_SYNC;
+ rqd->bio = new_bio;
+ rqd->nr_ppas = nr_holes;
+
+ if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ ppa_ptr = rqd->ppa_list;
+ dma_ppa_list = rqd->dma_ppa_list;
+ rqd->ppa_addr = rqd->ppa_list[0];
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(pblk->dev, ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+#endif
+
+ ret = pblk_submit_read_io(pblk, new_bio, rqd, r_ctx->flags);
+ wait_for_completion_io(&wait);
+ if (ret) {
+ pr_err("pblk: read IO submission failed\n");
+ r_ctx->flags = 0;
+ goto err;
+ }
+
+ if (new_bio->bi_error) {
+ inc_stat(pblk, &pblk->read_failed, 0);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, new_bio->bi_error);
+#endif
+ }
+
+ if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ rqd->ppa_list = ppa_ptr;
+ rqd->dma_ppa_list = dma_ppa_list;
+ }
+
+ /* Fill the holes in the original bio */
+ i = 0;
+ hole = find_first_zero_bit(read_bitmap, nr_secs);
+ do {
+ src_bv = new_bio->bi_io_vec[i];
+ dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+ src_p = kmap_atomic(src_bv.bv_page);
+ dst_p = kmap_atomic(dst_bv.bv_page);
+
+ memcpy(dst_p + dst_bv.bv_offset,
+ src_p + src_bv.bv_offset,
+ PBLK_EXPOSED_PAGE_SIZE);
+
+ mempool_free(src_p, pblk->page_pool);
+
+ kunmap_atomic(src_p);
+ kunmap_atomic(dst_p);
+
+ i++;
+ hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+ } while (hole < nr_secs);
+
+ bio_put(new_bio);
+
+ /* Complete the original bio and associated request */
+ r_ctx->flags = flags;
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_secs;
+
+ bio_endio(bio);
+ pblk_end_io(rqd);
+ return NVM_IO_OK;
+
+err:
+ /* Free allocated pages in new bio */
+ pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+ pblk_end_io(rqd);
+ return NVM_IO_ERR;
+}
+
+static int __pblk_submit_read(struct pblk *pblk, struct nvm_rq *rqd,
+ struct bio *bio, unsigned long *read_bitmap,
+ unsigned int bio_init_idx, int flags, int nr_secs,
+ int clone_read)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ int ret = NVM_IO_OK;
+
+ /* All sectors are to be read from the device */
+ if (bitmap_empty(read_bitmap, nr_secs)) {
+ struct bio *int_bio = NULL;
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(pblk->dev, ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+#endif
+
+ if (clone_read) {
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+
+ /* Clone read bio to deal with read errors internally */
+ int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
+ if (!int_bio) {
+ pr_err("pblk: could not clone read bio\n");
+ goto fail_ppa_free;
+ }
+
+ rqd->bio = int_bio;
+ r_ctx->orig_bio = bio;
+ }
+
+ ret = pblk_submit_read_io(pblk, int_bio, rqd, flags);
+ if (ret) {
+ pr_err("pblk: read IO submission failed\n");
+ if (int_bio)
+ bio_put(int_bio);
+ goto fail_ppa_free;
+ }
+
+ return NVM_IO_OK;
+ }
+
+ /* The read bio request could be partially filled by the write buffer,
+ * but there are some holes that need to be read from the drive.
+ */
+ ret = pblk_fill_partial_read_bio(pblk, bio, bio_init_idx, read_bitmap,
+ rqd, nr_secs);
+ if (ret) {
+ pr_err("pblk: failed to perform partial read\n");
+ goto fail_ppa_free;
+ }
+
+ return NVM_IO_OK;
+
+fail_ppa_free:
+ if ((nr_secs > 1) && (!(flags & PBLK_IOTYPE_GC)))
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+ return ret;
+}
+
+static int pblk_read_rq(struct pblk *pblk, struct bio *bio, struct nvm_rq *rqd,
+ sector_t laddr, unsigned long *read_bitmap,
+ unsigned long flags)
+{
+ struct pblk_addr *gp;
+ struct ppa_addr ppa;
+ int ret = NVM_IO_OK;
+
+ if (laddr == ADDR_EMPTY) {
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ ret = NVM_IO_DONE;
+ goto out;
+ }
+
+ /* logic error: lba out-of-bounds */
+ BUG_ON(!(laddr >= 0 && laddr < pblk->rl.nr_secs));
+
+ spin_lock(&pblk->trans_lock);
+ gp = &pblk->trans_map[laddr];
+ ppa = gp->ppa;
+ spin_unlock(&pblk->trans_lock);
+
+ if (ppa_empty(ppa)) {
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ return NVM_IO_DONE;
+ }
+
+ if (pblk_try_read_from_cache(pblk, bio, ppa)) {
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ return NVM_IO_DONE;
+ }
+
+ rqd->ppa_addr = ppa;
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&pblk->inflight_reads);
+#endif
+ return NVM_IO_OK;
+out:
+ return ret;
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ int nr_secs = pblk_get_secs(bio);
+ unsigned int bio_init_idx;
+ struct nvm_rq *rqd;
+ struct pblk_r_ctx *r_ctx;
+ unsigned long read_bitmap; /* Max 64 ppas per request */
+ int ret = NVM_IO_ERR;
+
+ if (nr_secs > PBLK_MAX_REQ_ADDRS)
+ return NVM_IO_ERR;
+
+ bitmap_zero(&read_bitmap, nr_secs);
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd)) {
+ pr_err_ratelimited("pblk: not able to alloc rqd");
+ bio_io_error(bio);
+ return NVM_IO_ERR;
+ }
+ r_ctx = nvm_rq_to_pdu(rqd);
+
+ /* Save the index for this bio's start. This is needed in case
+ * we need to fill a partial read.
+ */
+ bio_init_idx = pblk_get_bi_idx(bio);
+
+ if (nr_secs > 1) {
+ rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("pblk: not able to allocate ppa list\n");
+ goto fail_rqd_free;
+ }
+
+ pblk_read_ppalist_rq(pblk, bio, rqd, flags, nr_secs,
+ &read_bitmap);
+ } else {
+ sector_t laddr = pblk_get_laddr(bio);
+
+ ret = pblk_read_rq(pblk, bio, rqd, laddr, &read_bitmap, flags);
+ if (ret)
+ goto fail_rqd_free;
+ }
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->bio = bio;
+ rqd->ins = &pblk->instance;
+ rqd->nr_ppas = nr_secs;
+ r_ctx->flags = flags;
+
+ bio_get(bio);
+ if (bitmap_full(&read_bitmap, nr_secs)) {
+ bio_endio(bio);
+ pblk_end_io(rqd);
+ return NVM_IO_OK;
+ }
+
+ return __pblk_submit_read(pblk, rqd, bio, &read_bitmap, bio_init_idx,
+ flags, nr_secs, 1);
+
+fail_rqd_free:
+ pblk_free_rqd(pblk, rqd, READ);
+ return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct bio *bio,
+ struct nvm_rq *rqd, u64 *lba_list,
+ unsigned int nr_secs, unsigned long *read_bitmap,
+ unsigned long flags)
+{
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ sector_t lba;
+ int advanced_bio = 0;
+ int valid_secs = 0;
+ int i, j = 0;
+
+ pblk_setup_rand_reads(pblk, ppas, lba_list, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr *p = &ppas[i];
+
+ lba = lba_list[i];
+
+ if (lba == ADDR_EMPTY || ppa_empty(*p))
+ continue;
+
+ /* logic error: lba out-of-bounds */
+ BUG_ON(!(lba >= 0 && lba < pblk->rl.nr_secs));
+
+ /* Try to read from write buffer. Those addresses that cannot be
+ * read from the write buffer are sequentially added to the ppa
+ * list, which will later on be used to submit an I/O to the
+ * device to retrieve data.
+ */
+ if (nvm_addr_in_cache(*p)) {
+ WARN_ON(test_and_set_bit(valid_secs, read_bitmap));
+ if (unlikely(!advanced_bio)) {
+ /* This is at least a partially filled bio,
+ * advance it to copy data to the right place.
+ * We will deal with partial bios later on.
+ */
+ bio_advance(bio, valid_secs *
+ PBLK_EXPOSED_PAGE_SIZE);
+ advanced_bio = 1;
+ }
+ pblk_read_from_cache(pblk, bio, *p);
+ } else {
+ /* Fill ppa_list with the sectors that cannot be
+ * read from cache
+ */
+ rqd->ppa_list[j] = *p;
+ j++;
+ }
+
+ valid_secs++;
+
+ if (advanced_bio)
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(nr_secs, &pblk->inflight_reads);
+#endif
+ return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, struct bio *bio,
+ struct nvm_rq *rqd, u64 *lba_list,
+ unsigned int nr_secs, unsigned int nr_rec_secs,
+ unsigned long flags)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ unsigned int bio_init_idx;
+ unsigned long read_bitmap; /* Max 64 ppas per request */
+ unsigned int valid_secs = 1;
+ int ret;
+
+ if ((nr_rec_secs != bio->bi_vcnt) || (nr_rec_secs > PBLK_MAX_REQ_ADDRS))
+ return NVM_IO_ERR;
+
+ bitmap_zero(&read_bitmap, nr_secs);
+
+ /* Save the bvl_vec index for this bio's start. This is needed in case
+ * we need to fill a partial read.
+ */
+ bio_init_idx = pblk_get_bi_idx(bio);
+
+ if (nr_rec_secs > 1) {
+ rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("pblk: not able to allocate ppa list\n");
+ return NVM_IO_ERR;
+ }
+
+ valid_secs = read_ppalist_rq_gc(pblk, bio, rqd, lba_list,
+ nr_secs, &read_bitmap, flags);
+ } else {
+ sector_t laddr = lba_list[0];
+
+ ret = pblk_read_rq(pblk, bio, rqd, laddr, &read_bitmap, flags);
+ if (ret)
+ return ret;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(nr_rec_secs != valid_secs);
+#endif
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->bio = bio;
+ rqd->ins = &pblk->instance;
+ rqd->nr_ppas = valid_secs;
+ r_ctx->flags = flags;
+
+ if (bitmap_full(&read_bitmap, valid_secs)) {
+ bio_endio(bio);
+ return NVM_IO_OK;
+ }
+
+ return __pblk_submit_read(pblk, rqd, bio, &read_bitmap, bio_init_idx,
+ flags, valid_secs, 0);
+}
+
+void pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, uint8_t nr_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = rqd->bio;
+ struct bio *orig_bio = r_ctx->orig_bio;
+
+ if (bio->bi_error) {
+ switch (bio->bi_error) {
+ case NVM_RSP_WARN_HIGHECC:
+ inc_stat(pblk, &pblk->read_high_ecc, 1);
+ break;
+ case NVM_RSP_ERR_FAILECC:
+ inc_stat(pblk, &pblk->read_failed, 1);
+ break;
+ case NVM_RSP_ERR_EMPTYPAGE:
+ inc_stat(pblk, &pblk->read_empty, 1);
+ break;
+ default:
+ pr_err("pblk: unknown read error:%d\n", bio->bi_error);
+ }
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, bio->bi_error);
+#endif
+ }
+
+ if (r_ctx->flags & PBLK_IOTYPE_SYNC)
+ return;
+
+ if (nr_secs > 1)
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+
+ bio_put(bio);
+ if (orig_bio) {
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(orig_bio->bi_error);
+#endif
+ bio_endio(orig_bio);
+ bio_put(orig_bio);
+ }
+
+ pblk_free_rqd(pblk, rqd, READ);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(nr_secs, &pblk->sync_reads);
+ atomic_sub(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
new file mode 100644
index 0000000..3955344
--- /dev/null
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <jg@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+/*
+ * Write Retry - These set of functions implement recovery mechanisms for a
+ * failed write.
+ */
+static void pblk_rec_valid_pgs(struct work_struct *work)
+{
+ struct pblk_block_ws *blk_ws = container_of(work, struct pblk_block_ws,
+ ws_blk);
+ struct pblk *pblk = blk_ws->pblk;
+ struct pblk_block *rblk = blk_ws->rblk;
+ struct pblk_blk_rec_lpg *rlpg = rblk->rlpg;
+ u64 *lba_list = pblk_rlpg_to_llba(rlpg);
+ unsigned int nr_entries;
+ int off_sync, off;
+ int try = 0;
+ int ret;
+
+ spin_lock(&rblk->lock);
+ nr_entries = bitmap_weight(rblk->sync_bitmap, pblk->nr_blk_dsecs);
+
+ /* Recovery for this block already in progress */
+ if (nr_entries == 0) {
+ spin_unlock(&rblk->lock);
+ goto out;
+ }
+
+retry_off:
+ off_sync = find_first_bit(rblk->sync_bitmap, pblk->nr_blk_dsecs);
+ off = find_first_bit(rblk->sector_bitmap, pblk->nr_blk_dsecs);
+
+ if (off_sync != off)
+ goto retry_off;
+
+ /* Clear mapped pages as they are set for recovery */
+ bitmap_clear(rblk->sync_bitmap, off, nr_entries);
+ bitmap_clear(rblk->sector_bitmap, off, nr_entries);
+ spin_unlock(&rblk->lock);
+
+retry_move:
+ ret = pblk_gc_move_valid_secs(pblk, rblk, &lba_list[off], nr_entries);
+ if (ret != nr_entries) {
+ pr_err("pblk: could not recover all sectors:blk:%d\n",
+ rblk->id);
+ if (try < PBLK_GC_TRIES) {
+ off += ret;
+ goto retry_move;
+ } else {
+ pr_err("pblk: recovery failed\n");
+ }
+ }
+
+ spin_lock(&rblk->rlun->lock);
+ list_move_tail(&rblk->list, &rblk->rlun->g_bb_list);
+ spin_unlock(&rblk->rlun->lock);
+
+ mempool_free(blk_ws, pblk->blk_ws_pool);
+ return;
+out:
+ mempool_free(blk_ws, pblk->blk_ws_pool);
+}
+
+static int pblk_setup_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, unsigned int nr_rec_secs)
+{
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ unsigned int valid_secs = c_ctx->nr_valid;
+ unsigned int padded_secs = c_ctx->nr_padded;
+ unsigned int nr_secs = valid_secs + padded_secs;
+ unsigned long lun_bitmap[PBLK_MAX_LUNS_BITMAP];
+ unsigned int setup_secs;
+ struct pblk_sec_meta *meta;
+ int min = pblk->min_write_pgs;
+ int i;
+ int ret = 0;
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+#endif
+
+ bitmap_zero(lun_bitmap, pblk->nr_luns);
+
+ ret = pblk_write_alloc_rq(pblk, rqd, ctx, nr_rec_secs);
+ if (ret)
+ goto out;
+
+ meta = rqd->meta_list;
+
+ if (unlikely(nr_rec_secs == 1)) {
+ /*
+ * Single sector path - this path is highly improbable since
+ * controllers typically deal with multi-sector and multi-plane
+ * pages. This path is though useful for testing on QEMU
+ */
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(nr_secs != 1);
+ BUG_ON(padded_secs != 0);
+#endif
+ ret = pblk_write_setup_s(pblk, rqd, ctx, meta, lun_bitmap);
+ goto out;
+ }
+
+ for (i = 0; i < nr_rec_secs; i += min) {
+ if (i + min > nr_rec_secs) {
+ setup_secs = nr_rec_secs % min;
+
+ if (c_ctx->nr_valid == 0) {
+ c_ctx->nr_padded -= min;
+ } else if (c_ctx->nr_valid >= min) {
+ c_ctx->nr_valid -= min;
+ } else {
+ c_ctx->nr_padded -= min - c_ctx->nr_valid;
+ c_ctx->nr_valid = 0;
+ }
+ }
+
+ setup_secs = (i + min > nr_rec_secs) ?
+ (nr_rec_secs % min) : min;
+ ret = pblk_write_setup_m(pblk, rqd, ctx, meta, setup_secs, i,
+ lun_bitmap);
+ }
+
+ rqd->ppa_status = (u64)0;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+#ifdef CONFIG_NVM_DEBUG
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(pblk->dev, rqd->ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+#endif
+out:
+ return ret;
+}
+
+/* pblk_submit_rec -- thread to submit recovery requests
+ *
+ * When a write request fails, rqd->ppa_status signals which specific ppas could
+ * not be written to the media. All ppas previous to the failed writes could be
+ * completed when the io finished, as part of the end_io recovery. However,
+ * successful writes after the failed ppas are not completed in order to
+ * maintain the consistency of the back pointer that guarantees sequentiality on
+ * the write buffer.
+ */
+void pblk_submit_rec(struct work_struct *work)
+{
+ struct pblk_rec_ctx *recovery =
+ container_of(work, struct pblk_rec_ctx, ws_rec);
+ struct pblk *pblk = recovery->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_rq *rqd = recovery->rqd;
+ struct pblk_ctx *ctx = pblk_set_ctx(pblk, rqd);
+ int max_secs = nvm_max_phys_sects(dev);
+ struct bio *bio;
+ unsigned int nr_rec_secs;
+ unsigned int pgs_read;
+ int err;
+
+ nr_rec_secs =
+ bitmap_weight((unsigned long int *)&rqd->ppa_status, max_secs);
+
+ bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
+ if (!bio) {
+ pr_err("pblk: not able to create recovery bio\n");
+ return;
+ }
+ bio->bi_iter.bi_sector = 0; /* artificial bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+
+ pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, ctx,
+ &recovery->failed, nr_rec_secs);
+ if (pgs_read != nr_rec_secs) {
+ pr_err("pblk: could not read recovery entries\n");
+ goto fail;
+ }
+
+ if (pblk_setup_rec_rq(pblk, rqd, ctx, nr_rec_secs)) {
+ pr_err("pblk: could not setup recovery request\n");
+ goto fail;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(nr_rec_secs, &pblk->recov_writes);
+#endif
+
+ err = nvm_submit_io(dev, rqd);
+ if (err) {
+ pr_err("pblk: I/O submission failed: %d\n", err);
+ goto fail;
+ }
+
+ mempool_free(recovery, pblk->rec_pool);
+ return;
+
+fail:
+ bio_put(bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+void pblk_run_recovery(struct pblk *pblk, struct pblk_block *rblk)
+{
+ struct pblk_block_ws *blk_ws;
+
+ blk_ws = mempool_alloc(pblk->blk_ws_pool, GFP_ATOMIC);
+ if (!blk_ws) {
+ pr_err("pblk: unable to queue block for recovery gc.");
+ return;
+ }
+
+ pr_debug("Run recovery. Blk:%d\n", rblk->id);
+
+ blk_ws->pblk = pblk;
+ blk_ws->rblk = rblk;
+
+ /* Move data away from grown bad block */
+ INIT_WORK(&blk_ws->ws_blk, pblk_rec_valid_pgs);
+ queue_work(pblk->kgc_wq, &blk_ws->ws_blk);
+}
+
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_ctx *ctx,
+ struct pblk_rec_ctx *recovery, u64 *comp_bits,
+ unsigned int c_entries)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ int max_secs = nvm_max_phys_sects(dev);
+ struct nvm_rq *rec_rqd;
+ struct pblk_ctx *rec_ctx;
+ struct pblk_compl_ctx *rec_c_ctx;
+ int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
+
+ rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rec_rqd)) {
+ pr_err("pblk: could not create recovery req.\n");
+ return -ENOMEM;
+ }
+
+ rec_ctx = pblk_set_ctx(pblk, rec_rqd);
+ rec_c_ctx = rec_ctx->c_ctx;
+
+ /* Copy completion bitmap, but exclude the first X completed entries */
+ bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
+ (unsigned long int *)comp_bits,
+ c_entries, max_secs);
+
+ /* Save the context for the entries that need to be re-written and
+ * update current context with the completed entries.
+ */
+ rec_c_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb,
+ c_ctx->sentry + c_entries);
+ if (c_entries >= c_ctx->nr_valid) {
+ rec_c_ctx->nr_valid = 0;
+ rec_c_ctx->nr_padded = nr_entries - c_entries;
+
+ c_ctx->nr_padded = c_entries - c_ctx->nr_valid;
+ } else {
+ rec_c_ctx->nr_valid = c_ctx->nr_valid - c_entries;
+ rec_c_ctx->nr_padded = c_ctx->nr_padded;
+
+ c_ctx->nr_valid = c_entries;
+ c_ctx->nr_padded = 0;
+ }
+
+ rec_ctx->flags = ctx->flags;
+ recovery->rqd = rec_rqd;
+ recovery->pblk = pblk;
+
+ return 0;
+}
+
+struct nvm_rq *pblk_recov_setup(struct pblk *pblk, void *recov_page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_r_ctx *r_ctx;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ unsigned int page_size = pblk_recov_page_size(pblk);
+
+ bio = bio_map_kern(dev->q, recov_page, page_size, GFP_KERNEL);
+ if (!bio) {
+ pr_err("pblk: could not allocate recovery bio\n");
+ return NULL;
+ }
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: not able to create write req.\n");
+ bio_put(bio);
+ return NULL;
+ }
+
+ bio->bi_iter.bi_sector = 0;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_end_io = pblk_end_sync_bio;
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->ins = &pblk->instance;
+ rqd->bio = bio;
+ rqd->meta_list = NULL;
+ rqd->flags = pblk_set_read_mode(pblk);
+
+ r_ctx = nvm_rq_to_pdu(rqd);
+ r_ctx->flags = PBLK_IOTYPE_SYNC;
+
+ return rqd;
+}
+
+int pblk_recov_read(struct pblk *pblk, struct pblk_block *rblk,
+ void *recov_page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ unsigned int nr_rec_ppas = geo->sec_per_blk - pblk->nr_blk_dsecs;
+ struct ppa_addr ppa_addr[PBLK_RECOVERY_SECTORS];
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ u64 rppa;
+ int i;
+ int ret = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+#endif
+
+ rqd = pblk_recov_setup(pblk, recov_page);
+ if (!rqd)
+ return -1;
+
+ bio = rqd->bio;
+ bio->bi_private = &wait;
+
+ /* Last page in block contains mapped lba list if block is closed */
+ for (i = 0; i < nr_rec_ppas; i++) {
+ rppa = pblk->nr_blk_dsecs + i;
+ ppa_addr[i] = pblk_blk_ppa_to_gaddr(dev, rblk, rppa);
+ }
+
+ if (nvm_set_rqd_ppalist(dev->parent, rqd, ppa_addr, nr_rec_ppas, 0)) {
+ pr_err("pblk: not able to set rqd ppa list\n");
+ ret = -1;
+ goto free_rqd;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(dev, ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+#endif
+
+ if (nvm_submit_io(dev, rqd)) {
+ pr_err("pblk: I/O submission failed\n");
+ ret = -1;
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+ goto free_ppa_list;
+ }
+ wait_for_completion_io(&wait);
+
+ if (bio->bi_error)
+ pr_debug("pblk: recovery sync read failed (%u)\n",
+ bio->bi_error);
+
+free_ppa_list:
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+free_rqd:
+ pblk_free_rqd(pblk, rqd, READ);
+ bio_put(bio);
+
+ return ret;
+}
+
+static unsigned int calc_rlpg_len(unsigned int nr_entries,
+ unsigned int bitmap_len)
+{
+ return sizeof(struct pblk_blk_rec_lpg) +
+ (nr_entries * sizeof(u64)) +
+ (PBLK_RECOVERY_BITMAPS * (bitmap_len));
+}
+
+int pblk_recov_calc_meta_len(struct pblk *pblk, unsigned int *bitmap_len,
+ unsigned int *rlpg_len,
+ unsigned int *req_len)
+{
+ *bitmap_len = pblk->blk_meta.bitmap_len;
+ *req_len = pblk->blk_meta.rlpg_page_len;
+ *rlpg_len = calc_rlpg_len(pblk->nr_blk_dsecs, *bitmap_len);
+
+ if (*rlpg_len > *req_len) {
+ pr_err("pblk: metadata is too large for last page size (%d/%d)\n",
+ *rlpg_len, *req_len);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pblk_recov_page_size(struct pblk *pblk)
+{
+ return pblk->blk_meta.rlpg_page_len;
+}
+
+u64 *pblk_recov_get_lba_list(struct pblk *pblk, struct pblk_blk_rec_lpg *rlpg)
+{
+ u32 rlpg_len, req_len, bitmap_len;
+ u32 crc = ~(u32)0;
+
+ if (pblk_recov_calc_meta_len(pblk, &bitmap_len, &rlpg_len, &req_len))
+ return NULL;
+
+ crc = cpu_to_le32(crc32_le(crc, (unsigned char *)rlpg + sizeof(crc),
+ rlpg_len - sizeof(crc)));
+
+ if (rlpg->crc != crc || rlpg->status != PBLK_BLK_ST_CLOSED)
+ return NULL;
+
+ return pblk_rlpg_to_llba(rlpg);
+}
+
+/* TODO: Fit lba in u32 when possible to fit metadata in one page */
+int pblk_recov_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ unsigned int nr_blk_dsecs;
+ unsigned int rlpg_len;
+ unsigned int bitmap_len, rlpg_page_len;
+ unsigned int nr_rec_ppas;
+ int i = 1;
+
+retry:
+ nr_rec_ppas = i * geo->sec_per_pl;
+ nr_blk_dsecs = geo->sec_per_blk - nr_rec_ppas;
+ rlpg_page_len = nr_rec_ppas * geo->sec_size;
+ bitmap_len = BITS_TO_LONGS(nr_blk_dsecs) * sizeof(unsigned long);
+ rlpg_len = calc_rlpg_len(nr_blk_dsecs, bitmap_len);
+
+ if (rlpg_len > rlpg_page_len) {
+ i++;
+ goto retry;
+ }
+
+ if (nr_rec_ppas > PBLK_RECOVERY_SECTORS) {
+ pr_err("pblk: Not enough recovery sectors for NAND config.\n");
+ return -EINVAL;
+ }
+
+ pblk->blk_meta.rlpg_page_len = rlpg_page_len;
+ pblk->blk_meta.bitmap_len = bitmap_len;
+ pblk->nr_blk_dsecs = nr_blk_dsecs;
+
+ return 0;
+}
+
+/*
+ * Bring up & tear down scanning - These set of functions implement "last page
+ * recovery". This is, saving the l2p mapping of each block on the last page to
+ * be able to reconstruct the l2p table by scanning the last page of each block.
+ * This mechanism triggers when l2p snapshot fails
+ *
+ * Read last page on block and update l2p table if necessary.
+ */
+int pblk_recov_scan_blk(struct pblk *pblk, struct pblk_block *rblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_lun *rlun = rblk->rlun;
+ struct pblk_blk_rec_lpg *rlpg;
+ struct ppa_addr ppa;
+ u64 *lba_list;
+ int i;
+ int ret = 0;
+
+ rlpg = pblk_alloc_blk_meta(pblk, rblk, PBLK_BLK_ST_CLOSED);
+ if (!rlpg) {
+ pr_err("pblk: could not allocate recovery ppa list\n");
+ ret = -1;
+ goto out;
+ }
+
+ ret = pblk_recov_read(pblk, rblk, rlpg);
+ if (ret) {
+ pr_err("pblk: could not recover last page. Blk:%d\n",
+ rblk->id);
+ goto free_rlpg;
+ }
+
+ lba_list = pblk_recov_get_lba_list(pblk, rlpg);
+ if (!lba_list)
+ goto free_rlpg;
+
+ rblk->nr_invalid_secs = rblk->rlpg->nr_invalid_secs;
+ rblk->cur_sec = rblk->rlpg->cur_sec;
+
+ rblk->state = rblk->rlpg->blk_state;
+
+ /* For now, padded blocks are always closed on teardown */
+ spin_lock(&rlun->lock);
+ list_add_tail(&rblk->list, &rlun->closed_list);
+ list_add_tail(&rblk->prio, &rlun->prio_list);
+ spin_unlock(&rlun->lock);
+
+ for (i = 0; i < pblk->nr_blk_dsecs; i++) {
+ ppa = pblk_blk_ppa_to_gaddr(dev, rblk, i);
+ if (lba_list[i] != ADDR_EMPTY)
+ pblk_update_map(pblk, lba_list[i], rblk, ppa);
+
+#ifdef CONFIG_NVM_DEBUG
+ if (pblk_boundary_checks(dev, &ppa, 1))
+ WARN_ON(1);
+#endif
+ /* TODO: when not padding the whole block, mark as invalid */
+ }
+
+ return ret;
+
+free_rlpg:
+ mempool_free(rlpg, pblk->blk_meta_pool);
+out:
+ return ret;
+}
+
+void pblk_recov_clean_g_bb_list(struct pblk *pblk, struct pblk_lun *rlun)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_block *rblk, *trblk;
+ struct ppa_addr gen_ppa;
+ LIST_HEAD(g_bb_list);
+
+ spin_lock(&rlun->lock);
+ list_cut_position(&g_bb_list, &rlun->g_bb_list, rlun->g_bb_list.prev);
+ spin_unlock(&rlun->lock);
+
+ list_for_each_entry_safe(rblk, trblk, &g_bb_list, list) {
+ gen_ppa = pblk_blk_ppa_to_gaddr(dev, rblk, 0);
+ nvm_set_bb_tbl(dev->parent, &gen_ppa, 1, NVM_BLK_T_GRWN_BAD);
+
+ /* As sectors are recovered, the bitmap representing valid
+ * mapped pages is emptied
+ */
+ spin_lock(&rblk->lock);
+ if (bitmap_empty(rblk->sector_bitmap, pblk->nr_blk_dsecs))
+ pblk_put_blk(pblk, rblk);
+ spin_unlock(&rblk->lock);
+ }
+}
+
+struct nvm_rq *pblk_setup_close_rblk(struct pblk *pblk, struct pblk_block *rblk,
+ int io_type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct bio *bio;
+ struct pblk_ctx *ctx;
+ struct nvm_rq *rqd;
+ struct pblk_compl_close_ctx *c_ctx;
+ int rqd_len;
+ u32 crc = ~(u32)0;
+
+#ifdef CONFIG_NVM_DEBUG
+ if (!block_is_bad(rblk))
+ BUG_ON(rblk->rlpg->nr_lbas + rblk->rlpg->nr_padded !=
+ pblk->nr_blk_dsecs);
+#endif
+
+ rblk->rlpg->status = PBLK_BLK_ST_CLOSED;
+ rblk->rlpg->nr_invalid_secs = rblk->nr_invalid_secs;
+ rblk->rlpg->cur_sec = rblk->cur_sec;
+ rblk->rlpg->blk_state = rblk->state;
+
+ crc = crc32_le(crc, (unsigned char *)rblk->rlpg + sizeof(crc),
+ rblk->rlpg->rlpg_len - sizeof(crc));
+ rblk->rlpg->crc = cpu_to_le32(crc);
+
+ bio = bio_map_kern(dev->q, rblk->rlpg, rblk->rlpg->req_len, GFP_KERNEL);
+ if (!bio) {
+ pr_err("pblk: could not allocate recovery bio\n");
+ return NULL;
+ }
+
+ rqd_len = sizeof(struct nvm_rq) + sizeof(struct pblk_ctx) +
+ sizeof(struct pblk_compl_close_ctx);
+ rqd = kzalloc(rqd_len, GFP_KERNEL);
+ if (!rqd)
+ goto fail_alloc_rqd;
+
+ memset(rqd, 0, rqd_len);
+ ctx = pblk_set_ctx(pblk, rqd);
+ ctx->flags = io_type;
+ c_ctx = ctx->c_ctx;
+ c_ctx->rblk = rblk;
+
+ bio_get(bio);
+ bio->bi_iter.bi_sector = 0;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->ins = &pblk->instance;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->meta_list = NULL;
+
+ return rqd;
+
+fail_alloc_rqd:
+ bio_put(bio);
+ return NULL;
+}
+
+void __pblk_close_rblk(struct pblk *pblk, struct pblk_block *rblk,
+ struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr ppa_addr[PBLK_RECOVERY_SECTORS];
+ int nr_entries = pblk->nr_blk_dsecs;
+ unsigned int nr_rec_ppas = geo->sec_per_blk - nr_entries;
+ u64 paddr;
+ int i;
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+#endif
+
+ /* address within a block for the last writable page */
+ for (i = 0; i < nr_rec_ppas; i++) {
+ paddr = nr_entries + i;
+ ppa_addr[i] = pblk_blk_ppa_to_gaddr(dev, rblk, paddr);
+ }
+
+ if (nvm_set_rqd_ppalist(dev->parent, rqd, ppa_addr, nr_rec_ppas, 0)) {
+ pr_err("pblk: not able to set rqd ppa list\n");
+ goto fail_set_rqd;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(dev, ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+
+ BUG_ON(rqd->nr_ppas != nr_rec_ppas);
+ atomic_add(rqd->nr_ppas, &pblk->inflight_meta);
+#endif
+
+ if (nvm_submit_io(dev, rqd)) {
+ pr_err("pblk: I/O submission failed\n");
+ goto fail_submit;
+ }
+
+ return;
+
+fail_submit:
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+fail_set_rqd:
+ kfree(rqd);
+}
+
+/*
+ * The current block is out of the fast path; no more data can be written to it.
+ * Save the list of the lbas stored in the block on the last page of the block.
+ * This is used for GC and for recovery in case of FTL corruption after a crash.
+ */
+void pblk_close_rblk(struct pblk *pblk, struct pblk_block *rblk)
+{
+ struct nvm_rq *rqd;
+
+ if (down_interruptible(&rblk->rlun->wr_sem))
+ pr_err("pblk: lun semaphore failed\n");
+
+ rqd = pblk_setup_close_rblk(pblk, rblk, PBLK_IOTYPE_CLOSE_BLK);
+ if (!rqd) {
+ pr_err("pblk: not able to create write req.\n");
+ return;
+ }
+
+ __pblk_close_rblk(pblk, rblk, rqd);
+}
+
+void pblk_close_blk(struct work_struct *work)
+{
+ struct pblk_block_ws *blk_ws = container_of(work, struct pblk_block_ws,
+ ws_blk);
+ struct pblk *pblk = blk_ws->pblk;
+ struct pblk_block *rblk = blk_ws->rblk;
+
+ if (likely(!block_is_bad(rblk)))
+ pblk_close_rblk(pblk, rblk);
+
+ mempool_free(blk_ws, pblk->blk_ws_pool);
+}
+
+#ifdef CONFIG_NVM_DEBUG
+void pblk_recov_blk_meta_sysfs(struct pblk *pblk, u64 value)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ unsigned int nr_rec_ppas = geo->sec_per_blk - pblk->nr_blk_dsecs;
+ struct ppa_addr bppa;
+ struct ppa_addr ppas[PBLK_RECOVERY_SECTORS];
+ struct ppa_addr *ppa_list;
+ struct pblk_blk_rec_lpg *rlpg;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ u64 *lba_list;
+ int i;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ bppa.ppa = value;
+ print_ppa(&bppa, "RECOVERY", 0);
+
+ rlpg = mempool_alloc(pblk->blk_meta_pool, GFP_KERNEL);
+ if (!rlpg) {
+ pr_err("pblk: could not allocate recovery ppa list\n");
+ return;
+ }
+ memset(rlpg, 0, pblk->blk_meta.rlpg_page_len);
+
+ rqd = pblk_recov_setup(pblk, rlpg);
+ if (!rqd) {
+ pr_err("pblk: could not recover last page for ppa:%llx\n",
+ bppa.ppa);
+ return;
+ }
+
+ bio = rqd->bio;
+ bio->bi_private = &wait;
+
+ bppa.g.pg = 255;
+ for (i = 0; i < nr_rec_ppas; i++) {
+ struct ppa_addr ppa = bppa;
+
+ ppa.g.pl = i / 4;
+ ppa.g.sec = i % 4;
+
+ ppas[i] = ppa;
+ }
+
+ if (nvm_set_rqd_ppalist(dev->parent, rqd, ppas, nr_rec_ppas, 0)) {
+ pr_err("pblk: could not set rqd ppa list\n");
+ return;
+ }
+
+ for (i = 0; i < nr_rec_ppas; i++)
+ print_ppa(&rqd->ppa_list[i], "RECOVERY", i);
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(dev, ppa_list, rqd->nr_ppas)) {
+ pr_err("pblk: corrupt ppa list\n");
+ return;
+ }
+
+ if (nvm_submit_io(dev, rqd)) {
+ pr_err("pblk: I/O submission failed\n");
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+ return;
+ }
+ wait_for_completion_io(&wait);
+
+ if (bio->bi_error) {
+ pr_err("pblk: recovery sync read failed (%u)\n",
+ bio->bi_error);
+ return;
+ }
+
+ lba_list = pblk_recov_get_lba_list(pblk, rlpg);
+ if (!lba_list) {
+ pr_err("pblk: cannot recover lba list\n");
+ return;
+ }
+
+ for (i = 0; i < pblk->nr_blk_dsecs; i++)
+ pr_debug("lba[%i]: %llu\n", i, lba_list[i]);
+
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+ pblk_free_rqd(pblk, rqd, READ);
+ bio_put(bio);
+
+ mempool_free(rlpg, pblk->blk_meta_pool);
+}
+#endif
+
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
new file mode 100644
index 0000000..80460b4
--- /dev/null
+++ b/drivers/lightnvm/pblk-rl.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ */
+
+#include "pblk.h"
+
+static inline bool pblk_rl_rate(struct pblk_prov *rl, int *c, int inc, int max)
+{
+ spin_lock(&rl->lock);
+ if (*c + inc > max) {
+ spin_unlock(&rl->lock);
+ return false;
+ }
+
+ *c += inc;
+ spin_unlock(&rl->lock);
+
+ return true;
+}
+
+void pblk_rl_user_in(struct pblk *pblk, int nr_entries)
+{
+ struct pblk_prov *rl = &pblk->rl;
+ int max;
+ int *cnt;
+ DEFINE_WAIT(wait);
+
+ spin_lock(&rl->lock);
+ max = rl->rb_user_max;
+ cnt = &rl->rb_user_cnt;
+ spin_unlock(&rl->lock);
+
+ if (pblk_rl_rate(rl, cnt, nr_entries, max))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&pblk->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&rl->lock);
+ max = rl->rb_user_max;
+ cnt = &rl->rb_user_cnt;
+ spin_unlock(&rl->lock);
+
+ if (pblk_rl_rate(rl, cnt, nr_entries, max))
+ break;
+
+ io_schedule();
+ } while (1);
+
+ finish_wait(&pblk->wait, &wait);
+}
+
+void pblk_rl_gc_in(struct pblk *pblk, int nr_entries)
+{
+ struct pblk_prov *rl = &pblk->rl;
+ int max;
+ int *cnt;
+ DEFINE_WAIT(wait);
+
+ spin_lock(&rl->lock);
+ max = rl->rb_gc_max;
+ cnt = &rl->rb_gc_cnt;
+ spin_unlock(&rl->lock);
+
+ if (pblk_rl_rate(rl, cnt, nr_entries, max))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&pblk->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ spin_lock(&rl->lock);
+ max = rl->rb_gc_max;
+ cnt = &rl->rb_gc_cnt;
+ spin_unlock(&rl->lock);
+
+ if (pblk_rl_rate(rl, cnt, nr_entries, max))
+ break;
+
+ io_schedule();
+ } while (1);
+
+ finish_wait(&pblk->wait, &wait);
+}
+
+void pblk_rl_out(struct pblk *pblk, int nr_user, int nr_gc)
+{
+ struct pblk_prov *rl = &pblk->rl;
+
+ spin_lock(&rl->lock);
+ rl->rb_user_cnt -= nr_user;
+ rl->rb_gc_cnt -= nr_gc;
+ WARN_ON(rl->rb_user_cnt < 0 || rl->rb_gc_cnt < 0);
+ spin_unlock(&rl->lock);
+
+ /* Kick user I/O rate limiter queue if waiting */
+ if (waitqueue_active(&pblk->wait))
+ wake_up_all(&pblk->wait);
+}
+
+enum {
+ PBLK_RL_START_GC = 1,
+ PBLK_RL_STOP_GC = 2,
+};
+
+/*
+ * We check for (i) the number of free blocks in the current LUN and (ii) the
+ * total number of free blocks in the pblk instance. This is to even out the
+ * number of free blocks on each LUN when GC kicks in.
+ *
+ * Only the total number of free blocks is used to configure the rate limiter.
+ *
+ * TODO: Simplify calculations
+ */
+static int pblk_rl_update_rates(struct pblk *pblk, struct pblk_lun *rlun)
+{
+ struct pblk_prov *rl = &pblk->rl;
+ unsigned long rwb_size = pblk_rb_nr_entries(&pblk->rwb);
+ unsigned int high = 1 << rl->high_pw;
+ unsigned int low = 1 << rl->low_pw;
+ int ret;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rl->lock);
+#endif
+
+ if (rl->free_blocks >= high) {
+ rl->rb_user_max = rwb_size - rl->rb_gc_rsv;
+ rl->rb_gc_max = rl->rb_gc_rsv;
+ ret = PBLK_RL_STOP_GC;
+ } else if (rl->free_blocks > low && rl->free_blocks < high) {
+ int shift = rl->high_pw - rl->rb_windows_pw;
+ int user_windows = rl->free_blocks >> shift;
+ int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+ int gc_max;
+
+ rl->rb_user_max = user_max;
+ gc_max = rwb_size - rl->rb_user_max;
+ rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
+ ret = PBLK_RL_START_GC;
+ } else {
+ rl->rb_user_max = 0;
+ rl->rb_gc_max = rwb_size;
+ ret = PBLK_RL_START_GC;
+ }
+
+ if (rlun->nr_free_blocks < rl->low_lun)
+ ret = PBLK_RL_START_GC;
+
+ return ret;
+}
+
+void pblk_rl_set_gc_rsc(struct pblk *pblk, int rsv)
+{
+ spin_lock(&pblk->rl.lock);
+ pblk->rl.rb_gc_rsv = rsv;
+ spin_unlock(&pblk->rl.lock);
+}
+
+void pblk_rl_free_blks_inc(struct pblk *pblk, struct pblk_lun *rlun)
+{
+ int ret;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rlun->lock);
+#endif
+
+ rlun->nr_free_blocks++;
+
+ spin_lock(&pblk->rl.lock);
+ pblk->rl.free_blocks++;
+ ret = pblk_rl_update_rates(pblk, rlun);
+ spin_unlock(&pblk->rl.lock);
+
+ if (ret == PBLK_RL_START_GC)
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_free_blks_dec(struct pblk *pblk, struct pblk_lun *rlun)
+{
+ int ret;
+
+#ifdef CONFIG_NVM_DEBUG
+ lockdep_assert_held(&rlun->lock);
+#endif
+
+ rlun->nr_free_blocks--;
+
+ spin_lock(&pblk->rl.lock);
+ pblk->rl.free_blocks--;
+ ret = pblk_rl_update_rates(pblk, rlun);
+ spin_unlock(&pblk->rl.lock);
+
+ if (ret == PBLK_RL_START_GC)
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+int pblk_rl_gc_thrs(struct pblk *pblk)
+{
+ return pblk->rl.high_lun + 1;
+}
+
+int pblk_rl_sysfs_rate_show(struct pblk *pblk)
+{
+ return pblk->rl.rb_user_max;
+}
+
+int pblk_rl_sysfs_rate_store(struct pblk *pblk, int value)
+{
+ pblk->rl.rb_user_max = value;
+
+ return 0;
+}
+
+/* TODO: Update values correctly on power up recovery */
+void pblk_rl_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_prov *rl = &pblk->rl;
+ unsigned int rb_windows;
+
+ rl->free_blocks = pblk_nr_free_blks(pblk);
+
+ rl->high_pw = get_count_order(rl->total_blocks / PBLK_USER_HIGH_THRS);
+ rl->low_pw = get_count_order(rl->total_blocks / PBLK_USER_LOW_THRS);
+ rl->high_lun = geo->blks_per_lun / PBLK_USER_HIGH_THRS;
+ rl->low_lun = geo->blks_per_lun / PBLK_USER_LOW_THRS;
+ if (rl->low_lun < 3)
+ rl->low_lun = 3;
+
+ /* This will always be a power-of-2 */
+ rb_windows = pblk_rb_nr_entries(&pblk->rwb) / PBLK_MAX_REQ_ADDRS;
+ rl->rb_windows_pw = get_count_order(rb_windows);
+
+ /* To start with, all buffer is available to user I/O writers */
+ rl->rb_user_max = pblk_rb_nr_entries(&pblk->rwb);
+ rl->rb_user_cnt = 0;
+ rl->rb_gc_max = 0;
+ rl->rb_gc_cnt = 0;
+
+ spin_lock_init(&rl->lock);
+}
+
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
new file mode 100644
index 0000000..dac0c11
--- /dev/null
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -0,0 +1,828 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_active_show(struct pblk *pblk, char *page)
+{
+ return sprintf(page, "luns_active=%d\n",
+ pblk_map_get_active_luns(pblk));
+}
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+ struct pblk_lun *rlun;
+ ssize_t sz = 0;
+ int i;
+
+ spin_lock(&pblk->w_luns.lock);
+ for (i = 0; i < pblk->w_luns.nr_luns; i++) {
+ int active = 1;
+
+ rlun = pblk->w_luns.luns[i];
+ if (!down_trylock(&rlun->wr_sem)) {
+ active = 0;
+ up(&rlun->wr_sem);
+ }
+ sz += sprintf(page + sz, "POS:%d, CH:%d, LUN:%d - %d\n",
+ i,
+ rlun->bppa.g.ch,
+ rlun->bppa.g.lun,
+ active);
+ }
+ spin_unlock(&pblk->w_luns.lock);
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_consume_blocks_show(struct pblk *pblk, char *page)
+{
+ return sprintf(page, "consume_blocks=%d\n",
+ pblk_map_get_consume_blocks(pblk));
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+ unsigned long free_blocks;
+ int rb_user_max, rb_user_cnt;
+ int rb_gc_max, rb_gc_rsv, rb_gc_cnt;
+
+ spin_lock(&pblk->rl.lock);
+ free_blocks = pblk->rl.free_blocks;
+ rb_user_max = pblk->rl.rb_user_max;
+ rb_user_cnt = pblk->rl.rb_user_cnt;
+ rb_gc_max = pblk->rl.rb_gc_max;
+ rb_gc_rsv = pblk->rl.rb_gc_rsv;
+ rb_gc_cnt = pblk->rl.rb_gc_cnt;
+ spin_unlock(&pblk->rl.lock);
+
+ return sprintf(page,
+ "u:%u/%u,gc:%u/%u/%u(%lu)(stop:<%u/%u,full:>%u/%u,free:%lu)\n",
+ rb_user_max,
+ rb_user_cnt,
+ rb_gc_max,
+ rb_gc_rsv,
+ rb_gc_cnt,
+ pblk_rb_nr_entries(&pblk->rwb),
+ 1 << pblk->rl.low_pw,
+ pblk->rl.low_lun,
+ 1 << pblk->rl.high_pw,
+ pblk->rl.high_lun,
+ free_blocks);
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+ int gc_enabled, gc_active;
+
+ pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+ return sprintf(page, "gc_enabled=%d, gc_active=%d\n",
+ gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+ ssize_t offset;
+
+ spin_lock_irq(&pblk->lock);
+ offset = sprintf(page, "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+ pblk->read_failed, pblk->read_high_ecc,
+ pblk->read_empty, pblk->read_failed_gc,
+ pblk->write_failed, pblk->erase_failed);
+ spin_unlock_irq(&pblk->lock);
+
+ return offset;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+ return sprintf(page, "%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\t%u\n",
+ atomic_read(&pblk->inflight_writes),
+ atomic_read(&pblk->inflight_reads),
+ atomic_read(&pblk->req_writes),
+ atomic_read(&pblk->nr_flush),
+ atomic_read(&pblk->padded_writes),
+ atomic_read(&pblk->sub_writes),
+ atomic_read(&pblk->sync_writes),
+ atomic_read(&pblk->compl_writes),
+ atomic_read(&pblk->inflight_meta),
+ atomic_read(&pblk->compl_meta),
+ atomic_read(&pblk->recov_writes),
+ atomic_read(&pblk->recov_gc_writes),
+ atomic_read(&pblk->requeued_writes),
+ atomic_read(&pblk->sync_reads));
+}
+
+static ssize_t pblk_sysfs_blocks(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+ unsigned int free, used_int, used_cnt, bad, total_lun;
+ int i;
+ ssize_t line, sz = 0;
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ free = used_int = bad = 0;
+
+ spin_lock(&rlun->lock);
+ list_for_each_entry(rblk, &rlun->free_list, list)
+ free++;
+ list_for_each_entry(rblk, &rlun->bb_list, list)
+ bad++;
+
+ list_for_each_entry(rblk, &rlun->open_list, list)
+ used_int++;
+ list_for_each_entry(rblk, &rlun->closed_list, list)
+ used_int++;
+ list_for_each_entry(rblk, &rlun->g_bb_list, list)
+ used_int++;
+ spin_unlock(&rlun->lock);
+
+ used_cnt = geo->blks_per_lun - free - bad;
+ total_lun = used_int + free + bad;
+
+ if (used_cnt != used_int)
+ pr_err("pblk: used list corruption (i:%u,c:%u)\n",
+ used_int, used_cnt);
+
+ if (geo->blks_per_lun != total_lun)
+ pr_err("pblk: list corruption (t:%u,c:%u)\n",
+ geo->blks_per_lun, total_lun);
+
+ line = sprintf(page + sz,
+ "lun(%i %i):u=%u,f=%u,b=%u,t=%u,v=%u\n",
+ rlun->bppa.g.ch, rlun->bppa.g.lun,
+ used_int, free, bad, total_lun, rlun->nr_free_blocks);
+
+ sz += line;
+ if (sz + line > PAGE_SIZE) {
+ sz += sprintf(page + sz, "Cannot fit all LUNs\n");
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_open_blks(struct pblk *pblk, char *page)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+ int i;
+ ssize_t sz = 0;
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ sz += sprintf(page + sz, "LUN:%d\n", rlun->id);
+
+ spin_lock(&rlun->lock);
+ list_for_each_entry(rblk, &rlun->open_list, list) {
+ spin_lock(&rblk->lock);
+ sz += sprintf(page + sz,
+ "open:\tblk:%d\t%u\t%u\t%u\t%u\t%u\t%u\n",
+ rblk->id,
+ pblk->dev->geo.sec_per_blk,
+ pblk->nr_blk_dsecs,
+ bitmap_weight(rblk->sector_bitmap,
+ pblk->dev->geo.sec_per_blk),
+ bitmap_weight(rblk->sync_bitmap,
+ pblk->dev->geo.sec_per_blk),
+ bitmap_weight(rblk->invalid_bitmap,
+ pblk->dev->geo.sec_per_blk),
+ rblk->nr_invalid_secs);
+ spin_unlock(&rblk->lock);
+ }
+ spin_unlock(&rlun->lock);
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_bad_blks(struct pblk *pblk, char *page)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+ int i;
+ ssize_t line, sz = 0;
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ int bad_blks = 0;
+
+ spin_lock(&rlun->lock);
+ list_for_each_entry(rblk, &rlun->g_bb_list, list)
+ bad_blks++;
+ spin_unlock(&rlun->lock);
+
+ line = sprintf(page + sz, "lun(%i %i):bad=%u\n",
+ rlun->bppa.g.ch,
+ rlun->bppa.g.lun,
+ bad_blks);
+
+ sz += line;
+ if (sz + line > PAGE_SIZE) {
+ sz += sprintf(page + sz, "Cannot fit all LUNs\n");
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_gc_blks(struct pblk *pblk, char *page)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk;
+ int i;
+ ssize_t line, sz = 0;
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ int gc_blks = 0;
+
+ spin_lock(&rlun->lock);
+ list_for_each_entry(rblk, &rlun->prio_list, prio)
+ gc_blks++;
+ spin_unlock(&rlun->lock);
+
+ line = sprintf(page + sz, "lun(%i %i):gc=%u\n",
+ rlun->bppa.g.ch,
+ rlun->bppa.g.lun,
+ gc_blks);
+
+ sz += line;
+ if (sz + line > PAGE_SIZE) {
+ sz += sprintf(page + sz, "Cannot fit all LUNs\n");
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+ return pblk_rb_sysfs(&pblk->rwb, page);
+}
+#endif
+
+static ssize_t pblk_sysfs_luns_active_store(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ size_t c_len;
+ int value;
+ int ret;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ ret = pblk_map_set_active_luns(pblk, value);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_consume_blocks_store(struct pblk *pblk,
+ const char *page, size_t len)
+{
+ size_t c_len;
+ int value;
+ int ret;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ ret = pblk_map_set_consume_blocks(pblk, value);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ size_t c_len;
+ int value;
+ int ret;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ ret = pblk_rl_sysfs_rate_store(pblk, value);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_gc_state_store(struct pblk *pblk,
+ const char *page, size_t len)
+{
+ size_t c_len;
+ int value;
+ int ret;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ ret = pblk_gc_sysfs_enable(pblk, value);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ size_t c_len;
+ int value;
+ int ret;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ ret = pblk_gc_sysfs_force(pblk, value);
+ if (ret)
+ return ret;
+
+ return len;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_l2p_map_print(struct pblk *pblk, const char *page,
+ ssize_t len)
+{
+ size_t c_len;
+ sector_t lba_init, lba_end;
+ struct ppa_addr ppa;
+ sector_t i;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (sscanf(page, "%lu-%lu", &lba_init, &lba_end) != 2)
+ return -EINVAL;
+
+ for (i = lba_init; i < lba_end; i++) {
+ ppa = pblk_get_lba_map(pblk, i);
+
+ if (ppa_empty(ppa)) {
+ pr_debug("pblk: lba:%lu - ppa: EMPTY ADDRESS\n", i);
+ } else {
+ if (ppa.c.is_cached) {
+ pr_debug("pblk: lba:%lu - ppa: cacheline:%llu\n",
+ i,
+ (u64)ppa.c.line);
+
+ continue;
+ }
+
+ pr_debug("pblk: lba:%lu - ppa: %llx: ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ i,
+ ppa.ppa,
+ ppa.g.ch,
+ ppa.g.lun,
+ ppa.g.blk,
+ ppa.g.pg,
+ ppa.g.pl,
+ ppa.g.sec);
+ }
+ }
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_l2p_map_sanity(struct pblk *pblk, const char *page,
+ ssize_t len)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ size_t c_len;
+ struct pblk_addr *gp;
+ struct ppa_addr ppa;
+ void *read_sec;
+ struct nvm_rq *rqd;
+ struct pblk_r_ctx *r_ctx;
+ struct bio *bio;
+ sector_t lba_init, lba_end;
+ sector_t i;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (sscanf(page, "%llx-%lu-%lu", &ppa.ppa, &lba_init, &lba_end) != 3)
+ return -EINVAL;
+
+ if (lba_end == 0) {
+ lba_init = 0;
+ lba_end = pblk->rl.nr_secs;
+ }
+
+ if (lba_end > pblk->rl.nr_secs) {
+ pr_err("pblk: Incorrect lba limit\n");
+ goto out;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ for (i = lba_init; i < lba_end; i++) {
+ gp = &pblk->trans_map[i];
+
+ if (ppa.ppa == gp->ppa.ppa)
+ pr_debug("pblk: lba:%lu - ppa: %llx: ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ i,
+ gp->ppa.ppa,
+ gp->ppa.g.ch,
+ gp->ppa.g.lun,
+ gp->ppa.g.blk,
+ gp->ppa.g.pg,
+ gp->ppa.g.pl,
+ gp->ppa.g.sec);
+ }
+ spin_unlock(&pblk->trans_lock);
+
+ read_sec = kmalloc(geo->sec_size, GFP_KERNEL);
+ if (!read_sec)
+ goto out;
+
+ bio = bio_map_kern(dev->q, read_sec, geo->sec_size, GFP_KERNEL);
+ if (!bio) {
+ pr_err("pblk: could not allocate recovery bio\n");
+ goto out;
+ }
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: not able to create write req.\n");
+ bio_put(bio);
+ goto out;
+ }
+
+ bio->bi_iter.bi_sector = 0;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_end_io = pblk_end_sync_bio;
+ bio->bi_private = &wait;
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->ins = &pblk->instance;
+ rqd->bio = bio;
+ rqd->meta_list = NULL;
+ rqd->flags = NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND;
+
+ r_ctx = nvm_rq_to_pdu(rqd);
+ r_ctx->flags = PBLK_IOTYPE_SYNC;
+
+ if (nvm_set_rqd_ppalist(dev->parent, rqd, &ppa, 1, 0)) {
+ pr_err("pblk: could not set rqd ppa list\n");
+ goto out;
+ }
+
+ if (nvm_submit_io(dev, rqd)) {
+ pr_err("pblk: I/O submission failed\n");
+ nvm_free_rqd_ppalist(dev->parent, rqd);
+ goto out;
+ }
+
+ wait_for_completion_io(&wait);
+ if (bio->bi_error) {
+ struct ppa_addr p;
+
+ p = dev_to_generic_addr(pblk->dev, rqd->ppa_addr);
+ pr_err("pblk: read failed (%u)\n", bio->bi_error);
+ print_ppa(&p, "rqd", bio->bi_error);
+ goto out;
+ }
+
+out:
+ return len;
+}
+
+static ssize_t pblk_sysfs_block_meta(struct pblk *pblk, const char *page,
+ ssize_t len)
+{
+ size_t c_len;
+ u64 value;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtoull(page, 0, &value))
+ return -EINVAL;
+
+ pblk_recov_blk_meta_sysfs(pblk, value);
+ return len;
+}
+
+static ssize_t pblk_sysfs_cleanup(struct pblk *pblk, const char *page,
+ ssize_t len)
+{
+ struct pblk_lun *rlun;
+ struct pblk_block *rblk, *trblk;
+ size_t c_len;
+ int value;
+ sector_t i;
+ LIST_HEAD(cleanup_list);
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtoint(page, 0, &value))
+ return -EINVAL;
+ if (value != 1)
+ return -EINVAL;
+
+ /* Cleanup L2P table */
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < pblk->rl.nr_secs; i++) {
+ struct pblk_addr *p = &pblk->trans_map[i];
+
+ p->rblk = NULL;
+ ppa_set_empty(&p->ppa);
+ }
+ spin_unlock(&pblk->trans_lock);
+
+ pblk_for_each_lun(pblk, rlun, i) {
+ spin_lock(&rlun->lock);
+ list_for_each_entry_safe(rblk, trblk, &rlun->open_list, list)
+ list_move_tail(&rblk->list, &cleanup_list);
+ list_for_each_entry_safe(rblk, trblk, &rlun->closed_list, list)
+ list_move_tail(&rblk->list, &cleanup_list);
+ spin_unlock(&rlun->lock);
+
+ /* Blocks in closed_list are a superset of prio_list */
+ spin_lock(&rlun->lock);
+ list_for_each_entry_safe(rblk, trblk, &rlun->prio_list, prio)
+ list_del_init(&rblk->prio);
+ spin_unlock(&rlun->lock);
+
+ rlun->cur = NULL;
+ }
+
+ list_for_each_entry_safe(rblk, trblk, &cleanup_list, list) {
+ pblk_erase_blk(pblk, rblk);
+
+ spin_lock(&rblk->lock);
+ pblk_put_blk(pblk, rblk);
+ spin_unlock(&rblk->lock);
+ }
+
+ /* Reset write luns */
+ pblk_luns_configure(pblk);
+
+ return len;
+}
+#endif
+
+
+static struct attribute sys_luns_active = {
+ .name = "luns_active",
+ .mode = S_IRUGO | S_IWUSR,
+};
+
+static struct attribute sys_consume_blocks = {
+ .name = "consume_blocks",
+ .mode = S_IRUGO | S_IWUSR,
+};
+
+static struct attribute sys_write_luns = {
+ .name = "write_luns",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+ .name = "rate_limiter",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_gc_state = {
+ .name = "gc_state",
+ .mode = S_IRUGO | S_IWUSR,
+};
+
+static struct attribute sys_gc_force = {
+ .name = "gc_force",
+ .mode = S_IWUSR,
+};
+
+static struct attribute sys_errors_attr = {
+ .name = "errors",
+ .mode = S_IRUGO,
+};
+
+#ifdef CONFIG_NVM_DEBUG
+static struct attribute sys_stats_debug_attr = {
+ .name = "stats",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_blocks_attr = {
+ .name = "blocks",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_open_blocks_attr = {
+ .name = "open_blks",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_bad_blocks_attr = {
+ .name = "bad_blks",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_gc_blocks_attr = {
+ .name = "gc_blks",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_rb_attr = {
+ .name = "write_buffer",
+ .mode = S_IRUGO,
+};
+
+static struct attribute sys_blk_meta_attr = {
+ .name = "block_metadata",
+ .mode = S_IRUGO | S_IWUSR,
+};
+
+static struct attribute sys_l2p_map_attr = {
+ .name = "l2p_map",
+ .mode = S_IRUGO | S_IWUSR,
+};
+
+static struct attribute sys_l2p_sanity_attr = {
+ .name = "l2p_sanity",
+ .mode = S_IRUGO | S_IWUSR,
+};
+
+static struct attribute sys_cleanup = {
+ .name = "cleanup",
+ .mode = S_IWUSR,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+ &sys_luns_active,
+ &sys_consume_blocks,
+ &sys_write_luns,
+ &sys_rate_limiter_attr,
+ &sys_errors_attr,
+ &sys_gc_state,
+ &sys_gc_force,
+#ifdef CONFIG_NVM_DEBUG
+ &sys_stats_debug_attr,
+ &sys_blocks_attr,
+ &sys_open_blocks_attr,
+ &sys_bad_blocks_attr,
+ &sys_gc_blocks_attr,
+ &sys_rb_attr,
+ &sys_blk_meta_attr,
+ &sys_l2p_map_attr,
+ &sys_l2p_sanity_attr,
+ &sys_cleanup,
+#endif
+ NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "luns_active") == 0)
+ return pblk_sysfs_luns_active_show(pblk, buf);
+ else if (strcmp(attr->name, "write_luns") == 0)
+ return pblk_sysfs_luns_show(pblk, buf);
+ else if (strcmp(attr->name, "consume_blocks") == 0)
+ return pblk_sysfs_consume_blocks_show(pblk, buf);
+ else if (strcmp(attr->name, "rate_limiter") == 0)
+ return pblk_sysfs_rate_limiter(pblk, buf);
+ else if (strcmp(attr->name, "gc_state") == 0)
+ return pblk_sysfs_gc_state_show(pblk, buf);
+ else if (strcmp(attr->name, "errors") == 0)
+ return pblk_sysfs_stats(pblk, buf);
+#ifdef CONFIG_NVM_DEBUG
+ else if (strcmp(attr->name, "stats") == 0)
+ return pblk_sysfs_stats_debug(pblk, buf);
+ else if (strcmp(attr->name, "blocks") == 0)
+ return pblk_sysfs_blocks(pblk, buf);
+ else if (strcmp(attr->name, "open_blks") == 0)
+ return pblk_sysfs_open_blks(pblk, buf);
+ else if (strcmp(attr->name, "bad_blks") == 0)
+ return pblk_sysfs_bad_blks(pblk, buf);
+ else if (strcmp(attr->name, "gc_blks") == 0)
+ return pblk_sysfs_gc_blks(pblk, buf);
+ else if (strcmp(attr->name, "write_buffer") == 0)
+ return pblk_sysfs_write_buffer(pblk, buf);
+#endif
+ return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "luns_active") == 0)
+ return pblk_sysfs_luns_active_store(pblk, buf, len);
+ else if (strcmp(attr->name, "consume_blocks") == 0)
+ return pblk_sysfs_consume_blocks_store(pblk, buf, len);
+ else if (strcmp(attr->name, "rate_limiter") == 0)
+ return pblk_sysfs_rate_store(pblk, buf, len);
+ else if (strcmp(attr->name, "gc_state") == 0)
+ return pblk_sysfs_gc_state_store(pblk, buf, len);
+ else if (strcmp(attr->name, "gc_force") == 0)
+ return pblk_sysfs_gc_force(pblk, buf, len);
+#ifdef CONFIG_NVM_DEBUG
+ else if (strcmp(attr->name, "l2p_map") == 0)
+ return pblk_sysfs_l2p_map_print(pblk, buf, len);
+ else if (strcmp(attr->name, "l2p_sanity") == 0)
+ return pblk_sysfs_l2p_map_sanity(pblk, buf, len);
+ else if (strcmp(attr->name, "block_metadata") == 0)
+ return pblk_sysfs_block_meta(pblk, buf, len);
+ else if (strcmp(attr->name, "cleanup") == 0)
+ return pblk_sysfs_cleanup(pblk, buf, len);
+#endif
+
+ return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+ .show = pblk_sysfs_show,
+ .store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+ .sysfs_ops = &pblk_sysfs_ops,
+ .default_attrs = pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+ struct device *parent_dev = disk_to_dev(pblk->disk);
+ int ret;
+
+ ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+ kobject_get(&parent_dev->kobj),
+ "%s", "lightnvm");
+ if (ret) {
+ pr_err("pblk: could not register %s/lightnvm - name in use\n",
+ tdisk->disk_name);
+ return ret;
+ }
+
+ kobject_uevent(&pblk->kobj, KOBJ_ADD);
+ return 0;
+}
+
+void pblk_sysfs_exit(struct pblk *pblk)
+{
+ kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+ kobject_del(&pblk->kobj);
+ kobject_put(&pblk->kobj);
+}
+
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644
index 0000000..957b92c
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <jg@xxxxxxxxxxx>
+ * Matias Bjorling <m@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+int pblk_replace_blk(struct pblk *pblk, struct pblk_block *rblk,
+ struct pblk_lun *rlun, int lun_pos)
+{
+ rblk = pblk_get_blk(pblk, rlun);
+ if (!rblk) {
+ pr_debug("pblk: could not get new block\n");
+ return 0;
+ }
+
+ pblk_set_lun_cur(rlun, rblk);
+ return pblk_map_replace_lun(pblk, lun_pos);
+}
+
+int pblk_write_setup_s(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, struct pblk_sec_meta *meta,
+ unsigned long *lun_bitmap)
+{
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ int ret;
+
+ /* Single sector path - this path is highly improbable since
+ * controllers typically deal with multi-sector and multi-plane
+ * pages. This path is though useful for testing on QEMU
+ */
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(pblk->dev->geo.sec_per_pl != 1);
+#endif
+
+ return pblk_map_rr_page(pblk, c_ctx->sentry, &rqd->ppa_addr,
+ &meta[0], 1, 1,
+ lun_bitmap);
+
+ return ret;
+}
+
+int pblk_write_setup_m(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, struct pblk_sec_meta *meta,
+ unsigned int valid_secs, int off,
+ unsigned long *lun_bitmap)
+{
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ int min = pblk->min_write_pgs;
+
+ return pblk_map_rr_page(pblk, c_ctx->sentry + off,
+ &rqd->ppa_list[off],
+ &meta[off], min, valid_secs,
+ lun_bitmap);
+}
+
+int pblk_write_alloc_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, unsigned int nr_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+ /* Setup write request */
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->ins = &pblk->instance;
+ rqd->nr_ppas = nr_secs;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+ rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_meta_list);
+ if (!rqd->meta_list)
+ return -ENOMEM;
+
+ if (unlikely(nr_secs == 1))
+ return 0;
+
+ /* TODO: Reuse same dma region for ppa_list and metadata */
+ rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx)
+{
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ unsigned int valid_secs = c_ctx->nr_valid;
+ unsigned int padded_secs = c_ctx->nr_padded;
+ unsigned int nr_secs = valid_secs + padded_secs;
+ unsigned long lun_bitmap[PBLK_MAX_LUNS_BITMAP];
+ struct pblk_sec_meta *meta;
+ unsigned int setup_secs;
+ int min = pblk->min_write_pgs;
+ int i;
+ int ret = 0;
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+#endif
+
+ bitmap_zero(lun_bitmap, pblk->nr_luns);
+
+ ret = pblk_write_alloc_rq(pblk, rqd, ctx, nr_secs);
+ if (ret)
+ goto out;
+
+ meta = rqd->meta_list;
+
+ if (unlikely(nr_secs == 1)) {
+ /* Logic error */
+ BUG_ON(padded_secs != 0);
+ ret = pblk_write_setup_s(pblk, rqd, ctx, meta, lun_bitmap);
+ goto out;
+ }
+
+ for (i = 0; i < nr_secs; i += min) {
+ setup_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ ret = pblk_write_setup_m(pblk, rqd, ctx, meta, setup_secs, i,
+ lun_bitmap);
+ if (ret)
+ goto out;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_checks(pblk->dev, ppa_list, rqd->nr_ppas))
+ WARN_ON(1);
+#endif
+
+out:
+ return ret;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush)
+{
+ int max = pblk->max_write_pgs;
+ int min = pblk->min_write_pgs;
+ int secs_to_sync = 0;
+
+ if ((secs_avail >= max) || (secs_to_flush >= max)) {
+ secs_to_sync = max;
+ } else if (secs_avail >= min) {
+ if (secs_to_flush) {
+ secs_to_sync = min * (secs_to_flush / min);
+ while (1) {
+ int inc = secs_to_sync + min;
+
+ if (inc <= secs_avail && inc <= max)
+ secs_to_sync += min;
+ else
+ break;
+ }
+ } else
+ secs_to_sync = min * (secs_avail / min);
+ } else {
+ if (secs_to_flush)
+ secs_to_sync = min;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(!secs_to_sync && secs_to_flush);
+#endif
+
+ return secs_to_sync;
+}
+
+int pblk_submit_write(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ struct pblk_ctx *ctx;
+ struct pblk_compl_ctx *c_ctx;
+ unsigned int pgs_read;
+ unsigned int secs_avail, secs_to_sync, secs_to_com;
+ unsigned int secs_to_flush = 0;
+ unsigned long sync_point;
+ unsigned long count;
+ unsigned long pos;
+ int err;
+
+ /* Pre-check if we should start writing before doing allocations */
+ secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+ count = pblk_rb_count(&pblk->rwb);
+ if (!secs_to_flush && count < pblk->max_write_pgs)
+ return 1;
+
+ rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: not able to create write req.\n");
+ return 1;
+ }
+ ctx = pblk_set_ctx(pblk, rqd);
+ c_ctx = ctx->c_ctx;
+
+ bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
+ if (!bio) {
+ pr_err("pblk: not able to create write bio\n");
+ goto fail_free_rqd;
+ }
+
+ /* Count available entries on rb, and lock reader */
+ secs_avail = pblk_rb_read_lock(&pblk->rwb);
+ if (!secs_avail)
+ goto fail_put_bio;
+
+ secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
+ if (secs_to_sync < 0) {
+ pr_err("pblk: bad buffer sync calculation\n");
+ pblk_rb_read_unlock(&pblk->rwb);
+ goto fail_put_bio;
+ }
+
+ secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
+ pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+
+ if (!secs_to_com)
+ goto fail_put_bio;
+
+ pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, ctx, pos, secs_to_sync,
+ secs_avail, &sync_point);
+ if (!pgs_read)
+ goto fail_put_bio;
+
+ if (secs_to_flush <= secs_to_sync)
+ pblk_rb_sync_point_reset(&pblk->rwb, sync_point);
+
+ if (c_ctx->nr_padded)
+ if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
+ goto fail_put_bio;
+
+ bio->bi_iter.bi_sector = 0; /* artificial bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+
+ /* Assign lbas to ppas and populate request structure */
+ err = pblk_setup_w_rq(pblk, rqd, ctx);
+ if (err) {
+ pr_err("pblk: could not setup write request\n");
+ goto fail_free_bio;
+ }
+
+ err = nvm_submit_io(dev, rqd);
+ if (err) {
+ pr_err("pblk: I/O submission failed: %d\n", err);
+ goto fail_free_bio;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(secs_to_sync, &pblk->sub_writes);
+#endif
+ return 0;
+fail_free_bio:
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
+fail_put_bio:
+ bio_put(bio);
+fail_free_rqd:
+ pblk_free_rqd(pblk, rqd, WRITE);
+
+ return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_submit_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static void pblk_sync_buffer(struct pblk *pblk, struct pblk_block *rblk,
+ u64 block_ppa, int flags)
+{
+ WARN_ON(test_and_set_bit(block_ppa, rblk->sync_bitmap));
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&pblk->sync_writes);
+#endif
+
+ /* If last page completed, then this is not a grown bad block */
+ if (bitmap_full(rblk->sync_bitmap, pblk->nr_blk_dsecs))
+ pblk_run_blk_ws(pblk, rblk, pblk_close_blk);
+}
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ struct bio *original_bio;
+ int nr_entries = c_ctx->nr_valid;
+ unsigned long ret;
+ int i;
+ int cur_lun = -1;
+
+ for (i = 0; i < nr_entries; i++) {
+ struct pblk_block *rblk;
+ struct pblk_w_ctx *w_ctx;
+
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
+ rblk = w_ctx->ppa.rblk;
+
+ pblk_sync_buffer(pblk, w_ctx->ppa.rblk, w_ctx->paddr,
+ w_ctx->flags);
+ original_bio = w_ctx->bio;
+ if (original_bio) {
+ bio_endio(original_bio);
+ w_ctx->bio = NULL;
+ }
+
+ if (rblk->rlun->id != cur_lun) {
+ up(&rblk->rlun->wr_sem);
+ cur_lun = rblk->rlun->id;
+ }
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_add(nr_entries, &pblk->compl_writes);
+#endif
+
+ ret = pblk_rb_sync_advance(&pblk->rwb, nr_entries);
+
+ if (nr_entries > 1)
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+
+ return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+ struct nvm_rq *rqd,
+ struct pblk_ctx *ctx)
+{
+ list_del(&ctx->list);
+ return pblk_end_w_bio(pblk, rqd, ctx);
+}
+
+static void pblk_compl_queue(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx)
+{
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ struct pblk_ctx *c, *r;
+ unsigned long flags;
+ unsigned long pos;
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+ /* Kick user I/O rate limiter queue if waiting */
+ if (waitqueue_active(&pblk->wait))
+ wake_up_all(&pblk->wait);
+
+ pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+
+ if (c_ctx->sentry == pos) {
+ pos = pblk_end_w_bio(pblk, rqd, ctx);
+
+retry:
+ list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+ rqd = nvm_rq_from_pdu(c);
+ c_ctx = c->c_ctx;
+ if (c_ctx->sentry == pos) {
+ pos = pblk_end_queued_w_bio(pblk, rqd, c);
+ goto retry;
+ }
+ }
+ } else {
+ list_add_tail(&ctx->list, &pblk->compl_list);
+ }
+
+ pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/*
+ * When a write fails we assume for now that the flash block has grown bad.
+ * Thus, we start a recovery mechanism to (in general terms):
+ * - Take block out of the active open block list
+ * - Complete the successful writes on the request
+ * - Remap failed writes to a new request
+ * - Move written data on grown bad block(s) to new block(s)
+ * - Mark grown bad block(s) as bad and return to media manager
+ *
+ * This function assumes that ppas in rqd are in generic mode. This is,
+ * nvm_addr_to_generic_mode(dev, rqd) has been called.
+ *
+ * TODO: Depending on the type of memory, try write retry
+ */
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *comp_bits = &rqd->ppa_status;
+ struct pblk_ctx *ctx = pblk_set_ctx(pblk, rqd);
+ struct pblk_compl_ctx *c_ctx = ctx->c_ctx;
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ struct pblk_rec_ctx *recovery;
+ struct ppa_addr ppa, prev_ppa;
+ unsigned int c_entries;
+ int nr_ppas = rqd->nr_ppas;
+ int bit;
+ int ret;
+
+ /* The last page of a block contains recovery metadata, if a block
+ * becomes bad when writing this page, there is no need to recover what
+ * is being written; this metadata is generated in a per-block basis.
+ * This block is on its way to being closed. Mark as bad and trigger
+ * recovery
+ */
+ if (ctx->flags & PBLK_IOTYPE_CLOSE_BLK) {
+ struct pblk_compl_close_ctx *c_ctx = ctx->c_ctx;
+
+ pblk_run_recovery(pblk, c_ctx->rblk);
+ pblk_end_close_blk_bio(pblk, rqd, 0);
+ return;
+ }
+
+ /* look up blocks and mark them as bad
+ * TODO: RECOVERY HERE TOO
+ */
+ if (nr_ppas == 1)
+ return;
+
+ recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+ if (!recovery) {
+ pr_err("pblk: could not allocate recovery context\n");
+ return;
+ }
+ INIT_LIST_HEAD(&recovery->failed);
+
+ c_entries = find_first_bit(comp_bits, nr_ppas);
+
+ /* Replace all grown bad blocks on RR mapping scheme, mark them as bad
+ * and return them to the media manager.
+ */
+ ppa_set_empty(&prev_ppa);
+ bit = -1;
+ while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+ if (bit > c_ctx->nr_valid)
+ goto out;
+
+ ppa = rqd->ppa_list[bit];
+
+ entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
+ if (!entry) {
+ pr_err("pblk: could not scan entry on write failure\n");
+ continue;
+ }
+ w_ctx = &entry->w_ctx;
+
+ /* The list is filled first and emptied afterwards. No need for
+ * protecting it with a lock
+ */
+ list_add_tail(&entry->index, &recovery->failed);
+
+ if (ppa_cmp_blk(ppa, prev_ppa))
+ continue;
+
+ pblk_mark_bb(pblk, ppa);
+
+ prev_ppa.ppa = ppa.ppa;
+ pblk_run_recovery(pblk, w_ctx->ppa.rblk);
+ }
+
+out:
+ ret = pblk_recov_setup_rq(pblk, ctx, recovery, comp_bits, c_entries);
+ if (ret)
+ pr_err("pblk: could not recover from write failure\n");
+
+ INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+ queue_work(pblk->kw_wq, &recovery->ws_rec);
+
+ pblk_compl_queue(pblk, rqd, ctx);
+}
+
+void pblk_end_io_write(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_ctx *ctx;
+
+ if (rqd->error) {
+ inc_stat(pblk, &pblk->write_failed, 1);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+ nvm_addr_to_generic_mode(dev, rqd);
+ return pblk_end_w_fail(pblk, rqd);
+ }
+
+ ctx = pblk_set_ctx(pblk, rqd);
+
+ if (ctx->flags & PBLK_IOTYPE_SYNC)
+ return;
+
+ if (ctx->flags & PBLK_IOTYPE_CLOSE_BLK)
+ return pblk_end_close_blk_bio(pblk, rqd, 1);
+
+ pblk_compl_queue(pblk, rqd, ctx);
+}
+
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
new file mode 100644
index 0000000..7ceefe9
--- /dev/null
+++ b/drivers/lightnvm/pblk.h
@@ -0,0 +1,942 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Initial release: Matias Bjorling <m@xxxxxxxxxxx>
+ * Write buffering: Javier Gonzalez <jg@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ * Derived from rrpc.h
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 5000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+#define pblk_for_each_lun(pblk, rlun, i) \
+ for ((i) = 0, rlun = &(pblk)->luns[0]; \
+ (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
+
+#define ERASE 2 /* READ = 0, WRITE = 1 */
+
+enum {
+ /* IO Types */
+ PBLK_IOTYPE_USER = 1,
+ PBLK_IOTYPE_GC = 2,
+ PBLK_IOTYPE_SYNC = 4,
+ PBLK_IOTYPE_CLOSE_BLK = 8,
+ PBLK_IOTYPE_REF = 16,
+
+ /* Write buffer flags */
+ PBLK_WRITTEN_DATA = 128,
+ PBLK_WRITABLE_ENTRY = 256,
+};
+
+enum {
+ PBLK_BLK_ST_OPEN = 0x1,
+ PBLK_BLK_ST_CLOSED = 0x2,
+};
+
+struct pblk_sec_meta {
+ u64 lba;
+ u64 reserved;
+};
+
+/* Buffer allocated after counter */
+struct pblk_kref_buf {
+ struct kref ref;
+ void *data;
+};
+
+/* Logical to physical mapping */
+struct pblk_addr {
+ struct ppa_addr ppa; /* cacheline OR physical address */
+ struct pblk_block *rblk; /* reference to pblk block for lookup */
+};
+
+/* Completion context */
+struct pblk_compl_ctx {
+ unsigned int sentry;
+ unsigned int nr_valid;
+ unsigned int nr_padded;
+};
+
+struct pblk_compl_close_ctx {
+ struct pblk_block *rblk; /* reference to pblk block for lookup */
+};
+
+struct pblk_ctx {
+ struct list_head list; /* Head for out-of-order completion */
+ void *c_ctx; /* Completion context */
+ int flags; /* Context flags */
+};
+
+/* Read context */
+struct pblk_r_ctx {
+ int flags; /* Read context flags */
+ struct bio *orig_bio;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+ struct pblk *pblk;
+ struct nvm_rq *rqd;
+ struct list_head failed;
+ struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+ struct bio *bio; /* Original bio - used for completing in
+ * REQ_FUA, REQ_FLUSH case
+ */
+ void *priv; /* Private pointer */
+ sector_t lba; /* Logic addr. associated with entry */
+ u64 paddr; /* pblk block physical address */
+ struct pblk_addr ppa; /* Physic addr. associated with entry */
+ int flags; /* Write context flags */
+};
+
+struct pblk_rb_entry {
+ void *data; /* Pointer to data on this entry */
+ struct pblk_w_ctx w_ctx; /* Context for this entry */
+ struct list_head index; /* List head to enable indexes */
+};
+
+#define RB_EMPTY_ENTRY (~0ULL)
+
+struct pblk_rb_pages {
+ struct page *pages;
+ int order;
+ struct list_head list;
+};
+
+struct pblk_rb {
+ struct pblk_rb_entry *entries; /* Ring buffer entries */
+ unsigned long mem; /* Write offset - points to next
+ * writable entry in memory
+ */
+ unsigned long subm; /* Read offset - points to last entry
+ * that has been submitted to the media
+ * to be persisted
+ */
+ unsigned long sync; /* Synced - backpointer that signals
+ * the last submitted entry that has
+ * been successfully persisted to media
+ */
+ unsigned long sync_point; /* Sync point - last entry that must be
+ * flushed to the media. Used with
+ * REQ_FLUSH and REQ_FUA
+ */
+ unsigned long l2p_update; /* l2p update point - next entry for
+ * which l2p mapping will be updated to
+ * contain a device ppa address (instead
+ * of a cacheline
+ */
+ unsigned long nr_entries; /* Number of entries in write buffer -
+ * must be a power of two
+ */
+ unsigned int seg_size; /* Size of the data segments being
+ * stored on each entry. Typically this
+ * will be 4KB
+ */
+
+ struct list_head pages; /* List of data pages */
+
+ spinlock_t w_lock; /* Write lock */
+ spinlock_t r_lock; /* Read lock */
+ spinlock_t s_lock; /* Sync lock */
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+#define PBLK_RECOVERY_BITMAPS 3 /* sector_bitmap, sync_bitmap, invalid_bitmap */
+
+/*
+ * Recovery stored in the last page of the block. A list of lbas (u64) is
+ * allocated together with this structure to allow block recovery and GC.
+ * After this structure, we store the following block bitmaps on the last page:
+ * sector_bitmap, sync_bitmap and invalid_bitmap in this order.
+ */
+struct pblk_blk_rec_lpg {
+ u32 crc;
+ u32 status;
+ u32 blk_state;
+ u32 rlpg_len;
+ u32 req_len;
+ u32 nr_lbas;
+ u32 nr_padded;
+ u32 cur_sec;
+ u32 nr_invalid_secs;
+ u32 bitmap_len;
+};
+
+struct pblk_blk_rec_lenghts {
+ unsigned int bitmap_len;
+ unsigned int rlpg_page_len;
+};
+
+struct pblk_block {
+ int id; /* id inside of LUN */
+ struct pblk_lun *rlun;
+ struct list_head prio;
+ struct list_head list;
+
+ struct pblk_blk_rec_lpg *rlpg;
+
+ unsigned long *sector_bitmap; /* Bitmap for free (0) / used sectors
+ * (1) in the block
+ */
+ unsigned long *sync_bitmap; /* Bitmap representing physical
+ * addresses that have been synced to
+ * the media
+ */
+ unsigned long *invalid_bitmap; /* Bitmap for invalid sector entries */
+ unsigned long cur_sec;
+ /* number of secs that are invalid, wrt host page size */
+ unsigned int nr_invalid_secs;
+
+ int state;
+
+ spinlock_t lock;
+};
+
+struct pblk_lun {
+ struct pblk *pblk;
+
+ int id;
+ struct ppa_addr bppa;
+
+ struct pblk_block *cur;
+ struct pblk_block *blocks; /* Reference to block allocation */
+
+ /* In-use blocks - pblk block */
+ struct list_head prio_list; /* Blocks that may be GC'ed */
+ struct list_head open_list; /* In-use open blocks. These are blocks
+ * that can be both written to and read
+ * from
+ */
+ struct list_head closed_list; /* In-use closed blocks. These are
+ * blocks that can _only_ be read from
+ * and that have not been reclaimed by
+ * GC
+ */
+ struct list_head g_bb_list; /* Grown bad blocks waiting to be
+ *disposed
+ */
+
+ /* lun block lists */
+ struct list_head free_list; /* Not used blocks i.e. released
+ * and ready for use
+ */
+ struct list_head bb_list; /* Bad blocks. Mutually exclusive with
+ * free_list and used blocks
+ * (open_list + closed_list + g_bb_list)
+ */
+ unsigned int nr_free_blocks; /* Number of unused blocks */
+
+ struct semaphore wr_sem;
+
+ spinlock_t lock;
+};
+
+struct pblk_gc {
+ int gc_active;
+ int gc_enabled;
+ int gc_forced;
+
+ spinlock_t lock;
+};
+
+struct pblk_prov {
+
+ unsigned int high_pw; /* Upper threshold for rate limiter (free run -
+ * user I/O rate limiter. Given as a power-of-2
+ */
+ unsigned int high_lun; /* Upper threshold for per-LUN rate limiter.
+ * Given as absolute value
+ */
+ unsigned int low_pw; /* Lower threshold for rate limiter (user I/O
+ * rate limiter - stall). Given as a power-of-2
+ */
+ unsigned int low_lun; /* Lower threshold for per-LUN rate limiter.
+ * Given as absolute value
+ */
+
+#define PBLK_USER_LOW_THRS 50 /* full stop at 2 percent of available
+ * blocks
+ */
+#define PBLK_USER_HIGH_THRS 4 /* begin write limit at 25 percent
+ * available blks
+ */
+
+ int rb_windows_pw; /* Number of rate windows in the write buffer
+ * given as a power-of-2. This guarantees that
+ * when user I/O is being rate limited, there
+ * will be reserved enough space for the GC to
+ * place its payload. A window is of
+ * pblk->max_write_pgs size, which in NVMe is
+ * 64, i.e., 256kb.
+ */
+ int rb_user_max; /* Max buffer entries available for user I/O */
+ int rb_user_cnt; /* User I/O buffer counter */
+ int rb_gc_max; /* Max buffer entries available for GC I/O */
+ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */
+ int rb_gc_cnt; /* GC I/O buffer counter */
+
+ unsigned long long nr_secs;
+ unsigned long total_blocks;
+ unsigned long free_blocks;
+
+ spinlock_t lock;
+};
+
+struct pblk_prov_queue {
+ struct list_head list;
+ spinlock_t lock;
+ int nr_elems;
+ int qd;
+};
+
+/* Write strategy */
+struct pblk_w_luns {
+ int nr_luns; /* Number of writable luns */
+ int nr_blocks; /* Number of blocks to be consumed per lun. -1
+ * signals that the lun must not change and
+ * consume only blocks from the set luns. Active
+ * luns can be then set through sysfs
+ */
+
+ struct pblk_lun **luns; /* Pointers to writable luns */
+ int *lun_blocks; /* Consumed blocks per lun */
+
+ int next_w_lun; /* Whenever sector is written, this is updated
+ * to point to the next write lun
+ */
+ int next_lun; /* Next non-writable lun to become writable */
+
+ spinlock_t lock;
+};
+
+#define NVM_MEM_PAGE_WRITE (8)
+
+struct pblk {
+ /* instance must be kept in top to resolve pblk in unprep */
+ struct nvm_tgt_instance instance;
+
+ struct nvm_tgt_dev *dev;
+ struct gendisk *disk;
+
+ struct kobject kobj;
+
+ int nr_luns;
+ struct pblk_lun *luns;
+
+ struct pblk_w_luns w_luns;
+
+ struct pblk_rb rwb;
+
+ int min_write_pgs; /* minimum amount of pages required by controller */
+ int max_write_pgs; /* maximum amount of pages supported by controller */
+
+ int pgs_in_buffer; /* Number of pages that need to be old in buffer to
+ * guarantee successful reads
+ */
+
+ unsigned int nr_blk_dsecs; /* Number of data sectors in block */
+ struct pblk_blk_rec_lenghts blk_meta;
+
+ /* capacity of devices when bad blocks are subtracted */
+ sector_t capacity;
+
+ /* pblk provisioning values. Used by rate limiter */
+ struct pblk_prov rl;
+
+ /* counter for pblk_write_kick */
+#define PBLK_KICK_SECTS 16
+ int write_cnt;
+ spinlock_t kick_lock;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* All debug counters apply to 4kb sector I/Os */
+ atomic_t inflight_writes; /* Inflight writes (user and gc) */
+ atomic_t padded_writes; /* Sectors padded due to flush/fua */
+ atomic_t nr_flush; /* Number of flush/fua I/O */
+ atomic_t req_writes; /* Sectors stored on write buffer */
+ atomic_t sub_writes; /* Sectors submitted from buffer */
+ atomic_t sync_writes; /* Sectors synced to media */
+ atomic_t compl_writes; /* Sectors completed in write bio */
+ atomic_t inflight_meta; /* Inflight metadata sectors */
+ atomic_t compl_meta; /* Completed metadata sectors */
+ atomic_t inflight_reads; /* Inflight sector read requests */
+ atomic_t sync_reads; /* Completed sector read requests */
+ atomic_t recov_writes; /* Sectors submitted from recovery */
+ atomic_t recov_gc_writes; /* Sectors submitted from recovery GC */
+ atomic_t requeued_writes; /* Sectors requeued in cache */
+#endif
+
+ spinlock_t lock;
+ unsigned long read_failed;
+ unsigned long read_empty;
+ unsigned long read_high_ecc;
+ unsigned long read_failed_gc;
+ unsigned long write_failed;
+ unsigned long erase_failed;
+
+ spinlock_t bio_lock;
+ spinlock_t trans_lock;
+ struct bio_list requeue_bios;
+ struct work_struct ws_requeue;
+ struct work_struct ws_gc;
+ struct task_struct *ts_writer;
+
+ /* Simple translation map of logical addresses to physical addresses.
+ * The logical addresses is known by the host system, while the physical
+ * addresses are used when writing to the disk block device.
+ */
+ struct pblk_addr *trans_map;
+
+ struct list_head compl_list;
+
+ mempool_t *page_pool;
+ mempool_t *blk_ws_pool;
+ mempool_t *rec_pool;
+ mempool_t *r_rq_pool;
+ mempool_t *w_rq_pool;
+ mempool_t *blk_meta_pool;
+
+ struct timer_list gc_timer;
+ struct workqueue_struct *krqd_wq;
+ struct workqueue_struct *kgc_wq;
+ struct workqueue_struct *kw_wq;
+
+ wait_queue_head_t wait;
+ struct timer_list wtimer;
+
+ struct pblk_gc gc;
+};
+
+struct pblk_block_ws {
+ struct pblk *pblk;
+ struct pblk_block *rblk;
+ struct work_struct ws_blk;
+};
+
+#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_ctx) + \
+ sizeof(struct pblk_compl_ctx))
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz);
+unsigned long pblk_rb_calculate_size(unsigned long nr_entries);
+unsigned long pblk_rb_nr_entries(struct pblk_rb *rb);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+
+int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_up,
+ unsigned int nr_com, unsigned long *pos);
+void pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned long pos);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+
+unsigned long pblk_rb_read_lock(struct pblk_rb *rb);
+unsigned int pblk_rb_read(struct pblk_rb *rb, void *buf,
+ struct pblk_ctx *ctx,
+ unsigned int nr_entries);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_ctx *ctx,
+ unsigned long pos,
+ unsigned int nr_entries,
+ unsigned int count,
+ unsigned long *sp);
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_ctx *ctx,
+ struct list_head *list,
+ unsigned int max);
+void pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, u64 pos);
+unsigned long pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+void pblk_rb_read_unlock(struct pblk_rb *rb);
+
+unsigned long pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned long pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+
+int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio);
+unsigned long pblk_rb_sync_point_count(struct pblk_rb *rb);
+void pblk_rb_sync_point_reset(struct pblk_rb *rb, unsigned long sp);
+
+unsigned long pblk_rb_space(struct pblk_rb *rb);
+unsigned long pblk_rb_count(struct pblk_rb *rb);
+unsigned long pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned long pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+
+void pblk_rb_data_free(struct pblk_rb *rb);
+
+#ifdef CONFIG_NVM_DEBUG
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+#endif
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
+void pblk_flush_writer(struct pblk *pblk);
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+struct pblk_blk_rec_lpg *pblk_alloc_blk_meta(struct pblk *pblk,
+ struct pblk_block *rblk,
+ u32 status);
+void pblk_put_blk(struct pblk *pblk, struct pblk_block *rblk);
+void pblk_erase_blk(struct pblk *pblk, struct pblk_block *rblk);
+void pblk_mark_bb(struct pblk *pblk, struct ppa_addr ppa);
+void pblk_end_io(struct nvm_rq *rqd);
+void pblk_end_sync_bio(struct bio *bio);
+void pblk_free_blks(struct pblk *pblk);
+void pblk_pad_open_blks(struct pblk *pblk);
+struct pblk_block *pblk_get_blk(struct pblk *pblk, struct pblk_lun *rlun);
+int pblk_replace_blk(struct pblk *pblk, struct pblk_block *rblk,
+ struct pblk_lun *rlun, int lun_pos);
+void pblk_end_close_blk_bio(struct pblk *pblk, struct nvm_rq *rqd, int run_gc);
+void pblk_set_lun_cur(struct pblk_lun *rlun, struct pblk_block *rblk);
+void pblk_run_blk_ws(struct pblk *pblk, struct pblk_block *rblk,
+ void (*work)(struct work_struct *));
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages);
+int pblk_update_map(struct pblk *pblk, sector_t laddr, struct pblk_block *rblk,
+ struct ppa_addr ppa);
+int pblk_update_map_gc(struct pblk *pblk, sector_t laddr,
+ struct pblk_block *rblk, struct ppa_addr ppa,
+ struct pblk_block *gc_rblk);
+unsigned long pblk_nr_free_blks(struct pblk *pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, int error);
+int pblk_luns_configure(struct pblk *pblk);
+#endif
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+ unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+ struct pblk_kref_buf *ref_buf,
+ unsigned int nr_entries, unsigned int nr_rec_entries,
+ unsigned long flags, struct pblk_block *gc_rblk);
+
+/*
+ * pblk map
+ */
+int pblk_map_init(struct pblk *pblk);
+void pblk_map_free(struct pblk *pblk);
+int pblk_map_page(struct pblk *pblk, struct pblk_block *rblk,
+ unsigned int sentry, struct ppa_addr *ppa_list,
+ struct pblk_sec_meta *meta_list,
+ unsigned int nr_secs, unsigned int valid_secs);
+int pblk_map_rr_page(struct pblk *pblk, unsigned int sentry,
+ struct ppa_addr *ppa_list,
+ struct pblk_sec_meta *meta_list,
+ unsigned int nr_secs, unsigned int valid_secs,
+ unsigned long *lun_bitmap);
+int pblk_map_replace_lun(struct pblk *pblk, int lun_pos);
+ssize_t pblk_map_set_active_luns(struct pblk *pblk, int nr_luns);
+ssize_t pblk_map_set_offset_active_luns(struct pblk *pblk, int offset);
+int pblk_map_get_active_luns(struct pblk *pblk);
+int pblk_map_set_consume_blocks(struct pblk *pblk, int value);
+int pblk_map_get_consume_blocks(struct pblk *pblk);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(unsigned long data);
+int pblk_write_setup_m(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, struct pblk_sec_meta *meta,
+ unsigned int valid_secs, int off,
+ unsigned long *lun_bitmap);
+int pblk_write_setup_s(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, struct pblk_sec_meta *meta,
+ unsigned long *lun_bitmap);
+int pblk_write_alloc_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_ctx *ctx, unsigned int nr_secs);
+void pblk_end_io_write(struct pblk *pblk, struct nvm_rq *rqd);
+
+/*
+ * pblk read path
+ */
+int pblk_submit_read(struct pblk *pblk, struct bio *bio, unsigned long flags);
+int pblk_submit_read_gc(struct pblk *pblk, struct bio *bio,
+ struct nvm_rq *rqd, u64 *lba_list,
+ unsigned int nr_secs, unsigned int nr_rec_secs,
+ unsigned long flags);
+void pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, uint8_t nr_secs);
+
+/*
+ * pblk recovery
+ */
+void pblk_submit_rec(struct work_struct *work);
+int pblk_recov_page_size(struct pblk *pblk);
+void pblk_run_recovery(struct pblk *pblk, struct pblk_block *rblk);
+int pblk_recov_init(struct pblk *pblk);
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_ctx *ctx,
+ struct pblk_rec_ctx *recovery, u64 *comp_bits,
+ unsigned int c_entries);
+int pblk_recov_read(struct pblk *pblk, struct pblk_block *rblk,
+ void *recov_page);
+struct nvm_rq *pblk_recov_setup(struct pblk *pblk, void *recov_page);
+u64 *pblk_recov_get_lba_list(struct pblk *pblk, struct pblk_blk_rec_lpg *rlpg);
+int pblk_recov_scan_blk(struct pblk *pblk, struct pblk_block *rblk);
+void pblk_recov_clean_g_bb_list(struct pblk *pblk, struct pblk_lun *rlun);
+void pblk_close_blk(struct work_struct *work);
+int pblk_recov_calc_meta_len(struct pblk *pblk, unsigned int *bitmap_len,
+ unsigned int *rlpg_len,
+ unsigned int *req_len);
+
+#ifdef CONFIG_NVM_DEBUG
+void pblk_recov_blk_meta_sysfs(struct pblk *pblk, u64 value);
+#endif
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_TRIES 3
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+int pblk_gc_status(struct pblk *pblk);
+void pblk_gc_queue(struct work_struct *work);
+void pblk_gc(struct work_struct *work);
+int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_block *rblk,
+ u64 *lba_list, unsigned int nr_entries);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active);
+int pblk_gc_sysfs_force(struct pblk *pblk, int value);
+int pblk_gc_sysfs_enable(struct pblk *pblk, int value);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk *pblk);
+int pblk_rl_gc_thrs(struct pblk *pblk);
+void pblk_rl_user_in(struct pblk *pblk, int nr_entries);
+void pblk_rl_gc_in(struct pblk *pblk, int nr_entries);
+void pblk_rl_out(struct pblk *pblk, int nr_user, int nr_gc);
+void pblk_rl_set_gc_rsc(struct pblk *pblk, int rsv);
+int pblk_rl_sysfs_rate_show(struct pblk *pblk);
+int pblk_rl_sysfs_rate_store(struct pblk *pblk, int value);
+void pblk_rl_free_blks_inc(struct pblk *pblk, struct pblk_lun *rlun);
+void pblk_rl_free_blks_dec(struct pblk *pblk, struct pblk_lun *rlun);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct pblk *pblk);
+
+static inline int nvm_addr_in_cache(struct ppa_addr gp)
+{
+ if (gp.ppa != ADDR_EMPTY && gp.c.is_cached)
+ return 1;
+ return 0;
+}
+
+static inline u64 nvm_addr_to_cacheline(struct ppa_addr gp)
+{
+#ifdef CONFIG_NVM_DEBUG
+ BUG_ON(gp.ppa == ADDR_EMPTY);
+#endif
+ return gp.c.line;
+}
+
+static inline void pblk_write_kick(struct pblk *pblk)
+{
+ wake_up_process(pblk->ts_writer);
+}
+
+static inline void *pblk_rlpg_to_llba(struct pblk_blk_rec_lpg *lpg)
+{
+ return lpg + 1;
+}
+
+static inline struct pblk_ctx *pblk_set_ctx(struct pblk *pblk,
+ struct nvm_rq *rqd)
+{
+ struct pblk_ctx *c;
+
+ c = nvm_rq_to_pdu(rqd);
+ c->c_ctx = (void *)(c + 1);
+
+ return c;
+}
+
+static inline void pblk_memcpy_addr(struct pblk_addr *to,
+ struct pblk_addr *from)
+{
+ to->ppa = from->ppa;
+ to->rblk = from->rblk;
+}
+
+static inline void pblk_ppa_set_empty(struct pblk_addr *ppa)
+{
+ ppa_set_empty(&ppa->ppa);
+ ppa->rblk = NULL;
+}
+
+static inline void pblk_free_ref_mem(struct kref *ref)
+{
+ struct pblk_kref_buf *ref_buf;
+ void *data;
+
+ ref_buf = container_of(ref, struct pblk_kref_buf, ref);
+ data = ref_buf->data;
+
+ kfree(data);
+ kfree(ref_buf);
+}
+
+/* Calculate the page offset of within a block from a generic address */
+static inline u64 pblk_gaddr_to_pg_offset(struct nvm_tgt_dev *dev,
+ struct ppa_addr p)
+{
+ struct nvm_geo *geo = &dev->geo;
+
+ return (u64) (p.g.pg * geo->sec_per_pl) +
+ (p.g.pl * geo->sec_per_pg) + p.g.sec;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_ppa(u64 addr)
+{
+ struct ppa_addr p;
+
+ p.c.line = (u64)addr;
+ p.c.is_cached = 1;
+
+ return p;
+}
+
+static inline struct ppa_addr pblk_dev_addr_to_ppa(u64 addr)
+{
+ struct ppa_addr gp;
+
+ gp.ppa = (u64)addr;
+ gp.c.is_cached = 0;
+
+ return gp;
+}
+
+static inline struct ppa_addr addr_to_ppa(u64 paddr)
+{
+ struct ppa_addr ppa;
+
+ ppa.ppa = paddr;
+ return ppa;
+}
+
+static inline u64 ppa_to_addr(struct ppa_addr ppa)
+{
+ return ppa.ppa;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int flags;
+
+ switch (geo->plane_mode) {
+ case NVM_PLANE_QUAD:
+ flags = NVM_IO_QUAD_ACCESS;
+ break;
+ case NVM_PLANE_DOUBLE:
+ flags = NVM_IO_DUAL_ACCESS;
+ break;
+ case NVM_PLANE_SINGLE:
+ flags = NVM_IO_SNGL_ACCESS;
+ break;
+ default:
+ pr_err("pblk: invalid plane configuration\n");
+ return -EINVAL;
+ }
+
+ if (type == WRITE)
+ flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+ return flags;
+}
+
+static inline int pblk_set_read_mode(struct pblk *pblk)
+{
+ return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+}
+
+static inline struct ppa_addr pblk_blk_ppa_to_gaddr(struct nvm_tgt_dev *dev,
+ struct pblk_block *rblk,
+ u64 page_addr)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun = rblk->rlun;
+ struct ppa_addr p;
+ int secs, pgs, pls;
+
+ /* Set base address for LUN and block */
+ p = rlun->bppa;
+ p.g.blk = rblk->id;
+
+ /* Calculate page, plane and sector */
+ div_u64_rem(page_addr, geo->sec_per_pg, &secs);
+ p.g.sec = secs;
+
+ sector_div(page_addr, geo->sec_per_pg);
+ div_u64_rem(page_addr, geo->nr_planes, &pls);
+ p.g.pl = pls;
+
+ sector_div(page_addr, geo->nr_planes);
+ div_u64_rem(page_addr, geo->pgs_per_blk, &pgs);
+ p.g.pg = pgs;
+
+ return p;
+}
+
+static inline int pblk_boundary_checks(struct nvm_tgt_dev *tgt_dev,
+ struct ppa_addr *ppas, int nr_ppas)
+{
+ struct nvm_geo *geo = &tgt_dev->geo;
+ struct ppa_addr *ppa;
+ int i;
+
+ for (i = 0; i < nr_ppas; i++) {
+ ppa = &ppas[i];
+
+ if (ppa->g.ch < geo->nr_chnls &&
+ ppa->g.lun < geo->nr_luns &&
+ ppa->g.pl < geo->nr_planes &&
+ ppa->g.blk < geo->blks_per_lun &&
+ ppa->g.pg < geo->pgs_per_blk &&
+ ppa->g.sec < geo->sec_per_pg)
+ continue;
+
+#ifdef CONFIG_NVM_DEBUG
+ if (ppa->c.is_cached)
+ pr_err("nvm: ppa oob(cacheline:%llu)\n",
+ (u64)ppa->c.line);
+ else
+ pr_err("nvm: ppa oob(ch:%u,lun:%u,pl:%u,blk:%u,pg:%u,sec:%u\n)",
+ ppa->g.ch, ppa->g.lun, ppa->g.pl,
+ ppa->g.blk, ppa->g.pg, ppa->g.sec);
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+{
+ if (p->c.is_cached) {
+ pr_err("ppa: (%s: %x) cache line: %llu\n",
+ msg, error, (u64)p->c.line);
+ } else {
+ pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ msg, error,
+ p->g.ch, p->g.lun, p->g.blk,
+ p->g.pg, p->g.pl, p->g.sec);
+ }
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+ return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_laddr(struct bio *bio)
+{
+ return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+ return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline sector_t pblk_get_sector(sector_t laddr)
+{
+ return laddr * NR_PHY_IN_LOG;
+}
+
+static inline int block_is_bad(struct pblk_block *rblk)
+{
+ return (rblk->state == NVM_BLK_ST_BAD);
+}
+
+static inline int block_is_full(struct pblk *pblk, struct pblk_block *rblk)
+{
+#ifdef CONFIG_NVM_DEBUG
+ if (!block_is_bad(rblk))
+ BUG_ON(!bitmap_full(rblk->sector_bitmap, pblk->nr_blk_dsecs) &&
+ rblk->cur_sec >= pblk->nr_blk_dsecs);
+#endif
+
+ return (rblk->cur_sec >= pblk->nr_blk_dsecs);
+}
+
+static inline void inc_stat(struct pblk *pblk, unsigned long *stat, int interr)
+{
+ if (interr) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&pblk->lock, flags);
+ (*stat)++;
+ spin_unlock_irqrestore(&pblk->lock, flags);
+ } else {
+ spin_lock_irq(&pblk->lock);
+ (*stat)++;
+ spin_unlock_irq(&pblk->lock);
+ }
+}
+#endif /* PBLK_H_ */
--
2.7.4

Next message: luca abeni: "Re: [RFC v3 1/6] Track the active utilisation"
Previous message: Geliang Tang: "[PATCH] ovl: fix return value of ovl_fill_super"
Next in thread: Javier GonzÃlez: "Re: [PATCH] lightnvm: physical block device (pblk) target"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]