Re: [PATCH v4 02/12] pstore/zone: Introduce common layer to manage storage zones
From: WeiXiong Liao
Date: Fri May 08 2020 - 23:09:19 EST
hi Kees Cook,
On 2020/5/8 PM 2:39, Kees Cook wrote:
> From: WeiXiong Liao <liaoweixiong@xxxxxxxxxxxxxxxxx>
>
> Implement a common set of APIs needed to support pstore storage zones,
> based on how ramoops is designed. This will be used by pstore/blk with
> the intention of migrating pstore/ram in the future.
>
> Signed-off-by: WeiXiong Liao <liaoweixiong@xxxxxxxxxxxxxxxxx>
> Link: https://lore.kernel.org/r/1585126506-18635-2-git-send-email-liaoweixiong@xxxxxxxxxxxxxxxxx
> Co-developed-by: Kees Cook <keescook@xxxxxxxxxxxx>
> Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx>
> ---
> fs/pstore/Kconfig | 7 +
> fs/pstore/Makefile | 3 +
> fs/pstore/zone.c | 973 ++++++++++++++++++++++++++++++++++++
> include/linux/pstore_zone.h | 44 ++
> 4 files changed, 1027 insertions(+)
> create mode 100644 fs/pstore/zone.c
> create mode 100644 include/linux/pstore_zone.h
>
> diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
> index 8f0369aad22a..98d2457bdd9f 100644
> --- a/fs/pstore/Kconfig
> +++ b/fs/pstore/Kconfig
> @@ -153,3 +153,10 @@ config PSTORE_RAM
> "ramoops.ko".
>
> For more information, see Documentation/admin-guide/ramoops.rst.
> +
> +config PSTORE_ZONE
> + tristate
> + depends on PSTORE
> + help
> + The common layer for pstore/blk (and pstore/ram in the future)
> + to manage storage in zones.
> diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
> index 967b5891f325..58a967cbe4af 100644
> --- a/fs/pstore/Makefile
> +++ b/fs/pstore/Makefile
> @@ -12,3 +12,6 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o
>
> ramoops-objs += ram.o ram_core.o
> obj-$(CONFIG_PSTORE_RAM) += ramoops.o
> +
> +pstore_zone-objs += zone.o
> +obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o
> diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
> new file mode 100644
> index 000000000000..6c25c443c8e2
> --- /dev/null
> +++ b/fs/pstore/zone.c
> @@ -0,0 +1,973 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#define MODNAME "pstore-zone"
> +#define pr_fmt(fmt) MODNAME ": " fmt
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/pstore.h>
> +#include <linux/mount.h>
> +#include <linux/printk.h>
> +#include <linux/fs.h>
> +#include <linux/pstore_zone.h>
> +#include <linux/kdev_t.h>
> +#include <linux/device.h>
> +#include <linux/namei.h>
> +#include <linux/fcntl.h>
> +#include <linux/uio.h>
> +#include <linux/writeback.h>
> +
> +/**
> + * struct psz_head - header of zone to flush to storage
> + *
> + * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value)
> + * @datalen: length of data in @data
> + * @data: zone data.
> + */
> +struct psz_buffer {
> +#define PSZ_SIG (0x43474244) /* DBGC */
> + uint32_t sig;
> + atomic_t datalen;
> + uint8_t data[];
> +};
> +
> +/**
> + * struct psz_oops_header - sub header of oops zones to flush to storage
> + *
> + * @magic: magic num for oops header
> + * @time: oops/panic trigger time
> + * @compressed: whether conpressed
> + * @counter: oops/panic counter
> + * @reason: identify oops or panic
> + * @data: pointer to log data
> + *
> + * It's a sub-header of oops zone, trailing after &psz_buffer.
> + */
> +struct psz_oops_header {
> +#define OOPS_HEADER_MAGIC 0x4dfc3ae5 /* Just a ramdom number */
> + uint32_t magic;
> + struct timespec64 time;
> + bool compressed;
> + uint32_t counter;
> + enum kmsg_dump_reason reason;
> + uint8_t data[];
> +};
> +
> +/**
> + * struct pstore_zone - zone information
> + *
> + * @off: zone offset of storage
> + * @type: front-end type for this zone
> + * @name: front-end name for this zone
> + * @buffer: pointer to data buffer managed by this zone
> + * @oldbuf: pointer to old data buffer.
> + * @buffer_size: bytes in @buffer->data
> + * @should_recover: whether this zone should recover from storage
> + * @dirty: whether the data in @buffer dirty
> + *
> + * zone structure in memory.
> + */
> +struct pstore_zone {
> + loff_t off;
> + const char *name;
> + enum pstore_type_id type;
> +
> + struct psz_buffer *buffer;
> + struct psz_buffer *oldbuf;
> + size_t buffer_size;
> + bool should_recover;
> + atomic_t dirty;
> +};
> +
> +/**
> + * struct psz_context - all about running state of pstore/zone
> + *
> + * @opszs: oops/panic storage zones
> + * @oops_max_cnt: max count of @opszs
> + * @oops_read_cnt: counter to read oops zone
> + * @oops_write_cnt: counter to write
> + * @oops_counter: counter to oops
> + * @panic_counter: counter to panic
> + * @recovered: whether finish recovering data from storage
> + * @on_panic: whether occur panic
> + * @pstore_zone_info_lock: lock to @pstore_zone_info
> + * @pstore_zone_info: information from back-end
> + * @pstore: structure for pstore
> + */
> +struct psz_context {
> + struct pstore_zone **opszs;
> + unsigned int oops_max_cnt;
> + unsigned int oops_read_cnt;
> + unsigned int oops_write_cnt;
> + /*
> + * the counter should be recovered when recover.
> + * It records the oops/panic times after burning rather than booting.
> + */
> + unsigned int oops_counter;
> + unsigned int panic_counter;
> + atomic_t recovered;
> + atomic_t on_panic;
> +
> + /*
> + * pstore_zone_info_lock just protects "pstore_zone_info" during calls to
> + * register_pstore_zone/unregister_pstore_zone
> + */
> + struct mutex pstore_zone_info_lock;
> + struct pstore_zone_info *pstore_zone_info;
> + struct pstore_info pstore;
> +};
> +static struct psz_context psz_cxt;
> +
> +/**
> + * enum psz_flush_mode - flush mode for psz_zone_write()
> + *
> + * @FLUSH_NONE: do not flush to storage but update data on memory
> + * @FLUSH_PART: just flush part of data including meta data to storage
> + * @FLUSH_META: just flush meta data of zone to storage
> + * @FLUSH_ALL: flush all of zone
> + */
> +enum psz_flush_mode {
> + FLUSH_NONE = 0,
> + FLUSH_PART,
> + FLUSH_META,
> + FLUSH_ALL,
> +};
> +
> +static inline int buffer_datalen(struct pstore_zone *zone)
> +{
> + return atomic_read(&zone->buffer->datalen);
> +}
> +
> +static inline bool is_on_panic(void)
> +{
> + struct psz_context *cxt = &psz_cxt;
> +
> + return atomic_read(&cxt->on_panic);
> +}
> +
> +static ssize_t psz_zone_read(struct pstore_zone *zone, char *buf,
> + size_t len, unsigned long off)
> +{
> + if (!buf || !zone->buffer)
> + return -EINVAL;
> + if (off > zone->buffer_size)
> + return -EINVAL;
> + len = min_t(size_t, len, zone->buffer_size - off);
> + memcpy(buf, zone->buffer->data + off, len);
> + return len;
> +}
> +
> +static int psz_zone_write(struct pstore_zone *zone,
> + enum psz_flush_mode flush_mode, const char *buf,
> + size_t len, unsigned long off)
> +{
> + struct pstore_zone_info *info = psz_cxt.pstore_zone_info;
> + ssize_t wcnt = 0;
> + ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos);
> + size_t wlen;
> +
> + if (off > zone->buffer_size)
> + return -EINVAL;
> +
> + wlen = min_t(size_t, len, zone->buffer_size - off);
> + if (buf && wlen) {
> + memcpy(zone->buffer->data + off, buf, wlen);
> + atomic_set(&zone->buffer->datalen, wlen + off);
> + }
> +
> + /* avoid to damage old records */
> + if (!is_on_panic() && !atomic_read(&psz_cxt.recovered))
> + goto dirty;
> +
> + writeop = is_on_panic() ? info->panic_write : info->write;
> + if (!writeop)
> + goto dirty;
> +
> + switch (flush_mode) {
> + case FLUSH_NONE:
> + if (unlikely(buf && wlen))
> + goto dirty;
> + return 0;
> + case FLUSH_PART:
> + wcnt = writeop((const char *)zone->buffer->data + off, wlen,
> + zone->off + sizeof(*zone->buffer) + off);
> + if (wcnt != wlen)
> + goto dirty;
> + fallthrough;
> + case FLUSH_META:
> + wlen = sizeof(struct psz_buffer);
> + wcnt = writeop((const char *)zone->buffer, wlen, zone->off);
> + if (wcnt != wlen)
> + goto dirty;
> + break;
> + case FLUSH_ALL:
> + wlen = zone->buffer_size + sizeof(*zone->buffer);
> + wcnt = writeop((const char *)zone->buffer, wlen, zone->off);
> + if (wcnt != wlen)
> + goto dirty;
> + break;
> + }
> +
> + return 0;
> +dirty:
> + atomic_set(&zone->dirty, true);
> + return -EBUSY;
> +}
> +
> +static int psz_flush_dirty_zone(struct pstore_zone *zone)
> +{
> + int ret;
> +
> + if (!zone)
> + return -EINVAL;
> +
> + if (!atomic_read(&zone->dirty))
> + return 0;
> +
> + if (!atomic_read(&psz_cxt.recovered))
> + return -EBUSY;
> +
> + ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0);
> + if (!ret)
> + atomic_set(&zone->dirty, false);
> + return ret;
> +}
To avoid multi writers call flush_dirty_zone(), I prefer to
use atomic_xchg() as follow:
static int psz_flush_dirty_zone(struct pstore_zone *zone)
{
int ret;
if (unlikely(!zone))
return -EINVAL;
if (unlikely(!atomic_read(&psz_cxt.recovered)))
return -EBUSY;
if (!atomic_xchg(&zone->dirty, false))
return 0;
ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0);
if (ret)
atomic_set(&zone->dirty, true);
return ret;
}
> +
> +static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt)
> +{
> + int i, ret;
> + struct pstore_zone *zone;
> +
> + if (!zones)
> + return -EINVAL;
> +
> + for (i = 0; i < cnt; i++) {
> + zone = zones[i];
> + if (!zone)
> + return -EINVAL;
> + ret = psz_flush_dirty_zone(zone);
> + if (ret)
> + return ret;
> + }
> + return 0;
> +}
> +
> +static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new)
> +{
> + const char *data = (const char *)old->buffer->data;
> + int ret;
> +
> + ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0);
> + if (ret) {
> + atomic_set(&new->buffer->datalen, 0);
> + atomic_set(&new->dirty, false);
> + return ret;
> + }
> + atomic_set(&old->buffer->datalen, 0);
> + return 0;
> +}
> +
> +static int psz_recover_oops_data(struct psz_context *cxt)
> +{
> + struct pstore_zone_info *info = cxt->pstore_zone_info;
> + struct pstore_zone *zone = NULL;
> + struct psz_buffer *buf;
> + unsigned long i;
> + ssize_t rcnt;
> +
> + if (!info->read)
> + return -EINVAL;
> +
> + for (i = 0; i < cxt->oops_max_cnt; i++) {
> + zone = cxt->opszs[i];
> + if (unlikely(!zone))
> + return -EINVAL;
> + if (atomic_read(&zone->dirty)) {
> + unsigned int wcnt = cxt->oops_write_cnt;
> + struct pstore_zone *new = cxt->opszs[wcnt];
> + int ret;
> +
> + ret = psz_move_zone(zone, new);
> + if (ret) {
> + pr_err("move zone from %lu to %d failed\n",
> + i, wcnt);
> + return ret;
> + }
> + cxt->oops_write_cnt = (wcnt + 1) % cxt->oops_max_cnt;
> + }
> + if (!zone->should_recover)
> + continue;
> + buf = zone->buffer;
> + rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf),
> + zone->off);
> + if (rcnt != zone->buffer_size + sizeof(*buf))
> + return (int)rcnt < 0 ? (int)rcnt : -EIO;
> + }
> + return 0;
> +}
> +
> +static int psz_recover_oops_meta(struct psz_context *cxt)
> +{
> + struct pstore_zone_info *info = cxt->pstore_zone_info;
> + struct pstore_zone *zone;
> + size_t rcnt, len;
> + struct psz_buffer *buf;
> + struct psz_oops_header *hdr;
> + struct timespec64 time = {0};
> + unsigned long i;
> + /*
> + * Recover may on panic, we can't allocate any memory by kmalloc.
> + * So, we use local array instead.
> + */
> + char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0};
> +
> + if (!info->read)
> + return -EINVAL;
> +
> + len = sizeof(*buf) + sizeof(*hdr);
> + buf = (struct psz_buffer *)buffer_header;
> + for (i = 0; i < cxt->oops_max_cnt; i++) {
> + zone = cxt->opszs[i];
> + if (unlikely(!zone))
> + return -EINVAL;
> +
> + rcnt = info->read((char *)buf, len, zone->off);
> + if (rcnt != len) {
> + pr_err("read %s with id %lu failed\n", zone->name, i);
> + return (int)rcnt < 0 ? (int)rcnt : -EIO;
> + }
> +
> + if (buf->sig != zone->buffer->sig) {
> + pr_debug("no valid data in oops zone %lu\n", i);
> + continue;
> + }
> +
> + if (zone->buffer_size < atomic_read(&buf->datalen)) {
> + pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n",
> + zone->name, i, zone->off,
> + zone->buffer_size);
> + continue;
> + }
> +
> + hdr = (struct psz_oops_header *)buf->data;
> + if (hdr->magic != OOPS_HEADER_MAGIC) {
> + pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n",
> + zone->name, i, zone->off,
> + zone->buffer_size);
> + continue;
> + }
> +
> + /*
> + * we get the newest zone, and the next one must be the oldest
> + * or unused zone, because we do write one by one like a circle.
> + */
> + if (hdr->time.tv_sec >= time.tv_sec) {
> + time.tv_sec = hdr->time.tv_sec;
> + cxt->oops_write_cnt = (i + 1) % cxt->oops_max_cnt;
> + }
> +
> + if (hdr->reason == KMSG_DUMP_OOPS)
> + cxt->oops_counter =
> + max(cxt->oops_counter, hdr->counter);
> + else
> + cxt->panic_counter =
> + max(cxt->panic_counter, hdr->counter);
> +
> + if (!atomic_read(&buf->datalen)) {
> + pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n",
> + zone->name, i, zone->off,
> + zone->buffer_size,
> + atomic_read(&buf->datalen));
> + continue;
> + }
> +
> + if (!is_on_panic())
> + zone->should_recover = true;
> + pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n",
> + zone->name, i, zone->off,
> + zone->buffer_size, atomic_read(&buf->datalen));
> + }
> +
> + return 0;
> +}
> +
> +static int psz_recover_oops(struct psz_context *cxt)
> +{
> + int ret;
> +
> + if (!cxt->opszs)
> + return 0;
> +
> + ret = psz_recover_oops_meta(cxt);
> + if (ret)
> + goto recover_fail;
> +
> + ret = psz_recover_oops_data(cxt);
> + if (ret)
> + goto recover_fail;
> +
> + return 0;
> +recover_fail:
> + pr_debug("recover oops failed\n");
> + return ret;
> +}
> +
> +/**
> + * psz_recovery() - recover data from storage
> + * @cxt: the context of pstore/zone
> + *
> + * recovery means reading data back from storage after rebooting
> + *
> + * Return: 0 on success, others on failure.
> + */
> +static inline int psz_recovery(struct psz_context *cxt)
> +{
> + int ret = -EBUSY;
> +
> + if (atomic_read(&cxt->recovered))
> + return 0;
> +
> + ret = psz_recover_oops(cxt);
> + if (ret)
> + goto recover_fail;
> +
> + pr_debug("recover end!\n");
> + atomic_set(&cxt->recovered, 1);
> + return 0;
> +
> +recover_fail:
> + pr_err("recover failed\n");
> + return ret;
> +}
> +
> +static int psz_pstore_open(struct pstore_info *psi)
> +{
> + struct psz_context *cxt = psi->data;
> +
> + cxt->oops_read_cnt = 0;
> + return 0;
> +}
> +
> +static inline bool psz_ok(struct pstore_zone *zone)
> +{
> + if (zone && zone->buffer && buffer_datalen(zone))
> + return true;
> + return false;
> +}
> +
> +static inline int psz_oops_erase(struct psz_context *cxt,
> + struct pstore_zone *zone, struct pstore_record *record)
> +{
> + struct psz_buffer *buffer = zone->buffer;
> + struct psz_oops_header *hdr =
> + (struct psz_oops_header *)buffer->data;
> +
> + if (unlikely(!psz_ok(zone)))
> + return 0;
> + /* this zone is already updated, no need to erase */
> + if (record->count != hdr->counter)
> + return 0;
> +
> + atomic_set(&zone->buffer->datalen, 0);
> + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0);
> +}
> +
> +static int psz_pstore_erase(struct pstore_record *record)
> +{
> + struct psz_context *cxt = record->psi->data;
> +
> + switch (record->type) {
> + case PSTORE_TYPE_DMESG:
> + if (record->id >= cxt->oops_max_cnt)
> + return -EINVAL;
> + return psz_oops_erase(cxt, cxt->opszs[record->id], record);
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +static void psz_write_kmsg_hdr(struct pstore_zone *zone,
> + struct pstore_record *record)
> +{
> + struct psz_context *cxt = record->psi->data;
> + struct psz_buffer *buffer = zone->buffer;
> + struct psz_oops_header *hdr =
> + (struct psz_oops_header *)buffer->data;
> +
> + hdr->magic = OOPS_HEADER_MAGIC;
> + hdr->compressed = record->compressed;
> + hdr->time.tv_sec = record->time.tv_sec;
> + hdr->time.tv_nsec = record->time.tv_nsec;
> + hdr->reason = record->reason;
> + if (hdr->reason == KMSG_DUMP_OOPS)
> + hdr->counter = ++cxt->oops_counter;
> + else
> + hdr->counter = ++cxt->panic_counter;
> +}
> +
> +static inline int notrace psz_oops_write_record(struct psz_context *cxt,
> + struct pstore_record *record)
> +{
> + size_t size, hlen;
> + struct pstore_zone *zone;
> + unsigned int zonenum;
> +
> + zonenum = cxt->oops_write_cnt;
> + zone = cxt->opszs[zonenum];
> + if (unlikely(!zone))
> + return -ENOSPC;
> + cxt->oops_write_cnt = (zonenum + 1) % cxt->oops_max_cnt;
> +
> + pr_debug("write %s to zone id %d\n", zone->name, zonenum);
> + psz_write_kmsg_hdr(zone, record);
> + hlen = sizeof(struct psz_oops_header);
> + size = min_t(size_t, record->size, zone->buffer_size - hlen);
> + return psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen);
> +}
> +
> +static int notrace psz_oops_write(struct psz_context *cxt,
> + struct pstore_record *record)
> +{
> + int ret;
> +
> + /*
> + * Explicitly only take the first part of any new crash.
> + * If our buffer is larger than kmsg_bytes, this can never happen,
> + * and if our buffer is smaller than kmsg_bytes, we don't want the
> + * report split across multiple records.
> + */
> + if (record->part != 1)
> + return -ENOSPC;
> +
> + if (!cxt->opszs)
> + return -ENOSPC;
> +
> + ret = psz_oops_write_record(cxt, record);
> + if (!ret) {
> + pr_debug("try to flush other dirty oops zones\n");
> + psz_flush_dirty_zones(cxt->opszs, cxt->oops_max_cnt);
> + }
> +
> + /* always return 0 as we had handled it on buffer */
> + return 0;
> +}
> +
> +static int notrace psz_pstore_write(struct pstore_record *record)
> +{
> + struct psz_context *cxt = record->psi->data;
> +
> + if (record->type == PSTORE_TYPE_DMESG &&
> + record->reason == KMSG_DUMP_PANIC)
> + atomic_set(&cxt->on_panic, 1);
> +
> + switch (record->type) {
> + case PSTORE_TYPE_DMESG:
> + return psz_oops_write(cxt, record);
> + default:
> + return -EINVAL;
> + }
> +}
> +
> +static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt)
> +{
> + struct pstore_zone *zone = NULL;
> +
> + while (cxt->oops_read_cnt < cxt->oops_max_cnt) {
> + zone = cxt->opszs[cxt->oops_read_cnt++];
> + if (psz_ok(zone))
> + return zone;
> + }
> +
> + return NULL;
> +}
> +
> +static int psz_read_oops_hdr(struct pstore_zone *zone,
> + struct pstore_record *record)
> +{
> + struct psz_buffer *buffer = zone->buffer;
> + struct psz_oops_header *hdr =
> + (struct psz_oops_header *)buffer->data;
> +
> + if (hdr->magic != OOPS_HEADER_MAGIC)
> + return -EINVAL;
> + record->compressed = hdr->compressed;
> + record->time.tv_sec = hdr->time.tv_sec;
> + record->time.tv_nsec = hdr->time.tv_nsec;
> + record->reason = hdr->reason;
> + record->count = hdr->counter;
> + return 0;
> +}
> +
> +static ssize_t psz_oops_read(struct pstore_zone *zone,
> + struct pstore_record *record)
> +{
> + ssize_t size, hlen = 0;
> +
> + size = buffer_datalen(zone);
> + /* Clear and skip this oops record if it has no valid header */
> + if (psz_read_oops_hdr(zone, record)) {
> + atomic_set(&zone->buffer->datalen, 0);
> + atomic_set(&zone->dirty, 0);
> + return -ENOMSG;
> + }
> + size -= sizeof(struct psz_oops_header);
> +
> + if (!record->compressed) {
> + char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n",
> + kmsg_dump_reason_str(record->reason),
> + record->count);
> + hlen = strlen(buf);
> + record->buf = krealloc(buf, hlen + size, GFP_KERNEL);
> + if (!record->buf) {
> + kfree(buf);
> + return -ENOMEM;
> + }
> + } else {
> + record->buf = kmalloc(size, GFP_KERNEL);
> + if (!record->buf)
> + return -ENOMEM;
> + }
> +
> + size = psz_zone_read(zone, record->buf + hlen, size,
> + sizeof(struct psz_oops_header) < 0);
Here should be:
sizeof(struct psz_oops_header));
That's the reason why all the compressed files were failing to
decompress.
> + if (unlikely(size < 0)) {
> + kfree(record->buf);
> + return -ENOMSG;
> + }
> +
> + return size + hlen;
> +}
> +
> +static ssize_t psz_pstore_read(struct pstore_record *record)
> +{
> + struct psz_context *cxt = record->psi->data;
> + ssize_t (*readop)(struct pstore_zone *zone,
> + struct pstore_record *record);
> + struct pstore_zone *zone;
> + ssize_t ret;
> +
> + /* before read, we must recover from storage */
> + ret = psz_recovery(cxt);
> + if (ret)
> + return ret;
> +
> +next_zone:
> + zone = psz_read_next_zone(cxt);
> + if (!zone)
> + return 0;
> +
> + record->type = zone->type;
> + switch (record->type) {
> + case PSTORE_TYPE_DMESG:
> + readop = psz_oops_read;
> + record->id = cxt->oops_read_cnt - 1;
> + break;
> + default:
> + goto next_zone;
> + }
> +
> + ret = readop(zone, record);
> + if (ret == -ENOMSG)
> + goto next_zone;
> + return ret;
> +}
> +
> +static struct psz_context psz_cxt = {
> + .pstore_zone_info_lock = __MUTEX_INITIALIZER(psz_cxt.pstore_zone_info_lock),
> + .recovered = ATOMIC_INIT(0),
> + .on_panic = ATOMIC_INIT(0),
> + .pstore = {
> + .owner = THIS_MODULE,
> + .name = MODNAME,
> + .open = psz_pstore_open,
> + .read = psz_pstore_read,
> + .write = psz_pstore_write,
> + .erase = psz_pstore_erase,
> + },
> +};
> +
> +static struct pstore_zone *psz_init_zone(enum pstore_type_id type,
> + loff_t *off, size_t size)
> +{
> + struct pstore_zone_info *info = psz_cxt.pstore_zone_info;
> + struct pstore_zone *zone;
> + const char *name = pstore_type_to_name(type);
> +
> + if (!size)
> + return NULL;
> +
> + if (*off + size > info->total_size) {
> + pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n",
> + name, size, *off, info->total_size);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL);
> + if (!zone)
> + return ERR_PTR(-ENOMEM);
> +
> + zone->buffer = kmalloc(size, GFP_KERNEL);
> + if (!zone->buffer) {
> + kfree(zone);
> + return ERR_PTR(-ENOMEM);
> + }
> + memset(zone->buffer, 0xFF, size);
> + zone->off = *off;
> + zone->name = name;
> + zone->type = type;
> + zone->buffer_size = size - sizeof(struct psz_buffer);
> + zone->buffer->sig = type ^ PSZ_SIG;
> + atomic_set(&zone->dirty, 0);
> + atomic_set(&zone->buffer->datalen, 0);
> +
> + *off += size;
> +
> + pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name,
> + zone->off, sizeof(*zone->buffer), zone->buffer_size);
> + return zone;
> +}
> +
> +static struct pstore_zone **psz_init_zones(enum pstore_type_id type,
> + loff_t *off, size_t total_size, ssize_t record_size,
> + unsigned int *cnt)
> +{
> + struct pstore_zone_info *info = psz_cxt.pstore_zone_info;
> + struct pstore_zone **zones, *zone;
> + const char *name = pstore_type_to_name(type);
> + int c, i;
> +
> + if (!total_size || !record_size)
> + return NULL;
> +
> + if (*off + total_size > info->total_size) {
> + pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n",
> + name, total_size, *off, info->total_size);
> + return ERR_PTR(-ENOMEM);
> + }
> +
> + c = total_size / record_size;
> + zones = kcalloc(c, sizeof(*zones), GFP_KERNEL);
> + if (!zones) {
> + pr_err("allocate for zones %s failed\n", name);
> + return ERR_PTR(-ENOMEM);
> + }
> + memset(zones, 0, c * sizeof(*zones));
> +
> + for (i = 0; i < c; i++) {
> + zone = psz_init_zone(type, off, record_size);
> + if (!zone || IS_ERR(zone)) {
> + pr_err("initialize zones %s failed\n", name);
> + while (--i >= 0) {
> + kfree(zones[i]->buffer);
> + kfree(zones[i]);
> + }
> + kfree(zones);
> + return (void *)zone;
> + }
> + zones[i] = zone;
> + }
> +
> + *cnt = c;
> + return zones;
> +}
> +
> +static void psz_free_zone(struct pstore_zone **pszone)
> +{
> + struct pstore_zone *zone = *pszone;
> +
> + if (!zone)
> + return;
> +
> + kfree(zone->buffer);
> + kfree(zone);
> + *pszone = NULL;
> +}
> +
> +static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt)
> +{
> + struct pstore_zone **zones = *pszones;
> +
> + if (!zones)
> + return;
> +
> + while (*cnt > 0) {
> + psz_free_zone(&zones[*cnt]);
> + (*cnt)--;
> + }
> + kfree(zones);
> + *pszones = NULL;
> +}
> +
> +static void psz_free_all_zones(struct psz_context *cxt)
> +{
> + if (cxt->opszs)
> + psz_free_zones(&cxt->opszs, &cxt->oops_max_cnt);
> +}
> +
> +static int psz_alloc_zones(struct psz_context *cxt)
> +{
> + struct pstore_zone_info *info = cxt->pstore_zone_info;
> + loff_t off = 0;
> + int err;
> + size_t size;
> +
> + size = info->total_size;
> + cxt->opszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, size,
> + info->kmsg_size, &cxt->oops_max_cnt);
> + if (IS_ERR(cxt->opszs)) {
> + err = PTR_ERR(cxt->opszs);
> + goto fail_out;
> + }
> +
> + return 0;
> +fail_out:
> + return err;
> +}
> +
> +/**
> + * register_pstore_zone() - register to pstore/zone
> + *
> + * @info: back-end driver information. See &struct pstore_zone_info.
> + *
> + * Only one back-end at one time.
> + *
> + * Return: 0 on success, others on failure.
> + */
> +int register_pstore_zone(struct pstore_zone_info *info)
> +{
> + int err = -EINVAL;
> + struct psz_context *cxt = &psz_cxt;
> +
> + if (!info->total_size) {
> + pr_warn("the total size must be non-zero\n");
> + return -EINVAL;
> + }
> +
> + if (!info->kmsg_size) {
> + pr_warn("at least one of the records be non-zero\n");
> + return -EINVAL;
> + }
> +
> + if (!info->name || !info->name[0])
> + return -EINVAL;
> +
> + if (info->total_size < 4096) {
> + pr_err("total size must be greater than 4096 bytes\n");
> + return -EINVAL;
> + }
> +
> +#define check_size(name, size) { \
> + if (info->name > 0 && info->name < (size)) { \
> + pr_err(#name " must be over %d\n", (size)); \
> + return -EINVAL; \
> + } \
> + if (info->name & (size - 1)) { \
> + pr_err(#name " must be a multiple of %d\n", \
> + (size)); \
> + return -EINVAL; \
> + } \
> + }
> +
> + check_size(total_size, 4096);
> + check_size(kmsg_size, SECTOR_SIZE);
> +
> +#undef check_size
> +
> + /*
> + * the @read and @write must be applied.
> + * if no @read, pstore may mount failed.
> + * if no @write, pstore do not support to remove record file.
> + */
> + if (!info->read || !info->write) {
> + pr_err("no valid general read/write interface\n");
> + return -EINVAL;
> + }
> +
> + mutex_lock(&cxt->pstore_zone_info_lock);
> + if (cxt->pstore_zone_info) {
> + pr_warn("'%s' already loaded: ignoring '%s'\n",
> + cxt->pstore_zone_info->name, info->name);
> + mutex_unlock(&cxt->pstore_zone_info_lock);
> + return -EBUSY;
> + }
> + cxt->pstore_zone_info = info;
> + mutex_unlock(&cxt->pstore_zone_info_lock);
> +
> + pr_debug("register %s with properties:\n", info->name);
> + pr_debug("\ttotal size : %ld Bytes\n", info->total_size);
> + pr_debug("\toops size : %ld Bytes\n", info->kmsg_size);
> +
> + err = psz_alloc_zones(cxt);
> + if (err) {
> + pr_err("alloc zones failed\n");
> + goto fail_out;
> + }
> +
> + if (info->kmsg_size) {
> + cxt->pstore.bufsize = cxt->opszs[0]->buffer_size -
> + sizeof(struct psz_oops_header);
> + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
> + if (!cxt->pstore.buf) {
> + err = -ENOMEM;
> + goto free_all_zones;
> + }
> + }
> + cxt->pstore.data = cxt;
> +
> + pr_info("registered %s as backend for", info->name);
> + cxt->pstore.max_reason = info->max_reason;
> + if (info->kmsg_size) {
> + cxt->pstore.flags |= PSTORE_FLAGS_DMESG;
> + pr_cont(" kmsg(%s",
> + kmsg_dump_reason_str(cxt->pstore.max_reason));
> + if (cxt->pstore_zone_info->panic_write)
> + pr_cont(",panic_write");
> + pr_cont(")");
> + }
> + pr_cont("\n");
> +
> + err = pstore_register(&cxt->pstore);
> + if (err) {
> + pr_err("registering with pstore failed\n");
> + goto free_pstore_buf;
> + }
> +
> + return 0;
> +
> +free_pstore_buf:
> + kfree(cxt->pstore.buf);
> +free_all_zones:
> + psz_free_all_zones(cxt);
> +fail_out:
> + mutex_lock(&psz_cxt.pstore_zone_info_lock);
> + psz_cxt.pstore_zone_info = NULL;
> + mutex_unlock(&psz_cxt.pstore_zone_info_lock);
> + return err;
> +}
> +EXPORT_SYMBOL_GPL(register_pstore_zone);
> +
> +/**
> + * unregister_pstore_zone() - unregister to pstore/zone
> + *
> + * @info: back-end driver information. See struct pstore_zone_info.
> + */
> +void unregister_pstore_zone(struct pstore_zone_info *info)
> +{
> + struct psz_context *cxt = &psz_cxt;
> +
> + pstore_unregister(&cxt->pstore);
> + kfree(cxt->pstore.buf);
> + cxt->pstore.bufsize = 0;
> +
> + mutex_lock(&cxt->pstore_zone_info_lock);
> + cxt->pstore_zone_info = NULL;
> + mutex_unlock(&cxt->pstore_zone_info_lock);
> +
> + psz_free_all_zones(cxt);
> +}
> +EXPORT_SYMBOL_GPL(unregister_pstore_zone);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@xxxxxxxxxxxxxxxxx>");
> +MODULE_DESCRIPTION("Storage Manager for pstore/blk");
> diff --git a/include/linux/pstore_zone.h b/include/linux/pstore_zone.h
> new file mode 100644
> index 000000000000..a6a79ff1351b
> --- /dev/null
> +++ b/include/linux/pstore_zone.h
> @@ -0,0 +1,44 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __PSTORE_ZONE_H_
> +#define __PSTORE_ZONE_H_
> +
> +#include <linux/types.h>
> +
> +typedef ssize_t (*psz_read_op)(char *, size_t, loff_t);
> +typedef ssize_t (*psz_write_op)(const char *, size_t, loff_t);
> +/**
> + * struct pstore_zone_info - pstore/zone back-end driver structure
> + *
> + * @owner: Module which is responsible for this back-end driver.
> + * @name: Name of the back-end driver.
> + * @total_size: The total size in bytes pstore/zone can use. It must be greater
> + * than 4096 and be multiple of 4096.
> + * @kmsg_size: The size of oops/panic zone. Zero means disabled, otherwise,
> + * it must be multiple of SECTOR_SIZE(512 Bytes).
> + * @max_reason: Maximum kmsg dump reason to store.
> + * @read: The general read operation. Both of the function parameters
> + * @size and @offset are relative value to storage.
> + * On success, the number of bytes should be returned, others
> + * means error.
> + * @write: The same as @read.
> + * @panic_write:The write operation only used for panic case. It's optional
> + * if you do not care panic log. The parameters and return value
> + * are the same as @read.
> + */
> +struct pstore_zone_info {
> + struct module *owner;
> + const char *name;
> +
> + unsigned long total_size;
> + unsigned long kmsg_size;
> + int max_reason;
> + psz_read_op read;
> + psz_write_op write;
> + psz_write_op panic_write;
> +};
> +
> +extern int register_pstore_zone(struct pstore_zone_info *info);
> +extern void unregister_pstore_zone(struct pstore_zone_info *info);
> +
> +#endif
>
I will try to send v5 as soon as possable.
--
WeiXiong Liao