Re: [PATCH v7 3/4] remoteproc: qcom: Add capability to collect minidumps

From: Bjorn Andersson
Date: Wed Nov 18 2020 - 11:50:19 EST


On Tue 03 Nov 03:19 CST 2020, Siddharth Gupta wrote:

> This patch adds support for collecting minidump in the event of remoteproc
> crash. Parse the minidump table based on remoteproc's unique minidump-id,
> read all memory regions from the remoteproc's minidump table entry and
> expose the memory to userspace. The remoteproc platform driver can choose
> to collect a full/mini dump by specifying the coredump op.
>
> Co-developed-by: Rishabh Bhatnagar <rishabhb@xxxxxxxxxxxxxx>
> Signed-off-by: Rishabh Bhatnagar <rishabhb@xxxxxxxxxxxxxx>
> Co-developed-by: Gurbir Arora <gurbaror@xxxxxxxxxxxxxx>
> Signed-off-by: Gurbir Arora <gurbaror@xxxxxxxxxxxxxx>
> Signed-off-by: Siddharth Gupta <sidgup@xxxxxxxxxxxxxx>
> ---
> drivers/remoteproc/qcom_minidump.h | 64 +++++++++++++++++++++++
> drivers/remoteproc/qcom_q6v5_pas.c | 104 ++++++++++++++++++++++++++++++++++++-
> 2 files changed, 166 insertions(+), 2 deletions(-)
> create mode 100644 drivers/remoteproc/qcom_minidump.h
>
> diff --git a/drivers/remoteproc/qcom_minidump.h b/drivers/remoteproc/qcom_minidump.h

This only needs to live in a header file if it's going to be accessed
from more than 1 c-file.

> new file mode 100644
> index 0000000..5857d06
> --- /dev/null
> +++ b/drivers/remoteproc/qcom_minidump.h
> @@ -0,0 +1,64 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2020, The Linux Foundation. All rights reserved.
> + */
> +
> +#ifndef __QCOM_MINIDUMP_H
> +#define __QCOM_MINIDUMP_H
> +
> +#define MAX_NUM_OF_SS 10
> +#define MAX_REGION_NAME_LENGTH 16
> +#define SBL_MINIDUMP_SMEM_ID 602
> +#define MD_REGION_VALID ('V' << 24 | 'A' << 16 | 'L' << 8 | 'I' << 0)
> +#define MD_SS_ENCR_DONE ('D' << 24 | 'O' << 16 | 'N' << 8 | 'E' << 0)
> +#define MD_SS_ENABLED ('E' << 24 | 'N' << 16 | 'B' << 8 | 'L' << 0)
> +
> +/**
> + * struct minidump_region - Minidump region
> + * @name : Name of the region to be dumped
> + * @seq_num: : Use to differentiate regions with same name.
> + * @valid : This entry to be dumped (if set to 1)
> + * @address : Physical address of region to be dumped
> + * @size : Size of the region
> + */
> +struct minidump_region {
> + char name[MAX_REGION_NAME_LENGTH];
> + __le32 seq_num;
> + __le32 valid;
> + __le64 address;
> + __le64 size;
> +};
> +
> +/**
> + * struct minidump_subsystem_toc: Subsystem's SMEM Table of content
> + * @status : Subsystem toc init status
> + * @enabled : if set to 1, this region would be copied during coredump
> + * @encryption_status: Encryption status for this subsystem
> + * @encryption_required : Decides to encrypt the subsystem regions or not
> + * @ss_region_count : Number of regions added in this subsystem toc
> + * @md_ss_smem_regions_baseptr : regions base pointer of the subsystem
> + */
> +struct minidump_subsystem_toc {
> + __le32 status;
> + __le32 enabled;
> + __le32 encryption_status;
> + __le32 encryption_required;
> + __le32 ss_region_count;

Please drop the "ss_" prefix.

> + __le64 md_ss_smem_regions_baseptr;

Please drop the "md_ss_smem_" prefix.

> +};
> +
> +/**
> + * struct minidump_global_toc: Global Table of Content
> + * @md_toc_init : Global Minidump init status
> + * @md_revision : Minidump revision
> + * @md_enable_status : Minidump enable status
> + * @md_ss_toc : Array of subsystems toc
> + */
> +struct minidump_global_toc {
> + __le32 status;
> + __le32 md_revision;
> + __le32 enabled;
> + struct minidump_subsystem_toc md_ss_toc[MAX_NUM_OF_SS];

How about "subsystems" and how about dropping the "_toc" suffix on the
type?

> +};
> +
> +#endif
> diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c
> index 3837f23..349f725 100644
> --- a/drivers/remoteproc/qcom_q6v5_pas.c
> +++ b/drivers/remoteproc/qcom_q6v5_pas.c
> @@ -28,11 +28,13 @@
> #include "qcom_pil_info.h"
> #include "qcom_q6v5.h"
> #include "remoteproc_internal.h"
> +#include "qcom_minidump.h"
>
> struct adsp_data {
> int crash_reason_smem;
> const char *firmware_name;
> int pas_id;
> + unsigned int minidump_id;
> bool has_aggre2_clk;
> bool auto_boot;
>
> @@ -63,6 +65,7 @@ struct qcom_adsp {
> int proxy_pd_count;
>
> int pas_id;
> + unsigned int minidump_id;
> int crash_reason_smem;
> bool has_aggre2_clk;
> const char *info_name;
> @@ -116,6 +119,88 @@ static void adsp_pds_disable(struct qcom_adsp *adsp, struct device **pds,
> }
> }
>
> +static void adsp_minidump_cleanup(struct rproc *rproc)
> +{
> + struct rproc_dump_segment *entry, *tmp;
> +
> + list_for_each_entry_safe(entry, tmp, &rproc->dump_segments, node) {
> + list_del(&entry->node);
> + kfree(entry->priv);
> + kfree(entry);
> + }
> +}
> +
> +static void adsp_add_minidump_segments(struct rproc *rproc,
> + struct minidump_subsystem_toc *minidump_ss)
> +{
> + struct minidump_region __iomem *ptr;
> + struct minidump_region region;
> + int seg_cnt, i;
> + dma_addr_t da;
> + size_t size;
> + char *name;
> +
> + if (!list_empty(&rproc->dump_segments)) {

if (WARN_ON(!list_empty()))

Because this would only happen if we have a bug somewhere that leaves
items lingering on the dump_segments list.

> + dev_err(&rproc->dev, "dump segment list already populated\n");
> + return;
> + }
> +
> + seg_cnt = le32_to_cpu(minidump_ss->ss_region_count);
> + ptr = ioremap((unsigned long)le64_to_cpu(minidump_ss->md_ss_smem_regions_baseptr),
> + seg_cnt * sizeof(struct minidump_region));
> +
> + if (!ptr)
> + return;
> +
> + for (i = 0; i < seg_cnt; i++) {
> + memcpy_fromio(&region, ptr + i, sizeof(region));
> + if (region.valid == MD_REGION_VALID) {
> + name = kmalloc(MAX_REGION_NAME_LENGTH, GFP_KERNEL);
> + strlcpy(name, region.name, MAX_REGION_NAME_LENGTH);

Please use kstrdup() and don't forget to check for (and handle)
allocation failures.

> + da = le64_to_cpu(region.address);
> + size = le32_to_cpu(region.size);
> + rproc_coredump_add_custom_segment(rproc, da, size, NULL, name);
> + }
> + }
> +
> + iounmap(ptr);
> +}
> +
> +static void adsp_dump(struct rproc *rproc)

Here I think it makes sense to spell out adsp_minidump()

That said, the only thing I see specific to this driver here is the
use of adsp->minidump_id, so how about moving all this to qcom_common.c
and just call qcom_minidump(rproc, adsp->minidump_id); from here?

That way we can easily integrate it in the other remoteprocs as needed
later.

> +{
> + struct qcom_adsp *adsp = rproc->priv;
> + struct minidump_subsystem_toc *minidump_ss;
> + struct minidump_global_toc *minidump_toc;

How about just naming this "toc" and minidump_ss just "minidump"?

> +
> + /* Get Global minidump ToC*/
> + minidump_toc = qcom_smem_get(QCOM_SMEM_HOST_ANY, SBL_MINIDUMP_SMEM_ID, NULL);
> +
> + /* check if global table pointer exists and init is set */
> + if (IS_ERR(minidump_toc) || !minidump_toc->status) {
> + dev_err(&rproc->dev, "SMEM is not initialized.\n");

"Minidump TOC not found in SMEM\n"

> + return;
> + }
> +
> + /* Get subsystem table of contents using the minidump id */
> + minidump_ss = &minidump_toc->md_ss_toc[adsp->minidump_id];
> +
> + /**
> + * Collect minidump if SS ToC is valid and segment table
> + * is initialized in memory and encryption status is set.
> + */
> + if (minidump_ss->md_ss_smem_regions_baseptr == 0 ||
> + le32_to_cpu(minidump_ss->status) != 1 ||
> + le32_to_cpu(minidump_ss->enabled) != MD_SS_ENABLED ||
> + le32_to_cpu(minidump_ss->encryption_status) != MD_SS_ENCR_DONE) {
> + dev_err(&rproc->dev, "Minidump not ready!! Aborting\n");

"Minidump not ready, skipping\n"

> + return;
> + }
> +
> + adsp_add_minidump_segments(rproc, minidump_ss);
> + rproc_minidump(rproc);
> + adsp_minidump_cleanup(rproc);
> +}
> +
> static int adsp_load(struct rproc *rproc, const struct firmware *fw)
> {
> struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv;
> @@ -258,6 +343,15 @@ static const struct rproc_ops adsp_ops = {
> .panic = adsp_panic,
> };
>
> +static const struct rproc_ops adsp_minidump_ops = {
> + .start = adsp_start,
> + .stop = adsp_stop,
> + .da_to_va = adsp_da_to_va,
> + .load = adsp_load,
> + .panic = adsp_panic,
> + .coredump = adsp_dump,
> +};
> +
> static int adsp_init_clock(struct qcom_adsp *adsp)
> {
> int ret;
> @@ -398,8 +492,13 @@ static int adsp_probe(struct platform_device *pdev)
> if (ret < 0 && ret != -EINVAL)
> return ret;
>
> - rproc = rproc_alloc(&pdev->dev, pdev->name, &adsp_ops,
> - fw_name, sizeof(*adsp));
> + if (desc->minidump_id)

Please use a local variable to reference adsp_minidump_ops vs adsp_ops.
Instead of making the whole thing conditional.

Regards,
Bjorn

> + rproc = rproc_alloc(&pdev->dev, pdev->name, &adsp_minidump_ops, fw_name,
> + sizeof(*adsp));
> + else
> + rproc = rproc_alloc(&pdev->dev, pdev->name, &adsp_ops, fw_name,
> + sizeof(*adsp));
> +
> if (!rproc) {
> dev_err(&pdev->dev, "unable to allocate remoteproc\n");
> return -ENOMEM;
> @@ -411,6 +510,7 @@ static int adsp_probe(struct platform_device *pdev)
> adsp = (struct qcom_adsp *)rproc->priv;
> adsp->dev = &pdev->dev;
> adsp->rproc = rproc;
> + adsp->minidump_id = desc->minidump_id;
> adsp->pas_id = desc->pas_id;
> adsp->has_aggre2_clk = desc->has_aggre2_clk;
> adsp->info_name = desc->sysmon_name;
> --
> Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
>