Re: [PATCH v6 5/6] iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV

From: Jason Gunthorpe
Date: Tue Apr 30 2024 - 12:37:44 EST


On Mon, Apr 29, 2024 at 09:43:48PM -0700, Nicolin Chen wrote:

> static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
> {
> + if (smmu->tegra241_cmdqv)
> + return tegra241_cmdqv_get_cmdq(smmu);

Since it is compile time optional it would make some sense to optimize
this (in all the places) too:

if (arm_smmu_has_smmu_tegra241_cmdqv(smmu))
[..]

static inline bool arm_smmu_has_smmu_tegra241_cmdqv(struct arm_smmu_device *smmu)
{
return IS_ENABLED(CONFIG_TEGRA241_CMDQV) && smmu->tegra241_cmdqv;
}

> @@ -3105,12 +3108,10 @@ static struct iommu_ops arm_smmu_ops = {
> };
>
> /* Probing and initialisation functions */
> -static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
> - struct arm_smmu_queue *q,
> - void __iomem *page,
> - unsigned long prod_off,
> - unsigned long cons_off,
> - size_t dwords, const char *name)
> +int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
> + struct arm_smmu_queue *q, void __iomem *page,
> + unsigned long prod_off, unsigned long cons_off,
> + size_t dwords, const char *name)
> {
> size_t qsz;

This hunk and the .h file part should be moved to the prior patch that
is de-exporting things.

> +/* MMIO helpers */
> +#define cmdqv_readl(reg) \
> + readl(cmdqv->base + TEGRA241_CMDQV_##reg)
> +#define cmdqv_readl_relaxed(reg) \
> + readl_relaxed(cmdqv->base + TEGRA241_CMDQV_##reg)
> +#define cmdqv_writel(val, reg) \
> + writel((val), cmdqv->base + TEGRA241_CMDQV_##reg)
> +#define cmdqv_writel_relaxed(val, reg) \
> + writel_relaxed((val), cmdqv->base + TEGRA241_CMDQV_##reg)

Please don't hide access to a stack variable in a macro, and I'm not
keen on the ##reg scheme either - it makes it much harder to search
for things.

Really this all seems like alot of overkill to make a little bit of
shorthand. It is not so wordy just to type it out:

readl(vintf->base + TEGRA241_VINTF_CONFIG)

> +/* Logging helpers */
> +#define cmdqv_warn(fmt, ...) \
> + dev_warn(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)
> +#define cmdqv_err(fmt, ...) \
> + dev_err(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)
> +#define cmdqv_info(fmt, ...) \
> + dev_info(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)
> +#define cmdqv_dbg(fmt, ...) \
> + dev_dbg(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)

Really not sure these are necessary, same remark about the stack
variable.

Also cmdqv->dev is the wrong thing to print, this is part of the smmu driver,
it should print cmdqv->smmu->dev for consistency

> +#define vintf_warn(fmt, ...) \
> + dev_warn(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +#define vintf_err(fmt, ...) \
> + dev_err(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +#define vintf_info(fmt, ...) \
> + dev_info(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +#define vintf_dbg(fmt, ...) \
> + dev_dbg(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +
> +#define vcmdq_warn(fmt, ...) \
> + ({ \
> + struct tegra241_vintf *vintf = vcmdq->vintf; \
> + if (vintf) \
> + vintf_warn("VCMDQ%u/LVCMDQ%u: " fmt, \
> + vcmdq->idx, vcmdq->lidx, \
> + ##__VA_ARGS__); \
> + else \
> + dev_warn(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt, \
> + vcmdq->idx, ##__VA_ARGS__); \
> + })
> +#define vcmdq_err(fmt, ...) \
> + ({ \
> + struct tegra241_vintf *vintf = vcmdq->vintf; \
> + if (vintf) \
> + vintf_err("VCMDQ%u/LVCMDQ%u: " fmt, \
> + vcmdq->idx, vcmdq->lidx, \
> + ##__VA_ARGS__); \
> + else \
> + dev_err(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt, \
> + vcmdq->idx, ##__VA_ARGS__); \
> + })
> +#define vcmdq_info(fmt, ...) \
> + ({ \
> + struct tegra241_vintf *vintf = vcmdq->vintf; \
> + if (vintf) \
> + vintf_info("VCMDQ%u/LVCMDQ%u: " fmt, \
> + vcmdq->idx, vcmdq->lidx, \
> + ##__VA_ARGS__); \
> + else \
> + dev_info(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt, \
> + vcmdq->idx, ##__VA_ARGS__); \
> + })
> +#define vcmdq_dbg(fmt, ...) \
> + ({ \
> + struct tegra241_vintf *vintf = vcmdq->vintf; \
> + if (vintf) \
> + vintf_dbg("VCMDQ%u/LVCMDQ%u: " fmt, \
> + vcmdq->idx, vcmdq->lidx, \
> + ##__VA_ARGS__); \
> + else \
> + dev_dbg(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt, \
> + vcmdq->idx, ##__VA_ARGS__); \
> + })

Some of these are barely used, is it worth all these macros??

> +
> +/* Configuring and polling helpers */
> +#define tegra241_cmdqv_write_config(_owner, _OWNER, _regval) \
> + ({ \
> + bool _en = (_regval) & _OWNER##_EN; \
> + u32 _status; \
> + int _ret; \
> + writel((_regval), _owner->base + TEGRA241_##_OWNER##_CONFIG); \
> + _ret = readl_poll_timeout( \
> + _owner->base + TEGRA241_##_OWNER##_STATUS, _status, \
> + _en ? (_regval) & _OWNER##_ENABLED : \
> + !((_regval) & _OWNER##_ENABLED), \
> + 1, ARM_SMMU_POLL_TIMEOUT_US); \
> + if (_ret) \
> + _owner##_err("failed to %sable, STATUS = 0x%08X\n", \
> + _en ? "en" : "dis", _status); \
> + atomic_set(&_owner->status, _status); \
> + _ret; \
> + })

I feel like this could be an actual inline function without the macro
wrapper with a little fiddling.

> +
> +#define cmdqv_write_config(_regval) \
> + tegra241_cmdqv_write_config(cmdqv, CMDQV, _regval)
> +#define vintf_write_config(_regval) \
> + tegra241_cmdqv_write_config(vintf, VINTF, _regval)
> +#define vcmdq_write_config(_regval) \
> + tegra241_cmdqv_write_config(vcmdq, VCMDQ, _regval)

More hidden access to stack values

> +/**
> + * struct tegra241_cmdqv - CMDQ-V for SMMUv3
> + * @smmu: SMMUv3 pointer
> + * @dev: Device pointer

This should probably be clarified as the device pointer to the ACPI
companion device

> +static void tegra241_cmdqv_handle_vintf0_error(struct tegra241_cmdqv *cmdqv)
> +{
> + struct tegra241_vintf *vintf = cmdqv->vintfs[0];
> + int i;
> +
> + /* Cache status to bypass VCMDQs until error is recovered */
> + atomic_set(&vintf->status, vintf_readl(STATUS));
> +
> + for (i = 0; i < 4; i++) {
> + u32 lvcmdq_err_map = vintf_readl_relaxed(CMDQ_ERR_MAP(i));
> +
> + while (lvcmdq_err_map) {
> + int lidx = ffs(lvcmdq_err_map) - 1;
> + struct tegra241_vcmdq *vcmdq = vintf->vcmdqs[lidx];
> + u32 gerrorn, gerror;
> +
> + lvcmdq_err_map &= ~BIT(lidx);
> +
> + __arm_smmu_cmdq_skip_err(cmdqv->smmu, &vcmdq->cmdq.q);
> +
> + gerrorn = vcmdq_page0_readl_relaxed(GERRORN);
> + gerror = vcmdq_page0_readl_relaxed(GERROR);
> +
> + vcmdq_page0_writel(gerror, GERRORN);
> + }
> + }
> +
> + /* Now error status should be clean, cache it again */
> + atomic_set(&vintf->status, vintf_readl(STATUS));
> +}
> +
> +static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
> +{
> + struct tegra241_cmdqv *cmdqv = (struct tegra241_cmdqv *)devid;
> + u32 vintf_errs[2];
> + u32 vcmdq_errs[4];
> +
> + vintf_errs[0] = cmdqv_readl_relaxed(VINTF_ERR_MAP);
> + vintf_errs[1] = cmdqv_readl_relaxed(VINTF_ERR_MAP + 0x4);
> +
> + vcmdq_errs[0] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(0));
> + vcmdq_errs[1] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(1));
> + vcmdq_errs[2] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(2));
> + vcmdq_errs[3] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(3));
> +
> + cmdqv_warn("unexpected cmdqv error reported\n");
> + cmdqv_warn(" vintf_map: 0x%08X%08X\n", vintf_errs[1], vintf_errs[0]);
> + cmdqv_warn(" vcmdq_map: 0x%08X%08X%08X%08X\n",
> + vcmdq_errs[3], vcmdq_errs[2], vcmdq_errs[1], vcmdq_errs[0]);

Put warnings in one print only, spreading them like this just
increases the risk of tearing.. It doesn't need to be all pretty.

> +struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
> +{
> + struct tegra241_cmdqv *cmdqv = smmu->tegra241_cmdqv;
> + struct tegra241_vintf *vintf = cmdqv->vintfs[0];
> + struct tegra241_vcmdq *vcmdq;
> + u16 lidx;
> +
> + if (bypass_vcmdq)

READ_ONCE

> + return &smmu->cmdq;
> +
> + /* Use SMMU CMDQ if vintfs[0] is uninitialized */
> + if (!FIELD_GET(VINTF_ENABLED, atomic_read(&vintf->status)))
> + return &smmu->cmdq;
> +
> + /* Use SMMU CMDQ if vintfs[0] has error status */
> + if (FIELD_GET(VINTF_STATUS, atomic_read(&vintf->status)))
> + return &smmu->cmdq;

Why atomic_read? The unlocked interaction with
tegra241_cmdqv_handle_vintf0_error() doesn't seem especially sane IMHO

> +static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
> +{
> + u32 gerrorn, gerror;
> +
> + if (vcmdq_write_config(0)) {
> + vcmdq_err("GERRORN=0x%X\n", vcmdq_page0_readl_relaxed(GERRORN));
> + vcmdq_err("GERROR=0x%X\n", vcmdq_page0_readl_relaxed(GERROR));
> + vcmdq_err("CONS=0x%X\n", vcmdq_page0_readl_relaxed(CONS));

Less prints, include a unique message about why this is being
printed..

> + }
> + vcmdq_page0_writel_relaxed(0, PROD);
> + vcmdq_page0_writel_relaxed(0, CONS);
> + vcmdq_page1_writeq_relaxed(0, BASE);
> + vcmdq_page1_writeq_relaxed(0, CONS_INDX_BASE);
> +
> + gerrorn = vcmdq_page0_readl_relaxed(GERRORN);
> + gerror = vcmdq_page0_readl_relaxed(GERROR);
> + if (gerror != gerrorn) {
> + vcmdq_info("Uncleared error detected, resetting\n");
> + vcmdq_page0_writel(gerror, GERRORN);
> + }
> +
> + vcmdq_dbg("deinited\n");
> +}
> +
> +static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
> +{
> + int ret;
> +
> + /* Configure and enable the vcmdq */
> + tegra241_vcmdq_hw_deinit(vcmdq);
> +
> + vcmdq_page1_writeq_relaxed(vcmdq->cmdq.q.q_base, BASE);
> +
> + ret = vcmdq_write_config(VCMDQ_EN);
> + if (ret) {
> + vcmdq_err("GERRORN=0x%X\n", vcmdq_page0_readl_relaxed(GERRORN));
> + vcmdq_err("GERROR=0x%X\n", vcmdq_page0_readl_relaxed(GERROR));
> + vcmdq_err("CONS=0x%X\n", vcmdq_page0_readl_relaxed(CONS));
> + return ret;

Same print?

> +static void tegra241_vcmdq_free_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
> +{
> + struct tegra241_cmdqv *cmdqv = vcmdq->cmdqv;
> + struct arm_smmu_queue *q = &vcmdq->cmdq.q;
> + size_t nents = 1 << q->llq.max_n_shift;
> +
> + dmam_free_coherent(cmdqv->smmu->dev, (nents * CMDQ_ENT_DWORDS) << 3,
> + q->base, q->base_dma);

If we are calling dmam_free, do we really need devm at all?

> +static struct tegra241_vcmdq *
> +tegra241_vintf_lvcmdq_alloc(struct tegra241_vintf *vintf, u16 lidx)
> +{
> + struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
> + struct tegra241_vcmdq *vcmdq;
> + int ret;
> +
> + vcmdq = devm_kzalloc(cmdqv->dev, sizeof(*vcmdq), GFP_KERNEL);
> + if (!vcmdq)
> + return ERR_PTR(-ENOMEM);
> +
> + ret = tegra241_vintf_lvcmdq_init(vintf, lidx, vcmdq);
> + if (ret)
> + goto free_vcmdq;
> +
> + /* Setup struct arm_smmu_cmdq data members */
> + ret = tegra241_vcmdq_alloc_smmu_cmdq(vcmdq);
> + if (ret)
> + goto deinit_lvcmdq;
> +
> + ret = tegra241_vcmdq_hw_init(vcmdq);
> + if (ret)
> + goto free_queue;
> +
> + vcmdq_dbg("allocated\n");
> + return vcmdq;
> +free_queue:
> + tegra241_vcmdq_free_smmu_cmdq(vcmdq);
> +deinit_lvcmdq:
> + tegra241_vintf_lvcmdq_deinit(vcmdq);
> +free_vcmdq:
> + devm_kfree(cmdqv->dev, vcmdq);
> + return ERR_PTR(ret);
> +}
> +
> +static void tegra241_vintf_lvcmdq_free(struct tegra241_vcmdq *vcmdq)
> +{
> + tegra241_vcmdq_hw_deinit(vcmdq);
> + tegra241_vcmdq_free_smmu_cmdq(vcmdq);
> + tegra241_vintf_lvcmdq_deinit(vcmdq);
> + devm_kfree(vcmdq->cmdqv->dev, vcmdq);

Ditto for devm_kfree.

> +struct tegra241_cmdqv *
> +tegra241_cmdqv_acpi_probe(struct arm_smmu_device *smmu, int id)

id is a u32.

It might be clearer to just pass in the struct
acpi_iort_node *?

> +{
> + struct tegra241_cmdqv *cmdqv;
> +
> + cmdqv = tegra241_cmdqv_find_resource(smmu, id);
> + if (!cmdqv)
> + return NULL;
> +
> + if (tegra241_cmdqv_probe(cmdqv)) {
> + if (cmdqv->irq > 0)
> + devm_free_irq(smmu->dev, cmdqv->irq, cmdqv);
> + devm_iounmap(smmu->dev, cmdqv->base);
> + devm_kfree(smmu->dev, cmdqv);
> + return NULL;

Oh. Please don't use devm at all in this code then, it is not attached
to a probed driver with the proper scope, devm isn't going to work in
sensible way.

Jason