Re: [RFC PATCH] IOMMU: SMMUv2: Support for Extended Stream ID (16 bit)
From: Robin Murphy
Date: Wed Jan 11 2017 - 09:15:20 EST
On 10/01/17 11:57, Aleksey Makarov wrote:
> Enable the Extended Stream ID feature when available.
>
> This patch on top of series "[PATCH v7 00/19] KVM PCIe/MSI passthrough
> on ARM/ARM64 and IOVA reserved regions" by Eric Auger allows
> to passthrough an external PCIe network card on a ThunderX server
> successfully.
>
> Without this patch that card caused a warning like
>
> pci 0006:90:00.0: stream ID 0x9000 out of range for SMMU (0x7fff)
>
> during boot.
>
> Signed-off-by: Aleksey Makarov <aleksey.makarov@xxxxxxxxxx>
> ---
> drivers/iommu/arm-smmu.c | 53 +++++++++++++++++++++++++++++++++---------------
> 1 file changed, 37 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> index 13d26009b8e0..d160c12828f4 100644
> --- a/drivers/iommu/arm-smmu.c
> +++ b/drivers/iommu/arm-smmu.c
> @@ -24,6 +24,7 @@
> * - v7/v8 long-descriptor format
> * - Non-secure access to the SMMU
> * - Context fault reporting
> + * - Extended Stream ID (16 bit)
> */
>
> #define pr_fmt(fmt) "arm-smmu: " fmt
> @@ -87,6 +88,7 @@
> #define sCR0_CLIENTPD (1 << 0)
> #define sCR0_GFRE (1 << 1)
> #define sCR0_GFIE (1 << 2)
> +#define sCR0_EXIDENABLE (1 << 3)
> #define sCR0_GCFGFRE (1 << 4)
> #define sCR0_GCFGFIE (1 << 5)
> #define sCR0_USFCFG (1 << 10)
> @@ -126,6 +128,7 @@
> #define ID0_NUMIRPT_MASK 0xff
> #define ID0_NUMSIDB_SHIFT 9
> #define ID0_NUMSIDB_MASK 0xf
> +#define ID0_EXIDS (1 << 8)
> #define ID0_NUMSMRG_SHIFT 0
> #define ID0_NUMSMRG_MASK 0xff
>
> @@ -169,6 +172,7 @@
> #define ARM_SMMU_GR0_S2CR(n) (0xc00 + ((n) << 2))
> #define S2CR_CBNDX_SHIFT 0
> #define S2CR_CBNDX_MASK 0xff
> +#define S2CR_EXIDVALID (1 << 10)
> #define S2CR_TYPE_SHIFT 16
> #define S2CR_TYPE_MASK 0x3
> enum arm_smmu_s2cr_type {
> @@ -354,6 +358,7 @@ struct arm_smmu_device {
> #define ARM_SMMU_FEAT_FMT_AARCH64_64K (1 << 9)
> #define ARM_SMMU_FEAT_FMT_AARCH32_L (1 << 10)
> #define ARM_SMMU_FEAT_FMT_AARCH32_S (1 << 11)
> +#define ARM_SMMU_FEAT_EXIDS (1 << 12)
> u32 features;
>
> #define ARM_SMMU_OPT_SECURE_CFG_ACCESS (1 << 0)
> @@ -1051,7 +1056,7 @@ static void arm_smmu_write_smr(struct arm_smmu_device *smmu, int idx)
> struct arm_smmu_smr *smr = smmu->smrs + idx;
> u32 reg = smr->id << SMR_ID_SHIFT | smr->mask << SMR_MASK_SHIFT;
>
> - if (smr->valid)
> + if (!(smmu->features & ARM_SMMU_FEAT_EXIDS) && smr->valid)
> reg |= SMR_VALID;
> writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_SMR(idx));
> }
> @@ -1063,6 +1068,9 @@ static void arm_smmu_write_s2cr(struct arm_smmu_device *smmu, int idx)
> (s2cr->cbndx & S2CR_CBNDX_MASK) << S2CR_CBNDX_SHIFT |
> (s2cr->privcfg & S2CR_PRIVCFG_MASK) << S2CR_PRIVCFG_SHIFT;
>
> + if (smmu->features & ARM_SMMU_FEAT_EXIDS && smmu->smrs &&
> + smmu->smrs[idx].valid)
> + reg |= S2CR_EXIDVALID;
> writel_relaxed(reg, ARM_SMMU_GR0(smmu) + ARM_SMMU_GR0_S2CR(idx));
> }
>
> @@ -1674,6 +1682,9 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
> if (smmu->features & ARM_SMMU_FEAT_VMID16)
> reg |= sCR0_VMID16EN;
>
> + if (smmu->features & ARM_SMMU_FEAT_EXIDS)
> + reg |= sCR0_EXIDENABLE;
> +
> /* Push the button */
> __arm_smmu_tlb_sync(smmu);
> writel(reg, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
> @@ -1761,7 +1772,12 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
> "\t(IDR0.CTTW overridden by FW configuration)\n");
>
> /* Max. number of entries we have for stream matching/indexing */
> - size = 1 << ((id >> ID0_NUMSIDB_SHIFT) & ID0_NUMSIDB_MASK);
> + if (smmu->version == ARM_SMMU_V2 && id & ID0_EXIDS) {
> + smmu->features |= ARM_SMMU_FEAT_EXIDS;
> + size = (1 << 16);
Unnecessary parentheses.
> + } else {
> + size = 1 << ((id >> ID0_NUMSIDB_SHIFT) & ID0_NUMSIDB_MASK);
> + }
Given what the architecture says about the relationship between EXIDS
and NUMSIDB, I suppose an even shorter version could be:
if (smmu->version == ARM_SMMU_V2 && id & ID0_EXIDS)
size *= 2;
but I'm not sure that's actually any nicer to read.
> smmu->streamid_mask = size - 1;
> if (id & ID0_SMS) {
> u32 smr;
> @@ -1774,20 +1790,25 @@ static int arm_smmu_device_cfg_probe(struct arm_smmu_device *smmu)
> return -ENODEV;
> }
>
> - /*
> - * SMR.ID bits may not be preserved if the corresponding MASK
> - * bits are set, so check each one separately. We can reject
> - * masters later if they try to claim IDs outside these masks.
> - */
> - smr = smmu->streamid_mask << SMR_ID_SHIFT;
> - writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
> - smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
> - smmu->streamid_mask = smr >> SMR_ID_SHIFT;
> -
> - smr = smmu->streamid_mask << SMR_MASK_SHIFT;
> - writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
> - smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
> - smmu->smr_mask_mask = smr >> SMR_MASK_SHIFT;
> + if (smmu->features & ARM_SMMU_FEAT_EXIDS) {
> + smmu->smr_mask_mask = smmu->streamid_mask;
> + } else {
> + /*
> + * SMR.ID bits may not be preserved if the corresponding
> + * MASK bits are set, so check each one separately.
> + * We can reject masters later if they try to claim IDs
> + * outside these masks.
> + */
> + smr = smmu->streamid_mask << SMR_ID_SHIFT;
> + writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
> + smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
> + smmu->streamid_mask = smr >> SMR_ID_SHIFT;
> +
> + smr = smmu->streamid_mask << SMR_MASK_SHIFT;
> + writel_relaxed(smr, gr0_base + ARM_SMMU_GR0_SMR(0));
> + smr = readl_relaxed(gr0_base + ARM_SMMU_GR0_SMR(0));
> + smmu->smr_mask_mask = smr >> SMR_MASK_SHIFT;
> + }
This hunk is quite possibly wrong. I don't see any guarantee in the
architecture that all EXMASK/EXID bits *must* be implemented, and even
so there's still no harm in the driver determining that experimentally.
It looks like we need a bit of refactoring such that we move the probing
of SMR fields to after counting and allocating the SME structures, then
in the EXIDS case we can explicitly clear the SMEs and poke EXIDENABLE
inbetween.
Robin.
>
> /* Zero-initialised to mark as invalid */
> smmu->smrs = devm_kcalloc(smmu->dev, size, sizeof(*smmu->smrs),
>