In addition, I find that function arm_smmu_cmdq_build_cmd() can also be optimized
slightly, three useless instructions can be reduced.
Case 1):
void arm_smmu_cmdq_build_cmd_tst1(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
{
memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
cmd[0] |= FIELD_PREP(CMDQ_0_OP, ent->opcode);
}
0000000000004608 <arm_smmu_cmdq_build_cmd_tst1>:
4608: a9007c1f stp xzr, xzr, [x0]
460c: 39400022 ldrb w2, [x1]
4610: f9400001 ldr x1, [x0]
4614: aa020021 orr x1, x1, x2
4618: f9000001 str x1, [x0]
461c: d65f03c0 ret
Case 2):
void arm_smmu_cmdq_build_cmd_tst2(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
{
int i;
cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
for (i = 1; i < CMDQ_ENT_DWORDS; i++)
cmd[i] = 0;
}
0000000000004620 <arm_smmu_cmdq_build_cmd_tst2>:
4620: 39400021 ldrb w1, [x1]
4624: a9007c01 stp x1, xzr, [x0]
4628: d65f03c0 ret
462c: d503201f nop
Case 3):
void arm_smmu_cmdq_build_cmd_tst3(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
{
memset(cmd, 0, 1 << CMDQ_ENT_SZ_SHIFT);
cmd[0] = FIELD_PREP(CMDQ_0_OP, ent->opcode);
}
0000000000004630 <arm_smmu_cmdq_build_cmd_tst3>:
4630: a9007c1f stp xzr, xzr, [x0]
4634: 39400021 ldrb w1, [x1]
4638: f9000001 str x1, [x0]
463c: d65f03c0 ret