Re: [PATCH] KVM: x86: Add support for cmpxchg16b emulation

From: Sairaj Kodilkar

Date: Wed Mar 11 2026 - 07:19:29 EST




On 3/6/2026 9:08 PM, Sean Christopherson wrote:
On Fri, Mar 06, 2026, Sairaj Kodilkar wrote:
AMD and Intel both provides support for 128 bit cmpxchg operands using
cmpxchg8b/cmpxchg16b instructions (opcode 0FC7). However, kvm does not
support emulating cmpxchg16b (i.e when destination memory is 128 bit and
REX.W = 1) which causes emulation failure when QEMU guest performs a
cmpxchg16b on a memory region setup as a IO.

Hence extend cmpxchg8b to perform cmpxchg16b when the destination memory
is 128 bit.

Signed-off-by: Sairaj Kodilkar <sarunkod@xxxxxxx>
---
Background:

The AMD IOMMU driver writes 256-bit device table entries with two
128-bit cmpxchg operations. For guests using hardware-accelerated
vIOMMU (still in progress), QEMU traps device table accesses to set up
nested page tables. Without 128-bit cmpxchg emulation, KVM cannot
handle these traps and DTE access emulation fails.
Please put this paragraph in the changelog proper, the "why" matters greatly here.

Sure.

QEMU implementation that traps DTE accesses:
https://github.com/AMDESE/qemu-iommu/blob/wip/for_iommufd_hw_queue-v8_amd_viommu_20260106/hw/i386/amd_viommu.c#L517
---
arch/x86/kvm/emulate.c | 48 +++++++++++++++++++++++++++++++-------
arch/x86/kvm/kvm_emulate.h | 1 +
2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c8e292e9a24d..e1a08cd3274b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2188,17 +2188,18 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+static int __handle_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
{
- u64 old = ctxt->dst.orig_val64;
+ u64 old64 = ctxt->dst.orig_val64;
- if (ctxt->dst.bytes == 16)
+ /* Use of the REX.W prefix promotes operation to 128 bits */
+ if (ctxt->rex_bits & REX_W)
Uh, yeah, and the caller specifically pivoted on this size. I'm a-ok with a
sanity check, but it should be exactly that, e.g.

if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
return X86EMUL_UNHANDLEABLE;

Sure I'll update it.

return X86EMUL_UNHANDLEABLE;
- if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
- ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
- *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
- *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+ if (((u32) (old64 >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u32) (old64 >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old64 >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old64 >> 32);
ctxt->eflags &= ~X86_EFLAGS_ZF;
} else {
ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
@@ -2209,6 +2210,37 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int __handle_cmpxchg16b(struct x86_emulate_ctxt *ctxt)
+{
+ __uint128_t old128 = ctxt->dst.val128;
There is zero chance you properly tested this patch. Please write comprehensive
testcases in KUT's emulator64.c before posting the next version.

In writeback(), the OP_MEM case quite clearly operates on "unsigned long" values
*and* consumes orig_val in the locked case.

case OP_MEM:
if (ctxt->lock_prefix)
return segmented_cmpxchg(ctxt,
op->addr.mem,
&op->orig_val,
&op->val,
op->bytes);
else
return segmented_write(ctxt,
op->addr.mem,
&op->val,
op->bytes);

And then emulator_cmpxchg_emulated() should be taught to do cmpxchg16b itself:

/* guests cmpxchg8b have to be emulated atomically */
if (bytes > 8 || (bytes & (bytes - 1)))
goto emul_write;

That might be a big lift, e.g. to get a 16-byte uaccess version. If it's
unreasonably difficult, we should reject emulation of LOCK CMPXCHG16B.

I think its not a too much of the work, I have a prototype ready which
implements cmpxchg16b access in uaccess.h. Tested with a KUT as well
seems to be working fine. Will post it once I cleanup the series.

And this code would also need to be updated to copy the full 128-bit value.

/* Copy full 64-bit value for CMPXCHG8B. */
ctxt->dst.orig_val64 = ctxt->dst.val64;

Maybe something like this? Or maybe just copy 128 bits unconditionally (I'm not
sure if that could read uninitiatlized data or not).

/* Copy the full 64/128-bit value for CMPXCHG8B/16B. */
if (IS_ENABLED(CONFIG_X86_64) && ctxt->dst.bytes == 16)
ctxt->dst.orig_val128 = ctxt->dst.val128;
else
ctxt->dst.orig_val64 = ctxt->dst.val64;

Yeah thanks, I completely missed that cmpxchg with lock prefix
consumes original value. I'll update the code.

+
+ /* Use of the REX.W prefix promotes operation to 128 bits */
+ if (!(ctxt->rex_bits & REX_W))
+ return X86EMUL_UNHANDLEABLE;
+
+ if (((u64) (old128 >> 0) != (u64) reg_read(ctxt, VCPU_REGS_RAX)) ||
+ ((u64) (old128 >> 64) != (u64) reg_read(ctxt, VCPU_REGS_RDX))) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u64) (old128 >> 0);
+ *reg_write(ctxt, VCPU_REGS_RDX) = (u64) (old128 >> 64);
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ } else {
+ ctxt->dst.val128 =
+ ((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
+ (u64) reg_read(ctxt, VCPU_REGS_RBX);
+
+ ctxt->eflags |= X86_EFLAGS_ZF;
IMO we should use a macro to handle 8b vs. 16b. The code is going to be heinous
no matter what, and so to me, duplicating everything makes it twice as ugly,
whereas a macro makes it like 10% more ugly.

+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchgxb(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->dst.bytes == 16)
+ return __handle_cmpxchg16b(ctxt);
+
+ return __handle_cmpxchg8b(ctxt);
+}
+
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
int rc;
@@ -4097,7 +4129,7 @@ static const struct gprefix pfx_0f_c7_7 = {
static const struct group_dual group9 = { {
- N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchgxb), N, N, N, N, N, N,
Eh, I vote to keep it em_cmpxchg8b. _If_ we want to capture that its size is
variable, maybe em_cmpxchg8b_16b? em_cmpxchgxb just looks like a typo.

Yeah ok, I was not sure how to represent variable size hence used the x.
Will rename it.


Sans correct writeback() handling, something like so:

---
arch/x86/kvm/emulate.c | 44 ++++++++++++++++++++++++--------------
arch/x86/kvm/kvm_emulate.h | 2 ++
2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6145dac4a605..3ec4555571fa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2201,24 +2201,33 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
return rc;
}
+#define em_cmpxchg8b_16b(__c, rbits, mbits) \
+do { \
+ u##mbits old = __c->dst.orig_val##mbits; \
+ \
+ BUILD_BUG_ON(rbits * 2 != mbits); \
+ \
+ if (((u##rbits)(old >> 0) != (u##rbits)reg_read(__c, VCPU_REGS_RAX)) || \
+ ((u##rbits)(old >> rbits) != (u##rbits)reg_read(__c, VCPU_REGS_RDX))) { \
+ *reg_write(__c, VCPU_REGS_RAX) = (u##rbits)(old >> 0); \
+ *reg_write(__c, VCPU_REGS_RDX) = (u##rbits)(old >> rbits); \
+ __c->eflags &= ~X86_EFLAGS_ZF; \
+ } else { \
+ __c->dst.val##mbits = ((u##mbits)reg_read(__c, VCPU_REGS_RCX) << rbits) | \
+ (u##rbits)reg_read(__c, VCPU_REGS_RBX); \
+ __c->eflags |= X86_EFLAGS_ZF; \
+ } \
+} while (0)
+
static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
{
- u64 old = ctxt->dst.orig_val64;
-
- if (ctxt->dst.bytes == 16)
+ if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
return X86EMUL_UNHANDLEABLE;
- if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
- ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
- *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
- *reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- } else {
- ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
- (u32) reg_read(ctxt, VCPU_REGS_RBX);
-
- ctxt->eflags |= X86_EFLAGS_ZF;
- }
+ if (!(ctxt->rex_bits & REX_W))
+ em_cmpxchg8b_16b(ctxt, 32, 64);
+ else
+ em_cmpxchg8b_16b(ctxt, 64, 128);
return X86EMUL_CONTINUE;
}
@@ -5418,8 +5427,11 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
goto done;
}
}
- /* Copy full 64-bit value for CMPXCHG8B. */
- ctxt->dst.orig_val64 = ctxt->dst.val64;
+ /* Copy the full 64/128-bit value for CMPXCHG8B/16B. */
+ if (IS_ENABLED(CONFIG_X86_64) && ctxt->dst.bytes == 16)
+ ctxt->dst.orig_val128 = ctxt->dst.val128;
+ else
+ ctxt->dst.orig_val64 = ctxt->dst.val64;
special_insn:
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index fb3dab4b5a53..0e9968319343 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -255,6 +255,7 @@ struct operand {
union {
unsigned long orig_val;
u64 orig_val64;
+ u64 orig_val128;
};
union {
unsigned long *reg;
@@ -268,6 +269,7 @@ struct operand {
union {
unsigned long val;
u64 val64;
+ __uint128_t val128;
char valptr[sizeof(avx256_t)];
sse128_t vec_val;
avx256_t vec_val2;

base-commit: 5128b972fb2801ad9aca54d990a75611ab5283a9
--

Thanks
Sairaj