Re: [PATCH v4 3/4] KVM: x86: On emulation failure, convey the exit reason, etc. to userspace
From: Sean Christopherson
Date: Thu Sep 02 2021 - 14:39:42 EST
On Fri, Aug 13, 2021, David Edmondson wrote:
> -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
> +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
> + u8 ndata, u8 *insn_bytes, u8 insn_size)
> {
> - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
> - u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
> struct kvm_run *run = vcpu->run;
> + u8 ndata_start;
> + u64 info[5];
> +
> + /*
> + * Zero the whole array used to retrieve the exit info, casting to u32
> + * for select entries will leave some chunks uninitialized.
> + */
> + memset(&info, 0, sizeof(info));
> +
> + static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
> + &info[2], (u32 *)&info[3],
> + (u32 *)&info[4]);
>
> run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
> - run->emulation_failure.ndata = 0;
> +
> + /*
> + * There's currently space for 13 entries, but 5 are used for the exit
> + * reason and info. Restrict to 4 to reduce the maintenance burden
> + * when expanding kvm_run.emulation_failure in the future.
> + */
> + if (WARN_ON_ONCE(ndata > 4))
> + ndata = 4;
> +
> + /* Always include the flags as a 'data' entry. */
> + ndata_start = 1;
> run->emulation_failure.flags = 0;
>
> if (insn_size) {
> - run->emulation_failure.ndata = 3;
> + ndata_start += (sizeof(run->emulation_failure.insn_size) +
> + sizeof(run->emulation_failure.insn_bytes)) /
> + sizeof(u64);
Hrm, I like the intent, but the end result ends up being rather convoluted and
unnecessarily scary, e.g. this would do the wrong thing if the combined size of
the fields is not a multiple of 8. That's obviously is not true, but relying on
insn_size/insn_bytes being carefully selected while simultaneously obscuring that
dependency is a bit mean. What about a compile-time assertion with a more reader
friendly literal for bumping the count?
BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
sizeof(run->emulation_failure.insn_bytes) != 16));
ndata_start += 2;
> run->emulation_failure.flags |=
> KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
> run->emulation_failure.insn_size = insn_size;
> memset(run->emulation_failure.insn_bytes, 0x90,
> sizeof(run->emulation_failure.insn_bytes));
> - memcpy(run->emulation_failure.insn_bytes,
> - ctxt->fetch.data, insn_size);
> + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
> }
> +
> + memcpy(&run->internal.data[ndata_start], info, sizeof(info));
Oof, coming back to this code after some time away, "ndata_start" is confusing.
I believe past me thought that it would help convey that "info" is lumped into
the arbitrary data, but for me at least it just ends up making the interaction
with @data and @ndata more confusing. Sorry for the bad suggestion :-/
What about info_start? IMO, that makes the memcpy more readable. Another option
would be to have the name describe the number of "ABI enries", but I can't come
up with a variable name that's remotely readable.
memcpy(&run->internal.data[info_start], info, sizeof(info));
memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
ndata * sizeof(data[0]));
> + memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data,
> + ndata * sizeof(u64));
Not that it really matters, but it's probably better to use sizeof(data[0]) or
sizeof(*data). E.g. if we do screw up the param in the future, we only botch the
output formatting, as opposed to dumping kernel stack data to userspace.
> +
> + run->emulation_failure.ndata = ndata_start + ARRAY_SIZE(info) + ndata;
> }
>
> +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
> +{
> + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
> +
> + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
> + ctxt->fetch.end - ctxt->fetch.data);
> +}
> +
> +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
> + u8 ndata)
> +{
> + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
> +}
> +EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
> +
> +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
> +{
> + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
> +}
> +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
> +
> static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
> {
> struct kvm *kvm = vcpu->kvm;
> @@ -7502,16 +7551,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>
> if (kvm->arch.exit_on_emulation_error ||
> (emulation_type & EMULTYPE_SKIP)) {
> - prepare_emulation_failure_exit(vcpu);
> + prepare_emulation_ctxt_failure_exit(vcpu);
> return 0;
> }
>
> kvm_queue_exception(vcpu, UD_VECTOR);
>
> if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
> - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
> - vcpu->run->internal.ndata = 0;
> + prepare_emulation_ctxt_failure_exit(vcpu);
> return 0;
> }
>
> @@ -12104,9 +12151,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
> * doesn't seem to be a real use-case behind such requests, just return
> * KVM_EXIT_INTERNAL_ERROR for now.
> */
> - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
> - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
> - vcpu->run->internal.ndata = 0;
> + kvm_prepare_emulation_failure_exit(vcpu);
>
> return 0;
> }
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 6c79c1ce3703..e86cc2de7b5c 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -397,6 +397,12 @@ struct kvm_run {
> * "ndata" is correct, that new fields are enumerated in "flags",
> * and that each flag enumerates fields that are 64-bit aligned
> * and sized (so that ndata+internal.data[] is valid/accurate).
> + *
> + * Space beyond the defined fields may be used to
Please run these out to 80 chars. Even 80 is a soft limit, it's ok to run over
a bit if the end result is (subjectively) prettier.
> + * store arbitrary debug information relating to the
> + * emulation failure. It is accounted for in "ndata"
> + * but otherwise unspecified and is not represented in
Explicitly state the format is unspecified?
> + * "flags".
And also explicitly stating the debug info isn't ABI, e.g.
* Space beyond the defined fields may be used to store arbitrary
* debug information relating to the emulation failure. It is
* accounted for in "ndata" but the format is unspecified and
* is not represented in "flags". Any such info is _not_ ABI!
> */
> struct {
> __u32 suberror;
> @@ -408,6 +414,7 @@ struct kvm_run {
> __u8 insn_bytes[15];
> };
> };
> + /* Arbitrary debug data may follow. */
> } emulation_failure;
> /* KVM_EXIT_OSI */
> struct {
> --
> 2.30.2
>