Re: [PATCH v4 3/4] KVM: x86: On emulation failure, convey the exit reason, etc. to userspace

From: David Edmondson
Date: Mon Sep 13 2021 - 09:23:50 EST


Sean Christopherson <seanjc@xxxxxxxxxx> writes:

> On Fri, Aug 13, 2021, David Edmondson wrote:
>> -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>> +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
>> + u8 ndata, u8 *insn_bytes, u8 insn_size)
>> {
>> - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>> - u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
>> struct kvm_run *run = vcpu->run;
>> + u8 ndata_start;
>> + u64 info[5];
>> +
>> + /*
>> + * Zero the whole array used to retrieve the exit info, casting to u32
>> + * for select entries will leave some chunks uninitialized.
>> + */
>> + memset(&info, 0, sizeof(info));
>> +
>> + static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
>> + &info[2], (u32 *)&info[3],
>> + (u32 *)&info[4]);
>>
>> run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
>> - run->emulation_failure.ndata = 0;
>> +
>> + /*
>> + * There's currently space for 13 entries, but 5 are used for the exit
>> + * reason and info. Restrict to 4 to reduce the maintenance burden
>> + * when expanding kvm_run.emulation_failure in the future.
>> + */
>> + if (WARN_ON_ONCE(ndata > 4))
>> + ndata = 4;
>> +
>> + /* Always include the flags as a 'data' entry. */
>> + ndata_start = 1;
>> run->emulation_failure.flags = 0;
>>
>> if (insn_size) {
>> - run->emulation_failure.ndata = 3;
>> + ndata_start += (sizeof(run->emulation_failure.insn_size) +
>> + sizeof(run->emulation_failure.insn_bytes)) /
>> + sizeof(u64);
>
> Hrm, I like the intent, but the end result ends up being rather convoluted and
> unnecessarily scary, e.g. this would do the wrong thing if the combined size of
> the fields is not a multiple of 8. That's obviously is not true, but relying on
> insn_size/insn_bytes being carefully selected while simultaneously obscuring that
> dependency is a bit mean. What about a compile-time assertion with a more reader
> friendly literal for bumping the count?
>
> BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
> sizeof(run->emulation_failure.insn_bytes) != 16));
> ndata_start += 2;

Okay.

>> run->emulation_failure.flags |=
>> KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
>> run->emulation_failure.insn_size = insn_size;
>> memset(run->emulation_failure.insn_bytes, 0x90,
>> sizeof(run->emulation_failure.insn_bytes));
>> - memcpy(run->emulation_failure.insn_bytes,
>> - ctxt->fetch.data, insn_size);
>> + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
>> }
>> +
>> + memcpy(&run->internal.data[ndata_start], info, sizeof(info));
>
> Oof, coming back to this code after some time away, "ndata_start" is confusing.
> I believe past me thought that it would help convey that "info" is lumped into
> the arbitrary data, but for me at least it just ends up making the interaction
> with @data and @ndata more confusing. Sorry for the bad suggestion :-/
>
> What about info_start? IMO, that makes the memcpy more readable. Another option
> would be to have the name describe the number of "ABI enries", but I can't come
> up with a variable name that's remotely readable.
>
> memcpy(&run->internal.data[info_start], info, sizeof(info));
> memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
> ndata * sizeof(data[0]));

Okay.

>> + memcpy(&run->internal.data[ndata_start + ARRAY_SIZE(info)], data,
>> + ndata * sizeof(u64));
>
> Not that it really matters, but it's probably better to use sizeof(data[0]) or
> sizeof(*data). E.g. if we do screw up the param in the future, we only botch the
> output formatting, as opposed to dumping kernel stack data to userspace.

Agreed.

>> +
>> + run->emulation_failure.ndata = ndata_start + ARRAY_SIZE(info) + ndata;
>> }
>>
>> +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
>> +{
>> + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
>> +
>> + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
>> + ctxt->fetch.end - ctxt->fetch.data);
>> +}
>> +
>> +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
>> + u8 ndata)
>> +{
>> + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
>> +}
>> +EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
>> +
>> +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
>> +{
>> + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
>> +
>> static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>> {
>> struct kvm *kvm = vcpu->kvm;
>> @@ -7502,16 +7551,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
>>
>> if (kvm->arch.exit_on_emulation_error ||
>> (emulation_type & EMULTYPE_SKIP)) {
>> - prepare_emulation_failure_exit(vcpu);
>> + prepare_emulation_ctxt_failure_exit(vcpu);
>> return 0;
>> }
>>
>> kvm_queue_exception(vcpu, UD_VECTOR);
>>
>> if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
>> - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
>> - vcpu->run->internal.ndata = 0;
>> + prepare_emulation_ctxt_failure_exit(vcpu);
>> return 0;
>> }
>>
>> @@ -12104,9 +12151,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
>> * doesn't seem to be a real use-case behind such requests, just return
>> * KVM_EXIT_INTERNAL_ERROR for now.
>> */
>> - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
>> - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
>> - vcpu->run->internal.ndata = 0;
>> + kvm_prepare_emulation_failure_exit(vcpu);
>>
>> return 0;
>> }
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 6c79c1ce3703..e86cc2de7b5c 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -397,6 +397,12 @@ struct kvm_run {
>> * "ndata" is correct, that new fields are enumerated in "flags",
>> * and that each flag enumerates fields that are 64-bit aligned
>> * and sized (so that ndata+internal.data[] is valid/accurate).
>> + *
>> + * Space beyond the defined fields may be used to
>
> Please run these out to 80 chars. Even 80 is a soft limit, it's ok to run over
> a bit if the end result is (subjectively) prettier.
>
>> + * store arbitrary debug information relating to the
>> + * emulation failure. It is accounted for in "ndata"
>> + * but otherwise unspecified and is not represented in
>
> Explicitly state the format is unspecified?
>
>> + * "flags".
>
> And also explicitly stating the debug info isn't ABI, e.g.
>
> * Space beyond the defined fields may be used to store arbitrary
> * debug information relating to the emulation failure. It is
> * accounted for in "ndata" but the format is unspecified and
> * is not represented in "flags". Any such info is _not_ ABI!

Okay.

>> */
>> struct {
>> __u32 suberror;
>> @@ -408,6 +414,7 @@ struct kvm_run {
>> __u8 insn_bytes[15];
>> };
>> };
>> + /* Arbitrary debug data may follow. */
>> } emulation_failure;
>> /* KVM_EXIT_OSI */
>> struct {
>> --
>> 2.30.2
>>