Re: [RFC PATCH 06/18] virt/mshv: create, initialize, finalize, delete partition hypercalls

From: Nuno Das Neves
Date: Thu Mar 04 2021 - 18:49:22 EST


On 2/8/2021 11:42 AM, Michael Kelley wrote:
> From: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx> Sent: Friday, November 20, 2020 4:30 PM
>>
>> Add hypercalls for fully setting up and mostly tearing down a guest
>> partition.
>> The teardown operation will generate an error as the deposited
>> memory has not been withdrawn.
>> This is fixed in the next patch.
>>
>> Co-developed-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx>
>> Signed-off-by: Lillian Grassin-Drake <ligrassi@xxxxxxxxxxxxx>
>> Signed-off-by: Nuno Das Neves <nunodasneves@xxxxxxxxxxxxxxxxxxx>
>> ---
>> include/asm-generic/hyperv-tlfs.h | 52 +++++++-
>> include/uapi/asm-generic/hyperv-tlfs.h | 1 +
>> include/uapi/linux/mshv.h | 1 +
>> virt/mshv/mshv_main.c | 169 ++++++++++++++++++++++++-
>> 4 files changed, 220 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
>> index 2ff580780ce4..ab6ae6c164f5 100644
>> --- a/include/asm-generic/hyperv-tlfs.h
>> +++ b/include/asm-generic/hyperv-tlfs.h
>> @@ -142,6 +142,10 @@ struct ms_hyperv_tsc_page {
>> #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013
>> #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014
>> #define HVCALL_SEND_IPI_EX 0x0015
>> +#define HVCALL_CREATE_PARTITION 0x0040
>> +#define HVCALL_INITIALIZE_PARTITION 0x0041
>> +#define HVCALL_FINALIZE_PARTITION 0x0042
>> +#define HVCALL_DELETE_PARTITION 0x0043
>> #define HVCALL_GET_PARTITION_ID 0x0046
>> #define HVCALL_DEPOSIT_MEMORY 0x0048
>> #define HVCALL_CREATE_VP 0x004e
>> @@ -451,7 +455,7 @@ struct hv_get_partition_id {
>> struct hv_deposit_memory {
>> u64 partition_id;
>> u64 gpa_page_list[];
>> -} __packed;
>> +};
>
> Why remove __packed?
>

I change it to match the hyperv structures exactly.
I will re-add __packed here and explicitly lay out all the
structures as you have suggested.

>>
>> struct hv_proximity_domain_flags {
>> u32 proximity_preferred : 1;
>> @@ -767,4 +771,50 @@ struct hv_input_unmap_device_interrupt {
>> #define HV_SOURCE_SHADOW_NONE 0x0
>> #define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1
>>
>> +#define HV_MAKE_COMPATIBILITY_VERSION(major_, minor_) \
>> + ((u32)((major_) << 8 | (minor_)))
>> +
>> +enum hv_compatibility_version {
>> + HV_COMPATIBILITY_19_H1 = HV_MAKE_COMPATIBILITY_VERSION(0X6, 0X5),
>> + HV_COMPATIBILITY_MANGANESE = HV_MAKE_COMPATIBILITY_VERSION(0X6, 0X7),
>
> Avoid use of "Manganese", which is an internal code name. I'd suggest calling it
> 20_H1 instead, which at least has some broader meaning.
>
>> + HV_COMPATIBILITY_PRERELEASE = HV_MAKE_COMPATIBILITY_VERSION(0XFE, 0X0),
>> + HV_COMPATIBILITY_EXPERIMENT = HV_MAKE_COMPATIBILITY_VERSION(0XFF, 0X0),
>> +};
>> +
>> +union hv_partition_isolation_properties {
>> + u64 as_uint64;
>> + struct {
>> + u64 isolation_type: 5;
>> + u64 rsvd_z: 7;
>> + u64 shared_gpa_boundary_page_number: 52;
>> + };
>> +};
>
> Add __packed.
>

Will do.

>> +
>> +/* Non-userspace-visible partition creation flags */
>> +#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8)
>> +
>> +struct hv_create_partition_in {
>> + u64 flags;
>> + union hv_proximity_domain_info proximity_domain_info;
>> + enum hv_compatibility_version compatibility_version;
>
> An "enum" is a 32 bit value in gcc and I would presume that
> Hyper-V is expecting a 64 bit value. In general, using an enum in a data
> structure with exact layout requirements is problematic because the "C"
> language doesn't specify how big an enum is. In such cases, it's better
> to use an integer field with an explicit size (like u64) and #defines for
> the possible values.
>

Will do.

>> + struct hv_partition_creation_properties partition_creation_properties;
>> + union hv_partition_isolation_properties isolation_properties;
>> +};
>> +
>> +struct hv_create_partition_out {
>> + u64 partition_id;
>> +};
>> +
>> +struct hv_initialize_partition {
>> + u64 partition_id;
>> +};
>> +
>> +struct hv_finalize_partition {
>> + u64 partition_id;
>> +};
>> +
>> +struct hv_delete_partition {
>> + u64 partition_id;
>> +};
>
> All of the above should have __packed for consistency with the other
> Hyper-V data structures.
>

Will do.

>> +
>> #endif
>> diff --git a/include/uapi/asm-generic/hyperv-tlfs.h b/include/uapi/asm-generic/hyperv-
>> tlfs.h
>> index 140cc0b4f98f..7a858226a9c5 100644
>> --- a/include/uapi/asm-generic/hyperv-tlfs.h
>> +++ b/include/uapi/asm-generic/hyperv-tlfs.h
>> @@ -6,6 +6,7 @@
>> #define BIT(X) (1ULL << (X))
>> #endif
>>
>> +/* Userspace-visible partition creation flags */
>
> Could this comment be included in the earlier patch with the #defines so
> that you avoid the trivial change here?
>

Yep, will do.

>> #define HV_PARTITION_CREATION_FLAG_SMT_ENABLED_GUEST BIT(0)
>> #define HV_PARTITION_CREATION_FLAG_GPA_LARGE_PAGES_DISABLED BIT(3)
>> #define HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED BIT(4)
>> diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
>> index 3788f8bc5caa..4f8da9a6fde2 100644
>> --- a/include/uapi/linux/mshv.h
>> +++ b/include/uapi/linux/mshv.h
>> @@ -9,6 +9,7 @@
>>
>> #include <linux/types.h>
>> #include <asm/hyperv-tlfs.h>
>> +#include <asm-generic/hyperv-tlfs.h>
>
> Similarly, consider adding this #include in the earlier patch so that
> this trivial change isn't needed here.
>

Will do.

>>
>> #define MSHV_VERSION 0x0
>>
>> diff --git a/virt/mshv/mshv_main.c b/virt/mshv/mshv_main.c
>> index 4dcbe4907430..c4130a6508e5 100644
>> --- a/virt/mshv/mshv_main.c
>> +++ b/virt/mshv/mshv_main.c
>> @@ -15,6 +15,7 @@
>> #include <linux/file.h>
>> #include <linux/anon_inodes.h>
>> #include <linux/mshv.h>
>> +#include <asm/mshyperv.h>
>>
>> MODULE_AUTHOR("Microsoft");
>> MODULE_LICENSE("GPL");
>> @@ -31,7 +32,6 @@ static struct mshv mshv = {};
>> static void mshv_partition_put(struct mshv_partition *partition);
>> static int mshv_partition_release(struct inode *inode, struct file *filp);
>> static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
>> -
>
> Spurious whitespace change?
>

Yes - removed it.

>> static int mshv_dev_open(struct inode *inode, struct file *filp);
>> static int mshv_dev_release(struct inode *inode, struct file *filp);
>> static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
>> @@ -57,6 +57,143 @@ static struct miscdevice mshv_dev = {
>> .mode = 600,
>> };
>>
>> +#define HV_INIT_PARTITION_DEPOSIT_PAGES 208
>
> A comment about how this value is determined would be useful.
> I'm assuming it was determined empirically.
>

Correct - I'll add a comment.

>> +
>> +static int
>> +hv_call_create_partition(
>> + u64 flags,
>> + struct hv_partition_creation_properties creation_properties,
>> + u64 *partition_id)
>> +{
>> + struct hv_create_partition_in *input;
>> + struct hv_create_partition_out *output;
>> + int status;
>> + int ret;
>> + unsigned long irq_flags;
>> + int i;
>> +
>> + do {
>> + local_irq_save(irq_flags);
>> + input = (struct hv_create_partition_in *)(*this_cpu_ptr(
>> + hyperv_pcpu_input_arg));
>> + output = (struct hv_create_partition_out *)(*this_cpu_ptr(
>> + hyperv_pcpu_output_arg));
>> +
>> + input->flags = flags;
>> + input->proximity_domain_info.as_uint64 = 0;
>> + input->compatibility_version = HV_COMPATIBILITY_MANGANESE;
>> + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURE_BANKS; ++i)
>> + input->partition_creation_properties
>> + .disabled_processor_features.as_uint64[i] = 0;
>> + input->partition_creation_properties
>> + .disabled_processor_xsave_features.as_uint64 = 0;
>> + input->isolation_properties.as_uint64 = 0;
>> +
>> + status = hv_do_hypercall(HVCALL_CREATE_PARTITION,
>> + input, output);
>
> hv_do_hypercall returns a u64, which should then be masked with
> HV_HYPERCALL_RESULT_MASK before checking the result.
>

Yes, I'll fix this everywhere.

>> + if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
>> + if (status == HV_STATUS_SUCCESS)
>> + *partition_id = output->partition_id;
>> + else
>> + pr_err("%s: %s\n",
>> + __func__, hv_status_to_string(status));
>> + local_irq_restore(irq_flags);
>> + ret = -hv_status_to_errno(status);
>> + break;
>> + }
>> + local_irq_restore(irq_flags);
>> + ret = hv_call_deposit_pages(NUMA_NO_NODE,
>> + hv_current_partition_id, 1);
>> + } while (!ret);
>> +
>> + return ret;
>> +}
>> +
>> +static int
>> +hv_call_initialize_partition(u64 partition_id)
>> +{
>> + struct hv_initialize_partition *input;
>> + int status;
>> + int ret;
>> + unsigned long flags;
>> +
>> + ret = hv_call_deposit_pages(
>> + NUMA_NO_NODE,
>> + partition_id,
>> + HV_INIT_PARTITION_DEPOSIT_PAGES);
>> + if (ret)
>> + return ret;
>> +
>> + do {
>> + local_irq_save(flags);
>> + input = (struct hv_initialize_partition *)(*this_cpu_ptr(
>> + hyperv_pcpu_input_arg));
>> + input->partition_id = partition_id;
>> +
>> + status = hv_do_hypercall(
>> + HVCALL_INITIALIZE_PARTITION,
>> + input, NULL);
>
> FWIW, since the input is a single 64 bit value, and there's no output,
> this could use hv_do_fast_hypercall8() instead, and avoid
> needing to use the input arg page and the irq save/restore. Would have
> to check that the particular hypercall supports the "fast" version.
>

Good idea! I tested it and confirmed it works.

>> + local_irq_restore(flags);
>> +
>> + if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
>
> Same comment about status being u64 and masking.
>

Will do.

>> + if (status != HV_STATUS_SUCCESS)
>> + pr_err("%s: %s\n",
>> + __func__, hv_status_to_string(status));
>> + ret = -hv_status_to_errno(status);
>> + break;
>> + }
>> + ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
>> + } while (!ret);
>> +
>> + return ret;
>> +}
>> +
>> +static int
>> +hv_call_finalize_partition(u64 partition_id)
>> +{
>> + struct hv_finalize_partition *input;
>> + int status;
>> + unsigned long flags;
>> +
>> + local_irq_save(flags);
>> + input = (struct hv_finalize_partition *)(*this_cpu_ptr(
>> + hyperv_pcpu_input_arg));
>> +
>> + input->partition_id = partition_id;
>> + status = hv_do_hypercall(
>> + HVCALL_FINALIZE_PARTITION,
>> + input, NULL);
>> + local_irq_restore(flags);
>
>
> Same comment about hv_do_fast_hypercall8() and about status
> being a u64 and masking.
>

Will do.

>> +
>> + if (status != HV_STATUS_SUCCESS)
>> + pr_err("%s: %s\n", __func__, hv_status_to_string(status));
>> +
>> + return -hv_status_to_errno(status);
>> +}
>> +
>> +static int
>> +hv_call_delete_partition(u64 partition_id)
>> +{
>> + struct hv_delete_partition *input;
>> + int status;
>> + unsigned long flags;
>> +
>> + local_irq_save(flags);
>> + input = (struct hv_delete_partition *)(*this_cpu_ptr(
>> + hyperv_pcpu_input_arg));
>> +
>> + input->partition_id = partition_id;
>> + status = hv_do_hypercall(
>> + HVCALL_DELETE_PARTITION,
>> + input, NULL);
>> + local_irq_restore(flags);
>
> Same comments about hv_do_fast_hypercall8(), and
> the status and masking.
>

Will do.

>> +
>> + if (status != HV_STATUS_SUCCESS)
>> + pr_err("%s: %s\n", __func__, hv_status_to_string(status));
>> +
>> + return -hv_status_to_errno(status);
>> +}
>> +
>> static long
>> mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
>> {
>> @@ -86,6 +223,17 @@ destroy_partition(struct mshv_partition *partition)
>>
>> spin_unlock_irqrestore(&mshv.partitions.lock, flags);
>>
>> + /*
>> + * There are no remaining references to the partition or vps,
>> + * so the remaining cleanup can be lockless
>> + */
>> +
>> + /* Deallocates and unmaps everything including vcpus, GPA mappings etc */
>> + hv_call_finalize_partition(partition->id);
>> + /* TODO: Withdraw and free all pages we deposited */
>> +
>> + hv_call_delete_partition(partition->id);
>> +
>> kfree(partition);
>> }
>>
>> @@ -146,6 +294,9 @@ mshv_ioctl_create_partition(void __user *user_arg)
>> if (copy_from_user(&args, user_arg, sizeof(args)))
>> return -EFAULT;
>>
>> + /* Only support EXO partitions */
>> + args.flags |= HV_PARTITION_CREATION_FLAG_EXO_PARTITION;
>> +
>> partition = kzalloc(sizeof(*partition), GFP_KERNEL);
>> if (!partition)
>> return -ENOMEM;
>> @@ -156,11 +307,21 @@ mshv_ioctl_create_partition(void __user *user_arg)
>> goto free_partition;
>> }
>>
>> + ret = hv_call_create_partition(args.flags,
>> + args.partition_creation_properties,
>> + &partition->id);
>> + if (ret)
>> + goto put_fd;
>> +
>> + ret = hv_call_initialize_partition(partition->id);
>> + if (ret)
>> + goto delete_partition;
>> +
>> file = anon_inode_getfile("mshv_partition", &mshv_partition_fops,
>> partition, O_RDWR);
>> if (IS_ERR(file)) {
>> ret = PTR_ERR(file);
>> - goto put_fd;
>> + goto finalize_partition;
>> }
>> refcount_set(&partition->ref_count, 1);
>>
>> @@ -174,6 +335,10 @@ mshv_ioctl_create_partition(void __user *user_arg)
>>
>> release_file:
>> file->f_op->release(file->f_inode, file);
>> +finalize_partition:
>> + hv_call_finalize_partition(partition->id);
>> +delete_partition:
>> + hv_call_delete_partition(partition->id);
>> put_fd:
>> put_unused_fd(fd);
>> free_partition:
>> --
>> 2.25.1