Re: [PATCH v5 15/22] x86/virt/tdx: Allocate and set up PAMTs for TDMRs

From: Kai Huang
Date: Mon Jun 27 2022 - 06:31:48 EST


On Fri, 2022-06-24 at 13:13 -0700, Dave Hansen wrote:
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 4988a91d5283..ec496e96d120 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -1973,6 +1973,7 @@ config INTEL_TDX_HOST
> >   depends on CPU_SUP_INTEL
> >   depends on X86_64
> >   depends on KVM_INTEL
> > + depends on CONTIG_ALLOC
> >   select ARCH_HAS_CC_PLATFORM
> >   select ARCH_KEEP_MEMBLOCK
> >   help
> > diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> > index fd9f449b5395..36260dd7e69f 100644
> > --- a/arch/x86/virt/vmx/tdx/tdx.c
> > +++ b/arch/x86/virt/vmx/tdx/tdx.c
> > @@ -558,6 +558,196 @@ static int create_tdmrs(struct tdmr_info *tdmr_array,
> > int *tdmr_num)
> >   return 0;
> >  }
> >  
> > +/* Page sizes supported by TDX */
> > +enum tdx_page_sz {
> > + TDX_PG_4K,
> > + TDX_PG_2M,
> > + TDX_PG_1G,
> > + TDX_PG_MAX,
> > +};
>
> Are these the same constants as the magic numbers in Kirill's
> try_accept_one()?

try_accept_once() uses 'enum pg_level' PG_LEVEL_{4K,2M,1G} directly. They can
be used directly too, but 'enum pg_level' has more than we need here:

enum pg_level {
PG_LEVEL_NONE,
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
PG_LEVEL_512G,
PG_LEVEL_NUM
};

It has PG_LEVEL_NONE, so PG_LEVEL_4K starts with 1.

Below in tdmr_set_up_pamt(), I have two local arrays to store the base/size for
all TDX supported page sizes:

unsigned long pamt_base[TDX_PG_MAX];
unsigned long pamt_size[TDX_PG_MAX];

And a loop to calculate the size of PAMT for each page size:

for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
...
}

And later a similar loop to get the PAMT base of each page size too.

I can change them to:

/*
* TDX only supports 4K, 2M and 1G page, but doesn't
* support 512G page size.
*/
#define TDX_PG_LEVEL_MAX PG_LEVEL_512G

unsigned long pamt_base[TDX_PG_LEVEL_MAX];
unsigned long pamt_size[TDX_PG_LEVEL_MAX];

And change the loop to:

for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) {
pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
...
}

This would waste one 'unsigned long' for both pamt_base and pamt_size array, as
entry 0 isn't used for both of them. Or we explicitly -1 array index:

for (pgsz = PG_LEVEL_4K; pgsz < TDX_PG_LEVEL_MAX; pgsz++) {
pamt_size[pgsz - 1] = tdmr_get_pamt_sz(tdmr, pgsz);
...
}

What's your opinion?

> > +/*
> > + * Calculate PAMT size given a TDMR and a page size. The returned
> > + * PAMT size is always aligned up to 4K page boundary.
> > + */
> > +static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr,
> > + enum tdx_page_sz pgsz)
> > +{
> > + unsigned long pamt_sz;
> > + int pamt_entry_nr;
>
> 'nr_pamt_entries', please.

OK.

>
> > + switch (pgsz) {
> > + case TDX_PG_4K:
> > + pamt_entry_nr = tdmr->size >> PAGE_SHIFT;
> > + break;
> > + case TDX_PG_2M:
> > + pamt_entry_nr = tdmr->size >> PMD_SHIFT;
> > + break;
> > + case TDX_PG_1G:
> > + pamt_entry_nr = tdmr->size >> PUD_SHIFT;
> > + break;
> > + default:
> > + WARN_ON_ONCE(1);
> > + return 0;
> > + }
> > +
> > + pamt_sz = pamt_entry_nr * tdx_sysinfo.pamt_entry_size;
> > + /* TDX requires PAMT size must be 4K aligned */
> > + pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
> > +
> > + return pamt_sz;
> > +}
> > +
> > +/*
> > + * Pick a NUMA node on which to allocate this TDMR's metadata.
> > + *
> > + * This is imprecise since TDMRs are 1G aligned and NUMA nodes might
> > + * not be. If the TDMR covers more than one node, just use the _first_
> > + * one. This can lead to small areas of off-node metadata for some
> > + * memory.
> > + */
> > +static int tdmr_get_nid(struct tdmr_info *tdmr)
> > +{
> > + unsigned long start_pfn, end_pfn;
> > + int i, nid;
> > +
> > + /* Find the first memory region covered by the TDMR */
> > + memblock_for_each_tdx_mem_pfn_range(i, &start_pfn, &end_pfn, &nid)
> > {
> > + if (end_pfn > (tdmr_start(tdmr) >> PAGE_SHIFT))
> > + return nid;
> > + }
> > +
> > + /*
> > + * No memory region found for this TDMR. It cannot happen since
> > + * when one TDMR is created, it must cover at least one (or
> > + * partial) memory region.
> > + */
> > + WARN_ON_ONCE(1);
> > + return 0;
> > +}
>
> You should really describe what you are doing. At first glance "return
> 0;" looks like "declare success". How about something like this?
>
> /*
> * Fall back to allocating the TDMR from node 0 when no memblock
> * can be found. This should never happen since TDMRs originate
> * from the memblocks.
> */
>
> Does that miss any of the points you were trying to make?

No. Your comments looks better and will use yours. Thanks.

>
> > +static int tdmr_set_up_pamt(struct tdmr_info *tdmr)
> > +{
> > + unsigned long pamt_base[TDX_PG_MAX];
> > + unsigned long pamt_size[TDX_PG_MAX];
> > + unsigned long tdmr_pamt_base;
> > + unsigned long tdmr_pamt_size;
> > + enum tdx_page_sz pgsz;
> > + struct page *pamt;
> > + int nid;
> > +
> > + nid = tdmr_get_nid(tdmr);
> > +
> > + /*
> > + * Calculate the PAMT size for each TDX supported page size
> > + * and the total PAMT size.
> > + */
> > + tdmr_pamt_size = 0;
> > + for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > + pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz);
> > + tdmr_pamt_size += pamt_size[pgsz];
> > + }
> > +
> > + /*
> > + * Allocate one chunk of physically contiguous memory for all
> > + * PAMTs. This helps minimize the PAMT's use of reserved areas
> > + * in overlapped TDMRs.
> > + */
> > + pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
> > + nid, &node_online_map);
> > + if (!pamt)
> > + return -ENOMEM;
>
> I'm not sure it's worth mentioning, but this doesn't really need to be
> GFP_KERNEL. __GFP_HIGHMEM would actually be just fine. But,
> considering that this is 64-bit only, that's just a technicality.



>
> > + /* Calculate PAMT base and size for all supported page sizes. */
>
> That comment isn't doing much good. If you say anything here it should be:
>
> /*
> * Break the contiguous allocation back up into
> * the individual PAMTs for each page size:
> */
>
> Also, this is *not* "calculating size". That's done above.

Thanks will use this comment.

>
> > + tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
> > + for (pgsz = TDX_PG_4K; pgsz < TDX_PG_MAX; pgsz++) {
> > + pamt_base[pgsz] = tdmr_pamt_base;
> > + tdmr_pamt_base += pamt_size[pgsz];
> > + }
> > +
> > + tdmr->pamt_4k_base = pamt_base[TDX_PG_4K];
> > + tdmr->pamt_4k_size = pamt_size[TDX_PG_4K];
> > + tdmr->pamt_2m_base = pamt_base[TDX_PG_2M];
> > + tdmr->pamt_2m_size = pamt_size[TDX_PG_2M];
> > + tdmr->pamt_1g_base = pamt_base[TDX_PG_1G];
> > + tdmr->pamt_1g_size = pamt_size[TDX_PG_1G];
> > +
> > + return 0;
> > +}
> >
> > +static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_pfn,
> > + unsigned long *pamt_npages)
> > +{
> > + unsigned long pamt_base, pamt_sz;
> > +
> > + /*
> > + * The PAMT was allocated in one contiguous unit. The 4K PAMT
> > + * should always point to the beginning of that allocation.
> > + */
> > + pamt_base = tdmr->pamt_4k_base;
> > + pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr-
> > >pamt_1g_size;
> > +
> > + *pamt_pfn = pamt_base >> PAGE_SHIFT;
> > + *pamt_npages = pamt_sz >> PAGE_SHIFT;
> > +}
> > +
> > +static void tdmr_free_pamt(struct tdmr_info *tdmr)
> > +{
> > + unsigned long pamt_pfn, pamt_npages;
> > +
> > + tdmr_get_pamt(tdmr, &pamt_pfn, &pamt_npages);
> > +
> > + /* Do nothing if PAMT hasn't been allocated for this TDMR */
> > + if (!pamt_npages)
> > + return;
> > +
> > + if (WARN_ON_ONCE(!pamt_pfn))
> > + return;
> > +
> > + free_contig_range(pamt_pfn, pamt_npages);
> > +}
> > +
> > +static void tdmrs_free_pamt_all(struct tdmr_info *tdmr_array, int tdmr_num)
> > +{
> > + int i;
> > +
> > + for (i = 0; i < tdmr_num; i++)
> > + tdmr_free_pamt(tdmr_array_entry(tdmr_array, i));
> > +}
> > +
> > +/* Allocate and set up PAMTs for all TDMRs */
> > +static int tdmrs_set_up_pamt_all(struct tdmr_info *tdmr_array, int
> > tdmr_num)
> > +{
> > + int i, ret = 0;
> > +
> > + for (i = 0; i < tdmr_num; i++) {
> > + ret = tdmr_set_up_pamt(tdmr_array_entry(tdmr_array, i));
> > + if (ret)
> > + goto err;
> > + }
> > +
> > + return 0;
> > +err:
> > + tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > + return ret;
> > +}
> > +
> > +static unsigned long tdmrs_get_pamt_pages(struct tdmr_info *tdmr_array,
> > + int tdmr_num)
>
> "get" is for refcounting. tdmrs_count_pamt_pages() would be preferable.

Will use count. Thanks.

>
> > +{
> > + unsigned long pamt_npages = 0;
> > + int i;
> > +
> > + for (i = 0; i < tdmr_num; i++) {
> > + unsigned long pfn, npages;
> > +
> > + tdmr_get_pamt(tdmr_array_entry(tdmr_array, i), &pfn,
> > &npages);
> > + pamt_npages += npages;
> > + }
> > +
> > + return pamt_npages;
> > +}
> > +
> >  /*
> >   * Construct an array of TDMRs to cover all memory regions in memblock.
> >   * This makes sure all pages managed by the page allocator are TDX
> > @@ -572,8 +762,13 @@ static int construct_tdmrs_memeblock(struct tdmr_info
> > *tdmr_array,
> >   if (ret)
> >   goto err;
> >  
> > + ret = tdmrs_set_up_pamt_all(tdmr_array, *tdmr_num);
> > + if (ret)
> > + goto err;
> > +
> >   /* Return -EINVAL until constructing TDMRs is done */
> >   ret = -EINVAL;
> > + tdmrs_free_pamt_all(tdmr_array, *tdmr_num);
> >  err:
> >   return ret;
> >  }
> > @@ -644,6 +839,11 @@ static int init_tdx_module(void)
> >   * process are done.
> >   */
> >   ret = -EINVAL;
> > + if (ret)
> > + tdmrs_free_pamt_all(tdmr_array, tdmr_num);
> > + else
> > + pr_info("%lu pages allocated for PAMT.\n",
> > + tdmrs_get_pamt_pages(tdmr_array,
> > tdmr_num));
> >  out_free_tdmrs:
> >   /*
> >   * The array of TDMRs is freed no matter the initialization is
>
> The rest looks OK.

Thanks.

--
Thanks,
-Kai