Re: [PATCH v6 2/6] ksm: support unsharing zero pages placed by KSM

From: xu xin
Date: Sat Mar 11 2023 - 00:04:00 EST


I think your suggestions as follows are valuable. I will make adjustments according to the actual
situation.

Since patch series v6 have been merged into next branch, I think submitting a new patch is better
to reduce maintainers' workload.

>> Cc: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
>> Cc: Xuexin Jiang <jiang.xuexin@xxxxxxxxxx>
>> Reviewed-by: Xiaokai Ran <ran.xiaokai@xxxxxxxxxx>
>> Reviewed-by: Yang Yang <yang.yang29@xxxxxxxxxx>
>> ---
>> mm/ksm.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++++--------------
>> 1 file changed, 111 insertions(+), 30 deletions(-)
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 905a79d213da..ab04b44679c8 100644

>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -214,6 +214,7 @@ struct ksm_rmap_item {
>> #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
>> #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
>> #define STABLE_FLAG 0x200 /* is listed from the stable tree */
>> +#define ZERO_PAGE_FLAG 0x400 /* is zero page placed by KSM */
>>
>> /* The stable and unstable tree heads */
>> static struct rb_root one_stable_tree[1] = { RB_ROOT };
>
> @@ -420,6 +421,11 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
>> return atomic_read(&mm->mm_users) == 0;
>> }
>>
>> +enum break_ksm_pmd_entry_return_flag {
>
>what about 0 ? I think it would look cleaner if every possible value
>was explicitly listed here
You're right. I'll fix it in a new patch.

>>> + HAVE_KSM_PAGE = 1,
>> + HAVE_ZERO_PAGE
>> +};
>> +
>> static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
>> struct mm_walk *walk)
>> {
>> @@ -427,6 +433,7 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>> spinlock_t *ptl;
>> pte_t *pte;
>> int ret;
>> + bool is_zero_page = false;
>
>this ^ should probably be moved further up in the list of variables
>(i.e. reverse christmas tree)
>
Good. Fix it in a new patch.

>>
>> if (pmd_leaf(*pmd) || !pmd_present(*pmd))
>> return 0;
>> @@ -434,6 +441,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>> pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
>> if (pte_present(*pte)) {
>> page = vm_normal_page(walk->vma, addr, *pte);
>> + if (!page)
>> + is_zero_page = is_zero_pfn(pte_pfn(*pte));
>> } else if (!pte_none(*pte)) {
>> swp_entry_t entry = pte_to_swp_entry(*pte);
>>
>> @@ -444,7 +453,14 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex
>> if (is_migration_entry(entry))
>> page = pfn_swap_entry_to_page(entry);
>> }
>> - ret = page && PageKsm(page);
>> +
>> + if (page && PageKsm(page))
>> + ret = HAVE_KSM_PAGE;
>> + else if (is_zero_page)
>> + ret = HAVE_ZERO_PAGE;
>> + else
>> + ret = 0;
>
>and here it would be a great place to use the enum instead of
>hardcoding 0
>
Good. Fix it in a new patch.

>> +
>> pte_unmap_unlock(pte, ptl);
>> return ret;
>> }
>> @@ -466,19 +482,22 @@ static const struct mm_walk_ops break_ksm_ops = {
>> * of the process that owns 'vma'. We also do not want to enforce
>> * protection keys here anyway.
>
>please add a (short) explanation of when and why the new
>unshare_zero_page flag should be used.
>
>> */
>> -static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
>> +static int break_ksm(struct vm_area_struct *vma, unsigned long addr,
>> + bool unshare_zero_page)
>> {
>> vm_fault_t ret = 0;
>>
>> do {
>> - int ksm_page;
>> + int walk_result;
>>
>> cond_resched();
>> - ksm_page = walk_page_range_vma(vma, addr, addr + 1,
>> + walk_result = walk_page_range_vma(vma, addr, addr + 1,
>> &break_ksm_ops, NULL);
>> - if (WARN_ON_ONCE(ksm_page < 0))
>> - return ksm_page;
>> - if (!ksm_page)
>> + if (WARN_ON_ONCE(walk_result < 0))
>> + return walk_result;
>> + if (!walk_result)
>
>if (walk_result == ...)
>
Fine.

>> + return 0;
>> + if (walk_result == HAVE_ZERO_PAGE && !unshare_zero_page)
>> return 0;
>> ret = handle_mm_fault(vma, addr,
>> FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
>> @@ -539,7 +558,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item)
>> mmap_read_lock(mm);
>> vma = find_mergeable_vma(mm, addr);
>> if (vma)
>> - break_ksm(vma, addr);
>> + break_ksm(vma, addr, false);
>> mmap_read_unlock(mm);
>> }
>>
>> @@ -764,6 +783,30 @@ static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
>> return NULL;
>> }
>>
>> +/*
>> + * Cleaning the rmap_item's ZERO_PAGE_FLAG
>
>this is not what you are doing, though. in case new flags are added,
>this function will cause problems.
>
>> + * This function will be called when unshare or writing on zero pages.
>> + */
>> +static inline void clean_rmap_item_zero_flag(struct ksm_rmap_item *rmap_item)
>> +{
>> + if (rmap_item->address & ZERO_PAGE_FLAG)
>> + rmap_item->address &= PAGE_MASK;
>
> ... &= ~ZERO_PAGE_FLAG;
>
>this way you only clear the flag, and you won't interfere with
>potential new flags that might be introduced in the future. (I
>really hope we won't need new flags in the future because the code is
>already complex enough as it is, but you never know)

How about 'rmap_item->address &= (PAGE_MASK | ~ZERO_PAGE_FLAG);' ?

>
>> +}
>> +
>> +/* Only called when rmap_item is going to be freed */
>> +static inline void unshare_zero_pages(struct ksm_rmap_item *rmap_item)
>> +{
>> + struct vm_area_struct *vma;
>> +
>> + if (rmap_item->address & ZERO_PAGE_FLAG) {
>> + vma = vma_lookup(rmap_item->mm, rmap_item->address);
>> + if (vma && !ksm_test_exit(rmap_item->mm))
>> + break_ksm(vma, rmap_item->address, true);
>> + }
>> + /* Put at last. */
>> + clean_rmap_item_zero_flag(rmap_item);
>> +}
>> +
>> /*
>> * Removing rmap_item from stable or unstable tree.
>> * This function will clean the information from the stable/unstable tree.
>> @@ -824,6 +867,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
>> struct ksm_rmap_item *rmap_item = *rmap_list;
>> *rmap_list = rmap_item->rmap_list;
>> remove_rmap_item_from_tree(rmap_item);
>> + unshare_zero_pages(rmap_item);
>> free_rmap_item(rmap_item);
>> }
>> }
>> @@ -853,7 +897,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
>> if (signal_pending(current))
>> err = -ERESTARTSYS;
>> else
>> - err = break_ksm(vma, addr);
>> + err = break_ksm(vma, addr, false);
>> }
>> return err;
>> }
>> @@ -2044,6 +2088,39 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
>> rmap_item->mm->ksm_merging_pages++;
>> }
>>
>> +static int try_to_merge_with_kernel_zero_page(struct ksm_rmap_item *rmap_item,
>> + struct page *page)
>
>this line is less than 100 columns, please don't break it ^
>

Fine. Fix in a new patch.
>> +{
>> + struct mm_struct *mm = rmap_item->mm;
>> + int err = 0;
>> +
>> + /*
>> + * It should not take ZERO_PAGE_FLAG because on one hand,
>> + * get_next_rmap_item don't return zero pages' rmap_item.
>> + * On the other hand, even if zero page was writen as
>> + * anonymous page, rmap_item has been cleaned after
>> + * stable_tree_search
>> + */
>> + if (!WARN_ON_ONCE(rmap_item->address & ZERO_PAGE_FLAG)) {
>> + struct vm_area_struct *vma;
>> +
>> + mmap_read_lock(mm);
>> + vma = find_mergeable_vma(mm, rmap_item->address);
>> + if (vma) {
>> + err = try_to_merge_one_page(vma, page,
>> + ZERO_PAGE(rmap_item->address));
>
>this line is also less than 100 columns, please don't break it ^
>
>> + if (!err)
>> + rmap_item->address |= ZERO_PAGE_FLAG;
>> + } else {
>> + /* If the vma is out of date, we do not need to continue. */
>> + err = 0;
>> + }
>> + mmap_read_unlock(mm);
>> + }
>> +
>> + return err;
>> +}
>> +
>> /*
>> * cmp_and_merge_page - first see if page can be merged into the stable tree;
>> * if not, compare checksum to previous and if it's the same, see if page can
>> @@ -2055,7 +2132,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
>> */
>> static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
>> {
>> - struct mm_struct *mm = rmap_item->mm;
>> struct ksm_rmap_item *tree_rmap_item;
>> struct page *tree_page = NULL;
>> struct ksm_stable_node *stable_node;
>> @@ -2092,6 +2168,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
>> }
>>
>> remove_rmap_item_from_tree(rmap_item);
>> + clean_rmap_item_zero_flag(rmap_item);
>>
>> if (kpage) {
>> if (PTR_ERR(kpage) == -EBUSY)
>> @@ -2128,29 +2205,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
>> * Same checksum as an empty page. We attempt to merge it with the
>> * appropriate zero page if the user enabled this via sysfs.
>> */
>> - if (ksm_use_zero_pages && (checksum == zero_checksum)) {
>
>(like here, see comment below)
>
>> - struct vm_area_struct *vma;
>> -
>> - mmap_read_lock(mm);
>> - vma = find_mergeable_vma(mm, rmap_item->address);
>> - if (vma) {
>> - err = try_to_merge_one_page(vma, page,
>> - ZERO_PAGE(rmap_item->address));
>> - } else {
>> + if (ksm_use_zero_pages) {
>> + if (checksum == zero_checksum)
>
>if I see correctly, you end up with three ifs nested? why not just one
>if with all the conditions?
>
Yes, I thought three 'if' would be clearer in terms of structures. But let's not modify
this here for now, because I'm going to submit a patch about using static_key instead of
ksm_use_zero_pages.

>> /*
>> - * If the vma is out of date, we do not need to
>> - * continue.
>> + * In case of failure, the page was not really empty, so we
>> + * need to continue. Otherwise we're done.
>> */
>> - err = 0;
>> - }
>> - mmap_read_unlock(mm);
>> - /*
>> - * In case of failure, the page was not really empty, so we
>> - * need to continue. Otherwise we're done.
>> - */
>> - if (!err)
>> - return;
>> + if (!try_to_merge_with_kernel_zero_page(rmap_item, page))
>> + return;
>> }
>> +
>> tree_rmap_item =
>> unstable_tree_search_insert(rmap_item, page, &tree_page);
>> if (tree_rmap_item) {
>> @@ -2230,6 +2294,7 @@ static struct ksm_rmap_item *try_to_get_old_rmap_item(unsigned long addr,
>> * is ineligible or discarded, e.g. MADV_DONTNEED.
>> */
>> remove_rmap_item_from_tree(rmap_item);
>> + unshare_zero_pages(rmap_item);
>> free_rmap_item(rmap_item);
>> }
>>
>> @@ -2352,6 +2417,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>> }
>> if (is_zone_device_page(*page))
>> goto next_page;
>> + if (is_zero_pfn(page_to_pfn(*page))) {
>> + /*
>> + * To monitor ksm zero pages which becomes non-anonymous,
>> + * we have to save each rmap_item of zero pages by
>> + * try_to_get_old_rmap_item() walking on
>> + * ksm_scan.rmap_list, otherwise their rmap_items will be
>> + * freed by the next turn of get_next_rmap_item(). The
>> + * function get_next_rmap_item() will free all "skipped"
>> + * rmap_items because it thinks its areas as UNMERGEABLE.
>> + */
>> + rmap_item = try_to_get_old_rmap_item(ksm_scan.address,
>> + ksm_scan.rmap_list);
>> + if (rmap_item && (rmap_item->address & ZERO_PAGE_FLAG))
>> + ksm_scan.rmap_list = &rmap_item->rmap_list;
>> + goto next_page;
>> + }
>> if (PageAnon(*page)) {
>> flush_anon_page(vma, *page, ksm_scan.address);
>> flush_dcache_page(*page);
>