Re: [PATCH v12 22/29] HMM: mm add helper to update page table when migrating memory v3.

From: Aneesh Kumar K.V
Date: Mon Mar 21 2016 - 10:25:44 EST


JÃrÃme Glisse <jglisse@xxxxxxxxxx> writes:

> +
> + /* Try to fail early on. */
> + if (unlikely(anon_vma_prepare(vma)))
> + return -ENOMEM;
> +

What is this about ?

> +retry:
> + lru_add_drain();
> + tlb_gather_mmu(&tlb, mm, range.start, range.end);
> + update_hiwater_rss(mm);
> + mmu_notifier_invalidate_range_start_excluding(mm, &range,
> + mmu_notifier_exclude);
> + tlb_start_vma(&tlb, vma);
> + for (addr = range.start, i = 0; addr < end && !ret;) {
> + unsigned long cstart, next, npages = 0;
> + spinlock_t *ptl;
> + pgd_t *pgdp;
> + pud_t *pudp;
> + pmd_t *pmdp;
> + pte_t *ptep;
> +
> + /*
> + * Pretty much the exact same logic as __handle_mm_fault(),
> + * exception being the handling of huge pmd.
> + */
> + pgdp = pgd_offset(mm, addr);
> + pudp = pud_alloc(mm, pgdp, addr);
> + if (!pudp) {
> + ret = -ENOMEM;
> + break;
> + }
> + pmdp = pmd_alloc(mm, pudp, addr);
> + if (!pmdp) {
> + ret = -ENOMEM;
> + break;
> + }
> + if (unlikely(pte_alloc(mm, pmdp, addr))) {
> + ret = -ENOMEM;
> + break;
> + }
> +
> + /*
> + * If a huge pmd materialized under us just retry later. Use
> + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
> + * didn't become pmd_trans_huge under us and then back to pmd_none, as
> + * a result of MADV_DONTNEED running immediately after a huge pmd fault
> + * in a different thread of this mm, in turn leading to a misleading
> + * pmd_trans_huge() retval. All we have to ensure is that it is a
> + * regular pmd that we can walk with pte_offset_map() and we can do that
> + * through an atomic read in C, which is what pmd_trans_unstable()
> + * provides.
> + */
> + if (unlikely(pmd_trans_unstable(pmdp) || pmd_devmap(*pmdp))) {
> + ret = -EAGAIN;
> + break;
> + }
> +
> + /*
> + * If an huge pmd materialized from under us split it and break
> + * out of the loop to retry.
> + */
> + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) {
> + split_huge_pmd(vma, addr, pmdp);
> + ret = -EAGAIN;
> + break;
> + }
> +
> + /*
> + * A regular pmd is established and it can't morph into a huge pmd
> + * from under us anymore at this point because we hold the mmap_sem
> + * read mode and khugepaged takes it in write mode. So now it's
> + * safe to run pte_offset_map().
> + */
> + ptep = pte_offset_map(pmdp, addr);
> +
> + /*
> + * A regular pmd is established and it can't morph into a huge
> + * pmd from under us anymore at this point because we hold the
> + * mmap_sem read mode and khugepaged takes it in write mode. So
> + * now it's safe to run pte_offset_map().
> + */
> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);


Why pte_offset_map followed by map_lock ?

> + for (i = (addr - start) >> PAGE_SHIFT, cstart = addr,
> + next = min((addr + PMD_SIZE) & PMD_MASK, end);
> + addr < next; addr += PAGE_SIZE, ptep++, i++) {
> + save_pte[i] = ptep_get_and_clear(mm, addr, ptep);
> + tlb_remove_tlb_entry(&tlb, ptep, addr);
> + set_pte_at(mm, addr, ptep, hmm_entry);
> +
> + if (pte_present(save_pte[i]))
> + continue;
> +
> + if (!pte_none(save_pte[i])) {
> + set_pte_at(mm, addr, ptep, save_pte[i]);
> + ret = -ENOENT;
> + ptep++;
> + break;
> + }

What is special about pte_none ? Why break the loop ? I guess we are
checking for swap_pte ? why not is_swap_pte ? is that we already checked
pte_present ?

> + /*
> + * TODO: This mm_forbids_zeropage() really does not
> + * apply to us. First it seems only S390 have it set,
> + * second we are not even using the zero page entry
> + * to populate the CPU page table, thought on error
> + * we might use the save_pte entry to set the CPU
> + * page table entry.
> + *
> + * Live with that oddity for now.
> + */
> + if (mm_forbids_zeropage(mm)) {
> + pte_clear(mm, addr, &save_pte[i]);
> + npages++;
> + continue;
> + }
> + save_pte[i] = pte_mkspecial(pfn_pte(my_zero_pfn(addr),
> + vma->vm_page_prot));
> + }
> + pte_unmap_unlock(ptep - 1, ptl);
> +
> + /*
> + * So we must allocate pages before checking for error, which
> + * here indicate that one entry is a swap entry. We need to
> + * allocate first because otherwise there is no easy way to
> + * know on retry or in error code path wether the CPU page
> + * table locked HMM entry is ours or from some other thread.
> + */
> +
> + if (!npages)
> + continue;
> +
> + for (next = addr, addr = cstart,
> + i = (addr - start) >> PAGE_SHIFT;
> + addr < next; addr += PAGE_SIZE, i++) {
> + struct mem_cgroup *memcg;
> + struct page *page;
> +
> + if (pte_present(save_pte[i]) || !pte_none(save_pte[i]))
> + continue;
> +
> + page = alloc_zeroed_user_highpage_movable(vma, addr);
> + if (!page) {
> + ret = -ENOMEM;
> + break;
> + }
> + __SetPageUptodate(page);
> + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL,
> + &memcg, false)) {
> + page_cache_release(page);
> + ret = -ENOMEM;
> + break;
> + }
> + save_pte[i] = mk_pte(page, vma->vm_page_prot);
> + if (vma->vm_flags & VM_WRITE)
> + save_pte[i] = pte_mkwrite(save_pte[i]);

I guess this also need to go ?

> + inc_mm_counter_fast(mm, MM_ANONPAGES);
> + /*
> + * Because we set the page table entry to the special
> + * HMM locked entry we know no other process might do
> + * anything with it and thus we can safely account the
> + * page without holding any lock at this point.
> + */
> + page_add_new_anon_rmap(page, vma, addr, false);
> + mem_cgroup_commit_charge(page, memcg, false, false);
> + /*
> + * Add to active list so we know vmscan will not waste
> + * its time with that page while we are still using it.
> + */
> + lru_cache_add_active_or_unevictable(page, vma);
> + }
> + }
> + tlb_end_vma(&tlb, vma);
> + mmu_notifier_invalidate_range_end_excluding(mm, &range,
> + mmu_notifier_exclude);
> + tlb_finish_mmu(&tlb, range.start, range.end);
> +
> + if (backoff && *backoff) {
> + /* Stick to the range we updated. */
> + ret = -EAGAIN;
> + end = addr;
> + goto out;
> + }
> +
> + /* Check if something is missing or something went wrong. */
> + if (ret == -ENOENT) {
> + int flags = FAULT_FLAG_ALLOW_RETRY;
> +
> + do {
> + /*
> + * Using __handle_mm_fault() as current->mm != mm ie we
> + * might have been call from a kernel thread on behalf
> + * of a driver and all accounting handle_mm_fault() is
> + * pointless in our case.
> + */
> + ret = __handle_mm_fault(mm, vma, addr, flags);
> + flags |= FAULT_FLAG_TRIED;
> + } while ((ret & VM_FAULT_RETRY));
> + if ((ret & VM_FAULT_ERROR)) {
> + /* Stick to the range we updated. */
> + end = addr;
> + ret = -EFAULT;
> + goto out;
> + }
> + range.start = addr;
> + goto retry;
> + }
> + if (ret == -EAGAIN) {
> + range.start = addr;
> + goto retry;
> + }
> + if (ret)
> + /* Stick to the range we updated. */
> + end = addr;
> +
> + /*
> + * At this point no one else can take a reference on the page from this
> + * process CPU page table. So we can safely check wether we can migrate
> + * or not the page.
> + */
> +
> +out:
> + for (addr = start, i = 0; addr < end;) {
> + unsigned long next;
> + spinlock_t *ptl;
> + pgd_t *pgdp;
> + pud_t *pudp;
> + pmd_t *pmdp;
> + pte_t *ptep;
> +
> + /*
> + * We know for certain that we did set special swap entry for
> + * the range and HMM entry are mark as locked so it means that
> + * no one beside us can modify them which apply that all level
> + * of the CPU page table are valid.
> + */
> + pgdp = pgd_offset(mm, addr);
> + pudp = pud_offset(pgdp, addr);
> + VM_BUG_ON(!pudp);
> + pmdp = pmd_offset(pudp, addr);
> + VM_BUG_ON(!pmdp || pmd_bad(*pmdp) || pmd_none(*pmdp) ||
> + pmd_trans_huge(*pmdp));
> +
> + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> + for (next = min((addr + PMD_SIZE) & PMD_MASK, end),
> + i = (addr - start) >> PAGE_SHIFT; addr < next;
> + addr += PAGE_SIZE, ptep++, i++) {
> + struct page *page;
> + swp_entry_t entry;
> + int swapped;
> +
> + entry = pte_to_swp_entry(save_pte[i]);
> + if (is_hmm_entry(entry)) {
> + /*
> + * Logic here is pretty involve. If save_pte is
> + * an HMM special swap entry then it means that
> + * we failed to swap in that page so error must
> + * be set.
> + *
> + * If that's not the case than it means we are
> + * seriously screw.
> + */
> + VM_BUG_ON(!ret);
> + continue;
> + }
> +
> + /*
> + * This can not happen, no one else can replace our
> + * special entry and as range end is re-ajusted on
> + * error.
> + */
> + entry = pte_to_swp_entry(*ptep);
> + VM_BUG_ON(!is_hmm_entry_locked(entry));
> +
> + /* On error or backoff restore all the saved pte. */
> + if (ret)
> + goto restore;
> +
> + page = vm_normal_page(vma, addr, save_pte[i]);
> + /* The zero page is fine to migrate. */
> + if (!page)
> + continue;
> +
> + /*
> + * Check that only CPU mapping hold a reference on the
> + * page. To make thing simpler we just refuse bail out
> + * if page_mapcount() != page_count() (also accounting
> + * for swap cache).
> + *
> + * There is a small window here where wp_page_copy()
> + * might have decremented mapcount but have not yet
> + * decremented the page count. This is not an issue as
> + * we backoff in that case.
> + */
> + swapped = PageSwapCache(page);
> + if (page_mapcount(page) + swapped == page_count(page))
> + continue;
> +
> +restore:
> + /* Ok we have to restore that page. */
> + set_pte_at(mm, addr, ptep, save_pte[i]);
> + /*
> + * No need to invalidate - it was non-present
> + * before.
> + */
> + update_mmu_cache(vma, addr, ptep);
> + pte_clear(mm, addr, &save_pte[i]);
> + }
> + pte_unmap_unlock(ptep - 1, ptl);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(mm_hmm_migrate);

-aneesh