Re: [PATCH v3 12/15] habanalabs: add virtual memory and MMU modules

From: Mike Rapoport
Date: Wed Feb 06 2019 - 10:15:03 EST


On Mon, Feb 04, 2019 at 10:32:51PM +0200, Oded Gabbay wrote:
> From: Omer Shpigelman <oshpigelman@xxxxxxxxx>
>
> This patch adds the Virtual Memory and MMU modules.
>
> Goya has an internal MMU which provides process isolation on the internal
> DDR. The internal MMU also performs translations for transactions that go
> from Goya to the Host.
>
> The driver is responsible for allocating and freeing memory on the DDR
> upon user request. It also provides an interface to map and unmap DDR and
> Host memory to the device address space.
>
> Signed-off-by: Omer Shpigelman <oshpigelman@xxxxxxxxx>
> Signed-off-by: Oded Gabbay <oded.gabbay@xxxxxxxxx>
> ---
> Changes in v3:
> - Remove use of three exclamation marks
> - Add optimization - support mapping for huge page host memory
> - Minor bug fixes
>
> drivers/misc/habanalabs/Makefile | 2 +-
> drivers/misc/habanalabs/context.c | 19 +-
> drivers/misc/habanalabs/device.c | 20 +-
> drivers/misc/habanalabs/goya/goya.c | 393 +++++
> drivers/misc/habanalabs/habanalabs.h | 188 +-
> drivers/misc/habanalabs/habanalabs_drv.c | 2 +-
> drivers/misc/habanalabs/habanalabs_ioctl.c | 3 +-
> .../include/hw_ip/mmu/mmu_general.h | 45 +
> .../habanalabs/include/hw_ip/mmu/mmu_v1_0.h | 15 +
> drivers/misc/habanalabs/memory.c | 1514 +++++++++++++++++
> drivers/misc/habanalabs/mmu.c | 604 +++++++
> include/uapi/misc/habanalabs.h | 122 +-
> 12 files changed, 2919 insertions(+), 8 deletions(-)
> create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
> create mode 100644 drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
> create mode 100644 drivers/misc/habanalabs/mmu.c
>
> diff --git a/drivers/misc/habanalabs/Makefile b/drivers/misc/habanalabs/Makefile
> index 5167699701b9..2698bb6a2355 100644
> --- a/drivers/misc/habanalabs/Makefile
> +++ b/drivers/misc/habanalabs/Makefile
> @@ -6,7 +6,7 @@ obj-m := habanalabs.o
>
> habanalabs-y := habanalabs_drv.o device.o context.o asid.o habanalabs_ioctl.o \
> command_buffer.o hw_queue.o irq.o sysfs.o hwmon.o memory.o \
> - command_submission.o
> + command_submission.o mmu.o
>
> include $(src)/goya/Makefile
> habanalabs-y += $(HL_GOYA_FILES)

[ ... ]

> diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
> index 6536b40c63e0..cf0be4f254e0 100644
> --- a/drivers/misc/habanalabs/memory.c
> +++ b/drivers/misc/habanalabs/memory.c
> @@ -5,12 +5,1198 @@
> * All Rights Reserved.
> */
>
> +#include <uapi/misc/habanalabs.h>
> #include "habanalabs.h"
> +#include "include/hw_ip/mmu/mmu_general.h"
>
> #include <linux/sched.h>
> #include <linux/uaccess.h>
> #include <linux/genalloc.h>
>
> +#define PGS_IN_HPAGE (HPAGE_SIZE >> PAGE_SHIFT)
> +#define HL_MMU_DEBUG 0

[ ... ]

> +/*
> + * init_phys_pg_pack_from_userptr - initialize physical page pack from host
> + * memory
> + *
> + * @ctx : current context
> + * @userptr : userptr to initialize from
> + * @pphys_pg_pack : res pointer
> + *
> + * This function does the following:
> + * - Pin the physical pages related to the given virtual block
> + * - Create a physical page pack from the physical pages related to the given
> + * virtual block
> + */
> +static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
> + struct hl_userptr *userptr,
> + struct hl_vm_phys_pg_pack **pphys_pg_pack)
> +{
> + struct hl_vm_phys_pg_pack *phys_pg_pack;
> + struct scatterlist *sg;
> + dma_addr_t dma_addr;
> + u64 page_mask;
> + u32 npages, total_npages, page_size = PAGE_SIZE;
> + bool first = true, is_2mb_opt = true;

Now you've mentioned 2M pages, I've started to wonder what would happen if
PAGE_SIZE != 4k and HPAGE_SIZE != 2M?

There are architectures that may have different sizes for both...

> + int rc, i, j;
> +
> + phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
> + if (!phys_pg_pack)
> + return -ENOMEM;
> +
> + phys_pg_pack->vm_type = userptr->vm_type;
> + phys_pg_pack->created_from_userptr = true;
> + phys_pg_pack->asid = ctx->asid;
> + atomic_set(&phys_pg_pack->mapping_cnt, 1);
> +
> + /* Only if all dma_addrs are aligned to 2MB and their
> + * sizes is at least 2MB, we can use huge page mapping.
> + * We limit the 2MB optimization to this condition,
> + * since later on we acquire the related VA range as one
> + * consecutive block.
> + */
> + total_npages = 0;
> + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
> + npages = get_sg_info(sg, &dma_addr);
> +
> + total_npages += npages;
> +
> + if (first) {
> + first = false;
> + dma_addr &= HPAGE_MASK;
> + }
> +
> + if ((npages % PGS_IN_HPAGE) || (dma_addr & (HPAGE_SIZE - 1)))
> + is_2mb_opt = false;
> + }
> +
> + if (is_2mb_opt) {
> + page_size = HPAGE_SIZE;
> + total_npages /= PGS_IN_HPAGE;
> + }
> +
> + page_mask = ~(((u64) page_size) - 1);
> +
> + phys_pg_pack->pages = kcalloc(total_npages, sizeof(u64), GFP_KERNEL);
> + if (!phys_pg_pack->pages) {
> + rc = -ENOMEM;
> + goto page_pack_arr_mem_err;
> + }
> +
> + phys_pg_pack->npages = total_npages;
> + phys_pg_pack->page_size = page_size;
> + phys_pg_pack->total_size = total_npages * page_size;
> +
> + j = 0;
> + first = true;
> + for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
> + npages = get_sg_info(sg, &dma_addr);
> +
> + /* align down to physical page size and save the offset */
> + if (first) {
> + first = false;
> + phys_pg_pack->offset = dma_addr & (page_size - 1);
> + dma_addr &= page_mask;
> + }
> +
> + while (npages) {
> + phys_pg_pack->pages[j++] = dma_addr;
> + dma_addr += page_size;
> +
> + if (is_2mb_opt)
> + npages -= PGS_IN_HPAGE;
> + else
> + npages--;
> + }
> + }
> +
> + *pphys_pg_pack = phys_pg_pack;
> +
> + return 0;
> +
> +page_pack_arr_mem_err:
> + kfree(phys_pg_pack);
> +
> + return rc;
> +}
> +

[ ... ]

> diff --git a/drivers/misc/habanalabs/mmu.c b/drivers/misc/habanalabs/mmu.c
> new file mode 100644
> index 000000000000..054b72e48a49
> --- /dev/null
> +++ b/drivers/misc/habanalabs/mmu.c
> @@ -0,0 +1,604 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/*
> + * Copyright 2016-2018 HabanaLabs, Ltd.
> + * All Rights Reserved.
> + */
> +
> +#include "habanalabs.h"
> +#include "include/hw_ip/mmu/mmu_general.h"
> +
> +#include <linux/genalloc.h>
> +
> +#define HOP_NOT_FREED 0
> +#define HOP_FREED 1
> +
> +static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 addr)
> +{
> + struct pgt_info *pgt_info = NULL;
> +
> + hash_for_each_possible(ctx->mmu_hash, pgt_info, node,
> + (unsigned long) addr)
> + if (addr == pgt_info->addr)
> + break;
> +
> + return pgt_info;
> +}
> +
> +static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
> +{
> + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
> +
> + gen_pool_free(pgt_info->ctx->hdev->mmu_pgt_pool, pgt_info->addr,
> + ctx->hdev->asic_prop.mmu_hop_table_size);
> + hash_del(&pgt_info->node);
> +
> + kfree(pgt_info);
> +}
> +
> +static u64 alloc_hop(struct hl_ctx *ctx)
> +{
> + struct hl_device *hdev = ctx->hdev;
> + struct pgt_info *pgt_info;
> + u64 addr;
> +
> + pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
> + if (!pgt_info)
> + return ULLONG_MAX;
> +
> + addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool,
> + hdev->asic_prop.mmu_hop_table_size);
> + if (!addr) {
> + dev_err(hdev->dev, "failed to allocate page\n");
> + kfree(pgt_info);
> + return ULLONG_MAX;
> + }
> +
> + pgt_info->addr = addr;
> + pgt_info->ctx = ctx;
> + pgt_info->num_of_ptes = 0;
> + hash_add(ctx->mmu_hash, &pgt_info->node, addr);
> +
> + return addr;
> +}
> +
> +static inline void clear_pte(struct hl_device *hdev, u64 pte_addr)
> +{
> + /* clear the last and present bits */
> + hdev->asic_funcs->write_pte(hdev, pte_addr, 0);
> +}
> +
> +static inline void inc_num_of_ptes(struct hl_ctx *ctx, u64 hop_addr)
> +{
> + get_pgt_info(ctx, hop_addr)->num_of_ptes++;
> +}
> +
> +/*
> + * dec_num_of_ptes - decrement the num of ptes and free the hop if possible
> + *
> + * @ctx: pointer to the context structure
> + * @hop_addr: addr of the hop
> + *
> + * This function returns HOP_FREED if the hop was freed or HOP_NOT_FREED if the
> + * num of ptes was decremented without freeing the hop.
> + */
> +static inline int dec_num_of_ptes(struct hl_ctx *ctx, u64 hop_addr)

I think it's worth emphasizing that this function may free a pte. Maybe
even use {get,put}_pte instead of {inc,dec}_num_of_ptes.

> +{
> + struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
> +
> + pgt_info->num_of_ptes--;
> +
> + if (pgt_info->num_of_ptes == 0) {
> + free_hop(ctx, hop_addr);
> + return HOP_FREED;
> + }
> +
> + return HOP_NOT_FREED;

Why not simply return pgt_info->num_of_ptes?

> +}
> +
> +static inline u64 get_hop0_addr(struct hl_ctx *ctx)
> +{
> + return ctx->hdev->asic_prop.mmu_pgt_addr +
> + (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
> +}
> +
> +static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
> + u64 virt_addr)
> +{
> + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
> + ((virt_addr & HOP0_MASK) >> HOP0_SHIFT);
> +}
> +
> +static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
> + u64 virt_addr)
> +{
> + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
> + ((virt_addr & HOP1_MASK) >> HOP1_SHIFT);
> +}
> +
> +static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
> + u64 virt_addr)
> +{
> + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
> + ((virt_addr & HOP2_MASK) >> HOP2_SHIFT);
> +}
> +
> +static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
> + u64 virt_addr)
> +{
> + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
> + ((virt_addr & HOP3_MASK) >> HOP3_SHIFT);
> +}
> +
> +static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
> + u64 virt_addr)
> +{
> + return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
> + ((virt_addr & HOP4_MASK) >> HOP4_SHIFT);
> +}

These are almost clones of each other. I'd factor out

static inline u64 get_hopN_pte(struct hl_ctx *ctx, u64 hop_addr,
u64 vaddr, u64 mask, u64 shift);

and alias it for each level.

> +static inline u64 get_next_hop_addr(u64 curr_pte)
> +{
> + if (curr_pte & PAGE_PRESENT_MASK)
> + return curr_pte & PHYS_ADDR_MASK;
> + else
> + return ULLONG_MAX;
> +}
> +
> +/*
> + * hl_mmu_init - init the mmu module
> + *
> + * @hdev: pointer to the habanalabs device structure
> + *
> + * This function does the following:
> + * - Allocate max_asid zeroed hop0 pgts so no mapping is available
> + * - Enable mmu in hw
> + * - Invalidate the mmu cache
> + * - Create a pool of pages for pgts
> + * - Returns 0 on success
> + *
> + * This function depends on DMA QMAN to be working!
> + */
> +int hl_mmu_init(struct hl_device *hdev)
> +{
> + struct asic_fixed_properties *prop = &hdev->asic_prop;
> + int rc;
> +
> + if (!hdev->mmu_enable)
> + return 0;
> +
> + /* MMU HW init was already done in device hw_init() */
> +
> + mutex_init(&hdev->mmu_cache_lock);
> +
> + hdev->mmu_pgt_pool =
> + gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
> +
> + if (!hdev->mmu_pgt_pool) {
> + dev_err(hdev->dev, "Failed to create page gen pool\n");
> + rc = -ENOMEM;
> + goto err_pool_create;
> + }
> +
> + rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr +
> + prop->mmu_hop0_tables_total_size,
> + prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
> + -1);
> + if (rc) {
> + dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
> + goto err_pool_add;
> + }
> +
> + return 0;
> +
> +err_pool_add:
> + gen_pool_destroy(hdev->mmu_pgt_pool);
> +err_pool_create:
> + mutex_destroy(&hdev->mmu_cache_lock);
> +
> + return rc;
> +}
> +
> +/*
> + * hl_mmu_fini - release the mmu module.
> + *
> + * @hdev: pointer to the habanalabs device structure
> + *
> + * This function does the following:
> + * - Disable mmu in hw
> + * - free the pgts pool
> + *
> + * All ctxs should be freed before calling this func
> + */
> +void hl_mmu_fini(struct hl_device *hdev)
> +{
> + if (!hdev->mmu_enable)
> + return;
> +
> + gen_pool_destroy(hdev->mmu_pgt_pool);
> +
> + mutex_destroy(&hdev->mmu_cache_lock);
> +
> + /* MMU HW fini will be done in device hw_fini() */
> +}
> +
> +/*
> + * hl_mmu_ctx_init - init a ctx for using the mmu module
> + *
> + * @ctx: pointer to the context structure
> + *
> + * This function does the following:
> + * - Init a mutex to protect the concurrent mapping flow
> + * - Init a hash to hold all pgts related to this ctx
> + */
> +void hl_mmu_ctx_init(struct hl_ctx *ctx)
> +{
> + if (!ctx->hdev->mmu_enable)
> + return;
> +
> + mutex_init(&ctx->mmu_lock);
> + hash_init(ctx->mmu_hash);
> +}
> +
> +/*
> + * hl_mmu_ctx_fini - disable a ctx from using the mmu module
> + *
> + * @ctx: pointer to the context structure
> + *
> + * This function does the following:
> + * - Free any pgts which were not freed yet
> + * - Free the mutex
> + */
> +void hl_mmu_ctx_fini(struct hl_ctx *ctx)
> +{
> + struct pgt_info *pgt_info;
> + struct hlist_node *tmp;
> + int i;
> +
> + if (!ctx->hdev->mmu_enable)
> + return;
> +
> + if (!hash_empty(ctx->mmu_hash))
> + dev_err(ctx->hdev->dev,
> + "ctx is freed while it has pgts in use\n");
> +
> + hash_for_each_safe(ctx->mmu_hash, i, tmp, pgt_info, node) {
> + dev_err(ctx->hdev->dev,
> + "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
> + pgt_info->addr, ctx->asid, pgt_info->num_of_ptes);
> + free_hop(ctx, pgt_info->addr);
> + }
> +
> + mutex_destroy(&ctx->mmu_lock);
> +}
> +
> +/*
> + * hl_mmu_map - maps a virtual addr to physical addr
> + *
> + * @ctx: pointer to the context structure
> + * @virt_addr: virt addr to map from
> + * @phys_addr: phys addr to map to
> + * @page_size: physical page size
> + *
> + * This function does the following:
> + * - Check that the virt addr is not mapped
> + * - Allocate pgts as necessary in order to map the virt addr to the phys
> + * - Returns 0 on success, -EINVAL if addr is already mapped, or -ENOMEM.
> + *
> + * Because this function changes the page tables in the device and because it
> + * changes the MMU hash, it must be protected by a lock.
> + * However, because it maps only a single page, the lock should be implemented
> + * in a higher level in order to protect the entire mapping of the memory area
> + */
> +int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
> +{
> + struct hl_device *hdev = ctx->hdev;
> + u64 hop0_addr = 0, hop0_pte_addr = 0,
> + hop1_addr = 0, hop1_pte_addr = 0,
> + hop2_addr = 0, hop2_pte_addr = 0,
> + hop3_addr = 0, hop3_pte_addr = 0,
> + hop4_addr = 0, hop4_pte_addr = 0,
> + curr_pte = 0;
> + int hop1_new = 0, hop2_new = 0, hop3_new = 0, hop4_new = 0,
> + rc = -ENOMEM;
> + bool is_huge;
> +
> + if (!hdev->mmu_enable)
> + return 0;
> +
> + /*
> + * This mapping function can map a 4KB/2MB page. For 2MB page there are
> + * only 3 hops rather than 4. Currently the DRAM allocation uses 2MB
> + * pages only but user memory could have been allocated with one of the
> + * two page sizes. Since this is a common code for all the three cases,
> + * we need this hugs page check.
> + */
> + is_huge = page_size == PAGE_SIZE_2MB;
> +
> + hop0_addr = get_hop0_addr(ctx);
> +
> + hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
> +
> + hop1_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop1_addr == ULLONG_MAX) {
> + hop1_addr = alloc_hop(ctx);
> + if (hop1_addr == ULLONG_MAX)
> + goto err;
> + else
> + hop1_new = 1;
> + }

The allocation can be folded into get_alloc_next_hop_addr() along with
get_next_hop_addr().

> +
> + hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
> +
> + hop2_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop2_addr == ULLONG_MAX) {
> + hop2_addr = alloc_hop(ctx);
> + if (hop2_addr == ULLONG_MAX)
> + goto err;
> + else
> + hop2_new = 1;
> + }
> +
> + hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
> +
> + hop3_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop3_addr == ULLONG_MAX) {
> + hop3_addr = alloc_hop(ctx);
> + if (hop3_addr == ULLONG_MAX)
> + goto err;
> + else
> + hop3_new = 1;
> + }
> +
> + hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
> +
> + if (!is_huge) {
> + hop4_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop4_addr == ULLONG_MAX) {
> + hop4_addr = alloc_hop(ctx);
> + if (hop4_addr == ULLONG_MAX)
> + goto err;
> + else
> + hop4_new = 1;
> + }
> +
> + hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
> + }
> +
> + if (curr_pte & PAGE_PRESENT_MASK) {
> + dev_err(hdev->dev,
> + "mapping already exists for virt_addr 0x%llx\n",
> + virt_addr);
> +
> + dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
> + hdev->asic_funcs->read_pte(hdev, hop0_pte_addr),
> + hop0_pte_addr);
> + dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
> + hdev->asic_funcs->read_pte(hdev, hop1_pte_addr),
> + hop1_pte_addr);
> + dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
> + hdev->asic_funcs->read_pte(hdev, hop2_pte_addr),
> + hop2_pte_addr);
> + dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
> + hdev->asic_funcs->read_pte(hdev, hop3_pte_addr),
> + hop3_pte_addr);
> +
> + if (!is_huge)
> + dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
> + hdev->asic_funcs->read_pte(hdev,
> + hop4_pte_addr),
> + hop4_pte_addr);
> +
> + rc = EINVAL;
> + goto err;
> + }
> +
> + curr_pte = (phys_addr & PTE_PHYS_ADDR_MASK) | LAST_MASK
> + | PAGE_PRESENT_MASK;
> +
> + hdev->asic_funcs->write_pte(hdev,
> + is_huge ? hop3_pte_addr : hop4_pte_addr,
> + curr_pte);
> +
> + if (hop1_new) {
> + curr_pte = (hop1_addr & PTE_PHYS_ADDR_MASK) |
> + PAGE_PRESENT_MASK;
> + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop0_pte_addr,
> + curr_pte);
> + }
> + if (hop2_new) {
> + curr_pte = (hop2_addr & PTE_PHYS_ADDR_MASK) |
> + PAGE_PRESENT_MASK;
> + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop1_pte_addr,
> + curr_pte);
> + inc_num_of_ptes(ctx, hop1_addr);
> + }
> + if (hop3_new) {
> + curr_pte = (hop3_addr & PTE_PHYS_ADDR_MASK) |
> + PAGE_PRESENT_MASK;
> + ctx->hdev->asic_funcs->write_pte(ctx->hdev, hop2_pte_addr,
> + curr_pte);
> + inc_num_of_ptes(ctx, hop2_addr);
> + }
> +
> + if (!is_huge) {
> + if (hop4_new) {
> + curr_pte = (hop4_addr & PTE_PHYS_ADDR_MASK) |
> + PAGE_PRESENT_MASK;
> + ctx->hdev->asic_funcs->write_pte(ctx->hdev,
> + hop3_pte_addr, curr_pte);
> + inc_num_of_ptes(ctx, hop3_addr);
> + }
> +
> + inc_num_of_ptes(ctx, hop4_addr);
> + } else
> + inc_num_of_ptes(ctx, hop3_addr);

Isn't hop3 would be incremented twice for 2M pages?

> +
> + /* flush all writes from all cores to reach PCI */
> + mb();
> +
> + hdev->asic_funcs->read_pte(hdev,
> + is_huge ? hop3_pte_addr : hop4_pte_addr);
> +
> + return 0;
> +
> +err:
> + if (hop4_new)
> + free_hop(ctx, hop4_addr);
> + if (hop3_new)
> + free_hop(ctx, hop3_addr);
> + if (hop2_new)
> + free_hop(ctx, hop2_addr);
> + if (hop1_new)
> + free_hop(ctx, hop1_addr);
> +
> + return rc;
> +}
> +
> +/*
> + * hl_mmu_unmap - unmaps a virtual addr
> + *
> + * @ctx: pointer to the context structure
> + * @virt_addr: virt addr to map from
> + *
> + * This function does the following:
> + * - Check that the virt addr is mapped
> + * - Unmap the vurt addr and frees pgts if possible
> + * - Returns 0 on success, -EINVAL if the given addr is not mapped
> + *
> + * Because this function changes the page tables in the device and because it
> + * changes the MMU hash, it must be protected by a lock.
> + * However, because it maps only a single page, the lock should be implemented
> + * in a higher level in order to protect the entire mapping of the memory area
> + */
> +int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
> +{
> + struct hl_device *hdev = ctx->hdev;
> + u64 hop0_addr = 0, hop0_pte_addr = 0,
> + hop1_addr = 0, hop1_pte_addr = 0,
> + hop2_addr = 0, hop2_pte_addr = 0,
> + hop3_addr = 0, hop3_pte_addr = 0,
> + hop4_addr = 0, hop4_pte_addr = 0,
> + curr_pte;
> + int clear_hop3 = 1;
> +
> + if (!hdev->mmu_enable)
> + return 0;
> +
> + hop0_addr = get_hop0_addr(ctx);
> +
> + hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
> +
> + hop1_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop1_addr == ULLONG_MAX)
> + goto not_mapped;
> +
> + hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
> +
> + hop2_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop2_addr == ULLONG_MAX)
> + goto not_mapped;
> +
> + hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
> +
> + hop3_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop3_addr == ULLONG_MAX)
> + goto not_mapped;
> +
> + hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
> +
> + if (!(curr_pte & LAST_MASK)) {
> + hop4_addr = get_next_hop_addr(curr_pte);
> +
> + if (hop4_addr == ULLONG_MAX)
> + goto not_mapped;
> +
> + hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr);
> +
> + curr_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
> +
> + clear_hop3 = 0;
> + }
> +
> + if (!(curr_pte & PAGE_PRESENT_MASK))
> + goto not_mapped;
> +
> + clear_pte(hdev, hop4_addr ? hop4_pte_addr : hop3_pte_addr);
> +
> + if (hop4_addr && dec_num_of_ptes(ctx, hop4_addr) == HOP_FREED)
> + clear_hop3 = 1;
> +
> + if (clear_hop3) {
> + clear_pte(hdev, hop3_pte_addr);
> + if (dec_num_of_ptes(ctx, hop3_addr) == HOP_FREED) {
> + clear_pte(hdev, hop2_pte_addr);
> + if (dec_num_of_ptes(ctx, hop2_addr) == HOP_FREED) {
> + clear_pte(hdev, hop1_pte_addr);
> + if (dec_num_of_ptes(ctx, hop1_addr) ==
> + HOP_FREED)
> + clear_pte(hdev, hop0_pte_addr);
> + }
> + }
> + }

Consider inverting the logic and doing something like

if (dec_num_of_ptes() != HOP_FREED)
goto flush;
clear_pte(...)
if (...)
goto flush;
flush:

> +
> + /* flush all writes from all cores to reach PCI */
> + mb();
> +
> + hdev->asic_funcs->read_pte(hdev,
> + hop4_addr ? hop4_pte_addr : hop3_pte_addr);
> +
> + return 0;
> +
> +not_mapped:
> + dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
> + virt_addr);
> +
> + return -EINVAL;
> +}

[ ... ]

> --
> 2.17.1
>

--
Sincerely yours,
Mike.