[PATCH] habanalabs: Manipulate DMA addresses in ASIC functions

From: Oded Gabbay
Date: Wed May 01 2019 - 09:24:20 EST


From: Tomer Tayar <ttayar@xxxxxxxxx>

Routing device accesses to the host memory requires the usage of a base
offset, which is canceled by the iATU just before leaving the device.
The value of the base offset might be distinctive between different ASIC
types.
The manipulation of the addresses is currently used throughout the
driver code, and one should be aware to it whenever providing a host
memory address to the device.
This patch removes this manipulation from the driver common code, and
moves it to the ASIC specific functions that are responsible for
host memory allocation/mapping.

Signed-off-by: Tomer Tayar <ttayar@xxxxxxxxx>
Signed-off-by: Oded Gabbay <oded.gabbay@xxxxxxxxx>
---
drivers/misc/habanalabs/firmware_if.c | 7 +--
drivers/misc/habanalabs/goya/goya.c | 90 +++++++++++++++++----------
drivers/misc/habanalabs/habanalabs.h | 10 ++-
drivers/misc/habanalabs/hw_queue.c | 6 +-
drivers/misc/habanalabs/memory.c | 4 --
drivers/misc/habanalabs/pci.c | 11 ++--
6 files changed, 72 insertions(+), 56 deletions(-)

diff --git a/drivers/misc/habanalabs/firmware_if.c b/drivers/misc/habanalabs/firmware_if.c
index 1acf82650b20..eda5d7fcb79f 100644
--- a/drivers/misc/habanalabs/firmware_if.c
+++ b/drivers/misc/habanalabs/firmware_if.c
@@ -249,8 +249,7 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)

pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
- pkt.addr = cpu_to_le64(armcp_info_dma_addr +
- prop->host_phys_base_address);
+ pkt.addr = cpu_to_le64(armcp_info_dma_addr);
pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -281,7 +280,6 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)

int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
{
- struct asic_fixed_properties *prop = &hdev->asic_prop;
struct armcp_packet pkt = {};
void *eeprom_info_cpu_addr;
dma_addr_t eeprom_info_dma_addr;
@@ -301,8 +299,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)

pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
ARMCP_PKT_CTL_OPCODE_SHIFT);
- pkt.addr = cpu_to_le64(eeprom_info_dma_addr +
- prop->host_phys_base_address);
+ pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
pkt.data_max_size = cpu_to_le32(max_size);

rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 8e18c80a22e7..31dc3b872f9e 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -345,7 +345,6 @@ void goya_get_fixed_properties(struct hl_device *hdev)
prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB;

- prop->host_phys_base_address = HOST_PHYS_BASE;
prop->va_space_host_start_address = VA_HOST_SPACE_START;
prop->va_space_host_end_address = VA_HOST_SPACE_END;
prop->va_space_dram_start_address = VA_DDR_SPACE_START;
@@ -422,7 +421,7 @@ static u64 goya_set_ddr_bar_base(struct hl_device *hdev, u64 addr)
static int goya_init_iatu(struct hl_device *hdev)
{
return hl_pci_init_iatu(hdev, SRAM_BASE_ADDR, DRAM_PHYS_BASE,
- HOST_PHYS_SIZE);
+ HOST_PHYS_BASE, HOST_PHYS_SIZE);
}

/*
@@ -804,7 +803,6 @@ void goya_init_dma_qmans(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
struct hl_hw_queue *q;
- dma_addr_t bus_address;
int i;

if (goya->hw_cap_initialized & HW_CAP_DMA)
@@ -813,10 +811,7 @@ void goya_init_dma_qmans(struct hl_device *hdev)
q = &hdev->kernel_queues[0];

for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++, q++) {
- bus_address = q->bus_address +
- hdev->asic_prop.host_phys_base_address;
-
- goya_init_dma_qman(hdev, i, bus_address);
+ goya_init_dma_qman(hdev, i, q->bus_address);
goya_init_dma_ch(hdev, i);
}

@@ -957,7 +952,6 @@ int goya_init_cpu_queues(struct hl_device *hdev)
{
struct goya_device *goya = hdev->asic_specific;
struct hl_eq *eq;
- dma_addr_t bus_address;
u32 status;
struct hl_hw_queue *cpu_pq = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ];
int err;
@@ -970,19 +964,18 @@ int goya_init_cpu_queues(struct hl_device *hdev)

eq = &hdev->event_queue;

- bus_address = cpu_pq->bus_address +
- hdev->asic_prop.host_phys_base_address;
- WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0, lower_32_bits(bus_address));
- WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1, upper_32_bits(bus_address));
+ WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_0,
+ lower_32_bits(cpu_pq->bus_address));
+ WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_1,
+ upper_32_bits(cpu_pq->bus_address));

- bus_address = eq->bus_address + hdev->asic_prop.host_phys_base_address;
- WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(bus_address));
- WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(bus_address));
+ WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_2, lower_32_bits(eq->bus_address));
+ WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_3, upper_32_bits(eq->bus_address));

- bus_address = hdev->cpu_accessible_dma_address +
- hdev->asic_prop.host_phys_base_address;
- WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8, lower_32_bits(bus_address));
- WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9, upper_32_bits(bus_address));
+ WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_8,
+ lower_32_bits(hdev->cpu_accessible_dma_address));
+ WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_9,
+ upper_32_bits(hdev->cpu_accessible_dma_address));

WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_5, HL_QUEUE_SIZE_IN_BYTES);
WREG32(mmPSOC_GLOBAL_CONF_SCRATCHPAD_4, HL_EQ_SIZE_IN_BYTES);
@@ -2731,13 +2724,23 @@ void goya_flush_pq_write(struct hl_device *hdev, u64 *pq, u64 exp_val)
static void *goya_dma_alloc_coherent(struct hl_device *hdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags)
{
- return dma_alloc_coherent(&hdev->pdev->dev, size, dma_handle, flags);
+ void *kernel_addr = dma_alloc_coherent(&hdev->pdev->dev, size,
+ dma_handle, flags);
+
+ /* Shift to the device's base physical address of host memory */
+ if (kernel_addr)
+ *dma_handle += HOST_PHYS_BASE;
+
+ return kernel_addr;
}

static void goya_dma_free_coherent(struct hl_device *hdev, size_t size,
void *cpu_addr, dma_addr_t dma_handle)
{
- dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, dma_handle);
+ /* Cancel the device's base physical address of host memory */
+ dma_addr_t fixed_dma_handle = dma_handle - HOST_PHYS_BASE;
+
+ dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle);
}

void *goya_get_int_queue_base(struct hl_device *hdev, u32 queue_id,
@@ -2848,8 +2851,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job)
(1 << GOYA_PKT_CTL_MB_SHIFT);
fence_pkt->ctl = cpu_to_le32(tmp);
fence_pkt->value = cpu_to_le32(GOYA_QMAN0_FENCE_VAL);
- fence_pkt->addr = cpu_to_le64(fence_dma_addr +
- hdev->asic_prop.host_phys_base_address);
+ fence_pkt->addr = cpu_to_le64(fence_dma_addr);

rc = hl_hw_queue_send_cb_no_cmpl(hdev, GOYA_QUEUE_ID_DMA_0,
job->job_cb_size, cb->bus_address);
@@ -2928,8 +2930,7 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
(1 << GOYA_PKT_CTL_MB_SHIFT);
fence_pkt->ctl = cpu_to_le32(tmp);
fence_pkt->value = cpu_to_le32(fence_val);
- fence_pkt->addr = cpu_to_le64(fence_dma_addr +
- hdev->asic_prop.host_phys_base_address);
+ fence_pkt->addr = cpu_to_le64(fence_dma_addr);

rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id,
sizeof(struct packet_msg_prot),
@@ -3001,16 +3002,27 @@ int goya_test_queues(struct hl_device *hdev)
static void *goya_dma_pool_zalloc(struct hl_device *hdev, size_t size,
gfp_t mem_flags, dma_addr_t *dma_handle)
{
+ void *kernel_addr;
+
if (size > GOYA_DMA_POOL_BLK_SIZE)
return NULL;

- return dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
+ kernel_addr = dma_pool_zalloc(hdev->dma_pool, mem_flags, dma_handle);
+
+ /* Shift to the device's base physical address of host memory */
+ if (kernel_addr)
+ *dma_handle += HOST_PHYS_BASE;
+
+ return kernel_addr;
}

static void goya_dma_pool_free(struct hl_device *hdev, void *vaddr,
dma_addr_t dma_addr)
{
- dma_pool_free(hdev->dma_pool, vaddr, dma_addr);
+ /* Cancel the device's base physical address of host memory */
+ dma_addr_t fixed_dma_addr = dma_addr - HOST_PHYS_BASE;
+
+ dma_pool_free(hdev->dma_pool, vaddr, fixed_dma_addr);
}

void *goya_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
@@ -3025,19 +3037,33 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
hl_fw_cpu_accessible_dma_pool_free(hdev, size, vaddr);
}

-static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sg,
+static int goya_dma_map_sg(struct hl_device *hdev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir)
{
- if (!dma_map_sg(&hdev->pdev->dev, sg, nents, dir))
+ struct scatterlist *sg;
+ int i;
+
+ if (!dma_map_sg(&hdev->pdev->dev, sgl, nents, dir))
return -ENOMEM;

+ /* Shift to the device's base physical address of host memory */
+ for_each_sg(sgl, sg, nents, i)
+ sg->dma_address += HOST_PHYS_BASE;
+
return 0;
}

-static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sg,
+static void goya_dma_unmap_sg(struct hl_device *hdev, struct scatterlist *sgl,
int nents, enum dma_data_direction dir)
{
- dma_unmap_sg(&hdev->pdev->dev, sg, nents, dir);
+ struct scatterlist *sg;
+ int i;
+
+ /* Cancel the device's base physical address of host memory */
+ for_each_sg(sgl, sg, nents, i)
+ sg->dma_address -= HOST_PHYS_BASE;
+
+ dma_unmap_sg(&hdev->pdev->dev, sgl, nents, dir);
}

u32 goya_get_dma_desc_list_size(struct hl_device *hdev, struct sg_table *sgt)
@@ -3589,8 +3615,6 @@ static int goya_patch_dma_packet(struct hl_device *hdev,
new_dma_pkt->ctl = cpu_to_le32(ctl);
new_dma_pkt->tsize = cpu_to_le32((u32) len);

- dma_addr += hdev->asic_prop.host_phys_base_address;
-
if (dir == DMA_TO_DEVICE) {
new_dma_pkt->src_addr = cpu_to_le64(dma_addr);
new_dma_pkt->dst_addr = cpu_to_le64(device_memory_addr);
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index b64594be6dbd..f08f71982585 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -135,8 +135,6 @@ enum hl_device_hw_state {
* @dram_user_base_address: DRAM physical start address for user access.
* @dram_size: DRAM total size.
* @dram_pci_bar_size: size of PCI bar towards DRAM.
- * @host_phys_base_address: base physical address of host memory for
- * transactions that the device generates.
* @max_power_default: max power of the device after reset
* @va_space_host_start_address: base address of virtual memory range for
* mapping host memory.
@@ -184,7 +182,6 @@ struct asic_fixed_properties {
u64 dram_user_base_address;
u64 dram_size;
u64 dram_pci_bar_size;
- u64 host_phys_base_address;
u64 max_power_default;
u64 va_space_host_start_address;
u64 va_space_host_end_address;
@@ -537,11 +534,11 @@ struct hl_asic_funcs {
void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
size_t size, void *vaddr);
void (*hl_dma_unmap_sg)(struct hl_device *hdev,
- struct scatterlist *sg, int nents,
+ struct scatterlist *sgl, int nents,
enum dma_data_direction dir);
int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
int (*asic_dma_map_sg)(struct hl_device *hdev,
- struct scatterlist *sg, int nents,
+ struct scatterlist *sgl, int nents,
enum dma_data_direction dir);
u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
struct sg_table *sgt);
@@ -1450,7 +1447,8 @@ int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
u64 addr);
int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
- u64 dram_base_address, u64 host_phys_size);
+ u64 dram_base_address, u64 host_phys_base_address,
+ u64 host_phys_size);
int hl_pci_init(struct hl_device *hdev, u8 dma_mask);
void hl_pci_fini(struct hl_device *hdev);
int hl_pci_set_dma_mask(struct hl_device *hdev, u8 dma_mask);
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index 6cdaa117fc40..2894d8975933 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -82,7 +82,7 @@ static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
bd += hl_pi_2_offset(q->pi);
bd->ctl = __cpu_to_le32(ctl);
bd->len = __cpu_to_le32(len);
- bd->ptr = __cpu_to_le64(ptr + hdev->asic_prop.host_phys_base_address);
+ bd->ptr = __cpu_to_le64(ptr);

q->pi = hl_queue_inc_ptr(q->pi);
hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
@@ -263,9 +263,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
* checked in hl_queue_sanity_checks
*/
cq = &hdev->completion_queue[q->hw_queue_id];
- cq_addr = cq->bus_address +
- hdev->asic_prop.host_phys_base_address;
- cq_addr += cq->pi * sizeof(struct hl_cq_entry);
+ cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);

hdev->asic_funcs->add_end_of_cb_packets(cb->kernel_address, len,
cq_addr,
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index 43ef3ad8438a..d67d24c13efd 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -759,10 +759,6 @@ static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr,
for (i = 0 ; i < phys_pg_pack->npages ; i++) {
paddr = phys_pg_pack->pages[i];

- /* For accessing the host we need to turn on bit 39 */
- if (phys_pg_pack->created_from_userptr)
- paddr += hdev->asic_prop.host_phys_base_address;
-
rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size);
if (rc) {
dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/pci.c b/drivers/misc/habanalabs/pci.c
index 5278f086d65d..0e78a04d63f4 100644
--- a/drivers/misc/habanalabs/pci.c
+++ b/drivers/misc/habanalabs/pci.c
@@ -236,6 +236,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
* @hdev: Pointer to hl_device structure.
* @sram_base_address: SRAM base address.
* @dram_base_address: DRAM base address.
+ * @host_phys_base_address: Base physical address of host memory for device
+ * transactions.
* @host_phys_size: Size of host memory for device transactions.
*
* This is needed in case the firmware doesn't initialize the iATU.
@@ -243,7 +245,8 @@ int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
* Return: 0 on success, negative value for failure.
*/
int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,
- u64 dram_base_address, u64 host_phys_size)
+ u64 dram_base_address, u64 host_phys_base_address,
+ u64 host_phys_size)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
u64 host_phys_end_addr;
@@ -265,11 +268,11 @@ int hl_pci_init_iatu(struct hl_device *hdev, u64 sram_base_address,


/* Outbound Region 0 - Point to Host */
- host_phys_end_addr = prop->host_phys_base_address + host_phys_size - 1;
+ host_phys_end_addr = host_phys_base_address + host_phys_size - 1;
rc |= hl_pci_iatu_write(hdev, 0x008,
- lower_32_bits(prop->host_phys_base_address));
+ lower_32_bits(host_phys_base_address));
rc |= hl_pci_iatu_write(hdev, 0x00C,
- upper_32_bits(prop->host_phys_base_address));
+ upper_32_bits(host_phys_base_address));
rc |= hl_pci_iatu_write(hdev, 0x010, lower_32_bits(host_phys_end_addr));
rc |= hl_pci_iatu_write(hdev, 0x014, 0);
rc |= hl_pci_iatu_write(hdev, 0x018, 0);
--
2.17.1