[PATCH 2/7] habanalabs: Use pending CS amount per ASIC

From: Oded Gabbay
Date: Tue Jun 16 2020 - 02:13:46 EST


From: Ofir Bitton <obitton@xxxxxxxxx>

Training schemes requires much more concurrent command submissions than
inference does. In addition, training command submissions can be completed
in a non serialized manner. Hence, we add support in which each ASIC will
be able to configure the amount of concurrent pending command submissions,
rather than use a predefined amount. This change will enhance performance
by allowing the user to add more concurrent work without waiting for the
previous work to be completed.

Signed-off-by: Ofir Bitton <obitton@xxxxxxxxx>
Reviewed-by: Oded Gabbay <oded.gabbay@xxxxxxxxx>
Signed-off-by: Oded Gabbay <oded.gabbay@xxxxxxxxx>
---
drivers/misc/habanalabs/command_submission.c | 6 ++++--
drivers/misc/habanalabs/context.c | 14 +++++++++++---
drivers/misc/habanalabs/gaudi/gaudi.c | 2 ++
drivers/misc/habanalabs/gaudi/gaudiP.h | 6 ++++++
drivers/misc/habanalabs/goya/goya.c | 2 ++
drivers/misc/habanalabs/goya/goyaP.h | 6 ++++++
drivers/misc/habanalabs/habanalabs.h | 9 +++++----
drivers/misc/habanalabs/hw_queue.c | 2 +-
8 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index f82974a916c3..e156803f4a99 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -405,7 +405,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
spin_lock(&ctx->cs_lock);

cs_cmpl->cs_seq = ctx->cs_sequence;
- other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)];
+ other = ctx->cs_pending[cs_cmpl->cs_seq &
+ (hdev->asic_prop.max_pending_cs - 1)];
if ((other) && (!dma_fence_is_signaled(other))) {
spin_unlock(&ctx->cs_lock);
dev_dbg(hdev->dev,
@@ -419,7 +420,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,

cs->sequence = cs_cmpl->cs_seq;

- ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] =
+ ctx->cs_pending[cs_cmpl->cs_seq &
+ (hdev->asic_prop.max_pending_cs - 1)] =
&cs_cmpl->base_fence;
ctx->cs_sequence++;

diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c
index ec92b3506b1f..1b96fefa4a65 100644
--- a/drivers/misc/habanalabs/context.c
+++ b/drivers/misc/habanalabs/context.c
@@ -22,9 +22,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
* to this function unless the ref count is 0
*/

- for (i = 0 ; i < HL_MAX_PENDING_CS ; i++)
+ for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
dma_fence_put(ctx->cs_pending[i]);

+ kfree(ctx->cs_pending);
+
if (ctx->asid != HL_KERNEL_ASID_ID) {
/* The engines are stopped as there is no executing CS, but the
* Coresight might be still working by accessing addresses
@@ -126,6 +128,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
spin_lock_init(&ctx->cs_lock);
atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0;
+ ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
+ sizeof(struct dma_fence *),
+ GFP_KERNEL);
+ if (!ctx->cs_pending)
+ return -ENOMEM;

if (is_kernel_ctx) {
ctx->asid = HL_KERNEL_ASID_ID; /* Kernel driver gets ASID 0 */
@@ -170,6 +177,7 @@ int hl_ctx_put(struct hl_ctx *ctx)

struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{
+ struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
struct dma_fence *fence;

spin_lock(&ctx->cs_lock);
@@ -179,13 +187,13 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return ERR_PTR(-EINVAL);
}

- if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) {
+ if (seq + asic_prop->max_pending_cs < ctx->cs_sequence) {
spin_unlock(&ctx->cs_lock);
return NULL;
}

fence = dma_fence_get(
- ctx->cs_pending[seq & (HL_MAX_PENDING_CS - 1)]);
+ ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
spin_unlock(&ctx->cs_lock);

return fence;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4d69727bb53b..35e9080f6976 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -429,6 +429,8 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
strncpy(prop->armcp_info.card_name, GAUDI_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);

+ prop->max_pending_cs = GAUDI_MAX_PENDING_CS;
+
return 0;
}

diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index a46530d375fa..76c3f840e05a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -57,6 +57,12 @@

#define GAUDI_DEFAULT_CARD_NAME "HL2000"

+#define GAUDI_MAX_PENDING_CS 1024
+
+#if !IS_MAX_PENDING_CS_VALID(GAUDI_MAX_PENDING_CS)
+#error "GAUDI_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
#define PCI_DMA_NUMBER_OF_CHNLS 3
#define HBM_DMA_NUMBER_OF_CHNLS 5
#define DMA_NUMBER_OF_CHNLS (PCI_DMA_NUMBER_OF_CHNLS + \
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 0d2952bb58df..e872099a3f7a 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -426,6 +426,8 @@ void goya_get_fixed_properties(struct hl_device *hdev)

strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
+
+ prop->max_pending_cs = GOYA_MAX_PENDING_CS;
}

/*
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index d36f8d90c9c9..9d8a1761252d 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -57,6 +57,12 @@

#define GOYA_DEFAULT_CARD_NAME "HL1000"

+#define GOYA_MAX_PENDING_CS 64
+
+#if !IS_MAX_PENDING_CS_VALID(GOYA_MAX_PENDING_CS)
+#error "GOYA_MAX_PENDING_CS must be power of 2 and greater than 1"
+#endif
+
/* DRAM Memory Map */

#define CPU_FW_IMAGE_SIZE 0x10000000 /* 256MB */
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 1ecdcf8b763a..64d9b2dd3e19 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -42,9 +42,6 @@

#define HL_MAX_QUEUES 128

-/* MUST BE POWER OF 2 and larger than 1 */
-#define HL_MAX_PENDING_CS 64
-
#define HL_IDLE_BUSY_TS_ARR_SIZE 4096

/* Memory */
@@ -61,6 +58,9 @@

#define HL_MAX_SOB_VAL (1 << 15)

+#define IS_POWER_OF_2(n) (n != 0 && ((n & (n - 1)) == 0))
+#define IS_MAX_PENDING_CS_VALID(n) (IS_POWER_OF_2(n) && (n > 1))
+
/**
* struct pgt_info - MMU hop page info.
* @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -285,6 +285,7 @@ struct asic_fixed_properties {
u32 high_pll;
u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size;
+ u32 max_pending_cs;
u8 tpc_enabled_mask;
u8 completion_queues_count;
};
@@ -782,7 +783,7 @@ struct hl_ctx {
struct hl_fpriv *hpriv;
struct hl_device *hdev;
struct kref refcount;
- struct dma_fence *cs_pending[HL_MAX_PENDING_CS];
+ struct dma_fence **cs_pending;
struct hl_va_range *host_va_range;
struct hl_va_range *host_huge_va_range;
struct hl_va_range *dram_va_range;
diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c
index f4434b39ef1b..29b96d24edc2 100644
--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -376,7 +376,7 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
* write address offset in the SM block (QMAN LBW message).
* The write address offset is calculated as "COMP_OFFSET << 2".
*/
- offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+ offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);

--
2.17.1