[PATCH 04/10] gpu: host1x: Lock classes during job submission
From: Mikko Perttunen
Date: Sun Nov 05 2017 - 06:03:08 EST
Host1x has a feature called MLOCKs which allow a certain class
(~HW unit) to be locked (in the mutex sense) and unlocked during
command execution, preventing other channels from accessing the
class while it is locked. This is necessary to prevent concurrent
jobs from messing up class state.
This has not been necessary so far since due to our channel allocation
model, there has only been a single hardware channel submitting
commands to each class. Future patches, however, change the channel
allocation model to allow hardware-scheduled concurrency, and as such
we need to start locking.
This patch implements locking on all platforms from Tegra20 to
Tegra186.
Signed-off-by: Mikko Perttunen <mperttunen@xxxxxxxxxx>
---
drivers/gpu/host1x/cdma.c | 1 +
drivers/gpu/host1x/cdma.h | 1 +
drivers/gpu/host1x/hw/cdma_hw.c | 122 +++++++++++++++++++++++++
drivers/gpu/host1x/hw/channel_hw.c | 71 ++++++++++----
drivers/gpu/host1x/hw/host1x01_hardware.h | 10 ++
drivers/gpu/host1x/hw/host1x02_hardware.h | 10 ++
drivers/gpu/host1x/hw/host1x04_hardware.h | 10 ++
drivers/gpu/host1x/hw/host1x05_hardware.h | 10 ++
drivers/gpu/host1x/hw/host1x06_hardware.h | 10 ++
drivers/gpu/host1x/hw/hw_host1x01_sync.h | 6 ++
drivers/gpu/host1x/hw/hw_host1x02_sync.h | 6 ++
drivers/gpu/host1x/hw/hw_host1x04_sync.h | 6 ++
drivers/gpu/host1x/hw/hw_host1x05_sync.h | 6 ++
drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h | 5 +
14 files changed, 257 insertions(+), 17 deletions(-)
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 28541b280739..f787cfe69c11 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -232,6 +232,7 @@ static void cdma_start_timer_locked(struct host1x_cdma *cdma,
}
cdma->timeout.client = job->client;
+ cdma->timeout.class = job->class;
cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
cdma->timeout.syncpt_val = job->syncpt_end;
cdma->timeout.start_ktime = ktime_get();
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
index 286d49386be9..e72660fc83c9 100644
--- a/drivers/gpu/host1x/cdma.h
+++ b/drivers/gpu/host1x/cdma.h
@@ -59,6 +59,7 @@ struct buffer_timeout {
ktime_t start_ktime; /* starting time */
/* context timeout information */
int client;
+ u32 class;
};
enum cdma_event {
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
index ce320534cbed..4d5970d863d5 100644
--- a/drivers/gpu/host1x/hw/cdma_hw.c
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -16,6 +16,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <linux/iopoll.h>
#include <linux/slab.h>
#include <linux/scatterlist.h>
#include <linux/dma-mapping.h>
@@ -243,6 +244,125 @@ static void cdma_resume(struct host1x_cdma *cdma, u32 getptr)
cdma_timeout_restart(cdma, getptr);
}
+static int mlock_id_for_class(unsigned int class)
+{
+#if HOST1X_HW >= 6
+ switch (class)
+ {
+ case HOST1X_CLASS_HOST1X:
+ return 0;
+ case HOST1X_CLASS_VIC:
+ return 17;
+ default:
+ return -EINVAL;
+ }
+#else
+ switch (class)
+ {
+ case HOST1X_CLASS_HOST1X:
+ return 0;
+ case HOST1X_CLASS_GR2D:
+ return 1;
+ case HOST1X_CLASS_GR2D_SB:
+ return 2;
+ case HOST1X_CLASS_VIC:
+ return 3;
+ case HOST1X_CLASS_GR3D:
+ return 4;
+ default:
+ return -EINVAL;
+ }
+#endif
+}
+
+static void timeout_release_mlock(struct host1x_cdma *cdma)
+{
+#if HOST1X_HW >= 6
+ struct host1x_channel *ch = cdma_to_channel(cdma);
+ struct host1x *host = cdma_to_host1x(cdma);
+ u32 pb_pos, pb_temp[3], val;
+ int err, mlock_id;
+
+ if (!host->hv_regs)
+ return;
+
+ mlock_id = mlock_id_for_class(cdma->timeout.class);
+ if (WARN(mlock_id < 0, "Invalid class ID"))
+ return;
+
+ val = host1x_hypervisor_readl(host, HOST1X_HV_MLOCK(mlock_id));
+ if (!HOST1X_HV_MLOCK_LOCKED_V(val) ||
+ HOST1X_HV_MLOCK_CH_V(val) != ch->id)
+ {
+ /* Channel is not holding the MLOCK, nothing to release. */
+ return;
+ }
+
+ /*
+ * On Tegra186, there is no register to unlock an MLOCK (don't ask me
+ * why). As such, we have to execute a release_mlock instruction to
+ * do it. We do this by backing up the first three opcodes of the
+ * pushbuffer and replacing them with our own short sequence to do
+ * the unlocking. We set the .pos field to 12, which causes DMAEND
+ * to be set accordingly such that only the three opcodes we set
+ * here are executed before CDMA stops. Finally we restore the value
+ * of pos and pushbuffer contents.
+ */
+
+ pb_pos = cdma->push_buffer.pos;
+ memcpy(pb_temp, cdma->push_buffer.mapped, ARRAY_SIZE(pb_temp) * 4);
+
+ {
+ u32 *pb = cdma->push_buffer.mapped;
+ pb[0] = host1x_opcode_acquire_mlock(cdma->timeout.class);
+ pb[1] = host1x_opcode_setclass(cdma->timeout.class, 0, 0);
+ pb[2] = host1x_opcode_release_mlock(cdma->timeout.class);
+ }
+
+ /* Flush writecombine buffer */
+ wmb();
+
+ cdma->push_buffer.pos = ARRAY_SIZE(pb_temp) * 4;
+
+ cdma_resume(cdma, 0);
+
+ /* Wait until the release_mlock opcode has been executed */
+ err = readl_relaxed_poll_timeout(
+ host->hv_regs + HOST1X_HV_MLOCK(mlock_id), val,
+ !HOST1X_HV_MLOCK_LOCKED_V(val) ||
+ HOST1X_HV_MLOCK_CH_V(val) != ch->id,
+ 10, 10000);
+ WARN(err, "Failed to unlock mlock %u\n", mlock_id);
+
+ cdma_freeze(cdma);
+
+ /* Restore original pushbuffer state */
+ cdma->push_buffer.pos = pb_pos;
+ memcpy(cdma->push_buffer.mapped, pb_temp, ARRAY_SIZE(pb_temp) * 4);
+ wmb();
+#else
+ struct host1x_channel *ch = cdma_to_channel(cdma);
+ struct host1x *host = cdma_to_host1x(cdma);
+ int mlock_id;
+ u32 val;
+
+ mlock_id = mlock_id_for_class(cdma->timeout.class);
+ if (WARN(mlock_id < 0, "Invalid class ID"))
+ return;
+
+ val = host1x_sync_readl(host, HOST1X_SYNC_MLOCK_OWNER(mlock_id));
+ if (!HOST1X_SYNC_MLOCK_OWNER_CH_OWNS_V(val) ||
+ HOST1X_SYNC_MLOCK_OWNER_CHID_V(val) != ch->id)
+ {
+ /* Channel is not holding the MLOCK, nothing to release. */
+ return;
+ }
+
+ /* Unlock MLOCK */
+ host1x_sync_writel(host, 0x0, HOST1X_SYNC_MLOCK(mlock_id));
+#endif
+}
+
/*
* If this timeout fires, it indicates the current sync_queue entry has
* exceeded its TTL and the userctx should be timed out and remaining
@@ -293,6 +413,8 @@ static void cdma_timeout_handler(struct work_struct *work)
/* stop HW, resetting channel/module */
host1x_hw_cdma_freeze(host1x, cdma);
+ timeout_release_mlock(cdma);
+
host1x_cdma_update_sync_queue(cdma, ch->dev);
mutex_unlock(&cdma->lock);
}
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
index 246b78c41281..f80fb8be38e6 100644
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -89,6 +89,34 @@ static inline void synchronize_syncpt_base(struct host1x_job *job)
HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(value));
}
+static void channel_submit_serialize(struct host1x_channel *ch,
+ struct host1x_syncpt *sp)
+{
+#if HOST1X_HW >= 6
+ /*
+ * On T186, due to a hw issue, we need to issue an mlock acquire
+ * for HOST1X class. It is still interpreted as a no-op in
+ * hardware.
+ */
+ host1x_cdma_push(&ch->cdma,
+ host1x_opcode_acquire_mlock(HOST1X_CLASS_HOST1X),
+ host1x_opcode_setclass(HOST1X_CLASS_HOST1X, 0, 0));
+ host1x_cdma_push(&ch->cdma,
+ host1x_opcode_nonincr(host1x_uclass_wait_syncpt_r(), 1),
+ host1x_class_host_wait_syncpt(host1x_syncpt_id(sp),
+ host1x_syncpt_read_max(sp)));
+ host1x_cdma_push(&ch->cdma,
+ host1x_opcode_release_mlock(HOST1X_CLASS_HOST1X),
+ HOST1X_OPCODE_NOP);
+#else
+ host1x_cdma_push(&ch->cdma,
+ host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
+ host1x_uclass_wait_syncpt_r(), 1),
+ host1x_class_host_wait_syncpt(host1x_syncpt_id(sp),
+ host1x_syncpt_read_max(sp)));
+#endif
+}
+
static int channel_submit(struct host1x_job *job)
{
struct host1x_channel *ch = job->channel;
@@ -96,6 +124,7 @@ static int channel_submit(struct host1x_job *job)
u32 user_syncpt_incrs = job->syncpt_incrs;
u32 prev_max = 0;
u32 syncval;
+ u32 mlock;
int err;
struct host1x_waitlist *completed_waiter = NULL;
struct host1x *host = dev_get_drvdata(ch->dev->parent);
@@ -128,17 +157,12 @@ static int channel_submit(struct host1x_job *job)
goto error;
}
- if (job->serialize) {
- /*
- * Force serialization by inserting a host wait for the
- * previous job to finish before this one can commence.
- */
- host1x_cdma_push(&ch->cdma,
- host1x_opcode_setclass(HOST1X_CLASS_HOST1X,
- host1x_uclass_wait_syncpt_r(), 1),
- host1x_class_host_wait_syncpt(job->syncpt_id,
- host1x_syncpt_read_max(sp)));
- }
+ /*
+ * Force serialization by inserting a host wait for the
+ * previous job to finish before this one can commence.
+ */
+ if (job->serialize)
+ channel_submit_serialize(ch, sp);
/* Synchronize base register to allow using it for relative waiting */
if (sp->base)
@@ -149,16 +173,29 @@ static int channel_submit(struct host1x_job *job)
/* assign syncpoint to channel */
host1x_hw_syncpt_assign_channel(host, sp, ch);
- job->syncpt_end = syncval;
-
- /* add a setclass for modules that require it */
- if (job->class)
+ /* acquire MLOCK and set channel class to specified class */
+ mlock = HOST1X_HW >= 6 ? job->class : mlock_id_for_class(job->class);
+ if (job->class) {
host1x_cdma_push(&ch->cdma,
- host1x_opcode_setclass(job->class, 0, 0),
- HOST1X_OPCODE_NOP);
+ host1x_opcode_acquire_mlock(mlock),
+ host1x_opcode_setclass(job->class, 0, 0));
+ }
submit_gathers(job);
+ if (job->class) {
+ /*
+ * Push additional increment to catch jobs that crash before
+ * finishing their gather, not reaching the unlock opcode.
+ */
+ syncval = host1x_syncpt_incr_max(sp, 1);
+ host1x_cdma_push(&ch->cdma,
+ host1x_opcode_imm_incr_syncpt(0, job->syncpt_id),
+ host1x_opcode_release_mlock(mlock));
+ }
+
+ job->syncpt_end = syncval;
+
/* end CDMA submit & stash pinned hMems into sync queue */
host1x_cdma_end(&ch->cdma, job);
diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
index 5f0fb866efa8..6a2c9f905acc 100644
--- a/drivers/gpu/host1x/hw/host1x01_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
@@ -138,6 +138,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
}
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+ return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+ return (14 << 28) | (1 << 24) | id;
+}
+
#define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
#endif
diff --git a/drivers/gpu/host1x/hw/host1x02_hardware.h b/drivers/gpu/host1x/hw/host1x02_hardware.h
index 154901860bc6..c524c6c8d82f 100644
--- a/drivers/gpu/host1x/hw/host1x02_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x02_hardware.h
@@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
}
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+ return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+ return (14 << 28) | (1 << 24) | id;
+}
+
#define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
#endif
diff --git a/drivers/gpu/host1x/hw/host1x04_hardware.h b/drivers/gpu/host1x/hw/host1x04_hardware.h
index de1a38175328..8dedd77b7a1b 100644
--- a/drivers/gpu/host1x/hw/host1x04_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x04_hardware.h
@@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
}
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+ return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+ return (14 << 28) | (1 << 24) | id;
+}
+
#define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
#endif
diff --git a/drivers/gpu/host1x/hw/host1x05_hardware.h b/drivers/gpu/host1x/hw/host1x05_hardware.h
index 2937ebb6be11..6aafcb5e97e6 100644
--- a/drivers/gpu/host1x/hw/host1x05_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x05_hardware.h
@@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
}
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+ return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+ return (14 << 28) | (1 << 24) | id;
+}
+
#define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
#endif
diff --git a/drivers/gpu/host1x/hw/host1x06_hardware.h b/drivers/gpu/host1x/hw/host1x06_hardware.h
index 3039c92ea605..60147d26ad9b 100644
--- a/drivers/gpu/host1x/hw/host1x06_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x06_hardware.h
@@ -137,6 +137,16 @@ static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
}
+static inline u32 host1x_opcode_acquire_mlock(unsigned id)
+{
+ return (14 << 28) | id;
+}
+
+static inline u32 host1x_opcode_release_mlock(unsigned id)
+{
+ return (14 << 28) | (1 << 24) | id;
+}
+
#define HOST1X_OPCODE_NOP host1x_opcode_nonincr(0, 0)
#endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_sync.h b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
index 31238c285d46..4630fec2237a 100644
--- a/drivers/gpu/host1x/hw/hw_host1x01_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
@@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
}
#define HOST1X_SYNC_IP_BUSY_TIMEOUT \
host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+ return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+ host1x_sync_mlock_r(id)
static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
{
return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x02_sync.h b/drivers/gpu/host1x/hw/hw_host1x02_sync.h
index 540c7b65995f..dec8e812a114 100644
--- a/drivers/gpu/host1x/hw/hw_host1x02_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x02_sync.h
@@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
}
#define HOST1X_SYNC_IP_BUSY_TIMEOUT \
host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+ return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+ host1x_sync_mlock_r(id)
static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
{
return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x04_sync.h b/drivers/gpu/host1x/hw/hw_host1x04_sync.h
index 3d6c8ec65934..9a951a4db07f 100644
--- a/drivers/gpu/host1x/hw/hw_host1x04_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x04_sync.h
@@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
}
#define HOST1X_SYNC_IP_BUSY_TIMEOUT \
host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+ return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+ host1x_sync_mlock_r(id)
static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
{
return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x05_sync.h b/drivers/gpu/host1x/hw/hw_host1x05_sync.h
index ca10eee5045c..e29e2e19a60b 100644
--- a/drivers/gpu/host1x/hw/hw_host1x05_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x05_sync.h
@@ -125,6 +125,12 @@ static inline u32 host1x_sync_ip_busy_timeout_r(void)
}
#define HOST1X_SYNC_IP_BUSY_TIMEOUT \
host1x_sync_ip_busy_timeout_r()
+static inline u32 host1x_sync_mlock_r(unsigned int id)
+{
+ return 0x2c0 + id * REGISTER_STRIDE;
+}
+#define HOST1X_SYNC_MLOCK(id) \
+ host1x_sync_mlock_r(id)
static inline u32 host1x_sync_mlock_owner_r(unsigned int id)
{
return 0x340 + id * REGISTER_STRIDE;
diff --git a/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h b/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
index c05dab8a178b..be73530c3585 100644
--- a/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
+++ b/drivers/gpu/host1x/hw/hw_host1x06_hypervisor.h
@@ -18,6 +18,11 @@
#define HOST1X_HV_SYNCPT_PROT_EN 0x1ac4
#define HOST1X_HV_SYNCPT_PROT_EN_CH_EN BIT(1)
#define HOST1X_HV_CH_KERNEL_FILTER_GBUFFER(x) (0x2020 + (x * 4))
+#define HOST1X_HV_MLOCK(x) (0x2030 + (x * 4))
+#define HOST1X_HV_MLOCK_CH(x) (((x) & 0x3f) << 2)
+#define HOST1X_HV_MLOCK_CH_V(x) (((x) >> 2) & 0x3f)
+#define HOST1X_HV_MLOCK_LOCKED BIT(0)
+#define HOST1X_HV_MLOCK_LOCKED_V(x) ((x) & 0x1)
#define HOST1X_HV_CMDFIFO_PEEK_CTRL 0x233c
#define HOST1X_HV_CMDFIFO_PEEK_CTRL_ADDR(x) (x)
#define HOST1X_HV_CMDFIFO_PEEK_CTRL_CHANNEL(x) ((x) << 16)
--
2.14.2