[PATCH v5 2/3] drm/vc4: Add a load tracker to prevent HVS underflow errors

From: Paul Kocialkowski
Date: Wed Feb 20 2019 - 10:52:14 EST


From: Boris Brezillon <boris.brezillon@xxxxxxxxxxx>

The HVS block is supposed to fill the pixelvalve FIFOs fast enough to
meet the requested framerate. The problem is, the HVS and memory bus
bandwidths are limited, and if we don't take these limitations into
account we might end up with HVS underflow errors.

This patch is trying to model the per-plane HVS and memory bus bandwidth
consumption and take a decision at atomic_check() time whether the
estimated load will fit in the HVS and membus budget.

Note that we take an extra margin on the memory bus consumption to let
the system run smoothly when other blocks are doing heavy use of the
memory bus. Same goes for the HVS limit, except the margin is smaller in
this case, since the HVS is not used by external components.

Signed-off-by: Boris Brezillon <boris.brezillon@xxxxxxxxxxx>
Signed-off-by: Paul Kocialkowski <paul.kocialkowski@xxxxxxxxxxx>
---
drivers/gpu/drm/vc4/vc4_drv.c | 1 +
drivers/gpu/drm/vc4/vc4_drv.h | 11 ++++
drivers/gpu/drm/vc4/vc4_kms.c | 103 +++++++++++++++++++++++++++++++-
drivers/gpu/drm/vc4/vc4_plane.c | 57 ++++++++++++++++++
4 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 52576dee809e..3227706700f9 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -311,6 +311,7 @@ static void vc4_drm_unbind(struct device *dev)

drm_mode_config_cleanup(drm);

+ drm_atomic_private_obj_fini(&vc4->load_tracker);
drm_atomic_private_obj_fini(&vc4->ctm_manager);

drm_dev_put(drm);
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 6f86af1b3212..bbab084911f3 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -207,6 +207,7 @@ struct vc4_dev {

struct drm_modeset_lock ctm_state_lock;
struct drm_private_obj ctm_manager;
+ struct drm_private_obj load_tracker;
};

static inline struct vc4_dev *
@@ -378,6 +379,16 @@ struct vc4_plane_state {
* when async update is not possible.
*/
bool dlist_initialized;
+
+ /* Load of this plane on the HVS block. The load is expressed in HVS
+ * cycles/sec.
+ */
+ u64 hvs_load;
+
+ /* Memory bandwidth needed for this plane. This is expressed in
+ * bytes/sec.
+ */
+ u64 membus_load;
};

static inline struct vc4_plane_state *
diff --git a/drivers/gpu/drm/vc4/vc4_kms.c b/drivers/gpu/drm/vc4/vc4_kms.c
index b971258dcfd4..6e20604573de 100644
--- a/drivers/gpu/drm/vc4/vc4_kms.c
+++ b/drivers/gpu/drm/vc4/vc4_kms.c
@@ -34,6 +34,18 @@ static struct vc4_ctm_state *to_vc4_ctm_state(struct drm_private_state *priv)
return container_of(priv, struct vc4_ctm_state, base);
}

+struct vc4_load_tracker_state {
+ struct drm_private_state base;
+ u64 hvs_load;
+ u64 membus_load;
+};
+
+static struct vc4_load_tracker_state *
+to_vc4_load_tracker_state(struct drm_private_state *priv)
+{
+ return container_of(priv, struct vc4_load_tracker_state, base);
+}
+
static struct vc4_ctm_state *vc4_get_ctm_state(struct drm_atomic_state *state,
struct drm_private_obj *manager)
{
@@ -395,6 +407,81 @@ vc4_ctm_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
return 0;
}

+static int vc4_load_tracker_atomic_check(struct drm_atomic_state *state)
+{
+ struct drm_plane_state *old_plane_state, *new_plane_state;
+ struct vc4_dev *vc4 = to_vc4_dev(state->dev);
+ struct vc4_load_tracker_state *load_state;
+ struct drm_private_state *priv_state;
+ struct drm_plane *plane;
+ int i;
+
+ priv_state = drm_atomic_get_private_obj_state(state,
+ &vc4->load_tracker);
+ if (IS_ERR(priv_state))
+ return PTR_ERR(priv_state);
+
+ load_state = to_vc4_load_tracker_state(priv_state);
+ for_each_oldnew_plane_in_state(state, plane, old_plane_state,
+ new_plane_state, i) {
+ struct vc4_plane_state *vc4_plane_state;
+
+ if (old_plane_state->fb && old_plane_state->crtc) {
+ vc4_plane_state = to_vc4_plane_state(old_plane_state);
+ load_state->membus_load -= vc4_plane_state->membus_load;
+ load_state->hvs_load -= vc4_plane_state->hvs_load;
+ }
+
+ if (new_plane_state->fb && new_plane_state->crtc) {
+ vc4_plane_state = to_vc4_plane_state(new_plane_state);
+ load_state->membus_load += vc4_plane_state->membus_load;
+ load_state->hvs_load += vc4_plane_state->hvs_load;
+ }
+ }
+
+ /* The absolute limit is 2Gbyte/sec, but let's take a margin to let
+ * the system work when other blocks are accessing the memory.
+ */
+ if (load_state->membus_load > SZ_1G + SZ_512M)
+ return -ENOSPC;
+
+ /* HVS clock is supposed to run @ 250Mhz, let's take a margin and
+ * consider the maximum number of cycles is 240M.
+ */
+ if (load_state->hvs_load > 240000000ULL)
+ return -ENOSPC;
+
+ return 0;
+}
+
+static struct drm_private_state *
+vc4_load_tracker_duplicate_state(struct drm_private_obj *obj)
+{
+ struct vc4_load_tracker_state *state;
+
+ state = kmemdup(obj->state, sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return NULL;
+
+ __drm_atomic_helper_private_obj_duplicate_state(obj, &state->base);
+
+ return &state->base;
+}
+
+static void vc4_load_tracker_destroy_state(struct drm_private_obj *obj,
+ struct drm_private_state *state)
+{
+ struct vc4_load_tracker_state *load_state;
+
+ load_state = to_vc4_load_tracker_state(state);
+ kfree(load_state);
+}
+
+static const struct drm_private_state_funcs vc4_load_tracker_state_funcs = {
+ .atomic_duplicate_state = vc4_load_tracker_duplicate_state,
+ .atomic_destroy_state = vc4_load_tracker_destroy_state,
+};
+
static int
vc4_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
{
@@ -404,7 +491,11 @@ vc4_atomic_check(struct drm_device *dev, struct drm_atomic_state *state)
if (ret < 0)
return ret;

- return drm_atomic_helper_check(dev, state);
+ ret = drm_atomic_helper_check(dev, state);
+ if (ret)
+ return ret;
+
+ return vc4_load_tracker_atomic_check(state);
}

static const struct drm_mode_config_funcs vc4_mode_funcs = {
@@ -417,6 +508,7 @@ int vc4_kms_load(struct drm_device *dev)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
struct vc4_ctm_state *ctm_state;
+ struct vc4_load_tracker_state *load_state;
int ret;

sema_init(&vc4->async_modeset, 1);
@@ -446,6 +538,15 @@ int vc4_kms_load(struct drm_device *dev)
drm_atomic_private_obj_init(dev, &vc4->ctm_manager, &ctm_state->base,
&vc4_ctm_state_funcs);

+ load_state = kzalloc(sizeof(*load_state), GFP_KERNEL);
+ if (!load_state) {
+ drm_atomic_private_obj_fini(&vc4->ctm_manager);
+ return -ENOMEM;
+ }
+
+ drm_atomic_private_obj_init(dev, &vc4->load_tracker, &load_state->base,
+ &vc4_load_tracker_state_funcs);
+
drm_mode_config_reset(dev);

drm_kms_helper_poll_init(dev);
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 1babfeca0c92..4d918d3e4858 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -488,6 +488,61 @@ static void vc4_write_scaling_parameters(struct drm_plane_state *state,
}
}

+static void vc4_plane_calc_load(struct drm_plane_state *state)
+{
+ unsigned int hvs_load_shift, vrefresh, i;
+ struct drm_framebuffer *fb = state->fb;
+ struct vc4_plane_state *vc4_state;
+ struct drm_crtc_state *crtc_state;
+ unsigned int vscale_factor;
+
+ vc4_state = to_vc4_plane_state(state);
+ crtc_state = drm_atomic_get_existing_crtc_state(state->state,
+ state->crtc);
+ vrefresh = drm_mode_vrefresh(&crtc_state->adjusted_mode);
+
+ /* The HVS is able to process 2 pixels/cycle when scaling the source,
+ * 4 pixels/cycle otherwise.
+ * Alpha blending step seems to be pipelined and it's always operating
+ * at 4 pixels/cycle, so the limiting aspect here seems to be the
+ * scaler block.
+ * HVS load is expressed in clk-cycles/sec (AKA Hz).
+ */
+ if (vc4_state->x_scaling[0] != VC4_SCALING_NONE ||
+ vc4_state->x_scaling[1] != VC4_SCALING_NONE ||
+ vc4_state->y_scaling[0] != VC4_SCALING_NONE ||
+ vc4_state->y_scaling[1] != VC4_SCALING_NONE)
+ hvs_load_shift = 1;
+ else
+ hvs_load_shift = 2;
+
+ vc4_state->membus_load = 0;
+ vc4_state->hvs_load = 0;
+ for (i = 0; i < fb->format->num_planes; i++) {
+ /* Even if the bandwidth/plane required for a single frame is
+ *
+ * vc4_state->src_w[i] * vc4_state->src_h[i] * cpp * vrefresh
+ *
+ * when downscaling, we have to read more pixels per line in
+ * the time frame reserved for a single line, so the bandwidth
+ * demand can be punctually higher. To account for that, we
+ * calculate the down-scaling factor and multiply the plane
+ * load by this number. We're likely over-estimating the read
+ * demand, but that's better than under-estimating it.
+ */
+ vscale_factor = DIV_ROUND_UP(vc4_state->src_h[i],
+ vc4_state->crtc_h);
+ vc4_state->membus_load += vc4_state->src_w[i] *
+ vc4_state->src_h[i] * vscale_factor *
+ fb->format->cpp[i];
+ vc4_state->hvs_load += vc4_state->crtc_h * vc4_state->crtc_w;
+ }
+
+ vc4_state->hvs_load *= vrefresh;
+ vc4_state->hvs_load >>= hvs_load_shift;
+ vc4_state->membus_load *= vrefresh;
+}
+
static int vc4_plane_allocate_lbm(struct drm_plane_state *state)
{
struct vc4_dev *vc4 = to_vc4_dev(state->plane->dev);
@@ -875,6 +930,8 @@ static int vc4_plane_mode_set(struct drm_plane *plane,
*/
vc4_state->dlist_initialized = 1;

+ vc4_plane_calc_load(state);
+
return 0;
}

--
2.20.1