Re: [PATCH v2 27/27] drm/dp_mst: Add topology ref history tracking for debugging

From: Sean Paul
Date: Fri Sep 27 2019 - 10:51:12 EST


On Tue, Sep 03, 2019 at 04:46:05PM -0400, Lyude Paul wrote:
> For very subtle mistakes with topology refs, it can be rather difficult
> to trace them down with the debugging info that we already have. I had
> one such issue recently while trying to implement suspend/resume
> reprobing for MST, and ended up coming up with this.
>
> Inspired by Chris Wilson's wakeref tracking for i915, this adds a very
> similar feature to the DP MST helpers, which allows for partial tracking
> of topology refs for both ports and branch devices. This is a lot less
> advanced then wakeref tracking: we merely keep a count of all of the
> spots where a topology ref has been grabbed or dropped, then dump out
> that history in chronological order when a port or branch device's
> topology refcount reaches 0. So far, I've found this incredibly useful
> for debugging topology refcount errors.
>
> Since this has the potential to be somewhat slow and loud, we add an
> expert kernel config option to enable or disable this feature,
> CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
>

Looks very useful indeed!

My only nit is that we could probably grow the list a little more aggressively
(or start it off at some size > 1) and avoid a bunch of reallocs. That said,
I'm not sure how often it's reallocated so it might not be an issue. Either
way,

Reviewed-by: Sean Paul <sean@xxxxxxxxxx>


> Changes since v1:
> * Don't forget to destroy topology_ref_history_lock
>
> Cc: Juston Li <juston.li@xxxxxxxxx>
> Cc: Imre Deak <imre.deak@xxxxxxxxx>
> Cc: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx>
> Cc: Harry Wentland <hwentlan@xxxxxxx>
> Cc: Daniel Vetter <daniel.vetter@xxxxxxxx>
> Signed-off-by: Lyude Paul <lyude@xxxxxxxxxx>
> ---
> drivers/gpu/drm/Kconfig | 14 ++
> drivers/gpu/drm/drm_dp_mst_topology.c | 233 +++++++++++++++++++++++++-
> include/drm/drm_dp_mst_helper.h | 45 +++++
> 3 files changed, 288 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
> index e67c194c2aca..44fc2c2a6e2c 100644
> --- a/drivers/gpu/drm/Kconfig
> +++ b/drivers/gpu/drm/Kconfig
> @@ -93,6 +93,20 @@ config DRM_KMS_FB_HELPER
> help
> FBDEV helpers for KMS drivers.
>
> +config DRM_DEBUG_DP_MST_TOPOLOGY_REFS
> + bool "Enable refcount backtrace history in the DP MST helpers"
> + select STACKDEPOT
> + depends on DRM_KMS_HELPER
> + depends on DEBUG_KERNEL
> + depends on EXPERT
> + help
> + Enables debug tracing for topology refs in DRM's DP MST helpers. A
> + history of each topology reference/dereference will be printed to the
> + kernel log once a port or branch device's topology refcount reaches 0.
> +
> + This has the potential to use a lot of memory and print some very
> + large kernel messages. If in doubt, say "N".
> +
> config DRM_FBDEV_EMULATION
> bool "Enable legacy fbdev support for your modesetting driver"
> depends on DRM
> diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c
> index 5b5c0b3b3c0e..18f9a02927d9 100644
> --- a/drivers/gpu/drm/drm_dp_mst_topology.c
> +++ b/drivers/gpu/drm/drm_dp_mst_topology.c
> @@ -28,6 +28,13 @@
> #include <linux/sched.h>
> #include <linux/seq_file.h>
>
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +#include <linux/stackdepot.h>
> +#include <linux/sort.h>
> +#include <linux/timekeeping.h>
> +#include <linux/math64.h>
> +#endif
> +
> #include <drm/drm_atomic.h>
> #include <drm/drm_atomic_helper.h>
> #include <drm/drm_dp_mst_helper.h>
> @@ -1405,12 +1412,189 @@ drm_dp_mst_put_port_malloc(struct drm_dp_mst_port *port)
> }
> EXPORT_SYMBOL(drm_dp_mst_put_port_malloc);
>
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +
> +#define STACK_DEPTH 8
> +
> +static noinline void
> +__topology_ref_save(struct drm_dp_mst_topology_mgr *mgr,
> + struct drm_dp_mst_topology_ref_history *history,
> + enum drm_dp_mst_topology_ref_type type)
> +{
> + struct drm_dp_mst_topology_ref_entry *entry = NULL;
> + depot_stack_handle_t backtrace;
> + ulong stack_entries[STACK_DEPTH];
> + uint n;
> + int i;
> +
> + n = stack_trace_save(stack_entries, ARRAY_SIZE(stack_entries), 1);
> + backtrace = stack_depot_save(stack_entries, n, GFP_KERNEL);
> + if (!backtrace)
> + goto fail_alloc;
> +
> + /* Try to find an existing entry for this backtrace */
> + for (i = 0; i < history->len; i++) {
> + if (history->entries[i].backtrace == backtrace) {
> + entry = &history->entries[i];
> + break;
> + }
> + }
> +
> + /* Otherwise add one */
> + if (!entry) {
> + struct drm_dp_mst_topology_ref_entry *new;
> + int new_len = history->len + 1;
> +
> + new = krealloc(history->entries, sizeof(*new) * new_len,
> + GFP_KERNEL);
> + if (!new)
> + goto fail_alloc;
> +
> + entry = &new[history->len];
> + history->len = new_len;
> + history->entries = new;
> +
> + entry->backtrace = backtrace;
> + entry->type = type;
> + entry->count = 0;
> + }
> + entry->count++;
> + entry->ts_nsec = ktime_get_ns();
> +
> + return;
> +fail_alloc:
> + DRM_WARN_ONCE("Failed to allocate memory for topology refcount backtrace\n");
> +}
> +
> +static int
> +topology_ref_history_cmp(const void *a, const void *b)
> +{
> + const struct drm_dp_mst_topology_ref_entry *entry_a = a, *entry_b = b;
> +
> + if (entry_a->ts_nsec > entry_b->ts_nsec)
> + return 1;
> + else if (entry_a->ts_nsec < entry_b->ts_nsec)
> + return -1;
> + else
> + return 0;
> +}
> +
> +static inline const char *
> +topology_ref_type_to_str(enum drm_dp_mst_topology_ref_type type)
> +{
> + if (type == DRM_DP_MST_TOPOLOGY_REF_GET)
> + return "get";
> + else
> + return "put";
> +}
> +
> +static void
> +__dump_topology_ref_history(struct drm_dp_mst_topology_ref_history *history,
> + void *ptr, const char *type_str)
> +{
> + struct drm_printer p = drm_debug_printer(DBG_PREFIX);
> + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
> + int i;
> +
> + if (!buf)
> + return;
> +
> + if (!history->len)
> + goto out;
> +
> + /* First, sort the list so that it goes from oldest to newest
> + * reference entry
> + */
> + sort(history->entries, history->len, sizeof(*history->entries),
> + topology_ref_history_cmp, NULL);
> +
> + drm_printf(&p,
> + "%s (%p/%px) topology count reached 0, dumping history:\n",
> + type_str, ptr, ptr);
> +
> + for (i = 0; i < history->len; i++) {
> + const struct drm_dp_mst_topology_ref_entry *entry =
> + &history->entries[i];
> + ulong *entries;
> + uint nr_entries;
> + u64 ts_nsec = entry->ts_nsec;
> + u64 rem_nsec = do_div(ts_nsec, 1000000000);
> +
> + nr_entries = stack_depot_fetch(entry->backtrace, &entries);
> + stack_trace_snprint(buf, PAGE_SIZE, entries, nr_entries, 4);
> +
> + drm_printf(&p, " %d %ss (last at %5llu.%06llu):\n%s",
> + entry->count,
> + topology_ref_type_to_str(entry->type),
> + ts_nsec, rem_nsec / 1000, buf);
> + }
> +
> + /* Now free the history, since this is the only time we expose it */
> + kfree(history->entries);
> +out:
> + kfree(buf);
> +}
> +
> +static __always_inline void
> +drm_dp_mst_dump_mstb_topology_history(struct drm_dp_mst_branch *mstb)
> +{
> + __dump_topology_ref_history(&mstb->topology_ref_history, mstb,
> + "MSTB");
> +}
> +
> +static __always_inline void
> +drm_dp_mst_dump_port_topology_history(struct drm_dp_mst_port *port)
> +{
> + __dump_topology_ref_history(&port->topology_ref_history, port,
> + "Port");
> +}
> +
> +static __always_inline void
> +save_mstb_topology_ref(struct drm_dp_mst_branch *mstb,
> + enum drm_dp_mst_topology_ref_type type)
> +{
> + __topology_ref_save(mstb->mgr, &mstb->topology_ref_history, type);
> +}
> +
> +static __always_inline void
> +save_port_topology_ref(struct drm_dp_mst_port *port,
> + enum drm_dp_mst_topology_ref_type type)
> +{
> + __topology_ref_save(port->mgr, &port->topology_ref_history, type);
> +}
> +
> +static inline void
> +topology_ref_history_lock(struct drm_dp_mst_topology_mgr *mgr)
> +{
> + mutex_lock(&mgr->topology_ref_history_lock);
> +}
> +
> +static inline void
> +topology_ref_history_unlock(struct drm_dp_mst_topology_mgr *mgr)
> +{
> + mutex_unlock(&mgr->topology_ref_history_lock);
> +}
> +#else
> +static inline void
> +topology_ref_history_lock(struct drm_dp_mst_topology_mgr *mgr) {}
> +static inline void
> +topology_ref_history_unlock(struct drm_dp_mst_topology_mgr *mgr) {}
> +static inline void
> +drm_dp_mst_dump_mstb_topology_history(struct drm_dp_mst_branch *mstb) {}
> +static inline void
> +drm_dp_mst_dump_port_topology_history(struct drm_dp_mst_port *port) {}
> +#define save_mstb_topology_ref(mstb, type)
> +#define save_port_topology_ref(port, type)
> +#endif
> +
> static void drm_dp_destroy_mst_branch_device(struct kref *kref)
> {
> struct drm_dp_mst_branch *mstb =
> container_of(kref, struct drm_dp_mst_branch, topology_kref);
> struct drm_dp_mst_topology_mgr *mgr = mstb->mgr;
>
> + drm_dp_mst_dump_mstb_topology_history(mstb);
> +
> INIT_LIST_HEAD(&mstb->destroy_next);
>
> /*
> @@ -1448,11 +1632,18 @@ static void drm_dp_destroy_mst_branch_device(struct kref *kref)
> static int __must_check
> drm_dp_mst_topology_try_get_mstb(struct drm_dp_mst_branch *mstb)
> {
> - int ret = kref_get_unless_zero(&mstb->topology_kref);
> + int ret;
>
> - if (ret)
> + topology_ref_history_lock(mstb->mgr);
> + ret = kref_get_unless_zero(&mstb->topology_kref);
> +
> + if (ret) {
> DRM_DEBUG("mstb %p/%px (%d)\n",
> mstb, mstb, kref_read(&mstb->topology_kref));
> + save_mstb_topology_ref(mstb, DRM_DP_MST_TOPOLOGY_REF_GET);
> + }
> +
> + topology_ref_history_unlock(mstb->mgr);
>
> return ret;
> }
> @@ -1473,10 +1664,15 @@ drm_dp_mst_topology_try_get_mstb(struct drm_dp_mst_branch *mstb)
> */
> static void drm_dp_mst_topology_get_mstb(struct drm_dp_mst_branch *mstb)
> {
> + topology_ref_history_lock(mstb->mgr);
> +
> + save_mstb_topology_ref(mstb, DRM_DP_MST_TOPOLOGY_REF_GET);
> WARN_ON(kref_read(&mstb->topology_kref) == 0);
> kref_get(&mstb->topology_kref);
> DRM_DEBUG("mstb %p/%px (%d)\n",
> mstb, mstb, kref_read(&mstb->topology_kref));
> +
> + topology_ref_history_unlock(mstb->mgr);
> }
>
> /**
> @@ -1494,9 +1690,14 @@ static void drm_dp_mst_topology_get_mstb(struct drm_dp_mst_branch *mstb)
> static void
> drm_dp_mst_topology_put_mstb(struct drm_dp_mst_branch *mstb)
> {
> + topology_ref_history_lock(mstb->mgr);
> +
> DRM_DEBUG("mstb %p/%px (%d)\n",
> mstb, mstb, kref_read(&mstb->topology_kref) - 1);
> + save_mstb_topology_ref(mstb, DRM_DP_MST_TOPOLOGY_REF_PUT);
> kref_put(&mstb->topology_kref, drm_dp_destroy_mst_branch_device);
> +
> + topology_ref_history_unlock(mstb->mgr);
> }
>
> static void drm_dp_destroy_port(struct kref *kref)
> @@ -1505,6 +1706,8 @@ static void drm_dp_destroy_port(struct kref *kref)
> container_of(kref, struct drm_dp_mst_port, topology_kref);
> struct drm_dp_mst_topology_mgr *mgr = port->mgr;
>
> + drm_dp_mst_dump_port_topology_history(port);
> +
> /* There's nothing that needs locking to destroy an input port yet */
> if (port->input) {
> drm_dp_mst_put_port_malloc(port);
> @@ -1548,12 +1751,18 @@ static void drm_dp_destroy_port(struct kref *kref)
> static int __must_check
> drm_dp_mst_topology_try_get_port(struct drm_dp_mst_port *port)
> {
> - int ret = kref_get_unless_zero(&port->topology_kref);
> + int ret;
> +
> + topology_ref_history_lock(port->mgr);
> + ret = kref_get_unless_zero(&port->topology_kref);
>
> - if (ret)
> + if (ret) {
> DRM_DEBUG("port %p/%px (%d)\n",
> port, port, kref_read(&port->topology_kref));
> + save_port_topology_ref(port, DRM_DP_MST_TOPOLOGY_REF_GET);
> + }
>
> + topology_ref_history_unlock(port->mgr);
> return ret;
> }
>
> @@ -1572,10 +1781,15 @@ drm_dp_mst_topology_try_get_port(struct drm_dp_mst_port *port)
> */
> static void drm_dp_mst_topology_get_port(struct drm_dp_mst_port *port)
> {
> + topology_ref_history_lock(port->mgr);
> +
> WARN_ON(kref_read(&port->topology_kref) == 0);
> kref_get(&port->topology_kref);
> DRM_DEBUG("port %p/%px (%d)\n",
> port, port, kref_read(&port->topology_kref));
> + save_port_topology_ref(port, DRM_DP_MST_TOPOLOGY_REF_GET);
> +
> + topology_ref_history_unlock(port->mgr);
> }
>
> /**
> @@ -1591,9 +1805,14 @@ static void drm_dp_mst_topology_get_port(struct drm_dp_mst_port *port)
> */
> static void drm_dp_mst_topology_put_port(struct drm_dp_mst_port *port)
> {
> + topology_ref_history_lock(port->mgr);
> +
> DRM_DEBUG("port %p/%px (%d)\n",
> port, port, kref_read(&port->topology_kref) - 1);
> + save_port_topology_ref(port, DRM_DP_MST_TOPOLOGY_REF_PUT);
> kref_put(&port->topology_kref, drm_dp_destroy_port);
> +
> + topology_ref_history_unlock(port->mgr);
> }
>
> static struct drm_dp_mst_branch *
> @@ -4548,6 +4767,9 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
> mutex_init(&mgr->payload_lock);
> mutex_init(&mgr->delayed_destroy_lock);
> mutex_init(&mgr->up_req_lock);
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> + mutex_init(&mgr->topology_ref_history_lock);
> +#endif
> INIT_LIST_HEAD(&mgr->tx_msg_downq);
> INIT_LIST_HEAD(&mgr->destroy_port_list);
> INIT_LIST_HEAD(&mgr->destroy_branch_device_list);
> @@ -4613,6 +4835,9 @@ void drm_dp_mst_topology_mgr_destroy(struct drm_dp_mst_topology_mgr *mgr)
> mutex_destroy(&mgr->qlock);
> mutex_destroy(&mgr->lock);
> mutex_destroy(&mgr->up_req_lock);
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> + mutex_destroy(&mgr->topology_ref_history_lock);
> +#endif
> }
> EXPORT_SYMBOL(drm_dp_mst_topology_mgr_destroy);
>
> diff --git a/include/drm/drm_dp_mst_helper.h b/include/drm/drm_dp_mst_helper.h
> index 1bdee5ee6dcd..75b8fba6f399 100644
> --- a/include/drm/drm_dp_mst_helper.h
> +++ b/include/drm/drm_dp_mst_helper.h
> @@ -26,6 +26,26 @@
> #include <drm/drm_dp_helper.h>
> #include <drm/drm_atomic.h>
>
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> +#include <linux/stackdepot.h>
> +#include <linux/timekeeping.h>
> +
> +enum drm_dp_mst_topology_ref_type {
> + DRM_DP_MST_TOPOLOGY_REF_GET,
> + DRM_DP_MST_TOPOLOGY_REF_PUT,
> +};
> +
> +struct drm_dp_mst_topology_ref_history {
> + struct drm_dp_mst_topology_ref_entry {
> + enum drm_dp_mst_topology_ref_type type;
> + int count;
> + ktime_t ts_nsec;
> + depot_stack_handle_t backtrace;
> + } *entries;
> + int len;
> +};
> +#endif /* IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS) */
> +
> struct drm_dp_mst_branch;
>
> /**
> @@ -92,6 +112,14 @@ struct drm_dp_mst_port {
> */
> struct kref malloc_kref;
>
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> + /**
> + * @topology_ref_history: A history of each topology
> + * reference/dereference. See CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
> + */
> + struct drm_dp_mst_topology_ref_history topology_ref_history;
> +#endif
> +
> u8 port_num;
> bool input;
> bool mcs;
> @@ -162,6 +190,14 @@ struct drm_dp_mst_branch {
> */
> struct kref malloc_kref;
>
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> + /**
> + * @topology_ref_history: A history of each topology
> + * reference/dereference. See CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS.
> + */
> + struct drm_dp_mst_topology_ref_history topology_ref_history;
> +#endif
> +
> /**
> * @destroy_next: linked-list entry used by
> * drm_dp_delayed_destroy_work()
> @@ -630,6 +666,15 @@ struct drm_dp_mst_topology_mgr {
> * transmissions.
> */
> struct work_struct up_req_work;
> +
> +#if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS)
> + /**
> + * @topology_ref_history_lock: protects
> + * &drm_dp_mst_port.topology_ref_history and
> + * &drm_dp_mst_branch.topology_ref_history.
> + */
> + struct mutex topology_ref_history_lock;
> +#endif
> };
>
> int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
> --
> 2.21.0
>

--
Sean Paul, Software Engineer, Google / Chromium OS