From 5e68c6e971d98fc9d4beaf69c5ca58f39f8db1a7 Mon Sep 17 00:00:00 2001 From: Konsta Holtta Date: Wed, 4 Jan 2017 20:59:01 +0200 Subject: gpu: nvgpu: add support for refcount tracking If enabled, track actions (gets and puts) on channel reference counters. Dump the most recent actions to syslog when gk20a_wait_until_counter_is_N gets stuck when closing a channel. GK20A_CHANNEL_REFCOUNT_TRACKING specifies the size of the action history. Default is to disable completely, as this has some runtime overhead. Bug 1826754 Change-Id: I880b0efe8881044d02ae224c243a51cb6c2db8c1 Signed-off-by: Konsta Holtta Reviewed-on: http://git-master/r/1262424 Reviewed-by: mobile promotions Tested-by: mobile promotions --- drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 80 ++++++++++++++++++++++++++++++++- drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 48 +++++++++++++++++++- 2 files changed, 126 insertions(+), 2 deletions(-) (limited to 'drivers/gpu') diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 5e6ddb2e..a731e29c 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c @@ -1,7 +1,7 @@ /* * GK20A Graphics channel * - * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -62,6 +62,7 @@ struct channel_priv { static struct channel_gk20a *allocate_channel(struct fifo_gk20a *f); static void free_channel(struct fifo_gk20a *f, struct channel_gk20a *c); +static void gk20a_channel_dump_ref_actions(struct channel_gk20a *c); static void free_priv_cmdbuf(struct channel_gk20a *c, struct priv_cmd_entry *e); @@ -886,6 +887,8 @@ static void gk20a_wait_until_counter_is_N( "%s: channel %d, still waiting, %s left: %d, waiting for: %d", caller, ch->hw_chid, counter_name, atomic_read(counter), wait_value); + + gk20a_channel_dump_ref_actions(ch); } } @@ -1054,6 +1057,11 @@ unbind: if (channel_gk20a_is_prealloc_enabled(ch)) channel_gk20a_free_prealloc_resources(ch); +#if GK20A_CHANNEL_REFCOUNT_TRACKING + memset(ch->ref_actions, 0, sizeof(ch->ref_actions)); + ch->ref_actions_put = 0; +#endif + /* make sure we catch accesses of unopened channels in case * there's non-refcounted channel pointers hanging around */ ch->g = NULL; @@ -1063,6 +1071,71 @@ unbind: free_channel(f, ch); } +static void gk20a_channel_dump_ref_actions(struct channel_gk20a *ch) +{ +#if GK20A_CHANNEL_REFCOUNT_TRACKING + size_t i, get; + unsigned long now = jiffies; + unsigned long prev_jiffies = 0; + struct device *dev = dev_from_gk20a(ch->g); + + spin_lock(&ch->ref_actions_lock); + + dev_info(d, "ch %d: refs %d. Actions, most recent last:\n", + ch->hw_chid, atomic_read(&ch->ref_count)); + + /* start at the oldest possible entry. put is next insertion point */ + get = ch->ref_actions_put; + + /* + * If the buffer is not full, this will first loop to the oldest entry, + * skipping not-yet-initialized entries. There is no ref_actions_get. + */ + for (i = 0; i < GK20A_CHANNEL_REFCOUNT_TRACKING; i++) { + struct channel_gk20a_ref_action *act = &ch->ref_actions[get]; + + if (act->trace.nr_entries) { + dev_info(d, "%s ref %zu steps ago (age %d ms, diff %d ms)\n", + act->type == channel_gk20a_ref_action_get + ? "GET" : "PUT", + GK20A_CHANNEL_REFCOUNT_TRACKING - 1 - i, + jiffies_to_msecs(now - act->jiffies), + jiffies_to_msecs(act->jiffies - prev_jiffies)); + + print_stack_trace(&act->trace, 0); + prev_jiffies = act->jiffies; + } + + get = (get + 1) % GK20A_CHANNEL_REFCOUNT_TRACKING; + } + + spin_unlock(&ch->ref_actions_lock); +#endif +} + +static void gk20a_channel_save_ref_source(struct channel_gk20a *ch, + enum channel_gk20a_ref_action_type type) +{ +#if GK20A_CHANNEL_REFCOUNT_TRACKING + struct channel_gk20a_ref_action *act; + + spin_lock(&ch->ref_actions_lock); + + act = &ch->ref_actions[ch->ref_actions_put]; + act->type = type; + act->trace.max_entries = GK20A_CHANNEL_REFCOUNT_TRACKING_STACKLEN; + act->trace.nr_entries = 0; + act->trace.skip = 3; /* onwards from the caller of this */ + act->trace.entries = act->trace_entries; + save_stack_trace(&act->trace); + act->jiffies = jiffies; + ch->ref_actions_put = (ch->ref_actions_put + 1) % + GK20A_CHANNEL_REFCOUNT_TRACKING; + + spin_unlock(&ch->ref_actions_lock); +#endif +} + /* Try to get a reference to the channel. Return nonzero on success. If fails, * the channel is dead or being freed elsewhere and you must not touch it. * @@ -1082,6 +1155,7 @@ struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch, spin_lock(&ch->ref_obtain_lock); if (likely(ch->referenceable)) { + gk20a_channel_save_ref_source(ch, channel_gk20a_ref_action_get); atomic_inc(&ch->ref_count); ret = ch; } else @@ -1097,6 +1171,7 @@ struct channel_gk20a *_gk20a_channel_get(struct channel_gk20a *ch, void _gk20a_channel_put(struct channel_gk20a *ch, const char *caller) { + gk20a_channel_save_ref_source(ch, channel_gk20a_ref_action_put); trace_gk20a_channel_put(ch->hw_chid, caller); atomic_dec(&ch->ref_count); wake_up_all(&ch->ref_count_dec_wq); @@ -2861,6 +2936,9 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid) atomic_set(&c->ref_count, 0); c->referenceable = false; init_waitqueue_head(&c->ref_count_dec_wq); +#if GK20A_CHANNEL_REFCOUNT_TRACKING + spin_lock_init(&c->ref_actions_lock); +#endif mutex_init(&c->ioctl_lock); mutex_init(&c->error_notifier_mutex); spin_lock_init(&c->joblist.dynamic.lock); diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 697d1603..44a989da 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h @@ -1,7 +1,7 @@ /* * GK20A graphics channel * - * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -115,6 +116,40 @@ struct channel_gk20a_clean_up { struct delayed_work wq; }; +/* + * Track refcount actions, saving their stack traces. This number specifies how + * many most recent actions are stored in a buffer. Set to 0 to disable. 128 + * should be enough to track moderately hard problems from the start. + */ +#define GK20A_CHANNEL_REFCOUNT_TRACKING 0 +/* Stack depth for the saved actions. */ +#define GK20A_CHANNEL_REFCOUNT_TRACKING_STACKLEN 8 + +/* + * Because the puts and gets are not linked together explicitly (although they + * should always come in pairs), it's not possible to tell which ref holder to + * delete from the list when doing a put. So, just store some number of most + * recent gets and puts in a ring buffer, to obtain a history. + * + * These are zeroed when a channel is closed, so a new one starts fresh. + */ + +enum channel_gk20a_ref_action_type { + channel_gk20a_ref_action_get, + channel_gk20a_ref_action_put +}; + +struct channel_gk20a_ref_action { + enum channel_gk20a_ref_action_type type; + unsigned long jiffies; + /* + * Many of these traces will be similar. Simpler to just capture + * duplicates than to have a separate database for the entries. + */ + struct stack_trace trace; + unsigned long trace_entries[GK20A_CHANNEL_REFCOUNT_TRACKING_STACKLEN]; +}; + /* this is the priv element of struct nvhost_channel */ struct channel_gk20a { struct gk20a *g; /* set only when channel is active */ @@ -125,6 +160,17 @@ struct channel_gk20a { bool referenceable; atomic_t ref_count; wait_queue_head_t ref_count_dec_wq; +#if GK20A_CHANNEL_REFCOUNT_TRACKING + /* + * Ring buffer for most recent refcount gets and puts. Protected by + * ref_actions_lock when getting or putting refs (i.e., adding + * entries), and when reading entries. + */ + struct channel_gk20a_ref_action ref_actions[ + GK20A_CHANNEL_REFCOUNT_TRACKING]; + size_t ref_actions_put; /* index of next write */ + spinlock_t ref_actions_lock; +#endif struct gk20a_semaphore_int *hw_sema; -- cgit v1.2.2