gpu: nvgpu: add worker for watchdog and job cleanup

Implement a worker thread to replace the delayed works in channel watchdog and job cleanups. Watchdog runs by polling the channel states periodically, and job cleanup is performed on channels that are appended on a work queue consumed by the worker thread. Handling both of these two in the same thread makes it impossible for them to cause a deadlock, as has previously happened. The watchdog takes references to channels during checking and possibly recovering channels. Jobs in the cleanup queue have an additional reference taken which is released after the channel is processed. The worker is woken up from periodic sleep when channels are added to the queue. Currently, the queue is only used for job cleanups, but it is extendable for other per-channel works too. The worker can also process other periodic actions dependent on channels. Neither the semantics of timeout handling or of job cleanups are yet significantly changed - this patch only serializes them into one background thread. Each job that needs cleanup is tracked and holds a reference to its channel and a power reference, and timeouts can only be processed on channels that are tracked, so the thread will always be idle if the system is going to be suspended, so there is currently no need to explicitly suspend or stop it. Bug 1848834 Bug 1851689 Bug 1814773 Bug 200270332 Jira NVGPU-21 Change-Id: I355101802f50841ea9bd8042a017f91c931d2dc7 Signed-off-by: Konsta Holtta <kholtta@nvidia.com> Reviewed-on: http://git-master/r/1297183 Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Konsta Holtta <kholtta@nvidia.com> 2017-02-01 03:28:38 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2017-03-02 20:51:03 -0500
commit: f1072a28be09dc7f5433b5e1013a76d8a87c2505 (patch)
tree: 68d1a5b5123834859f8ae8c4481b886b49364811 /drivers/gpu/nvgpu/gk20a/channel_gk20a.h
parent: 0c155313e75a82a409d3438cc982ee30bb453d16 (diff)
1 files changed, 21 insertions, 12 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
index 14ee9f69..d9913cd7 100644
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -27,6 +27,7 @@
 #include <uapi/linux/nvgpu.h>
 #include <nvgpu/lock.h>
+#include <nvgpu/timers.h>
 struct gk20a;
 struct gr_gk20a;
@@ -87,12 +88,19 @@ struct channel_gk20a_joblist {
                struct list_head jobs;
                struct nvgpu_spinlock lock;
        } dynamic;
+        /*
+         * Synchronize abort cleanup (when closing a channel) and job cleanup
+         * (asynchronously from worker) - protect from concurrent access when
+         * job resources are being freed.
+         */
+        struct nvgpu_mutex cleanup_lock;
 };
 struct channel_gk20a_timeout {
-        struct delayed_work wq;
        struct nvgpu_raw_spinlock lock;
-        bool initialized;
+        struct nvgpu_timeout timer;
+        bool running;
        u32 gp_get;
 };
@@ -110,12 +118,6 @@ struct gk20a_event_id_data {
        struct list_head event_id_node;
 };
-struct channel_gk20a_clean_up {
-        struct nvgpu_mutex lock;
-        bool scheduled;
-        struct delayed_work wq;
-};
 /*
 * Track refcount actions, saving their stack traces. This number specifies how
 * many most recent actions are stored in a buffer. Set to 0 to disable. 128
@@ -214,7 +216,8 @@ struct channel_gk20a {
        u32 timeout_gpfifo_get;
        struct channel_gk20a_timeout timeout;
-        struct channel_gk20a_clean_up clean_up;
+        /* for job cleanup handling in the background worker */
+        struct list_head worker_item;
 #if defined(CONFIG_GK20A_CYCLE_STATS)
        struct {
@@ -250,8 +253,11 @@ struct channel_gk20a {
        u64 virt_ctx;
 #endif
-        /* signal channel owner via a callback, if set, in gk20a_channel_update
+        /*
-         * via schedule_work */
+         * Signal channel owner via a callback, if set, in job cleanup with
+         * schedule_work. Means that something finished on the channel (perhaps
+         * more than one job).
+         */
        void (*update_fn)(struct channel_gk20a *, void *);
        void *update_fn_data;
        struct nvgpu_spinlock update_fn_lock; /* make access to the two above atomic */
@@ -293,6 +299,9 @@ int gk20a_disable_channel_tsg(struct gk20a *g, struct channel_gk20a *ch);
 int gk20a_channel_suspend(struct gk20a *g);
 int gk20a_channel_resume(struct gk20a *g);
+int nvgpu_channel_worker_init(struct gk20a *g);
+void nvgpu_channel_worker_deinit(struct gk20a *g);
 /* Channel file operations */
 int gk20a_channel_open(struct inode *inode, struct file *filp);
 int gk20a_channel_open_ioctl(struct gk20a *g,
@@ -302,7 +311,7 @@ long gk20a_channel_ioctl(struct file *filp,
                         unsigned long arg);
 int gk20a_channel_release(struct inode *inode, struct file *filp);
 struct channel_gk20a *gk20a_get_channel_from_file(int fd);
-void gk20a_channel_update(struct channel_gk20a *c, int nr_completed);
+void gk20a_channel_update(struct channel_gk20a *c);
 void gk20a_init_channel(struct gpu_ops *gops);
author	Konsta Holtta <kholtta@nvidia.com>	2017-02-01 03:28:38 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2017-03-02 20:51:03 -0500
commit	f1072a28be09dc7f5433b5e1013a76d8a87c2505 (patch)
tree	68d1a5b5123834859f8ae8c4481b886b49364811 /drivers/gpu/nvgpu/gk20a/channel_gk20a.h
parent	0c155313e75a82a409d3438cc982ee30bb453d16 (diff)