aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFelix Kuehling <Felix.Kuehling@amd.com>2018-03-23 15:30:36 -0400
committerOded Gabbay <oded.gabbay@gmail.com>2018-03-23 15:30:36 -0400
commit1679ae8f8f4148766423066aeb3dbb0a985a373a (patch)
tree8d766de21c5a0a25618ec8d5fc64fdd284b01292
parent810955ba712fc5c517b5e999fd69bfd20251effb (diff)
drm/amdkfd: Use ordered workqueue to restore processes
Restoring multiple processes concurrently can lead to live-locks where each process prevents the other from validating all its BOs. v2: fix duplicate check of same variable Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_module.c6
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c30
3 files changed, 32 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index b0acb0603883..e0c07d24d251 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -133,7 +133,9 @@ static int __init kfd_module_init(void)
133 if (err < 0) 133 if (err < 0)
134 goto err_topology; 134 goto err_topology;
135 135
136 kfd_process_create_wq(); 136 err = kfd_process_create_wq();
137 if (err < 0)
138 goto err_create_wq;
137 139
138 kfd_debugfs_init(); 140 kfd_debugfs_init();
139 141
@@ -143,6 +145,8 @@ static int __init kfd_module_init(void)
143 145
144 return 0; 146 return 0;
145 147
148err_create_wq:
149 kfd_topology_shutdown();
146err_topology: 150err_topology:
147 kfd_chardev_exit(); 151 kfd_chardev_exit();
148err_ioctl: 152err_ioctl:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index db27f9f13696..96a9cc0f02c9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -674,7 +674,7 @@ struct amdkfd_ioctl_desc {
674 const char *name; 674 const char *name;
675}; 675};
676 676
677void kfd_process_create_wq(void); 677int kfd_process_create_wq(void);
678void kfd_process_destroy_wq(void); 678void kfd_process_destroy_wq(void);
679struct kfd_process *kfd_create_process(struct file *filep); 679struct kfd_process *kfd_create_process(struct file *filep);
680struct kfd_process *kfd_get_process(const struct task_struct *); 680struct kfd_process *kfd_get_process(const struct task_struct *);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 45ef2d03a975..1711ad0642f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -48,8 +48,17 @@ static DEFINE_MUTEX(kfd_processes_mutex);
48 48
49DEFINE_SRCU(kfd_processes_srcu); 49DEFINE_SRCU(kfd_processes_srcu);
50 50
51/* For process termination handling */
51static struct workqueue_struct *kfd_process_wq; 52static struct workqueue_struct *kfd_process_wq;
52 53
54/* Ordered, single-threaded workqueue for restoring evicted
55 * processes. Restoring multiple processes concurrently under memory
56 * pressure can lead to processes blocking each other from validating
57 * their BOs and result in a live-lock situation where processes
58 * remain evicted indefinitely.
59 */
60static struct workqueue_struct *kfd_restore_wq;
61
53static struct kfd_process *find_process(const struct task_struct *thread); 62static struct kfd_process *find_process(const struct task_struct *thread);
54static void kfd_process_ref_release(struct kref *ref); 63static void kfd_process_ref_release(struct kref *ref);
55static struct kfd_process *create_process(const struct task_struct *thread, 64static struct kfd_process *create_process(const struct task_struct *thread,
@@ -59,10 +68,19 @@ static void evict_process_worker(struct work_struct *work);
59static void restore_process_worker(struct work_struct *work); 68static void restore_process_worker(struct work_struct *work);
60 69
61 70
62void kfd_process_create_wq(void) 71int kfd_process_create_wq(void)
63{ 72{
64 if (!kfd_process_wq) 73 if (!kfd_process_wq)
65 kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); 74 kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
75 if (!kfd_restore_wq)
76 kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
77
78 if (!kfd_process_wq || !kfd_restore_wq) {
79 kfd_process_destroy_wq();
80 return -ENOMEM;
81 }
82
83 return 0;
66} 84}
67 85
68void kfd_process_destroy_wq(void) 86void kfd_process_destroy_wq(void)
@@ -71,6 +89,10 @@ void kfd_process_destroy_wq(void)
71 destroy_workqueue(kfd_process_wq); 89 destroy_workqueue(kfd_process_wq);
72 kfd_process_wq = NULL; 90 kfd_process_wq = NULL;
73 } 91 }
92 if (kfd_restore_wq) {
93 destroy_workqueue(kfd_restore_wq);
94 kfd_restore_wq = NULL;
95 }
74} 96}
75 97
76static void kfd_process_free_gpuvm(struct kgd_mem *mem, 98static void kfd_process_free_gpuvm(struct kgd_mem *mem,
@@ -869,7 +891,7 @@ static void evict_process_worker(struct work_struct *work)
869 dma_fence_signal(p->ef); 891 dma_fence_signal(p->ef);
870 dma_fence_put(p->ef); 892 dma_fence_put(p->ef);
871 p->ef = NULL; 893 p->ef = NULL;
872 schedule_delayed_work(&p->restore_work, 894 queue_delayed_work(kfd_restore_wq, &p->restore_work,
873 msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); 895 msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
874 896
875 pr_debug("Finished evicting pasid %d\n", p->pasid); 897 pr_debug("Finished evicting pasid %d\n", p->pasid);
@@ -918,7 +940,7 @@ static void restore_process_worker(struct work_struct *work)
918 if (ret) { 940 if (ret) {
919 pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n", 941 pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
920 p->pasid, PROCESS_BACK_OFF_TIME_MS); 942 p->pasid, PROCESS_BACK_OFF_TIME_MS);
921 ret = schedule_delayed_work(&p->restore_work, 943 ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
922 msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); 944 msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
923 WARN(!ret, "reschedule restore work failed\n"); 945 WARN(!ret, "reschedule restore work failed\n");
924 return; 946 return;
@@ -957,7 +979,7 @@ int kfd_resume_all_processes(void)
957 int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); 979 int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
958 980
959 hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { 981 hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
960 if (!schedule_delayed_work(&p->restore_work, 0)) { 982 if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
961 pr_err("Restore process %d failed during resume\n", 983 pr_err("Restore process %d failed during resume\n",
962 p->pasid); 984 p->pasid);
963 ret = -EFAULT; 985 ret = -EFAULT;