diff options
author | Felix Kuehling <Felix.Kuehling@amd.com> | 2018-03-23 15:30:36 -0400 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2018-03-23 15:30:36 -0400 |
commit | 1679ae8f8f4148766423066aeb3dbb0a985a373a (patch) | |
tree | 8d766de21c5a0a25618ec8d5fc64fdd284b01292 | |
parent | 810955ba712fc5c517b5e999fd69bfd20251effb (diff) |
drm/amdkfd: Use ordered workqueue to restore processes
Restoring multiple processes concurrently can lead to live-locks
where each process prevents the other from validating all its BOs.
v2: fix duplicate check of same variable
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_module.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 30 |
3 files changed, 32 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index b0acb0603883..e0c07d24d251 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c | |||
@@ -133,7 +133,9 @@ static int __init kfd_module_init(void) | |||
133 | if (err < 0) | 133 | if (err < 0) |
134 | goto err_topology; | 134 | goto err_topology; |
135 | 135 | ||
136 | kfd_process_create_wq(); | 136 | err = kfd_process_create_wq(); |
137 | if (err < 0) | ||
138 | goto err_create_wq; | ||
137 | 139 | ||
138 | kfd_debugfs_init(); | 140 | kfd_debugfs_init(); |
139 | 141 | ||
@@ -143,6 +145,8 @@ static int __init kfd_module_init(void) | |||
143 | 145 | ||
144 | return 0; | 146 | return 0; |
145 | 147 | ||
148 | err_create_wq: | ||
149 | kfd_topology_shutdown(); | ||
146 | err_topology: | 150 | err_topology: |
147 | kfd_chardev_exit(); | 151 | kfd_chardev_exit(); |
148 | err_ioctl: | 152 | err_ioctl: |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index db27f9f13696..96a9cc0f02c9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | |||
@@ -674,7 +674,7 @@ struct amdkfd_ioctl_desc { | |||
674 | const char *name; | 674 | const char *name; |
675 | }; | 675 | }; |
676 | 676 | ||
677 | void kfd_process_create_wq(void); | 677 | int kfd_process_create_wq(void); |
678 | void kfd_process_destroy_wq(void); | 678 | void kfd_process_destroy_wq(void); |
679 | struct kfd_process *kfd_create_process(struct file *filep); | 679 | struct kfd_process *kfd_create_process(struct file *filep); |
680 | struct kfd_process *kfd_get_process(const struct task_struct *); | 680 | struct kfd_process *kfd_get_process(const struct task_struct *); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 45ef2d03a975..1711ad0642f7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c | |||
@@ -48,8 +48,17 @@ static DEFINE_MUTEX(kfd_processes_mutex); | |||
48 | 48 | ||
49 | DEFINE_SRCU(kfd_processes_srcu); | 49 | DEFINE_SRCU(kfd_processes_srcu); |
50 | 50 | ||
51 | /* For process termination handling */ | ||
51 | static struct workqueue_struct *kfd_process_wq; | 52 | static struct workqueue_struct *kfd_process_wq; |
52 | 53 | ||
54 | /* Ordered, single-threaded workqueue for restoring evicted | ||
55 | * processes. Restoring multiple processes concurrently under memory | ||
56 | * pressure can lead to processes blocking each other from validating | ||
57 | * their BOs and result in a live-lock situation where processes | ||
58 | * remain evicted indefinitely. | ||
59 | */ | ||
60 | static struct workqueue_struct *kfd_restore_wq; | ||
61 | |||
53 | static struct kfd_process *find_process(const struct task_struct *thread); | 62 | static struct kfd_process *find_process(const struct task_struct *thread); |
54 | static void kfd_process_ref_release(struct kref *ref); | 63 | static void kfd_process_ref_release(struct kref *ref); |
55 | static struct kfd_process *create_process(const struct task_struct *thread, | 64 | static struct kfd_process *create_process(const struct task_struct *thread, |
@@ -59,10 +68,19 @@ static void evict_process_worker(struct work_struct *work); | |||
59 | static void restore_process_worker(struct work_struct *work); | 68 | static void restore_process_worker(struct work_struct *work); |
60 | 69 | ||
61 | 70 | ||
62 | void kfd_process_create_wq(void) | 71 | int kfd_process_create_wq(void) |
63 | { | 72 | { |
64 | if (!kfd_process_wq) | 73 | if (!kfd_process_wq) |
65 | kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); | 74 | kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); |
75 | if (!kfd_restore_wq) | ||
76 | kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0); | ||
77 | |||
78 | if (!kfd_process_wq || !kfd_restore_wq) { | ||
79 | kfd_process_destroy_wq(); | ||
80 | return -ENOMEM; | ||
81 | } | ||
82 | |||
83 | return 0; | ||
66 | } | 84 | } |
67 | 85 | ||
68 | void kfd_process_destroy_wq(void) | 86 | void kfd_process_destroy_wq(void) |
@@ -71,6 +89,10 @@ void kfd_process_destroy_wq(void) | |||
71 | destroy_workqueue(kfd_process_wq); | 89 | destroy_workqueue(kfd_process_wq); |
72 | kfd_process_wq = NULL; | 90 | kfd_process_wq = NULL; |
73 | } | 91 | } |
92 | if (kfd_restore_wq) { | ||
93 | destroy_workqueue(kfd_restore_wq); | ||
94 | kfd_restore_wq = NULL; | ||
95 | } | ||
74 | } | 96 | } |
75 | 97 | ||
76 | static void kfd_process_free_gpuvm(struct kgd_mem *mem, | 98 | static void kfd_process_free_gpuvm(struct kgd_mem *mem, |
@@ -869,7 +891,7 @@ static void evict_process_worker(struct work_struct *work) | |||
869 | dma_fence_signal(p->ef); | 891 | dma_fence_signal(p->ef); |
870 | dma_fence_put(p->ef); | 892 | dma_fence_put(p->ef); |
871 | p->ef = NULL; | 893 | p->ef = NULL; |
872 | schedule_delayed_work(&p->restore_work, | 894 | queue_delayed_work(kfd_restore_wq, &p->restore_work, |
873 | msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); | 895 | msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); |
874 | 896 | ||
875 | pr_debug("Finished evicting pasid %d\n", p->pasid); | 897 | pr_debug("Finished evicting pasid %d\n", p->pasid); |
@@ -918,7 +940,7 @@ static void restore_process_worker(struct work_struct *work) | |||
918 | if (ret) { | 940 | if (ret) { |
919 | pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n", | 941 | pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n", |
920 | p->pasid, PROCESS_BACK_OFF_TIME_MS); | 942 | p->pasid, PROCESS_BACK_OFF_TIME_MS); |
921 | ret = schedule_delayed_work(&p->restore_work, | 943 | ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, |
922 | msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); | 944 | msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); |
923 | WARN(!ret, "reschedule restore work failed\n"); | 945 | WARN(!ret, "reschedule restore work failed\n"); |
924 | return; | 946 | return; |
@@ -957,7 +979,7 @@ int kfd_resume_all_processes(void) | |||
957 | int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); | 979 | int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); |
958 | 980 | ||
959 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { | 981 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { |
960 | if (!schedule_delayed_work(&p->restore_work, 0)) { | 982 | if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) { |
961 | pr_err("Restore process %d failed during resume\n", | 983 | pr_err("Restore process %d failed during resume\n", |
962 | p->pasid); | 984 | p->pasid); |
963 | ret = -EFAULT; | 985 | ret = -EFAULT; |