aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorshaoyunl <Shaoyun.Liu@amd.com>2018-07-11 22:32:50 -0400
committerOded Gabbay <oded.gabbay@gmail.com>2018-07-11 22:32:50 -0400
commit2640c3facbd6e21e63c95f19588cc24913a263cd (patch)
tree83f2a6b4c5dad57c915747af31466d6fdc759956
parentb97dfa27ef3ad3eddd2cb97a3b6a140d7037827a (diff)
drm/amdkfd: Handle VM faults in KFD
1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per per-vmid. amdkfd needs to get the information from amdgpu through the new get_vm_fault_info interface. On GFX9 and later, all the required information is in the IH ring 2. amdkfd unmaps all queues from the faulting process and create new run-list without the guilty process 3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY Signed-off-by: shaoyun liu <shaoyun.liu@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c25
-rw-r--r--drivers/gpu/drm/amd/amdkfd/cik_int.h2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c17
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c37
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c18
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h4
-rw-r--r--include/uapi/linux/kfd_ioctl.h2
7 files changed, 98 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 49df6c791cfc..cc33870e7edb 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
48 return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || 48 return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
49 ihre->source_id == CIK_INTSRC_SDMA_TRAP || 49 ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
50 ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || 50 ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
51 ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE; 51 ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
52 ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
53 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
52} 54}
53 55
54static void cik_event_interrupt_wq(struct kfd_dev *dev, 56static void cik_event_interrupt_wq(struct kfd_dev *dev,
55 const uint32_t *ih_ring_entry) 57 const uint32_t *ih_ring_entry)
56{ 58{
57 unsigned int pasid;
58 const struct cik_ih_ring_entry *ihre = 59 const struct cik_ih_ring_entry *ihre =
59 (const struct cik_ih_ring_entry *)ih_ring_entry; 60 (const struct cik_ih_ring_entry *)ih_ring_entry;
60 uint32_t context_id = ihre->data & 0xfffffff; 61 uint32_t context_id = ihre->data & 0xfffffff;
61 62 unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
62 pasid = (ihre->ring_id & 0xffff0000) >> 16; 63 unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
63 64
64 if (pasid == 0) 65 if (pasid == 0)
65 return; 66 return;
@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
72 kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); 73 kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
73 else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) 74 else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
74 kfd_signal_hw_exception_event(pasid); 75 kfd_signal_hw_exception_event(pasid);
76 else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
77 ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
78 struct kfd_vm_fault_info info;
79
80 kfd_process_vm_fault(dev->dqm, pasid);
81
82 memset(&info, 0, sizeof(info));
83 dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
84 if (!info.page_addr && !info.status)
85 return;
86
87 if (info.vmid == vmid)
88 kfd_signal_vm_fault_event(dev, pasid, &info);
89 else
90 kfd_signal_vm_fault_event(dev, pasid, NULL);
91 }
75} 92}
76 93
77const struct kfd_event_interrupt_class event_interrupt_class_cik = { 94const struct kfd_event_interrupt_class event_interrupt_class_cik = {
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
index 109298b9d507..a2079a04a673 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
+++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
@@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
37#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 37#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
38#define CIK_INTSRC_SDMA_TRAP 0xE0 38#define CIK_INTSRC_SDMA_TRAP 0xE0
39#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF 39#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
40#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
41#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93
40 42
41#endif 43#endif
42 44
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f2f81d26db0c..44fc2038770e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
1684 kfree(dqm); 1684 kfree(dqm);
1685} 1685}
1686 1686
1687int kfd_process_vm_fault(struct device_queue_manager *dqm,
1688 unsigned int pasid)
1689{
1690 struct kfd_process_device *pdd;
1691 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
1692 int ret = 0;
1693
1694 if (!p)
1695 return -EINVAL;
1696 pdd = kfd_get_process_device_data(dqm->dev, p);
1697 if (pdd)
1698 ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
1699 kfd_unref_process(p);
1700
1701 return ret;
1702}
1703
1687#if defined(CONFIG_DEBUG_FS) 1704#if defined(CONFIG_DEBUG_FS)
1688 1705
1689static void seq_reg_dump(struct seq_file *m, 1706static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 3d5a8332e8c0..b58a0e665ebc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
963 mutex_unlock(&p->event_mutex); 963 mutex_unlock(&p->event_mutex);
964 kfd_unref_process(p); 964 kfd_unref_process(p);
965} 965}
966
967void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
968 struct kfd_vm_fault_info *info)
969{
970 struct kfd_event *ev;
971 uint32_t id;
972 struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
973 struct kfd_hsa_memory_exception_data memory_exception_data;
974
975 if (!p)
976 return; /* Presumably process exited. */
977 memset(&memory_exception_data, 0, sizeof(memory_exception_data));
978 memory_exception_data.gpu_id = dev->id;
979 memory_exception_data.failure.imprecise = 1;
980 /* Set failure reason */
981 if (info) {
982 memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
983 memory_exception_data.failure.NotPresent =
984 info->prot_valid ? 1 : 0;
985 memory_exception_data.failure.NoExecute =
986 info->prot_exec ? 1 : 0;
987 memory_exception_data.failure.ReadOnly =
988 info->prot_write ? 1 : 0;
989 memory_exception_data.failure.imprecise = 0;
990 }
991 mutex_lock(&p->event_mutex);
992
993 id = KFD_FIRST_NONSIGNAL_EVENT_ID;
994 idr_for_each_entry_continue(&p->event_idr, ev, id)
995 if (ev->type == KFD_EVENT_TYPE_MEMORY) {
996 ev->memory_exception_data = memory_exception_data;
997 set_event(ev);
998 }
999
1000 mutex_unlock(&p->event_mutex);
1001 kfd_unref_process(p);
1002}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 37029baa3346..d6b64e692760 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
57 return source_id == SOC15_INTSRC_CP_END_OF_PIPE || 57 return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
58 source_id == SOC15_INTSRC_SDMA_TRAP || 58 source_id == SOC15_INTSRC_SDMA_TRAP ||
59 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || 59 source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
60 source_id == SOC15_INTSRC_CP_BAD_OPCODE; 60 source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
61 client_id == SOC15_IH_CLIENTID_VMC ||
62 client_id == SOC15_IH_CLIENTID_UTCL2;
61} 63}
62 64
63static void event_interrupt_wq_v9(struct kfd_dev *dev, 65static void event_interrupt_wq_v9(struct kfd_dev *dev,
@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
82 kfd_signal_hw_exception_event(pasid); 84 kfd_signal_hw_exception_event(pasid);
83 else if (client_id == SOC15_IH_CLIENTID_VMC || 85 else if (client_id == SOC15_IH_CLIENTID_VMC ||
84 client_id == SOC15_IH_CLIENTID_UTCL2) { 86 client_id == SOC15_IH_CLIENTID_UTCL2) {
85 /* TODO */ 87 struct kfd_vm_fault_info info = {0};
88 uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
89
90 info.vmid = vmid;
91 info.mc_id = client_id;
92 info.page_addr = ih_ring_entry[4] |
93 (uint64_t)(ih_ring_entry[5] & 0xf) << 32;
94 info.prot_valid = ring_id & 0x08;
95 info.prot_read = ring_id & 0x10;
96 info.prot_write = ring_id & 0x20;
97
98 kfd_process_vm_fault(dev->dqm, pasid);
99 kfd_signal_vm_fault_event(dev, pasid, &info);
86 } 100 }
87} 101}
88 102
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 5e3990bb4c4b..91a3368421b1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
838struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, 838struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
839 enum kfd_queue_type type); 839 enum kfd_queue_type type);
840void kernel_queue_uninit(struct kernel_queue *kq); 840void kernel_queue_uninit(struct kernel_queue *kq);
841int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
841 842
842/* Process Queue Manager */ 843/* Process Queue Manager */
843struct process_queue_node { 844struct process_queue_node {
@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
964 uint64_t *event_page_offset, uint32_t *event_slot_index); 965 uint64_t *event_page_offset, uint32_t *event_slot_index);
965int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); 966int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
966 967
968void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
969 struct kfd_vm_fault_info *info);
970
967void kfd_flush_tlb(struct kfd_process_device *pdd); 971void kfd_flush_tlb(struct kfd_process_device *pdd);
968 972
969int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); 973int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index b4f5073dbac2..46a54ab1e728 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
219 __u32 NotPresent; /* Page not present or supervisor privilege */ 219 __u32 NotPresent; /* Page not present or supervisor privilege */
220 __u32 ReadOnly; /* Write access to a read-only page */ 220 __u32 ReadOnly; /* Write access to a read-only page */
221 __u32 NoExecute; /* Execute access to a page marked NX */ 221 __u32 NoExecute; /* Execute access to a page marked NX */
222 __u32 pad; 222 __u32 imprecise; /* Can't determine the exact fault address */
223}; 223};
224 224
225/* memory exception data*/ 225/* memory exception data*/