diff options
author | shaoyunl <Shaoyun.Liu@amd.com> | 2018-07-11 22:32:50 -0400 |
---|---|---|
committer | Oded Gabbay <oded.gabbay@gmail.com> | 2018-07-11 22:32:50 -0400 |
commit | 2640c3facbd6e21e63c95f19588cc24913a263cd (patch) | |
tree | 83f2a6b4c5dad57c915747af31466d6fdc759956 | |
parent | b97dfa27ef3ad3eddd2cb97a3b6a140d7037827a (diff) |
drm/amdkfd: Handle VM faults in KFD
1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
per-vmid. amdkfd needs to get the information from amdgpu through the
new get_vm_fault_info interface. On GFX9 and later, all the required
information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY
Signed-off-by: shaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 25 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cik_int.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 17 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 37 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/kfd_ioctl.h | 2 |
7 files changed, 98 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 49df6c791cfc..cc33870e7edb 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | |||
@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, | |||
48 | return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || | 48 | return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || |
49 | ihre->source_id == CIK_INTSRC_SDMA_TRAP || | 49 | ihre->source_id == CIK_INTSRC_SDMA_TRAP || |
50 | ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || | 50 | ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || |
51 | ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE; | 51 | ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || |
52 | ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || | ||
53 | ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT; | ||
52 | } | 54 | } |
53 | 55 | ||
54 | static void cik_event_interrupt_wq(struct kfd_dev *dev, | 56 | static void cik_event_interrupt_wq(struct kfd_dev *dev, |
55 | const uint32_t *ih_ring_entry) | 57 | const uint32_t *ih_ring_entry) |
56 | { | 58 | { |
57 | unsigned int pasid; | ||
58 | const struct cik_ih_ring_entry *ihre = | 59 | const struct cik_ih_ring_entry *ihre = |
59 | (const struct cik_ih_ring_entry *)ih_ring_entry; | 60 | (const struct cik_ih_ring_entry *)ih_ring_entry; |
60 | uint32_t context_id = ihre->data & 0xfffffff; | 61 | uint32_t context_id = ihre->data & 0xfffffff; |
61 | 62 | unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8; | |
62 | pasid = (ihre->ring_id & 0xffff0000) >> 16; | 63 | unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16; |
63 | 64 | ||
64 | if (pasid == 0) | 65 | if (pasid == 0) |
65 | return; | 66 | return; |
@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, | |||
72 | kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); | 73 | kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); |
73 | else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) | 74 | else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) |
74 | kfd_signal_hw_exception_event(pasid); | 75 | kfd_signal_hw_exception_event(pasid); |
76 | else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || | ||
77 | ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { | ||
78 | struct kfd_vm_fault_info info; | ||
79 | |||
80 | kfd_process_vm_fault(dev->dqm, pasid); | ||
81 | |||
82 | memset(&info, 0, sizeof(info)); | ||
83 | dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); | ||
84 | if (!info.page_addr && !info.status) | ||
85 | return; | ||
86 | |||
87 | if (info.vmid == vmid) | ||
88 | kfd_signal_vm_fault_event(dev, pasid, &info); | ||
89 | else | ||
90 | kfd_signal_vm_fault_event(dev, pasid, NULL); | ||
91 | } | ||
75 | } | 92 | } |
76 | 93 | ||
77 | const struct kfd_event_interrupt_class event_interrupt_class_cik = { | 94 | const struct kfd_event_interrupt_class event_interrupt_class_cik = { |
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index 109298b9d507..a2079a04a673 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h | |||
@@ -37,6 +37,8 @@ struct cik_ih_ring_entry { | |||
37 | #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 | 37 | #define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 |
38 | #define CIK_INTSRC_SDMA_TRAP 0xE0 | 38 | #define CIK_INTSRC_SDMA_TRAP 0xE0 |
39 | #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF | 39 | #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF |
40 | #define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 | ||
41 | #define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 | ||
40 | 42 | ||
41 | #endif | 43 | #endif |
42 | 44 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f2f81d26db0c..44fc2038770e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | |||
@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) | |||
1684 | kfree(dqm); | 1684 | kfree(dqm); |
1685 | } | 1685 | } |
1686 | 1686 | ||
1687 | int kfd_process_vm_fault(struct device_queue_manager *dqm, | ||
1688 | unsigned int pasid) | ||
1689 | { | ||
1690 | struct kfd_process_device *pdd; | ||
1691 | struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); | ||
1692 | int ret = 0; | ||
1693 | |||
1694 | if (!p) | ||
1695 | return -EINVAL; | ||
1696 | pdd = kfd_get_process_device_data(dqm->dev, p); | ||
1697 | if (pdd) | ||
1698 | ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); | ||
1699 | kfd_unref_process(p); | ||
1700 | |||
1701 | return ret; | ||
1702 | } | ||
1703 | |||
1687 | #if defined(CONFIG_DEBUG_FS) | 1704 | #if defined(CONFIG_DEBUG_FS) |
1688 | 1705 | ||
1689 | static void seq_reg_dump(struct seq_file *m, | 1706 | static void seq_reg_dump(struct seq_file *m, |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 3d5a8332e8c0..b58a0e665ebc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c | |||
@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid) | |||
963 | mutex_unlock(&p->event_mutex); | 963 | mutex_unlock(&p->event_mutex); |
964 | kfd_unref_process(p); | 964 | kfd_unref_process(p); |
965 | } | 965 | } |
966 | |||
967 | void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, | ||
968 | struct kfd_vm_fault_info *info) | ||
969 | { | ||
970 | struct kfd_event *ev; | ||
971 | uint32_t id; | ||
972 | struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); | ||
973 | struct kfd_hsa_memory_exception_data memory_exception_data; | ||
974 | |||
975 | if (!p) | ||
976 | return; /* Presumably process exited. */ | ||
977 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); | ||
978 | memory_exception_data.gpu_id = dev->id; | ||
979 | memory_exception_data.failure.imprecise = 1; | ||
980 | /* Set failure reason */ | ||
981 | if (info) { | ||
982 | memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; | ||
983 | memory_exception_data.failure.NotPresent = | ||
984 | info->prot_valid ? 1 : 0; | ||
985 | memory_exception_data.failure.NoExecute = | ||
986 | info->prot_exec ? 1 : 0; | ||
987 | memory_exception_data.failure.ReadOnly = | ||
988 | info->prot_write ? 1 : 0; | ||
989 | memory_exception_data.failure.imprecise = 0; | ||
990 | } | ||
991 | mutex_lock(&p->event_mutex); | ||
992 | |||
993 | id = KFD_FIRST_NONSIGNAL_EVENT_ID; | ||
994 | idr_for_each_entry_continue(&p->event_idr, ev, id) | ||
995 | if (ev->type == KFD_EVENT_TYPE_MEMORY) { | ||
996 | ev->memory_exception_data = memory_exception_data; | ||
997 | set_event(ev); | ||
998 | } | ||
999 | |||
1000 | mutex_unlock(&p->event_mutex); | ||
1001 | kfd_unref_process(p); | ||
1002 | } | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 37029baa3346..d6b64e692760 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | |||
@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, | |||
57 | return source_id == SOC15_INTSRC_CP_END_OF_PIPE || | 57 | return source_id == SOC15_INTSRC_CP_END_OF_PIPE || |
58 | source_id == SOC15_INTSRC_SDMA_TRAP || | 58 | source_id == SOC15_INTSRC_SDMA_TRAP || |
59 | source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || | 59 | source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || |
60 | source_id == SOC15_INTSRC_CP_BAD_OPCODE; | 60 | source_id == SOC15_INTSRC_CP_BAD_OPCODE || |
61 | client_id == SOC15_IH_CLIENTID_VMC || | ||
62 | client_id == SOC15_IH_CLIENTID_UTCL2; | ||
61 | } | 63 | } |
62 | 64 | ||
63 | static void event_interrupt_wq_v9(struct kfd_dev *dev, | 65 | static void event_interrupt_wq_v9(struct kfd_dev *dev, |
@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, | |||
82 | kfd_signal_hw_exception_event(pasid); | 84 | kfd_signal_hw_exception_event(pasid); |
83 | else if (client_id == SOC15_IH_CLIENTID_VMC || | 85 | else if (client_id == SOC15_IH_CLIENTID_VMC || |
84 | client_id == SOC15_IH_CLIENTID_UTCL2) { | 86 | client_id == SOC15_IH_CLIENTID_UTCL2) { |
85 | /* TODO */ | 87 | struct kfd_vm_fault_info info = {0}; |
88 | uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); | ||
89 | |||
90 | info.vmid = vmid; | ||
91 | info.mc_id = client_id; | ||
92 | info.page_addr = ih_ring_entry[4] | | ||
93 | (uint64_t)(ih_ring_entry[5] & 0xf) << 32; | ||
94 | info.prot_valid = ring_id & 0x08; | ||
95 | info.prot_read = ring_id & 0x10; | ||
96 | info.prot_write = ring_id & 0x20; | ||
97 | |||
98 | kfd_process_vm_fault(dev->dqm, pasid); | ||
99 | kfd_signal_vm_fault_event(dev, pasid, &info); | ||
86 | } | 100 | } |
87 | } | 101 | } |
88 | 102 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 5e3990bb4c4b..91a3368421b1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | |||
@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm); | |||
838 | struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, | 838 | struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, |
839 | enum kfd_queue_type type); | 839 | enum kfd_queue_type type); |
840 | void kernel_queue_uninit(struct kernel_queue *kq); | 840 | void kernel_queue_uninit(struct kernel_queue *kq); |
841 | int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); | ||
841 | 842 | ||
842 | /* Process Queue Manager */ | 843 | /* Process Queue Manager */ |
843 | struct process_queue_node { | 844 | struct process_queue_node { |
@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, | |||
964 | uint64_t *event_page_offset, uint32_t *event_slot_index); | 965 | uint64_t *event_page_offset, uint32_t *event_slot_index); |
965 | int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); | 966 | int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); |
966 | 967 | ||
968 | void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, | ||
969 | struct kfd_vm_fault_info *info); | ||
970 | |||
967 | void kfd_flush_tlb(struct kfd_process_device *pdd); | 971 | void kfd_flush_tlb(struct kfd_process_device *pdd); |
968 | 972 | ||
969 | int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); | 973 | int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); |
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index b4f5073dbac2..46a54ab1e728 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h | |||
@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure { | |||
219 | __u32 NotPresent; /* Page not present or supervisor privilege */ | 219 | __u32 NotPresent; /* Page not present or supervisor privilege */ |
220 | __u32 ReadOnly; /* Write access to a read-only page */ | 220 | __u32 ReadOnly; /* Write access to a read-only page */ |
221 | __u32 NoExecute; /* Execute access to a page marked NX */ | 221 | __u32 NoExecute; /* Execute access to a page marked NX */ |
222 | __u32 pad; | 222 | __u32 imprecise; /* Can't determine the exact fault address */ |
223 | }; | 223 | }; |
224 | 224 | ||
225 | /* memory exception data*/ | 225 | /* memory exception data*/ |