diff options
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 18 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/kfd_ioctl.h | 12 |
9 files changed, 51 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index fe1d7368c1e6..acf8ae0cee9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | |||
@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) | |||
640 | void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) | 640 | void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) |
641 | { | 641 | { |
642 | } | 642 | } |
643 | |||
644 | void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) | ||
645 | { | ||
646 | } | ||
643 | #endif | 647 | #endif |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 0e1711a75b68..e6a503760b62 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | |||
@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm); | |||
229 | int kgd2kfd_resume_mm(struct mm_struct *mm); | 229 | int kgd2kfd_resume_mm(struct mm_struct *mm); |
230 | int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, | 230 | int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, |
231 | struct dma_fence *fence); | 231 | struct dma_fence *fence); |
232 | void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); | ||
232 | 233 | ||
233 | #endif /* AMDGPU_AMDKFD_H_INCLUDED */ | 234 | #endif /* AMDGPU_AMDKFD_H_INCLUDED */ |
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 88c45f990f05..6bb71f6ee18e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | |||
@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, | |||
4805 | struct amdgpu_iv_entry *entry) | 4805 | struct amdgpu_iv_entry *entry) |
4806 | { | 4806 | { |
4807 | /* TODO ue will trigger an interrupt. */ | 4807 | /* TODO ue will trigger an interrupt. */ |
4808 | kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); | ||
4808 | amdgpu_ras_reset_gpu(adev, 0); | 4809 | amdgpu_ras_reset_gpu(adev, 0); |
4809 | return AMDGPU_RAS_UE; | 4810 | return AMDGPU_RAS_UE; |
4810 | } | 4811 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 2daa5ea1c2ea..0252345a1f08 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | |||
@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev, | |||
354 | static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, | 354 | static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, |
355 | struct amdgpu_iv_entry *entry) | 355 | struct amdgpu_iv_entry *entry) |
356 | { | 356 | { |
357 | kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); | ||
357 | amdgpu_ras_reset_gpu(adev, 0); | 358 | amdgpu_ras_reset_gpu(adev, 0); |
358 | return AMDGPU_RAS_UE; | 359 | return AMDGPU_RAS_UE; |
359 | } | 360 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 058b9daec514..f7a6fafd70ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | |||
@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, | |||
1851 | return 0; | 1851 | return 0; |
1852 | } | 1852 | } |
1853 | 1853 | ||
1854 | kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); | ||
1855 | |||
1854 | amdgpu_ras_reset_gpu(adev, 0); | 1856 | amdgpu_ras_reset_gpu(adev, 0); |
1855 | 1857 | ||
1856 | return AMDGPU_RAS_UE; | 1858 | return AMDGPU_RAS_UE; |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 8be9677c0c07..b3cdbf79f47b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c | |||
@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, | |||
466 | memset(&kfd->doorbell_available_index, 0, | 466 | memset(&kfd->doorbell_available_index, 0, |
467 | sizeof(kfd->doorbell_available_index)); | 467 | sizeof(kfd->doorbell_available_index)); |
468 | 468 | ||
469 | atomic_set(&kfd->sram_ecc_flag, 0); | ||
470 | |||
469 | return kfd; | 471 | return kfd; |
470 | } | 472 | } |
471 | 473 | ||
@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) | |||
661 | return ret; | 663 | return ret; |
662 | count = atomic_dec_return(&kfd_locked); | 664 | count = atomic_dec_return(&kfd_locked); |
663 | WARN_ONCE(count != 0, "KFD reset ref. error"); | 665 | WARN_ONCE(count != 0, "KFD reset ref. error"); |
666 | |||
667 | atomic_set(&kfd->sram_ecc_flag, 0); | ||
668 | |||
664 | return 0; | 669 | return 0; |
665 | } | 670 | } |
666 | 671 | ||
@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) | |||
1024 | return 0; | 1029 | return 0; |
1025 | } | 1030 | } |
1026 | 1031 | ||
1032 | void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) | ||
1033 | { | ||
1034 | if (kfd) | ||
1035 | atomic_inc(&kfd->sram_ecc_flag); | ||
1036 | } | ||
1037 | |||
1027 | #if defined(CONFIG_DEBUG_FS) | 1038 | #if defined(CONFIG_DEBUG_FS) |
1028 | 1039 | ||
1029 | /* This function will send a package to HIQ to hang the HWS | 1040 | /* This function will send a package to HIQ to hang the HWS |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index e9f0e0a1b41c..6e1d41c5bf86 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c | |||
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, | |||
1011 | void kfd_signal_reset_event(struct kfd_dev *dev) | 1011 | void kfd_signal_reset_event(struct kfd_dev *dev) |
1012 | { | 1012 | { |
1013 | struct kfd_hsa_hw_exception_data hw_exception_data; | 1013 | struct kfd_hsa_hw_exception_data hw_exception_data; |
1014 | struct kfd_hsa_memory_exception_data memory_exception_data; | ||
1014 | struct kfd_process *p; | 1015 | struct kfd_process *p; |
1015 | struct kfd_event *ev; | 1016 | struct kfd_event *ev; |
1016 | unsigned int temp; | 1017 | unsigned int temp; |
1017 | uint32_t id, idx; | 1018 | uint32_t id, idx; |
1019 | int reset_cause = atomic_read(&dev->sram_ecc_flag) ? | ||
1020 | KFD_HW_EXCEPTION_ECC : | ||
1021 | KFD_HW_EXCEPTION_GPU_HANG; | ||
1018 | 1022 | ||
1019 | /* Whole gpu reset caused by GPU hang and memory is lost */ | 1023 | /* Whole gpu reset caused by GPU hang and memory is lost */ |
1020 | memset(&hw_exception_data, 0, sizeof(hw_exception_data)); | 1024 | memset(&hw_exception_data, 0, sizeof(hw_exception_data)); |
1021 | hw_exception_data.gpu_id = dev->id; | 1025 | hw_exception_data.gpu_id = dev->id; |
1022 | hw_exception_data.memory_lost = 1; | 1026 | hw_exception_data.memory_lost = 1; |
1027 | hw_exception_data.reset_cause = reset_cause; | ||
1028 | |||
1029 | memset(&memory_exception_data, 0, sizeof(memory_exception_data)); | ||
1030 | memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; | ||
1031 | memory_exception_data.gpu_id = dev->id; | ||
1032 | memory_exception_data.failure.imprecise = true; | ||
1023 | 1033 | ||
1024 | idx = srcu_read_lock(&kfd_processes_srcu); | 1034 | idx = srcu_read_lock(&kfd_processes_srcu); |
1025 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { | 1035 | hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { |
1026 | mutex_lock(&p->event_mutex); | 1036 | mutex_lock(&p->event_mutex); |
1027 | id = KFD_FIRST_NONSIGNAL_EVENT_ID; | 1037 | id = KFD_FIRST_NONSIGNAL_EVENT_ID; |
1028 | idr_for_each_entry_continue(&p->event_idr, ev, id) | 1038 | idr_for_each_entry_continue(&p->event_idr, ev, id) { |
1029 | if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { | 1039 | if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { |
1030 | ev->hw_exception_data = hw_exception_data; | 1040 | ev->hw_exception_data = hw_exception_data; |
1031 | set_event(ev); | 1041 | set_event(ev); |
1032 | } | 1042 | } |
1043 | if (ev->type == KFD_EVENT_TYPE_MEMORY && | ||
1044 | reset_cause == KFD_HW_EXCEPTION_ECC) { | ||
1045 | ev->memory_exception_data = memory_exception_data; | ||
1046 | set_event(ev); | ||
1047 | } | ||
1048 | } | ||
1033 | mutex_unlock(&p->event_mutex); | 1049 | mutex_unlock(&p->event_mutex); |
1034 | } | 1050 | } |
1035 | srcu_read_unlock(&kfd_processes_srcu, idx); | 1051 | srcu_read_unlock(&kfd_processes_srcu, idx); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 0eeee3c6d6dc..9e0230965675 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | |||
@@ -276,6 +276,9 @@ struct kfd_dev { | |||
276 | uint64_t hive_id; | 276 | uint64_t hive_id; |
277 | 277 | ||
278 | bool pci_atomic_requested; | 278 | bool pci_atomic_requested; |
279 | |||
280 | /* SRAM ECC flag */ | ||
281 | atomic_t sram_ecc_flag; | ||
279 | }; | 282 | }; |
280 | 283 | ||
281 | enum kfd_mempool { | 284 | enum kfd_mempool { |
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index e622fd1fbd46..dc067ed0b72d 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h | |||
@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args { | |||
211 | #define KFD_HW_EXCEPTION_GPU_HANG 0 | 211 | #define KFD_HW_EXCEPTION_GPU_HANG 0 |
212 | #define KFD_HW_EXCEPTION_ECC 1 | 212 | #define KFD_HW_EXCEPTION_ECC 1 |
213 | 213 | ||
214 | /* For kfd_hsa_memory_exception_data.ErrorType */ | ||
215 | #define KFD_MEM_ERR_NO_RAS 0 | ||
216 | #define KFD_MEM_ERR_SRAM_ECC 1 | ||
217 | #define KFD_MEM_ERR_POISON_CONSUMED 2 | ||
218 | #define KFD_MEM_ERR_GPU_HANG 3 | ||
214 | 219 | ||
215 | struct kfd_ioctl_create_event_args { | 220 | struct kfd_ioctl_create_event_args { |
216 | __u64 event_page_offset; /* from KFD */ | 221 | __u64 event_page_offset; /* from KFD */ |
@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data { | |||
250 | struct kfd_memory_exception_failure failure; | 255 | struct kfd_memory_exception_failure failure; |
251 | __u64 va; | 256 | __u64 va; |
252 | __u32 gpu_id; | 257 | __u32 gpu_id; |
253 | __u32 pad; | 258 | __u32 ErrorType; /* 0 = no RAS error, |
259 | * 1 = ECC_SRAM, | ||
260 | * 2 = Link_SYNFLOOD (poison), | ||
261 | * 3 = GPU hang (not attributable to a specific cause), | ||
262 | * other values reserved | ||
263 | */ | ||
254 | }; | 264 | }; |
255 | 265 | ||
256 | /* hw exception data */ | 266 | /* hw exception data */ |