aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c18
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h3
-rw-r--r--include/uapi/linux/kfd_ioctl.h12
9 files changed, 51 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index fe1d7368c1e6..acf8ae0cee9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
640void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) 640void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
641{ 641{
642} 642}
643
644void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
645{
646}
643#endif 647#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 0e1711a75b68..e6a503760b62 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
229int kgd2kfd_resume_mm(struct mm_struct *mm); 229int kgd2kfd_resume_mm(struct mm_struct *mm);
230int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, 230int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
231 struct dma_fence *fence); 231 struct dma_fence *fence);
232void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
232 233
233#endif /* AMDGPU_AMDKFD_H_INCLUDED */ 234#endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 88c45f990f05..6bb71f6ee18e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
4805 struct amdgpu_iv_entry *entry) 4805 struct amdgpu_iv_entry *entry)
4806{ 4806{
4807 /* TODO ue will trigger an interrupt. */ 4807 /* TODO ue will trigger an interrupt. */
4808 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
4808 amdgpu_ras_reset_gpu(adev, 0); 4809 amdgpu_ras_reset_gpu(adev, 0);
4809 return AMDGPU_RAS_UE; 4810 return AMDGPU_RAS_UE;
4810} 4811}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 2daa5ea1c2ea..0252345a1f08 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
354static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, 354static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
355 struct amdgpu_iv_entry *entry) 355 struct amdgpu_iv_entry *entry)
356{ 356{
357 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
357 amdgpu_ras_reset_gpu(adev, 0); 358 amdgpu_ras_reset_gpu(adev, 0);
358 return AMDGPU_RAS_UE; 359 return AMDGPU_RAS_UE;
359} 360}
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 058b9daec514..f7a6fafd70ae 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
1851 return 0; 1851 return 0;
1852 } 1852 }
1853 1853
1854 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
1855
1854 amdgpu_ras_reset_gpu(adev, 0); 1856 amdgpu_ras_reset_gpu(adev, 0);
1855 1857
1856 return AMDGPU_RAS_UE; 1858 return AMDGPU_RAS_UE;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8be9677c0c07..b3cdbf79f47b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
466 memset(&kfd->doorbell_available_index, 0, 466 memset(&kfd->doorbell_available_index, 0,
467 sizeof(kfd->doorbell_available_index)); 467 sizeof(kfd->doorbell_available_index));
468 468
469 atomic_set(&kfd->sram_ecc_flag, 0);
470
469 return kfd; 471 return kfd;
470} 472}
471 473
@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
661 return ret; 663 return ret;
662 count = atomic_dec_return(&kfd_locked); 664 count = atomic_dec_return(&kfd_locked);
663 WARN_ONCE(count != 0, "KFD reset ref. error"); 665 WARN_ONCE(count != 0, "KFD reset ref. error");
666
667 atomic_set(&kfd->sram_ecc_flag, 0);
668
664 return 0; 669 return 0;
665} 670}
666 671
@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
1024 return 0; 1029 return 0;
1025} 1030}
1026 1031
1032void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
1033{
1034 if (kfd)
1035 atomic_inc(&kfd->sram_ecc_flag);
1036}
1037
1027#if defined(CONFIG_DEBUG_FS) 1038#if defined(CONFIG_DEBUG_FS)
1028 1039
1029/* This function will send a package to HIQ to hang the HWS 1040/* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9f0e0a1b41c..6e1d41c5bf86 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
1011void kfd_signal_reset_event(struct kfd_dev *dev) 1011void kfd_signal_reset_event(struct kfd_dev *dev)
1012{ 1012{
1013 struct kfd_hsa_hw_exception_data hw_exception_data; 1013 struct kfd_hsa_hw_exception_data hw_exception_data;
1014 struct kfd_hsa_memory_exception_data memory_exception_data;
1014 struct kfd_process *p; 1015 struct kfd_process *p;
1015 struct kfd_event *ev; 1016 struct kfd_event *ev;
1016 unsigned int temp; 1017 unsigned int temp;
1017 uint32_t id, idx; 1018 uint32_t id, idx;
1019 int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
1020 KFD_HW_EXCEPTION_ECC :
1021 KFD_HW_EXCEPTION_GPU_HANG;
1018 1022
1019 /* Whole gpu reset caused by GPU hang and memory is lost */ 1023 /* Whole gpu reset caused by GPU hang and memory is lost */
1020 memset(&hw_exception_data, 0, sizeof(hw_exception_data)); 1024 memset(&hw_exception_data, 0, sizeof(hw_exception_data));
1021 hw_exception_data.gpu_id = dev->id; 1025 hw_exception_data.gpu_id = dev->id;
1022 hw_exception_data.memory_lost = 1; 1026 hw_exception_data.memory_lost = 1;
1027 hw_exception_data.reset_cause = reset_cause;
1028
1029 memset(&memory_exception_data, 0, sizeof(memory_exception_data));
1030 memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
1031 memory_exception_data.gpu_id = dev->id;
1032 memory_exception_data.failure.imprecise = true;
1023 1033
1024 idx = srcu_read_lock(&kfd_processes_srcu); 1034 idx = srcu_read_lock(&kfd_processes_srcu);
1025 hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { 1035 hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
1026 mutex_lock(&p->event_mutex); 1036 mutex_lock(&p->event_mutex);
1027 id = KFD_FIRST_NONSIGNAL_EVENT_ID; 1037 id = KFD_FIRST_NONSIGNAL_EVENT_ID;
1028 idr_for_each_entry_continue(&p->event_idr, ev, id) 1038 idr_for_each_entry_continue(&p->event_idr, ev, id) {
1029 if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { 1039 if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
1030 ev->hw_exception_data = hw_exception_data; 1040 ev->hw_exception_data = hw_exception_data;
1031 set_event(ev); 1041 set_event(ev);
1032 } 1042 }
1043 if (ev->type == KFD_EVENT_TYPE_MEMORY &&
1044 reset_cause == KFD_HW_EXCEPTION_ECC) {
1045 ev->memory_exception_data = memory_exception_data;
1046 set_event(ev);
1047 }
1048 }
1033 mutex_unlock(&p->event_mutex); 1049 mutex_unlock(&p->event_mutex);
1034 } 1050 }
1035 srcu_read_unlock(&kfd_processes_srcu, idx); 1051 srcu_read_unlock(&kfd_processes_srcu, idx);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0eeee3c6d6dc..9e0230965675 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -276,6 +276,9 @@ struct kfd_dev {
276 uint64_t hive_id; 276 uint64_t hive_id;
277 277
278 bool pci_atomic_requested; 278 bool pci_atomic_requested;
279
280 /* SRAM ECC flag */
281 atomic_t sram_ecc_flag;
279}; 282};
280 283
281enum kfd_mempool { 284enum kfd_mempool {
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index e622fd1fbd46..dc067ed0b72d 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
211#define KFD_HW_EXCEPTION_GPU_HANG 0 211#define KFD_HW_EXCEPTION_GPU_HANG 0
212#define KFD_HW_EXCEPTION_ECC 1 212#define KFD_HW_EXCEPTION_ECC 1
213 213
214/* For kfd_hsa_memory_exception_data.ErrorType */
215#define KFD_MEM_ERR_NO_RAS 0
216#define KFD_MEM_ERR_SRAM_ECC 1
217#define KFD_MEM_ERR_POISON_CONSUMED 2
218#define KFD_MEM_ERR_GPU_HANG 3
214 219
215struct kfd_ioctl_create_event_args { 220struct kfd_ioctl_create_event_args {
216 __u64 event_page_offset; /* from KFD */ 221 __u64 event_page_offset; /* from KFD */
@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
250 struct kfd_memory_exception_failure failure; 255 struct kfd_memory_exception_failure failure;
251 __u64 va; 256 __u64 va;
252 __u32 gpu_id; 257 __u32 gpu_id;
253 __u32 pad; 258 __u32 ErrorType; /* 0 = no RAS error,
259 * 1 = ECC_SRAM,
260 * 2 = Link_SYNFLOOD (poison),
261 * 3 = GPU hang (not attributable to a specific cause),
262 * other values reserved
263 */
254}; 264};
255 265
256/* hw exception data */ 266/* hw exception data */