diff options
author | Monk Liu <Monk.Liu@amd.com> | 2017-01-23 01:22:08 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2017-03-29 23:52:45 -0400 |
commit | a90ad3c2afe5bebdd5d00aaec87fec6823545c59 (patch) | |
tree | 65b7bfb4bc1152ba0aacdf0e7971cde1cea2e138 | |
parent | 596c67d076a5ae0f2571cab9245ee76f6a6cf922 (diff) |
drm/amdgpu:implement SRIOV gpu_reset (v2)
implement SRIOV gpu_reset for future use.
it wil be called from:
1) job timeout
2) privl access or instruction error interrupt
3) hypervisor detect VF hang
v2: agd: rebase on upstream
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 156 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 |
2 files changed, 156 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7d5ae500fe02..66cdd89982c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |||
@@ -1607,6 +1607,53 @@ int amdgpu_suspend(struct amdgpu_device *adev) | |||
1607 | return 0; | 1607 | return 0; |
1608 | } | 1608 | } |
1609 | 1609 | ||
1610 | static int amdgpu_sriov_resume_early(struct amdgpu_device *adev) | ||
1611 | { | ||
1612 | int i, r; | ||
1613 | |||
1614 | for (i = 0; i < adev->num_ip_blocks; i++) { | ||
1615 | if (!adev->ip_blocks[i].status.valid) | ||
1616 | continue; | ||
1617 | |||
1618 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || | ||
1619 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || | ||
1620 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) | ||
1621 | r = adev->ip_blocks[i].version->funcs->resume(adev); | ||
1622 | |||
1623 | if (r) { | ||
1624 | DRM_ERROR("resume of IP block <%s> failed %d\n", | ||
1625 | adev->ip_blocks[i].version->funcs->name, r); | ||
1626 | return r; | ||
1627 | } | ||
1628 | } | ||
1629 | |||
1630 | return 0; | ||
1631 | } | ||
1632 | |||
1633 | static int amdgpu_sriov_resume_late(struct amdgpu_device *adev) | ||
1634 | { | ||
1635 | int i, r; | ||
1636 | |||
1637 | for (i = 0; i < adev->num_ip_blocks; i++) { | ||
1638 | if (!adev->ip_blocks[i].status.valid) | ||
1639 | continue; | ||
1640 | |||
1641 | if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || | ||
1642 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || | ||
1643 | adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ) | ||
1644 | continue; | ||
1645 | |||
1646 | r = adev->ip_blocks[i].version->funcs->resume(adev); | ||
1647 | if (r) { | ||
1648 | DRM_ERROR("resume of IP block <%s> failed %d\n", | ||
1649 | adev->ip_blocks[i].version->funcs->name, r); | ||
1650 | return r; | ||
1651 | } | ||
1652 | } | ||
1653 | |||
1654 | return 0; | ||
1655 | } | ||
1656 | |||
1610 | static int amdgpu_resume(struct amdgpu_device *adev) | 1657 | static int amdgpu_resume(struct amdgpu_device *adev) |
1611 | { | 1658 | { |
1612 | int i, r; | 1659 | int i, r; |
@@ -2286,6 +2333,113 @@ err: | |||
2286 | } | 2333 | } |
2287 | 2334 | ||
2288 | /** | 2335 | /** |
2336 | * amdgpu_sriov_gpu_reset - reset the asic | ||
2337 | * | ||
2338 | * @adev: amdgpu device pointer | ||
2339 | * @voluntary: if this reset is requested by guest. | ||
2340 | * (true means by guest and false means by HYPERVISOR ) | ||
2341 | * | ||
2342 | * Attempt the reset the GPU if it has hung (all asics). | ||
2343 | * for SRIOV case. | ||
2344 | * Returns 0 for success or an error on failure. | ||
2345 | */ | ||
2346 | int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary) | ||
2347 | { | ||
2348 | int i, r = 0; | ||
2349 | int resched; | ||
2350 | struct amdgpu_bo *bo, *tmp; | ||
2351 | struct amdgpu_ring *ring; | ||
2352 | struct dma_fence *fence = NULL, *next = NULL; | ||
2353 | |||
2354 | atomic_inc(&adev->gpu_reset_counter); | ||
2355 | |||
2356 | /* block TTM */ | ||
2357 | resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); | ||
2358 | |||
2359 | /* block scheduler */ | ||
2360 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { | ||
2361 | ring = adev->rings[i]; | ||
2362 | |||
2363 | if (!ring || !ring->sched.thread) | ||
2364 | continue; | ||
2365 | |||
2366 | kthread_park(ring->sched.thread); | ||
2367 | amd_sched_hw_job_reset(&ring->sched); | ||
2368 | } | ||
2369 | |||
2370 | /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ | ||
2371 | amdgpu_fence_driver_force_completion(adev); | ||
2372 | |||
2373 | /* request to take full control of GPU before re-initialization */ | ||
2374 | if (voluntary) | ||
2375 | amdgpu_virt_reset_gpu(adev); | ||
2376 | else | ||
2377 | amdgpu_virt_request_full_gpu(adev, true); | ||
2378 | |||
2379 | |||
2380 | /* Resume IP prior to SMC */ | ||
2381 | amdgpu_sriov_resume_early(adev); | ||
2382 | |||
2383 | /* we need recover gart prior to run SMC/CP/SDMA resume */ | ||
2384 | amdgpu_ttm_recover_gart(adev); | ||
2385 | |||
2386 | /* now we are okay to resume SMC/CP/SDMA */ | ||
2387 | amdgpu_sriov_resume_late(adev); | ||
2388 | |||
2389 | amdgpu_irq_gpu_reset_resume_helper(adev); | ||
2390 | |||
2391 | if (amdgpu_ib_ring_tests(adev)) | ||
2392 | dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); | ||
2393 | |||
2394 | /* release full control of GPU after ib test */ | ||
2395 | amdgpu_virt_release_full_gpu(adev, true); | ||
2396 | |||
2397 | DRM_INFO("recover vram bo from shadow\n"); | ||
2398 | |||
2399 | ring = adev->mman.buffer_funcs_ring; | ||
2400 | mutex_lock(&adev->shadow_list_lock); | ||
2401 | list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { | ||
2402 | amdgpu_recover_vram_from_shadow(adev, ring, bo, &next); | ||
2403 | if (fence) { | ||
2404 | r = dma_fence_wait(fence, false); | ||
2405 | if (r) { | ||
2406 | WARN(r, "recovery from shadow isn't completed\n"); | ||
2407 | break; | ||
2408 | } | ||
2409 | } | ||
2410 | |||
2411 | dma_fence_put(fence); | ||
2412 | fence = next; | ||
2413 | } | ||
2414 | mutex_unlock(&adev->shadow_list_lock); | ||
2415 | |||
2416 | if (fence) { | ||
2417 | r = dma_fence_wait(fence, false); | ||
2418 | if (r) | ||
2419 | WARN(r, "recovery from shadow isn't completed\n"); | ||
2420 | } | ||
2421 | dma_fence_put(fence); | ||
2422 | |||
2423 | for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { | ||
2424 | struct amdgpu_ring *ring = adev->rings[i]; | ||
2425 | if (!ring || !ring->sched.thread) | ||
2426 | continue; | ||
2427 | |||
2428 | amd_sched_job_recovery(&ring->sched); | ||
2429 | kthread_unpark(ring->sched.thread); | ||
2430 | } | ||
2431 | |||
2432 | drm_helper_resume_force_mode(adev->ddev); | ||
2433 | ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); | ||
2434 | if (r) { | ||
2435 | /* bad news, how to tell it to userspace ? */ | ||
2436 | dev_info(adev->dev, "GPU reset failed\n"); | ||
2437 | } | ||
2438 | |||
2439 | return r; | ||
2440 | } | ||
2441 | |||
2442 | /** | ||
2289 | * amdgpu_gpu_reset - reset the asic | 2443 | * amdgpu_gpu_reset - reset the asic |
2290 | * | 2444 | * |
2291 | * @adev: amdgpu device pointer | 2445 | * @adev: amdgpu device pointer |
@@ -2300,7 +2454,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) | |||
2300 | bool need_full_reset; | 2454 | bool need_full_reset; |
2301 | 2455 | ||
2302 | if (amdgpu_sriov_vf(adev)) | 2456 | if (amdgpu_sriov_vf(adev)) |
2303 | return 0; | 2457 | return amdgpu_sriov_gpu_reset(adev, true); |
2304 | 2458 | ||
2305 | if (!amdgpu_check_soft_reset(adev)) { | 2459 | if (!amdgpu_check_soft_reset(adev)) { |
2306 | DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); | 2460 | DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 675e12c42532..73d24df2efa1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | |||
@@ -89,5 +89,6 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v); | |||
89 | int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); | 89 | int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); |
90 | int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); | 90 | int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init); |
91 | int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); | 91 | int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); |
92 | int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary); | ||
92 | 93 | ||
93 | #endif | 94 | #endif |