drm/amdgpu: Track pending retry faults in IH and VM (v2)

IH tracks pending retry faults in a hash table for fast lookup in interrupt context. Each VM has a short FIFO of pending VM faults for processing in a bottom half. The IH prescreening stage adds retry faults and filters out repeated retry interrupts to minimize the impact of interrupt storms. It's the VM's responsibility remove pending faults once they are handled. For now this is only done when the VM is destroyed. v2: - Made the hash table smaller and the FIFO longer. I never want the FIFO to fill up, because that would make prescreen take longer. 128 pending page faults should be enough to keep migrations busy. Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Acked-by: Christian König <christian.koenig@amd.com> (v1) Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Felix Kuehling <Felix.Kuehling@amd.com> 2017-08-26 02:43:06 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2017-09-26 14:53:20 -0400
commit: a2f14820e3493145c25095873d4a510a1b25efdc (patch)
tree: 801651223be96004fc4f39ef658c3bd282311ca2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
parent: 5d86b2c391965cbcb295e8fa795276977b2a416e (diff)
1 files changed, 76 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index c834a40cfad6..f5f27e4f0f7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -196,3 +196,79 @@ restart_ih:
        return IRQ_HANDLED;
 }
+/**
+ * amdgpu_ih_add_fault - Add a page fault record
+ *
+ * @adev: amdgpu device pointer
+ * @key: 64-bit encoding of PASID and address
+ *
+ * This should be called when a retry page fault interrupt is
+ * received. If this is a new page fault, it will be added to a hash
+ * table. The return value indicates whether this is a new fault, or
+ * a fault that was already known and is already being handled.
+ *
+ * If there are too many pending page faults, this will fail. Retry
+ * interrupts should be ignored in this case until there is enough
+ * free space.
+ *
+ * Returns 0 if the fault was added, 1 if the fault was already known,
+ * -ENOSPC if there are too many pending faults.
+ */
+int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
+{
+        unsigned long flags;
+        int r = -ENOSPC;
+        if (WARN_ON_ONCE(!adev->irq.ih.faults))
+                /* Should be allocated in <IP>_ih_sw_init on GPUs that
+                 * support retry faults and require retry filtering.
+                 */
+                return r;
+        spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
+        /* Only let the hash table fill up to 50% for best performance */
+        if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
+                goto unlock_out;
+        r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
+        if (!r)
+                adev->irq.ih.faults->count++;
+        /* chash_table_copy_in should never fail unless we're losing count */
+        WARN_ON_ONCE(r < 0);
+unlock_out:
+        spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
+        return r;
+}
+/**
+ * amdgpu_ih_clear_fault - Remove a page fault record
+ *
+ * @adev: amdgpu device pointer
+ * @key: 64-bit encoding of PASID and address
+ *
+ * This should be called when a page fault has been handled. Any
+ * future interrupt with this key will be processed as a new
+ * page fault.
+ */
+void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
+{
+        unsigned long flags;
+        int r;
+        if (!adev->irq.ih.faults)
+                return;
+        spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
+        r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
+        if (!WARN_ON_ONCE(r < 0)) {
+                adev->irq.ih.faults->count--;
+                WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
+        }
+        spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
+}
author	Felix Kuehling <Felix.Kuehling@amd.com>	2017-08-26 02:43:06 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2017-09-26 14:53:20 -0400
commit	a2f14820e3493145c25095873d4a510a1b25efdc (patch)
tree	801651223be96004fc4f39ef658c3bd282311ca2 /drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
parent	5d86b2c391965cbcb295e8fa795276977b2a416e (diff)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c index c834a40cfad6..f5f27e4f0f7f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -196,3 +196,79 @@ restart_ih:
196		196
197	return IRQ_HANDLED;	197	return IRQ_HANDLED;
198	}	198	}
		199
		200	/**
		201	* amdgpu_ih_add_fault - Add a page fault record
		202	*
		203	* @adev: amdgpu device pointer
		204	* @key: 64-bit encoding of PASID and address
		205	*
		206	* This should be called when a retry page fault interrupt is
		207	* received. If this is a new page fault, it will be added to a hash
		208	* table. The return value indicates whether this is a new fault, or
		209	* a fault that was already known and is already being handled.
		210	*
		211	* If there are too many pending page faults, this will fail. Retry
		212	* interrupts should be ignored in this case until there is enough
		213	* free space.
		214	*
		215	* Returns 0 if the fault was added, 1 if the fault was already known,
		216	* -ENOSPC if there are too many pending faults.
		217	*/
		218	int amdgpu_ih_add_fault(struct amdgpu_device *adev, u64 key)
		219	{
		220	unsigned long flags;
		221	int r = -ENOSPC;
		222
		223	if (WARN_ON_ONCE(!adev->irq.ih.faults))
		224	/* Should be allocated in <IP>_ih_sw_init on GPUs that
		225	* support retry faults and require retry filtering.
		226	*/
		227	return r;
		228
		229	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
		230
		231	/* Only let the hash table fill up to 50% for best performance */
		232	if (adev->irq.ih.faults->count >= (1 << (AMDGPU_PAGEFAULT_HASH_BITS-1)))
		233	goto unlock_out;
		234
		235	r = chash_table_copy_in(&adev->irq.ih.faults->hash, key, NULL);
		236	if (!r)
		237	adev->irq.ih.faults->count++;
		238
		239	/* chash_table_copy_in should never fail unless we're losing count */
		240	WARN_ON_ONCE(r < 0);
		241
		242	unlock_out:
		243	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
		244	return r;
		245	}
		246
		247	/**
		248	* amdgpu_ih_clear_fault - Remove a page fault record
		249	*
		250	* @adev: amdgpu device pointer
		251	* @key: 64-bit encoding of PASID and address
		252	*
		253	* This should be called when a page fault has been handled. Any
		254	* future interrupt with this key will be processed as a new
		255	* page fault.
		256	*/
		257	void amdgpu_ih_clear_fault(struct amdgpu_device *adev, u64 key)
		258	{
		259	unsigned long flags;
		260	int r;
		261
		262	if (!adev->irq.ih.faults)
		263	return;
		264
		265	spin_lock_irqsave(&adev->irq.ih.faults->lock, flags);
		266
		267	r = chash_table_remove(&adev->irq.ih.faults->hash, key, NULL);
		268	if (!WARN_ON_ONCE(r < 0)) {
		269	adev->irq.ih.faults->count--;
		270	WARN_ON_ONCE(adev->irq.ih.faults->count < 0);
		271	}
		272
		273	spin_unlock_irqrestore(&adev->irq.ih.faults->lock, flags);
		274	}