powerpc/powernv/npu: Fix deadlock in mmio_invalidate()

When sending TLB invalidates to the NPU we need to send extra flushes due to a hardware issue. The original implementation would lock the all the ATSD MMIO registers sequentially before unlocking and relocking each of them sequentially to do the extra flush. This introduced a deadlock as it is possible for one thread to hold one ATSD register whilst waiting for another register to be freed while the other thread is holding that register waiting for the one in the first thread to be freed. For example if there are two threads and two ATSD registers: Thread A Thread B ---------------------- Acquire 1 Acquire 2 Release 1 Acquire 1 Wait 1 Wait 2 Both threads will be stuck waiting to acquire a register resulting in an RCU stall warning or soft lockup. This patch solves the deadlock by refactoring the code to ensure registers are not released between flushes and to ensure all registers are either acquired or released together and in order. Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD") Signed-off-by: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
author: Alistair Popple <alistair@popple.id.au> 2018-03-02 00:18:45 -0500
committer: Michael Ellerman <mpe@ellerman.id.au> 2018-03-13 00:50:29 -0400
commit: 2b74e2a9b39df40a2b489af2d24079617c61ee0e (patch)
tree: f537d51744808b3968f5e3458fac548229c6f10d
parent: c554ac91ce2213faa91c51c45423770218cccce3 (diff)
1 files changed, 141 insertions, 88 deletions
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 0a253b64ac5f..77d6061fd0ce 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -410,6 +410,11 @@ struct npu_context {
        void *priv;
 };
+struct mmio_atsd_reg {
+        struct npu *npu;
+        int reg;
+};
 /*
 * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
 * if none are available.
@@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
        int i;
        for (i = 0; i < npu->mmio_atsd_count; i++) {
-                if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
+                if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
                        return i;
        }
@@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
 static void put_mmio_atsd_reg(struct npu *npu, int reg)
 {
-        clear_bit(reg, &npu->mmio_atsd_usage);
+        clear_bit_unlock(reg, &npu->mmio_atsd_usage);
 }
 /* MMIO ATSD register offsets */
 #define XTS_ATSD_AVA  1
 #define XTS_ATSD_STAT 2
-static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
+static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
-                                unsigned long va)
+                                unsigned long launch, unsigned long va)
 {
-        int mmio_atsd_reg;
+        struct npu *npu = mmio_atsd_reg->npu;
+        int reg = mmio_atsd_reg->reg;
-        do {
-                mmio_atsd_reg = get_mmio_atsd_reg(npu);
-                cpu_relax();
-        } while (mmio_atsd_reg < 0);
        __raw_writeq(cpu_to_be64(va),
-                npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
+                npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
        eieio();
-        __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
+        __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
-        return mmio_atsd_reg;
 }
-static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
+static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
+                                unsigned long pid, bool flush)
 {
+        int i;
        unsigned long launch;
-        /* IS set to invalidate matching PID */
+        for (i = 0; i <= max_npu2_index; i++) {
-        launch = PPC_BIT(12);
+                if (mmio_atsd_reg[i].reg < 0)
+                        continue;
+                /* IS set to invalidate matching PID */
+                launch = PPC_BIT(12);
-        /* PRS set to process-scoped */
+                /* PRS set to process-scoped */
-        launch |= PPC_BIT(13);
+                launch |= PPC_BIT(13);
-        /* AP */
+                /* AP */
-        launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+                launch |= (u64)
+                        mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-        /* PID */
+                /* PID */
-        launch |= pid << PPC_BITLSHIFT(38);
+                launch |= pid << PPC_BITLSHIFT(38);
-        /* No flush */
+                /* No flush */
-        launch |= !flush << PPC_BITLSHIFT(39);
+                launch |= !flush << PPC_BITLSHIFT(39);
-        /* Invalidating the entire process doesn't use a va */
+                /* Invalidating the entire process doesn't use a va */
-        return mmio_launch_invalidate(npu, launch, 0);
+                mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
+        }
 }
-static int mmio_invalidate_va(struct npu *npu, unsigned long va,
+static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-                        unsigned long pid, bool flush)
+                        unsigned long va, unsigned long pid, bool flush)
 {
+        int i;
        unsigned long launch;
-        /* IS set to invalidate target VA */
+        for (i = 0; i <= max_npu2_index; i++) {
-        launch = 0;
+                if (mmio_atsd_reg[i].reg < 0)
+                        continue;
+                /* IS set to invalidate target VA */
+                launch = 0;
-        /* PRS set to process scoped */
+                /* PRS set to process scoped */
-        launch |= PPC_BIT(13);
+                launch |= PPC_BIT(13);
-        /* AP */
+                /* AP */
-        launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+                launch |= (u64)
+                        mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-        /* PID */
+                /* PID */
-        launch |= pid << PPC_BITLSHIFT(38);
+                launch |= pid << PPC_BITLSHIFT(38);
-        /* No flush */
+                /* No flush */
-        launch |= !flush << PPC_BITLSHIFT(39);
+                launch |= !flush << PPC_BITLSHIFT(39);
-        return mmio_launch_invalidate(npu, launch, va);
+                mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
+        }
 }
 #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
-struct mmio_atsd_reg {
-        struct npu *npu;
-        int reg;
-};
 static void mmio_invalidate_wait(
-        struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
+        struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
 {
        struct npu *npu;
        int i, reg;
@@ -522,16 +531,67 @@ static void mmio_invalidate_wait(
                reg = mmio_atsd_reg[i].reg;
                while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
                        cpu_relax();
+        }
+}
+/*
+ * Acquires all the address translation shootdown (ATSD) registers required to
+ * launch an ATSD on all links this npu_context is active on.
+ */
+static void acquire_atsd_reg(struct npu_context *npu_context,
+                        struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+        int i, j;
+        struct npu *npu;
+        struct pci_dev *npdev;
+        struct pnv_phb *nphb;
-                put_mmio_atsd_reg(npu, reg);
+        for (i = 0; i <= max_npu2_index; i++) {
+                mmio_atsd_reg[i].reg = -1;
+                for (j = 0; j < NV_MAX_LINKS; j++) {
+                        /*
+                         * There are no ordering requirements with respect to
+                         * the setup of struct npu_context, but to ensure
+                         * consistent behaviour we need to ensure npdev[][] is
+                         * only read once.
+                         */
+                        npdev = READ_ONCE(npu_context->npdev[i][j]);
+                        if (!npdev)
+                                continue;
+                        nphb = pci_bus_to_host(npdev->bus)->private_data;
+                        npu = &nphb->npu;
+                        mmio_atsd_reg[i].npu = npu;
+                        mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
+                        while (mmio_atsd_reg[i].reg < 0) {
+                                mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
+                                cpu_relax();
+                        }
+                        break;
+                }
+        }
+}
+/*
+ * Release previously acquired ATSD registers. To avoid deadlocks the registers
+ * must be released in the same order they were acquired above in
+ * acquire_atsd_reg.
+ */
+static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
+{
+        int i;
+        for (i = 0; i <= max_npu2_index; i++) {
                /*
-                 * The GPU requires two flush ATSDs to ensure all entries have
+                 * We can't rely on npu_context->npdev[][] being the same here
-                 * been flushed. We use PID 0 as it will never be used for a
+                 * as when acquire_atsd_reg() was called, hence we use the
-                 * process on the GPU.
+                 * values stored in mmio_atsd_reg during the acquire phase
+                 * rather than re-reading npdev[][].
                 */
-                if (flush)
+                if (mmio_atsd_reg[i].reg < 0)
-                        mmio_invalidate_pid(npu, 0, true);
+                        continue;
+                put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
        }
 }
@@ -542,10 +602,6 @@ static void mmio_invalidate_wait(
 static void mmio_invalidate(struct npu_context *npu_context, int va,
                        unsigned long address, bool flush)
 {
-        int i, j;
-        struct npu *npu;
-        struct pnv_phb *nphb;
-        struct pci_dev *npdev;
        struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
        unsigned long pid = npu_context->mm->context.id;
@@ -561,37 +617,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
         * Loop over all the NPUs this process is active on and launch
         * an invalidate.
         */
-        for (i = 0; i <= max_npu2_index; i++) {
+        acquire_atsd_reg(npu_context, mmio_atsd_reg);
-                mmio_atsd_reg[i].reg = -1;
+        if (va)
-                for (j = 0; j < NV_MAX_LINKS; j++) {
+                mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
-                        npdev = npu_context->npdev[i][j];
+        else
-                        if (!npdev)
+                mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
-                                continue;
+        mmio_invalidate_wait(mmio_atsd_reg);
-                        nphb = pci_bus_to_host(npdev->bus)->private_data;
+        if (flush) {
-                        npu = &nphb->npu;
+                /*
-                        mmio_atsd_reg[i].npu = npu;
+                 * The GPU requires two flush ATSDs to ensure all entries have
+                 * been flushed. We use PID 0 as it will never be used for a
-                        if (va)
+                 * process on the GPU.
-                                mmio_atsd_reg[i].reg =
+                 */
-                                        mmio_invalidate_va(npu, address, pid,
+                mmio_invalidate_pid(mmio_atsd_reg, 0, true);
-                                                        flush);
+                mmio_invalidate_wait(mmio_atsd_reg);
-                        else
+                mmio_invalidate_pid(mmio_atsd_reg, 0, true);
-                                mmio_atsd_reg[i].reg =
+                mmio_invalidate_wait(mmio_atsd_reg);
-                                        mmio_invalidate_pid(npu, pid, flush);
-                        /*
-                         * The NPU hardware forwards the shootdown to all GPUs
-                         * so we only have to launch one shootdown per NPU.
-                         */
-                        break;
-                }
        }
+        release_atsd_reg(mmio_atsd_reg);
-        mmio_invalidate_wait(mmio_atsd_reg, flush);
-        if (flush)
-                /* Wait for the flush to complete */
-                mmio_invalidate_wait(mmio_atsd_reg, false);
 }
 static void pnv_npu2_mn_release(struct mmu_notifier *mn,
@@ -726,7 +770,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
        if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
                                                        &nvlink_index)))
                return ERR_PTR(-ENODEV);
-        npu_context->npdev[npu->index][nvlink_index] = npdev;
+        /*
+         * npdev is a pci_dev pointer setup by the PCI code. We assign it to
+         * npdev[][] to indicate to the mmu notifiers that an invalidation
+         * should also be sent over this nvlink. The notifiers don't use any
+         * other fields in npu_context, so we just need to ensure that when they
+         * deference npu_context->npdev[][] it is either a valid pointer or
+         * NULL.
+         */
+        WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
        if (!nphb->npu.nmmu_flush) {
                /*
@@ -778,7 +831,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
        if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
                                                        &nvlink_index)))
                return;
-        npu_context->npdev[npu->index][nvlink_index] = NULL;
+        WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
        opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
                                PCI_DEVID(gpdev->bus->number, gpdev->devfn));
        kref_put(&npu_context->kref, pnv_npu2_release_context);
author	Alistair Popple <alistair@popple.id.au>	2018-03-02 00:18:45 -0500
committer	Michael Ellerman <mpe@ellerman.id.au>	2018-03-13 00:50:29 -0400
commit	2b74e2a9b39df40a2b489af2d24079617c61ee0e (patch)
tree	f537d51744808b3968f5e3458fac548229c6f10d
parent	c554ac91ce2213faa91c51c45423770218cccce3 (diff)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 0a253b64ac5f..77d6061fd0ce 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -410,6 +410,11 @@ struct npu_context {
410	void *priv;	410	void *priv;
411	};	411	};
412		412
		413	struct mmio_atsd_reg {
		414	struct npu *npu;
		415	int reg;
		416	};
		417
413	/*	418	/*
414	* Find a free MMIO ATSD register and mark it in use. Return -ENOSPC	419	* Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
415	* if none are available.	420	* if none are available.
@@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
419	int i;	424	int i;
420		425
421	for (i = 0; i < npu->mmio_atsd_count; i++) {	426	for (i = 0; i < npu->mmio_atsd_count; i++) {
422	if (!test_and_set_bit(i, &npu->mmio_atsd_usage))	427	if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
423	return i;	428	return i;
424	}	429	}
425		430
@@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
428		433
429	static void put_mmio_atsd_reg(struct npu *npu, int reg)	434	static void put_mmio_atsd_reg(struct npu *npu, int reg)
430	{	435	{
431	clear_bit(reg, &npu->mmio_atsd_usage);	436	clear_bit_unlock(reg, &npu->mmio_atsd_usage);
432	}	437	}
433		438
434	/* MMIO ATSD register offsets */	439	/* MMIO ATSD register offsets */
435	#define XTS_ATSD_AVA 1	440	#define XTS_ATSD_AVA 1
436	#define XTS_ATSD_STAT 2	441	#define XTS_ATSD_STAT 2
437		442
438	static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,	443	static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
439	unsigned long va)	444	unsigned long launch, unsigned long va)
440	{	445	{
441	int mmio_atsd_reg;	446	struct npu *npu = mmio_atsd_reg->npu;
442		447	int reg = mmio_atsd_reg->reg;
443	do {
444	mmio_atsd_reg = get_mmio_atsd_reg(npu);
445	cpu_relax();
446	} while (mmio_atsd_reg < 0);
447		448
448	__raw_writeq(cpu_to_be64(va),	449	__raw_writeq(cpu_to_be64(va),
449	npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);	450	npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
450	eieio();	451	eieio();
451	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);	452	__raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
452
453	return mmio_atsd_reg;
454	}	453	}
455		454
456	static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)	455	static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
		456	unsigned long pid, bool flush)
457	{	457	{
		458	int i;
458	unsigned long launch;	459	unsigned long launch;
459		460
460	/* IS set to invalidate matching PID */	461	for (i = 0; i <= max_npu2_index; i++) {
461	launch = PPC_BIT(12);	462	if (mmio_atsd_reg[i].reg < 0)
		463	continue;
		464
		465	/* IS set to invalidate matching PID */
		466	launch = PPC_BIT(12);
462		467
463	/* PRS set to process-scoped */	468	/* PRS set to process-scoped */
464	launch \|= PPC_BIT(13);	469	launch \|= PPC_BIT(13);
465		470
466	/* AP */	471	/* AP */
467	launch \|= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);	472	launch \|= (u64)
		473	mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
468		474
469	/* PID */	475	/* PID */
470	launch \|= pid << PPC_BITLSHIFT(38);	476	launch \|= pid << PPC_BITLSHIFT(38);
471		477
472	/* No flush */	478	/* No flush */
473	launch \|= !flush << PPC_BITLSHIFT(39);	479	launch \|= !flush << PPC_BITLSHIFT(39);
474		480
475	/* Invalidating the entire process doesn't use a va */	481	/* Invalidating the entire process doesn't use a va */
476	return mmio_launch_invalidate(npu, launch, 0);	482	mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
		483	}
477	}	484	}
478		485
479	static int mmio_invalidate_va(struct npu *npu, unsigned long va,	486	static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
480	unsigned long pid, bool flush)	487	unsigned long va, unsigned long pid, bool flush)
481	{	488	{
		489	int i;
482	unsigned long launch;	490	unsigned long launch;
483		491
484	/* IS set to invalidate target VA */	492	for (i = 0; i <= max_npu2_index; i++) {
485	launch = 0;	493	if (mmio_atsd_reg[i].reg < 0)
		494	continue;
		495
		496	/* IS set to invalidate target VA */
		497	launch = 0;
486		498
487	/* PRS set to process scoped */	499	/* PRS set to process scoped */
488	launch \|= PPC_BIT(13);	500	launch \|= PPC_BIT(13);
489		501
490	/* AP */	502	/* AP */
491	launch \|= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);	503	launch \|= (u64)
		504	mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
492		505
493	/* PID */	506	/* PID */
494	launch \|= pid << PPC_BITLSHIFT(38);	507	launch \|= pid << PPC_BITLSHIFT(38);
495		508
496	/* No flush */	509	/* No flush */
497	launch \|= !flush << PPC_BITLSHIFT(39);	510	launch \|= !flush << PPC_BITLSHIFT(39);
498		511
499	return mmio_launch_invalidate(npu, launch, va);	512	mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
		513	}
500	}	514	}
501		515
502	#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)	516	#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
503		517
504	struct mmio_atsd_reg {
505	struct npu *npu;
506	int reg;
507	};
508
509	static void mmio_invalidate_wait(	518	static void mmio_invalidate_wait(
510	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)	519	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
511	{	520	{
512	struct npu *npu;	521	struct npu *npu;
513	int i, reg;	522	int i, reg;
@@ -522,16 +531,67 @@ static void mmio_invalidate_wait(
522	reg = mmio_atsd_reg[i].reg;	531	reg = mmio_atsd_reg[i].reg;
523	while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))	532	while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
524	cpu_relax();	533	cpu_relax();
		534	}
		535	}
		536
		537	/*
		538	* Acquires all the address translation shootdown (ATSD) registers required to
		539	* launch an ATSD on all links this npu_context is active on.
		540	*/
		541	static void acquire_atsd_reg(struct npu_context *npu_context,
		542	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
		543	{
		544	int i, j;
		545	struct npu *npu;
		546	struct pci_dev *npdev;
		547	struct pnv_phb *nphb;
525		548
526	put_mmio_atsd_reg(npu, reg);	549	for (i = 0; i <= max_npu2_index; i++) {
		550	mmio_atsd_reg[i].reg = -1;
		551	for (j = 0; j < NV_MAX_LINKS; j++) {
		552	/*
		553	* There are no ordering requirements with respect to
		554	* the setup of struct npu_context, but to ensure
		555	* consistent behaviour we need to ensure npdev[][] is
		556	* only read once.
		557	*/
		558	npdev = READ_ONCE(npu_context->npdev[i][j]);
		559	if (!npdev)
		560	continue;
527		561
		562	nphb = pci_bus_to_host(npdev->bus)->private_data;
		563	npu = &nphb->npu;
		564	mmio_atsd_reg[i].npu = npu;
		565	mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
		566	while (mmio_atsd_reg[i].reg < 0) {
		567	mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
		568	cpu_relax();
		569	}
		570	break;
		571	}
		572	}
		573	}
		574
		575	/*
		576	* Release previously acquired ATSD registers. To avoid deadlocks the registers
		577	* must be released in the same order they were acquired above in
		578	* acquire_atsd_reg.
		579	*/
		580	static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
		581	{
		582	int i;
		583
		584	for (i = 0; i <= max_npu2_index; i++) {
528	/*	585	/*
529	* The GPU requires two flush ATSDs to ensure all entries have	586	* We can't rely on npu_context->npdev[][] being the same here
530	* been flushed. We use PID 0 as it will never be used for a	587	* as when acquire_atsd_reg() was called, hence we use the
531	* process on the GPU.	588	* values stored in mmio_atsd_reg during the acquire phase
		589	* rather than re-reading npdev[][].
532	*/	590	*/
533	if (flush)	591	if (mmio_atsd_reg[i].reg < 0)
534	mmio_invalidate_pid(npu, 0, true);	592	continue;
		593
		594	put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
535	}	595	}
536	}	596	}
537		597
@@ -542,10 +602,6 @@ static void mmio_invalidate_wait(
542	static void mmio_invalidate(struct npu_context *npu_context, int va,	602	static void mmio_invalidate(struct npu_context *npu_context, int va,
543	unsigned long address, bool flush)	603	unsigned long address, bool flush)
544	{	604	{
545	int i, j;
546	struct npu *npu;
547	struct pnv_phb *nphb;
548	struct pci_dev *npdev;
549	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];	605	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
550	unsigned long pid = npu_context->mm->context.id;	606	unsigned long pid = npu_context->mm->context.id;
551		607
@@ -561,37 +617,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
561	* Loop over all the NPUs this process is active on and launch	617	* Loop over all the NPUs this process is active on and launch
562	* an invalidate.	618	* an invalidate.
563	*/	619	*/
564	for (i = 0; i <= max_npu2_index; i++) {	620	acquire_atsd_reg(npu_context, mmio_atsd_reg);
565	mmio_atsd_reg[i].reg = -1;	621	if (va)
566	for (j = 0; j < NV_MAX_LINKS; j++) {	622	mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
567	npdev = npu_context->npdev[i][j];	623	else
568	if (!npdev)	624	mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
569	continue;	625
570		626	mmio_invalidate_wait(mmio_atsd_reg);
571	nphb = pci_bus_to_host(npdev->bus)->private_data;	627	if (flush) {
572	npu = &nphb->npu;	628	/*
573	mmio_atsd_reg[i].npu = npu;	629	* The GPU requires two flush ATSDs to ensure all entries have
574		630	* been flushed. We use PID 0 as it will never be used for a
575	if (va)	631	* process on the GPU.
576	mmio_atsd_reg[i].reg =	632	*/
577	mmio_invalidate_va(npu, address, pid,	633	mmio_invalidate_pid(mmio_atsd_reg, 0, true);
578	flush);	634	mmio_invalidate_wait(mmio_atsd_reg);
579	else	635	mmio_invalidate_pid(mmio_atsd_reg, 0, true);
580	mmio_atsd_reg[i].reg =	636	mmio_invalidate_wait(mmio_atsd_reg);
581	mmio_invalidate_pid(npu, pid, flush);
582
583	/*
584	* The NPU hardware forwards the shootdown to all GPUs
585	* so we only have to launch one shootdown per NPU.
586	*/
587	break;
588	}
589	}	637	}
590		638	release_atsd_reg(mmio_atsd_reg);
591	mmio_invalidate_wait(mmio_atsd_reg, flush);
592	if (flush)
593	/* Wait for the flush to complete */
594	mmio_invalidate_wait(mmio_atsd_reg, false);
595	}	639	}
596		640
597	static void pnv_npu2_mn_release(struct mmu_notifier *mn,	641	static void pnv_npu2_mn_release(struct mmu_notifier *mn,
@@ -726,7 +770,16 @@ struct npu_context pnv_npu2_init_context(struct pci_dev gpdev,
726	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",	770	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
727	&nvlink_index)))	771	&nvlink_index)))
728	return ERR_PTR(-ENODEV);	772	return ERR_PTR(-ENODEV);
729	npu_context->npdev[npu->index][nvlink_index] = npdev;	773
		774	/*
		775	* npdev is a pci_dev pointer setup by the PCI code. We assign it to
		776	* npdev[][] to indicate to the mmu notifiers that an invalidation
		777	* should also be sent over this nvlink. The notifiers don't use any
		778	* other fields in npu_context, so we just need to ensure that when they
		779	* deference npu_context->npdev[][] it is either a valid pointer or
		780	* NULL.
		781	*/
		782	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
730		783
731	if (!nphb->npu.nmmu_flush) {	784	if (!nphb->npu.nmmu_flush) {
732	/*	785	/*
@@ -778,7 +831,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
778	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",	831	if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
779	&nvlink_index)))	832	&nvlink_index)))
780	return;	833	return;
781	npu_context->npdev[npu->index][nvlink_index] = NULL;	834	WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
782	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,	835	opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
783	PCI_DEVID(gpdev->bus->number, gpdev->devfn));	836	PCI_DEVID(gpdev->bus->number, gpdev->devfn));
784	kref_put(&npu_context->kref, pnv_npu2_release_context);	837	kref_put(&npu_context->kref, pnv_npu2_release_context);