aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlistair Popple <alistair@popple.id.au>2018-03-02 00:18:45 -0500
committerMichael Ellerman <mpe@ellerman.id.au>2018-03-13 00:50:29 -0400
commit2b74e2a9b39df40a2b489af2d24079617c61ee0e (patch)
treef537d51744808b3968f5e3458fac548229c6f10d
parentc554ac91ce2213faa91c51c45423770218cccce3 (diff)
powerpc/powernv/npu: Fix deadlock in mmio_invalidate()
When sending TLB invalidates to the NPU we need to send extra flushes due to a hardware issue. The original implementation would lock the all the ATSD MMIO registers sequentially before unlocking and relocking each of them sequentially to do the extra flush. This introduced a deadlock as it is possible for one thread to hold one ATSD register whilst waiting for another register to be freed while the other thread is holding that register waiting for the one in the first thread to be freed. For example if there are two threads and two ATSD registers: Thread A Thread B ---------------------- Acquire 1 Acquire 2 Release 1 Acquire 1 Wait 1 Wait 2 Both threads will be stuck waiting to acquire a register resulting in an RCU stall warning or soft lockup. This patch solves the deadlock by refactoring the code to ensure registers are not released between flushes and to ensure all registers are either acquired or released together and in order. Fixes: bbd5ff50afff ("powerpc/powernv/npu-dma: Add explicit flush when sending an ATSD") Signed-off-by: Alistair Popple <alistair@popple.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c229
1 files changed, 141 insertions, 88 deletions
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 0a253b64ac5f..77d6061fd0ce 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -410,6 +410,11 @@ struct npu_context {
410 void *priv; 410 void *priv;
411}; 411};
412 412
413struct mmio_atsd_reg {
414 struct npu *npu;
415 int reg;
416};
417
413/* 418/*
414 * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC 419 * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
415 * if none are available. 420 * if none are available.
@@ -419,7 +424,7 @@ static int get_mmio_atsd_reg(struct npu *npu)
419 int i; 424 int i;
420 425
421 for (i = 0; i < npu->mmio_atsd_count; i++) { 426 for (i = 0; i < npu->mmio_atsd_count; i++) {
422 if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) 427 if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
423 return i; 428 return i;
424 } 429 }
425 430
@@ -428,86 +433,90 @@ static int get_mmio_atsd_reg(struct npu *npu)
428 433
429static void put_mmio_atsd_reg(struct npu *npu, int reg) 434static void put_mmio_atsd_reg(struct npu *npu, int reg)
430{ 435{
431 clear_bit(reg, &npu->mmio_atsd_usage); 436 clear_bit_unlock(reg, &npu->mmio_atsd_usage);
432} 437}
433 438
434/* MMIO ATSD register offsets */ 439/* MMIO ATSD register offsets */
435#define XTS_ATSD_AVA 1 440#define XTS_ATSD_AVA 1
436#define XTS_ATSD_STAT 2 441#define XTS_ATSD_STAT 2
437 442
438static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, 443static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
439 unsigned long va) 444 unsigned long launch, unsigned long va)
440{ 445{
441 int mmio_atsd_reg; 446 struct npu *npu = mmio_atsd_reg->npu;
442 447 int reg = mmio_atsd_reg->reg;
443 do {
444 mmio_atsd_reg = get_mmio_atsd_reg(npu);
445 cpu_relax();
446 } while (mmio_atsd_reg < 0);
447 448
448 __raw_writeq(cpu_to_be64(va), 449 __raw_writeq(cpu_to_be64(va),
449 npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); 450 npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
450 eieio(); 451 eieio();
451 __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); 452 __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[reg]);
452
453 return mmio_atsd_reg;
454} 453}
455 454
456static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) 455static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
456 unsigned long pid, bool flush)
457{ 457{
458 int i;
458 unsigned long launch; 459 unsigned long launch;
459 460
460 /* IS set to invalidate matching PID */ 461 for (i = 0; i <= max_npu2_index; i++) {
461 launch = PPC_BIT(12); 462 if (mmio_atsd_reg[i].reg < 0)
463 continue;
464
465 /* IS set to invalidate matching PID */
466 launch = PPC_BIT(12);
462 467
463 /* PRS set to process-scoped */ 468 /* PRS set to process-scoped */
464 launch |= PPC_BIT(13); 469 launch |= PPC_BIT(13);
465 470
466 /* AP */ 471 /* AP */
467 launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); 472 launch |= (u64)
473 mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
468 474
469 /* PID */ 475 /* PID */
470 launch |= pid << PPC_BITLSHIFT(38); 476 launch |= pid << PPC_BITLSHIFT(38);
471 477
472 /* No flush */ 478 /* No flush */
473 launch |= !flush << PPC_BITLSHIFT(39); 479 launch |= !flush << PPC_BITLSHIFT(39);
474 480
475 /* Invalidating the entire process doesn't use a va */ 481 /* Invalidating the entire process doesn't use a va */
476 return mmio_launch_invalidate(npu, launch, 0); 482 mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
483 }
477} 484}
478 485
479static int mmio_invalidate_va(struct npu *npu, unsigned long va, 486static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
480 unsigned long pid, bool flush) 487 unsigned long va, unsigned long pid, bool flush)
481{ 488{
489 int i;
482 unsigned long launch; 490 unsigned long launch;
483 491
484 /* IS set to invalidate target VA */ 492 for (i = 0; i <= max_npu2_index; i++) {
485 launch = 0; 493 if (mmio_atsd_reg[i].reg < 0)
494 continue;
495
496 /* IS set to invalidate target VA */
497 launch = 0;
486 498
487 /* PRS set to process scoped */ 499 /* PRS set to process scoped */
488 launch |= PPC_BIT(13); 500 launch |= PPC_BIT(13);
489 501
490 /* AP */ 502 /* AP */
491 launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); 503 launch |= (u64)
504 mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
492 505
493 /* PID */ 506 /* PID */
494 launch |= pid << PPC_BITLSHIFT(38); 507 launch |= pid << PPC_BITLSHIFT(38);
495 508
496 /* No flush */ 509 /* No flush */
497 launch |= !flush << PPC_BITLSHIFT(39); 510 launch |= !flush << PPC_BITLSHIFT(39);
498 511
499 return mmio_launch_invalidate(npu, launch, va); 512 mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
513 }
500} 514}
501 515
502#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) 516#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
503 517
504struct mmio_atsd_reg {
505 struct npu *npu;
506 int reg;
507};
508
509static void mmio_invalidate_wait( 518static void mmio_invalidate_wait(
510 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) 519 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
511{ 520{
512 struct npu *npu; 521 struct npu *npu;
513 int i, reg; 522 int i, reg;
@@ -522,16 +531,67 @@ static void mmio_invalidate_wait(
522 reg = mmio_atsd_reg[i].reg; 531 reg = mmio_atsd_reg[i].reg;
523 while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) 532 while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
524 cpu_relax(); 533 cpu_relax();
534 }
535}
536
537/*
538 * Acquires all the address translation shootdown (ATSD) registers required to
539 * launch an ATSD on all links this npu_context is active on.
540 */
541static void acquire_atsd_reg(struct npu_context *npu_context,
542 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
543{
544 int i, j;
545 struct npu *npu;
546 struct pci_dev *npdev;
547 struct pnv_phb *nphb;
525 548
526 put_mmio_atsd_reg(npu, reg); 549 for (i = 0; i <= max_npu2_index; i++) {
550 mmio_atsd_reg[i].reg = -1;
551 for (j = 0; j < NV_MAX_LINKS; j++) {
552 /*
553 * There are no ordering requirements with respect to
554 * the setup of struct npu_context, but to ensure
555 * consistent behaviour we need to ensure npdev[][] is
556 * only read once.
557 */
558 npdev = READ_ONCE(npu_context->npdev[i][j]);
559 if (!npdev)
560 continue;
527 561
562 nphb = pci_bus_to_host(npdev->bus)->private_data;
563 npu = &nphb->npu;
564 mmio_atsd_reg[i].npu = npu;
565 mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
566 while (mmio_atsd_reg[i].reg < 0) {
567 mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
568 cpu_relax();
569 }
570 break;
571 }
572 }
573}
574
575/*
576 * Release previously acquired ATSD registers. To avoid deadlocks the registers
577 * must be released in the same order they were acquired above in
578 * acquire_atsd_reg.
579 */
580static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
581{
582 int i;
583
584 for (i = 0; i <= max_npu2_index; i++) {
528 /* 585 /*
529 * The GPU requires two flush ATSDs to ensure all entries have 586 * We can't rely on npu_context->npdev[][] being the same here
530 * been flushed. We use PID 0 as it will never be used for a 587 * as when acquire_atsd_reg() was called, hence we use the
531 * process on the GPU. 588 * values stored in mmio_atsd_reg during the acquire phase
589 * rather than re-reading npdev[][].
532 */ 590 */
533 if (flush) 591 if (mmio_atsd_reg[i].reg < 0)
534 mmio_invalidate_pid(npu, 0, true); 592 continue;
593
594 put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
535 } 595 }
536} 596}
537 597
@@ -542,10 +602,6 @@ static void mmio_invalidate_wait(
542static void mmio_invalidate(struct npu_context *npu_context, int va, 602static void mmio_invalidate(struct npu_context *npu_context, int va,
543 unsigned long address, bool flush) 603 unsigned long address, bool flush)
544{ 604{
545 int i, j;
546 struct npu *npu;
547 struct pnv_phb *nphb;
548 struct pci_dev *npdev;
549 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; 605 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
550 unsigned long pid = npu_context->mm->context.id; 606 unsigned long pid = npu_context->mm->context.id;
551 607
@@ -561,37 +617,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
561 * Loop over all the NPUs this process is active on and launch 617 * Loop over all the NPUs this process is active on and launch
562 * an invalidate. 618 * an invalidate.
563 */ 619 */
564 for (i = 0; i <= max_npu2_index; i++) { 620 acquire_atsd_reg(npu_context, mmio_atsd_reg);
565 mmio_atsd_reg[i].reg = -1; 621 if (va)
566 for (j = 0; j < NV_MAX_LINKS; j++) { 622 mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
567 npdev = npu_context->npdev[i][j]; 623 else
568 if (!npdev) 624 mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
569 continue; 625
570 626 mmio_invalidate_wait(mmio_atsd_reg);
571 nphb = pci_bus_to_host(npdev->bus)->private_data; 627 if (flush) {
572 npu = &nphb->npu; 628 /*
573 mmio_atsd_reg[i].npu = npu; 629 * The GPU requires two flush ATSDs to ensure all entries have
574 630 * been flushed. We use PID 0 as it will never be used for a
575 if (va) 631 * process on the GPU.
576 mmio_atsd_reg[i].reg = 632 */
577 mmio_invalidate_va(npu, address, pid, 633 mmio_invalidate_pid(mmio_atsd_reg, 0, true);
578 flush); 634 mmio_invalidate_wait(mmio_atsd_reg);
579 else 635 mmio_invalidate_pid(mmio_atsd_reg, 0, true);
580 mmio_atsd_reg[i].reg = 636 mmio_invalidate_wait(mmio_atsd_reg);
581 mmio_invalidate_pid(npu, pid, flush);
582
583 /*
584 * The NPU hardware forwards the shootdown to all GPUs
585 * so we only have to launch one shootdown per NPU.
586 */
587 break;
588 }
589 } 637 }
590 638 release_atsd_reg(mmio_atsd_reg);
591 mmio_invalidate_wait(mmio_atsd_reg, flush);
592 if (flush)
593 /* Wait for the flush to complete */
594 mmio_invalidate_wait(mmio_atsd_reg, false);
595} 639}
596 640
597static void pnv_npu2_mn_release(struct mmu_notifier *mn, 641static void pnv_npu2_mn_release(struct mmu_notifier *mn,
@@ -726,7 +770,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
726 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", 770 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
727 &nvlink_index))) 771 &nvlink_index)))
728 return ERR_PTR(-ENODEV); 772 return ERR_PTR(-ENODEV);
729 npu_context->npdev[npu->index][nvlink_index] = npdev; 773
774 /*
775 * npdev is a pci_dev pointer setup by the PCI code. We assign it to
776 * npdev[][] to indicate to the mmu notifiers that an invalidation
777 * should also be sent over this nvlink. The notifiers don't use any
778 * other fields in npu_context, so we just need to ensure that when they
779 * deference npu_context->npdev[][] it is either a valid pointer or
780 * NULL.
781 */
782 WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
730 783
731 if (!nphb->npu.nmmu_flush) { 784 if (!nphb->npu.nmmu_flush) {
732 /* 785 /*
@@ -778,7 +831,7 @@ void pnv_npu2_destroy_context(struct npu_context *npu_context,
778 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", 831 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
779 &nvlink_index))) 832 &nvlink_index)))
780 return; 833 return;
781 npu_context->npdev[npu->index][nvlink_index] = NULL; 834 WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
782 opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, 835 opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
783 PCI_DEVID(gpdev->bus->number, gpdev->devfn)); 836 PCI_DEVID(gpdev->bus->number, gpdev->devfn));
784 kref_put(&npu_context->kref, pnv_npu2_release_context); 837 kref_put(&npu_context->kref, pnv_npu2_release_context);