diff options
author | Joshua Bakita <bakitajoshua@gmail.com> | 2024-04-08 13:33:28 -0400 |
---|---|---|
committer | Joshua Bakita <jbakita@cs.unc.edu> | 2024-04-08 14:02:07 -0400 |
commit | ac60151ea0a4a1f3882fde3c486af870029b7977 (patch) | |
tree | 4a32ead49a987030cb4222c687b23bc1f23d4d34 /nvdebug.h | |
parent | 3aab3c220f3f0bcc3d3d58d0daf6fd6acf1819e2 (diff) |
Rework LCE<->PCE and GRCE->LCE configuration printing APIarchive/saman63-wip
Rather than up to dozens of individual files exposing part of each
copy engine's configuration, have one file which exposes a unified
view of the full topology. Example new output on RTX 2080 Ti:
$ cat /proc/gpu0/copy_topology
GRCE0 -> LCE04
GRCE1 -> LCE03
LCE02 -> PCE02
LCE03 -> PCE03
LCE04 -> PCE01
Old output:
$ tail -n 1 /proc/gpu0/lce_for_pce*
==> /proc/gpu0/lce_for_pce0 <==
0xf
==> /proc/gpu0/lce_for_pce1 <==
0x4
==> /proc/gpu0/lce_for_pce2 <==
0x2
==> /proc/gpu0/lce_for_pce3 <==
0x3
$ tail -n 1 /proc/gpu1/shared_lce_for_grce*
==> /proc/gpu0/shared_lce_for_grce0 <==
0x4
==> /proc/gpu0/shared_lce_for_grce1 <==
0x3
Specifically:
- Add `copy_topology` API
- Remove `shared_lce_for_grce#` and `lce_for_pce#` APIs
- Move logic from `nvdebug_entry.c` to `copy_topology_procfs.c`
- Do not print PCE or Shared LCE configuration if flagged absent
- Refer to LCE0 and LCE1 as GRCE0 and GRCE1
- Print by LCE ID, which is move helpful when attempting to trace
how a given copy runlist maps to a physical copy engine.
- Document two errata with CE registers
Tested working on Pascal Integrated, Pascal, Volta Integrated
Volta, Turing, and Ampere Integrated on Linux 4.9 through 5.10.
Diffstat (limited to 'nvdebug.h')
-rw-r--r-- | nvdebug.h | 18 |
1 files changed, 11 insertions, 7 deletions
@@ -1,4 +1,4 @@ | |||
1 | /* Copyright 2021 Joshua Bakita | 1 | /* Copyright 2024 Joshua Bakita |
2 | * SPDX-License-Identifier: MIT | 2 | * SPDX-License-Identifier: MIT |
3 | * | 3 | * |
4 | * File outline: | 4 | * File outline: |
@@ -688,17 +688,20 @@ typedef union { | |||
688 | 688 | ||
689 | SCAL_NUM_CES : Number of externally accessible copy engines | 689 | SCAL_NUM_CES : Number of externally accessible copy engines |
690 | 690 | ||
691 | Errata: Incorrectly reports "3" on Jetson TX1 and TX2. Should report "1" to be | ||
692 | consistent with PTOP data. | ||
693 | |||
691 | Support: Kepler through (at least) Blackwell | 694 | Support: Kepler through (at least) Blackwell |
692 | Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. | 695 | Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. |
693 | */ | 696 | */ |
694 | #define NV_PTOP_SCAL_NUM_CES 0x00022444 | 697 | #define NV_PTOP_SCAL_NUM_CES 0x00022444 |
695 | // Defined number of GRCEs for a GPU | 698 | // Defined max number of GRCEs for a GPU (TX2 has only one) |
696 | # define NV_GRCE_NUM 2 | 699 | # define NV_GRCE_MAX 2 |
697 | // Defined GRCE->CE mapping offsets from nvgpu | 700 | // Defined GRCE->CE mapping offsets from nvgpu |
698 | #define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4) | 701 | #define NV_GRCE_FOR_CE_GP100(i) (0x00104034+(i)*4) |
699 | #define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4) | 702 | #define NV_GRCE_FOR_CE_GA100(i) (0x001041c0+(i)*4) |
700 | // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) | 703 | // Defined LCE->PCE mapping offset from nvgpu (same as ce_pce2lce_config_r(i) in nvgpu) |
701 | #define NV_LCE_FOR_PCE_GP100(i) (0x0010402c+(i)/2) | 704 | #define NV_LCE_FOR_PCE_GP100 0x0010402c |
702 | #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) | 705 | #define NV_LCE_FOR_PCE_GV100(i) (0x00104040+(i)*4) |
703 | #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) | 706 | #define NV_LCE_FOR_PCE_GA100(i) (0x00104100+(i)*4) |
704 | // Struct for use with nvdebug_reg_range_read() | 707 | // Struct for use with nvdebug_reg_range_read() |
@@ -717,13 +720,14 @@ union reg_range { | |||
717 | 720 | ||
718 | CE_PCE_MAP : A bitmask, where a set bit indicates that the PCE for that index | 721 | CE_PCE_MAP : A bitmask, where a set bit indicates that the PCE for that index |
719 | is enabled (not floorswept) on this GPU. Count the number of set | 722 | is enabled (not floorswept) on this GPU. Count the number of set |
720 | bits to get the number of PCEs. | 723 | bits to get the number of PCEs. Note that this may be bogus if |
724 | the GPU has not been used since reset. | ||
721 | 725 | ||
722 | Support: Kepler through (at least) Blackwell | 726 | Support: Pascal through (at least) Blackwell |
723 | Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. | 727 | Also see dev_ce.ref.txt of NVIDIA's open-gpu-doc for info. |
724 | */ | 728 | */ |
725 | #define NV_CE_PCE_MAP 0x00104028 | 729 | #define NV_CE_PCE_MAP 0x00104028 |
726 | #define MAP_SIZE 32 | 730 | #define NV_CE_PCE_MAP_SIZE 32 |
727 | 731 | ||
728 | 732 | ||
729 | /* Location of the 1Kb instance block with page tables for BAR1 and BAR2. | 733 | /* Location of the 1Kb instance block with page tables for BAR1 and BAR2. |