diff options
-rw-r--r-- | bus.c | 73 | ||||
-rw-r--r-- | nvdebug.h | 67 |
2 files changed, 111 insertions, 29 deletions
@@ -24,10 +24,11 @@ | |||
24 | /* Obtain the PRAMIN offset at which `addr` can be accessed | 24 | /* Obtain the PRAMIN offset at which `addr` can be accessed |
25 | @param addr Address to find | 25 | @param addr Address to find |
26 | @param target Which address space to use (VRAM, SYS_MEM, PEER(?)) | 26 | @param target Which address space to use (VRAM, SYS_MEM, PEER(?)) |
27 | @return positive offset or -EINVAL on invalid arguments | 27 | @return positive offset, -EINVAL on invalid arguments, or -EOPNOTSUPP on |
28 | an unsupported platform. | ||
28 | 29 | ||
29 | Note: Will move the PRAMIN window to accomodate the request. Only guarantees | 30 | Note: Will move the PRAMIN window to accomodate the request. Only guarantees |
30 | that the surrounding 64KiB window will be accessible. | 31 | that the surrounding 64-KiB-aligned window will be accessible. |
31 | Note: Moving the PRAMIN window will cause problems if it races with driver | 32 | Note: Moving the PRAMIN window will cause problems if it races with driver |
32 | code that tries to do the same, or expects the window not to move. | 33 | code that tries to do the same, or expects the window not to move. |
33 | Bugs: Untested on PEER. | 34 | Bugs: Untested on PEER. |
@@ -36,6 +37,7 @@ int addr_to_pramin_mut(struct nvdebug_state *g, | |||
36 | uint64_t addr, enum INST_TARGET target) { | 37 | uint64_t addr, enum INST_TARGET target) { |
37 | bar0_window_t window; | 38 | bar0_window_t window; |
38 | uint64_t pramin_base; | 39 | uint64_t pramin_base; |
40 | uint32_t window_reg; | ||
39 | // For us, accuracy and robustness is more important than speed | 41 | // For us, accuracy and robustness is more important than speed |
40 | // Check that the address is valid (49 bits are addressable on-GPU, but | 42 | // Check that the address is valid (49 bits are addressable on-GPU, but |
41 | // PRAMIN only supports up to 40 bits). | 43 | // PRAMIN only supports up to 40 bits). |
@@ -44,21 +46,38 @@ int addr_to_pramin_mut(struct nvdebug_state *g, | |||
44 | addr, __func__); | 46 | addr, __func__); |
45 | return -EINVAL; | 47 | return -EINVAL; |
46 | } | 48 | } |
47 | window.raw = nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); | 49 | // Register relocated on Hopper and Blackwell+ |
48 | if (window.target != target) | 50 | if ((g->chip_id >= NV_CHIP_ID_HOPPER && g->chip_id < NV_CHIP_ID_ADA) || g->chip_id >= NV_CHIP_ID_BLACKWELL) |
49 | goto relocate; | 51 | window_reg = NV_XAL_EP_BAR0_WINDOW_BASE; |
52 | else | ||
53 | window_reg = NV_PBUS_BAR0_WINDOW; | ||
54 | if ((window.raw = nvdebug_readl(g, window_reg)) == -1) { | ||
55 | printk(KERN_ERR "[nvdebug] PRAMIN window configuration inaccessible; " | ||
56 | "failing %s\n", __func__); | ||
57 | return -EOPNOTSUPP; | ||
58 | } | ||
59 | if (window.target != target) { | ||
60 | // On Hopper and Blackwell+, the window always points at VID_MEM | ||
61 | if ((g->chip_id >= NV_CHIP_ID_HOPPER && g->chip_id < NV_CHIP_ID_ADA) || g->chip_id >= NV_CHIP_ID_BLACKWELL) | ||
62 | return -EOPNOTSUPP; | ||
63 | else | ||
64 | goto relocate; | ||
65 | } | ||
50 | pramin_base = ((uint64_t)window.base) << 16; | 66 | pramin_base = ((uint64_t)window.base) << 16; |
51 | if (addr < pramin_base || addr > pramin_base + NV_PRAMIN_LEN) | 67 | if (addr < pramin_base || addr > pramin_base + NV_PRAMIN_LEN) |
52 | goto relocate; | 68 | goto relocate; |
53 | return addr - pramin_base; // Guaranteed to be < 1MiB, so safe for int | 69 | return addr - pramin_base; // Guaranteed to be < 1MiB, so safe for int |
54 | relocate: | 70 | relocate: |
55 | printk(KERN_INFO "[nvdebug] Moving PRAMIN win from base %llx (%s) to %llx (%s) to accomodate %#018llx\n", pramin_base, target_to_text(window.target), (addr >> 16) << 16, target_to_text(target), addr); | 71 | printk(KERN_INFO "[nvdebug] [SIDE EFFECT] Moving PRAMIN window from base " |
72 | "%llx (%s) to %llx (%s) to accomodate %#018llx\n", | ||
73 | ((uint64_t)window.base) << 16, target_to_text(window.target), | ||
74 | (addr >> 16) << 16, target_to_text(target), addr); | ||
56 | // Move PRAMIN window to a 64KiB-aligned address | 75 | // Move PRAMIN window to a 64KiB-aligned address |
57 | window.base = (u32)(addr >> 16); // Safe, due to above range check | 76 | window.base = (u32)(addr >> 16); // Safe, due to above range check |
58 | window.target = target; | 77 | window.target = target; |
59 | nvdebug_writel(g, NV_PBUS_BAR0_WINDOW, window.raw); | 78 | nvdebug_writel(g, window_reg, window.raw); |
60 | // Wait for the window to move by re-reading (as done in nvgpu driver) | 79 | // Wait for the window to move by re-reading (as done in nvgpu driver) |
61 | (void) nvdebug_readl(g, NV_PBUS_BAR0_WINDOW); | 80 | (void) nvdebug_readl(g, window_reg); |
62 | return (int)(addr & 0xffffull); | 81 | return (int)(addr & 0xffffull); |
63 | } | 82 | } |
64 | 83 | ||
@@ -72,7 +91,9 @@ relocate: | |||
72 | */ | 91 | */ |
73 | int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd) { | 92 | int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd) { |
74 | int ret; | 93 | int ret; |
75 | bar_config_block_t bar2_block; | 94 | uint64_t bar2_ptr; |
95 | enum INST_TARGET bar2_target; | ||
96 | bool bar2_is_virtual; | ||
76 | 97 | ||
77 | if (!pd) | 98 | if (!pd) |
78 | return -EINVAL; | 99 | return -EINVAL; |
@@ -85,17 +106,37 @@ int get_bar2_pdb(struct nvdebug_state *g, page_dir_config_t* pd) { | |||
85 | // hierarchy used to translate BAR2 offsets to VRAM or SYS_MEM addresses. | 106 | // hierarchy used to translate BAR2 offsets to VRAM or SYS_MEM addresses. |
86 | 107 | ||
87 | // Determine location of BAR2 instance block | 108 | // Determine location of BAR2 instance block |
88 | if ((bar2_block.raw = nvdebug_readl(g, NV_PBUS_BAR2_BLOCK)) == -1) { | 109 | if ((g->chip_id >= NV_CHIP_ID_HOPPER && g->chip_id < NV_CHIP_ID_ADA) || g->chip_id >= NV_CHIP_ID_BLACKWELL) { |
89 | printk(KERN_ERR "[nvdebug] Unable to read BAR2/3 configuration! BAR2/3 inaccessible.\n"); | 110 | // Register layout updated on Hopper and Blackwell+ to support 52-bit |
90 | return -EOPNOTSUPP; | 111 | // instance block pointers (vs. 40 bits before) |
112 | bar_config_block_gh100_t bar2_block; | ||
113 | if ((bar2_block.raw = nvdebug_readq(g, NV_VIRTUAL_FUNCTION_PRIV_FUNC_BAR2_BLOCK)) == -1) { | ||
114 | printk(KERN_ERR "[nvdebug] Unable to read BAR2/3 configuration! BAR2/3 inaccessible.\n"); | ||
115 | return -EOPNOTSUPP; | ||
116 | } | ||
117 | bar2_ptr = (uint64_t)bar2_block.ptr << 12; | ||
118 | bar2_target = bar2_block.target; | ||
119 | bar2_is_virtual = bar2_block.is_virtual; | ||
120 | } else { | ||
121 | bar_config_block_t bar2_block; | ||
122 | if ((bar2_block.raw = nvdebug_readl(g, NV_PBUS_BAR2_BLOCK)) == -1) { | ||
123 | printk(KERN_ERR "[nvdebug] Unable to read BAR2/3 configuration! BAR2/3 inaccessible.\n"); | ||
124 | return -EOPNOTSUPP; | ||
125 | } | ||
126 | bar2_ptr = (uint64_t)bar2_block.ptr << 12; | ||
127 | bar2_target = bar2_block.target; | ||
128 | bar2_is_virtual = bar2_block.is_virtual; | ||
91 | } | 129 | } |
92 | printk(KERN_INFO "[nvdebug] BAR2 inst block @ %llx in %s's %s address space.\n", ((u64)bar2_block.ptr) << 12, target_to_text(bar2_block.target), bar2_block.is_virtual ? "virtual" : "physical"); | 130 | printk(KERN_INFO "[nvdebug] BAR2 inst block @ %llx in %s's %s address space.\n", bar2_ptr, target_to_text(bar2_target), bar2_is_virtual ? "virtual" : "physical"); |
93 | // Setup PRAMIN to point at the BAR2 instance block | 131 | // Setup PRAMIN to point at the BAR2 instance block |
94 | if ((ret = addr_to_pramin_mut(g, (uint64_t)bar2_block.ptr << 12, bar2_block.target)) < 0) { | 132 | // TODO: This won't work if the instance block is in SYS_MEM on Hopper or |
95 | printk(KERN_ERR "[nvdebug] Invalid BAR2/3 Instance Block configuration! BAR2/3 inaccessible.\n"); | 133 | // Blackwell+. Going through the I/O MMU appears to be fairly |
134 | // reliable, so I need to switch to using that logic whenever | ||
135 | // SYS_MEM may be accessed. | ||
136 | if ((ret = addr_to_pramin_mut(g, bar2_ptr, bar2_target)) < 0) { | ||
137 | printk(KERN_ERR "[nvdebug] Unable to access BAR2/3 Instance Block configuration via PRAMIN! BAR2/3 inaccessible.\n"); | ||
96 | return ret; | 138 | return ret; |
97 | } | 139 | } |
98 | printk(KERN_INFO "[nvdebug] BAR2 inst block at off %x in PRAMIN\n", ret); | ||
99 | // Pull the page directory base configuration from the instance block | 140 | // Pull the page directory base configuration from the instance block |
100 | if ((pd->raw = nvdebug_readq(g, NV_PRAMIN + ret + NV_PRAMIN_PDB_CONFIG_OFF)) == -1) { | 141 | if ((pd->raw = nvdebug_readq(g, NV_PRAMIN + ret + NV_PRAMIN_PDB_CONFIG_OFF)) == -1) { |
101 | printk(KERN_ERR "[nvdebug] Unable to read BAR2/3 PDB configuration! BAR2/3 inaccessible.\n"); | 142 | printk(KERN_ERR "[nvdebug] Unable to read BAR2/3 PDB configuration! BAR2/3 inaccessible.\n"); |
@@ -484,6 +484,7 @@ typedef union { | |||
484 | #define NV_CHIP_ID_AMPERE 0x170 | 484 | #define NV_CHIP_ID_AMPERE 0x170 |
485 | #define NV_CHIP_ID_HOPPER 0x180 | 485 | #define NV_CHIP_ID_HOPPER 0x180 |
486 | #define NV_CHIP_ID_ADA 0x190 | 486 | #define NV_CHIP_ID_ADA 0x190 |
487 | #define NV_CHIP_ID_BLACKWELL 0x1A0 | ||
487 | 488 | ||
488 | inline static const char* ARCH2NAME(uint32_t arch) { | 489 | inline static const char* ARCH2NAME(uint32_t arch) { |
489 | switch (arch) { | 490 | switch (arch) { |
@@ -521,14 +522,18 @@ inline static const char* ARCH2NAME(uint32_t arch) { | |||
521 | return "Turing"; | 522 | return "Turing"; |
522 | case 0x17: | 523 | case 0x17: |
523 | return "Ampere"; | 524 | return "Ampere"; |
524 | case 0x18: | 525 | case 0x18: // Despite the Chip ID, Hopper functionally proceeds Ada |
525 | return "Hopper"; | 526 | return "Hopper"; |
526 | case 0x19: | 527 | case 0x19: |
527 | return "Ada Lovelace"; | 528 | return "Ada Lovelace"; |
528 | case 0x20: | 529 | case 0x1A: |
529 | return "Blackwell (?)"; | 530 | return "Blackwell"; |
531 | case 0x1B: | ||
532 | return "Rubin (?)"; | ||
533 | case 0x1F: // NVIDIA-internal simulator | ||
534 | return "AMODEL"; | ||
530 | default: | 535 | default: |
531 | if (arch < 0x19) | 536 | if (arch < 0x1A) |
532 | return "[unknown historical architecture]"; | 537 | return "[unknown historical architecture]"; |
533 | else | 538 | else |
534 | return "[future]"; | 539 | return "[future]"; |
@@ -881,25 +886,57 @@ union reg_range { | |||
881 | #define NV_CE_PCE_MAP_SIZE 32 | 886 | #define NV_CE_PCE_MAP_SIZE 32 |
882 | 887 | ||
883 | 888 | ||
884 | /* Location of the 1Kb instance block with page tables for BAR1 and BAR2. | 889 | /* Location of the 1Kb instance block with page tables for the BAR1/2 regions. |
885 | Support: Fermi+ (?), Pascal | 890 | |
891 | On the H100, the "BAR1 block" describes what is actually BAR2, and the | ||
892 | "BAR2 block" describes BAR4. | ||
893 | |||
894 | PTR : Upper 28 bits of the 40-bit, (4k-aligned) address where the instance | ||
895 | block configuration is for the listed BAR region. | ||
896 | |||
897 | "Hopper+ uses 64-bit BARs, so GPU BAR2 should be at BAR4/5 and GPU BAR1 is at | ||
898 | BAR2/3" (open-gpu-kernel-modules) | ||
886 | */ | 899 | */ |
900 | // Support: Fermi through Ampere, Ada | ||
887 | #define NV_PBUS_BAR1_BLOCK 0x00001704 | 901 | #define NV_PBUS_BAR1_BLOCK 0x00001704 |
888 | #define NV_PBUS_BAR2_BLOCK 0x00001714 | 902 | #define NV_PBUS_BAR2_BLOCK 0x00001714 |
889 | typedef union { | 903 | typedef union { |
890 | struct { | 904 | struct { |
891 | uint32_t ptr:28; | 905 | uint32_t ptr:28; |
892 | enum INST_TARGET target:2; | 906 | enum INST_TARGET target:2; |
893 | uint32_t padding0:1; | 907 | uint32_t :1; |
894 | bool is_virtual:1; | 908 | bool is_virtual:1; |
895 | } __attribute__((packed)); | 909 | } __attribute__((packed)); |
896 | uint32_t raw; | 910 | uint32_t raw; |
897 | struct { | 911 | struct { |
898 | uint32_t map:30; | 912 | uint32_t map:30; |
899 | uint32_t padding1:2; | 913 | uint32_t :2; |
900 | } __attribute__((packed)); | 914 | } __attribute__((packed)); |
901 | } bar_config_block_t; | 915 | } bar_config_block_t; |
902 | 916 | ||
917 | // Support: Hopper, Blackwell+ | ||
918 | // This is a "VREG" (virtual register?) in the documentation, meaning that it | ||
919 | // needs the VREG base added first. | ||
920 | #define NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET 0x00B80000 | ||
921 | #define NV_VIRTUAL_FUNCTION_PRIV_FUNC_BAR2_BLOCK (NV_VIRTUAL_FUNCTION_FULL_PHYS_OFFSET+0x00000F70) | ||
922 | typedef union { | ||
923 | struct { | ||
924 | bool is_pending:1; | ||
925 | bool is_outstanding:1; | ||
926 | uint32_t :7; | ||
927 | bool is_virtual:1; | ||
928 | enum INST_TARGET target:2; | ||
929 | uint64_t ptr:40; | ||
930 | uint32_t :12; | ||
931 | } __attribute__((packed)); | ||
932 | uint64_t raw; | ||
933 | struct { | ||
934 | uint32_t :10; | ||
935 | uint32_t map:22; | ||
936 | uint32_t :32; | ||
937 | } __attribute__((packed)); | ||
938 | } bar_config_block_gh100_t; | ||
939 | |||
903 | /* BAR0 PRAMIN (Private RAM Instance) window configuration | 940 | /* BAR0 PRAMIN (Private RAM Instance) window configuration |
904 | One of the oldest ways to access video memory on NVIDIA GPUs is by using | 941 | One of the oldest ways to access video memory on NVIDIA GPUs is by using |
905 | a configurable 1MB window into VRAM which is mapped into BAR0 (register) | 942 | a configurable 1MB window into VRAM which is mapped into BAR0 (register) |
@@ -914,21 +951,25 @@ typedef union { | |||
914 | TARGET : Which address space BASE points into | 951 | TARGET : Which address space BASE points into |
915 | 952 | ||
916 | Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes | 953 | Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes |
917 | |||
918 | Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere | ||
919 | */ | 954 | */ |
955 | // Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere, Ada | ||
920 | #define NV_PBUS_BAR0_WINDOW 0x00001700 | 956 | #define NV_PBUS_BAR0_WINDOW 0x00001700 |
921 | #define NV_PRAMIN 0x00700000 // Goes until 0x00800000 (1MB window) | 957 | // On Hopper, and Blackwell+, TARGET must always be 0 (VIDMEM) |
922 | #define NV_PRAMIN_LEN 0x00100000 | 958 | // Support: Hopper, Blackwell+ |
959 | #define NV_XAL_EP_BAR0_WINDOW_BASE 0x0010fd40 | ||
923 | typedef union { | 960 | typedef union { |
924 | struct { | 961 | struct { |
925 | uint32_t base:24; | 962 | uint32_t base:24; |
926 | enum INST_TARGET target:2; | 963 | enum INST_TARGET target:2; |
927 | uint32_t padding0:6; | 964 | uint32_t :6; |
928 | } __attribute__((packed)); | 965 | } __attribute__((packed)); |
929 | uint32_t raw; | 966 | uint32_t raw; |
930 | } bar0_window_t; | 967 | } bar0_window_t; |
931 | 968 | ||
969 | // Support: Tesla 2.0 through (at least) Blackwell | ||
970 | #define NV_PRAMIN 0x00700000 // Goes until 0x00800000 (1MB window) | ||
971 | #define NV_PRAMIN_LEN 0x00100000 | ||
972 | |||
932 | // Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere | 973 | // Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere |
933 | #define NV_PRAMIN_PDB_CONFIG_OFF 0x200 | 974 | #define NV_PRAMIN_PDB_CONFIG_OFF 0x200 |
934 | typedef union { | 975 | typedef union { |