aboutsummaryrefslogtreecommitdiffstats
path: root/nvdebug.h
diff options
context:
space:
mode:
Diffstat (limited to 'nvdebug.h')
-rw-r--r--nvdebug.h719
1 files changed, 673 insertions, 46 deletions
diff --git a/nvdebug.h b/nvdebug.h
index 9ac71da..1882756 100644
--- a/nvdebug.h
+++ b/nvdebug.h
@@ -5,14 +5,18 @@
5// TODO(jbakita): Don't depend on these. 5// TODO(jbakita): Don't depend on these.
6#include <nvgpu/gk20a.h> // For struct gk20a 6#include <nvgpu/gk20a.h> // For struct gk20a
7#include <os/linux/os_linux.h> // For struct nvgpu_os_linux 7#include <os/linux/os_linux.h> // For struct nvgpu_os_linux
8#include <linux/proc_fs.h> // For PDE_DATA() macro
8 9
9/* Runlist Channel 10/* Runlist Channel
10 A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue 11 A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
11 of GPU commands. These commands are typically queued from userspace. 12 of GPU commands. These commands are typically queued from userspace.
12 13
13 `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU 14 Prior to Volta, channels could also exist independent of a TSG. These are
14 virtual address space for this context. All channels in a TSG point to the 15 called "bare channels" in the Jetson nvgpu driver.
15 same GPU Instance Block (?). 16
17 `INST_PTR` points to a GPU Instance Block which contains FIFO states, virtual
18 address space configuration for this context, and a pointer to the page
19 tables. All channels in a TSG point to the same GPU Instance Block (?).
16 20
17 "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and 21 "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
18 thereby which PBDMA will run the channel. Increasing values select 22 thereby which PBDMA will run the channel. Increasing values select
@@ -30,7 +34,13 @@
30 ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN 34 ENTRY_TYPE (T) : type of this entry: ENTRY_TYPE_CHAN
31 CHID (ID) : identifier of the channel to run (overlays ENTRY_ID) 35 CHID (ID) : identifier of the channel to run (overlays ENTRY_ID)
32 RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if 36 RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
33 more than one PBDMA is supported by the runlist 37 more than one PBDMA is supported by the runlist,
38 additionally, "A value of 0 targets the first FE
39 pipe, which can process all FE driven engines:
40 Graphics, Compute, Inline2Memory, and TwoD. A value
41 of 1 targets the second FE pipe, which can only
42 process Compute work. Note that GRCE work is allowed
43 on either runqueue.)"
34 44
35 INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer 45 INST_PTR_LO : lower 20 bits of the 4k-aligned instance block pointer
36 INST_PTR_HI : upper 32 bit of instance block pointer 46 INST_PTR_HI : upper 32 bit of instance block pointer
@@ -39,6 +49,9 @@
39 USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer 49 USERD_PTR_LO : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
40 USERD_PTR_HI : upper 32 bits of USERD pointer 50 USERD_PTR_HI : upper 32 bits of USERD pointer
41 USERD_TARGET (TGU) : aperture of the USERD data structure 51 USERD_TARGET (TGU) : aperture of the USERD data structure
52
53 Channels were around since at least Fermi, but were rearranged with Volta to
54 add a USERD pointer, a longer INST pointer, and a runqueue selector flag.
42*/ 55*/
43enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1}; 56enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
44enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3}; 57enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
@@ -52,11 +65,12 @@ static inline char* target_to_text(enum INST_TARGET t) {
52 return "SYS_MEM_NONCOHERENT"; 65 return "SYS_MEM_NONCOHERENT";
53 default: 66 default:
54 printk(KERN_WARNING "[nvdebug] Invalid aperture!\n"); 67 printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
55 return NULL; 68 return "INVALID";
56 } 69 }
57} 70}
58 71
59struct runlist_chan { 72// Support: Volta, Ampere, Turing
73struct gv100_runlist_chan {
60// 0:63 74// 0:63
61 enum ENTRY_TYPE entry_type:1; 75 enum ENTRY_TYPE entry_type:1;
62 uint32_t runqueue_selector:1; 76 uint32_t runqueue_selector:1;
@@ -71,6 +85,20 @@ struct runlist_chan {
71 uint32_t inst_ptr_hi:32; 85 uint32_t inst_ptr_hi:32;
72} __attribute__((packed)); 86} __attribute__((packed));
73 87
88// Support: Fermi, Kepler*, Maxwell, Pascal
89// *In Kepler, inst fields may be unpopulated?
90struct gm107_runlist_chan {
91 uint32_t chid:12;
92 uint32_t padding0:1;
93 enum ENTRY_TYPE entry_type:1;
94 uint32_t padding1:18;
95 uint32_t inst_ptr_lo:20;
96 enum INST_TARGET inst_target:2; // Totally guessing on this
97 uint32_t padding2:10;
98} __attribute__((packed));
99
100#define gk110_runlist_chan gm107_runlist_chan
101
74/* Runlist TSG (TimeSlice Group) 102/* Runlist TSG (TimeSlice Group)
75 The runlist is composed of timeslice groups (TSG). Each TSG corresponds 103 The runlist is composed of timeslice groups (TSG). Each TSG corresponds
76 to a single virtual address space on the GPU and contains `TSG_LENGTH` 104 to a single virtual address space on the GPU and contains `TSG_LENGTH`
@@ -85,8 +113,15 @@ struct runlist_chan {
85 TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice 113 TIMESLICE_TIMEOUT : timeout amount for the TSG's timeslice
86 TSG_LENGTH : number of channels that are part of this timeslice group 114 TSG_LENGTH : number of channels that are part of this timeslice group
87 TSGID : identifier of the Timeslice group (overlays ENTRY_ID) 115 TSGID : identifier of the Timeslice group (overlays ENTRY_ID)
116
117 TSGs appear to have been introduced with Kepler and stayed the same until
118 they were rearranged at the time of channel rearrangement to support longer
119 GPU instance addresses with Volta.
88*/ 120*/
89struct entry_tsg { 121
122// Support: Volta, Ampere*, Turing*
123// *These treat the top 8 bits of TSGID as GFID (unused)
124struct gv100_runlist_tsg {
90// 0:63 125// 0:63
91 enum ENTRY_TYPE entry_type:1; 126 enum ENTRY_TYPE entry_type:1;
92 uint64_t padding:15; 127 uint64_t padding:15;
@@ -101,14 +136,28 @@ struct entry_tsg {
101} __attribute__((packed)); 136} __attribute__((packed));
102#define MAX_TSGID (1 << 12) 137#define MAX_TSGID (1 << 12)
103 138
139// Support: Kepler (v2?), Maxwell, Pascal
140// Same fields as Volta except tsg_length is 6 bits rather than 8
141// Last 32 bits appear to contain an undocumented inst ptr
142struct gk110_runlist_tsg {
143 uint32_t tsgid:12;
144 uint32_t padding0:1;
145 enum ENTRY_TYPE entry_type:1;
146 uint32_t timeslice_scale:4;
147 uint32_t timeslice_timeout:8;
148 uint32_t tsg_length:6;
149 uint32_t padding1:32;
150} __attribute__((packed));
151
152
104enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1}; 153enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};
105 154
106/* Preempt a TSG or Channel by ID 155/* Preempt a TSG or Channel by ID
107 ID/CHID : Id of TSG or channel to preempt 156 ID/CHID : Id of TSG or channel to preempt
108 IS_PENDING : ???? 157 IS_PENDING : Is a context switch pending?
109 TYPE : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG 158 TYPE : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG
110 159
111 Support: Kepler, Maxwell, Pascal, Volta 160 Support: Kepler, Maxwell, Pascal, Volta, Turing
112*/ 161*/
113#define NV_PFIFO_PREEMPT 0x00002634 162#define NV_PFIFO_PREEMPT 0x00002634
114typedef union { 163typedef union {
@@ -195,26 +244,36 @@ typedef union {
195 */ 244 */
196 245
197// Note: This is different with Turing 246// Note: This is different with Turing
198// Support: Kepler, Maxwell, Pascal, Volta 247// Support: Fermi, Kepler, Maxwell, Pascal, Volta
199#define NV_PFIFO_RUNLIST_BASE 0x00002270 248#define NV_PFIFO_RUNLIST_BASE 0x00002270
249#define NV_PFIFO_ENG_RUNLIST_BASE(i) (0x00002280+(i)*8)
200typedef union { 250typedef union {
201 struct { 251 struct {
202 uint32_t ptr:28; 252 uint32_t ptr:28;
203 uint32_t type:2; 253 enum INST_TARGET target:2;
204 uint32_t padding:2; 254 uint32_t padding:2;
205 } __attribute__((packed)); 255 } __attribute__((packed));
206 uint32_t raw; 256 uint32_t raw;
207} runlist_base_t; 257} runlist_base_t;
208 258
209// Support: Kepler, Maxwell, Pascal, Volta 259// Support: Kepler, Maxwell, Pascal, Volta
260// Works on Fermi, but id is one bit longer and is b11111
210#define NV_PFIFO_RUNLIST 0x00002274 261#define NV_PFIFO_RUNLIST 0x00002274
262#define NV_PFIFO_ENG_RUNLIST(i) (0x00002284+(i)*8)
211typedef union { 263typedef union {
264 // RUNLIST fields
212 struct { 265 struct {
213 uint32_t len:16; 266 uint32_t len:16;
214 uint32_t padding:4; 267 uint32_t padding:4;
215 uint32_t id:4; 268 uint32_t id:4; // Runlist ID (each engine may have a seperate runlist)
216 uint32_t padding2:8; 269 uint32_t padding2:8;
217 } __attribute__((packed)); 270 } __attribute__((packed));
271 // ENG_RUNLIST fields that differ
272 struct {
273 uint32_t padding3:20;
274 bool is_pending:1; // Is runlist not yet committed?
275 uint32_t padding4:11;
276 } __attribute__((packed));
218 uint32_t raw; 277 uint32_t raw;
219} runlist_info_t; 278} runlist_info_t;
220 279
@@ -301,63 +360,631 @@ typedef union {
301 uint32_t raw; 360 uint32_t raw;
302} runlist_disable_t; 361} runlist_disable_t;
303 362
363/* Read GPU descriptors from the Master Controller (MC)
364
365 MINOR_REVISION : Legacy (only used with Celvin in Nouveau)
366 MAJOR_REVISION : Legacy (only used with Celvin in Nouveau)
367 IMPLEMENTATION : Which implementation of the GPU architecture
368 ARCHITECTURE : Which GPU architecture
369
370 CHIP_ID = IMPLEMENTATION + ARCHITECTURE << 4
371 CHIP_ID : Unique ID of all chips since Kelvin
372
373 Support: Kelvin, Rankline, Curie, Tesla, Fermi, Kepler, Maxwell, Pascal,
374 Volta, Turing, Ampere
375*/
376#define NV_MC_BOOT_0 0x00000000
377#define NV_CHIP_ID_GP106 0x136 // Discrete GeForce GTX 1060
378#define NV_CHIP_ID_GV11B 0x15B // Jetson Xavier embedded GPU
379#define NV_CHIP_ID_KEPLER 0x0E0
380#define NV_CHIP_ID_VOLTA 0x140
381
382inline static const char* ARCH2NAME(uint32_t arch) {
383 switch (arch) {
384 case 0x01:
385 return "Celsius";
386 case 0x02:
387 return "Kelvin";
388 case 0x03:
389 return "Rankline";
390 case 0x04:
391 case 0x06: // 0x06 is (nForce 6XX integrated only)
392 return "Curie";
393 // 0x07 is unused/skipped
394 case 0x05: // First Tesla card was released before the nForce 6XX
395 case 0x08:
396 case 0x09:
397 case 0x0A:
398 return "Tesla";
399 // 0x0B is unused/skipped
400 case 0x0C:
401 case 0x0D:
402 return "Fermi";
403 case 0x0E:
404 case 0x0F:
405 case 0x11:
406 return "Kepler";
407 case 0x12:
408 return "Maxwell";
409 case 0x13:
410 return "Pascal";
411 case 0x14:
412 case 0x15: // Volta integrated
413 return "Volta";
414 case 0x16:
415 return "Turing";
416 case 0x17:
417 return "Ampere";
418 case 0x18:
419 case 0x19:
420 return "Hopper (?) or Lovelace (?)";
421 default:
422 if (arch < 0x19)
423 return "[unknown historical architecture]";
424 else
425 return "[future]";
426 }
427}
428
429typedef union {
430 // Fields as defined in the NVIDIA reference
431 struct {
432 uint32_t minor_revision:4;
433 uint32_t major_revision:4;
434 uint32_t reserved:4;
435 uint32_t padding0:8;
436 uint32_t implementation:4;
437 uint32_t architecture:5;
438 uint32_t padding1:3;
439 } __attribute__((packed));
440 uint32_t raw;
441 // Arch << 4 + impl is also often used
442 struct {
443 uint32_t padding2:20;
444 uint32_t chip_id:9;
445 uint32_t padding3:3;
446 } __attribute__((packed));
447} mc_boot_0_t;
448
449enum DEVICE_INFO_TYPE {INFO_TYPE_NOT_VALID = 0, INFO_TYPE_DATA = 1, INFO_TYPE_ENUM = 2, INFO_TYPE_ENGINE_TYPE = 3};
450enum ENGINE_TYPES {
451 ENGINE_GRAPHICS = 0, // GRAPHICS [/compute]
452 ENGINE_COPY0 = 1, // [raw/physical] COPY #0
453 ENGINE_COPY1 = 2, // [raw/physical] COPY #1
454 ENGINE_COPY2 = 3, // [raw/physical] COPY #2
455
456 ENGINE_MSPDEC = 8, // Picture DECoder
457 ENGINE_MSPPP = 9, // [Video] Post Processing
458 ENGINE_MSVLD = 10, // [Video] Variable Length Decoder
459 ENGINE_MSENC = 11, // [Video] ENCoding
460 ENGINE_VIC = 12, // Video Image Compositor
461 ENGINE_SEC = 13, // SEquenCer [?]
462 ENGINE_NVENC0 = 14, // Nvidia Video ENCoder #0
463 ENGINE_NVENC1 = 15, // Nvidia Video ENCoder #1
464 ENGINE_NVDEC = 16, // Nvidia Video DECoder
465
466 ENGINE_IOCTRL = 18, // I/O ConTRoLler [of NVLINK at least]
467 ENGINE_LCE = 19, // Logical Copy Engine
468 ENGINE_GSP = 20, // Gpu System Processor
469 ENGINE_NVJPG = 21, // NVidia JPeG [Decoder] (Ampere+)
470};
471#define ENGINE_TYPES_LEN 22
472static const char* const ENGINE_TYPES_NAMES[ENGINE_TYPES_LEN] = {
473 "Graphics/Compute",
474 "COPY0",
475 "COPY1",
476 "COPY2",
477 "Unknown Engine ID#4",
478 "Unknown Engine ID#5",
479 "Unknown Engine ID#6",
480 "Unknown Engine ID#7",
481 "MSPDEC: Picture Decoder",
482 "MSPPP: Post Processing",
483 "MSVLD: Variable Length Decoder",
484 "MSENC: Encoder",
485 "VIC: Video Image Compositor",
486 "SEC: Sequencer",
487 "NVENC0: NVIDIA Video Encoder #0",
488 "NVENC1: NVIDIA Video Encoder #1",
489 "NVDEC: NVIDIA Video Decoder",
490 "Unknown Engine ID#17",
491 "IOCTRL: I/O Controller",
492 "LCE: Logical Copy Engine",
493 "GSP: GPU System Processor",
494 "NVJPG: NVIDIA JPEG Decoder",
495};
496
497/* GPU engine information and control register offsets
498 Each engine is described by one or more entries (terminated by an entry with
499 the `has_next_entry` flag unset) in the fixed-size PTOP_DEVICE_INFO table. A
500 typical device, such as the graphics/compute engine and any copy engines, are
501 described by three entries, one of each type.
502
503 The PTOP_DEVICE_INFO table is sparsely populated (entries of type
504 INFO_TYPE_NOT_VALID may be intermingled with valid entries), so any traversal
505 code should check all NV_PTOP_DEVICE_INFO__SIZE_1 entries and not terminate
506 upon reaching the first entry of INFO_TYPE_NOT_VALID.
507
508 INFO_TYPE : Is this a DATA, ENUM, or ENGINE_TYPE table entry?
509 HAS_NEXT_ENTRY : Does the following entry refer to the same engine?
510
511 == INFO_TYPE_DATA fields ==
512 PRI_BASE : BAR0 base = (PRI_BASE << 12) aka 4k aligned.
513 INST_ID : "Note that some instanced [engines] (such as logical copy
514 engines aka LCE) share a PRI_BASE across all [engines] of
515 the same engine type; such [engines] require an additional
516 offset: instanced base = BAR0 base + stride * INST_ID.
517 FAULT_ID_IS_VALID : Does this engine have its own bind point and fault ID
518 with the MMU?
519 FAULT_ID : "The MMU fault id used by this [engine]. These IDs
520 correspond to the NV_PFAULT_MMU_ENG_ID define list."
521
522 == INFO_TYPE_ENUM fields ==
523 ENGINE_IS_VALID : Is this engine a host engine?
524 ENGINE_ENUM : "[T]he host engine ID for the current [engine] if it is
525 a host engine, meaning Host can send methods to the
526 engine. This id is used to index into any register array
527 whose __SIZE_1 is equal to NV_HOST_NUM_ENGINES. A given
528 ENGINE_ENUM can be present for at most one device in the
529 table. Devices corresponding to all ENGINE_ENUM ids 0
530 through NV_HOST_NUM_ENGINES - 1 must be present in the
531 device info table."
532 RUNLIST_IS_VALID : Is this engine a host engine with a runlist?
533 RUNLIST_ENUM : "[T]he Host runlist ID on which methods for the current
534 [engine] should be submitted... The runlist id is used to
535 index into any register array whose __SIZE_1 is equal to
536 NV_HOST_NUM_RUNLISTS. [Engines] corresponding to all
537 RUNLIST_ENUM ids 0 through NV_HOST_NUM_RUNLISTS - 1 must
538 be present in the device info table."
539 INTR_IS_VALID : Does this device have an interrupt?
540 INTR_ENUM : Interrupt ID for use with "the NV_PMC_INTR_*_DEVICE
541 register bitfields."
542 RESET_IS_VALID : Does this engine have a reset ID?
543 RESET_ENUM : Reset ID for use indexing the "NV_PMC_ENABLE_DEVICE(i)
544 and NV_PMC_ELPG_ENABLE_DEVICE(i) register bitfields."
545
546 == INFO_TYPE_ENGINE_TYPE fields ==
547 ENGINE_TYPE : What type of engine is this? (see ENGINE_TYPES_NAMES)
548
549 Support: Kepler, Maxwell, Pascal, Volta, Ampere
550 See dev_top.ref.txt of NVIDIA's open-gpu-doc for more info.
551*/
552#define NV_PTOP_DEVICE_INFO(i) (0x00022700+(i)*4)
553#define NV_PTOP_DEVICE_INFO__SIZE_1 64
554typedef union {
555 // DATA type fields
556 struct {
557 enum DEVICE_INFO_TYPE info_type:2;
558 bool fault_id_is_valid:1;
559 uint32_t fault_id:7;
560 uint32_t padding0:2;
561 uint32_t pri_base:12;
562 uint32_t padding1:2;
563 uint32_t inst_id:4;
564 uint32_t is_not_enum2:1;
565 bool has_next_entry:1;
566 } __attribute__((packed));
567 // ENUM type fields
568 struct {
569 uint32_t padding2:2;
570 bool reset_is_valid:1;
571 bool intr_is_valid:1;
572 bool runlist_is_valid:1;
573 bool engine_is_valid:1;
574 uint32_t padding3:3;
575 uint32_t reset_enum:5;
576 uint32_t padding4:1;
577 uint32_t intr_enum:5;
578 uint32_t padding5:1;
579 uint32_t runlist_enum:4;
580 uint32_t padding6:1;
581 uint32_t engine_enum:4;
582 uint32_t padding7:2;
583 } __attribute__((packed));
584 // ENGINE_TYPE type fields
585 struct {
586 uint32_t padding8:2;
587 enum ENGINE_TYPES engine_type:29;
588 uint32_t padding9:1;
589 } __attribute__((packed));
590 uint32_t raw;
591} ptop_device_info_t;
592
593#define NV_PTOP_SCAL_NUM_GPCS 0x00022430
594#define NV_PTOP_SCAL_NUM_TPC_PER_GPC 0x00022434
595#define NV_PTOP_SCAL_NUM_CES 0x00022444
596// PCE_MAP is Volta+ only
597#define NV_CE_PCE_MAP 0x00104028
598
599// GPC and TPC masks
600// Support: Maxwell+
601#define NV_FUSE_GPC 0x00021c1c
602#define NV_FUSE_TPC_FOR_GPC(i) (0x00021c38+(i)*4)
603
604/* Location of the 1Kb instance block with page tables for BAR1 and BAR2.
605 Support: Fermi+ (?), Pascal
606*/
607#define NV_PBUS_BAR1_BLOCK 0x00001704
608#define NV_PBUS_BAR2_BLOCK 0x00001714
609typedef union {
610 struct {
611 uint32_t ptr:28;
612 enum INST_TARGET target:2;
613 uint32_t padding0:1;
614 bool is_virtual:1;
615 } __attribute__((packed));
616 uint32_t raw;
617 struct {
618 uint32_t map:30;
619 uint32_t padding1:2;
620 } __attribute__((packed));
621} bar_config_block_t;
622
623/* BAR0 PRAMIN (Private RAM Instance) window configuration
624
625 BASE : Base of window >> 16 in [TARGET] virtual address space
626 TARGET : Which address space BASE points into
627
628 Note: This seems to be set to 0x0bff00000 - 0x0c0000000 at least sometimes
629
630 Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
631*/
632#define NV_PBUS_BAR0_WINDOW 0x00001700
633#define NV_PRAMIN 0x00700000 // Goes until 0x00800000 (1MB window)
634#define NV_PRAMIN_LEN 0x00100000
635typedef union {
636 struct {
637 uint32_t base:24;
638 enum INST_TARGET target:2;
639 uint32_t padding0:6;
640 } __attribute__((packed));
641 uint32_t raw;
642} bar0_window_t;
643
644// Support: Tesla 2.0, Fermi, Kepler, Maxwell, Pascal, Turing, Ampere
645#define NV_PRAMIN_PDB_CONFIG_OFF 0x200
646typedef union {
647 struct {
648 uint32_t target:2;
649 uint32_t vol:1;
650 uint32_t padding0:1;
651 uint32_t fault_replay_tex:1;
652 uint32_t fault_replay_gcc:1;
653 uint32_t padding1:4;
654 bool is_ver2:1;
655 bool is_64k_big_page:1; // 128Kb otherwise
656 uint32_t page_dir_lo:20;
657 uint32_t page_dir_hi:32;
658 } __attribute__((packed));
659 uint64_t raw;
660} page_dir_config_t;
661
662/* Page directory entry
663
664 Note: Format changed with Pascal (how?)
665
666 Support: Pascal, Volta, Turing, Ampere
667*/
668// FIXME: PDE/PTEs are actually 64 bits =S
669// Important: Aperture keys are different with PDEs
670enum PD_TARGET {
671 PD_AND_TARGET_INVALID = 0, // b000
672 PD_AND_TARGET_VID_MEM = 2, // b010
673 PD_AND_TARGET_SYS_MEM_COHERENT = 4, // b100
674 PD_AND_TARGET_SYS_MEM_NONCOHERENT = 6, // b110
675 PTE_AND_TARGET_VID_MEM = 1, // b001
676 PTE_AND_TARGET_PEER = 3, // b011
677 PTE_AND_TARGET_SYS_MEM_COHERENT = 5, // b101
678 PTE_AND_TARGET_SYS_MEM_NONCOHERENT = 7, // b111
679};
680static inline char* pd_target_to_text(enum PD_TARGET t) {
681 switch (t) {
682 case PD_AND_TARGET_INVALID:
683 return "INVALID";
684 case PD_AND_TARGET_VID_MEM:
685 case PTE_AND_TARGET_VID_MEM:
686 return "VID_MEM";
687 case PTE_AND_TARGET_PEER:
688 return "PEER";
689 case PD_AND_TARGET_SYS_MEM_COHERENT:
690 case PTE_AND_TARGET_SYS_MEM_COHERENT:
691 return "SYS_MEM_COHERENT";
692 case PD_AND_TARGET_SYS_MEM_NONCOHERENT:
693 case PTE_AND_TARGET_SYS_MEM_NONCOHERENT:
694 return "SYS_MEM_NONCOHERENT";
695 default:
696 printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
697 return NULL;
698 }
699}
700
701// PDE/PTE V2 type
702// Note: As the meaning of target (bits 2:1) changes depending on if the entry
703// is a PTE or not, this combines them into a single target field to
704// simplify comparisons.
705// Support: Pascal, Turing, Ampere
706typedef union {
707 // Page Directory Entry (PDE)
708 struct {
709 bool is_pte:1;
710 uint32_t __target:2;
711 bool is_volatile:1;
712 uint32_t padding1:4;
713 uint32_t addr:24;
714 } __attribute__((packed));
715 // Page Table Entry (PTE)
716 struct {
717 enum PD_TARGET target:3;
718 uint32_t __is_volatile:1;
719 bool is_encrypted:1;
720 bool is_privileged:1;
721 bool is_readonly:1;
722 bool atomics_disabled:1;
723 uint32_t __addr:24;
724 } __attribute__((packed));
725 uint32_t raw;
726} page_dir_entry_t;
727
728// PDE/PTE V1 types
729// Support: Fermi, Kepler, Maxwell
730enum V1_PD_TARGET {
731 PD_TARGET_INVALID = 0,
732 PD_TARGET_VID_MEM = 1,
733 PD_TARGET_SYS_MEM_COHERENT = 2,
734 PD_TARGET_SYS_MEM_NONCOHERENT = 3,
735};
736// Page Directory Entry (PDE)
737typedef union {
738// Large page fields
739 struct {
740// 0:32
741 enum V1_PD_TARGET target:2;
742 uint32_t padding0:2;
743 uint64_t addr:28; // May be wider?
744// 32:63
745 uint32_t padding2:3;
746 uint32_t is_volatile:1; // Might have counted wrong?
747 uint32_t padding3:28;
748 } __attribute__((packed));
749// Small page fields
750 struct {
751// 0:32
752 uint32_t padding00:32;
753// 32:63
754 enum V1_PD_TARGET alt_target:2;
755 uint32_t alt_is_volatile:1; // Might have counted wrong?
756 uint32_t padding03:1;
757 uint64_t alt_addr:28;
758 } __attribute__((packed));
759 uint64_t raw;
760} page_dir_entry_v1_t;
761// Page Table Entry (PTE)
762// Reconstructed from info in Jetson nvgpu driver
763typedef union {
764 struct {
765// 0:32
766 bool is_present:1;
767 bool is_privileged:1;
768 bool is_readonly:1;
769 uint32_t padding0:1;
770 uint64_t addr:28;
771// 32:63
772 bool is_volatile:1;
773 enum INST_TARGET:2;
774 uint32_t padding1:1;
775 uint32_t kind:8;
776 uint32_t comptag:17;
777 uint32_t padding2:1;
778 bool is_read_disabled:1;
779 bool is_write_disabled:1;
780 } __attribute__((packed));
781 uint64_t raw;
782} page_tbl_entry_v1_t;
783//enum V0_PDE_TYPE {NOT_PRESENT = 0, PAGE_64K = 1, PAGE_16K = 2, PAGE_4K = 3};
784//enum V0_PDE_SIZE {PDE_SZ_128K = 0, PDE_SZ_32K = 1, PDE_SZ_16K = 2, PDE_SZ_8K = 3};
785//static const int V0_PDE_SIZE2NUM[4] = {128*1024, 32*1024, 16*1024, 8*1024};
786/* PDE V0 (nv50/Tesla)
787typedef union {
788 struct {
789 enum V1_PDE_TYPE type:2;
790 enum INST_TARGET target:2;
791 uint32_t padding0:1;
792 enum V1_PDE_SIZE sublevel_size:2;
793 uint32_t padding1:5;
794 uint32_t addr:28;
795 uint32_t padding2:24;
796 } __attribute__((packed));
797 uint64_t raw;
798} page_dir_entry_v1_t;*/
799/* PTE V0 (nv50)
800typedef union {
801 struct {
802 bool is_present:1;
803 uint32_t padding3:2;
804 bool is_readonly:1;
805 enum INST_TARGET target:2;
806 bool is_privileged:1;
807 uint32_t contig_blk_sz:3;
808 uint32_t padding4:2;
809 uint32_t addr:28;
810 uint32_t storage_type:7; // ???
811 uint32_t compression_mode:2; // ???
812 uint32_t compression_tag:12; // ???
813 bool is_long_partition_cycle:1; // ???
814 bool is_encrypted:1;
815 uint32_t padding5:1;
816 } __attribute__((packed));
817 uint64_t raw;
818} page_tbl_entry_v1_t;*/
819
304// TODO(jbakita): Maybe put the above GPU types in a different file. 820// TODO(jbakita): Maybe put the above GPU types in a different file.
305 821
306#define for_chan_in_tsg(chan, tsg) \ 822#define NV_PCI_VENDOR 0x10de
307 for (chan = (struct runlist_chan*)(tsg + 1); \ 823struct nvdebug_state {
308 (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \ 824 // Pointer to the mapped base address of the GPU control registers (obtained
309 chan++) 825 // via ioremap() originally). For embedded GPUs, we extract this from their
826 // struct nvgpu_os_linux. For discrete GPUs, we create our own mapping of
827 // BAR0 with pci_iomap(). Access via nvgpu_readl/writel functions.
828 void __iomem *regs;
829 // Depending on the architecture, BAR2 or BAR3 are used to access PRAMIN
830 union {
831 void __iomem *bar2;
832 void __iomem *bar3;
833 };
834 int chip_id;
835 // Additional state from the built-in driver. Only set iff
836 // chip_id == NV_CHIP_ID_GV11B
837 struct gk20a *g;
838 // Pointer to PCI device needed for pci_iounmap
839 struct pci_dev *pcid;
840};
841
842/*const struct runlist_funcs {
843 u8 size;
844 enum ENTRY_TYPE (*entry_type)(struct nvdebug_state *, void *);
845 uint32_t (*chid)(struct nvdebug_state *, void *);
846 uint32_t (*inst_ptr_lo)(struct nvdebug_state *, void *);
847 enum INST_TARGET (*inst_target)(struct nvdebug_state *, void *):
848 uint32_t (*tsgid)(struct nvdebug_state *, void *);
849 uint32_t (*timeslice_scale)(struct nvdebug_state *, void *);
850 uint32_t (*timeslice_timeout)(struct nvdebug_state *, void *);
851 uint32_t (*tsg_length)(struct nvdebug_state *, void *);
852};*/
853
854// This disgusting macro is a crutch to work around the fact that runlists were
855// different prior to Volta.
856#define VERSIONED_RL_ACCESSOR(_ENTRY_TYPE, type, prop) \
857 __attribute__((unused)) \
858 static type (prop)(const struct nvdebug_state *g, const void *raw) { \
859 if (g->chip_id > NV_CHIP_ID_VOLTA) { \
860 const struct gv100_runlist_ ## _ENTRY_TYPE *entry = (struct gv100_runlist_ ## _ENTRY_TYPE*)raw; \
861 return entry->prop; \
862 } else if (g->chip_id > NV_CHIP_ID_KEPLER) { \
863 const struct gk110_runlist_ ## _ENTRY_TYPE *entry = (struct gk110_runlist_ ## _ENTRY_TYPE*)raw; \
864 return entry->prop; \
865 } else { \
866 printk(KERN_WARNING "[nvdebug] " #prop " unavailable on GPU ID %x, which is older than Kepler.\n", g->chip_id); \
867 return (type)0; \
868 } \
869 }
870
871VERSIONED_RL_ACCESSOR(chan, uint32_t, chid);
872VERSIONED_RL_ACCESSOR(chan, uint32_t, inst_ptr_lo);
873VERSIONED_RL_ACCESSOR(chan, enum INST_TARGET, inst_target);
874VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsgid);
875VERSIONED_RL_ACCESSOR(tsg, enum ENTRY_TYPE, entry_type);
876VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_scale);
877VERSIONED_RL_ACCESSOR(tsg, uint32_t, timeslice_timeout);
878VERSIONED_RL_ACCESSOR(tsg, uint32_t, tsg_length);
310 879
311#define next_tsg(tsg) \ 880
312 (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length 881#define NV_RL_ENTRY_SIZE(g) \
882 ((g)->chip_id >= NV_CHIP_ID_VOLTA ? sizeof(struct gv100_runlist_tsg) : sizeof(struct gk110_runlist_tsg))
883
884#define for_chan_in_tsg(g, chan, tsg) \
885 for (chan = (typeof(chan))(((u8*)tsg) + NV_RL_ENTRY_SIZE(g)); \
886 (u8*)chan < ((u8*)tsg) + (1 + tsg_length(g, tsg)) * NV_RL_ENTRY_SIZE(g); \
887 chan = (typeof(chan))(((u8*)chan) + NV_RL_ENTRY_SIZE(g)))
888
889#define next_tsg(g, tsg) \
890 (typeof(tsg))((u8*)(tsg) + NV_RL_ENTRY_SIZE(g) * (tsg_length(g, tsg) + 1))
313 891
314struct runlist_iter { 892struct runlist_iter {
315 struct entry_tsg *curr_tsg; 893 // Pointer to either a TSG or channel entry (they're the same size)
894 void *curr_entry;
895 // This should be set to tsg_length when a TSG is reached, and
896 // decremented as each subsequent channel is printed. This allows us to
897 // track which channel are and are not part of the TSG.
898 int channels_left_in_tsg;
899 // Total runlist length, etc
316 runlist_info_t rl_info; 900 runlist_info_t rl_info;
317}; 901};
318 902
903#define NVDEBUG_MAX_DEVICES 8
904extern struct nvdebug_state g_nvdebug_state[NVDEBUG_MAX_DEVICES];
905
319// Defined in runlist.c 906// Defined in runlist.c
320struct gk20a* get_live_gk20a(void); 907int get_runlist_iter(struct nvdebug_state *g, int rl_id, struct runlist_iter *rl_iter);
321int get_runlist_iter(struct runlist_iter *rl_iter); 908int preempt_tsg(struct nvdebug_state *g, uint32_t tsg_id);
322int preempt_tsg(uint32_t tsg_id); 909
910// Defined in mmu.c
911uint32_t vram2PRAMIN(struct nvdebug_state *g, uint64_t addr);
912void __iomem *phy2PRAMIN(struct nvdebug_state* g, uint64_t phy);
913uint64_t search_page_directory(
914 struct nvdebug_state *g,
915 void __iomem *pde_offset,
916 void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
917 uint64_t addr_to_find);
918uint64_t search_v1_page_directory(
919 struct nvdebug_state *g,
920 void __iomem *pde_offset,
921 void __iomem *(*off2addr)(struct nvdebug_state*, uint64_t),
922 uint64_t addr_to_find);
923
323 924
324static inline struct gk20a *get_gk20a(struct device *dev) { 925static inline struct gk20a *get_gk20a(struct device *dev) {
325 // XXX: Only works because gk20a* is the first member of gk20a_platform 926 // XXX: Only works because gk20a* is the first member of gk20a_platform
326 return *((struct gk20a**)dev_get_drvdata(dev)); 927 return *((struct gk20a**)dev_get_drvdata(dev));
327} 928}
328 929
329// Functionally identical to nvgpu_readl() 930// We us the data field of the proc_dir_entry ("PDE" in this function) to store
931// our index into the g_nvdebug_state array
932static inline int seq2gpuidx(struct seq_file *s) {
933 const struct file *f = s->file;
934 return (uintptr_t)PDE_DATA(file_inode(f));
935}
936static inline int file2gpuidx(const struct file *f) {
937 return (uintptr_t)PDE_DATA(file_inode(f));
938}
939static inline int file2parentgpuidx(const struct file *f) {
940 // Should be safe to call on ProcFS entries, as our parent should (?)
941 // still exist if we're called. If not, there are worse races in this
942 // module.
943 return (uintptr_t)PDE_DATA(file_dentry(f)->d_parent->d_inode);
944}
945
946#define gk20a_regs(gk20a) (container_of(gk20a, struct nvgpu_os_linux, g)->regs)
947
948// Similar to nvgpu_readl()
330// (except we don't try to resolve situations where regs is NULL) 949// (except we don't try to resolve situations where regs is NULL)
331static inline u32 nvdebug_readl(struct gk20a* g, u32 r) { 950static inline u32 nvdebug_readl(struct nvdebug_state *s, u32 r) {
332 struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); 951 if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
333 if (unlikely(!g_os->regs)) { 952 printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
334 printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); 953 return -1;
335 return -1; 954 }
336 } 955 return readl(s->regs + r);
337 return readl(g_os->regs + r);
338} 956}
339 957
340// quadword version of nvdebug_readl() 958// quadword version of nvdebug_readl()
341static inline u64 nvdebug_readq(struct gk20a* g, u32 r) { 959static inline u64 nvdebug_readq(struct nvdebug_state *s, u32 r) {
342 struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); 960 u64 ret;
343 u64 ret; 961 if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
344 if (unlikely(!g_os->regs)) { 962 printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
345 printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n"); 963 return -1;
346 return -1; 964 }
347 }
348 // readq seems to always return the uppermost 32 bits as 0, so workaround with readl 965 // readq seems to always return the uppermost 32 bits as 0, so workaround with readl
349 ret = readl(g_os->regs + r); 966 ret = readl(s->regs + r);
350 ret |= ((u64)readl(g_os->regs + r + 4)) << 32; 967 ret |= ((u64)readl(s->regs + r + 4)) << 32;
351 return ret; 968 return ret;
352} 969}
353 970
354// Functionally identical to nvgpu_writel() 971// Similar to nvgpu_writel()
355static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) { 972static inline void nvdebug_writel(struct nvdebug_state *s, u32 r, u32 v) {
356 struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g); 973 if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
357 if (unlikely(!g_os->regs)) { 974 printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
975 return;
976 }
977 writel_relaxed(v, s->regs + r);
978 wmb();
979}
980
981// quadword version of nvdebug_writel()
982// XXX: This probably doesn't work XXX: Untested
983static inline void nvdebug_writeq(struct nvdebug_state *s, u32 r, u64 v) {
984 if (unlikely(!s->regs || (s->g && !gk20a_regs(s->g)))) {
358 printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n"); 985 printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
359 return; 986 return;
360 } 987 }
361 writel_relaxed(v, g_os->regs + r); 988 writeq_relaxed(v, s->regs + r);
362 wmb(); 989 wmb();
363} 990}