/* Copyright 2021 Joshua Bakita
 * SPDX-License-Identifier: MIT
 */

// TODO(jbakita): Don't depend on these.
#include <nvgpu/gk20a.h>  // For struct gk20a
#include <os/linux/os_linux.h>  // For struct nvgpu_os_linux

/* Runlist Channel
  A timeslice group (TSG) is composed of channels. Each channel is a FIFO queue
  of GPU commands. These commands are typically queued from userspace.

  `INST_PTR` points to a GPU Instance Block which contains pointers to the GPU
  virtual address space for this context. All channels in a TSG point to the
  same GPU Instance Block (?).

  "RUNQUEUE_SELECTOR determines to which runqueue the channel belongs, and
  thereby which PBDMA will run the channel.  Increasing values select
  increasingly numbered PBDMA IDs serving the runlist.  If the selector value
  exceeds the number of PBDMAs on the runlist, the hardware will silently
  reassign the channel to run on the first PBDMA as though RUNQUEUE_SELECTOR had
  been set to 0.  (In current hardware, this is used by SCG on the graphics
  runlist only to determine which FE pipe should service a given channel.  A
  value of 0 targets the first FE pipe, which can process all FE driven engines:
  Graphics, Compute, Inline2Memory, and TwoD.  A value of 1 targets the second
  FE pipe, which can only process Compute work.  Note that GRCE work is allowed
  on either runqueue." (NVIDIA) Note that it appears runqueue 1 is the default
  for CUDA work on the Jetson Xavier.

  ENTRY_TYPE (T)        : type of this entry: ENTRY_TYPE_CHAN
  CHID (ID)             : identifier of the channel to run (overlays ENTRY_ID)
  RUNQUEUE_SELECTOR (Q) : selects which PBDMA should run this channel if
                          more than one PBDMA is supported by the runlist

  INST_PTR_LO           : lower 20 bits of the 4k-aligned instance block pointer
  INST_PTR_HI           : upper 32 bit of instance block pointer
  INST_TARGET (TGI)     : aperture of the instance block

  USERD_PTR_LO          : upper 24 bits of the low 32 bits, of the 512-byte-aligned USERD pointer
  USERD_PTR_HI          : upper 32 bits of USERD pointer
  USERD_TARGET (TGU)    : aperture of the USERD data structure
*/
enum ENTRY_TYPE {ENTRY_TYPE_CHAN = 0, ENTRY_TYPE_TSG = 1};
enum INST_TARGET {TARGET_VID_MEM = 0, TARGET_SYS_MEM_COHERENT = 2, TARGET_SYS_MEM_NONCOHERENT = 3};
static inline char* target_to_text(enum INST_TARGET t) {
	switch (t) {
		case TARGET_VID_MEM:
			return "VID_MEM";
		case TARGET_SYS_MEM_COHERENT:
			return "SYS_MEM_COHERENT";
		case TARGET_SYS_MEM_NONCOHERENT:
			return "SYS_MEM_NONCOHERENT";
		default:
			printk(KERN_WARNING "[nvdebug] Invalid aperture!\n");
			return NULL;
	}
}

struct runlist_chan {
// 0:63
	enum ENTRY_TYPE entry_type:1;
	uint32_t runqueue_selector:1;
	 uint32_t padding:2;
	enum INST_TARGET inst_target:2;
	 uint32_t padding2:2;
	uint32_t userd_ptr_lo:24;
	uint32_t userd_ptr_hi:32;
// 64:128
	uint32_t chid:12;
	uint32_t inst_ptr_lo:20;
	uint32_t inst_ptr_hi:32;
} __attribute__((packed));

/* Runlist TSG (TimeSlice Group)
  The runlist is composed of timeslice groups (TSG). Each TSG corresponds
  to a single virtual address space on the GPU and contains `TSG_LENGTH`
  channels. These channels and virtual address space are accessible to the GPU
  host unit for use until the timeslice expires or a TSG switch is forcibly
  initiated via a write to `NV_PFIFO_PREEMPT`.

  timeslice = (TSG_TIMESLICE_TIMEOUT << TSG_TIMESLICE_SCALE) * 1024 nanoseconds

  ENTRY_TYPE (T)      : type of this entry: ENTRY_TYPE_TSG
  TIMESLICE_SCALE     : scale factor for the TSG's timeslice
  TIMESLICE_TIMEOUT   : timeout amount for the TSG's timeslice
  TSG_LENGTH          : number of channels that are part of this timeslice group
  TSGID               : identifier of the Timeslice group (overlays ENTRY_ID)
*/
struct entry_tsg {
// 0:63
	enum ENTRY_TYPE entry_type:1;
	 uint64_t padding:15;
	uint32_t timeslice_scale:4;
	 uint64_t padding2:4;
	uint32_t timeslice_timeout:8;
	uint32_t tsg_length:8;
	 uint32_t padding3:24;
// 64:128
	uint32_t tsgid:12;
	 uint64_t padding4:52;
} __attribute__((packed));
#define MAX_TSGID (1 << 12)

enum PREEMPT_TYPE {PREEMPT_TYPE_CHANNEL = 0, PREEMPT_TYPE_TSG = 1};

/* Preempt a TSG or Channel by ID
  ID/CHID             : Id of TSG or channel to preempt
  IS_PENDING          : ????
  TYPE                : PREEMPT_TYPE_CHANNEL or PREEMPT_TYPE_TSG

  Support: Kepler, Maxwell, Pascal, Volta
*/
#define NV_PFIFO_PREEMPT 0x00002634
typedef union {
	struct {
		uint32_t id:12;
		 uint32_t padding:8;
		bool is_pending:1;
		 uint32_t padding2:3;
		enum PREEMPT_TYPE type:2;
		 uint32_t padding3:6;
	} __attribute__((packed));
	uint32_t raw;
} pfifo_preempt_t;

/*
  "Initiate a preempt of the engine by writing the bit associated with its
  runlist to NV_PFIFO_RUNLIST_PREEMPT...  Do not poll NV_PFIFO_RUNLIST_PREEMPT
  for the preempt to complete."

  Useful for preempting multiple runlists at once.

  Appears to trigger an interrupt or some other side-effect on the Jetson
  Xavier, as the built-in nvgpu driver seems to be disturbed by writing to this.

  To select the runlist dynamically, use the BIT(nr) kernel macro.
  Example:
    runlist_preempt_t rl_preempt;
    rl_preempt.raw = nvdebug_readl(g, NV_PFIFO_RUNLIST_PREEMPT);
    rl_preempt.raw |= BIT(nr);
    nvdebug_writel(g, NV_PFIFO_RUNLIST_PREEMPT, rl_preempt.raw);

  Support: Volta
*/
#define NV_PFIFO_RUNLIST_PREEMPT 0x00002638
typedef union {
	struct {
		bool runlist_0:1;
		bool runlist_1:1;
		bool runlist_2:1;
		bool runlist_3:1;
		bool runlist_4:1;
		bool runlist_5:1;
		bool runlist_6:1;
		bool runlist_7:1;
		bool runlist_8:1;
		bool runlist_9:1;
		bool runlist_10:1;
		bool runlist_11:1;
		bool runlist_12:1;
		bool runlist_13:1;
		 uint32_t padding:18;
	} __attribute__((packed));
	uint32_t raw;
} runlist_preempt_t;

/* Additional information on preempting from NVIDIA's driver (commit b1d0d8ece)
 * "From h/w team
 * Engine save can be blocked by eng  stalling interrupts.
 * FIFO interrupts shouldn’t block an engine save from
 * finishing, but could block FIFO from reporting preempt done.
 * No immediate reason to reset the engine if FIFO interrupt is
 * pending.
 * The hub, priv_ring, and ltc interrupts could block context
 * switch (or memory), but doesn’t necessarily have to.
 * For Hub interrupts they just report access counters and page
 * faults. Neither of these necessarily block context switch
 * or preemption, but they could.
 * For example a page fault for graphics would prevent graphics
 * from saving out. An access counter interrupt is a
 * notification and has no effect.
 * SW should handle page faults though for preempt to complete.
 * PRI interrupt (due to a failed PRI transaction) will result
 * in ctxsw failure reported to HOST.
 * LTC interrupts are generally ECC related and if so,
 * certainly don’t block preemption/ctxsw but they could.
 * Bus interrupts shouldn’t have anything to do with preemption
 * state as they are part of the Host EXT pipe, though they may
 * exhibit a symptom that indicates that GPU is in a bad state.
 * To be completely fair, when an engine is preempting SW
 * really should just handle other interrupts as they come in.
 * It’s generally bad to just poll and wait on a preempt
 * to complete since there are many things in the GPU which may
 * cause a system to hang/stop responding."
 */

// Note: This is different with Turing
// Support: Kepler, Maxwell, Pascal, Volta
#define NV_PFIFO_RUNLIST_BASE 0x00002270
typedef union {
	struct {
		uint32_t ptr:28;
		uint32_t type:2;
		 uint32_t padding:2;
	} __attribute__((packed));
	uint32_t raw;
} runlist_base_t;

// Support: Kepler, Maxwell, Pascal, Volta
#define NV_PFIFO_RUNLIST 0x00002274
typedef union {
	struct {
		uint32_t len:16;
		 uint32_t padding:4;
		uint32_t id:4;
		 uint32_t padding2:8;
	} __attribute__((packed));
	uint32_t raw;
} runlist_info_t;

enum CHANNEL_STATUS {
	CHANNEL_STATUS_IDLE = 0,
	CHANNEL_STATUS_PENDING = 1,
	CHANNEL_STATUS_PENDING_CTX_RELOAD = 2,
	CHANNEL_STATUS_PENDING_ACQUIRE = 3,
	CHANNEL_STATUS_PENDING_ACQ_CTX_RELOAD = 4,
	CHANNEL_STATUS_ON_PBDMA = 5,
	CHANNEL_STATUS_ON_PBDMA_AND_ENG = 6,
	CHANNEL_STATUS_ON_ENG = 7,
	CHANNEL_STATUS_ON_ENG_PENDING_ACQUIRE = 8,
	CHANNEL_STATUS_ON_ENG_PENDING = 9,
	CHANNEL_STATUS_ON_PBDMA_CTX_RELOAD = 10,
	CHANNEL_STATUS_ON_PBDMA_AND_ENG_CTX_RELOAD = 11,
	CHANNEL_STATUS_ON_ENG_CTX_RELOAD = 12,
	CHANNEL_STATUS_ON_ENG_PENDING_CTX_RELOAD = 13,
	CHANNEL_STATUS_ON_ENG_PENDING_ACQ_CTX_RELOAD = 14,
};

#define NV_PCCSR_CHANNEL_INST(i) (0x00800000+(i)*8)
// There are a total of 512 possible channels
#define MAX_CHID 512
typedef union {
	struct {
// 0:31
		uint32_t inst_ptr:28;
		enum INST_TARGET inst_target:2;
		 uint32_t padding0:1;
		bool inst_bind:1;
// 32:64
		bool enable:1;
		bool next:1;
		 uint32_t padding:6;
		bool force_ctx_reload:1;
		 uint32_t padding2:1;
		bool enable_set:1;
		bool enable_clear:1;
		 uint32_t padding3:10;
		bool pbdma_faulted:1;
		bool eng_faulted:1;
		enum CHANNEL_STATUS status:4;
		bool busy:1;
		 uint32_t padding4:3;
	} __attribute__((packed));
	uint64_t raw;
} channel_ctrl_t;

/* Control word for runlist enable/disable.

  RUNLIST_N           : Is runlist n disabled? (1 == disabled, 0 == enabled)

  To select the runlist dynamically, use the BIT(nr) kernel macro.
  Disabling example:
    runlist_disable_t rl_disable;
    rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
    rl_disable.raw |= BIT(nr);
    nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);
  Enabling example:
    runlist_disable_t rl_disable;
    rl_disable.raw = nvdebug_readl(g, NV_PFIFO_SCHED_DISABLE);
    rl_disable.raw &= ~BIT(nr);
    nvdebug_writel(g, NV_PFIFO_SCHED_DISABLE, rl_disable.raw);

  Support: Fermi, Kepler, Maxwell, Pascal, Volta, Turing
*/
#define NV_PFIFO_SCHED_DISABLE 0x00002630
typedef union {
	struct {
		bool runlist_0:1;
		bool runlist_1:1;
		bool runlist_2:1;
		bool runlist_3:1;
		bool runlist_4:1;
		bool runlist_5:1;
		bool runlist_6:1;
		bool runlist_7:1;
		bool runlist_8:1;
		bool runlist_9:1;
		bool runlist_10:1;
		 uint32_t padding:21;
	} __attribute__((packed));
	uint32_t raw;
} runlist_disable_t;

// TODO(jbakita): Maybe put the above GPU types in a different file.

#define for_chan_in_tsg(chan, tsg) \
        for (chan = (struct runlist_chan*)(tsg + 1); \
             (void*)chan < (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length; \
             chan++)

#define next_tsg(tsg) \
        (void*)(tsg + 1) + sizeof(struct runlist_chan) * tsg->tsg_length

struct runlist_iter {
	struct entry_tsg *curr_tsg;
	runlist_info_t rl_info;
};

// Defined in runlist.c
struct gk20a* get_live_gk20a(void);
int get_runlist_iter(struct runlist_iter *rl_iter);
int preempt_tsg(uint32_t tsg_id);

static inline struct gk20a *get_gk20a(struct device *dev) {
        // XXX: Only works because gk20a* is the first member of gk20a_platform
        return *((struct gk20a**)dev_get_drvdata(dev));
}

// Functionally identical to nvgpu_readl()
// (except we don't try to resolve situations where regs is NULL)
static inline u32 nvdebug_readl(struct gk20a* g, u32 r) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
        if (unlikely(!g_os->regs)) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
                return -1;
        }
        return readl(g_os->regs + r);
}

// quadword version of nvdebug_readl()
static inline u64 nvdebug_readq(struct gk20a* g, u32 r) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
        u64 ret;
	if (unlikely(!g_os->regs)) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_readl on non-existent registers!\n");
                return -1;
        }
	// readq seems to always return the uppermost 32 bits as 0, so workaround with readl
	ret = readl(g_os->regs + r);
	ret |= ((u64)readl(g_os->regs + r + 4)) << 32;
	return ret;
}

// Functionally identical to nvgpu_writel()
static inline void nvdebug_writel(struct gk20a* g, u32 r, u32 v) {
        struct nvgpu_os_linux* g_os = container_of(g, struct nvgpu_os_linux, g);
        if (unlikely(!g_os->regs)) {
                printk(KERN_ERR "[nvdebug] Attempted nvgpu_writel on non-existent registers!\n");
                return;
        }
        writel_relaxed(v, g_os->regs + r);
        wmb();
}