From 520ff00e870eadc98a50f58ecd514ced53a8612f Mon Sep 17 00:00:00 2001
From: Sami Kiminki <skiminki@nvidia.com>
Date: Thu, 19 Mar 2015 21:28:34 +0200
Subject: gpu: nvgpu: Implement compbits mapping

Implement NVGPU_AS_IOCTL_GET_BUFFER_COMPBITS_INFO for requesting info
on compbits-mappable buffers; and NVGPU_AS_IOCTL_MAP_BUFFER_COMPBITS,
which enables mapping compbits to the GPU address space of said
buffers. This, subsequently, enables moving comptag swizzling from GPU
to CDEH/CDEV formats to userspace.

Compbits mapping is conservative and it may map more than what is
strictly needed. This is because two reasons: 1) mapping must be done
on small page alignment (4kB), and 2) GPU comptags are swizzled all
around the aggregate cache line, which means that the whole cache line
must be visible even if only some comptag lines are required from
it. Cache line size is not necessarily a multiple of the small page
size.

Bug 200077571

Change-Id: I5ae88fe6b616e5ea37d3bff0dff46c07e9c9267e
Signed-off-by: Sami Kiminki <skiminki@nvidia.com>
Reviewed-on: http://git-master/r/719710
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Tested-by: Terje Bergstrom <tbergstrom@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/as_gk20a.c |  35 +++++++-
 drivers/gpu/nvgpu/gk20a/gk20a.c    |   9 +-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c | 168 ++++++++++++++++++++++++++++++++++++-
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h |  19 +++++
 include/uapi/linux/nvgpu.h         |  99 +++++++++++++++++++++-
 5 files changed, 323 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
index 038fa4c8..63569008 100644
--- a/drivers/gpu/nvgpu/gk20a/as_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -1,7 +1,7 @@
 /*
  * GK20A Address Spaces
  *
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2015, NVIDIA CORPORATION.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -225,6 +225,31 @@ static int gk20a_as_ioctl_get_va_regions(
 	return 0;
 }
 
+static int gk20a_as_ioctl_get_buffer_compbits_info(
+		struct gk20a_as_share *as_share,
+		struct nvgpu_as_get_buffer_compbits_info_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_get_compbits_info(as_share->vm,
+					  args->mapping_gva,
+					  &args->compbits_win_size,
+					  &args->compbits_win_ctagline,
+					  &args->mapping_ctagline,
+					  &args->flags);
+}
+
+static int gk20a_as_ioctl_map_buffer_compbits(
+		struct gk20a_as_share *as_share,
+		struct nvgpu_as_map_buffer_compbits_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_map_compbits(as_share->vm,
+				     args->mapping_gva,
+				     &args->compbits_win_gva,
+				     &args->mapping_iova,
+				     args->flags);
+}
+
 int gk20a_as_dev_open(struct inode *inode, struct file *filp)
 {
 	struct gk20a_as_share *as_share;
@@ -334,6 +359,14 @@ long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		err = gk20a_as_ioctl_get_va_regions(as_share,
 				(struct nvgpu_as_get_va_regions_args *)buf);
 		break;
+	case NVGPU_AS_IOCTL_GET_BUFFER_COMPBITS_INFO:
+		err = gk20a_as_ioctl_get_buffer_compbits_info(as_share,
+				(struct nvgpu_as_get_buffer_compbits_info_args *)buf);
+		break;
+	case NVGPU_AS_IOCTL_MAP_BUFFER_COMPBITS:
+		err = gk20a_as_ioctl_map_buffer_compbits(as_share,
+				(struct nvgpu_as_map_buffer_compbits_args *)buf);
+		break;
 	default:
 		dev_dbg(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd);
 		err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 470729b7..d3114ecd 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -2016,8 +2016,13 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
 	gpu->max_ltc_per_fbp =  g->ops.gr.get_max_ltc_per_fbp(g);
 	gpu->max_lts_per_ltc = g->ops.gr.get_max_lts_per_ltc(g);
 	g->ops.gr.get_rop_l2_en_mask(g);
-
-	gpu->reserved = 0;
+	gpu->gr_compbit_store_base_hw = g->gr.compbit_store.base_hw;
+	gpu->gr_gobs_per_comptagline_per_slice =
+		g->gr.gobs_per_comptagline_per_slice;
+	gpu->num_ltc = g->ltc_count;
+	gpu->lts_per_ltc = g->gr.slices_per_ltc;
+	gpu->cbc_cache_line_size = g->gr.cacheline_size;
+	gpu->cbc_comptags_per_line = g->gr.comptags_per_cacheline;
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index 5d1ff563..d896d783 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -221,7 +221,9 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				struct device *dev,
 				struct dma_buf *dmabuf,
 				struct gk20a_allocator *allocator,
-				u32 lines, bool user_mappable)
+				u32 lines, bool user_mappable,
+				u64 *ctag_map_win_size,
+				u32 *ctag_map_win_ctagline)
 {
 	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
 	u32 offset = 0;
@@ -313,6 +315,8 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 				first_unneeded_cacheline *
 				g->gr.comptags_per_cacheline;
 
+			u64 win_size;
+
 			if (needed_ctaglines < ctaglines_to_allocate) {
 				/* free alignment lines */
 				int tmp=
@@ -326,6 +330,14 @@ static int gk20a_alloc_comptags(struct gk20a *g,
 
 				ctaglines_to_allocate = needed_ctaglines;
 			}
+
+			*ctag_map_win_ctagline = offset;
+			win_size =
+				DIV_ROUND_UP(lines,
+					     g->gr.comptags_per_cacheline) *
+				aggregate_cacheline_sz;
+
+			*ctag_map_win_size = round_up(win_size, small_pgsz);
 		}
 
 		priv->comptags.offset = offset;
@@ -1374,6 +1386,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 	bool clear_ctags = false;
 	struct scatterlist *sgl;
 	u64 buf_addr;
+	u64 ctag_map_win_size = 0;
+	u32 ctag_map_win_ctagline = 0;
 
 	mutex_lock(&vm->update_gmmu_lock);
 
@@ -1501,7 +1515,9 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 
 		/* allocate compression resources if needed */
 		err = gk20a_alloc_comptags(g, d, dmabuf, ctag_allocator,
-					   bfr.ctag_lines, user_mappable);
+					   bfr.ctag_lines, user_mappable,
+					   &ctag_map_win_size,
+					   &ctag_map_win_ctagline);
 		if (err) {
 			/* ok to fall back here if we ran out */
 			/* TBD: we can partially alloc ctags as well... */
@@ -1588,6 +1604,8 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 	mapped_buffer->ctag_lines  = bfr.ctag_lines;
 	mapped_buffer->ctag_allocated_lines = bfr.ctag_allocated_lines;
 	mapped_buffer->ctags_mappable = bfr.ctag_user_mappable;
+	mapped_buffer->ctag_map_win_size = ctag_map_win_size;
+	mapped_buffer->ctag_map_win_ctagline = ctag_map_win_ctagline;
 	mapped_buffer->vm          = vm;
 	mapped_buffer->flags       = flags;
 	mapped_buffer->kind        = kind;
@@ -1640,6 +1658,140 @@ clean_up:
 	return 0;
 }
 
+int gk20a_vm_get_compbits_info(struct vm_gk20a *vm,
+			       u64 mapping_gva,
+			       u64 *compbits_win_size,
+			       u32 *compbits_win_ctagline,
+			       u32 *mapping_ctagline,
+			       u32 *flags)
+{
+	struct mapped_buffer_node *mapped_buffer;
+	struct device *d = dev_from_vm(vm);
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, mapping_gva);
+
+	if (!mapped_buffer | !mapped_buffer->user_mapped)
+	{
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "%s: bad offset 0x%llx", __func__, mapping_gva);
+		return -EFAULT;
+	}
+
+	*compbits_win_size = 0;
+	*compbits_win_ctagline = 0;
+	*mapping_ctagline = 0;
+	*flags = 0;
+
+	if (mapped_buffer->ctag_offset)
+		*flags |= NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_HAS_COMPBITS;
+
+	if (mapped_buffer->ctags_mappable)
+	{
+		*flags |= NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_MAPPABLE;
+		*compbits_win_size = mapped_buffer->ctag_map_win_size;
+		*compbits_win_ctagline = mapped_buffer->ctag_map_win_ctagline;
+		*mapping_ctagline = mapped_buffer->ctag_offset;
+	}
+
+	mutex_unlock(&vm->update_gmmu_lock);
+	return 0;
+}
+
+
+int gk20a_vm_map_compbits(struct vm_gk20a *vm,
+			  u64 mapping_gva,
+			  u64 *compbits_win_gva,
+			  u64 *mapping_iova,
+			  u32 flags)
+{
+	struct mapped_buffer_node *mapped_buffer;
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct device *d = dev_from_vm(vm);
+
+	if (flags & NVGPU_AS_MAP_BUFFER_COMPBITS_FLAGS_FIXED_OFFSET) {
+		/* This will be implemented later */
+		gk20a_err(d,
+			  "%s: fixed-offset compbits mapping not yet supported",
+			  __func__);
+		return -EFAULT;
+	}
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, mapping_gva);
+
+	if (!mapped_buffer || !mapped_buffer->user_mapped) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "%s: bad offset 0x%llx", __func__, mapping_gva);
+		return -EFAULT;
+	}
+
+	if (!mapped_buffer->ctags_mappable) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "%s: comptags not mappable, offset 0x%llx", __func__, mapping_gva);
+		return -EFAULT;
+	}
+
+	if (!mapped_buffer->ctag_map_win_addr) {
+		const u32 small_pgsz_index = 0; /* small pages, 4K */
+		const u32 aggregate_cacheline_sz =
+			g->gr.cacheline_size * g->gr.slices_per_ltc *
+			g->ltc_count;
+
+		/* first aggregate cacheline to map */
+		u32 cacheline_start; /* inclusive */
+
+		/* offset of the start cacheline (will be page aligned) */
+		u64 cacheline_offset_start;
+
+		if (!mapped_buffer->ctag_map_win_size) {
+			mutex_unlock(&vm->update_gmmu_lock);
+			gk20a_err(d,
+				  "%s: mapping 0x%llx does not have "
+				  "mappable comptags",
+				  __func__, mapping_gva);
+			return -EFAULT;
+		}
+
+		cacheline_start = mapped_buffer->ctag_offset /
+			g->gr.comptags_per_cacheline;
+		cacheline_offset_start =
+			cacheline_start * aggregate_cacheline_sz;
+
+		mapped_buffer->ctag_map_win_addr =
+			g->ops.mm.gmmu_map(
+				vm,
+				0,
+				g->gr.compbit_store.mem.sgt,
+				cacheline_offset_start, /* sg offset */
+				mapped_buffer->ctag_map_win_size, /* size */
+				small_pgsz_index,
+				0, /* kind */
+				0, /* ctag_offset */
+				NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				gk20a_mem_flag_read_only,
+				false,
+				false);
+
+		if (!mapped_buffer->ctag_map_win_addr) {
+			mutex_unlock(&vm->update_gmmu_lock);
+			gk20a_err(d,
+				  "%s: failed to map comptags for mapping 0x%llx",
+				  __func__, mapping_gva);
+			return -ENOMEM;
+		}
+	}
+
+	*mapping_iova = gk20a_mm_iova_addr(g, mapped_buffer->sgt->sgl, 0);
+	*compbits_win_gva = mapped_buffer->ctag_map_win_addr;
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	return 0;
+}
+
 u64 gk20a_gmmu_map(struct vm_gk20a *vm,
 		struct sg_table **sgt,
 		u64 size,
@@ -2276,6 +2428,18 @@ void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
 	struct vm_gk20a *vm = mapped_buffer->vm;
 	struct gk20a *g = vm->mm->g;
 
+	if (mapped_buffer->ctag_map_win_addr) {
+		/* unmap compbits */
+
+		g->ops.mm.gmmu_unmap(vm,
+				     mapped_buffer->ctag_map_win_addr,
+				     mapped_buffer->ctag_map_win_size,
+				     0,       /* page size 4k */
+				     true,    /* va allocated */
+				     gk20a_mem_flag_none,
+				     false);  /* not sparse */
+	}
+
 	g->ops.mm.gmmu_unmap(vm,
 		mapped_buffer->addr,
 		mapped_buffer->size,
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index 0ff11d09..e07b95fe 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -186,7 +186,13 @@ struct mapped_buffer_node {
 	u32 ctag_offset;
 	u32 ctag_lines;
 	u32 ctag_allocated_lines;
+
+	/* For comptag mapping, these are the mapping window parameters */
 	bool ctags_mappable;
+	u64 ctag_map_win_addr; /* non-zero if mapped */
+	u64 ctag_map_win_size; /* non-zero if ctags_mappable */
+	u32 ctag_map_win_ctagline; /* ctagline at win start, set if
+				    * ctags_mappable */
 
 	u32 flags;
 	u32 kind;
@@ -504,6 +510,19 @@ u64 gk20a_vm_map(struct vm_gk20a *vm,
 		 u64 buffer_offset,
 		 u64 mapping_size);
 
+int gk20a_vm_get_compbits_info(struct vm_gk20a *vm,
+			       u64 mapping_gva,
+			       u64 *compbits_win_size,
+			       u32 *compbits_win_ctagline,
+			       u32 *mapping_ctagline,
+			       u32 *flags);
+
+int gk20a_vm_map_compbits(struct vm_gk20a *vm,
+			  u64 mapping_gva,
+			  u64 *compbits_win_gva,
+			  u64 *mapping_iova,
+			  u32 flags);
+
 /* unmap handle from kernel */
 void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset);
 
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index 9527ab82..e5bb0d07 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -166,7 +166,13 @@ struct nvgpu_gpu_characteristics {
 
 	__u8 chipname[8];
 
-
+	__u64 gr_compbit_store_base_hw;
+	__u32 gr_gobs_per_comptagline_per_slice;
+	__u32 num_ltc;
+	__u32 lts_per_ltc;
+	__u32 cbc_cache_line_size;
+	__u32 cbc_comptags_per_line;
+	__u32 reserved2;
 
 	/* Notes:
 	   - This struct can be safely appended with new fields. However, always
@@ -895,6 +901,91 @@ struct nvgpu_as_map_buffer_ex_args {
 				 * the buffer is returned in this field. */
 };
 
+/*
+ * Get info about buffer compbits. Requires that buffer is mapped with
+ * NVGPU_AS_MAP_BUFFER_FLAGS_MAPPABLE_COMPBITS.
+ *
+ * The compbits for a mappable buffer are organized in a mappable
+ * window to the compbits store. In case the window contains comptags
+ * for more than one buffer, the buffer comptag line index may differ
+ * from the window comptag line index.
+ */
+struct nvgpu_as_get_buffer_compbits_info_args {
+
+	/* in: address of an existing buffer mapping */
+	__u64 mapping_gva;
+
+	/* out: size of compbits mapping window (bytes) */
+	__u64 compbits_win_size;
+
+	/* out: comptag line index of the window start */
+	__u32 compbits_win_ctagline;
+
+	/* out: comptag line index of the buffer mapping */
+	__u32 mapping_ctagline;
+
+/* Buffer uses compbits */
+#define NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_HAS_COMPBITS    (1 << 0)
+
+/* Buffer compbits are mappable */
+#define NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_MAPPABLE        (1 << 1)
+
+/* Buffer IOVA addresses are discontiguous */
+#define NVGPU_AS_GET_BUFFER_COMPBITS_INFO_FLAGS_DISCONTIG_IOVA  (1 << 2)
+
+	/* out */
+	__u32 flags;
+
+	__u32 reserved1;
+};
+
+/*
+ * Map compbits of a mapped buffer to the GPU address space. The
+ * compbits mapping is automatically unmapped when the buffer is
+ * unmapped.
+ *
+ * The compbits mapping always uses small pages, it is read-only, and
+ * is GPU cacheable. The mapping is a window to the compbits
+ * store. The window may not be exactly the size of the cache lines
+ * for the buffer mapping.
+ */
+struct nvgpu_as_map_buffer_compbits_args {
+
+	/* in: address of an existing buffer mapping */
+	__u64 mapping_gva;
+
+	/* in: gva to the mapped compbits store window when
+	 * FIXED_OFFSET is set. Otherwise, ignored and should be be 0.
+	 *
+	 * For FIXED_OFFSET mapping:
+	 * - If compbits are already mapped compbits_win_gva
+	 *   must match with the previously mapped gva.
+	 * - The user must have allocated enough GVA space for the
+	 *   mapping window (see compbits_win_size in
+	 *   nvgpu_as_get_buffer_compbits_info_args)
+	 *
+	 * out: gva to the mapped compbits store window */
+	__u64 compbits_win_gva;
+
+	/* in: reserved, must be 0
+	   out: physical or IOMMU address for mapping */
+	union {
+		/* contiguous iova addresses */
+		__u64 mapping_iova;
+
+		/* buffer to receive discontiguous iova addresses (reserved) */
+		__u64 mapping_iova_buf_addr;
+	};
+
+	/* in: Buffer size (in bytes) for discontiguous iova
+	 * addresses. Reserved, must be 0. */
+	__u64 mapping_iova_buf_size;
+
+#define NVGPU_AS_MAP_BUFFER_COMPBITS_FLAGS_FIXED_OFFSET        (1 << 0)
+	__u32 flags;
+	__u32 reserved1;
+};
+
 /*
  * Unmapping a buffer:
  *
@@ -938,9 +1029,13 @@ struct nvgpu_as_get_va_regions_args {
 	_IOWR(NVGPU_AS_IOCTL_MAGIC, 7, struct nvgpu_as_map_buffer_ex_args)
 #define NVGPU_AS_IOCTL_GET_VA_REGIONS \
 	_IOWR(NVGPU_AS_IOCTL_MAGIC, 8, struct nvgpu_as_get_va_regions_args)
+#define NVGPU_AS_IOCTL_GET_BUFFER_COMPBITS_INFO \
+	_IOWR(NVGPU_AS_IOCTL_MAGIC, 9, struct nvgpu_as_get_buffer_compbits_info_args)
+#define NVGPU_AS_IOCTL_MAP_BUFFER_COMPBITS \
+	_IOWR(NVGPU_AS_IOCTL_MAGIC, 10, struct nvgpu_as_map_buffer_compbits_args)
 
 #define NVGPU_AS_IOCTL_LAST            \
-	_IOC_NR(NVGPU_AS_IOCTL_GET_VA_REGIONS)
+	_IOC_NR(NVGPU_AS_IOCTL_MAP_BUFFER_COMPBITS)
 #define NVGPU_AS_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_as_map_buffer_ex_args)
 
-- 
cgit v1.2.2