From 13231006671a1da11cfaf7a67e69430199820788 Mon Sep 17 00:00:00 2001
From: Konsta Holtta <kholtta@nvidia.com>
Date: Fri, 15 Jul 2016 15:52:52 +0300
Subject: gpu: nvgpu: add vidmem allocation ioctl

Add NVGPU_GPU_IOCTL_ALLOC_VIDMEM to the ctrl fd for letting userspace
allocate on-board GPU memory (aka vidmem).  The allocations are returned
as dmabuf fds.

Also, report the amount of local video memory in the gpu
characteristics.

Jira DNVGPU-19
Jira DNVGPU-38

Change-Id: I28e361d31bb630b96d06bb1c86d022d91c7592bc
Signed-off-by: Konsta Holtta <kholtta@nvidia.com>
Reviewed-on: http://git-master/r/1181152
GVS: Gerrit_Virtual_Submit
Reviewed-by: Vijayakumar Subbu <vsubbu@nvidia.com>
---
 drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c |  48 ++++++++++++
 drivers/gpu/nvgpu/gk20a/gk20a.c      |   1 +
 drivers/gpu/nvgpu/gk20a/mm_gk20a.c   | 148 +++++++++++++++++++++++++++++++++++
 drivers/gpu/nvgpu/gk20a/mm_gk20a.h   |   2 +
 include/uapi/linux/nvgpu.h           |  73 ++++++++++++++++-
 5 files changed, 271 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
index 3e34b6b8..6b832670 100644
--- a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -704,6 +704,49 @@ clean_up:
 	return err;
 }
 
+static int nvgpu_gpu_alloc_vidmem(struct gk20a *g,
+			struct nvgpu_gpu_alloc_vidmem_args *args)
+{
+	u32 align = args->in.alignment ? args->in.alignment : SZ_4K;
+	int fd;
+
+	gk20a_dbg_fn("");
+
+	/* not yet supported */
+	if (WARN_ON(args->in.flags & NVGPU_GPU_ALLOC_VIDMEM_FLAG_CPU_MASK))
+		return -EINVAL;
+
+	/* not yet supported */
+	if (WARN_ON(args->in.flags & NVGPU_GPU_ALLOC_VIDMEM_FLAG_VPR))
+		return -EINVAL;
+
+	if (args->in.size & (SZ_4K - 1))
+		return -EINVAL;
+
+	if (!args->in.size)
+		return -EINVAL;
+
+	if (align & (align - 1))
+		return -EINVAL;
+
+	if (align > roundup_pow_of_two(args->in.size)) {
+		/* log this special case, buddy allocator detail */
+		gk20a_warn(dev_from_gk20a(g),
+			"alignment larger than buffer size rounded up to power of 2 is not supported");
+		return -EINVAL;
+	}
+
+	fd = gk20a_vidmem_buf_alloc(g, args->in.size);
+	if (fd < 0)
+		return fd;
+
+	args->out.dmabuf_fd = fd;
+
+	gk20a_dbg_fn("done, fd=%d", fd);
+
+	return 0;
+}
+
 long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct device *dev = filp->private_data;
@@ -951,6 +994,11 @@ long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg
 			(struct nvgpu_gpu_get_engine_info_args *)buf);
 		break;
 
+	case NVGPU_GPU_IOCTL_ALLOC_VIDMEM:
+		err = nvgpu_gpu_alloc_vidmem(g,
+			(struct nvgpu_gpu_alloc_vidmem_args *)buf);
+		break;
+
 	default:
 		dev_dbg(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd);
 		err = -ENOTTY;
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
index 04f82033..bb8cb33f 100644
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -2148,6 +2148,7 @@ int gk20a_init_gpu_characteristics(struct gk20a *g)
 	gpu->default_compute_preempt_mode =
 		g->gr.preemption_mode_rec.default_compute_preempt_mode;
 
+	gpu->local_video_memory_size = g->mm.vidmem.size;
 
 	return 0;
 }
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
index bf3d990c..2dcc4363 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -411,6 +411,14 @@ struct gk20a_dmabuf_priv {
 	u64 buffer_id;
 };
 
+struct gk20a_vidmem_buf {
+	struct gk20a *g;
+	struct mem_desc mem;
+	struct dma_buf *dmabuf;
+	void *dmabuf_priv;
+	void (*dmabuf_priv_delete)(void *);
+};
+
 static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm);
 
 static int gk20a_comptaglines_alloc(struct gk20a_comptag_allocator *allocator,
@@ -1833,6 +1841,146 @@ static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
 	return mapped_buffer->addr;
 }
 
+#if defined(CONFIG_GK20A_VIDMEM)
+static struct sg_table *gk20a_vidbuf_map_dma_buf(
+	struct dma_buf_attachment *attach, enum dma_data_direction dir)
+{
+	struct gk20a_vidmem_buf *buf = attach->dmabuf->priv;
+
+	return buf->mem.sgt;
+}
+
+static void gk20a_vidbuf_unmap_dma_buf(struct dma_buf_attachment *attach,
+				       struct sg_table *sgt,
+				       enum dma_data_direction dir)
+{
+}
+
+static void gk20a_vidbuf_release(struct dma_buf *dmabuf)
+{
+	struct gk20a_vidmem_buf *buf = dmabuf->priv;
+
+	gk20a_dbg_fn("");
+
+	if (buf->dmabuf_priv)
+		buf->dmabuf_priv_delete(buf->dmabuf_priv);
+
+	gk20a_gmmu_free(buf->g, &buf->mem);
+	kfree(buf);
+}
+
+static void *gk20a_vidbuf_kmap(struct dma_buf *dmabuf, unsigned long page_num)
+{
+	WARN_ON("Not supported");
+	return NULL;
+}
+
+static void *gk20a_vidbuf_kmap_atomic(struct dma_buf *dmabuf,
+				      unsigned long page_num)
+{
+	WARN_ON("Not supported");
+	return NULL;
+}
+
+static int gk20a_vidbuf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+	return -EINVAL;
+}
+
+static int gk20a_vidbuf_set_private(struct dma_buf *dmabuf,
+		struct device *dev, void *priv, void (*delete)(void *priv))
+{
+	struct gk20a_vidmem_buf *buf = dmabuf->priv;
+
+	buf->dmabuf_priv = priv;
+	buf->dmabuf_priv_delete = delete;
+
+	return 0;
+}
+
+static void *gk20a_vidbuf_get_private(struct dma_buf *dmabuf,
+		struct device *dev)
+{
+	struct gk20a_vidmem_buf *buf = dmabuf->priv;
+
+	return buf->dmabuf_priv;
+}
+
+static const struct dma_buf_ops gk20a_vidbuf_ops = {
+	.map_dma_buf      = gk20a_vidbuf_map_dma_buf,
+	.unmap_dma_buf    = gk20a_vidbuf_unmap_dma_buf,
+	.release          = gk20a_vidbuf_release,
+	.kmap_atomic      = gk20a_vidbuf_kmap_atomic,
+	.kmap             = gk20a_vidbuf_kmap,
+	.mmap             = gk20a_vidbuf_mmap,
+	.set_drvdata      = gk20a_vidbuf_set_private,
+	.get_drvdata      = gk20a_vidbuf_get_private,
+};
+
+static struct dma_buf *gk20a_vidbuf_export(struct gk20a_vidmem_buf *buf)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
+	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+	exp_info.priv = buf;
+	exp_info.ops = &gk20a_vidbuf_ops;
+	exp_info.size = buf->mem.size;
+	exp_info.flags = O_RDWR;
+
+	return dma_buf_export(&exp_info);
+#else
+	return dma_buf_export(buf, &gk20a_vidbuf_ops, buf->mem.size,
+			O_RDWR, NULL);
+#endif
+}
+#endif
+
+int gk20a_vidmem_buf_alloc(struct gk20a *g, size_t bytes)
+{
+#if defined(CONFIG_GK20A_VIDMEM)
+	struct gk20a_vidmem_buf *buf;
+	int err, fd;
+
+	gk20a_dbg_fn("");
+
+	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	buf->g = g;
+
+	err = gk20a_gmmu_alloc_vid(g, bytes, &buf->mem);
+	if (err)
+		goto err_kfree;
+
+	buf->dmabuf = gk20a_vidbuf_export(buf);
+	if (IS_ERR(buf->dmabuf)) {
+		err = PTR_ERR(buf->dmabuf);
+		goto err_bfree;
+	}
+
+	fd = get_unused_fd_flags(O_RDWR);
+	if (fd < 0) {
+		/* ->release frees what we have done */
+		dma_buf_put(buf->dmabuf);
+		return fd;
+	}
+
+	/* fclose() on this drops one ref, freeing the dma buf */
+	fd_install(fd, buf->dmabuf->file);
+
+	return fd;
+
+err_bfree:
+	gk20a_gmmu_free(g, &buf->mem);
+err_kfree:
+	kfree(buf);
+	return err;
+#else
+	return -ENOSYS;
+#endif
+}
+
 u64 gk20a_vm_map(struct vm_gk20a *vm,
 			struct dma_buf *dmabuf,
 			u64 offset_align,
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
index d7503948..5f0ce657 100644
--- a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -718,6 +718,8 @@ void gk20a_vm_mapping_batch_finish_locked(
 	struct vm_gk20a *vm, struct vm_gk20a_mapping_batch *batch);
 
 
+int gk20a_vidmem_buf_alloc(struct gk20a *g, size_t bytes);
+
 /* Note: batch may be NULL if map op is not part of a batch */
 int gk20a_vm_map_buffer(struct vm_gk20a *vm,
 			int dmabuf_fd,
diff --git a/include/uapi/linux/nvgpu.h b/include/uapi/linux/nvgpu.h
index aa950dfa..66ea05b3 100644
--- a/include/uapi/linux/nvgpu.h
+++ b/include/uapi/linux/nvgpu.h
@@ -196,6 +196,8 @@ struct nvgpu_gpu_characteristics {
 	__u32 default_graphics_preempt_mode; /* NVGPU_GRAPHICS_PREEMPTION_MODE_* */
 	__u32 default_compute_preempt_mode; /* NVGPU_COMPUTE_PREEMPTION_MODE_* */
 
+	__u64 local_video_memory_size; /* in bytes, non-zero only for dGPUs */
+
 	/* Notes:
 	   - This struct can be safely appended with new fields. However, always
 	     keep the structure size multiple of 8 and make sure that the binary
@@ -434,6 +436,72 @@ struct nvgpu_gpu_get_engine_info_args {
 	__u64 engine_info_buf_addr;
 };
 
+#define NVGPU_GPU_ALLOC_VIDMEM_FLAG_CONTIGUOUS		(1U << 0)
+
+/* CPU access and coherency flags (3 bits). Use CPU access with care,
+ * BAR resources are scarce. */
+#define NVGPU_GPU_ALLOC_VIDMEM_FLAG_CPU_NOT_MAPPABLE	(0U << 1)
+#define NVGPU_GPU_ALLOC_VIDMEM_FLAG_CPU_WRITE_COMBINE	(1U << 1)
+#define NVGPU_GPU_ALLOC_VIDMEM_FLAG_CPU_CACHED		(2U << 1)
+#define NVGPU_GPU_ALLOC_VIDMEM_FLAG_CPU_MASK		(7U << 1)
+
+#define NVGPU_GPU_ALLOC_VIDMEM_FLAG_VPR			(1U << 4)
+
+/* Allocation of device-specific local video memory. Returns dmabuf fd
+ * on success. */
+struct nvgpu_gpu_alloc_vidmem_args {
+	union {
+		struct {
+			/* Size for allocation. Must be a multiple of
+			 * small page size. */
+			__u64 size;
+
+			/* NVGPU_GPU_ALLOC_VIDMEM_FLAG_* */
+			__u32 flags;
+
+			/* Informational mem tag for resource usage
+			 * tracking. */
+			__u16 memtag;
+
+			__u16 reserved0;
+
+			/* GPU-visible physical memory alignment in
+			 * bytes.
+			 *
+			 * Alignment must be a power of two. Minimum
+			 * alignment is the small page size, which 0
+			 * also denotes.
+			 *
+			 * For contiguous and non-contiguous
+			 * allocations, the start address of the
+			 * physical memory allocation will be aligned
+			 * by this value.
+			 *
+			 * For non-contiguous allocations, memory is
+			 * internally allocated in round_up(size /
+			 * alignment) contiguous blocks. The start
+			 * address of each block is aligned by the
+			 * alignment value. If the size is not a
+			 * multiple of alignment (which is ok), the
+			 * last allocation block size is (size %
+			 * alignment).
+			 *
+			 * By specifying the big page size here and
+			 * allocation size that is a multiple of big
+			 * pages, it will be guaranteed that the
+			 * allocated buffer is big page size mappable.
+			 */
+			__u32 alignment;
+
+			__u32 reserved1[3];
+		} in;
+
+		struct {
+			__s32 dmabuf_fd;
+		} out;
+	};
+};
+
 #define NVGPU_GPU_IOCTL_ZCULL_GET_CTX_SIZE \
 	_IOR(NVGPU_GPU_IOCTL_MAGIC, 1, struct nvgpu_gpu_zcull_get_ctx_size_args)
 #define NVGPU_GPU_IOCTL_ZCULL_GET_INFO \
@@ -489,8 +557,11 @@ struct nvgpu_gpu_get_engine_info_args {
 #define NVGPU_GPU_IOCTL_GET_ENGINE_INFO \
 	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 26, \
 			struct nvgpu_gpu_get_engine_info_args)
+#define NVGPU_GPU_IOCTL_ALLOC_VIDMEM \
+	_IOWR(NVGPU_GPU_IOCTL_MAGIC, 27, \
+			struct nvgpu_gpu_alloc_vidmem_args)
 #define NVGPU_GPU_IOCTL_LAST		\
-	_IOC_NR(NVGPU_GPU_IOCTL_GET_ENGINE_INFO)
+	_IOC_NR(NVGPU_GPU_IOCTL_ALLOC_VIDMEM)
 #define NVGPU_GPU_IOCTL_MAX_ARG_SIZE	\
 	sizeof(struct nvgpu_gpu_get_cpu_time_correlation_info_args)
 
-- 
cgit v1.2.2