drm/nouveau/instmem/gk20a: add IOMMU support

Let GK20A's instmem take advantage of the IOMMU if it is present. Having an IOMMU means that instmem is no longer allocated using the DMA API, but instead obtained through page_alloc and made contiguous to the GPU by IOMMU mappings. Signed-off-by: Alexandre Courbot <acourbot@nvidia.com> Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
author: Alexandre Courbot <acourbot@nvidia.com> 2015-02-20 04:23:04 -0500
committer: Ben Skeggs <bskeggs@redhat.com> 2015-04-14 03:00:45 -0400
commit: a7f6da6e758cd99fcae918b63549273893983189 (patch)
tree: 2aa072dd29a1f6272749eff16f462c35a9e6cb37 /drivers/gpu
parent: 58fd9375c2c5344e8ab6ef9971635bc59cd39658 (diff)
1 files changed, 252 insertions, 39 deletions
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
index 175ac187d382..fcba72eb74a3 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
@@ -20,12 +20,32 @@
 * DEALINGS IN THE SOFTWARE.
 */
+/*
+ * GK20A does not have dedicated video memory, and to accurately represent this
+ * fact Nouveau will not create a RAM device for it. Therefore its instmem
+ * implementation must be done directly on top of system memory, while providing
+ * coherent read and write operations.
+ *
+ * Instmem can be allocated through two means:
+ * 1) If an IOMMU mapping has been probed, the IOMMU API is used to make memory
+ *    pages contiguous to the GPU. This is the preferred way.
+ * 2) If no IOMMU mapping is probed, the DMA API is used to allocate physically
+ *    contiguous memory.
+ *
+ * In both cases CPU read and writes are performed using PRAMIN (i.e. using the
+ * GPU path) to ensure these operations are coherent for the GPU. This allows us
+ * to use more "relaxed" allocation parameters when using the DMA API, since we
+ * never need a kernel mapping.
+ */
 #include <subdev/fb.h>
 #include <core/mm.h>
 #include <core/device.h>
 #ifdef __KERNEL__
 #include <linux/dma-attrs.h>
+#include <linux/iommu.h>
+#include <nouveau_platform.h>
 #endif
 #include "priv.h"
@@ -36,18 +56,53 @@ struct gk20a_instobj_priv {
        struct nvkm_mem *mem;
        /* Pointed by mem */
        struct nvkm_mem _mem;
+};
+/*
+ * Used for objects allocated using the DMA API
+ */
+struct gk20a_instobj_dma {
+        struct gk20a_instobj_priv base;
        void *cpuaddr;
        dma_addr_t handle;
        struct nvkm_mm_node r;
 };
+/*
+ * Used for objects flattened using the IOMMU API
+ */
+struct gk20a_instobj_iommu {
+        struct gk20a_instobj_priv base;
+        /* array of base.mem->size pages */
+        struct page *pages[];
+};
 struct gk20a_instmem_priv {
        struct nvkm_instmem base;
        spinlock_t lock;
        u64 addr;
+        /* Only used if IOMMU if present */
+        struct mutex *mm_mutex;
+        struct nvkm_mm *mm;
+        struct iommu_domain *domain;
+        unsigned long iommu_pgshift;
+        /* Only used by DMA API */
        struct dma_attrs attrs;
 };
+/*
+ * Use PRAMIN to read/write data and avoid coherency issues.
+ * PRAMIN uses the GPU path and ensures data will always be coherent.
+ *
+ * A dynamic mapping based solution would be desirable in the future, but
+ * the issue remains of how to maintain coherency efficiently. On ARM it is
+ * not easy (if possible at all?) to create uncached temporary mappings.
+ */
 static u32
 gk20a_instobj_rd32(struct nvkm_object *object, u64 offset)
 {
@@ -87,50 +142,79 @@ gk20a_instobj_wr32(struct nvkm_object *object, u64 offset, u32 data)
 }
 static void
-gk20a_instobj_dtor(struct nvkm_object *object)
+gk20a_instobj_dtor_dma(struct gk20a_instobj_priv *_node)
 {
-        struct gk20a_instobj_priv *node = (void *)object;
+        struct gk20a_instobj_dma *node = (void *)_node;
        struct gk20a_instmem_priv *priv = (void *)nvkm_instmem(node);
        struct device *dev = nv_device_base(nv_device(priv));
        if (unlikely(!node->handle))
                return;
-        dma_free_attrs(dev, node->mem->size << PAGE_SHIFT, node->cpuaddr,
+        dma_free_attrs(dev, _node->mem->size << PAGE_SHIFT, node->cpuaddr,
                       node->handle, &priv->attrs);
+}
+static void
+gk20a_instobj_dtor_iommu(struct gk20a_instobj_priv *_node)
+{
+        struct gk20a_instobj_iommu *node = (void *)_node;
+        struct gk20a_instmem_priv *priv = (void *)nvkm_instmem(node);
+        struct nvkm_mm_node *r;
+        int i;
+        if (unlikely(list_empty(&_node->mem->regions)))
+                return;
+        r = list_first_entry(&_node->mem->regions, struct nvkm_mm_node,
+                             rl_entry);
+        /* clear bit 34 to unmap pages */
+        r->offset &= ~BIT(34 - priv->iommu_pgshift);
+        /* Unmap pages from GPU address space and free them */
+        for (i = 0; i < _node->mem->size; i++) {
+                iommu_unmap(priv->domain,
+                            (r->offset + i) << priv->iommu_pgshift, PAGE_SIZE);
+                __free_page(node->pages[i]);
+        }
+        /* Release area from GPU address space */
+        mutex_lock(priv->mm_mutex);
+        nvkm_mm_free(priv->mm, &r);
+        mutex_unlock(priv->mm_mutex);
+}
+static void
+gk20a_instobj_dtor(struct nvkm_object *object)
+{
+        struct gk20a_instobj_priv *node = (void *)object;
+        struct gk20a_instmem_priv *priv = (void *)nvkm_instmem(node);
+        if (priv->domain)
+                gk20a_instobj_dtor_iommu(node);
+        else
+                gk20a_instobj_dtor_dma(node);
        nvkm_instobj_destroy(&node->base);
 }
 static int
-gk20a_instobj_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
+gk20a_instobj_ctor_dma(struct nvkm_object *parent, struct nvkm_object *engine,
-                   struct nvkm_oclass *oclass, void *data, u32 _size,
+                       struct nvkm_oclass *oclass, u32 npages, u32 align,
-                   struct nvkm_object **pobject)
+                       struct gk20a_instobj_priv **_node)
 {
-        struct nvkm_instobj_args *args = data;
+        struct gk20a_instobj_dma *node;
        struct gk20a_instmem_priv *priv = (void *)nvkm_instmem(parent);
-        struct device *dev = nv_device_base(nv_device(priv));
+        struct device *dev = nv_device_base(nv_device(parent));
-        struct gk20a_instobj_priv *node;
-        u32 size, align;
-        u32 npages;
        int ret;
-        nv_debug(parent, "%s: size: %x align: %x\n", __func__,
-                 args->size, args->align);
-        size  = max((args->size  + 4095) & ~4095, (u32)4096);
-        align = max((args->align + 4095) & ~4095, (u32)4096);
-        npages = size >> PAGE_SHIFT;
        ret = nvkm_instobj_create_(parent, engine, oclass, sizeof(*node),
-                                      (void **)&node);
+                                   (void **)&node);
-        *pobject = nv_object(node);
+        *_node = &node->base;
        if (ret)
                return ret;
-        node->mem = &node->_mem;
        node->cpuaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT,
                                        &node->handle, GFP_KERNEL,
                                        &priv->attrs);
@@ -144,16 +228,132 @@ gk20a_instobj_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
                nv_warn(priv, "memory not aligned as requested: %pad (0x%x)\n",
                        &node->handle, align);
-        node->mem->offset = node->handle;
+        /* present memory for being mapped using small pages */
+        node->r.type = 12;
+        node->r.offset = node->handle >> 12;
+        node->r.length = (npages << PAGE_SHIFT) >> 12;
+        node->base._mem.offset = node->handle;
+        INIT_LIST_HEAD(&node->base._mem.regions);
+        list_add_tail(&node->r.rl_entry, &node->base._mem.regions);
+        return 0;
+}
+static int
+gk20a_instobj_ctor_iommu(struct nvkm_object *parent, struct nvkm_object *engine,
+                         struct nvkm_oclass *oclass, u32 npages, u32 align,
+                         struct gk20a_instobj_priv **_node)
+{
+        struct gk20a_instobj_iommu *node;
+        struct gk20a_instmem_priv *priv = (void *)nvkm_instmem(parent);
+        struct nvkm_mm_node *r;
+        int ret;
+        int i;
+        ret = nvkm_instobj_create_(parent, engine, oclass,
+                                sizeof(*node) + sizeof(node->pages[0]) * npages,
+                                (void **)&node);
+        *_node = &node->base;
+        if (ret)
+                return ret;
+        /* Allocate backing memory */
+        for (i = 0; i < npages; i++) {
+                struct page *p = alloc_page(GFP_KERNEL);
+                if (p == NULL) {
+                        ret = -ENOMEM;
+                        goto free_pages;
+                }
+                node->pages[i] = p;
+        }
+        mutex_lock(priv->mm_mutex);
+        /* Reserve area from GPU address space */
+        ret = nvkm_mm_head(priv->mm, 0, 1, npages, npages,
+                           align >> priv->iommu_pgshift, &r);
+        mutex_unlock(priv->mm_mutex);
+        if (ret) {
+                nv_error(priv, "virtual space is full!\n");
+                goto free_pages;
+        }
+        /* Map into GPU address space */
+        for (i = 0; i < npages; i++) {
+                struct page *p = node->pages[i];
+                u32 offset = (r->offset + i) << priv->iommu_pgshift;
+                ret = iommu_map(priv->domain, offset, page_to_phys(p),
+                                PAGE_SIZE, IOMMU_READ | IOMMU_WRITE);
+                if (ret < 0) {
+                        nv_error(priv, "IOMMU mapping failure: %d\n", ret);
+                        while (i-- > 0) {
+                                offset -= PAGE_SIZE;
+                                iommu_unmap(priv->domain, offset, PAGE_SIZE);
+                        }
+                        goto release_area;
+                }
+        }
+        /* Bit 34 tells that an address is to be resolved through the IOMMU */
+        r->offset |= BIT(34 - priv->iommu_pgshift);
+        node->base._mem.offset = ((u64)r->offset) << priv->iommu_pgshift;
+        INIT_LIST_HEAD(&node->base._mem.regions);
+        list_add_tail(&r->rl_entry, &node->base._mem.regions);
+        return 0;
+release_area:
+        mutex_lock(priv->mm_mutex);
+        nvkm_mm_free(priv->mm, &r);
+        mutex_unlock(priv->mm_mutex);
+free_pages:
+        for (i = 0; i < npages && node->pages[i] != NULL; i++)
+                __free_page(node->pages[i]);
+        return ret;
+}
+static int
+gk20a_instobj_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
+                   struct nvkm_oclass *oclass, void *data, u32 _size,
+                   struct nvkm_object **pobject)
+{
+        struct nvkm_instobj_args *args = data;
+        struct gk20a_instmem_priv *priv = (void *)nvkm_instmem(parent);
+        struct gk20a_instobj_priv *node;
+        u32 size, align;
+        int ret;
+        nv_debug(parent, "%s (%s): size: %x align: %x\n", __func__,
+                 priv->domain ? "IOMMU" : "DMA", args->size, args->align);
+        /* Round size and align to page bounds */
+        size = max(roundup(args->size, PAGE_SIZE), PAGE_SIZE);
+        align = max(roundup(args->align, PAGE_SIZE), PAGE_SIZE);
+        if (priv->domain)
+                ret = gk20a_instobj_ctor_iommu(parent, engine, oclass,
+                                              size >> PAGE_SHIFT, align, &node);
+        else
+                ret = gk20a_instobj_ctor_dma(parent, engine, oclass,
+                                             size >> PAGE_SHIFT, align, &node);
+        *pobject = nv_object(node);
+        if (ret)
+                return ret;
+        node->mem = &node->_mem;
+        /* present memory for being mapped using small pages */
        node->mem->size = size >> 12;
        node->mem->memtype = 0;
        node->mem->page_shift = 12;
-        INIT_LIST_HEAD(&node->mem->regions);
-        node->r.type = 12;
-        node->r.offset = node->handle >> 12;
-        node->r.length = npages;
-        list_add_tail(&node->r.rl_entry, &node->mem->regions);
        node->base.addr = node->mem->offset;
        node->base.size = size;
@@ -192,6 +392,7 @@ gk20a_instmem_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
                   struct nvkm_object **pobject)
 {
        struct gk20a_instmem_priv *priv;
+        struct nouveau_platform_device *plat;
        int ret;
        ret = nvkm_instmem_create(parent, engine, oclass, &priv);
@@ -201,15 +402,27 @@ gk20a_instmem_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
        spin_lock_init(&priv->lock);
-        init_dma_attrs(&priv->attrs);
+        plat = nv_device_to_platform(nv_device(parent));
-        /*
+        if (plat->gpu->iommu.domain) {
-         * We will access instmem through PRAMIN and thus do not need a
+                priv->domain = plat->gpu->iommu.domain;
-         * consistent CPU pointer or kernel mapping
+                priv->mm = plat->gpu->iommu.mm;
-         */
+                priv->iommu_pgshift = plat->gpu->iommu.pgshift;
-        dma_set_attr(DMA_ATTR_NON_CONSISTENT, &priv->attrs);
+                priv->mm_mutex = &plat->gpu->iommu.mutex;
-        dma_set_attr(DMA_ATTR_WEAK_ORDERING, &priv->attrs);
-        dma_set_attr(DMA_ATTR_WRITE_COMBINE, &priv->attrs);
+                nv_info(priv, "using IOMMU\n");
-        dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &priv->attrs);
+        } else {
+                init_dma_attrs(&priv->attrs);
+                /*
+                 * We will access instmem through PRAMIN and thus do not need a
+                 * consistent CPU pointer or kernel mapping
+                 */
+                dma_set_attr(DMA_ATTR_NON_CONSISTENT, &priv->attrs);
+                dma_set_attr(DMA_ATTR_WEAK_ORDERING, &priv->attrs);
+                dma_set_attr(DMA_ATTR_WRITE_COMBINE, &priv->attrs);
+                dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &priv->attrs);
+                nv_info(priv, "using DMA API\n");
+        }
        return 0;
 }
author	Alexandre Courbot <acourbot@nvidia.com>	2015-02-20 04:23:04 -0500
committer	Ben Skeggs <bskeggs@redhat.com>	2015-04-14 03:00:45 -0400
commit	a7f6da6e758cd99fcae918b63549273893983189 (patch)
tree	2aa072dd29a1f6272749eff16f462c35a9e6cb37 /drivers/gpu
parent	58fd9375c2c5344e8ab6ef9971635bc59cd39658 (diff)

diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c index 175ac187d382..fcba72eb74a3 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c
@@ -20,12 +20,32 @@
20	* DEALINGS IN THE SOFTWARE.	20	* DEALINGS IN THE SOFTWARE.
21	*/	21	*/
22		22
		23	/*
		24	* GK20A does not have dedicated video memory, and to accurately represent this
		25	* fact Nouveau will not create a RAM device for it. Therefore its instmem
		26	* implementation must be done directly on top of system memory, while providing
		27	* coherent read and write operations.
		28	*
		29	* Instmem can be allocated through two means:
		30	* 1) If an IOMMU mapping has been probed, the IOMMU API is used to make memory
		31	* pages contiguous to the GPU. This is the preferred way.
		32	* 2) If no IOMMU mapping is probed, the DMA API is used to allocate physically
		33	* contiguous memory.
		34	*
		35	* In both cases CPU read and writes are performed using PRAMIN (i.e. using the
		36	* GPU path) to ensure these operations are coherent for the GPU. This allows us
		37	* to use more "relaxed" allocation parameters when using the DMA API, since we
		38	* never need a kernel mapping.
		39	*/
		40
23	#include <subdev/fb.h>	41	#include <subdev/fb.h>
24	#include <core/mm.h>	42	#include <core/mm.h>
25	#include <core/device.h>	43	#include <core/device.h>
26		44
27	#ifdef __KERNEL__	45	#ifdef __KERNEL__
28	#include <linux/dma-attrs.h>	46	#include <linux/dma-attrs.h>
		47	#include <linux/iommu.h>
		48	#include <nouveau_platform.h>
29	#endif	49	#endif
30		50
31	#include "priv.h"	51	#include "priv.h"
@@ -36,18 +56,53 @@ struct gk20a_instobj_priv {
36	struct nvkm_mem *mem;	56	struct nvkm_mem *mem;
37	/* Pointed by mem */	57	/* Pointed by mem */
38	struct nvkm_mem _mem;	58	struct nvkm_mem _mem;
		59	};
		60
		61	/*
		62	* Used for objects allocated using the DMA API
		63	*/
		64	struct gk20a_instobj_dma {
		65	struct gk20a_instobj_priv base;
		66
39	void *cpuaddr;	67	void *cpuaddr;
40	dma_addr_t handle;	68	dma_addr_t handle;
41	struct nvkm_mm_node r;	69	struct nvkm_mm_node r;
42	};	70	};
43		71
		72	/*
		73	* Used for objects flattened using the IOMMU API
		74	*/
		75	struct gk20a_instobj_iommu {
		76	struct gk20a_instobj_priv base;
		77
		78	/* array of base.mem->size pages */
		79	struct page *pages[];
		80	};
		81
44	struct gk20a_instmem_priv {	82	struct gk20a_instmem_priv {
45	struct nvkm_instmem base;	83	struct nvkm_instmem base;
46	spinlock_t lock;	84	spinlock_t lock;
47	u64 addr;	85	u64 addr;
		86
		87	/* Only used if IOMMU if present */
		88	struct mutex *mm_mutex;
		89	struct nvkm_mm *mm;
		90	struct iommu_domain *domain;
		91	unsigned long iommu_pgshift;
		92
		93	/* Only used by DMA API */
48	struct dma_attrs attrs;	94	struct dma_attrs attrs;
49	};	95	};
50		96
		97	/*
		98	* Use PRAMIN to read/write data and avoid coherency issues.
		99	* PRAMIN uses the GPU path and ensures data will always be coherent.
		100	*
		101	* A dynamic mapping based solution would be desirable in the future, but
		102	* the issue remains of how to maintain coherency efficiently. On ARM it is
		103	* not easy (if possible at all?) to create uncached temporary mappings.
		104	*/
		105
51	static u32	106	static u32
52	gk20a_instobj_rd32(struct nvkm_object *object, u64 offset)	107	gk20a_instobj_rd32(struct nvkm_object *object, u64 offset)
53	{	108	{
@@ -87,50 +142,79 @@ gk20a_instobj_wr32(struct nvkm_object *object, u64 offset, u32 data)
87	}	142	}
88		143
89	static void	144	static void
90	gk20a_instobj_dtor(struct nvkm_object *object)	145	gk20a_instobj_dtor_dma(struct gk20a_instobj_priv *_node)
91	{	146	{
92	struct gk20a_instobj_priv node = (void )object;	147	struct gk20a_instobj_dma node = (void )_node;
93	struct gk20a_instmem_priv priv = (void )nvkm_instmem(node);	148	struct gk20a_instmem_priv priv = (void )nvkm_instmem(node);
94	struct device *dev = nv_device_base(nv_device(priv));	149	struct device *dev = nv_device_base(nv_device(priv));
95		150
96	if (unlikely(!node->handle))	151	if (unlikely(!node->handle))
97	return;	152	return;
98		153
99	dma_free_attrs(dev, node->mem->size << PAGE_SHIFT, node->cpuaddr,	154	dma_free_attrs(dev, _node->mem->size << PAGE_SHIFT, node->cpuaddr,
100	node->handle, &priv->attrs);	155	node->handle, &priv->attrs);
		156	}
		157
		158	static void
		159	gk20a_instobj_dtor_iommu(struct gk20a_instobj_priv *_node)
		160	{
		161	struct gk20a_instobj_iommu node = (void )_node;
		162	struct gk20a_instmem_priv priv = (void )nvkm_instmem(node);
		163	struct nvkm_mm_node *r;
		164	int i;
		165
		166	if (unlikely(list_empty(&_node->mem->regions)))
		167	return;
		168
		169	r = list_first_entry(&_node->mem->regions, struct nvkm_mm_node,
		170	rl_entry);
		171
		172	/* clear bit 34 to unmap pages */
		173	r->offset &= ~BIT(34 - priv->iommu_pgshift);
		174
		175	/* Unmap pages from GPU address space and free them */
		176	for (i = 0; i < _node->mem->size; i++) {
		177	iommu_unmap(priv->domain,
		178	(r->offset + i) << priv->iommu_pgshift, PAGE_SIZE);
		179	__free_page(node->pages[i]);
		180	}
		181
		182	/* Release area from GPU address space */
		183	mutex_lock(priv->mm_mutex);
		184	nvkm_mm_free(priv->mm, &r);
		185	mutex_unlock(priv->mm_mutex);
		186	}
		187
		188	static void
		189	gk20a_instobj_dtor(struct nvkm_object *object)
		190	{
		191	struct gk20a_instobj_priv node = (void )object;
		192	struct gk20a_instmem_priv priv = (void )nvkm_instmem(node);
		193
		194	if (priv->domain)
		195	gk20a_instobj_dtor_iommu(node);
		196	else
		197	gk20a_instobj_dtor_dma(node);
101		198
102	nvkm_instobj_destroy(&node->base);	199	nvkm_instobj_destroy(&node->base);
103	}	200	}
104		201
105	static int	202	static int
106	gk20a_instobj_ctor(struct nvkm_object parent, struct nvkm_object engine,	203	gk20a_instobj_ctor_dma(struct nvkm_object parent, struct nvkm_object engine,
107	struct nvkm_oclass oclass, void data, u32 _size,	204	struct nvkm_oclass *oclass, u32 npages, u32 align,
108	struct nvkm_object **pobject)	205	struct gk20a_instobj_priv **_node)
109	{	206	{
110	struct nvkm_instobj_args *args = data;	207	struct gk20a_instobj_dma *node;
111	struct gk20a_instmem_priv priv = (void )nvkm_instmem(parent);	208	struct gk20a_instmem_priv priv = (void )nvkm_instmem(parent);
112	struct device *dev = nv_device_base(nv_device(priv));	209	struct device *dev = nv_device_base(nv_device(parent));
113	struct gk20a_instobj_priv *node;
114	u32 size, align;
115	u32 npages;
116	int ret;	210	int ret;
117		211
118	nv_debug(parent, "%s: size: %x align: %x\n", __func__,
119	args->size, args->align);
120
121	size = max((args->size + 4095) & ~4095, (u32)4096);
122	align = max((args->align + 4095) & ~4095, (u32)4096);
123
124	npages = size >> PAGE_SHIFT;
125
126	ret = nvkm_instobj_create_(parent, engine, oclass, sizeof(*node),	212	ret = nvkm_instobj_create_(parent, engine, oclass, sizeof(*node),
127	(void **)&node);	213	(void **)&node);
128	*pobject = nv_object(node);	214	*_node = &node->base;
129	if (ret)	215	if (ret)
130	return ret;	216	return ret;
131		217
132	node->mem = &node->_mem;
133
134	node->cpuaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT,	218	node->cpuaddr = dma_alloc_attrs(dev, npages << PAGE_SHIFT,
135	&node->handle, GFP_KERNEL,	219	&node->handle, GFP_KERNEL,
136	&priv->attrs);	220	&priv->attrs);
@@ -144,16 +228,132 @@ gk20a_instobj_ctor(struct nvkm_object parent, struct nvkm_object engine,
144	nv_warn(priv, "memory not aligned as requested: %pad (0x%x)\n",	228	nv_warn(priv, "memory not aligned as requested: %pad (0x%x)\n",
145	&node->handle, align);	229	&node->handle, align);
146		230
147	node->mem->offset = node->handle;	231	/* present memory for being mapped using small pages */
		232	node->r.type = 12;
		233	node->r.offset = node->handle >> 12;
		234	node->r.length = (npages << PAGE_SHIFT) >> 12;
		235
		236	node->base._mem.offset = node->handle;
		237
		238	INIT_LIST_HEAD(&node->base._mem.regions);
		239	list_add_tail(&node->r.rl_entry, &node->base._mem.regions);
		240
		241	return 0;
		242	}
		243
		244	static int
		245	gk20a_instobj_ctor_iommu(struct nvkm_object parent, struct nvkm_object engine,
		246	struct nvkm_oclass *oclass, u32 npages, u32 align,
		247	struct gk20a_instobj_priv **_node)
		248	{
		249	struct gk20a_instobj_iommu *node;
		250	struct gk20a_instmem_priv priv = (void )nvkm_instmem(parent);
		251	struct nvkm_mm_node *r;
		252	int ret;
		253	int i;
		254
		255	ret = nvkm_instobj_create_(parent, engine, oclass,
		256	sizeof(node) + sizeof(node->pages[0]) npages,
		257	(void **)&node);
		258	*_node = &node->base;
		259	if (ret)
		260	return ret;
		261
		262	/* Allocate backing memory */
		263	for (i = 0; i < npages; i++) {
		264	struct page *p = alloc_page(GFP_KERNEL);
		265
		266	if (p == NULL) {
		267	ret = -ENOMEM;
		268	goto free_pages;
		269	}
		270	node->pages[i] = p;
		271	}
		272
		273	mutex_lock(priv->mm_mutex);
		274	/* Reserve area from GPU address space */
		275	ret = nvkm_mm_head(priv->mm, 0, 1, npages, npages,
		276	align >> priv->iommu_pgshift, &r);
		277	mutex_unlock(priv->mm_mutex);
		278	if (ret) {
		279	nv_error(priv, "virtual space is full!\n");
		280	goto free_pages;
		281	}
		282
		283	/* Map into GPU address space */
		284	for (i = 0; i < npages; i++) {
		285	struct page *p = node->pages[i];
		286	u32 offset = (r->offset + i) << priv->iommu_pgshift;
		287
		288	ret = iommu_map(priv->domain, offset, page_to_phys(p),
		289	PAGE_SIZE, IOMMU_READ \| IOMMU_WRITE);
		290	if (ret < 0) {
		291	nv_error(priv, "IOMMU mapping failure: %d\n", ret);
		292
		293	while (i-- > 0) {
		294	offset -= PAGE_SIZE;
		295	iommu_unmap(priv->domain, offset, PAGE_SIZE);
		296	}
		297	goto release_area;
		298	}
		299	}
		300
		301	/* Bit 34 tells that an address is to be resolved through the IOMMU */
		302	r->offset \|= BIT(34 - priv->iommu_pgshift);
		303
		304	node->base._mem.offset = ((u64)r->offset) << priv->iommu_pgshift;
		305
		306	INIT_LIST_HEAD(&node->base._mem.regions);
		307	list_add_tail(&r->rl_entry, &node->base._mem.regions);
		308
		309	return 0;
		310
		311	release_area:
		312	mutex_lock(priv->mm_mutex);
		313	nvkm_mm_free(priv->mm, &r);
		314	mutex_unlock(priv->mm_mutex);
		315
		316	free_pages:
		317	for (i = 0; i < npages && node->pages[i] != NULL; i++)
		318	__free_page(node->pages[i]);
		319
		320	return ret;
		321	}
		322
		323	static int
		324	gk20a_instobj_ctor(struct nvkm_object parent, struct nvkm_object engine,
		325	struct nvkm_oclass oclass, void data, u32 _size,
		326	struct nvkm_object **pobject)
		327	{
		328	struct nvkm_instobj_args *args = data;
		329	struct gk20a_instmem_priv priv = (void )nvkm_instmem(parent);
		330	struct gk20a_instobj_priv *node;
		331	u32 size, align;
		332	int ret;
		333
		334	nv_debug(parent, "%s (%s): size: %x align: %x\n", __func__,
		335	priv->domain ? "IOMMU" : "DMA", args->size, args->align);
		336
		337	/* Round size and align to page bounds */
		338	size = max(roundup(args->size, PAGE_SIZE), PAGE_SIZE);
		339	align = max(roundup(args->align, PAGE_SIZE), PAGE_SIZE);
		340
		341	if (priv->domain)
		342	ret = gk20a_instobj_ctor_iommu(parent, engine, oclass,
		343	size >> PAGE_SHIFT, align, &node);
		344	else
		345	ret = gk20a_instobj_ctor_dma(parent, engine, oclass,
		346	size >> PAGE_SHIFT, align, &node);
		347	*pobject = nv_object(node);
		348	if (ret)
		349	return ret;
		350
		351	node->mem = &node->_mem;
		352
		353	/* present memory for being mapped using small pages */
148	node->mem->size = size >> 12;	354	node->mem->size = size >> 12;
149	node->mem->memtype = 0;	355	node->mem->memtype = 0;
150	node->mem->page_shift = 12;	356	node->mem->page_shift = 12;
151	INIT_LIST_HEAD(&node->mem->regions);
152
153	node->r.type = 12;
154	node->r.offset = node->handle >> 12;
155	node->r.length = npages;
156	list_add_tail(&node->r.rl_entry, &node->mem->regions);
157		357
158	node->base.addr = node->mem->offset;	358	node->base.addr = node->mem->offset;
159	node->base.size = size;	359	node->base.size = size;
@@ -192,6 +392,7 @@ gk20a_instmem_ctor(struct nvkm_object parent, struct nvkm_object engine,
192	struct nvkm_object **pobject)	392	struct nvkm_object **pobject)
193	{	393	{
194	struct gk20a_instmem_priv *priv;	394	struct gk20a_instmem_priv *priv;
		395	struct nouveau_platform_device *plat;
195	int ret;	396	int ret;
196		397
197	ret = nvkm_instmem_create(parent, engine, oclass, &priv);	398	ret = nvkm_instmem_create(parent, engine, oclass, &priv);
@@ -201,15 +402,27 @@ gk20a_instmem_ctor(struct nvkm_object parent, struct nvkm_object engine,
201		402
202	spin_lock_init(&priv->lock);	403	spin_lock_init(&priv->lock);
203		404
204	init_dma_attrs(&priv->attrs);	405	plat = nv_device_to_platform(nv_device(parent));
205	/*	406	if (plat->gpu->iommu.domain) {
206	* We will access instmem through PRAMIN and thus do not need a	407	priv->domain = plat->gpu->iommu.domain;
207	* consistent CPU pointer or kernel mapping	408	priv->mm = plat->gpu->iommu.mm;
208	*/	409	priv->iommu_pgshift = plat->gpu->iommu.pgshift;
209	dma_set_attr(DMA_ATTR_NON_CONSISTENT, &priv->attrs);	410	priv->mm_mutex = &plat->gpu->iommu.mutex;
210	dma_set_attr(DMA_ATTR_WEAK_ORDERING, &priv->attrs);	411
211	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &priv->attrs);	412	nv_info(priv, "using IOMMU\n");
212	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &priv->attrs);	413	} else {
		414	init_dma_attrs(&priv->attrs);
		415	/*
		416	* We will access instmem through PRAMIN and thus do not need a
		417	* consistent CPU pointer or kernel mapping
		418	*/
		419	dma_set_attr(DMA_ATTR_NON_CONSISTENT, &priv->attrs);
		420	dma_set_attr(DMA_ATTR_WEAK_ORDERING, &priv->attrs);
		421	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &priv->attrs);
		422	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &priv->attrs);
		423
		424	nv_info(priv, "using DMA API\n");
		425	}
213		426
214	return 0;	427	return 0;
215	}	428	}