ARM: 8505/1: dma-mapping: Optimize allocation

The __iommu_alloc_buffer() is expected to be called to allocate pretty sizeable buffers. Upon simple tests of video I saw it trying to allocate 4,194,304 bytes. The function tries to allocate large chunks in order to optimize IOMMU TLB usage. The current function is very, very slow. One problem is the way it keeps trying and trying to allocate big chunks. Imagine a very fragmented memory that has 4M free but no contiguous pages at all. Further imagine allocating 4M (1024 pages). We'll do the following memory allocations: - For page 1: - Try to allocate order 10 (no retry) - Try to allocate order 9 (no retry) - ... - Try to allocate order 0 (with retry, but not needed) - For page 2: - Try to allocate order 9 (no retry) - Try to allocate order 8 (no retry) - ... - Try to allocate order 0 (with retry, but not needed) - ... - ... Total number of calls to alloc() calls for this case is: sum(int(math.log(i, 2)) + 1 for i in range(1, 1025)) => 9228 The above is obviously worse case, but given how slow alloc can be we really want to try to avoid even somewhat bad cases. I timed the old code with a device under memory pressure and it wasn't hard to see it take more than 120 seconds to allocate 4 megs of memory! (NOTE: testing was done on kernel 3.14, so possibly mainline would behave differently). A second problem is that allocating big chunks under memory pressure when we don't need them is just not a great idea anyway unless we really need them. We can make due pretty well with smaller chunks so it's probably wise to leave bigger chunks for other users once memory pressure is on. Let's adjust the allocation like this: 1. If a big chunk fails, stop trying to hard and bump down to lower order allocations. 2. Don't try useless orders. The whole point of big chunks is to optimize the TLB and it can really only make use of 2M, 1M, 64K and 4K sizes. We'll still tend to eat up a bunch of big chunks, but that might be the right answer for some users. A future patch could possibly add a new DMA_ATTR that would let the caller decide that TLB optimization isn't important and that we should use smaller chunks. Presumably this would be a sane strategy for some callers. Signed-off-by: Douglas Anderson <dianders@chromium.org> Acked-by: Marek Szyprowski <m.szyprowski@samsung.com> Reviewed-by: Robin Murphy <robin.murphy@arm.com> Reviewed-by: Tomasz Figa <tfiga@chromium.org> Tested-by: Javier Martinez Canillas <javier@osg.samsung.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
author: Doug Anderson <armlinux@m.disordat.com> 2016-01-29 17:06:08 -0500
committer: Russell King <rmk+kernel@arm.linux.org.uk> 2016-02-11 10:33:37 -0500
commit: 33298ef6d8ddef57aaa1d11ed53fc08bef2f95aa (patch)
tree: 7132e479a85ce134050798c9750027e663bc9684
parent: 73e592f3bc2cdc68df9dbc92e681b61f9bc6c2bf (diff)
1 files changed, 20 insertions, 14 deletions
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 0eca3812527e..bc9cebfa0891 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1122,6 +1122,9 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
        spin_unlock_irqrestore(&mapping->lock, flags);
 }
+/* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */
+static const int iommu_order_array[] = { 9, 8, 4, 0 };
 static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
                                          gfp_t gfp, struct dma_attrs *attrs)
 {
@@ -1129,6 +1132,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
        int count = size >> PAGE_SHIFT;
        int array_size = count * sizeof(struct page *);
        int i = 0;
+        int order_idx = 0;
        if (array_size <= PAGE_SIZE)
                pages = kzalloc(array_size, GFP_KERNEL);
@@ -1162,22 +1166,24 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
        while (count) {
                int j, order;
-                for (order = __fls(count); order > 0; --order) {
+                order = iommu_order_array[order_idx];
-                        /*
-                         * We do not want OOM killer to be invoked as long
+                /* Drop down when we get small */
-                         * as we can fall back to single pages, so we force
+                if (__fls(count) < order) {
-                         * __GFP_NORETRY for orders higher than zero.
+                        order_idx++;
-                         */
+                        continue;
-                        pages[i] = alloc_pages(gfp | __GFP_NORETRY, order);
-                        if (pages[i])
-                                break;
                }
-                if (!pages[i]) {
+                if (order) {
-                        /*
+                        /* See if it's easy to allocate a high-order chunk */
-                         * Fall back to single page allocation.
+                        pages[i] = alloc_pages(gfp | __GFP_NORETRY, order);
-                         * Might invoke OOM killer as last resort.
-                         */
+                        /* Go down a notch at first sign of pressure */
+                        if (!pages[i]) {
+                                order_idx++;
+                                continue;
+                        }
+                } else {
                        pages[i] = alloc_pages(gfp, 0);
                        if (!pages[i])
                                goto error;
author	Doug Anderson <armlinux@m.disordat.com>	2016-01-29 17:06:08 -0500
committer	Russell King <rmk+kernel@arm.linux.org.uk>	2016-02-11 10:33:37 -0500
commit	33298ef6d8ddef57aaa1d11ed53fc08bef2f95aa (patch)
tree	7132e479a85ce134050798c9750027e663bc9684
parent	73e592f3bc2cdc68df9dbc92e681b61f9bc6c2bf (diff)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 0eca3812527e..bc9cebfa0891 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c
@@ -1122,6 +1122,9 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
1122	spin_unlock_irqrestore(&mapping->lock, flags);	1122	spin_unlock_irqrestore(&mapping->lock, flags);
1123	}	1123	}
1124		1124
		1125	/* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */
		1126	static const int iommu_order_array[] = { 9, 8, 4, 0 };
		1127
1125	static struct page *__iommu_alloc_buffer(struct device dev, size_t size,	1128	static struct page *__iommu_alloc_buffer(struct device dev, size_t size,
1126	gfp_t gfp, struct dma_attrs *attrs)	1129	gfp_t gfp, struct dma_attrs *attrs)
1127	{	1130	{
@@ -1129,6 +1132,7 @@ static struct page *__iommu_alloc_buffer(struct device dev, size_t size,
1129	int count = size >> PAGE_SHIFT;	1132	int count = size >> PAGE_SHIFT;
1130	int array_size = count * sizeof(struct page *);	1133	int array_size = count * sizeof(struct page *);
1131	int i = 0;	1134	int i = 0;
		1135	int order_idx = 0;
1132		1136
1133	if (array_size <= PAGE_SIZE)	1137	if (array_size <= PAGE_SIZE)
1134	pages = kzalloc(array_size, GFP_KERNEL);	1138	pages = kzalloc(array_size, GFP_KERNEL);
@@ -1162,22 +1166,24 @@ static struct page *__iommu_alloc_buffer(struct device dev, size_t size,
1162	while (count) {	1166	while (count) {
1163	int j, order;	1167	int j, order;
1164		1168
1165	for (order = __fls(count); order > 0; --order) {	1169	order = iommu_order_array[order_idx];
1166	/*	1170
1167	* We do not want OOM killer to be invoked as long	1171	/* Drop down when we get small */
1168	* as we can fall back to single pages, so we force	1172	if (__fls(count) < order) {
1169	* __GFP_NORETRY for orders higher than zero.	1173	order_idx++;
1170	*/	1174	continue;
1171	pages[i] = alloc_pages(gfp \| __GFP_NORETRY, order);
1172	if (pages[i])
1173	break;
1174	}	1175	}
1175		1176
1176	if (!pages[i]) {	1177	if (order) {
1177	/*	1178	/* See if it's easy to allocate a high-order chunk */
1178	* Fall back to single page allocation.	1179	pages[i] = alloc_pages(gfp \| __GFP_NORETRY, order);
1179	* Might invoke OOM killer as last resort.	1180
1180	*/	1181	/* Go down a notch at first sign of pressure */
		1182	if (!pages[i]) {
		1183	order_idx++;
		1184	continue;
		1185	}
		1186	} else {
1181	pages[i] = alloc_pages(gfp, 0);	1187	pages[i] = alloc_pages(gfp, 0);
1182	if (!pages[i])	1188	if (!pages[i])
1183	goto error;	1189	goto error;