diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-08-07 00:14:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-08-07 00:14:42 -0400 |
commit | 33caee39925b887a99a2400dc5c980097c3573f9 (patch) | |
tree | 8e68ad97e1fee88c4a3f31453041f8d139f2027e /mm | |
parent | 6456a0438b984186a0c9c8ecc9fe3d97b7ac3613 (diff) | |
parent | f84223087402c45179be5e7060c5736c17a7b271 (diff) |
Merge branch 'akpm' (patchbomb from Andrew Morton)
Merge incoming from Andrew Morton:
- Various misc things.
- arch/sh updates.
- Part of ocfs2. Review is slow.
- Slab updates.
- Most of -mm.
- printk updates.
- lib/ updates.
- checkpatch updates.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (226 commits)
checkpatch: update $declaration_macros, add uninitialized_var
checkpatch: warn on missing spaces in broken up quoted
checkpatch: fix false positives for --strict "space after cast" test
checkpatch: fix false positive MISSING_BREAK warnings with --file
checkpatch: add test for native c90 types in unusual order
checkpatch: add signed generic types
checkpatch: add short int to c variable types
checkpatch: add for_each tests to indentation and brace tests
checkpatch: fix brace style misuses of else and while
checkpatch: add --fix option for a couple OPEN_BRACE misuses
checkpatch: use the correct indentation for which()
checkpatch: add fix_insert_line and fix_delete_line helpers
checkpatch: add ability to insert and delete lines to patch/file
checkpatch: add an index variable for fixed lines
checkpatch: warn on break after goto or return with same tab indentation
checkpatch: emit a warning on file add/move/delete
checkpatch: add test for commit id formatting style in commit log
checkpatch: emit fewer kmalloc_array/kcalloc conversion warnings
checkpatch: improve "no space after cast" test
checkpatch: allow multiple const * types
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 54 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/cma.c | 335 | ||||
-rw-r--r-- | mm/filemap.c | 27 | ||||
-rw-r--r-- | mm/gup.c | 18 | ||||
-rw-r--r-- | mm/highmem.c | 86 | ||||
-rw-r--r-- | mm/huge_memory.c | 38 | ||||
-rw-r--r-- | mm/hugetlb.c | 129 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 3 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/madvise.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 416 | ||||
-rw-r--r-- | mm/memory-failure.c | 10 | ||||
-rw-r--r-- | mm/memory.c | 70 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 45 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 5 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 40 | ||||
-rw-r--r-- | mm/oom_kill.c | 34 | ||||
-rw-r--r-- | mm/page-writeback.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 159 | ||||
-rw-r--r-- | mm/readahead.c | 3 | ||||
-rw-r--r-- | mm/shmem.c | 39 | ||||
-rw-r--r-- | mm/slab.c | 514 | ||||
-rw-r--r-- | mm/slab.h | 24 | ||||
-rw-r--r-- | mm/slab_common.c | 101 | ||||
-rw-r--r-- | mm/slub.c | 221 | ||||
-rw-r--r-- | mm/swap.c | 18 | ||||
-rw-r--r-- | mm/util.c | 102 | ||||
-rw-r--r-- | mm/vmalloc.c | 30 | ||||
-rw-r--r-- | mm/vmscan.c | 274 | ||||
-rw-r--r-- | mm/vmstat.c | 9 | ||||
-rw-r--r-- | mm/zbud.c | 98 | ||||
-rw-r--r-- | mm/zpool.c | 364 | ||||
-rw-r--r-- | mm/zsmalloc.c | 86 | ||||
-rw-r--r-- | mm/zswap.c | 75 |
36 files changed, 2141 insertions, 1307 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3e9977a9d657..886db2158538 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -508,21 +508,34 @@ config CMA_DEBUG | |||
508 | processing calls such as dma_alloc_from_contiguous(). | 508 | processing calls such as dma_alloc_from_contiguous(). |
509 | This option does not affect warning and error messages. | 509 | This option does not affect warning and error messages. |
510 | 510 | ||
511 | config ZBUD | 511 | config CMA_AREAS |
512 | tristate | 512 | int "Maximum count of the CMA areas" |
513 | default n | 513 | depends on CMA |
514 | default 7 | ||
514 | help | 515 | help |
515 | A special purpose allocator for storing compressed pages. | 516 | CMA allows to create CMA areas for particular purpose, mainly, |
516 | It is designed to store up to two compressed pages per physical | 517 | used as device private area. This parameter sets the maximum |
517 | page. While this design limits storage density, it has simple and | 518 | number of CMA area in the system. |
518 | deterministic reclaim properties that make it preferable to a higher | 519 | |
519 | density approach when reclaim will be used. | 520 | If unsure, leave the default value "7". |
521 | |||
522 | config MEM_SOFT_DIRTY | ||
523 | bool "Track memory changes" | ||
524 | depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS | ||
525 | select PROC_PAGE_MONITOR | ||
526 | help | ||
527 | This option enables memory changes tracking by introducing a | ||
528 | soft-dirty bit on pte-s. This bit it set when someone writes | ||
529 | into a page just as regular dirty bit, but unlike the latter | ||
530 | it can be cleared by hands. | ||
531 | |||
532 | See Documentation/vm/soft-dirty.txt for more details. | ||
520 | 533 | ||
521 | config ZSWAP | 534 | config ZSWAP |
522 | bool "Compressed cache for swap pages (EXPERIMENTAL)" | 535 | bool "Compressed cache for swap pages (EXPERIMENTAL)" |
523 | depends on FRONTSWAP && CRYPTO=y | 536 | depends on FRONTSWAP && CRYPTO=y |
524 | select CRYPTO_LZO | 537 | select CRYPTO_LZO |
525 | select ZBUD | 538 | select ZPOOL |
526 | default n | 539 | default n |
527 | help | 540 | help |
528 | A lightweight compressed cache for swap pages. It takes | 541 | A lightweight compressed cache for swap pages. It takes |
@@ -538,17 +551,22 @@ config ZSWAP | |||
538 | they have not be fully explored on the large set of potential | 551 | they have not be fully explored on the large set of potential |
539 | configurations and workloads that exist. | 552 | configurations and workloads that exist. |
540 | 553 | ||
541 | config MEM_SOFT_DIRTY | 554 | config ZPOOL |
542 | bool "Track memory changes" | 555 | tristate "Common API for compressed memory storage" |
543 | depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS | 556 | default n |
544 | select PROC_PAGE_MONITOR | ||
545 | help | 557 | help |
546 | This option enables memory changes tracking by introducing a | 558 | Compressed memory storage API. This allows using either zbud or |
547 | soft-dirty bit on pte-s. This bit it set when someone writes | 559 | zsmalloc. |
548 | into a page just as regular dirty bit, but unlike the latter | ||
549 | it can be cleared by hands. | ||
550 | 560 | ||
551 | See Documentation/vm/soft-dirty.txt for more details. | 561 | config ZBUD |
562 | tristate "Low density storage for compressed pages" | ||
563 | default n | ||
564 | help | ||
565 | A special purpose allocator for storing compressed pages. | ||
566 | It is designed to store up to two compressed pages per physical | ||
567 | page. While this design limits storage density, it has simple and | ||
568 | deterministic reclaim properties that make it preferable to a higher | ||
569 | density approach when reclaim will be used. | ||
552 | 570 | ||
553 | config ZSMALLOC | 571 | config ZSMALLOC |
554 | tristate "Memory allocator for compressed pages" | 572 | tristate "Memory allocator for compressed pages" |
diff --git a/mm/Makefile b/mm/Makefile index 4064f3ec145e..632ae77e6070 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -59,6 +59,8 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | |||
59 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 59 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
62 | obj-$(CONFIG_ZPOOL) += zpool.o | ||
62 | obj-$(CONFIG_ZBUD) += zbud.o | 63 | obj-$(CONFIG_ZBUD) += zbud.o |
63 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | 64 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o |
64 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | 65 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o |
66 | obj-$(CONFIG_CMA) += cma.o | ||
diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 000000000000..c17751c0dcaf --- /dev/null +++ b/mm/cma.c | |||
@@ -0,0 +1,335 @@ | |||
1 | /* | ||
2 | * Contiguous Memory Allocator | ||
3 | * | ||
4 | * Copyright (c) 2010-2011 by Samsung Electronics. | ||
5 | * Copyright IBM Corporation, 2013 | ||
6 | * Copyright LG Electronics Inc., 2014 | ||
7 | * Written by: | ||
8 | * Marek Szyprowski <m.szyprowski@samsung.com> | ||
9 | * Michal Nazarewicz <mina86@mina86.com> | ||
10 | * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
11 | * Joonsoo Kim <iamjoonsoo.kim@lge.com> | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or | ||
14 | * modify it under the terms of the GNU General Public License as | ||
15 | * published by the Free Software Foundation; either version 2 of the | ||
16 | * License or (at your optional) any later version of the license. | ||
17 | */ | ||
18 | |||
19 | #define pr_fmt(fmt) "cma: " fmt | ||
20 | |||
21 | #ifdef CONFIG_CMA_DEBUG | ||
22 | #ifndef DEBUG | ||
23 | # define DEBUG | ||
24 | #endif | ||
25 | #endif | ||
26 | |||
27 | #include <linux/memblock.h> | ||
28 | #include <linux/err.h> | ||
29 | #include <linux/mm.h> | ||
30 | #include <linux/mutex.h> | ||
31 | #include <linux/sizes.h> | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/log2.h> | ||
34 | #include <linux/cma.h> | ||
35 | |||
36 | struct cma { | ||
37 | unsigned long base_pfn; | ||
38 | unsigned long count; | ||
39 | unsigned long *bitmap; | ||
40 | unsigned int order_per_bit; /* Order of pages represented by one bit */ | ||
41 | struct mutex lock; | ||
42 | }; | ||
43 | |||
44 | static struct cma cma_areas[MAX_CMA_AREAS]; | ||
45 | static unsigned cma_area_count; | ||
46 | static DEFINE_MUTEX(cma_mutex); | ||
47 | |||
48 | phys_addr_t cma_get_base(struct cma *cma) | ||
49 | { | ||
50 | return PFN_PHYS(cma->base_pfn); | ||
51 | } | ||
52 | |||
53 | unsigned long cma_get_size(struct cma *cma) | ||
54 | { | ||
55 | return cma->count << PAGE_SHIFT; | ||
56 | } | ||
57 | |||
58 | static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) | ||
59 | { | ||
60 | return (1UL << (align_order >> cma->order_per_bit)) - 1; | ||
61 | } | ||
62 | |||
63 | static unsigned long cma_bitmap_maxno(struct cma *cma) | ||
64 | { | ||
65 | return cma->count >> cma->order_per_bit; | ||
66 | } | ||
67 | |||
68 | static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, | ||
69 | unsigned long pages) | ||
70 | { | ||
71 | return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; | ||
72 | } | ||
73 | |||
74 | static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) | ||
75 | { | ||
76 | unsigned long bitmap_no, bitmap_count; | ||
77 | |||
78 | bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; | ||
79 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); | ||
80 | |||
81 | mutex_lock(&cma->lock); | ||
82 | bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); | ||
83 | mutex_unlock(&cma->lock); | ||
84 | } | ||
85 | |||
86 | static int __init cma_activate_area(struct cma *cma) | ||
87 | { | ||
88 | int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); | ||
89 | unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; | ||
90 | unsigned i = cma->count >> pageblock_order; | ||
91 | struct zone *zone; | ||
92 | |||
93 | cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); | ||
94 | |||
95 | if (!cma->bitmap) | ||
96 | return -ENOMEM; | ||
97 | |||
98 | WARN_ON_ONCE(!pfn_valid(pfn)); | ||
99 | zone = page_zone(pfn_to_page(pfn)); | ||
100 | |||
101 | do { | ||
102 | unsigned j; | ||
103 | |||
104 | base_pfn = pfn; | ||
105 | for (j = pageblock_nr_pages; j; --j, pfn++) { | ||
106 | WARN_ON_ONCE(!pfn_valid(pfn)); | ||
107 | /* | ||
108 | * alloc_contig_range requires the pfn range | ||
109 | * specified to be in the same zone. Make this | ||
110 | * simple by forcing the entire CMA resv range | ||
111 | * to be in the same zone. | ||
112 | */ | ||
113 | if (page_zone(pfn_to_page(pfn)) != zone) | ||
114 | goto err; | ||
115 | } | ||
116 | init_cma_reserved_pageblock(pfn_to_page(base_pfn)); | ||
117 | } while (--i); | ||
118 | |||
119 | mutex_init(&cma->lock); | ||
120 | return 0; | ||
121 | |||
122 | err: | ||
123 | kfree(cma->bitmap); | ||
124 | return -EINVAL; | ||
125 | } | ||
126 | |||
127 | static int __init cma_init_reserved_areas(void) | ||
128 | { | ||
129 | int i; | ||
130 | |||
131 | for (i = 0; i < cma_area_count; i++) { | ||
132 | int ret = cma_activate_area(&cma_areas[i]); | ||
133 | |||
134 | if (ret) | ||
135 | return ret; | ||
136 | } | ||
137 | |||
138 | return 0; | ||
139 | } | ||
140 | core_initcall(cma_init_reserved_areas); | ||
141 | |||
142 | /** | ||
143 | * cma_declare_contiguous() - reserve custom contiguous area | ||
144 | * @base: Base address of the reserved area optional, use 0 for any | ||
145 | * @size: Size of the reserved area (in bytes), | ||
146 | * @limit: End address of the reserved memory (optional, 0 for any). | ||
147 | * @alignment: Alignment for the CMA area, should be power of 2 or zero | ||
148 | * @order_per_bit: Order of pages represented by one bit on bitmap. | ||
149 | * @fixed: hint about where to place the reserved area | ||
150 | * @res_cma: Pointer to store the created cma region. | ||
151 | * | ||
152 | * This function reserves memory from early allocator. It should be | ||
153 | * called by arch specific code once the early allocator (memblock or bootmem) | ||
154 | * has been activated and all other subsystems have already allocated/reserved | ||
155 | * memory. This function allows to create custom reserved areas. | ||
156 | * | ||
157 | * If @fixed is true, reserve contiguous area at exactly @base. If false, | ||
158 | * reserve in range from @base to @limit. | ||
159 | */ | ||
160 | int __init cma_declare_contiguous(phys_addr_t base, | ||
161 | phys_addr_t size, phys_addr_t limit, | ||
162 | phys_addr_t alignment, unsigned int order_per_bit, | ||
163 | bool fixed, struct cma **res_cma) | ||
164 | { | ||
165 | struct cma *cma; | ||
166 | int ret = 0; | ||
167 | |||
168 | pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", | ||
169 | __func__, (unsigned long)size, (unsigned long)base, | ||
170 | (unsigned long)limit, (unsigned long)alignment); | ||
171 | |||
172 | if (cma_area_count == ARRAY_SIZE(cma_areas)) { | ||
173 | pr_err("Not enough slots for CMA reserved regions!\n"); | ||
174 | return -ENOSPC; | ||
175 | } | ||
176 | |||
177 | if (!size) | ||
178 | return -EINVAL; | ||
179 | |||
180 | if (alignment && !is_power_of_2(alignment)) | ||
181 | return -EINVAL; | ||
182 | |||
183 | /* | ||
184 | * Sanitise input arguments. | ||
185 | * Pages both ends in CMA area could be merged into adjacent unmovable | ||
186 | * migratetype page by page allocator's buddy algorithm. In the case, | ||
187 | * you couldn't get a contiguous memory, which is not what we want. | ||
188 | */ | ||
189 | alignment = max(alignment, | ||
190 | (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); | ||
191 | base = ALIGN(base, alignment); | ||
192 | size = ALIGN(size, alignment); | ||
193 | limit &= ~(alignment - 1); | ||
194 | |||
195 | /* size should be aligned with order_per_bit */ | ||
196 | if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) | ||
197 | return -EINVAL; | ||
198 | |||
199 | /* Reserve memory */ | ||
200 | if (base && fixed) { | ||
201 | if (memblock_is_region_reserved(base, size) || | ||
202 | memblock_reserve(base, size) < 0) { | ||
203 | ret = -EBUSY; | ||
204 | goto err; | ||
205 | } | ||
206 | } else { | ||
207 | phys_addr_t addr = memblock_alloc_range(size, alignment, base, | ||
208 | limit); | ||
209 | if (!addr) { | ||
210 | ret = -ENOMEM; | ||
211 | goto err; | ||
212 | } else { | ||
213 | base = addr; | ||
214 | } | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * Each reserved area must be initialised later, when more kernel | ||
219 | * subsystems (like slab allocator) are available. | ||
220 | */ | ||
221 | cma = &cma_areas[cma_area_count]; | ||
222 | cma->base_pfn = PFN_DOWN(base); | ||
223 | cma->count = size >> PAGE_SHIFT; | ||
224 | cma->order_per_bit = order_per_bit; | ||
225 | *res_cma = cma; | ||
226 | cma_area_count++; | ||
227 | |||
228 | pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, | ||
229 | (unsigned long)base); | ||
230 | return 0; | ||
231 | |||
232 | err: | ||
233 | pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); | ||
234 | return ret; | ||
235 | } | ||
236 | |||
237 | /** | ||
238 | * cma_alloc() - allocate pages from contiguous area | ||
239 | * @cma: Contiguous memory region for which the allocation is performed. | ||
240 | * @count: Requested number of pages. | ||
241 | * @align: Requested alignment of pages (in PAGE_SIZE order). | ||
242 | * | ||
243 | * This function allocates part of contiguous memory on specific | ||
244 | * contiguous memory area. | ||
245 | */ | ||
246 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | ||
247 | { | ||
248 | unsigned long mask, pfn, start = 0; | ||
249 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | ||
250 | struct page *page = NULL; | ||
251 | int ret; | ||
252 | |||
253 | if (!cma || !cma->count) | ||
254 | return NULL; | ||
255 | |||
256 | pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, | ||
257 | count, align); | ||
258 | |||
259 | if (!count) | ||
260 | return NULL; | ||
261 | |||
262 | mask = cma_bitmap_aligned_mask(cma, align); | ||
263 | bitmap_maxno = cma_bitmap_maxno(cma); | ||
264 | bitmap_count = cma_bitmap_pages_to_bits(cma, count); | ||
265 | |||
266 | for (;;) { | ||
267 | mutex_lock(&cma->lock); | ||
268 | bitmap_no = bitmap_find_next_zero_area(cma->bitmap, | ||
269 | bitmap_maxno, start, bitmap_count, mask); | ||
270 | if (bitmap_no >= bitmap_maxno) { | ||
271 | mutex_unlock(&cma->lock); | ||
272 | break; | ||
273 | } | ||
274 | bitmap_set(cma->bitmap, bitmap_no, bitmap_count); | ||
275 | /* | ||
276 | * It's safe to drop the lock here. We've marked this region for | ||
277 | * our exclusive use. If the migration fails we will take the | ||
278 | * lock again and unmark it. | ||
279 | */ | ||
280 | mutex_unlock(&cma->lock); | ||
281 | |||
282 | pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); | ||
283 | mutex_lock(&cma_mutex); | ||
284 | ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); | ||
285 | mutex_unlock(&cma_mutex); | ||
286 | if (ret == 0) { | ||
287 | page = pfn_to_page(pfn); | ||
288 | break; | ||
289 | } | ||
290 | |||
291 | cma_clear_bitmap(cma, pfn, count); | ||
292 | if (ret != -EBUSY) | ||
293 | break; | ||
294 | |||
295 | pr_debug("%s(): memory range at %p is busy, retrying\n", | ||
296 | __func__, pfn_to_page(pfn)); | ||
297 | /* try again with a bit different memory target */ | ||
298 | start = bitmap_no + mask + 1; | ||
299 | } | ||
300 | |||
301 | pr_debug("%s(): returned %p\n", __func__, page); | ||
302 | return page; | ||
303 | } | ||
304 | |||
305 | /** | ||
306 | * cma_release() - release allocated pages | ||
307 | * @cma: Contiguous memory region for which the allocation is performed. | ||
308 | * @pages: Allocated pages. | ||
309 | * @count: Number of allocated pages. | ||
310 | * | ||
311 | * This function releases memory allocated by alloc_cma(). | ||
312 | * It returns false when provided pages do not belong to contiguous area and | ||
313 | * true otherwise. | ||
314 | */ | ||
315 | bool cma_release(struct cma *cma, struct page *pages, int count) | ||
316 | { | ||
317 | unsigned long pfn; | ||
318 | |||
319 | if (!cma || !pages) | ||
320 | return false; | ||
321 | |||
322 | pr_debug("%s(page %p)\n", __func__, (void *)pages); | ||
323 | |||
324 | pfn = page_to_pfn(pages); | ||
325 | |||
326 | if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) | ||
327 | return false; | ||
328 | |||
329 | VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); | ||
330 | |||
331 | free_contig_range(pfn, count); | ||
332 | cma_clear_bitmap(cma, pfn, count); | ||
333 | |||
334 | return true; | ||
335 | } | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 65d44fd88c78..af19a6b079f5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -808,6 +808,17 @@ int __lock_page_killable(struct page *page) | |||
808 | } | 808 | } |
809 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 809 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
810 | 810 | ||
811 | /* | ||
812 | * Return values: | ||
813 | * 1 - page is locked; mmap_sem is still held. | ||
814 | * 0 - page is not locked. | ||
815 | * mmap_sem has been released (up_read()), unless flags had both | ||
816 | * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in | ||
817 | * which case mmap_sem is still held. | ||
818 | * | ||
819 | * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 | ||
820 | * with the page locked and the mmap_sem unperturbed. | ||
821 | */ | ||
811 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 822 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
812 | unsigned int flags) | 823 | unsigned int flags) |
813 | { | 824 | { |
@@ -1091,9 +1102,9 @@ no_page: | |||
1091 | if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) | 1102 | if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) |
1092 | fgp_flags |= FGP_LOCK; | 1103 | fgp_flags |= FGP_LOCK; |
1093 | 1104 | ||
1094 | /* Init accessed so avoit atomic mark_page_accessed later */ | 1105 | /* Init accessed so avoid atomic mark_page_accessed later */ |
1095 | if (fgp_flags & FGP_ACCESSED) | 1106 | if (fgp_flags & FGP_ACCESSED) |
1096 | init_page_accessed(page); | 1107 | __SetPageReferenced(page); |
1097 | 1108 | ||
1098 | err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); | 1109 | err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); |
1099 | if (unlikely(err)) { | 1110 | if (unlikely(err)) { |
@@ -1827,6 +1838,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, | |||
1827 | * The goto's are kind of ugly, but this streamlines the normal case of having | 1838 | * The goto's are kind of ugly, but this streamlines the normal case of having |
1828 | * it in the page cache, and handles the special cases reasonably without | 1839 | * it in the page cache, and handles the special cases reasonably without |
1829 | * having a lot of duplicated code. | 1840 | * having a lot of duplicated code. |
1841 | * | ||
1842 | * vma->vm_mm->mmap_sem must be held on entry. | ||
1843 | * | ||
1844 | * If our return value has VM_FAULT_RETRY set, it's because | ||
1845 | * lock_page_or_retry() returned 0. | ||
1846 | * The mmap_sem has usually been released in this case. | ||
1847 | * See __lock_page_or_retry() for the exception. | ||
1848 | * | ||
1849 | * If our return value does not have VM_FAULT_RETRY set, the mmap_sem | ||
1850 | * has not been released. | ||
1851 | * | ||
1852 | * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. | ||
1830 | */ | 1853 | */ |
1831 | int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1854 | int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1832 | { | 1855 | { |
@@ -258,6 +258,11 @@ unmap: | |||
258 | return ret; | 258 | return ret; |
259 | } | 259 | } |
260 | 260 | ||
261 | /* | ||
262 | * mmap_sem must be held on entry. If @nonblocking != NULL and | ||
263 | * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. | ||
264 | * If it is, *@nonblocking will be set to 0 and -EBUSY returned. | ||
265 | */ | ||
261 | static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | 266 | static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, |
262 | unsigned long address, unsigned int *flags, int *nonblocking) | 267 | unsigned long address, unsigned int *flags, int *nonblocking) |
263 | { | 268 | { |
@@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) | |||
373 | * with a put_page() call when it is finished with. vmas will only | 378 | * with a put_page() call when it is finished with. vmas will only |
374 | * remain valid while mmap_sem is held. | 379 | * remain valid while mmap_sem is held. |
375 | * | 380 | * |
376 | * Must be called with mmap_sem held for read or write. | 381 | * Must be called with mmap_sem held. It may be released. See below. |
377 | * | 382 | * |
378 | * __get_user_pages walks a process's page tables and takes a reference to | 383 | * __get_user_pages walks a process's page tables and takes a reference to |
379 | * each struct page that each user address corresponds to at a given | 384 | * each struct page that each user address corresponds to at a given |
@@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) | |||
396 | * | 401 | * |
397 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO | 402 | * If @nonblocking != NULL, __get_user_pages will not wait for disk IO |
398 | * or mmap_sem contention, and if waiting is needed to pin all pages, | 403 | * or mmap_sem contention, and if waiting is needed to pin all pages, |
399 | * *@nonblocking will be set to 0. | 404 | * *@nonblocking will be set to 0. Further, if @gup_flags does not |
405 | * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in | ||
406 | * this case. | ||
407 | * | ||
408 | * A caller using such a combination of @nonblocking and @gup_flags | ||
409 | * must therefore hold the mmap_sem for reading only, and recognize | ||
410 | * when it's been released. Otherwise, it must be held for either | ||
411 | * reading or writing and will not be released. | ||
400 | * | 412 | * |
401 | * In most cases, get_user_pages or get_user_pages_fast should be used | 413 | * In most cases, get_user_pages or get_user_pages_fast should be used |
402 | * instead of __get_user_pages. __get_user_pages should be used only if | 414 | * instead of __get_user_pages. __get_user_pages should be used only if |
@@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages); | |||
528 | * such architectures, gup() will not be enough to make a subsequent access | 540 | * such architectures, gup() will not be enough to make a subsequent access |
529 | * succeed. | 541 | * succeed. |
530 | * | 542 | * |
531 | * This should be called with the mm_sem held for read. | 543 | * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). |
532 | */ | 544 | */ |
533 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | 545 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, |
534 | unsigned long address, unsigned int fault_flags) | 546 | unsigned long address, unsigned int fault_flags) |
diff --git a/mm/highmem.c b/mm/highmem.c index b32b70cdaed6..123bcd3ed4f2 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx); | |||
44 | */ | 44 | */ |
45 | #ifdef CONFIG_HIGHMEM | 45 | #ifdef CONFIG_HIGHMEM |
46 | 46 | ||
47 | /* | ||
48 | * Architecture with aliasing data cache may define the following family of | ||
49 | * helper functions in its asm/highmem.h to control cache color of virtual | ||
50 | * addresses where physical memory pages are mapped by kmap. | ||
51 | */ | ||
52 | #ifndef get_pkmap_color | ||
53 | |||
54 | /* | ||
55 | * Determine color of virtual address where the page should be mapped. | ||
56 | */ | ||
57 | static inline unsigned int get_pkmap_color(struct page *page) | ||
58 | { | ||
59 | return 0; | ||
60 | } | ||
61 | #define get_pkmap_color get_pkmap_color | ||
62 | |||
63 | /* | ||
64 | * Get next index for mapping inside PKMAP region for page with given color. | ||
65 | */ | ||
66 | static inline unsigned int get_next_pkmap_nr(unsigned int color) | ||
67 | { | ||
68 | static unsigned int last_pkmap_nr; | ||
69 | |||
70 | last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; | ||
71 | return last_pkmap_nr; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Determine if page index inside PKMAP region (pkmap_nr) of given color | ||
76 | * has wrapped around PKMAP region end. When this happens an attempt to | ||
77 | * flush all unused PKMAP slots is made. | ||
78 | */ | ||
79 | static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) | ||
80 | { | ||
81 | return pkmap_nr == 0; | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * Get the number of PKMAP entries of the given color. If no free slot is | ||
86 | * found after checking that many entries, kmap will sleep waiting for | ||
87 | * someone to call kunmap and free PKMAP slot. | ||
88 | */ | ||
89 | static inline int get_pkmap_entries_count(unsigned int color) | ||
90 | { | ||
91 | return LAST_PKMAP; | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * Get head of a wait queue for PKMAP entries of the given color. | ||
96 | * Wait queues for different mapping colors should be independent to avoid | ||
97 | * unnecessary wakeups caused by freeing of slots of other colors. | ||
98 | */ | ||
99 | static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) | ||
100 | { | ||
101 | static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | ||
102 | |||
103 | return &pkmap_map_wait; | ||
104 | } | ||
105 | #endif | ||
106 | |||
47 | unsigned long totalhigh_pages __read_mostly; | 107 | unsigned long totalhigh_pages __read_mostly; |
48 | EXPORT_SYMBOL(totalhigh_pages); | 108 | EXPORT_SYMBOL(totalhigh_pages); |
49 | 109 | ||
@@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void) | |||
68 | } | 128 | } |
69 | 129 | ||
70 | static int pkmap_count[LAST_PKMAP]; | 130 | static int pkmap_count[LAST_PKMAP]; |
71 | static unsigned int last_pkmap_nr; | ||
72 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); | 131 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); |
73 | 132 | ||
74 | pte_t * pkmap_page_table; | 133 | pte_t * pkmap_page_table; |
75 | 134 | ||
76 | static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | ||
77 | |||
78 | /* | 135 | /* |
79 | * Most architectures have no use for kmap_high_get(), so let's abstract | 136 | * Most architectures have no use for kmap_high_get(), so let's abstract |
80 | * the disabling of IRQ out of the locking in that case to save on a | 137 | * the disabling of IRQ out of the locking in that case to save on a |
@@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page) | |||
161 | { | 218 | { |
162 | unsigned long vaddr; | 219 | unsigned long vaddr; |
163 | int count; | 220 | int count; |
221 | unsigned int last_pkmap_nr; | ||
222 | unsigned int color = get_pkmap_color(page); | ||
164 | 223 | ||
165 | start: | 224 | start: |
166 | count = LAST_PKMAP; | 225 | count = get_pkmap_entries_count(color); |
167 | /* Find an empty entry */ | 226 | /* Find an empty entry */ |
168 | for (;;) { | 227 | for (;;) { |
169 | last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; | 228 | last_pkmap_nr = get_next_pkmap_nr(color); |
170 | if (!last_pkmap_nr) { | 229 | if (no_more_pkmaps(last_pkmap_nr, color)) { |
171 | flush_all_zero_pkmaps(); | 230 | flush_all_zero_pkmaps(); |
172 | count = LAST_PKMAP; | 231 | count = get_pkmap_entries_count(color); |
173 | } | 232 | } |
174 | if (!pkmap_count[last_pkmap_nr]) | 233 | if (!pkmap_count[last_pkmap_nr]) |
175 | break; /* Found a usable entry */ | 234 | break; /* Found a usable entry */ |
@@ -181,12 +240,14 @@ start: | |||
181 | */ | 240 | */ |
182 | { | 241 | { |
183 | DECLARE_WAITQUEUE(wait, current); | 242 | DECLARE_WAITQUEUE(wait, current); |
243 | wait_queue_head_t *pkmap_map_wait = | ||
244 | get_pkmap_wait_queue_head(color); | ||
184 | 245 | ||
185 | __set_current_state(TASK_UNINTERRUPTIBLE); | 246 | __set_current_state(TASK_UNINTERRUPTIBLE); |
186 | add_wait_queue(&pkmap_map_wait, &wait); | 247 | add_wait_queue(pkmap_map_wait, &wait); |
187 | unlock_kmap(); | 248 | unlock_kmap(); |
188 | schedule(); | 249 | schedule(); |
189 | remove_wait_queue(&pkmap_map_wait, &wait); | 250 | remove_wait_queue(pkmap_map_wait, &wait); |
190 | lock_kmap(); | 251 | lock_kmap(); |
191 | 252 | ||
192 | /* Somebody else might have mapped it while we slept */ | 253 | /* Somebody else might have mapped it while we slept */ |
@@ -274,6 +335,8 @@ void kunmap_high(struct page *page) | |||
274 | unsigned long nr; | 335 | unsigned long nr; |
275 | unsigned long flags; | 336 | unsigned long flags; |
276 | int need_wakeup; | 337 | int need_wakeup; |
338 | unsigned int color = get_pkmap_color(page); | ||
339 | wait_queue_head_t *pkmap_map_wait; | ||
277 | 340 | ||
278 | lock_kmap_any(flags); | 341 | lock_kmap_any(flags); |
279 | vaddr = (unsigned long)page_address(page); | 342 | vaddr = (unsigned long)page_address(page); |
@@ -299,13 +362,14 @@ void kunmap_high(struct page *page) | |||
299 | * no need for the wait-queue-head's lock. Simply | 362 | * no need for the wait-queue-head's lock. Simply |
300 | * test if the queue is empty. | 363 | * test if the queue is empty. |
301 | */ | 364 | */ |
302 | need_wakeup = waitqueue_active(&pkmap_map_wait); | 365 | pkmap_map_wait = get_pkmap_wait_queue_head(color); |
366 | need_wakeup = waitqueue_active(pkmap_map_wait); | ||
303 | } | 367 | } |
304 | unlock_kmap_any(flags); | 368 | unlock_kmap_any(flags); |
305 | 369 | ||
306 | /* do wake-up, if needed, race-free outside of the spin lock */ | 370 | /* do wake-up, if needed, race-free outside of the spin lock */ |
307 | if (need_wakeup) | 371 | if (need_wakeup) |
308 | wake_up(&pkmap_map_wait); | 372 | wake_up(pkmap_map_wait); |
309 | } | 373 | } |
310 | 374 | ||
311 | EXPORT_SYMBOL(kunmap_high); | 375 | EXPORT_SYMBOL(kunmap_high); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33514d88fef9..3630d577e987 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
827 | count_vm_event(THP_FAULT_FALLBACK); | 827 | count_vm_event(THP_FAULT_FALLBACK); |
828 | return VM_FAULT_FALLBACK; | 828 | return VM_FAULT_FALLBACK; |
829 | } | 829 | } |
830 | if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { | 830 | if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) { |
831 | put_page(page); | 831 | put_page(page); |
832 | count_vm_event(THP_FAULT_FALLBACK); | 832 | count_vm_event(THP_FAULT_FALLBACK); |
833 | return VM_FAULT_FALLBACK; | 833 | return VM_FAULT_FALLBACK; |
@@ -1132,7 +1132,7 @@ alloc: | |||
1132 | goto out; | 1132 | goto out; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { | 1135 | if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { |
1136 | put_page(new_page); | 1136 | put_page(new_page); |
1137 | if (page) { | 1137 | if (page) { |
1138 | split_huge_page(page); | 1138 | split_huge_page(page); |
@@ -1681,7 +1681,7 @@ static void __split_huge_page_refcount(struct page *page, | |||
1681 | &page_tail->_count); | 1681 | &page_tail->_count); |
1682 | 1682 | ||
1683 | /* after clearing PageTail the gup refcount can be released */ | 1683 | /* after clearing PageTail the gup refcount can be released */ |
1684 | smp_mb(); | 1684 | smp_mb__after_atomic(); |
1685 | 1685 | ||
1686 | /* | 1686 | /* |
1687 | * retain hwpoison flag of the poisoned tail page: | 1687 | * retain hwpoison flag of the poisoned tail page: |
@@ -1775,6 +1775,8 @@ static int __split_huge_page_map(struct page *page, | |||
1775 | if (pmd) { | 1775 | if (pmd) { |
1776 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1776 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1777 | pmd_populate(mm, &_pmd, pgtable); | 1777 | pmd_populate(mm, &_pmd, pgtable); |
1778 | if (pmd_write(*pmd)) | ||
1779 | BUG_ON(page_mapcount(page) != 1); | ||
1778 | 1780 | ||
1779 | haddr = address; | 1781 | haddr = address; |
1780 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1782 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -1784,8 +1786,6 @@ static int __split_huge_page_map(struct page *page, | |||
1784 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1786 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1785 | if (!pmd_write(*pmd)) | 1787 | if (!pmd_write(*pmd)) |
1786 | entry = pte_wrprotect(entry); | 1788 | entry = pte_wrprotect(entry); |
1787 | else | ||
1788 | BUG_ON(page_mapcount(page) != 1); | ||
1789 | if (!pmd_young(*pmd)) | 1789 | if (!pmd_young(*pmd)) |
1790 | entry = pte_mkold(entry); | 1790 | entry = pte_mkold(entry); |
1791 | if (pmd_numa(*pmd)) | 1791 | if (pmd_numa(*pmd)) |
@@ -2233,6 +2233,30 @@ static void khugepaged_alloc_sleep(void) | |||
2233 | 2233 | ||
2234 | static int khugepaged_node_load[MAX_NUMNODES]; | 2234 | static int khugepaged_node_load[MAX_NUMNODES]; |
2235 | 2235 | ||
2236 | static bool khugepaged_scan_abort(int nid) | ||
2237 | { | ||
2238 | int i; | ||
2239 | |||
2240 | /* | ||
2241 | * If zone_reclaim_mode is disabled, then no extra effort is made to | ||
2242 | * allocate memory locally. | ||
2243 | */ | ||
2244 | if (!zone_reclaim_mode) | ||
2245 | return false; | ||
2246 | |||
2247 | /* If there is a count for this node already, it must be acceptable */ | ||
2248 | if (khugepaged_node_load[nid]) | ||
2249 | return false; | ||
2250 | |||
2251 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
2252 | if (!khugepaged_node_load[i]) | ||
2253 | continue; | ||
2254 | if (node_distance(nid, i) > RECLAIM_DISTANCE) | ||
2255 | return true; | ||
2256 | } | ||
2257 | return false; | ||
2258 | } | ||
2259 | |||
2236 | #ifdef CONFIG_NUMA | 2260 | #ifdef CONFIG_NUMA |
2237 | static int khugepaged_find_target_node(void) | 2261 | static int khugepaged_find_target_node(void) |
2238 | { | 2262 | { |
@@ -2399,7 +2423,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2399 | if (!new_page) | 2423 | if (!new_page) |
2400 | return; | 2424 | return; |
2401 | 2425 | ||
2402 | if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) | 2426 | if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) |
2403 | return; | 2427 | return; |
2404 | 2428 | ||
2405 | /* | 2429 | /* |
@@ -2545,6 +2569,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2545 | * hit record. | 2569 | * hit record. |
2546 | */ | 2570 | */ |
2547 | node = page_to_nid(page); | 2571 | node = page_to_nid(page); |
2572 | if (khugepaged_scan_abort(node)) | ||
2573 | goto out_unmap; | ||
2548 | khugepaged_node_load[node]++; | 2574 | khugepaged_node_load[node]++; |
2549 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2575 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2550 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2576 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0a73d2fcff..eeceeeb09019 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -35,7 +35,6 @@ | |||
35 | #include <linux/node.h> | 35 | #include <linux/node.h> |
36 | #include "internal.h" | 36 | #include "internal.h" |
37 | 37 | ||
38 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | ||
39 | unsigned long hugepages_treat_as_movable; | 38 | unsigned long hugepages_treat_as_movable; |
40 | 39 | ||
41 | int hugetlb_max_hstate __read_mostly; | 40 | int hugetlb_max_hstate __read_mostly; |
@@ -1089,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
1089 | unsigned long pfn; | 1088 | unsigned long pfn; |
1090 | struct hstate *h; | 1089 | struct hstate *h; |
1091 | 1090 | ||
1091 | if (!hugepages_supported()) | ||
1092 | return; | ||
1093 | |||
1092 | /* Set scan step to minimum hugepage size */ | 1094 | /* Set scan step to minimum hugepage size */ |
1093 | for_each_hstate(h) | 1095 | for_each_hstate(h) |
1094 | if (order > huge_page_order(h)) | 1096 | if (order > huge_page_order(h)) |
@@ -1734,21 +1736,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, | |||
1734 | return sprintf(buf, "%lu\n", nr_huge_pages); | 1736 | return sprintf(buf, "%lu\n", nr_huge_pages); |
1735 | } | 1737 | } |
1736 | 1738 | ||
1737 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | 1739 | static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, |
1738 | struct kobject *kobj, struct kobj_attribute *attr, | 1740 | struct hstate *h, int nid, |
1739 | const char *buf, size_t len) | 1741 | unsigned long count, size_t len) |
1740 | { | 1742 | { |
1741 | int err; | 1743 | int err; |
1742 | int nid; | ||
1743 | unsigned long count; | ||
1744 | struct hstate *h; | ||
1745 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | 1744 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); |
1746 | 1745 | ||
1747 | err = kstrtoul(buf, 10, &count); | ||
1748 | if (err) | ||
1749 | goto out; | ||
1750 | |||
1751 | h = kobj_to_hstate(kobj, &nid); | ||
1752 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) { | 1746 | if (hstate_is_gigantic(h) && !gigantic_page_supported()) { |
1753 | err = -EINVAL; | 1747 | err = -EINVAL; |
1754 | goto out; | 1748 | goto out; |
@@ -1784,6 +1778,23 @@ out: | |||
1784 | return err; | 1778 | return err; |
1785 | } | 1779 | } |
1786 | 1780 | ||
1781 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | ||
1782 | struct kobject *kobj, const char *buf, | ||
1783 | size_t len) | ||
1784 | { | ||
1785 | struct hstate *h; | ||
1786 | unsigned long count; | ||
1787 | int nid; | ||
1788 | int err; | ||
1789 | |||
1790 | err = kstrtoul(buf, 10, &count); | ||
1791 | if (err) | ||
1792 | return err; | ||
1793 | |||
1794 | h = kobj_to_hstate(kobj, &nid); | ||
1795 | return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); | ||
1796 | } | ||
1797 | |||
1787 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1798 | static ssize_t nr_hugepages_show(struct kobject *kobj, |
1788 | struct kobj_attribute *attr, char *buf) | 1799 | struct kobj_attribute *attr, char *buf) |
1789 | { | 1800 | { |
@@ -1793,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, | |||
1793 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1804 | static ssize_t nr_hugepages_store(struct kobject *kobj, |
1794 | struct kobj_attribute *attr, const char *buf, size_t len) | 1805 | struct kobj_attribute *attr, const char *buf, size_t len) |
1795 | { | 1806 | { |
1796 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | 1807 | return nr_hugepages_store_common(false, kobj, buf, len); |
1797 | } | 1808 | } |
1798 | HSTATE_ATTR(nr_hugepages); | 1809 | HSTATE_ATTR(nr_hugepages); |
1799 | 1810 | ||
@@ -1812,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | |||
1812 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | 1823 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, |
1813 | struct kobj_attribute *attr, const char *buf, size_t len) | 1824 | struct kobj_attribute *attr, const char *buf, size_t len) |
1814 | { | 1825 | { |
1815 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | 1826 | return nr_hugepages_store_common(true, kobj, buf, len); |
1816 | } | 1827 | } |
1817 | HSTATE_ATTR(nr_hugepages_mempolicy); | 1828 | HSTATE_ATTR(nr_hugepages_mempolicy); |
1818 | #endif | 1829 | #endif |
@@ -2248,36 +2259,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2248 | void __user *buffer, size_t *length, loff_t *ppos) | 2259 | void __user *buffer, size_t *length, loff_t *ppos) |
2249 | { | 2260 | { |
2250 | struct hstate *h = &default_hstate; | 2261 | struct hstate *h = &default_hstate; |
2251 | unsigned long tmp; | 2262 | unsigned long tmp = h->max_huge_pages; |
2252 | int ret; | 2263 | int ret; |
2253 | 2264 | ||
2254 | if (!hugepages_supported()) | 2265 | if (!hugepages_supported()) |
2255 | return -ENOTSUPP; | 2266 | return -ENOTSUPP; |
2256 | 2267 | ||
2257 | tmp = h->max_huge_pages; | ||
2258 | |||
2259 | if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) | ||
2260 | return -EINVAL; | ||
2261 | |||
2262 | table->data = &tmp; | 2268 | table->data = &tmp; |
2263 | table->maxlen = sizeof(unsigned long); | 2269 | table->maxlen = sizeof(unsigned long); |
2264 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); | 2270 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
2265 | if (ret) | 2271 | if (ret) |
2266 | goto out; | 2272 | goto out; |
2267 | 2273 | ||
2268 | if (write) { | 2274 | if (write) |
2269 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, | 2275 | ret = __nr_hugepages_store_common(obey_mempolicy, h, |
2270 | GFP_KERNEL | __GFP_NORETRY); | 2276 | NUMA_NO_NODE, tmp, *length); |
2271 | if (!(obey_mempolicy && | ||
2272 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
2273 | NODEMASK_FREE(nodes_allowed); | ||
2274 | nodes_allowed = &node_states[N_MEMORY]; | ||
2275 | } | ||
2276 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
2277 | |||
2278 | if (nodes_allowed != &node_states[N_MEMORY]) | ||
2279 | NODEMASK_FREE(nodes_allowed); | ||
2280 | } | ||
2281 | out: | 2277 | out: |
2282 | return ret; | 2278 | return ret; |
2283 | } | 2279 | } |
@@ -2754,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2754 | * from other VMAs and let the children be SIGKILLed if they are faulting the | 2750 | * from other VMAs and let the children be SIGKILLed if they are faulting the |
2755 | * same region. | 2751 | * same region. |
2756 | */ | 2752 | */ |
2757 | static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | 2753 | static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, |
2758 | struct page *page, unsigned long address) | 2754 | struct page *page, unsigned long address) |
2759 | { | 2755 | { |
2760 | struct hstate *h = hstate_vma(vma); | 2756 | struct hstate *h = hstate_vma(vma); |
2761 | struct vm_area_struct *iter_vma; | 2757 | struct vm_area_struct *iter_vma; |
@@ -2794,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2794 | address + huge_page_size(h), page); | 2790 | address + huge_page_size(h), page); |
2795 | } | 2791 | } |
2796 | mutex_unlock(&mapping->i_mmap_mutex); | 2792 | mutex_unlock(&mapping->i_mmap_mutex); |
2797 | |||
2798 | return 1; | ||
2799 | } | 2793 | } |
2800 | 2794 | ||
2801 | /* | 2795 | /* |
@@ -2810,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2810 | { | 2804 | { |
2811 | struct hstate *h = hstate_vma(vma); | 2805 | struct hstate *h = hstate_vma(vma); |
2812 | struct page *old_page, *new_page; | 2806 | struct page *old_page, *new_page; |
2813 | int outside_reserve = 0; | 2807 | int ret = 0, outside_reserve = 0; |
2814 | unsigned long mmun_start; /* For mmu_notifiers */ | 2808 | unsigned long mmun_start; /* For mmu_notifiers */ |
2815 | unsigned long mmun_end; /* For mmu_notifiers */ | 2809 | unsigned long mmun_end; /* For mmu_notifiers */ |
2816 | 2810 | ||
@@ -2840,14 +2834,14 @@ retry_avoidcopy: | |||
2840 | 2834 | ||
2841 | page_cache_get(old_page); | 2835 | page_cache_get(old_page); |
2842 | 2836 | ||
2843 | /* Drop page table lock as buddy allocator may be called */ | 2837 | /* |
2838 | * Drop page table lock as buddy allocator may be called. It will | ||
2839 | * be acquired again before returning to the caller, as expected. | ||
2840 | */ | ||
2844 | spin_unlock(ptl); | 2841 | spin_unlock(ptl); |
2845 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2842 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2846 | 2843 | ||
2847 | if (IS_ERR(new_page)) { | 2844 | if (IS_ERR(new_page)) { |
2848 | long err = PTR_ERR(new_page); | ||
2849 | page_cache_release(old_page); | ||
2850 | |||
2851 | /* | 2845 | /* |
2852 | * If a process owning a MAP_PRIVATE mapping fails to COW, | 2846 | * If a process owning a MAP_PRIVATE mapping fails to COW, |
2853 | * it is due to references held by a child and an insufficient | 2847 | * it is due to references held by a child and an insufficient |
@@ -2856,29 +2850,25 @@ retry_avoidcopy: | |||
2856 | * may get SIGKILLed if it later faults. | 2850 | * may get SIGKILLed if it later faults. |
2857 | */ | 2851 | */ |
2858 | if (outside_reserve) { | 2852 | if (outside_reserve) { |
2853 | page_cache_release(old_page); | ||
2859 | BUG_ON(huge_pte_none(pte)); | 2854 | BUG_ON(huge_pte_none(pte)); |
2860 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2855 | unmap_ref_private(mm, vma, old_page, address); |
2861 | BUG_ON(huge_pte_none(pte)); | 2856 | BUG_ON(huge_pte_none(pte)); |
2862 | spin_lock(ptl); | 2857 | spin_lock(ptl); |
2863 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2858 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2864 | if (likely(ptep && | 2859 | if (likely(ptep && |
2865 | pte_same(huge_ptep_get(ptep), pte))) | 2860 | pte_same(huge_ptep_get(ptep), pte))) |
2866 | goto retry_avoidcopy; | 2861 | goto retry_avoidcopy; |
2867 | /* | 2862 | /* |
2868 | * race occurs while re-acquiring page table | 2863 | * race occurs while re-acquiring page table |
2869 | * lock, and our job is done. | 2864 | * lock, and our job is done. |
2870 | */ | 2865 | */ |
2871 | return 0; | 2866 | return 0; |
2872 | } | ||
2873 | WARN_ON_ONCE(1); | ||
2874 | } | 2867 | } |
2875 | 2868 | ||
2876 | /* Caller expects lock to be held */ | 2869 | ret = (PTR_ERR(new_page) == -ENOMEM) ? |
2877 | spin_lock(ptl); | 2870 | VM_FAULT_OOM : VM_FAULT_SIGBUS; |
2878 | if (err == -ENOMEM) | 2871 | goto out_release_old; |
2879 | return VM_FAULT_OOM; | ||
2880 | else | ||
2881 | return VM_FAULT_SIGBUS; | ||
2882 | } | 2872 | } |
2883 | 2873 | ||
2884 | /* | 2874 | /* |
@@ -2886,11 +2876,8 @@ retry_avoidcopy: | |||
2886 | * anon_vma prepared. | 2876 | * anon_vma prepared. |
2887 | */ | 2877 | */ |
2888 | if (unlikely(anon_vma_prepare(vma))) { | 2878 | if (unlikely(anon_vma_prepare(vma))) { |
2889 | page_cache_release(new_page); | 2879 | ret = VM_FAULT_OOM; |
2890 | page_cache_release(old_page); | 2880 | goto out_release_all; |
2891 | /* Caller expects lock to be held */ | ||
2892 | spin_lock(ptl); | ||
2893 | return VM_FAULT_OOM; | ||
2894 | } | 2881 | } |
2895 | 2882 | ||
2896 | copy_user_huge_page(new_page, old_page, address, vma, | 2883 | copy_user_huge_page(new_page, old_page, address, vma, |
@@ -2900,6 +2887,7 @@ retry_avoidcopy: | |||
2900 | mmun_start = address & huge_page_mask(h); | 2887 | mmun_start = address & huge_page_mask(h); |
2901 | mmun_end = mmun_start + huge_page_size(h); | 2888 | mmun_end = mmun_start + huge_page_size(h); |
2902 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2889 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2890 | |||
2903 | /* | 2891 | /* |
2904 | * Retake the page table lock to check for racing updates | 2892 | * Retake the page table lock to check for racing updates |
2905 | * before the page tables are altered | 2893 | * before the page tables are altered |
@@ -2920,12 +2908,13 @@ retry_avoidcopy: | |||
2920 | } | 2908 | } |
2921 | spin_unlock(ptl); | 2909 | spin_unlock(ptl); |
2922 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2910 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2911 | out_release_all: | ||
2923 | page_cache_release(new_page); | 2912 | page_cache_release(new_page); |
2913 | out_release_old: | ||
2924 | page_cache_release(old_page); | 2914 | page_cache_release(old_page); |
2925 | 2915 | ||
2926 | /* Caller expects lock to be held */ | 2916 | spin_lock(ptl); /* Caller expects lock to be held */ |
2927 | spin_lock(ptl); | 2917 | return ret; |
2928 | return 0; | ||
2929 | } | 2918 | } |
2930 | 2919 | ||
2931 | /* Return the pagecache page at a given address within a VMA */ | 2920 | /* Return the pagecache page at a given address within a VMA */ |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c71cad5..329caf56df22 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | |||
72 | 72 | ||
73 | static void pfn_inject_exit(void) | 73 | static void pfn_inject_exit(void) |
74 | { | 74 | { |
75 | if (hwpoison_dir) | 75 | debugfs_remove_recursive(hwpoison_dir); |
76 | debugfs_remove_recursive(hwpoison_dir); | ||
77 | } | 76 | } |
78 | 77 | ||
79 | static int pfn_inject_init(void) | 78 | static int pfn_inject_init(void) |
diff --git a/mm/internal.h b/mm/internal.h index 7f22a11fcc66..a1b651b11c5f 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } | |||
247 | static inline struct page *mem_map_offset(struct page *base, int offset) | 247 | static inline struct page *mem_map_offset(struct page *base, int offset) |
248 | { | 248 | { |
249 | if (unlikely(offset >= MAX_ORDER_NR_PAGES)) | 249 | if (unlikely(offset >= MAX_ORDER_NR_PAGES)) |
250 | return pfn_to_page(page_to_pfn(base) + offset); | 250 | return nth_page(base, offset); |
251 | return base + offset; | 251 | return base + offset; |
252 | } | 252 | } |
253 | 253 | ||
diff --git a/mm/madvise.c b/mm/madvise.c index a402f8fdc68e..0938b30da4ab 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, | |||
292 | /* | 292 | /* |
293 | * Application wants to free up the pages and associated backing store. | 293 | * Application wants to free up the pages and associated backing store. |
294 | * This is effectively punching a hole into the middle of a file. | 294 | * This is effectively punching a hole into the middle of a file. |
295 | * | ||
296 | * NOTE: Currently, only shmfs/tmpfs is supported for this operation. | ||
297 | * Other filesystems return -ENOSYS. | ||
298 | */ | 295 | */ |
299 | static long madvise_remove(struct vm_area_struct *vma, | 296 | static long madvise_remove(struct vm_area_struct *vma, |
300 | struct vm_area_struct **prev, | 297 | struct vm_area_struct **prev, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f009a14918d2..90dc501eaf3f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -2551,55 +2551,72 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2551 | return NOTIFY_OK; | 2551 | return NOTIFY_OK; |
2552 | } | 2552 | } |
2553 | 2553 | ||
2554 | 2554 | /** | |
2555 | /* See mem_cgroup_try_charge() for details */ | 2555 | * mem_cgroup_try_charge - try charging a memcg |
2556 | enum { | 2556 | * @memcg: memcg to charge |
2557 | CHARGE_OK, /* success */ | 2557 | * @nr_pages: number of pages to charge |
2558 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 2558 | * |
2559 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | 2559 | * Returns 0 if @memcg was charged successfully, -EINTR if the charge |
2560 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | 2560 | * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. |
2561 | }; | 2561 | */ |
2562 | 2562 | static int mem_cgroup_try_charge(struct mem_cgroup *memcg, | |
2563 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2563 | gfp_t gfp_mask, |
2564 | unsigned int nr_pages, unsigned int min_pages, | 2564 | unsigned int nr_pages) |
2565 | bool invoke_oom) | ||
2566 | { | 2565 | { |
2567 | unsigned long csize = nr_pages * PAGE_SIZE; | 2566 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2567 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
2568 | struct mem_cgroup *mem_over_limit; | 2568 | struct mem_cgroup *mem_over_limit; |
2569 | struct res_counter *fail_res; | 2569 | struct res_counter *fail_res; |
2570 | unsigned long nr_reclaimed; | ||
2570 | unsigned long flags = 0; | 2571 | unsigned long flags = 0; |
2571 | int ret; | 2572 | unsigned long long size; |
2573 | int ret = 0; | ||
2572 | 2574 | ||
2573 | ret = res_counter_charge(&memcg->res, csize, &fail_res); | 2575 | retry: |
2576 | if (consume_stock(memcg, nr_pages)) | ||
2577 | goto done; | ||
2574 | 2578 | ||
2575 | if (likely(!ret)) { | 2579 | size = batch * PAGE_SIZE; |
2580 | if (!res_counter_charge(&memcg->res, size, &fail_res)) { | ||
2576 | if (!do_swap_account) | 2581 | if (!do_swap_account) |
2577 | return CHARGE_OK; | 2582 | goto done_restock; |
2578 | ret = res_counter_charge(&memcg->memsw, csize, &fail_res); | 2583 | if (!res_counter_charge(&memcg->memsw, size, &fail_res)) |
2579 | if (likely(!ret)) | 2584 | goto done_restock; |
2580 | return CHARGE_OK; | 2585 | res_counter_uncharge(&memcg->res, size); |
2581 | |||
2582 | res_counter_uncharge(&memcg->res, csize); | ||
2583 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2586 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
2584 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 2587 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
2585 | } else | 2588 | } else |
2586 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2589 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
2590 | |||
2591 | if (batch > nr_pages) { | ||
2592 | batch = nr_pages; | ||
2593 | goto retry; | ||
2594 | } | ||
2595 | |||
2587 | /* | 2596 | /* |
2588 | * Never reclaim on behalf of optional batching, retry with a | 2597 | * Unlike in global OOM situations, memcg is not in a physical |
2589 | * single page instead. | 2598 | * memory shortage. Allow dying and OOM-killed tasks to |
2599 | * bypass the last charges so that they can exit quickly and | ||
2600 | * free their memory. | ||
2590 | */ | 2601 | */ |
2591 | if (nr_pages > min_pages) | 2602 | if (unlikely(test_thread_flag(TIF_MEMDIE) || |
2592 | return CHARGE_RETRY; | 2603 | fatal_signal_pending(current) || |
2604 | current->flags & PF_EXITING)) | ||
2605 | goto bypass; | ||
2606 | |||
2607 | if (unlikely(task_in_memcg_oom(current))) | ||
2608 | goto nomem; | ||
2593 | 2609 | ||
2594 | if (!(gfp_mask & __GFP_WAIT)) | 2610 | if (!(gfp_mask & __GFP_WAIT)) |
2595 | return CHARGE_WOULDBLOCK; | 2611 | goto nomem; |
2596 | 2612 | ||
2597 | if (gfp_mask & __GFP_NORETRY) | 2613 | nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); |
2598 | return CHARGE_NOMEM; | ||
2599 | 2614 | ||
2600 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); | ||
2601 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2615 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2602 | return CHARGE_RETRY; | 2616 | goto retry; |
2617 | |||
2618 | if (gfp_mask & __GFP_NORETRY) | ||
2619 | goto nomem; | ||
2603 | /* | 2620 | /* |
2604 | * Even though the limit is exceeded at this point, reclaim | 2621 | * Even though the limit is exceeded at this point, reclaim |
2605 | * may have been able to free some pages. Retry the charge | 2622 | * may have been able to free some pages. Retry the charge |
@@ -2609,96 +2626,38 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2609 | * unlikely to succeed so close to the limit, and we fall back | 2626 | * unlikely to succeed so close to the limit, and we fall back |
2610 | * to regular pages anyway in case of failure. | 2627 | * to regular pages anyway in case of failure. |
2611 | */ | 2628 | */ |
2612 | if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) | 2629 | if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) |
2613 | return CHARGE_RETRY; | 2630 | goto retry; |
2614 | |||
2615 | /* | 2631 | /* |
2616 | * At task move, charge accounts can be doubly counted. So, it's | 2632 | * At task move, charge accounts can be doubly counted. So, it's |
2617 | * better to wait until the end of task_move if something is going on. | 2633 | * better to wait until the end of task_move if something is going on. |
2618 | */ | 2634 | */ |
2619 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 2635 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
2620 | return CHARGE_RETRY; | 2636 | goto retry; |
2621 | |||
2622 | if (invoke_oom) | ||
2623 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); | ||
2624 | |||
2625 | return CHARGE_NOMEM; | ||
2626 | } | ||
2627 | |||
2628 | /** | ||
2629 | * mem_cgroup_try_charge - try charging a memcg | ||
2630 | * @memcg: memcg to charge | ||
2631 | * @nr_pages: number of pages to charge | ||
2632 | * @oom: trigger OOM if reclaim fails | ||
2633 | * | ||
2634 | * Returns 0 if @memcg was charged successfully, -EINTR if the charge | ||
2635 | * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. | ||
2636 | */ | ||
2637 | static int mem_cgroup_try_charge(struct mem_cgroup *memcg, | ||
2638 | gfp_t gfp_mask, | ||
2639 | unsigned int nr_pages, | ||
2640 | bool oom) | ||
2641 | { | ||
2642 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | ||
2643 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
2644 | int ret; | ||
2645 | |||
2646 | if (mem_cgroup_is_root(memcg)) | ||
2647 | goto done; | ||
2648 | /* | ||
2649 | * Unlike in global OOM situations, memcg is not in a physical | ||
2650 | * memory shortage. Allow dying and OOM-killed tasks to | ||
2651 | * bypass the last charges so that they can exit quickly and | ||
2652 | * free their memory. | ||
2653 | */ | ||
2654 | if (unlikely(test_thread_flag(TIF_MEMDIE) || | ||
2655 | fatal_signal_pending(current) || | ||
2656 | current->flags & PF_EXITING)) | ||
2657 | goto bypass; | ||
2658 | 2637 | ||
2659 | if (unlikely(task_in_memcg_oom(current))) | 2638 | if (nr_retries--) |
2660 | goto nomem; | 2639 | goto retry; |
2661 | 2640 | ||
2662 | if (gfp_mask & __GFP_NOFAIL) | 2641 | if (gfp_mask & __GFP_NOFAIL) |
2663 | oom = false; | 2642 | goto bypass; |
2664 | again: | ||
2665 | if (consume_stock(memcg, nr_pages)) | ||
2666 | goto done; | ||
2667 | |||
2668 | do { | ||
2669 | bool invoke_oom = oom && !nr_oom_retries; | ||
2670 | |||
2671 | /* If killed, bypass charge */ | ||
2672 | if (fatal_signal_pending(current)) | ||
2673 | goto bypass; | ||
2674 | 2643 | ||
2675 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, | 2644 | if (fatal_signal_pending(current)) |
2676 | nr_pages, invoke_oom); | 2645 | goto bypass; |
2677 | switch (ret) { | ||
2678 | case CHARGE_OK: | ||
2679 | break; | ||
2680 | case CHARGE_RETRY: /* not in OOM situation but retry */ | ||
2681 | batch = nr_pages; | ||
2682 | goto again; | ||
2683 | case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ | ||
2684 | goto nomem; | ||
2685 | case CHARGE_NOMEM: /* OOM routine works */ | ||
2686 | if (!oom || invoke_oom) | ||
2687 | goto nomem; | ||
2688 | nr_oom_retries--; | ||
2689 | break; | ||
2690 | } | ||
2691 | } while (ret != CHARGE_OK); | ||
2692 | 2646 | ||
2693 | if (batch > nr_pages) | 2647 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); |
2694 | refill_stock(memcg, batch - nr_pages); | ||
2695 | done: | ||
2696 | return 0; | ||
2697 | nomem: | 2648 | nomem: |
2698 | if (!(gfp_mask & __GFP_NOFAIL)) | 2649 | if (!(gfp_mask & __GFP_NOFAIL)) |
2699 | return -ENOMEM; | 2650 | return -ENOMEM; |
2700 | bypass: | 2651 | bypass: |
2701 | return -EINTR; | 2652 | memcg = root_mem_cgroup; |
2653 | ret = -EINTR; | ||
2654 | goto retry; | ||
2655 | |||
2656 | done_restock: | ||
2657 | if (batch > nr_pages) | ||
2658 | refill_stock(memcg, batch - nr_pages); | ||
2659 | done: | ||
2660 | return ret; | ||
2702 | } | 2661 | } |
2703 | 2662 | ||
2704 | /** | 2663 | /** |
@@ -2712,15 +2671,14 @@ bypass: | |||
2712 | */ | 2671 | */ |
2713 | static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, | 2672 | static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, |
2714 | gfp_t gfp_mask, | 2673 | gfp_t gfp_mask, |
2715 | unsigned int nr_pages, | 2674 | unsigned int nr_pages) |
2716 | bool oom) | ||
2717 | 2675 | ||
2718 | { | 2676 | { |
2719 | struct mem_cgroup *memcg; | 2677 | struct mem_cgroup *memcg; |
2720 | int ret; | 2678 | int ret; |
2721 | 2679 | ||
2722 | memcg = get_mem_cgroup_from_mm(mm); | 2680 | memcg = get_mem_cgroup_from_mm(mm); |
2723 | ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); | 2681 | ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); |
2724 | css_put(&memcg->css); | 2682 | css_put(&memcg->css); |
2725 | if (ret == -EINTR) | 2683 | if (ret == -EINTR) |
2726 | memcg = root_mem_cgroup; | 2684 | memcg = root_mem_cgroup; |
@@ -2738,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, | |||
2738 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, | 2696 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, |
2739 | unsigned int nr_pages) | 2697 | unsigned int nr_pages) |
2740 | { | 2698 | { |
2741 | if (!mem_cgroup_is_root(memcg)) { | 2699 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2742 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2743 | 2700 | ||
2744 | res_counter_uncharge(&memcg->res, bytes); | 2701 | res_counter_uncharge(&memcg->res, bytes); |
2745 | if (do_swap_account) | 2702 | if (do_swap_account) |
2746 | res_counter_uncharge(&memcg->memsw, bytes); | 2703 | res_counter_uncharge(&memcg->memsw, bytes); |
2747 | } | ||
2748 | } | 2704 | } |
2749 | 2705 | ||
2750 | /* | 2706 | /* |
@@ -2756,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | |||
2756 | { | 2712 | { |
2757 | unsigned long bytes = nr_pages * PAGE_SIZE; | 2713 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2758 | 2714 | ||
2759 | if (mem_cgroup_is_root(memcg)) | ||
2760 | return; | ||
2761 | |||
2762 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | 2715 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); |
2763 | if (do_swap_account) | 2716 | if (do_swap_account) |
2764 | res_counter_uncharge_until(&memcg->memsw, | 2717 | res_counter_uncharge_until(&memcg->memsw, |
@@ -2842,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2842 | } | 2795 | } |
2843 | 2796 | ||
2844 | pc->mem_cgroup = memcg; | 2797 | pc->mem_cgroup = memcg; |
2845 | /* | ||
2846 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | ||
2847 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | ||
2848 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | ||
2849 | * before USED bit, we need memory barrier here. | ||
2850 | * See mem_cgroup_add_lru_list(), etc. | ||
2851 | */ | ||
2852 | smp_wmb(); | ||
2853 | SetPageCgroupUsed(pc); | 2798 | SetPageCgroupUsed(pc); |
2854 | 2799 | ||
2855 | if (lrucare) { | 2800 | if (lrucare) { |
@@ -2937,8 +2882,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | |||
2937 | if (ret) | 2882 | if (ret) |
2938 | return ret; | 2883 | return ret; |
2939 | 2884 | ||
2940 | ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, | 2885 | ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); |
2941 | oom_gfp_allowed(gfp)); | ||
2942 | if (ret == -EINTR) { | 2886 | if (ret == -EINTR) { |
2943 | /* | 2887 | /* |
2944 | * mem_cgroup_try_charge() chosed to bypass to root due to | 2888 | * mem_cgroup_try_charge() chosed to bypass to root due to |
@@ -3463,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
3463 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 3407 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); |
3464 | return; | 3408 | return; |
3465 | } | 3409 | } |
3466 | 3410 | /* | |
3411 | * The page is freshly allocated and not visible to any | ||
3412 | * outside callers yet. Set up pc non-atomically. | ||
3413 | */ | ||
3467 | pc = lookup_page_cgroup(page); | 3414 | pc = lookup_page_cgroup(page); |
3468 | lock_page_cgroup(pc); | ||
3469 | pc->mem_cgroup = memcg; | 3415 | pc->mem_cgroup = memcg; |
3470 | SetPageCgroupUsed(pc); | 3416 | pc->flags = PCG_USED; |
3471 | unlock_page_cgroup(pc); | ||
3472 | } | 3417 | } |
3473 | 3418 | ||
3474 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | 3419 | void __memcg_kmem_uncharge_pages(struct page *page, int order) |
@@ -3478,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
3478 | 3423 | ||
3479 | 3424 | ||
3480 | pc = lookup_page_cgroup(page); | 3425 | pc = lookup_page_cgroup(page); |
3481 | /* | ||
3482 | * Fast unlocked return. Theoretically might have changed, have to | ||
3483 | * check again after locking. | ||
3484 | */ | ||
3485 | if (!PageCgroupUsed(pc)) | 3426 | if (!PageCgroupUsed(pc)) |
3486 | return; | 3427 | return; |
3487 | 3428 | ||
3488 | lock_page_cgroup(pc); | 3429 | memcg = pc->mem_cgroup; |
3489 | if (PageCgroupUsed(pc)) { | 3430 | pc->flags = 0; |
3490 | memcg = pc->mem_cgroup; | ||
3491 | ClearPageCgroupUsed(pc); | ||
3492 | } | ||
3493 | unlock_page_cgroup(pc); | ||
3494 | 3431 | ||
3495 | /* | 3432 | /* |
3496 | * We trust that only if there is a memcg associated with the page, it | 3433 | * We trust that only if there is a memcg associated with the page, it |
@@ -3531,7 +3468,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3531 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 3468 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
3532 | pc = head_pc + i; | 3469 | pc = head_pc + i; |
3533 | pc->mem_cgroup = memcg; | 3470 | pc->mem_cgroup = memcg; |
3534 | smp_wmb();/* see __commit_charge() */ | ||
3535 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 3471 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
3536 | } | 3472 | } |
3537 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | 3473 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], |
@@ -3687,7 +3623,6 @@ int mem_cgroup_charge_anon(struct page *page, | |||
3687 | { | 3623 | { |
3688 | unsigned int nr_pages = 1; | 3624 | unsigned int nr_pages = 1; |
3689 | struct mem_cgroup *memcg; | 3625 | struct mem_cgroup *memcg; |
3690 | bool oom = true; | ||
3691 | 3626 | ||
3692 | if (mem_cgroup_disabled()) | 3627 | if (mem_cgroup_disabled()) |
3693 | return 0; | 3628 | return 0; |
@@ -3699,14 +3634,9 @@ int mem_cgroup_charge_anon(struct page *page, | |||
3699 | if (PageTransHuge(page)) { | 3634 | if (PageTransHuge(page)) { |
3700 | nr_pages <<= compound_order(page); | 3635 | nr_pages <<= compound_order(page); |
3701 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 3636 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
3702 | /* | ||
3703 | * Never OOM-kill a process for a huge page. The | ||
3704 | * fault handler will fall back to regular pages. | ||
3705 | */ | ||
3706 | oom = false; | ||
3707 | } | 3637 | } |
3708 | 3638 | ||
3709 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); | 3639 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); |
3710 | if (!memcg) | 3640 | if (!memcg) |
3711 | return -ENOMEM; | 3641 | return -ENOMEM; |
3712 | __mem_cgroup_commit_charge(memcg, page, nr_pages, | 3642 | __mem_cgroup_commit_charge(memcg, page, nr_pages, |
@@ -3743,7 +3673,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
3743 | memcg = try_get_mem_cgroup_from_page(page); | 3673 | memcg = try_get_mem_cgroup_from_page(page); |
3744 | if (!memcg) | 3674 | if (!memcg) |
3745 | memcg = get_mem_cgroup_from_mm(mm); | 3675 | memcg = get_mem_cgroup_from_mm(mm); |
3746 | ret = mem_cgroup_try_charge(memcg, mask, 1, true); | 3676 | ret = mem_cgroup_try_charge(memcg, mask, 1); |
3747 | css_put(&memcg->css); | 3677 | css_put(&memcg->css); |
3748 | if (ret == -EINTR) | 3678 | if (ret == -EINTR) |
3749 | memcg = root_mem_cgroup; | 3679 | memcg = root_mem_cgroup; |
@@ -3770,7 +3700,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | |||
3770 | if (!PageSwapCache(page)) { | 3700 | if (!PageSwapCache(page)) { |
3771 | struct mem_cgroup *memcg; | 3701 | struct mem_cgroup *memcg; |
3772 | 3702 | ||
3773 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); | 3703 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); |
3774 | if (!memcg) | 3704 | if (!memcg) |
3775 | return -ENOMEM; | 3705 | return -ENOMEM; |
3776 | *memcgp = memcg; | 3706 | *memcgp = memcg; |
@@ -3839,7 +3769,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, | |||
3839 | return 0; | 3769 | return 0; |
3840 | } | 3770 | } |
3841 | 3771 | ||
3842 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); | 3772 | memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); |
3843 | if (!memcg) | 3773 | if (!memcg) |
3844 | return -ENOMEM; | 3774 | return -ENOMEM; |
3845 | __mem_cgroup_commit_charge(memcg, page, 1, type, false); | 3775 | __mem_cgroup_commit_charge(memcg, page, 1, type, false); |
@@ -3993,7 +3923,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, | |||
3993 | * replacement page, so leave it alone when phasing out the | 3923 | * replacement page, so leave it alone when phasing out the |
3994 | * page that is unused after the migration. | 3924 | * page that is unused after the migration. |
3995 | */ | 3925 | */ |
3996 | if (!end_migration && !mem_cgroup_is_root(memcg)) | 3926 | if (!end_migration) |
3997 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); | 3927 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
3998 | 3928 | ||
3999 | return memcg; | 3929 | return memcg; |
@@ -4126,8 +4056,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
4126 | * We uncharge this because swap is freed. This memcg can | 4056 | * We uncharge this because swap is freed. This memcg can |
4127 | * be obsolete one. We avoid calling css_tryget_online(). | 4057 | * be obsolete one. We avoid calling css_tryget_online(). |
4128 | */ | 4058 | */ |
4129 | if (!mem_cgroup_is_root(memcg)) | 4059 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); |
4130 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
4131 | mem_cgroup_swap_statistics(memcg, false); | 4060 | mem_cgroup_swap_statistics(memcg, false); |
4132 | css_put(&memcg->css); | 4061 | css_put(&memcg->css); |
4133 | } | 4062 | } |
@@ -4817,78 +4746,24 @@ out: | |||
4817 | return retval; | 4746 | return retval; |
4818 | } | 4747 | } |
4819 | 4748 | ||
4820 | |||
4821 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, | ||
4822 | enum mem_cgroup_stat_index idx) | ||
4823 | { | ||
4824 | struct mem_cgroup *iter; | ||
4825 | long val = 0; | ||
4826 | |||
4827 | /* Per-cpu values can be negative, use a signed accumulator */ | ||
4828 | for_each_mem_cgroup_tree(iter, memcg) | ||
4829 | val += mem_cgroup_read_stat(iter, idx); | ||
4830 | |||
4831 | if (val < 0) /* race ? */ | ||
4832 | val = 0; | ||
4833 | return val; | ||
4834 | } | ||
4835 | |||
4836 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | ||
4837 | { | ||
4838 | u64 val; | ||
4839 | |||
4840 | if (!mem_cgroup_is_root(memcg)) { | ||
4841 | if (!swap) | ||
4842 | return res_counter_read_u64(&memcg->res, RES_USAGE); | ||
4843 | else | ||
4844 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
4845 | } | ||
4846 | |||
4847 | /* | ||
4848 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
4849 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
4850 | */ | ||
4851 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4852 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4853 | |||
4854 | if (swap) | ||
4855 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
4856 | |||
4857 | return val << PAGE_SHIFT; | ||
4858 | } | ||
4859 | |||
4860 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 4749 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
4861 | struct cftype *cft) | 4750 | struct cftype *cft) |
4862 | { | 4751 | { |
4863 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4752 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4864 | u64 val; | 4753 | enum res_type type = MEMFILE_TYPE(cft->private); |
4865 | int name; | 4754 | int name = MEMFILE_ATTR(cft->private); |
4866 | enum res_type type; | ||
4867 | |||
4868 | type = MEMFILE_TYPE(cft->private); | ||
4869 | name = MEMFILE_ATTR(cft->private); | ||
4870 | 4755 | ||
4871 | switch (type) { | 4756 | switch (type) { |
4872 | case _MEM: | 4757 | case _MEM: |
4873 | if (name == RES_USAGE) | 4758 | return res_counter_read_u64(&memcg->res, name); |
4874 | val = mem_cgroup_usage(memcg, false); | ||
4875 | else | ||
4876 | val = res_counter_read_u64(&memcg->res, name); | ||
4877 | break; | ||
4878 | case _MEMSWAP: | 4759 | case _MEMSWAP: |
4879 | if (name == RES_USAGE) | 4760 | return res_counter_read_u64(&memcg->memsw, name); |
4880 | val = mem_cgroup_usage(memcg, true); | ||
4881 | else | ||
4882 | val = res_counter_read_u64(&memcg->memsw, name); | ||
4883 | break; | ||
4884 | case _KMEM: | 4761 | case _KMEM: |
4885 | val = res_counter_read_u64(&memcg->kmem, name); | 4762 | return res_counter_read_u64(&memcg->kmem, name); |
4886 | break; | 4763 | break; |
4887 | default: | 4764 | default: |
4888 | BUG(); | 4765 | BUG(); |
4889 | } | 4766 | } |
4890 | |||
4891 | return val; | ||
4892 | } | 4767 | } |
4893 | 4768 | ||
4894 | #ifdef CONFIG_MEMCG_KMEM | 4769 | #ifdef CONFIG_MEMCG_KMEM |
@@ -5350,7 +5225,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
5350 | if (!t) | 5225 | if (!t) |
5351 | goto unlock; | 5226 | goto unlock; |
5352 | 5227 | ||
5353 | usage = mem_cgroup_usage(memcg, swap); | 5228 | if (!swap) |
5229 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); | ||
5230 | else | ||
5231 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
5354 | 5232 | ||
5355 | /* | 5233 | /* |
5356 | * current_threshold points to threshold just below or equal to usage. | 5234 | * current_threshold points to threshold just below or equal to usage. |
@@ -5446,15 +5324,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
5446 | 5324 | ||
5447 | mutex_lock(&memcg->thresholds_lock); | 5325 | mutex_lock(&memcg->thresholds_lock); |
5448 | 5326 | ||
5449 | if (type == _MEM) | 5327 | if (type == _MEM) { |
5450 | thresholds = &memcg->thresholds; | 5328 | thresholds = &memcg->thresholds; |
5451 | else if (type == _MEMSWAP) | 5329 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); |
5330 | } else if (type == _MEMSWAP) { | ||
5452 | thresholds = &memcg->memsw_thresholds; | 5331 | thresholds = &memcg->memsw_thresholds; |
5453 | else | 5332 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
5333 | } else | ||
5454 | BUG(); | 5334 | BUG(); |
5455 | 5335 | ||
5456 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
5457 | |||
5458 | /* Check if a threshold crossed before adding a new one */ | 5336 | /* Check if a threshold crossed before adding a new one */ |
5459 | if (thresholds->primary) | 5337 | if (thresholds->primary) |
5460 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 5338 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
@@ -5534,18 +5412,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |||
5534 | int i, j, size; | 5412 | int i, j, size; |
5535 | 5413 | ||
5536 | mutex_lock(&memcg->thresholds_lock); | 5414 | mutex_lock(&memcg->thresholds_lock); |
5537 | if (type == _MEM) | 5415 | |
5416 | if (type == _MEM) { | ||
5538 | thresholds = &memcg->thresholds; | 5417 | thresholds = &memcg->thresholds; |
5539 | else if (type == _MEMSWAP) | 5418 | usage = res_counter_read_u64(&memcg->res, RES_USAGE); |
5419 | } else if (type == _MEMSWAP) { | ||
5540 | thresholds = &memcg->memsw_thresholds; | 5420 | thresholds = &memcg->memsw_thresholds; |
5541 | else | 5421 | usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
5422 | } else | ||
5542 | BUG(); | 5423 | BUG(); |
5543 | 5424 | ||
5544 | if (!thresholds->primary) | 5425 | if (!thresholds->primary) |
5545 | goto unlock; | 5426 | goto unlock; |
5546 | 5427 | ||
5547 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
5548 | |||
5549 | /* Check if a threshold crossed before removing */ | 5428 | /* Check if a threshold crossed before removing */ |
5550 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 5429 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
5551 | 5430 | ||
@@ -6299,9 +6178,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
6299 | * core guarantees its existence. | 6178 | * core guarantees its existence. |
6300 | */ | 6179 | */ |
6301 | } else { | 6180 | } else { |
6302 | res_counter_init(&memcg->res, NULL); | 6181 | res_counter_init(&memcg->res, &root_mem_cgroup->res); |
6303 | res_counter_init(&memcg->memsw, NULL); | 6182 | res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); |
6304 | res_counter_init(&memcg->kmem, NULL); | 6183 | res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); |
6305 | /* | 6184 | /* |
6306 | * Deeper hierachy with use_hierarchy == false doesn't make | 6185 | * Deeper hierachy with use_hierarchy == false doesn't make |
6307 | * much sense so let cgroup subsystem know about this | 6186 | * much sense so let cgroup subsystem know about this |
@@ -6435,55 +6314,39 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
6435 | 6314 | ||
6436 | #ifdef CONFIG_MMU | 6315 | #ifdef CONFIG_MMU |
6437 | /* Handlers for move charge at task migration. */ | 6316 | /* Handlers for move charge at task migration. */ |
6438 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
6439 | static int mem_cgroup_do_precharge(unsigned long count) | 6317 | static int mem_cgroup_do_precharge(unsigned long count) |
6440 | { | 6318 | { |
6441 | int ret = 0; | 6319 | int ret; |
6442 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
6443 | struct mem_cgroup *memcg = mc.to; | ||
6444 | 6320 | ||
6445 | if (mem_cgroup_is_root(memcg)) { | 6321 | /* Try a single bulk charge without reclaim first */ |
6322 | ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); | ||
6323 | if (!ret) { | ||
6446 | mc.precharge += count; | 6324 | mc.precharge += count; |
6447 | /* we don't need css_get for root */ | ||
6448 | return ret; | 6325 | return ret; |
6449 | } | 6326 | } |
6450 | /* try to charge at once */ | 6327 | if (ret == -EINTR) { |
6451 | if (count > 1) { | 6328 | __mem_cgroup_cancel_charge(root_mem_cgroup, count); |
6452 | struct res_counter *dummy; | ||
6453 | /* | ||
6454 | * "memcg" cannot be under rmdir() because we've already checked | ||
6455 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
6456 | * are still under the same cgroup_mutex. So we can postpone | ||
6457 | * css_get(). | ||
6458 | */ | ||
6459 | if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) | ||
6460 | goto one_by_one; | ||
6461 | if (do_swap_account && res_counter_charge(&memcg->memsw, | ||
6462 | PAGE_SIZE * count, &dummy)) { | ||
6463 | res_counter_uncharge(&memcg->res, PAGE_SIZE * count); | ||
6464 | goto one_by_one; | ||
6465 | } | ||
6466 | mc.precharge += count; | ||
6467 | return ret; | 6329 | return ret; |
6468 | } | 6330 | } |
6469 | one_by_one: | 6331 | |
6470 | /* fall back to one by one charge */ | 6332 | /* Try charges one by one with reclaim */ |
6471 | while (count--) { | 6333 | while (count--) { |
6472 | if (signal_pending(current)) { | 6334 | ret = mem_cgroup_try_charge(mc.to, |
6473 | ret = -EINTR; | 6335 | GFP_KERNEL & ~__GFP_NORETRY, 1); |
6474 | break; | 6336 | /* |
6475 | } | 6337 | * In case of failure, any residual charges against |
6476 | if (!batch_count--) { | 6338 | * mc.to will be dropped by mem_cgroup_clear_mc() |
6477 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 6339 | * later on. However, cancel any charges that are |
6478 | cond_resched(); | 6340 | * bypassed to root right away or they'll be lost. |
6479 | } | 6341 | */ |
6480 | ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); | 6342 | if (ret == -EINTR) |
6343 | __mem_cgroup_cancel_charge(root_mem_cgroup, 1); | ||
6481 | if (ret) | 6344 | if (ret) |
6482 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
6483 | return ret; | 6345 | return ret; |
6484 | mc.precharge++; | 6346 | mc.precharge++; |
6347 | cond_resched(); | ||
6485 | } | 6348 | } |
6486 | return ret; | 6349 | return 0; |
6487 | } | 6350 | } |
6488 | 6351 | ||
6489 | /** | 6352 | /** |
@@ -6760,21 +6623,18 @@ static void __mem_cgroup_clear_mc(void) | |||
6760 | /* we must fixup refcnts and charges */ | 6623 | /* we must fixup refcnts and charges */ |
6761 | if (mc.moved_swap) { | 6624 | if (mc.moved_swap) { |
6762 | /* uncharge swap account from the old cgroup */ | 6625 | /* uncharge swap account from the old cgroup */ |
6763 | if (!mem_cgroup_is_root(mc.from)) | 6626 | res_counter_uncharge(&mc.from->memsw, |
6764 | res_counter_uncharge(&mc.from->memsw, | 6627 | PAGE_SIZE * mc.moved_swap); |
6765 | PAGE_SIZE * mc.moved_swap); | ||
6766 | 6628 | ||
6767 | for (i = 0; i < mc.moved_swap; i++) | 6629 | for (i = 0; i < mc.moved_swap; i++) |
6768 | css_put(&mc.from->css); | 6630 | css_put(&mc.from->css); |
6769 | 6631 | ||
6770 | if (!mem_cgroup_is_root(mc.to)) { | 6632 | /* |
6771 | /* | 6633 | * we charged both to->res and to->memsw, so we should |
6772 | * we charged both to->res and to->memsw, so we should | 6634 | * uncharge to->res. |
6773 | * uncharge to->res. | 6635 | */ |
6774 | */ | 6636 | res_counter_uncharge(&mc.to->res, |
6775 | res_counter_uncharge(&mc.to->res, | 6637 | PAGE_SIZE * mc.moved_swap); |
6776 | PAGE_SIZE * mc.moved_swap); | ||
6777 | } | ||
6778 | /* we've already done css_get(mc.to) */ | 6638 | /* we've already done css_get(mc.to) */ |
6779 | mc.moved_swap = 0; | 6639 | mc.moved_swap = 0; |
6780 | } | 6640 | } |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a013bc94ebbe..44c6bd201d3a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1173,6 +1173,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1173 | lock_page(hpage); | 1173 | lock_page(hpage); |
1174 | 1174 | ||
1175 | /* | 1175 | /* |
1176 | * The page could have changed compound pages during the locking. | ||
1177 | * If this happens just bail out. | ||
1178 | */ | ||
1179 | if (compound_head(p) != hpage) { | ||
1180 | action_result(pfn, "different compound page after locking", IGNORED); | ||
1181 | res = -EBUSY; | ||
1182 | goto out; | ||
1183 | } | ||
1184 | |||
1185 | /* | ||
1176 | * We use page flags to determine what action should be taken, but | 1186 | * We use page flags to determine what action should be taken, but |
1177 | * the flags can be modified by the error containment action. One | 1187 | * the flags can be modified by the error containment action. One |
1178 | * example is an mlocked page, where PG_mlocked is cleared by | 1188 | * example is an mlocked page, where PG_mlocked is cleared by |
diff --git a/mm/memory.c b/mm/memory.c index 8b44f765b645..5c55270729f7 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -884,7 +884,7 @@ out_set_pte: | |||
884 | return 0; | 884 | return 0; |
885 | } | 885 | } |
886 | 886 | ||
887 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 887 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
888 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 888 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
889 | unsigned long addr, unsigned long end) | 889 | unsigned long addr, unsigned long end) |
890 | { | 890 | { |
@@ -2399,7 +2399,10 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2399 | /* | 2399 | /* |
2400 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2400 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2401 | * but allow concurrent faults), and pte mapped but not yet locked. | 2401 | * but allow concurrent faults), and pte mapped but not yet locked. |
2402 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2402 | * We return with pte unmapped and unlocked. |
2403 | * | ||
2404 | * We return with the mmap_sem locked or unlocked in the same cases | ||
2405 | * as does filemap_fault(). | ||
2403 | */ | 2406 | */ |
2404 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2407 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2405 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2408 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
@@ -2688,6 +2691,11 @@ oom: | |||
2688 | return VM_FAULT_OOM; | 2691 | return VM_FAULT_OOM; |
2689 | } | 2692 | } |
2690 | 2693 | ||
2694 | /* | ||
2695 | * The mmap_sem must have been held on entry, and may have been | ||
2696 | * released depending on flags and vma->vm_ops->fault() return value. | ||
2697 | * See filemap_fault() and __lock_page_retry(). | ||
2698 | */ | ||
2691 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, | 2699 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, |
2692 | pgoff_t pgoff, unsigned int flags, struct page **page) | 2700 | pgoff_t pgoff, unsigned int flags, struct page **page) |
2693 | { | 2701 | { |
@@ -2744,7 +2752,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
2744 | if (write) | 2752 | if (write) |
2745 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2753 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2746 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) | 2754 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) |
2747 | pte_mksoft_dirty(entry); | 2755 | entry = pte_mksoft_dirty(entry); |
2748 | if (anon) { | 2756 | if (anon) { |
2749 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2757 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2750 | page_add_new_anon_rmap(page, vma, address); | 2758 | page_add_new_anon_rmap(page, vma, address); |
@@ -2758,17 +2766,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
2758 | update_mmu_cache(vma, address, pte); | 2766 | update_mmu_cache(vma, address, pte); |
2759 | } | 2767 | } |
2760 | 2768 | ||
2761 | static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); | 2769 | static unsigned long fault_around_bytes __read_mostly = |
2762 | 2770 | rounddown_pow_of_two(65536); | |
2763 | static inline unsigned long fault_around_pages(void) | ||
2764 | { | ||
2765 | return fault_around_bytes >> PAGE_SHIFT; | ||
2766 | } | ||
2767 | |||
2768 | static inline unsigned long fault_around_mask(void) | ||
2769 | { | ||
2770 | return ~(fault_around_bytes - 1) & PAGE_MASK; | ||
2771 | } | ||
2772 | 2771 | ||
2773 | #ifdef CONFIG_DEBUG_FS | 2772 | #ifdef CONFIG_DEBUG_FS |
2774 | static int fault_around_bytes_get(void *data, u64 *val) | 2773 | static int fault_around_bytes_get(void *data, u64 *val) |
@@ -2834,12 +2833,15 @@ late_initcall(fault_around_debugfs); | |||
2834 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 2833 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, |
2835 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | 2834 | pte_t *pte, pgoff_t pgoff, unsigned int flags) |
2836 | { | 2835 | { |
2837 | unsigned long start_addr; | 2836 | unsigned long start_addr, nr_pages, mask; |
2838 | pgoff_t max_pgoff; | 2837 | pgoff_t max_pgoff; |
2839 | struct vm_fault vmf; | 2838 | struct vm_fault vmf; |
2840 | int off; | 2839 | int off; |
2841 | 2840 | ||
2842 | start_addr = max(address & fault_around_mask(), vma->vm_start); | 2841 | nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2842 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | ||
2843 | |||
2844 | start_addr = max(address & mask, vma->vm_start); | ||
2843 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 2845 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
2844 | pte -= off; | 2846 | pte -= off; |
2845 | pgoff -= off; | 2847 | pgoff -= off; |
@@ -2851,7 +2853,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | |||
2851 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 2853 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
2852 | PTRS_PER_PTE - 1; | 2854 | PTRS_PER_PTE - 1; |
2853 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | 2855 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, |
2854 | pgoff + fault_around_pages() - 1); | 2856 | pgoff + nr_pages - 1); |
2855 | 2857 | ||
2856 | /* Check if it makes any sense to call ->map_pages */ | 2858 | /* Check if it makes any sense to call ->map_pages */ |
2857 | while (!pte_none(*pte)) { | 2859 | while (!pte_none(*pte)) { |
@@ -2886,7 +2888,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2886 | * something). | 2888 | * something). |
2887 | */ | 2889 | */ |
2888 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && | 2890 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && |
2889 | fault_around_pages() > 1) { | 2891 | fault_around_bytes >> PAGE_SHIFT > 1) { |
2890 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2892 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2891 | do_fault_around(vma, address, pte, pgoff, flags); | 2893 | do_fault_around(vma, address, pte, pgoff, flags); |
2892 | if (!pte_same(*pte, orig_pte)) | 2894 | if (!pte_same(*pte, orig_pte)) |
@@ -3016,6 +3018,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3016 | return ret; | 3018 | return ret; |
3017 | } | 3019 | } |
3018 | 3020 | ||
3021 | /* | ||
3022 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
3023 | * but allow concurrent faults). | ||
3024 | * The mmap_sem may have been released depending on flags and our | ||
3025 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3026 | */ | ||
3019 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3027 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3020 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 3028 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
3021 | unsigned int flags, pte_t orig_pte) | 3029 | unsigned int flags, pte_t orig_pte) |
@@ -3040,7 +3048,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3040 | * | 3048 | * |
3041 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3049 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
3042 | * but allow concurrent faults), and pte mapped but not yet locked. | 3050 | * but allow concurrent faults), and pte mapped but not yet locked. |
3043 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3051 | * We return with pte unmapped and unlocked. |
3052 | * The mmap_sem may have been released depending on flags and our | ||
3053 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3044 | */ | 3054 | */ |
3045 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3055 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3046 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 3056 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
@@ -3172,7 +3182,10 @@ out: | |||
3172 | * | 3182 | * |
3173 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3183 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
3174 | * but allow concurrent faults), and pte mapped but not yet locked. | 3184 | * but allow concurrent faults), and pte mapped but not yet locked. |
3175 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3185 | * We return with pte unmapped and unlocked. |
3186 | * | ||
3187 | * The mmap_sem may have been released depending on flags and our | ||
3188 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3176 | */ | 3189 | */ |
3177 | static int handle_pte_fault(struct mm_struct *mm, | 3190 | static int handle_pte_fault(struct mm_struct *mm, |
3178 | struct vm_area_struct *vma, unsigned long address, | 3191 | struct vm_area_struct *vma, unsigned long address, |
@@ -3181,7 +3194,7 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3181 | pte_t entry; | 3194 | pte_t entry; |
3182 | spinlock_t *ptl; | 3195 | spinlock_t *ptl; |
3183 | 3196 | ||
3184 | entry = *pte; | 3197 | entry = ACCESS_ONCE(*pte); |
3185 | if (!pte_present(entry)) { | 3198 | if (!pte_present(entry)) { |
3186 | if (pte_none(entry)) { | 3199 | if (pte_none(entry)) { |
3187 | if (vma->vm_ops) { | 3200 | if (vma->vm_ops) { |
@@ -3232,6 +3245,9 @@ unlock: | |||
3232 | 3245 | ||
3233 | /* | 3246 | /* |
3234 | * By the time we get here, we already hold the mm semaphore | 3247 | * By the time we get here, we already hold the mm semaphore |
3248 | * | ||
3249 | * The mmap_sem may have been released depending on flags and our | ||
3250 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3235 | */ | 3251 | */ |
3236 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3252 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3237 | unsigned long address, unsigned int flags) | 3253 | unsigned long address, unsigned int flags) |
@@ -3313,6 +3329,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3313 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3329 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3314 | } | 3330 | } |
3315 | 3331 | ||
3332 | /* | ||
3333 | * By the time we get here, we already hold the mm semaphore | ||
3334 | * | ||
3335 | * The mmap_sem may have been released depending on flags and our | ||
3336 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3337 | */ | ||
3316 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3338 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3317 | unsigned long address, unsigned int flags) | 3339 | unsigned long address, unsigned int flags) |
3318 | { | 3340 | { |
@@ -3591,11 +3613,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
3591 | ret = get_user_pages(tsk, mm, addr, 1, | 3613 | ret = get_user_pages(tsk, mm, addr, 1, |
3592 | write, 1, &page, &vma); | 3614 | write, 1, &page, &vma); |
3593 | if (ret <= 0) { | 3615 | if (ret <= 0) { |
3616 | #ifndef CONFIG_HAVE_IOREMAP_PROT | ||
3617 | break; | ||
3618 | #else | ||
3594 | /* | 3619 | /* |
3595 | * Check if this is a VM_IO | VM_PFNMAP VMA, which | 3620 | * Check if this is a VM_IO | VM_PFNMAP VMA, which |
3596 | * we can access using slightly different code. | 3621 | * we can access using slightly different code. |
3597 | */ | 3622 | */ |
3598 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
3599 | vma = find_vma(mm, addr); | 3623 | vma = find_vma(mm, addr); |
3600 | if (!vma || vma->vm_start > addr) | 3624 | if (!vma || vma->vm_start > addr) |
3601 | break; | 3625 | break; |
@@ -3603,9 +3627,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
3603 | ret = vma->vm_ops->access(vma, addr, buf, | 3627 | ret = vma->vm_ops->access(vma, addr, buf, |
3604 | len, write); | 3628 | len, write); |
3605 | if (ret <= 0) | 3629 | if (ret <= 0) |
3606 | #endif | ||
3607 | break; | 3630 | break; |
3608 | bytes = ret; | 3631 | bytes = ret; |
3632 | #endif | ||
3609 | } else { | 3633 | } else { |
3610 | bytes = len; | 3634 | bytes = len; |
3611 | offset = addr & (PAGE_SIZE-1); | 3635 | offset = addr & (PAGE_SIZE-1); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 469bbf505f85..2ff8c2325e96 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
284 | } | 284 | } |
285 | #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ | 285 | #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ |
286 | 286 | ||
287 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | 287 | static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, |
288 | unsigned long end_pfn) | 288 | unsigned long end_pfn) |
289 | { | 289 | { |
290 | unsigned long old_zone_end_pfn; | 290 | unsigned long old_zone_end_pfn; |
291 | 291 | ||
@@ -427,8 +427,8 @@ out_fail: | |||
427 | return -1; | 427 | return -1; |
428 | } | 428 | } |
429 | 429 | ||
430 | static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 430 | static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
431 | unsigned long end_pfn) | 431 | unsigned long end_pfn) |
432 | { | 432 | { |
433 | unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); | 433 | unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); |
434 | 434 | ||
@@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
977 | zone = page_zone(pfn_to_page(pfn)); | 977 | zone = page_zone(pfn_to_page(pfn)); |
978 | 978 | ||
979 | ret = -EINVAL; | 979 | ret = -EINVAL; |
980 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | 980 | if ((zone_idx(zone) > ZONE_NORMAL || |
981 | online_type == MMOP_ONLINE_MOVABLE) && | ||
981 | !can_online_high_movable(zone)) | 982 | !can_online_high_movable(zone)) |
982 | goto out; | 983 | goto out; |
983 | 984 | ||
984 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | 985 | if (online_type == MMOP_ONLINE_KERNEL && |
986 | zone_idx(zone) == ZONE_MOVABLE) { | ||
985 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) | 987 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) |
986 | goto out; | 988 | goto out; |
987 | } | 989 | } |
988 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | 990 | if (online_type == MMOP_ONLINE_MOVABLE && |
991 | zone_idx(zone) == ZONE_MOVABLE - 1) { | ||
989 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) | 992 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) |
990 | goto out; | 993 | goto out; |
991 | } | 994 | } |
@@ -1156,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size) | |||
1156 | return 0; | 1159 | return 0; |
1157 | } | 1160 | } |
1158 | 1161 | ||
1162 | /* | ||
1163 | * If movable zone has already been setup, newly added memory should be check. | ||
1164 | * If its address is higher than movable zone, it should be added as movable. | ||
1165 | * Without this check, movable zone may overlap with other zone. | ||
1166 | */ | ||
1167 | static int should_add_memory_movable(int nid, u64 start, u64 size) | ||
1168 | { | ||
1169 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
1170 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1171 | struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; | ||
1172 | |||
1173 | if (zone_is_empty(movable_zone)) | ||
1174 | return 0; | ||
1175 | |||
1176 | if (movable_zone->zone_start_pfn <= start_pfn) | ||
1177 | return 1; | ||
1178 | |||
1179 | return 0; | ||
1180 | } | ||
1181 | |||
1182 | int zone_for_memory(int nid, u64 start, u64 size, int zone_default) | ||
1183 | { | ||
1184 | if (should_add_memory_movable(nid, start, size)) | ||
1185 | return ZONE_MOVABLE; | ||
1186 | |||
1187 | return zone_default; | ||
1188 | } | ||
1189 | |||
1159 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ | 1190 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ |
1160 | int __ref add_memory(int nid, u64 start, u64 size) | 1191 | int __ref add_memory(int nid, u64 start, u64 size) |
1161 | { | 1192 | { |
diff --git a/mm/mlock.c b/mm/mlock.c index b1eb53634005..ce84cb0b83ef 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -210,12 +210,19 @@ out: | |||
210 | * @vma: target vma | 210 | * @vma: target vma |
211 | * @start: start address | 211 | * @start: start address |
212 | * @end: end address | 212 | * @end: end address |
213 | * @nonblocking: | ||
213 | * | 214 | * |
214 | * This takes care of making the pages present too. | 215 | * This takes care of making the pages present too. |
215 | * | 216 | * |
216 | * return 0 on success, negative error code on error. | 217 | * return 0 on success, negative error code on error. |
217 | * | 218 | * |
218 | * vma->vm_mm->mmap_sem must be held for at least read. | 219 | * vma->vm_mm->mmap_sem must be held. |
220 | * | ||
221 | * If @nonblocking is NULL, it may be held for read or write and will | ||
222 | * be unperturbed. | ||
223 | * | ||
224 | * If @nonblocking is non-NULL, it must held for read only and may be | ||
225 | * released. If it's released, *@nonblocking will be set to 0. | ||
219 | */ | 226 | */ |
220 | long __mlock_vma_pages_range(struct vm_area_struct *vma, | 227 | long __mlock_vma_pages_range(struct vm_area_struct *vma, |
221 | unsigned long start, unsigned long end, int *nonblocking) | 228 | unsigned long start, unsigned long end, int *nonblocking) |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
32 | #include <linux/rmap.h> | 32 | #include <linux/rmap.h> |
33 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
34 | #include <linux/mmdebug.h> | ||
34 | #include <linux/perf_event.h> | 35 | #include <linux/perf_event.h> |
35 | #include <linux/audit.h> | 36 | #include <linux/audit.h> |
36 | #include <linux/khugepaged.h> | 37 | #include <linux/khugepaged.h> |
@@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
134 | { | 135 | { |
135 | unsigned long free, allowed, reserve; | 136 | unsigned long free, allowed, reserve; |
136 | 137 | ||
138 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < | ||
139 | -(s64)vm_committed_as_batch * num_online_cpus(), | ||
140 | "memory commitment underflow"); | ||
141 | |||
137 | vm_acct_memory(pages); | 142 | vm_acct_memory(pages); |
138 | 143 | ||
139 | /* | 144 | /* |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 41cefdf0aadd..950813b1eb36 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -23,6 +23,25 @@ | |||
23 | static struct srcu_struct srcu; | 23 | static struct srcu_struct srcu; |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * This function allows mmu_notifier::release callback to delay a call to | ||
27 | * a function that will free appropriate resources. The function must be | ||
28 | * quick and must not block. | ||
29 | */ | ||
30 | void mmu_notifier_call_srcu(struct rcu_head *rcu, | ||
31 | void (*func)(struct rcu_head *rcu)) | ||
32 | { | ||
33 | call_srcu(&srcu, rcu, func); | ||
34 | } | ||
35 | EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); | ||
36 | |||
37 | void mmu_notifier_synchronize(void) | ||
38 | { | ||
39 | /* Wait for any running method to finish. */ | ||
40 | srcu_barrier(&srcu); | ||
41 | } | ||
42 | EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); | ||
43 | |||
44 | /* | ||
26 | * This function can't run concurrently against mmu_notifier_register | 45 | * This function can't run concurrently against mmu_notifier_register |
27 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | 46 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
28 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers | 47 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers |
@@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
53 | */ | 72 | */ |
54 | if (mn->ops->release) | 73 | if (mn->ops->release) |
55 | mn->ops->release(mn, mm); | 74 | mn->ops->release(mn, mm); |
56 | srcu_read_unlock(&srcu, id); | ||
57 | 75 | ||
58 | spin_lock(&mm->mmu_notifier_mm->lock); | 76 | spin_lock(&mm->mmu_notifier_mm->lock); |
59 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 77 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
69 | hlist_del_init_rcu(&mn->hlist); | 87 | hlist_del_init_rcu(&mn->hlist); |
70 | } | 88 | } |
71 | spin_unlock(&mm->mmu_notifier_mm->lock); | 89 | spin_unlock(&mm->mmu_notifier_mm->lock); |
90 | srcu_read_unlock(&srcu, id); | ||
72 | 91 | ||
73 | /* | 92 | /* |
74 | * synchronize_srcu here prevents mmu_notifier_release from returning to | 93 | * synchronize_srcu here prevents mmu_notifier_release from returning to |
@@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
325 | } | 344 | } |
326 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | 345 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
327 | 346 | ||
347 | /* | ||
348 | * Same as mmu_notifier_unregister but no callback and no srcu synchronization. | ||
349 | */ | ||
350 | void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, | ||
351 | struct mm_struct *mm) | ||
352 | { | ||
353 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
354 | /* | ||
355 | * Can not use list_del_rcu() since __mmu_notifier_release | ||
356 | * can delete it before we hold the lock. | ||
357 | */ | ||
358 | hlist_del_init_rcu(&mn->hlist); | ||
359 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
360 | |||
361 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
362 | mmdrop(mm); | ||
363 | } | ||
364 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); | ||
365 | |||
328 | static int __init mmu_notifier_init(void) | 366 | static int __init mmu_notifier_init(void) |
329 | { | 367 | { |
330 | return init_srcu_struct(&srcu); | 368 | return init_srcu_struct(&srcu); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3291e82d4352..1e11df8fa7ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
258 | unsigned long totalpages, const nodemask_t *nodemask, | 258 | unsigned long totalpages, const nodemask_t *nodemask, |
259 | bool force_kill) | 259 | bool force_kill) |
260 | { | 260 | { |
261 | if (task->exit_state) | ||
262 | return OOM_SCAN_CONTINUE; | ||
263 | if (oom_unkillable_task(task, NULL, nodemask)) | 261 | if (oom_unkillable_task(task, NULL, nodemask)) |
264 | return OOM_SCAN_CONTINUE; | 262 | return OOM_SCAN_CONTINUE; |
265 | 263 | ||
@@ -559,28 +557,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); | |||
559 | * if a parallel OOM killing is already taking place that includes a zone in | 557 | * if a parallel OOM killing is already taking place that includes a zone in |
560 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. | 558 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. |
561 | */ | 559 | */ |
562 | int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | 560 | bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) |
563 | { | 561 | { |
564 | struct zoneref *z; | 562 | struct zoneref *z; |
565 | struct zone *zone; | 563 | struct zone *zone; |
566 | int ret = 1; | 564 | bool ret = true; |
567 | 565 | ||
568 | spin_lock(&zone_scan_lock); | 566 | spin_lock(&zone_scan_lock); |
569 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 567 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
570 | if (zone_is_oom_locked(zone)) { | 568 | if (zone_is_oom_locked(zone)) { |
571 | ret = 0; | 569 | ret = false; |
572 | goto out; | 570 | goto out; |
573 | } | 571 | } |
574 | } | ||
575 | 572 | ||
576 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 573 | /* |
577 | /* | 574 | * Lock each zone in the zonelist under zone_scan_lock so a parallel |
578 | * Lock each zone in the zonelist under zone_scan_lock so a | 575 | * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. |
579 | * parallel invocation of try_set_zonelist_oom() doesn't succeed | 576 | */ |
580 | * when it shouldn't. | 577 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
581 | */ | ||
582 | zone_set_flag(zone, ZONE_OOM_LOCKED); | 578 | zone_set_flag(zone, ZONE_OOM_LOCKED); |
583 | } | ||
584 | 579 | ||
585 | out: | 580 | out: |
586 | spin_unlock(&zone_scan_lock); | 581 | spin_unlock(&zone_scan_lock); |
@@ -592,15 +587,14 @@ out: | |||
592 | * allocation attempts with zonelists containing them may now recall the OOM | 587 | * allocation attempts with zonelists containing them may now recall the OOM |
593 | * killer, if necessary. | 588 | * killer, if necessary. |
594 | */ | 589 | */ |
595 | void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | 590 | void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) |
596 | { | 591 | { |
597 | struct zoneref *z; | 592 | struct zoneref *z; |
598 | struct zone *zone; | 593 | struct zone *zone; |
599 | 594 | ||
600 | spin_lock(&zone_scan_lock); | 595 | spin_lock(&zone_scan_lock); |
601 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { | 596 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
602 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | 597 | zone_clear_flag(zone, ZONE_OOM_LOCKED); |
603 | } | ||
604 | spin_unlock(&zone_scan_lock); | 598 | spin_unlock(&zone_scan_lock); |
605 | } | 599 | } |
606 | 600 | ||
@@ -694,9 +688,9 @@ void pagefault_out_of_memory(void) | |||
694 | if (mem_cgroup_oom_synchronize(true)) | 688 | if (mem_cgroup_oom_synchronize(true)) |
695 | return; | 689 | return; |
696 | 690 | ||
697 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | 691 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); |
698 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | 692 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { |
699 | out_of_memory(NULL, 0, 0, NULL, false); | 693 | out_of_memory(NULL, 0, 0, NULL, false); |
700 | clear_zonelist_oom(zonelist, GFP_KERNEL); | 694 | oom_zonelist_unlock(zonelist, GFP_KERNEL); |
701 | } | 695 | } |
702 | } | 696 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e0c943014eb7..91d73ef1744d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void) | |||
261 | */ | 261 | */ |
262 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | 262 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) |
263 | { | 263 | { |
264 | const unsigned long available_memory = global_dirtyable_memory(); | ||
264 | unsigned long background; | 265 | unsigned long background; |
265 | unsigned long dirty; | 266 | unsigned long dirty; |
266 | unsigned long uninitialized_var(available_memory); | ||
267 | struct task_struct *tsk; | 267 | struct task_struct *tsk; |
268 | 268 | ||
269 | if (!vm_dirty_bytes || !dirty_background_bytes) | ||
270 | available_memory = global_dirtyable_memory(); | ||
271 | |||
272 | if (vm_dirty_bytes) | 269 | if (vm_dirty_bytes) |
273 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | 270 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); |
274 | else | 271 | else |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad736ca1..18cee0d4c8a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
680 | int migratetype = 0; | 680 | int migratetype = 0; |
681 | int batch_free = 0; | 681 | int batch_free = 0; |
682 | int to_free = count; | 682 | int to_free = count; |
683 | unsigned long nr_scanned; | ||
683 | 684 | ||
684 | spin_lock(&zone->lock); | 685 | spin_lock(&zone->lock); |
685 | zone->pages_scanned = 0; | 686 | nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); |
687 | if (nr_scanned) | ||
688 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); | ||
686 | 689 | ||
687 | while (to_free) { | 690 | while (to_free) { |
688 | struct page *page; | 691 | struct page *page; |
@@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone, | |||
731 | unsigned int order, | 734 | unsigned int order, |
732 | int migratetype) | 735 | int migratetype) |
733 | { | 736 | { |
737 | unsigned long nr_scanned; | ||
734 | spin_lock(&zone->lock); | 738 | spin_lock(&zone->lock); |
735 | zone->pages_scanned = 0; | 739 | nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); |
740 | if (nr_scanned) | ||
741 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); | ||
736 | 742 | ||
737 | __free_one_page(page, pfn, zone, order, migratetype); | 743 | __free_one_page(page, pfn, zone, order, migratetype); |
738 | if (unlikely(!is_migrate_isolate(migratetype))) | 744 | if (unlikely(!is_migrate_isolate(migratetype))) |
@@ -1257,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1257 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | 1263 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
1258 | { | 1264 | { |
1259 | unsigned long flags; | 1265 | unsigned long flags; |
1260 | int to_drain; | 1266 | int to_drain, batch; |
1261 | unsigned long batch; | ||
1262 | 1267 | ||
1263 | local_irq_save(flags); | 1268 | local_irq_save(flags); |
1264 | batch = ACCESS_ONCE(pcp->batch); | 1269 | batch = ACCESS_ONCE(pcp->batch); |
1265 | if (pcp->count >= batch) | 1270 | to_drain = min(pcp->count, batch); |
1266 | to_drain = batch; | ||
1267 | else | ||
1268 | to_drain = pcp->count; | ||
1269 | if (to_drain > 0) { | 1271 | if (to_drain > 0) { |
1270 | free_pcppages_bulk(zone, to_drain, pcp); | 1272 | free_pcppages_bulk(zone, to_drain, pcp); |
1271 | pcp->count -= to_drain; | 1273 | pcp->count -= to_drain; |
@@ -1610,6 +1612,9 @@ again: | |||
1610 | } | 1612 | } |
1611 | 1613 | ||
1612 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1614 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
1615 | if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && | ||
1616 | !zone_is_fair_depleted(zone)) | ||
1617 | zone_set_flag(zone, ZONE_FAIR_DEPLETED); | ||
1613 | 1618 | ||
1614 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1619 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1615 | zone_statistics(preferred_zone, zone, gfp_flags); | 1620 | zone_statistics(preferred_zone, zone, gfp_flags); |
@@ -1712,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, | |||
1712 | { | 1717 | { |
1713 | /* free_pages my go negative - that's OK */ | 1718 | /* free_pages my go negative - that's OK */ |
1714 | long min = mark; | 1719 | long min = mark; |
1715 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1716 | int o; | 1720 | int o; |
1717 | long free_cma = 0; | 1721 | long free_cma = 0; |
1718 | 1722 | ||
@@ -1727,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, | |||
1727 | free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); | 1731 | free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); |
1728 | #endif | 1732 | #endif |
1729 | 1733 | ||
1730 | if (free_pages - free_cma <= min + lowmem_reserve) | 1734 | if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) |
1731 | return false; | 1735 | return false; |
1732 | for (o = 0; o < order; o++) { | 1736 | for (o = 0; o < order; o++) { |
1733 | /* At the next order, this order's pages become unavailable */ | 1737 | /* At the next order, this order's pages become unavailable */ |
@@ -1922,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | |||
1922 | 1926 | ||
1923 | #endif /* CONFIG_NUMA */ | 1927 | #endif /* CONFIG_NUMA */ |
1924 | 1928 | ||
1929 | static void reset_alloc_batches(struct zone *preferred_zone) | ||
1930 | { | ||
1931 | struct zone *zone = preferred_zone->zone_pgdat->node_zones; | ||
1932 | |||
1933 | do { | ||
1934 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
1935 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
1936 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
1937 | zone_clear_flag(zone, ZONE_FAIR_DEPLETED); | ||
1938 | } while (zone++ != preferred_zone); | ||
1939 | } | ||
1940 | |||
1925 | /* | 1941 | /* |
1926 | * get_page_from_freelist goes through the zonelist trying to allocate | 1942 | * get_page_from_freelist goes through the zonelist trying to allocate |
1927 | * a page. | 1943 | * a page. |
@@ -1939,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | |||
1939 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1955 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1940 | bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && | 1956 | bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && |
1941 | (gfp_mask & __GFP_WRITE); | 1957 | (gfp_mask & __GFP_WRITE); |
1958 | int nr_fair_skipped = 0; | ||
1959 | bool zonelist_rescan; | ||
1942 | 1960 | ||
1943 | zonelist_scan: | 1961 | zonelist_scan: |
1962 | zonelist_rescan = false; | ||
1963 | |||
1944 | /* | 1964 | /* |
1945 | * Scan zonelist, looking for a zone with enough free. | 1965 | * Scan zonelist, looking for a zone with enough free. |
1946 | * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. | 1966 | * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. |
@@ -1964,9 +1984,11 @@ zonelist_scan: | |||
1964 | */ | 1984 | */ |
1965 | if (alloc_flags & ALLOC_FAIR) { | 1985 | if (alloc_flags & ALLOC_FAIR) { |
1966 | if (!zone_local(preferred_zone, zone)) | 1986 | if (!zone_local(preferred_zone, zone)) |
1987 | break; | ||
1988 | if (zone_is_fair_depleted(zone)) { | ||
1989 | nr_fair_skipped++; | ||
1967 | continue; | 1990 | continue; |
1968 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) | 1991 | } |
1969 | continue; | ||
1970 | } | 1992 | } |
1971 | /* | 1993 | /* |
1972 | * When allocating a page cache page for writing, we | 1994 | * When allocating a page cache page for writing, we |
@@ -2072,13 +2094,7 @@ this_zone_full: | |||
2072 | zlc_mark_zone_full(zonelist, z); | 2094 | zlc_mark_zone_full(zonelist, z); |
2073 | } | 2095 | } |
2074 | 2096 | ||
2075 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { | 2097 | if (page) { |
2076 | /* Disable zlc cache for second zonelist scan */ | ||
2077 | zlc_active = 0; | ||
2078 | goto zonelist_scan; | ||
2079 | } | ||
2080 | |||
2081 | if (page) | ||
2082 | /* | 2098 | /* |
2083 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | 2099 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was |
2084 | * necessary to allocate the page. The expectation is | 2100 | * necessary to allocate the page. The expectation is |
@@ -2087,8 +2103,37 @@ this_zone_full: | |||
2087 | * for !PFMEMALLOC purposes. | 2103 | * for !PFMEMALLOC purposes. |
2088 | */ | 2104 | */ |
2089 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | 2105 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); |
2106 | return page; | ||
2107 | } | ||
2090 | 2108 | ||
2091 | return page; | 2109 | /* |
2110 | * The first pass makes sure allocations are spread fairly within the | ||
2111 | * local node. However, the local node might have free pages left | ||
2112 | * after the fairness batches are exhausted, and remote zones haven't | ||
2113 | * even been considered yet. Try once more without fairness, and | ||
2114 | * include remote zones now, before entering the slowpath and waking | ||
2115 | * kswapd: prefer spilling to a remote zone over swapping locally. | ||
2116 | */ | ||
2117 | if (alloc_flags & ALLOC_FAIR) { | ||
2118 | alloc_flags &= ~ALLOC_FAIR; | ||
2119 | if (nr_fair_skipped) { | ||
2120 | zonelist_rescan = true; | ||
2121 | reset_alloc_batches(preferred_zone); | ||
2122 | } | ||
2123 | if (nr_online_nodes > 1) | ||
2124 | zonelist_rescan = true; | ||
2125 | } | ||
2126 | |||
2127 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { | ||
2128 | /* Disable zlc cache for second zonelist scan */ | ||
2129 | zlc_active = 0; | ||
2130 | zonelist_rescan = true; | ||
2131 | } | ||
2132 | |||
2133 | if (zonelist_rescan) | ||
2134 | goto zonelist_scan; | ||
2135 | |||
2136 | return NULL; | ||
2092 | } | 2137 | } |
2093 | 2138 | ||
2094 | /* | 2139 | /* |
@@ -2201,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2201 | { | 2246 | { |
2202 | struct page *page; | 2247 | struct page *page; |
2203 | 2248 | ||
2204 | /* Acquire the OOM killer lock for the zones in zonelist */ | 2249 | /* Acquire the per-zone oom lock for each zone */ |
2205 | if (!try_set_zonelist_oom(zonelist, gfp_mask)) { | 2250 | if (!oom_zonelist_trylock(zonelist, gfp_mask)) { |
2206 | schedule_timeout_uninterruptible(1); | 2251 | schedule_timeout_uninterruptible(1); |
2207 | return NULL; | 2252 | return NULL; |
2208 | } | 2253 | } |
@@ -2240,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2240 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2285 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); |
2241 | 2286 | ||
2242 | out: | 2287 | out: |
2243 | clear_zonelist_oom(zonelist, gfp_mask); | 2288 | oom_zonelist_unlock(zonelist, gfp_mask); |
2244 | return page; | 2289 | return page; |
2245 | } | 2290 | } |
2246 | 2291 | ||
@@ -2409,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
2409 | return page; | 2454 | return page; |
2410 | } | 2455 | } |
2411 | 2456 | ||
2412 | static void reset_alloc_batches(struct zonelist *zonelist, | ||
2413 | enum zone_type high_zoneidx, | ||
2414 | struct zone *preferred_zone) | ||
2415 | { | ||
2416 | struct zoneref *z; | ||
2417 | struct zone *zone; | ||
2418 | |||
2419 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | ||
2420 | /* | ||
2421 | * Only reset the batches of zones that were actually | ||
2422 | * considered in the fairness pass, we don't want to | ||
2423 | * trash fairness information for zones that are not | ||
2424 | * actually part of this zonelist's round-robin cycle. | ||
2425 | */ | ||
2426 | if (!zone_local(preferred_zone, zone)) | ||
2427 | continue; | ||
2428 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
2429 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
2430 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
2431 | } | ||
2432 | } | ||
2433 | |||
2434 | static void wake_all_kswapds(unsigned int order, | 2457 | static void wake_all_kswapds(unsigned int order, |
2435 | struct zonelist *zonelist, | 2458 | struct zonelist *zonelist, |
2436 | enum zone_type high_zoneidx, | 2459 | enum zone_type high_zoneidx, |
@@ -2616,14 +2639,6 @@ rebalance: | |||
2616 | goto got_pg; | 2639 | goto got_pg; |
2617 | 2640 | ||
2618 | /* | 2641 | /* |
2619 | * It can become very expensive to allocate transparent hugepages at | ||
2620 | * fault, so use asynchronous memory compaction for THP unless it is | ||
2621 | * khugepaged trying to collapse. | ||
2622 | */ | ||
2623 | if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) | ||
2624 | migration_mode = MIGRATE_SYNC_LIGHT; | ||
2625 | |||
2626 | /* | ||
2627 | * If compaction is deferred for high-order allocations, it is because | 2642 | * If compaction is deferred for high-order allocations, it is because |
2628 | * sync compaction recently failed. In this is the case and the caller | 2643 | * sync compaction recently failed. In this is the case and the caller |
2629 | * requested a movable allocation that does not heavily disrupt the | 2644 | * requested a movable allocation that does not heavily disrupt the |
@@ -2633,6 +2648,15 @@ rebalance: | |||
2633 | (gfp_mask & __GFP_NO_KSWAPD)) | 2648 | (gfp_mask & __GFP_NO_KSWAPD)) |
2634 | goto nopage; | 2649 | goto nopage; |
2635 | 2650 | ||
2651 | /* | ||
2652 | * It can become very expensive to allocate transparent hugepages at | ||
2653 | * fault, so use asynchronous memory compaction for THP unless it is | ||
2654 | * khugepaged trying to collapse. | ||
2655 | */ | ||
2656 | if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || | ||
2657 | (current->flags & PF_KTHREAD)) | ||
2658 | migration_mode = MIGRATE_SYNC_LIGHT; | ||
2659 | |||
2636 | /* Try direct reclaim and then allocating */ | 2660 | /* Try direct reclaim and then allocating */ |
2637 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2661 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
2638 | zonelist, high_zoneidx, | 2662 | zonelist, high_zoneidx, |
@@ -2766,29 +2790,12 @@ retry_cpuset: | |||
2766 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2790 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
2767 | alloc_flags |= ALLOC_CMA; | 2791 | alloc_flags |= ALLOC_CMA; |
2768 | #endif | 2792 | #endif |
2769 | retry: | ||
2770 | /* First allocation attempt */ | 2793 | /* First allocation attempt */ |
2771 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2794 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2772 | zonelist, high_zoneidx, alloc_flags, | 2795 | zonelist, high_zoneidx, alloc_flags, |
2773 | preferred_zone, classzone_idx, migratetype); | 2796 | preferred_zone, classzone_idx, migratetype); |
2774 | if (unlikely(!page)) { | 2797 | if (unlikely(!page)) { |
2775 | /* | 2798 | /* |
2776 | * The first pass makes sure allocations are spread | ||
2777 | * fairly within the local node. However, the local | ||
2778 | * node might have free pages left after the fairness | ||
2779 | * batches are exhausted, and remote zones haven't | ||
2780 | * even been considered yet. Try once more without | ||
2781 | * fairness, and include remote zones now, before | ||
2782 | * entering the slowpath and waking kswapd: prefer | ||
2783 | * spilling to a remote zone over swapping locally. | ||
2784 | */ | ||
2785 | if (alloc_flags & ALLOC_FAIR) { | ||
2786 | reset_alloc_batches(zonelist, high_zoneidx, | ||
2787 | preferred_zone); | ||
2788 | alloc_flags &= ~ALLOC_FAIR; | ||
2789 | goto retry; | ||
2790 | } | ||
2791 | /* | ||
2792 | * Runtime PM, block IO and its error handling path | 2799 | * Runtime PM, block IO and its error handling path |
2793 | * can deadlock because I/O on the device might not | 2800 | * can deadlock because I/O on the device might not |
2794 | * complete. | 2801 | * complete. |
@@ -2962,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact); | |||
2962 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, | 2969 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, |
2963 | * but is not exact. | 2970 | * but is not exact. |
2964 | */ | 2971 | */ |
2965 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | 2972 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
2966 | { | 2973 | { |
2967 | unsigned order = get_order(size); | 2974 | unsigned order = get_order(size); |
2968 | struct page *p = alloc_pages_node(nid, gfp_mask, order); | 2975 | struct page *p = alloc_pages_node(nid, gfp_mask, order); |
@@ -2970,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | |||
2970 | return NULL; | 2977 | return NULL; |
2971 | return make_alloc_exact((unsigned long)page_address(p), order, size); | 2978 | return make_alloc_exact((unsigned long)page_address(p), order, size); |
2972 | } | 2979 | } |
2973 | EXPORT_SYMBOL(alloc_pages_exact_nid); | ||
2974 | 2980 | ||
2975 | /** | 2981 | /** |
2976 | * free_pages_exact - release memory allocated via alloc_pages_exact() | 2982 | * free_pages_exact - release memory allocated via alloc_pages_exact() |
@@ -3052,7 +3058,7 @@ static inline void show_node(struct zone *zone) | |||
3052 | void si_meminfo(struct sysinfo *val) | 3058 | void si_meminfo(struct sysinfo *val) |
3053 | { | 3059 | { |
3054 | val->totalram = totalram_pages; | 3060 | val->totalram = totalram_pages; |
3055 | val->sharedram = 0; | 3061 | val->sharedram = global_page_state(NR_SHMEM); |
3056 | val->freeram = global_page_state(NR_FREE_PAGES); | 3062 | val->freeram = global_page_state(NR_FREE_PAGES); |
3057 | val->bufferram = nr_blockdev_pages(); | 3063 | val->bufferram = nr_blockdev_pages(); |
3058 | val->totalhigh = totalhigh_pages; | 3064 | val->totalhigh = totalhigh_pages; |
@@ -3072,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
3072 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) | 3078 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
3073 | managed_pages += pgdat->node_zones[zone_type].managed_pages; | 3079 | managed_pages += pgdat->node_zones[zone_type].managed_pages; |
3074 | val->totalram = managed_pages; | 3080 | val->totalram = managed_pages; |
3081 | val->sharedram = node_page_state(nid, NR_SHMEM); | ||
3075 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 3082 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
3076 | #ifdef CONFIG_HIGHMEM | 3083 | #ifdef CONFIG_HIGHMEM |
3077 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; | 3084 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; |
@@ -3253,12 +3260,12 @@ void show_free_areas(unsigned int filter) | |||
3253 | K(zone_page_state(zone, NR_BOUNCE)), | 3260 | K(zone_page_state(zone, NR_BOUNCE)), |
3254 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | 3261 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), |
3255 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 3262 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
3256 | zone->pages_scanned, | 3263 | K(zone_page_state(zone, NR_PAGES_SCANNED)), |
3257 | (!zone_reclaimable(zone) ? "yes" : "no") | 3264 | (!zone_reclaimable(zone) ? "yes" : "no") |
3258 | ); | 3265 | ); |
3259 | printk("lowmem_reserve[]:"); | 3266 | printk("lowmem_reserve[]:"); |
3260 | for (i = 0; i < MAX_NR_ZONES; i++) | 3267 | for (i = 0; i < MAX_NR_ZONES; i++) |
3261 | printk(" %lu", zone->lowmem_reserve[i]); | 3268 | printk(" %ld", zone->lowmem_reserve[i]); |
3262 | printk("\n"); | 3269 | printk("\n"); |
3263 | } | 3270 | } |
3264 | 3271 | ||
@@ -5579,7 +5586,7 @@ static void calculate_totalreserve_pages(void) | |||
5579 | for_each_online_pgdat(pgdat) { | 5586 | for_each_online_pgdat(pgdat) { |
5580 | for (i = 0; i < MAX_NR_ZONES; i++) { | 5587 | for (i = 0; i < MAX_NR_ZONES; i++) { |
5581 | struct zone *zone = pgdat->node_zones + i; | 5588 | struct zone *zone = pgdat->node_zones + i; |
5582 | unsigned long max = 0; | 5589 | long max = 0; |
5583 | 5590 | ||
5584 | /* Find valid and maximum lowmem_reserve in the zone */ | 5591 | /* Find valid and maximum lowmem_reserve in the zone */ |
5585 | for (j = i; j < MAX_NR_ZONES; j++) { | 5592 | for (j = i; j < MAX_NR_ZONES; j++) { |
diff --git a/mm/readahead.c b/mm/readahead.c index 0ca36a7770b1..17b9172ec37f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
326 | * - thrashing threshold in memory tight systems | 326 | * - thrashing threshold in memory tight systems |
327 | */ | 327 | */ |
328 | static pgoff_t count_history_pages(struct address_space *mapping, | 328 | static pgoff_t count_history_pages(struct address_space *mapping, |
329 | struct file_ra_state *ra, | ||
330 | pgoff_t offset, unsigned long max) | 329 | pgoff_t offset, unsigned long max) |
331 | { | 330 | { |
332 | pgoff_t head; | 331 | pgoff_t head; |
@@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, | |||
349 | { | 348 | { |
350 | pgoff_t size; | 349 | pgoff_t size; |
351 | 350 | ||
352 | size = count_history_pages(mapping, ra, offset, max); | 351 | size = count_history_pages(mapping, offset, max); |
353 | 352 | ||
354 | /* | 353 | /* |
355 | * not enough history pages: | 354 | * not enough history pages: |
diff --git a/mm/shmem.c b/mm/shmem.c index af68b15a8fc1..302d1cf7ad07 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) | |||
149 | vm_unacct_memory(VM_ACCT(size)); | 149 | vm_unacct_memory(VM_ACCT(size)); |
150 | } | 150 | } |
151 | 151 | ||
152 | static inline int shmem_reacct_size(unsigned long flags, | ||
153 | loff_t oldsize, loff_t newsize) | ||
154 | { | ||
155 | if (!(flags & VM_NORESERVE)) { | ||
156 | if (VM_ACCT(newsize) > VM_ACCT(oldsize)) | ||
157 | return security_vm_enough_memory_mm(current->mm, | ||
158 | VM_ACCT(newsize) - VM_ACCT(oldsize)); | ||
159 | else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) | ||
160 | vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); | ||
161 | } | ||
162 | return 0; | ||
163 | } | ||
164 | |||
152 | /* | 165 | /* |
153 | * ... whereas tmpfs objects are accounted incrementally as | 166 | * ... whereas tmpfs objects are accounted incrementally as |
154 | * pages are allocated, in order to allow huge sparse files. | 167 | * pages are allocated, in order to allow huge sparse files. |
@@ -280,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, | |||
280 | */ | 293 | */ |
281 | static int shmem_add_to_page_cache(struct page *page, | 294 | static int shmem_add_to_page_cache(struct page *page, |
282 | struct address_space *mapping, | 295 | struct address_space *mapping, |
283 | pgoff_t index, gfp_t gfp, void *expected) | 296 | pgoff_t index, void *expected) |
284 | { | 297 | { |
285 | int error; | 298 | int error; |
286 | 299 | ||
@@ -549,6 +562,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
549 | loff_t newsize = attr->ia_size; | 562 | loff_t newsize = attr->ia_size; |
550 | 563 | ||
551 | if (newsize != oldsize) { | 564 | if (newsize != oldsize) { |
565 | error = shmem_reacct_size(SHMEM_I(inode)->flags, | ||
566 | oldsize, newsize); | ||
567 | if (error) | ||
568 | return error; | ||
552 | i_size_write(inode, newsize); | 569 | i_size_write(inode, newsize); |
553 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 570 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
554 | } | 571 | } |
@@ -649,7 +666,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
649 | */ | 666 | */ |
650 | if (!error) | 667 | if (!error) |
651 | error = shmem_add_to_page_cache(*pagep, mapping, index, | 668 | error = shmem_add_to_page_cache(*pagep, mapping, index, |
652 | GFP_NOWAIT, radswap); | 669 | radswap); |
653 | if (error != -ENOMEM) { | 670 | if (error != -ENOMEM) { |
654 | /* | 671 | /* |
655 | * Truncation and eviction use free_swap_and_cache(), which | 672 | * Truncation and eviction use free_swap_and_cache(), which |
@@ -1095,7 +1112,7 @@ repeat: | |||
1095 | gfp & GFP_RECLAIM_MASK); | 1112 | gfp & GFP_RECLAIM_MASK); |
1096 | if (!error) { | 1113 | if (!error) { |
1097 | error = shmem_add_to_page_cache(page, mapping, index, | 1114 | error = shmem_add_to_page_cache(page, mapping, index, |
1098 | gfp, swp_to_radix_entry(swap)); | 1115 | swp_to_radix_entry(swap)); |
1099 | /* | 1116 | /* |
1100 | * We already confirmed swap under page lock, and make | 1117 | * We already confirmed swap under page lock, and make |
1101 | * no memory allocation here, so usually no possibility | 1118 | * no memory allocation here, so usually no possibility |
@@ -1149,7 +1166,7 @@ repeat: | |||
1149 | __SetPageSwapBacked(page); | 1166 | __SetPageSwapBacked(page); |
1150 | __set_page_locked(page); | 1167 | __set_page_locked(page); |
1151 | if (sgp == SGP_WRITE) | 1168 | if (sgp == SGP_WRITE) |
1152 | init_page_accessed(page); | 1169 | __SetPageReferenced(page); |
1153 | 1170 | ||
1154 | error = mem_cgroup_charge_file(page, current->mm, | 1171 | error = mem_cgroup_charge_file(page, current->mm, |
1155 | gfp & GFP_RECLAIM_MASK); | 1172 | gfp & GFP_RECLAIM_MASK); |
@@ -1158,7 +1175,7 @@ repeat: | |||
1158 | error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); | 1175 | error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); |
1159 | if (!error) { | 1176 | if (!error) { |
1160 | error = shmem_add_to_page_cache(page, mapping, index, | 1177 | error = shmem_add_to_page_cache(page, mapping, index, |
1161 | gfp, NULL); | 1178 | NULL); |
1162 | radix_tree_preload_end(); | 1179 | radix_tree_preload_end(); |
1163 | } | 1180 | } |
1164 | if (error) { | 1181 | if (error) { |
@@ -2932,16 +2949,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, | |||
2932 | this.len = strlen(name); | 2949 | this.len = strlen(name); |
2933 | this.hash = 0; /* will go */ | 2950 | this.hash = 0; /* will go */ |
2934 | sb = shm_mnt->mnt_sb; | 2951 | sb = shm_mnt->mnt_sb; |
2952 | path.mnt = mntget(shm_mnt); | ||
2935 | path.dentry = d_alloc_pseudo(sb, &this); | 2953 | path.dentry = d_alloc_pseudo(sb, &this); |
2936 | if (!path.dentry) | 2954 | if (!path.dentry) |
2937 | goto put_memory; | 2955 | goto put_memory; |
2938 | d_set_d_op(path.dentry, &anon_ops); | 2956 | d_set_d_op(path.dentry, &anon_ops); |
2939 | path.mnt = mntget(shm_mnt); | ||
2940 | 2957 | ||
2941 | res = ERR_PTR(-ENOSPC); | 2958 | res = ERR_PTR(-ENOSPC); |
2942 | inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); | 2959 | inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); |
2943 | if (!inode) | 2960 | if (!inode) |
2944 | goto put_dentry; | 2961 | goto put_memory; |
2945 | 2962 | ||
2946 | inode->i_flags |= i_flags; | 2963 | inode->i_flags |= i_flags; |
2947 | d_instantiate(path.dentry, inode); | 2964 | d_instantiate(path.dentry, inode); |
@@ -2949,19 +2966,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, | |||
2949 | clear_nlink(inode); /* It is unlinked */ | 2966 | clear_nlink(inode); /* It is unlinked */ |
2950 | res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); | 2967 | res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); |
2951 | if (IS_ERR(res)) | 2968 | if (IS_ERR(res)) |
2952 | goto put_dentry; | 2969 | goto put_path; |
2953 | 2970 | ||
2954 | res = alloc_file(&path, FMODE_WRITE | FMODE_READ, | 2971 | res = alloc_file(&path, FMODE_WRITE | FMODE_READ, |
2955 | &shmem_file_operations); | 2972 | &shmem_file_operations); |
2956 | if (IS_ERR(res)) | 2973 | if (IS_ERR(res)) |
2957 | goto put_dentry; | 2974 | goto put_path; |
2958 | 2975 | ||
2959 | return res; | 2976 | return res; |
2960 | 2977 | ||
2961 | put_dentry: | ||
2962 | path_put(&path); | ||
2963 | put_memory: | 2978 | put_memory: |
2964 | shmem_unacct_size(flags, size); | 2979 | shmem_unacct_size(flags, size); |
2980 | put_path: | ||
2981 | path_put(&path); | ||
2965 | return res; | 2982 | return res; |
2966 | } | 2983 | } |
2967 | 2984 | ||
@@ -191,7 +191,6 @@ struct array_cache { | |||
191 | unsigned int limit; | 191 | unsigned int limit; |
192 | unsigned int batchcount; | 192 | unsigned int batchcount; |
193 | unsigned int touched; | 193 | unsigned int touched; |
194 | spinlock_t lock; | ||
195 | void *entry[]; /* | 194 | void *entry[]; /* |
196 | * Must have this definition in here for the proper | 195 | * Must have this definition in here for the proper |
197 | * alignment of array_cache. Also simplifies accessing | 196 | * alignment of array_cache. Also simplifies accessing |
@@ -203,6 +202,11 @@ struct array_cache { | |||
203 | */ | 202 | */ |
204 | }; | 203 | }; |
205 | 204 | ||
205 | struct alien_cache { | ||
206 | spinlock_t lock; | ||
207 | struct array_cache ac; | ||
208 | }; | ||
209 | |||
206 | #define SLAB_OBJ_PFMEMALLOC 1 | 210 | #define SLAB_OBJ_PFMEMALLOC 1 |
207 | static inline bool is_obj_pfmemalloc(void *objp) | 211 | static inline bool is_obj_pfmemalloc(void *objp) |
208 | { | 212 | { |
@@ -242,7 +246,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; | |||
242 | static int drain_freelist(struct kmem_cache *cache, | 246 | static int drain_freelist(struct kmem_cache *cache, |
243 | struct kmem_cache_node *n, int tofree); | 247 | struct kmem_cache_node *n, int tofree); |
244 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 248 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
245 | int node); | 249 | int node, struct list_head *list); |
250 | static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); | ||
246 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); | 251 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); |
247 | static void cache_reap(struct work_struct *unused); | 252 | static void cache_reap(struct work_struct *unused); |
248 | 253 | ||
@@ -267,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) | |||
267 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ | 272 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ |
268 | do { \ | 273 | do { \ |
269 | INIT_LIST_HEAD(listp); \ | 274 | INIT_LIST_HEAD(listp); \ |
270 | list_splice(&(cachep->node[nodeid]->slab), listp); \ | 275 | list_splice(&get_node(cachep, nodeid)->slab, listp); \ |
271 | } while (0) | 276 | } while (0) |
272 | 277 | ||
273 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ | 278 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ |
@@ -465,143 +470,6 @@ static struct kmem_cache kmem_cache_boot = { | |||
465 | .name = "kmem_cache", | 470 | .name = "kmem_cache", |
466 | }; | 471 | }; |
467 | 472 | ||
468 | #define BAD_ALIEN_MAGIC 0x01020304ul | ||
469 | |||
470 | #ifdef CONFIG_LOCKDEP | ||
471 | |||
472 | /* | ||
473 | * Slab sometimes uses the kmalloc slabs to store the slab headers | ||
474 | * for other slabs "off slab". | ||
475 | * The locking for this is tricky in that it nests within the locks | ||
476 | * of all other slabs in a few places; to deal with this special | ||
477 | * locking we put on-slab caches into a separate lock-class. | ||
478 | * | ||
479 | * We set lock class for alien array caches which are up during init. | ||
480 | * The lock annotation will be lost if all cpus of a node goes down and | ||
481 | * then comes back up during hotplug | ||
482 | */ | ||
483 | static struct lock_class_key on_slab_l3_key; | ||
484 | static struct lock_class_key on_slab_alc_key; | ||
485 | |||
486 | static struct lock_class_key debugobj_l3_key; | ||
487 | static struct lock_class_key debugobj_alc_key; | ||
488 | |||
489 | static void slab_set_lock_classes(struct kmem_cache *cachep, | ||
490 | struct lock_class_key *l3_key, struct lock_class_key *alc_key, | ||
491 | int q) | ||
492 | { | ||
493 | struct array_cache **alc; | ||
494 | struct kmem_cache_node *n; | ||
495 | int r; | ||
496 | |||
497 | n = cachep->node[q]; | ||
498 | if (!n) | ||
499 | return; | ||
500 | |||
501 | lockdep_set_class(&n->list_lock, l3_key); | ||
502 | alc = n->alien; | ||
503 | /* | ||
504 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
505 | * should go away when common slab code is taught to | ||
506 | * work even without alien caches. | ||
507 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
508 | * for alloc_alien_cache, | ||
509 | */ | ||
510 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
511 | return; | ||
512 | for_each_node(r) { | ||
513 | if (alc[r]) | ||
514 | lockdep_set_class(&alc[r]->lock, alc_key); | ||
515 | } | ||
516 | } | ||
517 | |||
518 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
519 | { | ||
520 | slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); | ||
521 | } | ||
522 | |||
523 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
524 | { | ||
525 | int node; | ||
526 | |||
527 | for_each_online_node(node) | ||
528 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
529 | } | ||
530 | |||
531 | static void init_node_lock_keys(int q) | ||
532 | { | ||
533 | int i; | ||
534 | |||
535 | if (slab_state < UP) | ||
536 | return; | ||
537 | |||
538 | for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { | ||
539 | struct kmem_cache_node *n; | ||
540 | struct kmem_cache *cache = kmalloc_caches[i]; | ||
541 | |||
542 | if (!cache) | ||
543 | continue; | ||
544 | |||
545 | n = cache->node[q]; | ||
546 | if (!n || OFF_SLAB(cache)) | ||
547 | continue; | ||
548 | |||
549 | slab_set_lock_classes(cache, &on_slab_l3_key, | ||
550 | &on_slab_alc_key, q); | ||
551 | } | ||
552 | } | ||
553 | |||
554 | static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) | ||
555 | { | ||
556 | if (!cachep->node[q]) | ||
557 | return; | ||
558 | |||
559 | slab_set_lock_classes(cachep, &on_slab_l3_key, | ||
560 | &on_slab_alc_key, q); | ||
561 | } | ||
562 | |||
563 | static inline void on_slab_lock_classes(struct kmem_cache *cachep) | ||
564 | { | ||
565 | int node; | ||
566 | |||
567 | VM_BUG_ON(OFF_SLAB(cachep)); | ||
568 | for_each_node(node) | ||
569 | on_slab_lock_classes_node(cachep, node); | ||
570 | } | ||
571 | |||
572 | static inline void init_lock_keys(void) | ||
573 | { | ||
574 | int node; | ||
575 | |||
576 | for_each_node(node) | ||
577 | init_node_lock_keys(node); | ||
578 | } | ||
579 | #else | ||
580 | static void init_node_lock_keys(int q) | ||
581 | { | ||
582 | } | ||
583 | |||
584 | static inline void init_lock_keys(void) | ||
585 | { | ||
586 | } | ||
587 | |||
588 | static inline void on_slab_lock_classes(struct kmem_cache *cachep) | ||
589 | { | ||
590 | } | ||
591 | |||
592 | static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) | ||
593 | { | ||
594 | } | ||
595 | |||
596 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
597 | { | ||
598 | } | ||
599 | |||
600 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
601 | { | ||
602 | } | ||
603 | #endif | ||
604 | |||
605 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | 473 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
606 | 474 | ||
607 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 475 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
@@ -792,13 +660,8 @@ static void start_cpu_timer(int cpu) | |||
792 | } | 660 | } |
793 | } | 661 | } |
794 | 662 | ||
795 | static struct array_cache *alloc_arraycache(int node, int entries, | 663 | static void init_arraycache(struct array_cache *ac, int limit, int batch) |
796 | int batchcount, gfp_t gfp) | ||
797 | { | 664 | { |
798 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); | ||
799 | struct array_cache *nc = NULL; | ||
800 | |||
801 | nc = kmalloc_node(memsize, gfp, node); | ||
802 | /* | 665 | /* |
803 | * The array_cache structures contain pointers to free object. | 666 | * The array_cache structures contain pointers to free object. |
804 | * However, when such objects are allocated or transferred to another | 667 | * However, when such objects are allocated or transferred to another |
@@ -806,15 +669,24 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
806 | * valid references during a kmemleak scan. Therefore, kmemleak must | 669 | * valid references during a kmemleak scan. Therefore, kmemleak must |
807 | * not scan such objects. | 670 | * not scan such objects. |
808 | */ | 671 | */ |
809 | kmemleak_no_scan(nc); | 672 | kmemleak_no_scan(ac); |
810 | if (nc) { | 673 | if (ac) { |
811 | nc->avail = 0; | 674 | ac->avail = 0; |
812 | nc->limit = entries; | 675 | ac->limit = limit; |
813 | nc->batchcount = batchcount; | 676 | ac->batchcount = batch; |
814 | nc->touched = 0; | 677 | ac->touched = 0; |
815 | spin_lock_init(&nc->lock); | ||
816 | } | 678 | } |
817 | return nc; | 679 | } |
680 | |||
681 | static struct array_cache *alloc_arraycache(int node, int entries, | ||
682 | int batchcount, gfp_t gfp) | ||
683 | { | ||
684 | size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); | ||
685 | struct array_cache *ac = NULL; | ||
686 | |||
687 | ac = kmalloc_node(memsize, gfp, node); | ||
688 | init_arraycache(ac, entries, batchcount); | ||
689 | return ac; | ||
818 | } | 690 | } |
819 | 691 | ||
820 | static inline bool is_slab_pfmemalloc(struct page *page) | 692 | static inline bool is_slab_pfmemalloc(struct page *page) |
@@ -826,7 +698,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) | |||
826 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | 698 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, |
827 | struct array_cache *ac) | 699 | struct array_cache *ac) |
828 | { | 700 | { |
829 | struct kmem_cache_node *n = cachep->node[numa_mem_id()]; | 701 | struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); |
830 | struct page *page; | 702 | struct page *page; |
831 | unsigned long flags; | 703 | unsigned long flags; |
832 | 704 | ||
@@ -881,7 +753,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | |||
881 | * If there are empty slabs on the slabs_free list and we are | 753 | * If there are empty slabs on the slabs_free list and we are |
882 | * being forced to refill the cache, mark this one !pfmemalloc. | 754 | * being forced to refill the cache, mark this one !pfmemalloc. |
883 | */ | 755 | */ |
884 | n = cachep->node[numa_mem_id()]; | 756 | n = get_node(cachep, numa_mem_id()); |
885 | if (!list_empty(&n->slabs_free) && force_refill) { | 757 | if (!list_empty(&n->slabs_free) && force_refill) { |
886 | struct page *page = virt_to_head_page(objp); | 758 | struct page *page = virt_to_head_page(objp); |
887 | ClearPageSlabPfmemalloc(page); | 759 | ClearPageSlabPfmemalloc(page); |
@@ -961,12 +833,13 @@ static int transfer_objects(struct array_cache *to, | |||
961 | #define drain_alien_cache(cachep, alien) do { } while (0) | 833 | #define drain_alien_cache(cachep, alien) do { } while (0) |
962 | #define reap_alien(cachep, n) do { } while (0) | 834 | #define reap_alien(cachep, n) do { } while (0) |
963 | 835 | ||
964 | static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | 836 | static inline struct alien_cache **alloc_alien_cache(int node, |
837 | int limit, gfp_t gfp) | ||
965 | { | 838 | { |
966 | return (struct array_cache **)BAD_ALIEN_MAGIC; | 839 | return NULL; |
967 | } | 840 | } |
968 | 841 | ||
969 | static inline void free_alien_cache(struct array_cache **ac_ptr) | 842 | static inline void free_alien_cache(struct alien_cache **ac_ptr) |
970 | { | 843 | { |
971 | } | 844 | } |
972 | 845 | ||
@@ -992,46 +865,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, | |||
992 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); | 865 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
993 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 866 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
994 | 867 | ||
995 | static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | 868 | static struct alien_cache *__alloc_alien_cache(int node, int entries, |
869 | int batch, gfp_t gfp) | ||
870 | { | ||
871 | size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); | ||
872 | struct alien_cache *alc = NULL; | ||
873 | |||
874 | alc = kmalloc_node(memsize, gfp, node); | ||
875 | init_arraycache(&alc->ac, entries, batch); | ||
876 | spin_lock_init(&alc->lock); | ||
877 | return alc; | ||
878 | } | ||
879 | |||
880 | static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | ||
996 | { | 881 | { |
997 | struct array_cache **ac_ptr; | 882 | struct alien_cache **alc_ptr; |
998 | int memsize = sizeof(void *) * nr_node_ids; | 883 | size_t memsize = sizeof(void *) * nr_node_ids; |
999 | int i; | 884 | int i; |
1000 | 885 | ||
1001 | if (limit > 1) | 886 | if (limit > 1) |
1002 | limit = 12; | 887 | limit = 12; |
1003 | ac_ptr = kzalloc_node(memsize, gfp, node); | 888 | alc_ptr = kzalloc_node(memsize, gfp, node); |
1004 | if (ac_ptr) { | 889 | if (!alc_ptr) |
1005 | for_each_node(i) { | 890 | return NULL; |
1006 | if (i == node || !node_online(i)) | 891 | |
1007 | continue; | 892 | for_each_node(i) { |
1008 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); | 893 | if (i == node || !node_online(i)) |
1009 | if (!ac_ptr[i]) { | 894 | continue; |
1010 | for (i--; i >= 0; i--) | 895 | alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); |
1011 | kfree(ac_ptr[i]); | 896 | if (!alc_ptr[i]) { |
1012 | kfree(ac_ptr); | 897 | for (i--; i >= 0; i--) |
1013 | return NULL; | 898 | kfree(alc_ptr[i]); |
1014 | } | 899 | kfree(alc_ptr); |
900 | return NULL; | ||
1015 | } | 901 | } |
1016 | } | 902 | } |
1017 | return ac_ptr; | 903 | return alc_ptr; |
1018 | } | 904 | } |
1019 | 905 | ||
1020 | static void free_alien_cache(struct array_cache **ac_ptr) | 906 | static void free_alien_cache(struct alien_cache **alc_ptr) |
1021 | { | 907 | { |
1022 | int i; | 908 | int i; |
1023 | 909 | ||
1024 | if (!ac_ptr) | 910 | if (!alc_ptr) |
1025 | return; | 911 | return; |
1026 | for_each_node(i) | 912 | for_each_node(i) |
1027 | kfree(ac_ptr[i]); | 913 | kfree(alc_ptr[i]); |
1028 | kfree(ac_ptr); | 914 | kfree(alc_ptr); |
1029 | } | 915 | } |
1030 | 916 | ||
1031 | static void __drain_alien_cache(struct kmem_cache *cachep, | 917 | static void __drain_alien_cache(struct kmem_cache *cachep, |
1032 | struct array_cache *ac, int node) | 918 | struct array_cache *ac, int node, |
919 | struct list_head *list) | ||
1033 | { | 920 | { |
1034 | struct kmem_cache_node *n = cachep->node[node]; | 921 | struct kmem_cache_node *n = get_node(cachep, node); |
1035 | 922 | ||
1036 | if (ac->avail) { | 923 | if (ac->avail) { |
1037 | spin_lock(&n->list_lock); | 924 | spin_lock(&n->list_lock); |
@@ -1043,7 +930,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
1043 | if (n->shared) | 930 | if (n->shared) |
1044 | transfer_objects(n->shared, ac, ac->limit); | 931 | transfer_objects(n->shared, ac, ac->limit); |
1045 | 932 | ||
1046 | free_block(cachep, ac->entry, ac->avail, node); | 933 | free_block(cachep, ac->entry, ac->avail, node, list); |
1047 | ac->avail = 0; | 934 | ac->avail = 0; |
1048 | spin_unlock(&n->list_lock); | 935 | spin_unlock(&n->list_lock); |
1049 | } | 936 | } |
@@ -1057,28 +944,40 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) | |||
1057 | int node = __this_cpu_read(slab_reap_node); | 944 | int node = __this_cpu_read(slab_reap_node); |
1058 | 945 | ||
1059 | if (n->alien) { | 946 | if (n->alien) { |
1060 | struct array_cache *ac = n->alien[node]; | 947 | struct alien_cache *alc = n->alien[node]; |
948 | struct array_cache *ac; | ||
949 | |||
950 | if (alc) { | ||
951 | ac = &alc->ac; | ||
952 | if (ac->avail && spin_trylock_irq(&alc->lock)) { | ||
953 | LIST_HEAD(list); | ||
1061 | 954 | ||
1062 | if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { | 955 | __drain_alien_cache(cachep, ac, node, &list); |
1063 | __drain_alien_cache(cachep, ac, node); | 956 | spin_unlock_irq(&alc->lock); |
1064 | spin_unlock_irq(&ac->lock); | 957 | slabs_destroy(cachep, &list); |
958 | } | ||
1065 | } | 959 | } |
1066 | } | 960 | } |
1067 | } | 961 | } |
1068 | 962 | ||
1069 | static void drain_alien_cache(struct kmem_cache *cachep, | 963 | static void drain_alien_cache(struct kmem_cache *cachep, |
1070 | struct array_cache **alien) | 964 | struct alien_cache **alien) |
1071 | { | 965 | { |
1072 | int i = 0; | 966 | int i = 0; |
967 | struct alien_cache *alc; | ||
1073 | struct array_cache *ac; | 968 | struct array_cache *ac; |
1074 | unsigned long flags; | 969 | unsigned long flags; |
1075 | 970 | ||
1076 | for_each_online_node(i) { | 971 | for_each_online_node(i) { |
1077 | ac = alien[i]; | 972 | alc = alien[i]; |
1078 | if (ac) { | 973 | if (alc) { |
1079 | spin_lock_irqsave(&ac->lock, flags); | 974 | LIST_HEAD(list); |
1080 | __drain_alien_cache(cachep, ac, i); | 975 | |
1081 | spin_unlock_irqrestore(&ac->lock, flags); | 976 | ac = &alc->ac; |
977 | spin_lock_irqsave(&alc->lock, flags); | ||
978 | __drain_alien_cache(cachep, ac, i, &list); | ||
979 | spin_unlock_irqrestore(&alc->lock, flags); | ||
980 | slabs_destroy(cachep, &list); | ||
1082 | } | 981 | } |
1083 | } | 982 | } |
1084 | } | 983 | } |
@@ -1087,8 +986,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1087 | { | 986 | { |
1088 | int nodeid = page_to_nid(virt_to_page(objp)); | 987 | int nodeid = page_to_nid(virt_to_page(objp)); |
1089 | struct kmem_cache_node *n; | 988 | struct kmem_cache_node *n; |
1090 | struct array_cache *alien = NULL; | 989 | struct alien_cache *alien = NULL; |
990 | struct array_cache *ac; | ||
1091 | int node; | 991 | int node; |
992 | LIST_HEAD(list); | ||
1092 | 993 | ||
1093 | node = numa_mem_id(); | 994 | node = numa_mem_id(); |
1094 | 995 | ||
@@ -1099,21 +1000,25 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1099 | if (likely(nodeid == node)) | 1000 | if (likely(nodeid == node)) |
1100 | return 0; | 1001 | return 0; |
1101 | 1002 | ||
1102 | n = cachep->node[node]; | 1003 | n = get_node(cachep, node); |
1103 | STATS_INC_NODEFREES(cachep); | 1004 | STATS_INC_NODEFREES(cachep); |
1104 | if (n->alien && n->alien[nodeid]) { | 1005 | if (n->alien && n->alien[nodeid]) { |
1105 | alien = n->alien[nodeid]; | 1006 | alien = n->alien[nodeid]; |
1007 | ac = &alien->ac; | ||
1106 | spin_lock(&alien->lock); | 1008 | spin_lock(&alien->lock); |
1107 | if (unlikely(alien->avail == alien->limit)) { | 1009 | if (unlikely(ac->avail == ac->limit)) { |
1108 | STATS_INC_ACOVERFLOW(cachep); | 1010 | STATS_INC_ACOVERFLOW(cachep); |
1109 | __drain_alien_cache(cachep, alien, nodeid); | 1011 | __drain_alien_cache(cachep, ac, nodeid, &list); |
1110 | } | 1012 | } |
1111 | ac_put_obj(cachep, alien, objp); | 1013 | ac_put_obj(cachep, ac, objp); |
1112 | spin_unlock(&alien->lock); | 1014 | spin_unlock(&alien->lock); |
1015 | slabs_destroy(cachep, &list); | ||
1113 | } else { | 1016 | } else { |
1114 | spin_lock(&(cachep->node[nodeid])->list_lock); | 1017 | n = get_node(cachep, nodeid); |
1115 | free_block(cachep, &objp, 1, nodeid); | 1018 | spin_lock(&n->list_lock); |
1116 | spin_unlock(&(cachep->node[nodeid])->list_lock); | 1019 | free_block(cachep, &objp, 1, nodeid, &list); |
1020 | spin_unlock(&n->list_lock); | ||
1021 | slabs_destroy(cachep, &list); | ||
1117 | } | 1022 | } |
1118 | return 1; | 1023 | return 1; |
1119 | } | 1024 | } |
@@ -1132,7 +1037,7 @@ static int init_cache_node_node(int node) | |||
1132 | { | 1037 | { |
1133 | struct kmem_cache *cachep; | 1038 | struct kmem_cache *cachep; |
1134 | struct kmem_cache_node *n; | 1039 | struct kmem_cache_node *n; |
1135 | const int memsize = sizeof(struct kmem_cache_node); | 1040 | const size_t memsize = sizeof(struct kmem_cache_node); |
1136 | 1041 | ||
1137 | list_for_each_entry(cachep, &slab_caches, list) { | 1042 | list_for_each_entry(cachep, &slab_caches, list) { |
1138 | /* | 1043 | /* |
@@ -1140,7 +1045,8 @@ static int init_cache_node_node(int node) | |||
1140 | * begin anything. Make sure some other cpu on this | 1045 | * begin anything. Make sure some other cpu on this |
1141 | * node has not already allocated this | 1046 | * node has not already allocated this |
1142 | */ | 1047 | */ |
1143 | if (!cachep->node[node]) { | 1048 | n = get_node(cachep, node); |
1049 | if (!n) { | ||
1144 | n = kmalloc_node(memsize, GFP_KERNEL, node); | 1050 | n = kmalloc_node(memsize, GFP_KERNEL, node); |
1145 | if (!n) | 1051 | if (!n) |
1146 | return -ENOMEM; | 1052 | return -ENOMEM; |
@@ -1156,11 +1062,11 @@ static int init_cache_node_node(int node) | |||
1156 | cachep->node[node] = n; | 1062 | cachep->node[node] = n; |
1157 | } | 1063 | } |
1158 | 1064 | ||
1159 | spin_lock_irq(&cachep->node[node]->list_lock); | 1065 | spin_lock_irq(&n->list_lock); |
1160 | cachep->node[node]->free_limit = | 1066 | n->free_limit = |
1161 | (1 + nr_cpus_node(node)) * | 1067 | (1 + nr_cpus_node(node)) * |
1162 | cachep->batchcount + cachep->num; | 1068 | cachep->batchcount + cachep->num; |
1163 | spin_unlock_irq(&cachep->node[node]->list_lock); | 1069 | spin_unlock_irq(&n->list_lock); |
1164 | } | 1070 | } |
1165 | return 0; | 1071 | return 0; |
1166 | } | 1072 | } |
@@ -1181,12 +1087,13 @@ static void cpuup_canceled(long cpu) | |||
1181 | list_for_each_entry(cachep, &slab_caches, list) { | 1087 | list_for_each_entry(cachep, &slab_caches, list) { |
1182 | struct array_cache *nc; | 1088 | struct array_cache *nc; |
1183 | struct array_cache *shared; | 1089 | struct array_cache *shared; |
1184 | struct array_cache **alien; | 1090 | struct alien_cache **alien; |
1091 | LIST_HEAD(list); | ||
1185 | 1092 | ||
1186 | /* cpu is dead; no one can alloc from it. */ | 1093 | /* cpu is dead; no one can alloc from it. */ |
1187 | nc = cachep->array[cpu]; | 1094 | nc = cachep->array[cpu]; |
1188 | cachep->array[cpu] = NULL; | 1095 | cachep->array[cpu] = NULL; |
1189 | n = cachep->node[node]; | 1096 | n = get_node(cachep, node); |
1190 | 1097 | ||
1191 | if (!n) | 1098 | if (!n) |
1192 | goto free_array_cache; | 1099 | goto free_array_cache; |
@@ -1196,7 +1103,7 @@ static void cpuup_canceled(long cpu) | |||
1196 | /* Free limit for this kmem_cache_node */ | 1103 | /* Free limit for this kmem_cache_node */ |
1197 | n->free_limit -= cachep->batchcount; | 1104 | n->free_limit -= cachep->batchcount; |
1198 | if (nc) | 1105 | if (nc) |
1199 | free_block(cachep, nc->entry, nc->avail, node); | 1106 | free_block(cachep, nc->entry, nc->avail, node, &list); |
1200 | 1107 | ||
1201 | if (!cpumask_empty(mask)) { | 1108 | if (!cpumask_empty(mask)) { |
1202 | spin_unlock_irq(&n->list_lock); | 1109 | spin_unlock_irq(&n->list_lock); |
@@ -1206,7 +1113,7 @@ static void cpuup_canceled(long cpu) | |||
1206 | shared = n->shared; | 1113 | shared = n->shared; |
1207 | if (shared) { | 1114 | if (shared) { |
1208 | free_block(cachep, shared->entry, | 1115 | free_block(cachep, shared->entry, |
1209 | shared->avail, node); | 1116 | shared->avail, node, &list); |
1210 | n->shared = NULL; | 1117 | n->shared = NULL; |
1211 | } | 1118 | } |
1212 | 1119 | ||
@@ -1221,6 +1128,7 @@ static void cpuup_canceled(long cpu) | |||
1221 | free_alien_cache(alien); | 1128 | free_alien_cache(alien); |
1222 | } | 1129 | } |
1223 | free_array_cache: | 1130 | free_array_cache: |
1131 | slabs_destroy(cachep, &list); | ||
1224 | kfree(nc); | 1132 | kfree(nc); |
1225 | } | 1133 | } |
1226 | /* | 1134 | /* |
@@ -1229,7 +1137,7 @@ free_array_cache: | |||
1229 | * shrink each nodelist to its limit. | 1137 | * shrink each nodelist to its limit. |
1230 | */ | 1138 | */ |
1231 | list_for_each_entry(cachep, &slab_caches, list) { | 1139 | list_for_each_entry(cachep, &slab_caches, list) { |
1232 | n = cachep->node[node]; | 1140 | n = get_node(cachep, node); |
1233 | if (!n) | 1141 | if (!n) |
1234 | continue; | 1142 | continue; |
1235 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); | 1143 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
@@ -1260,7 +1168,7 @@ static int cpuup_prepare(long cpu) | |||
1260 | list_for_each_entry(cachep, &slab_caches, list) { | 1168 | list_for_each_entry(cachep, &slab_caches, list) { |
1261 | struct array_cache *nc; | 1169 | struct array_cache *nc; |
1262 | struct array_cache *shared = NULL; | 1170 | struct array_cache *shared = NULL; |
1263 | struct array_cache **alien = NULL; | 1171 | struct alien_cache **alien = NULL; |
1264 | 1172 | ||
1265 | nc = alloc_arraycache(node, cachep->limit, | 1173 | nc = alloc_arraycache(node, cachep->limit, |
1266 | cachep->batchcount, GFP_KERNEL); | 1174 | cachep->batchcount, GFP_KERNEL); |
@@ -1284,7 +1192,7 @@ static int cpuup_prepare(long cpu) | |||
1284 | } | 1192 | } |
1285 | } | 1193 | } |
1286 | cachep->array[cpu] = nc; | 1194 | cachep->array[cpu] = nc; |
1287 | n = cachep->node[node]; | 1195 | n = get_node(cachep, node); |
1288 | BUG_ON(!n); | 1196 | BUG_ON(!n); |
1289 | 1197 | ||
1290 | spin_lock_irq(&n->list_lock); | 1198 | spin_lock_irq(&n->list_lock); |
@@ -1305,13 +1213,7 @@ static int cpuup_prepare(long cpu) | |||
1305 | spin_unlock_irq(&n->list_lock); | 1213 | spin_unlock_irq(&n->list_lock); |
1306 | kfree(shared); | 1214 | kfree(shared); |
1307 | free_alien_cache(alien); | 1215 | free_alien_cache(alien); |
1308 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | ||
1309 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
1310 | else if (!OFF_SLAB(cachep) && | ||
1311 | !(cachep->flags & SLAB_DESTROY_BY_RCU)) | ||
1312 | on_slab_lock_classes_node(cachep, node); | ||
1313 | } | 1216 | } |
1314 | init_node_lock_keys(node); | ||
1315 | 1217 | ||
1316 | return 0; | 1218 | return 0; |
1317 | bad: | 1219 | bad: |
@@ -1395,7 +1297,7 @@ static int __meminit drain_cache_node_node(int node) | |||
1395 | list_for_each_entry(cachep, &slab_caches, list) { | 1297 | list_for_each_entry(cachep, &slab_caches, list) { |
1396 | struct kmem_cache_node *n; | 1298 | struct kmem_cache_node *n; |
1397 | 1299 | ||
1398 | n = cachep->node[node]; | 1300 | n = get_node(cachep, node); |
1399 | if (!n) | 1301 | if (!n) |
1400 | continue; | 1302 | continue; |
1401 | 1303 | ||
@@ -1575,10 +1477,6 @@ void __init kmem_cache_init(void) | |||
1575 | 1477 | ||
1576 | memcpy(ptr, cpu_cache_get(kmem_cache), | 1478 | memcpy(ptr, cpu_cache_get(kmem_cache), |
1577 | sizeof(struct arraycache_init)); | 1479 | sizeof(struct arraycache_init)); |
1578 | /* | ||
1579 | * Do not assume that spinlocks can be initialized via memcpy: | ||
1580 | */ | ||
1581 | spin_lock_init(&ptr->lock); | ||
1582 | 1480 | ||
1583 | kmem_cache->array[smp_processor_id()] = ptr; | 1481 | kmem_cache->array[smp_processor_id()] = ptr; |
1584 | 1482 | ||
@@ -1588,10 +1486,6 @@ void __init kmem_cache_init(void) | |||
1588 | != &initarray_generic.cache); | 1486 | != &initarray_generic.cache); |
1589 | memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), | 1487 | memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), |
1590 | sizeof(struct arraycache_init)); | 1488 | sizeof(struct arraycache_init)); |
1591 | /* | ||
1592 | * Do not assume that spinlocks can be initialized via memcpy: | ||
1593 | */ | ||
1594 | spin_lock_init(&ptr->lock); | ||
1595 | 1489 | ||
1596 | kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; | 1490 | kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; |
1597 | } | 1491 | } |
@@ -1628,9 +1522,6 @@ void __init kmem_cache_init_late(void) | |||
1628 | BUG(); | 1522 | BUG(); |
1629 | mutex_unlock(&slab_mutex); | 1523 | mutex_unlock(&slab_mutex); |
1630 | 1524 | ||
1631 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1632 | init_lock_keys(); | ||
1633 | |||
1634 | /* Done! */ | 1525 | /* Done! */ |
1635 | slab_state = FULL; | 1526 | slab_state = FULL; |
1636 | 1527 | ||
@@ -1690,14 +1581,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1690 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", | 1581 | printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", |
1691 | cachep->name, cachep->size, cachep->gfporder); | 1582 | cachep->name, cachep->size, cachep->gfporder); |
1692 | 1583 | ||
1693 | for_each_online_node(node) { | 1584 | for_each_kmem_cache_node(cachep, node, n) { |
1694 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; | 1585 | unsigned long active_objs = 0, num_objs = 0, free_objects = 0; |
1695 | unsigned long active_slabs = 0, num_slabs = 0; | 1586 | unsigned long active_slabs = 0, num_slabs = 0; |
1696 | 1587 | ||
1697 | n = cachep->node[node]; | ||
1698 | if (!n) | ||
1699 | continue; | ||
1700 | |||
1701 | spin_lock_irqsave(&n->list_lock, flags); | 1588 | spin_lock_irqsave(&n->list_lock, flags); |
1702 | list_for_each_entry(page, &n->slabs_full, lru) { | 1589 | list_for_each_entry(page, &n->slabs_full, lru) { |
1703 | active_objs += cachep->num; | 1590 | active_objs += cachep->num; |
@@ -1724,7 +1611,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) | |||
1724 | } | 1611 | } |
1725 | 1612 | ||
1726 | /* | 1613 | /* |
1727 | * Interface to system's page allocator. No need to hold the cache-lock. | 1614 | * Interface to system's page allocator. No need to hold the |
1615 | * kmem_cache_node ->list_lock. | ||
1728 | * | 1616 | * |
1729 | * If we requested dmaable memory, we will get it. Even if we | 1617 | * If we requested dmaable memory, we will get it. Even if we |
1730 | * did not request dmaable memory, we might get it, but that | 1618 | * did not request dmaable memory, we might get it, but that |
@@ -2026,9 +1914,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, | |||
2026 | * @cachep: cache pointer being destroyed | 1914 | * @cachep: cache pointer being destroyed |
2027 | * @page: page pointer being destroyed | 1915 | * @page: page pointer being destroyed |
2028 | * | 1916 | * |
2029 | * Destroy all the objs in a slab, and release the mem back to the system. | 1917 | * Destroy all the objs in a slab page, and release the mem back to the system. |
2030 | * Before calling the slab must have been unlinked from the cache. The | 1918 | * Before calling the slab page must have been unlinked from the cache. The |
2031 | * cache-lock is not held/needed. | 1919 | * kmem_cache_node ->list_lock is not held/needed. |
2032 | */ | 1920 | */ |
2033 | static void slab_destroy(struct kmem_cache *cachep, struct page *page) | 1921 | static void slab_destroy(struct kmem_cache *cachep, struct page *page) |
2034 | { | 1922 | { |
@@ -2060,6 +1948,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) | |||
2060 | kmem_cache_free(cachep->freelist_cache, freelist); | 1948 | kmem_cache_free(cachep->freelist_cache, freelist); |
2061 | } | 1949 | } |
2062 | 1950 | ||
1951 | static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) | ||
1952 | { | ||
1953 | struct page *page, *n; | ||
1954 | |||
1955 | list_for_each_entry_safe(page, n, list, lru) { | ||
1956 | list_del(&page->lru); | ||
1957 | slab_destroy(cachep, page); | ||
1958 | } | ||
1959 | } | ||
1960 | |||
2063 | /** | 1961 | /** |
2064 | * calculate_slab_order - calculate size (page order) of slabs | 1962 | * calculate_slab_order - calculate size (page order) of slabs |
2065 | * @cachep: pointer to the cache that is being created | 1963 | * @cachep: pointer to the cache that is being created |
@@ -2405,17 +2303,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2405 | return err; | 2303 | return err; |
2406 | } | 2304 | } |
2407 | 2305 | ||
2408 | if (flags & SLAB_DEBUG_OBJECTS) { | ||
2409 | /* | ||
2410 | * Would deadlock through slab_destroy()->call_rcu()-> | ||
2411 | * debug_object_activate()->kmem_cache_alloc(). | ||
2412 | */ | ||
2413 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | ||
2414 | |||
2415 | slab_set_debugobj_lock_classes(cachep); | ||
2416 | } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) | ||
2417 | on_slab_lock_classes(cachep); | ||
2418 | |||
2419 | return 0; | 2306 | return 0; |
2420 | } | 2307 | } |
2421 | 2308 | ||
@@ -2434,7 +2321,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) | |||
2434 | { | 2321 | { |
2435 | #ifdef CONFIG_SMP | 2322 | #ifdef CONFIG_SMP |
2436 | check_irq_off(); | 2323 | check_irq_off(); |
2437 | assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); | 2324 | assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); |
2438 | #endif | 2325 | #endif |
2439 | } | 2326 | } |
2440 | 2327 | ||
@@ -2442,7 +2329,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) | |||
2442 | { | 2329 | { |
2443 | #ifdef CONFIG_SMP | 2330 | #ifdef CONFIG_SMP |
2444 | check_irq_off(); | 2331 | check_irq_off(); |
2445 | assert_spin_locked(&cachep->node[node]->list_lock); | 2332 | assert_spin_locked(&get_node(cachep, node)->list_lock); |
2446 | #endif | 2333 | #endif |
2447 | } | 2334 | } |
2448 | 2335 | ||
@@ -2462,12 +2349,16 @@ static void do_drain(void *arg) | |||
2462 | struct kmem_cache *cachep = arg; | 2349 | struct kmem_cache *cachep = arg; |
2463 | struct array_cache *ac; | 2350 | struct array_cache *ac; |
2464 | int node = numa_mem_id(); | 2351 | int node = numa_mem_id(); |
2352 | struct kmem_cache_node *n; | ||
2353 | LIST_HEAD(list); | ||
2465 | 2354 | ||
2466 | check_irq_off(); | 2355 | check_irq_off(); |
2467 | ac = cpu_cache_get(cachep); | 2356 | ac = cpu_cache_get(cachep); |
2468 | spin_lock(&cachep->node[node]->list_lock); | 2357 | n = get_node(cachep, node); |
2469 | free_block(cachep, ac->entry, ac->avail, node); | 2358 | spin_lock(&n->list_lock); |
2470 | spin_unlock(&cachep->node[node]->list_lock); | 2359 | free_block(cachep, ac->entry, ac->avail, node, &list); |
2360 | spin_unlock(&n->list_lock); | ||
2361 | slabs_destroy(cachep, &list); | ||
2471 | ac->avail = 0; | 2362 | ac->avail = 0; |
2472 | } | 2363 | } |
2473 | 2364 | ||
@@ -2478,17 +2369,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
2478 | 2369 | ||
2479 | on_each_cpu(do_drain, cachep, 1); | 2370 | on_each_cpu(do_drain, cachep, 1); |
2480 | check_irq_on(); | 2371 | check_irq_on(); |
2481 | for_each_online_node(node) { | 2372 | for_each_kmem_cache_node(cachep, node, n) |
2482 | n = cachep->node[node]; | 2373 | if (n->alien) |
2483 | if (n && n->alien) | ||
2484 | drain_alien_cache(cachep, n->alien); | 2374 | drain_alien_cache(cachep, n->alien); |
2485 | } | ||
2486 | 2375 | ||
2487 | for_each_online_node(node) { | 2376 | for_each_kmem_cache_node(cachep, node, n) |
2488 | n = cachep->node[node]; | 2377 | drain_array(cachep, n, n->shared, 1, node); |
2489 | if (n) | ||
2490 | drain_array(cachep, n, n->shared, 1, node); | ||
2491 | } | ||
2492 | } | 2378 | } |
2493 | 2379 | ||
2494 | /* | 2380 | /* |
@@ -2534,17 +2420,14 @@ out: | |||
2534 | 2420 | ||
2535 | int __kmem_cache_shrink(struct kmem_cache *cachep) | 2421 | int __kmem_cache_shrink(struct kmem_cache *cachep) |
2536 | { | 2422 | { |
2537 | int ret = 0, i = 0; | 2423 | int ret = 0; |
2424 | int node; | ||
2538 | struct kmem_cache_node *n; | 2425 | struct kmem_cache_node *n; |
2539 | 2426 | ||
2540 | drain_cpu_caches(cachep); | 2427 | drain_cpu_caches(cachep); |
2541 | 2428 | ||
2542 | check_irq_on(); | 2429 | check_irq_on(); |
2543 | for_each_online_node(i) { | 2430 | for_each_kmem_cache_node(cachep, node, n) { |
2544 | n = cachep->node[i]; | ||
2545 | if (!n) | ||
2546 | continue; | ||
2547 | |||
2548 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); | 2431 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
2549 | 2432 | ||
2550 | ret += !list_empty(&n->slabs_full) || | 2433 | ret += !list_empty(&n->slabs_full) || |
@@ -2566,13 +2449,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) | |||
2566 | kfree(cachep->array[i]); | 2449 | kfree(cachep->array[i]); |
2567 | 2450 | ||
2568 | /* NUMA: free the node structures */ | 2451 | /* NUMA: free the node structures */ |
2569 | for_each_online_node(i) { | 2452 | for_each_kmem_cache_node(cachep, i, n) { |
2570 | n = cachep->node[i]; | 2453 | kfree(n->shared); |
2571 | if (n) { | 2454 | free_alien_cache(n->alien); |
2572 | kfree(n->shared); | 2455 | kfree(n); |
2573 | free_alien_cache(n->alien); | 2456 | cachep->node[i] = NULL; |
2574 | kfree(n); | ||
2575 | } | ||
2576 | } | 2457 | } |
2577 | return 0; | 2458 | return 0; |
2578 | } | 2459 | } |
@@ -2751,7 +2632,7 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2751 | 2632 | ||
2752 | /* Take the node list lock to change the colour_next on this node */ | 2633 | /* Take the node list lock to change the colour_next on this node */ |
2753 | check_irq_off(); | 2634 | check_irq_off(); |
2754 | n = cachep->node[nodeid]; | 2635 | n = get_node(cachep, nodeid); |
2755 | spin_lock(&n->list_lock); | 2636 | spin_lock(&n->list_lock); |
2756 | 2637 | ||
2757 | /* Get colour for the slab, and cal the next value. */ | 2638 | /* Get colour for the slab, and cal the next value. */ |
@@ -2920,7 +2801,7 @@ retry: | |||
2920 | */ | 2801 | */ |
2921 | batchcount = BATCHREFILL_LIMIT; | 2802 | batchcount = BATCHREFILL_LIMIT; |
2922 | } | 2803 | } |
2923 | n = cachep->node[node]; | 2804 | n = get_node(cachep, node); |
2924 | 2805 | ||
2925 | BUG_ON(ac->avail > 0 || !n); | 2806 | BUG_ON(ac->avail > 0 || !n); |
2926 | spin_lock(&n->list_lock); | 2807 | spin_lock(&n->list_lock); |
@@ -3060,7 +2941,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3060 | 2941 | ||
3061 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | 2942 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) |
3062 | { | 2943 | { |
3063 | if (cachep == kmem_cache) | 2944 | if (unlikely(cachep == kmem_cache)) |
3064 | return false; | 2945 | return false; |
3065 | 2946 | ||
3066 | return should_failslab(cachep->object_size, flags, cachep->flags); | 2947 | return should_failslab(cachep->object_size, flags, cachep->flags); |
@@ -3169,8 +3050,8 @@ retry: | |||
3169 | nid = zone_to_nid(zone); | 3050 | nid = zone_to_nid(zone); |
3170 | 3051 | ||
3171 | if (cpuset_zone_allowed_hardwall(zone, flags) && | 3052 | if (cpuset_zone_allowed_hardwall(zone, flags) && |
3172 | cache->node[nid] && | 3053 | get_node(cache, nid) && |
3173 | cache->node[nid]->free_objects) { | 3054 | get_node(cache, nid)->free_objects) { |
3174 | obj = ____cache_alloc_node(cache, | 3055 | obj = ____cache_alloc_node(cache, |
3175 | flags | GFP_THISNODE, nid); | 3056 | flags | GFP_THISNODE, nid); |
3176 | if (obj) | 3057 | if (obj) |
@@ -3233,7 +3114,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | |||
3233 | int x; | 3114 | int x; |
3234 | 3115 | ||
3235 | VM_BUG_ON(nodeid > num_online_nodes()); | 3116 | VM_BUG_ON(nodeid > num_online_nodes()); |
3236 | n = cachep->node[nodeid]; | 3117 | n = get_node(cachep, nodeid); |
3237 | BUG_ON(!n); | 3118 | BUG_ON(!n); |
3238 | 3119 | ||
3239 | retry: | 3120 | retry: |
@@ -3304,7 +3185,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3304 | if (nodeid == NUMA_NO_NODE) | 3185 | if (nodeid == NUMA_NO_NODE) |
3305 | nodeid = slab_node; | 3186 | nodeid = slab_node; |
3306 | 3187 | ||
3307 | if (unlikely(!cachep->node[nodeid])) { | 3188 | if (unlikely(!get_node(cachep, nodeid))) { |
3308 | /* Node not bootstrapped yet */ | 3189 | /* Node not bootstrapped yet */ |
3309 | ptr = fallback_alloc(cachep, flags); | 3190 | ptr = fallback_alloc(cachep, flags); |
3310 | goto out; | 3191 | goto out; |
@@ -3405,12 +3286,13 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) | |||
3405 | 3286 | ||
3406 | /* | 3287 | /* |
3407 | * Caller needs to acquire correct kmem_cache_node's list_lock | 3288 | * Caller needs to acquire correct kmem_cache_node's list_lock |
3289 | * @list: List of detached free slabs should be freed by caller | ||
3408 | */ | 3290 | */ |
3409 | static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | 3291 | static void free_block(struct kmem_cache *cachep, void **objpp, |
3410 | int node) | 3292 | int nr_objects, int node, struct list_head *list) |
3411 | { | 3293 | { |
3412 | int i; | 3294 | int i; |
3413 | struct kmem_cache_node *n; | 3295 | struct kmem_cache_node *n = get_node(cachep, node); |
3414 | 3296 | ||
3415 | for (i = 0; i < nr_objects; i++) { | 3297 | for (i = 0; i < nr_objects; i++) { |
3416 | void *objp; | 3298 | void *objp; |
@@ -3420,7 +3302,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3420 | objp = objpp[i]; | 3302 | objp = objpp[i]; |
3421 | 3303 | ||
3422 | page = virt_to_head_page(objp); | 3304 | page = virt_to_head_page(objp); |
3423 | n = cachep->node[node]; | ||
3424 | list_del(&page->lru); | 3305 | list_del(&page->lru); |
3425 | check_spinlock_acquired_node(cachep, node); | 3306 | check_spinlock_acquired_node(cachep, node); |
3426 | slab_put_obj(cachep, page, objp, node); | 3307 | slab_put_obj(cachep, page, objp, node); |
@@ -3431,13 +3312,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3431 | if (page->active == 0) { | 3312 | if (page->active == 0) { |
3432 | if (n->free_objects > n->free_limit) { | 3313 | if (n->free_objects > n->free_limit) { |
3433 | n->free_objects -= cachep->num; | 3314 | n->free_objects -= cachep->num; |
3434 | /* No need to drop any previously held | 3315 | list_add_tail(&page->lru, list); |
3435 | * lock here, even if we have a off-slab slab | ||
3436 | * descriptor it is guaranteed to come from | ||
3437 | * a different cache, refer to comments before | ||
3438 | * alloc_slabmgmt. | ||
3439 | */ | ||
3440 | slab_destroy(cachep, page); | ||
3441 | } else { | 3316 | } else { |
3442 | list_add(&page->lru, &n->slabs_free); | 3317 | list_add(&page->lru, &n->slabs_free); |
3443 | } | 3318 | } |
@@ -3456,13 +3331,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
3456 | int batchcount; | 3331 | int batchcount; |
3457 | struct kmem_cache_node *n; | 3332 | struct kmem_cache_node *n; |
3458 | int node = numa_mem_id(); | 3333 | int node = numa_mem_id(); |
3334 | LIST_HEAD(list); | ||
3459 | 3335 | ||
3460 | batchcount = ac->batchcount; | 3336 | batchcount = ac->batchcount; |
3461 | #if DEBUG | 3337 | #if DEBUG |
3462 | BUG_ON(!batchcount || batchcount > ac->avail); | 3338 | BUG_ON(!batchcount || batchcount > ac->avail); |
3463 | #endif | 3339 | #endif |
3464 | check_irq_off(); | 3340 | check_irq_off(); |
3465 | n = cachep->node[node]; | 3341 | n = get_node(cachep, node); |
3466 | spin_lock(&n->list_lock); | 3342 | spin_lock(&n->list_lock); |
3467 | if (n->shared) { | 3343 | if (n->shared) { |
3468 | struct array_cache *shared_array = n->shared; | 3344 | struct array_cache *shared_array = n->shared; |
@@ -3477,7 +3353,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
3477 | } | 3353 | } |
3478 | } | 3354 | } |
3479 | 3355 | ||
3480 | free_block(cachep, ac->entry, batchcount, node); | 3356 | free_block(cachep, ac->entry, batchcount, node, &list); |
3481 | free_done: | 3357 | free_done: |
3482 | #if STATS | 3358 | #if STATS |
3483 | { | 3359 | { |
@@ -3498,6 +3374,7 @@ free_done: | |||
3498 | } | 3374 | } |
3499 | #endif | 3375 | #endif |
3500 | spin_unlock(&n->list_lock); | 3376 | spin_unlock(&n->list_lock); |
3377 | slabs_destroy(cachep, &list); | ||
3501 | ac->avail -= batchcount; | 3378 | ac->avail -= batchcount; |
3502 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); | 3379 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); |
3503 | } | 3380 | } |
@@ -3754,7 +3631,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) | |||
3754 | int node; | 3631 | int node; |
3755 | struct kmem_cache_node *n; | 3632 | struct kmem_cache_node *n; |
3756 | struct array_cache *new_shared; | 3633 | struct array_cache *new_shared; |
3757 | struct array_cache **new_alien = NULL; | 3634 | struct alien_cache **new_alien = NULL; |
3758 | 3635 | ||
3759 | for_each_online_node(node) { | 3636 | for_each_online_node(node) { |
3760 | 3637 | ||
@@ -3775,15 +3652,16 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) | |||
3775 | } | 3652 | } |
3776 | } | 3653 | } |
3777 | 3654 | ||
3778 | n = cachep->node[node]; | 3655 | n = get_node(cachep, node); |
3779 | if (n) { | 3656 | if (n) { |
3780 | struct array_cache *shared = n->shared; | 3657 | struct array_cache *shared = n->shared; |
3658 | LIST_HEAD(list); | ||
3781 | 3659 | ||
3782 | spin_lock_irq(&n->list_lock); | 3660 | spin_lock_irq(&n->list_lock); |
3783 | 3661 | ||
3784 | if (shared) | 3662 | if (shared) |
3785 | free_block(cachep, shared->entry, | 3663 | free_block(cachep, shared->entry, |
3786 | shared->avail, node); | 3664 | shared->avail, node, &list); |
3787 | 3665 | ||
3788 | n->shared = new_shared; | 3666 | n->shared = new_shared; |
3789 | if (!n->alien) { | 3667 | if (!n->alien) { |
@@ -3793,6 +3671,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) | |||
3793 | n->free_limit = (1 + nr_cpus_node(node)) * | 3671 | n->free_limit = (1 + nr_cpus_node(node)) * |
3794 | cachep->batchcount + cachep->num; | 3672 | cachep->batchcount + cachep->num; |
3795 | spin_unlock_irq(&n->list_lock); | 3673 | spin_unlock_irq(&n->list_lock); |
3674 | slabs_destroy(cachep, &list); | ||
3796 | kfree(shared); | 3675 | kfree(shared); |
3797 | free_alien_cache(new_alien); | 3676 | free_alien_cache(new_alien); |
3798 | continue; | 3677 | continue; |
@@ -3820,9 +3699,8 @@ fail: | |||
3820 | /* Cache is not active yet. Roll back what we did */ | 3699 | /* Cache is not active yet. Roll back what we did */ |
3821 | node--; | 3700 | node--; |
3822 | while (node >= 0) { | 3701 | while (node >= 0) { |
3823 | if (cachep->node[node]) { | 3702 | n = get_node(cachep, node); |
3824 | n = cachep->node[node]; | 3703 | if (n) { |
3825 | |||
3826 | kfree(n->shared); | 3704 | kfree(n->shared); |
3827 | free_alien_cache(n->alien); | 3705 | free_alien_cache(n->alien); |
3828 | kfree(n); | 3706 | kfree(n); |
@@ -3883,12 +3761,20 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3883 | cachep->shared = shared; | 3761 | cachep->shared = shared; |
3884 | 3762 | ||
3885 | for_each_online_cpu(i) { | 3763 | for_each_online_cpu(i) { |
3764 | LIST_HEAD(list); | ||
3886 | struct array_cache *ccold = new->new[i]; | 3765 | struct array_cache *ccold = new->new[i]; |
3766 | int node; | ||
3767 | struct kmem_cache_node *n; | ||
3768 | |||
3887 | if (!ccold) | 3769 | if (!ccold) |
3888 | continue; | 3770 | continue; |
3889 | spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); | 3771 | |
3890 | free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); | 3772 | node = cpu_to_mem(i); |
3891 | spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); | 3773 | n = get_node(cachep, node); |
3774 | spin_lock_irq(&n->list_lock); | ||
3775 | free_block(cachep, ccold->entry, ccold->avail, node, &list); | ||
3776 | spin_unlock_irq(&n->list_lock); | ||
3777 | slabs_destroy(cachep, &list); | ||
3892 | kfree(ccold); | 3778 | kfree(ccold); |
3893 | } | 3779 | } |
3894 | kfree(new); | 3780 | kfree(new); |
@@ -3996,6 +3882,7 @@ skip_setup: | |||
3996 | static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, | 3882 | static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, |
3997 | struct array_cache *ac, int force, int node) | 3883 | struct array_cache *ac, int force, int node) |
3998 | { | 3884 | { |
3885 | LIST_HEAD(list); | ||
3999 | int tofree; | 3886 | int tofree; |
4000 | 3887 | ||
4001 | if (!ac || !ac->avail) | 3888 | if (!ac || !ac->avail) |
@@ -4008,12 +3895,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, | |||
4008 | tofree = force ? ac->avail : (ac->limit + 4) / 5; | 3895 | tofree = force ? ac->avail : (ac->limit + 4) / 5; |
4009 | if (tofree > ac->avail) | 3896 | if (tofree > ac->avail) |
4010 | tofree = (ac->avail + 1) / 2; | 3897 | tofree = (ac->avail + 1) / 2; |
4011 | free_block(cachep, ac->entry, tofree, node); | 3898 | free_block(cachep, ac->entry, tofree, node, &list); |
4012 | ac->avail -= tofree; | 3899 | ac->avail -= tofree; |
4013 | memmove(ac->entry, &(ac->entry[tofree]), | 3900 | memmove(ac->entry, &(ac->entry[tofree]), |
4014 | sizeof(void *) * ac->avail); | 3901 | sizeof(void *) * ac->avail); |
4015 | } | 3902 | } |
4016 | spin_unlock_irq(&n->list_lock); | 3903 | spin_unlock_irq(&n->list_lock); |
3904 | slabs_destroy(cachep, &list); | ||
4017 | } | 3905 | } |
4018 | } | 3906 | } |
4019 | 3907 | ||
@@ -4048,7 +3936,7 @@ static void cache_reap(struct work_struct *w) | |||
4048 | * have established with reasonable certainty that | 3936 | * have established with reasonable certainty that |
4049 | * we can do some work if the lock was obtained. | 3937 | * we can do some work if the lock was obtained. |
4050 | */ | 3938 | */ |
4051 | n = searchp->node[node]; | 3939 | n = get_node(searchp, node); |
4052 | 3940 | ||
4053 | reap_alien(searchp, n); | 3941 | reap_alien(searchp, n); |
4054 | 3942 | ||
@@ -4100,10 +3988,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) | |||
4100 | 3988 | ||
4101 | active_objs = 0; | 3989 | active_objs = 0; |
4102 | num_slabs = 0; | 3990 | num_slabs = 0; |
4103 | for_each_online_node(node) { | 3991 | for_each_kmem_cache_node(cachep, node, n) { |
4104 | n = cachep->node[node]; | ||
4105 | if (!n) | ||
4106 | continue; | ||
4107 | 3992 | ||
4108 | check_irq_on(); | 3993 | check_irq_on(); |
4109 | spin_lock_irq(&n->list_lock); | 3994 | spin_lock_irq(&n->list_lock); |
@@ -4328,10 +4213,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4328 | 4213 | ||
4329 | x[1] = 0; | 4214 | x[1] = 0; |
4330 | 4215 | ||
4331 | for_each_online_node(node) { | 4216 | for_each_kmem_cache_node(cachep, node, n) { |
4332 | n = cachep->node[node]; | ||
4333 | if (!n) | ||
4334 | continue; | ||
4335 | 4217 | ||
4336 | check_irq_on(); | 4218 | check_irq_on(); |
4337 | spin_lock_irq(&n->list_lock); | 4219 | spin_lock_irq(&n->list_lock); |
@@ -256,13 +256,12 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | |||
256 | return cachep; | 256 | return cachep; |
257 | 257 | ||
258 | pr_err("%s: Wrong slab cache. %s but object is from %s\n", | 258 | pr_err("%s: Wrong slab cache. %s but object is from %s\n", |
259 | __FUNCTION__, cachep->name, s->name); | 259 | __func__, cachep->name, s->name); |
260 | WARN_ON_ONCE(1); | 260 | WARN_ON_ONCE(1); |
261 | return s; | 261 | return s; |
262 | } | 262 | } |
263 | #endif | ||
264 | |||
265 | 263 | ||
264 | #ifndef CONFIG_SLOB | ||
266 | /* | 265 | /* |
267 | * The slab lists for all objects. | 266 | * The slab lists for all objects. |
268 | */ | 267 | */ |
@@ -277,7 +276,7 @@ struct kmem_cache_node { | |||
277 | unsigned int free_limit; | 276 | unsigned int free_limit; |
278 | unsigned int colour_next; /* Per-node cache coloring */ | 277 | unsigned int colour_next; /* Per-node cache coloring */ |
279 | struct array_cache *shared; /* shared per node */ | 278 | struct array_cache *shared; /* shared per node */ |
280 | struct array_cache **alien; /* on other nodes */ | 279 | struct alien_cache **alien; /* on other nodes */ |
281 | unsigned long next_reap; /* updated without locking */ | 280 | unsigned long next_reap; /* updated without locking */ |
282 | int free_touched; /* updated without locking */ | 281 | int free_touched; /* updated without locking */ |
283 | #endif | 282 | #endif |
@@ -294,5 +293,22 @@ struct kmem_cache_node { | |||
294 | 293 | ||
295 | }; | 294 | }; |
296 | 295 | ||
296 | static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | ||
297 | { | ||
298 | return s->node[node]; | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Iterator over all nodes. The body will be executed for each node that has | ||
303 | * a kmem_cache_node structure allocated (which is true for all online nodes) | ||
304 | */ | ||
305 | #define for_each_kmem_cache_node(__s, __node, __n) \ | ||
306 | for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ | ||
307 | if (__n) | ||
308 | |||
309 | #endif | ||
310 | |||
297 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); | 311 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); |
298 | void slab_stop(struct seq_file *m, void *p); | 312 | void slab_stop(struct seq_file *m, void *p); |
313 | |||
314 | #endif /* MM_SLAB_H */ | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c index d31c4bacc6a2..d319502b2403 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -19,6 +19,8 @@ | |||
19 | #include <asm/tlbflush.h> | 19 | #include <asm/tlbflush.h> |
20 | #include <asm/page.h> | 20 | #include <asm/page.h> |
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | |||
23 | #define CREATE_TRACE_POINTS | ||
22 | #include <trace/events/kmem.h> | 24 | #include <trace/events/kmem.h> |
23 | 25 | ||
24 | #include "slab.h" | 26 | #include "slab.h" |
@@ -787,3 +789,102 @@ static int __init slab_proc_init(void) | |||
787 | } | 789 | } |
788 | module_init(slab_proc_init); | 790 | module_init(slab_proc_init); |
789 | #endif /* CONFIG_SLABINFO */ | 791 | #endif /* CONFIG_SLABINFO */ |
792 | |||
793 | static __always_inline void *__do_krealloc(const void *p, size_t new_size, | ||
794 | gfp_t flags) | ||
795 | { | ||
796 | void *ret; | ||
797 | size_t ks = 0; | ||
798 | |||
799 | if (p) | ||
800 | ks = ksize(p); | ||
801 | |||
802 | if (ks >= new_size) | ||
803 | return (void *)p; | ||
804 | |||
805 | ret = kmalloc_track_caller(new_size, flags); | ||
806 | if (ret && p) | ||
807 | memcpy(ret, p, ks); | ||
808 | |||
809 | return ret; | ||
810 | } | ||
811 | |||
812 | /** | ||
813 | * __krealloc - like krealloc() but don't free @p. | ||
814 | * @p: object to reallocate memory for. | ||
815 | * @new_size: how many bytes of memory are required. | ||
816 | * @flags: the type of memory to allocate. | ||
817 | * | ||
818 | * This function is like krealloc() except it never frees the originally | ||
819 | * allocated buffer. Use this if you don't want to free the buffer immediately | ||
820 | * like, for example, with RCU. | ||
821 | */ | ||
822 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) | ||
823 | { | ||
824 | if (unlikely(!new_size)) | ||
825 | return ZERO_SIZE_PTR; | ||
826 | |||
827 | return __do_krealloc(p, new_size, flags); | ||
828 | |||
829 | } | ||
830 | EXPORT_SYMBOL(__krealloc); | ||
831 | |||
832 | /** | ||
833 | * krealloc - reallocate memory. The contents will remain unchanged. | ||
834 | * @p: object to reallocate memory for. | ||
835 | * @new_size: how many bytes of memory are required. | ||
836 | * @flags: the type of memory to allocate. | ||
837 | * | ||
838 | * The contents of the object pointed to are preserved up to the | ||
839 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | ||
840 | * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a | ||
841 | * %NULL pointer, the object pointed to is freed. | ||
842 | */ | ||
843 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | ||
844 | { | ||
845 | void *ret; | ||
846 | |||
847 | if (unlikely(!new_size)) { | ||
848 | kfree(p); | ||
849 | return ZERO_SIZE_PTR; | ||
850 | } | ||
851 | |||
852 | ret = __do_krealloc(p, new_size, flags); | ||
853 | if (ret && p != ret) | ||
854 | kfree(p); | ||
855 | |||
856 | return ret; | ||
857 | } | ||
858 | EXPORT_SYMBOL(krealloc); | ||
859 | |||
860 | /** | ||
861 | * kzfree - like kfree but zero memory | ||
862 | * @p: object to free memory of | ||
863 | * | ||
864 | * The memory of the object @p points to is zeroed before freed. | ||
865 | * If @p is %NULL, kzfree() does nothing. | ||
866 | * | ||
867 | * Note: this function zeroes the whole allocated buffer which can be a good | ||
868 | * deal bigger than the requested buffer size passed to kmalloc(). So be | ||
869 | * careful when using this function in performance sensitive code. | ||
870 | */ | ||
871 | void kzfree(const void *p) | ||
872 | { | ||
873 | size_t ks; | ||
874 | void *mem = (void *)p; | ||
875 | |||
876 | if (unlikely(ZERO_OR_NULL_PTR(mem))) | ||
877 | return; | ||
878 | ks = ksize(mem); | ||
879 | memset(mem, 0, ks); | ||
880 | kfree(mem); | ||
881 | } | ||
882 | EXPORT_SYMBOL(kzfree); | ||
883 | |||
884 | /* Tracepoints definitions. */ | ||
885 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | ||
886 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | ||
887 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); | ||
888 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); | ||
889 | EXPORT_TRACEPOINT_SYMBOL(kfree); | ||
890 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); | ||
@@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) | |||
233 | * Core slab cache functions | 233 | * Core slab cache functions |
234 | *******************************************************************/ | 234 | *******************************************************************/ |
235 | 235 | ||
236 | static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | ||
237 | { | ||
238 | return s->node[node]; | ||
239 | } | ||
240 | |||
241 | /* Verify that a pointer has an address that is valid within a slab page */ | 236 | /* Verify that a pointer has an address that is valid within a slab page */ |
242 | static inline int check_valid_pointer(struct kmem_cache *s, | 237 | static inline int check_valid_pointer(struct kmem_cache *s, |
243 | struct page *page, const void *object) | 238 | struct page *page, const void *object) |
@@ -288,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | |||
288 | for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ | 283 | for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ |
289 | __p += (__s)->size) | 284 | __p += (__s)->size) |
290 | 285 | ||
286 | #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ | ||
287 | for (__p = (__addr), __idx = 1; __idx <= __objects;\ | ||
288 | __p += (__s)->size, __idx++) | ||
289 | |||
291 | /* Determine object index from a given position */ | 290 | /* Determine object index from a given position */ |
292 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | 291 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) |
293 | { | 292 | { |
@@ -382,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
382 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | 381 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) |
383 | if (s->flags & __CMPXCHG_DOUBLE) { | 382 | if (s->flags & __CMPXCHG_DOUBLE) { |
384 | if (cmpxchg_double(&page->freelist, &page->counters, | 383 | if (cmpxchg_double(&page->freelist, &page->counters, |
385 | freelist_old, counters_old, | 384 | freelist_old, counters_old, |
386 | freelist_new, counters_new)) | 385 | freelist_new, counters_new)) |
387 | return 1; | 386 | return 1; |
388 | } else | 387 | } else |
389 | #endif | 388 | #endif |
390 | { | 389 | { |
@@ -418,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
418 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | 417 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) |
419 | if (s->flags & __CMPXCHG_DOUBLE) { | 418 | if (s->flags & __CMPXCHG_DOUBLE) { |
420 | if (cmpxchg_double(&page->freelist, &page->counters, | 419 | if (cmpxchg_double(&page->freelist, &page->counters, |
421 | freelist_old, counters_old, | 420 | freelist_old, counters_old, |
422 | freelist_new, counters_new)) | 421 | freelist_new, counters_new)) |
423 | return 1; | 422 | return 1; |
424 | } else | 423 | } else |
425 | #endif | 424 | #endif |
426 | { | 425 | { |
@@ -945,60 +944,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, | |||
945 | } | 944 | } |
946 | 945 | ||
947 | /* | 946 | /* |
948 | * Hooks for other subsystems that check memory allocations. In a typical | ||
949 | * production configuration these hooks all should produce no code at all. | ||
950 | */ | ||
951 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | ||
952 | { | ||
953 | kmemleak_alloc(ptr, size, 1, flags); | ||
954 | } | ||
955 | |||
956 | static inline void kfree_hook(const void *x) | ||
957 | { | ||
958 | kmemleak_free(x); | ||
959 | } | ||
960 | |||
961 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | ||
962 | { | ||
963 | flags &= gfp_allowed_mask; | ||
964 | lockdep_trace_alloc(flags); | ||
965 | might_sleep_if(flags & __GFP_WAIT); | ||
966 | |||
967 | return should_failslab(s->object_size, flags, s->flags); | ||
968 | } | ||
969 | |||
970 | static inline void slab_post_alloc_hook(struct kmem_cache *s, | ||
971 | gfp_t flags, void *object) | ||
972 | { | ||
973 | flags &= gfp_allowed_mask; | ||
974 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | ||
975 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | ||
976 | } | ||
977 | |||
978 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | ||
979 | { | ||
980 | kmemleak_free_recursive(x, s->flags); | ||
981 | |||
982 | /* | ||
983 | * Trouble is that we may no longer disable interrupts in the fast path | ||
984 | * So in order to make the debug calls that expect irqs to be | ||
985 | * disabled we need to disable interrupts temporarily. | ||
986 | */ | ||
987 | #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) | ||
988 | { | ||
989 | unsigned long flags; | ||
990 | |||
991 | local_irq_save(flags); | ||
992 | kmemcheck_slab_free(s, x, s->object_size); | ||
993 | debug_check_no_locks_freed(x, s->object_size); | ||
994 | local_irq_restore(flags); | ||
995 | } | ||
996 | #endif | ||
997 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | ||
998 | debug_check_no_obj_freed(x, s->object_size); | ||
999 | } | ||
1000 | |||
1001 | /* | ||
1002 | * Tracking of fully allocated slabs for debugging purposes. | 947 | * Tracking of fully allocated slabs for debugging purposes. |
1003 | */ | 948 | */ |
1004 | static void add_full(struct kmem_cache *s, | 949 | static void add_full(struct kmem_cache *s, |
@@ -1282,6 +1227,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, | |||
1282 | static inline void dec_slabs_node(struct kmem_cache *s, int node, | 1227 | static inline void dec_slabs_node(struct kmem_cache *s, int node, |
1283 | int objects) {} | 1228 | int objects) {} |
1284 | 1229 | ||
1230 | #endif /* CONFIG_SLUB_DEBUG */ | ||
1231 | |||
1232 | /* | ||
1233 | * Hooks for other subsystems that check memory allocations. In a typical | ||
1234 | * production configuration these hooks all should produce no code at all. | ||
1235 | */ | ||
1285 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | 1236 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) |
1286 | { | 1237 | { |
1287 | kmemleak_alloc(ptr, size, 1, flags); | 1238 | kmemleak_alloc(ptr, size, 1, flags); |
@@ -1293,21 +1244,44 @@ static inline void kfree_hook(const void *x) | |||
1293 | } | 1244 | } |
1294 | 1245 | ||
1295 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) | 1246 | static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) |
1296 | { return 0; } | 1247 | { |
1248 | flags &= gfp_allowed_mask; | ||
1249 | lockdep_trace_alloc(flags); | ||
1250 | might_sleep_if(flags & __GFP_WAIT); | ||
1251 | |||
1252 | return should_failslab(s->object_size, flags, s->flags); | ||
1253 | } | ||
1297 | 1254 | ||
1298 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | 1255 | static inline void slab_post_alloc_hook(struct kmem_cache *s, |
1299 | void *object) | 1256 | gfp_t flags, void *object) |
1300 | { | 1257 | { |
1301 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, | 1258 | flags &= gfp_allowed_mask; |
1302 | flags & gfp_allowed_mask); | 1259 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
1260 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | ||
1303 | } | 1261 | } |
1304 | 1262 | ||
1305 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1263 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
1306 | { | 1264 | { |
1307 | kmemleak_free_recursive(x, s->flags); | 1265 | kmemleak_free_recursive(x, s->flags); |
1308 | } | ||
1309 | 1266 | ||
1310 | #endif /* CONFIG_SLUB_DEBUG */ | 1267 | /* |
1268 | * Trouble is that we may no longer disable interrupts in the fast path | ||
1269 | * So in order to make the debug calls that expect irqs to be | ||
1270 | * disabled we need to disable interrupts temporarily. | ||
1271 | */ | ||
1272 | #if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) | ||
1273 | { | ||
1274 | unsigned long flags; | ||
1275 | |||
1276 | local_irq_save(flags); | ||
1277 | kmemcheck_slab_free(s, x, s->object_size); | ||
1278 | debug_check_no_locks_freed(x, s->object_size); | ||
1279 | local_irq_restore(flags); | ||
1280 | } | ||
1281 | #endif | ||
1282 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | ||
1283 | debug_check_no_obj_freed(x, s->object_size); | ||
1284 | } | ||
1311 | 1285 | ||
1312 | /* | 1286 | /* |
1313 | * Slab allocation and freeing | 1287 | * Slab allocation and freeing |
@@ -1409,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1409 | { | 1383 | { |
1410 | struct page *page; | 1384 | struct page *page; |
1411 | void *start; | 1385 | void *start; |
1412 | void *last; | ||
1413 | void *p; | 1386 | void *p; |
1414 | int order; | 1387 | int order; |
1388 | int idx; | ||
1415 | 1389 | ||
1416 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 1390 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
1417 | 1391 | ||
@@ -1432,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1432 | if (unlikely(s->flags & SLAB_POISON)) | 1406 | if (unlikely(s->flags & SLAB_POISON)) |
1433 | memset(start, POISON_INUSE, PAGE_SIZE << order); | 1407 | memset(start, POISON_INUSE, PAGE_SIZE << order); |
1434 | 1408 | ||
1435 | last = start; | 1409 | for_each_object_idx(p, idx, s, start, page->objects) { |
1436 | for_each_object(p, s, start, page->objects) { | 1410 | setup_object(s, page, p); |
1437 | setup_object(s, page, last); | 1411 | if (likely(idx < page->objects)) |
1438 | set_freepointer(s, last, p); | 1412 | set_freepointer(s, p, p + s->size); |
1439 | last = p; | 1413 | else |
1414 | set_freepointer(s, p, NULL); | ||
1440 | } | 1415 | } |
1441 | setup_object(s, page, last); | ||
1442 | set_freepointer(s, last, NULL); | ||
1443 | 1416 | ||
1444 | page->freelist = start; | 1417 | page->freelist = start; |
1445 | page->inuse = page->objects; | 1418 | page->inuse = page->objects; |
@@ -2162,6 +2135,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2162 | static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 2135 | static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
2163 | DEFAULT_RATELIMIT_BURST); | 2136 | DEFAULT_RATELIMIT_BURST); |
2164 | int node; | 2137 | int node; |
2138 | struct kmem_cache_node *n; | ||
2165 | 2139 | ||
2166 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) | 2140 | if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) |
2167 | return; | 2141 | return; |
@@ -2176,15 +2150,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | |||
2176 | pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", | 2150 | pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", |
2177 | s->name); | 2151 | s->name); |
2178 | 2152 | ||
2179 | for_each_online_node(node) { | 2153 | for_each_kmem_cache_node(s, node, n) { |
2180 | struct kmem_cache_node *n = get_node(s, node); | ||
2181 | unsigned long nr_slabs; | 2154 | unsigned long nr_slabs; |
2182 | unsigned long nr_objs; | 2155 | unsigned long nr_objs; |
2183 | unsigned long nr_free; | 2156 | unsigned long nr_free; |
2184 | 2157 | ||
2185 | if (!n) | ||
2186 | continue; | ||
2187 | |||
2188 | nr_free = count_partial(n, count_free); | 2158 | nr_free = count_partial(n, count_free); |
2189 | nr_slabs = node_nr_slabs(n); | 2159 | nr_slabs = node_nr_slabs(n); |
2190 | nr_objs = node_nr_objs(n); | 2160 | nr_objs = node_nr_objs(n); |
@@ -2928,13 +2898,10 @@ static void early_kmem_cache_node_alloc(int node) | |||
2928 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2898 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
2929 | { | 2899 | { |
2930 | int node; | 2900 | int node; |
2901 | struct kmem_cache_node *n; | ||
2931 | 2902 | ||
2932 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2903 | for_each_kmem_cache_node(s, node, n) { |
2933 | struct kmem_cache_node *n = s->node[node]; | 2904 | kmem_cache_free(kmem_cache_node, n); |
2934 | |||
2935 | if (n) | ||
2936 | kmem_cache_free(kmem_cache_node, n); | ||
2937 | |||
2938 | s->node[node] = NULL; | 2905 | s->node[node] = NULL; |
2939 | } | 2906 | } |
2940 | } | 2907 | } |
@@ -3222,12 +3189,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | |||
3222 | static inline int kmem_cache_close(struct kmem_cache *s) | 3189 | static inline int kmem_cache_close(struct kmem_cache *s) |
3223 | { | 3190 | { |
3224 | int node; | 3191 | int node; |
3192 | struct kmem_cache_node *n; | ||
3225 | 3193 | ||
3226 | flush_all(s); | 3194 | flush_all(s); |
3227 | /* Attempt to free all objects */ | 3195 | /* Attempt to free all objects */ |
3228 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3196 | for_each_kmem_cache_node(s, node, n) { |
3229 | struct kmem_cache_node *n = get_node(s, node); | ||
3230 | |||
3231 | free_partial(s, n); | 3197 | free_partial(s, n); |
3232 | if (n->nr_partial || slabs_node(s, node)) | 3198 | if (n->nr_partial || slabs_node(s, node)) |
3233 | return 1; | 3199 | return 1; |
@@ -3412,9 +3378,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) | |||
3412 | return -ENOMEM; | 3378 | return -ENOMEM; |
3413 | 3379 | ||
3414 | flush_all(s); | 3380 | flush_all(s); |
3415 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3381 | for_each_kmem_cache_node(s, node, n) { |
3416 | n = get_node(s, node); | ||
3417 | |||
3418 | if (!n->nr_partial) | 3382 | if (!n->nr_partial) |
3419 | continue; | 3383 | continue; |
3420 | 3384 | ||
@@ -3586,6 +3550,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) | |||
3586 | { | 3550 | { |
3587 | int node; | 3551 | int node; |
3588 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); | 3552 | struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
3553 | struct kmem_cache_node *n; | ||
3589 | 3554 | ||
3590 | memcpy(s, static_cache, kmem_cache->object_size); | 3555 | memcpy(s, static_cache, kmem_cache->object_size); |
3591 | 3556 | ||
@@ -3595,19 +3560,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) | |||
3595 | * IPIs around. | 3560 | * IPIs around. |
3596 | */ | 3561 | */ |
3597 | __flush_cpu_slab(s, smp_processor_id()); | 3562 | __flush_cpu_slab(s, smp_processor_id()); |
3598 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3563 | for_each_kmem_cache_node(s, node, n) { |
3599 | struct kmem_cache_node *n = get_node(s, node); | ||
3600 | struct page *p; | 3564 | struct page *p; |
3601 | 3565 | ||
3602 | if (n) { | 3566 | list_for_each_entry(p, &n->partial, lru) |
3603 | list_for_each_entry(p, &n->partial, lru) | 3567 | p->slab_cache = s; |
3604 | p->slab_cache = s; | ||
3605 | 3568 | ||
3606 | #ifdef CONFIG_SLUB_DEBUG | 3569 | #ifdef CONFIG_SLUB_DEBUG |
3607 | list_for_each_entry(p, &n->full, lru) | 3570 | list_for_each_entry(p, &n->full, lru) |
3608 | p->slab_cache = s; | 3571 | p->slab_cache = s; |
3609 | #endif | 3572 | #endif |
3610 | } | ||
3611 | } | 3573 | } |
3612 | list_add(&s->list, &slab_caches); | 3574 | list_add(&s->list, &slab_caches); |
3613 | return s; | 3575 | return s; |
@@ -3960,16 +3922,14 @@ static long validate_slab_cache(struct kmem_cache *s) | |||
3960 | unsigned long count = 0; | 3922 | unsigned long count = 0; |
3961 | unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * | 3923 | unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * |
3962 | sizeof(unsigned long), GFP_KERNEL); | 3924 | sizeof(unsigned long), GFP_KERNEL); |
3925 | struct kmem_cache_node *n; | ||
3963 | 3926 | ||
3964 | if (!map) | 3927 | if (!map) |
3965 | return -ENOMEM; | 3928 | return -ENOMEM; |
3966 | 3929 | ||
3967 | flush_all(s); | 3930 | flush_all(s); |
3968 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3931 | for_each_kmem_cache_node(s, node, n) |
3969 | struct kmem_cache_node *n = get_node(s, node); | ||
3970 | |||
3971 | count += validate_slab_node(s, n, map); | 3932 | count += validate_slab_node(s, n, map); |
3972 | } | ||
3973 | kfree(map); | 3933 | kfree(map); |
3974 | return count; | 3934 | return count; |
3975 | } | 3935 | } |
@@ -4123,6 +4083,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4123 | int node; | 4083 | int node; |
4124 | unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * | 4084 | unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * |
4125 | sizeof(unsigned long), GFP_KERNEL); | 4085 | sizeof(unsigned long), GFP_KERNEL); |
4086 | struct kmem_cache_node *n; | ||
4126 | 4087 | ||
4127 | if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), | 4088 | if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), |
4128 | GFP_TEMPORARY)) { | 4089 | GFP_TEMPORARY)) { |
@@ -4132,8 +4093,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4132 | /* Push back cpu slabs */ | 4093 | /* Push back cpu slabs */ |
4133 | flush_all(s); | 4094 | flush_all(s); |
4134 | 4095 | ||
4135 | for_each_node_state(node, N_NORMAL_MEMORY) { | 4096 | for_each_kmem_cache_node(s, node, n) { |
4136 | struct kmem_cache_node *n = get_node(s, node); | ||
4137 | unsigned long flags; | 4097 | unsigned long flags; |
4138 | struct page *page; | 4098 | struct page *page; |
4139 | 4099 | ||
@@ -4205,7 +4165,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4205 | #endif | 4165 | #endif |
4206 | 4166 | ||
4207 | #ifdef SLUB_RESILIENCY_TEST | 4167 | #ifdef SLUB_RESILIENCY_TEST |
4208 | static void resiliency_test(void) | 4168 | static void __init resiliency_test(void) |
4209 | { | 4169 | { |
4210 | u8 *p; | 4170 | u8 *p; |
4211 | 4171 | ||
@@ -4332,8 +4292,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4332 | get_online_mems(); | 4292 | get_online_mems(); |
4333 | #ifdef CONFIG_SLUB_DEBUG | 4293 | #ifdef CONFIG_SLUB_DEBUG |
4334 | if (flags & SO_ALL) { | 4294 | if (flags & SO_ALL) { |
4335 | for_each_node_state(node, N_NORMAL_MEMORY) { | 4295 | struct kmem_cache_node *n; |
4336 | struct kmem_cache_node *n = get_node(s, node); | 4296 | |
4297 | for_each_kmem_cache_node(s, node, n) { | ||
4337 | 4298 | ||
4338 | if (flags & SO_TOTAL) | 4299 | if (flags & SO_TOTAL) |
4339 | x = atomic_long_read(&n->total_objects); | 4300 | x = atomic_long_read(&n->total_objects); |
@@ -4349,9 +4310,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4349 | } else | 4310 | } else |
4350 | #endif | 4311 | #endif |
4351 | if (flags & SO_PARTIAL) { | 4312 | if (flags & SO_PARTIAL) { |
4352 | for_each_node_state(node, N_NORMAL_MEMORY) { | 4313 | struct kmem_cache_node *n; |
4353 | struct kmem_cache_node *n = get_node(s, node); | ||
4354 | 4314 | ||
4315 | for_each_kmem_cache_node(s, node, n) { | ||
4355 | if (flags & SO_TOTAL) | 4316 | if (flags & SO_TOTAL) |
4356 | x = count_partial(n, count_total); | 4317 | x = count_partial(n, count_total); |
4357 | else if (flags & SO_OBJECTS) | 4318 | else if (flags & SO_OBJECTS) |
@@ -4364,7 +4325,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4364 | } | 4325 | } |
4365 | x = sprintf(buf, "%lu", total); | 4326 | x = sprintf(buf, "%lu", total); |
4366 | #ifdef CONFIG_NUMA | 4327 | #ifdef CONFIG_NUMA |
4367 | for_each_node_state(node, N_NORMAL_MEMORY) | 4328 | for (node = 0; node < nr_node_ids; node++) |
4368 | if (nodes[node]) | 4329 | if (nodes[node]) |
4369 | x += sprintf(buf + x, " N%d=%lu", | 4330 | x += sprintf(buf + x, " N%d=%lu", |
4370 | node, nodes[node]); | 4331 | node, nodes[node]); |
@@ -4378,16 +4339,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
4378 | static int any_slab_objects(struct kmem_cache *s) | 4339 | static int any_slab_objects(struct kmem_cache *s) |
4379 | { | 4340 | { |
4380 | int node; | 4341 | int node; |
4342 | struct kmem_cache_node *n; | ||
4381 | 4343 | ||
4382 | for_each_online_node(node) { | 4344 | for_each_kmem_cache_node(s, node, n) |
4383 | struct kmem_cache_node *n = get_node(s, node); | ||
4384 | |||
4385 | if (!n) | ||
4386 | continue; | ||
4387 | |||
4388 | if (atomic_long_read(&n->total_objects)) | 4345 | if (atomic_long_read(&n->total_objects)) |
4389 | return 1; | 4346 | return 1; |
4390 | } | 4347 | |
4391 | return 0; | 4348 | return 0; |
4392 | } | 4349 | } |
4393 | #endif | 4350 | #endif |
@@ -4509,7 +4466,7 @@ SLAB_ATTR_RO(ctor); | |||
4509 | 4466 | ||
4510 | static ssize_t aliases_show(struct kmem_cache *s, char *buf) | 4467 | static ssize_t aliases_show(struct kmem_cache *s, char *buf) |
4511 | { | 4468 | { |
4512 | return sprintf(buf, "%d\n", s->refcount - 1); | 4469 | return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); |
4513 | } | 4470 | } |
4514 | SLAB_ATTR_RO(aliases); | 4471 | SLAB_ATTR_RO(aliases); |
4515 | 4472 | ||
@@ -5171,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s) | |||
5171 | *p++ = '-'; | 5128 | *p++ = '-'; |
5172 | p += sprintf(p, "%07d", s->size); | 5129 | p += sprintf(p, "%07d", s->size); |
5173 | 5130 | ||
5174 | #ifdef CONFIG_MEMCG_KMEM | ||
5175 | if (!is_root_cache(s)) | ||
5176 | p += sprintf(p, "-%08d", | ||
5177 | memcg_cache_id(s->memcg_params->memcg)); | ||
5178 | #endif | ||
5179 | |||
5180 | BUG_ON(p > name + ID_STR_LENGTH - 1); | 5131 | BUG_ON(p > name + ID_STR_LENGTH - 1); |
5181 | return name; | 5132 | return name; |
5182 | } | 5133 | } |
@@ -5342,13 +5293,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) | |||
5342 | unsigned long nr_objs = 0; | 5293 | unsigned long nr_objs = 0; |
5343 | unsigned long nr_free = 0; | 5294 | unsigned long nr_free = 0; |
5344 | int node; | 5295 | int node; |
5296 | struct kmem_cache_node *n; | ||
5345 | 5297 | ||
5346 | for_each_online_node(node) { | 5298 | for_each_kmem_cache_node(s, node, n) { |
5347 | struct kmem_cache_node *n = get_node(s, node); | ||
5348 | |||
5349 | if (!n) | ||
5350 | continue; | ||
5351 | |||
5352 | nr_slabs += node_nr_slabs(n); | 5299 | nr_slabs += node_nr_slabs(n); |
5353 | nr_objs += node_nr_objs(n); | 5300 | nr_objs += node_nr_objs(n); |
5354 | nr_free += count_partial(n, count_free); | 5301 | nr_free += count_partial(n, count_free); |
@@ -501,7 +501,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, | |||
501 | SetPageActive(page); | 501 | SetPageActive(page); |
502 | lru += LRU_ACTIVE; | 502 | lru += LRU_ACTIVE; |
503 | add_page_to_lru_list(page, lruvec, lru); | 503 | add_page_to_lru_list(page, lruvec, lru); |
504 | trace_mm_lru_activate(page, page_to_pfn(page)); | 504 | trace_mm_lru_activate(page); |
505 | 505 | ||
506 | __count_vm_event(PGACTIVATE); | 506 | __count_vm_event(PGACTIVATE); |
507 | update_page_reclaim_stat(lruvec, file, 1); | 507 | update_page_reclaim_stat(lruvec, file, 1); |
@@ -589,6 +589,9 @@ static void __lru_cache_activate_page(struct page *page) | |||
589 | * inactive,unreferenced -> inactive,referenced | 589 | * inactive,unreferenced -> inactive,referenced |
590 | * inactive,referenced -> active,unreferenced | 590 | * inactive,referenced -> active,unreferenced |
591 | * active,unreferenced -> active,referenced | 591 | * active,unreferenced -> active,referenced |
592 | * | ||
593 | * When a newly allocated page is not yet visible, so safe for non-atomic ops, | ||
594 | * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). | ||
592 | */ | 595 | */ |
593 | void mark_page_accessed(struct page *page) | 596 | void mark_page_accessed(struct page *page) |
594 | { | 597 | { |
@@ -614,17 +617,6 @@ void mark_page_accessed(struct page *page) | |||
614 | } | 617 | } |
615 | EXPORT_SYMBOL(mark_page_accessed); | 618 | EXPORT_SYMBOL(mark_page_accessed); |
616 | 619 | ||
617 | /* | ||
618 | * Used to mark_page_accessed(page) that is not visible yet and when it is | ||
619 | * still safe to use non-atomic ops | ||
620 | */ | ||
621 | void init_page_accessed(struct page *page) | ||
622 | { | ||
623 | if (!PageReferenced(page)) | ||
624 | __SetPageReferenced(page); | ||
625 | } | ||
626 | EXPORT_SYMBOL(init_page_accessed); | ||
627 | |||
628 | static void __lru_cache_add(struct page *page) | 620 | static void __lru_cache_add(struct page *page) |
629 | { | 621 | { |
630 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | 622 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); |
@@ -996,7 +988,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, | |||
996 | SetPageLRU(page); | 988 | SetPageLRU(page); |
997 | add_page_to_lru_list(page, lruvec, lru); | 989 | add_page_to_lru_list(page, lruvec, lru); |
998 | update_page_reclaim_stat(lruvec, file, active); | 990 | update_page_reclaim_stat(lruvec, file, active); |
999 | trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); | 991 | trace_mm_lru_insertion(page, lru); |
1000 | } | 992 | } |
1001 | 993 | ||
1002 | /* | 994 | /* |
@@ -16,9 +16,6 @@ | |||
16 | 16 | ||
17 | #include "internal.h" | 17 | #include "internal.h" |
18 | 18 | ||
19 | #define CREATE_TRACE_POINTS | ||
20 | #include <trace/events/kmem.h> | ||
21 | |||
22 | /** | 19 | /** |
23 | * kstrdup - allocate space for and copy an existing string | 20 | * kstrdup - allocate space for and copy an existing string |
24 | * @s: the string to duplicate | 21 | * @s: the string to duplicate |
@@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len) | |||
112 | } | 109 | } |
113 | EXPORT_SYMBOL(memdup_user); | 110 | EXPORT_SYMBOL(memdup_user); |
114 | 111 | ||
115 | static __always_inline void *__do_krealloc(const void *p, size_t new_size, | ||
116 | gfp_t flags) | ||
117 | { | ||
118 | void *ret; | ||
119 | size_t ks = 0; | ||
120 | |||
121 | if (p) | ||
122 | ks = ksize(p); | ||
123 | |||
124 | if (ks >= new_size) | ||
125 | return (void *)p; | ||
126 | |||
127 | ret = kmalloc_track_caller(new_size, flags); | ||
128 | if (ret && p) | ||
129 | memcpy(ret, p, ks); | ||
130 | |||
131 | return ret; | ||
132 | } | ||
133 | |||
134 | /** | ||
135 | * __krealloc - like krealloc() but don't free @p. | ||
136 | * @p: object to reallocate memory for. | ||
137 | * @new_size: how many bytes of memory are required. | ||
138 | * @flags: the type of memory to allocate. | ||
139 | * | ||
140 | * This function is like krealloc() except it never frees the originally | ||
141 | * allocated buffer. Use this if you don't want to free the buffer immediately | ||
142 | * like, for example, with RCU. | ||
143 | */ | ||
144 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) | ||
145 | { | ||
146 | if (unlikely(!new_size)) | ||
147 | return ZERO_SIZE_PTR; | ||
148 | |||
149 | return __do_krealloc(p, new_size, flags); | ||
150 | |||
151 | } | ||
152 | EXPORT_SYMBOL(__krealloc); | ||
153 | |||
154 | /** | ||
155 | * krealloc - reallocate memory. The contents will remain unchanged. | ||
156 | * @p: object to reallocate memory for. | ||
157 | * @new_size: how many bytes of memory are required. | ||
158 | * @flags: the type of memory to allocate. | ||
159 | * | ||
160 | * The contents of the object pointed to are preserved up to the | ||
161 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | ||
162 | * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a | ||
163 | * %NULL pointer, the object pointed to is freed. | ||
164 | */ | ||
165 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | ||
166 | { | ||
167 | void *ret; | ||
168 | |||
169 | if (unlikely(!new_size)) { | ||
170 | kfree(p); | ||
171 | return ZERO_SIZE_PTR; | ||
172 | } | ||
173 | |||
174 | ret = __do_krealloc(p, new_size, flags); | ||
175 | if (ret && p != ret) | ||
176 | kfree(p); | ||
177 | |||
178 | return ret; | ||
179 | } | ||
180 | EXPORT_SYMBOL(krealloc); | ||
181 | |||
182 | /** | ||
183 | * kzfree - like kfree but zero memory | ||
184 | * @p: object to free memory of | ||
185 | * | ||
186 | * The memory of the object @p points to is zeroed before freed. | ||
187 | * If @p is %NULL, kzfree() does nothing. | ||
188 | * | ||
189 | * Note: this function zeroes the whole allocated buffer which can be a good | ||
190 | * deal bigger than the requested buffer size passed to kmalloc(). So be | ||
191 | * careful when using this function in performance sensitive code. | ||
192 | */ | ||
193 | void kzfree(const void *p) | ||
194 | { | ||
195 | size_t ks; | ||
196 | void *mem = (void *)p; | ||
197 | |||
198 | if (unlikely(ZERO_OR_NULL_PTR(mem))) | ||
199 | return; | ||
200 | ks = ksize(mem); | ||
201 | memset(mem, 0, ks); | ||
202 | kfree(mem); | ||
203 | } | ||
204 | EXPORT_SYMBOL(kzfree); | ||
205 | |||
206 | /* | 112 | /* |
207 | * strndup_user - duplicate an existing string from user space | 113 | * strndup_user - duplicate an existing string from user space |
208 | * @s: The string to duplicate | 114 | * @s: The string to duplicate |
@@ -504,11 +410,3 @@ out_mm: | |||
504 | out: | 410 | out: |
505 | return res; | 411 | return res; |
506 | } | 412 | } |
507 | |||
508 | /* Tracepoints definitions. */ | ||
509 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | ||
510 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | ||
511 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); | ||
512 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); | ||
513 | EXPORT_TRACEPOINT_SYMBOL(kfree); | ||
514 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f64632b67196..2b0aa5486092 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
1270 | } | 1270 | } |
1271 | EXPORT_SYMBOL_GPL(unmap_kernel_range); | 1271 | EXPORT_SYMBOL_GPL(unmap_kernel_range); |
1272 | 1272 | ||
1273 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 1273 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) |
1274 | { | 1274 | { |
1275 | unsigned long addr = (unsigned long)area->addr; | 1275 | unsigned long addr = (unsigned long)area->addr; |
1276 | unsigned long end = addr + get_vm_area_size(area); | 1276 | unsigned long end = addr + get_vm_area_size(area); |
1277 | int err; | 1277 | int err; |
1278 | 1278 | ||
1279 | err = vmap_page_range(addr, end, prot, *pages); | 1279 | err = vmap_page_range(addr, end, prot, pages); |
1280 | if (err > 0) { | ||
1281 | *pages += err; | ||
1282 | err = 0; | ||
1283 | } | ||
1284 | 1280 | ||
1285 | return err; | 1281 | return err > 0 ? 0 : err; |
1286 | } | 1282 | } |
1287 | EXPORT_SYMBOL_GPL(map_vm_area); | 1283 | EXPORT_SYMBOL_GPL(map_vm_area); |
1288 | 1284 | ||
@@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, | |||
1548 | if (!area) | 1544 | if (!area) |
1549 | return NULL; | 1545 | return NULL; |
1550 | 1546 | ||
1551 | if (map_vm_area(area, prot, &pages)) { | 1547 | if (map_vm_area(area, prot, pages)) { |
1552 | vunmap(area->addr); | 1548 | vunmap(area->addr); |
1553 | return NULL; | 1549 | return NULL; |
1554 | } | 1550 | } |
@@ -1566,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1566 | const int order = 0; | 1562 | const int order = 0; |
1567 | struct page **pages; | 1563 | struct page **pages; |
1568 | unsigned int nr_pages, array_size, i; | 1564 | unsigned int nr_pages, array_size, i; |
1569 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | 1565 | const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; |
1566 | const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; | ||
1570 | 1567 | ||
1571 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; | 1568 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; |
1572 | array_size = (nr_pages * sizeof(struct page *)); | 1569 | array_size = (nr_pages * sizeof(struct page *)); |
@@ -1589,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1589 | 1586 | ||
1590 | for (i = 0; i < area->nr_pages; i++) { | 1587 | for (i = 0; i < area->nr_pages; i++) { |
1591 | struct page *page; | 1588 | struct page *page; |
1592 | gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; | ||
1593 | 1589 | ||
1594 | if (node == NUMA_NO_NODE) | 1590 | if (node == NUMA_NO_NODE) |
1595 | page = alloc_page(tmp_mask); | 1591 | page = alloc_page(alloc_mask); |
1596 | else | 1592 | else |
1597 | page = alloc_pages_node(node, tmp_mask, order); | 1593 | page = alloc_pages_node(node, alloc_mask, order); |
1598 | 1594 | ||
1599 | if (unlikely(!page)) { | 1595 | if (unlikely(!page)) { |
1600 | /* Successfully allocated i pages, free them in __vunmap() */ | 1596 | /* Successfully allocated i pages, free them in __vunmap() */ |
@@ -1602,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1602 | goto fail; | 1598 | goto fail; |
1603 | } | 1599 | } |
1604 | area->pages[i] = page; | 1600 | area->pages[i] = page; |
1601 | if (gfp_mask & __GFP_WAIT) | ||
1602 | cond_resched(); | ||
1605 | } | 1603 | } |
1606 | 1604 | ||
1607 | if (map_vm_area(area, prot, &pages)) | 1605 | if (map_vm_area(area, prot, pages)) |
1608 | goto fail; | 1606 | goto fail; |
1609 | return area->addr; | 1607 | return area->addr; |
1610 | 1608 | ||
@@ -2690,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) | |||
2690 | 2688 | ||
2691 | prev_end = VMALLOC_START; | 2689 | prev_end = VMALLOC_START; |
2692 | 2690 | ||
2693 | spin_lock(&vmap_area_lock); | 2691 | rcu_read_lock(); |
2694 | 2692 | ||
2695 | if (list_empty(&vmap_area_list)) { | 2693 | if (list_empty(&vmap_area_list)) { |
2696 | vmi->largest_chunk = VMALLOC_TOTAL; | 2694 | vmi->largest_chunk = VMALLOC_TOTAL; |
2697 | goto out; | 2695 | goto out; |
2698 | } | 2696 | } |
2699 | 2697 | ||
2700 | list_for_each_entry(va, &vmap_area_list, list) { | 2698 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
2701 | unsigned long addr = va->va_start; | 2699 | unsigned long addr = va->va_start; |
2702 | 2700 | ||
2703 | /* | 2701 | /* |
@@ -2724,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) | |||
2724 | vmi->largest_chunk = VMALLOC_END - prev_end; | 2722 | vmi->largest_chunk = VMALLOC_END - prev_end; |
2725 | 2723 | ||
2726 | out: | 2724 | out: |
2727 | spin_unlock(&vmap_area_lock); | 2725 | rcu_read_unlock(); |
2728 | } | 2726 | } |
2729 | #endif | 2727 | #endif |
2730 | 2728 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 0f16ffe8eb67..d2f65c856350 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -59,35 +59,20 @@ | |||
59 | #include <trace/events/vmscan.h> | 59 | #include <trace/events/vmscan.h> |
60 | 60 | ||
61 | struct scan_control { | 61 | struct scan_control { |
62 | /* Incremented by the number of inactive pages that were scanned */ | ||
63 | unsigned long nr_scanned; | ||
64 | |||
65 | /* Number of pages freed so far during a call to shrink_zones() */ | ||
66 | unsigned long nr_reclaimed; | ||
67 | |||
68 | /* How many pages shrink_list() should reclaim */ | 62 | /* How many pages shrink_list() should reclaim */ |
69 | unsigned long nr_to_reclaim; | 63 | unsigned long nr_to_reclaim; |
70 | 64 | ||
71 | unsigned long hibernation_mode; | ||
72 | |||
73 | /* This context's GFP mask */ | 65 | /* This context's GFP mask */ |
74 | gfp_t gfp_mask; | 66 | gfp_t gfp_mask; |
75 | 67 | ||
76 | int may_writepage; | 68 | /* Allocation order */ |
77 | |||
78 | /* Can mapped pages be reclaimed? */ | ||
79 | int may_unmap; | ||
80 | |||
81 | /* Can pages be swapped as part of reclaim? */ | ||
82 | int may_swap; | ||
83 | |||
84 | int order; | 69 | int order; |
85 | 70 | ||
86 | /* Scan (total_size >> priority) pages at once */ | 71 | /* |
87 | int priority; | 72 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
88 | 73 | * are scanned. | |
89 | /* anon vs. file LRUs scanning "ratio" */ | 74 | */ |
90 | int swappiness; | 75 | nodemask_t *nodemask; |
91 | 76 | ||
92 | /* | 77 | /* |
93 | * The memory cgroup that hit its limit and as a result is the | 78 | * The memory cgroup that hit its limit and as a result is the |
@@ -95,11 +80,27 @@ struct scan_control { | |||
95 | */ | 80 | */ |
96 | struct mem_cgroup *target_mem_cgroup; | 81 | struct mem_cgroup *target_mem_cgroup; |
97 | 82 | ||
98 | /* | 83 | /* Scan (total_size >> priority) pages at once */ |
99 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 84 | int priority; |
100 | * are scanned. | 85 | |
101 | */ | 86 | unsigned int may_writepage:1; |
102 | nodemask_t *nodemask; | 87 | |
88 | /* Can mapped pages be reclaimed? */ | ||
89 | unsigned int may_unmap:1; | ||
90 | |||
91 | /* Can pages be swapped as part of reclaim? */ | ||
92 | unsigned int may_swap:1; | ||
93 | |||
94 | unsigned int hibernation_mode:1; | ||
95 | |||
96 | /* One of the zones is ready for compaction */ | ||
97 | unsigned int compaction_ready:1; | ||
98 | |||
99 | /* Incremented by the number of inactive pages that were scanned */ | ||
100 | unsigned long nr_scanned; | ||
101 | |||
102 | /* Number of pages freed so far during a call to shrink_zones() */ | ||
103 | unsigned long nr_reclaimed; | ||
103 | }; | 104 | }; |
104 | 105 | ||
105 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | 106 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
@@ -136,7 +137,11 @@ struct scan_control { | |||
136 | * From 0 .. 100. Higher means more swappy. | 137 | * From 0 .. 100. Higher means more swappy. |
137 | */ | 138 | */ |
138 | int vm_swappiness = 60; | 139 | int vm_swappiness = 60; |
139 | unsigned long vm_total_pages; /* The total number of pages which the VM controls */ | 140 | /* |
141 | * The total number of pages which are beyond the high watermark within all | ||
142 | * zones. | ||
143 | */ | ||
144 | unsigned long vm_total_pages; | ||
140 | 145 | ||
141 | static LIST_HEAD(shrinker_list); | 146 | static LIST_HEAD(shrinker_list); |
142 | static DECLARE_RWSEM(shrinker_rwsem); | 147 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -169,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) | |||
169 | 174 | ||
170 | bool zone_reclaimable(struct zone *zone) | 175 | bool zone_reclaimable(struct zone *zone) |
171 | { | 176 | { |
172 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | 177 | return zone_page_state(zone, NR_PAGES_SCANNED) < |
178 | zone_reclaimable_pages(zone) * 6; | ||
173 | } | 179 | } |
174 | 180 | ||
175 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 181 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
@@ -1503,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1503 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | 1509 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); |
1504 | 1510 | ||
1505 | if (global_reclaim(sc)) { | 1511 | if (global_reclaim(sc)) { |
1506 | zone->pages_scanned += nr_scanned; | 1512 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); |
1507 | if (current_is_kswapd()) | 1513 | if (current_is_kswapd()) |
1508 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); | 1514 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); |
1509 | else | 1515 | else |
@@ -1693,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1693 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, | 1699 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, |
1694 | &nr_scanned, sc, isolate_mode, lru); | 1700 | &nr_scanned, sc, isolate_mode, lru); |
1695 | if (global_reclaim(sc)) | 1701 | if (global_reclaim(sc)) |
1696 | zone->pages_scanned += nr_scanned; | 1702 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); |
1697 | 1703 | ||
1698 | reclaim_stat->recent_scanned[file] += nr_taken; | 1704 | reclaim_stat->recent_scanned[file] += nr_taken; |
1699 | 1705 | ||
@@ -1750,7 +1756,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1750 | * Count referenced pages from currently used mappings as rotated, | 1756 | * Count referenced pages from currently used mappings as rotated, |
1751 | * even though only some of them are actually re-activated. This | 1757 | * even though only some of them are actually re-activated. This |
1752 | * helps balance scan pressure between file and anonymous pages in | 1758 | * helps balance scan pressure between file and anonymous pages in |
1753 | * get_scan_ratio. | 1759 | * get_scan_count. |
1754 | */ | 1760 | */ |
1755 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1761 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1756 | 1762 | ||
@@ -1865,8 +1871,8 @@ enum scan_balance { | |||
1865 | * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan | 1871 | * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan |
1866 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | 1872 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan |
1867 | */ | 1873 | */ |
1868 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1874 | static void get_scan_count(struct lruvec *lruvec, int swappiness, |
1869 | unsigned long *nr) | 1875 | struct scan_control *sc, unsigned long *nr) |
1870 | { | 1876 | { |
1871 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1877 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1872 | u64 fraction[2]; | 1878 | u64 fraction[2]; |
@@ -1909,7 +1915,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1909 | * using the memory controller's swap limit feature would be | 1915 | * using the memory controller's swap limit feature would be |
1910 | * too expensive. | 1916 | * too expensive. |
1911 | */ | 1917 | */ |
1912 | if (!global_reclaim(sc) && !sc->swappiness) { | 1918 | if (!global_reclaim(sc) && !swappiness) { |
1913 | scan_balance = SCAN_FILE; | 1919 | scan_balance = SCAN_FILE; |
1914 | goto out; | 1920 | goto out; |
1915 | } | 1921 | } |
@@ -1919,16 +1925,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1919 | * system is close to OOM, scan both anon and file equally | 1925 | * system is close to OOM, scan both anon and file equally |
1920 | * (unless the swappiness setting disagrees with swapping). | 1926 | * (unless the swappiness setting disagrees with swapping). |
1921 | */ | 1927 | */ |
1922 | if (!sc->priority && sc->swappiness) { | 1928 | if (!sc->priority && swappiness) { |
1923 | scan_balance = SCAN_EQUAL; | 1929 | scan_balance = SCAN_EQUAL; |
1924 | goto out; | 1930 | goto out; |
1925 | } | 1931 | } |
1926 | 1932 | ||
1927 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + | ||
1928 | get_lru_size(lruvec, LRU_INACTIVE_ANON); | ||
1929 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | ||
1930 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1931 | |||
1932 | /* | 1933 | /* |
1933 | * Prevent the reclaimer from falling into the cache trap: as | 1934 | * Prevent the reclaimer from falling into the cache trap: as |
1934 | * cache pages start out inactive, every cache fault will tip | 1935 | * cache pages start out inactive, every cache fault will tip |
@@ -1939,9 +1940,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1939 | * anon pages. Try to detect this based on file LRU size. | 1940 | * anon pages. Try to detect this based on file LRU size. |
1940 | */ | 1941 | */ |
1941 | if (global_reclaim(sc)) { | 1942 | if (global_reclaim(sc)) { |
1942 | unsigned long free = zone_page_state(zone, NR_FREE_PAGES); | 1943 | unsigned long zonefile; |
1944 | unsigned long zonefree; | ||
1943 | 1945 | ||
1944 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1946 | zonefree = zone_page_state(zone, NR_FREE_PAGES); |
1947 | zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1948 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1949 | |||
1950 | if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { | ||
1945 | scan_balance = SCAN_ANON; | 1951 | scan_balance = SCAN_ANON; |
1946 | goto out; | 1952 | goto out; |
1947 | } | 1953 | } |
@@ -1962,7 +1968,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1962 | * With swappiness at 100, anonymous and file have the same priority. | 1968 | * With swappiness at 100, anonymous and file have the same priority. |
1963 | * This scanning priority is essentially the inverse of IO cost. | 1969 | * This scanning priority is essentially the inverse of IO cost. |
1964 | */ | 1970 | */ |
1965 | anon_prio = sc->swappiness; | 1971 | anon_prio = swappiness; |
1966 | file_prio = 200 - anon_prio; | 1972 | file_prio = 200 - anon_prio; |
1967 | 1973 | ||
1968 | /* | 1974 | /* |
@@ -1976,6 +1982,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1976 | * | 1982 | * |
1977 | * anon in [0], file in [1] | 1983 | * anon in [0], file in [1] |
1978 | */ | 1984 | */ |
1985 | |||
1986 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + | ||
1987 | get_lru_size(lruvec, LRU_INACTIVE_ANON); | ||
1988 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | ||
1989 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1990 | |||
1979 | spin_lock_irq(&zone->lru_lock); | 1991 | spin_lock_irq(&zone->lru_lock); |
1980 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 1992 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1981 | reclaim_stat->recent_scanned[0] /= 2; | 1993 | reclaim_stat->recent_scanned[0] /= 2; |
@@ -2052,7 +2064,8 @@ out: | |||
2052 | /* | 2064 | /* |
2053 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 2065 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
2054 | */ | 2066 | */ |
2055 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | 2067 | static void shrink_lruvec(struct lruvec *lruvec, int swappiness, |
2068 | struct scan_control *sc) | ||
2056 | { | 2069 | { |
2057 | unsigned long nr[NR_LRU_LISTS]; | 2070 | unsigned long nr[NR_LRU_LISTS]; |
2058 | unsigned long targets[NR_LRU_LISTS]; | 2071 | unsigned long targets[NR_LRU_LISTS]; |
@@ -2063,7 +2076,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2063 | struct blk_plug plug; | 2076 | struct blk_plug plug; |
2064 | bool scan_adjusted; | 2077 | bool scan_adjusted; |
2065 | 2078 | ||
2066 | get_scan_count(lruvec, sc, nr); | 2079 | get_scan_count(lruvec, swappiness, sc, nr); |
2067 | 2080 | ||
2068 | /* Record the original scan target for proportional adjustments later */ | 2081 | /* Record the original scan target for proportional adjustments later */ |
2069 | memcpy(targets, nr, sizeof(nr)); | 2082 | memcpy(targets, nr, sizeof(nr)); |
@@ -2241,9 +2254,10 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2241 | } | 2254 | } |
2242 | } | 2255 | } |
2243 | 2256 | ||
2244 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 2257 | static bool shrink_zone(struct zone *zone, struct scan_control *sc) |
2245 | { | 2258 | { |
2246 | unsigned long nr_reclaimed, nr_scanned; | 2259 | unsigned long nr_reclaimed, nr_scanned; |
2260 | bool reclaimable = false; | ||
2247 | 2261 | ||
2248 | do { | 2262 | do { |
2249 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2263 | struct mem_cgroup *root = sc->target_mem_cgroup; |
@@ -2259,11 +2273,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2259 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2273 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2260 | do { | 2274 | do { |
2261 | struct lruvec *lruvec; | 2275 | struct lruvec *lruvec; |
2276 | int swappiness; | ||
2262 | 2277 | ||
2263 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2278 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2279 | swappiness = mem_cgroup_swappiness(memcg); | ||
2264 | 2280 | ||
2265 | sc->swappiness = mem_cgroup_swappiness(memcg); | 2281 | shrink_lruvec(lruvec, swappiness, sc); |
2266 | shrink_lruvec(lruvec, sc); | ||
2267 | 2282 | ||
2268 | /* | 2283 | /* |
2269 | * Direct reclaim and kswapd have to scan all memory | 2284 | * Direct reclaim and kswapd have to scan all memory |
@@ -2287,20 +2302,21 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2287 | sc->nr_scanned - nr_scanned, | 2302 | sc->nr_scanned - nr_scanned, |
2288 | sc->nr_reclaimed - nr_reclaimed); | 2303 | sc->nr_reclaimed - nr_reclaimed); |
2289 | 2304 | ||
2305 | if (sc->nr_reclaimed - nr_reclaimed) | ||
2306 | reclaimable = true; | ||
2307 | |||
2290 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2308 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
2291 | sc->nr_scanned - nr_scanned, sc)); | 2309 | sc->nr_scanned - nr_scanned, sc)); |
2310 | |||
2311 | return reclaimable; | ||
2292 | } | 2312 | } |
2293 | 2313 | ||
2294 | /* Returns true if compaction should go ahead for a high-order request */ | 2314 | /* Returns true if compaction should go ahead for a high-order request */ |
2295 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | 2315 | static inline bool compaction_ready(struct zone *zone, int order) |
2296 | { | 2316 | { |
2297 | unsigned long balance_gap, watermark; | 2317 | unsigned long balance_gap, watermark; |
2298 | bool watermark_ok; | 2318 | bool watermark_ok; |
2299 | 2319 | ||
2300 | /* Do not consider compaction for orders reclaim is meant to satisfy */ | ||
2301 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) | ||
2302 | return false; | ||
2303 | |||
2304 | /* | 2320 | /* |
2305 | * Compaction takes time to run and there are potentially other | 2321 | * Compaction takes time to run and there are potentially other |
2306 | * callers using the pages just freed. Continue reclaiming until | 2322 | * callers using the pages just freed. Continue reclaiming until |
@@ -2309,18 +2325,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2309 | */ | 2325 | */ |
2310 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( | 2326 | balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( |
2311 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); | 2327 | zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); |
2312 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2328 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); |
2313 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2329 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
2314 | 2330 | ||
2315 | /* | 2331 | /* |
2316 | * If compaction is deferred, reclaim up to a point where | 2332 | * If compaction is deferred, reclaim up to a point where |
2317 | * compaction will have a chance of success when re-enabled | 2333 | * compaction will have a chance of success when re-enabled |
2318 | */ | 2334 | */ |
2319 | if (compaction_deferred(zone, sc->order)) | 2335 | if (compaction_deferred(zone, order)) |
2320 | return watermark_ok; | 2336 | return watermark_ok; |
2321 | 2337 | ||
2322 | /* If compaction is not ready to start, keep reclaiming */ | 2338 | /* If compaction is not ready to start, keep reclaiming */ |
2323 | if (!compaction_suitable(zone, sc->order)) | 2339 | if (!compaction_suitable(zone, order)) |
2324 | return false; | 2340 | return false; |
2325 | 2341 | ||
2326 | return watermark_ok; | 2342 | return watermark_ok; |
@@ -2342,10 +2358,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2342 | * If a zone is deemed to be full of pinned pages then just give it a light | 2358 | * If a zone is deemed to be full of pinned pages then just give it a light |
2343 | * scan then give up on it. | 2359 | * scan then give up on it. |
2344 | * | 2360 | * |
2345 | * This function returns true if a zone is being reclaimed for a costly | 2361 | * Returns true if a zone was reclaimable. |
2346 | * high-order allocation and compaction is ready to begin. This indicates to | ||
2347 | * the caller that it should consider retrying the allocation instead of | ||
2348 | * further reclaim. | ||
2349 | */ | 2362 | */ |
2350 | static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | 2363 | static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) |
2351 | { | 2364 | { |
@@ -2354,13 +2367,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2354 | unsigned long nr_soft_reclaimed; | 2367 | unsigned long nr_soft_reclaimed; |
2355 | unsigned long nr_soft_scanned; | 2368 | unsigned long nr_soft_scanned; |
2356 | unsigned long lru_pages = 0; | 2369 | unsigned long lru_pages = 0; |
2357 | bool aborted_reclaim = false; | ||
2358 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2370 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2359 | gfp_t orig_mask; | 2371 | gfp_t orig_mask; |
2360 | struct shrink_control shrink = { | 2372 | struct shrink_control shrink = { |
2361 | .gfp_mask = sc->gfp_mask, | 2373 | .gfp_mask = sc->gfp_mask, |
2362 | }; | 2374 | }; |
2363 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); | 2375 | enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); |
2376 | bool reclaimable = false; | ||
2364 | 2377 | ||
2365 | /* | 2378 | /* |
2366 | * If the number of buffer_heads in the machine exceeds the maximum | 2379 | * If the number of buffer_heads in the machine exceeds the maximum |
@@ -2391,22 +2404,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2391 | if (sc->priority != DEF_PRIORITY && | 2404 | if (sc->priority != DEF_PRIORITY && |
2392 | !zone_reclaimable(zone)) | 2405 | !zone_reclaimable(zone)) |
2393 | continue; /* Let kswapd poll it */ | 2406 | continue; /* Let kswapd poll it */ |
2394 | if (IS_ENABLED(CONFIG_COMPACTION)) { | 2407 | |
2395 | /* | 2408 | /* |
2396 | * If we already have plenty of memory free for | 2409 | * If we already have plenty of memory free for |
2397 | * compaction in this zone, don't free any more. | 2410 | * compaction in this zone, don't free any more. |
2398 | * Even though compaction is invoked for any | 2411 | * Even though compaction is invoked for any |
2399 | * non-zero order, only frequent costly order | 2412 | * non-zero order, only frequent costly order |
2400 | * reclamation is disruptive enough to become a | 2413 | * reclamation is disruptive enough to become a |
2401 | * noticeable problem, like transparent huge | 2414 | * noticeable problem, like transparent huge |
2402 | * page allocations. | 2415 | * page allocations. |
2403 | */ | 2416 | */ |
2404 | if ((zonelist_zone_idx(z) <= requested_highidx) | 2417 | if (IS_ENABLED(CONFIG_COMPACTION) && |
2405 | && compaction_ready(zone, sc)) { | 2418 | sc->order > PAGE_ALLOC_COSTLY_ORDER && |
2406 | aborted_reclaim = true; | 2419 | zonelist_zone_idx(z) <= requested_highidx && |
2407 | continue; | 2420 | compaction_ready(zone, sc->order)) { |
2408 | } | 2421 | sc->compaction_ready = true; |
2422 | continue; | ||
2409 | } | 2423 | } |
2424 | |||
2410 | /* | 2425 | /* |
2411 | * This steals pages from memory cgroups over softlimit | 2426 | * This steals pages from memory cgroups over softlimit |
2412 | * and returns the number of reclaimed pages and | 2427 | * and returns the number of reclaimed pages and |
@@ -2419,10 +2434,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2419 | &nr_soft_scanned); | 2434 | &nr_soft_scanned); |
2420 | sc->nr_reclaimed += nr_soft_reclaimed; | 2435 | sc->nr_reclaimed += nr_soft_reclaimed; |
2421 | sc->nr_scanned += nr_soft_scanned; | 2436 | sc->nr_scanned += nr_soft_scanned; |
2437 | if (nr_soft_reclaimed) | ||
2438 | reclaimable = true; | ||
2422 | /* need some check for avoid more shrink_zone() */ | 2439 | /* need some check for avoid more shrink_zone() */ |
2423 | } | 2440 | } |
2424 | 2441 | ||
2425 | shrink_zone(zone, sc); | 2442 | if (shrink_zone(zone, sc)) |
2443 | reclaimable = true; | ||
2444 | |||
2445 | if (global_reclaim(sc) && | ||
2446 | !reclaimable && zone_reclaimable(zone)) | ||
2447 | reclaimable = true; | ||
2426 | } | 2448 | } |
2427 | 2449 | ||
2428 | /* | 2450 | /* |
@@ -2445,27 +2467,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2445 | */ | 2467 | */ |
2446 | sc->gfp_mask = orig_mask; | 2468 | sc->gfp_mask = orig_mask; |
2447 | 2469 | ||
2448 | return aborted_reclaim; | 2470 | return reclaimable; |
2449 | } | ||
2450 | |||
2451 | /* All zones in zonelist are unreclaimable? */ | ||
2452 | static bool all_unreclaimable(struct zonelist *zonelist, | ||
2453 | struct scan_control *sc) | ||
2454 | { | ||
2455 | struct zoneref *z; | ||
2456 | struct zone *zone; | ||
2457 | |||
2458 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
2459 | gfp_zone(sc->gfp_mask), sc->nodemask) { | ||
2460 | if (!populated_zone(zone)) | ||
2461 | continue; | ||
2462 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
2463 | continue; | ||
2464 | if (zone_reclaimable(zone)) | ||
2465 | return false; | ||
2466 | } | ||
2467 | |||
2468 | return true; | ||
2469 | } | 2471 | } |
2470 | 2472 | ||
2471 | /* | 2473 | /* |
@@ -2489,7 +2491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2489 | { | 2491 | { |
2490 | unsigned long total_scanned = 0; | 2492 | unsigned long total_scanned = 0; |
2491 | unsigned long writeback_threshold; | 2493 | unsigned long writeback_threshold; |
2492 | bool aborted_reclaim; | 2494 | bool zones_reclaimable; |
2493 | 2495 | ||
2494 | delayacct_freepages_start(); | 2496 | delayacct_freepages_start(); |
2495 | 2497 | ||
@@ -2500,11 +2502,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2500 | vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, | 2502 | vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, |
2501 | sc->priority); | 2503 | sc->priority); |
2502 | sc->nr_scanned = 0; | 2504 | sc->nr_scanned = 0; |
2503 | aborted_reclaim = shrink_zones(zonelist, sc); | 2505 | zones_reclaimable = shrink_zones(zonelist, sc); |
2504 | 2506 | ||
2505 | total_scanned += sc->nr_scanned; | 2507 | total_scanned += sc->nr_scanned; |
2506 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) | 2508 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) |
2507 | goto out; | 2509 | break; |
2510 | |||
2511 | if (sc->compaction_ready) | ||
2512 | break; | ||
2508 | 2513 | ||
2509 | /* | 2514 | /* |
2510 | * If we're getting trouble reclaiming, start doing | 2515 | * If we're getting trouble reclaiming, start doing |
@@ -2526,28 +2531,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2526 | WB_REASON_TRY_TO_FREE_PAGES); | 2531 | WB_REASON_TRY_TO_FREE_PAGES); |
2527 | sc->may_writepage = 1; | 2532 | sc->may_writepage = 1; |
2528 | } | 2533 | } |
2529 | } while (--sc->priority >= 0 && !aborted_reclaim); | 2534 | } while (--sc->priority >= 0); |
2530 | 2535 | ||
2531 | out: | ||
2532 | delayacct_freepages_end(); | 2536 | delayacct_freepages_end(); |
2533 | 2537 | ||
2534 | if (sc->nr_reclaimed) | 2538 | if (sc->nr_reclaimed) |
2535 | return sc->nr_reclaimed; | 2539 | return sc->nr_reclaimed; |
2536 | 2540 | ||
2537 | /* | ||
2538 | * As hibernation is going on, kswapd is freezed so that it can't mark | ||
2539 | * the zone into all_unreclaimable. Thus bypassing all_unreclaimable | ||
2540 | * check. | ||
2541 | */ | ||
2542 | if (oom_killer_disabled) | ||
2543 | return 0; | ||
2544 | |||
2545 | /* Aborted reclaim to try compaction? don't OOM, then */ | 2541 | /* Aborted reclaim to try compaction? don't OOM, then */ |
2546 | if (aborted_reclaim) | 2542 | if (sc->compaction_ready) |
2547 | return 1; | 2543 | return 1; |
2548 | 2544 | ||
2549 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2545 | /* Any of the zones still reclaimable? Don't OOM. */ |
2550 | if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) | 2546 | if (zones_reclaimable) |
2551 | return 1; | 2547 | return 1; |
2552 | 2548 | ||
2553 | return 0; | 2549 | return 0; |
@@ -2684,15 +2680,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2684 | { | 2680 | { |
2685 | unsigned long nr_reclaimed; | 2681 | unsigned long nr_reclaimed; |
2686 | struct scan_control sc = { | 2682 | struct scan_control sc = { |
2683 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | ||
2687 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 2684 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
2685 | .order = order, | ||
2686 | .nodemask = nodemask, | ||
2687 | .priority = DEF_PRIORITY, | ||
2688 | .may_writepage = !laptop_mode, | 2688 | .may_writepage = !laptop_mode, |
2689 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | ||
2690 | .may_unmap = 1, | 2689 | .may_unmap = 1, |
2691 | .may_swap = 1, | 2690 | .may_swap = 1, |
2692 | .order = order, | ||
2693 | .priority = DEF_PRIORITY, | ||
2694 | .target_mem_cgroup = NULL, | ||
2695 | .nodemask = nodemask, | ||
2696 | }; | 2691 | }; |
2697 | 2692 | ||
2698 | /* | 2693 | /* |
@@ -2722,17 +2717,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2722 | unsigned long *nr_scanned) | 2717 | unsigned long *nr_scanned) |
2723 | { | 2718 | { |
2724 | struct scan_control sc = { | 2719 | struct scan_control sc = { |
2725 | .nr_scanned = 0, | ||
2726 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2720 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2721 | .target_mem_cgroup = memcg, | ||
2727 | .may_writepage = !laptop_mode, | 2722 | .may_writepage = !laptop_mode, |
2728 | .may_unmap = 1, | 2723 | .may_unmap = 1, |
2729 | .may_swap = !noswap, | 2724 | .may_swap = !noswap, |
2730 | .order = 0, | ||
2731 | .priority = 0, | ||
2732 | .swappiness = mem_cgroup_swappiness(memcg), | ||
2733 | .target_mem_cgroup = memcg, | ||
2734 | }; | 2725 | }; |
2735 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2726 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2727 | int swappiness = mem_cgroup_swappiness(memcg); | ||
2736 | 2728 | ||
2737 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2729 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2738 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2730 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2748,7 +2740,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2748 | * will pick up pages from other mem cgroup's as well. We hack | 2740 | * will pick up pages from other mem cgroup's as well. We hack |
2749 | * the priority and make it zero. | 2741 | * the priority and make it zero. |
2750 | */ | 2742 | */ |
2751 | shrink_lruvec(lruvec, &sc); | 2743 | shrink_lruvec(lruvec, swappiness, &sc); |
2752 | 2744 | ||
2753 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2745 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2754 | 2746 | ||
@@ -2764,16 +2756,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
2764 | unsigned long nr_reclaimed; | 2756 | unsigned long nr_reclaimed; |
2765 | int nid; | 2757 | int nid; |
2766 | struct scan_control sc = { | 2758 | struct scan_control sc = { |
2767 | .may_writepage = !laptop_mode, | ||
2768 | .may_unmap = 1, | ||
2769 | .may_swap = !noswap, | ||
2770 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2759 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2771 | .order = 0, | ||
2772 | .priority = DEF_PRIORITY, | ||
2773 | .target_mem_cgroup = memcg, | ||
2774 | .nodemask = NULL, /* we don't care the placement */ | ||
2775 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2760 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2776 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2761 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
2762 | .target_mem_cgroup = memcg, | ||
2763 | .priority = DEF_PRIORITY, | ||
2764 | .may_writepage = !laptop_mode, | ||
2765 | .may_unmap = 1, | ||
2766 | .may_swap = !noswap, | ||
2777 | }; | 2767 | }; |
2778 | 2768 | ||
2779 | /* | 2769 | /* |
@@ -3031,12 +3021,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3031 | unsigned long nr_soft_scanned; | 3021 | unsigned long nr_soft_scanned; |
3032 | struct scan_control sc = { | 3022 | struct scan_control sc = { |
3033 | .gfp_mask = GFP_KERNEL, | 3023 | .gfp_mask = GFP_KERNEL, |
3024 | .order = order, | ||
3034 | .priority = DEF_PRIORITY, | 3025 | .priority = DEF_PRIORITY, |
3026 | .may_writepage = !laptop_mode, | ||
3035 | .may_unmap = 1, | 3027 | .may_unmap = 1, |
3036 | .may_swap = 1, | 3028 | .may_swap = 1, |
3037 | .may_writepage = !laptop_mode, | ||
3038 | .order = order, | ||
3039 | .target_mem_cgroup = NULL, | ||
3040 | }; | 3029 | }; |
3041 | count_vm_event(PAGEOUTRUN); | 3030 | count_vm_event(PAGEOUTRUN); |
3042 | 3031 | ||
@@ -3417,14 +3406,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3417 | { | 3406 | { |
3418 | struct reclaim_state reclaim_state; | 3407 | struct reclaim_state reclaim_state; |
3419 | struct scan_control sc = { | 3408 | struct scan_control sc = { |
3409 | .nr_to_reclaim = nr_to_reclaim, | ||
3420 | .gfp_mask = GFP_HIGHUSER_MOVABLE, | 3410 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
3421 | .may_swap = 1, | 3411 | .priority = DEF_PRIORITY, |
3422 | .may_unmap = 1, | ||
3423 | .may_writepage = 1, | 3412 | .may_writepage = 1, |
3424 | .nr_to_reclaim = nr_to_reclaim, | 3413 | .may_unmap = 1, |
3414 | .may_swap = 1, | ||
3425 | .hibernation_mode = 1, | 3415 | .hibernation_mode = 1, |
3426 | .order = 0, | ||
3427 | .priority = DEF_PRIORITY, | ||
3428 | }; | 3416 | }; |
3429 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 3417 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); |
3430 | struct task_struct *p = current; | 3418 | struct task_struct *p = current; |
@@ -3604,13 +3592,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3604 | struct task_struct *p = current; | 3592 | struct task_struct *p = current; |
3605 | struct reclaim_state reclaim_state; | 3593 | struct reclaim_state reclaim_state; |
3606 | struct scan_control sc = { | 3594 | struct scan_control sc = { |
3607 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | ||
3608 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | ||
3609 | .may_swap = 1, | ||
3610 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3595 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3611 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 3596 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
3612 | .order = order, | 3597 | .order = order, |
3613 | .priority = ZONE_RECLAIM_PRIORITY, | 3598 | .priority = ZONE_RECLAIM_PRIORITY, |
3599 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | ||
3600 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | ||
3601 | .may_swap = 1, | ||
3614 | }; | 3602 | }; |
3615 | struct shrink_control shrink = { | 3603 | struct shrink_control shrink = { |
3616 | .gfp_mask = sc.gfp_mask, | 3604 | .gfp_mask = sc.gfp_mask, |
diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49bfd55..e9ab104b956f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, | |||
200 | continue; | 200 | continue; |
201 | 201 | ||
202 | threshold = (*calculate_pressure)(zone); | 202 | threshold = (*calculate_pressure)(zone); |
203 | for_each_possible_cpu(cpu) | 203 | for_each_online_cpu(cpu) |
204 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 204 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
205 | = threshold; | 205 | = threshold; |
206 | } | 206 | } |
@@ -763,6 +763,7 @@ const char * const vmstat_text[] = { | |||
763 | "nr_shmem", | 763 | "nr_shmem", |
764 | "nr_dirtied", | 764 | "nr_dirtied", |
765 | "nr_written", | 765 | "nr_written", |
766 | "nr_pages_scanned", | ||
766 | 767 | ||
767 | #ifdef CONFIG_NUMA | 768 | #ifdef CONFIG_NUMA |
768 | "numa_hit", | 769 | "numa_hit", |
@@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1067 | min_wmark_pages(zone), | 1068 | min_wmark_pages(zone), |
1068 | low_wmark_pages(zone), | 1069 | low_wmark_pages(zone), |
1069 | high_wmark_pages(zone), | 1070 | high_wmark_pages(zone), |
1070 | zone->pages_scanned, | 1071 | zone_page_state(zone, NR_PAGES_SCANNED), |
1071 | zone->spanned_pages, | 1072 | zone->spanned_pages, |
1072 | zone->present_pages, | 1073 | zone->present_pages, |
1073 | zone->managed_pages); | 1074 | zone->managed_pages); |
@@ -1077,10 +1078,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1077 | zone_page_state(zone, i)); | 1078 | zone_page_state(zone, i)); |
1078 | 1079 | ||
1079 | seq_printf(m, | 1080 | seq_printf(m, |
1080 | "\n protection: (%lu", | 1081 | "\n protection: (%ld", |
1081 | zone->lowmem_reserve[0]); | 1082 | zone->lowmem_reserve[0]); |
1082 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 1083 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
1083 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | 1084 | seq_printf(m, ", %ld", zone->lowmem_reserve[i]); |
1084 | seq_printf(m, | 1085 | seq_printf(m, |
1085 | ")" | 1086 | ")" |
1086 | "\n pagesets"); | 1087 | "\n pagesets"); |
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/slab.h> | 51 | #include <linux/slab.h> |
52 | #include <linux/spinlock.h> | 52 | #include <linux/spinlock.h> |
53 | #include <linux/zbud.h> | 53 | #include <linux/zbud.h> |
54 | #include <linux/zpool.h> | ||
54 | 55 | ||
55 | /***************** | 56 | /***************** |
56 | * Structures | 57 | * Structures |
@@ -113,6 +114,90 @@ struct zbud_header { | |||
113 | }; | 114 | }; |
114 | 115 | ||
115 | /***************** | 116 | /***************** |
117 | * zpool | ||
118 | ****************/ | ||
119 | |||
120 | #ifdef CONFIG_ZPOOL | ||
121 | |||
122 | static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) | ||
123 | { | ||
124 | return zpool_evict(pool, handle); | ||
125 | } | ||
126 | |||
127 | static struct zbud_ops zbud_zpool_ops = { | ||
128 | .evict = zbud_zpool_evict | ||
129 | }; | ||
130 | |||
131 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | ||
132 | { | ||
133 | return zbud_create_pool(gfp, &zbud_zpool_ops); | ||
134 | } | ||
135 | |||
136 | static void zbud_zpool_destroy(void *pool) | ||
137 | { | ||
138 | zbud_destroy_pool(pool); | ||
139 | } | ||
140 | |||
141 | static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, | ||
142 | unsigned long *handle) | ||
143 | { | ||
144 | return zbud_alloc(pool, size, gfp, handle); | ||
145 | } | ||
146 | static void zbud_zpool_free(void *pool, unsigned long handle) | ||
147 | { | ||
148 | zbud_free(pool, handle); | ||
149 | } | ||
150 | |||
151 | static int zbud_zpool_shrink(void *pool, unsigned int pages, | ||
152 | unsigned int *reclaimed) | ||
153 | { | ||
154 | unsigned int total = 0; | ||
155 | int ret = -EINVAL; | ||
156 | |||
157 | while (total < pages) { | ||
158 | ret = zbud_reclaim_page(pool, 8); | ||
159 | if (ret < 0) | ||
160 | break; | ||
161 | total++; | ||
162 | } | ||
163 | |||
164 | if (reclaimed) | ||
165 | *reclaimed = total; | ||
166 | |||
167 | return ret; | ||
168 | } | ||
169 | |||
170 | static void *zbud_zpool_map(void *pool, unsigned long handle, | ||
171 | enum zpool_mapmode mm) | ||
172 | { | ||
173 | return zbud_map(pool, handle); | ||
174 | } | ||
175 | static void zbud_zpool_unmap(void *pool, unsigned long handle) | ||
176 | { | ||
177 | zbud_unmap(pool, handle); | ||
178 | } | ||
179 | |||
180 | static u64 zbud_zpool_total_size(void *pool) | ||
181 | { | ||
182 | return zbud_get_pool_size(pool) * PAGE_SIZE; | ||
183 | } | ||
184 | |||
185 | static struct zpool_driver zbud_zpool_driver = { | ||
186 | .type = "zbud", | ||
187 | .owner = THIS_MODULE, | ||
188 | .create = zbud_zpool_create, | ||
189 | .destroy = zbud_zpool_destroy, | ||
190 | .malloc = zbud_zpool_malloc, | ||
191 | .free = zbud_zpool_free, | ||
192 | .shrink = zbud_zpool_shrink, | ||
193 | .map = zbud_zpool_map, | ||
194 | .unmap = zbud_zpool_unmap, | ||
195 | .total_size = zbud_zpool_total_size, | ||
196 | }; | ||
197 | |||
198 | #endif /* CONFIG_ZPOOL */ | ||
199 | |||
200 | /***************** | ||
116 | * Helpers | 201 | * Helpers |
117 | *****************/ | 202 | *****************/ |
118 | /* Just to make the code easier to read */ | 203 | /* Just to make the code easier to read */ |
@@ -122,7 +207,7 @@ enum buddy { | |||
122 | }; | 207 | }; |
123 | 208 | ||
124 | /* Converts an allocation size in bytes to size in zbud chunks */ | 209 | /* Converts an allocation size in bytes to size in zbud chunks */ |
125 | static int size_to_chunks(int size) | 210 | static int size_to_chunks(size_t size) |
126 | { | 211 | { |
127 | return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; | 212 | return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; |
128 | } | 213 | } |
@@ -247,7 +332,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) | |||
247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate | 332 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate |
248 | * a new page. | 333 | * a new page. |
249 | */ | 334 | */ |
250 | int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, | 335 | int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, |
251 | unsigned long *handle) | 336 | unsigned long *handle) |
252 | { | 337 | { |
253 | int chunks, i, freechunks; | 338 | int chunks, i, freechunks; |
@@ -511,11 +596,20 @@ static int __init init_zbud(void) | |||
511 | /* Make sure the zbud header will fit in one chunk */ | 596 | /* Make sure the zbud header will fit in one chunk */ |
512 | BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); | 597 | BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); |
513 | pr_info("loaded\n"); | 598 | pr_info("loaded\n"); |
599 | |||
600 | #ifdef CONFIG_ZPOOL | ||
601 | zpool_register_driver(&zbud_zpool_driver); | ||
602 | #endif | ||
603 | |||
514 | return 0; | 604 | return 0; |
515 | } | 605 | } |
516 | 606 | ||
517 | static void __exit exit_zbud(void) | 607 | static void __exit exit_zbud(void) |
518 | { | 608 | { |
609 | #ifdef CONFIG_ZPOOL | ||
610 | zpool_unregister_driver(&zbud_zpool_driver); | ||
611 | #endif | ||
612 | |||
519 | pr_info("unloaded\n"); | 613 | pr_info("unloaded\n"); |
520 | } | 614 | } |
521 | 615 | ||
diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000000..e40612a1df00 --- /dev/null +++ b/mm/zpool.c | |||
@@ -0,0 +1,364 @@ | |||
1 | /* | ||
2 | * zpool memory storage api | ||
3 | * | ||
4 | * Copyright (C) 2014 Dan Streetman | ||
5 | * | ||
6 | * This is a common frontend for memory storage pool implementations. | ||
7 | * Typically, this is used to store compressed memory. | ||
8 | */ | ||
9 | |||
10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
11 | |||
12 | #include <linux/list.h> | ||
13 | #include <linux/types.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/zpool.h> | ||
19 | |||
20 | struct zpool { | ||
21 | char *type; | ||
22 | |||
23 | struct zpool_driver *driver; | ||
24 | void *pool; | ||
25 | struct zpool_ops *ops; | ||
26 | |||
27 | struct list_head list; | ||
28 | }; | ||
29 | |||
30 | static LIST_HEAD(drivers_head); | ||
31 | static DEFINE_SPINLOCK(drivers_lock); | ||
32 | |||
33 | static LIST_HEAD(pools_head); | ||
34 | static DEFINE_SPINLOCK(pools_lock); | ||
35 | |||
36 | /** | ||
37 | * zpool_register_driver() - register a zpool implementation. | ||
38 | * @driver: driver to register | ||
39 | */ | ||
40 | void zpool_register_driver(struct zpool_driver *driver) | ||
41 | { | ||
42 | spin_lock(&drivers_lock); | ||
43 | atomic_set(&driver->refcount, 0); | ||
44 | list_add(&driver->list, &drivers_head); | ||
45 | spin_unlock(&drivers_lock); | ||
46 | } | ||
47 | EXPORT_SYMBOL(zpool_register_driver); | ||
48 | |||
49 | /** | ||
50 | * zpool_unregister_driver() - unregister a zpool implementation. | ||
51 | * @driver: driver to unregister. | ||
52 | * | ||
53 | * Module usage counting is used to prevent using a driver | ||
54 | * while/after unloading, so if this is called from module | ||
55 | * exit function, this should never fail; if called from | ||
56 | * other than the module exit function, and this returns | ||
57 | * failure, the driver is in use and must remain available. | ||
58 | */ | ||
59 | int zpool_unregister_driver(struct zpool_driver *driver) | ||
60 | { | ||
61 | int ret = 0, refcount; | ||
62 | |||
63 | spin_lock(&drivers_lock); | ||
64 | refcount = atomic_read(&driver->refcount); | ||
65 | WARN_ON(refcount < 0); | ||
66 | if (refcount > 0) | ||
67 | ret = -EBUSY; | ||
68 | else | ||
69 | list_del(&driver->list); | ||
70 | spin_unlock(&drivers_lock); | ||
71 | |||
72 | return ret; | ||
73 | } | ||
74 | EXPORT_SYMBOL(zpool_unregister_driver); | ||
75 | |||
76 | /** | ||
77 | * zpool_evict() - evict callback from a zpool implementation. | ||
78 | * @pool: pool to evict from. | ||
79 | * @handle: handle to evict. | ||
80 | * | ||
81 | * This can be used by zpool implementations to call the | ||
82 | * user's evict zpool_ops struct evict callback. | ||
83 | */ | ||
84 | int zpool_evict(void *pool, unsigned long handle) | ||
85 | { | ||
86 | struct zpool *zpool; | ||
87 | |||
88 | spin_lock(&pools_lock); | ||
89 | list_for_each_entry(zpool, &pools_head, list) { | ||
90 | if (zpool->pool == pool) { | ||
91 | spin_unlock(&pools_lock); | ||
92 | if (!zpool->ops || !zpool->ops->evict) | ||
93 | return -EINVAL; | ||
94 | return zpool->ops->evict(zpool, handle); | ||
95 | } | ||
96 | } | ||
97 | spin_unlock(&pools_lock); | ||
98 | |||
99 | return -ENOENT; | ||
100 | } | ||
101 | EXPORT_SYMBOL(zpool_evict); | ||
102 | |||
103 | static struct zpool_driver *zpool_get_driver(char *type) | ||
104 | { | ||
105 | struct zpool_driver *driver; | ||
106 | |||
107 | spin_lock(&drivers_lock); | ||
108 | list_for_each_entry(driver, &drivers_head, list) { | ||
109 | if (!strcmp(driver->type, type)) { | ||
110 | bool got = try_module_get(driver->owner); | ||
111 | |||
112 | if (got) | ||
113 | atomic_inc(&driver->refcount); | ||
114 | spin_unlock(&drivers_lock); | ||
115 | return got ? driver : NULL; | ||
116 | } | ||
117 | } | ||
118 | |||
119 | spin_unlock(&drivers_lock); | ||
120 | return NULL; | ||
121 | } | ||
122 | |||
123 | static void zpool_put_driver(struct zpool_driver *driver) | ||
124 | { | ||
125 | atomic_dec(&driver->refcount); | ||
126 | module_put(driver->owner); | ||
127 | } | ||
128 | |||
129 | /** | ||
130 | * zpool_create_pool() - Create a new zpool | ||
131 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) | ||
132 | * @gfp The GFP flags to use when allocating the pool. | ||
133 | * @ops The optional ops callback. | ||
134 | * | ||
135 | * This creates a new zpool of the specified type. The gfp flags will be | ||
136 | * used when allocating memory, if the implementation supports it. If the | ||
137 | * ops param is NULL, then the created zpool will not be shrinkable. | ||
138 | * | ||
139 | * Implementations must guarantee this to be thread-safe. | ||
140 | * | ||
141 | * Returns: New zpool on success, NULL on failure. | ||
142 | */ | ||
143 | struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) | ||
144 | { | ||
145 | struct zpool_driver *driver; | ||
146 | struct zpool *zpool; | ||
147 | |||
148 | pr_info("creating pool type %s\n", type); | ||
149 | |||
150 | driver = zpool_get_driver(type); | ||
151 | |||
152 | if (!driver) { | ||
153 | request_module(type); | ||
154 | driver = zpool_get_driver(type); | ||
155 | } | ||
156 | |||
157 | if (!driver) { | ||
158 | pr_err("no driver for type %s\n", type); | ||
159 | return NULL; | ||
160 | } | ||
161 | |||
162 | zpool = kmalloc(sizeof(*zpool), gfp); | ||
163 | if (!zpool) { | ||
164 | pr_err("couldn't create zpool - out of memory\n"); | ||
165 | zpool_put_driver(driver); | ||
166 | return NULL; | ||
167 | } | ||
168 | |||
169 | zpool->type = driver->type; | ||
170 | zpool->driver = driver; | ||
171 | zpool->pool = driver->create(gfp, ops); | ||
172 | zpool->ops = ops; | ||
173 | |||
174 | if (!zpool->pool) { | ||
175 | pr_err("couldn't create %s pool\n", type); | ||
176 | zpool_put_driver(driver); | ||
177 | kfree(zpool); | ||
178 | return NULL; | ||
179 | } | ||
180 | |||
181 | pr_info("created %s pool\n", type); | ||
182 | |||
183 | spin_lock(&pools_lock); | ||
184 | list_add(&zpool->list, &pools_head); | ||
185 | spin_unlock(&pools_lock); | ||
186 | |||
187 | return zpool; | ||
188 | } | ||
189 | |||
190 | /** | ||
191 | * zpool_destroy_pool() - Destroy a zpool | ||
192 | * @pool The zpool to destroy. | ||
193 | * | ||
194 | * Implementations must guarantee this to be thread-safe, | ||
195 | * however only when destroying different pools. The same | ||
196 | * pool should only be destroyed once, and should not be used | ||
197 | * after it is destroyed. | ||
198 | * | ||
199 | * This destroys an existing zpool. The zpool should not be in use. | ||
200 | */ | ||
201 | void zpool_destroy_pool(struct zpool *zpool) | ||
202 | { | ||
203 | pr_info("destroying pool type %s\n", zpool->type); | ||
204 | |||
205 | spin_lock(&pools_lock); | ||
206 | list_del(&zpool->list); | ||
207 | spin_unlock(&pools_lock); | ||
208 | zpool->driver->destroy(zpool->pool); | ||
209 | zpool_put_driver(zpool->driver); | ||
210 | kfree(zpool); | ||
211 | } | ||
212 | |||
213 | /** | ||
214 | * zpool_get_type() - Get the type of the zpool | ||
215 | * @pool The zpool to check | ||
216 | * | ||
217 | * This returns the type of the pool. | ||
218 | * | ||
219 | * Implementations must guarantee this to be thread-safe. | ||
220 | * | ||
221 | * Returns: The type of zpool. | ||
222 | */ | ||
223 | char *zpool_get_type(struct zpool *zpool) | ||
224 | { | ||
225 | return zpool->type; | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * zpool_malloc() - Allocate memory | ||
230 | * @pool The zpool to allocate from. | ||
231 | * @size The amount of memory to allocate. | ||
232 | * @gfp The GFP flags to use when allocating memory. | ||
233 | * @handle Pointer to the handle to set | ||
234 | * | ||
235 | * This allocates the requested amount of memory from the pool. | ||
236 | * The gfp flags will be used when allocating memory, if the | ||
237 | * implementation supports it. The provided @handle will be | ||
238 | * set to the allocated object handle. | ||
239 | * | ||
240 | * Implementations must guarantee this to be thread-safe. | ||
241 | * | ||
242 | * Returns: 0 on success, negative value on error. | ||
243 | */ | ||
244 | int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, | ||
245 | unsigned long *handle) | ||
246 | { | ||
247 | return zpool->driver->malloc(zpool->pool, size, gfp, handle); | ||
248 | } | ||
249 | |||
250 | /** | ||
251 | * zpool_free() - Free previously allocated memory | ||
252 | * @pool The zpool that allocated the memory. | ||
253 | * @handle The handle to the memory to free. | ||
254 | * | ||
255 | * This frees previously allocated memory. This does not guarantee | ||
256 | * that the pool will actually free memory, only that the memory | ||
257 | * in the pool will become available for use by the pool. | ||
258 | * | ||
259 | * Implementations must guarantee this to be thread-safe, | ||
260 | * however only when freeing different handles. The same | ||
261 | * handle should only be freed once, and should not be used | ||
262 | * after freeing. | ||
263 | */ | ||
264 | void zpool_free(struct zpool *zpool, unsigned long handle) | ||
265 | { | ||
266 | zpool->driver->free(zpool->pool, handle); | ||
267 | } | ||
268 | |||
269 | /** | ||
270 | * zpool_shrink() - Shrink the pool size | ||
271 | * @pool The zpool to shrink. | ||
272 | * @pages The number of pages to shrink the pool. | ||
273 | * @reclaimed The number of pages successfully evicted. | ||
274 | * | ||
275 | * This attempts to shrink the actual memory size of the pool | ||
276 | * by evicting currently used handle(s). If the pool was | ||
277 | * created with no zpool_ops, or the evict call fails for any | ||
278 | * of the handles, this will fail. If non-NULL, the @reclaimed | ||
279 | * parameter will be set to the number of pages reclaimed, | ||
280 | * which may be more than the number of pages requested. | ||
281 | * | ||
282 | * Implementations must guarantee this to be thread-safe. | ||
283 | * | ||
284 | * Returns: 0 on success, negative value on error/failure. | ||
285 | */ | ||
286 | int zpool_shrink(struct zpool *zpool, unsigned int pages, | ||
287 | unsigned int *reclaimed) | ||
288 | { | ||
289 | return zpool->driver->shrink(zpool->pool, pages, reclaimed); | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * zpool_map_handle() - Map a previously allocated handle into memory | ||
294 | * @pool The zpool that the handle was allocated from | ||
295 | * @handle The handle to map | ||
296 | * @mm How the memory should be mapped | ||
297 | * | ||
298 | * This maps a previously allocated handle into memory. The @mm | ||
299 | * param indicates to the implementation how the memory will be | ||
300 | * used, i.e. read-only, write-only, read-write. If the | ||
301 | * implementation does not support it, the memory will be treated | ||
302 | * as read-write. | ||
303 | * | ||
304 | * This may hold locks, disable interrupts, and/or preemption, | ||
305 | * and the zpool_unmap_handle() must be called to undo those | ||
306 | * actions. The code that uses the mapped handle should complete | ||
307 | * its operatons on the mapped handle memory quickly and unmap | ||
308 | * as soon as possible. As the implementation may use per-cpu | ||
309 | * data, multiple handles should not be mapped concurrently on | ||
310 | * any cpu. | ||
311 | * | ||
312 | * Returns: A pointer to the handle's mapped memory area. | ||
313 | */ | ||
314 | void *zpool_map_handle(struct zpool *zpool, unsigned long handle, | ||
315 | enum zpool_mapmode mapmode) | ||
316 | { | ||
317 | return zpool->driver->map(zpool->pool, handle, mapmode); | ||
318 | } | ||
319 | |||
320 | /** | ||
321 | * zpool_unmap_handle() - Unmap a previously mapped handle | ||
322 | * @pool The zpool that the handle was allocated from | ||
323 | * @handle The handle to unmap | ||
324 | * | ||
325 | * This unmaps a previously mapped handle. Any locks or other | ||
326 | * actions that the implementation took in zpool_map_handle() | ||
327 | * will be undone here. The memory area returned from | ||
328 | * zpool_map_handle() should no longer be used after this. | ||
329 | */ | ||
330 | void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) | ||
331 | { | ||
332 | zpool->driver->unmap(zpool->pool, handle); | ||
333 | } | ||
334 | |||
335 | /** | ||
336 | * zpool_get_total_size() - The total size of the pool | ||
337 | * @pool The zpool to check | ||
338 | * | ||
339 | * This returns the total size in bytes of the pool. | ||
340 | * | ||
341 | * Returns: Total size of the zpool in bytes. | ||
342 | */ | ||
343 | u64 zpool_get_total_size(struct zpool *zpool) | ||
344 | { | ||
345 | return zpool->driver->total_size(zpool->pool); | ||
346 | } | ||
347 | |||
348 | static int __init init_zpool(void) | ||
349 | { | ||
350 | pr_info("loaded\n"); | ||
351 | return 0; | ||
352 | } | ||
353 | |||
354 | static void __exit exit_zpool(void) | ||
355 | { | ||
356 | pr_info("unloaded\n"); | ||
357 | } | ||
358 | |||
359 | module_init(init_zpool); | ||
360 | module_exit(exit_zpool); | ||
361 | |||
362 | MODULE_LICENSE("GPL"); | ||
363 | MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); | ||
364 | MODULE_DESCRIPTION("Common API for compressed memory storage"); | ||
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe78189624cf..4e2fc83cb394 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -92,6 +92,7 @@ | |||
92 | #include <linux/spinlock.h> | 92 | #include <linux/spinlock.h> |
93 | #include <linux/types.h> | 93 | #include <linux/types.h> |
94 | #include <linux/zsmalloc.h> | 94 | #include <linux/zsmalloc.h> |
95 | #include <linux/zpool.h> | ||
95 | 96 | ||
96 | /* | 97 | /* |
97 | * This must be power of 2 and greater than of equal to sizeof(link_free). | 98 | * This must be power of 2 and greater than of equal to sizeof(link_free). |
@@ -240,6 +241,81 @@ struct mapping_area { | |||
240 | enum zs_mapmode vm_mm; /* mapping mode */ | 241 | enum zs_mapmode vm_mm; /* mapping mode */ |
241 | }; | 242 | }; |
242 | 243 | ||
244 | /* zpool driver */ | ||
245 | |||
246 | #ifdef CONFIG_ZPOOL | ||
247 | |||
248 | static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | ||
249 | { | ||
250 | return zs_create_pool(gfp); | ||
251 | } | ||
252 | |||
253 | static void zs_zpool_destroy(void *pool) | ||
254 | { | ||
255 | zs_destroy_pool(pool); | ||
256 | } | ||
257 | |||
258 | static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, | ||
259 | unsigned long *handle) | ||
260 | { | ||
261 | *handle = zs_malloc(pool, size); | ||
262 | return *handle ? 0 : -1; | ||
263 | } | ||
264 | static void zs_zpool_free(void *pool, unsigned long handle) | ||
265 | { | ||
266 | zs_free(pool, handle); | ||
267 | } | ||
268 | |||
269 | static int zs_zpool_shrink(void *pool, unsigned int pages, | ||
270 | unsigned int *reclaimed) | ||
271 | { | ||
272 | return -EINVAL; | ||
273 | } | ||
274 | |||
275 | static void *zs_zpool_map(void *pool, unsigned long handle, | ||
276 | enum zpool_mapmode mm) | ||
277 | { | ||
278 | enum zs_mapmode zs_mm; | ||
279 | |||
280 | switch (mm) { | ||
281 | case ZPOOL_MM_RO: | ||
282 | zs_mm = ZS_MM_RO; | ||
283 | break; | ||
284 | case ZPOOL_MM_WO: | ||
285 | zs_mm = ZS_MM_WO; | ||
286 | break; | ||
287 | case ZPOOL_MM_RW: /* fallthru */ | ||
288 | default: | ||
289 | zs_mm = ZS_MM_RW; | ||
290 | break; | ||
291 | } | ||
292 | |||
293 | return zs_map_object(pool, handle, zs_mm); | ||
294 | } | ||
295 | static void zs_zpool_unmap(void *pool, unsigned long handle) | ||
296 | { | ||
297 | zs_unmap_object(pool, handle); | ||
298 | } | ||
299 | |||
300 | static u64 zs_zpool_total_size(void *pool) | ||
301 | { | ||
302 | return zs_get_total_size_bytes(pool); | ||
303 | } | ||
304 | |||
305 | static struct zpool_driver zs_zpool_driver = { | ||
306 | .type = "zsmalloc", | ||
307 | .owner = THIS_MODULE, | ||
308 | .create = zs_zpool_create, | ||
309 | .destroy = zs_zpool_destroy, | ||
310 | .malloc = zs_zpool_malloc, | ||
311 | .free = zs_zpool_free, | ||
312 | .shrink = zs_zpool_shrink, | ||
313 | .map = zs_zpool_map, | ||
314 | .unmap = zs_zpool_unmap, | ||
315 | .total_size = zs_zpool_total_size, | ||
316 | }; | ||
317 | |||
318 | #endif /* CONFIG_ZPOOL */ | ||
243 | 319 | ||
244 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 320 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
245 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | 321 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
@@ -690,7 +766,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) | |||
690 | static inline void *__zs_map_object(struct mapping_area *area, | 766 | static inline void *__zs_map_object(struct mapping_area *area, |
691 | struct page *pages[2], int off, int size) | 767 | struct page *pages[2], int off, int size) |
692 | { | 768 | { |
693 | BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); | 769 | BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); |
694 | area->vm_addr = area->vm->addr; | 770 | area->vm_addr = area->vm->addr; |
695 | return area->vm_addr + off; | 771 | return area->vm_addr + off; |
696 | } | 772 | } |
@@ -814,6 +890,10 @@ static void zs_exit(void) | |||
814 | { | 890 | { |
815 | int cpu; | 891 | int cpu; |
816 | 892 | ||
893 | #ifdef CONFIG_ZPOOL | ||
894 | zpool_unregister_driver(&zs_zpool_driver); | ||
895 | #endif | ||
896 | |||
817 | cpu_notifier_register_begin(); | 897 | cpu_notifier_register_begin(); |
818 | 898 | ||
819 | for_each_online_cpu(cpu) | 899 | for_each_online_cpu(cpu) |
@@ -840,6 +920,10 @@ static int zs_init(void) | |||
840 | 920 | ||
841 | cpu_notifier_register_done(); | 921 | cpu_notifier_register_done(); |
842 | 922 | ||
923 | #ifdef CONFIG_ZPOOL | ||
924 | zpool_register_driver(&zs_zpool_driver); | ||
925 | #endif | ||
926 | |||
843 | return 0; | 927 | return 0; |
844 | fail: | 928 | fail: |
845 | zs_exit(); | 929 | zs_exit(); |
diff --git a/mm/zswap.c b/mm/zswap.c index 008388fe7b0f..032c21eeab2b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -34,7 +34,7 @@ | |||
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/crypto.h> | 35 | #include <linux/crypto.h> |
36 | #include <linux/mempool.h> | 36 | #include <linux/mempool.h> |
37 | #include <linux/zbud.h> | 37 | #include <linux/zpool.h> |
38 | 38 | ||
39 | #include <linux/mm_types.h> | 39 | #include <linux/mm_types.h> |
40 | #include <linux/page-flags.h> | 40 | #include <linux/page-flags.h> |
@@ -45,8 +45,8 @@ | |||
45 | /********************************* | 45 | /********************************* |
46 | * statistics | 46 | * statistics |
47 | **********************************/ | 47 | **********************************/ |
48 | /* Number of memory pages used by the compressed pool */ | 48 | /* Total bytes used by the compressed storage */ |
49 | static u64 zswap_pool_pages; | 49 | static u64 zswap_pool_total_size; |
50 | /* The number of compressed pages currently stored in zswap */ | 50 | /* The number of compressed pages currently stored in zswap */ |
51 | static atomic_t zswap_stored_pages = ATOMIC_INIT(0); | 51 | static atomic_t zswap_stored_pages = ATOMIC_INIT(0); |
52 | 52 | ||
@@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20; | |||
89 | module_param_named(max_pool_percent, | 89 | module_param_named(max_pool_percent, |
90 | zswap_max_pool_percent, uint, 0644); | 90 | zswap_max_pool_percent, uint, 0644); |
91 | 91 | ||
92 | /* zbud_pool is shared by all of zswap backend */ | 92 | /* Compressed storage to use */ |
93 | static struct zbud_pool *zswap_pool; | 93 | #define ZSWAP_ZPOOL_DEFAULT "zbud" |
94 | static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; | ||
95 | module_param_named(zpool, zswap_zpool_type, charp, 0444); | ||
96 | |||
97 | /* zpool is shared by all of zswap backend */ | ||
98 | static struct zpool *zswap_pool; | ||
94 | 99 | ||
95 | /********************************* | 100 | /********************************* |
96 | * compression functions | 101 | * compression functions |
@@ -168,7 +173,7 @@ static void zswap_comp_exit(void) | |||
168 | * be held while changing the refcount. Since the lock must | 173 | * be held while changing the refcount. Since the lock must |
169 | * be held, there is no reason to also make refcount atomic. | 174 | * be held, there is no reason to also make refcount atomic. |
170 | * offset - the swap offset for the entry. Index into the red-black tree. | 175 | * offset - the swap offset for the entry. Index into the red-black tree. |
171 | * handle - zbud allocation handle that stores the compressed page data | 176 | * handle - zpool allocation handle that stores the compressed page data |
172 | * length - the length in bytes of the compressed page data. Needed during | 177 | * length - the length in bytes of the compressed page data. Needed during |
173 | * decompression | 178 | * decompression |
174 | */ | 179 | */ |
@@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) | |||
284 | } | 289 | } |
285 | 290 | ||
286 | /* | 291 | /* |
287 | * Carries out the common pattern of freeing and entry's zbud allocation, | 292 | * Carries out the common pattern of freeing and entry's zpool allocation, |
288 | * freeing the entry itself, and decrementing the number of stored pages. | 293 | * freeing the entry itself, and decrementing the number of stored pages. |
289 | */ | 294 | */ |
290 | static void zswap_free_entry(struct zswap_entry *entry) | 295 | static void zswap_free_entry(struct zswap_entry *entry) |
291 | { | 296 | { |
292 | zbud_free(zswap_pool, entry->handle); | 297 | zpool_free(zswap_pool, entry->handle); |
293 | zswap_entry_cache_free(entry); | 298 | zswap_entry_cache_free(entry); |
294 | atomic_dec(&zswap_stored_pages); | 299 | atomic_dec(&zswap_stored_pages); |
295 | zswap_pool_pages = zbud_get_pool_size(zswap_pool); | 300 | zswap_pool_total_size = zpool_get_total_size(zswap_pool); |
296 | } | 301 | } |
297 | 302 | ||
298 | /* caller must hold the tree lock */ | 303 | /* caller must hold the tree lock */ |
@@ -409,7 +414,7 @@ cleanup: | |||
409 | static bool zswap_is_full(void) | 414 | static bool zswap_is_full(void) |
410 | { | 415 | { |
411 | return totalram_pages * zswap_max_pool_percent / 100 < | 416 | return totalram_pages * zswap_max_pool_percent / 100 < |
412 | zswap_pool_pages; | 417 | DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); |
413 | } | 418 | } |
414 | 419 | ||
415 | /********************************* | 420 | /********************************* |
@@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, | |||
525 | * the swap cache, the compressed version stored by zswap can be | 530 | * the swap cache, the compressed version stored by zswap can be |
526 | * freed. | 531 | * freed. |
527 | */ | 532 | */ |
528 | static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | 533 | static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) |
529 | { | 534 | { |
530 | struct zswap_header *zhdr; | 535 | struct zswap_header *zhdr; |
531 | swp_entry_t swpentry; | 536 | swp_entry_t swpentry; |
@@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
541 | }; | 546 | }; |
542 | 547 | ||
543 | /* extract swpentry from data */ | 548 | /* extract swpentry from data */ |
544 | zhdr = zbud_map(pool, handle); | 549 | zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); |
545 | swpentry = zhdr->swpentry; /* here */ | 550 | swpentry = zhdr->swpentry; /* here */ |
546 | zbud_unmap(pool, handle); | 551 | zpool_unmap_handle(pool, handle); |
547 | tree = zswap_trees[swp_type(swpentry)]; | 552 | tree = zswap_trees[swp_type(swpentry)]; |
548 | offset = swp_offset(swpentry); | 553 | offset = swp_offset(swpentry); |
549 | 554 | ||
@@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | |||
573 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ | 578 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ |
574 | /* decompress */ | 579 | /* decompress */ |
575 | dlen = PAGE_SIZE; | 580 | dlen = PAGE_SIZE; |
576 | src = (u8 *)zbud_map(zswap_pool, entry->handle) + | 581 | src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, |
577 | sizeof(struct zswap_header); | 582 | ZPOOL_MM_RO) + sizeof(struct zswap_header); |
578 | dst = kmap_atomic(page); | 583 | dst = kmap_atomic(page); |
579 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, | 584 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, |
580 | entry->length, dst, &dlen); | 585 | entry->length, dst, &dlen); |
581 | kunmap_atomic(dst); | 586 | kunmap_atomic(dst); |
582 | zbud_unmap(zswap_pool, entry->handle); | 587 | zpool_unmap_handle(zswap_pool, entry->handle); |
583 | BUG_ON(ret); | 588 | BUG_ON(ret); |
584 | BUG_ON(dlen != PAGE_SIZE); | 589 | BUG_ON(dlen != PAGE_SIZE); |
585 | 590 | ||
@@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
652 | /* reclaim space if needed */ | 657 | /* reclaim space if needed */ |
653 | if (zswap_is_full()) { | 658 | if (zswap_is_full()) { |
654 | zswap_pool_limit_hit++; | 659 | zswap_pool_limit_hit++; |
655 | if (zbud_reclaim_page(zswap_pool, 8)) { | 660 | if (zpool_shrink(zswap_pool, 1, NULL)) { |
656 | zswap_reject_reclaim_fail++; | 661 | zswap_reject_reclaim_fail++; |
657 | ret = -ENOMEM; | 662 | ret = -ENOMEM; |
658 | goto reject; | 663 | goto reject; |
@@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
679 | 684 | ||
680 | /* store */ | 685 | /* store */ |
681 | len = dlen + sizeof(struct zswap_header); | 686 | len = dlen + sizeof(struct zswap_header); |
682 | ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, | 687 | ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, |
683 | &handle); | 688 | &handle); |
684 | if (ret == -ENOSPC) { | 689 | if (ret == -ENOSPC) { |
685 | zswap_reject_compress_poor++; | 690 | zswap_reject_compress_poor++; |
@@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
689 | zswap_reject_alloc_fail++; | 694 | zswap_reject_alloc_fail++; |
690 | goto freepage; | 695 | goto freepage; |
691 | } | 696 | } |
692 | zhdr = zbud_map(zswap_pool, handle); | 697 | zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); |
693 | zhdr->swpentry = swp_entry(type, offset); | 698 | zhdr->swpentry = swp_entry(type, offset); |
694 | buf = (u8 *)(zhdr + 1); | 699 | buf = (u8 *)(zhdr + 1); |
695 | memcpy(buf, dst, dlen); | 700 | memcpy(buf, dst, dlen); |
696 | zbud_unmap(zswap_pool, handle); | 701 | zpool_unmap_handle(zswap_pool, handle); |
697 | put_cpu_var(zswap_dstmem); | 702 | put_cpu_var(zswap_dstmem); |
698 | 703 | ||
699 | /* populate entry */ | 704 | /* populate entry */ |
@@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
716 | 721 | ||
717 | /* update stats */ | 722 | /* update stats */ |
718 | atomic_inc(&zswap_stored_pages); | 723 | atomic_inc(&zswap_stored_pages); |
719 | zswap_pool_pages = zbud_get_pool_size(zswap_pool); | 724 | zswap_pool_total_size = zpool_get_total_size(zswap_pool); |
720 | 725 | ||
721 | return 0; | 726 | return 0; |
722 | 727 | ||
@@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, | |||
752 | 757 | ||
753 | /* decompress */ | 758 | /* decompress */ |
754 | dlen = PAGE_SIZE; | 759 | dlen = PAGE_SIZE; |
755 | src = (u8 *)zbud_map(zswap_pool, entry->handle) + | 760 | src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, |
756 | sizeof(struct zswap_header); | 761 | ZPOOL_MM_RO) + sizeof(struct zswap_header); |
757 | dst = kmap_atomic(page); | 762 | dst = kmap_atomic(page); |
758 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, | 763 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, |
759 | dst, &dlen); | 764 | dst, &dlen); |
760 | kunmap_atomic(dst); | 765 | kunmap_atomic(dst); |
761 | zbud_unmap(zswap_pool, entry->handle); | 766 | zpool_unmap_handle(zswap_pool, entry->handle); |
762 | BUG_ON(ret); | 767 | BUG_ON(ret); |
763 | 768 | ||
764 | spin_lock(&tree->lock); | 769 | spin_lock(&tree->lock); |
@@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
811 | zswap_trees[type] = NULL; | 816 | zswap_trees[type] = NULL; |
812 | } | 817 | } |
813 | 818 | ||
814 | static struct zbud_ops zswap_zbud_ops = { | 819 | static struct zpool_ops zswap_zpool_ops = { |
815 | .evict = zswap_writeback_entry | 820 | .evict = zswap_writeback_entry |
816 | }; | 821 | }; |
817 | 822 | ||
@@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void) | |||
869 | zswap_debugfs_root, &zswap_written_back_pages); | 874 | zswap_debugfs_root, &zswap_written_back_pages); |
870 | debugfs_create_u64("duplicate_entry", S_IRUGO, | 875 | debugfs_create_u64("duplicate_entry", S_IRUGO, |
871 | zswap_debugfs_root, &zswap_duplicate_entry); | 876 | zswap_debugfs_root, &zswap_duplicate_entry); |
872 | debugfs_create_u64("pool_pages", S_IRUGO, | 877 | debugfs_create_u64("pool_total_size", S_IRUGO, |
873 | zswap_debugfs_root, &zswap_pool_pages); | 878 | zswap_debugfs_root, &zswap_pool_total_size); |
874 | debugfs_create_atomic_t("stored_pages", S_IRUGO, | 879 | debugfs_create_atomic_t("stored_pages", S_IRUGO, |
875 | zswap_debugfs_root, &zswap_stored_pages); | 880 | zswap_debugfs_root, &zswap_stored_pages); |
876 | 881 | ||
@@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { } | |||
895 | **********************************/ | 900 | **********************************/ |
896 | static int __init init_zswap(void) | 901 | static int __init init_zswap(void) |
897 | { | 902 | { |
903 | gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; | ||
904 | |||
898 | if (!zswap_enabled) | 905 | if (!zswap_enabled) |
899 | return 0; | 906 | return 0; |
900 | 907 | ||
901 | pr_info("loading zswap\n"); | 908 | pr_info("loading zswap\n"); |
902 | 909 | ||
903 | zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); | 910 | zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); |
911 | if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { | ||
912 | pr_info("%s zpool not available\n", zswap_zpool_type); | ||
913 | zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; | ||
914 | zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, | ||
915 | &zswap_zpool_ops); | ||
916 | } | ||
904 | if (!zswap_pool) { | 917 | if (!zswap_pool) { |
905 | pr_err("zbud pool creation failed\n"); | 918 | pr_err("%s zpool not available\n", zswap_zpool_type); |
919 | pr_err("zpool creation failed\n"); | ||
906 | goto error; | 920 | goto error; |
907 | } | 921 | } |
922 | pr_info("using %s pool\n", zswap_zpool_type); | ||
908 | 923 | ||
909 | if (zswap_entry_cache_create()) { | 924 | if (zswap_entry_cache_create()) { |
910 | pr_err("entry cache creation failed\n"); | 925 | pr_err("entry cache creation failed\n"); |
@@ -928,7 +943,7 @@ pcpufail: | |||
928 | compfail: | 943 | compfail: |
929 | zswap_entry_cache_destory(); | 944 | zswap_entry_cache_destory(); |
930 | cachefail: | 945 | cachefail: |
931 | zbud_destroy_pool(zswap_pool); | 946 | zpool_destroy_pool(zswap_pool); |
932 | error: | 947 | error: |
933 | return -ENOMEM; | 948 | return -ENOMEM; |
934 | } | 949 | } |