aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-07 00:14:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-07 00:14:42 -0400
commit33caee39925b887a99a2400dc5c980097c3573f9 (patch)
tree8e68ad97e1fee88c4a3f31453041f8d139f2027e /mm
parent6456a0438b984186a0c9c8ecc9fe3d97b7ac3613 (diff)
parentf84223087402c45179be5e7060c5736c17a7b271 (diff)
Merge branch 'akpm' (patchbomb from Andrew Morton)
Merge incoming from Andrew Morton: - Various misc things. - arch/sh updates. - Part of ocfs2. Review is slow. - Slab updates. - Most of -mm. - printk updates. - lib/ updates. - checkpatch updates. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (226 commits) checkpatch: update $declaration_macros, add uninitialized_var checkpatch: warn on missing spaces in broken up quoted checkpatch: fix false positives for --strict "space after cast" test checkpatch: fix false positive MISSING_BREAK warnings with --file checkpatch: add test for native c90 types in unusual order checkpatch: add signed generic types checkpatch: add short int to c variable types checkpatch: add for_each tests to indentation and brace tests checkpatch: fix brace style misuses of else and while checkpatch: add --fix option for a couple OPEN_BRACE misuses checkpatch: use the correct indentation for which() checkpatch: add fix_insert_line and fix_delete_line helpers checkpatch: add ability to insert and delete lines to patch/file checkpatch: add an index variable for fixed lines checkpatch: warn on break after goto or return with same tab indentation checkpatch: emit a warning on file add/move/delete checkpatch: add test for commit id formatting style in commit log checkpatch: emit fewer kmalloc_array/kcalloc conversion warnings checkpatch: improve "no space after cast" test checkpatch: allow multiple const * types ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig54
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cma.c335
-rw-r--r--mm/filemap.c27
-rw-r--r--mm/gup.c18
-rw-r--r--mm/highmem.c86
-rw-r--r--mm/huge_memory.c38
-rw-r--r--mm/hugetlb.c129
-rw-r--r--mm/hwpoison-inject.c3
-rw-r--r--mm/internal.h2
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/memcontrol.c416
-rw-r--r--mm/memory-failure.c10
-rw-r--r--mm/memory.c70
-rw-r--r--mm/memory_hotplug.c45
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mmu_notifier.c40
-rw-r--r--mm/oom_kill.c34
-rw-r--r--mm/page-writeback.c5
-rw-r--r--mm/page_alloc.c159
-rw-r--r--mm/readahead.c3
-rw-r--r--mm/shmem.c39
-rw-r--r--mm/slab.c514
-rw-r--r--mm/slab.h24
-rw-r--r--mm/slab_common.c101
-rw-r--r--mm/slub.c221
-rw-r--r--mm/swap.c18
-rw-r--r--mm/util.c102
-rw-r--r--mm/vmalloc.c30
-rw-r--r--mm/vmscan.c274
-rw-r--r--mm/vmstat.c9
-rw-r--r--mm/zbud.c98
-rw-r--r--mm/zpool.c364
-rw-r--r--mm/zsmalloc.c86
-rw-r--r--mm/zswap.c75
36 files changed, 2141 insertions, 1307 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e9977a9d657..886db2158538 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -508,21 +508,34 @@ config CMA_DEBUG
508 processing calls such as dma_alloc_from_contiguous(). 508 processing calls such as dma_alloc_from_contiguous().
509 This option does not affect warning and error messages. 509 This option does not affect warning and error messages.
510 510
511config ZBUD 511config CMA_AREAS
512 tristate 512 int "Maximum count of the CMA areas"
513 default n 513 depends on CMA
514 default 7
514 help 515 help
515 A special purpose allocator for storing compressed pages. 516 CMA allows to create CMA areas for particular purpose, mainly,
516 It is designed to store up to two compressed pages per physical 517 used as device private area. This parameter sets the maximum
517 page. While this design limits storage density, it has simple and 518 number of CMA area in the system.
518 deterministic reclaim properties that make it preferable to a higher 519
519 density approach when reclaim will be used. 520 If unsure, leave the default value "7".
521
522config MEM_SOFT_DIRTY
523 bool "Track memory changes"
524 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
525 select PROC_PAGE_MONITOR
526 help
527 This option enables memory changes tracking by introducing a
528 soft-dirty bit on pte-s. This bit it set when someone writes
529 into a page just as regular dirty bit, but unlike the latter
530 it can be cleared by hands.
531
532 See Documentation/vm/soft-dirty.txt for more details.
520 533
521config ZSWAP 534config ZSWAP
522 bool "Compressed cache for swap pages (EXPERIMENTAL)" 535 bool "Compressed cache for swap pages (EXPERIMENTAL)"
523 depends on FRONTSWAP && CRYPTO=y 536 depends on FRONTSWAP && CRYPTO=y
524 select CRYPTO_LZO 537 select CRYPTO_LZO
525 select ZBUD 538 select ZPOOL
526 default n 539 default n
527 help 540 help
528 A lightweight compressed cache for swap pages. It takes 541 A lightweight compressed cache for swap pages. It takes
@@ -538,17 +551,22 @@ config ZSWAP
538 they have not be fully explored on the large set of potential 551 they have not be fully explored on the large set of potential
539 configurations and workloads that exist. 552 configurations and workloads that exist.
540 553
541config MEM_SOFT_DIRTY 554config ZPOOL
542 bool "Track memory changes" 555 tristate "Common API for compressed memory storage"
543 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS 556 default n
544 select PROC_PAGE_MONITOR
545 help 557 help
546 This option enables memory changes tracking by introducing a 558 Compressed memory storage API. This allows using either zbud or
547 soft-dirty bit on pte-s. This bit it set when someone writes 559 zsmalloc.
548 into a page just as regular dirty bit, but unlike the latter
549 it can be cleared by hands.
550 560
551 See Documentation/vm/soft-dirty.txt for more details. 561config ZBUD
562 tristate "Low density storage for compressed pages"
563 default n
564 help
565 A special purpose allocator for storing compressed pages.
566 It is designed to store up to two compressed pages per physical
567 page. While this design limits storage density, it has simple and
568 deterministic reclaim properties that make it preferable to a higher
569 density approach when reclaim will be used.
552 570
553config ZSMALLOC 571config ZSMALLOC
554 tristate "Memory allocator for compressed pages" 572 tristate "Memory allocator for compressed pages"
diff --git a/mm/Makefile b/mm/Makefile
index 4064f3ec145e..632ae77e6070 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -59,6 +59,8 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
59obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 59obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
60obj-$(CONFIG_CLEANCACHE) += cleancache.o 60obj-$(CONFIG_CLEANCACHE) += cleancache.o
61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
62obj-$(CONFIG_ZPOOL) += zpool.o
62obj-$(CONFIG_ZBUD) += zbud.o 63obj-$(CONFIG_ZBUD) += zbud.o
63obj-$(CONFIG_ZSMALLOC) += zsmalloc.o 64obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
64obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o 65obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
66obj-$(CONFIG_CMA) += cma.o
diff --git a/mm/cma.c b/mm/cma.c
new file mode 100644
index 000000000000..c17751c0dcaf
--- /dev/null
+++ b/mm/cma.c
@@ -0,0 +1,335 @@
1/*
2 * Contiguous Memory Allocator
3 *
4 * Copyright (c) 2010-2011 by Samsung Electronics.
5 * Copyright IBM Corporation, 2013
6 * Copyright LG Electronics Inc., 2014
7 * Written by:
8 * Marek Szyprowski <m.szyprowski@samsung.com>
9 * Michal Nazarewicz <mina86@mina86.com>
10 * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
11 * Joonsoo Kim <iamjoonsoo.kim@lge.com>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License as
15 * published by the Free Software Foundation; either version 2 of the
16 * License or (at your optional) any later version of the license.
17 */
18
19#define pr_fmt(fmt) "cma: " fmt
20
21#ifdef CONFIG_CMA_DEBUG
22#ifndef DEBUG
23# define DEBUG
24#endif
25#endif
26
27#include <linux/memblock.h>
28#include <linux/err.h>
29#include <linux/mm.h>
30#include <linux/mutex.h>
31#include <linux/sizes.h>
32#include <linux/slab.h>
33#include <linux/log2.h>
34#include <linux/cma.h>
35
36struct cma {
37 unsigned long base_pfn;
38 unsigned long count;
39 unsigned long *bitmap;
40 unsigned int order_per_bit; /* Order of pages represented by one bit */
41 struct mutex lock;
42};
43
44static struct cma cma_areas[MAX_CMA_AREAS];
45static unsigned cma_area_count;
46static DEFINE_MUTEX(cma_mutex);
47
48phys_addr_t cma_get_base(struct cma *cma)
49{
50 return PFN_PHYS(cma->base_pfn);
51}
52
53unsigned long cma_get_size(struct cma *cma)
54{
55 return cma->count << PAGE_SHIFT;
56}
57
58static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
59{
60 return (1UL << (align_order >> cma->order_per_bit)) - 1;
61}
62
63static unsigned long cma_bitmap_maxno(struct cma *cma)
64{
65 return cma->count >> cma->order_per_bit;
66}
67
68static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
69 unsigned long pages)
70{
71 return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
72}
73
74static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
75{
76 unsigned long bitmap_no, bitmap_count;
77
78 bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit;
79 bitmap_count = cma_bitmap_pages_to_bits(cma, count);
80
81 mutex_lock(&cma->lock);
82 bitmap_clear(cma->bitmap, bitmap_no, bitmap_count);
83 mutex_unlock(&cma->lock);
84}
85
86static int __init cma_activate_area(struct cma *cma)
87{
88 int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
89 unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
90 unsigned i = cma->count >> pageblock_order;
91 struct zone *zone;
92
93 cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
94
95 if (!cma->bitmap)
96 return -ENOMEM;
97
98 WARN_ON_ONCE(!pfn_valid(pfn));
99 zone = page_zone(pfn_to_page(pfn));
100
101 do {
102 unsigned j;
103
104 base_pfn = pfn;
105 for (j = pageblock_nr_pages; j; --j, pfn++) {
106 WARN_ON_ONCE(!pfn_valid(pfn));
107 /*
108 * alloc_contig_range requires the pfn range
109 * specified to be in the same zone. Make this
110 * simple by forcing the entire CMA resv range
111 * to be in the same zone.
112 */
113 if (page_zone(pfn_to_page(pfn)) != zone)
114 goto err;
115 }
116 init_cma_reserved_pageblock(pfn_to_page(base_pfn));
117 } while (--i);
118
119 mutex_init(&cma->lock);
120 return 0;
121
122err:
123 kfree(cma->bitmap);
124 return -EINVAL;
125}
126
127static int __init cma_init_reserved_areas(void)
128{
129 int i;
130
131 for (i = 0; i < cma_area_count; i++) {
132 int ret = cma_activate_area(&cma_areas[i]);
133
134 if (ret)
135 return ret;
136 }
137
138 return 0;
139}
140core_initcall(cma_init_reserved_areas);
141
142/**
143 * cma_declare_contiguous() - reserve custom contiguous area
144 * @base: Base address of the reserved area optional, use 0 for any
145 * @size: Size of the reserved area (in bytes),
146 * @limit: End address of the reserved memory (optional, 0 for any).
147 * @alignment: Alignment for the CMA area, should be power of 2 or zero
148 * @order_per_bit: Order of pages represented by one bit on bitmap.
149 * @fixed: hint about where to place the reserved area
150 * @res_cma: Pointer to store the created cma region.
151 *
152 * This function reserves memory from early allocator. It should be
153 * called by arch specific code once the early allocator (memblock or bootmem)
154 * has been activated and all other subsystems have already allocated/reserved
155 * memory. This function allows to create custom reserved areas.
156 *
157 * If @fixed is true, reserve contiguous area at exactly @base. If false,
158 * reserve in range from @base to @limit.
159 */
160int __init cma_declare_contiguous(phys_addr_t base,
161 phys_addr_t size, phys_addr_t limit,
162 phys_addr_t alignment, unsigned int order_per_bit,
163 bool fixed, struct cma **res_cma)
164{
165 struct cma *cma;
166 int ret = 0;
167
168 pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
169 __func__, (unsigned long)size, (unsigned long)base,
170 (unsigned long)limit, (unsigned long)alignment);
171
172 if (cma_area_count == ARRAY_SIZE(cma_areas)) {
173 pr_err("Not enough slots for CMA reserved regions!\n");
174 return -ENOSPC;
175 }
176
177 if (!size)
178 return -EINVAL;
179
180 if (alignment && !is_power_of_2(alignment))
181 return -EINVAL;
182
183 /*
184 * Sanitise input arguments.
185 * Pages both ends in CMA area could be merged into adjacent unmovable
186 * migratetype page by page allocator's buddy algorithm. In the case,
187 * you couldn't get a contiguous memory, which is not what we want.
188 */
189 alignment = max(alignment,
190 (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
191 base = ALIGN(base, alignment);
192 size = ALIGN(size, alignment);
193 limit &= ~(alignment - 1);
194
195 /* size should be aligned with order_per_bit */
196 if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
197 return -EINVAL;
198
199 /* Reserve memory */
200 if (base && fixed) {
201 if (memblock_is_region_reserved(base, size) ||
202 memblock_reserve(base, size) < 0) {
203 ret = -EBUSY;
204 goto err;
205 }
206 } else {
207 phys_addr_t addr = memblock_alloc_range(size, alignment, base,
208 limit);
209 if (!addr) {
210 ret = -ENOMEM;
211 goto err;
212 } else {
213 base = addr;
214 }
215 }
216
217 /*
218 * Each reserved area must be initialised later, when more kernel
219 * subsystems (like slab allocator) are available.
220 */
221 cma = &cma_areas[cma_area_count];
222 cma->base_pfn = PFN_DOWN(base);
223 cma->count = size >> PAGE_SHIFT;
224 cma->order_per_bit = order_per_bit;
225 *res_cma = cma;
226 cma_area_count++;
227
228 pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M,
229 (unsigned long)base);
230 return 0;
231
232err:
233 pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
234 return ret;
235}
236
237/**
238 * cma_alloc() - allocate pages from contiguous area
239 * @cma: Contiguous memory region for which the allocation is performed.
240 * @count: Requested number of pages.
241 * @align: Requested alignment of pages (in PAGE_SIZE order).
242 *
243 * This function allocates part of contiguous memory on specific
244 * contiguous memory area.
245 */
246struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
247{
248 unsigned long mask, pfn, start = 0;
249 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
250 struct page *page = NULL;
251 int ret;
252
253 if (!cma || !cma->count)
254 return NULL;
255
256 pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma,
257 count, align);
258
259 if (!count)
260 return NULL;
261
262 mask = cma_bitmap_aligned_mask(cma, align);
263 bitmap_maxno = cma_bitmap_maxno(cma);
264 bitmap_count = cma_bitmap_pages_to_bits(cma, count);
265
266 for (;;) {
267 mutex_lock(&cma->lock);
268 bitmap_no = bitmap_find_next_zero_area(cma->bitmap,
269 bitmap_maxno, start, bitmap_count, mask);
270 if (bitmap_no >= bitmap_maxno) {
271 mutex_unlock(&cma->lock);
272 break;
273 }
274 bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
275 /*
276 * It's safe to drop the lock here. We've marked this region for
277 * our exclusive use. If the migration fails we will take the
278 * lock again and unmark it.
279 */
280 mutex_unlock(&cma->lock);
281
282 pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
283 mutex_lock(&cma_mutex);
284 ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
285 mutex_unlock(&cma_mutex);
286 if (ret == 0) {
287 page = pfn_to_page(pfn);
288 break;
289 }
290
291 cma_clear_bitmap(cma, pfn, count);
292 if (ret != -EBUSY)
293 break;
294
295 pr_debug("%s(): memory range at %p is busy, retrying\n",
296 __func__, pfn_to_page(pfn));
297 /* try again with a bit different memory target */
298 start = bitmap_no + mask + 1;
299 }
300
301 pr_debug("%s(): returned %p\n", __func__, page);
302 return page;
303}
304
305/**
306 * cma_release() - release allocated pages
307 * @cma: Contiguous memory region for which the allocation is performed.
308 * @pages: Allocated pages.
309 * @count: Number of allocated pages.
310 *
311 * This function releases memory allocated by alloc_cma().
312 * It returns false when provided pages do not belong to contiguous area and
313 * true otherwise.
314 */
315bool cma_release(struct cma *cma, struct page *pages, int count)
316{
317 unsigned long pfn;
318
319 if (!cma || !pages)
320 return false;
321
322 pr_debug("%s(page %p)\n", __func__, (void *)pages);
323
324 pfn = page_to_pfn(pages);
325
326 if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
327 return false;
328
329 VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
330
331 free_contig_range(pfn, count);
332 cma_clear_bitmap(cma, pfn, count);
333
334 return true;
335}
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d44fd88c78..af19a6b079f5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -808,6 +808,17 @@ int __lock_page_killable(struct page *page)
808} 808}
809EXPORT_SYMBOL_GPL(__lock_page_killable); 809EXPORT_SYMBOL_GPL(__lock_page_killable);
810 810
811/*
812 * Return values:
813 * 1 - page is locked; mmap_sem is still held.
814 * 0 - page is not locked.
815 * mmap_sem has been released (up_read()), unless flags had both
816 * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
817 * which case mmap_sem is still held.
818 *
819 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
820 * with the page locked and the mmap_sem unperturbed.
821 */
811int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 822int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
812 unsigned int flags) 823 unsigned int flags)
813{ 824{
@@ -1091,9 +1102,9 @@ no_page:
1091 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) 1102 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
1092 fgp_flags |= FGP_LOCK; 1103 fgp_flags |= FGP_LOCK;
1093 1104
1094 /* Init accessed so avoit atomic mark_page_accessed later */ 1105 /* Init accessed so avoid atomic mark_page_accessed later */
1095 if (fgp_flags & FGP_ACCESSED) 1106 if (fgp_flags & FGP_ACCESSED)
1096 init_page_accessed(page); 1107 __SetPageReferenced(page);
1097 1108
1098 err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); 1109 err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
1099 if (unlikely(err)) { 1110 if (unlikely(err)) {
@@ -1827,6 +1838,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
1827 * The goto's are kind of ugly, but this streamlines the normal case of having 1838 * The goto's are kind of ugly, but this streamlines the normal case of having
1828 * it in the page cache, and handles the special cases reasonably without 1839 * it in the page cache, and handles the special cases reasonably without
1829 * having a lot of duplicated code. 1840 * having a lot of duplicated code.
1841 *
1842 * vma->vm_mm->mmap_sem must be held on entry.
1843 *
1844 * If our return value has VM_FAULT_RETRY set, it's because
1845 * lock_page_or_retry() returned 0.
1846 * The mmap_sem has usually been released in this case.
1847 * See __lock_page_or_retry() for the exception.
1848 *
1849 * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
1850 * has not been released.
1851 *
1852 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
1830 */ 1853 */
1831int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1854int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1832{ 1855{
diff --git a/mm/gup.c b/mm/gup.c
index cc5a9e7adea7..91d044b1600d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -258,6 +258,11 @@ unmap:
258 return ret; 258 return ret;
259} 259}
260 260
261/*
262 * mmap_sem must be held on entry. If @nonblocking != NULL and
263 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
264 * If it is, *@nonblocking will be set to 0 and -EBUSY returned.
265 */
261static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 266static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
262 unsigned long address, unsigned int *flags, int *nonblocking) 267 unsigned long address, unsigned int *flags, int *nonblocking)
263{ 268{
@@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
373 * with a put_page() call when it is finished with. vmas will only 378 * with a put_page() call when it is finished with. vmas will only
374 * remain valid while mmap_sem is held. 379 * remain valid while mmap_sem is held.
375 * 380 *
376 * Must be called with mmap_sem held for read or write. 381 * Must be called with mmap_sem held. It may be released. See below.
377 * 382 *
378 * __get_user_pages walks a process's page tables and takes a reference to 383 * __get_user_pages walks a process's page tables and takes a reference to
379 * each struct page that each user address corresponds to at a given 384 * each struct page that each user address corresponds to at a given
@@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
396 * 401 *
397 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 402 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
398 * or mmap_sem contention, and if waiting is needed to pin all pages, 403 * or mmap_sem contention, and if waiting is needed to pin all pages,
399 * *@nonblocking will be set to 0. 404 * *@nonblocking will be set to 0. Further, if @gup_flags does not
405 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
406 * this case.
407 *
408 * A caller using such a combination of @nonblocking and @gup_flags
409 * must therefore hold the mmap_sem for reading only, and recognize
410 * when it's been released. Otherwise, it must be held for either
411 * reading or writing and will not be released.
400 * 412 *
401 * In most cases, get_user_pages or get_user_pages_fast should be used 413 * In most cases, get_user_pages or get_user_pages_fast should be used
402 * instead of __get_user_pages. __get_user_pages should be used only if 414 * instead of __get_user_pages. __get_user_pages should be used only if
@@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages);
528 * such architectures, gup() will not be enough to make a subsequent access 540 * such architectures, gup() will not be enough to make a subsequent access
529 * succeed. 541 * succeed.
530 * 542 *
531 * This should be called with the mm_sem held for read. 543 * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
532 */ 544 */
533int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 545int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
534 unsigned long address, unsigned int fault_flags) 546 unsigned long address, unsigned int fault_flags)
diff --git a/mm/highmem.c b/mm/highmem.c
index b32b70cdaed6..123bcd3ed4f2 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -44,6 +44,66 @@ DEFINE_PER_CPU(int, __kmap_atomic_idx);
44 */ 44 */
45#ifdef CONFIG_HIGHMEM 45#ifdef CONFIG_HIGHMEM
46 46
47/*
48 * Architecture with aliasing data cache may define the following family of
49 * helper functions in its asm/highmem.h to control cache color of virtual
50 * addresses where physical memory pages are mapped by kmap.
51 */
52#ifndef get_pkmap_color
53
54/*
55 * Determine color of virtual address where the page should be mapped.
56 */
57static inline unsigned int get_pkmap_color(struct page *page)
58{
59 return 0;
60}
61#define get_pkmap_color get_pkmap_color
62
63/*
64 * Get next index for mapping inside PKMAP region for page with given color.
65 */
66static inline unsigned int get_next_pkmap_nr(unsigned int color)
67{
68 static unsigned int last_pkmap_nr;
69
70 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
71 return last_pkmap_nr;
72}
73
74/*
75 * Determine if page index inside PKMAP region (pkmap_nr) of given color
76 * has wrapped around PKMAP region end. When this happens an attempt to
77 * flush all unused PKMAP slots is made.
78 */
79static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color)
80{
81 return pkmap_nr == 0;
82}
83
84/*
85 * Get the number of PKMAP entries of the given color. If no free slot is
86 * found after checking that many entries, kmap will sleep waiting for
87 * someone to call kunmap and free PKMAP slot.
88 */
89static inline int get_pkmap_entries_count(unsigned int color)
90{
91 return LAST_PKMAP;
92}
93
94/*
95 * Get head of a wait queue for PKMAP entries of the given color.
96 * Wait queues for different mapping colors should be independent to avoid
97 * unnecessary wakeups caused by freeing of slots of other colors.
98 */
99static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
100{
101 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
102
103 return &pkmap_map_wait;
104}
105#endif
106
47unsigned long totalhigh_pages __read_mostly; 107unsigned long totalhigh_pages __read_mostly;
48EXPORT_SYMBOL(totalhigh_pages); 108EXPORT_SYMBOL(totalhigh_pages);
49 109
@@ -68,13 +128,10 @@ unsigned int nr_free_highpages (void)
68} 128}
69 129
70static int pkmap_count[LAST_PKMAP]; 130static int pkmap_count[LAST_PKMAP];
71static unsigned int last_pkmap_nr;
72static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 131static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
73 132
74pte_t * pkmap_page_table; 133pte_t * pkmap_page_table;
75 134
76static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
77
78/* 135/*
79 * Most architectures have no use for kmap_high_get(), so let's abstract 136 * Most architectures have no use for kmap_high_get(), so let's abstract
80 * the disabling of IRQ out of the locking in that case to save on a 137 * the disabling of IRQ out of the locking in that case to save on a
@@ -161,15 +218,17 @@ static inline unsigned long map_new_virtual(struct page *page)
161{ 218{
162 unsigned long vaddr; 219 unsigned long vaddr;
163 int count; 220 int count;
221 unsigned int last_pkmap_nr;
222 unsigned int color = get_pkmap_color(page);
164 223
165start: 224start:
166 count = LAST_PKMAP; 225 count = get_pkmap_entries_count(color);
167 /* Find an empty entry */ 226 /* Find an empty entry */
168 for (;;) { 227 for (;;) {
169 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; 228 last_pkmap_nr = get_next_pkmap_nr(color);
170 if (!last_pkmap_nr) { 229 if (no_more_pkmaps(last_pkmap_nr, color)) {
171 flush_all_zero_pkmaps(); 230 flush_all_zero_pkmaps();
172 count = LAST_PKMAP; 231 count = get_pkmap_entries_count(color);
173 } 232 }
174 if (!pkmap_count[last_pkmap_nr]) 233 if (!pkmap_count[last_pkmap_nr])
175 break; /* Found a usable entry */ 234 break; /* Found a usable entry */
@@ -181,12 +240,14 @@ start:
181 */ 240 */
182 { 241 {
183 DECLARE_WAITQUEUE(wait, current); 242 DECLARE_WAITQUEUE(wait, current);
243 wait_queue_head_t *pkmap_map_wait =
244 get_pkmap_wait_queue_head(color);
184 245
185 __set_current_state(TASK_UNINTERRUPTIBLE); 246 __set_current_state(TASK_UNINTERRUPTIBLE);
186 add_wait_queue(&pkmap_map_wait, &wait); 247 add_wait_queue(pkmap_map_wait, &wait);
187 unlock_kmap(); 248 unlock_kmap();
188 schedule(); 249 schedule();
189 remove_wait_queue(&pkmap_map_wait, &wait); 250 remove_wait_queue(pkmap_map_wait, &wait);
190 lock_kmap(); 251 lock_kmap();
191 252
192 /* Somebody else might have mapped it while we slept */ 253 /* Somebody else might have mapped it while we slept */
@@ -274,6 +335,8 @@ void kunmap_high(struct page *page)
274 unsigned long nr; 335 unsigned long nr;
275 unsigned long flags; 336 unsigned long flags;
276 int need_wakeup; 337 int need_wakeup;
338 unsigned int color = get_pkmap_color(page);
339 wait_queue_head_t *pkmap_map_wait;
277 340
278 lock_kmap_any(flags); 341 lock_kmap_any(flags);
279 vaddr = (unsigned long)page_address(page); 342 vaddr = (unsigned long)page_address(page);
@@ -299,13 +362,14 @@ void kunmap_high(struct page *page)
299 * no need for the wait-queue-head's lock. Simply 362 * no need for the wait-queue-head's lock. Simply
300 * test if the queue is empty. 363 * test if the queue is empty.
301 */ 364 */
302 need_wakeup = waitqueue_active(&pkmap_map_wait); 365 pkmap_map_wait = get_pkmap_wait_queue_head(color);
366 need_wakeup = waitqueue_active(pkmap_map_wait);
303 } 367 }
304 unlock_kmap_any(flags); 368 unlock_kmap_any(flags);
305 369
306 /* do wake-up, if needed, race-free outside of the spin lock */ 370 /* do wake-up, if needed, race-free outside of the spin lock */
307 if (need_wakeup) 371 if (need_wakeup)
308 wake_up(&pkmap_map_wait); 372 wake_up(pkmap_map_wait);
309} 373}
310 374
311EXPORT_SYMBOL(kunmap_high); 375EXPORT_SYMBOL(kunmap_high);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 33514d88fef9..3630d577e987 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
827 count_vm_event(THP_FAULT_FALLBACK); 827 count_vm_event(THP_FAULT_FALLBACK);
828 return VM_FAULT_FALLBACK; 828 return VM_FAULT_FALLBACK;
829 } 829 }
830 if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { 830 if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) {
831 put_page(page); 831 put_page(page);
832 count_vm_event(THP_FAULT_FALLBACK); 832 count_vm_event(THP_FAULT_FALLBACK);
833 return VM_FAULT_FALLBACK; 833 return VM_FAULT_FALLBACK;
@@ -1132,7 +1132,7 @@ alloc:
1132 goto out; 1132 goto out;
1133 } 1133 }
1134 1134
1135 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { 1135 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) {
1136 put_page(new_page); 1136 put_page(new_page);
1137 if (page) { 1137 if (page) {
1138 split_huge_page(page); 1138 split_huge_page(page);
@@ -1681,7 +1681,7 @@ static void __split_huge_page_refcount(struct page *page,
1681 &page_tail->_count); 1681 &page_tail->_count);
1682 1682
1683 /* after clearing PageTail the gup refcount can be released */ 1683 /* after clearing PageTail the gup refcount can be released */
1684 smp_mb(); 1684 smp_mb__after_atomic();
1685 1685
1686 /* 1686 /*
1687 * retain hwpoison flag of the poisoned tail page: 1687 * retain hwpoison flag of the poisoned tail page:
@@ -1775,6 +1775,8 @@ static int __split_huge_page_map(struct page *page,
1775 if (pmd) { 1775 if (pmd) {
1776 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1776 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1777 pmd_populate(mm, &_pmd, pgtable); 1777 pmd_populate(mm, &_pmd, pgtable);
1778 if (pmd_write(*pmd))
1779 BUG_ON(page_mapcount(page) != 1);
1778 1780
1779 haddr = address; 1781 haddr = address;
1780 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1782 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1784,8 +1786,6 @@ static int __split_huge_page_map(struct page *page,
1784 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1786 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1785 if (!pmd_write(*pmd)) 1787 if (!pmd_write(*pmd))
1786 entry = pte_wrprotect(entry); 1788 entry = pte_wrprotect(entry);
1787 else
1788 BUG_ON(page_mapcount(page) != 1);
1789 if (!pmd_young(*pmd)) 1789 if (!pmd_young(*pmd))
1790 entry = pte_mkold(entry); 1790 entry = pte_mkold(entry);
1791 if (pmd_numa(*pmd)) 1791 if (pmd_numa(*pmd))
@@ -2233,6 +2233,30 @@ static void khugepaged_alloc_sleep(void)
2233 2233
2234static int khugepaged_node_load[MAX_NUMNODES]; 2234static int khugepaged_node_load[MAX_NUMNODES];
2235 2235
2236static bool khugepaged_scan_abort(int nid)
2237{
2238 int i;
2239
2240 /*
2241 * If zone_reclaim_mode is disabled, then no extra effort is made to
2242 * allocate memory locally.
2243 */
2244 if (!zone_reclaim_mode)
2245 return false;
2246
2247 /* If there is a count for this node already, it must be acceptable */
2248 if (khugepaged_node_load[nid])
2249 return false;
2250
2251 for (i = 0; i < MAX_NUMNODES; i++) {
2252 if (!khugepaged_node_load[i])
2253 continue;
2254 if (node_distance(nid, i) > RECLAIM_DISTANCE)
2255 return true;
2256 }
2257 return false;
2258}
2259
2236#ifdef CONFIG_NUMA 2260#ifdef CONFIG_NUMA
2237static int khugepaged_find_target_node(void) 2261static int khugepaged_find_target_node(void)
2238{ 2262{
@@ -2399,7 +2423,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2399 if (!new_page) 2423 if (!new_page)
2400 return; 2424 return;
2401 2425
2402 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) 2426 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE)))
2403 return; 2427 return;
2404 2428
2405 /* 2429 /*
@@ -2545,6 +2569,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2545 * hit record. 2569 * hit record.
2546 */ 2570 */
2547 node = page_to_nid(page); 2571 node = page_to_nid(page);
2572 if (khugepaged_scan_abort(node))
2573 goto out_unmap;
2548 khugepaged_node_load[node]++; 2574 khugepaged_node_load[node]++;
2549 VM_BUG_ON_PAGE(PageCompound(page), page); 2575 VM_BUG_ON_PAGE(PageCompound(page), page);
2550 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2576 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7a0a73d2fcff..eeceeeb09019 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,6 @@
35#include <linux/node.h> 35#include <linux/node.h>
36#include "internal.h" 36#include "internal.h"
37 37
38const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
39unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
40 39
41int hugetlb_max_hstate __read_mostly; 40int hugetlb_max_hstate __read_mostly;
@@ -1089,6 +1088,9 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1089 unsigned long pfn; 1088 unsigned long pfn;
1090 struct hstate *h; 1089 struct hstate *h;
1091 1090
1091 if (!hugepages_supported())
1092 return;
1093
1092 /* Set scan step to minimum hugepage size */ 1094 /* Set scan step to minimum hugepage size */
1093 for_each_hstate(h) 1095 for_each_hstate(h)
1094 if (order > huge_page_order(h)) 1096 if (order > huge_page_order(h))
@@ -1734,21 +1736,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1734 return sprintf(buf, "%lu\n", nr_huge_pages); 1736 return sprintf(buf, "%lu\n", nr_huge_pages);
1735} 1737}
1736 1738
1737static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1739static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
1738 struct kobject *kobj, struct kobj_attribute *attr, 1740 struct hstate *h, int nid,
1739 const char *buf, size_t len) 1741 unsigned long count, size_t len)
1740{ 1742{
1741 int err; 1743 int err;
1742 int nid;
1743 unsigned long count;
1744 struct hstate *h;
1745 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1744 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1746 1745
1747 err = kstrtoul(buf, 10, &count);
1748 if (err)
1749 goto out;
1750
1751 h = kobj_to_hstate(kobj, &nid);
1752 if (hstate_is_gigantic(h) && !gigantic_page_supported()) { 1746 if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
1753 err = -EINVAL; 1747 err = -EINVAL;
1754 goto out; 1748 goto out;
@@ -1784,6 +1778,23 @@ out:
1784 return err; 1778 return err;
1785} 1779}
1786 1780
1781static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1782 struct kobject *kobj, const char *buf,
1783 size_t len)
1784{
1785 struct hstate *h;
1786 unsigned long count;
1787 int nid;
1788 int err;
1789
1790 err = kstrtoul(buf, 10, &count);
1791 if (err)
1792 return err;
1793
1794 h = kobj_to_hstate(kobj, &nid);
1795 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
1796}
1797
1787static ssize_t nr_hugepages_show(struct kobject *kobj, 1798static ssize_t nr_hugepages_show(struct kobject *kobj,
1788 struct kobj_attribute *attr, char *buf) 1799 struct kobj_attribute *attr, char *buf)
1789{ 1800{
@@ -1793,7 +1804,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj,
1793static ssize_t nr_hugepages_store(struct kobject *kobj, 1804static ssize_t nr_hugepages_store(struct kobject *kobj,
1794 struct kobj_attribute *attr, const char *buf, size_t len) 1805 struct kobj_attribute *attr, const char *buf, size_t len)
1795{ 1806{
1796 return nr_hugepages_store_common(false, kobj, attr, buf, len); 1807 return nr_hugepages_store_common(false, kobj, buf, len);
1797} 1808}
1798HSTATE_ATTR(nr_hugepages); 1809HSTATE_ATTR(nr_hugepages);
1799 1810
@@ -1812,7 +1823,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1812static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 1823static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1813 struct kobj_attribute *attr, const char *buf, size_t len) 1824 struct kobj_attribute *attr, const char *buf, size_t len)
1814{ 1825{
1815 return nr_hugepages_store_common(true, kobj, attr, buf, len); 1826 return nr_hugepages_store_common(true, kobj, buf, len);
1816} 1827}
1817HSTATE_ATTR(nr_hugepages_mempolicy); 1828HSTATE_ATTR(nr_hugepages_mempolicy);
1818#endif 1829#endif
@@ -2248,36 +2259,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2248 void __user *buffer, size_t *length, loff_t *ppos) 2259 void __user *buffer, size_t *length, loff_t *ppos)
2249{ 2260{
2250 struct hstate *h = &default_hstate; 2261 struct hstate *h = &default_hstate;
2251 unsigned long tmp; 2262 unsigned long tmp = h->max_huge_pages;
2252 int ret; 2263 int ret;
2253 2264
2254 if (!hugepages_supported()) 2265 if (!hugepages_supported())
2255 return -ENOTSUPP; 2266 return -ENOTSUPP;
2256 2267
2257 tmp = h->max_huge_pages;
2258
2259 if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
2260 return -EINVAL;
2261
2262 table->data = &tmp; 2268 table->data = &tmp;
2263 table->maxlen = sizeof(unsigned long); 2269 table->maxlen = sizeof(unsigned long);
2264 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2270 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2265 if (ret) 2271 if (ret)
2266 goto out; 2272 goto out;
2267 2273
2268 if (write) { 2274 if (write)
2269 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 2275 ret = __nr_hugepages_store_common(obey_mempolicy, h,
2270 GFP_KERNEL | __GFP_NORETRY); 2276 NUMA_NO_NODE, tmp, *length);
2271 if (!(obey_mempolicy &&
2272 init_nodemask_of_mempolicy(nodes_allowed))) {
2273 NODEMASK_FREE(nodes_allowed);
2274 nodes_allowed = &node_states[N_MEMORY];
2275 }
2276 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
2277
2278 if (nodes_allowed != &node_states[N_MEMORY])
2279 NODEMASK_FREE(nodes_allowed);
2280 }
2281out: 2277out:
2282 return ret; 2278 return ret;
2283} 2279}
@@ -2754,8 +2750,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2754 * from other VMAs and let the children be SIGKILLed if they are faulting the 2750 * from other VMAs and let the children be SIGKILLed if they are faulting the
2755 * same region. 2751 * same region.
2756 */ 2752 */
2757static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 2753static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2758 struct page *page, unsigned long address) 2754 struct page *page, unsigned long address)
2759{ 2755{
2760 struct hstate *h = hstate_vma(vma); 2756 struct hstate *h = hstate_vma(vma);
2761 struct vm_area_struct *iter_vma; 2757 struct vm_area_struct *iter_vma;
@@ -2794,8 +2790,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2794 address + huge_page_size(h), page); 2790 address + huge_page_size(h), page);
2795 } 2791 }
2796 mutex_unlock(&mapping->i_mmap_mutex); 2792 mutex_unlock(&mapping->i_mmap_mutex);
2797
2798 return 1;
2799} 2793}
2800 2794
2801/* 2795/*
@@ -2810,7 +2804,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2810{ 2804{
2811 struct hstate *h = hstate_vma(vma); 2805 struct hstate *h = hstate_vma(vma);
2812 struct page *old_page, *new_page; 2806 struct page *old_page, *new_page;
2813 int outside_reserve = 0; 2807 int ret = 0, outside_reserve = 0;
2814 unsigned long mmun_start; /* For mmu_notifiers */ 2808 unsigned long mmun_start; /* For mmu_notifiers */
2815 unsigned long mmun_end; /* For mmu_notifiers */ 2809 unsigned long mmun_end; /* For mmu_notifiers */
2816 2810
@@ -2840,14 +2834,14 @@ retry_avoidcopy:
2840 2834
2841 page_cache_get(old_page); 2835 page_cache_get(old_page);
2842 2836
2843 /* Drop page table lock as buddy allocator may be called */ 2837 /*
2838 * Drop page table lock as buddy allocator may be called. It will
2839 * be acquired again before returning to the caller, as expected.
2840 */
2844 spin_unlock(ptl); 2841 spin_unlock(ptl);
2845 new_page = alloc_huge_page(vma, address, outside_reserve); 2842 new_page = alloc_huge_page(vma, address, outside_reserve);
2846 2843
2847 if (IS_ERR(new_page)) { 2844 if (IS_ERR(new_page)) {
2848 long err = PTR_ERR(new_page);
2849 page_cache_release(old_page);
2850
2851 /* 2845 /*
2852 * If a process owning a MAP_PRIVATE mapping fails to COW, 2846 * If a process owning a MAP_PRIVATE mapping fails to COW,
2853 * it is due to references held by a child and an insufficient 2847 * it is due to references held by a child and an insufficient
@@ -2856,29 +2850,25 @@ retry_avoidcopy:
2856 * may get SIGKILLed if it later faults. 2850 * may get SIGKILLed if it later faults.
2857 */ 2851 */
2858 if (outside_reserve) { 2852 if (outside_reserve) {
2853 page_cache_release(old_page);
2859 BUG_ON(huge_pte_none(pte)); 2854 BUG_ON(huge_pte_none(pte));
2860 if (unmap_ref_private(mm, vma, old_page, address)) { 2855 unmap_ref_private(mm, vma, old_page, address);
2861 BUG_ON(huge_pte_none(pte)); 2856 BUG_ON(huge_pte_none(pte));
2862 spin_lock(ptl); 2857 spin_lock(ptl);
2863 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2858 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2864 if (likely(ptep && 2859 if (likely(ptep &&
2865 pte_same(huge_ptep_get(ptep), pte))) 2860 pte_same(huge_ptep_get(ptep), pte)))
2866 goto retry_avoidcopy; 2861 goto retry_avoidcopy;
2867 /* 2862 /*
2868 * race occurs while re-acquiring page table 2863 * race occurs while re-acquiring page table
2869 * lock, and our job is done. 2864 * lock, and our job is done.
2870 */ 2865 */
2871 return 0; 2866 return 0;
2872 }
2873 WARN_ON_ONCE(1);
2874 } 2867 }
2875 2868
2876 /* Caller expects lock to be held */ 2869 ret = (PTR_ERR(new_page) == -ENOMEM) ?
2877 spin_lock(ptl); 2870 VM_FAULT_OOM : VM_FAULT_SIGBUS;
2878 if (err == -ENOMEM) 2871 goto out_release_old;
2879 return VM_FAULT_OOM;
2880 else
2881 return VM_FAULT_SIGBUS;
2882 } 2872 }
2883 2873
2884 /* 2874 /*
@@ -2886,11 +2876,8 @@ retry_avoidcopy:
2886 * anon_vma prepared. 2876 * anon_vma prepared.
2887 */ 2877 */
2888 if (unlikely(anon_vma_prepare(vma))) { 2878 if (unlikely(anon_vma_prepare(vma))) {
2889 page_cache_release(new_page); 2879 ret = VM_FAULT_OOM;
2890 page_cache_release(old_page); 2880 goto out_release_all;
2891 /* Caller expects lock to be held */
2892 spin_lock(ptl);
2893 return VM_FAULT_OOM;
2894 } 2881 }
2895 2882
2896 copy_user_huge_page(new_page, old_page, address, vma, 2883 copy_user_huge_page(new_page, old_page, address, vma,
@@ -2900,6 +2887,7 @@ retry_avoidcopy:
2900 mmun_start = address & huge_page_mask(h); 2887 mmun_start = address & huge_page_mask(h);
2901 mmun_end = mmun_start + huge_page_size(h); 2888 mmun_end = mmun_start + huge_page_size(h);
2902 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2889 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2890
2903 /* 2891 /*
2904 * Retake the page table lock to check for racing updates 2892 * Retake the page table lock to check for racing updates
2905 * before the page tables are altered 2893 * before the page tables are altered
@@ -2920,12 +2908,13 @@ retry_avoidcopy:
2920 } 2908 }
2921 spin_unlock(ptl); 2909 spin_unlock(ptl);
2922 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2910 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2911out_release_all:
2923 page_cache_release(new_page); 2912 page_cache_release(new_page);
2913out_release_old:
2924 page_cache_release(old_page); 2914 page_cache_release(old_page);
2925 2915
2926 /* Caller expects lock to be held */ 2916 spin_lock(ptl); /* Caller expects lock to be held */
2927 spin_lock(ptl); 2917 return ret;
2928 return 0;
2929} 2918}
2930 2919
2931/* Return the pagecache page at a given address within a VMA */ 2920/* Return the pagecache page at a given address within a VMA */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 95487c71cad5..329caf56df22 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
72 72
73static void pfn_inject_exit(void) 73static void pfn_inject_exit(void)
74{ 74{
75 if (hwpoison_dir) 75 debugfs_remove_recursive(hwpoison_dir);
76 debugfs_remove_recursive(hwpoison_dir);
77} 76}
78 77
79static int pfn_inject_init(void) 78static int pfn_inject_init(void)
diff --git a/mm/internal.h b/mm/internal.h
index 7f22a11fcc66..a1b651b11c5f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { }
247static inline struct page *mem_map_offset(struct page *base, int offset) 247static inline struct page *mem_map_offset(struct page *base, int offset)
248{ 248{
249 if (unlikely(offset >= MAX_ORDER_NR_PAGES)) 249 if (unlikely(offset >= MAX_ORDER_NR_PAGES))
250 return pfn_to_page(page_to_pfn(base) + offset); 250 return nth_page(base, offset);
251 return base + offset; 251 return base + offset;
252} 252}
253 253
diff --git a/mm/madvise.c b/mm/madvise.c
index a402f8fdc68e..0938b30da4ab 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma,
292/* 292/*
293 * Application wants to free up the pages and associated backing store. 293 * Application wants to free up the pages and associated backing store.
294 * This is effectively punching a hole into the middle of a file. 294 * This is effectively punching a hole into the middle of a file.
295 *
296 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
297 * Other filesystems return -ENOSYS.
298 */ 295 */
299static long madvise_remove(struct vm_area_struct *vma, 296static long madvise_remove(struct vm_area_struct *vma,
300 struct vm_area_struct **prev, 297 struct vm_area_struct **prev,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f009a14918d2..90dc501eaf3f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2551,55 +2551,72 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2551 return NOTIFY_OK; 2551 return NOTIFY_OK;
2552} 2552}
2553 2553
2554 2554/**
2555/* See mem_cgroup_try_charge() for details */ 2555 * mem_cgroup_try_charge - try charging a memcg
2556enum { 2556 * @memcg: memcg to charge
2557 CHARGE_OK, /* success */ 2557 * @nr_pages: number of pages to charge
2558 CHARGE_RETRY, /* need to retry but retry is not bad */ 2558 *
2559 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2559 * Returns 0 if @memcg was charged successfully, -EINTR if the charge
2560 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2560 * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
2561}; 2561 */
2562 2562static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2563static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2563 gfp_t gfp_mask,
2564 unsigned int nr_pages, unsigned int min_pages, 2564 unsigned int nr_pages)
2565 bool invoke_oom)
2566{ 2565{
2567 unsigned long csize = nr_pages * PAGE_SIZE; 2566 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2567 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2568 struct mem_cgroup *mem_over_limit; 2568 struct mem_cgroup *mem_over_limit;
2569 struct res_counter *fail_res; 2569 struct res_counter *fail_res;
2570 unsigned long nr_reclaimed;
2570 unsigned long flags = 0; 2571 unsigned long flags = 0;
2571 int ret; 2572 unsigned long long size;
2573 int ret = 0;
2572 2574
2573 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2575retry:
2576 if (consume_stock(memcg, nr_pages))
2577 goto done;
2574 2578
2575 if (likely(!ret)) { 2579 size = batch * PAGE_SIZE;
2580 if (!res_counter_charge(&memcg->res, size, &fail_res)) {
2576 if (!do_swap_account) 2581 if (!do_swap_account)
2577 return CHARGE_OK; 2582 goto done_restock;
2578 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2583 if (!res_counter_charge(&memcg->memsw, size, &fail_res))
2579 if (likely(!ret)) 2584 goto done_restock;
2580 return CHARGE_OK; 2585 res_counter_uncharge(&memcg->res, size);
2581
2582 res_counter_uncharge(&memcg->res, csize);
2583 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2586 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2584 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2587 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2585 } else 2588 } else
2586 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2589 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2590
2591 if (batch > nr_pages) {
2592 batch = nr_pages;
2593 goto retry;
2594 }
2595
2587 /* 2596 /*
2588 * Never reclaim on behalf of optional batching, retry with a 2597 * Unlike in global OOM situations, memcg is not in a physical
2589 * single page instead. 2598 * memory shortage. Allow dying and OOM-killed tasks to
2599 * bypass the last charges so that they can exit quickly and
2600 * free their memory.
2590 */ 2601 */
2591 if (nr_pages > min_pages) 2602 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2592 return CHARGE_RETRY; 2603 fatal_signal_pending(current) ||
2604 current->flags & PF_EXITING))
2605 goto bypass;
2606
2607 if (unlikely(task_in_memcg_oom(current)))
2608 goto nomem;
2593 2609
2594 if (!(gfp_mask & __GFP_WAIT)) 2610 if (!(gfp_mask & __GFP_WAIT))
2595 return CHARGE_WOULDBLOCK; 2611 goto nomem;
2596 2612
2597 if (gfp_mask & __GFP_NORETRY) 2613 nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2598 return CHARGE_NOMEM;
2599 2614
2600 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2601 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2615 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2602 return CHARGE_RETRY; 2616 goto retry;
2617
2618 if (gfp_mask & __GFP_NORETRY)
2619 goto nomem;
2603 /* 2620 /*
2604 * Even though the limit is exceeded at this point, reclaim 2621 * Even though the limit is exceeded at this point, reclaim
2605 * may have been able to free some pages. Retry the charge 2622 * may have been able to free some pages. Retry the charge
@@ -2609,96 +2626,38 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2609 * unlikely to succeed so close to the limit, and we fall back 2626 * unlikely to succeed so close to the limit, and we fall back
2610 * to regular pages anyway in case of failure. 2627 * to regular pages anyway in case of failure.
2611 */ 2628 */
2612 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) 2629 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2613 return CHARGE_RETRY; 2630 goto retry;
2614
2615 /* 2631 /*
2616 * At task move, charge accounts can be doubly counted. So, it's 2632 * At task move, charge accounts can be doubly counted. So, it's
2617 * better to wait until the end of task_move if something is going on. 2633 * better to wait until the end of task_move if something is going on.
2618 */ 2634 */
2619 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2635 if (mem_cgroup_wait_acct_move(mem_over_limit))
2620 return CHARGE_RETRY; 2636 goto retry;
2621
2622 if (invoke_oom)
2623 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2624
2625 return CHARGE_NOMEM;
2626}
2627
2628/**
2629 * mem_cgroup_try_charge - try charging a memcg
2630 * @memcg: memcg to charge
2631 * @nr_pages: number of pages to charge
2632 * @oom: trigger OOM if reclaim fails
2633 *
2634 * Returns 0 if @memcg was charged successfully, -EINTR if the charge
2635 * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
2636 */
2637static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2638 gfp_t gfp_mask,
2639 unsigned int nr_pages,
2640 bool oom)
2641{
2642 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2643 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2644 int ret;
2645
2646 if (mem_cgroup_is_root(memcg))
2647 goto done;
2648 /*
2649 * Unlike in global OOM situations, memcg is not in a physical
2650 * memory shortage. Allow dying and OOM-killed tasks to
2651 * bypass the last charges so that they can exit quickly and
2652 * free their memory.
2653 */
2654 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2655 fatal_signal_pending(current) ||
2656 current->flags & PF_EXITING))
2657 goto bypass;
2658 2637
2659 if (unlikely(task_in_memcg_oom(current))) 2638 if (nr_retries--)
2660 goto nomem; 2639 goto retry;
2661 2640
2662 if (gfp_mask & __GFP_NOFAIL) 2641 if (gfp_mask & __GFP_NOFAIL)
2663 oom = false; 2642 goto bypass;
2664again:
2665 if (consume_stock(memcg, nr_pages))
2666 goto done;
2667
2668 do {
2669 bool invoke_oom = oom && !nr_oom_retries;
2670
2671 /* If killed, bypass charge */
2672 if (fatal_signal_pending(current))
2673 goto bypass;
2674 2643
2675 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2644 if (fatal_signal_pending(current))
2676 nr_pages, invoke_oom); 2645 goto bypass;
2677 switch (ret) {
2678 case CHARGE_OK:
2679 break;
2680 case CHARGE_RETRY: /* not in OOM situation but retry */
2681 batch = nr_pages;
2682 goto again;
2683 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2684 goto nomem;
2685 case CHARGE_NOMEM: /* OOM routine works */
2686 if (!oom || invoke_oom)
2687 goto nomem;
2688 nr_oom_retries--;
2689 break;
2690 }
2691 } while (ret != CHARGE_OK);
2692 2646
2693 if (batch > nr_pages) 2647 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2694 refill_stock(memcg, batch - nr_pages);
2695done:
2696 return 0;
2697nomem: 2648nomem:
2698 if (!(gfp_mask & __GFP_NOFAIL)) 2649 if (!(gfp_mask & __GFP_NOFAIL))
2699 return -ENOMEM; 2650 return -ENOMEM;
2700bypass: 2651bypass:
2701 return -EINTR; 2652 memcg = root_mem_cgroup;
2653 ret = -EINTR;
2654 goto retry;
2655
2656done_restock:
2657 if (batch > nr_pages)
2658 refill_stock(memcg, batch - nr_pages);
2659done:
2660 return ret;
2702} 2661}
2703 2662
2704/** 2663/**
@@ -2712,15 +2671,14 @@ bypass:
2712 */ 2671 */
2713static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, 2672static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
2714 gfp_t gfp_mask, 2673 gfp_t gfp_mask,
2715 unsigned int nr_pages, 2674 unsigned int nr_pages)
2716 bool oom)
2717 2675
2718{ 2676{
2719 struct mem_cgroup *memcg; 2677 struct mem_cgroup *memcg;
2720 int ret; 2678 int ret;
2721 2679
2722 memcg = get_mem_cgroup_from_mm(mm); 2680 memcg = get_mem_cgroup_from_mm(mm);
2723 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); 2681 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages);
2724 css_put(&memcg->css); 2682 css_put(&memcg->css);
2725 if (ret == -EINTR) 2683 if (ret == -EINTR)
2726 memcg = root_mem_cgroup; 2684 memcg = root_mem_cgroup;
@@ -2738,13 +2696,11 @@ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
2738static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2696static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2739 unsigned int nr_pages) 2697 unsigned int nr_pages)
2740{ 2698{
2741 if (!mem_cgroup_is_root(memcg)) { 2699 unsigned long bytes = nr_pages * PAGE_SIZE;
2742 unsigned long bytes = nr_pages * PAGE_SIZE;
2743 2700
2744 res_counter_uncharge(&memcg->res, bytes); 2701 res_counter_uncharge(&memcg->res, bytes);
2745 if (do_swap_account) 2702 if (do_swap_account)
2746 res_counter_uncharge(&memcg->memsw, bytes); 2703 res_counter_uncharge(&memcg->memsw, bytes);
2747 }
2748} 2704}
2749 2705
2750/* 2706/*
@@ -2756,9 +2712,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2756{ 2712{
2757 unsigned long bytes = nr_pages * PAGE_SIZE; 2713 unsigned long bytes = nr_pages * PAGE_SIZE;
2758 2714
2759 if (mem_cgroup_is_root(memcg))
2760 return;
2761
2762 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2715 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2763 if (do_swap_account) 2716 if (do_swap_account)
2764 res_counter_uncharge_until(&memcg->memsw, 2717 res_counter_uncharge_until(&memcg->memsw,
@@ -2842,14 +2795,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2842 } 2795 }
2843 2796
2844 pc->mem_cgroup = memcg; 2797 pc->mem_cgroup = memcg;
2845 /*
2846 * We access a page_cgroup asynchronously without lock_page_cgroup().
2847 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2848 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2849 * before USED bit, we need memory barrier here.
2850 * See mem_cgroup_add_lru_list(), etc.
2851 */
2852 smp_wmb();
2853 SetPageCgroupUsed(pc); 2798 SetPageCgroupUsed(pc);
2854 2799
2855 if (lrucare) { 2800 if (lrucare) {
@@ -2937,8 +2882,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2937 if (ret) 2882 if (ret)
2938 return ret; 2883 return ret;
2939 2884
2940 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, 2885 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT);
2941 oom_gfp_allowed(gfp));
2942 if (ret == -EINTR) { 2886 if (ret == -EINTR) {
2943 /* 2887 /*
2944 * mem_cgroup_try_charge() chosed to bypass to root due to 2888 * mem_cgroup_try_charge() chosed to bypass to root due to
@@ -3463,12 +3407,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3463 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3407 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3464 return; 3408 return;
3465 } 3409 }
3466 3410 /*
3411 * The page is freshly allocated and not visible to any
3412 * outside callers yet. Set up pc non-atomically.
3413 */
3467 pc = lookup_page_cgroup(page); 3414 pc = lookup_page_cgroup(page);
3468 lock_page_cgroup(pc);
3469 pc->mem_cgroup = memcg; 3415 pc->mem_cgroup = memcg;
3470 SetPageCgroupUsed(pc); 3416 pc->flags = PCG_USED;
3471 unlock_page_cgroup(pc);
3472} 3417}
3473 3418
3474void __memcg_kmem_uncharge_pages(struct page *page, int order) 3419void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -3478,19 +3423,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
3478 3423
3479 3424
3480 pc = lookup_page_cgroup(page); 3425 pc = lookup_page_cgroup(page);
3481 /*
3482 * Fast unlocked return. Theoretically might have changed, have to
3483 * check again after locking.
3484 */
3485 if (!PageCgroupUsed(pc)) 3426 if (!PageCgroupUsed(pc))
3486 return; 3427 return;
3487 3428
3488 lock_page_cgroup(pc); 3429 memcg = pc->mem_cgroup;
3489 if (PageCgroupUsed(pc)) { 3430 pc->flags = 0;
3490 memcg = pc->mem_cgroup;
3491 ClearPageCgroupUsed(pc);
3492 }
3493 unlock_page_cgroup(pc);
3494 3431
3495 /* 3432 /*
3496 * We trust that only if there is a memcg associated with the page, it 3433 * We trust that only if there is a memcg associated with the page, it
@@ -3531,7 +3468,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3531 for (i = 1; i < HPAGE_PMD_NR; i++) { 3468 for (i = 1; i < HPAGE_PMD_NR; i++) {
3532 pc = head_pc + i; 3469 pc = head_pc + i;
3533 pc->mem_cgroup = memcg; 3470 pc->mem_cgroup = memcg;
3534 smp_wmb();/* see __commit_charge() */
3535 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3471 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3536 } 3472 }
3537 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3473 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
@@ -3687,7 +3623,6 @@ int mem_cgroup_charge_anon(struct page *page,
3687{ 3623{
3688 unsigned int nr_pages = 1; 3624 unsigned int nr_pages = 1;
3689 struct mem_cgroup *memcg; 3625 struct mem_cgroup *memcg;
3690 bool oom = true;
3691 3626
3692 if (mem_cgroup_disabled()) 3627 if (mem_cgroup_disabled())
3693 return 0; 3628 return 0;
@@ -3699,14 +3634,9 @@ int mem_cgroup_charge_anon(struct page *page,
3699 if (PageTransHuge(page)) { 3634 if (PageTransHuge(page)) {
3700 nr_pages <<= compound_order(page); 3635 nr_pages <<= compound_order(page);
3701 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3636 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3702 /*
3703 * Never OOM-kill a process for a huge page. The
3704 * fault handler will fall back to regular pages.
3705 */
3706 oom = false;
3707 } 3637 }
3708 3638
3709 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); 3639 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages);
3710 if (!memcg) 3640 if (!memcg)
3711 return -ENOMEM; 3641 return -ENOMEM;
3712 __mem_cgroup_commit_charge(memcg, page, nr_pages, 3642 __mem_cgroup_commit_charge(memcg, page, nr_pages,
@@ -3743,7 +3673,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3743 memcg = try_get_mem_cgroup_from_page(page); 3673 memcg = try_get_mem_cgroup_from_page(page);
3744 if (!memcg) 3674 if (!memcg)
3745 memcg = get_mem_cgroup_from_mm(mm); 3675 memcg = get_mem_cgroup_from_mm(mm);
3746 ret = mem_cgroup_try_charge(memcg, mask, 1, true); 3676 ret = mem_cgroup_try_charge(memcg, mask, 1);
3747 css_put(&memcg->css); 3677 css_put(&memcg->css);
3748 if (ret == -EINTR) 3678 if (ret == -EINTR)
3749 memcg = root_mem_cgroup; 3679 memcg = root_mem_cgroup;
@@ -3770,7 +3700,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3770 if (!PageSwapCache(page)) { 3700 if (!PageSwapCache(page)) {
3771 struct mem_cgroup *memcg; 3701 struct mem_cgroup *memcg;
3772 3702
3773 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); 3703 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
3774 if (!memcg) 3704 if (!memcg)
3775 return -ENOMEM; 3705 return -ENOMEM;
3776 *memcgp = memcg; 3706 *memcgp = memcg;
@@ -3839,7 +3769,7 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
3839 return 0; 3769 return 0;
3840 } 3770 }
3841 3771
3842 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); 3772 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1);
3843 if (!memcg) 3773 if (!memcg)
3844 return -ENOMEM; 3774 return -ENOMEM;
3845 __mem_cgroup_commit_charge(memcg, page, 1, type, false); 3775 __mem_cgroup_commit_charge(memcg, page, 1, type, false);
@@ -3993,7 +3923,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
3993 * replacement page, so leave it alone when phasing out the 3923 * replacement page, so leave it alone when phasing out the
3994 * page that is unused after the migration. 3924 * page that is unused after the migration.
3995 */ 3925 */
3996 if (!end_migration && !mem_cgroup_is_root(memcg)) 3926 if (!end_migration)
3997 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3927 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3998 3928
3999 return memcg; 3929 return memcg;
@@ -4126,8 +4056,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
4126 * We uncharge this because swap is freed. This memcg can 4056 * We uncharge this because swap is freed. This memcg can
4127 * be obsolete one. We avoid calling css_tryget_online(). 4057 * be obsolete one. We avoid calling css_tryget_online().
4128 */ 4058 */
4129 if (!mem_cgroup_is_root(memcg)) 4059 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4130 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4131 mem_cgroup_swap_statistics(memcg, false); 4060 mem_cgroup_swap_statistics(memcg, false);
4132 css_put(&memcg->css); 4061 css_put(&memcg->css);
4133 } 4062 }
@@ -4817,78 +4746,24 @@ out:
4817 return retval; 4746 return retval;
4818} 4747}
4819 4748
4820
4821static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4822 enum mem_cgroup_stat_index idx)
4823{
4824 struct mem_cgroup *iter;
4825 long val = 0;
4826
4827 /* Per-cpu values can be negative, use a signed accumulator */
4828 for_each_mem_cgroup_tree(iter, memcg)
4829 val += mem_cgroup_read_stat(iter, idx);
4830
4831 if (val < 0) /* race ? */
4832 val = 0;
4833 return val;
4834}
4835
4836static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4837{
4838 u64 val;
4839
4840 if (!mem_cgroup_is_root(memcg)) {
4841 if (!swap)
4842 return res_counter_read_u64(&memcg->res, RES_USAGE);
4843 else
4844 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4845 }
4846
4847 /*
4848 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4849 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4850 */
4851 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4852 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4853
4854 if (swap)
4855 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4856
4857 return val << PAGE_SHIFT;
4858}
4859
4860static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4749static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4861 struct cftype *cft) 4750 struct cftype *cft)
4862{ 4751{
4863 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4752 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4864 u64 val; 4753 enum res_type type = MEMFILE_TYPE(cft->private);
4865 int name; 4754 int name = MEMFILE_ATTR(cft->private);
4866 enum res_type type;
4867
4868 type = MEMFILE_TYPE(cft->private);
4869 name = MEMFILE_ATTR(cft->private);
4870 4755
4871 switch (type) { 4756 switch (type) {
4872 case _MEM: 4757 case _MEM:
4873 if (name == RES_USAGE) 4758 return res_counter_read_u64(&memcg->res, name);
4874 val = mem_cgroup_usage(memcg, false);
4875 else
4876 val = res_counter_read_u64(&memcg->res, name);
4877 break;
4878 case _MEMSWAP: 4759 case _MEMSWAP:
4879 if (name == RES_USAGE) 4760 return res_counter_read_u64(&memcg->memsw, name);
4880 val = mem_cgroup_usage(memcg, true);
4881 else
4882 val = res_counter_read_u64(&memcg->memsw, name);
4883 break;
4884 case _KMEM: 4761 case _KMEM:
4885 val = res_counter_read_u64(&memcg->kmem, name); 4762 return res_counter_read_u64(&memcg->kmem, name);
4886 break; 4763 break;
4887 default: 4764 default:
4888 BUG(); 4765 BUG();
4889 } 4766 }
4890
4891 return val;
4892} 4767}
4893 4768
4894#ifdef CONFIG_MEMCG_KMEM 4769#ifdef CONFIG_MEMCG_KMEM
@@ -5350,7 +5225,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5350 if (!t) 5225 if (!t)
5351 goto unlock; 5226 goto unlock;
5352 5227
5353 usage = mem_cgroup_usage(memcg, swap); 5228 if (!swap)
5229 usage = res_counter_read_u64(&memcg->res, RES_USAGE);
5230 else
5231 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
5354 5232
5355 /* 5233 /*
5356 * current_threshold points to threshold just below or equal to usage. 5234 * current_threshold points to threshold just below or equal to usage.
@@ -5446,15 +5324,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5446 5324
5447 mutex_lock(&memcg->thresholds_lock); 5325 mutex_lock(&memcg->thresholds_lock);
5448 5326
5449 if (type == _MEM) 5327 if (type == _MEM) {
5450 thresholds = &memcg->thresholds; 5328 thresholds = &memcg->thresholds;
5451 else if (type == _MEMSWAP) 5329 usage = res_counter_read_u64(&memcg->res, RES_USAGE);
5330 } else if (type == _MEMSWAP) {
5452 thresholds = &memcg->memsw_thresholds; 5331 thresholds = &memcg->memsw_thresholds;
5453 else 5332 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
5333 } else
5454 BUG(); 5334 BUG();
5455 5335
5456 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5457
5458 /* Check if a threshold crossed before adding a new one */ 5336 /* Check if a threshold crossed before adding a new one */
5459 if (thresholds->primary) 5337 if (thresholds->primary)
5460 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5338 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
@@ -5534,18 +5412,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5534 int i, j, size; 5412 int i, j, size;
5535 5413
5536 mutex_lock(&memcg->thresholds_lock); 5414 mutex_lock(&memcg->thresholds_lock);
5537 if (type == _MEM) 5415
5416 if (type == _MEM) {
5538 thresholds = &memcg->thresholds; 5417 thresholds = &memcg->thresholds;
5539 else if (type == _MEMSWAP) 5418 usage = res_counter_read_u64(&memcg->res, RES_USAGE);
5419 } else if (type == _MEMSWAP) {
5540 thresholds = &memcg->memsw_thresholds; 5420 thresholds = &memcg->memsw_thresholds;
5541 else 5421 usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
5422 } else
5542 BUG(); 5423 BUG();
5543 5424
5544 if (!thresholds->primary) 5425 if (!thresholds->primary)
5545 goto unlock; 5426 goto unlock;
5546 5427
5547 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5548
5549 /* Check if a threshold crossed before removing */ 5428 /* Check if a threshold crossed before removing */
5550 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5429 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5551 5430
@@ -6299,9 +6178,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6299 * core guarantees its existence. 6178 * core guarantees its existence.
6300 */ 6179 */
6301 } else { 6180 } else {
6302 res_counter_init(&memcg->res, NULL); 6181 res_counter_init(&memcg->res, &root_mem_cgroup->res);
6303 res_counter_init(&memcg->memsw, NULL); 6182 res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
6304 res_counter_init(&memcg->kmem, NULL); 6183 res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
6305 /* 6184 /*
6306 * Deeper hierachy with use_hierarchy == false doesn't make 6185 * Deeper hierachy with use_hierarchy == false doesn't make
6307 * much sense so let cgroup subsystem know about this 6186 * much sense so let cgroup subsystem know about this
@@ -6435,55 +6314,39 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
6435 6314
6436#ifdef CONFIG_MMU 6315#ifdef CONFIG_MMU
6437/* Handlers for move charge at task migration. */ 6316/* Handlers for move charge at task migration. */
6438#define PRECHARGE_COUNT_AT_ONCE 256
6439static int mem_cgroup_do_precharge(unsigned long count) 6317static int mem_cgroup_do_precharge(unsigned long count)
6440{ 6318{
6441 int ret = 0; 6319 int ret;
6442 int batch_count = PRECHARGE_COUNT_AT_ONCE;
6443 struct mem_cgroup *memcg = mc.to;
6444 6320
6445 if (mem_cgroup_is_root(memcg)) { 6321 /* Try a single bulk charge without reclaim first */
6322 ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
6323 if (!ret) {
6446 mc.precharge += count; 6324 mc.precharge += count;
6447 /* we don't need css_get for root */
6448 return ret; 6325 return ret;
6449 } 6326 }
6450 /* try to charge at once */ 6327 if (ret == -EINTR) {
6451 if (count > 1) { 6328 __mem_cgroup_cancel_charge(root_mem_cgroup, count);
6452 struct res_counter *dummy;
6453 /*
6454 * "memcg" cannot be under rmdir() because we've already checked
6455 * by cgroup_lock_live_cgroup() that it is not removed and we
6456 * are still under the same cgroup_mutex. So we can postpone
6457 * css_get().
6458 */
6459 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6460 goto one_by_one;
6461 if (do_swap_account && res_counter_charge(&memcg->memsw,
6462 PAGE_SIZE * count, &dummy)) {
6463 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6464 goto one_by_one;
6465 }
6466 mc.precharge += count;
6467 return ret; 6329 return ret;
6468 } 6330 }
6469one_by_one: 6331
6470 /* fall back to one by one charge */ 6332 /* Try charges one by one with reclaim */
6471 while (count--) { 6333 while (count--) {
6472 if (signal_pending(current)) { 6334 ret = mem_cgroup_try_charge(mc.to,
6473 ret = -EINTR; 6335 GFP_KERNEL & ~__GFP_NORETRY, 1);
6474 break; 6336 /*
6475 } 6337 * In case of failure, any residual charges against
6476 if (!batch_count--) { 6338 * mc.to will be dropped by mem_cgroup_clear_mc()
6477 batch_count = PRECHARGE_COUNT_AT_ONCE; 6339 * later on. However, cancel any charges that are
6478 cond_resched(); 6340 * bypassed to root right away or they'll be lost.
6479 } 6341 */
6480 ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); 6342 if (ret == -EINTR)
6343 __mem_cgroup_cancel_charge(root_mem_cgroup, 1);
6481 if (ret) 6344 if (ret)
6482 /* mem_cgroup_clear_mc() will do uncharge later */
6483 return ret; 6345 return ret;
6484 mc.precharge++; 6346 mc.precharge++;
6347 cond_resched();
6485 } 6348 }
6486 return ret; 6349 return 0;
6487} 6350}
6488 6351
6489/** 6352/**
@@ -6760,21 +6623,18 @@ static void __mem_cgroup_clear_mc(void)
6760 /* we must fixup refcnts and charges */ 6623 /* we must fixup refcnts and charges */
6761 if (mc.moved_swap) { 6624 if (mc.moved_swap) {
6762 /* uncharge swap account from the old cgroup */ 6625 /* uncharge swap account from the old cgroup */
6763 if (!mem_cgroup_is_root(mc.from)) 6626 res_counter_uncharge(&mc.from->memsw,
6764 res_counter_uncharge(&mc.from->memsw, 6627 PAGE_SIZE * mc.moved_swap);
6765 PAGE_SIZE * mc.moved_swap);
6766 6628
6767 for (i = 0; i < mc.moved_swap; i++) 6629 for (i = 0; i < mc.moved_swap; i++)
6768 css_put(&mc.from->css); 6630 css_put(&mc.from->css);
6769 6631
6770 if (!mem_cgroup_is_root(mc.to)) { 6632 /*
6771 /* 6633 * we charged both to->res and to->memsw, so we should
6772 * we charged both to->res and to->memsw, so we should 6634 * uncharge to->res.
6773 * uncharge to->res. 6635 */
6774 */ 6636 res_counter_uncharge(&mc.to->res,
6775 res_counter_uncharge(&mc.to->res, 6637 PAGE_SIZE * mc.moved_swap);
6776 PAGE_SIZE * mc.moved_swap);
6777 }
6778 /* we've already done css_get(mc.to) */ 6638 /* we've already done css_get(mc.to) */
6779 mc.moved_swap = 0; 6639 mc.moved_swap = 0;
6780 } 6640 }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a013bc94ebbe..44c6bd201d3a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1173,6 +1173,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1173 lock_page(hpage); 1173 lock_page(hpage);
1174 1174
1175 /* 1175 /*
1176 * The page could have changed compound pages during the locking.
1177 * If this happens just bail out.
1178 */
1179 if (compound_head(p) != hpage) {
1180 action_result(pfn, "different compound page after locking", IGNORED);
1181 res = -EBUSY;
1182 goto out;
1183 }
1184
1185 /*
1176 * We use page flags to determine what action should be taken, but 1186 * We use page flags to determine what action should be taken, but
1177 * the flags can be modified by the error containment action. One 1187 * the flags can be modified by the error containment action. One
1178 * example is an mlocked page, where PG_mlocked is cleared by 1188 * example is an mlocked page, where PG_mlocked is cleared by
diff --git a/mm/memory.c b/mm/memory.c
index 8b44f765b645..5c55270729f7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -884,7 +884,7 @@ out_set_pte:
884 return 0; 884 return 0;
885} 885}
886 886
887int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 887static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
888 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 888 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
889 unsigned long addr, unsigned long end) 889 unsigned long addr, unsigned long end)
890{ 890{
@@ -2399,7 +2399,10 @@ EXPORT_SYMBOL(unmap_mapping_range);
2399/* 2399/*
2400 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2400 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2401 * but allow concurrent faults), and pte mapped but not yet locked. 2401 * but allow concurrent faults), and pte mapped but not yet locked.
2402 * We return with mmap_sem still held, but pte unmapped and unlocked. 2402 * We return with pte unmapped and unlocked.
2403 *
2404 * We return with the mmap_sem locked or unlocked in the same cases
2405 * as does filemap_fault().
2403 */ 2406 */
2404static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2407static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2405 unsigned long address, pte_t *page_table, pmd_t *pmd, 2408 unsigned long address, pte_t *page_table, pmd_t *pmd,
@@ -2688,6 +2691,11 @@ oom:
2688 return VM_FAULT_OOM; 2691 return VM_FAULT_OOM;
2689} 2692}
2690 2693
2694/*
2695 * The mmap_sem must have been held on entry, and may have been
2696 * released depending on flags and vma->vm_ops->fault() return value.
2697 * See filemap_fault() and __lock_page_retry().
2698 */
2691static int __do_fault(struct vm_area_struct *vma, unsigned long address, 2699static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2692 pgoff_t pgoff, unsigned int flags, struct page **page) 2700 pgoff_t pgoff, unsigned int flags, struct page **page)
2693{ 2701{
@@ -2744,7 +2752,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2744 if (write) 2752 if (write)
2745 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2753 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2746 else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) 2754 else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
2747 pte_mksoft_dirty(entry); 2755 entry = pte_mksoft_dirty(entry);
2748 if (anon) { 2756 if (anon) {
2749 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2757 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2750 page_add_new_anon_rmap(page, vma, address); 2758 page_add_new_anon_rmap(page, vma, address);
@@ -2758,17 +2766,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2758 update_mmu_cache(vma, address, pte); 2766 update_mmu_cache(vma, address, pte);
2759} 2767}
2760 2768
2761static unsigned long fault_around_bytes = rounddown_pow_of_two(65536); 2769static unsigned long fault_around_bytes __read_mostly =
2762 2770 rounddown_pow_of_two(65536);
2763static inline unsigned long fault_around_pages(void)
2764{
2765 return fault_around_bytes >> PAGE_SHIFT;
2766}
2767
2768static inline unsigned long fault_around_mask(void)
2769{
2770 return ~(fault_around_bytes - 1) & PAGE_MASK;
2771}
2772 2771
2773#ifdef CONFIG_DEBUG_FS 2772#ifdef CONFIG_DEBUG_FS
2774static int fault_around_bytes_get(void *data, u64 *val) 2773static int fault_around_bytes_get(void *data, u64 *val)
@@ -2834,12 +2833,15 @@ late_initcall(fault_around_debugfs);
2834static void do_fault_around(struct vm_area_struct *vma, unsigned long address, 2833static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2835 pte_t *pte, pgoff_t pgoff, unsigned int flags) 2834 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2836{ 2835{
2837 unsigned long start_addr; 2836 unsigned long start_addr, nr_pages, mask;
2838 pgoff_t max_pgoff; 2837 pgoff_t max_pgoff;
2839 struct vm_fault vmf; 2838 struct vm_fault vmf;
2840 int off; 2839 int off;
2841 2840
2842 start_addr = max(address & fault_around_mask(), vma->vm_start); 2841 nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2842 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2843
2844 start_addr = max(address & mask, vma->vm_start);
2843 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 2845 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2844 pte -= off; 2846 pte -= off;
2845 pgoff -= off; 2847 pgoff -= off;
@@ -2851,7 +2853,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2851 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 2853 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2852 PTRS_PER_PTE - 1; 2854 PTRS_PER_PTE - 1;
2853 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, 2855 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
2854 pgoff + fault_around_pages() - 1); 2856 pgoff + nr_pages - 1);
2855 2857
2856 /* Check if it makes any sense to call ->map_pages */ 2858 /* Check if it makes any sense to call ->map_pages */
2857 while (!pte_none(*pte)) { 2859 while (!pte_none(*pte)) {
@@ -2886,7 +2888,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2886 * something). 2888 * something).
2887 */ 2889 */
2888 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && 2890 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
2889 fault_around_pages() > 1) { 2891 fault_around_bytes >> PAGE_SHIFT > 1) {
2890 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2892 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2891 do_fault_around(vma, address, pte, pgoff, flags); 2893 do_fault_around(vma, address, pte, pgoff, flags);
2892 if (!pte_same(*pte, orig_pte)) 2894 if (!pte_same(*pte, orig_pte))
@@ -3016,6 +3018,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3016 return ret; 3018 return ret;
3017} 3019}
3018 3020
3021/*
3022 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3023 * but allow concurrent faults).
3024 * The mmap_sem may have been released depending on flags and our
3025 * return value. See filemap_fault() and __lock_page_or_retry().
3026 */
3019static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3027static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3020 unsigned long address, pte_t *page_table, pmd_t *pmd, 3028 unsigned long address, pte_t *page_table, pmd_t *pmd,
3021 unsigned int flags, pte_t orig_pte) 3029 unsigned int flags, pte_t orig_pte)
@@ -3040,7 +3048,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3040 * 3048 *
3041 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3049 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3042 * but allow concurrent faults), and pte mapped but not yet locked. 3050 * but allow concurrent faults), and pte mapped but not yet locked.
3043 * We return with mmap_sem still held, but pte unmapped and unlocked. 3051 * We return with pte unmapped and unlocked.
3052 * The mmap_sem may have been released depending on flags and our
3053 * return value. See filemap_fault() and __lock_page_or_retry().
3044 */ 3054 */
3045static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3055static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3046 unsigned long address, pte_t *page_table, pmd_t *pmd, 3056 unsigned long address, pte_t *page_table, pmd_t *pmd,
@@ -3172,7 +3182,10 @@ out:
3172 * 3182 *
3173 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3183 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3174 * but allow concurrent faults), and pte mapped but not yet locked. 3184 * but allow concurrent faults), and pte mapped but not yet locked.
3175 * We return with mmap_sem still held, but pte unmapped and unlocked. 3185 * We return with pte unmapped and unlocked.
3186 *
3187 * The mmap_sem may have been released depending on flags and our
3188 * return value. See filemap_fault() and __lock_page_or_retry().
3176 */ 3189 */
3177static int handle_pte_fault(struct mm_struct *mm, 3190static int handle_pte_fault(struct mm_struct *mm,
3178 struct vm_area_struct *vma, unsigned long address, 3191 struct vm_area_struct *vma, unsigned long address,
@@ -3181,7 +3194,7 @@ static int handle_pte_fault(struct mm_struct *mm,
3181 pte_t entry; 3194 pte_t entry;
3182 spinlock_t *ptl; 3195 spinlock_t *ptl;
3183 3196
3184 entry = *pte; 3197 entry = ACCESS_ONCE(*pte);
3185 if (!pte_present(entry)) { 3198 if (!pte_present(entry)) {
3186 if (pte_none(entry)) { 3199 if (pte_none(entry)) {
3187 if (vma->vm_ops) { 3200 if (vma->vm_ops) {
@@ -3232,6 +3245,9 @@ unlock:
3232 3245
3233/* 3246/*
3234 * By the time we get here, we already hold the mm semaphore 3247 * By the time we get here, we already hold the mm semaphore
3248 *
3249 * The mmap_sem may have been released depending on flags and our
3250 * return value. See filemap_fault() and __lock_page_or_retry().
3235 */ 3251 */
3236static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3252static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3237 unsigned long address, unsigned int flags) 3253 unsigned long address, unsigned int flags)
@@ -3313,6 +3329,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3313 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3329 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3314} 3330}
3315 3331
3332/*
3333 * By the time we get here, we already hold the mm semaphore
3334 *
3335 * The mmap_sem may have been released depending on flags and our
3336 * return value. See filemap_fault() and __lock_page_or_retry().
3337 */
3316int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3338int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3317 unsigned long address, unsigned int flags) 3339 unsigned long address, unsigned int flags)
3318{ 3340{
@@ -3591,11 +3613,13 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3591 ret = get_user_pages(tsk, mm, addr, 1, 3613 ret = get_user_pages(tsk, mm, addr, 1,
3592 write, 1, &page, &vma); 3614 write, 1, &page, &vma);
3593 if (ret <= 0) { 3615 if (ret <= 0) {
3616#ifndef CONFIG_HAVE_IOREMAP_PROT
3617 break;
3618#else
3594 /* 3619 /*
3595 * Check if this is a VM_IO | VM_PFNMAP VMA, which 3620 * Check if this is a VM_IO | VM_PFNMAP VMA, which
3596 * we can access using slightly different code. 3621 * we can access using slightly different code.
3597 */ 3622 */
3598#ifdef CONFIG_HAVE_IOREMAP_PROT
3599 vma = find_vma(mm, addr); 3623 vma = find_vma(mm, addr);
3600 if (!vma || vma->vm_start > addr) 3624 if (!vma || vma->vm_start > addr)
3601 break; 3625 break;
@@ -3603,9 +3627,9 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3603 ret = vma->vm_ops->access(vma, addr, buf, 3627 ret = vma->vm_ops->access(vma, addr, buf,
3604 len, write); 3628 len, write);
3605 if (ret <= 0) 3629 if (ret <= 0)
3606#endif
3607 break; 3630 break;
3608 bytes = ret; 3631 bytes = ret;
3632#endif
3609 } else { 3633 } else {
3610 bytes = len; 3634 bytes = len;
3611 offset = addr & (PAGE_SIZE-1); 3635 offset = addr & (PAGE_SIZE-1);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 469bbf505f85..2ff8c2325e96 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
284} 284}
285#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ 285#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
286 286
287static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 287static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
288 unsigned long end_pfn) 288 unsigned long end_pfn)
289{ 289{
290 unsigned long old_zone_end_pfn; 290 unsigned long old_zone_end_pfn;
291 291
@@ -427,8 +427,8 @@ out_fail:
427 return -1; 427 return -1;
428} 428}
429 429
430static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 430static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
431 unsigned long end_pfn) 431 unsigned long end_pfn)
432{ 432{
433 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); 433 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
434 434
@@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
977 zone = page_zone(pfn_to_page(pfn)); 977 zone = page_zone(pfn_to_page(pfn));
978 978
979 ret = -EINVAL; 979 ret = -EINVAL;
980 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 980 if ((zone_idx(zone) > ZONE_NORMAL ||
981 online_type == MMOP_ONLINE_MOVABLE) &&
981 !can_online_high_movable(zone)) 982 !can_online_high_movable(zone))
982 goto out; 983 goto out;
983 984
984 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 985 if (online_type == MMOP_ONLINE_KERNEL &&
986 zone_idx(zone) == ZONE_MOVABLE) {
985 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 987 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
986 goto out; 988 goto out;
987 } 989 }
988 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 990 if (online_type == MMOP_ONLINE_MOVABLE &&
991 zone_idx(zone) == ZONE_MOVABLE - 1) {
989 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 992 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
990 goto out; 993 goto out;
991 } 994 }
@@ -1156,6 +1159,34 @@ static int check_hotplug_memory_range(u64 start, u64 size)
1156 return 0; 1159 return 0;
1157} 1160}
1158 1161
1162/*
1163 * If movable zone has already been setup, newly added memory should be check.
1164 * If its address is higher than movable zone, it should be added as movable.
1165 * Without this check, movable zone may overlap with other zone.
1166 */
1167static int should_add_memory_movable(int nid, u64 start, u64 size)
1168{
1169 unsigned long start_pfn = start >> PAGE_SHIFT;
1170 pg_data_t *pgdat = NODE_DATA(nid);
1171 struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
1172
1173 if (zone_is_empty(movable_zone))
1174 return 0;
1175
1176 if (movable_zone->zone_start_pfn <= start_pfn)
1177 return 1;
1178
1179 return 0;
1180}
1181
1182int zone_for_memory(int nid, u64 start, u64 size, int zone_default)
1183{
1184 if (should_add_memory_movable(nid, start, size))
1185 return ZONE_MOVABLE;
1186
1187 return zone_default;
1188}
1189
1159/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1190/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1160int __ref add_memory(int nid, u64 start, u64 size) 1191int __ref add_memory(int nid, u64 start, u64 size)
1161{ 1192{
diff --git a/mm/mlock.c b/mm/mlock.c
index b1eb53634005..ce84cb0b83ef 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -210,12 +210,19 @@ out:
210 * @vma: target vma 210 * @vma: target vma
211 * @start: start address 211 * @start: start address
212 * @end: end address 212 * @end: end address
213 * @nonblocking:
213 * 214 *
214 * This takes care of making the pages present too. 215 * This takes care of making the pages present too.
215 * 216 *
216 * return 0 on success, negative error code on error. 217 * return 0 on success, negative error code on error.
217 * 218 *
218 * vma->vm_mm->mmap_sem must be held for at least read. 219 * vma->vm_mm->mmap_sem must be held.
220 *
221 * If @nonblocking is NULL, it may be held for read or write and will
222 * be unperturbed.
223 *
224 * If @nonblocking is non-NULL, it must held for read only and may be
225 * released. If it's released, *@nonblocking will be set to 0.
219 */ 226 */
220long __mlock_vma_pages_range(struct vm_area_struct *vma, 227long __mlock_vma_pages_range(struct vm_area_struct *vma,
221 unsigned long start, unsigned long end, int *nonblocking) 228 unsigned long start, unsigned long end, int *nonblocking)
diff --git a/mm/mmap.c b/mm/mmap.c
index 129b847d30cc..64c9d736155c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
31#include <linux/mempolicy.h> 31#include <linux/mempolicy.h>
32#include <linux/rmap.h> 32#include <linux/rmap.h>
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/mmdebug.h>
34#include <linux/perf_event.h> 35#include <linux/perf_event.h>
35#include <linux/audit.h> 36#include <linux/audit.h>
36#include <linux/khugepaged.h> 37#include <linux/khugepaged.h>
@@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
134{ 135{
135 unsigned long free, allowed, reserve; 136 unsigned long free, allowed, reserve;
136 137
138 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
139 -(s64)vm_committed_as_batch * num_online_cpus(),
140 "memory commitment underflow");
141
137 vm_acct_memory(pages); 142 vm_acct_memory(pages);
138 143
139 /* 144 /*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 41cefdf0aadd..950813b1eb36 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -23,6 +23,25 @@
23static struct srcu_struct srcu; 23static struct srcu_struct srcu;
24 24
25/* 25/*
26 * This function allows mmu_notifier::release callback to delay a call to
27 * a function that will free appropriate resources. The function must be
28 * quick and must not block.
29 */
30void mmu_notifier_call_srcu(struct rcu_head *rcu,
31 void (*func)(struct rcu_head *rcu))
32{
33 call_srcu(&srcu, rcu, func);
34}
35EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
36
37void mmu_notifier_synchronize(void)
38{
39 /* Wait for any running method to finish. */
40 srcu_barrier(&srcu);
41}
42EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
43
44/*
26 * This function can't run concurrently against mmu_notifier_register 45 * This function can't run concurrently against mmu_notifier_register
27 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap 46 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
28 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers 47 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
@@ -53,7 +72,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
53 */ 72 */
54 if (mn->ops->release) 73 if (mn->ops->release)
55 mn->ops->release(mn, mm); 74 mn->ops->release(mn, mm);
56 srcu_read_unlock(&srcu, id);
57 75
58 spin_lock(&mm->mmu_notifier_mm->lock); 76 spin_lock(&mm->mmu_notifier_mm->lock);
59 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 77 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -69,6 +87,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
69 hlist_del_init_rcu(&mn->hlist); 87 hlist_del_init_rcu(&mn->hlist);
70 } 88 }
71 spin_unlock(&mm->mmu_notifier_mm->lock); 89 spin_unlock(&mm->mmu_notifier_mm->lock);
90 srcu_read_unlock(&srcu, id);
72 91
73 /* 92 /*
74 * synchronize_srcu here prevents mmu_notifier_release from returning to 93 * synchronize_srcu here prevents mmu_notifier_release from returning to
@@ -325,6 +344,25 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
325} 344}
326EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 345EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
327 346
347/*
348 * Same as mmu_notifier_unregister but no callback and no srcu synchronization.
349 */
350void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
351 struct mm_struct *mm)
352{
353 spin_lock(&mm->mmu_notifier_mm->lock);
354 /*
355 * Can not use list_del_rcu() since __mmu_notifier_release
356 * can delete it before we hold the lock.
357 */
358 hlist_del_init_rcu(&mn->hlist);
359 spin_unlock(&mm->mmu_notifier_mm->lock);
360
361 BUG_ON(atomic_read(&mm->mm_count) <= 0);
362 mmdrop(mm);
363}
364EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
365
328static int __init mmu_notifier_init(void) 366static int __init mmu_notifier_init(void)
329{ 367{
330 return init_srcu_struct(&srcu); 368 return init_srcu_struct(&srcu);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3291e82d4352..1e11df8fa7ec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
258 unsigned long totalpages, const nodemask_t *nodemask, 258 unsigned long totalpages, const nodemask_t *nodemask,
259 bool force_kill) 259 bool force_kill)
260{ 260{
261 if (task->exit_state)
262 return OOM_SCAN_CONTINUE;
263 if (oom_unkillable_task(task, NULL, nodemask)) 261 if (oom_unkillable_task(task, NULL, nodemask))
264 return OOM_SCAN_CONTINUE; 262 return OOM_SCAN_CONTINUE;
265 263
@@ -559,28 +557,25 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
559 * if a parallel OOM killing is already taking place that includes a zone in 557 * if a parallel OOM killing is already taking place that includes a zone in
560 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 558 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
561 */ 559 */
562int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) 560bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
563{ 561{
564 struct zoneref *z; 562 struct zoneref *z;
565 struct zone *zone; 563 struct zone *zone;
566 int ret = 1; 564 bool ret = true;
567 565
568 spin_lock(&zone_scan_lock); 566 spin_lock(&zone_scan_lock);
569 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 567 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
570 if (zone_is_oom_locked(zone)) { 568 if (zone_is_oom_locked(zone)) {
571 ret = 0; 569 ret = false;
572 goto out; 570 goto out;
573 } 571 }
574 }
575 572
576 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 573 /*
577 /* 574 * Lock each zone in the zonelist under zone_scan_lock so a parallel
578 * Lock each zone in the zonelist under zone_scan_lock so a 575 * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
579 * parallel invocation of try_set_zonelist_oom() doesn't succeed 576 */
580 * when it shouldn't. 577 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
581 */
582 zone_set_flag(zone, ZONE_OOM_LOCKED); 578 zone_set_flag(zone, ZONE_OOM_LOCKED);
583 }
584 579
585out: 580out:
586 spin_unlock(&zone_scan_lock); 581 spin_unlock(&zone_scan_lock);
@@ -592,15 +587,14 @@ out:
592 * allocation attempts with zonelists containing them may now recall the OOM 587 * allocation attempts with zonelists containing them may now recall the OOM
593 * killer, if necessary. 588 * killer, if necessary.
594 */ 589 */
595void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) 590void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
596{ 591{
597 struct zoneref *z; 592 struct zoneref *z;
598 struct zone *zone; 593 struct zone *zone;
599 594
600 spin_lock(&zone_scan_lock); 595 spin_lock(&zone_scan_lock);
601 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 596 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
602 zone_clear_flag(zone, ZONE_OOM_LOCKED); 597 zone_clear_flag(zone, ZONE_OOM_LOCKED);
603 }
604 spin_unlock(&zone_scan_lock); 598 spin_unlock(&zone_scan_lock);
605} 599}
606 600
@@ -694,9 +688,9 @@ void pagefault_out_of_memory(void)
694 if (mem_cgroup_oom_synchronize(true)) 688 if (mem_cgroup_oom_synchronize(true))
695 return; 689 return;
696 690
697 zonelist = node_zonelist(first_online_node, GFP_KERNEL); 691 zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
698 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { 692 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
699 out_of_memory(NULL, 0, 0, NULL, false); 693 out_of_memory(NULL, 0, 0, NULL, false);
700 clear_zonelist_oom(zonelist, GFP_KERNEL); 694 oom_zonelist_unlock(zonelist, GFP_KERNEL);
701 } 695 }
702} 696}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e0c943014eb7..91d73ef1744d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -261,14 +261,11 @@ static unsigned long global_dirtyable_memory(void)
261 */ 261 */
262void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) 262void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
263{ 263{
264 const unsigned long available_memory = global_dirtyable_memory();
264 unsigned long background; 265 unsigned long background;
265 unsigned long dirty; 266 unsigned long dirty;
266 unsigned long uninitialized_var(available_memory);
267 struct task_struct *tsk; 267 struct task_struct *tsk;
268 268
269 if (!vm_dirty_bytes || !dirty_background_bytes)
270 available_memory = global_dirtyable_memory();
271
272 if (vm_dirty_bytes) 269 if (vm_dirty_bytes)
273 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 270 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
274 else 271 else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef44ad736ca1..18cee0d4c8a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
680 int migratetype = 0; 680 int migratetype = 0;
681 int batch_free = 0; 681 int batch_free = 0;
682 int to_free = count; 682 int to_free = count;
683 unsigned long nr_scanned;
683 684
684 spin_lock(&zone->lock); 685 spin_lock(&zone->lock);
685 zone->pages_scanned = 0; 686 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
687 if (nr_scanned)
688 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
686 689
687 while (to_free) { 690 while (to_free) {
688 struct page *page; 691 struct page *page;
@@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone,
731 unsigned int order, 734 unsigned int order,
732 int migratetype) 735 int migratetype)
733{ 736{
737 unsigned long nr_scanned;
734 spin_lock(&zone->lock); 738 spin_lock(&zone->lock);
735 zone->pages_scanned = 0; 739 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
740 if (nr_scanned)
741 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
736 742
737 __free_one_page(page, pfn, zone, order, migratetype); 743 __free_one_page(page, pfn, zone, order, migratetype);
738 if (unlikely(!is_migrate_isolate(migratetype))) 744 if (unlikely(!is_migrate_isolate(migratetype)))
@@ -1257,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1257void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1263void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1258{ 1264{
1259 unsigned long flags; 1265 unsigned long flags;
1260 int to_drain; 1266 int to_drain, batch;
1261 unsigned long batch;
1262 1267
1263 local_irq_save(flags); 1268 local_irq_save(flags);
1264 batch = ACCESS_ONCE(pcp->batch); 1269 batch = ACCESS_ONCE(pcp->batch);
1265 if (pcp->count >= batch) 1270 to_drain = min(pcp->count, batch);
1266 to_drain = batch;
1267 else
1268 to_drain = pcp->count;
1269 if (to_drain > 0) { 1271 if (to_drain > 0) {
1270 free_pcppages_bulk(zone, to_drain, pcp); 1272 free_pcppages_bulk(zone, to_drain, pcp);
1271 pcp->count -= to_drain; 1273 pcp->count -= to_drain;
@@ -1610,6 +1612,9 @@ again:
1610 } 1612 }
1611 1613
1612 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1614 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1615 if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
1616 !zone_is_fair_depleted(zone))
1617 zone_set_flag(zone, ZONE_FAIR_DEPLETED);
1613 1618
1614 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1619 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1615 zone_statistics(preferred_zone, zone, gfp_flags); 1620 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1712,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1712{ 1717{
1713 /* free_pages my go negative - that's OK */ 1718 /* free_pages my go negative - that's OK */
1714 long min = mark; 1719 long min = mark;
1715 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1716 int o; 1720 int o;
1717 long free_cma = 0; 1721 long free_cma = 0;
1718 1722
@@ -1727,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1727 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1731 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1728#endif 1732#endif
1729 1733
1730 if (free_pages - free_cma <= min + lowmem_reserve) 1734 if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
1731 return false; 1735 return false;
1732 for (o = 0; o < order; o++) { 1736 for (o = 0; o < order; o++) {
1733 /* At the next order, this order's pages become unavailable */ 1737 /* At the next order, this order's pages become unavailable */
@@ -1922,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1922 1926
1923#endif /* CONFIG_NUMA */ 1927#endif /* CONFIG_NUMA */
1924 1928
1929static void reset_alloc_batches(struct zone *preferred_zone)
1930{
1931 struct zone *zone = preferred_zone->zone_pgdat->node_zones;
1932
1933 do {
1934 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1935 high_wmark_pages(zone) - low_wmark_pages(zone) -
1936 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1937 zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
1938 } while (zone++ != preferred_zone);
1939}
1940
1925/* 1941/*
1926 * get_page_from_freelist goes through the zonelist trying to allocate 1942 * get_page_from_freelist goes through the zonelist trying to allocate
1927 * a page. 1943 * a page.
@@ -1939,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1939 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1955 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1940 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && 1956 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1941 (gfp_mask & __GFP_WRITE); 1957 (gfp_mask & __GFP_WRITE);
1958 int nr_fair_skipped = 0;
1959 bool zonelist_rescan;
1942 1960
1943zonelist_scan: 1961zonelist_scan:
1962 zonelist_rescan = false;
1963
1944 /* 1964 /*
1945 * Scan zonelist, looking for a zone with enough free. 1965 * Scan zonelist, looking for a zone with enough free.
1946 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1966 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1964,9 +1984,11 @@ zonelist_scan:
1964 */ 1984 */
1965 if (alloc_flags & ALLOC_FAIR) { 1985 if (alloc_flags & ALLOC_FAIR) {
1966 if (!zone_local(preferred_zone, zone)) 1986 if (!zone_local(preferred_zone, zone))
1987 break;
1988 if (zone_is_fair_depleted(zone)) {
1989 nr_fair_skipped++;
1967 continue; 1990 continue;
1968 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1991 }
1969 continue;
1970 } 1992 }
1971 /* 1993 /*
1972 * When allocating a page cache page for writing, we 1994 * When allocating a page cache page for writing, we
@@ -2072,13 +2094,7 @@ this_zone_full:
2072 zlc_mark_zone_full(zonelist, z); 2094 zlc_mark_zone_full(zonelist, z);
2073 } 2095 }
2074 2096
2075 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2097 if (page) {
2076 /* Disable zlc cache for second zonelist scan */
2077 zlc_active = 0;
2078 goto zonelist_scan;
2079 }
2080
2081 if (page)
2082 /* 2098 /*
2083 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2099 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2084 * necessary to allocate the page. The expectation is 2100 * necessary to allocate the page. The expectation is
@@ -2087,8 +2103,37 @@ this_zone_full:
2087 * for !PFMEMALLOC purposes. 2103 * for !PFMEMALLOC purposes.
2088 */ 2104 */
2089 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2105 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2106 return page;
2107 }
2090 2108
2091 return page; 2109 /*
2110 * The first pass makes sure allocations are spread fairly within the
2111 * local node. However, the local node might have free pages left
2112 * after the fairness batches are exhausted, and remote zones haven't
2113 * even been considered yet. Try once more without fairness, and
2114 * include remote zones now, before entering the slowpath and waking
2115 * kswapd: prefer spilling to a remote zone over swapping locally.
2116 */
2117 if (alloc_flags & ALLOC_FAIR) {
2118 alloc_flags &= ~ALLOC_FAIR;
2119 if (nr_fair_skipped) {
2120 zonelist_rescan = true;
2121 reset_alloc_batches(preferred_zone);
2122 }
2123 if (nr_online_nodes > 1)
2124 zonelist_rescan = true;
2125 }
2126
2127 if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
2128 /* Disable zlc cache for second zonelist scan */
2129 zlc_active = 0;
2130 zonelist_rescan = true;
2131 }
2132
2133 if (zonelist_rescan)
2134 goto zonelist_scan;
2135
2136 return NULL;
2092} 2137}
2093 2138
2094/* 2139/*
@@ -2201,8 +2246,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2201{ 2246{
2202 struct page *page; 2247 struct page *page;
2203 2248
2204 /* Acquire the OOM killer lock for the zones in zonelist */ 2249 /* Acquire the per-zone oom lock for each zone */
2205 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2250 if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
2206 schedule_timeout_uninterruptible(1); 2251 schedule_timeout_uninterruptible(1);
2207 return NULL; 2252 return NULL;
2208 } 2253 }
@@ -2240,7 +2285,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2240 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2285 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2241 2286
2242out: 2287out:
2243 clear_zonelist_oom(zonelist, gfp_mask); 2288 oom_zonelist_unlock(zonelist, gfp_mask);
2244 return page; 2289 return page;
2245} 2290}
2246 2291
@@ -2409,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2409 return page; 2454 return page;
2410} 2455}
2411 2456
2412static void reset_alloc_batches(struct zonelist *zonelist,
2413 enum zone_type high_zoneidx,
2414 struct zone *preferred_zone)
2415{
2416 struct zoneref *z;
2417 struct zone *zone;
2418
2419 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2420 /*
2421 * Only reset the batches of zones that were actually
2422 * considered in the fairness pass, we don't want to
2423 * trash fairness information for zones that are not
2424 * actually part of this zonelist's round-robin cycle.
2425 */
2426 if (!zone_local(preferred_zone, zone))
2427 continue;
2428 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2429 high_wmark_pages(zone) - low_wmark_pages(zone) -
2430 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2431 }
2432}
2433
2434static void wake_all_kswapds(unsigned int order, 2457static void wake_all_kswapds(unsigned int order,
2435 struct zonelist *zonelist, 2458 struct zonelist *zonelist,
2436 enum zone_type high_zoneidx, 2459 enum zone_type high_zoneidx,
@@ -2616,14 +2639,6 @@ rebalance:
2616 goto got_pg; 2639 goto got_pg;
2617 2640
2618 /* 2641 /*
2619 * It can become very expensive to allocate transparent hugepages at
2620 * fault, so use asynchronous memory compaction for THP unless it is
2621 * khugepaged trying to collapse.
2622 */
2623 if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
2624 migration_mode = MIGRATE_SYNC_LIGHT;
2625
2626 /*
2627 * If compaction is deferred for high-order allocations, it is because 2642 * If compaction is deferred for high-order allocations, it is because
2628 * sync compaction recently failed. In this is the case and the caller 2643 * sync compaction recently failed. In this is the case and the caller
2629 * requested a movable allocation that does not heavily disrupt the 2644 * requested a movable allocation that does not heavily disrupt the
@@ -2633,6 +2648,15 @@ rebalance:
2633 (gfp_mask & __GFP_NO_KSWAPD)) 2648 (gfp_mask & __GFP_NO_KSWAPD))
2634 goto nopage; 2649 goto nopage;
2635 2650
2651 /*
2652 * It can become very expensive to allocate transparent hugepages at
2653 * fault, so use asynchronous memory compaction for THP unless it is
2654 * khugepaged trying to collapse.
2655 */
2656 if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
2657 (current->flags & PF_KTHREAD))
2658 migration_mode = MIGRATE_SYNC_LIGHT;
2659
2636 /* Try direct reclaim and then allocating */ 2660 /* Try direct reclaim and then allocating */
2637 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2661 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2638 zonelist, high_zoneidx, 2662 zonelist, high_zoneidx,
@@ -2766,29 +2790,12 @@ retry_cpuset:
2766 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2790 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2767 alloc_flags |= ALLOC_CMA; 2791 alloc_flags |= ALLOC_CMA;
2768#endif 2792#endif
2769retry:
2770 /* First allocation attempt */ 2793 /* First allocation attempt */
2771 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2794 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2772 zonelist, high_zoneidx, alloc_flags, 2795 zonelist, high_zoneidx, alloc_flags,
2773 preferred_zone, classzone_idx, migratetype); 2796 preferred_zone, classzone_idx, migratetype);
2774 if (unlikely(!page)) { 2797 if (unlikely(!page)) {
2775 /* 2798 /*
2776 * The first pass makes sure allocations are spread
2777 * fairly within the local node. However, the local
2778 * node might have free pages left after the fairness
2779 * batches are exhausted, and remote zones haven't
2780 * even been considered yet. Try once more without
2781 * fairness, and include remote zones now, before
2782 * entering the slowpath and waking kswapd: prefer
2783 * spilling to a remote zone over swapping locally.
2784 */
2785 if (alloc_flags & ALLOC_FAIR) {
2786 reset_alloc_batches(zonelist, high_zoneidx,
2787 preferred_zone);
2788 alloc_flags &= ~ALLOC_FAIR;
2789 goto retry;
2790 }
2791 /*
2792 * Runtime PM, block IO and its error handling path 2799 * Runtime PM, block IO and its error handling path
2793 * can deadlock because I/O on the device might not 2800 * can deadlock because I/O on the device might not
2794 * complete. 2801 * complete.
@@ -2962,7 +2969,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
2962 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 2969 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2963 * but is not exact. 2970 * but is not exact.
2964 */ 2971 */
2965void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 2972void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2966{ 2973{
2967 unsigned order = get_order(size); 2974 unsigned order = get_order(size);
2968 struct page *p = alloc_pages_node(nid, gfp_mask, order); 2975 struct page *p = alloc_pages_node(nid, gfp_mask, order);
@@ -2970,7 +2977,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2970 return NULL; 2977 return NULL;
2971 return make_alloc_exact((unsigned long)page_address(p), order, size); 2978 return make_alloc_exact((unsigned long)page_address(p), order, size);
2972} 2979}
2973EXPORT_SYMBOL(alloc_pages_exact_nid);
2974 2980
2975/** 2981/**
2976 * free_pages_exact - release memory allocated via alloc_pages_exact() 2982 * free_pages_exact - release memory allocated via alloc_pages_exact()
@@ -3052,7 +3058,7 @@ static inline void show_node(struct zone *zone)
3052void si_meminfo(struct sysinfo *val) 3058void si_meminfo(struct sysinfo *val)
3053{ 3059{
3054 val->totalram = totalram_pages; 3060 val->totalram = totalram_pages;
3055 val->sharedram = 0; 3061 val->sharedram = global_page_state(NR_SHMEM);
3056 val->freeram = global_page_state(NR_FREE_PAGES); 3062 val->freeram = global_page_state(NR_FREE_PAGES);
3057 val->bufferram = nr_blockdev_pages(); 3063 val->bufferram = nr_blockdev_pages();
3058 val->totalhigh = totalhigh_pages; 3064 val->totalhigh = totalhigh_pages;
@@ -3072,6 +3078,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
3072 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3078 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3073 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3079 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3074 val->totalram = managed_pages; 3080 val->totalram = managed_pages;
3081 val->sharedram = node_page_state(nid, NR_SHMEM);
3075 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3082 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3076#ifdef CONFIG_HIGHMEM 3083#ifdef CONFIG_HIGHMEM
3077 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3084 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3253,12 +3260,12 @@ void show_free_areas(unsigned int filter)
3253 K(zone_page_state(zone, NR_BOUNCE)), 3260 K(zone_page_state(zone, NR_BOUNCE)),
3254 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3261 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3255 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3262 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3256 zone->pages_scanned, 3263 K(zone_page_state(zone, NR_PAGES_SCANNED)),
3257 (!zone_reclaimable(zone) ? "yes" : "no") 3264 (!zone_reclaimable(zone) ? "yes" : "no")
3258 ); 3265 );
3259 printk("lowmem_reserve[]:"); 3266 printk("lowmem_reserve[]:");
3260 for (i = 0; i < MAX_NR_ZONES; i++) 3267 for (i = 0; i < MAX_NR_ZONES; i++)
3261 printk(" %lu", zone->lowmem_reserve[i]); 3268 printk(" %ld", zone->lowmem_reserve[i]);
3262 printk("\n"); 3269 printk("\n");
3263 } 3270 }
3264 3271
@@ -5579,7 +5586,7 @@ static void calculate_totalreserve_pages(void)
5579 for_each_online_pgdat(pgdat) { 5586 for_each_online_pgdat(pgdat) {
5580 for (i = 0; i < MAX_NR_ZONES; i++) { 5587 for (i = 0; i < MAX_NR_ZONES; i++) {
5581 struct zone *zone = pgdat->node_zones + i; 5588 struct zone *zone = pgdat->node_zones + i;
5582 unsigned long max = 0; 5589 long max = 0;
5583 5590
5584 /* Find valid and maximum lowmem_reserve in the zone */ 5591 /* Find valid and maximum lowmem_reserve in the zone */
5585 for (j = i; j < MAX_NR_ZONES; j++) { 5592 for (j = i; j < MAX_NR_ZONES; j++) {
diff --git a/mm/readahead.c b/mm/readahead.c
index 0ca36a7770b1..17b9172ec37f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
326 * - thrashing threshold in memory tight systems 326 * - thrashing threshold in memory tight systems
327 */ 327 */
328static pgoff_t count_history_pages(struct address_space *mapping, 328static pgoff_t count_history_pages(struct address_space *mapping,
329 struct file_ra_state *ra,
330 pgoff_t offset, unsigned long max) 329 pgoff_t offset, unsigned long max)
331{ 330{
332 pgoff_t head; 331 pgoff_t head;
@@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping,
349{ 348{
350 pgoff_t size; 349 pgoff_t size;
351 350
352 size = count_history_pages(mapping, ra, offset, max); 351 size = count_history_pages(mapping, offset, max);
353 352
354 /* 353 /*
355 * not enough history pages: 354 * not enough history pages:
diff --git a/mm/shmem.c b/mm/shmem.c
index af68b15a8fc1..302d1cf7ad07 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
149 vm_unacct_memory(VM_ACCT(size)); 149 vm_unacct_memory(VM_ACCT(size));
150} 150}
151 151
152static inline int shmem_reacct_size(unsigned long flags,
153 loff_t oldsize, loff_t newsize)
154{
155 if (!(flags & VM_NORESERVE)) {
156 if (VM_ACCT(newsize) > VM_ACCT(oldsize))
157 return security_vm_enough_memory_mm(current->mm,
158 VM_ACCT(newsize) - VM_ACCT(oldsize));
159 else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
160 vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
161 }
162 return 0;
163}
164
152/* 165/*
153 * ... whereas tmpfs objects are accounted incrementally as 166 * ... whereas tmpfs objects are accounted incrementally as
154 * pages are allocated, in order to allow huge sparse files. 167 * pages are allocated, in order to allow huge sparse files.
@@ -280,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
280 */ 293 */
281static int shmem_add_to_page_cache(struct page *page, 294static int shmem_add_to_page_cache(struct page *page,
282 struct address_space *mapping, 295 struct address_space *mapping,
283 pgoff_t index, gfp_t gfp, void *expected) 296 pgoff_t index, void *expected)
284{ 297{
285 int error; 298 int error;
286 299
@@ -549,6 +562,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
549 loff_t newsize = attr->ia_size; 562 loff_t newsize = attr->ia_size;
550 563
551 if (newsize != oldsize) { 564 if (newsize != oldsize) {
565 error = shmem_reacct_size(SHMEM_I(inode)->flags,
566 oldsize, newsize);
567 if (error)
568 return error;
552 i_size_write(inode, newsize); 569 i_size_write(inode, newsize);
553 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 570 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
554 } 571 }
@@ -649,7 +666,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
649 */ 666 */
650 if (!error) 667 if (!error)
651 error = shmem_add_to_page_cache(*pagep, mapping, index, 668 error = shmem_add_to_page_cache(*pagep, mapping, index,
652 GFP_NOWAIT, radswap); 669 radswap);
653 if (error != -ENOMEM) { 670 if (error != -ENOMEM) {
654 /* 671 /*
655 * Truncation and eviction use free_swap_and_cache(), which 672 * Truncation and eviction use free_swap_and_cache(), which
@@ -1095,7 +1112,7 @@ repeat:
1095 gfp & GFP_RECLAIM_MASK); 1112 gfp & GFP_RECLAIM_MASK);
1096 if (!error) { 1113 if (!error) {
1097 error = shmem_add_to_page_cache(page, mapping, index, 1114 error = shmem_add_to_page_cache(page, mapping, index,
1098 gfp, swp_to_radix_entry(swap)); 1115 swp_to_radix_entry(swap));
1099 /* 1116 /*
1100 * We already confirmed swap under page lock, and make 1117 * We already confirmed swap under page lock, and make
1101 * no memory allocation here, so usually no possibility 1118 * no memory allocation here, so usually no possibility
@@ -1149,7 +1166,7 @@ repeat:
1149 __SetPageSwapBacked(page); 1166 __SetPageSwapBacked(page);
1150 __set_page_locked(page); 1167 __set_page_locked(page);
1151 if (sgp == SGP_WRITE) 1168 if (sgp == SGP_WRITE)
1152 init_page_accessed(page); 1169 __SetPageReferenced(page);
1153 1170
1154 error = mem_cgroup_charge_file(page, current->mm, 1171 error = mem_cgroup_charge_file(page, current->mm,
1155 gfp & GFP_RECLAIM_MASK); 1172 gfp & GFP_RECLAIM_MASK);
@@ -1158,7 +1175,7 @@ repeat:
1158 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1175 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
1159 if (!error) { 1176 if (!error) {
1160 error = shmem_add_to_page_cache(page, mapping, index, 1177 error = shmem_add_to_page_cache(page, mapping, index,
1161 gfp, NULL); 1178 NULL);
1162 radix_tree_preload_end(); 1179 radix_tree_preload_end();
1163 } 1180 }
1164 if (error) { 1181 if (error) {
@@ -2932,16 +2949,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
2932 this.len = strlen(name); 2949 this.len = strlen(name);
2933 this.hash = 0; /* will go */ 2950 this.hash = 0; /* will go */
2934 sb = shm_mnt->mnt_sb; 2951 sb = shm_mnt->mnt_sb;
2952 path.mnt = mntget(shm_mnt);
2935 path.dentry = d_alloc_pseudo(sb, &this); 2953 path.dentry = d_alloc_pseudo(sb, &this);
2936 if (!path.dentry) 2954 if (!path.dentry)
2937 goto put_memory; 2955 goto put_memory;
2938 d_set_d_op(path.dentry, &anon_ops); 2956 d_set_d_op(path.dentry, &anon_ops);
2939 path.mnt = mntget(shm_mnt);
2940 2957
2941 res = ERR_PTR(-ENOSPC); 2958 res = ERR_PTR(-ENOSPC);
2942 inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); 2959 inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2943 if (!inode) 2960 if (!inode)
2944 goto put_dentry; 2961 goto put_memory;
2945 2962
2946 inode->i_flags |= i_flags; 2963 inode->i_flags |= i_flags;
2947 d_instantiate(path.dentry, inode); 2964 d_instantiate(path.dentry, inode);
@@ -2949,19 +2966,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
2949 clear_nlink(inode); /* It is unlinked */ 2966 clear_nlink(inode); /* It is unlinked */
2950 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); 2967 res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
2951 if (IS_ERR(res)) 2968 if (IS_ERR(res))
2952 goto put_dentry; 2969 goto put_path;
2953 2970
2954 res = alloc_file(&path, FMODE_WRITE | FMODE_READ, 2971 res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2955 &shmem_file_operations); 2972 &shmem_file_operations);
2956 if (IS_ERR(res)) 2973 if (IS_ERR(res))
2957 goto put_dentry; 2974 goto put_path;
2958 2975
2959 return res; 2976 return res;
2960 2977
2961put_dentry:
2962 path_put(&path);
2963put_memory: 2978put_memory:
2964 shmem_unacct_size(flags, size); 2979 shmem_unacct_size(flags, size);
2980put_path:
2981 path_put(&path);
2965 return res; 2982 return res;
2966} 2983}
2967 2984
diff --git a/mm/slab.c b/mm/slab.c
index 3070b929a1bf..2e60bf3dedbb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,7 +191,6 @@ struct array_cache {
191 unsigned int limit; 191 unsigned int limit;
192 unsigned int batchcount; 192 unsigned int batchcount;
193 unsigned int touched; 193 unsigned int touched;
194 spinlock_t lock;
195 void *entry[]; /* 194 void *entry[]; /*
196 * Must have this definition in here for the proper 195 * Must have this definition in here for the proper
197 * alignment of array_cache. Also simplifies accessing 196 * alignment of array_cache. Also simplifies accessing
@@ -203,6 +202,11 @@ struct array_cache {
203 */ 202 */
204}; 203};
205 204
205struct alien_cache {
206 spinlock_t lock;
207 struct array_cache ac;
208};
209
206#define SLAB_OBJ_PFMEMALLOC 1 210#define SLAB_OBJ_PFMEMALLOC 1
207static inline bool is_obj_pfmemalloc(void *objp) 211static inline bool is_obj_pfmemalloc(void *objp)
208{ 212{
@@ -242,7 +246,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
242static int drain_freelist(struct kmem_cache *cache, 246static int drain_freelist(struct kmem_cache *cache,
243 struct kmem_cache_node *n, int tofree); 247 struct kmem_cache_node *n, int tofree);
244static void free_block(struct kmem_cache *cachep, void **objpp, int len, 248static void free_block(struct kmem_cache *cachep, void **objpp, int len,
245 int node); 249 int node, struct list_head *list);
250static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
246static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 251static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
247static void cache_reap(struct work_struct *unused); 252static void cache_reap(struct work_struct *unused);
248 253
@@ -267,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
267#define MAKE_LIST(cachep, listp, slab, nodeid) \ 272#define MAKE_LIST(cachep, listp, slab, nodeid) \
268 do { \ 273 do { \
269 INIT_LIST_HEAD(listp); \ 274 INIT_LIST_HEAD(listp); \
270 list_splice(&(cachep->node[nodeid]->slab), listp); \ 275 list_splice(&get_node(cachep, nodeid)->slab, listp); \
271 } while (0) 276 } while (0)
272 277
273#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 278#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
@@ -465,143 +470,6 @@ static struct kmem_cache kmem_cache_boot = {
465 .name = "kmem_cache", 470 .name = "kmem_cache",
466}; 471};
467 472
468#define BAD_ALIEN_MAGIC 0x01020304ul
469
470#ifdef CONFIG_LOCKDEP
471
472/*
473 * Slab sometimes uses the kmalloc slabs to store the slab headers
474 * for other slabs "off slab".
475 * The locking for this is tricky in that it nests within the locks
476 * of all other slabs in a few places; to deal with this special
477 * locking we put on-slab caches into a separate lock-class.
478 *
479 * We set lock class for alien array caches which are up during init.
480 * The lock annotation will be lost if all cpus of a node goes down and
481 * then comes back up during hotplug
482 */
483static struct lock_class_key on_slab_l3_key;
484static struct lock_class_key on_slab_alc_key;
485
486static struct lock_class_key debugobj_l3_key;
487static struct lock_class_key debugobj_alc_key;
488
489static void slab_set_lock_classes(struct kmem_cache *cachep,
490 struct lock_class_key *l3_key, struct lock_class_key *alc_key,
491 int q)
492{
493 struct array_cache **alc;
494 struct kmem_cache_node *n;
495 int r;
496
497 n = cachep->node[q];
498 if (!n)
499 return;
500
501 lockdep_set_class(&n->list_lock, l3_key);
502 alc = n->alien;
503 /*
504 * FIXME: This check for BAD_ALIEN_MAGIC
505 * should go away when common slab code is taught to
506 * work even without alien caches.
507 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
508 * for alloc_alien_cache,
509 */
510 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
511 return;
512 for_each_node(r) {
513 if (alc[r])
514 lockdep_set_class(&alc[r]->lock, alc_key);
515 }
516}
517
518static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
519{
520 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
521}
522
523static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
524{
525 int node;
526
527 for_each_online_node(node)
528 slab_set_debugobj_lock_classes_node(cachep, node);
529}
530
531static void init_node_lock_keys(int q)
532{
533 int i;
534
535 if (slab_state < UP)
536 return;
537
538 for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
539 struct kmem_cache_node *n;
540 struct kmem_cache *cache = kmalloc_caches[i];
541
542 if (!cache)
543 continue;
544
545 n = cache->node[q];
546 if (!n || OFF_SLAB(cache))
547 continue;
548
549 slab_set_lock_classes(cache, &on_slab_l3_key,
550 &on_slab_alc_key, q);
551 }
552}
553
554static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
555{
556 if (!cachep->node[q])
557 return;
558
559 slab_set_lock_classes(cachep, &on_slab_l3_key,
560 &on_slab_alc_key, q);
561}
562
563static inline void on_slab_lock_classes(struct kmem_cache *cachep)
564{
565 int node;
566
567 VM_BUG_ON(OFF_SLAB(cachep));
568 for_each_node(node)
569 on_slab_lock_classes_node(cachep, node);
570}
571
572static inline void init_lock_keys(void)
573{
574 int node;
575
576 for_each_node(node)
577 init_node_lock_keys(node);
578}
579#else
580static void init_node_lock_keys(int q)
581{
582}
583
584static inline void init_lock_keys(void)
585{
586}
587
588static inline void on_slab_lock_classes(struct kmem_cache *cachep)
589{
590}
591
592static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
593{
594}
595
596static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
597{
598}
599
600static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
601{
602}
603#endif
604
605static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 473static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
606 474
607static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 475static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -792,13 +660,8 @@ static void start_cpu_timer(int cpu)
792 } 660 }
793} 661}
794 662
795static struct array_cache *alloc_arraycache(int node, int entries, 663static void init_arraycache(struct array_cache *ac, int limit, int batch)
796 int batchcount, gfp_t gfp)
797{ 664{
798 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
799 struct array_cache *nc = NULL;
800
801 nc = kmalloc_node(memsize, gfp, node);
802 /* 665 /*
803 * The array_cache structures contain pointers to free object. 666 * The array_cache structures contain pointers to free object.
804 * However, when such objects are allocated or transferred to another 667 * However, when such objects are allocated or transferred to another
@@ -806,15 +669,24 @@ static struct array_cache *alloc_arraycache(int node, int entries,
806 * valid references during a kmemleak scan. Therefore, kmemleak must 669 * valid references during a kmemleak scan. Therefore, kmemleak must
807 * not scan such objects. 670 * not scan such objects.
808 */ 671 */
809 kmemleak_no_scan(nc); 672 kmemleak_no_scan(ac);
810 if (nc) { 673 if (ac) {
811 nc->avail = 0; 674 ac->avail = 0;
812 nc->limit = entries; 675 ac->limit = limit;
813 nc->batchcount = batchcount; 676 ac->batchcount = batch;
814 nc->touched = 0; 677 ac->touched = 0;
815 spin_lock_init(&nc->lock);
816 } 678 }
817 return nc; 679}
680
681static struct array_cache *alloc_arraycache(int node, int entries,
682 int batchcount, gfp_t gfp)
683{
684 size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache);
685 struct array_cache *ac = NULL;
686
687 ac = kmalloc_node(memsize, gfp, node);
688 init_arraycache(ac, entries, batchcount);
689 return ac;
818} 690}
819 691
820static inline bool is_slab_pfmemalloc(struct page *page) 692static inline bool is_slab_pfmemalloc(struct page *page)
@@ -826,7 +698,7 @@ static inline bool is_slab_pfmemalloc(struct page *page)
826static void recheck_pfmemalloc_active(struct kmem_cache *cachep, 698static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
827 struct array_cache *ac) 699 struct array_cache *ac)
828{ 700{
829 struct kmem_cache_node *n = cachep->node[numa_mem_id()]; 701 struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
830 struct page *page; 702 struct page *page;
831 unsigned long flags; 703 unsigned long flags;
832 704
@@ -881,7 +753,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
881 * If there are empty slabs on the slabs_free list and we are 753 * If there are empty slabs on the slabs_free list and we are
882 * being forced to refill the cache, mark this one !pfmemalloc. 754 * being forced to refill the cache, mark this one !pfmemalloc.
883 */ 755 */
884 n = cachep->node[numa_mem_id()]; 756 n = get_node(cachep, numa_mem_id());
885 if (!list_empty(&n->slabs_free) && force_refill) { 757 if (!list_empty(&n->slabs_free) && force_refill) {
886 struct page *page = virt_to_head_page(objp); 758 struct page *page = virt_to_head_page(objp);
887 ClearPageSlabPfmemalloc(page); 759 ClearPageSlabPfmemalloc(page);
@@ -961,12 +833,13 @@ static int transfer_objects(struct array_cache *to,
961#define drain_alien_cache(cachep, alien) do { } while (0) 833#define drain_alien_cache(cachep, alien) do { } while (0)
962#define reap_alien(cachep, n) do { } while (0) 834#define reap_alien(cachep, n) do { } while (0)
963 835
964static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 836static inline struct alien_cache **alloc_alien_cache(int node,
837 int limit, gfp_t gfp)
965{ 838{
966 return (struct array_cache **)BAD_ALIEN_MAGIC; 839 return NULL;
967} 840}
968 841
969static inline void free_alien_cache(struct array_cache **ac_ptr) 842static inline void free_alien_cache(struct alien_cache **ac_ptr)
970{ 843{
971} 844}
972 845
@@ -992,46 +865,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
992static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 865static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
993static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 866static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
994 867
995static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 868static struct alien_cache *__alloc_alien_cache(int node, int entries,
869 int batch, gfp_t gfp)
870{
871 size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache);
872 struct alien_cache *alc = NULL;
873
874 alc = kmalloc_node(memsize, gfp, node);
875 init_arraycache(&alc->ac, entries, batch);
876 spin_lock_init(&alc->lock);
877 return alc;
878}
879
880static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
996{ 881{
997 struct array_cache **ac_ptr; 882 struct alien_cache **alc_ptr;
998 int memsize = sizeof(void *) * nr_node_ids; 883 size_t memsize = sizeof(void *) * nr_node_ids;
999 int i; 884 int i;
1000 885
1001 if (limit > 1) 886 if (limit > 1)
1002 limit = 12; 887 limit = 12;
1003 ac_ptr = kzalloc_node(memsize, gfp, node); 888 alc_ptr = kzalloc_node(memsize, gfp, node);
1004 if (ac_ptr) { 889 if (!alc_ptr)
1005 for_each_node(i) { 890 return NULL;
1006 if (i == node || !node_online(i)) 891
1007 continue; 892 for_each_node(i) {
1008 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 893 if (i == node || !node_online(i))
1009 if (!ac_ptr[i]) { 894 continue;
1010 for (i--; i >= 0; i--) 895 alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp);
1011 kfree(ac_ptr[i]); 896 if (!alc_ptr[i]) {
1012 kfree(ac_ptr); 897 for (i--; i >= 0; i--)
1013 return NULL; 898 kfree(alc_ptr[i]);
1014 } 899 kfree(alc_ptr);
900 return NULL;
1015 } 901 }
1016 } 902 }
1017 return ac_ptr; 903 return alc_ptr;
1018} 904}
1019 905
1020static void free_alien_cache(struct array_cache **ac_ptr) 906static void free_alien_cache(struct alien_cache **alc_ptr)
1021{ 907{
1022 int i; 908 int i;
1023 909
1024 if (!ac_ptr) 910 if (!alc_ptr)
1025 return; 911 return;
1026 for_each_node(i) 912 for_each_node(i)
1027 kfree(ac_ptr[i]); 913 kfree(alc_ptr[i]);
1028 kfree(ac_ptr); 914 kfree(alc_ptr);
1029} 915}
1030 916
1031static void __drain_alien_cache(struct kmem_cache *cachep, 917static void __drain_alien_cache(struct kmem_cache *cachep,
1032 struct array_cache *ac, int node) 918 struct array_cache *ac, int node,
919 struct list_head *list)
1033{ 920{
1034 struct kmem_cache_node *n = cachep->node[node]; 921 struct kmem_cache_node *n = get_node(cachep, node);
1035 922
1036 if (ac->avail) { 923 if (ac->avail) {
1037 spin_lock(&n->list_lock); 924 spin_lock(&n->list_lock);
@@ -1043,7 +930,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1043 if (n->shared) 930 if (n->shared)
1044 transfer_objects(n->shared, ac, ac->limit); 931 transfer_objects(n->shared, ac, ac->limit);
1045 932
1046 free_block(cachep, ac->entry, ac->avail, node); 933 free_block(cachep, ac->entry, ac->avail, node, list);
1047 ac->avail = 0; 934 ac->avail = 0;
1048 spin_unlock(&n->list_lock); 935 spin_unlock(&n->list_lock);
1049 } 936 }
@@ -1057,28 +944,40 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n)
1057 int node = __this_cpu_read(slab_reap_node); 944 int node = __this_cpu_read(slab_reap_node);
1058 945
1059 if (n->alien) { 946 if (n->alien) {
1060 struct array_cache *ac = n->alien[node]; 947 struct alien_cache *alc = n->alien[node];
948 struct array_cache *ac;
949
950 if (alc) {
951 ac = &alc->ac;
952 if (ac->avail && spin_trylock_irq(&alc->lock)) {
953 LIST_HEAD(list);
1061 954
1062 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 955 __drain_alien_cache(cachep, ac, node, &list);
1063 __drain_alien_cache(cachep, ac, node); 956 spin_unlock_irq(&alc->lock);
1064 spin_unlock_irq(&ac->lock); 957 slabs_destroy(cachep, &list);
958 }
1065 } 959 }
1066 } 960 }
1067} 961}
1068 962
1069static void drain_alien_cache(struct kmem_cache *cachep, 963static void drain_alien_cache(struct kmem_cache *cachep,
1070 struct array_cache **alien) 964 struct alien_cache **alien)
1071{ 965{
1072 int i = 0; 966 int i = 0;
967 struct alien_cache *alc;
1073 struct array_cache *ac; 968 struct array_cache *ac;
1074 unsigned long flags; 969 unsigned long flags;
1075 970
1076 for_each_online_node(i) { 971 for_each_online_node(i) {
1077 ac = alien[i]; 972 alc = alien[i];
1078 if (ac) { 973 if (alc) {
1079 spin_lock_irqsave(&ac->lock, flags); 974 LIST_HEAD(list);
1080 __drain_alien_cache(cachep, ac, i); 975
1081 spin_unlock_irqrestore(&ac->lock, flags); 976 ac = &alc->ac;
977 spin_lock_irqsave(&alc->lock, flags);
978 __drain_alien_cache(cachep, ac, i, &list);
979 spin_unlock_irqrestore(&alc->lock, flags);
980 slabs_destroy(cachep, &list);
1082 } 981 }
1083 } 982 }
1084} 983}
@@ -1087,8 +986,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1087{ 986{
1088 int nodeid = page_to_nid(virt_to_page(objp)); 987 int nodeid = page_to_nid(virt_to_page(objp));
1089 struct kmem_cache_node *n; 988 struct kmem_cache_node *n;
1090 struct array_cache *alien = NULL; 989 struct alien_cache *alien = NULL;
990 struct array_cache *ac;
1091 int node; 991 int node;
992 LIST_HEAD(list);
1092 993
1093 node = numa_mem_id(); 994 node = numa_mem_id();
1094 995
@@ -1099,21 +1000,25 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1099 if (likely(nodeid == node)) 1000 if (likely(nodeid == node))
1100 return 0; 1001 return 0;
1101 1002
1102 n = cachep->node[node]; 1003 n = get_node(cachep, node);
1103 STATS_INC_NODEFREES(cachep); 1004 STATS_INC_NODEFREES(cachep);
1104 if (n->alien && n->alien[nodeid]) { 1005 if (n->alien && n->alien[nodeid]) {
1105 alien = n->alien[nodeid]; 1006 alien = n->alien[nodeid];
1007 ac = &alien->ac;
1106 spin_lock(&alien->lock); 1008 spin_lock(&alien->lock);
1107 if (unlikely(alien->avail == alien->limit)) { 1009 if (unlikely(ac->avail == ac->limit)) {
1108 STATS_INC_ACOVERFLOW(cachep); 1010 STATS_INC_ACOVERFLOW(cachep);
1109 __drain_alien_cache(cachep, alien, nodeid); 1011 __drain_alien_cache(cachep, ac, nodeid, &list);
1110 } 1012 }
1111 ac_put_obj(cachep, alien, objp); 1013 ac_put_obj(cachep, ac, objp);
1112 spin_unlock(&alien->lock); 1014 spin_unlock(&alien->lock);
1015 slabs_destroy(cachep, &list);
1113 } else { 1016 } else {
1114 spin_lock(&(cachep->node[nodeid])->list_lock); 1017 n = get_node(cachep, nodeid);
1115 free_block(cachep, &objp, 1, nodeid); 1018 spin_lock(&n->list_lock);
1116 spin_unlock(&(cachep->node[nodeid])->list_lock); 1019 free_block(cachep, &objp, 1, nodeid, &list);
1020 spin_unlock(&n->list_lock);
1021 slabs_destroy(cachep, &list);
1117 } 1022 }
1118 return 1; 1023 return 1;
1119} 1024}
@@ -1132,7 +1037,7 @@ static int init_cache_node_node(int node)
1132{ 1037{
1133 struct kmem_cache *cachep; 1038 struct kmem_cache *cachep;
1134 struct kmem_cache_node *n; 1039 struct kmem_cache_node *n;
1135 const int memsize = sizeof(struct kmem_cache_node); 1040 const size_t memsize = sizeof(struct kmem_cache_node);
1136 1041
1137 list_for_each_entry(cachep, &slab_caches, list) { 1042 list_for_each_entry(cachep, &slab_caches, list) {
1138 /* 1043 /*
@@ -1140,7 +1045,8 @@ static int init_cache_node_node(int node)
1140 * begin anything. Make sure some other cpu on this 1045 * begin anything. Make sure some other cpu on this
1141 * node has not already allocated this 1046 * node has not already allocated this
1142 */ 1047 */
1143 if (!cachep->node[node]) { 1048 n = get_node(cachep, node);
1049 if (!n) {
1144 n = kmalloc_node(memsize, GFP_KERNEL, node); 1050 n = kmalloc_node(memsize, GFP_KERNEL, node);
1145 if (!n) 1051 if (!n)
1146 return -ENOMEM; 1052 return -ENOMEM;
@@ -1156,11 +1062,11 @@ static int init_cache_node_node(int node)
1156 cachep->node[node] = n; 1062 cachep->node[node] = n;
1157 } 1063 }
1158 1064
1159 spin_lock_irq(&cachep->node[node]->list_lock); 1065 spin_lock_irq(&n->list_lock);
1160 cachep->node[node]->free_limit = 1066 n->free_limit =
1161 (1 + nr_cpus_node(node)) * 1067 (1 + nr_cpus_node(node)) *
1162 cachep->batchcount + cachep->num; 1068 cachep->batchcount + cachep->num;
1163 spin_unlock_irq(&cachep->node[node]->list_lock); 1069 spin_unlock_irq(&n->list_lock);
1164 } 1070 }
1165 return 0; 1071 return 0;
1166} 1072}
@@ -1181,12 +1087,13 @@ static void cpuup_canceled(long cpu)
1181 list_for_each_entry(cachep, &slab_caches, list) { 1087 list_for_each_entry(cachep, &slab_caches, list) {
1182 struct array_cache *nc; 1088 struct array_cache *nc;
1183 struct array_cache *shared; 1089 struct array_cache *shared;
1184 struct array_cache **alien; 1090 struct alien_cache **alien;
1091 LIST_HEAD(list);
1185 1092
1186 /* cpu is dead; no one can alloc from it. */ 1093 /* cpu is dead; no one can alloc from it. */
1187 nc = cachep->array[cpu]; 1094 nc = cachep->array[cpu];
1188 cachep->array[cpu] = NULL; 1095 cachep->array[cpu] = NULL;
1189 n = cachep->node[node]; 1096 n = get_node(cachep, node);
1190 1097
1191 if (!n) 1098 if (!n)
1192 goto free_array_cache; 1099 goto free_array_cache;
@@ -1196,7 +1103,7 @@ static void cpuup_canceled(long cpu)
1196 /* Free limit for this kmem_cache_node */ 1103 /* Free limit for this kmem_cache_node */
1197 n->free_limit -= cachep->batchcount; 1104 n->free_limit -= cachep->batchcount;
1198 if (nc) 1105 if (nc)
1199 free_block(cachep, nc->entry, nc->avail, node); 1106 free_block(cachep, nc->entry, nc->avail, node, &list);
1200 1107
1201 if (!cpumask_empty(mask)) { 1108 if (!cpumask_empty(mask)) {
1202 spin_unlock_irq(&n->list_lock); 1109 spin_unlock_irq(&n->list_lock);
@@ -1206,7 +1113,7 @@ static void cpuup_canceled(long cpu)
1206 shared = n->shared; 1113 shared = n->shared;
1207 if (shared) { 1114 if (shared) {
1208 free_block(cachep, shared->entry, 1115 free_block(cachep, shared->entry,
1209 shared->avail, node); 1116 shared->avail, node, &list);
1210 n->shared = NULL; 1117 n->shared = NULL;
1211 } 1118 }
1212 1119
@@ -1221,6 +1128,7 @@ static void cpuup_canceled(long cpu)
1221 free_alien_cache(alien); 1128 free_alien_cache(alien);
1222 } 1129 }
1223free_array_cache: 1130free_array_cache:
1131 slabs_destroy(cachep, &list);
1224 kfree(nc); 1132 kfree(nc);
1225 } 1133 }
1226 /* 1134 /*
@@ -1229,7 +1137,7 @@ free_array_cache:
1229 * shrink each nodelist to its limit. 1137 * shrink each nodelist to its limit.
1230 */ 1138 */
1231 list_for_each_entry(cachep, &slab_caches, list) { 1139 list_for_each_entry(cachep, &slab_caches, list) {
1232 n = cachep->node[node]; 1140 n = get_node(cachep, node);
1233 if (!n) 1141 if (!n)
1234 continue; 1142 continue;
1235 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 1143 drain_freelist(cachep, n, slabs_tofree(cachep, n));
@@ -1260,7 +1168,7 @@ static int cpuup_prepare(long cpu)
1260 list_for_each_entry(cachep, &slab_caches, list) { 1168 list_for_each_entry(cachep, &slab_caches, list) {
1261 struct array_cache *nc; 1169 struct array_cache *nc;
1262 struct array_cache *shared = NULL; 1170 struct array_cache *shared = NULL;
1263 struct array_cache **alien = NULL; 1171 struct alien_cache **alien = NULL;
1264 1172
1265 nc = alloc_arraycache(node, cachep->limit, 1173 nc = alloc_arraycache(node, cachep->limit,
1266 cachep->batchcount, GFP_KERNEL); 1174 cachep->batchcount, GFP_KERNEL);
@@ -1284,7 +1192,7 @@ static int cpuup_prepare(long cpu)
1284 } 1192 }
1285 } 1193 }
1286 cachep->array[cpu] = nc; 1194 cachep->array[cpu] = nc;
1287 n = cachep->node[node]; 1195 n = get_node(cachep, node);
1288 BUG_ON(!n); 1196 BUG_ON(!n);
1289 1197
1290 spin_lock_irq(&n->list_lock); 1198 spin_lock_irq(&n->list_lock);
@@ -1305,13 +1213,7 @@ static int cpuup_prepare(long cpu)
1305 spin_unlock_irq(&n->list_lock); 1213 spin_unlock_irq(&n->list_lock);
1306 kfree(shared); 1214 kfree(shared);
1307 free_alien_cache(alien); 1215 free_alien_cache(alien);
1308 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1309 slab_set_debugobj_lock_classes_node(cachep, node);
1310 else if (!OFF_SLAB(cachep) &&
1311 !(cachep->flags & SLAB_DESTROY_BY_RCU))
1312 on_slab_lock_classes_node(cachep, node);
1313 } 1216 }
1314 init_node_lock_keys(node);
1315 1217
1316 return 0; 1218 return 0;
1317bad: 1219bad:
@@ -1395,7 +1297,7 @@ static int __meminit drain_cache_node_node(int node)
1395 list_for_each_entry(cachep, &slab_caches, list) { 1297 list_for_each_entry(cachep, &slab_caches, list) {
1396 struct kmem_cache_node *n; 1298 struct kmem_cache_node *n;
1397 1299
1398 n = cachep->node[node]; 1300 n = get_node(cachep, node);
1399 if (!n) 1301 if (!n)
1400 continue; 1302 continue;
1401 1303
@@ -1575,10 +1477,6 @@ void __init kmem_cache_init(void)
1575 1477
1576 memcpy(ptr, cpu_cache_get(kmem_cache), 1478 memcpy(ptr, cpu_cache_get(kmem_cache),
1577 sizeof(struct arraycache_init)); 1479 sizeof(struct arraycache_init));
1578 /*
1579 * Do not assume that spinlocks can be initialized via memcpy:
1580 */
1581 spin_lock_init(&ptr->lock);
1582 1480
1583 kmem_cache->array[smp_processor_id()] = ptr; 1481 kmem_cache->array[smp_processor_id()] = ptr;
1584 1482
@@ -1588,10 +1486,6 @@ void __init kmem_cache_init(void)
1588 != &initarray_generic.cache); 1486 != &initarray_generic.cache);
1589 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), 1487 memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
1590 sizeof(struct arraycache_init)); 1488 sizeof(struct arraycache_init));
1591 /*
1592 * Do not assume that spinlocks can be initialized via memcpy:
1593 */
1594 spin_lock_init(&ptr->lock);
1595 1489
1596 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; 1490 kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
1597 } 1491 }
@@ -1628,9 +1522,6 @@ void __init kmem_cache_init_late(void)
1628 BUG(); 1522 BUG();
1629 mutex_unlock(&slab_mutex); 1523 mutex_unlock(&slab_mutex);
1630 1524
1631 /* Annotate slab for lockdep -- annotate the malloc caches */
1632 init_lock_keys();
1633
1634 /* Done! */ 1525 /* Done! */
1635 slab_state = FULL; 1526 slab_state = FULL;
1636 1527
@@ -1690,14 +1581,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1690 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", 1581 printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
1691 cachep->name, cachep->size, cachep->gfporder); 1582 cachep->name, cachep->size, cachep->gfporder);
1692 1583
1693 for_each_online_node(node) { 1584 for_each_kmem_cache_node(cachep, node, n) {
1694 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1585 unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
1695 unsigned long active_slabs = 0, num_slabs = 0; 1586 unsigned long active_slabs = 0, num_slabs = 0;
1696 1587
1697 n = cachep->node[node];
1698 if (!n)
1699 continue;
1700
1701 spin_lock_irqsave(&n->list_lock, flags); 1588 spin_lock_irqsave(&n->list_lock, flags);
1702 list_for_each_entry(page, &n->slabs_full, lru) { 1589 list_for_each_entry(page, &n->slabs_full, lru) {
1703 active_objs += cachep->num; 1590 active_objs += cachep->num;
@@ -1724,7 +1611,8 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1724} 1611}
1725 1612
1726/* 1613/*
1727 * Interface to system's page allocator. No need to hold the cache-lock. 1614 * Interface to system's page allocator. No need to hold the
1615 * kmem_cache_node ->list_lock.
1728 * 1616 *
1729 * If we requested dmaable memory, we will get it. Even if we 1617 * If we requested dmaable memory, we will get it. Even if we
1730 * did not request dmaable memory, we might get it, but that 1618 * did not request dmaable memory, we might get it, but that
@@ -2026,9 +1914,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
2026 * @cachep: cache pointer being destroyed 1914 * @cachep: cache pointer being destroyed
2027 * @page: page pointer being destroyed 1915 * @page: page pointer being destroyed
2028 * 1916 *
2029 * Destroy all the objs in a slab, and release the mem back to the system. 1917 * Destroy all the objs in a slab page, and release the mem back to the system.
2030 * Before calling the slab must have been unlinked from the cache. The 1918 * Before calling the slab page must have been unlinked from the cache. The
2031 * cache-lock is not held/needed. 1919 * kmem_cache_node ->list_lock is not held/needed.
2032 */ 1920 */
2033static void slab_destroy(struct kmem_cache *cachep, struct page *page) 1921static void slab_destroy(struct kmem_cache *cachep, struct page *page)
2034{ 1922{
@@ -2060,6 +1948,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
2060 kmem_cache_free(cachep->freelist_cache, freelist); 1948 kmem_cache_free(cachep->freelist_cache, freelist);
2061} 1949}
2062 1950
1951static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
1952{
1953 struct page *page, *n;
1954
1955 list_for_each_entry_safe(page, n, list, lru) {
1956 list_del(&page->lru);
1957 slab_destroy(cachep, page);
1958 }
1959}
1960
2063/** 1961/**
2064 * calculate_slab_order - calculate size (page order) of slabs 1962 * calculate_slab_order - calculate size (page order) of slabs
2065 * @cachep: pointer to the cache that is being created 1963 * @cachep: pointer to the cache that is being created
@@ -2405,17 +2303,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
2405 return err; 2303 return err;
2406 } 2304 }
2407 2305
2408 if (flags & SLAB_DEBUG_OBJECTS) {
2409 /*
2410 * Would deadlock through slab_destroy()->call_rcu()->
2411 * debug_object_activate()->kmem_cache_alloc().
2412 */
2413 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2414
2415 slab_set_debugobj_lock_classes(cachep);
2416 } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
2417 on_slab_lock_classes(cachep);
2418
2419 return 0; 2306 return 0;
2420} 2307}
2421 2308
@@ -2434,7 +2321,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
2434{ 2321{
2435#ifdef CONFIG_SMP 2322#ifdef CONFIG_SMP
2436 check_irq_off(); 2323 check_irq_off();
2437 assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); 2324 assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
2438#endif 2325#endif
2439} 2326}
2440 2327
@@ -2442,7 +2329,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2442{ 2329{
2443#ifdef CONFIG_SMP 2330#ifdef CONFIG_SMP
2444 check_irq_off(); 2331 check_irq_off();
2445 assert_spin_locked(&cachep->node[node]->list_lock); 2332 assert_spin_locked(&get_node(cachep, node)->list_lock);
2446#endif 2333#endif
2447} 2334}
2448 2335
@@ -2462,12 +2349,16 @@ static void do_drain(void *arg)
2462 struct kmem_cache *cachep = arg; 2349 struct kmem_cache *cachep = arg;
2463 struct array_cache *ac; 2350 struct array_cache *ac;
2464 int node = numa_mem_id(); 2351 int node = numa_mem_id();
2352 struct kmem_cache_node *n;
2353 LIST_HEAD(list);
2465 2354
2466 check_irq_off(); 2355 check_irq_off();
2467 ac = cpu_cache_get(cachep); 2356 ac = cpu_cache_get(cachep);
2468 spin_lock(&cachep->node[node]->list_lock); 2357 n = get_node(cachep, node);
2469 free_block(cachep, ac->entry, ac->avail, node); 2358 spin_lock(&n->list_lock);
2470 spin_unlock(&cachep->node[node]->list_lock); 2359 free_block(cachep, ac->entry, ac->avail, node, &list);
2360 spin_unlock(&n->list_lock);
2361 slabs_destroy(cachep, &list);
2471 ac->avail = 0; 2362 ac->avail = 0;
2472} 2363}
2473 2364
@@ -2478,17 +2369,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2478 2369
2479 on_each_cpu(do_drain, cachep, 1); 2370 on_each_cpu(do_drain, cachep, 1);
2480 check_irq_on(); 2371 check_irq_on();
2481 for_each_online_node(node) { 2372 for_each_kmem_cache_node(cachep, node, n)
2482 n = cachep->node[node]; 2373 if (n->alien)
2483 if (n && n->alien)
2484 drain_alien_cache(cachep, n->alien); 2374 drain_alien_cache(cachep, n->alien);
2485 }
2486 2375
2487 for_each_online_node(node) { 2376 for_each_kmem_cache_node(cachep, node, n)
2488 n = cachep->node[node]; 2377 drain_array(cachep, n, n->shared, 1, node);
2489 if (n)
2490 drain_array(cachep, n, n->shared, 1, node);
2491 }
2492} 2378}
2493 2379
2494/* 2380/*
@@ -2534,17 +2420,14 @@ out:
2534 2420
2535int __kmem_cache_shrink(struct kmem_cache *cachep) 2421int __kmem_cache_shrink(struct kmem_cache *cachep)
2536{ 2422{
2537 int ret = 0, i = 0; 2423 int ret = 0;
2424 int node;
2538 struct kmem_cache_node *n; 2425 struct kmem_cache_node *n;
2539 2426
2540 drain_cpu_caches(cachep); 2427 drain_cpu_caches(cachep);
2541 2428
2542 check_irq_on(); 2429 check_irq_on();
2543 for_each_online_node(i) { 2430 for_each_kmem_cache_node(cachep, node, n) {
2544 n = cachep->node[i];
2545 if (!n)
2546 continue;
2547
2548 drain_freelist(cachep, n, slabs_tofree(cachep, n)); 2431 drain_freelist(cachep, n, slabs_tofree(cachep, n));
2549 2432
2550 ret += !list_empty(&n->slabs_full) || 2433 ret += !list_empty(&n->slabs_full) ||
@@ -2566,13 +2449,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
2566 kfree(cachep->array[i]); 2449 kfree(cachep->array[i]);
2567 2450
2568 /* NUMA: free the node structures */ 2451 /* NUMA: free the node structures */
2569 for_each_online_node(i) { 2452 for_each_kmem_cache_node(cachep, i, n) {
2570 n = cachep->node[i]; 2453 kfree(n->shared);
2571 if (n) { 2454 free_alien_cache(n->alien);
2572 kfree(n->shared); 2455 kfree(n);
2573 free_alien_cache(n->alien); 2456 cachep->node[i] = NULL;
2574 kfree(n);
2575 }
2576 } 2457 }
2577 return 0; 2458 return 0;
2578} 2459}
@@ -2751,7 +2632,7 @@ static int cache_grow(struct kmem_cache *cachep,
2751 2632
2752 /* Take the node list lock to change the colour_next on this node */ 2633 /* Take the node list lock to change the colour_next on this node */
2753 check_irq_off(); 2634 check_irq_off();
2754 n = cachep->node[nodeid]; 2635 n = get_node(cachep, nodeid);
2755 spin_lock(&n->list_lock); 2636 spin_lock(&n->list_lock);
2756 2637
2757 /* Get colour for the slab, and cal the next value. */ 2638 /* Get colour for the slab, and cal the next value. */
@@ -2920,7 +2801,7 @@ retry:
2920 */ 2801 */
2921 batchcount = BATCHREFILL_LIMIT; 2802 batchcount = BATCHREFILL_LIMIT;
2922 } 2803 }
2923 n = cachep->node[node]; 2804 n = get_node(cachep, node);
2924 2805
2925 BUG_ON(ac->avail > 0 || !n); 2806 BUG_ON(ac->avail > 0 || !n);
2926 spin_lock(&n->list_lock); 2807 spin_lock(&n->list_lock);
@@ -3060,7 +2941,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3060 2941
3061static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) 2942static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3062{ 2943{
3063 if (cachep == kmem_cache) 2944 if (unlikely(cachep == kmem_cache))
3064 return false; 2945 return false;
3065 2946
3066 return should_failslab(cachep->object_size, flags, cachep->flags); 2947 return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3169,8 +3050,8 @@ retry:
3169 nid = zone_to_nid(zone); 3050 nid = zone_to_nid(zone);
3170 3051
3171 if (cpuset_zone_allowed_hardwall(zone, flags) && 3052 if (cpuset_zone_allowed_hardwall(zone, flags) &&
3172 cache->node[nid] && 3053 get_node(cache, nid) &&
3173 cache->node[nid]->free_objects) { 3054 get_node(cache, nid)->free_objects) {
3174 obj = ____cache_alloc_node(cache, 3055 obj = ____cache_alloc_node(cache,
3175 flags | GFP_THISNODE, nid); 3056 flags | GFP_THISNODE, nid);
3176 if (obj) 3057 if (obj)
@@ -3233,7 +3114,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3233 int x; 3114 int x;
3234 3115
3235 VM_BUG_ON(nodeid > num_online_nodes()); 3116 VM_BUG_ON(nodeid > num_online_nodes());
3236 n = cachep->node[nodeid]; 3117 n = get_node(cachep, nodeid);
3237 BUG_ON(!n); 3118 BUG_ON(!n);
3238 3119
3239retry: 3120retry:
@@ -3304,7 +3185,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3304 if (nodeid == NUMA_NO_NODE) 3185 if (nodeid == NUMA_NO_NODE)
3305 nodeid = slab_node; 3186 nodeid = slab_node;
3306 3187
3307 if (unlikely(!cachep->node[nodeid])) { 3188 if (unlikely(!get_node(cachep, nodeid))) {
3308 /* Node not bootstrapped yet */ 3189 /* Node not bootstrapped yet */
3309 ptr = fallback_alloc(cachep, flags); 3190 ptr = fallback_alloc(cachep, flags);
3310 goto out; 3191 goto out;
@@ -3405,12 +3286,13 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
3405 3286
3406/* 3287/*
3407 * Caller needs to acquire correct kmem_cache_node's list_lock 3288 * Caller needs to acquire correct kmem_cache_node's list_lock
3289 * @list: List of detached free slabs should be freed by caller
3408 */ 3290 */
3409static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3291static void free_block(struct kmem_cache *cachep, void **objpp,
3410 int node) 3292 int nr_objects, int node, struct list_head *list)
3411{ 3293{
3412 int i; 3294 int i;
3413 struct kmem_cache_node *n; 3295 struct kmem_cache_node *n = get_node(cachep, node);
3414 3296
3415 for (i = 0; i < nr_objects; i++) { 3297 for (i = 0; i < nr_objects; i++) {
3416 void *objp; 3298 void *objp;
@@ -3420,7 +3302,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3420 objp = objpp[i]; 3302 objp = objpp[i];
3421 3303
3422 page = virt_to_head_page(objp); 3304 page = virt_to_head_page(objp);
3423 n = cachep->node[node];
3424 list_del(&page->lru); 3305 list_del(&page->lru);
3425 check_spinlock_acquired_node(cachep, node); 3306 check_spinlock_acquired_node(cachep, node);
3426 slab_put_obj(cachep, page, objp, node); 3307 slab_put_obj(cachep, page, objp, node);
@@ -3431,13 +3312,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3431 if (page->active == 0) { 3312 if (page->active == 0) {
3432 if (n->free_objects > n->free_limit) { 3313 if (n->free_objects > n->free_limit) {
3433 n->free_objects -= cachep->num; 3314 n->free_objects -= cachep->num;
3434 /* No need to drop any previously held 3315 list_add_tail(&page->lru, list);
3435 * lock here, even if we have a off-slab slab
3436 * descriptor it is guaranteed to come from
3437 * a different cache, refer to comments before
3438 * alloc_slabmgmt.
3439 */
3440 slab_destroy(cachep, page);
3441 } else { 3316 } else {
3442 list_add(&page->lru, &n->slabs_free); 3317 list_add(&page->lru, &n->slabs_free);
3443 } 3318 }
@@ -3456,13 +3331,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3456 int batchcount; 3331 int batchcount;
3457 struct kmem_cache_node *n; 3332 struct kmem_cache_node *n;
3458 int node = numa_mem_id(); 3333 int node = numa_mem_id();
3334 LIST_HEAD(list);
3459 3335
3460 batchcount = ac->batchcount; 3336 batchcount = ac->batchcount;
3461#if DEBUG 3337#if DEBUG
3462 BUG_ON(!batchcount || batchcount > ac->avail); 3338 BUG_ON(!batchcount || batchcount > ac->avail);
3463#endif 3339#endif
3464 check_irq_off(); 3340 check_irq_off();
3465 n = cachep->node[node]; 3341 n = get_node(cachep, node);
3466 spin_lock(&n->list_lock); 3342 spin_lock(&n->list_lock);
3467 if (n->shared) { 3343 if (n->shared) {
3468 struct array_cache *shared_array = n->shared; 3344 struct array_cache *shared_array = n->shared;
@@ -3477,7 +3353,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3477 } 3353 }
3478 } 3354 }
3479 3355
3480 free_block(cachep, ac->entry, batchcount, node); 3356 free_block(cachep, ac->entry, batchcount, node, &list);
3481free_done: 3357free_done:
3482#if STATS 3358#if STATS
3483 { 3359 {
@@ -3498,6 +3374,7 @@ free_done:
3498 } 3374 }
3499#endif 3375#endif
3500 spin_unlock(&n->list_lock); 3376 spin_unlock(&n->list_lock);
3377 slabs_destroy(cachep, &list);
3501 ac->avail -= batchcount; 3378 ac->avail -= batchcount;
3502 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3379 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3503} 3380}
@@ -3754,7 +3631,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
3754 int node; 3631 int node;
3755 struct kmem_cache_node *n; 3632 struct kmem_cache_node *n;
3756 struct array_cache *new_shared; 3633 struct array_cache *new_shared;
3757 struct array_cache **new_alien = NULL; 3634 struct alien_cache **new_alien = NULL;
3758 3635
3759 for_each_online_node(node) { 3636 for_each_online_node(node) {
3760 3637
@@ -3775,15 +3652,16 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
3775 } 3652 }
3776 } 3653 }
3777 3654
3778 n = cachep->node[node]; 3655 n = get_node(cachep, node);
3779 if (n) { 3656 if (n) {
3780 struct array_cache *shared = n->shared; 3657 struct array_cache *shared = n->shared;
3658 LIST_HEAD(list);
3781 3659
3782 spin_lock_irq(&n->list_lock); 3660 spin_lock_irq(&n->list_lock);
3783 3661
3784 if (shared) 3662 if (shared)
3785 free_block(cachep, shared->entry, 3663 free_block(cachep, shared->entry,
3786 shared->avail, node); 3664 shared->avail, node, &list);
3787 3665
3788 n->shared = new_shared; 3666 n->shared = new_shared;
3789 if (!n->alien) { 3667 if (!n->alien) {
@@ -3793,6 +3671,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
3793 n->free_limit = (1 + nr_cpus_node(node)) * 3671 n->free_limit = (1 + nr_cpus_node(node)) *
3794 cachep->batchcount + cachep->num; 3672 cachep->batchcount + cachep->num;
3795 spin_unlock_irq(&n->list_lock); 3673 spin_unlock_irq(&n->list_lock);
3674 slabs_destroy(cachep, &list);
3796 kfree(shared); 3675 kfree(shared);
3797 free_alien_cache(new_alien); 3676 free_alien_cache(new_alien);
3798 continue; 3677 continue;
@@ -3820,9 +3699,8 @@ fail:
3820 /* Cache is not active yet. Roll back what we did */ 3699 /* Cache is not active yet. Roll back what we did */
3821 node--; 3700 node--;
3822 while (node >= 0) { 3701 while (node >= 0) {
3823 if (cachep->node[node]) { 3702 n = get_node(cachep, node);
3824 n = cachep->node[node]; 3703 if (n) {
3825
3826 kfree(n->shared); 3704 kfree(n->shared);
3827 free_alien_cache(n->alien); 3705 free_alien_cache(n->alien);
3828 kfree(n); 3706 kfree(n);
@@ -3883,12 +3761,20 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
3883 cachep->shared = shared; 3761 cachep->shared = shared;
3884 3762
3885 for_each_online_cpu(i) { 3763 for_each_online_cpu(i) {
3764 LIST_HEAD(list);
3886 struct array_cache *ccold = new->new[i]; 3765 struct array_cache *ccold = new->new[i];
3766 int node;
3767 struct kmem_cache_node *n;
3768
3887 if (!ccold) 3769 if (!ccold)
3888 continue; 3770 continue;
3889 spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); 3771
3890 free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); 3772 node = cpu_to_mem(i);
3891 spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); 3773 n = get_node(cachep, node);
3774 spin_lock_irq(&n->list_lock);
3775 free_block(cachep, ccold->entry, ccold->avail, node, &list);
3776 spin_unlock_irq(&n->list_lock);
3777 slabs_destroy(cachep, &list);
3892 kfree(ccold); 3778 kfree(ccold);
3893 } 3779 }
3894 kfree(new); 3780 kfree(new);
@@ -3996,6 +3882,7 @@ skip_setup:
3996static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 3882static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
3997 struct array_cache *ac, int force, int node) 3883 struct array_cache *ac, int force, int node)
3998{ 3884{
3885 LIST_HEAD(list);
3999 int tofree; 3886 int tofree;
4000 3887
4001 if (!ac || !ac->avail) 3888 if (!ac || !ac->avail)
@@ -4008,12 +3895,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
4008 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3895 tofree = force ? ac->avail : (ac->limit + 4) / 5;
4009 if (tofree > ac->avail) 3896 if (tofree > ac->avail)
4010 tofree = (ac->avail + 1) / 2; 3897 tofree = (ac->avail + 1) / 2;
4011 free_block(cachep, ac->entry, tofree, node); 3898 free_block(cachep, ac->entry, tofree, node, &list);
4012 ac->avail -= tofree; 3899 ac->avail -= tofree;
4013 memmove(ac->entry, &(ac->entry[tofree]), 3900 memmove(ac->entry, &(ac->entry[tofree]),
4014 sizeof(void *) * ac->avail); 3901 sizeof(void *) * ac->avail);
4015 } 3902 }
4016 spin_unlock_irq(&n->list_lock); 3903 spin_unlock_irq(&n->list_lock);
3904 slabs_destroy(cachep, &list);
4017 } 3905 }
4018} 3906}
4019 3907
@@ -4048,7 +3936,7 @@ static void cache_reap(struct work_struct *w)
4048 * have established with reasonable certainty that 3936 * have established with reasonable certainty that
4049 * we can do some work if the lock was obtained. 3937 * we can do some work if the lock was obtained.
4050 */ 3938 */
4051 n = searchp->node[node]; 3939 n = get_node(searchp, node);
4052 3940
4053 reap_alien(searchp, n); 3941 reap_alien(searchp, n);
4054 3942
@@ -4100,10 +3988,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4100 3988
4101 active_objs = 0; 3989 active_objs = 0;
4102 num_slabs = 0; 3990 num_slabs = 0;
4103 for_each_online_node(node) { 3991 for_each_kmem_cache_node(cachep, node, n) {
4104 n = cachep->node[node];
4105 if (!n)
4106 continue;
4107 3992
4108 check_irq_on(); 3993 check_irq_on();
4109 spin_lock_irq(&n->list_lock); 3994 spin_lock_irq(&n->list_lock);
@@ -4328,10 +4213,7 @@ static int leaks_show(struct seq_file *m, void *p)
4328 4213
4329 x[1] = 0; 4214 x[1] = 0;
4330 4215
4331 for_each_online_node(node) { 4216 for_each_kmem_cache_node(cachep, node, n) {
4332 n = cachep->node[node];
4333 if (!n)
4334 continue;
4335 4217
4336 check_irq_on(); 4218 check_irq_on();
4337 spin_lock_irq(&n->list_lock); 4219 spin_lock_irq(&n->list_lock);
diff --git a/mm/slab.h b/mm/slab.h
index 961a3fb1f5a2..0e0fdd365840 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -256,13 +256,12 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
256 return cachep; 256 return cachep;
257 257
258 pr_err("%s: Wrong slab cache. %s but object is from %s\n", 258 pr_err("%s: Wrong slab cache. %s but object is from %s\n",
259 __FUNCTION__, cachep->name, s->name); 259 __func__, cachep->name, s->name);
260 WARN_ON_ONCE(1); 260 WARN_ON_ONCE(1);
261 return s; 261 return s;
262} 262}
263#endif
264
265 263
264#ifndef CONFIG_SLOB
266/* 265/*
267 * The slab lists for all objects. 266 * The slab lists for all objects.
268 */ 267 */
@@ -277,7 +276,7 @@ struct kmem_cache_node {
277 unsigned int free_limit; 276 unsigned int free_limit;
278 unsigned int colour_next; /* Per-node cache coloring */ 277 unsigned int colour_next; /* Per-node cache coloring */
279 struct array_cache *shared; /* shared per node */ 278 struct array_cache *shared; /* shared per node */
280 struct array_cache **alien; /* on other nodes */ 279 struct alien_cache **alien; /* on other nodes */
281 unsigned long next_reap; /* updated without locking */ 280 unsigned long next_reap; /* updated without locking */
282 int free_touched; /* updated without locking */ 281 int free_touched; /* updated without locking */
283#endif 282#endif
@@ -294,5 +293,22 @@ struct kmem_cache_node {
294 293
295}; 294};
296 295
296static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
297{
298 return s->node[node];
299}
300
301/*
302 * Iterator over all nodes. The body will be executed for each node that has
303 * a kmem_cache_node structure allocated (which is true for all online nodes)
304 */
305#define for_each_kmem_cache_node(__s, __node, __n) \
306 for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \
307 if (__n)
308
309#endif
310
297void *slab_next(struct seq_file *m, void *p, loff_t *pos); 311void *slab_next(struct seq_file *m, void *p, loff_t *pos);
298void slab_stop(struct seq_file *m, void *p); 312void slab_stop(struct seq_file *m, void *p);
313
314#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d31c4bacc6a2..d319502b2403 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -19,6 +19,8 @@
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/page.h> 20#include <asm/page.h>
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22
23#define CREATE_TRACE_POINTS
22#include <trace/events/kmem.h> 24#include <trace/events/kmem.h>
23 25
24#include "slab.h" 26#include "slab.h"
@@ -787,3 +789,102 @@ static int __init slab_proc_init(void)
787} 789}
788module_init(slab_proc_init); 790module_init(slab_proc_init);
789#endif /* CONFIG_SLABINFO */ 791#endif /* CONFIG_SLABINFO */
792
793static __always_inline void *__do_krealloc(const void *p, size_t new_size,
794 gfp_t flags)
795{
796 void *ret;
797 size_t ks = 0;
798
799 if (p)
800 ks = ksize(p);
801
802 if (ks >= new_size)
803 return (void *)p;
804
805 ret = kmalloc_track_caller(new_size, flags);
806 if (ret && p)
807 memcpy(ret, p, ks);
808
809 return ret;
810}
811
812/**
813 * __krealloc - like krealloc() but don't free @p.
814 * @p: object to reallocate memory for.
815 * @new_size: how many bytes of memory are required.
816 * @flags: the type of memory to allocate.
817 *
818 * This function is like krealloc() except it never frees the originally
819 * allocated buffer. Use this if you don't want to free the buffer immediately
820 * like, for example, with RCU.
821 */
822void *__krealloc(const void *p, size_t new_size, gfp_t flags)
823{
824 if (unlikely(!new_size))
825 return ZERO_SIZE_PTR;
826
827 return __do_krealloc(p, new_size, flags);
828
829}
830EXPORT_SYMBOL(__krealloc);
831
832/**
833 * krealloc - reallocate memory. The contents will remain unchanged.
834 * @p: object to reallocate memory for.
835 * @new_size: how many bytes of memory are required.
836 * @flags: the type of memory to allocate.
837 *
838 * The contents of the object pointed to are preserved up to the
839 * lesser of the new and old sizes. If @p is %NULL, krealloc()
840 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
841 * %NULL pointer, the object pointed to is freed.
842 */
843void *krealloc(const void *p, size_t new_size, gfp_t flags)
844{
845 void *ret;
846
847 if (unlikely(!new_size)) {
848 kfree(p);
849 return ZERO_SIZE_PTR;
850 }
851
852 ret = __do_krealloc(p, new_size, flags);
853 if (ret && p != ret)
854 kfree(p);
855
856 return ret;
857}
858EXPORT_SYMBOL(krealloc);
859
860/**
861 * kzfree - like kfree but zero memory
862 * @p: object to free memory of
863 *
864 * The memory of the object @p points to is zeroed before freed.
865 * If @p is %NULL, kzfree() does nothing.
866 *
867 * Note: this function zeroes the whole allocated buffer which can be a good
868 * deal bigger than the requested buffer size passed to kmalloc(). So be
869 * careful when using this function in performance sensitive code.
870 */
871void kzfree(const void *p)
872{
873 size_t ks;
874 void *mem = (void *)p;
875
876 if (unlikely(ZERO_OR_NULL_PTR(mem)))
877 return;
878 ks = ksize(mem);
879 memset(mem, 0, ks);
880 kfree(mem);
881}
882EXPORT_SYMBOL(kzfree);
883
884/* Tracepoints definitions. */
885EXPORT_TRACEPOINT_SYMBOL(kmalloc);
886EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
887EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
888EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
889EXPORT_TRACEPOINT_SYMBOL(kfree);
890EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/slub.c b/mm/slub.c
index 73004808537e..3e8afcc07a76 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
233 * Core slab cache functions 233 * Core slab cache functions
234 *******************************************************************/ 234 *******************************************************************/
235 235
236static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
237{
238 return s->node[node];
239}
240
241/* Verify that a pointer has an address that is valid within a slab page */ 236/* Verify that a pointer has an address that is valid within a slab page */
242static inline int check_valid_pointer(struct kmem_cache *s, 237static inline int check_valid_pointer(struct kmem_cache *s,
243 struct page *page, const void *object) 238 struct page *page, const void *object)
@@ -288,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
288 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 283 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
289 __p += (__s)->size) 284 __p += (__s)->size)
290 285
286#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
287 for (__p = (__addr), __idx = 1; __idx <= __objects;\
288 __p += (__s)->size, __idx++)
289
291/* Determine object index from a given position */ 290/* Determine object index from a given position */
292static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 291static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
293{ 292{
@@ -382,9 +381,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
382 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 381 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
383 if (s->flags & __CMPXCHG_DOUBLE) { 382 if (s->flags & __CMPXCHG_DOUBLE) {
384 if (cmpxchg_double(&page->freelist, &page->counters, 383 if (cmpxchg_double(&page->freelist, &page->counters,
385 freelist_old, counters_old, 384 freelist_old, counters_old,
386 freelist_new, counters_new)) 385 freelist_new, counters_new))
387 return 1; 386 return 1;
388 } else 387 } else
389#endif 388#endif
390 { 389 {
@@ -418,9 +417,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
418 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 417 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
419 if (s->flags & __CMPXCHG_DOUBLE) { 418 if (s->flags & __CMPXCHG_DOUBLE) {
420 if (cmpxchg_double(&page->freelist, &page->counters, 419 if (cmpxchg_double(&page->freelist, &page->counters,
421 freelist_old, counters_old, 420 freelist_old, counters_old,
422 freelist_new, counters_new)) 421 freelist_new, counters_new))
423 return 1; 422 return 1;
424 } else 423 } else
425#endif 424#endif
426 { 425 {
@@ -945,60 +944,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
945} 944}
946 945
947/* 946/*
948 * Hooks for other subsystems that check memory allocations. In a typical
949 * production configuration these hooks all should produce no code at all.
950 */
951static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
952{
953 kmemleak_alloc(ptr, size, 1, flags);
954}
955
956static inline void kfree_hook(const void *x)
957{
958 kmemleak_free(x);
959}
960
961static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
962{
963 flags &= gfp_allowed_mask;
964 lockdep_trace_alloc(flags);
965 might_sleep_if(flags & __GFP_WAIT);
966
967 return should_failslab(s->object_size, flags, s->flags);
968}
969
970static inline void slab_post_alloc_hook(struct kmem_cache *s,
971 gfp_t flags, void *object)
972{
973 flags &= gfp_allowed_mask;
974 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
975 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
976}
977
978static inline void slab_free_hook(struct kmem_cache *s, void *x)
979{
980 kmemleak_free_recursive(x, s->flags);
981
982 /*
983 * Trouble is that we may no longer disable interrupts in the fast path
984 * So in order to make the debug calls that expect irqs to be
985 * disabled we need to disable interrupts temporarily.
986 */
987#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
988 {
989 unsigned long flags;
990
991 local_irq_save(flags);
992 kmemcheck_slab_free(s, x, s->object_size);
993 debug_check_no_locks_freed(x, s->object_size);
994 local_irq_restore(flags);
995 }
996#endif
997 if (!(s->flags & SLAB_DEBUG_OBJECTS))
998 debug_check_no_obj_freed(x, s->object_size);
999}
1000
1001/*
1002 * Tracking of fully allocated slabs for debugging purposes. 947 * Tracking of fully allocated slabs for debugging purposes.
1003 */ 948 */
1004static void add_full(struct kmem_cache *s, 949static void add_full(struct kmem_cache *s,
@@ -1282,6 +1227,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
1282static inline void dec_slabs_node(struct kmem_cache *s, int node, 1227static inline void dec_slabs_node(struct kmem_cache *s, int node,
1283 int objects) {} 1228 int objects) {}
1284 1229
1230#endif /* CONFIG_SLUB_DEBUG */
1231
1232/*
1233 * Hooks for other subsystems that check memory allocations. In a typical
1234 * production configuration these hooks all should produce no code at all.
1235 */
1285static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1236static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1286{ 1237{
1287 kmemleak_alloc(ptr, size, 1, flags); 1238 kmemleak_alloc(ptr, size, 1, flags);
@@ -1293,21 +1244,44 @@ static inline void kfree_hook(const void *x)
1293} 1244}
1294 1245
1295static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) 1246static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
1296 { return 0; } 1247{
1248 flags &= gfp_allowed_mask;
1249 lockdep_trace_alloc(flags);
1250 might_sleep_if(flags & __GFP_WAIT);
1251
1252 return should_failslab(s->object_size, flags, s->flags);
1253}
1297 1254
1298static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 1255static inline void slab_post_alloc_hook(struct kmem_cache *s,
1299 void *object) 1256 gfp_t flags, void *object)
1300{ 1257{
1301 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, 1258 flags &= gfp_allowed_mask;
1302 flags & gfp_allowed_mask); 1259 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1260 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1303} 1261}
1304 1262
1305static inline void slab_free_hook(struct kmem_cache *s, void *x) 1263static inline void slab_free_hook(struct kmem_cache *s, void *x)
1306{ 1264{
1307 kmemleak_free_recursive(x, s->flags); 1265 kmemleak_free_recursive(x, s->flags);
1308}
1309 1266
1310#endif /* CONFIG_SLUB_DEBUG */ 1267 /*
1268 * Trouble is that we may no longer disable interrupts in the fast path
1269 * So in order to make the debug calls that expect irqs to be
1270 * disabled we need to disable interrupts temporarily.
1271 */
1272#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
1273 {
1274 unsigned long flags;
1275
1276 local_irq_save(flags);
1277 kmemcheck_slab_free(s, x, s->object_size);
1278 debug_check_no_locks_freed(x, s->object_size);
1279 local_irq_restore(flags);
1280 }
1281#endif
1282 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1283 debug_check_no_obj_freed(x, s->object_size);
1284}
1311 1285
1312/* 1286/*
1313 * Slab allocation and freeing 1287 * Slab allocation and freeing
@@ -1409,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1409{ 1383{
1410 struct page *page; 1384 struct page *page;
1411 void *start; 1385 void *start;
1412 void *last;
1413 void *p; 1386 void *p;
1414 int order; 1387 int order;
1388 int idx;
1415 1389
1416 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1390 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1417 1391
@@ -1432,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1432 if (unlikely(s->flags & SLAB_POISON)) 1406 if (unlikely(s->flags & SLAB_POISON))
1433 memset(start, POISON_INUSE, PAGE_SIZE << order); 1407 memset(start, POISON_INUSE, PAGE_SIZE << order);
1434 1408
1435 last = start; 1409 for_each_object_idx(p, idx, s, start, page->objects) {
1436 for_each_object(p, s, start, page->objects) { 1410 setup_object(s, page, p);
1437 setup_object(s, page, last); 1411 if (likely(idx < page->objects))
1438 set_freepointer(s, last, p); 1412 set_freepointer(s, p, p + s->size);
1439 last = p; 1413 else
1414 set_freepointer(s, p, NULL);
1440 } 1415 }
1441 setup_object(s, page, last);
1442 set_freepointer(s, last, NULL);
1443 1416
1444 page->freelist = start; 1417 page->freelist = start;
1445 page->inuse = page->objects; 1418 page->inuse = page->objects;
@@ -2162,6 +2135,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2162 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 2135 static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
2163 DEFAULT_RATELIMIT_BURST); 2136 DEFAULT_RATELIMIT_BURST);
2164 int node; 2137 int node;
2138 struct kmem_cache_node *n;
2165 2139
2166 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) 2140 if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
2167 return; 2141 return;
@@ -2176,15 +2150,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2176 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", 2150 pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
2177 s->name); 2151 s->name);
2178 2152
2179 for_each_online_node(node) { 2153 for_each_kmem_cache_node(s, node, n) {
2180 struct kmem_cache_node *n = get_node(s, node);
2181 unsigned long nr_slabs; 2154 unsigned long nr_slabs;
2182 unsigned long nr_objs; 2155 unsigned long nr_objs;
2183 unsigned long nr_free; 2156 unsigned long nr_free;
2184 2157
2185 if (!n)
2186 continue;
2187
2188 nr_free = count_partial(n, count_free); 2158 nr_free = count_partial(n, count_free);
2189 nr_slabs = node_nr_slabs(n); 2159 nr_slabs = node_nr_slabs(n);
2190 nr_objs = node_nr_objs(n); 2160 nr_objs = node_nr_objs(n);
@@ -2928,13 +2898,10 @@ static void early_kmem_cache_node_alloc(int node)
2928static void free_kmem_cache_nodes(struct kmem_cache *s) 2898static void free_kmem_cache_nodes(struct kmem_cache *s)
2929{ 2899{
2930 int node; 2900 int node;
2901 struct kmem_cache_node *n;
2931 2902
2932 for_each_node_state(node, N_NORMAL_MEMORY) { 2903 for_each_kmem_cache_node(s, node, n) {
2933 struct kmem_cache_node *n = s->node[node]; 2904 kmem_cache_free(kmem_cache_node, n);
2934
2935 if (n)
2936 kmem_cache_free(kmem_cache_node, n);
2937
2938 s->node[node] = NULL; 2905 s->node[node] = NULL;
2939 } 2906 }
2940} 2907}
@@ -3222,12 +3189,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3222static inline int kmem_cache_close(struct kmem_cache *s) 3189static inline int kmem_cache_close(struct kmem_cache *s)
3223{ 3190{
3224 int node; 3191 int node;
3192 struct kmem_cache_node *n;
3225 3193
3226 flush_all(s); 3194 flush_all(s);
3227 /* Attempt to free all objects */ 3195 /* Attempt to free all objects */
3228 for_each_node_state(node, N_NORMAL_MEMORY) { 3196 for_each_kmem_cache_node(s, node, n) {
3229 struct kmem_cache_node *n = get_node(s, node);
3230
3231 free_partial(s, n); 3197 free_partial(s, n);
3232 if (n->nr_partial || slabs_node(s, node)) 3198 if (n->nr_partial || slabs_node(s, node))
3233 return 1; 3199 return 1;
@@ -3412,9 +3378,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
3412 return -ENOMEM; 3378 return -ENOMEM;
3413 3379
3414 flush_all(s); 3380 flush_all(s);
3415 for_each_node_state(node, N_NORMAL_MEMORY) { 3381 for_each_kmem_cache_node(s, node, n) {
3416 n = get_node(s, node);
3417
3418 if (!n->nr_partial) 3382 if (!n->nr_partial)
3419 continue; 3383 continue;
3420 3384
@@ -3586,6 +3550,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3586{ 3550{
3587 int node; 3551 int node;
3588 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 3552 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3553 struct kmem_cache_node *n;
3589 3554
3590 memcpy(s, static_cache, kmem_cache->object_size); 3555 memcpy(s, static_cache, kmem_cache->object_size);
3591 3556
@@ -3595,19 +3560,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3595 * IPIs around. 3560 * IPIs around.
3596 */ 3561 */
3597 __flush_cpu_slab(s, smp_processor_id()); 3562 __flush_cpu_slab(s, smp_processor_id());
3598 for_each_node_state(node, N_NORMAL_MEMORY) { 3563 for_each_kmem_cache_node(s, node, n) {
3599 struct kmem_cache_node *n = get_node(s, node);
3600 struct page *p; 3564 struct page *p;
3601 3565
3602 if (n) { 3566 list_for_each_entry(p, &n->partial, lru)
3603 list_for_each_entry(p, &n->partial, lru) 3567 p->slab_cache = s;
3604 p->slab_cache = s;
3605 3568
3606#ifdef CONFIG_SLUB_DEBUG 3569#ifdef CONFIG_SLUB_DEBUG
3607 list_for_each_entry(p, &n->full, lru) 3570 list_for_each_entry(p, &n->full, lru)
3608 p->slab_cache = s; 3571 p->slab_cache = s;
3609#endif 3572#endif
3610 }
3611 } 3573 }
3612 list_add(&s->list, &slab_caches); 3574 list_add(&s->list, &slab_caches);
3613 return s; 3575 return s;
@@ -3960,16 +3922,14 @@ static long validate_slab_cache(struct kmem_cache *s)
3960 unsigned long count = 0; 3922 unsigned long count = 0;
3961 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 3923 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3962 sizeof(unsigned long), GFP_KERNEL); 3924 sizeof(unsigned long), GFP_KERNEL);
3925 struct kmem_cache_node *n;
3963 3926
3964 if (!map) 3927 if (!map)
3965 return -ENOMEM; 3928 return -ENOMEM;
3966 3929
3967 flush_all(s); 3930 flush_all(s);
3968 for_each_node_state(node, N_NORMAL_MEMORY) { 3931 for_each_kmem_cache_node(s, node, n)
3969 struct kmem_cache_node *n = get_node(s, node);
3970
3971 count += validate_slab_node(s, n, map); 3932 count += validate_slab_node(s, n, map);
3972 }
3973 kfree(map); 3933 kfree(map);
3974 return count; 3934 return count;
3975} 3935}
@@ -4123,6 +4083,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
4123 int node; 4083 int node;
4124 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * 4084 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
4125 sizeof(unsigned long), GFP_KERNEL); 4085 sizeof(unsigned long), GFP_KERNEL);
4086 struct kmem_cache_node *n;
4126 4087
4127 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 4088 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
4128 GFP_TEMPORARY)) { 4089 GFP_TEMPORARY)) {
@@ -4132,8 +4093,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
4132 /* Push back cpu slabs */ 4093 /* Push back cpu slabs */
4133 flush_all(s); 4094 flush_all(s);
4134 4095
4135 for_each_node_state(node, N_NORMAL_MEMORY) { 4096 for_each_kmem_cache_node(s, node, n) {
4136 struct kmem_cache_node *n = get_node(s, node);
4137 unsigned long flags; 4097 unsigned long flags;
4138 struct page *page; 4098 struct page *page;
4139 4099
@@ -4205,7 +4165,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
4205#endif 4165#endif
4206 4166
4207#ifdef SLUB_RESILIENCY_TEST 4167#ifdef SLUB_RESILIENCY_TEST
4208static void resiliency_test(void) 4168static void __init resiliency_test(void)
4209{ 4169{
4210 u8 *p; 4170 u8 *p;
4211 4171
@@ -4332,8 +4292,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4332 get_online_mems(); 4292 get_online_mems();
4333#ifdef CONFIG_SLUB_DEBUG 4293#ifdef CONFIG_SLUB_DEBUG
4334 if (flags & SO_ALL) { 4294 if (flags & SO_ALL) {
4335 for_each_node_state(node, N_NORMAL_MEMORY) { 4295 struct kmem_cache_node *n;
4336 struct kmem_cache_node *n = get_node(s, node); 4296
4297 for_each_kmem_cache_node(s, node, n) {
4337 4298
4338 if (flags & SO_TOTAL) 4299 if (flags & SO_TOTAL)
4339 x = atomic_long_read(&n->total_objects); 4300 x = atomic_long_read(&n->total_objects);
@@ -4349,9 +4310,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4349 } else 4310 } else
4350#endif 4311#endif
4351 if (flags & SO_PARTIAL) { 4312 if (flags & SO_PARTIAL) {
4352 for_each_node_state(node, N_NORMAL_MEMORY) { 4313 struct kmem_cache_node *n;
4353 struct kmem_cache_node *n = get_node(s, node);
4354 4314
4315 for_each_kmem_cache_node(s, node, n) {
4355 if (flags & SO_TOTAL) 4316 if (flags & SO_TOTAL)
4356 x = count_partial(n, count_total); 4317 x = count_partial(n, count_total);
4357 else if (flags & SO_OBJECTS) 4318 else if (flags & SO_OBJECTS)
@@ -4364,7 +4325,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4364 } 4325 }
4365 x = sprintf(buf, "%lu", total); 4326 x = sprintf(buf, "%lu", total);
4366#ifdef CONFIG_NUMA 4327#ifdef CONFIG_NUMA
4367 for_each_node_state(node, N_NORMAL_MEMORY) 4328 for (node = 0; node < nr_node_ids; node++)
4368 if (nodes[node]) 4329 if (nodes[node])
4369 x += sprintf(buf + x, " N%d=%lu", 4330 x += sprintf(buf + x, " N%d=%lu",
4370 node, nodes[node]); 4331 node, nodes[node]);
@@ -4378,16 +4339,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4378static int any_slab_objects(struct kmem_cache *s) 4339static int any_slab_objects(struct kmem_cache *s)
4379{ 4340{
4380 int node; 4341 int node;
4342 struct kmem_cache_node *n;
4381 4343
4382 for_each_online_node(node) { 4344 for_each_kmem_cache_node(s, node, n)
4383 struct kmem_cache_node *n = get_node(s, node);
4384
4385 if (!n)
4386 continue;
4387
4388 if (atomic_long_read(&n->total_objects)) 4345 if (atomic_long_read(&n->total_objects))
4389 return 1; 4346 return 1;
4390 } 4347
4391 return 0; 4348 return 0;
4392} 4349}
4393#endif 4350#endif
@@ -4509,7 +4466,7 @@ SLAB_ATTR_RO(ctor);
4509 4466
4510static ssize_t aliases_show(struct kmem_cache *s, char *buf) 4467static ssize_t aliases_show(struct kmem_cache *s, char *buf)
4511{ 4468{
4512 return sprintf(buf, "%d\n", s->refcount - 1); 4469 return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
4513} 4470}
4514SLAB_ATTR_RO(aliases); 4471SLAB_ATTR_RO(aliases);
4515 4472
@@ -5171,12 +5128,6 @@ static char *create_unique_id(struct kmem_cache *s)
5171 *p++ = '-'; 5128 *p++ = '-';
5172 p += sprintf(p, "%07d", s->size); 5129 p += sprintf(p, "%07d", s->size);
5173 5130
5174#ifdef CONFIG_MEMCG_KMEM
5175 if (!is_root_cache(s))
5176 p += sprintf(p, "-%08d",
5177 memcg_cache_id(s->memcg_params->memcg));
5178#endif
5179
5180 BUG_ON(p > name + ID_STR_LENGTH - 1); 5131 BUG_ON(p > name + ID_STR_LENGTH - 1);
5181 return name; 5132 return name;
5182} 5133}
@@ -5342,13 +5293,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5342 unsigned long nr_objs = 0; 5293 unsigned long nr_objs = 0;
5343 unsigned long nr_free = 0; 5294 unsigned long nr_free = 0;
5344 int node; 5295 int node;
5296 struct kmem_cache_node *n;
5345 5297
5346 for_each_online_node(node) { 5298 for_each_kmem_cache_node(s, node, n) {
5347 struct kmem_cache_node *n = get_node(s, node);
5348
5349 if (!n)
5350 continue;
5351
5352 nr_slabs += node_nr_slabs(n); 5299 nr_slabs += node_nr_slabs(n);
5353 nr_objs += node_nr_objs(n); 5300 nr_objs += node_nr_objs(n);
5354 nr_free += count_partial(n, count_free); 5301 nr_free += count_partial(n, count_free);
diff --git a/mm/swap.c b/mm/swap.c
index 9e8e3472248b..c789d01c9ec3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -501,7 +501,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
501 SetPageActive(page); 501 SetPageActive(page);
502 lru += LRU_ACTIVE; 502 lru += LRU_ACTIVE;
503 add_page_to_lru_list(page, lruvec, lru); 503 add_page_to_lru_list(page, lruvec, lru);
504 trace_mm_lru_activate(page, page_to_pfn(page)); 504 trace_mm_lru_activate(page);
505 505
506 __count_vm_event(PGACTIVATE); 506 __count_vm_event(PGACTIVATE);
507 update_page_reclaim_stat(lruvec, file, 1); 507 update_page_reclaim_stat(lruvec, file, 1);
@@ -589,6 +589,9 @@ static void __lru_cache_activate_page(struct page *page)
589 * inactive,unreferenced -> inactive,referenced 589 * inactive,unreferenced -> inactive,referenced
590 * inactive,referenced -> active,unreferenced 590 * inactive,referenced -> active,unreferenced
591 * active,unreferenced -> active,referenced 591 * active,unreferenced -> active,referenced
592 *
593 * When a newly allocated page is not yet visible, so safe for non-atomic ops,
594 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
592 */ 595 */
593void mark_page_accessed(struct page *page) 596void mark_page_accessed(struct page *page)
594{ 597{
@@ -614,17 +617,6 @@ void mark_page_accessed(struct page *page)
614} 617}
615EXPORT_SYMBOL(mark_page_accessed); 618EXPORT_SYMBOL(mark_page_accessed);
616 619
617/*
618 * Used to mark_page_accessed(page) that is not visible yet and when it is
619 * still safe to use non-atomic ops
620 */
621void init_page_accessed(struct page *page)
622{
623 if (!PageReferenced(page))
624 __SetPageReferenced(page);
625}
626EXPORT_SYMBOL(init_page_accessed);
627
628static void __lru_cache_add(struct page *page) 620static void __lru_cache_add(struct page *page)
629{ 621{
630 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 622 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
@@ -996,7 +988,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
996 SetPageLRU(page); 988 SetPageLRU(page);
997 add_page_to_lru_list(page, lruvec, lru); 989 add_page_to_lru_list(page, lruvec, lru);
998 update_page_reclaim_stat(lruvec, file, active); 990 update_page_reclaim_stat(lruvec, file, active);
999 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 991 trace_mm_lru_insertion(page, lru);
1000} 992}
1001 993
1002/* 994/*
diff --git a/mm/util.c b/mm/util.c
index d5ea733c5082..7b6608df2ee8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -16,9 +16,6 @@
16 16
17#include "internal.h" 17#include "internal.h"
18 18
19#define CREATE_TRACE_POINTS
20#include <trace/events/kmem.h>
21
22/** 19/**
23 * kstrdup - allocate space for and copy an existing string 20 * kstrdup - allocate space for and copy an existing string
24 * @s: the string to duplicate 21 * @s: the string to duplicate
@@ -112,97 +109,6 @@ void *memdup_user(const void __user *src, size_t len)
112} 109}
113EXPORT_SYMBOL(memdup_user); 110EXPORT_SYMBOL(memdup_user);
114 111
115static __always_inline void *__do_krealloc(const void *p, size_t new_size,
116 gfp_t flags)
117{
118 void *ret;
119 size_t ks = 0;
120
121 if (p)
122 ks = ksize(p);
123
124 if (ks >= new_size)
125 return (void *)p;
126
127 ret = kmalloc_track_caller(new_size, flags);
128 if (ret && p)
129 memcpy(ret, p, ks);
130
131 return ret;
132}
133
134/**
135 * __krealloc - like krealloc() but don't free @p.
136 * @p: object to reallocate memory for.
137 * @new_size: how many bytes of memory are required.
138 * @flags: the type of memory to allocate.
139 *
140 * This function is like krealloc() except it never frees the originally
141 * allocated buffer. Use this if you don't want to free the buffer immediately
142 * like, for example, with RCU.
143 */
144void *__krealloc(const void *p, size_t new_size, gfp_t flags)
145{
146 if (unlikely(!new_size))
147 return ZERO_SIZE_PTR;
148
149 return __do_krealloc(p, new_size, flags);
150
151}
152EXPORT_SYMBOL(__krealloc);
153
154/**
155 * krealloc - reallocate memory. The contents will remain unchanged.
156 * @p: object to reallocate memory for.
157 * @new_size: how many bytes of memory are required.
158 * @flags: the type of memory to allocate.
159 *
160 * The contents of the object pointed to are preserved up to the
161 * lesser of the new and old sizes. If @p is %NULL, krealloc()
162 * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
163 * %NULL pointer, the object pointed to is freed.
164 */
165void *krealloc(const void *p, size_t new_size, gfp_t flags)
166{
167 void *ret;
168
169 if (unlikely(!new_size)) {
170 kfree(p);
171 return ZERO_SIZE_PTR;
172 }
173
174 ret = __do_krealloc(p, new_size, flags);
175 if (ret && p != ret)
176 kfree(p);
177
178 return ret;
179}
180EXPORT_SYMBOL(krealloc);
181
182/**
183 * kzfree - like kfree but zero memory
184 * @p: object to free memory of
185 *
186 * The memory of the object @p points to is zeroed before freed.
187 * If @p is %NULL, kzfree() does nothing.
188 *
189 * Note: this function zeroes the whole allocated buffer which can be a good
190 * deal bigger than the requested buffer size passed to kmalloc(). So be
191 * careful when using this function in performance sensitive code.
192 */
193void kzfree(const void *p)
194{
195 size_t ks;
196 void *mem = (void *)p;
197
198 if (unlikely(ZERO_OR_NULL_PTR(mem)))
199 return;
200 ks = ksize(mem);
201 memset(mem, 0, ks);
202 kfree(mem);
203}
204EXPORT_SYMBOL(kzfree);
205
206/* 112/*
207 * strndup_user - duplicate an existing string from user space 113 * strndup_user - duplicate an existing string from user space
208 * @s: The string to duplicate 114 * @s: The string to duplicate
@@ -504,11 +410,3 @@ out_mm:
504out: 410out:
505 return res; 411 return res;
506} 412}
507
508/* Tracepoints definitions. */
509EXPORT_TRACEPOINT_SYMBOL(kmalloc);
510EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
511EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
512EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
513EXPORT_TRACEPOINT_SYMBOL(kfree);
514EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f64632b67196..2b0aa5486092 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
1270} 1270}
1271EXPORT_SYMBOL_GPL(unmap_kernel_range); 1271EXPORT_SYMBOL_GPL(unmap_kernel_range);
1272 1272
1273int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1273int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
1274{ 1274{
1275 unsigned long addr = (unsigned long)area->addr; 1275 unsigned long addr = (unsigned long)area->addr;
1276 unsigned long end = addr + get_vm_area_size(area); 1276 unsigned long end = addr + get_vm_area_size(area);
1277 int err; 1277 int err;
1278 1278
1279 err = vmap_page_range(addr, end, prot, *pages); 1279 err = vmap_page_range(addr, end, prot, pages);
1280 if (err > 0) {
1281 *pages += err;
1282 err = 0;
1283 }
1284 1280
1285 return err; 1281 return err > 0 ? 0 : err;
1286} 1282}
1287EXPORT_SYMBOL_GPL(map_vm_area); 1283EXPORT_SYMBOL_GPL(map_vm_area);
1288 1284
@@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count,
1548 if (!area) 1544 if (!area)
1549 return NULL; 1545 return NULL;
1550 1546
1551 if (map_vm_area(area, prot, &pages)) { 1547 if (map_vm_area(area, prot, pages)) {
1552 vunmap(area->addr); 1548 vunmap(area->addr);
1553 return NULL; 1549 return NULL;
1554 } 1550 }
@@ -1566,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1566 const int order = 0; 1562 const int order = 0;
1567 struct page **pages; 1563 struct page **pages;
1568 unsigned int nr_pages, array_size, i; 1564 unsigned int nr_pages, array_size, i;
1569 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1565 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1566 const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
1570 1567
1571 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 1568 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1572 array_size = (nr_pages * sizeof(struct page *)); 1569 array_size = (nr_pages * sizeof(struct page *));
@@ -1589,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1589 1586
1590 for (i = 0; i < area->nr_pages; i++) { 1587 for (i = 0; i < area->nr_pages; i++) {
1591 struct page *page; 1588 struct page *page;
1592 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
1593 1589
1594 if (node == NUMA_NO_NODE) 1590 if (node == NUMA_NO_NODE)
1595 page = alloc_page(tmp_mask); 1591 page = alloc_page(alloc_mask);
1596 else 1592 else
1597 page = alloc_pages_node(node, tmp_mask, order); 1593 page = alloc_pages_node(node, alloc_mask, order);
1598 1594
1599 if (unlikely(!page)) { 1595 if (unlikely(!page)) {
1600 /* Successfully allocated i pages, free them in __vunmap() */ 1596 /* Successfully allocated i pages, free them in __vunmap() */
@@ -1602,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1602 goto fail; 1598 goto fail;
1603 } 1599 }
1604 area->pages[i] = page; 1600 area->pages[i] = page;
1601 if (gfp_mask & __GFP_WAIT)
1602 cond_resched();
1605 } 1603 }
1606 1604
1607 if (map_vm_area(area, prot, &pages)) 1605 if (map_vm_area(area, prot, pages))
1608 goto fail; 1606 goto fail;
1609 return area->addr; 1607 return area->addr;
1610 1608
@@ -2690,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
2690 2688
2691 prev_end = VMALLOC_START; 2689 prev_end = VMALLOC_START;
2692 2690
2693 spin_lock(&vmap_area_lock); 2691 rcu_read_lock();
2694 2692
2695 if (list_empty(&vmap_area_list)) { 2693 if (list_empty(&vmap_area_list)) {
2696 vmi->largest_chunk = VMALLOC_TOTAL; 2694 vmi->largest_chunk = VMALLOC_TOTAL;
2697 goto out; 2695 goto out;
2698 } 2696 }
2699 2697
2700 list_for_each_entry(va, &vmap_area_list, list) { 2698 list_for_each_entry_rcu(va, &vmap_area_list, list) {
2701 unsigned long addr = va->va_start; 2699 unsigned long addr = va->va_start;
2702 2700
2703 /* 2701 /*
@@ -2724,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
2724 vmi->largest_chunk = VMALLOC_END - prev_end; 2722 vmi->largest_chunk = VMALLOC_END - prev_end;
2725 2723
2726out: 2724out:
2727 spin_unlock(&vmap_area_lock); 2725 rcu_read_unlock();
2728} 2726}
2729#endif 2727#endif
2730 2728
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0f16ffe8eb67..d2f65c856350 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -59,35 +59,20 @@
59#include <trace/events/vmscan.h> 59#include <trace/events/vmscan.h>
60 60
61struct scan_control { 61struct scan_control {
62 /* Incremented by the number of inactive pages that were scanned */
63 unsigned long nr_scanned;
64
65 /* Number of pages freed so far during a call to shrink_zones() */
66 unsigned long nr_reclaimed;
67
68 /* How many pages shrink_list() should reclaim */ 62 /* How many pages shrink_list() should reclaim */
69 unsigned long nr_to_reclaim; 63 unsigned long nr_to_reclaim;
70 64
71 unsigned long hibernation_mode;
72
73 /* This context's GFP mask */ 65 /* This context's GFP mask */
74 gfp_t gfp_mask; 66 gfp_t gfp_mask;
75 67
76 int may_writepage; 68 /* Allocation order */
77
78 /* Can mapped pages be reclaimed? */
79 int may_unmap;
80
81 /* Can pages be swapped as part of reclaim? */
82 int may_swap;
83
84 int order; 69 int order;
85 70
86 /* Scan (total_size >> priority) pages at once */ 71 /*
87 int priority; 72 * Nodemask of nodes allowed by the caller. If NULL, all nodes
88 73 * are scanned.
89 /* anon vs. file LRUs scanning "ratio" */ 74 */
90 int swappiness; 75 nodemask_t *nodemask;
91 76
92 /* 77 /*
93 * The memory cgroup that hit its limit and as a result is the 78 * The memory cgroup that hit its limit and as a result is the
@@ -95,11 +80,27 @@ struct scan_control {
95 */ 80 */
96 struct mem_cgroup *target_mem_cgroup; 81 struct mem_cgroup *target_mem_cgroup;
97 82
98 /* 83 /* Scan (total_size >> priority) pages at once */
99 * Nodemask of nodes allowed by the caller. If NULL, all nodes 84 int priority;
100 * are scanned. 85
101 */ 86 unsigned int may_writepage:1;
102 nodemask_t *nodemask; 87
88 /* Can mapped pages be reclaimed? */
89 unsigned int may_unmap:1;
90
91 /* Can pages be swapped as part of reclaim? */
92 unsigned int may_swap:1;
93
94 unsigned int hibernation_mode:1;
95
96 /* One of the zones is ready for compaction */
97 unsigned int compaction_ready:1;
98
99 /* Incremented by the number of inactive pages that were scanned */
100 unsigned long nr_scanned;
101
102 /* Number of pages freed so far during a call to shrink_zones() */
103 unsigned long nr_reclaimed;
103}; 104};
104 105
105#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 106#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -136,7 +137,11 @@ struct scan_control {
136 * From 0 .. 100. Higher means more swappy. 137 * From 0 .. 100. Higher means more swappy.
137 */ 138 */
138int vm_swappiness = 60; 139int vm_swappiness = 60;
139unsigned long vm_total_pages; /* The total number of pages which the VM controls */ 140/*
141 * The total number of pages which are beyond the high watermark within all
142 * zones.
143 */
144unsigned long vm_total_pages;
140 145
141static LIST_HEAD(shrinker_list); 146static LIST_HEAD(shrinker_list);
142static DECLARE_RWSEM(shrinker_rwsem); 147static DECLARE_RWSEM(shrinker_rwsem);
@@ -169,7 +174,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
169 174
170bool zone_reclaimable(struct zone *zone) 175bool zone_reclaimable(struct zone *zone)
171{ 176{
172 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 177 return zone_page_state(zone, NR_PAGES_SCANNED) <
178 zone_reclaimable_pages(zone) * 6;
173} 179}
174 180
175static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 181static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -1503,7 +1509,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1503 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1509 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1504 1510
1505 if (global_reclaim(sc)) { 1511 if (global_reclaim(sc)) {
1506 zone->pages_scanned += nr_scanned; 1512 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
1507 if (current_is_kswapd()) 1513 if (current_is_kswapd())
1508 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); 1514 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1509 else 1515 else
@@ -1693,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1693 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 1699 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1694 &nr_scanned, sc, isolate_mode, lru); 1700 &nr_scanned, sc, isolate_mode, lru);
1695 if (global_reclaim(sc)) 1701 if (global_reclaim(sc))
1696 zone->pages_scanned += nr_scanned; 1702 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
1697 1703
1698 reclaim_stat->recent_scanned[file] += nr_taken; 1704 reclaim_stat->recent_scanned[file] += nr_taken;
1699 1705
@@ -1750,7 +1756,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1750 * Count referenced pages from currently used mappings as rotated, 1756 * Count referenced pages from currently used mappings as rotated,
1751 * even though only some of them are actually re-activated. This 1757 * even though only some of them are actually re-activated. This
1752 * helps balance scan pressure between file and anonymous pages in 1758 * helps balance scan pressure between file and anonymous pages in
1753 * get_scan_ratio. 1759 * get_scan_count.
1754 */ 1760 */
1755 reclaim_stat->recent_rotated[file] += nr_rotated; 1761 reclaim_stat->recent_rotated[file] += nr_rotated;
1756 1762
@@ -1865,8 +1871,8 @@ enum scan_balance {
1865 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan 1871 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1866 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1872 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1867 */ 1873 */
1868static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1874static void get_scan_count(struct lruvec *lruvec, int swappiness,
1869 unsigned long *nr) 1875 struct scan_control *sc, unsigned long *nr)
1870{ 1876{
1871 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; 1877 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1872 u64 fraction[2]; 1878 u64 fraction[2];
@@ -1909,7 +1915,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1909 * using the memory controller's swap limit feature would be 1915 * using the memory controller's swap limit feature would be
1910 * too expensive. 1916 * too expensive.
1911 */ 1917 */
1912 if (!global_reclaim(sc) && !sc->swappiness) { 1918 if (!global_reclaim(sc) && !swappiness) {
1913 scan_balance = SCAN_FILE; 1919 scan_balance = SCAN_FILE;
1914 goto out; 1920 goto out;
1915 } 1921 }
@@ -1919,16 +1925,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1919 * system is close to OOM, scan both anon and file equally 1925 * system is close to OOM, scan both anon and file equally
1920 * (unless the swappiness setting disagrees with swapping). 1926 * (unless the swappiness setting disagrees with swapping).
1921 */ 1927 */
1922 if (!sc->priority && sc->swappiness) { 1928 if (!sc->priority && swappiness) {
1923 scan_balance = SCAN_EQUAL; 1929 scan_balance = SCAN_EQUAL;
1924 goto out; 1930 goto out;
1925 } 1931 }
1926 1932
1927 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1928 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1929 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1930 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1931
1932 /* 1933 /*
1933 * Prevent the reclaimer from falling into the cache trap: as 1934 * Prevent the reclaimer from falling into the cache trap: as
1934 * cache pages start out inactive, every cache fault will tip 1935 * cache pages start out inactive, every cache fault will tip
@@ -1939,9 +1940,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1939 * anon pages. Try to detect this based on file LRU size. 1940 * anon pages. Try to detect this based on file LRU size.
1940 */ 1941 */
1941 if (global_reclaim(sc)) { 1942 if (global_reclaim(sc)) {
1942 unsigned long free = zone_page_state(zone, NR_FREE_PAGES); 1943 unsigned long zonefile;
1944 unsigned long zonefree;
1943 1945
1944 if (unlikely(file + free <= high_wmark_pages(zone))) { 1946 zonefree = zone_page_state(zone, NR_FREE_PAGES);
1947 zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
1948 zone_page_state(zone, NR_INACTIVE_FILE);
1949
1950 if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
1945 scan_balance = SCAN_ANON; 1951 scan_balance = SCAN_ANON;
1946 goto out; 1952 goto out;
1947 } 1953 }
@@ -1962,7 +1968,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1962 * With swappiness at 100, anonymous and file have the same priority. 1968 * With swappiness at 100, anonymous and file have the same priority.
1963 * This scanning priority is essentially the inverse of IO cost. 1969 * This scanning priority is essentially the inverse of IO cost.
1964 */ 1970 */
1965 anon_prio = sc->swappiness; 1971 anon_prio = swappiness;
1966 file_prio = 200 - anon_prio; 1972 file_prio = 200 - anon_prio;
1967 1973
1968 /* 1974 /*
@@ -1976,6 +1982,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1976 * 1982 *
1977 * anon in [0], file in [1] 1983 * anon in [0], file in [1]
1978 */ 1984 */
1985
1986 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1987 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1988 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1989 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1990
1979 spin_lock_irq(&zone->lru_lock); 1991 spin_lock_irq(&zone->lru_lock);
1980 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1992 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1981 reclaim_stat->recent_scanned[0] /= 2; 1993 reclaim_stat->recent_scanned[0] /= 2;
@@ -2052,7 +2064,8 @@ out:
2052/* 2064/*
2053 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2065 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2054 */ 2066 */
2055static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 2067static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
2068 struct scan_control *sc)
2056{ 2069{
2057 unsigned long nr[NR_LRU_LISTS]; 2070 unsigned long nr[NR_LRU_LISTS];
2058 unsigned long targets[NR_LRU_LISTS]; 2071 unsigned long targets[NR_LRU_LISTS];
@@ -2063,7 +2076,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2063 struct blk_plug plug; 2076 struct blk_plug plug;
2064 bool scan_adjusted; 2077 bool scan_adjusted;
2065 2078
2066 get_scan_count(lruvec, sc, nr); 2079 get_scan_count(lruvec, swappiness, sc, nr);
2067 2080
2068 /* Record the original scan target for proportional adjustments later */ 2081 /* Record the original scan target for proportional adjustments later */
2069 memcpy(targets, nr, sizeof(nr)); 2082 memcpy(targets, nr, sizeof(nr));
@@ -2241,9 +2254,10 @@ static inline bool should_continue_reclaim(struct zone *zone,
2241 } 2254 }
2242} 2255}
2243 2256
2244static void shrink_zone(struct zone *zone, struct scan_control *sc) 2257static bool shrink_zone(struct zone *zone, struct scan_control *sc)
2245{ 2258{
2246 unsigned long nr_reclaimed, nr_scanned; 2259 unsigned long nr_reclaimed, nr_scanned;
2260 bool reclaimable = false;
2247 2261
2248 do { 2262 do {
2249 struct mem_cgroup *root = sc->target_mem_cgroup; 2263 struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2259,11 +2273,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
2259 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2273 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2260 do { 2274 do {
2261 struct lruvec *lruvec; 2275 struct lruvec *lruvec;
2276 int swappiness;
2262 2277
2263 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2278 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2279 swappiness = mem_cgroup_swappiness(memcg);
2264 2280
2265 sc->swappiness = mem_cgroup_swappiness(memcg); 2281 shrink_lruvec(lruvec, swappiness, sc);
2266 shrink_lruvec(lruvec, sc);
2267 2282
2268 /* 2283 /*
2269 * Direct reclaim and kswapd have to scan all memory 2284 * Direct reclaim and kswapd have to scan all memory
@@ -2287,20 +2302,21 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
2287 sc->nr_scanned - nr_scanned, 2302 sc->nr_scanned - nr_scanned,
2288 sc->nr_reclaimed - nr_reclaimed); 2303 sc->nr_reclaimed - nr_reclaimed);
2289 2304
2305 if (sc->nr_reclaimed - nr_reclaimed)
2306 reclaimable = true;
2307
2290 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2308 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2291 sc->nr_scanned - nr_scanned, sc)); 2309 sc->nr_scanned - nr_scanned, sc));
2310
2311 return reclaimable;
2292} 2312}
2293 2313
2294/* Returns true if compaction should go ahead for a high-order request */ 2314/* Returns true if compaction should go ahead for a high-order request */
2295static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) 2315static inline bool compaction_ready(struct zone *zone, int order)
2296{ 2316{
2297 unsigned long balance_gap, watermark; 2317 unsigned long balance_gap, watermark;
2298 bool watermark_ok; 2318 bool watermark_ok;
2299 2319
2300 /* Do not consider compaction for orders reclaim is meant to satisfy */
2301 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2302 return false;
2303
2304 /* 2320 /*
2305 * Compaction takes time to run and there are potentially other 2321 * Compaction takes time to run and there are potentially other
2306 * callers using the pages just freed. Continue reclaiming until 2322 * callers using the pages just freed. Continue reclaiming until
@@ -2309,18 +2325,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2309 */ 2325 */
2310 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( 2326 balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
2311 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); 2327 zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
2312 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2328 watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
2313 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2329 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2314 2330
2315 /* 2331 /*
2316 * If compaction is deferred, reclaim up to a point where 2332 * If compaction is deferred, reclaim up to a point where
2317 * compaction will have a chance of success when re-enabled 2333 * compaction will have a chance of success when re-enabled
2318 */ 2334 */
2319 if (compaction_deferred(zone, sc->order)) 2335 if (compaction_deferred(zone, order))
2320 return watermark_ok; 2336 return watermark_ok;
2321 2337
2322 /* If compaction is not ready to start, keep reclaiming */ 2338 /* If compaction is not ready to start, keep reclaiming */
2323 if (!compaction_suitable(zone, sc->order)) 2339 if (!compaction_suitable(zone, order))
2324 return false; 2340 return false;
2325 2341
2326 return watermark_ok; 2342 return watermark_ok;
@@ -2342,10 +2358,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2342 * If a zone is deemed to be full of pinned pages then just give it a light 2358 * If a zone is deemed to be full of pinned pages then just give it a light
2343 * scan then give up on it. 2359 * scan then give up on it.
2344 * 2360 *
2345 * This function returns true if a zone is being reclaimed for a costly 2361 * Returns true if a zone was reclaimable.
2346 * high-order allocation and compaction is ready to begin. This indicates to
2347 * the caller that it should consider retrying the allocation instead of
2348 * further reclaim.
2349 */ 2362 */
2350static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) 2363static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2351{ 2364{
@@ -2354,13 +2367,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2354 unsigned long nr_soft_reclaimed; 2367 unsigned long nr_soft_reclaimed;
2355 unsigned long nr_soft_scanned; 2368 unsigned long nr_soft_scanned;
2356 unsigned long lru_pages = 0; 2369 unsigned long lru_pages = 0;
2357 bool aborted_reclaim = false;
2358 struct reclaim_state *reclaim_state = current->reclaim_state; 2370 struct reclaim_state *reclaim_state = current->reclaim_state;
2359 gfp_t orig_mask; 2371 gfp_t orig_mask;
2360 struct shrink_control shrink = { 2372 struct shrink_control shrink = {
2361 .gfp_mask = sc->gfp_mask, 2373 .gfp_mask = sc->gfp_mask,
2362 }; 2374 };
2363 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); 2375 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
2376 bool reclaimable = false;
2364 2377
2365 /* 2378 /*
2366 * If the number of buffer_heads in the machine exceeds the maximum 2379 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2391,22 +2404,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2391 if (sc->priority != DEF_PRIORITY && 2404 if (sc->priority != DEF_PRIORITY &&
2392 !zone_reclaimable(zone)) 2405 !zone_reclaimable(zone))
2393 continue; /* Let kswapd poll it */ 2406 continue; /* Let kswapd poll it */
2394 if (IS_ENABLED(CONFIG_COMPACTION)) { 2407
2395 /* 2408 /*
2396 * If we already have plenty of memory free for 2409 * If we already have plenty of memory free for
2397 * compaction in this zone, don't free any more. 2410 * compaction in this zone, don't free any more.
2398 * Even though compaction is invoked for any 2411 * Even though compaction is invoked for any
2399 * non-zero order, only frequent costly order 2412 * non-zero order, only frequent costly order
2400 * reclamation is disruptive enough to become a 2413 * reclamation is disruptive enough to become a
2401 * noticeable problem, like transparent huge 2414 * noticeable problem, like transparent huge
2402 * page allocations. 2415 * page allocations.
2403 */ 2416 */
2404 if ((zonelist_zone_idx(z) <= requested_highidx) 2417 if (IS_ENABLED(CONFIG_COMPACTION) &&
2405 && compaction_ready(zone, sc)) { 2418 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2406 aborted_reclaim = true; 2419 zonelist_zone_idx(z) <= requested_highidx &&
2407 continue; 2420 compaction_ready(zone, sc->order)) {
2408 } 2421 sc->compaction_ready = true;
2422 continue;
2409 } 2423 }
2424
2410 /* 2425 /*
2411 * This steals pages from memory cgroups over softlimit 2426 * This steals pages from memory cgroups over softlimit
2412 * and returns the number of reclaimed pages and 2427 * and returns the number of reclaimed pages and
@@ -2419,10 +2434,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2419 &nr_soft_scanned); 2434 &nr_soft_scanned);
2420 sc->nr_reclaimed += nr_soft_reclaimed; 2435 sc->nr_reclaimed += nr_soft_reclaimed;
2421 sc->nr_scanned += nr_soft_scanned; 2436 sc->nr_scanned += nr_soft_scanned;
2437 if (nr_soft_reclaimed)
2438 reclaimable = true;
2422 /* need some check for avoid more shrink_zone() */ 2439 /* need some check for avoid more shrink_zone() */
2423 } 2440 }
2424 2441
2425 shrink_zone(zone, sc); 2442 if (shrink_zone(zone, sc))
2443 reclaimable = true;
2444
2445 if (global_reclaim(sc) &&
2446 !reclaimable && zone_reclaimable(zone))
2447 reclaimable = true;
2426 } 2448 }
2427 2449
2428 /* 2450 /*
@@ -2445,27 +2467,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2445 */ 2467 */
2446 sc->gfp_mask = orig_mask; 2468 sc->gfp_mask = orig_mask;
2447 2469
2448 return aborted_reclaim; 2470 return reclaimable;
2449}
2450
2451/* All zones in zonelist are unreclaimable? */
2452static bool all_unreclaimable(struct zonelist *zonelist,
2453 struct scan_control *sc)
2454{
2455 struct zoneref *z;
2456 struct zone *zone;
2457
2458 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2459 gfp_zone(sc->gfp_mask), sc->nodemask) {
2460 if (!populated_zone(zone))
2461 continue;
2462 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2463 continue;
2464 if (zone_reclaimable(zone))
2465 return false;
2466 }
2467
2468 return true;
2469} 2471}
2470 2472
2471/* 2473/*
@@ -2489,7 +2491,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2489{ 2491{
2490 unsigned long total_scanned = 0; 2492 unsigned long total_scanned = 0;
2491 unsigned long writeback_threshold; 2493 unsigned long writeback_threshold;
2492 bool aborted_reclaim; 2494 bool zones_reclaimable;
2493 2495
2494 delayacct_freepages_start(); 2496 delayacct_freepages_start();
2495 2497
@@ -2500,11 +2502,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2500 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, 2502 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2501 sc->priority); 2503 sc->priority);
2502 sc->nr_scanned = 0; 2504 sc->nr_scanned = 0;
2503 aborted_reclaim = shrink_zones(zonelist, sc); 2505 zones_reclaimable = shrink_zones(zonelist, sc);
2504 2506
2505 total_scanned += sc->nr_scanned; 2507 total_scanned += sc->nr_scanned;
2506 if (sc->nr_reclaimed >= sc->nr_to_reclaim) 2508 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2507 goto out; 2509 break;
2510
2511 if (sc->compaction_ready)
2512 break;
2508 2513
2509 /* 2514 /*
2510 * If we're getting trouble reclaiming, start doing 2515 * If we're getting trouble reclaiming, start doing
@@ -2526,28 +2531,19 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2526 WB_REASON_TRY_TO_FREE_PAGES); 2531 WB_REASON_TRY_TO_FREE_PAGES);
2527 sc->may_writepage = 1; 2532 sc->may_writepage = 1;
2528 } 2533 }
2529 } while (--sc->priority >= 0 && !aborted_reclaim); 2534 } while (--sc->priority >= 0);
2530 2535
2531out:
2532 delayacct_freepages_end(); 2536 delayacct_freepages_end();
2533 2537
2534 if (sc->nr_reclaimed) 2538 if (sc->nr_reclaimed)
2535 return sc->nr_reclaimed; 2539 return sc->nr_reclaimed;
2536 2540
2537 /*
2538 * As hibernation is going on, kswapd is freezed so that it can't mark
2539 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2540 * check.
2541 */
2542 if (oom_killer_disabled)
2543 return 0;
2544
2545 /* Aborted reclaim to try compaction? don't OOM, then */ 2541 /* Aborted reclaim to try compaction? don't OOM, then */
2546 if (aborted_reclaim) 2542 if (sc->compaction_ready)
2547 return 1; 2543 return 1;
2548 2544
2549 /* top priority shrink_zones still had more to do? don't OOM, then */ 2545 /* Any of the zones still reclaimable? Don't OOM. */
2550 if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) 2546 if (zones_reclaimable)
2551 return 1; 2547 return 1;
2552 2548
2553 return 0; 2549 return 0;
@@ -2684,15 +2680,14 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2684{ 2680{
2685 unsigned long nr_reclaimed; 2681 unsigned long nr_reclaimed;
2686 struct scan_control sc = { 2682 struct scan_control sc = {
2683 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2687 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 2684 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2685 .order = order,
2686 .nodemask = nodemask,
2687 .priority = DEF_PRIORITY,
2688 .may_writepage = !laptop_mode, 2688 .may_writepage = !laptop_mode,
2689 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2690 .may_unmap = 1, 2689 .may_unmap = 1,
2691 .may_swap = 1, 2690 .may_swap = 1,
2692 .order = order,
2693 .priority = DEF_PRIORITY,
2694 .target_mem_cgroup = NULL,
2695 .nodemask = nodemask,
2696 }; 2691 };
2697 2692
2698 /* 2693 /*
@@ -2722,17 +2717,14 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2722 unsigned long *nr_scanned) 2717 unsigned long *nr_scanned)
2723{ 2718{
2724 struct scan_control sc = { 2719 struct scan_control sc = {
2725 .nr_scanned = 0,
2726 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2720 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2721 .target_mem_cgroup = memcg,
2727 .may_writepage = !laptop_mode, 2722 .may_writepage = !laptop_mode,
2728 .may_unmap = 1, 2723 .may_unmap = 1,
2729 .may_swap = !noswap, 2724 .may_swap = !noswap,
2730 .order = 0,
2731 .priority = 0,
2732 .swappiness = mem_cgroup_swappiness(memcg),
2733 .target_mem_cgroup = memcg,
2734 }; 2725 };
2735 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2726 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2727 int swappiness = mem_cgroup_swappiness(memcg);
2736 2728
2737 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2729 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2738 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2730 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2748,7 +2740,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2748 * will pick up pages from other mem cgroup's as well. We hack 2740 * will pick up pages from other mem cgroup's as well. We hack
2749 * the priority and make it zero. 2741 * the priority and make it zero.
2750 */ 2742 */
2751 shrink_lruvec(lruvec, &sc); 2743 shrink_lruvec(lruvec, swappiness, &sc);
2752 2744
2753 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2745 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2754 2746
@@ -2764,16 +2756,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2764 unsigned long nr_reclaimed; 2756 unsigned long nr_reclaimed;
2765 int nid; 2757 int nid;
2766 struct scan_control sc = { 2758 struct scan_control sc = {
2767 .may_writepage = !laptop_mode,
2768 .may_unmap = 1,
2769 .may_swap = !noswap,
2770 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2759 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2771 .order = 0,
2772 .priority = DEF_PRIORITY,
2773 .target_mem_cgroup = memcg,
2774 .nodemask = NULL, /* we don't care the placement */
2775 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2760 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2776 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2761 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2762 .target_mem_cgroup = memcg,
2763 .priority = DEF_PRIORITY,
2764 .may_writepage = !laptop_mode,
2765 .may_unmap = 1,
2766 .may_swap = !noswap,
2777 }; 2767 };
2778 2768
2779 /* 2769 /*
@@ -3031,12 +3021,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3031 unsigned long nr_soft_scanned; 3021 unsigned long nr_soft_scanned;
3032 struct scan_control sc = { 3022 struct scan_control sc = {
3033 .gfp_mask = GFP_KERNEL, 3023 .gfp_mask = GFP_KERNEL,
3024 .order = order,
3034 .priority = DEF_PRIORITY, 3025 .priority = DEF_PRIORITY,
3026 .may_writepage = !laptop_mode,
3035 .may_unmap = 1, 3027 .may_unmap = 1,
3036 .may_swap = 1, 3028 .may_swap = 1,
3037 .may_writepage = !laptop_mode,
3038 .order = order,
3039 .target_mem_cgroup = NULL,
3040 }; 3029 };
3041 count_vm_event(PAGEOUTRUN); 3030 count_vm_event(PAGEOUTRUN);
3042 3031
@@ -3417,14 +3406,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3417{ 3406{
3418 struct reclaim_state reclaim_state; 3407 struct reclaim_state reclaim_state;
3419 struct scan_control sc = { 3408 struct scan_control sc = {
3409 .nr_to_reclaim = nr_to_reclaim,
3420 .gfp_mask = GFP_HIGHUSER_MOVABLE, 3410 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3421 .may_swap = 1, 3411 .priority = DEF_PRIORITY,
3422 .may_unmap = 1,
3423 .may_writepage = 1, 3412 .may_writepage = 1,
3424 .nr_to_reclaim = nr_to_reclaim, 3413 .may_unmap = 1,
3414 .may_swap = 1,
3425 .hibernation_mode = 1, 3415 .hibernation_mode = 1,
3426 .order = 0,
3427 .priority = DEF_PRIORITY,
3428 }; 3416 };
3429 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 3417 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3430 struct task_struct *p = current; 3418 struct task_struct *p = current;
@@ -3604,13 +3592,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3604 struct task_struct *p = current; 3592 struct task_struct *p = current;
3605 struct reclaim_state reclaim_state; 3593 struct reclaim_state reclaim_state;
3606 struct scan_control sc = { 3594 struct scan_control sc = {
3607 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3608 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3609 .may_swap = 1,
3610 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), 3595 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3611 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), 3596 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3612 .order = order, 3597 .order = order,
3613 .priority = ZONE_RECLAIM_PRIORITY, 3598 .priority = ZONE_RECLAIM_PRIORITY,
3599 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3600 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3601 .may_swap = 1,
3614 }; 3602 };
3615 struct shrink_control shrink = { 3603 struct shrink_control shrink = {
3616 .gfp_mask = sc.gfp_mask, 3604 .gfp_mask = sc.gfp_mask,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b37bd49bfd55..e9ab104b956f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
200 continue; 200 continue;
201 201
202 threshold = (*calculate_pressure)(zone); 202 threshold = (*calculate_pressure)(zone);
203 for_each_possible_cpu(cpu) 203 for_each_online_cpu(cpu)
204 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 204 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
205 = threshold; 205 = threshold;
206 } 206 }
@@ -763,6 +763,7 @@ const char * const vmstat_text[] = {
763 "nr_shmem", 763 "nr_shmem",
764 "nr_dirtied", 764 "nr_dirtied",
765 "nr_written", 765 "nr_written",
766 "nr_pages_scanned",
766 767
767#ifdef CONFIG_NUMA 768#ifdef CONFIG_NUMA
768 "numa_hit", 769 "numa_hit",
@@ -1067,7 +1068,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1067 min_wmark_pages(zone), 1068 min_wmark_pages(zone),
1068 low_wmark_pages(zone), 1069 low_wmark_pages(zone),
1069 high_wmark_pages(zone), 1070 high_wmark_pages(zone),
1070 zone->pages_scanned, 1071 zone_page_state(zone, NR_PAGES_SCANNED),
1071 zone->spanned_pages, 1072 zone->spanned_pages,
1072 zone->present_pages, 1073 zone->present_pages,
1073 zone->managed_pages); 1074 zone->managed_pages);
@@ -1077,10 +1078,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1077 zone_page_state(zone, i)); 1078 zone_page_state(zone, i));
1078 1079
1079 seq_printf(m, 1080 seq_printf(m,
1080 "\n protection: (%lu", 1081 "\n protection: (%ld",
1081 zone->lowmem_reserve[0]); 1082 zone->lowmem_reserve[0]);
1082 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 1083 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1083 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 1084 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1084 seq_printf(m, 1085 seq_printf(m,
1085 ")" 1086 ")"
1086 "\n pagesets"); 1087 "\n pagesets");
diff --git a/mm/zbud.c b/mm/zbud.c
index 01df13a7e2e1..a05790b1915e 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -51,6 +51,7 @@
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/spinlock.h> 52#include <linux/spinlock.h>
53#include <linux/zbud.h> 53#include <linux/zbud.h>
54#include <linux/zpool.h>
54 55
55/***************** 56/*****************
56 * Structures 57 * Structures
@@ -113,6 +114,90 @@ struct zbud_header {
113}; 114};
114 115
115/***************** 116/*****************
117 * zpool
118 ****************/
119
120#ifdef CONFIG_ZPOOL
121
122static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
123{
124 return zpool_evict(pool, handle);
125}
126
127static struct zbud_ops zbud_zpool_ops = {
128 .evict = zbud_zpool_evict
129};
130
131static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
132{
133 return zbud_create_pool(gfp, &zbud_zpool_ops);
134}
135
136static void zbud_zpool_destroy(void *pool)
137{
138 zbud_destroy_pool(pool);
139}
140
141static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
142 unsigned long *handle)
143{
144 return zbud_alloc(pool, size, gfp, handle);
145}
146static void zbud_zpool_free(void *pool, unsigned long handle)
147{
148 zbud_free(pool, handle);
149}
150
151static int zbud_zpool_shrink(void *pool, unsigned int pages,
152 unsigned int *reclaimed)
153{
154 unsigned int total = 0;
155 int ret = -EINVAL;
156
157 while (total < pages) {
158 ret = zbud_reclaim_page(pool, 8);
159 if (ret < 0)
160 break;
161 total++;
162 }
163
164 if (reclaimed)
165 *reclaimed = total;
166
167 return ret;
168}
169
170static void *zbud_zpool_map(void *pool, unsigned long handle,
171 enum zpool_mapmode mm)
172{
173 return zbud_map(pool, handle);
174}
175static void zbud_zpool_unmap(void *pool, unsigned long handle)
176{
177 zbud_unmap(pool, handle);
178}
179
180static u64 zbud_zpool_total_size(void *pool)
181{
182 return zbud_get_pool_size(pool) * PAGE_SIZE;
183}
184
185static struct zpool_driver zbud_zpool_driver = {
186 .type = "zbud",
187 .owner = THIS_MODULE,
188 .create = zbud_zpool_create,
189 .destroy = zbud_zpool_destroy,
190 .malloc = zbud_zpool_malloc,
191 .free = zbud_zpool_free,
192 .shrink = zbud_zpool_shrink,
193 .map = zbud_zpool_map,
194 .unmap = zbud_zpool_unmap,
195 .total_size = zbud_zpool_total_size,
196};
197
198#endif /* CONFIG_ZPOOL */
199
200/*****************
116 * Helpers 201 * Helpers
117*****************/ 202*****************/
118/* Just to make the code easier to read */ 203/* Just to make the code easier to read */
@@ -122,7 +207,7 @@ enum buddy {
122}; 207};
123 208
124/* Converts an allocation size in bytes to size in zbud chunks */ 209/* Converts an allocation size in bytes to size in zbud chunks */
125static int size_to_chunks(int size) 210static int size_to_chunks(size_t size)
126{ 211{
127 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 212 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
128} 213}
@@ -247,7 +332,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 332 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page. 333 * a new page.
249 */ 334 */
250int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, 335int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
251 unsigned long *handle) 336 unsigned long *handle)
252{ 337{
253 int chunks, i, freechunks; 338 int chunks, i, freechunks;
@@ -511,11 +596,20 @@ static int __init init_zbud(void)
511 /* Make sure the zbud header will fit in one chunk */ 596 /* Make sure the zbud header will fit in one chunk */
512 BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); 597 BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
513 pr_info("loaded\n"); 598 pr_info("loaded\n");
599
600#ifdef CONFIG_ZPOOL
601 zpool_register_driver(&zbud_zpool_driver);
602#endif
603
514 return 0; 604 return 0;
515} 605}
516 606
517static void __exit exit_zbud(void) 607static void __exit exit_zbud(void)
518{ 608{
609#ifdef CONFIG_ZPOOL
610 zpool_unregister_driver(&zbud_zpool_driver);
611#endif
612
519 pr_info("unloaded\n"); 613 pr_info("unloaded\n");
520} 614}
521 615
diff --git a/mm/zpool.c b/mm/zpool.c
new file mode 100644
index 000000000000..e40612a1df00
--- /dev/null
+++ b/mm/zpool.c
@@ -0,0 +1,364 @@
1/*
2 * zpool memory storage api
3 *
4 * Copyright (C) 2014 Dan Streetman
5 *
6 * This is a common frontend for memory storage pool implementations.
7 * Typically, this is used to store compressed memory.
8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12#include <linux/list.h>
13#include <linux/types.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16#include <linux/spinlock.h>
17#include <linux/module.h>
18#include <linux/zpool.h>
19
20struct zpool {
21 char *type;
22
23 struct zpool_driver *driver;
24 void *pool;
25 struct zpool_ops *ops;
26
27 struct list_head list;
28};
29
30static LIST_HEAD(drivers_head);
31static DEFINE_SPINLOCK(drivers_lock);
32
33static LIST_HEAD(pools_head);
34static DEFINE_SPINLOCK(pools_lock);
35
36/**
37 * zpool_register_driver() - register a zpool implementation.
38 * @driver: driver to register
39 */
40void zpool_register_driver(struct zpool_driver *driver)
41{
42 spin_lock(&drivers_lock);
43 atomic_set(&driver->refcount, 0);
44 list_add(&driver->list, &drivers_head);
45 spin_unlock(&drivers_lock);
46}
47EXPORT_SYMBOL(zpool_register_driver);
48
49/**
50 * zpool_unregister_driver() - unregister a zpool implementation.
51 * @driver: driver to unregister.
52 *
53 * Module usage counting is used to prevent using a driver
54 * while/after unloading, so if this is called from module
55 * exit function, this should never fail; if called from
56 * other than the module exit function, and this returns
57 * failure, the driver is in use and must remain available.
58 */
59int zpool_unregister_driver(struct zpool_driver *driver)
60{
61 int ret = 0, refcount;
62
63 spin_lock(&drivers_lock);
64 refcount = atomic_read(&driver->refcount);
65 WARN_ON(refcount < 0);
66 if (refcount > 0)
67 ret = -EBUSY;
68 else
69 list_del(&driver->list);
70 spin_unlock(&drivers_lock);
71
72 return ret;
73}
74EXPORT_SYMBOL(zpool_unregister_driver);
75
76/**
77 * zpool_evict() - evict callback from a zpool implementation.
78 * @pool: pool to evict from.
79 * @handle: handle to evict.
80 *
81 * This can be used by zpool implementations to call the
82 * user's evict zpool_ops struct evict callback.
83 */
84int zpool_evict(void *pool, unsigned long handle)
85{
86 struct zpool *zpool;
87
88 spin_lock(&pools_lock);
89 list_for_each_entry(zpool, &pools_head, list) {
90 if (zpool->pool == pool) {
91 spin_unlock(&pools_lock);
92 if (!zpool->ops || !zpool->ops->evict)
93 return -EINVAL;
94 return zpool->ops->evict(zpool, handle);
95 }
96 }
97 spin_unlock(&pools_lock);
98
99 return -ENOENT;
100}
101EXPORT_SYMBOL(zpool_evict);
102
103static struct zpool_driver *zpool_get_driver(char *type)
104{
105 struct zpool_driver *driver;
106
107 spin_lock(&drivers_lock);
108 list_for_each_entry(driver, &drivers_head, list) {
109 if (!strcmp(driver->type, type)) {
110 bool got = try_module_get(driver->owner);
111
112 if (got)
113 atomic_inc(&driver->refcount);
114 spin_unlock(&drivers_lock);
115 return got ? driver : NULL;
116 }
117 }
118
119 spin_unlock(&drivers_lock);
120 return NULL;
121}
122
123static void zpool_put_driver(struct zpool_driver *driver)
124{
125 atomic_dec(&driver->refcount);
126 module_put(driver->owner);
127}
128
129/**
130 * zpool_create_pool() - Create a new zpool
131 * @type The type of the zpool to create (e.g. zbud, zsmalloc)
132 * @gfp The GFP flags to use when allocating the pool.
133 * @ops The optional ops callback.
134 *
135 * This creates a new zpool of the specified type. The gfp flags will be
136 * used when allocating memory, if the implementation supports it. If the
137 * ops param is NULL, then the created zpool will not be shrinkable.
138 *
139 * Implementations must guarantee this to be thread-safe.
140 *
141 * Returns: New zpool on success, NULL on failure.
142 */
143struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
144{
145 struct zpool_driver *driver;
146 struct zpool *zpool;
147
148 pr_info("creating pool type %s\n", type);
149
150 driver = zpool_get_driver(type);
151
152 if (!driver) {
153 request_module(type);
154 driver = zpool_get_driver(type);
155 }
156
157 if (!driver) {
158 pr_err("no driver for type %s\n", type);
159 return NULL;
160 }
161
162 zpool = kmalloc(sizeof(*zpool), gfp);
163 if (!zpool) {
164 pr_err("couldn't create zpool - out of memory\n");
165 zpool_put_driver(driver);
166 return NULL;
167 }
168
169 zpool->type = driver->type;
170 zpool->driver = driver;
171 zpool->pool = driver->create(gfp, ops);
172 zpool->ops = ops;
173
174 if (!zpool->pool) {
175 pr_err("couldn't create %s pool\n", type);
176 zpool_put_driver(driver);
177 kfree(zpool);
178 return NULL;
179 }
180
181 pr_info("created %s pool\n", type);
182
183 spin_lock(&pools_lock);
184 list_add(&zpool->list, &pools_head);
185 spin_unlock(&pools_lock);
186
187 return zpool;
188}
189
190/**
191 * zpool_destroy_pool() - Destroy a zpool
192 * @pool The zpool to destroy.
193 *
194 * Implementations must guarantee this to be thread-safe,
195 * however only when destroying different pools. The same
196 * pool should only be destroyed once, and should not be used
197 * after it is destroyed.
198 *
199 * This destroys an existing zpool. The zpool should not be in use.
200 */
201void zpool_destroy_pool(struct zpool *zpool)
202{
203 pr_info("destroying pool type %s\n", zpool->type);
204
205 spin_lock(&pools_lock);
206 list_del(&zpool->list);
207 spin_unlock(&pools_lock);
208 zpool->driver->destroy(zpool->pool);
209 zpool_put_driver(zpool->driver);
210 kfree(zpool);
211}
212
213/**
214 * zpool_get_type() - Get the type of the zpool
215 * @pool The zpool to check
216 *
217 * This returns the type of the pool.
218 *
219 * Implementations must guarantee this to be thread-safe.
220 *
221 * Returns: The type of zpool.
222 */
223char *zpool_get_type(struct zpool *zpool)
224{
225 return zpool->type;
226}
227
228/**
229 * zpool_malloc() - Allocate memory
230 * @pool The zpool to allocate from.
231 * @size The amount of memory to allocate.
232 * @gfp The GFP flags to use when allocating memory.
233 * @handle Pointer to the handle to set
234 *
235 * This allocates the requested amount of memory from the pool.
236 * The gfp flags will be used when allocating memory, if the
237 * implementation supports it. The provided @handle will be
238 * set to the allocated object handle.
239 *
240 * Implementations must guarantee this to be thread-safe.
241 *
242 * Returns: 0 on success, negative value on error.
243 */
244int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
245 unsigned long *handle)
246{
247 return zpool->driver->malloc(zpool->pool, size, gfp, handle);
248}
249
250/**
251 * zpool_free() - Free previously allocated memory
252 * @pool The zpool that allocated the memory.
253 * @handle The handle to the memory to free.
254 *
255 * This frees previously allocated memory. This does not guarantee
256 * that the pool will actually free memory, only that the memory
257 * in the pool will become available for use by the pool.
258 *
259 * Implementations must guarantee this to be thread-safe,
260 * however only when freeing different handles. The same
261 * handle should only be freed once, and should not be used
262 * after freeing.
263 */
264void zpool_free(struct zpool *zpool, unsigned long handle)
265{
266 zpool->driver->free(zpool->pool, handle);
267}
268
269/**
270 * zpool_shrink() - Shrink the pool size
271 * @pool The zpool to shrink.
272 * @pages The number of pages to shrink the pool.
273 * @reclaimed The number of pages successfully evicted.
274 *
275 * This attempts to shrink the actual memory size of the pool
276 * by evicting currently used handle(s). If the pool was
277 * created with no zpool_ops, or the evict call fails for any
278 * of the handles, this will fail. If non-NULL, the @reclaimed
279 * parameter will be set to the number of pages reclaimed,
280 * which may be more than the number of pages requested.
281 *
282 * Implementations must guarantee this to be thread-safe.
283 *
284 * Returns: 0 on success, negative value on error/failure.
285 */
286int zpool_shrink(struct zpool *zpool, unsigned int pages,
287 unsigned int *reclaimed)
288{
289 return zpool->driver->shrink(zpool->pool, pages, reclaimed);
290}
291
292/**
293 * zpool_map_handle() - Map a previously allocated handle into memory
294 * @pool The zpool that the handle was allocated from
295 * @handle The handle to map
296 * @mm How the memory should be mapped
297 *
298 * This maps a previously allocated handle into memory. The @mm
299 * param indicates to the implementation how the memory will be
300 * used, i.e. read-only, write-only, read-write. If the
301 * implementation does not support it, the memory will be treated
302 * as read-write.
303 *
304 * This may hold locks, disable interrupts, and/or preemption,
305 * and the zpool_unmap_handle() must be called to undo those
306 * actions. The code that uses the mapped handle should complete
307 * its operatons on the mapped handle memory quickly and unmap
308 * as soon as possible. As the implementation may use per-cpu
309 * data, multiple handles should not be mapped concurrently on
310 * any cpu.
311 *
312 * Returns: A pointer to the handle's mapped memory area.
313 */
314void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
315 enum zpool_mapmode mapmode)
316{
317 return zpool->driver->map(zpool->pool, handle, mapmode);
318}
319
320/**
321 * zpool_unmap_handle() - Unmap a previously mapped handle
322 * @pool The zpool that the handle was allocated from
323 * @handle The handle to unmap
324 *
325 * This unmaps a previously mapped handle. Any locks or other
326 * actions that the implementation took in zpool_map_handle()
327 * will be undone here. The memory area returned from
328 * zpool_map_handle() should no longer be used after this.
329 */
330void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
331{
332 zpool->driver->unmap(zpool->pool, handle);
333}
334
335/**
336 * zpool_get_total_size() - The total size of the pool
337 * @pool The zpool to check
338 *
339 * This returns the total size in bytes of the pool.
340 *
341 * Returns: Total size of the zpool in bytes.
342 */
343u64 zpool_get_total_size(struct zpool *zpool)
344{
345 return zpool->driver->total_size(zpool->pool);
346}
347
348static int __init init_zpool(void)
349{
350 pr_info("loaded\n");
351 return 0;
352}
353
354static void __exit exit_zpool(void)
355{
356 pr_info("unloaded\n");
357}
358
359module_init(init_zpool);
360module_exit(exit_zpool);
361
362MODULE_LICENSE("GPL");
363MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
364MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index fe78189624cf..4e2fc83cb394 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -92,6 +92,7 @@
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/types.h> 93#include <linux/types.h>
94#include <linux/zsmalloc.h> 94#include <linux/zsmalloc.h>
95#include <linux/zpool.h>
95 96
96/* 97/*
97 * This must be power of 2 and greater than of equal to sizeof(link_free). 98 * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -240,6 +241,81 @@ struct mapping_area {
240 enum zs_mapmode vm_mm; /* mapping mode */ 241 enum zs_mapmode vm_mm; /* mapping mode */
241}; 242};
242 243
244/* zpool driver */
245
246#ifdef CONFIG_ZPOOL
247
248static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
249{
250 return zs_create_pool(gfp);
251}
252
253static void zs_zpool_destroy(void *pool)
254{
255 zs_destroy_pool(pool);
256}
257
258static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
259 unsigned long *handle)
260{
261 *handle = zs_malloc(pool, size);
262 return *handle ? 0 : -1;
263}
264static void zs_zpool_free(void *pool, unsigned long handle)
265{
266 zs_free(pool, handle);
267}
268
269static int zs_zpool_shrink(void *pool, unsigned int pages,
270 unsigned int *reclaimed)
271{
272 return -EINVAL;
273}
274
275static void *zs_zpool_map(void *pool, unsigned long handle,
276 enum zpool_mapmode mm)
277{
278 enum zs_mapmode zs_mm;
279
280 switch (mm) {
281 case ZPOOL_MM_RO:
282 zs_mm = ZS_MM_RO;
283 break;
284 case ZPOOL_MM_WO:
285 zs_mm = ZS_MM_WO;
286 break;
287 case ZPOOL_MM_RW: /* fallthru */
288 default:
289 zs_mm = ZS_MM_RW;
290 break;
291 }
292
293 return zs_map_object(pool, handle, zs_mm);
294}
295static void zs_zpool_unmap(void *pool, unsigned long handle)
296{
297 zs_unmap_object(pool, handle);
298}
299
300static u64 zs_zpool_total_size(void *pool)
301{
302 return zs_get_total_size_bytes(pool);
303}
304
305static struct zpool_driver zs_zpool_driver = {
306 .type = "zsmalloc",
307 .owner = THIS_MODULE,
308 .create = zs_zpool_create,
309 .destroy = zs_zpool_destroy,
310 .malloc = zs_zpool_malloc,
311 .free = zs_zpool_free,
312 .shrink = zs_zpool_shrink,
313 .map = zs_zpool_map,
314 .unmap = zs_zpool_unmap,
315 .total_size = zs_zpool_total_size,
316};
317
318#endif /* CONFIG_ZPOOL */
243 319
244/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 320/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
245static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 321static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -690,7 +766,7 @@ static inline void __zs_cpu_down(struct mapping_area *area)
690static inline void *__zs_map_object(struct mapping_area *area, 766static inline void *__zs_map_object(struct mapping_area *area,
691 struct page *pages[2], int off, int size) 767 struct page *pages[2], int off, int size)
692{ 768{
693 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); 769 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
694 area->vm_addr = area->vm->addr; 770 area->vm_addr = area->vm->addr;
695 return area->vm_addr + off; 771 return area->vm_addr + off;
696} 772}
@@ -814,6 +890,10 @@ static void zs_exit(void)
814{ 890{
815 int cpu; 891 int cpu;
816 892
893#ifdef CONFIG_ZPOOL
894 zpool_unregister_driver(&zs_zpool_driver);
895#endif
896
817 cpu_notifier_register_begin(); 897 cpu_notifier_register_begin();
818 898
819 for_each_online_cpu(cpu) 899 for_each_online_cpu(cpu)
@@ -840,6 +920,10 @@ static int zs_init(void)
840 920
841 cpu_notifier_register_done(); 921 cpu_notifier_register_done();
842 922
923#ifdef CONFIG_ZPOOL
924 zpool_register_driver(&zs_zpool_driver);
925#endif
926
843 return 0; 927 return 0;
844fail: 928fail:
845 zs_exit(); 929 zs_exit();
diff --git a/mm/zswap.c b/mm/zswap.c
index 008388fe7b0f..032c21eeab2b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -34,7 +34,7 @@
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/crypto.h> 35#include <linux/crypto.h>
36#include <linux/mempool.h> 36#include <linux/mempool.h>
37#include <linux/zbud.h> 37#include <linux/zpool.h>
38 38
39#include <linux/mm_types.h> 39#include <linux/mm_types.h>
40#include <linux/page-flags.h> 40#include <linux/page-flags.h>
@@ -45,8 +45,8 @@
45/********************************* 45/*********************************
46* statistics 46* statistics
47**********************************/ 47**********************************/
48/* Number of memory pages used by the compressed pool */ 48/* Total bytes used by the compressed storage */
49static u64 zswap_pool_pages; 49static u64 zswap_pool_total_size;
50/* The number of compressed pages currently stored in zswap */ 50/* The number of compressed pages currently stored in zswap */
51static atomic_t zswap_stored_pages = ATOMIC_INIT(0); 51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
52 52
@@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20;
89module_param_named(max_pool_percent, 89module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644); 90 zswap_max_pool_percent, uint, 0644);
91 91
92/* zbud_pool is shared by all of zswap backend */ 92/* Compressed storage to use */
93static struct zbud_pool *zswap_pool; 93#define ZSWAP_ZPOOL_DEFAULT "zbud"
94static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
95module_param_named(zpool, zswap_zpool_type, charp, 0444);
96
97/* zpool is shared by all of zswap backend */
98static struct zpool *zswap_pool;
94 99
95/********************************* 100/*********************************
96* compression functions 101* compression functions
@@ -168,7 +173,7 @@ static void zswap_comp_exit(void)
168 * be held while changing the refcount. Since the lock must 173 * be held while changing the refcount. Since the lock must
169 * be held, there is no reason to also make refcount atomic. 174 * be held, there is no reason to also make refcount atomic.
170 * offset - the swap offset for the entry. Index into the red-black tree. 175 * offset - the swap offset for the entry. Index into the red-black tree.
171 * handle - zbud allocation handle that stores the compressed page data 176 * handle - zpool allocation handle that stores the compressed page data
172 * length - the length in bytes of the compressed page data. Needed during 177 * length - the length in bytes of the compressed page data. Needed during
173 * decompression 178 * decompression
174 */ 179 */
@@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
284} 289}
285 290
286/* 291/*
287 * Carries out the common pattern of freeing and entry's zbud allocation, 292 * Carries out the common pattern of freeing and entry's zpool allocation,
288 * freeing the entry itself, and decrementing the number of stored pages. 293 * freeing the entry itself, and decrementing the number of stored pages.
289 */ 294 */
290static void zswap_free_entry(struct zswap_entry *entry) 295static void zswap_free_entry(struct zswap_entry *entry)
291{ 296{
292 zbud_free(zswap_pool, entry->handle); 297 zpool_free(zswap_pool, entry->handle);
293 zswap_entry_cache_free(entry); 298 zswap_entry_cache_free(entry);
294 atomic_dec(&zswap_stored_pages); 299 atomic_dec(&zswap_stored_pages);
295 zswap_pool_pages = zbud_get_pool_size(zswap_pool); 300 zswap_pool_total_size = zpool_get_total_size(zswap_pool);
296} 301}
297 302
298/* caller must hold the tree lock */ 303/* caller must hold the tree lock */
@@ -409,7 +414,7 @@ cleanup:
409static bool zswap_is_full(void) 414static bool zswap_is_full(void)
410{ 415{
411 return totalram_pages * zswap_max_pool_percent / 100 < 416 return totalram_pages * zswap_max_pool_percent / 100 <
412 zswap_pool_pages; 417 DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
413} 418}
414 419
415/********************************* 420/*********************************
@@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
525 * the swap cache, the compressed version stored by zswap can be 530 * the swap cache, the compressed version stored by zswap can be
526 * freed. 531 * freed.
527 */ 532 */
528static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) 533static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
529{ 534{
530 struct zswap_header *zhdr; 535 struct zswap_header *zhdr;
531 swp_entry_t swpentry; 536 swp_entry_t swpentry;
@@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
541 }; 546 };
542 547
543 /* extract swpentry from data */ 548 /* extract swpentry from data */
544 zhdr = zbud_map(pool, handle); 549 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
545 swpentry = zhdr->swpentry; /* here */ 550 swpentry = zhdr->swpentry; /* here */
546 zbud_unmap(pool, handle); 551 zpool_unmap_handle(pool, handle);
547 tree = zswap_trees[swp_type(swpentry)]; 552 tree = zswap_trees[swp_type(swpentry)];
548 offset = swp_offset(swpentry); 553 offset = swp_offset(swpentry);
549 554
@@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
573 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 578 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
574 /* decompress */ 579 /* decompress */
575 dlen = PAGE_SIZE; 580 dlen = PAGE_SIZE;
576 src = (u8 *)zbud_map(zswap_pool, entry->handle) + 581 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
577 sizeof(struct zswap_header); 582 ZPOOL_MM_RO) + sizeof(struct zswap_header);
578 dst = kmap_atomic(page); 583 dst = kmap_atomic(page);
579 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 584 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
580 entry->length, dst, &dlen); 585 entry->length, dst, &dlen);
581 kunmap_atomic(dst); 586 kunmap_atomic(dst);
582 zbud_unmap(zswap_pool, entry->handle); 587 zpool_unmap_handle(zswap_pool, entry->handle);
583 BUG_ON(ret); 588 BUG_ON(ret);
584 BUG_ON(dlen != PAGE_SIZE); 589 BUG_ON(dlen != PAGE_SIZE);
585 590
@@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
652 /* reclaim space if needed */ 657 /* reclaim space if needed */
653 if (zswap_is_full()) { 658 if (zswap_is_full()) {
654 zswap_pool_limit_hit++; 659 zswap_pool_limit_hit++;
655 if (zbud_reclaim_page(zswap_pool, 8)) { 660 if (zpool_shrink(zswap_pool, 1, NULL)) {
656 zswap_reject_reclaim_fail++; 661 zswap_reject_reclaim_fail++;
657 ret = -ENOMEM; 662 ret = -ENOMEM;
658 goto reject; 663 goto reject;
@@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
679 684
680 /* store */ 685 /* store */
681 len = dlen + sizeof(struct zswap_header); 686 len = dlen + sizeof(struct zswap_header);
682 ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, 687 ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
683 &handle); 688 &handle);
684 if (ret == -ENOSPC) { 689 if (ret == -ENOSPC) {
685 zswap_reject_compress_poor++; 690 zswap_reject_compress_poor++;
@@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
689 zswap_reject_alloc_fail++; 694 zswap_reject_alloc_fail++;
690 goto freepage; 695 goto freepage;
691 } 696 }
692 zhdr = zbud_map(zswap_pool, handle); 697 zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
693 zhdr->swpentry = swp_entry(type, offset); 698 zhdr->swpentry = swp_entry(type, offset);
694 buf = (u8 *)(zhdr + 1); 699 buf = (u8 *)(zhdr + 1);
695 memcpy(buf, dst, dlen); 700 memcpy(buf, dst, dlen);
696 zbud_unmap(zswap_pool, handle); 701 zpool_unmap_handle(zswap_pool, handle);
697 put_cpu_var(zswap_dstmem); 702 put_cpu_var(zswap_dstmem);
698 703
699 /* populate entry */ 704 /* populate entry */
@@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
716 721
717 /* update stats */ 722 /* update stats */
718 atomic_inc(&zswap_stored_pages); 723 atomic_inc(&zswap_stored_pages);
719 zswap_pool_pages = zbud_get_pool_size(zswap_pool); 724 zswap_pool_total_size = zpool_get_total_size(zswap_pool);
720 725
721 return 0; 726 return 0;
722 727
@@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
752 757
753 /* decompress */ 758 /* decompress */
754 dlen = PAGE_SIZE; 759 dlen = PAGE_SIZE;
755 src = (u8 *)zbud_map(zswap_pool, entry->handle) + 760 src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
756 sizeof(struct zswap_header); 761 ZPOOL_MM_RO) + sizeof(struct zswap_header);
757 dst = kmap_atomic(page); 762 dst = kmap_atomic(page);
758 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 763 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
759 dst, &dlen); 764 dst, &dlen);
760 kunmap_atomic(dst); 765 kunmap_atomic(dst);
761 zbud_unmap(zswap_pool, entry->handle); 766 zpool_unmap_handle(zswap_pool, entry->handle);
762 BUG_ON(ret); 767 BUG_ON(ret);
763 768
764 spin_lock(&tree->lock); 769 spin_lock(&tree->lock);
@@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
811 zswap_trees[type] = NULL; 816 zswap_trees[type] = NULL;
812} 817}
813 818
814static struct zbud_ops zswap_zbud_ops = { 819static struct zpool_ops zswap_zpool_ops = {
815 .evict = zswap_writeback_entry 820 .evict = zswap_writeback_entry
816}; 821};
817 822
@@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void)
869 zswap_debugfs_root, &zswap_written_back_pages); 874 zswap_debugfs_root, &zswap_written_back_pages);
870 debugfs_create_u64("duplicate_entry", S_IRUGO, 875 debugfs_create_u64("duplicate_entry", S_IRUGO,
871 zswap_debugfs_root, &zswap_duplicate_entry); 876 zswap_debugfs_root, &zswap_duplicate_entry);
872 debugfs_create_u64("pool_pages", S_IRUGO, 877 debugfs_create_u64("pool_total_size", S_IRUGO,
873 zswap_debugfs_root, &zswap_pool_pages); 878 zswap_debugfs_root, &zswap_pool_total_size);
874 debugfs_create_atomic_t("stored_pages", S_IRUGO, 879 debugfs_create_atomic_t("stored_pages", S_IRUGO,
875 zswap_debugfs_root, &zswap_stored_pages); 880 zswap_debugfs_root, &zswap_stored_pages);
876 881
@@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { }
895**********************************/ 900**********************************/
896static int __init init_zswap(void) 901static int __init init_zswap(void)
897{ 902{
903 gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
904
898 if (!zswap_enabled) 905 if (!zswap_enabled)
899 return 0; 906 return 0;
900 907
901 pr_info("loading zswap\n"); 908 pr_info("loading zswap\n");
902 909
903 zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); 910 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
911 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
912 pr_info("%s zpool not available\n", zswap_zpool_type);
913 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
914 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
915 &zswap_zpool_ops);
916 }
904 if (!zswap_pool) { 917 if (!zswap_pool) {
905 pr_err("zbud pool creation failed\n"); 918 pr_err("%s zpool not available\n", zswap_zpool_type);
919 pr_err("zpool creation failed\n");
906 goto error; 920 goto error;
907 } 921 }
922 pr_info("using %s pool\n", zswap_zpool_type);
908 923
909 if (zswap_entry_cache_create()) { 924 if (zswap_entry_cache_create()) {
910 pr_err("entry cache creation failed\n"); 925 pr_err("entry cache creation failed\n");
@@ -928,7 +943,7 @@ pcpufail:
928compfail: 943compfail:
929 zswap_entry_cache_destory(); 944 zswap_entry_cache_destory();
930cachefail: 945cachefail:
931 zbud_destroy_pool(zswap_pool); 946 zpool_destroy_pool(zswap_pool);
932error: 947error:
933 return -ENOMEM; 948 return -ENOMEM;
934} 949}