aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-04-07 19:38:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-04-07 19:38:06 -0400
commit26c12d93348f0bda0756aff83f4867d9ae58a5a6 (patch)
tree65221f6837c66a9260c5c973e5fb908b10e0d504 /mm
parentdc5ed40686a4da95881c35d913b60f867755cbe2 (diff)
parentfdc5813fbbd484a54c88477f91a78934cda8bb32 (diff)
Merge branch 'akpm' (incoming from Andrew)
Merge second patch-bomb from Andrew Morton: - the rest of MM - zram updates - zswap updates - exit - procfs - exec - wait - crash dump - lib/idr - rapidio - adfs, affs, bfs, ufs - cris - Kconfig things - initramfs - small amount of IPC material - percpu enhancements - early ioremap support - various other misc things * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (156 commits) MAINTAINERS: update Intel C600 SAS driver maintainers fs/ufs: remove unused ufs_super_block_third pointer fs/ufs: remove unused ufs_super_block_second pointer fs/ufs: remove unused ufs_super_block_first pointer fs/ufs/super.c: add __init to init_inodecache() doc/kernel-parameters.txt: add early_ioremap_debug arm64: add early_ioremap support arm64: initialize pgprot info earlier in boot x86: use generic early_ioremap mm: create generic early_ioremap() support x86/mm: sparse warning fix for early_memremap lglock: map to spinlock when !CONFIG_SMP percpu: add preemption checks to __this_cpu ops vmstat: use raw_cpu_ops to avoid false positives on preemption checks slub: use raw_cpu_inc for incrementing statistics net: replace __this_cpu_inc in route.c with raw_cpu_inc modules: use raw_cpu_write for initialization of per cpu refcount. mm: use raw_cpu ops for determining current NUMA node percpu: add raw_cpu_ops slub: fix leak of 'name' in sysfs_slab_add ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c84
-rw-r--r--mm/early_ioremap.c245
-rw-r--r--mm/filemap.c86
-rw-r--r--mm/huge_memory.c21
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/internal.h16
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c453
-rw-r--r--mm/memory.c147
-rw-r--r--mm/mempolicy.c46
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c55
-rw-r--r--mm/mprotect.c56
-rw-r--r--mm/nommu.c49
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c118
-rw-r--r--mm/readahead.c21
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slab.h21
-rw-r--r--mm/slab_common.c250
-rw-r--r--mm/slub.c87
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmacache.c112
-rw-r--r--mm/vmalloc.c10
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/zswap.c78
32 files changed, 1342 insertions, 722 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2888024e0b0a..ebe5880c29d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED
216# 216#
217config SPLIT_PTLOCK_CPUS 217config SPLIT_PTLOCK_CPUS
218 int 218 int
219 default "999999" if !MMU
219 default "999999" if ARM && !CPU_CACHE_VIPT 220 default "999999" if ARM && !CPU_CACHE_VIPT
220 default "999999" if PARISC && !PA20 221 default "999999" if PARISC && !PA20
221 default "4" 222 default "4"
@@ -577,3 +578,6 @@ config PGTABLE_MAPPING
577 578
578 You can check speed with zsmalloc benchmark: 579 You can check speed with zsmalloc benchmark:
579 https://github.com/spartacus06/zsmapbench 580 https://github.com/spartacus06/zsmapbench
581
582config GENERIC_EARLY_IOREMAP
583 bool
diff --git a/mm/Makefile b/mm/Makefile
index cdd741519ee0..9e5aaf92197d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o balloon_compaction.o \ 19 compaction.o balloon_compaction.o vmacache.o \
20 interval_tree.o list_lru.o workingset.o $(mmu-y) 20 interval_tree.o list_lru.o workingset.o $(mmu-y)
21 21
22obj-y += init-mm.o 22obj-y += init-mm.o
@@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o
61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
62obj-$(CONFIG_ZBUD) += zbud.o 62obj-$(CONFIG_ZBUD) += zbud.o
63obj-$(CONFIG_ZSMALLOC) += zsmalloc.o 63obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
64obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/compaction.c b/mm/compaction.c
index b6ab77160068..37f976287068 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
217/* Returns true if the page is within a block suitable for migration to */ 217/* Returns true if the page is within a block suitable for migration to */
218static bool suitable_migration_target(struct page *page) 218static bool suitable_migration_target(struct page *page)
219{ 219{
220 int migratetype = get_pageblock_migratetype(page); 220 /* If the page is a large free page, then disallow migration */
221
222 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
223 if (migratetype == MIGRATE_RESERVE)
224 return false;
225
226 if (is_migrate_isolate(migratetype))
227 return false;
228
229 /* If the page is a large free page, then allow migration */
230 if (PageBuddy(page) && page_order(page) >= pageblock_order) 221 if (PageBuddy(page) && page_order(page) >= pageblock_order)
231 return true; 222 return false;
232 223
233 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 224 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
234 if (migrate_async_suitable(migratetype)) 225 if (migrate_async_suitable(get_pageblock_migratetype(page)))
235 return true; 226 return true;
236 227
237 /* Otherwise skip the block */ 228 /* Otherwise skip the block */
@@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
253 struct page *cursor, *valid_page = NULL; 244 struct page *cursor, *valid_page = NULL;
254 unsigned long flags; 245 unsigned long flags;
255 bool locked = false; 246 bool locked = false;
247 bool checked_pageblock = false;
256 248
257 cursor = pfn_to_page(blockpfn); 249 cursor = pfn_to_page(blockpfn);
258 250
@@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
284 break; 276 break;
285 277
286 /* Recheck this is a suitable migration target under lock */ 278 /* Recheck this is a suitable migration target under lock */
287 if (!strict && !suitable_migration_target(page)) 279 if (!strict && !checked_pageblock) {
288 break; 280 /*
281 * We need to check suitability of pageblock only once
282 * and this isolate_freepages_block() is called with
283 * pageblock range, so just check once is sufficient.
284 */
285 checked_pageblock = true;
286 if (!suitable_migration_target(page))
287 break;
288 }
289 289
290 /* Recheck this is a buddy page under lock */ 290 /* Recheck this is a buddy page under lock */
291 if (!PageBuddy(page)) 291 if (!PageBuddy(page))
@@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
460 unsigned long last_pageblock_nr = 0, pageblock_nr; 460 unsigned long last_pageblock_nr = 0, pageblock_nr;
461 unsigned long nr_scanned = 0, nr_isolated = 0; 461 unsigned long nr_scanned = 0, nr_isolated = 0;
462 struct list_head *migratelist = &cc->migratepages; 462 struct list_head *migratelist = &cc->migratepages;
463 isolate_mode_t mode = 0;
464 struct lruvec *lruvec; 463 struct lruvec *lruvec;
465 unsigned long flags; 464 unsigned long flags;
466 bool locked = false; 465 bool locked = false;
467 struct page *page = NULL, *valid_page = NULL; 466 struct page *page = NULL, *valid_page = NULL;
468 bool skipped_async_unsuitable = false; 467 bool skipped_async_unsuitable = false;
468 const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
469 (unevictable ? ISOLATE_UNEVICTABLE : 0);
469 470
470 /* 471 /*
471 * Ensure that there are not too many pages isolated from the LRU 472 * Ensure that there are not too many pages isolated from the LRU
@@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
487 cond_resched(); 488 cond_resched();
488 for (; low_pfn < end_pfn; low_pfn++) { 489 for (; low_pfn < end_pfn; low_pfn++) {
489 /* give a chance to irqs before checking need_resched() */ 490 /* give a chance to irqs before checking need_resched() */
490 if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { 491 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
491 if (should_release_lock(&zone->lru_lock)) { 492 if (should_release_lock(&zone->lru_lock)) {
492 spin_unlock_irqrestore(&zone->lru_lock, flags); 493 spin_unlock_irqrestore(&zone->lru_lock, flags);
493 locked = false; 494 locked = false;
@@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
526 527
527 /* If isolation recently failed, do not retry */ 528 /* If isolation recently failed, do not retry */
528 pageblock_nr = low_pfn >> pageblock_order; 529 pageblock_nr = low_pfn >> pageblock_order;
529 if (!isolation_suitable(cc, page)) 530 if (last_pageblock_nr != pageblock_nr) {
530 goto next_pageblock; 531 int mt;
532
533 last_pageblock_nr = pageblock_nr;
534 if (!isolation_suitable(cc, page))
535 goto next_pageblock;
536
537 /*
538 * For async migration, also only scan in MOVABLE
539 * blocks. Async migration is optimistic to see if
540 * the minimum amount of work satisfies the allocation
541 */
542 mt = get_pageblock_migratetype(page);
543 if (!cc->sync && !migrate_async_suitable(mt)) {
544 cc->finished_update_migrate = true;
545 skipped_async_unsuitable = true;
546 goto next_pageblock;
547 }
548 }
531 549
532 /* 550 /*
533 * Skip if free. page_order cannot be used without zone->lock 551 * Skip if free. page_order cannot be used without zone->lock
@@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
537 continue; 555 continue;
538 556
539 /* 557 /*
540 * For async migration, also only scan in MOVABLE blocks. Async
541 * migration is optimistic to see if the minimum amount of work
542 * satisfies the allocation
543 */
544 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
545 !migrate_async_suitable(get_pageblock_migratetype(page))) {
546 cc->finished_update_migrate = true;
547 skipped_async_unsuitable = true;
548 goto next_pageblock;
549 }
550
551 /*
552 * Check may be lockless but that's ok as we recheck later. 558 * Check may be lockless but that's ok as we recheck later.
553 * It's possible to migrate LRU pages and balloon pages 559 * It's possible to migrate LRU pages and balloon pages
554 * Skip any other type of page 560 * Skip any other type of page
@@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
557 if (unlikely(balloon_page_movable(page))) { 563 if (unlikely(balloon_page_movable(page))) {
558 if (locked && balloon_page_isolate(page)) { 564 if (locked && balloon_page_isolate(page)) {
559 /* Successfully isolated */ 565 /* Successfully isolated */
560 cc->finished_update_migrate = true; 566 goto isolate_success;
561 list_add(&page->lru, migratelist);
562 cc->nr_migratepages++;
563 nr_isolated++;
564 goto check_compact_cluster;
565 } 567 }
566 } 568 }
567 continue; 569 continue;
@@ -607,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
607 continue; 609 continue;
608 } 610 }
609 611
610 if (!cc->sync)
611 mode |= ISOLATE_ASYNC_MIGRATE;
612
613 if (unevictable)
614 mode |= ISOLATE_UNEVICTABLE;
615
616 lruvec = mem_cgroup_page_lruvec(page, zone); 612 lruvec = mem_cgroup_page_lruvec(page, zone);
617 613
618 /* Try isolate the page */ 614 /* Try isolate the page */
@@ -622,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
622 VM_BUG_ON_PAGE(PageTransCompound(page), page); 618 VM_BUG_ON_PAGE(PageTransCompound(page), page);
623 619
624 /* Successfully isolated */ 620 /* Successfully isolated */
625 cc->finished_update_migrate = true;
626 del_page_from_lru_list(page, lruvec, page_lru(page)); 621 del_page_from_lru_list(page, lruvec, page_lru(page));
622
623isolate_success:
624 cc->finished_update_migrate = true;
627 list_add(&page->lru, migratelist); 625 list_add(&page->lru, migratelist);
628 cc->nr_migratepages++; 626 cc->nr_migratepages++;
629 nr_isolated++; 627 nr_isolated++;
630 628
631check_compact_cluster:
632 /* Avoid isolating too much */ 629 /* Avoid isolating too much */
633 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 630 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
634 ++low_pfn; 631 ++low_pfn;
@@ -639,7 +636,6 @@ check_compact_cluster:
639 636
640next_pageblock: 637next_pageblock:
641 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; 638 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
642 last_pageblock_nr = pageblock_nr;
643 } 639 }
644 640
645 acct_isolated(zone, locked, cc); 641 acct_isolated(zone, locked, cc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 000000000000..e10ccd299d66
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,245 @@
1/*
2 * Provide common bits of early_ioremap() support for architectures needing
3 * temporary mappings during boot before ioremap() is available.
4 *
5 * This is mostly a direct copy of the x86 early_ioremap implementation.
6 *
7 * (C) Copyright 1995 1996, 2014 Linus Torvalds
8 *
9 */
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/io.h>
13#include <linux/module.h>
14#include <linux/slab.h>
15#include <linux/mm.h>
16#include <linux/vmalloc.h>
17#include <asm/fixmap.h>
18
19#ifdef CONFIG_MMU
20static int early_ioremap_debug __initdata;
21
22static int __init early_ioremap_debug_setup(char *str)
23{
24 early_ioremap_debug = 1;
25
26 return 0;
27}
28early_param("early_ioremap_debug", early_ioremap_debug_setup);
29
30static int after_paging_init __initdata;
31
32void __init __weak early_ioremap_shutdown(void)
33{
34}
35
36void __init early_ioremap_reset(void)
37{
38 early_ioremap_shutdown();
39 after_paging_init = 1;
40}
41
42/*
43 * Generally, ioremap() is available after paging_init() has been called.
44 * Architectures wanting to allow early_ioremap after paging_init() can
45 * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
46 */
47#ifndef __late_set_fixmap
48static inline void __init __late_set_fixmap(enum fixed_addresses idx,
49 phys_addr_t phys, pgprot_t prot)
50{
51 BUG();
52}
53#endif
54
55#ifndef __late_clear_fixmap
56static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
57{
58 BUG();
59}
60#endif
61
62static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
63static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
64static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
65
66void __init early_ioremap_setup(void)
67{
68 int i;
69
70 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
71 if (WARN_ON(prev_map[i]))
72 break;
73
74 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
75 slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
76}
77
78static int __init check_early_ioremap_leak(void)
79{
80 int count = 0;
81 int i;
82
83 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
84 if (prev_map[i])
85 count++;
86
87 if (WARN(count, KERN_WARNING
88 "Debug warning: early ioremap leak of %d areas detected.\n"
89 "please boot with early_ioremap_debug and report the dmesg.\n",
90 count))
91 return 1;
92 return 0;
93}
94late_initcall(check_early_ioremap_leak);
95
96static void __init __iomem *
97__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
98{
99 unsigned long offset;
100 resource_size_t last_addr;
101 unsigned int nrpages;
102 enum fixed_addresses idx;
103 int i, slot;
104
105 WARN_ON(system_state != SYSTEM_BOOTING);
106
107 slot = -1;
108 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
109 if (!prev_map[i]) {
110 slot = i;
111 break;
112 }
113 }
114
115 if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
116 __func__, (u64)phys_addr, size))
117 return NULL;
118
119 /* Don't allow wraparound or zero size */
120 last_addr = phys_addr + size - 1;
121 if (WARN_ON(!size || last_addr < phys_addr))
122 return NULL;
123
124 prev_size[slot] = size;
125 /*
126 * Mappings have to be page-aligned
127 */
128 offset = phys_addr & ~PAGE_MASK;
129 phys_addr &= PAGE_MASK;
130 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
131
132 /*
133 * Mappings have to fit in the FIX_BTMAP area.
134 */
135 nrpages = size >> PAGE_SHIFT;
136 if (WARN_ON(nrpages > NR_FIX_BTMAPS))
137 return NULL;
138
139 /*
140 * Ok, go for it..
141 */
142 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
143 while (nrpages > 0) {
144 if (after_paging_init)
145 __late_set_fixmap(idx, phys_addr, prot);
146 else
147 __early_set_fixmap(idx, phys_addr, prot);
148 phys_addr += PAGE_SIZE;
149 --idx;
150 --nrpages;
151 }
152 WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
153 __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
154
155 prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
156 return prev_map[slot];
157}
158
159void __init early_iounmap(void __iomem *addr, unsigned long size)
160{
161 unsigned long virt_addr;
162 unsigned long offset;
163 unsigned int nrpages;
164 enum fixed_addresses idx;
165 int i, slot;
166
167 slot = -1;
168 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
169 if (prev_map[i] == addr) {
170 slot = i;
171 break;
172 }
173 }
174
175 if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
176 addr, size))
177 return;
178
179 if (WARN(prev_size[slot] != size,
180 "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
181 addr, size, slot, prev_size[slot]))
182 return;
183
184 WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
185 addr, size, slot);
186
187 virt_addr = (unsigned long)addr;
188 if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
189 return;
190
191 offset = virt_addr & ~PAGE_MASK;
192 nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
193
194 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
195 while (nrpages > 0) {
196 if (after_paging_init)
197 __late_clear_fixmap(idx);
198 else
199 __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
200 --idx;
201 --nrpages;
202 }
203 prev_map[slot] = NULL;
204}
205
206/* Remap an IO device */
207void __init __iomem *
208early_ioremap(resource_size_t phys_addr, unsigned long size)
209{
210 return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
211}
212
213/* Remap memory */
214void __init *
215early_memremap(resource_size_t phys_addr, unsigned long size)
216{
217 return (__force void *)__early_ioremap(phys_addr, size,
218 FIXMAP_PAGE_NORMAL);
219}
220#else /* CONFIG_MMU */
221
222void __init __iomem *
223early_ioremap(resource_size_t phys_addr, unsigned long size)
224{
225 return (__force void __iomem *)phys_addr;
226}
227
228/* Remap memory */
229void __init *
230early_memremap(resource_size_t phys_addr, unsigned long size)
231{
232 return (void *)phys_addr;
233}
234
235void __init early_iounmap(void __iomem *addr, unsigned long size)
236{
237}
238
239#endif /* CONFIG_MMU */
240
241
242void __init early_memunmap(void *addr, unsigned long size)
243{
244 early_iounmap((__force void __iomem *)addr, size);
245}
diff --git a/mm/filemap.c b/mm/filemap.c
index 21781f1fe52b..27ebc0c9571b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
34#include <linux/memcontrol.h> 34#include <linux/memcontrol.h>
35#include <linux/cleancache.h> 35#include <linux/cleancache.h>
36#include <linux/rmap.h>
36#include "internal.h" 37#include "internal.h"
37 38
38#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
@@ -562,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page,
562 VM_BUG_ON_PAGE(!PageLocked(page), page); 563 VM_BUG_ON_PAGE(!PageLocked(page), page);
563 VM_BUG_ON_PAGE(PageSwapBacked(page), page); 564 VM_BUG_ON_PAGE(PageSwapBacked(page), page);
564 565
565 error = mem_cgroup_cache_charge(page, current->mm, 566 error = mem_cgroup_charge_file(page, current->mm,
566 gfp_mask & GFP_RECLAIM_MASK); 567 gfp_mask & GFP_RECLAIM_MASK);
567 if (error) 568 if (error)
568 return error; 569 return error;
@@ -1952,11 +1953,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1952 struct inode *inode = mapping->host; 1953 struct inode *inode = mapping->host;
1953 pgoff_t offset = vmf->pgoff; 1954 pgoff_t offset = vmf->pgoff;
1954 struct page *page; 1955 struct page *page;
1955 pgoff_t size; 1956 loff_t size;
1956 int ret = 0; 1957 int ret = 0;
1957 1958
1958 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1959 size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
1959 if (offset >= size) 1960 if (offset >= size >> PAGE_CACHE_SHIFT)
1960 return VM_FAULT_SIGBUS; 1961 return VM_FAULT_SIGBUS;
1961 1962
1962 /* 1963 /*
@@ -2005,8 +2006,8 @@ retry_find:
2005 * Found the page and have a reference on it. 2006 * Found the page and have a reference on it.
2006 * We must recheck i_size under page lock. 2007 * We must recheck i_size under page lock.
2007 */ 2008 */
2008 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 2009 size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
2009 if (unlikely(offset >= size)) { 2010 if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
2010 unlock_page(page); 2011 unlock_page(page);
2011 page_cache_release(page); 2012 page_cache_release(page);
2012 return VM_FAULT_SIGBUS; 2013 return VM_FAULT_SIGBUS;
@@ -2064,6 +2065,78 @@ page_not_uptodate:
2064} 2065}
2065EXPORT_SYMBOL(filemap_fault); 2066EXPORT_SYMBOL(filemap_fault);
2066 2067
2068void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
2069{
2070 struct radix_tree_iter iter;
2071 void **slot;
2072 struct file *file = vma->vm_file;
2073 struct address_space *mapping = file->f_mapping;
2074 loff_t size;
2075 struct page *page;
2076 unsigned long address = (unsigned long) vmf->virtual_address;
2077 unsigned long addr;
2078 pte_t *pte;
2079
2080 rcu_read_lock();
2081 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
2082 if (iter.index > vmf->max_pgoff)
2083 break;
2084repeat:
2085 page = radix_tree_deref_slot(slot);
2086 if (unlikely(!page))
2087 goto next;
2088 if (radix_tree_exception(page)) {
2089 if (radix_tree_deref_retry(page))
2090 break;
2091 else
2092 goto next;
2093 }
2094
2095 if (!page_cache_get_speculative(page))
2096 goto repeat;
2097
2098 /* Has the page moved? */
2099 if (unlikely(page != *slot)) {
2100 page_cache_release(page);
2101 goto repeat;
2102 }
2103
2104 if (!PageUptodate(page) ||
2105 PageReadahead(page) ||
2106 PageHWPoison(page))
2107 goto skip;
2108 if (!trylock_page(page))
2109 goto skip;
2110
2111 if (page->mapping != mapping || !PageUptodate(page))
2112 goto unlock;
2113
2114 size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
2115 if (page->index >= size >> PAGE_CACHE_SHIFT)
2116 goto unlock;
2117
2118 pte = vmf->pte + page->index - vmf->pgoff;
2119 if (!pte_none(*pte))
2120 goto unlock;
2121
2122 if (file->f_ra.mmap_miss > 0)
2123 file->f_ra.mmap_miss--;
2124 addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
2125 do_set_pte(vma, addr, page, pte, false, false);
2126 unlock_page(page);
2127 goto next;
2128unlock:
2129 unlock_page(page);
2130skip:
2131 page_cache_release(page);
2132next:
2133 if (iter.index == vmf->max_pgoff)
2134 break;
2135 }
2136 rcu_read_unlock();
2137}
2138EXPORT_SYMBOL(filemap_map_pages);
2139
2067int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 2140int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
2068{ 2141{
2069 struct page *page = vmf->page; 2142 struct page *page = vmf->page;
@@ -2093,6 +2166,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
2093 2166
2094const struct vm_operations_struct generic_file_vm_ops = { 2167const struct vm_operations_struct generic_file_vm_ops = {
2095 .fault = filemap_fault, 2168 .fault = filemap_fault,
2169 .map_pages = filemap_map_pages,
2096 .page_mkwrite = filemap_page_mkwrite, 2170 .page_mkwrite = filemap_page_mkwrite,
2097 .remap_pages = generic_file_remap_pages, 2171 .remap_pages = generic_file_remap_pages,
2098}; 2172};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ac89e9f82ef..64635f5278ff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
827 count_vm_event(THP_FAULT_FALLBACK); 827 count_vm_event(THP_FAULT_FALLBACK);
828 return VM_FAULT_FALLBACK; 828 return VM_FAULT_FALLBACK;
829 } 829 }
830 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 830 if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
831 put_page(page); 831 put_page(page);
832 count_vm_event(THP_FAULT_FALLBACK); 832 count_vm_event(THP_FAULT_FALLBACK);
833 return VM_FAULT_FALLBACK; 833 return VM_FAULT_FALLBACK;
@@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
968 __GFP_OTHER_NODE, 968 __GFP_OTHER_NODE,
969 vma, address, page_to_nid(page)); 969 vma, address, page_to_nid(page));
970 if (unlikely(!pages[i] || 970 if (unlikely(!pages[i] ||
971 mem_cgroup_newpage_charge(pages[i], mm, 971 mem_cgroup_charge_anon(pages[i], mm,
972 GFP_KERNEL))) { 972 GFP_KERNEL))) {
973 if (pages[i]) 973 if (pages[i])
974 put_page(pages[i]); 974 put_page(pages[i]);
@@ -1101,7 +1101,7 @@ alloc:
1101 goto out; 1101 goto out;
1102 } 1102 }
1103 1103
1104 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1104 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
1105 put_page(new_page); 1105 put_page(new_page);
1106 if (page) { 1106 if (page) {
1107 split_huge_page(page); 1107 split_huge_page(page);
@@ -1891,17 +1891,22 @@ out:
1891int hugepage_madvise(struct vm_area_struct *vma, 1891int hugepage_madvise(struct vm_area_struct *vma,
1892 unsigned long *vm_flags, int advice) 1892 unsigned long *vm_flags, int advice)
1893{ 1893{
1894 struct mm_struct *mm = vma->vm_mm;
1895
1896 switch (advice) { 1894 switch (advice) {
1897 case MADV_HUGEPAGE: 1895 case MADV_HUGEPAGE:
1896#ifdef CONFIG_S390
1897 /*
1898 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
1899 * can't handle this properly after s390_enable_sie, so we simply
1900 * ignore the madvise to prevent qemu from causing a SIGSEGV.
1901 */
1902 if (mm_has_pgste(vma->vm_mm))
1903 return 0;
1904#endif
1898 /* 1905 /*
1899 * Be somewhat over-protective like KSM for now! 1906 * Be somewhat over-protective like KSM for now!
1900 */ 1907 */
1901 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1908 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1902 return -EINVAL; 1909 return -EINVAL;
1903 if (mm->def_flags & VM_NOHUGEPAGE)
1904 return -EINVAL;
1905 *vm_flags &= ~VM_NOHUGEPAGE; 1910 *vm_flags &= ~VM_NOHUGEPAGE;
1906 *vm_flags |= VM_HUGEPAGE; 1911 *vm_flags |= VM_HUGEPAGE;
1907 /* 1912 /*
@@ -2354,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2354 if (!new_page) 2359 if (!new_page)
2355 return; 2360 return;
2356 2361
2357 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) 2362 if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
2358 return; 2363 return;
2359 2364
2360 /* 2365 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c02b9dadfb0..dd30f22b35e0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
16#include <linux/compiler.h>
16#include <linux/cpuset.h> 17#include <linux/cpuset.h>
17#include <linux/mutex.h> 18#include <linux/mutex.h>
18#include <linux/bootmem.h> 19#include <linux/bootmem.h>
@@ -1535,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1535 while (min_count < persistent_huge_pages(h)) { 1536 while (min_count < persistent_huge_pages(h)) {
1536 if (!free_pool_huge_page(h, nodes_allowed, 0)) 1537 if (!free_pool_huge_page(h, nodes_allowed, 0))
1537 break; 1538 break;
1539 cond_resched_lock(&hugetlb_lock);
1538 } 1540 }
1539 while (count < persistent_huge_pages(h)) { 1541 while (count < persistent_huge_pages(h)) {
1540 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 1542 if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2690,7 +2692,8 @@ retry_avoidcopy:
2690 BUG_ON(huge_pte_none(pte)); 2692 BUG_ON(huge_pte_none(pte));
2691 spin_lock(ptl); 2693 spin_lock(ptl);
2692 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2694 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2693 if (likely(pte_same(huge_ptep_get(ptep), pte))) 2695 if (likely(ptep &&
2696 pte_same(huge_ptep_get(ptep), pte)))
2694 goto retry_avoidcopy; 2697 goto retry_avoidcopy;
2695 /* 2698 /*
2696 * race occurs while re-acquiring page table 2699 * race occurs while re-acquiring page table
@@ -2734,7 +2737,7 @@ retry_avoidcopy:
2734 */ 2737 */
2735 spin_lock(ptl); 2738 spin_lock(ptl);
2736 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2739 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2737 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2740 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
2738 ClearPagePrivate(new_page); 2741 ClearPagePrivate(new_page);
2739 2742
2740 /* Break COW */ 2743 /* Break COW */
@@ -2896,8 +2899,7 @@ retry:
2896 if (anon_rmap) { 2899 if (anon_rmap) {
2897 ClearPagePrivate(page); 2900 ClearPagePrivate(page);
2898 hugepage_add_new_anon_rmap(page, vma, address); 2901 hugepage_add_new_anon_rmap(page, vma, address);
2899 } 2902 } else
2900 else
2901 page_dup_rmap(page); 2903 page_dup_rmap(page);
2902 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 2904 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2903 && (vma->vm_flags & VM_SHARED))); 2905 && (vma->vm_flags & VM_SHARED)));
@@ -3185,6 +3187,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3185 BUG_ON(address >= end); 3187 BUG_ON(address >= end);
3186 flush_cache_range(vma, address, end); 3188 flush_cache_range(vma, address, end);
3187 3189
3190 mmu_notifier_invalidate_range_start(mm, start, end);
3188 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 3191 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
3189 for (; address < end; address += huge_page_size(h)) { 3192 for (; address < end; address += huge_page_size(h)) {
3190 spinlock_t *ptl; 3193 spinlock_t *ptl;
@@ -3214,6 +3217,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3214 */ 3217 */
3215 flush_tlb_range(vma, start, end); 3218 flush_tlb_range(vma, start, end);
3216 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3219 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
3220 mmu_notifier_invalidate_range_end(mm, start, end);
3217 3221
3218 return pages << h->order; 3222 return pages << h->order;
3219} 3223}
@@ -3518,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
3518#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3522#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3519 3523
3520/* Can be overriden by architectures */ 3524/* Can be overriden by architectures */
3521__attribute__((weak)) struct page * 3525struct page * __weak
3522follow_huge_pud(struct mm_struct *mm, unsigned long address, 3526follow_huge_pud(struct mm_struct *mm, unsigned long address,
3523 pud_t *pud, int write) 3527 pud_t *pud, int write)
3524{ 3528{
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e761f9eb..07b67361a40a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
11#ifndef __MM_INTERNAL_H 11#ifndef __MM_INTERNAL_H
12#define __MM_INTERNAL_H 12#define __MM_INTERNAL_H
13 13
14#include <linux/fs.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15 16
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 17void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
21 atomic_set(&page->_count, v); 22 atomic_set(&page->_count, v);
22} 23}
23 24
25extern int __do_page_cache_readahead(struct address_space *mapping,
26 struct file *filp, pgoff_t offset, unsigned long nr_to_read,
27 unsigned long lookahead_size);
28
29/*
30 * Submit IO for the read-ahead request in file_ra_state.
31 */
32static inline unsigned long ra_submit(struct file_ra_state *ra,
33 struct address_space *mapping, struct file *filp)
34{
35 return __do_page_cache_readahead(mapping, filp,
36 ra->start, ra->size, ra->async_size);
37}
38
24/* 39/*
25 * Turn a non-refcounted page (->_count == 0) into refcounted with 40 * Turn a non-refcounted page (->_count == 0) into refcounted with
26 * a count of one. 41 * a count of one.
@@ -370,5 +385,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
370#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 385#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
371#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 386#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
372#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ 387#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
388#define ALLOC_FAIR 0x100 /* fair zone allocation */
373 389
374#endif /* __MM_INTERNAL_H */ 390#endif /* __MM_INTERNAL_H */
diff --git a/mm/memblock.c b/mm/memblock.c
index 7fe5354e7552..e9d6ca9a01a9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
1253 pages += end_pfn - start_pfn; 1253 pages += end_pfn - start_pfn;
1254 } 1254 }
1255 1255
1256 return (phys_addr_t)pages << PAGE_SHIFT; 1256 return PFN_PHYS(pages);
1257} 1257}
1258 1258
1259/* lowest address */ 1259/* lowest address */
@@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
1271 1271
1272void __init memblock_enforce_memory_limit(phys_addr_t limit) 1272void __init memblock_enforce_memory_limit(phys_addr_t limit)
1273{ 1273{
1274 unsigned long i;
1275 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; 1274 phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
1275 struct memblock_region *r;
1276 1276
1277 if (!limit) 1277 if (!limit)
1278 return; 1278 return;
1279 1279
1280 /* find out max address */ 1280 /* find out max address */
1281 for (i = 0; i < memblock.memory.cnt; i++) { 1281 for_each_memblock(memory, r) {
1282 struct memblock_region *r = &memblock.memory.regions[i];
1283
1284 if (limit <= r->size) { 1282 if (limit <= r->size) {
1285 max_addr = r->base + limit; 1283 max_addr = r->base + limit;
1286 break; 1284 break;
@@ -1326,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
1326 unsigned long *start_pfn, unsigned long *end_pfn) 1324 unsigned long *start_pfn, unsigned long *end_pfn)
1327{ 1325{
1328 struct memblock_type *type = &memblock.memory; 1326 struct memblock_type *type = &memblock.memory;
1329 int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); 1327 int mid = memblock_search(type, PFN_PHYS(pfn));
1330 1328
1331 if (mid == -1) 1329 if (mid == -1)
1332 return -1; 1330 return -1;
@@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
1379 1377
1380void __init_memblock memblock_trim_memory(phys_addr_t align) 1378void __init_memblock memblock_trim_memory(phys_addr_t align)
1381{ 1379{
1382 int i;
1383 phys_addr_t start, end, orig_start, orig_end; 1380 phys_addr_t start, end, orig_start, orig_end;
1384 struct memblock_type *mem = &memblock.memory; 1381 struct memblock_region *r;
1385 1382
1386 for (i = 0; i < mem->cnt; i++) { 1383 for_each_memblock(memory, r) {
1387 orig_start = mem->regions[i].base; 1384 orig_start = r->base;
1388 orig_end = mem->regions[i].base + mem->regions[i].size; 1385 orig_end = r->base + r->size;
1389 start = round_up(orig_start, align); 1386 start = round_up(orig_start, align);
1390 end = round_down(orig_end, align); 1387 end = round_down(orig_end, align);
1391 1388
@@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
1393 continue; 1390 continue;
1394 1391
1395 if (start < end) { 1392 if (start < end) {
1396 mem->regions[i].base = start; 1393 r->base = start;
1397 mem->regions[i].size = end - start; 1394 r->size = end - start;
1398 } else { 1395 } else {
1399 memblock_remove_region(mem, i); 1396 memblock_remove_region(&memblock.memory,
1400 i--; 1397 r - memblock.memory.regions);
1398 r--;
1401 } 1399 }
1402 } 1400 }
1403} 1401}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dcc8153a1681..29501f040568 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
921 struct page *page, 921 struct page *page,
922 bool anon, int nr_pages) 922 bool anon, int nr_pages)
923{ 923{
924 preempt_disable();
925
926 /* 924 /*
927 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 925 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
928 * counted as CACHE even if it's on ANON LRU. 926 * counted as CACHE even if it's on ANON LRU.
@@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
947 } 945 }
948 946
949 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 947 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
950
951 preempt_enable();
952} 948}
953 949
954unsigned long 950unsigned long
@@ -1075,22 +1071,15 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1075 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1071 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1076} 1072}
1077 1073
1078struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1074static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1079{ 1075{
1080 struct mem_cgroup *memcg = NULL; 1076 struct mem_cgroup *memcg = NULL;
1081 1077
1082 if (!mm)
1083 return NULL;
1084 /*
1085 * Because we have no locks, mm->owner's may be being moved to other
1086 * cgroup. We use css_tryget() here even if this looks
1087 * pessimistic (rather than adding locks here).
1088 */
1089 rcu_read_lock(); 1078 rcu_read_lock();
1090 do { 1079 do {
1091 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1080 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1092 if (unlikely(!memcg)) 1081 if (unlikely(!memcg))
1093 break; 1082 memcg = root_mem_cgroup;
1094 } while (!css_tryget(&memcg->css)); 1083 } while (!css_tryget(&memcg->css));
1095 rcu_read_unlock(); 1084 rcu_read_unlock();
1096 return memcg; 1085 return memcg;
@@ -1486,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task,
1486 1475
1487 p = find_lock_task_mm(task); 1476 p = find_lock_task_mm(task);
1488 if (p) { 1477 if (p) {
1489 curr = try_get_mem_cgroup_from_mm(p->mm); 1478 curr = get_mem_cgroup_from_mm(p->mm);
1490 task_unlock(p); 1479 task_unlock(p);
1491 } else { 1480 } else {
1492 /* 1481 /*
@@ -1500,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task,
1500 css_get(&curr->css); 1489 css_get(&curr->css);
1501 rcu_read_unlock(); 1490 rcu_read_unlock();
1502 } 1491 }
1503 if (!curr)
1504 return false;
1505 /* 1492 /*
1506 * We should check use_hierarchy of "memcg" not "curr". Because checking 1493 * We should check use_hierarchy of "memcg" not "curr". Because checking
1507 * use_hierarchy of "curr" here make this function true if hierarchy is 1494 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -2588,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2588} 2575}
2589 2576
2590 2577
2591/* See __mem_cgroup_try_charge() for details */ 2578/* See mem_cgroup_try_charge() for details */
2592enum { 2579enum {
2593 CHARGE_OK, /* success */ 2580 CHARGE_OK, /* success */
2594 CHARGE_RETRY, /* need to retry but retry is not bad */ 2581 CHARGE_RETRY, /* need to retry but retry is not bad */
@@ -2661,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2661 return CHARGE_NOMEM; 2648 return CHARGE_NOMEM;
2662} 2649}
2663 2650
2664/* 2651/**
2665 * __mem_cgroup_try_charge() does 2652 * mem_cgroup_try_charge - try charging a memcg
2666 * 1. detect memcg to be charged against from passed *mm and *ptr, 2653 * @memcg: memcg to charge
2667 * 2. update res_counter 2654 * @nr_pages: number of pages to charge
2668 * 3. call memory reclaim if necessary. 2655 * @oom: trigger OOM if reclaim fails
2669 *
2670 * In some special case, if the task is fatal, fatal_signal_pending() or
2671 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2672 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2673 * as possible without any hazards. 2: all pages should have a valid
2674 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2675 * pointer, that is treated as a charge to root_mem_cgroup.
2676 *
2677 * So __mem_cgroup_try_charge() will return
2678 * 0 ... on success, filling *ptr with a valid memcg pointer.
2679 * -ENOMEM ... charge failure because of resource limits.
2680 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
2681 * 2656 *
2682 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2657 * Returns 0 if @memcg was charged successfully, -EINTR if the charge
2683 * the oom-killer can be invoked. 2658 * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
2684 */ 2659 */
2685static int __mem_cgroup_try_charge(struct mm_struct *mm, 2660static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2686 gfp_t gfp_mask, 2661 gfp_t gfp_mask,
2687 unsigned int nr_pages, 2662 unsigned int nr_pages,
2688 struct mem_cgroup **ptr, 2663 bool oom)
2689 bool oom)
2690{ 2664{
2691 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2665 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2692 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2666 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2693 struct mem_cgroup *memcg = NULL;
2694 int ret; 2667 int ret;
2695 2668
2669 if (mem_cgroup_is_root(memcg))
2670 goto done;
2696 /* 2671 /*
2697 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2672 * Unlike in global OOM situations, memcg is not in a physical
2698 * in system level. So, allow to go ahead dying process in addition to 2673 * memory shortage. Allow dying and OOM-killed tasks to
2699 * MEMDIE process. 2674 * bypass the last charges so that they can exit quickly and
2675 * free their memory.
2700 */ 2676 */
2701 if (unlikely(test_thread_flag(TIF_MEMDIE) 2677 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2702 || fatal_signal_pending(current))) 2678 fatal_signal_pending(current)))
2703 goto bypass; 2679 goto bypass;
2704 2680
2705 if (unlikely(task_in_memcg_oom(current))) 2681 if (unlikely(task_in_memcg_oom(current)))
@@ -2707,73 +2683,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2707 2683
2708 if (gfp_mask & __GFP_NOFAIL) 2684 if (gfp_mask & __GFP_NOFAIL)
2709 oom = false; 2685 oom = false;
2710
2711 /*
2712 * We always charge the cgroup the mm_struct belongs to.
2713 * The mm_struct's mem_cgroup changes on task migration if the
2714 * thread group leader migrates. It's possible that mm is not
2715 * set, if so charge the root memcg (happens for pagecache usage).
2716 */
2717 if (!*ptr && !mm)
2718 *ptr = root_mem_cgroup;
2719again: 2686again:
2720 if (*ptr) { /* css should be a valid one */ 2687 if (consume_stock(memcg, nr_pages))
2721 memcg = *ptr; 2688 goto done;
2722 if (mem_cgroup_is_root(memcg))
2723 goto done;
2724 if (consume_stock(memcg, nr_pages))
2725 goto done;
2726 css_get(&memcg->css);
2727 } else {
2728 struct task_struct *p;
2729
2730 rcu_read_lock();
2731 p = rcu_dereference(mm->owner);
2732 /*
2733 * Because we don't have task_lock(), "p" can exit.
2734 * In that case, "memcg" can point to root or p can be NULL with
2735 * race with swapoff. Then, we have small risk of mis-accouning.
2736 * But such kind of mis-account by race always happens because
2737 * we don't have cgroup_mutex(). It's overkill and we allo that
2738 * small race, here.
2739 * (*) swapoff at el will charge against mm-struct not against
2740 * task-struct. So, mm->owner can be NULL.
2741 */
2742 memcg = mem_cgroup_from_task(p);
2743 if (!memcg)
2744 memcg = root_mem_cgroup;
2745 if (mem_cgroup_is_root(memcg)) {
2746 rcu_read_unlock();
2747 goto done;
2748 }
2749 if (consume_stock(memcg, nr_pages)) {
2750 /*
2751 * It seems dagerous to access memcg without css_get().
2752 * But considering how consume_stok works, it's not
2753 * necessary. If consume_stock success, some charges
2754 * from this memcg are cached on this cpu. So, we
2755 * don't need to call css_get()/css_tryget() before
2756 * calling consume_stock().
2757 */
2758 rcu_read_unlock();
2759 goto done;
2760 }
2761 /* after here, we may be blocked. we need to get refcnt */
2762 if (!css_tryget(&memcg->css)) {
2763 rcu_read_unlock();
2764 goto again;
2765 }
2766 rcu_read_unlock();
2767 }
2768 2689
2769 do { 2690 do {
2770 bool invoke_oom = oom && !nr_oom_retries; 2691 bool invoke_oom = oom && !nr_oom_retries;
2771 2692
2772 /* If killed, bypass charge */ 2693 /* If killed, bypass charge */
2773 if (fatal_signal_pending(current)) { 2694 if (fatal_signal_pending(current))
2774 css_put(&memcg->css);
2775 goto bypass; 2695 goto bypass;
2776 }
2777 2696
2778 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2697 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2779 nr_pages, invoke_oom); 2698 nr_pages, invoke_oom);
@@ -2782,17 +2701,12 @@ again:
2782 break; 2701 break;
2783 case CHARGE_RETRY: /* not in OOM situation but retry */ 2702 case CHARGE_RETRY: /* not in OOM situation but retry */
2784 batch = nr_pages; 2703 batch = nr_pages;
2785 css_put(&memcg->css);
2786 memcg = NULL;
2787 goto again; 2704 goto again;
2788 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2705 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2789 css_put(&memcg->css);
2790 goto nomem; 2706 goto nomem;
2791 case CHARGE_NOMEM: /* OOM routine works */ 2707 case CHARGE_NOMEM: /* OOM routine works */
2792 if (!oom || invoke_oom) { 2708 if (!oom || invoke_oom)
2793 css_put(&memcg->css);
2794 goto nomem; 2709 goto nomem;
2795 }
2796 nr_oom_retries--; 2710 nr_oom_retries--;
2797 break; 2711 break;
2798 } 2712 }
@@ -2800,20 +2714,44 @@ again:
2800 2714
2801 if (batch > nr_pages) 2715 if (batch > nr_pages)
2802 refill_stock(memcg, batch - nr_pages); 2716 refill_stock(memcg, batch - nr_pages);
2803 css_put(&memcg->css);
2804done: 2717done:
2805 *ptr = memcg;
2806 return 0; 2718 return 0;
2807nomem: 2719nomem:
2808 if (!(gfp_mask & __GFP_NOFAIL)) { 2720 if (!(gfp_mask & __GFP_NOFAIL))
2809 *ptr = NULL;
2810 return -ENOMEM; 2721 return -ENOMEM;
2811 }
2812bypass: 2722bypass:
2813 *ptr = root_mem_cgroup;
2814 return -EINTR; 2723 return -EINTR;
2815} 2724}
2816 2725
2726/**
2727 * mem_cgroup_try_charge_mm - try charging a mm
2728 * @mm: mm_struct to charge
2729 * @nr_pages: number of pages to charge
2730 * @oom: trigger OOM if reclaim fails
2731 *
2732 * Returns the charged mem_cgroup associated with the given mm_struct or
2733 * NULL the charge failed.
2734 */
2735static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
2736 gfp_t gfp_mask,
2737 unsigned int nr_pages,
2738 bool oom)
2739
2740{
2741 struct mem_cgroup *memcg;
2742 int ret;
2743
2744 memcg = get_mem_cgroup_from_mm(mm);
2745 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
2746 css_put(&memcg->css);
2747 if (ret == -EINTR)
2748 memcg = root_mem_cgroup;
2749 else if (ret)
2750 memcg = NULL;
2751
2752 return memcg;
2753}
2754
2817/* 2755/*
2818 * Somemtimes we have to undo a charge we got by try_charge(). 2756 * Somemtimes we have to undo a charge we got by try_charge().
2819 * This function is for that and do uncharge, put css's refcnt. 2757 * This function is for that and do uncharge, put css's refcnt.
@@ -3009,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
3009static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2947static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
3010{ 2948{
3011 struct res_counter *fail_res; 2949 struct res_counter *fail_res;
3012 struct mem_cgroup *_memcg;
3013 int ret = 0; 2950 int ret = 0;
3014 2951
3015 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2952 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
3016 if (ret) 2953 if (ret)
3017 return ret; 2954 return ret;
3018 2955
3019 _memcg = memcg; 2956 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
3020 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 2957 oom_gfp_allowed(gfp));
3021 &_memcg, oom_gfp_allowed(gfp));
3022
3023 if (ret == -EINTR) { 2958 if (ret == -EINTR) {
3024 /* 2959 /*
3025 * __mem_cgroup_try_charge() chosed to bypass to root due to 2960 * mem_cgroup_try_charge() chosed to bypass to root due to
3026 * OOM kill or fatal signal. Since our only options are to 2961 * OOM kill or fatal signal. Since our only options are to
3027 * either fail the allocation or charge it to this cgroup, do 2962 * either fail the allocation or charge it to this cgroup, do
3028 * it as a temporary condition. But we can't fail. From a 2963 * it as a temporary condition. But we can't fail. From a
@@ -3032,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
3032 * 2967 *
3033 * This condition will only trigger if the task entered 2968 * This condition will only trigger if the task entered
3034 * memcg_charge_kmem in a sane state, but was OOM-killed during 2969 * memcg_charge_kmem in a sane state, but was OOM-killed during
3035 * __mem_cgroup_try_charge() above. Tasks that were already 2970 * mem_cgroup_try_charge() above. Tasks that were already
3036 * dying when the allocation triggers should have been already 2971 * dying when the allocation triggers should have been already
3037 * directed to the root cgroup in memcontrol.h 2972 * directed to the root cgroup in memcontrol.h
3038 */ 2973 */
@@ -3159,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3159 return 0; 3094 return 0;
3160} 3095}
3161 3096
3097char *memcg_create_cache_name(struct mem_cgroup *memcg,
3098 struct kmem_cache *root_cache)
3099{
3100 static char *buf = NULL;
3101
3102 /*
3103 * We need a mutex here to protect the shared buffer. Since this is
3104 * expected to be called only on cache creation, we can employ the
3105 * slab_mutex for that purpose.
3106 */
3107 lockdep_assert_held(&slab_mutex);
3108
3109 if (!buf) {
3110 buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3111 if (!buf)
3112 return NULL;
3113 }
3114
3115 cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
3116 return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
3117 memcg_cache_id(memcg), buf);
3118}
3119
3162int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 3120int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3163 struct kmem_cache *root_cache) 3121 struct kmem_cache *root_cache)
3164{ 3122{
@@ -3182,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3182 s->memcg_params->root_cache = root_cache; 3140 s->memcg_params->root_cache = root_cache;
3183 INIT_WORK(&s->memcg_params->destroy, 3141 INIT_WORK(&s->memcg_params->destroy,
3184 kmem_cache_destroy_work_func); 3142 kmem_cache_destroy_work_func);
3143 css_get(&memcg->css);
3185 } else 3144 } else
3186 s->memcg_params->is_root_cache = true; 3145 s->memcg_params->is_root_cache = true;
3187 3146
@@ -3190,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3190 3149
3191void memcg_free_cache_params(struct kmem_cache *s) 3150void memcg_free_cache_params(struct kmem_cache *s)
3192{ 3151{
3152 if (!s->memcg_params)
3153 return;
3154 if (!s->memcg_params->is_root_cache)
3155 css_put(&s->memcg_params->memcg->css);
3193 kfree(s->memcg_params); 3156 kfree(s->memcg_params);
3194} 3157}
3195 3158
@@ -3212,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s)
3212 memcg = s->memcg_params->memcg; 3175 memcg = s->memcg_params->memcg;
3213 id = memcg_cache_id(memcg); 3176 id = memcg_cache_id(memcg);
3214 3177
3215 css_get(&memcg->css);
3216
3217
3218 /* 3178 /*
3219 * Since readers won't lock (see cache_from_memcg_idx()), we need a 3179 * Since readers won't lock (see cache_from_memcg_idx()), we need a
3220 * barrier here to ensure nobody will see the kmem_cache partially 3180 * barrier here to ensure nobody will see the kmem_cache partially
@@ -3263,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s)
3263 * after removing it from the memcg_slab_caches list, otherwise we can 3223 * after removing it from the memcg_slab_caches list, otherwise we can
3264 * fail to convert memcg_params_to_cache() while traversing the list. 3224 * fail to convert memcg_params_to_cache() while traversing the list.
3265 */ 3225 */
3266 VM_BUG_ON(!root->memcg_params->memcg_caches[id]); 3226 VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
3267 root->memcg_params->memcg_caches[id] = NULL; 3227 root->memcg_params->memcg_caches[id] = NULL;
3268
3269 css_put(&memcg->css);
3270} 3228}
3271 3229
3272/* 3230/*
@@ -3363,55 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3363 schedule_work(&cachep->memcg_params->destroy); 3321 schedule_work(&cachep->memcg_params->destroy);
3364} 3322}
3365 3323
3366static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3324int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3367 struct kmem_cache *s)
3368{
3369 struct kmem_cache *new = NULL;
3370 static char *tmp_path = NULL, *tmp_name = NULL;
3371 static DEFINE_MUTEX(mutex); /* protects tmp_name */
3372
3373 BUG_ON(!memcg_can_account_kmem(memcg));
3374
3375 mutex_lock(&mutex);
3376 /*
3377 * kmem_cache_create_memcg duplicates the given name and
3378 * cgroup_name for this name requires RCU context.
3379 * This static temporary buffer is used to prevent from
3380 * pointless shortliving allocation.
3381 */
3382 if (!tmp_path || !tmp_name) {
3383 if (!tmp_path)
3384 tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
3385 if (!tmp_name)
3386 tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3387 if (!tmp_path || !tmp_name)
3388 goto out;
3389 }
3390
3391 cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
3392 snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
3393 memcg_cache_id(memcg), tmp_name);
3394
3395 new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
3396 (s->flags & ~SLAB_PANIC), s->ctor, s);
3397 if (new)
3398 new->allocflags |= __GFP_KMEMCG;
3399 else
3400 new = s;
3401out:
3402 mutex_unlock(&mutex);
3403 return new;
3404}
3405
3406void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3407{ 3325{
3408 struct kmem_cache *c; 3326 struct kmem_cache *c;
3409 int i; 3327 int i, failed = 0;
3410
3411 if (!s->memcg_params)
3412 return;
3413 if (!s->memcg_params->is_root_cache)
3414 return;
3415 3328
3416 /* 3329 /*
3417 * If the cache is being destroyed, we trust that there is no one else 3330 * If the cache is being destroyed, we trust that there is no one else
@@ -3445,16 +3358,14 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3445 c->memcg_params->dead = false; 3358 c->memcg_params->dead = false;
3446 cancel_work_sync(&c->memcg_params->destroy); 3359 cancel_work_sync(&c->memcg_params->destroy);
3447 kmem_cache_destroy(c); 3360 kmem_cache_destroy(c);
3361
3362 if (cache_from_memcg_idx(s, i))
3363 failed++;
3448 } 3364 }
3449 mutex_unlock(&activate_kmem_mutex); 3365 mutex_unlock(&activate_kmem_mutex);
3366 return failed;
3450} 3367}
3451 3368
3452struct create_work {
3453 struct mem_cgroup *memcg;
3454 struct kmem_cache *cachep;
3455 struct work_struct work;
3456};
3457
3458static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3369static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3459{ 3370{
3460 struct kmem_cache *cachep; 3371 struct kmem_cache *cachep;
@@ -3472,13 +3383,20 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3472 mutex_unlock(&memcg->slab_caches_mutex); 3383 mutex_unlock(&memcg->slab_caches_mutex);
3473} 3384}
3474 3385
3386struct create_work {
3387 struct mem_cgroup *memcg;
3388 struct kmem_cache *cachep;
3389 struct work_struct work;
3390};
3391
3475static void memcg_create_cache_work_func(struct work_struct *w) 3392static void memcg_create_cache_work_func(struct work_struct *w)
3476{ 3393{
3477 struct create_work *cw; 3394 struct create_work *cw = container_of(w, struct create_work, work);
3395 struct mem_cgroup *memcg = cw->memcg;
3396 struct kmem_cache *cachep = cw->cachep;
3478 3397
3479 cw = container_of(w, struct create_work, work); 3398 kmem_cache_create_memcg(memcg, cachep);
3480 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3399 css_put(&memcg->css);
3481 css_put(&cw->memcg->css);
3482 kfree(cw); 3400 kfree(cw);
3483} 3401}
3484 3402
@@ -3637,15 +3555,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3637 if (!current->mm || current->memcg_kmem_skip_account) 3555 if (!current->mm || current->memcg_kmem_skip_account)
3638 return true; 3556 return true;
3639 3557
3640 memcg = try_get_mem_cgroup_from_mm(current->mm); 3558 memcg = get_mem_cgroup_from_mm(current->mm);
3641
3642 /*
3643 * very rare case described in mem_cgroup_from_task. Unfortunately there
3644 * isn't much we can do without complicating this too much, and it would
3645 * be gfp-dependent anyway. Just let it go
3646 */
3647 if (unlikely(!memcg))
3648 return true;
3649 3559
3650 if (!memcg_can_account_kmem(memcg)) { 3560 if (!memcg_can_account_kmem(memcg)) {
3651 css_put(&memcg->css); 3561 css_put(&memcg->css);
@@ -3748,19 +3658,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3748} 3658}
3749#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3659#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3750 3660
3751static inline
3752void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
3753 struct mem_cgroup *to,
3754 unsigned int nr_pages,
3755 enum mem_cgroup_stat_index idx)
3756{
3757 /* Update stat data for mem_cgroup */
3758 preempt_disable();
3759 __this_cpu_sub(from->stat->count[idx], nr_pages);
3760 __this_cpu_add(to->stat->count[idx], nr_pages);
3761 preempt_enable();
3762}
3763
3764/** 3661/**
3765 * mem_cgroup_move_account - move account of the page 3662 * mem_cgroup_move_account - move account of the page
3766 * @page: the page 3663 * @page: the page
@@ -3806,13 +3703,19 @@ static int mem_cgroup_move_account(struct page *page,
3806 3703
3807 move_lock_mem_cgroup(from, &flags); 3704 move_lock_mem_cgroup(from, &flags);
3808 3705
3809 if (!anon && page_mapped(page)) 3706 if (!anon && page_mapped(page)) {
3810 mem_cgroup_move_account_page_stat(from, to, nr_pages, 3707 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3811 MEM_CGROUP_STAT_FILE_MAPPED); 3708 nr_pages);
3709 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3710 nr_pages);
3711 }
3812 3712
3813 if (PageWriteback(page)) 3713 if (PageWriteback(page)) {
3814 mem_cgroup_move_account_page_stat(from, to, nr_pages, 3714 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3815 MEM_CGROUP_STAT_WRITEBACK); 3715 nr_pages);
3716 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3717 nr_pages);
3718 }
3816 3719
3817 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3720 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3818 3721
@@ -3898,19 +3801,19 @@ out:
3898 return ret; 3801 return ret;
3899} 3802}
3900 3803
3901/* 3804int mem_cgroup_charge_anon(struct page *page,
3902 * Charge the memory controller for page usage. 3805 struct mm_struct *mm, gfp_t gfp_mask)
3903 * Return
3904 * 0 if the charge was successful
3905 * < 0 if the cgroup is over its limit
3906 */
3907static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3908 gfp_t gfp_mask, enum charge_type ctype)
3909{ 3806{
3910 struct mem_cgroup *memcg = NULL;
3911 unsigned int nr_pages = 1; 3807 unsigned int nr_pages = 1;
3808 struct mem_cgroup *memcg;
3912 bool oom = true; 3809 bool oom = true;
3913 int ret; 3810
3811 if (mem_cgroup_disabled())
3812 return 0;
3813
3814 VM_BUG_ON_PAGE(page_mapped(page), page);
3815 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3816 VM_BUG_ON(!mm);
3914 3817
3915 if (PageTransHuge(page)) { 3818 if (PageTransHuge(page)) {
3916 nr_pages <<= compound_order(page); 3819 nr_pages <<= compound_order(page);
@@ -3922,25 +3825,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3922 oom = false; 3825 oom = false;
3923 } 3826 }
3924 3827
3925 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 3828 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
3926 if (ret == -ENOMEM) 3829 if (!memcg)
3927 return ret; 3830 return -ENOMEM;
3928 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 3831 __mem_cgroup_commit_charge(memcg, page, nr_pages,
3832 MEM_CGROUP_CHARGE_TYPE_ANON, false);
3929 return 0; 3833 return 0;
3930} 3834}
3931 3835
3932int mem_cgroup_newpage_charge(struct page *page,
3933 struct mm_struct *mm, gfp_t gfp_mask)
3934{
3935 if (mem_cgroup_disabled())
3936 return 0;
3937 VM_BUG_ON_PAGE(page_mapped(page), page);
3938 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3939 VM_BUG_ON(!mm);
3940 return mem_cgroup_charge_common(page, mm, gfp_mask,
3941 MEM_CGROUP_CHARGE_TYPE_ANON);
3942}
3943
3944/* 3836/*
3945 * While swap-in, try_charge -> commit or cancel, the page is locked. 3837 * While swap-in, try_charge -> commit or cancel, the page is locked.
3946 * And when try_charge() successfully returns, one refcnt to memcg without 3838 * And when try_charge() successfully returns, one refcnt to memcg without
@@ -3952,7 +3844,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3952 gfp_t mask, 3844 gfp_t mask,
3953 struct mem_cgroup **memcgp) 3845 struct mem_cgroup **memcgp)
3954{ 3846{
3955 struct mem_cgroup *memcg; 3847 struct mem_cgroup *memcg = NULL;
3956 struct page_cgroup *pc; 3848 struct page_cgroup *pc;
3957 int ret; 3849 int ret;
3958 3850
@@ -3965,31 +3857,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3965 * in turn serializes uncharging. 3857 * in turn serializes uncharging.
3966 */ 3858 */
3967 if (PageCgroupUsed(pc)) 3859 if (PageCgroupUsed(pc))
3968 return 0; 3860 goto out;
3969 if (!do_swap_account) 3861 if (do_swap_account)
3970 goto charge_cur_mm; 3862 memcg = try_get_mem_cgroup_from_page(page);
3971 memcg = try_get_mem_cgroup_from_page(page);
3972 if (!memcg) 3863 if (!memcg)
3973 goto charge_cur_mm; 3864 memcg = get_mem_cgroup_from_mm(mm);
3974 *memcgp = memcg; 3865 ret = mem_cgroup_try_charge(memcg, mask, 1, true);
3975 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
3976 css_put(&memcg->css); 3866 css_put(&memcg->css);
3977 if (ret == -EINTR) 3867 if (ret == -EINTR)
3978 ret = 0; 3868 memcg = root_mem_cgroup;
3979 return ret; 3869 else if (ret)
3980charge_cur_mm: 3870 return ret;
3981 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 3871out:
3982 if (ret == -EINTR) 3872 *memcgp = memcg;
3983 ret = 0; 3873 return 0;
3984 return ret;
3985} 3874}
3986 3875
3987int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, 3876int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3988 gfp_t gfp_mask, struct mem_cgroup **memcgp) 3877 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3989{ 3878{
3990 *memcgp = NULL; 3879 if (mem_cgroup_disabled()) {
3991 if (mem_cgroup_disabled()) 3880 *memcgp = NULL;
3992 return 0; 3881 return 0;
3882 }
3993 /* 3883 /*
3994 * A racing thread's fault, or swapoff, may have already 3884 * A racing thread's fault, or swapoff, may have already
3995 * updated the pte, and even removed page from swap cache: in 3885 * updated the pte, and even removed page from swap cache: in
@@ -3997,12 +3887,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3997 * there's also a KSM case which does need to charge the page. 3887 * there's also a KSM case which does need to charge the page.
3998 */ 3888 */
3999 if (!PageSwapCache(page)) { 3889 if (!PageSwapCache(page)) {
4000 int ret; 3890 struct mem_cgroup *memcg;
4001 3891
4002 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); 3892 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
4003 if (ret == -EINTR) 3893 if (!memcg)
4004 ret = 0; 3894 return -ENOMEM;
4005 return ret; 3895 *memcgp = memcg;
3896 return 0;
4006 } 3897 }
4007 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); 3898 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
4008} 3899}
@@ -4046,11 +3937,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
4046 MEM_CGROUP_CHARGE_TYPE_ANON); 3937 MEM_CGROUP_CHARGE_TYPE_ANON);
4047} 3938}
4048 3939
4049int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 3940int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
4050 gfp_t gfp_mask) 3941 gfp_t gfp_mask)
4051{ 3942{
4052 struct mem_cgroup *memcg = NULL;
4053 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3943 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3944 struct mem_cgroup *memcg;
4054 int ret; 3945 int ret;
4055 3946
4056 if (mem_cgroup_disabled()) 3947 if (mem_cgroup_disabled())
@@ -4058,15 +3949,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
4058 if (PageCompound(page)) 3949 if (PageCompound(page))
4059 return 0; 3950 return 0;
4060 3951
4061 if (!PageSwapCache(page)) 3952 if (PageSwapCache(page)) { /* shmem */
4062 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
4063 else { /* page is swapcache/shmem */
4064 ret = __mem_cgroup_try_charge_swapin(mm, page, 3953 ret = __mem_cgroup_try_charge_swapin(mm, page,
4065 gfp_mask, &memcg); 3954 gfp_mask, &memcg);
4066 if (!ret) 3955 if (ret)
4067 __mem_cgroup_commit_charge_swapin(page, memcg, type); 3956 return ret;
3957 __mem_cgroup_commit_charge_swapin(page, memcg, type);
3958 return 0;
4068 } 3959 }
4069 return ret; 3960
3961 /*
3962 * Page cache insertions can happen without an actual mm
3963 * context, e.g. during disk probing on boot.
3964 */
3965 if (unlikely(!mm))
3966 memcg = root_mem_cgroup;
3967 else {
3968 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3969 if (!memcg)
3970 return -ENOMEM;
3971 }
3972 __mem_cgroup_commit_charge(memcg, page, 1, type, false);
3973 return 0;
4070} 3974}
4071 3975
4072static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 3976static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -6678,8 +6582,7 @@ one_by_one:
6678 batch_count = PRECHARGE_COUNT_AT_ONCE; 6582 batch_count = PRECHARGE_COUNT_AT_ONCE;
6679 cond_resched(); 6583 cond_resched();
6680 } 6584 }
6681 ret = __mem_cgroup_try_charge(NULL, 6585 ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
6682 GFP_KERNEL, 1, &memcg, false);
6683 if (ret) 6586 if (ret)
6684 /* mem_cgroup_clear_mc() will do uncharge later */ 6587 /* mem_cgroup_clear_mc() will do uncharge later */
6685 return ret; 6588 return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 82c1e4cf00d1..d0f0bef3be48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,6 +60,7 @@
60#include <linux/migrate.h> 60#include <linux/migrate.h>
61#include <linux/string.h> 61#include <linux/string.h>
62#include <linux/dma-debug.h> 62#include <linux/dma-debug.h>
63#include <linux/debugfs.h>
63 64
64#include <asm/io.h> 65#include <asm/io.h>
65#include <asm/pgalloc.h> 66#include <asm/pgalloc.h>
@@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1320 * It is undesirable to test vma->vm_file as it 1321 * It is undesirable to test vma->vm_file as it
1321 * should be non-null for valid hugetlb area. 1322 * should be non-null for valid hugetlb area.
1322 * However, vm_file will be NULL in the error 1323 * However, vm_file will be NULL in the error
1323 * cleanup path of do_mmap_pgoff. When 1324 * cleanup path of mmap_region. When
1324 * hugetlbfs ->mmap method fails, 1325 * hugetlbfs ->mmap method fails,
1325 * do_mmap_pgoff() nullifies vma->vm_file 1326 * mmap_region() nullifies vma->vm_file
1326 * before calling this function to clean up. 1327 * before calling this function to clean up.
1327 * Since no pte has actually been setup, it is 1328 * Since no pte has actually been setup, it is
1328 * safe to do nothing in this case. 1329 * safe to do nothing in this case.
@@ -2781,7 +2782,7 @@ reuse:
2781 */ 2782 */
2782 if (!page_mkwrite) { 2783 if (!page_mkwrite) {
2783 wait_on_page_locked(dirty_page); 2784 wait_on_page_locked(dirty_page);
2784 set_page_dirty_balance(dirty_page, page_mkwrite); 2785 set_page_dirty_balance(dirty_page);
2785 /* file_update_time outside page_lock */ 2786 /* file_update_time outside page_lock */
2786 if (vma->vm_file) 2787 if (vma->vm_file)
2787 file_update_time(vma->vm_file); 2788 file_update_time(vma->vm_file);
@@ -2827,7 +2828,7 @@ gotten:
2827 } 2828 }
2828 __SetPageUptodate(new_page); 2829 __SetPageUptodate(new_page);
2829 2830
2830 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2831 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
2831 goto oom_free_new; 2832 goto oom_free_new;
2832 2833
2833 mmun_start = address & PAGE_MASK; 2834 mmun_start = address & PAGE_MASK;
@@ -3280,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
3280 */ 3281 */
3281 __SetPageUptodate(page); 3282 __SetPageUptodate(page);
3282 3283
3283 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) 3284 if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
3284 goto oom_free_page; 3285 goto oom_free_page;
3285 3286
3286 entry = mk_pte(page, vma->vm_page_prot); 3287 entry = mk_pte(page, vma->vm_page_prot);
@@ -3342,7 +3343,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
3342 return ret; 3343 return ret;
3343} 3344}
3344 3345
3345static void do_set_pte(struct vm_area_struct *vma, unsigned long address, 3346/**
3347 * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
3348 *
3349 * @vma: virtual memory area
3350 * @address: user virtual address
3351 * @page: page to map
3352 * @pte: pointer to target page table entry
3353 * @write: true, if new entry is writable
3354 * @anon: true, if it's anonymous page
3355 *
3356 * Caller must hold page table lock relevant for @pte.
3357 *
3358 * Target users are page handler itself and implementations of
3359 * vm_ops->map_pages.
3360 */
3361void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3346 struct page *page, pte_t *pte, bool write, bool anon) 3362 struct page *page, pte_t *pte, bool write, bool anon)
3347{ 3363{
3348 pte_t entry; 3364 pte_t entry;
@@ -3366,6 +3382,105 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3366 update_mmu_cache(vma, address, pte); 3382 update_mmu_cache(vma, address, pte);
3367} 3383}
3368 3384
3385#define FAULT_AROUND_ORDER 4
3386
3387#ifdef CONFIG_DEBUG_FS
3388static unsigned int fault_around_order = FAULT_AROUND_ORDER;
3389
3390static int fault_around_order_get(void *data, u64 *val)
3391{
3392 *val = fault_around_order;
3393 return 0;
3394}
3395
3396static int fault_around_order_set(void *data, u64 val)
3397{
3398 BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
3399 if (1UL << val > PTRS_PER_PTE)
3400 return -EINVAL;
3401 fault_around_order = val;
3402 return 0;
3403}
3404DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
3405 fault_around_order_get, fault_around_order_set, "%llu\n");
3406
3407static int __init fault_around_debugfs(void)
3408{
3409 void *ret;
3410
3411 ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL,
3412 &fault_around_order_fops);
3413 if (!ret)
3414 pr_warn("Failed to create fault_around_order in debugfs");
3415 return 0;
3416}
3417late_initcall(fault_around_debugfs);
3418
3419static inline unsigned long fault_around_pages(void)
3420{
3421 return 1UL << fault_around_order;
3422}
3423
3424static inline unsigned long fault_around_mask(void)
3425{
3426 return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
3427}
3428#else
3429static inline unsigned long fault_around_pages(void)
3430{
3431 unsigned long nr_pages;
3432
3433 nr_pages = 1UL << FAULT_AROUND_ORDER;
3434 BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
3435 return nr_pages;
3436}
3437
3438static inline unsigned long fault_around_mask(void)
3439{
3440 return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
3441}
3442#endif
3443
3444static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
3445 pte_t *pte, pgoff_t pgoff, unsigned int flags)
3446{
3447 unsigned long start_addr;
3448 pgoff_t max_pgoff;
3449 struct vm_fault vmf;
3450 int off;
3451
3452 start_addr = max(address & fault_around_mask(), vma->vm_start);
3453 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3454 pte -= off;
3455 pgoff -= off;
3456
3457 /*
3458 * max_pgoff is either end of page table or end of vma
3459 * or fault_around_pages() from pgoff, depending what is neast.
3460 */
3461 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3462 PTRS_PER_PTE - 1;
3463 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
3464 pgoff + fault_around_pages() - 1);
3465
3466 /* Check if it makes any sense to call ->map_pages */
3467 while (!pte_none(*pte)) {
3468 if (++pgoff > max_pgoff)
3469 return;
3470 start_addr += PAGE_SIZE;
3471 if (start_addr >= vma->vm_end)
3472 return;
3473 pte++;
3474 }
3475
3476 vmf.virtual_address = (void __user *) start_addr;
3477 vmf.pte = pte;
3478 vmf.pgoff = pgoff;
3479 vmf.max_pgoff = max_pgoff;
3480 vmf.flags = flags;
3481 vma->vm_ops->map_pages(vma, &vmf);
3482}
3483
3369static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3484static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3370 unsigned long address, pmd_t *pmd, 3485 unsigned long address, pmd_t *pmd,
3371 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 3486 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
@@ -3373,7 +3488,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3373 struct page *fault_page; 3488 struct page *fault_page;
3374 spinlock_t *ptl; 3489 spinlock_t *ptl;
3375 pte_t *pte; 3490 pte_t *pte;
3376 int ret; 3491 int ret = 0;
3492
3493 /*
3494 * Let's call ->map_pages() first and use ->fault() as fallback
3495 * if page by the offset is not ready to be mapped (cold cache or
3496 * something).
3497 */
3498 if (vma->vm_ops->map_pages) {
3499 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3500 do_fault_around(vma, address, pte, pgoff, flags);
3501 if (!pte_same(*pte, orig_pte))
3502 goto unlock_out;
3503 pte_unmap_unlock(pte, ptl);
3504 }
3377 3505
3378 ret = __do_fault(vma, address, pgoff, flags, &fault_page); 3506 ret = __do_fault(vma, address, pgoff, flags, &fault_page);
3379 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3507 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3387,8 +3515,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3387 return ret; 3515 return ret;
3388 } 3516 }
3389 do_set_pte(vma, address, fault_page, pte, false, false); 3517 do_set_pte(vma, address, fault_page, pte, false, false);
3390 pte_unmap_unlock(pte, ptl);
3391 unlock_page(fault_page); 3518 unlock_page(fault_page);
3519unlock_out:
3520 pte_unmap_unlock(pte, ptl);
3392 return ret; 3521 return ret;
3393} 3522}
3394 3523
@@ -3408,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3408 if (!new_page) 3537 if (!new_page)
3409 return VM_FAULT_OOM; 3538 return VM_FAULT_OOM;
3410 3539
3411 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { 3540 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
3412 page_cache_release(new_page); 3541 page_cache_release(new_page);
3413 return VM_FAULT_OOM; 3542 return VM_FAULT_OOM;
3414 } 3543 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3ab02822799..78e1472933ea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
795 return err; 795 return err;
796} 796}
797 797
798/*
799 * Update task->flags PF_MEMPOLICY bit: set iff non-default
800 * mempolicy. Allows more rapid checking of this (combined perhaps
801 * with other PF_* flag bits) on memory allocation hot code paths.
802 *
803 * If called from outside this file, the task 'p' should -only- be
804 * a newly forked child not yet visible on the task list, because
805 * manipulating the task flags of a visible task is not safe.
806 *
807 * The above limitation is why this routine has the funny name
808 * mpol_fix_fork_child_flag().
809 *
810 * It is also safe to call this with a task pointer of current,
811 * which the static wrapper mpol_set_task_struct_flag() does,
812 * for use within this file.
813 */
814
815void mpol_fix_fork_child_flag(struct task_struct *p)
816{
817 if (p->mempolicy)
818 p->flags |= PF_MEMPOLICY;
819 else
820 p->flags &= ~PF_MEMPOLICY;
821}
822
823static void mpol_set_task_struct_flag(void)
824{
825 mpol_fix_fork_child_flag(current);
826}
827
828/* Set the process memory policy */ 798/* Set the process memory policy */
829static long do_set_mempolicy(unsigned short mode, unsigned short flags, 799static long do_set_mempolicy(unsigned short mode, unsigned short flags,
830 nodemask_t *nodes) 800 nodemask_t *nodes)
@@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
861 } 831 }
862 old = current->mempolicy; 832 old = current->mempolicy;
863 current->mempolicy = new; 833 current->mempolicy = new;
864 mpol_set_task_struct_flag();
865 if (new && new->mode == MPOL_INTERLEAVE && 834 if (new && new->mode == MPOL_INTERLEAVE &&
866 nodes_weight(new->v.nodes)) 835 nodes_weight(new->v.nodes))
867 current->il_next = first_node(new->v.nodes); 836 current->il_next = first_node(new->v.nodes);
@@ -1782,21 +1751,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1782/* 1751/*
1783 * Depending on the memory policy provide a node from which to allocate the 1752 * Depending on the memory policy provide a node from which to allocate the
1784 * next slab entry. 1753 * next slab entry.
1785 * @policy must be protected by freeing by the caller. If @policy is
1786 * the current task's mempolicy, this protection is implicit, as only the
1787 * task can change it's policy. The system default policy requires no
1788 * such protection.
1789 */ 1754 */
1790unsigned slab_node(void) 1755unsigned int mempolicy_slab_node(void)
1791{ 1756{
1792 struct mempolicy *policy; 1757 struct mempolicy *policy;
1758 int node = numa_mem_id();
1793 1759
1794 if (in_interrupt()) 1760 if (in_interrupt())
1795 return numa_node_id(); 1761 return node;
1796 1762
1797 policy = current->mempolicy; 1763 policy = current->mempolicy;
1798 if (!policy || policy->flags & MPOL_F_LOCAL) 1764 if (!policy || policy->flags & MPOL_F_LOCAL)
1799 return numa_node_id(); 1765 return node;
1800 1766
1801 switch (policy->mode) { 1767 switch (policy->mode) {
1802 case MPOL_PREFERRED: 1768 case MPOL_PREFERRED:
@@ -1816,11 +1782,11 @@ unsigned slab_node(void)
1816 struct zonelist *zonelist; 1782 struct zonelist *zonelist;
1817 struct zone *zone; 1783 struct zone *zone;
1818 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); 1784 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1819 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; 1785 zonelist = &NODE_DATA(node)->node_zonelists[0];
1820 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1786 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1821 &policy->v.nodes, 1787 &policy->v.nodes,
1822 &zone); 1788 &zone);
1823 return zone ? zone->node : numa_node_id(); 1789 return zone ? zone->node : node;
1824 } 1790 }
1825 1791
1826 default: 1792 default:
diff --git a/mm/mempool.c b/mm/mempool.c
index 659aa42bad16..905434f18c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool)
304 * ensures that there will be frees which return elements to the 304 * ensures that there will be frees which return elements to the
305 * pool waking up the waiters. 305 * pool waking up the waiters.
306 */ 306 */
307 if (pool->curr_nr < pool->min_nr) { 307 if (unlikely(pool->curr_nr < pool->min_nr)) {
308 spin_lock_irqsave(&pool->lock, flags); 308 spin_lock_irqsave(&pool->lock, flags);
309 if (pool->curr_nr < pool->min_nr) { 309 if (likely(pool->curr_nr < pool->min_nr)) {
310 add_element(pool, element); 310 add_element(pool, element);
311 spin_unlock_irqrestore(&pool->lock, flags); 311 spin_unlock_irqrestore(&pool->lock, flags);
312 wake_up(&pool->wait); 312 wake_up(&pool->wait);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4e1a68162285..b1eb53634005 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
79 */ 79 */
80void mlock_vma_page(struct page *page) 80void mlock_vma_page(struct page *page)
81{ 81{
82 /* Serialize with page migration */
82 BUG_ON(!PageLocked(page)); 83 BUG_ON(!PageLocked(page));
83 84
84 if (!TestSetPageMlocked(page)) { 85 if (!TestSetPageMlocked(page)) {
@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page)
174 unsigned int nr_pages; 175 unsigned int nr_pages;
175 struct zone *zone = page_zone(page); 176 struct zone *zone = page_zone(page);
176 177
178 /* For try_to_munlock() and to serialize with page migration */
177 BUG_ON(!PageLocked(page)); 179 BUG_ON(!PageLocked(page));
178 180
179 /* 181 /*
diff --git a/mm/mmap.c b/mm/mmap.c
index 46433e137abc..b1202cf81f4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/vmacache.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/pagemap.h> 16#include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
681 prev->vm_next = next = vma->vm_next; 682 prev->vm_next = next = vma->vm_next;
682 if (next) 683 if (next)
683 next->vm_prev = prev; 684 next->vm_prev = prev;
684 if (mm->mmap_cache == vma) 685
685 mm->mmap_cache = prev; 686 /* Kill the cache */
687 vmacache_invalidate(mm);
686} 688}
687 689
688/* 690/*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
1989/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1991/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1990struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 1992struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1991{ 1993{
1992 struct vm_area_struct *vma = NULL; 1994 struct rb_node *rb_node;
1995 struct vm_area_struct *vma;
1993 1996
1994 /* Check the cache first. */ 1997 /* Check the cache first. */
1995 /* (Cache hit rate is typically around 35%.) */ 1998 vma = vmacache_find(mm, addr);
1996 vma = ACCESS_ONCE(mm->mmap_cache); 1999 if (likely(vma))
1997 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 2000 return vma;
1998 struct rb_node *rb_node;
1999 2001
2000 rb_node = mm->mm_rb.rb_node; 2002 rb_node = mm->mm_rb.rb_node;
2001 vma = NULL; 2003 vma = NULL;
2002 2004
2003 while (rb_node) { 2005 while (rb_node) {
2004 struct vm_area_struct *vma_tmp; 2006 struct vm_area_struct *tmp;
2005 2007
2006 vma_tmp = rb_entry(rb_node, 2008 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2007 struct vm_area_struct, vm_rb); 2009
2008 2010 if (tmp->vm_end > addr) {
2009 if (vma_tmp->vm_end > addr) { 2011 vma = tmp;
2010 vma = vma_tmp; 2012 if (tmp->vm_start <= addr)
2011 if (vma_tmp->vm_start <= addr) 2013 break;
2012 break; 2014 rb_node = rb_node->rb_left;
2013 rb_node = rb_node->rb_left; 2015 } else
2014 } else 2016 rb_node = rb_node->rb_right;
2015 rb_node = rb_node->rb_right;
2016 }
2017 if (vma)
2018 mm->mmap_cache = vma;
2019 } 2017 }
2018
2019 if (vma)
2020 vmacache_update(addr, vma);
2020 return vma; 2021 return vma;
2021} 2022}
2022 2023
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2388 } else 2389 } else
2389 mm->highest_vm_end = prev ? prev->vm_end : 0; 2390 mm->highest_vm_end = prev ? prev->vm_end : 0;
2390 tail_vma->vm_next = NULL; 2391 tail_vma->vm_next = NULL;
2391 mm->mmap_cache = NULL; /* Kill the cache. */ 2392
2393 /* Kill the cache */
2394 vmacache_invalidate(mm);
2392} 2395}
2393 2396
2394/* 2397/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 769a67a15803..c43d557941f8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
36} 36}
37#endif 37#endif
38 38
39/*
40 * For a prot_numa update we only hold mmap_sem for read so there is a
41 * potential race with faulting where a pmd was temporarily none. This
42 * function checks for a transhuge pmd under the appropriate lock. It
43 * returns a pte if it was successfully locked or NULL if it raced with
44 * a transhuge insertion.
45 */
46static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
47 unsigned long addr, int prot_numa, spinlock_t **ptl)
48{
49 pte_t *pte;
50 spinlock_t *pmdl;
51
52 /* !prot_numa is protected by mmap_sem held for write */
53 if (!prot_numa)
54 return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
55
56 pmdl = pmd_lock(vma->vm_mm, pmd);
57 if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
58 spin_unlock(pmdl);
59 return NULL;
60 }
61
62 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
63 spin_unlock(pmdl);
64 return pte;
65}
66
39static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 67static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
40 unsigned long addr, unsigned long end, pgprot_t newprot, 68 unsigned long addr, unsigned long end, pgprot_t newprot,
41 int dirty_accountable, int prot_numa) 69 int dirty_accountable, int prot_numa)
@@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
45 spinlock_t *ptl; 73 spinlock_t *ptl;
46 unsigned long pages = 0; 74 unsigned long pages = 0;
47 75
48 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 76 pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
77 if (!pte)
78 return 0;
79
49 arch_enter_lazy_mmu_mode(); 80 arch_enter_lazy_mmu_mode();
50 do { 81 do {
51 oldpte = *pte; 82 oldpte = *pte;
@@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
109 pgprot_t newprot, int dirty_accountable, int prot_numa) 140 pgprot_t newprot, int dirty_accountable, int prot_numa)
110{ 141{
111 pmd_t *pmd; 142 pmd_t *pmd;
143 struct mm_struct *mm = vma->vm_mm;
112 unsigned long next; 144 unsigned long next;
113 unsigned long pages = 0; 145 unsigned long pages = 0;
114 unsigned long nr_huge_updates = 0; 146 unsigned long nr_huge_updates = 0;
147 unsigned long mni_start = 0;
115 148
116 pmd = pmd_offset(pud, addr); 149 pmd = pmd_offset(pud, addr);
117 do { 150 do {
118 unsigned long this_pages; 151 unsigned long this_pages;
119 152
120 next = pmd_addr_end(addr, end); 153 next = pmd_addr_end(addr, end);
154 if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
155 continue;
156
157 /* invoke the mmu notifier if the pmd is populated */
158 if (!mni_start) {
159 mni_start = addr;
160 mmu_notifier_invalidate_range_start(mm, mni_start, end);
161 }
162
121 if (pmd_trans_huge(*pmd)) { 163 if (pmd_trans_huge(*pmd)) {
122 if (next - addr != HPAGE_PMD_SIZE) 164 if (next - addr != HPAGE_PMD_SIZE)
123 split_huge_page_pmd(vma, addr, pmd); 165 split_huge_page_pmd(vma, addr, pmd);
@@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
130 pages += HPAGE_PMD_NR; 172 pages += HPAGE_PMD_NR;
131 nr_huge_updates++; 173 nr_huge_updates++;
132 } 174 }
175
176 /* huge pmd was handled */
133 continue; 177 continue;
134 } 178 }
135 } 179 }
136 /* fall through */ 180 /* fall through, the trans huge pmd just split */
137 } 181 }
138 if (pmd_none_or_clear_bad(pmd))
139 continue;
140 this_pages = change_pte_range(vma, pmd, addr, next, newprot, 182 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
141 dirty_accountable, prot_numa); 183 dirty_accountable, prot_numa);
142 pages += this_pages; 184 pages += this_pages;
143 } while (pmd++, addr = next, addr != end); 185 } while (pmd++, addr = next, addr != end);
144 186
187 if (mni_start)
188 mmu_notifier_invalidate_range_end(mm, mni_start, end);
189
145 if (nr_huge_updates) 190 if (nr_huge_updates)
146 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); 191 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
147 return pages; 192 return pages;
@@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
201 unsigned long end, pgprot_t newprot, 246 unsigned long end, pgprot_t newprot,
202 int dirty_accountable, int prot_numa) 247 int dirty_accountable, int prot_numa)
203{ 248{
204 struct mm_struct *mm = vma->vm_mm;
205 unsigned long pages; 249 unsigned long pages;
206 250
207 mmu_notifier_invalidate_range_start(mm, start, end);
208 if (is_vm_hugetlb_page(vma)) 251 if (is_vm_hugetlb_page(vma))
209 pages = hugetlb_change_protection(vma, start, end, newprot); 252 pages = hugetlb_change_protection(vma, start, end, newprot);
210 else 253 else
211 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); 254 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
212 mmu_notifier_invalidate_range_end(mm, start, end);
213 255
214 return pages; 256 return pages;
215} 257}
diff --git a/mm/nommu.c b/mm/nommu.c
index a554e5a451cd..85f8d6698d48 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/vmacache.h>
18#include <linux/mman.h> 19#include <linux/mman.h>
19#include <linux/swap.h> 20#include <linux/swap.h>
20#include <linux/file.h> 21#include <linux/file.h>
@@ -24,6 +25,7 @@
24#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
25#include <linux/blkdev.h> 26#include <linux/blkdev.h>
26#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/compiler.h>
27#include <linux/mount.h> 29#include <linux/mount.h>
28#include <linux/personality.h> 30#include <linux/personality.h>
29#include <linux/security.h> 31#include <linux/security.h>
@@ -296,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
296 count = -(unsigned long) addr; 298 count = -(unsigned long) addr;
297 299
298 memcpy(addr, buf, count); 300 memcpy(addr, buf, count);
299 return(count); 301 return count;
300} 302}
301 303
302/* 304/*
@@ -459,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
459 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 461 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
460 * have one. 462 * have one.
461 */ 463 */
462void __attribute__((weak)) vmalloc_sync_all(void) 464void __weak vmalloc_sync_all(void)
463{ 465{
464} 466}
465 467
@@ -768,16 +770,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
768 */ 770 */
769static void delete_vma_from_mm(struct vm_area_struct *vma) 771static void delete_vma_from_mm(struct vm_area_struct *vma)
770{ 772{
773 int i;
771 struct address_space *mapping; 774 struct address_space *mapping;
772 struct mm_struct *mm = vma->vm_mm; 775 struct mm_struct *mm = vma->vm_mm;
776 struct task_struct *curr = current;
773 777
774 kenter("%p", vma); 778 kenter("%p", vma);
775 779
776 protect_vma(vma, 0); 780 protect_vma(vma, 0);
777 781
778 mm->map_count--; 782 mm->map_count--;
779 if (mm->mmap_cache == vma) 783 for (i = 0; i < VMACACHE_SIZE; i++) {
780 mm->mmap_cache = NULL; 784 /* if the vma is cached, invalidate the entire cache */
785 if (curr->vmacache[i] == vma) {
786 vmacache_invalidate(curr->mm);
787 break;
788 }
789 }
781 790
782 /* remove the VMA from the mapping */ 791 /* remove the VMA from the mapping */
783 if (vma->vm_file) { 792 if (vma->vm_file) {
@@ -825,8 +834,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
825 struct vm_area_struct *vma; 834 struct vm_area_struct *vma;
826 835
827 /* check the cache first */ 836 /* check the cache first */
828 vma = ACCESS_ONCE(mm->mmap_cache); 837 vma = vmacache_find(mm, addr);
829 if (vma && vma->vm_start <= addr && vma->vm_end > addr) 838 if (likely(vma))
830 return vma; 839 return vma;
831 840
832 /* trawl the list (there may be multiple mappings in which addr 841 /* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +844,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
835 if (vma->vm_start > addr) 844 if (vma->vm_start > addr)
836 return NULL; 845 return NULL;
837 if (vma->vm_end > addr) { 846 if (vma->vm_end > addr) {
838 mm->mmap_cache = vma; 847 vmacache_update(addr, vma);
839 return vma; 848 return vma;
840 } 849 }
841 } 850 }
@@ -874,8 +883,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
874 unsigned long end = addr + len; 883 unsigned long end = addr + len;
875 884
876 /* check the cache first */ 885 /* check the cache first */
877 vma = mm->mmap_cache; 886 vma = vmacache_find_exact(mm, addr, end);
878 if (vma && vma->vm_start == addr && vma->vm_end == end) 887 if (vma)
879 return vma; 888 return vma;
880 889
881 /* trawl the list (there may be multiple mappings in which addr 890 /* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +895,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
886 if (vma->vm_start > addr) 895 if (vma->vm_start > addr)
887 return NULL; 896 return NULL;
888 if (vma->vm_end == end) { 897 if (vma->vm_end == end) {
889 mm->mmap_cache = vma; 898 vmacache_update(addr, vma);
890 return vma; 899 return vma;
891 } 900 }
892 } 901 }
@@ -1003,8 +1012,7 @@ static int validate_mmap_request(struct file *file,
1003 1012
1004 /* we mustn't privatise shared mappings */ 1013 /* we mustn't privatise shared mappings */
1005 capabilities &= ~BDI_CAP_MAP_COPY; 1014 capabilities &= ~BDI_CAP_MAP_COPY;
1006 } 1015 } else {
1007 else {
1008 /* we're going to read the file into private memory we 1016 /* we're going to read the file into private memory we
1009 * allocate */ 1017 * allocate */
1010 if (!(capabilities & BDI_CAP_MAP_COPY)) 1018 if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -1035,23 +1043,20 @@ static int validate_mmap_request(struct file *file,
1035 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 1043 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
1036 if (prot & PROT_EXEC) 1044 if (prot & PROT_EXEC)
1037 return -EPERM; 1045 return -EPERM;
1038 } 1046 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
1039 else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
1040 /* handle implication of PROT_EXEC by PROT_READ */ 1047 /* handle implication of PROT_EXEC by PROT_READ */
1041 if (current->personality & READ_IMPLIES_EXEC) { 1048 if (current->personality & READ_IMPLIES_EXEC) {
1042 if (capabilities & BDI_CAP_EXEC_MAP) 1049 if (capabilities & BDI_CAP_EXEC_MAP)
1043 prot |= PROT_EXEC; 1050 prot |= PROT_EXEC;
1044 } 1051 }
1045 } 1052 } else if ((prot & PROT_READ) &&
1046 else if ((prot & PROT_READ) &&
1047 (prot & PROT_EXEC) && 1053 (prot & PROT_EXEC) &&
1048 !(capabilities & BDI_CAP_EXEC_MAP) 1054 !(capabilities & BDI_CAP_EXEC_MAP)
1049 ) { 1055 ) {
1050 /* backing file is not executable, try to copy */ 1056 /* backing file is not executable, try to copy */
1051 capabilities &= ~BDI_CAP_MAP_DIRECT; 1057 capabilities &= ~BDI_CAP_MAP_DIRECT;
1052 } 1058 }
1053 } 1059 } else {
1054 else {
1055 /* anonymous mappings are always memory backed and can be 1060 /* anonymous mappings are always memory backed and can be
1056 * privately mapped 1061 * privately mapped
1057 */ 1062 */
@@ -1659,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1659 /* find the first potentially overlapping VMA */ 1664 /* find the first potentially overlapping VMA */
1660 vma = find_vma(mm, start); 1665 vma = find_vma(mm, start);
1661 if (!vma) { 1666 if (!vma) {
1662 static int limit = 0; 1667 static int limit;
1663 if (limit < 5) { 1668 if (limit < 5) {
1664 printk(KERN_WARNING 1669 printk(KERN_WARNING
1665 "munmap of memory not mmapped by process %d" 1670 "munmap of memory not mmapped by process %d"
@@ -1985,6 +1990,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1985} 1990}
1986EXPORT_SYMBOL(filemap_fault); 1991EXPORT_SYMBOL(filemap_fault);
1987 1992
1993void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
1994{
1995 BUG();
1996}
1997EXPORT_SYMBOL(filemap_map_pages);
1998
1988int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, 1999int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
1989 unsigned long size, pgoff_t pgoff) 2000 unsigned long size, pgoff_t pgoff)
1990{ 2001{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1aca8e..ef413492a149 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1562,9 +1562,9 @@ pause:
1562 bdi_start_background_writeback(bdi); 1562 bdi_start_background_writeback(bdi);
1563} 1563}
1564 1564
1565void set_page_dirty_balance(struct page *page, int page_mkwrite) 1565void set_page_dirty_balance(struct page *page)
1566{ 1566{
1567 if (set_page_dirty(page) || page_mkwrite) { 1567 if (set_page_dirty(page)) {
1568 struct address_space *mapping = page_mapping(page); 1568 struct address_space *mapping = page_mapping(page);
1569 1569
1570 if (mapping) 1570 if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 979378deccbf..5dba2933c9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page)
295} 295}
296#endif 296#endif
297 297
298static void bad_page(struct page *page, char *reason, unsigned long bad_flags) 298static void bad_page(struct page *page, const char *reason,
299 unsigned long bad_flags)
299{ 300{
300 static unsigned long resume; 301 static unsigned long resume;
301 static unsigned long nr_shown; 302 static unsigned long nr_shown;
@@ -623,7 +624,7 @@ out:
623 624
624static inline int free_pages_check(struct page *page) 625static inline int free_pages_check(struct page *page)
625{ 626{
626 char *bad_reason = NULL; 627 const char *bad_reason = NULL;
627 unsigned long bad_flags = 0; 628 unsigned long bad_flags = 0;
628 629
629 if (unlikely(page_mapcount(page))) 630 if (unlikely(page_mapcount(page)))
@@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page,
859 */ 860 */
860static inline int check_new_page(struct page *page) 861static inline int check_new_page(struct page *page)
861{ 862{
862 char *bad_reason = NULL; 863 const char *bad_reason = NULL;
863 unsigned long bad_flags = 0; 864 unsigned long bad_flags = 0;
864 865
865 if (unlikely(page_mapcount(page))) 866 if (unlikely(page_mapcount(page)))
@@ -1238,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1238 } 1239 }
1239 local_irq_restore(flags); 1240 local_irq_restore(flags);
1240} 1241}
1241static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1242{
1243 return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
1244}
1245#else
1246static bool gfp_thisnode_allocation(gfp_t gfp_mask)
1247{
1248 return false;
1249}
1250#endif 1242#endif
1251 1243
1252/* 1244/*
@@ -1583,12 +1575,7 @@ again:
1583 get_pageblock_migratetype(page)); 1575 get_pageblock_migratetype(page));
1584 } 1576 }
1585 1577
1586 /* 1578 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1587 * NOTE: GFP_THISNODE allocations do not partake in the kswapd
1588 * aging protocol, so they can't be fair.
1589 */
1590 if (!gfp_thisnode_allocation(gfp_flags))
1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1592 1579
1593 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1580 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1594 zone_statistics(preferred_zone, zone, gfp_flags); 1581 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1870,7 +1857,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
1870{ 1857{
1871 int i; 1858 int i;
1872 1859
1873 for_each_online_node(i) 1860 for_each_node_state(i, N_MEMORY)
1874 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1861 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1875 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1862 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1876 else 1863 else
@@ -1954,23 +1941,12 @@ zonelist_scan:
1954 * zone size to ensure fair page aging. The zone a 1941 * zone size to ensure fair page aging. The zone a
1955 * page was allocated in should have no effect on the 1942 * page was allocated in should have no effect on the
1956 * time the page has in memory before being reclaimed. 1943 * time the page has in memory before being reclaimed.
1957 *
1958 * Try to stay in local zones in the fastpath. If
1959 * that fails, the slowpath is entered, which will do
1960 * another pass starting with the local zones, but
1961 * ultimately fall back to remote zones that do not
1962 * partake in the fairness round-robin cycle of this
1963 * zonelist.
1964 *
1965 * NOTE: GFP_THISNODE allocations do not partake in
1966 * the kswapd aging protocol, so they can't be fair.
1967 */ 1944 */
1968 if ((alloc_flags & ALLOC_WMARK_LOW) && 1945 if (alloc_flags & ALLOC_FAIR) {
1969 !gfp_thisnode_allocation(gfp_mask)) {
1970 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1971 continue;
1972 if (!zone_local(preferred_zone, zone)) 1946 if (!zone_local(preferred_zone, zone))
1973 continue; 1947 continue;
1948 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1949 continue;
1974 } 1950 }
1975 /* 1951 /*
1976 * When allocating a page cache page for writing, we 1952 * When allocating a page cache page for writing, we
@@ -2408,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2408 return page; 2384 return page;
2409} 2385}
2410 2386
2411static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, 2387static void reset_alloc_batches(struct zonelist *zonelist,
2412 struct zonelist *zonelist, 2388 enum zone_type high_zoneidx,
2413 enum zone_type high_zoneidx, 2389 struct zone *preferred_zone)
2414 struct zone *preferred_zone)
2415{ 2390{
2416 struct zoneref *z; 2391 struct zoneref *z;
2417 struct zone *zone; 2392 struct zone *zone;
2418 2393
2419 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 2394 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2420 if (!(gfp_mask & __GFP_NO_KSWAPD))
2421 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2422 /* 2395 /*
2423 * Only reset the batches of zones that were actually 2396 * Only reset the batches of zones that were actually
2424 * considered in the fast path, we don't want to 2397 * considered in the fairness pass, we don't want to
2425 * thrash fairness information for zones that are not 2398 * trash fairness information for zones that are not
2426 * actually part of this zonelist's round-robin cycle. 2399 * actually part of this zonelist's round-robin cycle.
2427 */ 2400 */
2428 if (!zone_local(preferred_zone, zone)) 2401 if (!zone_local(preferred_zone, zone))
2429 continue; 2402 continue;
2430 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2403 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2431 high_wmark_pages(zone) - 2404 high_wmark_pages(zone) - low_wmark_pages(zone) -
2432 low_wmark_pages(zone) - 2405 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2433 zone_page_state(zone, NR_ALLOC_BATCH));
2434 } 2406 }
2435} 2407}
2436 2408
2409static void wake_all_kswapds(unsigned int order,
2410 struct zonelist *zonelist,
2411 enum zone_type high_zoneidx,
2412 struct zone *preferred_zone)
2413{
2414 struct zoneref *z;
2415 struct zone *zone;
2416
2417 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
2418 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2419}
2420
2437static inline int 2421static inline int
2438gfp_to_alloc_flags(gfp_t gfp_mask) 2422gfp_to_alloc_flags(gfp_t gfp_mask)
2439{ 2423{
@@ -2522,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2522 * allowed per node queues are empty and that nodes are 2506 * allowed per node queues are empty and that nodes are
2523 * over allocated. 2507 * over allocated.
2524 */ 2508 */
2525 if (gfp_thisnode_allocation(gfp_mask)) 2509 if (IS_ENABLED(CONFIG_NUMA) &&
2510 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2526 goto nopage; 2511 goto nopage;
2527 2512
2528restart: 2513restart:
2529 prepare_slowpath(gfp_mask, order, zonelist, 2514 if (!(gfp_mask & __GFP_NO_KSWAPD))
2530 high_zoneidx, preferred_zone); 2515 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
2531 2516
2532 /* 2517 /*
2533 * OK, we're below the kswapd watermark and have kicked background 2518 * OK, we're below the kswapd watermark and have kicked background
@@ -2711,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2711 struct page *page = NULL; 2696 struct page *page = NULL;
2712 int migratetype = allocflags_to_migratetype(gfp_mask); 2697 int migratetype = allocflags_to_migratetype(gfp_mask);
2713 unsigned int cpuset_mems_cookie; 2698 unsigned int cpuset_mems_cookie;
2714 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2699 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2715 struct mem_cgroup *memcg = NULL; 2700 struct mem_cgroup *memcg = NULL;
2716 2701
2717 gfp_mask &= gfp_allowed_mask; 2702 gfp_mask &= gfp_allowed_mask;
@@ -2752,12 +2737,29 @@ retry_cpuset:
2752 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2737 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2753 alloc_flags |= ALLOC_CMA; 2738 alloc_flags |= ALLOC_CMA;
2754#endif 2739#endif
2740retry:
2755 /* First allocation attempt */ 2741 /* First allocation attempt */
2756 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2742 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2757 zonelist, high_zoneidx, alloc_flags, 2743 zonelist, high_zoneidx, alloc_flags,
2758 preferred_zone, migratetype); 2744 preferred_zone, migratetype);
2759 if (unlikely(!page)) { 2745 if (unlikely(!page)) {
2760 /* 2746 /*
2747 * The first pass makes sure allocations are spread
2748 * fairly within the local node. However, the local
2749 * node might have free pages left after the fairness
2750 * batches are exhausted, and remote zones haven't
2751 * even been considered yet. Try once more without
2752 * fairness, and include remote zones now, before
2753 * entering the slowpath and waking kswapd: prefer
2754 * spilling to a remote zone over swapping locally.
2755 */
2756 if (alloc_flags & ALLOC_FAIR) {
2757 reset_alloc_batches(zonelist, high_zoneidx,
2758 preferred_zone);
2759 alloc_flags &= ~ALLOC_FAIR;
2760 goto retry;
2761 }
2762 /*
2761 * Runtime PM, block IO and its error handling path 2763 * Runtime PM, block IO and its error handling path
2762 * can deadlock because I/O on the device might not 2764 * can deadlock because I/O on the device might not
2763 * complete. 2765 * complete.
@@ -4919,7 +4921,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4919 4921
4920 pgdat->node_id = nid; 4922 pgdat->node_id = nid;
4921 pgdat->node_start_pfn = node_start_pfn; 4923 pgdat->node_start_pfn = node_start_pfn;
4922 init_zone_allows_reclaim(nid); 4924 if (node_state(nid, N_MEMORY))
4925 init_zone_allows_reclaim(nid);
4923#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4926#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4924 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4927 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4925#endif 4928#endif
@@ -5070,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5070 nodemask_t saved_node_state = node_states[N_MEMORY]; 5073 nodemask_t saved_node_state = node_states[N_MEMORY];
5071 unsigned long totalpages = early_calculate_totalpages(); 5074 unsigned long totalpages = early_calculate_totalpages();
5072 int usable_nodes = nodes_weight(node_states[N_MEMORY]); 5075 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
5073 struct memblock_type *type = &memblock.memory; 5076 struct memblock_region *r;
5074 5077
5075 /* Need to find movable_zone earlier when movable_node is specified. */ 5078 /* Need to find movable_zone earlier when movable_node is specified. */
5076 find_usable_zone_for_movable(); 5079 find_usable_zone_for_movable();
@@ -5080,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void)
5080 * options. 5083 * options.
5081 */ 5084 */
5082 if (movable_node_is_enabled()) { 5085 if (movable_node_is_enabled()) {
5083 for (i = 0; i < type->cnt; i++) { 5086 for_each_memblock(memory, r) {
5084 if (!memblock_is_hotpluggable(&type->regions[i])) 5087 if (!memblock_is_hotpluggable(r))
5085 continue; 5088 continue;
5086 5089
5087 nid = type->regions[i].nid; 5090 nid = r->nid;
5088 5091
5089 usable_startpfn = PFN_DOWN(type->regions[i].base); 5092 usable_startpfn = PFN_DOWN(r->base);
5090 zone_movable_pfn[nid] = zone_movable_pfn[nid] ? 5093 zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
5091 min(usable_startpfn, zone_movable_pfn[nid]) : 5094 min(usable_startpfn, zone_movable_pfn[nid]) :
5092 usable_startpfn; 5095 usable_startpfn;
@@ -6544,7 +6547,8 @@ static void dump_page_flags(unsigned long flags)
6544 printk(")\n"); 6547 printk(")\n");
6545} 6548}
6546 6549
6547void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) 6550void dump_page_badflags(struct page *page, const char *reason,
6551 unsigned long badflags)
6548{ 6552{
6549 printk(KERN_ALERT 6553 printk(KERN_ALERT
6550 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 6554 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
@@ -6560,8 +6564,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
6560 mem_cgroup_print_bad_page(page); 6564 mem_cgroup_print_bad_page(page);
6561} 6565}
6562 6566
6563void dump_page(struct page *page, char *reason) 6567void dump_page(struct page *page, const char *reason)
6564{ 6568{
6565 dump_page_badflags(page, reason, 0); 6569 dump_page_badflags(page, reason, 0);
6566} 6570}
6567EXPORT_SYMBOL_GPL(dump_page); 6571EXPORT_SYMBOL(dump_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1af5a0c..0ca36a7770b1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
8 */ 8 */
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/fs.h>
12#include <linux/gfp.h> 11#include <linux/gfp.h>
13#include <linux/mm.h>
14#include <linux/export.h> 12#include <linux/export.h>
15#include <linux/blkdev.h> 13#include <linux/blkdev.h>
16#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
20#include <linux/syscalls.h> 18#include <linux/syscalls.h>
21#include <linux/file.h> 19#include <linux/file.h>
22 20
21#include "internal.h"
22
23/* 23/*
24 * Initialise a struct file's readahead state. Assumes that the caller has 24 * Initialise a struct file's readahead state. Assumes that the caller has
25 * memset *ra to zero. 25 * memset *ra to zero.
@@ -149,8 +149,7 @@ out:
149 * 149 *
150 * Returns the number of pages requested, or the maximum amount of I/O allowed. 150 * Returns the number of pages requested, or the maximum amount of I/O allowed.
151 */ 151 */
152static int 152int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
153__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
154 pgoff_t offset, unsigned long nr_to_read, 153 pgoff_t offset, unsigned long nr_to_read,
155 unsigned long lookahead_size) 154 unsigned long lookahead_size)
156{ 155{
@@ -244,20 +243,6 @@ unsigned long max_sane_readahead(unsigned long nr)
244} 243}
245 244
246/* 245/*
247 * Submit IO for the read-ahead request in file_ra_state.
248 */
249unsigned long ra_submit(struct file_ra_state *ra,
250 struct address_space *mapping, struct file *filp)
251{
252 int actual;
253
254 actual = __do_page_cache_readahead(mapping, filp,
255 ra->start, ra->size, ra->async_size);
256
257 return actual;
258}
259
260/*
261 * Set the initial window size, round to next power of 2 and square 246 * Set the initial window size, round to next power of 2 and square
262 * for small size, x 4 for medium, and x 2 for large 247 * for small size, x 4 for medium, and x 2 for large
263 * for 128k (32 page) max ra 248 * for 128k (32 page) max ra
diff --git a/mm/rmap.c b/mm/rmap.c
index 11cf322f8133..9c3e77396d1a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1332 BUG_ON(!page || PageAnon(page)); 1332 BUG_ON(!page || PageAnon(page));
1333 1333
1334 if (locked_vma) { 1334 if (locked_vma) {
1335 mlock_vma_page(page); /* no-op if already mlocked */ 1335 if (page == check_page) {
1336 if (page == check_page) 1336 /* we know we have check_page locked */
1337 mlock_vma_page(page);
1337 ret = SWAP_MLOCK; 1338 ret = SWAP_MLOCK;
1339 } else if (trylock_page(page)) {
1340 /*
1341 * If we can lock the page, perform mlock.
1342 * Otherwise leave the page alone, it will be
1343 * eventually encountered again later.
1344 */
1345 mlock_vma_page(page);
1346 unlock_page(page);
1347 }
1338 continue; /* don't unmap */ 1348 continue; /* don't unmap */
1339 } 1349 }
1340 1350
diff --git a/mm/shmem.c b/mm/shmem.c
index a3ba988ec946..70273f8df586 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
683 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 683 * the shmem_swaplist_mutex which might hold up shmem_writepage().
684 * Charged back to the user (not to caller) when swap account is used. 684 * Charged back to the user (not to caller) when swap account is used.
685 */ 685 */
686 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 686 error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
687 if (error) 687 if (error)
688 goto out; 688 goto out;
689 /* No radix_tree_preload: swap entry keeps a place for page in tree */ 689 /* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1080,7 +1080,7 @@ repeat:
1080 goto failed; 1080 goto failed;
1081 } 1081 }
1082 1082
1083 error = mem_cgroup_cache_charge(page, current->mm, 1083 error = mem_cgroup_charge_file(page, current->mm,
1084 gfp & GFP_RECLAIM_MASK); 1084 gfp & GFP_RECLAIM_MASK);
1085 if (!error) { 1085 if (!error) {
1086 error = shmem_add_to_page_cache(page, mapping, index, 1086 error = shmem_add_to_page_cache(page, mapping, index,
@@ -1134,7 +1134,7 @@ repeat:
1134 1134
1135 SetPageSwapBacked(page); 1135 SetPageSwapBacked(page);
1136 __set_page_locked(page); 1136 __set_page_locked(page);
1137 error = mem_cgroup_cache_charge(page, current->mm, 1137 error = mem_cgroup_charge_file(page, current->mm,
1138 gfp & GFP_RECLAIM_MASK); 1138 gfp & GFP_RECLAIM_MASK);
1139 if (error) 1139 if (error)
1140 goto decused; 1140 goto decused;
@@ -2723,6 +2723,7 @@ static const struct super_operations shmem_ops = {
2723 2723
2724static const struct vm_operations_struct shmem_vm_ops = { 2724static const struct vm_operations_struct shmem_vm_ops = {
2725 .fault = shmem_fault, 2725 .fault = shmem_fault,
2726 .map_pages = filemap_map_pages,
2726#ifdef CONFIG_NUMA 2727#ifdef CONFIG_NUMA
2727 .set_policy = shmem_set_policy, 2728 .set_policy = shmem_set_policy,
2728 .get_policy = shmem_get_policy, 2729 .get_policy = shmem_get_policy,
diff --git a/mm/slab.c b/mm/slab.c
index 9153c802e2fe..3db4cb06e32e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3027,7 +3027,7 @@ out:
3027 3027
3028#ifdef CONFIG_NUMA 3028#ifdef CONFIG_NUMA
3029/* 3029/*
3030 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. 3030 * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
3031 * 3031 *
3032 * If we are in_interrupt, then process context, including cpusets and 3032 * If we are in_interrupt, then process context, including cpusets and
3033 * mempolicy, may not apply and should not be used for allocation policy. 3033 * mempolicy, may not apply and should not be used for allocation policy.
@@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3042 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3042 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3043 nid_alloc = cpuset_slab_spread_node(); 3043 nid_alloc = cpuset_slab_spread_node();
3044 else if (current->mempolicy) 3044 else if (current->mempolicy)
3045 nid_alloc = slab_node(); 3045 nid_alloc = mempolicy_slab_node();
3046 if (nid_alloc != nid_here) 3046 if (nid_alloc != nid_here)
3047 return ____cache_alloc_node(cachep, flags, nid_alloc); 3047 return ____cache_alloc_node(cachep, flags, nid_alloc);
3048 return NULL; 3048 return NULL;
@@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3074 3074
3075retry_cpuset: 3075retry_cpuset:
3076 cpuset_mems_cookie = read_mems_allowed_begin(); 3076 cpuset_mems_cookie = read_mems_allowed_begin();
3077 zonelist = node_zonelist(slab_node(), flags); 3077 zonelist = node_zonelist(mempolicy_slab_node(), flags);
3078 3078
3079retry: 3079retry:
3080 /* 3080 /*
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3259{ 3259{
3260 void *objp; 3260 void *objp;
3261 3261
3262 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { 3262 if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
3263 objp = alternate_node_alloc(cache, flags); 3263 objp = alternate_node_alloc(cache, flags);
3264 if (objp) 3264 if (objp)
3265 goto out; 3265 goto out;
diff --git a/mm/slab.h b/mm/slab.h
index 8184a7cde272..3045316b7c9d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
55struct mem_cgroup; 55struct mem_cgroup;
56#ifdef CONFIG_SLUB 56#ifdef CONFIG_SLUB
57struct kmem_cache * 57struct kmem_cache *
58__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 58__kmem_cache_alias(const char *name, size_t size, size_t align,
59 size_t align, unsigned long flags, void (*ctor)(void *)); 59 unsigned long flags, void (*ctor)(void *));
60#else 60#else
61static inline struct kmem_cache * 61static inline struct kmem_cache *
62__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 62__kmem_cache_alias(const char *name, size_t size, size_t align,
63 size_t align, unsigned long flags, void (*ctor)(void *)) 63 unsigned long flags, void (*ctor)(void *))
64{ return NULL; } 64{ return NULL; }
65#endif 65#endif
66 66
@@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
119 return !s->memcg_params || s->memcg_params->is_root_cache; 119 return !s->memcg_params || s->memcg_params->is_root_cache;
120} 120}
121 121
122static inline bool cache_match_memcg(struct kmem_cache *cachep,
123 struct mem_cgroup *memcg)
124{
125 return (is_root_cache(cachep) && !memcg) ||
126 (cachep->memcg_params->memcg == memcg);
127}
128
129static inline void memcg_bind_pages(struct kmem_cache *s, int order) 122static inline void memcg_bind_pages(struct kmem_cache *s, int order)
130{ 123{
131 if (!is_root_cache(s)) 124 if (!is_root_cache(s))
@@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
204 return true; 197 return true;
205} 198}
206 199
207static inline bool cache_match_memcg(struct kmem_cache *cachep,
208 struct mem_cgroup *memcg)
209{
210 return true;
211}
212
213static inline void memcg_bind_pages(struct kmem_cache *s, int order) 200static inline void memcg_bind_pages(struct kmem_cache *s, int order)
214{ 201{
215} 202}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1ec3c619ba04..f3cfccf76dda 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex);
29struct kmem_cache *kmem_cache; 29struct kmem_cache *kmem_cache;
30 30
31#ifdef CONFIG_DEBUG_VM 31#ifdef CONFIG_DEBUG_VM
32static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, 32static int kmem_cache_sanity_check(const char *name, size_t size)
33 size_t size)
34{ 33{
35 struct kmem_cache *s = NULL; 34 struct kmem_cache *s = NULL;
36 35
@@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
57 } 56 }
58 57
59#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) 58#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
60 /* 59 if (!strcmp(s->name, name)) {
61 * For simplicity, we won't check this in the list of memcg
62 * caches. We have control over memcg naming, and if there
63 * aren't duplicates in the global list, there won't be any
64 * duplicates in the memcg lists as well.
65 */
66 if (!memcg && !strcmp(s->name, name)) {
67 pr_err("%s (%s): Cache name already exists.\n", 60 pr_err("%s (%s): Cache name already exists.\n",
68 __func__, name); 61 __func__, name);
69 dump_stack(); 62 dump_stack();
@@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
77 return 0; 70 return 0;
78} 71}
79#else 72#else
80static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, 73static inline int kmem_cache_sanity_check(const char *name, size_t size)
81 const char *name, size_t size)
82{ 74{
83 return 0; 75 return 0;
84} 76}
@@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags,
139 return ALIGN(align, sizeof(void *)); 131 return ALIGN(align, sizeof(void *));
140} 132}
141 133
134static struct kmem_cache *
135do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
136 unsigned long flags, void (*ctor)(void *),
137 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
138{
139 struct kmem_cache *s;
140 int err;
141
142 err = -ENOMEM;
143 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
144 if (!s)
145 goto out;
146
147 s->name = name;
148 s->object_size = object_size;
149 s->size = size;
150 s->align = align;
151 s->ctor = ctor;
152
153 err = memcg_alloc_cache_params(memcg, s, root_cache);
154 if (err)
155 goto out_free_cache;
156
157 err = __kmem_cache_create(s, flags);
158 if (err)
159 goto out_free_cache;
160
161 s->refcount = 1;
162 list_add(&s->list, &slab_caches);
163 memcg_register_cache(s);
164out:
165 if (err)
166 return ERR_PTR(err);
167 return s;
168
169out_free_cache:
170 memcg_free_cache_params(s);
171 kfree(s);
172 goto out;
173}
142 174
143/* 175/*
144 * kmem_cache_create - Create a cache. 176 * kmem_cache_create - Create a cache.
@@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags,
164 * cacheline. This can be beneficial if you're counting cycles as closely 196 * cacheline. This can be beneficial if you're counting cycles as closely
165 * as davem. 197 * as davem.
166 */ 198 */
167
168struct kmem_cache * 199struct kmem_cache *
169kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, 200kmem_cache_create(const char *name, size_t size, size_t align,
170 size_t align, unsigned long flags, void (*ctor)(void *), 201 unsigned long flags, void (*ctor)(void *))
171 struct kmem_cache *parent_cache)
172{ 202{
173 struct kmem_cache *s = NULL; 203 struct kmem_cache *s;
204 char *cache_name;
174 int err; 205 int err;
175 206
176 get_online_cpus(); 207 get_online_cpus();
177 mutex_lock(&slab_mutex); 208 mutex_lock(&slab_mutex);
178 209
179 err = kmem_cache_sanity_check(memcg, name, size); 210 err = kmem_cache_sanity_check(name, size);
180 if (err) 211 if (err)
181 goto out_unlock; 212 goto out_unlock;
182 213
183 if (memcg) {
184 /*
185 * Since per-memcg caches are created asynchronously on first
186 * allocation (see memcg_kmem_get_cache()), several threads can
187 * try to create the same cache, but only one of them may
188 * succeed. Therefore if we get here and see the cache has
189 * already been created, we silently return NULL.
190 */
191 if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
192 goto out_unlock;
193 }
194
195 /* 214 /*
196 * Some allocators will constraint the set of valid flags to a subset 215 * Some allocators will constraint the set of valid flags to a subset
197 * of all flags. We expect them to define CACHE_CREATE_MASK in this 216 * of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -200,50 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
200 */ 219 */
201 flags &= CACHE_CREATE_MASK; 220 flags &= CACHE_CREATE_MASK;
202 221
203 s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); 222 s = __kmem_cache_alias(name, size, align, flags, ctor);
204 if (s) 223 if (s)
205 goto out_unlock; 224 goto out_unlock;
206 225
207 err = -ENOMEM; 226 cache_name = kstrdup(name, GFP_KERNEL);
208 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 227 if (!cache_name) {
209 if (!s) 228 err = -ENOMEM;
210 goto out_unlock; 229 goto out_unlock;
230 }
211 231
212 s->object_size = s->size = size; 232 s = do_kmem_cache_create(cache_name, size, size,
213 s->align = calculate_alignment(flags, align, size); 233 calculate_alignment(flags, align, size),
214 s->ctor = ctor; 234 flags, ctor, NULL, NULL);
215 235 if (IS_ERR(s)) {
216 s->name = kstrdup(name, GFP_KERNEL); 236 err = PTR_ERR(s);
217 if (!s->name) 237 kfree(cache_name);
218 goto out_free_cache; 238 }
219
220 err = memcg_alloc_cache_params(memcg, s, parent_cache);
221 if (err)
222 goto out_free_cache;
223
224 err = __kmem_cache_create(s, flags);
225 if (err)
226 goto out_free_cache;
227
228 s->refcount = 1;
229 list_add(&s->list, &slab_caches);
230 memcg_register_cache(s);
231 239
232out_unlock: 240out_unlock:
233 mutex_unlock(&slab_mutex); 241 mutex_unlock(&slab_mutex);
234 put_online_cpus(); 242 put_online_cpus();
235 243
236 if (err) { 244 if (err) {
237 /*
238 * There is no point in flooding logs with warnings or
239 * especially crashing the system if we fail to create a cache
240 * for a memcg. In this case we will be accounting the memcg
241 * allocation to the root cgroup until we succeed to create its
242 * own cache, but it isn't that critical.
243 */
244 if (!memcg)
245 return NULL;
246
247 if (flags & SLAB_PANIC) 245 if (flags & SLAB_PANIC)
248 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", 246 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
249 name, err); 247 name, err);
@@ -255,52 +253,112 @@ out_unlock:
255 return NULL; 253 return NULL;
256 } 254 }
257 return s; 255 return s;
256}
257EXPORT_SYMBOL(kmem_cache_create);
258 258
259out_free_cache: 259#ifdef CONFIG_MEMCG_KMEM
260 memcg_free_cache_params(s); 260/*
261 kfree(s->name); 261 * kmem_cache_create_memcg - Create a cache for a memory cgroup.
262 kmem_cache_free(kmem_cache, s); 262 * @memcg: The memory cgroup the new cache is for.
263 goto out_unlock; 263 * @root_cache: The parent of the new cache.
264 *
265 * This function attempts to create a kmem cache that will serve allocation
266 * requests going from @memcg to @root_cache. The new cache inherits properties
267 * from its parent.
268 */
269void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
270{
271 struct kmem_cache *s;
272 char *cache_name;
273
274 get_online_cpus();
275 mutex_lock(&slab_mutex);
276
277 /*
278 * Since per-memcg caches are created asynchronously on first
279 * allocation (see memcg_kmem_get_cache()), several threads can try to
280 * create the same cache, but only one of them may succeed.
281 */
282 if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
283 goto out_unlock;
284
285 cache_name = memcg_create_cache_name(memcg, root_cache);
286 if (!cache_name)
287 goto out_unlock;
288
289 s = do_kmem_cache_create(cache_name, root_cache->object_size,
290 root_cache->size, root_cache->align,
291 root_cache->flags, root_cache->ctor,
292 memcg, root_cache);
293 if (IS_ERR(s)) {
294 kfree(cache_name);
295 goto out_unlock;
296 }
297
298 s->allocflags |= __GFP_KMEMCG;
299
300out_unlock:
301 mutex_unlock(&slab_mutex);
302 put_online_cpus();
264} 303}
265 304
266struct kmem_cache * 305static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
267kmem_cache_create(const char *name, size_t size, size_t align,
268 unsigned long flags, void (*ctor)(void *))
269{ 306{
270 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); 307 int rc;
308
309 if (!s->memcg_params ||
310 !s->memcg_params->is_root_cache)
311 return 0;
312
313 mutex_unlock(&slab_mutex);
314 rc = __kmem_cache_destroy_memcg_children(s);
315 mutex_lock(&slab_mutex);
316
317 return rc;
271} 318}
272EXPORT_SYMBOL(kmem_cache_create); 319#else
320static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
321{
322 return 0;
323}
324#endif /* CONFIG_MEMCG_KMEM */
273 325
274void kmem_cache_destroy(struct kmem_cache *s) 326void kmem_cache_destroy(struct kmem_cache *s)
275{ 327{
276 /* Destroy all the children caches if we aren't a memcg cache */
277 kmem_cache_destroy_memcg_children(s);
278
279 get_online_cpus(); 328 get_online_cpus();
280 mutex_lock(&slab_mutex); 329 mutex_lock(&slab_mutex);
330
281 s->refcount--; 331 s->refcount--;
282 if (!s->refcount) { 332 if (s->refcount)
283 list_del(&s->list); 333 goto out_unlock;
284 334
285 if (!__kmem_cache_shutdown(s)) { 335 if (kmem_cache_destroy_memcg_children(s) != 0)
286 memcg_unregister_cache(s); 336 goto out_unlock;
287 mutex_unlock(&slab_mutex); 337
288 if (s->flags & SLAB_DESTROY_BY_RCU) 338 list_del(&s->list);
289 rcu_barrier(); 339 memcg_unregister_cache(s);
290 340
291 memcg_free_cache_params(s); 341 if (__kmem_cache_shutdown(s) != 0) {
292 kfree(s->name); 342 list_add(&s->list, &slab_caches);
293 kmem_cache_free(kmem_cache, s); 343 memcg_register_cache(s);
294 } else { 344 printk(KERN_ERR "kmem_cache_destroy %s: "
295 list_add(&s->list, &slab_caches); 345 "Slab cache still has objects\n", s->name);
296 mutex_unlock(&slab_mutex); 346 dump_stack();
297 printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", 347 goto out_unlock;
298 s->name);
299 dump_stack();
300 }
301 } else {
302 mutex_unlock(&slab_mutex);
303 } 348 }
349
350 mutex_unlock(&slab_mutex);
351 if (s->flags & SLAB_DESTROY_BY_RCU)
352 rcu_barrier();
353
354 memcg_free_cache_params(s);
355 kfree(s->name);
356 kmem_cache_free(kmem_cache, s);
357 goto out_put_cpus;
358
359out_unlock:
360 mutex_unlock(&slab_mutex);
361out_put_cpus:
304 put_online_cpus(); 362 put_online_cpus();
305} 363}
306EXPORT_SYMBOL(kmem_cache_destroy); 364EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index fe6d7be22ef0..f620bbf4054a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
224static inline void stat(const struct kmem_cache *s, enum stat_item si) 224static inline void stat(const struct kmem_cache *s, enum stat_item si)
225{ 225{
226#ifdef CONFIG_SLUB_STATS 226#ifdef CONFIG_SLUB_STATS
227 __this_cpu_inc(s->cpu_slab->stat[si]); 227 /*
228 * The rmw is racy on a preemptible kernel but this is acceptable, so
229 * avoid this_cpu_add()'s irq-disable overhead.
230 */
231 raw_cpu_inc(s->cpu_slab->stat[si]);
228#endif 232#endif
229} 233}
230 234
@@ -1685,7 +1689,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1685 1689
1686 do { 1690 do {
1687 cpuset_mems_cookie = read_mems_allowed_begin(); 1691 cpuset_mems_cookie = read_mems_allowed_begin();
1688 zonelist = node_zonelist(slab_node(), flags); 1692 zonelist = node_zonelist(mempolicy_slab_node(), flags);
1689 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1693 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1690 struct kmem_cache_node *n; 1694 struct kmem_cache_node *n;
1691 1695
@@ -3685,6 +3689,9 @@ static int slab_unmergeable(struct kmem_cache *s)
3685 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) 3689 if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
3686 return 1; 3690 return 1;
3687 3691
3692 if (!is_root_cache(s))
3693 return 1;
3694
3688 if (s->ctor) 3695 if (s->ctor)
3689 return 1; 3696 return 1;
3690 3697
@@ -3697,9 +3704,8 @@ static int slab_unmergeable(struct kmem_cache *s)
3697 return 0; 3704 return 0;
3698} 3705}
3699 3706
3700static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, 3707static struct kmem_cache *find_mergeable(size_t size, size_t align,
3701 size_t align, unsigned long flags, const char *name, 3708 unsigned long flags, const char *name, void (*ctor)(void *))
3702 void (*ctor)(void *))
3703{ 3709{
3704 struct kmem_cache *s; 3710 struct kmem_cache *s;
3705 3711
@@ -3722,7 +3728,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3722 continue; 3728 continue;
3723 3729
3724 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) 3730 if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
3725 continue; 3731 continue;
3726 /* 3732 /*
3727 * Check if alignment is compatible. 3733 * Check if alignment is compatible.
3728 * Courtesy of Adrian Drzewiecki 3734 * Courtesy of Adrian Drzewiecki
@@ -3733,23 +3739,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3733 if (s->size - size >= sizeof(void *)) 3739 if (s->size - size >= sizeof(void *))
3734 continue; 3740 continue;
3735 3741
3736 if (!cache_match_memcg(s, memcg))
3737 continue;
3738
3739 return s; 3742 return s;
3740 } 3743 }
3741 return NULL; 3744 return NULL;
3742} 3745}
3743 3746
3744struct kmem_cache * 3747struct kmem_cache *
3745__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 3748__kmem_cache_alias(const char *name, size_t size, size_t align,
3746 size_t align, unsigned long flags, void (*ctor)(void *)) 3749 unsigned long flags, void (*ctor)(void *))
3747{ 3750{
3748 struct kmem_cache *s; 3751 struct kmem_cache *s;
3749 3752
3750 s = find_mergeable(memcg, size, align, flags, name, ctor); 3753 s = find_mergeable(size, align, flags, name, ctor);
3751 if (s) { 3754 if (s) {
3755 int i;
3756 struct kmem_cache *c;
3757
3752 s->refcount++; 3758 s->refcount++;
3759
3753 /* 3760 /*
3754 * Adjust the object sizes so that we clear 3761 * Adjust the object sizes so that we clear
3755 * the complete object on kzalloc. 3762 * the complete object on kzalloc.
@@ -3757,6 +3764,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
3757 s->object_size = max(s->object_size, (int)size); 3764 s->object_size = max(s->object_size, (int)size);
3758 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3765 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3759 3766
3767 for_each_memcg_cache_index(i) {
3768 c = cache_from_memcg_idx(s, i);
3769 if (!c)
3770 continue;
3771 c->object_size = s->object_size;
3772 c->inuse = max_t(int, c->inuse,
3773 ALIGN(size, sizeof(void *)));
3774 }
3775
3760 if (sysfs_slab_alias(s, name)) { 3776 if (sysfs_slab_alias(s, name)) {
3761 s->refcount--; 3777 s->refcount--;
3762 s = NULL; 3778 s = NULL;
@@ -5126,6 +5142,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
5126 5142
5127static struct kset *slab_kset; 5143static struct kset *slab_kset;
5128 5144
5145static inline struct kset *cache_kset(struct kmem_cache *s)
5146{
5147#ifdef CONFIG_MEMCG_KMEM
5148 if (!is_root_cache(s))
5149 return s->memcg_params->root_cache->memcg_kset;
5150#endif
5151 return slab_kset;
5152}
5153
5129#define ID_STR_LENGTH 64 5154#define ID_STR_LENGTH 64
5130 5155
5131/* Create a unique string id for a slab cache: 5156/* Create a unique string id for a slab cache:
@@ -5191,26 +5216,39 @@ static int sysfs_slab_add(struct kmem_cache *s)
5191 name = create_unique_id(s); 5216 name = create_unique_id(s);
5192 } 5217 }
5193 5218
5194 s->kobj.kset = slab_kset; 5219 s->kobj.kset = cache_kset(s);
5195 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); 5220 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
5196 if (err) { 5221 if (err)
5197 kobject_put(&s->kobj); 5222 goto out_put_kobj;
5198 return err;
5199 }
5200 5223
5201 err = sysfs_create_group(&s->kobj, &slab_attr_group); 5224 err = sysfs_create_group(&s->kobj, &slab_attr_group);
5202 if (err) { 5225 if (err)
5203 kobject_del(&s->kobj); 5226 goto out_del_kobj;
5204 kobject_put(&s->kobj); 5227
5205 return err; 5228#ifdef CONFIG_MEMCG_KMEM
5229 if (is_root_cache(s)) {
5230 s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
5231 if (!s->memcg_kset) {
5232 err = -ENOMEM;
5233 goto out_del_kobj;
5234 }
5206 } 5235 }
5236#endif
5237
5207 kobject_uevent(&s->kobj, KOBJ_ADD); 5238 kobject_uevent(&s->kobj, KOBJ_ADD);
5208 if (!unmergeable) { 5239 if (!unmergeable) {
5209 /* Setup first alias */ 5240 /* Setup first alias */
5210 sysfs_slab_alias(s, s->name); 5241 sysfs_slab_alias(s, s->name);
5211 kfree(name);
5212 } 5242 }
5213 return 0; 5243out:
5244 if (!unmergeable)
5245 kfree(name);
5246 return err;
5247out_del_kobj:
5248 kobject_del(&s->kobj);
5249out_put_kobj:
5250 kobject_put(&s->kobj);
5251 goto out;
5214} 5252}
5215 5253
5216static void sysfs_slab_remove(struct kmem_cache *s) 5254static void sysfs_slab_remove(struct kmem_cache *s)
@@ -5222,6 +5260,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
5222 */ 5260 */
5223 return; 5261 return;
5224 5262
5263#ifdef CONFIG_MEMCG_KMEM
5264 kset_unregister(s->memcg_kset);
5265#endif
5225 kobject_uevent(&s->kobj, KOBJ_REMOVE); 5266 kobject_uevent(&s->kobj, KOBJ_REMOVE);
5226 kobject_del(&s->kobj); 5267 kobject_del(&s->kobj);
5227 kobject_put(&s->kobj); 5268 kobject_put(&s->kobj);
diff --git a/mm/sparse.c b/mm/sparse.c
index 38cad8fd7397..d1b48b691ac8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,10 +5,12 @@
5#include <linux/slab.h> 5#include <linux/slab.h>
6#include <linux/mmzone.h> 6#include <linux/mmzone.h>
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/compiler.h>
8#include <linux/highmem.h> 9#include <linux/highmem.h>
9#include <linux/export.h> 10#include <linux/export.h>
10#include <linux/spinlock.h> 11#include <linux/spinlock.h>
11#include <linux/vmalloc.h> 12#include <linux/vmalloc.h>
13
12#include "internal.h" 14#include "internal.h"
13#include <asm/dma.h> 15#include <asm/dma.h>
14#include <asm/pgalloc.h> 16#include <asm/pgalloc.h>
@@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
461} 463}
462#endif 464#endif
463 465
464void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 466void __weak __meminit vmemmap_populate_print_last(void)
465{ 467{
466} 468}
467 469
diff --git a/mm/util.c b/mm/util.c
index a24aa22f2473..d7813e6d4cc7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/compiler.h>
4#include <linux/export.h> 5#include <linux/export.h>
5#include <linux/err.h> 6#include <linux/err.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
@@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
307 * If the architecture not support this function, simply return with no 308 * If the architecture not support this function, simply return with no
308 * page pinned 309 * page pinned
309 */ 310 */
310int __attribute__((weak)) __get_user_pages_fast(unsigned long start, 311int __weak __get_user_pages_fast(unsigned long start,
311 int nr_pages, int write, struct page **pages) 312 int nr_pages, int write, struct page **pages)
312{ 313{
313 return 0; 314 return 0;
@@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
338 * callers need to carefully consider what to use. On many architectures, 339 * callers need to carefully consider what to use. On many architectures,
339 * get_user_pages_fast simply falls back to get_user_pages. 340 * get_user_pages_fast simply falls back to get_user_pages.
340 */ 341 */
341int __attribute__((weak)) get_user_pages_fast(unsigned long start, 342int __weak get_user_pages_fast(unsigned long start,
342 int nr_pages, int write, struct page **pages) 343 int nr_pages, int write, struct page **pages)
343{ 344{
344 struct mm_struct *mm = current->mm; 345 struct mm_struct *mm = current->mm;
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..d4224b397c0e
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
1/*
2 * Copyright (C) 2014 Davidlohr Bueso.
3 */
4#include <linux/sched.h>
5#include <linux/mm.h>
6#include <linux/vmacache.h>
7
8/*
9 * Flush vma caches for threads that share a given mm.
10 *
11 * The operation is safe because the caller holds the mmap_sem
12 * exclusively and other threads accessing the vma cache will
13 * have mmap_sem held at least for read, so no extra locking
14 * is required to maintain the vma cache.
15 */
16void vmacache_flush_all(struct mm_struct *mm)
17{
18 struct task_struct *g, *p;
19
20 rcu_read_lock();
21 for_each_process_thread(g, p) {
22 /*
23 * Only flush the vmacache pointers as the
24 * mm seqnum is already set and curr's will
25 * be set upon invalidation when the next
26 * lookup is done.
27 */
28 if (mm == p->mm)
29 vmacache_flush(p);
30 }
31 rcu_read_unlock();
32}
33
34/*
35 * This task may be accessing a foreign mm via (for example)
36 * get_user_pages()->find_vma(). The vmacache is task-local and this
37 * task's vmacache pertains to a different mm (ie, its own). There is
38 * nothing we can do here.
39 *
40 * Also handle the case where a kernel thread has adopted this mm via use_mm().
41 * That kernel thread's vmacache is not applicable to this mm.
42 */
43static bool vmacache_valid_mm(struct mm_struct *mm)
44{
45 return current->mm == mm && !(current->flags & PF_KTHREAD);
46}
47
48void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
49{
50 if (vmacache_valid_mm(newvma->vm_mm))
51 current->vmacache[VMACACHE_HASH(addr)] = newvma;
52}
53
54static bool vmacache_valid(struct mm_struct *mm)
55{
56 struct task_struct *curr;
57
58 if (!vmacache_valid_mm(mm))
59 return false;
60
61 curr = current;
62 if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
63 /*
64 * First attempt will always be invalid, initialize
65 * the new cache for this task here.
66 */
67 curr->vmacache_seqnum = mm->vmacache_seqnum;
68 vmacache_flush(curr);
69 return false;
70 }
71 return true;
72}
73
74struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
75{
76 int i;
77
78 if (!vmacache_valid(mm))
79 return NULL;
80
81 for (i = 0; i < VMACACHE_SIZE; i++) {
82 struct vm_area_struct *vma = current->vmacache[i];
83
84 if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
85 BUG_ON(vma->vm_mm != mm);
86 return vma;
87 }
88 }
89
90 return NULL;
91}
92
93#ifndef CONFIG_MMU
94struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
95 unsigned long start,
96 unsigned long end)
97{
98 int i;
99
100 if (!vmacache_valid(mm))
101 return NULL;
102
103 for (i = 0; i < VMACACHE_SIZE; i++) {
104 struct vm_area_struct *vma = current->vmacache[i];
105
106 if (vma && vma->vm_start == start && vma->vm_end == end)
107 return vma;
108 }
109
110 return NULL;
111}
112#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..bf233b283319 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -27,7 +27,9 @@
27#include <linux/pfn.h> 27#include <linux/pfn.h>
28#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
29#include <linux/atomic.h> 29#include <linux/atomic.h>
30#include <linux/compiler.h>
30#include <linux/llist.h> 31#include <linux/llist.h>
32
31#include <asm/uaccess.h> 33#include <asm/uaccess.h>
32#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
33#include <asm/shmparam.h> 35#include <asm/shmparam.h>
@@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram);
1083 * @node: prefer to allocate data structures on this node 1085 * @node: prefer to allocate data structures on this node
1084 * @prot: memory protection to use. PAGE_KERNEL for regular RAM 1086 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
1085 * 1087 *
1088 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
1089 * faster than vmap so it's good. But if you mix long-life and short-life
1090 * objects with vm_map_ram(), it could consume lots of address space through
1091 * fragmentation (especially on a 32bit machine). You could see failures in
1092 * the end. Please use this function for short-lived objects.
1093 *
1086 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1094 * Returns: a pointer to the address that has been mapped, or %NULL on failure
1087 */ 1095 */
1088void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 1096void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
@@ -2181,7 +2189,7 @@ EXPORT_SYMBOL(remap_vmalloc_range);
2181 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 2189 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
2182 * have one. 2190 * have one.
2183 */ 2191 */
2184void __attribute__((weak)) vmalloc_sync_all(void) 2192void __weak vmalloc_sync_all(void)
2185{ 2193{
2186} 2194}
2187 2195
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f56a80a7c41..06879ead7380 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2314,15 +2314,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2314 unsigned long lru_pages = 0; 2314 unsigned long lru_pages = 0;
2315 bool aborted_reclaim = false; 2315 bool aborted_reclaim = false;
2316 struct reclaim_state *reclaim_state = current->reclaim_state; 2316 struct reclaim_state *reclaim_state = current->reclaim_state;
2317 gfp_t orig_mask;
2317 struct shrink_control shrink = { 2318 struct shrink_control shrink = {
2318 .gfp_mask = sc->gfp_mask, 2319 .gfp_mask = sc->gfp_mask,
2319 }; 2320 };
2321 enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
2320 2322
2321 /* 2323 /*
2322 * If the number of buffer_heads in the machine exceeds the maximum 2324 * If the number of buffer_heads in the machine exceeds the maximum
2323 * allowed level, force direct reclaim to scan the highmem zone as 2325 * allowed level, force direct reclaim to scan the highmem zone as
2324 * highmem pages could be pinning lowmem pages storing buffer_heads 2326 * highmem pages could be pinning lowmem pages storing buffer_heads
2325 */ 2327 */
2328 orig_mask = sc->gfp_mask;
2326 if (buffer_heads_over_limit) 2329 if (buffer_heads_over_limit)
2327 sc->gfp_mask |= __GFP_HIGHMEM; 2330 sc->gfp_mask |= __GFP_HIGHMEM;
2328 2331
@@ -2356,7 +2359,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2356 * noticeable problem, like transparent huge 2359 * noticeable problem, like transparent huge
2357 * page allocations. 2360 * page allocations.
2358 */ 2361 */
2359 if (compaction_ready(zone, sc)) { 2362 if ((zonelist_zone_idx(z) <= requested_highidx)
2363 && compaction_ready(zone, sc)) {
2360 aborted_reclaim = true; 2364 aborted_reclaim = true;
2361 continue; 2365 continue;
2362 } 2366 }
@@ -2393,6 +2397,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2393 } 2397 }
2394 } 2398 }
2395 2399
2400 /*
2401 * Restore to original mask to avoid the impact on the caller if we
2402 * promoted it to __GFP_HIGHMEM.
2403 */
2404 sc->gfp_mask = orig_mask;
2405
2396 return aborted_reclaim; 2406 return aborted_reclaim;
2397} 2407}
2398 2408
diff --git a/mm/zswap.c b/mm/zswap.c
index d7337fbf6605..aeaef0fb5624 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20;
89module_param_named(max_pool_percent, 89module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644); 90 zswap_max_pool_percent, uint, 0644);
91 91
92/* zbud_pool is shared by all of zswap backend */
93static struct zbud_pool *zswap_pool;
94
92/********************************* 95/*********************************
93* compression functions 96* compression functions
94**********************************/ 97**********************************/
@@ -160,14 +163,14 @@ static void zswap_comp_exit(void)
160 * rbnode - links the entry into red-black tree for the appropriate swap type 163 * rbnode - links the entry into red-black tree for the appropriate swap type
161 * refcount - the number of outstanding reference to the entry. This is needed 164 * refcount - the number of outstanding reference to the entry. This is needed
162 * to protect against premature freeing of the entry by code 165 * to protect against premature freeing of the entry by code
163 * concurent calls to load, invalidate, and writeback. The lock 166 * concurrent calls to load, invalidate, and writeback. The lock
164 * for the zswap_tree structure that contains the entry must 167 * for the zswap_tree structure that contains the entry must
165 * be held while changing the refcount. Since the lock must 168 * be held while changing the refcount. Since the lock must
166 * be held, there is no reason to also make refcount atomic. 169 * be held, there is no reason to also make refcount atomic.
167 * offset - the swap offset for the entry. Index into the red-black tree. 170 * offset - the swap offset for the entry. Index into the red-black tree.
168 * handle - zsmalloc allocation handle that stores the compressed page data 171 * handle - zbud allocation handle that stores the compressed page data
169 * length - the length in bytes of the compressed page data. Needed during 172 * length - the length in bytes of the compressed page data. Needed during
170 * decompression 173 * decompression
171 */ 174 */
172struct zswap_entry { 175struct zswap_entry {
173 struct rb_node rbnode; 176 struct rb_node rbnode;
@@ -189,7 +192,6 @@ struct zswap_header {
189struct zswap_tree { 192struct zswap_tree {
190 struct rb_root rbroot; 193 struct rb_root rbroot;
191 spinlock_t lock; 194 spinlock_t lock;
192 struct zbud_pool *pool;
193}; 195};
194 196
195static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; 197static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
@@ -202,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache;
202static int zswap_entry_cache_create(void) 204static int zswap_entry_cache_create(void)
203{ 205{
204 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); 206 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
205 return (zswap_entry_cache == NULL); 207 return zswap_entry_cache == NULL;
206} 208}
207 209
208static void zswap_entry_cache_destory(void) 210static void zswap_entry_cache_destory(void)
@@ -282,16 +284,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
282} 284}
283 285
284/* 286/*
285 * Carries out the common pattern of freeing and entry's zsmalloc allocation, 287 * Carries out the common pattern of freeing and entry's zbud allocation,
286 * freeing the entry itself, and decrementing the number of stored pages. 288 * freeing the entry itself, and decrementing the number of stored pages.
287 */ 289 */
288static void zswap_free_entry(struct zswap_tree *tree, 290static void zswap_free_entry(struct zswap_entry *entry)
289 struct zswap_entry *entry)
290{ 291{
291 zbud_free(tree->pool, entry->handle); 292 zbud_free(zswap_pool, entry->handle);
292 zswap_entry_cache_free(entry); 293 zswap_entry_cache_free(entry);
293 atomic_dec(&zswap_stored_pages); 294 atomic_dec(&zswap_stored_pages);
294 zswap_pool_pages = zbud_get_pool_size(tree->pool); 295 zswap_pool_pages = zbud_get_pool_size(zswap_pool);
295} 296}
296 297
297/* caller must hold the tree lock */ 298/* caller must hold the tree lock */
@@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree,
311 BUG_ON(refcount < 0); 312 BUG_ON(refcount < 0);
312 if (refcount == 0) { 313 if (refcount == 0) {
313 zswap_rb_erase(&tree->rbroot, entry); 314 zswap_rb_erase(&tree->rbroot, entry);
314 zswap_free_entry(tree, entry); 315 zswap_free_entry(entry);
315 } 316 }
316} 317}
317 318
@@ -407,8 +408,8 @@ cleanup:
407**********************************/ 408**********************************/
408static bool zswap_is_full(void) 409static bool zswap_is_full(void)
409{ 410{
410 return (totalram_pages * zswap_max_pool_percent / 100 < 411 return totalram_pages * zswap_max_pool_percent / 100 <
411 zswap_pool_pages); 412 zswap_pool_pages;
412} 413}
413 414
414/********************************* 415/*********************************
@@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
545 zbud_unmap(pool, handle); 546 zbud_unmap(pool, handle);
546 tree = zswap_trees[swp_type(swpentry)]; 547 tree = zswap_trees[swp_type(swpentry)];
547 offset = swp_offset(swpentry); 548 offset = swp_offset(swpentry);
548 BUG_ON(pool != tree->pool);
549 549
550 /* find and ref zswap entry */ 550 /* find and ref zswap entry */
551 spin_lock(&tree->lock); 551 spin_lock(&tree->lock);
@@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
573 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 573 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
574 /* decompress */ 574 /* decompress */
575 dlen = PAGE_SIZE; 575 dlen = PAGE_SIZE;
576 src = (u8 *)zbud_map(tree->pool, entry->handle) + 576 src = (u8 *)zbud_map(zswap_pool, entry->handle) +
577 sizeof(struct zswap_header); 577 sizeof(struct zswap_header);
578 dst = kmap_atomic(page); 578 dst = kmap_atomic(page);
579 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, 579 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
580 entry->length, dst, &dlen); 580 entry->length, dst, &dlen);
581 kunmap_atomic(dst); 581 kunmap_atomic(dst);
582 zbud_unmap(tree->pool, entry->handle); 582 zbud_unmap(zswap_pool, entry->handle);
583 BUG_ON(ret); 583 BUG_ON(ret);
584 BUG_ON(dlen != PAGE_SIZE); 584 BUG_ON(dlen != PAGE_SIZE);
585 585
@@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
652 /* reclaim space if needed */ 652 /* reclaim space if needed */
653 if (zswap_is_full()) { 653 if (zswap_is_full()) {
654 zswap_pool_limit_hit++; 654 zswap_pool_limit_hit++;
655 if (zbud_reclaim_page(tree->pool, 8)) { 655 if (zbud_reclaim_page(zswap_pool, 8)) {
656 zswap_reject_reclaim_fail++; 656 zswap_reject_reclaim_fail++;
657 ret = -ENOMEM; 657 ret = -ENOMEM;
658 goto reject; 658 goto reject;
@@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
679 679
680 /* store */ 680 /* store */
681 len = dlen + sizeof(struct zswap_header); 681 len = dlen + sizeof(struct zswap_header);
682 ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, 682 ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
683 &handle); 683 &handle);
684 if (ret == -ENOSPC) { 684 if (ret == -ENOSPC) {
685 zswap_reject_compress_poor++; 685 zswap_reject_compress_poor++;
@@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
689 zswap_reject_alloc_fail++; 689 zswap_reject_alloc_fail++;
690 goto freepage; 690 goto freepage;
691 } 691 }
692 zhdr = zbud_map(tree->pool, handle); 692 zhdr = zbud_map(zswap_pool, handle);
693 zhdr->swpentry = swp_entry(type, offset); 693 zhdr->swpentry = swp_entry(type, offset);
694 buf = (u8 *)(zhdr + 1); 694 buf = (u8 *)(zhdr + 1);
695 memcpy(buf, dst, dlen); 695 memcpy(buf, dst, dlen);
696 zbud_unmap(tree->pool, handle); 696 zbud_unmap(zswap_pool, handle);
697 put_cpu_var(zswap_dstmem); 697 put_cpu_var(zswap_dstmem);
698 698
699 /* populate entry */ 699 /* populate entry */
@@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
716 716
717 /* update stats */ 717 /* update stats */
718 atomic_inc(&zswap_stored_pages); 718 atomic_inc(&zswap_stored_pages);
719 zswap_pool_pages = zbud_get_pool_size(tree->pool); 719 zswap_pool_pages = zbud_get_pool_size(zswap_pool);
720 720
721 return 0; 721 return 0;
722 722
@@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
752 752
753 /* decompress */ 753 /* decompress */
754 dlen = PAGE_SIZE; 754 dlen = PAGE_SIZE;
755 src = (u8 *)zbud_map(tree->pool, entry->handle) + 755 src = (u8 *)zbud_map(zswap_pool, entry->handle) +
756 sizeof(struct zswap_header); 756 sizeof(struct zswap_header);
757 dst = kmap_atomic(page); 757 dst = kmap_atomic(page);
758 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, 758 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
759 dst, &dlen); 759 dst, &dlen);
760 kunmap_atomic(dst); 760 kunmap_atomic(dst);
761 zbud_unmap(tree->pool, entry->handle); 761 zbud_unmap(zswap_pool, entry->handle);
762 BUG_ON(ret); 762 BUG_ON(ret);
763 763
764 spin_lock(&tree->lock); 764 spin_lock(&tree->lock);
@@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type)
804 /* walk the tree and free everything */ 804 /* walk the tree and free everything */
805 spin_lock(&tree->lock); 805 spin_lock(&tree->lock);
806 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) 806 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
807 zswap_free_entry(tree, entry); 807 zswap_free_entry(entry);
808 tree->rbroot = RB_ROOT; 808 tree->rbroot = RB_ROOT;
809 spin_unlock(&tree->lock); 809 spin_unlock(&tree->lock);
810
811 zbud_destroy_pool(tree->pool);
812 kfree(tree); 810 kfree(tree);
813 zswap_trees[type] = NULL; 811 zswap_trees[type] = NULL;
814} 812}
@@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type)
822 struct zswap_tree *tree; 820 struct zswap_tree *tree;
823 821
824 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); 822 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
825 if (!tree) 823 if (!tree) {
826 goto err; 824 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
827 tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); 825 return;
828 if (!tree->pool) 826 }
829 goto freetree; 827
830 tree->rbroot = RB_ROOT; 828 tree->rbroot = RB_ROOT;
831 spin_lock_init(&tree->lock); 829 spin_lock_init(&tree->lock);
832 zswap_trees[type] = tree; 830 zswap_trees[type] = tree;
833 return;
834
835freetree:
836 kfree(tree);
837err:
838 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
839} 831}
840 832
841static struct frontswap_ops zswap_frontswap_ops = { 833static struct frontswap_ops zswap_frontswap_ops = {
@@ -907,9 +899,16 @@ static int __init init_zswap(void)
907 return 0; 899 return 0;
908 900
909 pr_info("loading zswap\n"); 901 pr_info("loading zswap\n");
902
903 zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
904 if (!zswap_pool) {
905 pr_err("zbud pool creation failed\n");
906 goto error;
907 }
908
910 if (zswap_entry_cache_create()) { 909 if (zswap_entry_cache_create()) {
911 pr_err("entry cache creation failed\n"); 910 pr_err("entry cache creation failed\n");
912 goto error; 911 goto cachefail;
913 } 912 }
914 if (zswap_comp_init()) { 913 if (zswap_comp_init()) {
915 pr_err("compressor initialization failed\n"); 914 pr_err("compressor initialization failed\n");
@@ -919,6 +918,7 @@ static int __init init_zswap(void)
919 pr_err("per-cpu initialization failed\n"); 918 pr_err("per-cpu initialization failed\n");
920 goto pcpufail; 919 goto pcpufail;
921 } 920 }
921
922 frontswap_register_ops(&zswap_frontswap_ops); 922 frontswap_register_ops(&zswap_frontswap_ops);
923 if (zswap_debugfs_init()) 923 if (zswap_debugfs_init())
924 pr_warn("debugfs initialization failed\n"); 924 pr_warn("debugfs initialization failed\n");
@@ -927,6 +927,8 @@ pcpufail:
927 zswap_comp_exit(); 927 zswap_comp_exit();
928compfail: 928compfail:
929 zswap_entry_cache_destory(); 929 zswap_entry_cache_destory();
930cachefail:
931 zbud_destroy_pool(zswap_pool);
930error: 932error:
931 return -ENOMEM; 933 return -ENOMEM;
932} 934}