aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2011-05-24 03:06:26 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2011-05-24 03:06:26 -0400
commitb73077eb03f510a84b102fb97640e595a958403c (patch)
tree8b639000418e2756bf6baece4e00e07d2534bccc /mm
parent28350e330cfab46b60a1dbf763b678d859f9f3d9 (diff)
parent9d2e173644bb5c42ff1b280fbdda3f195a7cf1f7 (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug25
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c18
-rw-r--r--mm/bootmem.c188
-rw-r--r--mm/compaction.c65
-rw-r--r--mm/filemap.c211
-rw-r--r--mm/huge_memory.c137
-rw-r--r--mm/hugetlb.c16
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h7
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/ksm.c25
-rw-r--r--mm/memblock.c243
-rw-r--r--mm/memcontrol.c692
-rw-r--r--mm/memory-failure.c140
-rw-r--r--mm/memory.c209
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c21
-rw-r--r--mm/migrate.c71
-rw-r--r--mm/mlock.c24
-rw-r--r--mm/mmap.c15
-rw-r--r--mm/mremap.c15
-rw-r--r--mm/nobootmem.c427
-rw-r--r--mm/nommu.c58
-rw-r--r--mm/oom_kill.c89
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c175
-rw-r--r--mm/page_cgroup.c140
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c24
-rw-r--r--mm/percpu.c13
-rw-r--r--mm/readahead.c18
-rw-r--r--mm/rmap.c139
-rw-r--r--mm/shmem.c26
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c376
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c189
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c411
-rw-r--r--mm/truncate.c24
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c158
-rw-r--r--mm/vmscan.c102
-rw-r--r--mm/vmstat.c27
46 files changed, 2914 insertions, 1727 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f0..8b1a477162dc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
1config DEBUG_PAGEALLOC 1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations" 2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC 3 depends on DEBUG_KERNEL
4 depends on !HIBERNATION || !PPC && !SPARC 4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
5 depends on !KMEMCHECK 5 depends on !KMEMCHECK
6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
6 ---help--- 7 ---help---
7 Unmap pages from the kernel linear mapping after free_pages(). 8 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types 9 This results in a large slowdown, but helps to find certain types
9 of memory corruption. 10 of memory corruption.
10 11
12 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
13 fill the pages with poison patterns after free_pages() and verify
14 the patterns before alloc_pages(). Additionally,
15 this option cannot be enabled in combination with hibernation as
16 that would result in incorrect warnings of memory corruption after
17 a resume because free pages are not saved to the suspend image.
18
11config WANT_PAGE_DEBUG_FLAGS 19config WANT_PAGE_DEBUG_FLAGS
12 bool 20 bool
13 21
14config PAGE_POISONING 22config PAGE_POISONING
15 bool "Debug page memory allocations" 23 bool
16 depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 depends on !HIBERNATION
18 select DEBUG_PAGEALLOC
19 select WANT_PAGE_DEBUG_FLAGS 24 select WANT_PAGE_DEBUG_FLAGS
20 ---help---
21 Fill the pages with poison patterns after free_pages() and verify
22 the patterns before alloc_pages(). This results in a large slowdown,
23 but helps to find certain types of memory corruption.
24
25 This option cannot be enabled in combination with hibernation as
26 that would result in incorrect warnings of memory corruption after
27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 2b1b575ae712..42a8326c3e3d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
@@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
15 $(mmu-y) 15 $(mmu-y)
16obj-y += init-mm.o 16obj-y += init-mm.o
17 17
18ifdef CONFIG_NO_BOOTMEM
19 obj-y += nobootmem.o
20else
21 obj-y += bootmem.o
22endif
23
18obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 24obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
19 25
20obj-$(CONFIG_BOUNCE) += bounce.o 26obj-$(CONFIG_BOUNCE) += bounce.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d30227..befc87531e4f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
14 14
15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 16
17void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
18{
19}
20EXPORT_SYMBOL(default_unplug_io_fn);
21
22struct backing_dev_info default_backing_dev_info = { 17struct backing_dev_info default_backing_dev_info = {
23 .name = "default", 18 .name = "default",
24 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
25 .state = 0, 20 .state = 0,
26 .capabilities = BDI_CAP_MAP_COPY, 21 .capabilities = BDI_CAP_MAP_COPY,
27 .unplug_io_fn = default_unplug_io_fn,
28}; 22};
29EXPORT_SYMBOL_GPL(default_backing_dev_info); 23EXPORT_SYMBOL_GPL(default_backing_dev_info);
30 24
@@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
73 struct inode *inode; 67 struct inode *inode;
74 68
75 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
76 spin_lock(&inode_lock); 70 spin_lock(&inode_wb_list_lock);
77 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
78 nr_dirty++; 72 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_wb_list) 73 list_for_each_entry(inode, &wb->b_io, i_wb_list)
80 nr_io++; 74 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
82 nr_more_io++; 76 nr_more_io++;
83 spin_unlock(&inode_lock); 77 spin_unlock(&inode_wb_list_lock);
84 78
85 global_dirty_limits(&background_thresh, &dirty_thresh); 79 global_dirty_limits(&background_thresh, &dirty_thresh);
86 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
604 spin_lock(&sb_lock); 598 spin_lock(&sb_lock);
605 list_for_each_entry(sb, &super_blocks, s_list) { 599 list_for_each_entry(sb, &super_blocks, s_list) {
606 if (sb->s_bdi == bdi) 600 if (sb->s_bdi == bdi)
607 sb->s_bdi = NULL; 601 sb->s_bdi = &default_backing_dev_info;
608 } 602 }
609 spin_unlock(&sb_lock); 603 spin_unlock(&sb_lock);
610} 604}
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
682 if (bdi_has_dirty_io(bdi)) { 676 if (bdi_has_dirty_io(bdi)) {
683 struct bdi_writeback *dst = &default_backing_dev_info.wb; 677 struct bdi_writeback *dst = &default_backing_dev_info.wb;
684 678
685 spin_lock(&inode_lock); 679 spin_lock(&inode_wb_list_lock);
686 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
687 list_splice(&bdi->wb.b_io, &dst->b_io); 681 list_splice(&bdi->wb.b_io, &dst->b_io);
688 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
689 spin_unlock(&inode_lock); 683 spin_unlock(&inode_wb_list_lock);
690 } 684 }
691 685
692 bdi_unregister(bdi); 686 bdi_unregister(bdi);
@@ -793,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait);
793 * jiffies for either a BDI to exit congestion of the given @sync queue 787 * jiffies for either a BDI to exit congestion of the given @sync queue
794 * or a write to complete. 788 * or a write to complete.
795 * 789 *
796 * In the absense of zone congestion, cond_resched() is called to yield 790 * In the absence of zone congestion, cond_resched() is called to yield
797 * the processor if necessary but otherwise does not sleep. 791 * the processor if necessary but otherwise does not sleep.
798 * 792 *
799 * The return value is 0 if the sleep is for the full timeout. Otherwise, 793 * The return value is 0 if the sleep is for the full timeout. Otherwise,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 13b0caa9793c..01d5a4b3dd0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -23,19 +23,17 @@
23 23
24#include "internal.h" 24#include "internal.h"
25 25
26#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data = {
28 .bdata = &bootmem_node_data[0]
29};
30EXPORT_SYMBOL(contig_page_data);
31#endif
32
26unsigned long max_low_pfn; 33unsigned long max_low_pfn;
27unsigned long min_low_pfn; 34unsigned long min_low_pfn;
28unsigned long max_pfn; 35unsigned long max_pfn;
29 36
30#ifdef CONFIG_CRASH_DUMP
31/*
32 * If we have booted due to a crash, max_pfn will be a very low value. We need
33 * to know the amount of memory that the previous kernel used.
34 */
35unsigned long saved_max_pfn;
36#endif
37
38#ifndef CONFIG_NO_BOOTMEM
39bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 37bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
40 38
41static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 39static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -146,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
146 min_low_pfn = start; 144 min_low_pfn = start;
147 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 145 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
148} 146}
149#endif 147
150/* 148/*
151 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
152 * @addr: starting address of the range 150 * @addr: starting address of the range
@@ -171,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
171 } 169 }
172} 170}
173 171
174#ifdef CONFIG_NO_BOOTMEM
175static void __init __free_pages_memory(unsigned long start, unsigned long end)
176{
177 int i;
178 unsigned long start_aligned, end_aligned;
179 int order = ilog2(BITS_PER_LONG);
180
181 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
182 end_aligned = end & ~(BITS_PER_LONG - 1);
183
184 if (end_aligned <= start_aligned) {
185 for (i = start; i < end; i++)
186 __free_pages_bootmem(pfn_to_page(i), 0);
187
188 return;
189 }
190
191 for (i = start; i < start_aligned; i++)
192 __free_pages_bootmem(pfn_to_page(i), 0);
193
194 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
195 __free_pages_bootmem(pfn_to_page(i), order);
196
197 for (i = end_aligned; i < end; i++)
198 __free_pages_bootmem(pfn_to_page(i), 0);
199}
200
201unsigned long __init free_all_memory_core_early(int nodeid)
202{
203 int i;
204 u64 start, end;
205 unsigned long count = 0;
206 struct range *range = NULL;
207 int nr_range;
208
209 nr_range = get_free_all_memory_range(&range, nodeid);
210
211 for (i = 0; i < nr_range; i++) {
212 start = range[i].start;
213 end = range[i].end;
214 count += end - start;
215 __free_pages_memory(start, end);
216 }
217
218 return count;
219}
220#else
221static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
222{ 173{
223 int aligned; 174 int aligned;
@@ -278,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
278 229
279 return count; 230 return count;
280} 231}
281#endif
282 232
283/** 233/**
284 * free_all_bootmem_node - release a node's free pages to the buddy allocator 234 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -289,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
289unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 239unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
290{ 240{
291 register_page_bootmem_info_node(pgdat); 241 register_page_bootmem_info_node(pgdat);
292#ifdef CONFIG_NO_BOOTMEM
293 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
294 return 0;
295#else
296 return free_all_bootmem_core(pgdat->bdata); 242 return free_all_bootmem_core(pgdat->bdata);
297#endif
298} 243}
299 244
300/** 245/**
@@ -304,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
304 */ 249 */
305unsigned long __init free_all_bootmem(void) 250unsigned long __init free_all_bootmem(void)
306{ 251{
307#ifdef CONFIG_NO_BOOTMEM
308 /*
309 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
310 * because in some case like Node0 doesnt have RAM installed
311 * low ram will be on Node1
312 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
313 * will be used instead of only Node0 related
314 */
315 return free_all_memory_core_early(MAX_NUMNODES);
316#else
317 unsigned long total_pages = 0; 252 unsigned long total_pages = 0;
318 bootmem_data_t *bdata; 253 bootmem_data_t *bdata;
319 254
@@ -321,10 +256,8 @@ unsigned long __init free_all_bootmem(void)
321 total_pages += free_all_bootmem_core(bdata); 256 total_pages += free_all_bootmem_core(bdata);
322 257
323 return total_pages; 258 return total_pages;
324#endif
325} 259}
326 260
327#ifndef CONFIG_NO_BOOTMEM
328static void __init __free(bootmem_data_t *bdata, 261static void __init __free(bootmem_data_t *bdata,
329 unsigned long sidx, unsigned long eidx) 262 unsigned long sidx, unsigned long eidx)
330{ 263{
@@ -419,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
419 } 352 }
420 BUG(); 353 BUG();
421} 354}
422#endif
423 355
424/** 356/**
425 * free_bootmem_node - mark a page range as usable 357 * free_bootmem_node - mark a page range as usable
@@ -434,10 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
434void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 366void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
435 unsigned long size) 367 unsigned long size)
436{ 368{
437#ifdef CONFIG_NO_BOOTMEM
438 kmemleak_free_part(__va(physaddr), size);
439 memblock_x86_free_range(physaddr, physaddr + size);
440#else
441 unsigned long start, end; 369 unsigned long start, end;
442 370
443 kmemleak_free_part(__va(physaddr), size); 371 kmemleak_free_part(__va(physaddr), size);
@@ -446,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
446 end = PFN_DOWN(physaddr + size); 374 end = PFN_DOWN(physaddr + size);
447 375
448 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 376 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
449#endif
450} 377}
451 378
452/** 379/**
@@ -460,10 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
460 */ 387 */
461void __init free_bootmem(unsigned long addr, unsigned long size) 388void __init free_bootmem(unsigned long addr, unsigned long size)
462{ 389{
463#ifdef CONFIG_NO_BOOTMEM
464 kmemleak_free_part(__va(addr), size);
465 memblock_x86_free_range(addr, addr + size);
466#else
467 unsigned long start, end; 390 unsigned long start, end;
468 391
469 kmemleak_free_part(__va(addr), size); 392 kmemleak_free_part(__va(addr), size);
@@ -472,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
472 end = PFN_DOWN(addr + size); 395 end = PFN_DOWN(addr + size);
473 396
474 mark_bootmem(start, end, 0, 0); 397 mark_bootmem(start, end, 0, 0);
475#endif
476} 398}
477 399
478/** 400/**
@@ -489,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
489int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 411int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
490 unsigned long size, int flags) 412 unsigned long size, int flags)
491{ 413{
492#ifdef CONFIG_NO_BOOTMEM
493 panic("no bootmem");
494 return 0;
495#else
496 unsigned long start, end; 414 unsigned long start, end;
497 415
498 start = PFN_DOWN(physaddr); 416 start = PFN_DOWN(physaddr);
499 end = PFN_UP(physaddr + size); 417 end = PFN_UP(physaddr + size);
500 418
501 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 419 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
502#endif
503} 420}
504 421
505/** 422/**
@@ -515,20 +432,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
515int __init reserve_bootmem(unsigned long addr, unsigned long size, 432int __init reserve_bootmem(unsigned long addr, unsigned long size,
516 int flags) 433 int flags)
517{ 434{
518#ifdef CONFIG_NO_BOOTMEM
519 panic("no bootmem");
520 return 0;
521#else
522 unsigned long start, end; 435 unsigned long start, end;
523 436
524 start = PFN_DOWN(addr); 437 start = PFN_DOWN(addr);
525 end = PFN_UP(addr + size); 438 end = PFN_UP(addr + size);
526 439
527 return mark_bootmem(start, end, 1, flags); 440 return mark_bootmem(start, end, 1, flags);
528#endif
529} 441}
530 442
531#ifndef CONFIG_NO_BOOTMEM
532int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, 443int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
533 int flags) 444 int flags)
534{ 445{
@@ -685,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
685#endif 596#endif
686 return NULL; 597 return NULL;
687} 598}
688#endif
689 599
690static void * __init ___alloc_bootmem_nopanic(unsigned long size, 600static void * __init ___alloc_bootmem_nopanic(unsigned long size,
691 unsigned long align, 601 unsigned long align,
692 unsigned long goal, 602 unsigned long goal,
693 unsigned long limit) 603 unsigned long limit)
694{ 604{
695#ifdef CONFIG_NO_BOOTMEM
696 void *ptr;
697
698 if (WARN_ON_ONCE(slab_is_available()))
699 return kzalloc(size, GFP_NOWAIT);
700
701restart:
702
703 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
704
705 if (ptr)
706 return ptr;
707
708 if (goal != 0) {
709 goal = 0;
710 goto restart;
711 }
712
713 return NULL;
714#else
715 bootmem_data_t *bdata; 605 bootmem_data_t *bdata;
716 void *region; 606 void *region;
717 607
@@ -737,7 +627,6 @@ restart:
737 } 627 }
738 628
739 return NULL; 629 return NULL;
740#endif
741} 630}
742 631
743/** 632/**
@@ -758,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
758{ 647{
759 unsigned long limit = 0; 648 unsigned long limit = 0;
760 649
761#ifdef CONFIG_NO_BOOTMEM
762 limit = -1UL;
763#endif
764
765 return ___alloc_bootmem_nopanic(size, align, goal, limit); 650 return ___alloc_bootmem_nopanic(size, align, goal, limit);
766} 651}
767 652
@@ -798,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
798{ 683{
799 unsigned long limit = 0; 684 unsigned long limit = 0;
800 685
801#ifdef CONFIG_NO_BOOTMEM
802 limit = -1UL;
803#endif
804
805 return ___alloc_bootmem(size, align, goal, limit); 686 return ___alloc_bootmem(size, align, goal, limit);
806} 687}
807 688
808#ifndef CONFIG_NO_BOOTMEM
809static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 689static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
810 unsigned long size, unsigned long align, 690 unsigned long size, unsigned long align,
811 unsigned long goal, unsigned long limit) 691 unsigned long goal, unsigned long limit)
@@ -822,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
822 702
823 return ___alloc_bootmem(size, align, goal, limit); 703 return ___alloc_bootmem(size, align, goal, limit);
824} 704}
825#endif
826 705
827/** 706/**
828 * __alloc_bootmem_node - allocate boot memory from a specific node 707 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -842,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
842void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, 721void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
843 unsigned long align, unsigned long goal) 722 unsigned long align, unsigned long goal)
844{ 723{
845 void *ptr;
846
847 if (WARN_ON_ONCE(slab_is_available())) 724 if (WARN_ON_ONCE(slab_is_available()))
848 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 725 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
849 726
850#ifdef CONFIG_NO_BOOTMEM 727 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
851 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
852 goal, -1ULL);
853 if (ptr)
854 return ptr;
855
856 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
857 goal, -1ULL);
858#else
859 ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
860#endif
861
862 return ptr;
863} 728}
864 729
865void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 730void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -880,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
880 unsigned long new_goal; 745 unsigned long new_goal;
881 746
882 new_goal = MAX_DMA32_PFN << PAGE_SHIFT; 747 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
883#ifdef CONFIG_NO_BOOTMEM
884 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
885 new_goal, -1ULL);
886#else
887 ptr = alloc_bootmem_core(pgdat->bdata, size, align, 748 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
888 new_goal, 0); 749 new_goal, 0);
889#endif
890 if (ptr) 750 if (ptr)
891 return ptr; 751 return ptr;
892 } 752 }
@@ -907,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
907void * __init alloc_bootmem_section(unsigned long size, 767void * __init alloc_bootmem_section(unsigned long size,
908 unsigned long section_nr) 768 unsigned long section_nr)
909{ 769{
910#ifdef CONFIG_NO_BOOTMEM
911 unsigned long pfn, goal, limit;
912
913 pfn = section_nr_to_pfn(section_nr);
914 goal = pfn << PAGE_SHIFT;
915 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
916
917 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
918 SMP_CACHE_BYTES, goal, limit);
919#else
920 bootmem_data_t *bdata; 770 bootmem_data_t *bdata;
921 unsigned long pfn, goal, limit; 771 unsigned long pfn, goal, limit;
922 772
@@ -926,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size,
926 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 776 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
927 777
928 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 778 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
929#endif
930} 779}
931#endif 780#endif
932 781
@@ -938,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
938 if (WARN_ON_ONCE(slab_is_available())) 787 if (WARN_ON_ONCE(slab_is_available()))
939 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 788 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
940 789
941#ifdef CONFIG_NO_BOOTMEM
942 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
943 goal, -1ULL);
944#else
945 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 790 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
946 if (ptr) 791 if (ptr)
947 return ptr; 792 return ptr;
948 793
949 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 794 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
950#endif
951 if (ptr) 795 if (ptr)
952 return ptr; 796 return ptr;
953 797
@@ -995,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
995void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 839void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
996 unsigned long align, unsigned long goal) 840 unsigned long align, unsigned long goal)
997{ 841{
998 void *ptr;
999
1000 if (WARN_ON_ONCE(slab_is_available())) 842 if (WARN_ON_ONCE(slab_is_available()))
1001 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 843 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
1002 844
1003#ifdef CONFIG_NO_BOOTMEM 845 return ___alloc_bootmem_node(pgdat->bdata, size, align,
1004 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
1005 goal, ARCH_LOW_ADDRESS_LIMIT); 846 goal, ARCH_LOW_ADDRESS_LIMIT);
1006 if (ptr)
1007 return ptr;
1008 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
1009 goal, ARCH_LOW_ADDRESS_LIMIT);
1010#else
1011 ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
1012 goal, ARCH_LOW_ADDRESS_LIMIT);
1013#endif
1014 return ptr;
1015} 847}
diff --git a/mm/compaction.c b/mm/compaction.c
index 8be430b812de..021a2960ef9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,8 +42,6 @@ struct compact_control {
42 unsigned int order; /* order a direct compactor needs */ 42 unsigned int order; /* order a direct compactor needs */
43 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 43 int migratetype; /* MOVABLE, RECLAIMABLE etc */
44 struct zone *zone; 44 struct zone *zone;
45
46 int compact_mode;
47}; 45};
48 46
49static unsigned long release_freepages(struct list_head *freelist) 47static unsigned long release_freepages(struct list_head *freelist)
@@ -155,7 +153,6 @@ static void isolate_freepages(struct zone *zone,
155 * pages on cc->migratepages. We stop searching if the migrate 153 * pages on cc->migratepages. We stop searching if the migrate
156 * and free page scanners meet or enough free pages are isolated. 154 * and free page scanners meet or enough free pages are isolated.
157 */ 155 */
158 spin_lock_irqsave(&zone->lock, flags);
159 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 156 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
160 pfn -= pageblock_nr_pages) { 157 pfn -= pageblock_nr_pages) {
161 unsigned long isolated; 158 unsigned long isolated;
@@ -178,9 +175,19 @@ static void isolate_freepages(struct zone *zone,
178 if (!suitable_migration_target(page)) 175 if (!suitable_migration_target(page))
179 continue; 176 continue;
180 177
181 /* Found a block suitable for isolating free pages from */ 178 /*
182 isolated = isolate_freepages_block(zone, pfn, freelist); 179 * Found a block suitable for isolating free pages from. Now
183 nr_freepages += isolated; 180 * we disabled interrupts, double check things are ok and
181 * isolate the pages. This is to minimise the time IRQs
182 * are disabled
183 */
184 isolated = 0;
185 spin_lock_irqsave(&zone->lock, flags);
186 if (suitable_migration_target(page)) {
187 isolated = isolate_freepages_block(zone, pfn, freelist);
188 nr_freepages += isolated;
189 }
190 spin_unlock_irqrestore(&zone->lock, flags);
184 191
185 /* 192 /*
186 * Record the highest PFN we isolated pages from. When next 193 * Record the highest PFN we isolated pages from. When next
@@ -190,7 +197,6 @@ static void isolate_freepages(struct zone *zone,
190 if (isolated) 197 if (isolated)
191 high_pfn = max(high_pfn, pfn); 198 high_pfn = max(high_pfn, pfn);
192 } 199 }
193 spin_unlock_irqrestore(&zone->lock, flags);
194 200
195 /* split_free_page does not map the pages */ 201 /* split_free_page does not map the pages */
196 list_for_each_entry(page, freelist, lru) { 202 list_for_each_entry(page, freelist, lru) {
@@ -271,9 +277,27 @@ static unsigned long isolate_migratepages(struct zone *zone,
271 } 277 }
272 278
273 /* Time to isolate some pages for migration */ 279 /* Time to isolate some pages for migration */
280 cond_resched();
274 spin_lock_irq(&zone->lru_lock); 281 spin_lock_irq(&zone->lru_lock);
275 for (; low_pfn < end_pfn; low_pfn++) { 282 for (; low_pfn < end_pfn; low_pfn++) {
276 struct page *page; 283 struct page *page;
284 bool locked = true;
285
286 /* give a chance to irqs before checking need_resched() */
287 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
288 spin_unlock_irq(&zone->lru_lock);
289 locked = false;
290 }
291 if (need_resched() || spin_is_contended(&zone->lru_lock)) {
292 if (locked)
293 spin_unlock_irq(&zone->lru_lock);
294 cond_resched();
295 spin_lock_irq(&zone->lru_lock);
296 if (fatal_signal_pending(current))
297 break;
298 } else if (!locked)
299 spin_lock_irq(&zone->lru_lock);
300
277 if (!pfn_valid_within(low_pfn)) 301 if (!pfn_valid_within(low_pfn))
278 continue; 302 continue;
279 nr_scanned++; 303 nr_scanned++;
@@ -397,10 +421,7 @@ static int compact_finished(struct zone *zone,
397 return COMPACT_COMPLETE; 421 return COMPACT_COMPLETE;
398 422
399 /* Compaction run is not finished if the watermark is not met */ 423 /* Compaction run is not finished if the watermark is not met */
400 if (cc->compact_mode != COMPACT_MODE_KSWAPD) 424 watermark = low_wmark_pages(zone);
401 watermark = low_wmark_pages(zone);
402 else
403 watermark = high_wmark_pages(zone);
404 watermark += (1 << cc->order); 425 watermark += (1 << cc->order);
405 426
406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 427 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
@@ -413,15 +434,6 @@ static int compact_finished(struct zone *zone,
413 if (cc->order == -1) 434 if (cc->order == -1)
414 return COMPACT_CONTINUE; 435 return COMPACT_CONTINUE;
415 436
416 /*
417 * Generating only one page of the right order is not enough
418 * for kswapd, we must continue until we're above the high
419 * watermark as a pool for high order GFP_ATOMIC allocations
420 * too.
421 */
422 if (cc->compact_mode == COMPACT_MODE_KSWAPD)
423 return COMPACT_CONTINUE;
424
425 /* Direct compactor: Is a suitable page free? */ 437 /* Direct compactor: Is a suitable page free? */
426 for (order = cc->order; order < MAX_ORDER; order++) { 438 for (order = cc->order; order < MAX_ORDER; order++) {
427 /* Job done if page is free of the right migratetype */ 439 /* Job done if page is free of the right migratetype */
@@ -508,12 +520,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
508 520
509 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 521 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
510 unsigned long nr_migrate, nr_remaining; 522 unsigned long nr_migrate, nr_remaining;
523 int err;
511 524
512 if (!isolate_migratepages(zone, cc)) 525 if (!isolate_migratepages(zone, cc))
513 continue; 526 continue;
514 527
515 nr_migrate = cc->nr_migratepages; 528 nr_migrate = cc->nr_migratepages;
516 migrate_pages(&cc->migratepages, compaction_alloc, 529 err = migrate_pages(&cc->migratepages, compaction_alloc,
517 (unsigned long)cc, false, 530 (unsigned long)cc, false,
518 cc->sync); 531 cc->sync);
519 update_nr_listpages(cc); 532 update_nr_listpages(cc);
@@ -527,7 +540,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
527 nr_remaining); 540 nr_remaining);
528 541
529 /* Release LRU pages not migrated */ 542 /* Release LRU pages not migrated */
530 if (!list_empty(&cc->migratepages)) { 543 if (err) {
531 putback_lru_pages(&cc->migratepages); 544 putback_lru_pages(&cc->migratepages);
532 cc->nr_migratepages = 0; 545 cc->nr_migratepages = 0;
533 } 546 }
@@ -543,8 +556,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
543 556
544unsigned long compact_zone_order(struct zone *zone, 557unsigned long compact_zone_order(struct zone *zone,
545 int order, gfp_t gfp_mask, 558 int order, gfp_t gfp_mask,
546 bool sync, 559 bool sync)
547 int compact_mode)
548{ 560{
549 struct compact_control cc = { 561 struct compact_control cc = {
550 .nr_freepages = 0, 562 .nr_freepages = 0,
@@ -553,7 +565,6 @@ unsigned long compact_zone_order(struct zone *zone,
553 .migratetype = allocflags_to_migratetype(gfp_mask), 565 .migratetype = allocflags_to_migratetype(gfp_mask),
554 .zone = zone, 566 .zone = zone,
555 .sync = sync, 567 .sync = sync,
556 .compact_mode = compact_mode,
557 }; 568 };
558 INIT_LIST_HEAD(&cc.freepages); 569 INIT_LIST_HEAD(&cc.freepages);
559 INIT_LIST_HEAD(&cc.migratepages); 570 INIT_LIST_HEAD(&cc.migratepages);
@@ -599,8 +610,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
599 nodemask) { 610 nodemask) {
600 int status; 611 int status;
601 612
602 status = compact_zone_order(zone, order, gfp_mask, sync, 613 status = compact_zone_order(zone, order, gfp_mask, sync);
603 COMPACT_MODE_DIRECT_RECLAIM);
604 rc = max(status, rc); 614 rc = max(status, rc);
605 615
606 /* If a normal allocation would succeed, stop compacting */ 616 /* If a normal allocation would succeed, stop compacting */
@@ -631,7 +641,6 @@ static int compact_node(int nid)
631 .nr_freepages = 0, 641 .nr_freepages = 0,
632 .nr_migratepages = 0, 642 .nr_migratepages = 0,
633 .order = -1, 643 .order = -1,
634 .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
635 }; 644 };
636 645
637 zone = &pgdat->node_zones[zoneid]; 646 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468b..c641edf553a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
80 * ->i_mutex 80 * ->i_mutex
81 * ->i_alloc_sem (various) 81 * ->i_alloc_sem (various)
82 * 82 *
83 * ->inode_lock 83 * inode_wb_list_lock
84 * ->sb_lock (fs/fs-writeback.c) 84 * sb_lock (fs/fs-writeback.c)
85 * ->mapping->tree_lock (__sync_single_inode) 85 * ->mapping->tree_lock (__sync_single_inode)
86 * 86 *
87 * ->i_mmap_lock 87 * ->i_mmap_lock
@@ -98,8 +98,10 @@
98 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 98 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
99 * ->private_lock (page_remove_rmap->set_page_dirty) 99 * ->private_lock (page_remove_rmap->set_page_dirty)
100 * ->tree_lock (page_remove_rmap->set_page_dirty) 100 * ->tree_lock (page_remove_rmap->set_page_dirty)
101 * ->inode_lock (page_remove_rmap->set_page_dirty) 101 * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
103 * inode_wb_list_lock (zap_pte_range->set_page_dirty)
104 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 105 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 106 *
105 * (code doesn't rely on that order, so you could switch it around) 107 * (code doesn't rely on that order, so you could switch it around)
@@ -108,11 +110,11 @@
108 */ 110 */
109 111
110/* 112/*
111 * Remove a page from the page cache and free it. Caller has to make 113 * Delete a page from the page cache and free it. Caller has to make
112 * sure the page is locked and that nobody else uses it - or that usage 114 * sure the page is locked and that nobody else uses it - or that usage
113 * is safe. The caller must hold the mapping's tree_lock. 115 * is safe. The caller must hold the mapping's tree_lock.
114 */ 116 */
115void __remove_from_page_cache(struct page *page) 117void __delete_from_page_cache(struct page *page)
116{ 118{
117 struct address_space *mapping = page->mapping; 119 struct address_space *mapping = page->mapping;
118 120
@@ -137,7 +139,15 @@ void __remove_from_page_cache(struct page *page)
137 } 139 }
138} 140}
139 141
140void remove_from_page_cache(struct page *page) 142/**
143 * delete_from_page_cache - delete page from page cache
144 * @page: the page which the kernel is trying to remove from page cache
145 *
146 * This must be called only on pages that have been verified to be in the page
147 * cache and locked. It will never put the page into the free list, the caller
148 * has a reference on the page.
149 */
150void delete_from_page_cache(struct page *page)
141{ 151{
142 struct address_space *mapping = page->mapping; 152 struct address_space *mapping = page->mapping;
143 void (*freepage)(struct page *); 153 void (*freepage)(struct page *);
@@ -146,54 +156,25 @@ void remove_from_page_cache(struct page *page)
146 156
147 freepage = mapping->a_ops->freepage; 157 freepage = mapping->a_ops->freepage;
148 spin_lock_irq(&mapping->tree_lock); 158 spin_lock_irq(&mapping->tree_lock);
149 __remove_from_page_cache(page); 159 __delete_from_page_cache(page);
150 spin_unlock_irq(&mapping->tree_lock); 160 spin_unlock_irq(&mapping->tree_lock);
151 mem_cgroup_uncharge_cache_page(page); 161 mem_cgroup_uncharge_cache_page(page);
152 162
153 if (freepage) 163 if (freepage)
154 freepage(page); 164 freepage(page);
165 page_cache_release(page);
155} 166}
156EXPORT_SYMBOL(remove_from_page_cache); 167EXPORT_SYMBOL(delete_from_page_cache);
157 168
158static int sync_page(void *word) 169static int sleep_on_page(void *word)
159{ 170{
160 struct address_space *mapping;
161 struct page *page;
162
163 page = container_of((unsigned long *)word, struct page, flags);
164
165 /*
166 * page_mapping() is being called without PG_locked held.
167 * Some knowledge of the state and use of the page is used to
168 * reduce the requirements down to a memory barrier.
169 * The danger here is of a stale page_mapping() return value
170 * indicating a struct address_space different from the one it's
171 * associated with when it is associated with one.
172 * After smp_mb(), it's either the correct page_mapping() for
173 * the page, or an old page_mapping() and the page's own
174 * page_mapping() has gone NULL.
175 * The ->sync_page() address_space operation must tolerate
176 * page_mapping() going NULL. By an amazing coincidence,
177 * this comes about because none of the users of the page
178 * in the ->sync_page() methods make essential use of the
179 * page_mapping(), merely passing the page down to the backing
180 * device's unplug functions when it's non-NULL, which in turn
181 * ignore it for all cases but swap, where only page_private(page) is
182 * of interest. When page_mapping() does go NULL, the entire
183 * call stack gracefully ignores the page and returns.
184 * -- wli
185 */
186 smp_mb();
187 mapping = page_mapping(page);
188 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
189 mapping->a_ops->sync_page(page);
190 io_schedule(); 171 io_schedule();
191 return 0; 172 return 0;
192} 173}
193 174
194static int sync_page_killable(void *word) 175static int sleep_on_page_killable(void *word)
195{ 176{
196 sync_page(word); 177 sleep_on_page(word);
197 return fatal_signal_pending(current) ? -EINTR : 0; 178 return fatal_signal_pending(current) ? -EINTR : 0;
198} 179}
199 180
@@ -387,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
387EXPORT_SYMBOL(filemap_write_and_wait_range); 368EXPORT_SYMBOL(filemap_write_and_wait_range);
388 369
389/** 370/**
371 * replace_page_cache_page - replace a pagecache page with a new one
372 * @old: page to be replaced
373 * @new: page to replace with
374 * @gfp_mask: allocation mode
375 *
376 * This function replaces a page in the pagecache with a new one. On
377 * success it acquires the pagecache reference for the new page and
378 * drops it for the old page. Both the old and new pages must be
379 * locked. This function does not add the new page to the LRU, the
380 * caller must do that.
381 *
382 * The remove + add is atomic. The only way this function can fail is
383 * memory allocation failure.
384 */
385int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
386{
387 int error;
388 struct mem_cgroup *memcg = NULL;
389
390 VM_BUG_ON(!PageLocked(old));
391 VM_BUG_ON(!PageLocked(new));
392 VM_BUG_ON(new->mapping);
393
394 /*
395 * This is not page migration, but prepare_migration and
396 * end_migration does enough work for charge replacement.
397 *
398 * In the longer term we probably want a specialized function
399 * for moving the charge from old to new in a more efficient
400 * manner.
401 */
402 error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
403 if (error)
404 return error;
405
406 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
407 if (!error) {
408 struct address_space *mapping = old->mapping;
409 void (*freepage)(struct page *);
410
411 pgoff_t offset = old->index;
412 freepage = mapping->a_ops->freepage;
413
414 page_cache_get(new);
415 new->mapping = mapping;
416 new->index = offset;
417
418 spin_lock_irq(&mapping->tree_lock);
419 __delete_from_page_cache(old);
420 error = radix_tree_insert(&mapping->page_tree, offset, new);
421 BUG_ON(error);
422 mapping->nrpages++;
423 __inc_zone_page_state(new, NR_FILE_PAGES);
424 if (PageSwapBacked(new))
425 __inc_zone_page_state(new, NR_SHMEM);
426 spin_unlock_irq(&mapping->tree_lock);
427 radix_tree_preload_end();
428 if (freepage)
429 freepage(old);
430 page_cache_release(old);
431 mem_cgroup_end_migration(memcg, old, new, true);
432 } else {
433 mem_cgroup_end_migration(memcg, old, new, false);
434 }
435
436 return error;
437}
438EXPORT_SYMBOL_GPL(replace_page_cache_page);
439
440/**
390 * add_to_page_cache_locked - add a locked page to the pagecache 441 * add_to_page_cache_locked - add a locked page to the pagecache
391 * @page: page to add 442 * @page: page to add
392 * @mapping: the page's address_space 443 * @mapping: the page's address_space
@@ -479,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
479EXPORT_SYMBOL(__page_cache_alloc); 530EXPORT_SYMBOL(__page_cache_alloc);
480#endif 531#endif
481 532
482static int __sleep_on_page_lock(void *word)
483{
484 io_schedule();
485 return 0;
486}
487
488/* 533/*
489 * In order to wait for pages to become available there must be 534 * In order to wait for pages to become available there must be
490 * waitqueues associated with pages. By using a hash table of 535 * waitqueues associated with pages. By using a hash table of
@@ -512,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
512 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 557 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
513 558
514 if (test_bit(bit_nr, &page->flags)) 559 if (test_bit(bit_nr, &page->flags))
515 __wait_on_bit(page_waitqueue(page), &wait, sync_page, 560 __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
516 TASK_UNINTERRUPTIBLE); 561 TASK_UNINTERRUPTIBLE);
517} 562}
518EXPORT_SYMBOL(wait_on_page_bit); 563EXPORT_SYMBOL(wait_on_page_bit);
@@ -576,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback);
576/** 621/**
577 * __lock_page - get a lock on the page, assuming we need to sleep to get it 622 * __lock_page - get a lock on the page, assuming we need to sleep to get it
578 * @page: the page to lock 623 * @page: the page to lock
579 *
580 * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
581 * random driver's requestfn sets TASK_RUNNING, we could busywait. However
582 * chances are that on the second loop, the block layer's plug list is empty,
583 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
584 */ 624 */
585void __lock_page(struct page *page) 625void __lock_page(struct page *page)
586{ 626{
587 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 627 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
588 628
589 __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, 629 __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
590 TASK_UNINTERRUPTIBLE); 630 TASK_UNINTERRUPTIBLE);
591} 631}
592EXPORT_SYMBOL(__lock_page); 632EXPORT_SYMBOL(__lock_page);
@@ -596,24 +636,10 @@ int __lock_page_killable(struct page *page)
596 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 636 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
597 637
598 return __wait_on_bit_lock(page_waitqueue(page), &wait, 638 return __wait_on_bit_lock(page_waitqueue(page), &wait,
599 sync_page_killable, TASK_KILLABLE); 639 sleep_on_page_killable, TASK_KILLABLE);
600} 640}
601EXPORT_SYMBOL_GPL(__lock_page_killable); 641EXPORT_SYMBOL_GPL(__lock_page_killable);
602 642
603/**
604 * __lock_page_nosync - get a lock on the page, without calling sync_page()
605 * @page: the page to lock
606 *
607 * Variant of lock_page that does not require the caller to hold a reference
608 * on the page's mapping.
609 */
610void __lock_page_nosync(struct page *page)
611{
612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
614 TASK_UNINTERRUPTIBLE);
615}
616
617int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 643int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
618 unsigned int flags) 644 unsigned int flags)
619{ 645{
@@ -621,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
621 __lock_page(page); 647 __lock_page(page);
622 return 1; 648 return 1;
623 } else { 649 } else {
624 up_read(&mm->mmap_sem); 650 if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
625 wait_on_page_locked(page); 651 up_read(&mm->mmap_sem);
652 wait_on_page_locked(page);
653 }
626 return 0; 654 return 0;
627 } 655 }
628} 656}
@@ -782,9 +810,13 @@ repeat:
782 page = radix_tree_deref_slot((void **)pages[i]); 810 page = radix_tree_deref_slot((void **)pages[i]);
783 if (unlikely(!page)) 811 if (unlikely(!page))
784 continue; 812 continue;
813
814 /*
815 * This can only trigger when the entry at index 0 moves out
816 * of or back to the root: none yet gotten, safe to restart.
817 */
785 if (radix_tree_deref_retry(page)) { 818 if (radix_tree_deref_retry(page)) {
786 if (ret) 819 WARN_ON(start | i);
787 start = pages[ret-1]->index;
788 goto restart; 820 goto restart;
789 } 821 }
790 822
@@ -800,6 +832,13 @@ repeat:
800 pages[ret] = page; 832 pages[ret] = page;
801 ret++; 833 ret++;
802 } 834 }
835
836 /*
837 * If all entries were removed before we could secure them,
838 * try again, because callers stop trying once 0 is returned.
839 */
840 if (unlikely(!ret && nr_found))
841 goto restart;
803 rcu_read_unlock(); 842 rcu_read_unlock();
804 return ret; 843 return ret;
805} 844}
@@ -834,6 +873,11 @@ repeat:
834 page = radix_tree_deref_slot((void **)pages[i]); 873 page = radix_tree_deref_slot((void **)pages[i]);
835 if (unlikely(!page)) 874 if (unlikely(!page))
836 continue; 875 continue;
876
877 /*
878 * This can only trigger when the entry at index 0 moves out
879 * of or back to the root: none yet gotten, safe to restart.
880 */
837 if (radix_tree_deref_retry(page)) 881 if (radix_tree_deref_retry(page))
838 goto restart; 882 goto restart;
839 883
@@ -894,6 +938,11 @@ repeat:
894 page = radix_tree_deref_slot((void **)pages[i]); 938 page = radix_tree_deref_slot((void **)pages[i]);
895 if (unlikely(!page)) 939 if (unlikely(!page))
896 continue; 940 continue;
941
942 /*
943 * This can only trigger when the entry at index 0 moves out
944 * of or back to the root: none yet gotten, safe to restart.
945 */
897 if (radix_tree_deref_retry(page)) 946 if (radix_tree_deref_retry(page))
898 goto restart; 947 goto restart;
899 948
@@ -909,6 +958,13 @@ repeat:
909 pages[ret] = page; 958 pages[ret] = page;
910 ret++; 959 ret++;
911 } 960 }
961
962 /*
963 * If all entries were removed before we could secure them,
964 * try again, because callers stop trying once 0 is returned.
965 */
966 if (unlikely(!ret && nr_found))
967 goto restart;
912 rcu_read_unlock(); 968 rcu_read_unlock();
913 969
914 if (ret) 970 if (ret)
@@ -1298,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1298 unsigned long seg = 0; 1354 unsigned long seg = 0;
1299 size_t count; 1355 size_t count;
1300 loff_t *ppos = &iocb->ki_pos; 1356 loff_t *ppos = &iocb->ki_pos;
1357 struct blk_plug plug;
1301 1358
1302 count = 0; 1359 count = 0;
1303 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 1360 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1304 if (retval) 1361 if (retval)
1305 return retval; 1362 return retval;
1306 1363
1364 blk_start_plug(&plug);
1365
1307 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1366 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1308 if (filp->f_flags & O_DIRECT) { 1367 if (filp->f_flags & O_DIRECT) {
1309 loff_t size; 1368 loff_t size;
@@ -1376,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1376 break; 1435 break;
1377 } 1436 }
1378out: 1437out:
1438 blk_finish_plug(&plug);
1379 return retval; 1439 return retval;
1380} 1440}
1381EXPORT_SYMBOL(generic_file_aio_read); 1441EXPORT_SYMBOL(generic_file_aio_read);
@@ -2487,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2487{ 2547{
2488 struct file *file = iocb->ki_filp; 2548 struct file *file = iocb->ki_filp;
2489 struct inode *inode = file->f_mapping->host; 2549 struct inode *inode = file->f_mapping->host;
2550 struct blk_plug plug;
2490 ssize_t ret; 2551 ssize_t ret;
2491 2552
2492 BUG_ON(iocb->ki_pos != pos); 2553 BUG_ON(iocb->ki_pos != pos);
2493 2554
2494 mutex_lock(&inode->i_mutex); 2555 mutex_lock(&inode->i_mutex);
2556 blk_start_plug(&plug);
2495 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 2557 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2496 mutex_unlock(&inode->i_mutex); 2558 mutex_unlock(&inode->i_mutex);
2497 2559
@@ -2502,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2502 if (err < 0 && ret > 0) 2564 if (err < 0 && ret > 0)
2503 ret = err; 2565 ret = err;
2504 } 2566 }
2567 blk_finish_plug(&plug);
2505 return ret; 2568 return ret;
2506} 2569}
2507EXPORT_SYMBOL(generic_file_aio_write); 2570EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..470dcda10add 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
244 struct kobj_attribute *attr, char *buf, 244 struct kobj_attribute *attr, char *buf,
245 enum transparent_hugepage_flag flag) 245 enum transparent_hugepage_flag flag)
246{ 246{
247 if (test_bit(flag, &transparent_hugepage_flags)) 247 return sprintf(buf, "%d\n",
248 return sprintf(buf, "[yes] no\n"); 248 !!test_bit(flag, &transparent_hugepage_flags));
249 else
250 return sprintf(buf, "yes [no]\n");
251} 249}
250
252static ssize_t single_flag_store(struct kobject *kobj, 251static ssize_t single_flag_store(struct kobject *kobj,
253 struct kobj_attribute *attr, 252 struct kobj_attribute *attr,
254 const char *buf, size_t count, 253 const char *buf, size_t count,
255 enum transparent_hugepage_flag flag) 254 enum transparent_hugepage_flag flag)
256{ 255{
257 if (!memcmp("yes", buf, 256 unsigned long value;
258 min(sizeof("yes")-1, count))) { 257 int ret;
258
259 ret = kstrtoul(buf, 10, &value);
260 if (ret < 0)
261 return ret;
262 if (value > 1)
263 return -EINVAL;
264
265 if (value)
259 set_bit(flag, &transparent_hugepage_flags); 266 set_bit(flag, &transparent_hugepage_flags);
260 } else if (!memcmp("no", buf, 267 else
261 min(sizeof("no")-1, count))) {
262 clear_bit(flag, &transparent_hugepage_flags); 268 clear_bit(flag, &transparent_hugepage_flags);
263 } else
264 return -EINVAL;
265 269
266 return count; 270 return count;
267} 271}
@@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
643 return ret; 647 return ret;
644} 648}
645 649
646static inline gfp_t alloc_hugepage_gfpmask(int defrag) 650static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
647{ 651{
648 return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); 652 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
649} 653}
650 654
651static inline struct page *alloc_hugepage_vma(int defrag, 655static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma, 656 struct vm_area_struct *vma,
653 unsigned long haddr) 657 unsigned long haddr, int nd,
658 gfp_t extra_gfp)
654{ 659{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), 660 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
656 HPAGE_PMD_ORDER, vma, haddr); 661 HPAGE_PMD_ORDER, vma, haddr, nd);
657} 662}
658 663
659#ifndef CONFIG_NUMA 664#ifndef CONFIG_NUMA
660static inline struct page *alloc_hugepage(int defrag) 665static inline struct page *alloc_hugepage(int defrag)
661{ 666{
662 return alloc_pages(alloc_hugepage_gfpmask(defrag), 667 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
663 HPAGE_PMD_ORDER); 668 HPAGE_PMD_ORDER);
664} 669}
665#endif 670#endif
@@ -678,9 +683,12 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
678 if (unlikely(khugepaged_enter(vma))) 683 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM; 684 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 685 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr); 686 vma, haddr, numa_node_id(), 0);
682 if (unlikely(!page)) 687 if (unlikely(!page)) {
688 count_vm_event(THP_FAULT_FALLBACK);
683 goto out; 689 goto out;
690 }
691 count_vm_event(THP_FAULT_ALLOC);
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { 692 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
685 put_page(page); 693 put_page(page);
686 goto out; 694 goto out;
@@ -799,8 +807,9 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
799 } 807 }
800 808
801 for (i = 0; i < HPAGE_PMD_NR; i++) { 809 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 810 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
803 vma, address); 811 __GFP_OTHER_NODE,
812 vma, address, page_to_nid(page));
804 if (unlikely(!pages[i] || 813 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm, 814 mem_cgroup_newpage_charge(pages[i], mm,
806 GFP_KERNEL))) { 815 GFP_KERNEL))) {
@@ -902,16 +911,18 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
902 if (transparent_hugepage_enabled(vma) && 911 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow()) 912 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 913 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr); 914 vma, haddr, numa_node_id(), 0);
906 else 915 else
907 new_page = NULL; 916 new_page = NULL;
908 917
909 if (unlikely(!new_page)) { 918 if (unlikely(!new_page)) {
919 count_vm_event(THP_FAULT_FALLBACK);
910 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 920 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
911 pmd, orig_pmd, page, haddr); 921 pmd, orig_pmd, page, haddr);
912 put_page(page); 922 put_page(page);
913 goto out; 923 goto out;
914 } 924 }
925 count_vm_event(THP_FAULT_ALLOC);
915 926
916 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 927 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
917 put_page(new_page); 928 put_page(new_page);
@@ -1162,7 +1173,12 @@ static void __split_huge_page_refcount(struct page *page)
1162 /* after clearing PageTail the gup refcount can be released */ 1173 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb(); 1174 smp_mb();
1164 1175
1165 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1176 /*
1177 * retain hwpoison flag of the poisoned tail page:
1178 * fix for the unsuitable process killed on Guest Machine(KVM)
1179 * by the memory-failure.
1180 */
1181 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1166 page_tail->flags |= (page->flags & 1182 page_tail->flags |= (page->flags &
1167 ((1L << PG_referenced) | 1183 ((1L << PG_referenced) |
1168 (1L << PG_swapbacked) | 1184 (1L << PG_swapbacked) |
@@ -1383,6 +1399,7 @@ int split_huge_page(struct page *page)
1383 1399
1384 BUG_ON(!PageSwapBacked(page)); 1400 BUG_ON(!PageSwapBacked(page));
1385 __split_huge_page(page, anon_vma); 1401 __split_huge_page(page, anon_vma);
1402 count_vm_event(THP_SPLIT);
1386 1403
1387 BUG_ON(PageCompound(page)); 1404 BUG_ON(PageCompound(page));
1388out_unlock: 1405out_unlock:
@@ -1740,7 +1757,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1740static void collapse_huge_page(struct mm_struct *mm, 1757static void collapse_huge_page(struct mm_struct *mm,
1741 unsigned long address, 1758 unsigned long address,
1742 struct page **hpage, 1759 struct page **hpage,
1743 struct vm_area_struct *vma) 1760 struct vm_area_struct *vma,
1761 int node)
1744{ 1762{
1745 pgd_t *pgd; 1763 pgd_t *pgd;
1746 pud_t *pud; 1764 pud_t *pud;
@@ -1756,6 +1774,10 @@ static void collapse_huge_page(struct mm_struct *mm,
1756#ifndef CONFIG_NUMA 1774#ifndef CONFIG_NUMA
1757 VM_BUG_ON(!*hpage); 1775 VM_BUG_ON(!*hpage);
1758 new_page = *hpage; 1776 new_page = *hpage;
1777 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1778 up_read(&mm->mmap_sem);
1779 return;
1780 }
1759#else 1781#else
1760 VM_BUG_ON(*hpage); 1782 VM_BUG_ON(*hpage);
1761 /* 1783 /*
@@ -1768,18 +1790,21 @@ static void collapse_huge_page(struct mm_struct *mm,
1768 * mmap_sem in read mode is good idea also to allow greater 1790 * mmap_sem in read mode is good idea also to allow greater
1769 * scalability. 1791 * scalability.
1770 */ 1792 */
1771 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); 1793 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1794 node, __GFP_OTHER_NODE);
1772 if (unlikely(!new_page)) { 1795 if (unlikely(!new_page)) {
1773 up_read(&mm->mmap_sem); 1796 up_read(&mm->mmap_sem);
1797 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1774 *hpage = ERR_PTR(-ENOMEM); 1798 *hpage = ERR_PTR(-ENOMEM);
1775 return; 1799 return;
1776 } 1800 }
1777#endif 1801 count_vm_event(THP_COLLAPSE_ALLOC);
1778 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1802 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1779 up_read(&mm->mmap_sem); 1803 up_read(&mm->mmap_sem);
1780 put_page(new_page); 1804 put_page(new_page);
1781 return; 1805 return;
1782 } 1806 }
1807#endif
1783 1808
1784 /* after allocating the hugepage upgrade to mmap_sem write mode */ 1809 /* after allocating the hugepage upgrade to mmap_sem write mode */
1785 up_read(&mm->mmap_sem); 1810 up_read(&mm->mmap_sem);
@@ -1806,6 +1831,8 @@ static void collapse_huge_page(struct mm_struct *mm,
1806 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 1831 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1807 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) 1832 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1808 goto out; 1833 goto out;
1834 if (is_vma_temporary_stack(vma))
1835 goto out;
1809 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 1836 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1810 1837
1811 pgd = pgd_offset(mm, address); 1838 pgd = pgd_offset(mm, address);
@@ -1847,7 +1874,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1847 set_pmd_at(mm, address, pmd, _pmd); 1874 set_pmd_at(mm, address, pmd, _pmd);
1848 spin_unlock(&mm->page_table_lock); 1875 spin_unlock(&mm->page_table_lock);
1849 anon_vma_unlock(vma->anon_vma); 1876 anon_vma_unlock(vma->anon_vma);
1850 mem_cgroup_uncharge_page(new_page);
1851 goto out; 1877 goto out;
1852 } 1878 }
1853 1879
@@ -1893,6 +1919,7 @@ out_up_write:
1893 return; 1919 return;
1894 1920
1895out: 1921out:
1922 mem_cgroup_uncharge_page(new_page);
1896#ifdef CONFIG_NUMA 1923#ifdef CONFIG_NUMA
1897 put_page(new_page); 1924 put_page(new_page);
1898#endif 1925#endif
@@ -1912,6 +1939,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1912 struct page *page; 1939 struct page *page;
1913 unsigned long _address; 1940 unsigned long _address;
1914 spinlock_t *ptl; 1941 spinlock_t *ptl;
1942 int node = -1;
1915 1943
1916 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1944 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1917 1945
@@ -1942,6 +1970,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
1942 page = vm_normal_page(vma, _address, pteval); 1970 page = vm_normal_page(vma, _address, pteval);
1943 if (unlikely(!page)) 1971 if (unlikely(!page))
1944 goto out_unmap; 1972 goto out_unmap;
1973 /*
1974 * Chose the node of the first page. This could
1975 * be more sophisticated and look at more pages,
1976 * but isn't for now.
1977 */
1978 if (node == -1)
1979 node = page_to_nid(page);
1945 VM_BUG_ON(PageCompound(page)); 1980 VM_BUG_ON(PageCompound(page));
1946 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 1981 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1947 goto out_unmap; 1982 goto out_unmap;
@@ -1958,7 +1993,7 @@ out_unmap:
1958 pte_unmap_unlock(pte, ptl); 1993 pte_unmap_unlock(pte, ptl);
1959 if (ret) 1994 if (ret)
1960 /* collapse_huge_page will return with the mmap_sem released */ 1995 /* collapse_huge_page will return with the mmap_sem released */
1961 collapse_huge_page(mm, address, hpage, vma); 1996 collapse_huge_page(mm, address, hpage, vma, node);
1962out: 1997out:
1963 return ret; 1998 return ret;
1964} 1999}
@@ -2027,32 +2062,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2027 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2062 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2028 !khugepaged_always()) || 2063 !khugepaged_always()) ||
2029 (vma->vm_flags & VM_NOHUGEPAGE)) { 2064 (vma->vm_flags & VM_NOHUGEPAGE)) {
2065 skip:
2030 progress++; 2066 progress++;
2031 continue; 2067 continue;
2032 } 2068 }
2033
2034 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ 2069 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
2035 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { 2070 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
2036 khugepaged_scan.address = vma->vm_end; 2071 goto skip;
2037 progress++; 2072 if (is_vma_temporary_stack(vma))
2038 continue; 2073 goto skip;
2039 } 2074
2040 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); 2075 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
2041 2076
2042 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2077 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2043 hend = vma->vm_end & HPAGE_PMD_MASK; 2078 hend = vma->vm_end & HPAGE_PMD_MASK;
2044 if (hstart >= hend) { 2079 if (hstart >= hend)
2045 progress++; 2080 goto skip;
2046 continue; 2081 if (khugepaged_scan.address > hend)
2047 } 2082 goto skip;
2048 if (khugepaged_scan.address < hstart) 2083 if (khugepaged_scan.address < hstart)
2049 khugepaged_scan.address = hstart; 2084 khugepaged_scan.address = hstart;
2050 if (khugepaged_scan.address > hend) { 2085 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2051 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
2052 progress++;
2053 continue;
2054 }
2055 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2056 2086
2057 while (khugepaged_scan.address < hend) { 2087 while (khugepaged_scan.address < hend) {
2058 int ret; 2088 int ret;
@@ -2081,7 +2111,7 @@ breakouterloop:
2081breakouterloop_mmap_sem: 2111breakouterloop_mmap_sem:
2082 2112
2083 spin_lock(&khugepaged_mm_lock); 2113 spin_lock(&khugepaged_mm_lock);
2084 BUG_ON(khugepaged_scan.mm_slot != mm_slot); 2114 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2085 /* 2115 /*
2086 * Release the current mm_slot if this mm is about to die, or 2116 * Release the current mm_slot if this mm is about to die, or
2087 * if we scanned all vmas of this mm. 2117 * if we scanned all vmas of this mm.
@@ -2133,8 +2163,11 @@ static void khugepaged_do_scan(struct page **hpage)
2133#ifndef CONFIG_NUMA 2163#ifndef CONFIG_NUMA
2134 if (!*hpage) { 2164 if (!*hpage) {
2135 *hpage = alloc_hugepage(khugepaged_defrag()); 2165 *hpage = alloc_hugepage(khugepaged_defrag());
2136 if (unlikely(!*hpage)) 2166 if (unlikely(!*hpage)) {
2167 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2137 break; 2168 break;
2169 }
2170 count_vm_event(THP_COLLAPSE_ALLOC);
2138 } 2171 }
2139#else 2172#else
2140 if (IS_ERR(*hpage)) 2173 if (IS_ERR(*hpage))
@@ -2174,8 +2207,11 @@ static struct page *khugepaged_alloc_hugepage(void)
2174 2207
2175 do { 2208 do {
2176 hpage = alloc_hugepage(khugepaged_defrag()); 2209 hpage = alloc_hugepage(khugepaged_defrag());
2177 if (!hpage) 2210 if (!hpage) {
2211 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2178 khugepaged_alloc_sleep(); 2212 khugepaged_alloc_sleep();
2213 } else
2214 count_vm_event(THP_COLLAPSE_ALLOC);
2179 } while (unlikely(!hpage) && 2215 } while (unlikely(!hpage) &&
2180 likely(khugepaged_enabled())); 2216 likely(khugepaged_enabled()));
2181 return hpage; 2217 return hpage;
@@ -2192,8 +2228,11 @@ static void khugepaged_loop(void)
2192 while (likely(khugepaged_enabled())) { 2228 while (likely(khugepaged_enabled())) {
2193#ifndef CONFIG_NUMA 2229#ifndef CONFIG_NUMA
2194 hpage = khugepaged_alloc_hugepage(); 2230 hpage = khugepaged_alloc_hugepage();
2195 if (unlikely(!hpage)) 2231 if (unlikely(!hpage)) {
2232 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2196 break; 2233 break;
2234 }
2235 count_vm_event(THP_COLLAPSE_ALLOC);
2197#else 2236#else
2198 if (IS_ERR(hpage)) { 2237 if (IS_ERR(hpage)) {
2199 khugepaged_alloc_sleep(); 2238 khugepaged_alloc_sleep();
@@ -2236,9 +2275,9 @@ static int khugepaged(void *none)
2236 2275
2237 for (;;) { 2276 for (;;) {
2238 mutex_unlock(&khugepaged_mutex); 2277 mutex_unlock(&khugepaged_mutex);
2239 BUG_ON(khugepaged_thread != current); 2278 VM_BUG_ON(khugepaged_thread != current);
2240 khugepaged_loop(); 2279 khugepaged_loop();
2241 BUG_ON(khugepaged_thread != current); 2280 VM_BUG_ON(khugepaged_thread != current);
2242 2281
2243 mutex_lock(&khugepaged_mutex); 2282 mutex_lock(&khugepaged_mutex);
2244 if (!khugepaged_enabled()) 2283 if (!khugepaged_enabled())
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bb0b7c128015..8ee3bd8ec5b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
146 if (rg->from > t) 146 if (rg->from > t)
147 return chg; 147 return chg;
148 148
149 /* We overlap with this area, if it extends futher than 149 /* We overlap with this area, if it extends further than
150 * us then we must extend ourselves. Account for its 150 * us then we must extend ourselves. Account for its
151 * existing reservation. */ 151 * existing reservation. */
152 if (rg->to > t) { 152 if (rg->to > t) {
@@ -842,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
842} 842}
843 843
844/* 844/*
845 * Increase the hugetlb pool such that it can accomodate a reservation 845 * Increase the hugetlb pool such that it can accommodate a reservation
846 * of size 'delta'. 846 * of size 'delta'.
847 */ 847 */
848static int gather_surplus_pages(struct hstate *h, int delta) 848static int gather_surplus_pages(struct hstate *h, int delta)
@@ -890,7 +890,7 @@ retry:
890 890
891 /* 891 /*
892 * The surplus_list now contains _at_least_ the number of extra pages 892 * The surplus_list now contains _at_least_ the number of extra pages
893 * needed to accomodate the reservation. Add the appropriate number 893 * needed to accommodate the reservation. Add the appropriate number
894 * of pages to the hugetlb pool and free the extras back to the buddy 894 * of pages to the hugetlb pool and free the extras back to the buddy
895 * allocator. Commit the entire reservation here to prevent another 895 * allocator. Commit the entire reservation here to prevent another
896 * process from stealing the pages as they are added to the pool but 896 * process from stealing the pages as they are added to the pool but
@@ -1872,8 +1872,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1872 unsigned long tmp; 1872 unsigned long tmp;
1873 int ret; 1873 int ret;
1874 1874
1875 if (!write) 1875 tmp = h->max_huge_pages;
1876 tmp = h->max_huge_pages;
1877 1876
1878 if (write && h->order >= MAX_ORDER) 1877 if (write && h->order >= MAX_ORDER)
1879 return -EINVAL; 1878 return -EINVAL;
@@ -1938,8 +1937,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1938 unsigned long tmp; 1937 unsigned long tmp;
1939 int ret; 1938 int ret;
1940 1939
1941 if (!write) 1940 tmp = h->nr_overcommit_huge_pages;
1942 tmp = h->nr_overcommit_huge_pages;
1943 1941
1944 if (write && h->order >= MAX_ORDER) 1942 if (write && h->order >= MAX_ORDER)
1945 return -EINVAL; 1943 return -EINVAL;
@@ -2045,7 +2043,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2045 * This new VMA should share its siblings reservation map if present. 2043 * This new VMA should share its siblings reservation map if present.
2046 * The VMA will only ever have a valid reservation map pointer where 2044 * The VMA will only ever have a valid reservation map pointer where
2047 * it is being copied for another still existing VMA. As that VMA 2045 * it is being copied for another still existing VMA. As that VMA
2048 * has a reference to the reservation map it cannot dissappear until 2046 * has a reference to the reservation map it cannot disappear until
2049 * after this open call completes. It is therefore safe to take a 2047 * after this open call completes. It is therefore safe to take a
2050 * new reference here without additional locking. 2048 * new reference here without additional locking.
2051 */ 2049 */
@@ -2492,7 +2490,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2492 /* 2490 /*
2493 * Currently, we are forced to kill the process in the event the 2491 * Currently, we are forced to kill the process in the event the
2494 * original mapper has unmapped pages from the child due to a failed 2492 * original mapper has unmapped pages from the child due to a failed
2495 * COW. Warn that such a situation has occured as it may not be obvious 2493 * COW. Warn that such a situation has occurred as it may not be obvious
2496 */ 2494 */
2497 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2495 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2498 printk(KERN_WARNING 2496 printk(KERN_WARNING
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 0948f1072d6b..c7fc7fd00e32 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,4 +1,4 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */ 1/* Inject a hwpoison memory failure on a arbitrary pfn */
2#include <linux/module.h> 2#include <linux/module.h>
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
diff --git a/mm/internal.h b/mm/internal.h
index 69488205723d..9d0ced8e505e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -162,7 +162,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset)
162} 162}
163 163
164/* 164/*
165 * Iterator over all subpages withing the maximally aligned gigantic 165 * Iterator over all subpages within the maximally aligned gigantic
166 * page 'base'. Handle any discontiguity in the mem_map. 166 * page 'base'. Handle any discontiguity in the mem_map.
167 */ 167 */
168static inline struct page *mem_map_next(struct page *iter, 168static inline struct page *mem_map_next(struct page *iter,
@@ -245,11 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
245} 245}
246#endif /* CONFIG_SPARSEMEM */ 246#endif /* CONFIG_SPARSEMEM */
247 247
248int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
249 unsigned long start, int len, unsigned int foll_flags,
250 struct page **pages, struct vm_area_struct **vmas,
251 int *nonblocking);
252
253#define ZONE_RECLAIM_NOSCAN -2 248#define ZONE_RECLAIM_NOSCAN -2
254#define ZONE_RECLAIM_FULL -1 249#define ZONE_RECLAIM_FULL -1
255#define ZONE_RECLAIM_SOME 0 250#define ZONE_RECLAIM_SOME 0
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 84225f3b7190..c1d5867543e4 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -265,7 +265,7 @@ static void kmemleak_disable(void);
265} while (0) 265} while (0)
266 266
267/* 267/*
268 * Macro invoked when a serious kmemleak condition occured and cannot be 268 * Macro invoked when a serious kmemleak condition occurred and cannot be
269 * recovered from. Kmemleak will be disabled and further allocation/freeing 269 * recovered from. Kmemleak will be disabled and further allocation/freeing
270 * tracing no longer available. 270 * tracing no longer available.
271 */ 271 */
@@ -1006,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object)
1006 1006
1007/* 1007/*
1008 * Memory scanning is a long process and it needs to be interruptable. This 1008 * Memory scanning is a long process and it needs to be interruptable. This
1009 * function checks whether such interrupt condition occured. 1009 * function checks whether such interrupt condition occurred.
1010 */ 1010 */
1011static int scan_should_stop(void) 1011static int scan_should_stop(void)
1012{ 1012{
@@ -1733,7 +1733,7 @@ static int __init kmemleak_late_init(void)
1733 1733
1734 if (atomic_read(&kmemleak_error)) { 1734 if (atomic_read(&kmemleak_error)) {
1735 /* 1735 /*
1736 * Some error occured and kmemleak was disabled. There is a 1736 * Some error occurred and kmemleak was disabled. There is a
1737 * small chance that kmemleak_disable() was called immediately 1737 * small chance that kmemleak_disable() was called immediately
1738 * after setting kmemleak_initialized and we may end up with 1738 * after setting kmemleak_initialized and we may end up with
1739 * two clean-up threads but serialized by scan_mutex. 1739 * two clean-up threads but serialized by scan_mutex.
diff --git a/mm/ksm.c b/mm/ksm.c
index c2b2a94f9d67..942dfc73a2ff 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -301,20 +301,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
301 return rmap_item->address & STABLE_FLAG; 301 return rmap_item->address & STABLE_FLAG;
302} 302}
303 303
304static void hold_anon_vma(struct rmap_item *rmap_item,
305 struct anon_vma *anon_vma)
306{
307 rmap_item->anon_vma = anon_vma;
308 get_anon_vma(anon_vma);
309}
310
311static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
312{
313 struct anon_vma *anon_vma = rmap_item->anon_vma;
314
315 drop_anon_vma(anon_vma);
316}
317
318/* 304/*
319 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 305 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
320 * page tables after it has passed through ksm_exit() - which, if necessary, 306 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -397,7 +383,7 @@ static void break_cow(struct rmap_item *rmap_item)
397 * It is not an accident that whenever we want to break COW 383 * It is not an accident that whenever we want to break COW
398 * to undo, we also need to drop a reference to the anon_vma. 384 * to undo, we also need to drop a reference to the anon_vma.
399 */ 385 */
400 ksm_drop_anon_vma(rmap_item); 386 put_anon_vma(rmap_item->anon_vma);
401 387
402 down_read(&mm->mmap_sem); 388 down_read(&mm->mmap_sem);
403 if (ksm_test_exit(mm)) 389 if (ksm_test_exit(mm))
@@ -466,7 +452,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
466 ksm_pages_sharing--; 452 ksm_pages_sharing--;
467 else 453 else
468 ksm_pages_shared--; 454 ksm_pages_shared--;
469 ksm_drop_anon_vma(rmap_item); 455 put_anon_vma(rmap_item->anon_vma);
470 rmap_item->address &= PAGE_MASK; 456 rmap_item->address &= PAGE_MASK;
471 cond_resched(); 457 cond_resched();
472 } 458 }
@@ -554,7 +540,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
554 else 540 else
555 ksm_pages_shared--; 541 ksm_pages_shared--;
556 542
557 ksm_drop_anon_vma(rmap_item); 543 put_anon_vma(rmap_item->anon_vma);
558 rmap_item->address &= PAGE_MASK; 544 rmap_item->address &= PAGE_MASK;
559 545
560 } else if (rmap_item->address & UNSTABLE_FLAG) { 546 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -734,7 +720,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
734 swapped = PageSwapCache(page); 720 swapped = PageSwapCache(page);
735 flush_cache_page(vma, addr, page_to_pfn(page)); 721 flush_cache_page(vma, addr, page_to_pfn(page));
736 /* 722 /*
737 * Ok this is tricky, when get_user_pages_fast() run it doesnt 723 * Ok this is tricky, when get_user_pages_fast() run it doesn't
738 * take any lock, therefore the check that we are going to make 724 * take any lock, therefore the check that we are going to make
739 * with the pagecount against the mapcount is racey and 725 * with the pagecount against the mapcount is racey and
740 * O_DIRECT can happen right after the check. 726 * O_DIRECT can happen right after the check.
@@ -949,7 +935,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
949 goto out; 935 goto out;
950 936
951 /* Must get reference to anon_vma while still holding mmap_sem */ 937 /* Must get reference to anon_vma while still holding mmap_sem */
952 hold_anon_vma(rmap_item, vma->anon_vma); 938 rmap_item->anon_vma = vma->anon_vma;
939 get_anon_vma(vma->anon_vma);
953out: 940out:
954 up_read(&mm->mmap_sem); 941 up_read(&mm->mmap_sem);
955 return err; 942 return err;
diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..a0562d1a6ad4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); 58 return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
59} 59}
60 60
61static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
62 phys_addr_t base2, phys_addr_t size2)
63{
64 if (base2 == base1 + size1)
65 return 1;
66 else if (base1 == base2 + size2)
67 return -1;
68
69 return 0;
70}
71
72static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
73 unsigned long r1, unsigned long r2)
74{
75 phys_addr_t base1 = type->regions[r1].base;
76 phys_addr_t size1 = type->regions[r1].size;
77 phys_addr_t base2 = type->regions[r2].base;
78 phys_addr_t size2 = type->regions[r2].size;
79
80 return memblock_addrs_adjacent(base1, size1, base2, size2);
81}
82
83long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 61long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
84{ 62{
85 unsigned long i; 63 unsigned long i;
@@ -137,8 +115,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
137 115
138 BUG_ON(0 == size); 116 BUG_ON(0 == size);
139 117
140 size = memblock_align_up(size, align);
141
142 /* Pump up max_addr */ 118 /* Pump up max_addr */
143 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 119 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
144 end = memblock.current_limit; 120 end = memblock.current_limit;
@@ -208,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
208 type->regions[i].size = type->regions[i + 1].size; 184 type->regions[i].size = type->regions[i + 1].size;
209 } 185 }
210 type->cnt--; 186 type->cnt--;
211}
212 187
213/* Assumption: base addr of region 1 < base addr of region 2 */ 188 /* Special case for empty arrays */
214static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, 189 if (type->cnt == 0) {
215 unsigned long r1, unsigned long r2) 190 type->cnt = 1;
216{ 191 type->regions[0].base = 0;
217 type->regions[r1].size += type->regions[r2].size; 192 type->regions[0].size = 0;
218 memblock_remove_region(type, r2); 193 }
219} 194}
220 195
221/* Defined below but needed now */ 196/* Defined below but needed now */
@@ -278,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
278 return 0; 253 return 0;
279 254
280 /* Add the new reserved region now. Should not fail ! */ 255 /* Add the new reserved region now. Should not fail ! */
281 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); 256 BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
282 257
283 /* If the array wasn't our static init one, then free it. We only do 258 /* If the array wasn't our static init one, then free it. We only do
284 * that before SLAB is available as later on, we don't know whether 259 * that before SLAB is available as later on, we don't know whether
@@ -298,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1
298 return 1; 273 return 1;
299} 274}
300 275
301static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 276static long __init_memblock memblock_add_region(struct memblock_type *type,
277 phys_addr_t base, phys_addr_t size)
302{ 278{
303 unsigned long coalesced = 0; 279 phys_addr_t end = base + size;
304 long adjacent, i; 280 int i, slot = -1;
305
306 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
307 type->regions[0].base = base;
308 type->regions[0].size = size;
309 return 0;
310 }
311 281
312 /* First try and coalesce this MEMBLOCK with another. */ 282 /* First try and coalesce this MEMBLOCK with others */
313 for (i = 0; i < type->cnt; i++) { 283 for (i = 0; i < type->cnt; i++) {
314 phys_addr_t rgnbase = type->regions[i].base; 284 struct memblock_region *rgn = &type->regions[i];
315 phys_addr_t rgnsize = type->regions[i].size; 285 phys_addr_t rend = rgn->base + rgn->size;
316 286
317 if ((rgnbase == base) && (rgnsize == size)) 287 /* Exit if there's no possible hits */
318 /* Already have this region, so we're done */ 288 if (rgn->base > end || rgn->size == 0)
289 break;
290
291 /* Check if we are fully enclosed within an existing
292 * block
293 */
294 if (rgn->base <= base && rend >= end)
319 return 0; 295 return 0;
320 296
321 adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); 297 /* Check if we overlap or are adjacent with the bottom
322 /* Check if arch allows coalescing */ 298 * of a block.
323 if (adjacent != 0 && type == &memblock.memory && 299 */
324 !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) 300 if (base < rgn->base && end >= rgn->base) {
325 break; 301 /* If we can't coalesce, create a new block */
326 if (adjacent > 0) { 302 if (!memblock_memory_can_coalesce(base, size,
327 type->regions[i].base -= size; 303 rgn->base,
328 type->regions[i].size += size; 304 rgn->size)) {
329 coalesced++; 305 /* Overlap & can't coalesce are mutually
330 break; 306 * exclusive, if you do that, be prepared
331 } else if (adjacent < 0) { 307 * for trouble
332 type->regions[i].size += size; 308 */
333 coalesced++; 309 WARN_ON(end != rgn->base);
334 break; 310 goto new_block;
311 }
312 /* We extend the bottom of the block down to our
313 * base
314 */
315 rgn->base = base;
316 rgn->size = rend - base;
317
318 /* Return if we have nothing else to allocate
319 * (fully coalesced)
320 */
321 if (rend >= end)
322 return 0;
323
324 /* We continue processing from the end of the
325 * coalesced block.
326 */
327 base = rend;
328 size = end - base;
329 }
330
331 /* Now check if we overlap or are adjacent with the
332 * top of a block
333 */
334 if (base <= rend && end >= rend) {
335 /* If we can't coalesce, create a new block */
336 if (!memblock_memory_can_coalesce(rgn->base,
337 rgn->size,
338 base, size)) {
339 /* Overlap & can't coalesce are mutually
340 * exclusive, if you do that, be prepared
341 * for trouble
342 */
343 WARN_ON(rend != base);
344 goto new_block;
345 }
346 /* We adjust our base down to enclose the
347 * original block and destroy it. It will be
348 * part of our new allocation. Since we've
349 * freed an entry, we know we won't fail
350 * to allocate one later, so we won't risk
351 * losing the original block allocation.
352 */
353 size += (base - rgn->base);
354 base = rgn->base;
355 memblock_remove_region(type, i--);
335 } 356 }
336 } 357 }
337 358
338 /* If we plugged a hole, we may want to also coalesce with the 359 /* If the array is empty, special case, replace the fake
339 * next region 360 * filler region and return
340 */ 361 */
341 if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && 362 if ((type->cnt == 1) && (type->regions[0].size == 0)) {
342 ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, 363 type->regions[0].base = base;
343 type->regions[i].size, 364 type->regions[0].size = size;
344 type->regions[i+1].base, 365 return 0;
345 type->regions[i+1].size)))) {
346 memblock_coalesce_regions(type, i, i+1);
347 coalesced++;
348 } 366 }
349 367
350 if (coalesced) 368 new_block:
351 return coalesced;
352
353 /* If we are out of space, we fail. It's too late to resize the array 369 /* If we are out of space, we fail. It's too late to resize the array
354 * but then this shouldn't have happened in the first place. 370 * but then this shouldn't have happened in the first place.
355 */ 371 */
@@ -364,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
364 } else { 380 } else {
365 type->regions[i+1].base = base; 381 type->regions[i+1].base = base;
366 type->regions[i+1].size = size; 382 type->regions[i+1].size = size;
383 slot = i + 1;
367 break; 384 break;
368 } 385 }
369 } 386 }
370
371 if (base < type->regions[0].base) { 387 if (base < type->regions[0].base) {
372 type->regions[0].base = base; 388 type->regions[0].base = base;
373 type->regions[0].size = size; 389 type->regions[0].size = size;
390 slot = 0;
374 } 391 }
375 type->cnt++; 392 type->cnt++;
376 393
@@ -378,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
378 * our allocation and return an error 395 * our allocation and return an error
379 */ 396 */
380 if (type->cnt == type->max && memblock_double_array(type)) { 397 if (type->cnt == type->max && memblock_double_array(type)) {
381 type->cnt--; 398 BUG_ON(slot < 0);
399 memblock_remove_region(type, slot);
382 return -1; 400 return -1;
383 } 401 }
384 402
@@ -391,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
391 409
392} 410}
393 411
394static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) 412static long __init_memblock __memblock_remove(struct memblock_type *type,
413 phys_addr_t base, phys_addr_t size)
395{ 414{
396 phys_addr_t rgnbegin, rgnend;
397 phys_addr_t end = base + size; 415 phys_addr_t end = base + size;
398 int i; 416 int i;
399 417
400 rgnbegin = rgnend = 0; /* supress gcc warnings */ 418 /* Walk through the array for collisions */
401 419 for (i = 0; i < type->cnt; i++) {
402 /* Find the region where (base, size) belongs to */ 420 struct memblock_region *rgn = &type->regions[i];
403 for (i=0; i < type->cnt; i++) { 421 phys_addr_t rend = rgn->base + rgn->size;
404 rgnbegin = type->regions[i].base;
405 rgnend = rgnbegin + type->regions[i].size;
406 422
407 if ((rgnbegin <= base) && (end <= rgnend)) 423 /* Nothing more to do, exit */
424 if (rgn->base > end || rgn->size == 0)
408 break; 425 break;
409 }
410 426
411 /* Didn't find the region */ 427 /* If we fully enclose the block, drop it */
412 if (i == type->cnt) 428 if (base <= rgn->base && end >= rend) {
413 return -1; 429 memblock_remove_region(type, i--);
430 continue;
431 }
414 432
415 /* Check to see if we are removing entire region */ 433 /* If we are fully enclosed within a block
416 if ((rgnbegin == base) && (rgnend == end)) { 434 * then we need to split it and we are done
417 memblock_remove_region(type, i); 435 */
418 return 0; 436 if (base > rgn->base && end < rend) {
419 } 437 rgn->size = base - rgn->base;
438 if (!memblock_add_region(type, end, rend - end))
439 return 0;
440 /* Failure to split is bad, we at least
441 * restore the block before erroring
442 */
443 rgn->size = rend - rgn->base;
444 WARN_ON(1);
445 return -1;
446 }
420 447
421 /* Check to see if region is matching at the front */ 448 /* Check if we need to trim the bottom of a block */
422 if (rgnbegin == base) { 449 if (rgn->base < end && rend > end) {
423 type->regions[i].base = end; 450 rgn->size -= end - rgn->base;
424 type->regions[i].size -= size; 451 rgn->base = end;
425 return 0; 452 break;
426 } 453 }
427 454
428 /* Check to see if the region is matching at the end */ 455 /* And check if we need to trim the top of a block */
429 if (rgnend == end) { 456 if (base < rend)
430 type->regions[i].size -= size; 457 rgn->size -= rend - base;
431 return 0;
432 }
433 458
434 /* 459 }
435 * We need to split the entry - adjust the current one to the 460 return 0;
436 * beginging of the hole and add the region after hole.
437 */
438 type->regions[i].size = base - type->regions[i].base;
439 return memblock_add_region(type, end, rgnend - end);
440} 461}
441 462
442long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) 463long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
@@ -469,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph
469 490
470 found = memblock_find_base(size, align, 0, max_addr); 491 found = memblock_find_base(size, align, 0, max_addr);
471 if (found != MEMBLOCK_ERROR && 492 if (found != MEMBLOCK_ERROR &&
472 memblock_add_region(&memblock.reserved, found, size) >= 0) 493 !memblock_add_region(&memblock.reserved, found, size))
473 return found; 494 return found;
474 495
475 return 0; 496 return 0;
@@ -550,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
550 if (this_nid == nid) { 571 if (this_nid == nid) {
551 phys_addr_t ret = memblock_find_region(start, this_end, size, align); 572 phys_addr_t ret = memblock_find_region(start, this_end, size, align);
552 if (ret != MEMBLOCK_ERROR && 573 if (ret != MEMBLOCK_ERROR &&
553 memblock_add_region(&memblock.reserved, ret, size) >= 0) 574 !memblock_add_region(&memblock.reserved, ret, size))
554 return ret; 575 return ret;
555 } 576 }
556 start = this_end; 577 start = this_end;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3878cfe399dc..010f9166fa6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0;
73#define do_swap_account (0) 73#define do_swap_account (0)
74#endif 74#endif
75 75
76/*
77 * Per memcg event counter is incremented at every pagein/pageout. This counter
78 * is used for trigger some periodic events. This is straightforward and better
79 * than using jiffies etc. to handle periodic memcg event.
80 *
81 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
82 */
83#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
84#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
85 76
86/* 77/*
87 * Statistics for memory cgroup. 78 * Statistics for memory cgroup.
@@ -93,19 +84,36 @@ enum mem_cgroup_stat_index {
93 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 84 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
94 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 85 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
95 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 86 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
96 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
97 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
98 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 87 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
99 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 88 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
100 /* incremented at every pagein/pageout */
101 MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
102 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 89 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
103
104 MEM_CGROUP_STAT_NSTATS, 90 MEM_CGROUP_STAT_NSTATS,
105}; 91};
106 92
93enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
97 MEM_CGROUP_EVENTS_NSTATS,
98};
99/*
100 * Per memcg event counter is incremented at every pagein/pageout. With THP,
101 * it will be incremated by the number of pages. This counter is used for
102 * for trigger some periodic events. This is straightforward and better
103 * than using jiffies etc. to handle periodic memcg event.
104 */
105enum mem_cgroup_events_target {
106 MEM_CGROUP_TARGET_THRESH,
107 MEM_CGROUP_TARGET_SOFTLIMIT,
108 MEM_CGROUP_NTARGETS,
109};
110#define THRESHOLDS_EVENTS_TARGET (128)
111#define SOFTLIMIT_EVENTS_TARGET (1024)
112
107struct mem_cgroup_stat_cpu { 113struct mem_cgroup_stat_cpu {
108 s64 count[MEM_CGROUP_STAT_NSTATS]; 114 long count[MEM_CGROUP_STAT_NSTATS];
115 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
116 unsigned long targets[MEM_CGROUP_NTARGETS];
109}; 117};
110 118
111/* 119/*
@@ -218,12 +226,6 @@ struct mem_cgroup {
218 * per zone LRU lists. 226 * per zone LRU lists.
219 */ 227 */
220 struct mem_cgroup_lru_info info; 228 struct mem_cgroup_lru_info info;
221
222 /*
223 protect against reclaim related member.
224 */
225 spinlock_t reclaim_param_lock;
226
227 /* 229 /*
228 * While reclaiming in a hierarchy, we cache the last child we 230 * While reclaiming in a hierarchy, we cache the last child we
229 * reclaimed from. 231 * reclaimed from.
@@ -327,13 +329,6 @@ enum charge_type {
327 NR_CHARGE_TYPE, 329 NR_CHARGE_TYPE,
328}; 330};
329 331
330/* only for here (for easy reading.) */
331#define PCGF_CACHE (1UL << PCG_CACHE)
332#define PCGF_USED (1UL << PCG_USED)
333#define PCGF_LOCK (1UL << PCG_LOCK)
334/* Not used, but added here for completeness */
335#define PCGF_ACCT (1UL << PCG_ACCT)
336
337/* for encoding cft->private value on file */ 332/* for encoding cft->private value on file */
338#define _MEM (0) 333#define _MEM (0)
339#define _MEMSWAP (1) 334#define _MEMSWAP (1)
@@ -371,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
371} 366}
372 367
373static struct mem_cgroup_per_zone * 368static struct mem_cgroup_per_zone *
374page_cgroup_zoneinfo(struct page_cgroup *pc) 369page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
375{ 370{
376 struct mem_cgroup *mem = pc->mem_cgroup; 371 int nid = page_to_nid(page);
377 int nid = page_cgroup_nid(pc); 372 int zid = page_zonenum(page);
378 int zid = page_cgroup_zid(pc);
379
380 if (!mem)
381 return NULL;
382 373
383 return mem_cgroup_zoneinfo(mem, nid, zid); 374 return mem_cgroup_zoneinfo(mem, nid, zid);
384} 375}
@@ -504,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
504 } 495 }
505} 496}
506 497
507static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
508{
509 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
510}
511
512static struct mem_cgroup_per_zone * 498static struct mem_cgroup_per_zone *
513__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 499__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
514{ 500{
@@ -565,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
565 * common workload, threashold and synchonization as vmstat[] should be 551 * common workload, threashold and synchonization as vmstat[] should be
566 * implemented. 552 * implemented.
567 */ 553 */
568static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, 554static long mem_cgroup_read_stat(struct mem_cgroup *mem,
569 enum mem_cgroup_stat_index idx) 555 enum mem_cgroup_stat_index idx)
570{ 556{
557 long val = 0;
571 int cpu; 558 int cpu;
572 s64 val = 0;
573 559
574 get_online_cpus(); 560 get_online_cpus();
575 for_each_online_cpu(cpu) 561 for_each_online_cpu(cpu)
@@ -583,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
583 return val; 569 return val;
584} 570}
585 571
586static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) 572static long mem_cgroup_local_usage(struct mem_cgroup *mem)
587{ 573{
588 s64 ret; 574 long ret;
589 575
590 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 576 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
591 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 577 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -599,6 +585,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
599 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 585 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
600} 586}
601 587
588static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
589 enum mem_cgroup_events_index idx)
590{
591 unsigned long val = 0;
592 int cpu;
593
594 for_each_online_cpu(cpu)
595 val += per_cpu(mem->stat->events[idx], cpu);
596#ifdef CONFIG_HOTPLUG_CPU
597 spin_lock(&mem->pcp_counter_lock);
598 val += mem->nocpu_base.events[idx];
599 spin_unlock(&mem->pcp_counter_lock);
600#endif
601 return val;
602}
603
602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 604static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
603 bool file, int nr_pages) 605 bool file, int nr_pages)
604{ 606{
@@ -611,11 +613,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
611 613
612 /* pagein of a big page is an event. So, ignore page size */ 614 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0) 615 if (nr_pages > 0)
614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 616 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
615 else 617 else {
616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 618 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
619 nr_pages = -nr_pages; /* for event */
620 }
617 621
618 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); 622 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
619 623
620 preempt_enable(); 624 preempt_enable();
621} 625}
@@ -635,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
635 return total; 639 return total;
636} 640}
637 641
638static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) 642static bool __memcg_event_check(struct mem_cgroup *mem, int target)
639{ 643{
640 s64 val; 644 unsigned long val, next;
645
646 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
647 next = this_cpu_read(mem->stat->targets[target]);
648 /* from time_after() in jiffies.h */
649 return ((long)next - (long)val < 0);
650}
651
652static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
653{
654 unsigned long val, next;
655
656 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
641 657
642 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); 658 switch (target) {
659 case MEM_CGROUP_TARGET_THRESH:
660 next = val + THRESHOLDS_EVENTS_TARGET;
661 break;
662 case MEM_CGROUP_TARGET_SOFTLIMIT:
663 next = val + SOFTLIMIT_EVENTS_TARGET;
664 break;
665 default:
666 return;
667 }
643 668
644 return !(val & ((1 << event_mask_shift) - 1)); 669 this_cpu_write(mem->stat->targets[target], next);
645} 670}
646 671
647/* 672/*
@@ -651,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
651static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 676static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
652{ 677{
653 /* threshold event is triggered in finer grain than soft limit */ 678 /* threshold event is triggered in finer grain than soft limit */
654 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { 679 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
655 mem_cgroup_threshold(mem); 680 mem_cgroup_threshold(mem);
656 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) 681 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
682 if (unlikely(__memcg_event_check(mem,
683 MEM_CGROUP_TARGET_SOFTLIMIT))){
657 mem_cgroup_update_tree(mem, page); 684 mem_cgroup_update_tree(mem, page);
685 __mem_cgroup_target_update(mem,
686 MEM_CGROUP_TARGET_SOFTLIMIT);
687 }
658 } 688 }
659} 689}
660 690
@@ -813,7 +843,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
813 * We don't check PCG_USED bit. It's cleared when the "page" is finally 843 * We don't check PCG_USED bit. It's cleared when the "page" is finally
814 * removed from global LRU. 844 * removed from global LRU.
815 */ 845 */
816 mz = page_cgroup_zoneinfo(pc); 846 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
817 /* huge page split is done under lru_lock. so, we have no races. */ 847 /* huge page split is done under lru_lock. so, we have no races. */
818 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 848 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
819 if (mem_cgroup_is_root(pc->mem_cgroup)) 849 if (mem_cgroup_is_root(pc->mem_cgroup))
@@ -827,6 +857,32 @@ void mem_cgroup_del_lru(struct page *page)
827 mem_cgroup_del_lru_list(page, page_lru(page)); 857 mem_cgroup_del_lru_list(page, page_lru(page));
828} 858}
829 859
860/*
861 * Writeback is about to end against a page which has been marked for immediate
862 * reclaim. If it still appears to be reclaimable, move it to the tail of the
863 * inactive list.
864 */
865void mem_cgroup_rotate_reclaimable_page(struct page *page)
866{
867 struct mem_cgroup_per_zone *mz;
868 struct page_cgroup *pc;
869 enum lru_list lru = page_lru(page);
870
871 if (mem_cgroup_disabled())
872 return;
873
874 pc = lookup_page_cgroup(page);
875 /* unused or root page is not rotated. */
876 if (!PageCgroupUsed(pc))
877 return;
878 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
879 smp_rmb();
880 if (mem_cgroup_is_root(pc->mem_cgroup))
881 return;
882 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
883 list_move_tail(&pc->lru, &mz->lists[lru]);
884}
885
830void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 886void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
831{ 887{
832 struct mem_cgroup_per_zone *mz; 888 struct mem_cgroup_per_zone *mz;
@@ -843,7 +899,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
843 smp_rmb(); 899 smp_rmb();
844 if (mem_cgroup_is_root(pc->mem_cgroup)) 900 if (mem_cgroup_is_root(pc->mem_cgroup))
845 return; 901 return;
846 mz = page_cgroup_zoneinfo(pc); 902 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
847 list_move(&pc->lru, &mz->lists[lru]); 903 list_move(&pc->lru, &mz->lists[lru]);
848} 904}
849 905
@@ -860,7 +916,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
860 return; 916 return;
861 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 917 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
862 smp_rmb(); 918 smp_rmb();
863 mz = page_cgroup_zoneinfo(pc); 919 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
864 /* huge page split is done under lru_lock. so, we have no races. */ 920 /* huge page split is done under lru_lock. so, we have no races. */
865 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 921 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
866 SetPageCgroupAcctLRU(pc); 922 SetPageCgroupAcctLRU(pc);
@@ -870,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
870} 926}
871 927
872/* 928/*
873 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to 929 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
874 * lru because the page may.be reused after it's fully uncharged (because of 930 * while it's linked to lru because the page may be reused after it's fully
875 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge 931 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
876 * it again. This function is only used to charge SwapCache. It's done under 932 * It's done under lock_page and expected that zone->lru_lock isnever held.
877 * lock_page and expected that zone->lru_lock is never held.
878 */ 933 */
879static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) 934static void mem_cgroup_lru_del_before_commit(struct page *page)
880{ 935{
881 unsigned long flags; 936 unsigned long flags;
882 struct zone *zone = page_zone(page); 937 struct zone *zone = page_zone(page);
883 struct page_cgroup *pc = lookup_page_cgroup(page); 938 struct page_cgroup *pc = lookup_page_cgroup(page);
884 939
940 /*
941 * Doing this check without taking ->lru_lock seems wrong but this
942 * is safe. Because if page_cgroup's USED bit is unset, the page
943 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
944 * set, the commit after this will fail, anyway.
945 * This all charge/uncharge is done under some mutual execustion.
946 * So, we don't need to taking care of changes in USED bit.
947 */
948 if (likely(!PageLRU(page)))
949 return;
950
885 spin_lock_irqsave(&zone->lru_lock, flags); 951 spin_lock_irqsave(&zone->lru_lock, flags);
886 /* 952 /*
887 * Forget old LRU when this page_cgroup is *not* used. This Used bit 953 * Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -892,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
892 spin_unlock_irqrestore(&zone->lru_lock, flags); 958 spin_unlock_irqrestore(&zone->lru_lock, flags);
893} 959}
894 960
895static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) 961static void mem_cgroup_lru_add_after_commit(struct page *page)
896{ 962{
897 unsigned long flags; 963 unsigned long flags;
898 struct zone *zone = page_zone(page); 964 struct zone *zone = page_zone(page);
899 struct page_cgroup *pc = lookup_page_cgroup(page); 965 struct page_cgroup *pc = lookup_page_cgroup(page);
900 966
967 /* taking care of that the page is added to LRU while we commit it */
968 if (likely(!PageLRU(page)))
969 return;
901 spin_lock_irqsave(&zone->lru_lock, flags); 970 spin_lock_irqsave(&zone->lru_lock, flags);
902 /* link when the page is linked to LRU but page_cgroup isn't */ 971 /* link when the page is linked to LRU but page_cgroup isn't */
903 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 972 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -1030,10 +1099,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1030 return NULL; 1099 return NULL;
1031 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1100 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1032 smp_rmb(); 1101 smp_rmb();
1033 mz = page_cgroup_zoneinfo(pc); 1102 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1034 if (!mz)
1035 return NULL;
1036
1037 return &mz->reclaim_stat; 1103 return &mz->reclaim_stat;
1038} 1104}
1039 1105
@@ -1065,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1065 if (scan >= nr_to_scan) 1131 if (scan >= nr_to_scan)
1066 break; 1132 break;
1067 1133
1068 page = pc->page;
1069 if (unlikely(!PageCgroupUsed(pc))) 1134 if (unlikely(!PageCgroupUsed(pc)))
1070 continue; 1135 continue;
1136
1137 page = lookup_cgroup_page(pc);
1138
1071 if (unlikely(!PageLRU(page))) 1139 if (unlikely(!PageLRU(page)))
1072 continue; 1140 continue;
1073 1141
@@ -1099,32 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1099#define mem_cgroup_from_res_counter(counter, member) \ 1167#define mem_cgroup_from_res_counter(counter, member) \
1100 container_of(counter, struct mem_cgroup, member) 1168 container_of(counter, struct mem_cgroup, member)
1101 1169
1102static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 1170/**
1171 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1172 * @mem: the memory cgroup
1173 *
1174 * Returns the maximum amount of memory @mem can be charged with, in
1175 * pages.
1176 */
1177static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1103{ 1178{
1104 if (do_swap_account) { 1179 unsigned long long margin;
1105 if (res_counter_check_under_limit(&mem->res) && 1180
1106 res_counter_check_under_limit(&mem->memsw)) 1181 margin = res_counter_margin(&mem->res);
1107 return true; 1182 if (do_swap_account)
1108 } else 1183 margin = min(margin, res_counter_margin(&mem->memsw));
1109 if (res_counter_check_under_limit(&mem->res)) 1184 return margin >> PAGE_SHIFT;
1110 return true;
1111 return false;
1112} 1185}
1113 1186
1114static unsigned int get_swappiness(struct mem_cgroup *memcg) 1187static unsigned int get_swappiness(struct mem_cgroup *memcg)
1115{ 1188{
1116 struct cgroup *cgrp = memcg->css.cgroup; 1189 struct cgroup *cgrp = memcg->css.cgroup;
1117 unsigned int swappiness;
1118 1190
1119 /* root ? */ 1191 /* root ? */
1120 if (cgrp->parent == NULL) 1192 if (cgrp->parent == NULL)
1121 return vm_swappiness; 1193 return vm_swappiness;
1122 1194
1123 spin_lock(&memcg->reclaim_param_lock); 1195 return memcg->swappiness;
1124 swappiness = memcg->swappiness;
1125 spin_unlock(&memcg->reclaim_param_lock);
1126
1127 return swappiness;
1128} 1196}
1129 1197
1130static void mem_cgroup_start_move(struct mem_cgroup *mem) 1198static void mem_cgroup_start_move(struct mem_cgroup *mem)
@@ -1340,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1340 1408
1341 rcu_read_unlock(); 1409 rcu_read_unlock();
1342 /* Updates scanning parameter */ 1410 /* Updates scanning parameter */
1343 spin_lock(&root_mem->reclaim_param_lock);
1344 if (!css) { 1411 if (!css) {
1345 /* this means start scan from ID:1 */ 1412 /* this means start scan from ID:1 */
1346 root_mem->last_scanned_child = 0; 1413 root_mem->last_scanned_child = 0;
1347 } else 1414 } else
1348 root_mem->last_scanned_child = found; 1415 root_mem->last_scanned_child = found;
1349 spin_unlock(&root_mem->reclaim_param_lock);
1350 } 1416 }
1351 1417
1352 return ret; 1418 return ret;
@@ -1375,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1375 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1441 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1376 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1442 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1377 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1443 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1378 unsigned long excess = mem_cgroup_get_excess(root_mem); 1444 unsigned long excess;
1445
1446 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1379 1447
1380 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1448 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1381 if (root_mem->memsw_is_minimum) 1449 if (root_mem->memsw_is_minimum)
@@ -1398,7 +1466,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1398 break; 1466 break;
1399 } 1467 }
1400 /* 1468 /*
1401 * We want to do more targetted reclaim. 1469 * We want to do more targeted reclaim.
1402 * excess >> 2 is not to excessive so as to 1470 * excess >> 2 is not to excessive so as to
1403 * reclaim too much, nor too less that we keep 1471 * reclaim too much, nor too less that we keep
1404 * coming back to reclaim from this cgroup 1472 * coming back to reclaim from this cgroup
@@ -1432,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1432 return ret; 1500 return ret;
1433 total += ret; 1501 total += ret;
1434 if (check_soft) { 1502 if (check_soft) {
1435 if (res_counter_check_under_soft_limit(&root_mem->res)) 1503 if (!res_counter_soft_limit_excess(&root_mem->res))
1436 return total; 1504 return total;
1437 } else if (mem_cgroup_check_under_limit(root_mem)) 1505 } else if (mem_cgroup_margin(root_mem))
1438 return 1 + total; 1506 return 1 + total;
1439 } 1507 }
1440 return total; 1508 return total;
@@ -1642,17 +1710,17 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1642 * size of first charge trial. "32" comes from vmscan.c's magic value. 1710 * size of first charge trial. "32" comes from vmscan.c's magic value.
1643 * TODO: maybe necessary to use big numbers in big irons. 1711 * TODO: maybe necessary to use big numbers in big irons.
1644 */ 1712 */
1645#define CHARGE_SIZE (32 * PAGE_SIZE) 1713#define CHARGE_BATCH 32U
1646struct memcg_stock_pcp { 1714struct memcg_stock_pcp {
1647 struct mem_cgroup *cached; /* this never be root cgroup */ 1715 struct mem_cgroup *cached; /* this never be root cgroup */
1648 int charge; 1716 unsigned int nr_pages;
1649 struct work_struct work; 1717 struct work_struct work;
1650}; 1718};
1651static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1719static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1652static atomic_t memcg_drain_count; 1720static atomic_t memcg_drain_count;
1653 1721
1654/* 1722/*
1655 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed 1723 * Try to consume stocked charge on this cpu. If success, one page is consumed
1656 * from local stock and true is returned. If the stock is 0 or charges from a 1724 * from local stock and true is returned. If the stock is 0 or charges from a
1657 * cgroup which is not current target, returns false. This stock will be 1725 * cgroup which is not current target, returns false. This stock will be
1658 * refilled. 1726 * refilled.
@@ -1663,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem)
1663 bool ret = true; 1731 bool ret = true;
1664 1732
1665 stock = &get_cpu_var(memcg_stock); 1733 stock = &get_cpu_var(memcg_stock);
1666 if (mem == stock->cached && stock->charge) 1734 if (mem == stock->cached && stock->nr_pages)
1667 stock->charge -= PAGE_SIZE; 1735 stock->nr_pages--;
1668 else /* need to call res_counter_charge */ 1736 else /* need to call res_counter_charge */
1669 ret = false; 1737 ret = false;
1670 put_cpu_var(memcg_stock); 1738 put_cpu_var(memcg_stock);
@@ -1678,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock)
1678{ 1746{
1679 struct mem_cgroup *old = stock->cached; 1747 struct mem_cgroup *old = stock->cached;
1680 1748
1681 if (stock->charge) { 1749 if (stock->nr_pages) {
1682 res_counter_uncharge(&old->res, stock->charge); 1750 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
1751
1752 res_counter_uncharge(&old->res, bytes);
1683 if (do_swap_account) 1753 if (do_swap_account)
1684 res_counter_uncharge(&old->memsw, stock->charge); 1754 res_counter_uncharge(&old->memsw, bytes);
1755 stock->nr_pages = 0;
1685 } 1756 }
1686 stock->cached = NULL; 1757 stock->cached = NULL;
1687 stock->charge = 0;
1688} 1758}
1689 1759
1690/* 1760/*
@@ -1701,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy)
1701 * Cache charges(val) which is from res_counter, to local per_cpu area. 1771 * Cache charges(val) which is from res_counter, to local per_cpu area.
1702 * This will be consumed by consume_stock() function, later. 1772 * This will be consumed by consume_stock() function, later.
1703 */ 1773 */
1704static void refill_stock(struct mem_cgroup *mem, int val) 1774static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
1705{ 1775{
1706 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1776 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1707 1777
@@ -1709,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val)
1709 drain_stock(stock); 1779 drain_stock(stock);
1710 stock->cached = mem; 1780 stock->cached = mem;
1711 } 1781 }
1712 stock->charge += val; 1782 stock->nr_pages += nr_pages;
1713 put_cpu_var(memcg_stock); 1783 put_cpu_var(memcg_stock);
1714} 1784}
1715 1785
@@ -1761,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1761 1831
1762 spin_lock(&mem->pcp_counter_lock); 1832 spin_lock(&mem->pcp_counter_lock);
1763 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 1833 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1764 s64 x = per_cpu(mem->stat->count[i], cpu); 1834 long x = per_cpu(mem->stat->count[i], cpu);
1765 1835
1766 per_cpu(mem->stat->count[i], cpu) = 0; 1836 per_cpu(mem->stat->count[i], cpu) = 0;
1767 mem->nocpu_base.count[i] += x; 1837 mem->nocpu_base.count[i] += x;
1768 } 1838 }
1839 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
1840 unsigned long x = per_cpu(mem->stat->events[i], cpu);
1841
1842 per_cpu(mem->stat->events[i], cpu) = 0;
1843 mem->nocpu_base.events[i] += x;
1844 }
1769 /* need to clear ON_MOVE value, works as a kind of lock. */ 1845 /* need to clear ON_MOVE value, works as a kind of lock. */
1770 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 1846 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1771 spin_unlock(&mem->pcp_counter_lock); 1847 spin_unlock(&mem->pcp_counter_lock);
@@ -1815,9 +1891,10 @@ enum {
1815 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 1891 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1816}; 1892};
1817 1893
1818static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 1894static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1819 int csize, bool oom_check) 1895 unsigned int nr_pages, bool oom_check)
1820{ 1896{
1897 unsigned long csize = nr_pages * PAGE_SIZE;
1821 struct mem_cgroup *mem_over_limit; 1898 struct mem_cgroup *mem_over_limit;
1822 struct res_counter *fail_res; 1899 struct res_counter *fail_res;
1823 unsigned long flags = 0; 1900 unsigned long flags = 0;
@@ -1837,23 +1914,33 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1837 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1914 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1838 } else 1915 } else
1839 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1916 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1840 1917 /*
1841 if (csize > PAGE_SIZE) /* change csize and retry */ 1918 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
1919 * of regular pages (CHARGE_BATCH), or a single regular page (1).
1920 *
1921 * Never reclaim on behalf of optional batching, retry with a
1922 * single page instead.
1923 */
1924 if (nr_pages == CHARGE_BATCH)
1842 return CHARGE_RETRY; 1925 return CHARGE_RETRY;
1843 1926
1844 if (!(gfp_mask & __GFP_WAIT)) 1927 if (!(gfp_mask & __GFP_WAIT))
1845 return CHARGE_WOULDBLOCK; 1928 return CHARGE_WOULDBLOCK;
1846 1929
1847 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1930 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1848 gfp_mask, flags); 1931 gfp_mask, flags);
1932 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1933 return CHARGE_RETRY;
1849 /* 1934 /*
1850 * try_to_free_mem_cgroup_pages() might not give us a full 1935 * Even though the limit is exceeded at this point, reclaim
1851 * picture of reclaim. Some pages are reclaimed and might be 1936 * may have been able to free some pages. Retry the charge
1852 * moved to swap cache or just unmapped from the cgroup. 1937 * before killing the task.
1853 * Check the limit again to see if the reclaim reduced the 1938 *
1854 * current usage of the cgroup before giving up 1939 * Only for regular pages, though: huge pages are rather
1940 * unlikely to succeed so close to the limit, and we fall back
1941 * to regular pages anyway in case of failure.
1855 */ 1942 */
1856 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1943 if (nr_pages == 1 && ret)
1857 return CHARGE_RETRY; 1944 return CHARGE_RETRY;
1858 1945
1859 /* 1946 /*
@@ -1879,13 +1966,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1879 */ 1966 */
1880static int __mem_cgroup_try_charge(struct mm_struct *mm, 1967static int __mem_cgroup_try_charge(struct mm_struct *mm,
1881 gfp_t gfp_mask, 1968 gfp_t gfp_mask,
1882 struct mem_cgroup **memcg, bool oom, 1969 unsigned int nr_pages,
1883 int page_size) 1970 struct mem_cgroup **memcg,
1971 bool oom)
1884{ 1972{
1973 unsigned int batch = max(CHARGE_BATCH, nr_pages);
1885 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1974 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1886 struct mem_cgroup *mem = NULL; 1975 struct mem_cgroup *mem = NULL;
1887 int ret; 1976 int ret;
1888 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1889 1977
1890 /* 1978 /*
1891 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1979 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1910,7 +1998,7 @@ again:
1910 VM_BUG_ON(css_is_removed(&mem->css)); 1998 VM_BUG_ON(css_is_removed(&mem->css));
1911 if (mem_cgroup_is_root(mem)) 1999 if (mem_cgroup_is_root(mem))
1912 goto done; 2000 goto done;
1913 if (page_size == PAGE_SIZE && consume_stock(mem)) 2001 if (nr_pages == 1 && consume_stock(mem))
1914 goto done; 2002 goto done;
1915 css_get(&mem->css); 2003 css_get(&mem->css);
1916 } else { 2004 } else {
@@ -1933,7 +2021,7 @@ again:
1933 rcu_read_unlock(); 2021 rcu_read_unlock();
1934 goto done; 2022 goto done;
1935 } 2023 }
1936 if (page_size == PAGE_SIZE && consume_stock(mem)) { 2024 if (nr_pages == 1 && consume_stock(mem)) {
1937 /* 2025 /*
1938 * It seems dagerous to access memcg without css_get(). 2026 * It seems dagerous to access memcg without css_get().
1939 * But considering how consume_stok works, it's not 2027 * But considering how consume_stok works, it's not
@@ -1968,13 +2056,12 @@ again:
1968 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2056 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1969 } 2057 }
1970 2058
1971 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); 2059 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
1972
1973 switch (ret) { 2060 switch (ret) {
1974 case CHARGE_OK: 2061 case CHARGE_OK:
1975 break; 2062 break;
1976 case CHARGE_RETRY: /* not in OOM situation but retry */ 2063 case CHARGE_RETRY: /* not in OOM situation but retry */
1977 csize = page_size; 2064 batch = nr_pages;
1978 css_put(&mem->css); 2065 css_put(&mem->css);
1979 mem = NULL; 2066 mem = NULL;
1980 goto again; 2067 goto again;
@@ -1995,8 +2082,8 @@ again:
1995 } 2082 }
1996 } while (ret != CHARGE_OK); 2083 } while (ret != CHARGE_OK);
1997 2084
1998 if (csize > page_size) 2085 if (batch > nr_pages)
1999 refill_stock(mem, csize - page_size); 2086 refill_stock(mem, batch - nr_pages);
2000 css_put(&mem->css); 2087 css_put(&mem->css);
2001done: 2088done:
2002 *memcg = mem; 2089 *memcg = mem;
@@ -2015,21 +2102,17 @@ bypass:
2015 * gotten by try_charge(). 2102 * gotten by try_charge().
2016 */ 2103 */
2017static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2104static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2018 unsigned long count) 2105 unsigned int nr_pages)
2019{ 2106{
2020 if (!mem_cgroup_is_root(mem)) { 2107 if (!mem_cgroup_is_root(mem)) {
2021 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 2108 unsigned long bytes = nr_pages * PAGE_SIZE;
2109
2110 res_counter_uncharge(&mem->res, bytes);
2022 if (do_swap_account) 2111 if (do_swap_account)
2023 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 2112 res_counter_uncharge(&mem->memsw, bytes);
2024 } 2113 }
2025} 2114}
2026 2115
2027static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2028 int page_size)
2029{
2030 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2031}
2032
2033/* 2116/*
2034 * A helper function to get mem_cgroup from ID. must be called under 2117 * A helper function to get mem_cgroup from ID. must be called under
2035 * rcu_read_lock(). The caller must check css_is_removed() or some if 2118 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -2078,20 +2161,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2078} 2161}
2079 2162
2080static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2163static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2164 struct page *page,
2165 unsigned int nr_pages,
2081 struct page_cgroup *pc, 2166 struct page_cgroup *pc,
2082 enum charge_type ctype, 2167 enum charge_type ctype)
2083 int page_size)
2084{ 2168{
2085 int nr_pages = page_size >> PAGE_SHIFT;
2086
2087 /* try_charge() can return NULL to *memcg, taking care of it. */
2088 if (!mem)
2089 return;
2090
2091 lock_page_cgroup(pc); 2169 lock_page_cgroup(pc);
2092 if (unlikely(PageCgroupUsed(pc))) { 2170 if (unlikely(PageCgroupUsed(pc))) {
2093 unlock_page_cgroup(pc); 2171 unlock_page_cgroup(pc);
2094 mem_cgroup_cancel_charge(mem, page_size); 2172 __mem_cgroup_cancel_charge(mem, nr_pages);
2095 return; 2173 return;
2096 } 2174 }
2097 /* 2175 /*
@@ -2128,7 +2206,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2128 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2206 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2129 * if they exceeds softlimit. 2207 * if they exceeds softlimit.
2130 */ 2208 */
2131 memcg_check_events(mem, pc->page); 2209 memcg_check_events(mem, page);
2132} 2210}
2133 2211
2134#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2212#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2165,7 +2243,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2165 * We hold lru_lock, then, reduce counter directly. 2243 * We hold lru_lock, then, reduce counter directly.
2166 */ 2244 */
2167 lru = page_lru(head); 2245 lru = page_lru(head);
2168 mz = page_cgroup_zoneinfo(head_pc); 2246 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2169 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2247 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2170 } 2248 }
2171 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2249 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
@@ -2174,7 +2252,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2174#endif 2252#endif
2175 2253
2176/** 2254/**
2177 * __mem_cgroup_move_account - move account of the page 2255 * mem_cgroup_move_account - move account of the page
2256 * @page: the page
2257 * @nr_pages: number of regular pages (>1 for huge pages)
2178 * @pc: page_cgroup of the page. 2258 * @pc: page_cgroup of the page.
2179 * @from: mem_cgroup which the page is moved from. 2259 * @from: mem_cgroup which the page is moved from.
2180 * @to: mem_cgroup which the page is moved to. @from != @to. 2260 * @to: mem_cgroup which the page is moved to. @from != @to.
@@ -2182,25 +2262,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2182 * 2262 *
2183 * The caller must confirm following. 2263 * The caller must confirm following.
2184 * - page is not on LRU (isolate_page() is useful.) 2264 * - page is not on LRU (isolate_page() is useful.)
2185 * - the pc is locked, used, and ->mem_cgroup points to @from. 2265 * - compound_lock is held when nr_pages > 1
2186 * 2266 *
2187 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2267 * This function doesn't do "charge" nor css_get to new cgroup. It should be
2188 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is 2268 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2189 * true, this function does "uncharge" from old cgroup, but it doesn't if 2269 * true, this function does "uncharge" from old cgroup, but it doesn't if
2190 * @uncharge is false, so a caller should do "uncharge". 2270 * @uncharge is false, so a caller should do "uncharge".
2191 */ 2271 */
2192 2272static int mem_cgroup_move_account(struct page *page,
2193static void __mem_cgroup_move_account(struct page_cgroup *pc, 2273 unsigned int nr_pages,
2194 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, 2274 struct page_cgroup *pc,
2195 int charge_size) 2275 struct mem_cgroup *from,
2276 struct mem_cgroup *to,
2277 bool uncharge)
2196{ 2278{
2197 int nr_pages = charge_size >> PAGE_SHIFT; 2279 unsigned long flags;
2280 int ret;
2198 2281
2199 VM_BUG_ON(from == to); 2282 VM_BUG_ON(from == to);
2200 VM_BUG_ON(PageLRU(pc->page)); 2283 VM_BUG_ON(PageLRU(page));
2201 VM_BUG_ON(!page_is_cgroup_locked(pc)); 2284 /*
2202 VM_BUG_ON(!PageCgroupUsed(pc)); 2285 * The page is isolated from LRU. So, collapse function
2203 VM_BUG_ON(pc->mem_cgroup != from); 2286 * will not handle this page. But page splitting can happen.
2287 * Do this check under compound_page_lock(). The caller should
2288 * hold it.
2289 */
2290 ret = -EBUSY;
2291 if (nr_pages > 1 && !PageTransHuge(page))
2292 goto out;
2293
2294 lock_page_cgroup(pc);
2295
2296 ret = -EINVAL;
2297 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2298 goto unlock;
2299
2300 move_lock_page_cgroup(pc, &flags);
2204 2301
2205 if (PageCgroupFileMapped(pc)) { 2302 if (PageCgroupFileMapped(pc)) {
2206 /* Update mapped_file data for mem_cgroup */ 2303 /* Update mapped_file data for mem_cgroup */
@@ -2212,7 +2309,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2212 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2309 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2213 if (uncharge) 2310 if (uncharge)
2214 /* This is not "cancel", but cancel_charge does all we need. */ 2311 /* This is not "cancel", but cancel_charge does all we need. */
2215 mem_cgroup_cancel_charge(from, charge_size); 2312 __mem_cgroup_cancel_charge(from, nr_pages);
2216 2313
2217 /* caller should have done css_get */ 2314 /* caller should have done css_get */
2218 pc->mem_cgroup = to; 2315 pc->mem_cgroup = to;
@@ -2221,43 +2318,19 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2221 * We charges against "to" which may not have any tasks. Then, "to" 2318 * We charges against "to" which may not have any tasks. Then, "to"
2222 * can be under rmdir(). But in current implementation, caller of 2319 * can be under rmdir(). But in current implementation, caller of
2223 * this function is just force_empty() and move charge, so it's 2320 * this function is just force_empty() and move charge, so it's
2224 * garanteed that "to" is never removed. So, we don't check rmdir 2321 * guaranteed that "to" is never removed. So, we don't check rmdir
2225 * status here. 2322 * status here.
2226 */ 2323 */
2227} 2324 move_unlock_page_cgroup(pc, &flags);
2228 2325 ret = 0;
2229/* 2326unlock:
2230 * check whether the @pc is valid for moving account and call
2231 * __mem_cgroup_move_account()
2232 */
2233static int mem_cgroup_move_account(struct page_cgroup *pc,
2234 struct mem_cgroup *from, struct mem_cgroup *to,
2235 bool uncharge, int charge_size)
2236{
2237 int ret = -EINVAL;
2238 unsigned long flags;
2239 /*
2240 * The page is isolated from LRU. So, collapse function
2241 * will not handle this page. But page splitting can happen.
2242 * Do this check under compound_page_lock(). The caller should
2243 * hold it.
2244 */
2245 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2246 return -EBUSY;
2247
2248 lock_page_cgroup(pc);
2249 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2250 move_lock_page_cgroup(pc, &flags);
2251 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2252 move_unlock_page_cgroup(pc, &flags);
2253 ret = 0;
2254 }
2255 unlock_page_cgroup(pc); 2327 unlock_page_cgroup(pc);
2256 /* 2328 /*
2257 * check events 2329 * check events
2258 */ 2330 */
2259 memcg_check_events(to, pc->page); 2331 memcg_check_events(to, page);
2260 memcg_check_events(from, pc->page); 2332 memcg_check_events(from, page);
2333out:
2261 return ret; 2334 return ret;
2262} 2335}
2263 2336
@@ -2265,16 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2265 * move charges to its parent. 2338 * move charges to its parent.
2266 */ 2339 */
2267 2340
2268static int mem_cgroup_move_parent(struct page_cgroup *pc, 2341static int mem_cgroup_move_parent(struct page *page,
2342 struct page_cgroup *pc,
2269 struct mem_cgroup *child, 2343 struct mem_cgroup *child,
2270 gfp_t gfp_mask) 2344 gfp_t gfp_mask)
2271{ 2345{
2272 struct page *page = pc->page;
2273 struct cgroup *cg = child->css.cgroup; 2346 struct cgroup *cg = child->css.cgroup;
2274 struct cgroup *pcg = cg->parent; 2347 struct cgroup *pcg = cg->parent;
2275 struct mem_cgroup *parent; 2348 struct mem_cgroup *parent;
2276 int page_size = PAGE_SIZE; 2349 unsigned int nr_pages;
2277 unsigned long flags; 2350 unsigned long uninitialized_var(flags);
2278 int ret; 2351 int ret;
2279 2352
2280 /* Is ROOT ? */ 2353 /* Is ROOT ? */
@@ -2287,23 +2360,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2287 if (isolate_lru_page(page)) 2360 if (isolate_lru_page(page))
2288 goto put; 2361 goto put;
2289 2362
2290 if (PageTransHuge(page)) 2363 nr_pages = hpage_nr_pages(page);
2291 page_size = HPAGE_SIZE;
2292 2364
2293 parent = mem_cgroup_from_cont(pcg); 2365 parent = mem_cgroup_from_cont(pcg);
2294 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 2366 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2295 &parent, false, page_size);
2296 if (ret || !parent) 2367 if (ret || !parent)
2297 goto put_back; 2368 goto put_back;
2298 2369
2299 if (page_size > PAGE_SIZE) 2370 if (nr_pages > 1)
2300 flags = compound_lock_irqsave(page); 2371 flags = compound_lock_irqsave(page);
2301 2372
2302 ret = mem_cgroup_move_account(pc, child, parent, true, page_size); 2373 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2303 if (ret) 2374 if (ret)
2304 mem_cgroup_cancel_charge(parent, page_size); 2375 __mem_cgroup_cancel_charge(parent, nr_pages);
2305 2376
2306 if (page_size > PAGE_SIZE) 2377 if (nr_pages > 1)
2307 compound_unlock_irqrestore(page, flags); 2378 compound_unlock_irqrestore(page, flags);
2308put_back: 2379put_back:
2309 putback_lru_page(page); 2380 putback_lru_page(page);
@@ -2323,26 +2394,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2323 gfp_t gfp_mask, enum charge_type ctype) 2394 gfp_t gfp_mask, enum charge_type ctype)
2324{ 2395{
2325 struct mem_cgroup *mem = NULL; 2396 struct mem_cgroup *mem = NULL;
2397 unsigned int nr_pages = 1;
2326 struct page_cgroup *pc; 2398 struct page_cgroup *pc;
2399 bool oom = true;
2327 int ret; 2400 int ret;
2328 int page_size = PAGE_SIZE;
2329 2401
2330 if (PageTransHuge(page)) { 2402 if (PageTransHuge(page)) {
2331 page_size <<= compound_order(page); 2403 nr_pages <<= compound_order(page);
2332 VM_BUG_ON(!PageTransHuge(page)); 2404 VM_BUG_ON(!PageTransHuge(page));
2405 /*
2406 * Never OOM-kill a process for a huge page. The
2407 * fault handler will fall back to regular pages.
2408 */
2409 oom = false;
2333 } 2410 }
2334 2411
2335 pc = lookup_page_cgroup(page); 2412 pc = lookup_page_cgroup(page);
2336 /* can happen at boot */ 2413 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2337 if (unlikely(!pc))
2338 return 0;
2339 prefetchw(pc);
2340 2414
2341 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); 2415 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2342 if (ret || !mem) 2416 if (ret || !mem)
2343 return ret; 2417 return ret;
2344 2418
2345 __mem_cgroup_commit_charge(mem, pc, ctype, page_size); 2419 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2346 return 0; 2420 return 0;
2347} 2421}
2348 2422
@@ -2370,9 +2444,26 @@ static void
2370__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2444__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2371 enum charge_type ctype); 2445 enum charge_type ctype);
2372 2446
2447static void
2448__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2449 enum charge_type ctype)
2450{
2451 struct page_cgroup *pc = lookup_page_cgroup(page);
2452 /*
2453 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2454 * is already on LRU. It means the page may on some other page_cgroup's
2455 * LRU. Take care of it.
2456 */
2457 mem_cgroup_lru_del_before_commit(page);
2458 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2459 mem_cgroup_lru_add_after_commit(page);
2460 return;
2461}
2462
2373int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2463int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2374 gfp_t gfp_mask) 2464 gfp_t gfp_mask)
2375{ 2465{
2466 struct mem_cgroup *mem = NULL;
2376 int ret; 2467 int ret;
2377 2468
2378 if (mem_cgroup_disabled()) 2469 if (mem_cgroup_disabled())
@@ -2407,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2407 if (unlikely(!mm)) 2498 if (unlikely(!mm))
2408 mm = &init_mm; 2499 mm = &init_mm;
2409 2500
2410 if (page_is_file_cache(page)) 2501 if (page_is_file_cache(page)) {
2411 return mem_cgroup_charge_common(page, mm, gfp_mask, 2502 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2412 MEM_CGROUP_CHARGE_TYPE_CACHE); 2503 if (ret || !mem)
2504 return ret;
2413 2505
2506 /*
2507 * FUSE reuses pages without going through the final
2508 * put that would remove them from the LRU list, make
2509 * sure that they get relinked properly.
2510 */
2511 __mem_cgroup_commit_charge_lrucare(page, mem,
2512 MEM_CGROUP_CHARGE_TYPE_CACHE);
2513 return ret;
2514 }
2414 /* shmem */ 2515 /* shmem */
2415 if (PageSwapCache(page)) { 2516 if (PageSwapCache(page)) {
2416 struct mem_cgroup *mem = NULL;
2417
2418 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2517 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2419 if (!ret) 2518 if (!ret)
2420 __mem_cgroup_commit_charge_swapin(page, mem, 2519 __mem_cgroup_commit_charge_swapin(page, mem,
@@ -2439,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2439 struct mem_cgroup *mem; 2538 struct mem_cgroup *mem;
2440 int ret; 2539 int ret;
2441 2540
2541 *ptr = NULL;
2542
2442 if (mem_cgroup_disabled()) 2543 if (mem_cgroup_disabled())
2443 return 0; 2544 return 0;
2444 2545
@@ -2456,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2456 if (!mem) 2557 if (!mem)
2457 goto charge_cur_mm; 2558 goto charge_cur_mm;
2458 *ptr = mem; 2559 *ptr = mem;
2459 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); 2560 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2460 css_put(&mem->css); 2561 css_put(&mem->css);
2461 return ret; 2562 return ret;
2462charge_cur_mm: 2563charge_cur_mm:
2463 if (unlikely(!mm)) 2564 if (unlikely(!mm))
2464 mm = &init_mm; 2565 mm = &init_mm;
2465 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); 2566 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2466} 2567}
2467 2568
2468static void 2569static void
2469__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2570__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2470 enum charge_type ctype) 2571 enum charge_type ctype)
2471{ 2572{
2472 struct page_cgroup *pc;
2473
2474 if (mem_cgroup_disabled()) 2573 if (mem_cgroup_disabled())
2475 return; 2574 return;
2476 if (!ptr) 2575 if (!ptr)
2477 return; 2576 return;
2478 cgroup_exclude_rmdir(&ptr->css); 2577 cgroup_exclude_rmdir(&ptr->css);
2479 pc = lookup_page_cgroup(page); 2578
2480 mem_cgroup_lru_del_before_commit_swapcache(page); 2579 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2481 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2482 mem_cgroup_lru_add_after_commit_swapcache(page);
2483 /* 2580 /*
2484 * Now swap is on-memory. This means this page may be 2581 * Now swap is on-memory. This means this page may be
2485 * counted both as mem and swap....double count. 2582 * counted both as mem and swap....double count.
@@ -2527,15 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2527 return; 2624 return;
2528 if (!mem) 2625 if (!mem)
2529 return; 2626 return;
2530 mem_cgroup_cancel_charge(mem, PAGE_SIZE); 2627 __mem_cgroup_cancel_charge(mem, 1);
2531} 2628}
2532 2629
2533static void 2630static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2534__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, 2631 unsigned int nr_pages,
2535 int page_size) 2632 const enum charge_type ctype)
2536{ 2633{
2537 struct memcg_batch_info *batch = NULL; 2634 struct memcg_batch_info *batch = NULL;
2538 bool uncharge_memsw = true; 2635 bool uncharge_memsw = true;
2636
2539 /* If swapout, usage of swap doesn't decrease */ 2637 /* If swapout, usage of swap doesn't decrease */
2540 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2638 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2541 uncharge_memsw = false; 2639 uncharge_memsw = false;
@@ -2550,7 +2648,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2550 batch->memcg = mem; 2648 batch->memcg = mem;
2551 /* 2649 /*
2552 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2650 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2553 * In those cases, all pages freed continously can be expected to be in 2651 * In those cases, all pages freed continuously can be expected to be in
2554 * the same cgroup and we have chance to coalesce uncharges. 2652 * the same cgroup and we have chance to coalesce uncharges.
2555 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2653 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2556 * because we want to do uncharge as soon as possible. 2654 * because we want to do uncharge as soon as possible.
@@ -2559,7 +2657,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2559 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2657 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2560 goto direct_uncharge; 2658 goto direct_uncharge;
2561 2659
2562 if (page_size != PAGE_SIZE) 2660 if (nr_pages > 1)
2563 goto direct_uncharge; 2661 goto direct_uncharge;
2564 2662
2565 /* 2663 /*
@@ -2570,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2570 if (batch->memcg != mem) 2668 if (batch->memcg != mem)
2571 goto direct_uncharge; 2669 goto direct_uncharge;
2572 /* remember freed charge and uncharge it later */ 2670 /* remember freed charge and uncharge it later */
2573 batch->bytes += PAGE_SIZE; 2671 batch->nr_pages++;
2574 if (uncharge_memsw) 2672 if (uncharge_memsw)
2575 batch->memsw_bytes += PAGE_SIZE; 2673 batch->memsw_nr_pages++;
2576 return; 2674 return;
2577direct_uncharge: 2675direct_uncharge:
2578 res_counter_uncharge(&mem->res, page_size); 2676 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2579 if (uncharge_memsw) 2677 if (uncharge_memsw)
2580 res_counter_uncharge(&mem->memsw, page_size); 2678 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2581 if (unlikely(batch->memcg != mem)) 2679 if (unlikely(batch->memcg != mem))
2582 memcg_oom_recover(mem); 2680 memcg_oom_recover(mem);
2583 return; 2681 return;
@@ -2589,10 +2687,9 @@ direct_uncharge:
2589static struct mem_cgroup * 2687static struct mem_cgroup *
2590__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2688__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2591{ 2689{
2592 int count;
2593 struct page_cgroup *pc;
2594 struct mem_cgroup *mem = NULL; 2690 struct mem_cgroup *mem = NULL;
2595 int page_size = PAGE_SIZE; 2691 unsigned int nr_pages = 1;
2692 struct page_cgroup *pc;
2596 2693
2597 if (mem_cgroup_disabled()) 2694 if (mem_cgroup_disabled())
2598 return NULL; 2695 return NULL;
@@ -2601,11 +2698,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2601 return NULL; 2698 return NULL;
2602 2699
2603 if (PageTransHuge(page)) { 2700 if (PageTransHuge(page)) {
2604 page_size <<= compound_order(page); 2701 nr_pages <<= compound_order(page);
2605 VM_BUG_ON(!PageTransHuge(page)); 2702 VM_BUG_ON(!PageTransHuge(page));
2606 } 2703 }
2607
2608 count = page_size >> PAGE_SHIFT;
2609 /* 2704 /*
2610 * Check if our page_cgroup is valid 2705 * Check if our page_cgroup is valid
2611 */ 2706 */
@@ -2638,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2638 break; 2733 break;
2639 } 2734 }
2640 2735
2641 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); 2736 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
2642 2737
2643 ClearPageCgroupUsed(pc); 2738 ClearPageCgroupUsed(pc);
2644 /* 2739 /*
@@ -2659,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2659 mem_cgroup_get(mem); 2754 mem_cgroup_get(mem);
2660 } 2755 }
2661 if (!mem_cgroup_is_root(mem)) 2756 if (!mem_cgroup_is_root(mem))
2662 __do_uncharge(mem, ctype, page_size); 2757 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
2663 2758
2664 return mem; 2759 return mem;
2665 2760
@@ -2699,8 +2794,8 @@ void mem_cgroup_uncharge_start(void)
2699 /* We can do nest. */ 2794 /* We can do nest. */
2700 if (current->memcg_batch.do_batch == 1) { 2795 if (current->memcg_batch.do_batch == 1) {
2701 current->memcg_batch.memcg = NULL; 2796 current->memcg_batch.memcg = NULL;
2702 current->memcg_batch.bytes = 0; 2797 current->memcg_batch.nr_pages = 0;
2703 current->memcg_batch.memsw_bytes = 0; 2798 current->memcg_batch.memsw_nr_pages = 0;
2704 } 2799 }
2705} 2800}
2706 2801
@@ -2721,10 +2816,12 @@ void mem_cgroup_uncharge_end(void)
2721 * This "batch->memcg" is valid without any css_get/put etc... 2816 * This "batch->memcg" is valid without any css_get/put etc...
2722 * bacause we hide charges behind us. 2817 * bacause we hide charges behind us.
2723 */ 2818 */
2724 if (batch->bytes) 2819 if (batch->nr_pages)
2725 res_counter_uncharge(&batch->memcg->res, batch->bytes); 2820 res_counter_uncharge(&batch->memcg->res,
2726 if (batch->memsw_bytes) 2821 batch->nr_pages * PAGE_SIZE);
2727 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); 2822 if (batch->memsw_nr_pages)
2823 res_counter_uncharge(&batch->memcg->memsw,
2824 batch->memsw_nr_pages * PAGE_SIZE);
2728 memcg_oom_recover(batch->memcg); 2825 memcg_oom_recover(batch->memcg);
2729 /* forget this pointer (for sanity check) */ 2826 /* forget this pointer (for sanity check) */
2730 batch->memcg = NULL; 2827 batch->memcg = NULL;
@@ -2847,13 +2944,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2847 * page belongs to. 2944 * page belongs to.
2848 */ 2945 */
2849int mem_cgroup_prepare_migration(struct page *page, 2946int mem_cgroup_prepare_migration(struct page *page,
2850 struct page *newpage, struct mem_cgroup **ptr) 2947 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
2851{ 2948{
2852 struct page_cgroup *pc;
2853 struct mem_cgroup *mem = NULL; 2949 struct mem_cgroup *mem = NULL;
2950 struct page_cgroup *pc;
2854 enum charge_type ctype; 2951 enum charge_type ctype;
2855 int ret = 0; 2952 int ret = 0;
2856 2953
2954 *ptr = NULL;
2955
2857 VM_BUG_ON(PageTransHuge(page)); 2956 VM_BUG_ON(PageTransHuge(page));
2858 if (mem_cgroup_disabled()) 2957 if (mem_cgroup_disabled())
2859 return 0; 2958 return 0;
@@ -2904,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2904 return 0; 3003 return 0;
2905 3004
2906 *ptr = mem; 3005 *ptr = mem;
2907 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); 3006 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
2908 css_put(&mem->css);/* drop extra refcnt */ 3007 css_put(&mem->css);/* drop extra refcnt */
2909 if (ret || *ptr == NULL) { 3008 if (ret || *ptr == NULL) {
2910 if (PageAnon(page)) { 3009 if (PageAnon(page)) {
@@ -2931,7 +3030,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2931 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3030 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2932 else 3031 else
2933 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3032 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2934 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); 3033 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2935 return ret; 3034 return ret;
2936} 3035}
2937 3036
@@ -2996,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
2996 struct mm_struct *mm, 3095 struct mm_struct *mm,
2997 gfp_t gfp_mask) 3096 gfp_t gfp_mask)
2998{ 3097{
2999 struct mem_cgroup *mem = NULL; 3098 struct mem_cgroup *mem;
3000 int ret; 3099 int ret;
3001 3100
3002 if (mem_cgroup_disabled()) 3101 if (mem_cgroup_disabled())
@@ -3009,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
3009 return ret; 3108 return ret;
3010} 3109}
3011 3110
3111#ifdef CONFIG_DEBUG_VM
3112static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3113{
3114 struct page_cgroup *pc;
3115
3116 pc = lookup_page_cgroup(page);
3117 if (likely(pc) && PageCgroupUsed(pc))
3118 return pc;
3119 return NULL;
3120}
3121
3122bool mem_cgroup_bad_page_check(struct page *page)
3123{
3124 if (mem_cgroup_disabled())
3125 return false;
3126
3127 return lookup_page_cgroup_used(page) != NULL;
3128}
3129
3130void mem_cgroup_print_bad_page(struct page *page)
3131{
3132 struct page_cgroup *pc;
3133
3134 pc = lookup_page_cgroup_used(page);
3135 if (pc) {
3136 int ret = -1;
3137 char *path;
3138
3139 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3140 pc, pc->flags, pc->mem_cgroup);
3141
3142 path = kmalloc(PATH_MAX, GFP_KERNEL);
3143 if (path) {
3144 rcu_read_lock();
3145 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3146 path, PATH_MAX);
3147 rcu_read_unlock();
3148 }
3149
3150 printk(KERN_CONT "(%s)\n",
3151 (ret < 0) ? "cannot get the path" : path);
3152 kfree(path);
3153 }
3154}
3155#endif
3156
3012static DEFINE_MUTEX(set_limit_mutex); 3157static DEFINE_MUTEX(set_limit_mutex);
3013 3158
3014static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3159static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -3252,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3252 loop += 256; 3397 loop += 256;
3253 busy = NULL; 3398 busy = NULL;
3254 while (loop--) { 3399 while (loop--) {
3400 struct page *page;
3401
3255 ret = 0; 3402 ret = 0;
3256 spin_lock_irqsave(&zone->lru_lock, flags); 3403 spin_lock_irqsave(&zone->lru_lock, flags);
3257 if (list_empty(list)) { 3404 if (list_empty(list)) {
@@ -3267,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3267 } 3414 }
3268 spin_unlock_irqrestore(&zone->lru_lock, flags); 3415 spin_unlock_irqrestore(&zone->lru_lock, flags);
3269 3416
3270 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); 3417 page = lookup_cgroup_page(pc);
3418
3419 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3271 if (ret == -ENOMEM) 3420 if (ret == -ENOMEM)
3272 break; 3421 break;
3273 3422
@@ -3415,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3415} 3564}
3416 3565
3417 3566
3418static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, 3567static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3419 enum mem_cgroup_stat_index idx) 3568 enum mem_cgroup_stat_index idx)
3420{ 3569{
3421 struct mem_cgroup *iter; 3570 struct mem_cgroup *iter;
3422 s64 val = 0; 3571 long val = 0;
3423 3572
3424 /* each per cpu's value can be minus.Then, use s64 */ 3573 /* Per-cpu values can be negative, use a signed accumulator */
3425 for_each_mem_cgroup_tree(iter, mem) 3574 for_each_mem_cgroup_tree(iter, mem)
3426 val += mem_cgroup_read_stat(iter, idx); 3575 val += mem_cgroup_read_stat(iter, idx);
3427 3576
@@ -3441,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3441 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3590 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3442 } 3591 }
3443 3592
3444 val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); 3593 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3445 val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); 3594 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
3446 3595
3447 if (swap) 3596 if (swap)
3448 val += mem_cgroup_get_recursive_idx_stat(mem, 3597 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3449 MEM_CGROUP_STAT_SWAPOUT);
3450 3598
3451 return val << PAGE_SHIFT; 3599 return val << PAGE_SHIFT;
3452} 3600}
@@ -3666,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3666 s->stat[MCS_RSS] += val * PAGE_SIZE; 3814 s->stat[MCS_RSS] += val * PAGE_SIZE;
3667 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3815 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3668 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3816 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3669 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); 3817 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
3670 s->stat[MCS_PGPGIN] += val; 3818 s->stat[MCS_PGPGIN] += val;
3671 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3819 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
3672 s->stat[MCS_PGPGOUT] += val; 3820 s->stat[MCS_PGPGOUT] += val;
3673 if (do_swap_account) { 3821 if (do_swap_account) {
3674 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3822 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
@@ -3792,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3792 return -EINVAL; 3940 return -EINVAL;
3793 } 3941 }
3794 3942
3795 spin_lock(&memcg->reclaim_param_lock);
3796 memcg->swappiness = val; 3943 memcg->swappiness = val;
3797 spin_unlock(&memcg->reclaim_param_lock);
3798 3944
3799 cgroup_unlock(); 3945 cgroup_unlock();
3800 3946
@@ -4450,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4450 res_counter_init(&mem->memsw, NULL); 4596 res_counter_init(&mem->memsw, NULL);
4451 } 4597 }
4452 mem->last_scanned_child = 0; 4598 mem->last_scanned_child = 0;
4453 spin_lock_init(&mem->reclaim_param_lock);
4454 INIT_LIST_HEAD(&mem->oom_notify); 4599 INIT_LIST_HEAD(&mem->oom_notify);
4455 4600
4456 if (parent) 4601 if (parent)
@@ -4538,8 +4683,7 @@ one_by_one:
4538 batch_count = PRECHARGE_COUNT_AT_ONCE; 4683 batch_count = PRECHARGE_COUNT_AT_ONCE;
4539 cond_resched(); 4684 cond_resched();
4540 } 4685 }
4541 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 4686 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
4542 PAGE_SIZE);
4543 if (ret || !mem) 4687 if (ret || !mem)
4544 /* mem_cgroup_clear_mc() will do uncharge later */ 4688 /* mem_cgroup_clear_mc() will do uncharge later */
4545 return -ENOMEM; 4689 return -ENOMEM;
@@ -4701,7 +4845,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4701 pte_t *pte; 4845 pte_t *pte;
4702 spinlock_t *ptl; 4846 spinlock_t *ptl;
4703 4847
4704 VM_BUG_ON(pmd_trans_huge(*pmd)); 4848 split_huge_page_pmd(walk->mm, pmd);
4849
4705 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4850 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4706 for (; addr != end; pte++, addr += PAGE_SIZE) 4851 for (; addr != end; pte++, addr += PAGE_SIZE)
4707 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4852 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4863,8 +5008,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4863 pte_t *pte; 5008 pte_t *pte;
4864 spinlock_t *ptl; 5009 spinlock_t *ptl;
4865 5010
5011 split_huge_page_pmd(walk->mm, pmd);
4866retry: 5012retry:
4867 VM_BUG_ON(pmd_trans_huge(*pmd));
4868 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5013 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4869 for (; addr != end; addr += PAGE_SIZE) { 5014 for (; addr != end; addr += PAGE_SIZE) {
4870 pte_t ptent = *(pte++); 5015 pte_t ptent = *(pte++);
@@ -4884,8 +5029,8 @@ retry:
4884 if (isolate_lru_page(page)) 5029 if (isolate_lru_page(page))
4885 goto put; 5030 goto put;
4886 pc = lookup_page_cgroup(page); 5031 pc = lookup_page_cgroup(page);
4887 if (!mem_cgroup_move_account(pc, 5032 if (!mem_cgroup_move_account(page, 1, pc,
4888 mc.from, mc.to, false, PAGE_SIZE)) { 5033 mc.from, mc.to, false)) {
4889 mc.precharge--; 5034 mc.precharge--;
4890 /* we uncharge from mc.from later. */ 5035 /* we uncharge from mc.from later. */
4891 mc.moved_charge++; 5036 mc.moved_charge++;
@@ -5024,9 +5169,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
5024static int __init enable_swap_account(char *s) 5169static int __init enable_swap_account(char *s)
5025{ 5170{
5026 /* consider enabled if no parameter or 1 is given */ 5171 /* consider enabled if no parameter or 1 is given */
5027 if (!s || !strcmp(s, "1")) 5172 if (!(*s) || !strcmp(s, "=1"))
5028 really_do_swap_account = 1; 5173 really_do_swap_account = 1;
5029 else if (!strcmp(s, "0")) 5174 else if (!strcmp(s, "=0"))
5030 really_do_swap_account = 0; 5175 really_do_swap_account = 0;
5031 return 1; 5176 return 1;
5032} 5177}
@@ -5034,7 +5179,8 @@ __setup("swapaccount", enable_swap_account);
5034 5179
5035static int __init disable_swap_account(char *s) 5180static int __init disable_swap_account(char *s)
5036{ 5181{
5037 enable_swap_account("0"); 5182 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5183 enable_swap_account("=0");
5038 return 1; 5184 return 1;
5039} 5185}
5040__setup("noswapaccount", disable_swap_account); 5186__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..2b9a5eef39e0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -208,7 +208,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
208 * Don't use force here, it's convenient if the signal 208 * Don't use force here, it's convenient if the signal
209 * can be temporarily blocked. 209 * can be temporarily blocked.
210 * This could cause a loop when the user sets SIGBUS 210 * This could cause a loop when the user sets SIGBUS
211 * to SIG_IGN, but hopefully noone will do that? 211 * to SIG_IGN, but hopefully no one will do that?
212 */ 212 */
213 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ 213 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
214 if (ret < 0) 214 if (ret < 0)
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
233 } 233 }
234 234
235 /* 235 /*
236 * Only all shrink_slab here (which would also 236 * Only call shrink_slab here (which would also shrink other caches) if
237 * shrink other caches) if access is not potentially fatal. 237 * access is not potentially fatal.
238 */ 238 */
239 if (access) { 239 if (access) {
240 int nr; 240 int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
386 struct task_struct *tsk; 386 struct task_struct *tsk;
387 struct anon_vma *av; 387 struct anon_vma *av;
388 388
389 if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 return;
391 read_lock(&tasklist_lock); 389 read_lock(&tasklist_lock);
392 av = page_lock_anon_vma(page); 390 av = page_lock_anon_vma(page);
393 if (av == NULL) /* Not actually mapped anymore */ 391 if (av == NULL) /* Not actually mapped anymore */
@@ -636,7 +634,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
636 * when the page is reread or dropped. If an 634 * when the page is reread or dropped. If an
637 * application assumes it will always get error on 635 * application assumes it will always get error on
638 * fsync, but does other operations on the fd before 636 * fsync, but does other operations on the fd before
639 * and the page is dropped inbetween then the error 637 * and the page is dropped between then the error
640 * will not be properly reported. 638 * will not be properly reported.
641 * 639 *
642 * This can already happen even without hwpoisoned 640 * This can already happen even without hwpoisoned
@@ -730,7 +728,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
730 * The table matches them in order and calls the right handler. 728 * The table matches them in order and calls the right handler.
731 * 729 *
732 * This is quite tricky because we can access page at any time 730 * This is quite tricky because we can access page at any time
733 * in its live cycle, so all accesses have to be extremly careful. 731 * in its live cycle, so all accesses have to be extremely careful.
734 * 732 *
735 * This is not complete. More states could be added. 733 * This is not complete. More states could be added.
736 * For any missing state don't attempt recovery. 734 * For any missing state don't attempt recovery.
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
856 int ret; 854 int ret;
857 int kill = 1; 855 int kill = 1;
858 struct page *hpage = compound_head(p); 856 struct page *hpage = compound_head(p);
857 struct page *ppage;
859 858
860 if (PageReserved(p) || PageSlab(p)) 859 if (PageReserved(p) || PageSlab(p))
861 return SWAP_SUCCESS; 860 return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
897 } 896 }
898 897
899 /* 898 /*
899 * ppage: poisoned page
900 * if p is regular page(4k page)
901 * ppage == real poisoned page;
902 * else p is hugetlb or THP, ppage == head page.
903 */
904 ppage = hpage;
905
906 if (PageTransHuge(hpage)) {
907 /*
908 * Verify that this isn't a hugetlbfs head page, the check for
909 * PageAnon is just for avoid tripping a split_huge_page
910 * internal debug check, as split_huge_page refuses to deal with
911 * anything that isn't an anon page. PageAnon can't go away fro
912 * under us because we hold a refcount on the hpage, without a
913 * refcount on the hpage. split_huge_page can't be safely called
914 * in the first place, having a refcount on the tail isn't
915 * enough * to be safe.
916 */
917 if (!PageHuge(hpage) && PageAnon(hpage)) {
918 if (unlikely(split_huge_page(hpage))) {
919 /*
920 * FIXME: if splitting THP is failed, it is
921 * better to stop the following operation rather
922 * than causing panic by unmapping. System might
923 * survive if the page is freed later.
924 */
925 printk(KERN_INFO
926 "MCE %#lx: failed to split THP\n", pfn);
927
928 BUG_ON(!PageHWPoison(p));
929 return SWAP_FAIL;
930 }
931 /* THP is split, so ppage should be the real poisoned page. */
932 ppage = p;
933 }
934 }
935
936 /*
900 * First collect all the processes that have the page 937 * First collect all the processes that have the page
901 * mapped in dirty form. This has to be done before try_to_unmap, 938 * mapped in dirty form. This has to be done before try_to_unmap,
902 * because ttu takes the rmap data structures down. 939 * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
905 * there's nothing that can be done. 942 * there's nothing that can be done.
906 */ 943 */
907 if (kill) 944 if (kill)
908 collect_procs(hpage, &tokill); 945 collect_procs(ppage, &tokill);
946
947 if (hpage != ppage)
948 lock_page(ppage);
909 949
910 ret = try_to_unmap(hpage, ttu); 950 ret = try_to_unmap(ppage, ttu);
911 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
912 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 952 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
913 pfn, page_mapcount(hpage)); 953 pfn, page_mapcount(ppage));
954
955 if (hpage != ppage)
956 unlock_page(ppage);
914 957
915 /* 958 /*
916 * Now that the dirty bit has been propagated to the 959 * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
921 * use a more force-full uncatchable kill to prevent 964 * use a more force-full uncatchable kill to prevent
922 * any accesses to the poisoned memory. 965 * any accesses to the poisoned memory.
923 */ 966 */
924 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 967 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
925 ret != SWAP_SUCCESS, p, pfn); 968 ret != SWAP_SUCCESS, p, pfn);
926 969
927 return ret; 970 return ret;
@@ -995,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
995 * Check "just unpoisoned", "filter hit", and 1038 * Check "just unpoisoned", "filter hit", and
996 * "race with other subpage." 1039 * "race with other subpage."
997 */ 1040 */
998 lock_page_nosync(hpage); 1041 lock_page(hpage);
999 if (!PageHWPoison(hpage) 1042 if (!PageHWPoison(hpage)
1000 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1043 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1001 || (p != hpage && TestSetPageHWPoison(hpage))) { 1044 || (p != hpage && TestSetPageHWPoison(hpage))) {
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1022 * The check (unnecessarily) ignores LRU pages being isolated and 1065 * The check (unnecessarily) ignores LRU pages being isolated and
1023 * walked by the page reclaim code, however that's not a big loss. 1066 * walked by the page reclaim code, however that's not a big loss.
1024 */ 1067 */
1025 if (!PageLRU(p) && !PageHuge(p)) 1068 if (!PageHuge(p) && !PageTransCompound(p)) {
1026 shake_page(p, 0); 1069 if (!PageLRU(p))
1027 if (!PageLRU(p) && !PageHuge(p)) { 1070 shake_page(p, 0);
1028 /* 1071 if (!PageLRU(p)) {
1029 * shake_page could have turned it free. 1072 /*
1030 */ 1073 * shake_page could have turned it free.
1031 if (is_free_buddy_page(p)) { 1074 */
1032 action_result(pfn, "free buddy, 2nd try", DELAYED); 1075 if (is_free_buddy_page(p)) {
1033 return 0; 1076 action_result(pfn, "free buddy, 2nd try",
1077 DELAYED);
1078 return 0;
1079 }
1080 action_result(pfn, "non LRU", IGNORED);
1081 put_page(p);
1082 return -EBUSY;
1034 } 1083 }
1035 action_result(pfn, "non LRU", IGNORED);
1036 put_page(p);
1037 return -EBUSY;
1038 } 1084 }
1039 1085
1040 /* 1086 /*
@@ -1042,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1042 * It's very difficult to mess with pages currently under IO 1088 * It's very difficult to mess with pages currently under IO
1043 * and in many cases impossible, so we just avoid it here. 1089 * and in many cases impossible, so we just avoid it here.
1044 */ 1090 */
1045 lock_page_nosync(hpage); 1091 lock_page(hpage);
1046 1092
1047 /* 1093 /*
1048 * unpoison always clear PG_hwpoison inside page lock 1094 * unpoison always clear PG_hwpoison inside page lock
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1064 * For error on the tail page, we should set PG_hwpoison 1110 * For error on the tail page, we should set PG_hwpoison
1065 * on the head page to show that the hugepage is hwpoisoned 1111 * on the head page to show that the hugepage is hwpoisoned
1066 */ 1112 */
1067 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1113 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1068 action_result(pfn, "hugepage already hardware poisoned", 1114 action_result(pfn, "hugepage already hardware poisoned",
1069 IGNORED); 1115 IGNORED);
1070 unlock_page(hpage); 1116 unlock_page(hpage);
@@ -1084,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1084 1130
1085 /* 1131 /*
1086 * Now take care of user space mappings. 1132 * Now take care of user space mappings.
1087 * Abort on fail: __remove_from_page_cache() assumes unmapped page. 1133 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1088 */ 1134 */
1089 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { 1135 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1090 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); 1136 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1185,7 +1231,7 @@ int unpoison_memory(unsigned long pfn)
1185 return 0; 1231 return 0;
1186 } 1232 }
1187 1233
1188 lock_page_nosync(page); 1234 lock_page(page);
1189 /* 1235 /*
1190 * This test is racy because PG_hwpoison is set outside of page lock. 1236 * This test is racy because PG_hwpoison is set outside of page lock.
1191 * That's acceptable because that won't trigger kernel panic. Instead, 1237 * That's acceptable because that won't trigger kernel panic. Instead,
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1295 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1341 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 true); 1342 true);
1297 if (ret) { 1343 if (ret) {
1298 putback_lru_pages(&pagelist); 1344 struct page *page1, *page2;
1345 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1346 put_page(page1);
1347
1299 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1348 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1300 pfn, ret, page->flags); 1349 pfn, ret, page->flags);
1301 if (ret > 0) 1350 if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
1419 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1468 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 0, true); 1469 0, true);
1421 if (ret) { 1470 if (ret) {
1471 putback_lru_pages(&pagelist);
1422 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1472 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1423 pfn, ret, page->flags); 1473 pfn, ret, page->flags);
1424 if (ret > 0) 1474 if (ret > 0)
@@ -1437,35 +1487,3 @@ done:
1437 /* keep elevated page count for bad page */ 1487 /* keep elevated page count for bad page */
1438 return ret; 1488 return ret;
1439} 1489}
1440
1441/*
1442 * The caller must hold current->mm->mmap_sem in read mode.
1443 */
1444int is_hwpoison_address(unsigned long addr)
1445{
1446 pgd_t *pgdp;
1447 pud_t pud, *pudp;
1448 pmd_t pmd, *pmdp;
1449 pte_t pte, *ptep;
1450 swp_entry_t entry;
1451
1452 pgdp = pgd_offset(current->mm, addr);
1453 if (!pgd_present(*pgdp))
1454 return 0;
1455 pudp = pud_offset(pgdp, addr);
1456 pud = *pudp;
1457 if (!pud_present(pud) || pud_large(pud))
1458 return 0;
1459 pmdp = pmd_offset(pudp, addr);
1460 pmd = *pmdp;
1461 if (!pmd_present(pmd) || pmd_large(pmd))
1462 return 0;
1463 ptep = pte_offset_map(pmdp, addr);
1464 pte = *ptep;
1465 pte_unmap(ptep);
1466 if (!is_swap_pte(pte))
1467 return 0;
1468 entry = pte_to_swp_entry(pte);
1469 return is_hwpoison_entry(entry);
1470}
1471EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..ce22a250926f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1410,6 +1410,62 @@ no_page_table:
1410 return page; 1410 return page;
1411} 1411}
1412 1412
1413static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
1414{
1415 return (vma->vm_flags & VM_GROWSDOWN) &&
1416 (vma->vm_start == addr) &&
1417 !vma_stack_continue(vma->vm_prev, addr);
1418}
1419
1420/**
1421 * __get_user_pages() - pin user pages in memory
1422 * @tsk: task_struct of target task
1423 * @mm: mm_struct of target mm
1424 * @start: starting user address
1425 * @nr_pages: number of pages from start to pin
1426 * @gup_flags: flags modifying pin behaviour
1427 * @pages: array that receives pointers to the pages pinned.
1428 * Should be at least nr_pages long. Or NULL, if caller
1429 * only intends to ensure the pages are faulted in.
1430 * @vmas: array of pointers to vmas corresponding to each page.
1431 * Or NULL if the caller does not require them.
1432 * @nonblocking: whether waiting for disk IO or mmap_sem contention
1433 *
1434 * Returns number of pages pinned. This may be fewer than the number
1435 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1436 * were pinned, returns -errno. Each page returned must be released
1437 * with a put_page() call when it is finished with. vmas will only
1438 * remain valid while mmap_sem is held.
1439 *
1440 * Must be called with mmap_sem held for read or write.
1441 *
1442 * __get_user_pages walks a process's page tables and takes a reference to
1443 * each struct page that each user address corresponds to at a given
1444 * instant. That is, it takes the page that would be accessed if a user
1445 * thread accesses the given user virtual address at that instant.
1446 *
1447 * This does not guarantee that the page exists in the user mappings when
1448 * __get_user_pages returns, and there may even be a completely different
1449 * page there in some cases (eg. if mmapped pagecache has been invalidated
1450 * and subsequently re faulted). However it does guarantee that the page
1451 * won't be freed completely. And mostly callers simply care that the page
1452 * contains data that was valid *at some point in time*. Typically, an IO
1453 * or similar operation cannot guarantee anything stronger anyway because
1454 * locks can't be held over the syscall boundary.
1455 *
1456 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1457 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1458 * appropriate) must be called after the page is finished with, and
1459 * before put_page is called.
1460 *
1461 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
1462 * or mmap_sem contention, and if waiting is needed to pin all pages,
1463 * *@nonblocking will be set to 0.
1464 *
1465 * In most cases, get_user_pages or get_user_pages_fast should be used
1466 * instead of __get_user_pages. __get_user_pages should be used only if
1467 * you need some special @gup_flags.
1468 */
1413int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1469int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1414 unsigned long start, int nr_pages, unsigned int gup_flags, 1470 unsigned long start, int nr_pages, unsigned int gup_flags,
1415 struct page **pages, struct vm_area_struct **vmas, 1471 struct page **pages, struct vm_area_struct **vmas,
@@ -1437,9 +1493,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1437 struct vm_area_struct *vma; 1493 struct vm_area_struct *vma;
1438 1494
1439 vma = find_extend_vma(mm, start); 1495 vma = find_extend_vma(mm, start);
1440 if (!vma && in_gate_area(tsk, start)) { 1496 if (!vma && in_gate_area(mm, start)) {
1441 unsigned long pg = start & PAGE_MASK; 1497 unsigned long pg = start & PAGE_MASK;
1442 struct vm_area_struct *gate_vma = get_gate_vma(tsk);
1443 pgd_t *pgd; 1498 pgd_t *pgd;
1444 pud_t *pud; 1499 pud_t *pud;
1445 pmd_t *pmd; 1500 pmd_t *pmd;
@@ -1464,10 +1519,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1464 pte_unmap(pte); 1519 pte_unmap(pte);
1465 return i ? : -EFAULT; 1520 return i ? : -EFAULT;
1466 } 1521 }
1522 vma = get_gate_vma(mm);
1467 if (pages) { 1523 if (pages) {
1468 struct page *page; 1524 struct page *page;
1469 1525
1470 page = vm_normal_page(gate_vma, start, *pte); 1526 page = vm_normal_page(vma, start, *pte);
1471 if (!page) { 1527 if (!page) {
1472 if (!(gup_flags & FOLL_DUMP) && 1528 if (!(gup_flags & FOLL_DUMP) &&
1473 is_zero_pfn(pte_pfn(*pte))) 1529 is_zero_pfn(pte_pfn(*pte)))
@@ -1481,12 +1537,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1481 get_page(page); 1537 get_page(page);
1482 } 1538 }
1483 pte_unmap(pte); 1539 pte_unmap(pte);
1484 if (vmas) 1540 goto next_page;
1485 vmas[i] = gate_vma;
1486 i++;
1487 start += PAGE_SIZE;
1488 nr_pages--;
1489 continue;
1490 } 1541 }
1491 1542
1492 if (!vma || 1543 if (!vma ||
@@ -1500,6 +1551,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1500 continue; 1551 continue;
1501 } 1552 }
1502 1553
1554 /*
1555 * If we don't actually want the page itself,
1556 * and it's the stack guard page, just skip it.
1557 */
1558 if (!pages && stack_guard_page(vma, start))
1559 goto next_page;
1560
1503 do { 1561 do {
1504 struct page *page; 1562 struct page *page;
1505 unsigned int foll_flags = gup_flags; 1563 unsigned int foll_flags = gup_flags;
@@ -1520,6 +1578,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1520 fault_flags |= FAULT_FLAG_WRITE; 1578 fault_flags |= FAULT_FLAG_WRITE;
1521 if (nonblocking) 1579 if (nonblocking)
1522 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 1580 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1581 if (foll_flags & FOLL_NOWAIT)
1582 fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
1523 1583
1524 ret = handle_mm_fault(mm, vma, start, 1584 ret = handle_mm_fault(mm, vma, start,
1525 fault_flags); 1585 fault_flags);
@@ -1527,19 +1587,30 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1527 if (ret & VM_FAULT_ERROR) { 1587 if (ret & VM_FAULT_ERROR) {
1528 if (ret & VM_FAULT_OOM) 1588 if (ret & VM_FAULT_OOM)
1529 return i ? i : -ENOMEM; 1589 return i ? i : -ENOMEM;
1530 if (ret & 1590 if (ret & (VM_FAULT_HWPOISON |
1531 (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| 1591 VM_FAULT_HWPOISON_LARGE)) {
1532 VM_FAULT_SIGBUS)) 1592 if (i)
1593 return i;
1594 else if (gup_flags & FOLL_HWPOISON)
1595 return -EHWPOISON;
1596 else
1597 return -EFAULT;
1598 }
1599 if (ret & VM_FAULT_SIGBUS)
1533 return i ? i : -EFAULT; 1600 return i ? i : -EFAULT;
1534 BUG(); 1601 BUG();
1535 } 1602 }
1536 if (ret & VM_FAULT_MAJOR) 1603
1537 tsk->maj_flt++; 1604 if (tsk) {
1538 else 1605 if (ret & VM_FAULT_MAJOR)
1539 tsk->min_flt++; 1606 tsk->maj_flt++;
1607 else
1608 tsk->min_flt++;
1609 }
1540 1610
1541 if (ret & VM_FAULT_RETRY) { 1611 if (ret & VM_FAULT_RETRY) {
1542 *nonblocking = 0; 1612 if (nonblocking)
1613 *nonblocking = 0;
1543 return i; 1614 return i;
1544 } 1615 }
1545 1616
@@ -1569,6 +1640,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1569 flush_anon_page(vma, page, start); 1640 flush_anon_page(vma, page, start);
1570 flush_dcache_page(page); 1641 flush_dcache_page(page);
1571 } 1642 }
1643next_page:
1572 if (vmas) 1644 if (vmas)
1573 vmas[i] = vma; 1645 vmas[i] = vma;
1574 i++; 1646 i++;
@@ -1578,10 +1650,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1578 } while (nr_pages); 1650 } while (nr_pages);
1579 return i; 1651 return i;
1580} 1652}
1653EXPORT_SYMBOL(__get_user_pages);
1581 1654
1582/** 1655/**
1583 * get_user_pages() - pin user pages in memory 1656 * get_user_pages() - pin user pages in memory
1584 * @tsk: task_struct of target task 1657 * @tsk: the task_struct to use for page fault accounting, or
1658 * NULL if faults are not to be recorded.
1585 * @mm: mm_struct of target mm 1659 * @mm: mm_struct of target mm
1586 * @start: starting user address 1660 * @start: starting user address
1587 * @nr_pages: number of pages from start to pin 1661 * @nr_pages: number of pages from start to pin
@@ -2115,10 +2189,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
2115 * handle_pte_fault chooses page fault handler according to an entry 2189 * handle_pte_fault chooses page fault handler according to an entry
2116 * which was read non-atomically. Before making any commitment, on 2190 * which was read non-atomically. Before making any commitment, on
2117 * those architectures or configurations (e.g. i386 with PAE) which 2191 * those architectures or configurations (e.g. i386 with PAE) which
2118 * might give a mix of unmatched parts, do_swap_page and do_file_page 2192 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
2119 * must check under lock before unmapping the pte and proceeding 2193 * must check under lock before unmapping the pte and proceeding
2120 * (but do_wp_page is only called after already making such a check; 2194 * (but do_wp_page is only called after already making such a check;
2121 * and do_anonymous_page and do_no_page can safely check later on). 2195 * and do_anonymous_page can safely check later on).
2122 */ 2196 */
2123static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 2197static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2124 pte_t *page_table, pte_t orig_pte) 2198 pte_t *page_table, pte_t orig_pte)
@@ -2219,7 +2293,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2219 &ptl); 2293 &ptl);
2220 if (!pte_same(*page_table, orig_pte)) { 2294 if (!pte_same(*page_table, orig_pte)) {
2221 unlock_page(old_page); 2295 unlock_page(old_page);
2222 page_cache_release(old_page);
2223 goto unlock; 2296 goto unlock;
2224 } 2297 }
2225 page_cache_release(old_page); 2298 page_cache_release(old_page);
@@ -2289,7 +2362,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2289 &ptl); 2362 &ptl);
2290 if (!pte_same(*page_table, orig_pte)) { 2363 if (!pte_same(*page_table, orig_pte)) {
2291 unlock_page(old_page); 2364 unlock_page(old_page);
2292 page_cache_release(old_page);
2293 goto unlock; 2365 goto unlock;
2294 } 2366 }
2295 2367
@@ -2316,7 +2388,7 @@ reuse:
2316 * bit after it clear all dirty ptes, but before a racing 2388 * bit after it clear all dirty ptes, but before a racing
2317 * do_wp_page installs a dirty pte. 2389 * do_wp_page installs a dirty pte.
2318 * 2390 *
2319 * do_no_page is protected similarly. 2391 * __do_fault is protected similarly.
2320 */ 2392 */
2321 if (!page_mkwrite) { 2393 if (!page_mkwrite) {
2322 wait_on_page_locked(dirty_page); 2394 wait_on_page_locked(dirty_page);
@@ -2367,16 +2439,6 @@ gotten:
2367 } 2439 }
2368 __SetPageUptodate(new_page); 2440 __SetPageUptodate(new_page);
2369 2441
2370 /*
2371 * Don't let another task, with possibly unlocked vma,
2372 * keep the mlocked page.
2373 */
2374 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2375 lock_page(old_page); /* for LRU manipulation */
2376 clear_page_mlock(old_page);
2377 unlock_page(old_page);
2378 }
2379
2380 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2442 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2381 goto oom_free_new; 2443 goto oom_free_new;
2382 2444
@@ -2444,10 +2506,20 @@ gotten:
2444 2506
2445 if (new_page) 2507 if (new_page)
2446 page_cache_release(new_page); 2508 page_cache_release(new_page);
2447 if (old_page)
2448 page_cache_release(old_page);
2449unlock: 2509unlock:
2450 pte_unmap_unlock(page_table, ptl); 2510 pte_unmap_unlock(page_table, ptl);
2511 if (old_page) {
2512 /*
2513 * Don't let another task, with possibly unlocked vma,
2514 * keep the mlocked page.
2515 */
2516 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2517 lock_page(old_page); /* LRU manipulation */
2518 munlock_vma_page(old_page);
2519 unlock_page(old_page);
2520 }
2521 page_cache_release(old_page);
2522 }
2451 return ret; 2523 return ret;
2452oom_free_new: 2524oom_free_new:
2453 page_cache_release(new_page); 2525 page_cache_release(new_page);
@@ -2650,6 +2722,7 @@ void unmap_mapping_range(struct address_space *mapping,
2650 details.last_index = ULONG_MAX; 2722 details.last_index = ULONG_MAX;
2651 details.i_mmap_lock = &mapping->i_mmap_lock; 2723 details.i_mmap_lock = &mapping->i_mmap_lock;
2652 2724
2725 mutex_lock(&mapping->unmap_mutex);
2653 spin_lock(&mapping->i_mmap_lock); 2726 spin_lock(&mapping->i_mmap_lock);
2654 2727
2655 /* Protect against endless unmapping loops */ 2728 /* Protect against endless unmapping loops */
@@ -2666,6 +2739,7 @@ void unmap_mapping_range(struct address_space *mapping,
2666 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2739 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2667 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2740 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2668 spin_unlock(&mapping->i_mmap_lock); 2741 spin_unlock(&mapping->i_mmap_lock);
2742 mutex_unlock(&mapping->unmap_mutex);
2669} 2743}
2670EXPORT_SYMBOL(unmap_mapping_range); 2744EXPORT_SYMBOL(unmap_mapping_range);
2671 2745
@@ -2707,7 +2781,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2707 swp_entry_t entry; 2781 swp_entry_t entry;
2708 pte_t pte; 2782 pte_t pte;
2709 int locked; 2783 int locked;
2710 struct mem_cgroup *ptr = NULL; 2784 struct mem_cgroup *ptr;
2711 int exclusive = 0; 2785 int exclusive = 0;
2712 int ret = 0; 2786 int ret = 0;
2713 2787
@@ -3053,12 +3127,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3053 goto out; 3127 goto out;
3054 } 3128 }
3055 charged = 1; 3129 charged = 1;
3056 /*
3057 * Don't let another task, with possibly unlocked vma,
3058 * keep the mlocked page.
3059 */
3060 if (vma->vm_flags & VM_LOCKED)
3061 clear_page_mlock(vmf.page);
3062 copy_user_highpage(page, vmf.page, address, vma); 3130 copy_user_highpage(page, vmf.page, address, vma);
3063 __SetPageUptodate(page); 3131 __SetPageUptodate(page);
3064 } else { 3132 } else {
@@ -3445,7 +3513,7 @@ static int __init gate_vma_init(void)
3445__initcall(gate_vma_init); 3513__initcall(gate_vma_init);
3446#endif 3514#endif
3447 3515
3448struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 3516struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3449{ 3517{
3450#ifdef AT_SYSINFO_EHDR 3518#ifdef AT_SYSINFO_EHDR
3451 return &gate_vma; 3519 return &gate_vma;
@@ -3454,7 +3522,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
3454#endif 3522#endif
3455} 3523}
3456 3524
3457int in_gate_area_no_task(unsigned long addr) 3525int in_gate_area_no_mm(unsigned long addr)
3458{ 3526{
3459#ifdef AT_SYSINFO_EHDR 3527#ifdef AT_SYSINFO_EHDR
3460 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) 3528 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3595,20 +3663,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3595#endif 3663#endif
3596 3664
3597/* 3665/*
3598 * Access another process' address space. 3666 * Access another process' address space as given in mm. If non-NULL, use the
3599 * Source/target buffer must be kernel space, 3667 * given task for page fault accounting.
3600 * Do not walk the page table directly, use get_user_pages
3601 */ 3668 */
3602int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) 3669static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3670 unsigned long addr, void *buf, int len, int write)
3603{ 3671{
3604 struct mm_struct *mm;
3605 struct vm_area_struct *vma; 3672 struct vm_area_struct *vma;
3606 void *old_buf = buf; 3673 void *old_buf = buf;
3607 3674
3608 mm = get_task_mm(tsk);
3609 if (!mm)
3610 return 0;
3611
3612 down_read(&mm->mmap_sem); 3675 down_read(&mm->mmap_sem);
3613 /* ignore errors, just check how much was successfully transferred */ 3676 /* ignore errors, just check how much was successfully transferred */
3614 while (len) { 3677 while (len) {
@@ -3625,7 +3688,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
3625 */ 3688 */
3626#ifdef CONFIG_HAVE_IOREMAP_PROT 3689#ifdef CONFIG_HAVE_IOREMAP_PROT
3627 vma = find_vma(mm, addr); 3690 vma = find_vma(mm, addr);
3628 if (!vma) 3691 if (!vma || vma->vm_start > addr)
3629 break; 3692 break;
3630 if (vma->vm_ops && vma->vm_ops->access) 3693 if (vma->vm_ops && vma->vm_ops->access)
3631 ret = vma->vm_ops->access(vma, addr, buf, 3694 ret = vma->vm_ops->access(vma, addr, buf,
@@ -3657,11 +3720,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
3657 addr += bytes; 3720 addr += bytes;
3658 } 3721 }
3659 up_read(&mm->mmap_sem); 3722 up_read(&mm->mmap_sem);
3660 mmput(mm);
3661 3723
3662 return buf - old_buf; 3724 return buf - old_buf;
3663} 3725}
3664 3726
3727/**
3728 * access_remote_vm - access another process' address space
3729 * @mm: the mm_struct of the target address space
3730 * @addr: start address to access
3731 * @buf: source or destination buffer
3732 * @len: number of bytes to transfer
3733 * @write: whether the access is a write
3734 *
3735 * The caller must hold a reference on @mm.
3736 */
3737int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3738 void *buf, int len, int write)
3739{
3740 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3741}
3742
3743/*
3744 * Access another process' address space.
3745 * Source/target buffer must be kernel space,
3746 * Do not walk the page table directly, use get_user_pages
3747 */
3748int access_process_vm(struct task_struct *tsk, unsigned long addr,
3749 void *buf, int len, int write)
3750{
3751 struct mm_struct *mm;
3752 int ret;
3753
3754 mm = get_task_mm(tsk);
3755 if (!mm)
3756 return 0;
3757
3758 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3759 mmput(mm);
3760
3761 return ret;
3762}
3763
3665/* 3764/*
3666 * Print the name of a VMA. 3765 * Print the name of a VMA.
3667 */ 3766 */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 321fc7455df7..9ca1d604f7cd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -375,7 +375,7 @@ void online_page(struct page *page)
375#endif 375#endif
376 376
377#ifdef CONFIG_FLATMEM 377#ifdef CONFIG_FLATMEM
378 max_mapnr = max(page_to_pfn(page), max_mapnr); 378 max_mapnr = max(pfn, max_mapnr);
379#endif 379#endif
380 380
381 ClearPageReserved(page); 381 ClearPageReserved(page);
@@ -724,7 +724,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
724 pfn); 724 pfn);
725 dump_page(page); 725 dump_page(page);
726#endif 726#endif
727 /* Becasue we don't have big zone->lock. we should 727 /* Because we don't have big zone->lock. we should
728 check this again here. */ 728 check this again here. */
729 if (page_count(page)) { 729 if (page_count(page)) {
730 not_managed++; 730 not_managed++;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..959a8b8c7350 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -993,7 +993,7 @@ int do_migrate_pages(struct mm_struct *mm,
993 * most recent <s, d> pair that moved (s != d). If we find a pair 993 * most recent <s, d> pair that moved (s != d). If we find a pair
994 * that not only moved, but what's better, moved to an empty slot 994 * that not only moved, but what's better, moved to an empty slot
995 * (d is not set in tmp), then we break out then, with that pair. 995 * (d is not set in tmp), then we break out then, with that pair.
996 * Otherwise when we finish scannng from_tmp, we at least have the 996 * Otherwise when we finish scanning from_tmp, we at least have the
997 * most recent <s, d> pair that moved. If we get all the way through 997 * most recent <s, d> pair that moved. If we get all the way through
998 * the scan of tmp without finding any node that moved, much less 998 * the scan of tmp without finding any node that moved, much less
999 * moved to an empty node, then there is nothing left worth migrating. 999 * moved to an empty node, then there is nothing left worth migrating.
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1524} 1524}
1525 1525
1526/* Return a zonelist indicated by gfp for node representing a mempolicy */ 1526/* Return a zonelist indicated by gfp for node representing a mempolicy */
1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) 1527static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1528 int nd)
1528{ 1529{
1529 int nd = numa_node_id();
1530
1531 switch (policy->mode) { 1530 switch (policy->mode) {
1532 case MPOL_PREFERRED: 1531 case MPOL_PREFERRED:
1533 if (!(policy->flags & MPOL_F_LOCAL)) 1532 if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1679 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1678 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1680 huge_page_shift(hstate_vma(vma))), gfp_flags); 1679 huge_page_shift(hstate_vma(vma))), gfp_flags);
1681 } else { 1680 } else {
1682 zl = policy_zonelist(gfp_flags, *mpol); 1681 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1683 if ((*mpol)->mode == MPOL_BIND) 1682 if ((*mpol)->mode == MPOL_BIND)
1684 *nodemask = &(*mpol)->v.nodes; 1683 *nodemask = &(*mpol)->v.nodes;
1685 } 1684 }
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1820 */ 1819 */
1821struct page * 1820struct page *
1822alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1821alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1823 unsigned long addr) 1822 unsigned long addr, int node)
1824{ 1823{
1825 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1824 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1826 struct zonelist *zl; 1825 struct zonelist *zl;
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1830 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1829 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1831 unsigned nid; 1830 unsigned nid;
1832 1831
1833 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1832 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1834 mpol_cond_put(pol); 1833 mpol_cond_put(pol);
1835 page = alloc_page_interleave(gfp, order, nid); 1834 page = alloc_page_interleave(gfp, order, nid);
1836 put_mems_allowed(); 1835 put_mems_allowed();
1837 return page; 1836 return page;
1838 } 1837 }
1839 zl = policy_zonelist(gfp, pol); 1838 zl = policy_zonelist(gfp, pol, node);
1840 if (unlikely(mpol_needs_cond_ref(pol))) { 1839 if (unlikely(mpol_needs_cond_ref(pol))) {
1841 /* 1840 /*
1842 * slow path: ref counted shared policy 1841 * slow path: ref counted shared policy
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1892 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1891 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1893 else 1892 else
1894 page = __alloc_pages_nodemask(gfp, order, 1893 page = __alloc_pages_nodemask(gfp, order,
1895 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); 1894 policy_zonelist(gfp, pol, numa_node_id()),
1895 policy_nodemask(gfp, pol));
1896 put_mems_allowed(); 1896 put_mems_allowed();
1897 return page; 1897 return page;
1898} 1898}
@@ -1979,8 +1979,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1979 case MPOL_INTERLEAVE: 1979 case MPOL_INTERLEAVE:
1980 return nodes_equal(a->v.nodes, b->v.nodes); 1980 return nodes_equal(a->v.nodes, b->v.nodes);
1981 case MPOL_PREFERRED: 1981 case MPOL_PREFERRED:
1982 return a->v.preferred_node == b->v.preferred_node && 1982 return a->v.preferred_node == b->v.preferred_node;
1983 a->flags == b->flags;
1984 default: 1983 default:
1985 BUG(); 1984 BUG();
1986 return 0; 1985 return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 9f29a3b7aac2..34132f8e9109 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -375,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
375 * redo the accounting that clear_page_dirty_for_io undid, 375 * redo the accounting that clear_page_dirty_for_io undid,
376 * but we can't use set_page_dirty because that function 376 * but we can't use set_page_dirty because that function
377 * is actually a signal that all of the page has become dirty. 377 * is actually a signal that all of the page has become dirty.
378 * Wheras only part of our page may be dirty. 378 * Whereas only part of our page may be dirty.
379 */ 379 */
380 __set_page_dirty_nobuffers(newpage); 380 __set_page_dirty_nobuffers(newpage);
381 } 381 }
@@ -564,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
564 * == 0 - success 564 * == 0 - success
565 */ 565 */
566static int move_to_new_page(struct page *newpage, struct page *page, 566static int move_to_new_page(struct page *newpage, struct page *page,
567 int remap_swapcache) 567 int remap_swapcache, bool sync)
568{ 568{
569 struct address_space *mapping; 569 struct address_space *mapping;
570 int rc; 570 int rc;
@@ -586,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
586 mapping = page_mapping(page); 586 mapping = page_mapping(page);
587 if (!mapping) 587 if (!mapping)
588 rc = migrate_page(mapping, newpage, page); 588 rc = migrate_page(mapping, newpage, page);
589 else if (mapping->a_ops->migratepage) 589 else {
590 /* 590 /*
591 * Most pages have a mapping and most filesystems 591 * Do not writeback pages if !sync and migratepage is
592 * should provide a migration function. Anonymous 592 * not pointing to migrate_page() which is nonblocking
593 * pages are part of swap space which also has its 593 * (swapcache/tmpfs uses migratepage = migrate_page).
594 * own migration function. This is the most common
595 * path for page migration.
596 */ 594 */
597 rc = mapping->a_ops->migratepage(mapping, 595 if (PageDirty(page) && !sync &&
598 newpage, page); 596 mapping->a_ops->migratepage != migrate_page)
599 else 597 rc = -EBUSY;
600 rc = fallback_migrate_page(mapping, newpage, page); 598 else if (mapping->a_ops->migratepage)
599 /*
600 * Most pages have a mapping and most filesystems
601 * should provide a migration function. Anonymous
602 * pages are part of swap space which also has its
603 * own migration function. This is the most common
604 * path for page migration.
605 */
606 rc = mapping->a_ops->migratepage(mapping,
607 newpage, page);
608 else
609 rc = fallback_migrate_page(mapping, newpage, page);
610 }
601 611
602 if (rc) { 612 if (rc) {
603 newpage->mapping = NULL; 613 newpage->mapping = NULL;
@@ -623,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
623 struct page *newpage = get_new_page(page, private, &result); 633 struct page *newpage = get_new_page(page, private, &result);
624 int remap_swapcache = 1; 634 int remap_swapcache = 1;
625 int charge = 0; 635 int charge = 0;
626 struct mem_cgroup *mem = NULL; 636 struct mem_cgroup *mem;
627 struct anon_vma *anon_vma = NULL; 637 struct anon_vma *anon_vma = NULL;
628 638
629 if (!newpage) 639 if (!newpage)
@@ -641,7 +651,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
641 rc = -EAGAIN; 651 rc = -EAGAIN;
642 652
643 if (!trylock_page(page)) { 653 if (!trylock_page(page)) {
644 if (!force) 654 if (!force || !sync)
645 goto move_newpage; 655 goto move_newpage;
646 656
647 /* 657 /*
@@ -678,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
678 } 688 }
679 689
680 /* charge against new page */ 690 /* charge against new page */
681 charge = mem_cgroup_prepare_migration(page, newpage, &mem); 691 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
682 if (charge == -ENOMEM) { 692 if (charge == -ENOMEM) {
683 rc = -ENOMEM; 693 rc = -ENOMEM;
684 goto unlock; 694 goto unlock;
@@ -686,7 +696,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
686 BUG_ON(charge); 696 BUG_ON(charge);
687 697
688 if (PageWriteback(page)) { 698 if (PageWriteback(page)) {
689 if (!force || !sync) 699 /*
700 * For !sync, there is no point retrying as the retry loop
701 * is expected to be too short for PageWriteback to be cleared
702 */
703 if (!sync) {
704 rc = -EBUSY;
705 goto uncharge;
706 }
707 if (!force)
690 goto uncharge; 708 goto uncharge;
691 wait_on_page_writeback(page); 709 wait_on_page_writeback(page);
692 } 710 }
@@ -757,14 +775,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
757 775
758skip_unmap: 776skip_unmap:
759 if (!page_mapped(page)) 777 if (!page_mapped(page))
760 rc = move_to_new_page(newpage, page, remap_swapcache); 778 rc = move_to_new_page(newpage, page, remap_swapcache, sync);
761 779
762 if (rc && remap_swapcache) 780 if (rc && remap_swapcache)
763 remove_migration_ptes(page, page); 781 remove_migration_ptes(page, page);
764 782
765 /* Drop an anon_vma reference if we took one */ 783 /* Drop an anon_vma reference if we took one */
766 if (anon_vma) 784 if (anon_vma)
767 drop_anon_vma(anon_vma); 785 put_anon_vma(anon_vma);
768 786
769uncharge: 787uncharge:
770 if (!charge) 788 if (!charge)
@@ -772,6 +790,7 @@ uncharge:
772unlock: 790unlock:
773 unlock_page(page); 791 unlock_page(page);
774 792
793move_newpage:
775 if (rc != -EAGAIN) { 794 if (rc != -EAGAIN) {
776 /* 795 /*
777 * A page that has been migrated has all references 796 * A page that has been migrated has all references
@@ -785,8 +804,6 @@ unlock:
785 putback_lru_page(page); 804 putback_lru_page(page);
786 } 805 }
787 806
788move_newpage:
789
790 /* 807 /*
791 * Move the new page to the LRU. If migration was not successful 808 * Move the new page to the LRU. If migration was not successful
792 * then this will free the page. 809 * then this will free the page.
@@ -851,13 +868,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
851 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 868 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
852 869
853 if (!page_mapped(hpage)) 870 if (!page_mapped(hpage))
854 rc = move_to_new_page(new_hpage, hpage, 1); 871 rc = move_to_new_page(new_hpage, hpage, 1, sync);
855 872
856 if (rc) 873 if (rc)
857 remove_migration_ptes(hpage, hpage); 874 remove_migration_ptes(hpage, hpage);
858 875
859 if (anon_vma) 876 if (anon_vma)
860 drop_anon_vma(anon_vma); 877 put_anon_vma(anon_vma);
861out: 878out:
862 unlock_page(hpage); 879 unlock_page(hpage);
863 880
@@ -981,10 +998,6 @@ int migrate_huge_pages(struct list_head *from,
981 } 998 }
982 rc = 0; 999 rc = 0;
983out: 1000out:
984
985 list_for_each_entry_safe(page, page2, from, lru)
986 put_page(page);
987
988 if (rc) 1001 if (rc)
989 return rc; 1002 return rc;
990 1003
@@ -1292,14 +1305,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1292 return -EPERM; 1305 return -EPERM;
1293 1306
1294 /* Find the mm_struct */ 1307 /* Find the mm_struct */
1295 read_lock(&tasklist_lock); 1308 rcu_read_lock();
1296 task = pid ? find_task_by_vpid(pid) : current; 1309 task = pid ? find_task_by_vpid(pid) : current;
1297 if (!task) { 1310 if (!task) {
1298 read_unlock(&tasklist_lock); 1311 rcu_read_unlock();
1299 return -ESRCH; 1312 return -ESRCH;
1300 } 1313 }
1301 mm = get_task_mm(task); 1314 mm = get_task_mm(task);
1302 read_unlock(&tasklist_lock); 1315 rcu_read_unlock();
1303 1316
1304 if (!mm) 1317 if (!mm)
1305 return -EINVAL; 1318 return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..6b55e3efe0df 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
135 } 135 }
136} 136}
137 137
138static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
139{
140 return (vma->vm_flags & VM_GROWSDOWN) &&
141 (vma->vm_start == addr) &&
142 !vma_stack_continue(vma->vm_prev, addr);
143}
144
145/** 138/**
146 * __mlock_vma_pages_range() - mlock a range of pages in the vma. 139 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
147 * @vma: target vma 140 * @vma: target vma
@@ -178,15 +171,16 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 171 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
179 gup_flags |= FOLL_WRITE; 172 gup_flags |= FOLL_WRITE;
180 173
174 /*
175 * We want mlock to succeed for regions that have any permissions
176 * other than PROT_NONE.
177 */
178 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
179 gup_flags |= FOLL_FORCE;
180
181 if (vma->vm_flags & VM_LOCKED) 181 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK; 182 gup_flags |= FOLL_MLOCK;
183 183
184 /* We don't try to access the guard page of a stack vma */
185 if (stack_guard_page(vma, start)) {
186 addr += PAGE_SIZE;
187 nr_pages--;
188 }
189
190 return __get_user_pages(current, mm, addr, nr_pages, gup_flags, 184 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
191 NULL, NULL, nonblocking); 185 NULL, NULL, nonblocking);
192} 186}
@@ -230,7 +224,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
230 224
231 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 225 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
232 is_vm_hugetlb_page(vma) || 226 is_vm_hugetlb_page(vma) ||
233 vma == get_gate_vma(current))) { 227 vma == get_gate_vma(current->mm))) {
234 228
235 __mlock_vma_pages_range(vma, start, end, NULL); 229 __mlock_vma_pages_range(vma, start, end, NULL);
236 230
@@ -325,7 +319,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
325 int lock = newflags & VM_LOCKED; 319 int lock = newflags & VM_LOCKED;
326 320
327 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 321 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
328 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) 322 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
329 goto out; /* don't set VM_LOCKED, don't count */ 323 goto out; /* don't set VM_LOCKED, don't count */
330 324
331 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 325 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ec8eb5a9cdd..e27e0cf0de03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -259,7 +259,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
259 * randomize_va_space to 2, which will still cause mm->start_brk 259 * randomize_va_space to 2, which will still cause mm->start_brk
260 * to be arbitrarily shifted 260 * to be arbitrarily shifted
261 */ 261 */
262 if (mm->start_brk > PAGE_ALIGN(mm->end_data)) 262 if (current->brk_randomized)
263 min_brk = mm->start_brk; 263 min_brk = mm->start_brk;
264 else 264 else
265 min_brk = mm->end_data; 265 min_brk = mm->end_data;
@@ -1814,11 +1814,14 @@ static int expand_downwards(struct vm_area_struct *vma,
1814 size = vma->vm_end - address; 1814 size = vma->vm_end - address;
1815 grow = (vma->vm_start - address) >> PAGE_SHIFT; 1815 grow = (vma->vm_start - address) >> PAGE_SHIFT;
1816 1816
1817 error = acct_stack_growth(vma, size, grow); 1817 error = -ENOMEM;
1818 if (!error) { 1818 if (grow <= vma->vm_pgoff) {
1819 vma->vm_start = address; 1819 error = acct_stack_growth(vma, size, grow);
1820 vma->vm_pgoff -= grow; 1820 if (!error) {
1821 perf_event_mmap(vma); 1821 vma->vm_start = address;
1822 vma->vm_pgoff -= grow;
1823 perf_event_mmap(vma);
1824 }
1822 } 1825 }
1823 } 1826 }
1824 vma_unlock_anon_vma(vma); 1827 vma_unlock_anon_vma(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..a7c1f9f9b941 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
94 */ 94 */
95 mapping = vma->vm_file->f_mapping; 95 mapping = vma->vm_file->f_mapping;
96 spin_lock(&mapping->i_mmap_lock); 96 spin_lock(&mapping->i_mmap_lock);
97 if (new_vma->vm_truncate_count && 97 new_vma->vm_truncate_count = 0;
98 new_vma->vm_truncate_count != vma->vm_truncate_count)
99 new_vma->vm_truncate_count = 0;
100 } 98 }
101 99
102 /* 100 /*
@@ -279,9 +277,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
279 if (old_len > vma->vm_end - addr) 277 if (old_len > vma->vm_end - addr)
280 goto Efault; 278 goto Efault;
281 279
282 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { 280 /* Need to be careful about a growing mapping */
283 if (new_len > old_len) 281 if (new_len > old_len) {
282 unsigned long pgoff;
283
284 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
284 goto Efault; 285 goto Efault;
286 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
287 pgoff += vma->vm_pgoff;
288 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
289 goto Einval;
285 } 290 }
286 291
287 if (vma->vm_flags & VM_LOCKED) { 292 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 000000000000..9109049f0bbc
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,427 @@
1/*
2 * bootmem - A boot-time physical memory allocator and configurator
3 *
4 * Copyright (C) 1999 Ingo Molnar
5 * 1999 Kanoj Sarcar, SGI
6 * 2008 Johannes Weiner
7 *
8 * Access to this subsystem has to be serialized externally (which is true
9 * for the boot process anyway).
10 */
11#include <linux/init.h>
12#include <linux/pfn.h>
13#include <linux/slab.h>
14#include <linux/bootmem.h>
15#include <linux/module.h>
16#include <linux/kmemleak.h>
17#include <linux/range.h>
18#include <linux/memblock.h>
19
20#include <asm/bug.h>
21#include <asm/io.h>
22#include <asm/processor.h>
23
24#include "internal.h"
25
26#ifndef CONFIG_NEED_MULTIPLE_NODES
27struct pglist_data __refdata contig_page_data;
28EXPORT_SYMBOL(contig_page_data);
29#endif
30
31unsigned long max_low_pfn;
32unsigned long min_low_pfn;
33unsigned long max_pfn;
34
35static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
36 u64 goal, u64 limit)
37{
38 void *ptr;
39 u64 addr;
40
41 if (limit > memblock.current_limit)
42 limit = memblock.current_limit;
43
44 addr = find_memory_core_early(nid, size, align, goal, limit);
45
46 if (addr == MEMBLOCK_ERROR)
47 return NULL;
48
49 ptr = phys_to_virt(addr);
50 memset(ptr, 0, size);
51 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
52 /*
53 * The min_count is set to 0 so that bootmem allocated blocks
54 * are never reported as leaks.
55 */
56 kmemleak_alloc(ptr, size, 0, 0);
57 return ptr;
58}
59
60/*
61 * free_bootmem_late - free bootmem pages directly to page allocator
62 * @addr: starting address of the range
63 * @size: size of the range in bytes
64 *
65 * This is only useful when the bootmem allocator has already been torn
66 * down, but we are still initializing the system. Pages are given directly
67 * to the page allocator, no bootmem metadata is updated because it is gone.
68 */
69void __init free_bootmem_late(unsigned long addr, unsigned long size)
70{
71 unsigned long cursor, end;
72
73 kmemleak_free_part(__va(addr), size);
74
75 cursor = PFN_UP(addr);
76 end = PFN_DOWN(addr + size);
77
78 for (; cursor < end; cursor++) {
79 __free_pages_bootmem(pfn_to_page(cursor), 0);
80 totalram_pages++;
81 }
82}
83
84static void __init __free_pages_memory(unsigned long start, unsigned long end)
85{
86 int i;
87 unsigned long start_aligned, end_aligned;
88 int order = ilog2(BITS_PER_LONG);
89
90 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
91 end_aligned = end & ~(BITS_PER_LONG - 1);
92
93 if (end_aligned <= start_aligned) {
94 for (i = start; i < end; i++)
95 __free_pages_bootmem(pfn_to_page(i), 0);
96
97 return;
98 }
99
100 for (i = start; i < start_aligned; i++)
101 __free_pages_bootmem(pfn_to_page(i), 0);
102
103 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
104 __free_pages_bootmem(pfn_to_page(i), order);
105
106 for (i = end_aligned; i < end; i++)
107 __free_pages_bootmem(pfn_to_page(i), 0);
108}
109
110unsigned long __init free_all_memory_core_early(int nodeid)
111{
112 int i;
113 u64 start, end;
114 unsigned long count = 0;
115 struct range *range = NULL;
116 int nr_range;
117
118 nr_range = get_free_all_memory_range(&range, nodeid);
119
120 for (i = 0; i < nr_range; i++) {
121 start = range[i].start;
122 end = range[i].end;
123 count += end - start;
124 __free_pages_memory(start, end);
125 }
126
127 return count;
128}
129
130/**
131 * free_all_bootmem_node - release a node's free pages to the buddy allocator
132 * @pgdat: node to be released
133 *
134 * Returns the number of pages actually released.
135 */
136unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
137{
138 register_page_bootmem_info_node(pgdat);
139
140 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
141 return 0;
142}
143
144/**
145 * free_all_bootmem - release free pages to the buddy allocator
146 *
147 * Returns the number of pages actually released.
148 */
149unsigned long __init free_all_bootmem(void)
150{
151 /*
152 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
153 * because in some case like Node0 doesn't have RAM installed
154 * low ram will be on Node1
155 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
156 * will be used instead of only Node0 related
157 */
158 return free_all_memory_core_early(MAX_NUMNODES);
159}
160
161/**
162 * free_bootmem_node - mark a page range as usable
163 * @pgdat: node the range resides on
164 * @physaddr: starting address of the range
165 * @size: size of the range in bytes
166 *
167 * Partial pages will be considered reserved and left as they are.
168 *
169 * The range must reside completely on the specified node.
170 */
171void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
172 unsigned long size)
173{
174 kmemleak_free_part(__va(physaddr), size);
175 memblock_x86_free_range(physaddr, physaddr + size);
176}
177
178/**
179 * free_bootmem - mark a page range as usable
180 * @addr: starting address of the range
181 * @size: size of the range in bytes
182 *
183 * Partial pages will be considered reserved and left as they are.
184 *
185 * The range must be contiguous but may span node boundaries.
186 */
187void __init free_bootmem(unsigned long addr, unsigned long size)
188{
189 kmemleak_free_part(__va(addr), size);
190 memblock_x86_free_range(addr, addr + size);
191}
192
193static void * __init ___alloc_bootmem_nopanic(unsigned long size,
194 unsigned long align,
195 unsigned long goal,
196 unsigned long limit)
197{
198 void *ptr;
199
200 if (WARN_ON_ONCE(slab_is_available()))
201 return kzalloc(size, GFP_NOWAIT);
202
203restart:
204
205 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
206
207 if (ptr)
208 return ptr;
209
210 if (goal != 0) {
211 goal = 0;
212 goto restart;
213 }
214
215 return NULL;
216}
217
218/**
219 * __alloc_bootmem_nopanic - allocate boot memory without panicking
220 * @size: size of the request in bytes
221 * @align: alignment of the region
222 * @goal: preferred starting address of the region
223 *
224 * The goal is dropped if it can not be satisfied and the allocation will
225 * fall back to memory below @goal.
226 *
227 * Allocation may happen on any node in the system.
228 *
229 * Returns NULL on failure.
230 */
231void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
232 unsigned long goal)
233{
234 unsigned long limit = -1UL;
235
236 return ___alloc_bootmem_nopanic(size, align, goal, limit);
237}
238
239static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
240 unsigned long goal, unsigned long limit)
241{
242 void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
243
244 if (mem)
245 return mem;
246 /*
247 * Whoops, we cannot satisfy the allocation request.
248 */
249 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
250 panic("Out of memory");
251 return NULL;
252}
253
254/**
255 * __alloc_bootmem - allocate boot memory
256 * @size: size of the request in bytes
257 * @align: alignment of the region
258 * @goal: preferred starting address of the region
259 *
260 * The goal is dropped if it can not be satisfied and the allocation will
261 * fall back to memory below @goal.
262 *
263 * Allocation may happen on any node in the system.
264 *
265 * The function panics if the request can not be satisfied.
266 */
267void * __init __alloc_bootmem(unsigned long size, unsigned long align,
268 unsigned long goal)
269{
270 unsigned long limit = -1UL;
271
272 return ___alloc_bootmem(size, align, goal, limit);
273}
274
275/**
276 * __alloc_bootmem_node - allocate boot memory from a specific node
277 * @pgdat: node to allocate from
278 * @size: size of the request in bytes
279 * @align: alignment of the region
280 * @goal: preferred starting address of the region
281 *
282 * The goal is dropped if it can not be satisfied and the allocation will
283 * fall back to memory below @goal.
284 *
285 * Allocation may fall back to any node in the system if the specified node
286 * can not hold the requested memory.
287 *
288 * The function panics if the request can not be satisfied.
289 */
290void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
291 unsigned long align, unsigned long goal)
292{
293 void *ptr;
294
295 if (WARN_ON_ONCE(slab_is_available()))
296 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
297
298 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
299 goal, -1ULL);
300 if (ptr)
301 return ptr;
302
303 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
304 goal, -1ULL);
305}
306
307void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
308 unsigned long align, unsigned long goal)
309{
310#ifdef MAX_DMA32_PFN
311 unsigned long end_pfn;
312
313 if (WARN_ON_ONCE(slab_is_available()))
314 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
315
316 /* update goal according ...MAX_DMA32_PFN */
317 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
318
319 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
320 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
321 void *ptr;
322 unsigned long new_goal;
323
324 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
325 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
326 new_goal, -1ULL);
327 if (ptr)
328 return ptr;
329 }
330#endif
331
332 return __alloc_bootmem_node(pgdat, size, align, goal);
333
334}
335
336#ifdef CONFIG_SPARSEMEM
337/**
338 * alloc_bootmem_section - allocate boot memory from a specific section
339 * @size: size of the request in bytes
340 * @section_nr: sparse map section to allocate from
341 *
342 * Return NULL on failure.
343 */
344void * __init alloc_bootmem_section(unsigned long size,
345 unsigned long section_nr)
346{
347 unsigned long pfn, goal, limit;
348
349 pfn = section_nr_to_pfn(section_nr);
350 goal = pfn << PAGE_SHIFT;
351 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
352
353 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
354 SMP_CACHE_BYTES, goal, limit);
355}
356#endif
357
358void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
359 unsigned long align, unsigned long goal)
360{
361 void *ptr;
362
363 if (WARN_ON_ONCE(slab_is_available()))
364 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
365
366 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
367 goal, -1ULL);
368 if (ptr)
369 return ptr;
370
371 return __alloc_bootmem_nopanic(size, align, goal);
372}
373
374#ifndef ARCH_LOW_ADDRESS_LIMIT
375#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
376#endif
377
378/**
379 * __alloc_bootmem_low - allocate low boot memory
380 * @size: size of the request in bytes
381 * @align: alignment of the region
382 * @goal: preferred starting address of the region
383 *
384 * The goal is dropped if it can not be satisfied and the allocation will
385 * fall back to memory below @goal.
386 *
387 * Allocation may happen on any node in the system.
388 *
389 * The function panics if the request can not be satisfied.
390 */
391void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
392 unsigned long goal)
393{
394 return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
395}
396
397/**
398 * __alloc_bootmem_low_node - allocate low boot memory from a specific node
399 * @pgdat: node to allocate from
400 * @size: size of the request in bytes
401 * @align: alignment of the region
402 * @goal: preferred starting address of the region
403 *
404 * The goal is dropped if it can not be satisfied and the allocation will
405 * fall back to memory below @goal.
406 *
407 * Allocation may fall back to any node in the system if the specified node
408 * can not hold the requested memory.
409 *
410 * The function panics if the request can not be satisfied.
411 */
412void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
413 unsigned long align, unsigned long goal)
414{
415 void *ptr;
416
417 if (WARN_ON_ONCE(slab_is_available()))
418 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
419
420 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
421 goal, ARCH_LOW_ADDRESS_LIMIT);
422 if (ptr)
423 return ptr;
424
425 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
426 goal, ARCH_LOW_ADDRESS_LIMIT);
427}
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e1424d3db..c4c542c736a9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1842} 1842}
1843EXPORT_SYMBOL(remap_vmalloc_range); 1843EXPORT_SYMBOL(remap_vmalloc_range);
1844 1844
1845void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1846{
1847}
1848
1849unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, 1845unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1850 unsigned long len, unsigned long pgoff, unsigned long flags) 1846 unsigned long len, unsigned long pgoff, unsigned long flags)
1851{ 1847{
@@ -1963,7 +1959,7 @@ error:
1963 return -ENOMEM; 1959 return -ENOMEM;
1964} 1960}
1965 1961
1966int in_gate_area_no_task(unsigned long addr) 1962int in_gate_area_no_mm(unsigned long addr)
1967{ 1963{
1968 return 0; 1964 return 0;
1969} 1965}
@@ -1975,21 +1971,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1975} 1971}
1976EXPORT_SYMBOL(filemap_fault); 1972EXPORT_SYMBOL(filemap_fault);
1977 1973
1978/* 1974static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1979 * Access another process' address space. 1975 unsigned long addr, void *buf, int len, int write)
1980 * - source/target buffer must be kernel space
1981 */
1982int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1983{ 1976{
1984 struct vm_area_struct *vma; 1977 struct vm_area_struct *vma;
1985 struct mm_struct *mm;
1986
1987 if (addr + len < addr)
1988 return 0;
1989
1990 mm = get_task_mm(tsk);
1991 if (!mm)
1992 return 0;
1993 1978
1994 down_read(&mm->mmap_sem); 1979 down_read(&mm->mmap_sem);
1995 1980
@@ -2014,6 +1999,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2014 } 1999 }
2015 2000
2016 up_read(&mm->mmap_sem); 2001 up_read(&mm->mmap_sem);
2002
2003 return len;
2004}
2005
2006/**
2007 * @access_remote_vm - access another process' address space
2008 * @mm: the mm_struct of the target address space
2009 * @addr: start address to access
2010 * @buf: source or destination buffer
2011 * @len: number of bytes to transfer
2012 * @write: whether the access is a write
2013 *
2014 * The caller must hold a reference on @mm.
2015 */
2016int access_remote_vm(struct mm_struct *mm, unsigned long addr,
2017 void *buf, int len, int write)
2018{
2019 return __access_remote_vm(NULL, mm, addr, buf, len, write);
2020}
2021
2022/*
2023 * Access another process' address space.
2024 * - source/target buffer must be kernel space
2025 */
2026int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
2027{
2028 struct mm_struct *mm;
2029
2030 if (addr + len < addr)
2031 return 0;
2032
2033 mm = get_task_mm(tsk);
2034 if (!mm)
2035 return 0;
2036
2037 len = __access_remote_vm(tsk, mm, addr, buf, len, write);
2038
2017 mmput(mm); 2039 mmput(mm);
2018 return len; 2040 return len;
2019} 2041}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7dcca55ede7c..83fb72c108b7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h> 32#include <linux/mempolicy.h>
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h>
34 35
35int sysctl_panic_on_oom; 36int sysctl_panic_on_oom;
36int sysctl_oom_kill_allocating_task; 37int sysctl_oom_kill_allocating_task;
@@ -83,24 +84,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
83#endif /* CONFIG_NUMA */ 84#endif /* CONFIG_NUMA */
84 85
85/* 86/*
86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
90 */
91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
93{
94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through 87 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid 88 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with 89 * pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -292,13 +275,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
292 unsigned long totalpages, struct mem_cgroup *mem, 275 unsigned long totalpages, struct mem_cgroup *mem,
293 const nodemask_t *nodemask) 276 const nodemask_t *nodemask)
294{ 277{
295 struct task_struct *p; 278 struct task_struct *g, *p;
296 struct task_struct *chosen = NULL; 279 struct task_struct *chosen = NULL;
297 *ppoints = 0; 280 *ppoints = 0;
298 281
299 for_each_process(p) { 282 do_each_thread(g, p) {
300 unsigned int points; 283 unsigned int points;
301 284
285 if (!p->mm)
286 continue;
302 if (oom_unkillable_task(p, mem, nodemask)) 287 if (oom_unkillable_task(p, mem, nodemask))
303 continue; 288 continue;
304 289
@@ -314,22 +299,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
314 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 299 if (test_tsk_thread_flag(p, TIF_MEMDIE))
315 return ERR_PTR(-1UL); 300 return ERR_PTR(-1UL);
316 301
317 /* 302 if (p->flags & PF_EXITING) {
318 * This is in the process of releasing memory so wait for it 303 /*
319 * to finish before killing some other task by mistake. 304 * If p is the current task and is in the process of
320 * 305 * releasing memory, we allow the "kill" to set
321 * However, if p is the current task, we allow the 'kill' to 306 * TIF_MEMDIE, which will allow it to gain access to
322 * go ahead if it is exiting: this will simply set TIF_MEMDIE, 307 * memory reserves. Otherwise, it may stall forever.
323 * which will allow it to gain access to memory reserves in 308 *
324 * the process of exiting and releasing its resources. 309 * The loop isn't broken here, however, in case other
325 * Otherwise we could get an easy OOM deadlock. 310 * threads are found to have already been oom killed.
326 */ 311 */
327 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { 312 if (p == current) {
328 if (p != current) 313 chosen = p;
329 return ERR_PTR(-1UL); 314 *ppoints = 1000;
330 315 } else {
331 chosen = p; 316 /*
332 *ppoints = 1000; 317 * If this task is not being ptraced on exit,
318 * then wait for it to finish before killing
319 * some other task unnecessarily.
320 */
321 if (!(task_ptrace(p->group_leader) &
322 PT_TRACE_EXIT))
323 return ERR_PTR(-1UL);
324 }
333 } 325 }
334 326
335 points = oom_badness(p, mem, nodemask, totalpages); 327 points = oom_badness(p, mem, nodemask, totalpages);
@@ -337,7 +329,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
337 chosen = p; 329 chosen = p;
338 *ppoints = points; 330 *ppoints = points;
339 } 331 }
340 } 332 } while_each_thread(g, p);
341 333
342 return chosen; 334 return chosen;
343} 335}
@@ -396,7 +388,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
396 task_unlock(current); 388 task_unlock(current);
397 dump_stack(); 389 dump_stack();
398 mem_cgroup_print_oom_info(mem, p); 390 mem_cgroup_print_oom_info(mem, p);
399 show_mem(); 391 show_mem(SHOW_MEM_FILTER_NODES);
400 if (sysctl_oom_dump_tasks) 392 if (sysctl_oom_dump_tasks)
401 dump_tasks(mem, nodemask); 393 dump_tasks(mem, nodemask);
402} 394}
@@ -442,13 +434,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
442 set_tsk_thread_flag(p, TIF_MEMDIE); 434 set_tsk_thread_flag(p, TIF_MEMDIE);
443 force_sig(SIGKILL, p); 435 force_sig(SIGKILL, p);
444 436
445 /*
446 * We give our sacrificial lamb high priority and access to
447 * all the memory it needs. That way it should be able to
448 * exit() and clear out its resources quickly...
449 */
450 boost_dying_task_prio(p, mem);
451
452 return 0; 437 return 0;
453} 438}
454#undef K 439#undef K
@@ -472,7 +457,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
472 */ 457 */
473 if (p->flags & PF_EXITING) { 458 if (p->flags & PF_EXITING) {
474 set_tsk_thread_flag(p, TIF_MEMDIE); 459 set_tsk_thread_flag(p, TIF_MEMDIE);
475 boost_dying_task_prio(p, mem);
476 return 0; 460 return 0;
477 } 461 }
478 462
@@ -491,6 +475,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
491 list_for_each_entry(child, &t->children, sibling) { 475 list_for_each_entry(child, &t->children, sibling) {
492 unsigned int child_points; 476 unsigned int child_points;
493 477
478 if (child->mm == p->mm)
479 continue;
494 /* 480 /*
495 * oom_badness() returns 0 if the thread is unkillable 481 * oom_badness() returns 0 if the thread is unkillable
496 */ 482 */
@@ -537,6 +523,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
537 unsigned int points = 0; 523 unsigned int points = 0;
538 struct task_struct *p; 524 struct task_struct *p;
539 525
526 /*
527 * If current has a pending SIGKILL, then automatically select it. The
528 * goal is to allow it to allocate so that it may quickly exit and free
529 * its memory.
530 */
531 if (fatal_signal_pending(current)) {
532 set_thread_flag(TIF_MEMDIE);
533 return;
534 }
535
540 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 536 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
541 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; 537 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
542 read_lock(&tasklist_lock); 538 read_lock(&tasklist_lock);
@@ -689,7 +685,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
689 */ 685 */
690 if (fatal_signal_pending(current)) { 686 if (fatal_signal_pending(current)) {
691 set_thread_flag(TIF_MEMDIE); 687 set_thread_flag(TIF_MEMDIE);
692 boost_dying_task_prio(current, NULL);
693 return; 688 return;
694 } 689 }
695 690
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2cb01f6ec5d0..31f698862420 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -927,7 +927,7 @@ retry:
927 break; 927 break;
928 } 928 }
929 929
930 done_index = page->index + 1; 930 done_index = page->index;
931 931
932 lock_page(page); 932 lock_page(page);
933 933
@@ -977,6 +977,7 @@ continue_unlock:
977 * not be suitable for data integrity 977 * not be suitable for data integrity
978 * writeout). 978 * writeout).
979 */ 979 */
980 done_index = page->index + 1;
980 done = 1; 981 done = 1;
981 break; 982 break;
982 } 983 }
@@ -1039,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
1039int generic_writepages(struct address_space *mapping, 1040int generic_writepages(struct address_space *mapping,
1040 struct writeback_control *wbc) 1041 struct writeback_control *wbc)
1041{ 1042{
1043 struct blk_plug plug;
1044 int ret;
1045
1042 /* deal with chardevs and other special file */ 1046 /* deal with chardevs and other special file */
1043 if (!mapping->a_ops->writepage) 1047 if (!mapping->a_ops->writepage)
1044 return 0; 1048 return 0;
1045 1049
1046 return write_cache_pages(mapping, wbc, __writepage, mapping); 1050 blk_start_plug(&plug);
1051 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
1052 blk_finish_plug(&plug);
1053 return ret;
1047} 1054}
1048 1055
1049EXPORT_SYMBOL(generic_writepages); 1056EXPORT_SYMBOL(generic_writepages);
@@ -1211,6 +1218,17 @@ int set_page_dirty(struct page *page)
1211 1218
1212 if (likely(mapping)) { 1219 if (likely(mapping)) {
1213 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 1220 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
1221 /*
1222 * readahead/lru_deactivate_page could remain
1223 * PG_readahead/PG_reclaim due to race with end_page_writeback
1224 * About readahead, if the page is written, the flags would be
1225 * reset. So no problem.
1226 * About lru_deactivate_page, if the page is redirty, the flag
1227 * will be reset. So no problem. but if the page is used by readahead
1228 * it will confuse readahead and make it restart the size rampup
1229 * process. But it's a trivial problem.
1230 */
1231 ClearPageReclaim(page);
1214#ifdef CONFIG_BLOCK 1232#ifdef CONFIG_BLOCK
1215 if (!spd) 1233 if (!spd)
1216 spd = __set_page_dirty_buffers; 1234 spd = __set_page_dirty_buffers;
@@ -1239,7 +1257,7 @@ int set_page_dirty_lock(struct page *page)
1239{ 1257{
1240 int ret; 1258 int ret;
1241 1259
1242 lock_page_nosync(page); 1260 lock_page(page);
1243 ret = set_page_dirty(page); 1261 ret = set_page_dirty(page);
1244 unlock_page(page); 1262 unlock_page(page);
1245 return ret; 1263 return ret;
@@ -1266,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page)
1266 1284
1267 BUG_ON(!PageLocked(page)); 1285 BUG_ON(!PageLocked(page));
1268 1286
1269 ClearPageReclaim(page);
1270 if (mapping && mapping_cap_account_dirty(mapping)) { 1287 if (mapping && mapping_cap_account_dirty(mapping)) {
1271 /* 1288 /*
1272 * Yes, Virginia, this is indeed insane. 1289 * Yes, Virginia, this is indeed insane.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a873e61e312e..9f8a97b9a350 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@
53#include <linux/compaction.h> 53#include <linux/compaction.h>
54#include <trace/events/kmem.h> 54#include <trace/events/kmem.h>
55#include <linux/ftrace_event.h> 55#include <linux/ftrace_event.h>
56#include <linux/memcontrol.h>
56 57
57#include <asm/tlbflush.h> 58#include <asm/tlbflush.h>
58#include <asm/div64.h> 59#include <asm/div64.h>
@@ -286,7 +287,7 @@ static void bad_page(struct page *page)
286 287
287 /* Don't complain about poisoned pages */ 288 /* Don't complain about poisoned pages */
288 if (PageHWPoison(page)) { 289 if (PageHWPoison(page)) {
289 __ClearPageBuddy(page); 290 reset_page_mapcount(page); /* remove PageBuddy */
290 return; 291 return;
291 } 292 }
292 293
@@ -317,7 +318,7 @@ static void bad_page(struct page *page)
317 dump_stack(); 318 dump_stack();
318out: 319out:
319 /* Leave bad fields for debug, except PageBuddy could make trouble */ 320 /* Leave bad fields for debug, except PageBuddy could make trouble */
320 __ClearPageBuddy(page); 321 reset_page_mapcount(page); /* remove PageBuddy */
321 add_taint(TAINT_BAD_PAGE); 322 add_taint(TAINT_BAD_PAGE);
322} 323}
323 324
@@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page)
565 if (unlikely(page_mapcount(page) | 566 if (unlikely(page_mapcount(page) |
566 (page->mapping != NULL) | 567 (page->mapping != NULL) |
567 (atomic_read(&page->_count) != 0) | 568 (atomic_read(&page->_count) != 0) |
568 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 569 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
570 (mem_cgroup_bad_page_check(page)))) {
569 bad_page(page); 571 bad_page(page);
570 return 1; 572 return 1;
571 } 573 }
@@ -614,6 +616,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
614 list = &pcp->lists[migratetype]; 616 list = &pcp->lists[migratetype];
615 } while (list_empty(list)); 617 } while (list_empty(list));
616 618
619 /* This is the only non-empty list. Free them all. */
620 if (batch_free == MIGRATE_PCPTYPES)
621 batch_free = to_free;
622
617 do { 623 do {
618 page = list_entry(list->prev, struct page, lru); 624 page = list_entry(list->prev, struct page, lru);
619 /* must delete as __free_one_page list manipulates */ 625 /* must delete as __free_one_page list manipulates */
@@ -750,7 +756,8 @@ static inline int check_new_page(struct page *page)
750 if (unlikely(page_mapcount(page) | 756 if (unlikely(page_mapcount(page) |
751 (page->mapping != NULL) | 757 (page->mapping != NULL) |
752 (atomic_read(&page->_count) != 0) | 758 (atomic_read(&page->_count) != 0) |
753 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 759 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
760 (mem_cgroup_bad_page_check(page)))) {
754 bad_page(page); 761 bad_page(page);
755 return 1; 762 return 1;
756 } 763 }
@@ -863,9 +870,8 @@ static int move_freepages(struct zone *zone,
863 } 870 }
864 871
865 order = page_order(page); 872 order = page_order(page);
866 list_del(&page->lru); 873 list_move(&page->lru,
867 list_add(&page->lru, 874 &zone->free_area[order].free_list[migratetype]);
868 &zone->free_area[order].free_list[migratetype]);
869 page += 1 << order; 875 page += 1 << order;
870 pages_moved += 1 << order; 876 pages_moved += 1 << order;
871 } 877 }
@@ -936,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
936 * If breaking a large block of pages, move all free 942 * If breaking a large block of pages, move all free
937 * pages to the preferred allocation list. If falling 943 * pages to the preferred allocation list. If falling
938 * back for a reclaimable kernel allocation, be more 944 * back for a reclaimable kernel allocation, be more
939 * agressive about taking ownership of free pages 945 * aggressive about taking ownership of free pages
940 */ 946 */
941 if (unlikely(current_order >= (pageblock_order >> 1)) || 947 if (unlikely(current_order >= (pageblock_order >> 1)) ||
942 start_migratetype == MIGRATE_RECLAIMABLE || 948 start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1333,7 +1339,7 @@ again:
1333 } 1339 }
1334 1340
1335 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1341 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1336 zone_statistics(preferred_zone, zone); 1342 zone_statistics(preferred_zone, zone, gfp_flags);
1337 local_irq_restore(flags); 1343 local_irq_restore(flags);
1338 1344
1339 VM_BUG_ON(bad_range(zone, page)); 1345 VM_BUG_ON(bad_range(zone, page));
@@ -1714,6 +1720,20 @@ try_next_zone:
1714 return page; 1720 return page;
1715} 1721}
1716 1722
1723/*
1724 * Large machines with many possible nodes should not always dump per-node
1725 * meminfo in irq context.
1726 */
1727static inline bool should_suppress_show_mem(void)
1728{
1729 bool ret = false;
1730
1731#if NODES_SHIFT > 8
1732 ret = in_interrupt();
1733#endif
1734 return ret;
1735}
1736
1717static inline int 1737static inline int
1718should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1738should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1719 unsigned long pages_reclaimed) 1739 unsigned long pages_reclaimed)
@@ -2085,7 +2105,7 @@ rebalance:
2085 sync_migration); 2105 sync_migration);
2086 if (page) 2106 if (page)
2087 goto got_pg; 2107 goto got_pg;
2088 sync_migration = true; 2108 sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
2089 2109
2090 /* Try direct reclaim and then allocating */ 2110 /* Try direct reclaim and then allocating */
2091 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2111 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2157,11 +2177,25 @@ rebalance:
2157 2177
2158nopage: 2178nopage:
2159 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2179 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
2160 printk(KERN_WARNING "%s: page allocation failure." 2180 unsigned int filter = SHOW_MEM_FILTER_NODES;
2161 " order:%d, mode:0x%x\n", 2181
2182 /*
2183 * This documents exceptions given to allocations in certain
2184 * contexts that are allowed to allocate outside current's set
2185 * of allowed nodes.
2186 */
2187 if (!(gfp_mask & __GFP_NOMEMALLOC))
2188 if (test_thread_flag(TIF_MEMDIE) ||
2189 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2190 filter &= ~SHOW_MEM_FILTER_NODES;
2191 if (in_interrupt() || !wait)
2192 filter &= ~SHOW_MEM_FILTER_NODES;
2193
2194 pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
2162 current->comm, order, gfp_mask); 2195 current->comm, order, gfp_mask);
2163 dump_stack(); 2196 dump_stack();
2164 show_mem(); 2197 if (!should_suppress_show_mem())
2198 show_mem(filter);
2165 } 2199 }
2166 return page; 2200 return page;
2167got_pg: 2201got_pg:
@@ -2411,19 +2445,42 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2411} 2445}
2412#endif 2446#endif
2413 2447
2448/*
2449 * Determine whether the zone's node should be displayed or not, depending on
2450 * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
2451 */
2452static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
2453{
2454 bool ret = false;
2455
2456 if (!(flags & SHOW_MEM_FILTER_NODES))
2457 goto out;
2458
2459 get_mems_allowed();
2460 ret = !node_isset(zone->zone_pgdat->node_id,
2461 cpuset_current_mems_allowed);
2462 put_mems_allowed();
2463out:
2464 return ret;
2465}
2466
2414#define K(x) ((x) << (PAGE_SHIFT-10)) 2467#define K(x) ((x) << (PAGE_SHIFT-10))
2415 2468
2416/* 2469/*
2417 * Show free area list (used inside shift_scroll-lock stuff) 2470 * Show free area list (used inside shift_scroll-lock stuff)
2418 * We also calculate the percentage fragmentation. We do this by counting the 2471 * We also calculate the percentage fragmentation. We do this by counting the
2419 * memory on each free list with the exception of the first item on the list. 2472 * memory on each free list with the exception of the first item on the list.
2473 * Suppresses nodes that are not allowed by current's cpuset if
2474 * SHOW_MEM_FILTER_NODES is passed.
2420 */ 2475 */
2421void show_free_areas(void) 2476void __show_free_areas(unsigned int filter)
2422{ 2477{
2423 int cpu; 2478 int cpu;
2424 struct zone *zone; 2479 struct zone *zone;
2425 2480
2426 for_each_populated_zone(zone) { 2481 for_each_populated_zone(zone) {
2482 if (skip_free_areas_zone(filter, zone))
2483 continue;
2427 show_node(zone); 2484 show_node(zone);
2428 printk("%s per-cpu:\n", zone->name); 2485 printk("%s per-cpu:\n", zone->name);
2429 2486
@@ -2465,6 +2522,8 @@ void show_free_areas(void)
2465 for_each_populated_zone(zone) { 2522 for_each_populated_zone(zone) {
2466 int i; 2523 int i;
2467 2524
2525 if (skip_free_areas_zone(filter, zone))
2526 continue;
2468 show_node(zone); 2527 show_node(zone);
2469 printk("%s" 2528 printk("%s"
2470 " free:%lukB" 2529 " free:%lukB"
@@ -2532,6 +2591,8 @@ void show_free_areas(void)
2532 for_each_populated_zone(zone) { 2591 for_each_populated_zone(zone) {
2533 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2592 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2534 2593
2594 if (skip_free_areas_zone(filter, zone))
2595 continue;
2535 show_node(zone); 2596 show_node(zone);
2536 printk("%s: ", zone->name); 2597 printk("%s: ", zone->name);
2537 2598
@@ -2551,6 +2612,11 @@ void show_free_areas(void)
2551 show_swap_cache_info(); 2612 show_swap_cache_info();
2552} 2613}
2553 2614
2615void show_free_areas(void)
2616{
2617 __show_free_areas(0);
2618}
2619
2554static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 2620static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2555{ 2621{
2556 zoneref->zone = zone; 2622 zoneref->zone = zone;
@@ -3110,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data)
3110 * Called with zonelists_mutex held always 3176 * Called with zonelists_mutex held always
3111 * unless system_state == SYSTEM_BOOTING. 3177 * unless system_state == SYSTEM_BOOTING.
3112 */ 3178 */
3113void build_all_zonelists(void *data) 3179void __ref build_all_zonelists(void *data)
3114{ 3180{
3115 set_zonelist_order(); 3181 set_zonelist_order();
3116 3182
@@ -3699,13 +3765,45 @@ void __init free_bootmem_with_active_regions(int nid,
3699} 3765}
3700 3766
3701#ifdef CONFIG_HAVE_MEMBLOCK 3767#ifdef CONFIG_HAVE_MEMBLOCK
3768/*
3769 * Basic iterator support. Return the last range of PFNs for a node
3770 * Note: nid == MAX_NUMNODES returns last region regardless of node
3771 */
3772static int __meminit last_active_region_index_in_nid(int nid)
3773{
3774 int i;
3775
3776 for (i = nr_nodemap_entries - 1; i >= 0; i--)
3777 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3778 return i;
3779
3780 return -1;
3781}
3782
3783/*
3784 * Basic iterator support. Return the previous active range of PFNs for a node
3785 * Note: nid == MAX_NUMNODES returns next region regardless of node
3786 */
3787static int __meminit previous_active_region_index_in_nid(int index, int nid)
3788{
3789 for (index = index - 1; index >= 0; index--)
3790 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3791 return index;
3792
3793 return -1;
3794}
3795
3796#define for_each_active_range_index_in_nid_reverse(i, nid) \
3797 for (i = last_active_region_index_in_nid(nid); i != -1; \
3798 i = previous_active_region_index_in_nid(i, nid))
3799
3702u64 __init find_memory_core_early(int nid, u64 size, u64 align, 3800u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3703 u64 goal, u64 limit) 3801 u64 goal, u64 limit)
3704{ 3802{
3705 int i; 3803 int i;
3706 3804
3707 /* Need to go over early_node_map to find out good range for node */ 3805 /* Need to go over early_node_map to find out good range for node */
3708 for_each_active_range_index_in_nid(i, nid) { 3806 for_each_active_range_index_in_nid_reverse(i, nid) {
3709 u64 addr; 3807 u64 addr;
3710 u64 ei_start, ei_last; 3808 u64 ei_start, ei_last;
3711 u64 final_start, final_end; 3809 u64 final_start, final_end;
@@ -3748,34 +3846,6 @@ int __init add_from_early_node_map(struct range *range, int az,
3748 return nr_range; 3846 return nr_range;
3749} 3847}
3750 3848
3751#ifdef CONFIG_NO_BOOTMEM
3752void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3753 u64 goal, u64 limit)
3754{
3755 void *ptr;
3756 u64 addr;
3757
3758 if (limit > memblock.current_limit)
3759 limit = memblock.current_limit;
3760
3761 addr = find_memory_core_early(nid, size, align, goal, limit);
3762
3763 if (addr == MEMBLOCK_ERROR)
3764 return NULL;
3765
3766 ptr = phys_to_virt(addr);
3767 memset(ptr, 0, size);
3768 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
3769 /*
3770 * The min_count is set to 0 so that bootmem allocated blocks
3771 * are never reported as leaks.
3772 */
3773 kmemleak_alloc(ptr, size, 0, 0);
3774 return ptr;
3775}
3776#endif
3777
3778
3779void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3849void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3780{ 3850{
3781 int i; 3851 int i;
@@ -3856,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void)
3856 3926
3857/* 3927/*
3858 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3928 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3859 * because it is sized independant of architecture. Unlike the other zones, 3929 * because it is sized independent of architecture. Unlike the other zones,
3860 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3930 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3861 * in each node depending on the size of each node and how evenly kernelcore 3931 * in each node depending on the size of each node and how evenly kernelcore
3862 * is distributed. This helper function adjusts the zone ranges 3932 * is distributed. This helper function adjusts the zone ranges
@@ -4809,15 +4879,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4809 dma_reserve = new_dma_reserve; 4879 dma_reserve = new_dma_reserve;
4810} 4880}
4811 4881
4812#ifndef CONFIG_NEED_MULTIPLE_NODES
4813struct pglist_data __refdata contig_page_data = {
4814#ifndef CONFIG_NO_BOOTMEM
4815 .bdata = &bootmem_node_data[0]
4816#endif
4817 };
4818EXPORT_SYMBOL(contig_page_data);
4819#endif
4820
4821void __init free_area_init(unsigned long *zones_size) 4882void __init free_area_init(unsigned long *zones_size)
4822{ 4883{
4823 free_area_init_node(0, zones_size, 4884 free_area_init_node(0, zones_size,
@@ -5376,10 +5437,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5376 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5437 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5377 unsigned long check = pfn + iter; 5438 unsigned long check = pfn + iter;
5378 5439
5379 if (!pfn_valid_within(check)) { 5440 if (!pfn_valid_within(check))
5380 iter++;
5381 continue; 5441 continue;
5382 } 5442
5383 page = pfn_to_page(check); 5443 page = pfn_to_page(check);
5384 if (!page_count(page)) { 5444 if (!page_count(page)) {
5385 if (PageBuddy(page)) 5445 if (PageBuddy(page))
@@ -5627,4 +5687,5 @@ void dump_page(struct page *page)
5627 page, atomic_read(&page->_count), page_mapcount(page), 5687 page, atomic_read(&page->_count), page_mapcount(page),
5628 page->mapping, page->index); 5688 page->mapping, page->index);
5629 dump_page_flags(page->flags); 5689 dump_page_flags(page->flags);
5690 mem_cgroup_print_bad_page(page);
5630} 5691}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada7cde1..99055010cece 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
11#include <linux/swapops.h> 11#include <linux/swapops.h>
12#include <linux/kmemleak.h> 12#include <linux/kmemleak.h>
13 13
14static void __meminit 14static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
15__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
16{ 15{
17 pc->flags = 0; 16 pc->flags = 0;
17 set_page_cgroup_array_id(pc, id);
18 pc->mem_cgroup = NULL; 18 pc->mem_cgroup = NULL;
19 pc->page = pfn_to_page(pfn);
20 INIT_LIST_HEAD(&pc->lru); 19 INIT_LIST_HEAD(&pc->lru);
21} 20}
22static unsigned long total_usage; 21static unsigned long total_usage;
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
43 return base + offset; 42 return base + offset;
44} 43}
45 44
45struct page *lookup_cgroup_page(struct page_cgroup *pc)
46{
47 unsigned long pfn;
48 struct page *page;
49 pg_data_t *pgdat;
50
51 pgdat = NODE_DATA(page_cgroup_array_id(pc));
52 pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
53 page = pfn_to_page(pfn);
54 VM_BUG_ON(pc != lookup_page_cgroup(page));
55 return page;
56}
57
46static int __init alloc_node_page_cgroup(int nid) 58static int __init alloc_node_page_cgroup(int nid)
47{ 59{
48 struct page_cgroup *base, *pc; 60 struct page_cgroup *base, *pc;
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid)
63 return -ENOMEM; 75 return -ENOMEM;
64 for (index = 0; index < nr_pages; index++) { 76 for (index = 0; index < nr_pages; index++) {
65 pc = base + index; 77 pc = base + index;
66 __init_page_cgroup(pc, start_pfn + index); 78 init_page_cgroup(pc, nid);
67 } 79 }
68 NODE_DATA(nid)->node_page_cgroup = base; 80 NODE_DATA(nid)->node_page_cgroup = base;
69 total_usage += table_size; 81 total_usage += table_size;
@@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
105 return section->page_cgroup + pfn; 117 return section->page_cgroup + pfn;
106} 118}
107 119
108/* __alloc_bootmem...() is protected by !slab_available() */ 120struct page *lookup_cgroup_page(struct page_cgroup *pc)
121{
122 struct mem_section *section;
123 struct page *page;
124 unsigned long nr;
125
126 nr = page_cgroup_array_id(pc);
127 section = __nr_to_section(nr);
128 page = pfn_to_page(pc - section->page_cgroup);
129 VM_BUG_ON(pc != lookup_page_cgroup(page));
130 return page;
131}
132
133static void *__init_refok alloc_page_cgroup(size_t size, int nid)
134{
135 void *addr = NULL;
136
137 addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN);
138 if (addr)
139 return addr;
140
141 if (node_state(nid, N_HIGH_MEMORY))
142 addr = vmalloc_node(size, nid);
143 else
144 addr = vmalloc(size);
145
146 return addr;
147}
148
149#ifdef CONFIG_MEMORY_HOTPLUG
150static void free_page_cgroup(void *addr)
151{
152 if (is_vmalloc_addr(addr)) {
153 vfree(addr);
154 } else {
155 struct page *page = virt_to_page(addr);
156 size_t table_size =
157 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
158
159 BUG_ON(PageReserved(page));
160 free_pages_exact(addr, table_size);
161 }
162}
163#endif
164
109static int __init_refok init_section_page_cgroup(unsigned long pfn) 165static int __init_refok init_section_page_cgroup(unsigned long pfn)
110{ 166{
111 struct mem_section *section = __pfn_to_section(pfn);
112 struct page_cgroup *base, *pc; 167 struct page_cgroup *base, *pc;
168 struct mem_section *section;
113 unsigned long table_size; 169 unsigned long table_size;
170 unsigned long nr;
114 int nid, index; 171 int nid, index;
115 172
116 if (!section->page_cgroup) { 173 nr = pfn_to_section_nr(pfn);
117 nid = page_to_nid(pfn_to_page(pfn)); 174 section = __nr_to_section(nr);
118 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 175
119 VM_BUG_ON(!slab_is_available()); 176 if (section->page_cgroup)
120 if (node_state(nid, N_HIGH_MEMORY)) { 177 return 0;
121 base = kmalloc_node(table_size, 178
122 GFP_KERNEL | __GFP_NOWARN, nid); 179 nid = page_to_nid(pfn_to_page(pfn));
123 if (!base) 180 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
124 base = vmalloc_node(table_size, nid); 181 base = alloc_page_cgroup(table_size, nid);
125 } else { 182
126 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); 183 /*
127 if (!base) 184 * The value stored in section->page_cgroup is (base - pfn)
128 base = vmalloc(table_size); 185 * and it does not point to the memory block allocated above,
129 } 186 * causing kmemleak false positives.
130 /* 187 */
131 * The value stored in section->page_cgroup is (base - pfn) 188 kmemleak_not_leak(base);
132 * and it does not point to the memory block allocated above,
133 * causing kmemleak false positives.
134 */
135 kmemleak_not_leak(base);
136 } else {
137 /*
138 * We don't have to allocate page_cgroup again, but
139 * address of memmap may be changed. So, we have to initialize
140 * again.
141 */
142 base = section->page_cgroup + pfn;
143 table_size = 0;
144 /* check address of memmap is changed or not. */
145 if (base->page == pfn_to_page(pfn))
146 return 0;
147 }
148 189
149 if (!base) { 190 if (!base) {
150 printk(KERN_ERR "page cgroup allocation failure\n"); 191 printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
153 194
154 for (index = 0; index < PAGES_PER_SECTION; index++) { 195 for (index = 0; index < PAGES_PER_SECTION; index++) {
155 pc = base + index; 196 pc = base + index;
156 __init_page_cgroup(pc, pfn + index); 197 init_page_cgroup(pc, nr);
157 } 198 }
158 199
159 section->page_cgroup = base - pfn; 200 section->page_cgroup = base - pfn;
@@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn)
170 if (!ms || !ms->page_cgroup) 211 if (!ms || !ms->page_cgroup)
171 return; 212 return;
172 base = ms->page_cgroup + pfn; 213 base = ms->page_cgroup + pfn;
173 if (is_vmalloc_addr(base)) { 214 free_page_cgroup(base);
174 vfree(base); 215 ms->page_cgroup = NULL;
175 ms->page_cgroup = NULL;
176 } else {
177 struct page *page = virt_to_page(base);
178 if (!PageReserved(page)) { /* Is bootmem ? */
179 kfree(base);
180 ms->page_cgroup = NULL;
181 }
182 }
183} 216}
184 217
185int __meminit online_page_cgroup(unsigned long start_pfn, 218int __meminit online_page_cgroup(unsigned long start_pfn,
@@ -243,12 +276,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
243 break; 276 break;
244 } 277 }
245 278
246 if (ret) 279 return notifier_from_errno(ret);
247 ret = notifier_from_errno(ret);
248 else
249 ret = NOTIFY_OK;
250
251 return ret;
252} 280}
253 281
254#endif 282#endif
@@ -349,7 +377,7 @@ not_enough_page:
349 * @new: new id 377 * @new: new id
350 * 378 *
351 * Returns old id at success, 0 at failure. 379 * Returns old id at success, 0 at failure.
352 * (There is no mem_cgroup useing 0 as its id) 380 * (There is no mem_cgroup using 0 as its id)
353 */ 381 */
354unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, 382unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
355 unsigned short old, unsigned short new) 383 unsigned short old, unsigned short new)
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
106 goto out; 106 goto out;
107 } 107 }
108 if (wbc->sync_mode == WB_SYNC_ALL) 108 if (wbc->sync_mode == WB_SYNC_ALL)
109 rw |= REQ_SYNC | REQ_UNPLUG; 109 rw |= REQ_SYNC;
110 count_vm_event(PSWPOUT); 110 count_vm_event(PSWPOUT);
111 set_page_writeback(page); 111 set_page_writeback(page);
112 unlock_page(page); 112 unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7cfa6ae02303..c3450d533611 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -33,19 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
33 33
34 pmd = pmd_offset(pud, addr); 34 pmd = pmd_offset(pud, addr);
35 do { 35 do {
36again:
36 next = pmd_addr_end(addr, end); 37 next = pmd_addr_end(addr, end);
37 split_huge_page_pmd(walk->mm, pmd); 38 if (pmd_none(*pmd)) {
38 if (pmd_none_or_clear_bad(pmd)) {
39 if (walk->pte_hole) 39 if (walk->pte_hole)
40 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
41 if (err) 41 if (err)
42 break; 42 break;
43 continue; 43 continue;
44 } 44 }
45 /*
46 * This implies that each ->pmd_entry() handler
47 * needs to know about pmd_trans_huge() pmds
48 */
45 if (walk->pmd_entry) 49 if (walk->pmd_entry)
46 err = walk->pmd_entry(pmd, addr, next, walk); 50 err = walk->pmd_entry(pmd, addr, next, walk);
47 if (!err && walk->pte_entry) 51 if (err)
48 err = walk_pte_range(pmd, addr, next, walk); 52 break;
53
54 /*
55 * Check this here so we only break down trans_huge
56 * pages when we _need_ to
57 */
58 if (!walk->pte_entry)
59 continue;
60
61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd))
63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk);
49 if (err) 65 if (err)
50 break; 66 break;
51 } while (pmd++, addr = next, addr != end); 67 } while (pmd++, addr = next, addr != end);
diff --git a/mm/percpu.c b/mm/percpu.c
index 3f930018aa60..a160db39b810 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -342,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
342 * @chunk: chunk of interest 342 * @chunk: chunk of interest
343 * 343 *
344 * Determine whether area map of @chunk needs to be extended to 344 * Determine whether area map of @chunk needs to be extended to
345 * accomodate a new allocation. 345 * accommodate a new allocation.
346 * 346 *
347 * CONTEXT: 347 * CONTEXT:
348 * pcpu_lock. 348 * pcpu_lock.
@@ -431,7 +431,7 @@ out_unlock:
431 * depending on @head, is reduced by @tail bytes and @tail byte block 431 * depending on @head, is reduced by @tail bytes and @tail byte block
432 * is inserted after the target block. 432 * is inserted after the target block.
433 * 433 *
434 * @chunk->map must have enough free slots to accomodate the split. 434 * @chunk->map must have enough free slots to accommodate the split.
435 * 435 *
436 * CONTEXT: 436 * CONTEXT:
437 * pcpu_lock. 437 * pcpu_lock.
@@ -1008,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
1008 } 1008 }
1009 1009
1010 if (in_first_chunk) { 1010 if (in_first_chunk) {
1011 if ((unsigned long)addr < VMALLOC_START || 1011 if (!is_vmalloc_addr(addr))
1012 (unsigned long)addr >= VMALLOC_END)
1013 return __pa(addr); 1012 return __pa(addr);
1014 else 1013 else
1015 return page_to_phys(vmalloc_to_page(addr)); 1014 return page_to_phys(vmalloc_to_page(addr));
@@ -1436,7 +1435,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1436 /* 1435 /*
1437 * Determine min_unit_size, alloc_size and max_upa such that 1436 * Determine min_unit_size, alloc_size and max_upa such that
1438 * alloc_size is multiple of atom_size and is the smallest 1437 * alloc_size is multiple of atom_size and is the smallest
1439 * which can accomodate 4k aligned segments which are equal to 1438 * which can accommodate 4k aligned segments which are equal to
1440 * or larger than min_unit_size. 1439 * or larger than min_unit_size.
1441 */ 1440 */
1442 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); 1441 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
@@ -1551,7 +1550,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1551 * @atom_size: allocation atom size 1550 * @atom_size: allocation atom size
1552 * @cpu_distance_fn: callback to determine distance between cpus, optional 1551 * @cpu_distance_fn: callback to determine distance between cpus, optional
1553 * @alloc_fn: function to allocate percpu page 1552 * @alloc_fn: function to allocate percpu page
1554 * @free_fn: funtion to free percpu page 1553 * @free_fn: function to free percpu page
1555 * 1554 *
1556 * This is a helper to ease setting up embedded first percpu chunk and 1555 * This is a helper to ease setting up embedded first percpu chunk and
1557 * can be called where pcpu_setup_first_chunk() is expected. 1556 * can be called where pcpu_setup_first_chunk() is expected.
@@ -1679,7 +1678,7 @@ out_free:
1679 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages 1678 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1680 * @reserved_size: the size of reserved percpu area in bytes 1679 * @reserved_size: the size of reserved percpu area in bytes
1681 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE 1680 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1682 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE 1681 * @free_fn: function to free percpu page, always called with PAGE_SIZE
1683 * @populate_pte_fn: function to populate pte 1682 * @populate_pte_fn: function to populate pte
1684 * 1683 *
1685 * This is a helper to ease setting up page-remapped first percpu 1684 * This is a helper to ease setting up page-remapped first percpu
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..2c0cc489e288 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
109static int read_pages(struct address_space *mapping, struct file *filp, 109static int read_pages(struct address_space *mapping, struct file *filp,
110 struct list_head *pages, unsigned nr_pages) 110 struct list_head *pages, unsigned nr_pages)
111{ 111{
112 struct blk_plug plug;
112 unsigned page_idx; 113 unsigned page_idx;
113 int ret; 114 int ret;
114 115
116 blk_start_plug(&plug);
117
115 if (mapping->a_ops->readpages) { 118 if (mapping->a_ops->readpages) {
116 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 119 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
117 /* Clean up the remaining pages */ 120 /* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
129 page_cache_release(page); 132 page_cache_release(page);
130 } 133 }
131 ret = 0; 134 ret = 0;
135
132out: 136out:
137 blk_finish_plug(&plug);
138
133 return ret; 139 return ret;
134} 140}
135 141
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
554 560
555 /* do read-ahead */ 561 /* do read-ahead */
556 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 562 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
557
558#ifdef CONFIG_BLOCK
559 /*
560 * Normally the current page is !uptodate and lock_page() will be
561 * immediately called to implicitly unplug the device. However this
562 * is not always true for RAID conifgurations, where data arrives
563 * not strictly in their submission order. In this case we need to
564 * explicitly kick off the IO.
565 */
566 if (PageUptodate(page))
567 blk_run_backing_dev(mapping->backing_dev_info, NULL);
568#endif
569} 563}
570EXPORT_SYMBOL_GPL(page_cache_async_readahead); 564EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index f21f4a1d6a1c..8da044a1db0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,11 +31,12 @@
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
32 * mmlist_lock (in mmput, drain_mmlist and others) 32 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 33 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode_lock (in set_page_dirty's __mark_inode_dirty) 34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 36 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 37 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 38 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 39 * within inode_wb_list_lock in __sync_single_inode)
39 * 40 *
40 * (code doesn't rely on that order so it could be switched around) 41 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock 42 * ->tasklist_lock
@@ -67,11 +68,24 @@ static struct kmem_cache *anon_vma_chain_cachep;
67 68
68static inline struct anon_vma *anon_vma_alloc(void) 69static inline struct anon_vma *anon_vma_alloc(void)
69{ 70{
70 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 71 struct anon_vma *anon_vma;
72
73 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
74 if (anon_vma) {
75 atomic_set(&anon_vma->refcount, 1);
76 /*
77 * Initialise the anon_vma root to point to itself. If called
78 * from fork, the root will be reset to the parents anon_vma.
79 */
80 anon_vma->root = anon_vma;
81 }
82
83 return anon_vma;
71} 84}
72 85
73void anon_vma_free(struct anon_vma *anon_vma) 86static inline void anon_vma_free(struct anon_vma *anon_vma)
74{ 87{
88 VM_BUG_ON(atomic_read(&anon_vma->refcount));
75 kmem_cache_free(anon_vma_cachep, anon_vma); 89 kmem_cache_free(anon_vma_cachep, anon_vma);
76} 90}
77 91
@@ -133,11 +147,6 @@ int anon_vma_prepare(struct vm_area_struct *vma)
133 if (unlikely(!anon_vma)) 147 if (unlikely(!anon_vma))
134 goto out_enomem_free_avc; 148 goto out_enomem_free_avc;
135 allocated = anon_vma; 149 allocated = anon_vma;
136 /*
137 * This VMA had no anon_vma yet. This anon_vma is
138 * the root of any anon_vma tree that might form.
139 */
140 anon_vma->root = anon_vma;
141 } 150 }
142 151
143 anon_vma_lock(anon_vma); 152 anon_vma_lock(anon_vma);
@@ -156,7 +165,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
156 anon_vma_unlock(anon_vma); 165 anon_vma_unlock(anon_vma);
157 166
158 if (unlikely(allocated)) 167 if (unlikely(allocated))
159 anon_vma_free(allocated); 168 put_anon_vma(allocated);
160 if (unlikely(avc)) 169 if (unlikely(avc))
161 anon_vma_chain_free(avc); 170 anon_vma_chain_free(avc);
162 } 171 }
@@ -241,9 +250,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
241 */ 250 */
242 anon_vma->root = pvma->anon_vma->root; 251 anon_vma->root = pvma->anon_vma->root;
243 /* 252 /*
244 * With KSM refcounts, an anon_vma can stay around longer than the 253 * With refcounts, an anon_vma can stay around longer than the
245 * process it belongs to. The root anon_vma needs to be pinned 254 * process it belongs to. The root anon_vma needs to be pinned until
246 * until this anon_vma is freed, because the lock lives in the root. 255 * this anon_vma is freed, because the lock lives in the root.
247 */ 256 */
248 get_anon_vma(anon_vma->root); 257 get_anon_vma(anon_vma->root);
249 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 258 /* Mark this anon_vma as the one where our new (COWed) pages go. */
@@ -253,7 +262,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
253 return 0; 262 return 0;
254 263
255 out_error_free_anon_vma: 264 out_error_free_anon_vma:
256 anon_vma_free(anon_vma); 265 put_anon_vma(anon_vma);
257 out_error: 266 out_error:
258 unlink_anon_vmas(vma); 267 unlink_anon_vmas(vma);
259 return -ENOMEM; 268 return -ENOMEM;
@@ -272,15 +281,11 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
272 list_del(&anon_vma_chain->same_anon_vma); 281 list_del(&anon_vma_chain->same_anon_vma);
273 282
274 /* We must garbage collect the anon_vma if it's empty */ 283 /* We must garbage collect the anon_vma if it's empty */
275 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 284 empty = list_empty(&anon_vma->head);
276 anon_vma_unlock(anon_vma); 285 anon_vma_unlock(anon_vma);
277 286
278 if (empty) { 287 if (empty)
279 /* We no longer need the root anon_vma */ 288 put_anon_vma(anon_vma);
280 if (anon_vma->root != anon_vma)
281 drop_anon_vma(anon_vma->root);
282 anon_vma_free(anon_vma);
283 }
284} 289}
285 290
286void unlink_anon_vmas(struct vm_area_struct *vma) 291void unlink_anon_vmas(struct vm_area_struct *vma)
@@ -303,7 +308,7 @@ static void anon_vma_ctor(void *data)
303 struct anon_vma *anon_vma = data; 308 struct anon_vma *anon_vma = data;
304 309
305 spin_lock_init(&anon_vma->lock); 310 spin_lock_init(&anon_vma->lock);
306 anonvma_external_refcount_init(anon_vma); 311 atomic_set(&anon_vma->refcount, 0);
307 INIT_LIST_HEAD(&anon_vma->head); 312 INIT_LIST_HEAD(&anon_vma->head);
308} 313}
309 314
@@ -497,41 +502,51 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
497 struct mm_struct *mm = vma->vm_mm; 502 struct mm_struct *mm = vma->vm_mm;
498 int referenced = 0; 503 int referenced = 0;
499 504
500 /*
501 * Don't want to elevate referenced for mlocked page that gets this far,
502 * in order that it progresses to try_to_unmap and is moved to the
503 * unevictable list.
504 */
505 if (vma->vm_flags & VM_LOCKED) {
506 *mapcount = 0; /* break early from loop */
507 *vm_flags |= VM_LOCKED;
508 goto out;
509 }
510
511 /* Pretend the page is referenced if the task has the
512 swap token and is in the middle of a page fault. */
513 if (mm != current->mm && has_swap_token(mm) &&
514 rwsem_is_locked(&mm->mmap_sem))
515 referenced++;
516
517 if (unlikely(PageTransHuge(page))) { 505 if (unlikely(PageTransHuge(page))) {
518 pmd_t *pmd; 506 pmd_t *pmd;
519 507
520 spin_lock(&mm->page_table_lock); 508 spin_lock(&mm->page_table_lock);
509 /*
510 * rmap might return false positives; we must filter
511 * these out using page_check_address_pmd().
512 */
521 pmd = page_check_address_pmd(page, mm, address, 513 pmd = page_check_address_pmd(page, mm, address,
522 PAGE_CHECK_ADDRESS_PMD_FLAG); 514 PAGE_CHECK_ADDRESS_PMD_FLAG);
523 if (pmd && !pmd_trans_splitting(*pmd) && 515 if (!pmd) {
524 pmdp_clear_flush_young_notify(vma, address, pmd)) 516 spin_unlock(&mm->page_table_lock);
517 goto out;
518 }
519
520 if (vma->vm_flags & VM_LOCKED) {
521 spin_unlock(&mm->page_table_lock);
522 *mapcount = 0; /* break early from loop */
523 *vm_flags |= VM_LOCKED;
524 goto out;
525 }
526
527 /* go ahead even if the pmd is pmd_trans_splitting() */
528 if (pmdp_clear_flush_young_notify(vma, address, pmd))
525 referenced++; 529 referenced++;
526 spin_unlock(&mm->page_table_lock); 530 spin_unlock(&mm->page_table_lock);
527 } else { 531 } else {
528 pte_t *pte; 532 pte_t *pte;
529 spinlock_t *ptl; 533 spinlock_t *ptl;
530 534
535 /*
536 * rmap might return false positives; we must filter
537 * these out using page_check_address().
538 */
531 pte = page_check_address(page, mm, address, &ptl, 0); 539 pte = page_check_address(page, mm, address, &ptl, 0);
532 if (!pte) 540 if (!pte)
533 goto out; 541 goto out;
534 542
543 if (vma->vm_flags & VM_LOCKED) {
544 pte_unmap_unlock(pte, ptl);
545 *mapcount = 0; /* break early from loop */
546 *vm_flags |= VM_LOCKED;
547 goto out;
548 }
549
535 if (ptep_clear_flush_young_notify(vma, address, pte)) { 550 if (ptep_clear_flush_young_notify(vma, address, pte)) {
536 /* 551 /*
537 * Don't treat a reference through a sequentially read 552 * Don't treat a reference through a sequentially read
@@ -546,6 +561,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
546 pte_unmap_unlock(pte, ptl); 561 pte_unmap_unlock(pte, ptl);
547 } 562 }
548 563
564 /* Pretend the page is referenced if the task has the
565 swap token and is in the middle of a page fault. */
566 if (mm != current->mm && has_swap_token(mm) &&
567 rwsem_is_locked(&mm->mmap_sem))
568 referenced++;
569
549 (*mapcount)--; 570 (*mapcount)--;
550 571
551 if (referenced) 572 if (referenced)
@@ -1470,41 +1491,15 @@ int try_to_munlock(struct page *page)
1470 return try_to_unmap_file(page, TTU_MUNLOCK); 1491 return try_to_unmap_file(page, TTU_MUNLOCK);
1471} 1492}
1472 1493
1473#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) 1494void __put_anon_vma(struct anon_vma *anon_vma)
1474/*
1475 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1476 * if necessary. Be careful to do all the tests under the lock. Once
1477 * we know we are the last user, nobody else can get a reference and we
1478 * can do the freeing without the lock.
1479 */
1480void drop_anon_vma(struct anon_vma *anon_vma)
1481{ 1495{
1482 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); 1496 struct anon_vma *root = anon_vma->root;
1483 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1484 struct anon_vma *root = anon_vma->root;
1485 int empty = list_empty(&anon_vma->head);
1486 int last_root_user = 0;
1487 int root_empty = 0;
1488 1497
1489 /* 1498 if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1490 * The refcount on a non-root anon_vma got dropped. Drop 1499 anon_vma_free(root);
1491 * the refcount on the root and check if we need to free it.
1492 */
1493 if (empty && anon_vma != root) {
1494 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1495 last_root_user = atomic_dec_and_test(&root->external_refcount);
1496 root_empty = list_empty(&root->head);
1497 }
1498 anon_vma_unlock(anon_vma);
1499 1500
1500 if (empty) { 1501 anon_vma_free(anon_vma);
1501 anon_vma_free(anon_vma);
1502 if (root_empty && last_root_user)
1503 anon_vma_free(root);
1504 }
1505 }
1506} 1502}
1507#endif
1508 1503
1509#ifdef CONFIG_MIGRATION 1504#ifdef CONFIG_MIGRATION
1510/* 1505/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c990602..8fa27e4e582a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops;
224static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 224static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
225 .ra_pages = 0, /* No readahead */ 225 .ra_pages = 0, /* No readahead */
226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 226 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
227 .unplug_io_fn = default_unplug_io_fn,
228}; 227};
229 228
230static LIST_HEAD(shmem_swaplist); 229static LIST_HEAD(shmem_swaplist);
@@ -422,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
422 * a waste to allocate index if we cannot allocate data. 421 * a waste to allocate index if we cannot allocate data.
423 */ 422 */
424 if (sbinfo->max_blocks) { 423 if (sbinfo->max_blocks) {
425 if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) 424 if (percpu_counter_compare(&sbinfo->used_blocks,
425 sbinfo->max_blocks - 1) >= 0)
426 return ERR_PTR(-ENOSPC); 426 return ERR_PTR(-ENOSPC);
427 percpu_counter_inc(&sbinfo->used_blocks); 427 percpu_counter_inc(&sbinfo->used_blocks);
428 spin_lock(&inode->i_lock); 428 spin_lock(&inode->i_lock);
@@ -779,7 +779,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
779 * If truncating down to a partial page, then 779 * If truncating down to a partial page, then
780 * if that page is already allocated, hold it 780 * if that page is already allocated, hold it
781 * in memory until the truncation is over, so 781 * in memory until the truncation is over, so
782 * truncate_partial_page cannnot miss it were 782 * truncate_partial_page cannot miss it were
783 * it assigned to swap. 783 * it assigned to swap.
784 */ 784 */
785 if (newsize & (PAGE_CACHE_SIZE-1)) { 785 if (newsize & (PAGE_CACHE_SIZE-1)) {
@@ -1081,7 +1081,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1081 shmem_recalc_inode(inode); 1081 shmem_recalc_inode(inode);
1082 1082
1083 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1083 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1084 remove_from_page_cache(page); 1084 delete_from_page_cache(page);
1085 shmem_swp_set(info, entry, swap.val); 1085 shmem_swp_set(info, entry, swap.val);
1086 shmem_swp_unmap(entry); 1086 shmem_swp_unmap(entry);
1087 if (list_empty(&info->swaplist)) 1087 if (list_empty(&info->swaplist))
@@ -1091,7 +1091,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1091 spin_unlock(&info->lock); 1091 spin_unlock(&info->lock);
1092 swap_shmem_alloc(swap); 1092 swap_shmem_alloc(swap);
1093 BUG_ON(page_mapped(page)); 1093 BUG_ON(page_mapped(page));
1094 page_cache_release(page); /* pagecache ref */
1095 swap_writepage(page, wbc); 1094 swap_writepage(page, wbc);
1096 if (inode) { 1095 if (inode) {
1097 mutex_lock(&shmem_swaplist_mutex); 1096 mutex_lock(&shmem_swaplist_mutex);
@@ -1399,7 +1398,8 @@ repeat:
1399 shmem_swp_unmap(entry); 1398 shmem_swp_unmap(entry);
1400 sbinfo = SHMEM_SB(inode->i_sb); 1399 sbinfo = SHMEM_SB(inode->i_sb);
1401 if (sbinfo->max_blocks) { 1400 if (sbinfo->max_blocks) {
1402 if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || 1401 if (percpu_counter_compare(&sbinfo->used_blocks,
1402 sbinfo->max_blocks) >= 0 ||
1403 shmem_acct_block(info->flags)) { 1403 shmem_acct_block(info->flags)) {
1404 spin_unlock(&info->lock); 1404 spin_unlock(&info->lock);
1405 error = -ENOSPC; 1405 error = -ENOSPC;
@@ -1843,8 +1843,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1843 1843
1844 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1844 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1845 if (inode) { 1845 if (inode) {
1846 error = security_inode_init_security(inode, dir, NULL, NULL, 1846 error = security_inode_init_security(inode, dir,
1847 NULL); 1847 &dentry->d_name, NULL,
1848 NULL, NULL);
1848 if (error) { 1849 if (error) {
1849 if (error != -EOPNOTSUPP) { 1850 if (error != -EOPNOTSUPP) {
1850 iput(inode); 1851 iput(inode);
@@ -1983,8 +1984,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1983 if (!inode) 1984 if (!inode)
1984 return -ENOSPC; 1985 return -ENOSPC;
1985 1986
1986 error = security_inode_init_security(inode, dir, NULL, NULL, 1987 error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
1987 NULL); 1988 NULL, NULL);
1988 if (error) { 1989 if (error) {
1989 if (error != -EOPNOTSUPP) { 1990 if (error != -EOPNOTSUPP) {
1990 iput(inode); 1991 iput(inode);
@@ -2144,8 +2145,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2144{ 2145{
2145 struct inode *inode = dentry->d_inode; 2146 struct inode *inode = dentry->d_inode;
2146 2147
2147 if (*len < 3) 2148 if (*len < 3) {
2149 *len = 3;
2148 return 255; 2150 return 255;
2151 }
2149 2152
2150 if (inode_unhashed(inode)) { 2153 if (inode_unhashed(inode)) {
2151 /* Unfortunately insert_inode_hash is not idempotent, 2154 /* Unfortunately insert_inode_hash is not idempotent,
@@ -2791,5 +2794,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2791 fput(vma->vm_file); 2794 fput(vma->vm_file);
2792 vma->vm_file = file; 2795 vma->vm_file = file;
2793 vma->vm_ops = &shmem_vm_ops; 2796 vma->vm_ops = &shmem_vm_ops;
2797 vma->vm_flags |= VM_CAN_NONLINEAR;
2794 return 0; 2798 return 0;
2795} 2799}
diff --git a/mm/slab.c b/mm/slab.c
index 37961d1f584f..46a9c163a92f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t;
191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 191#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
192 192
193/* 193/*
194 * struct slab
195 *
196 * Manages the objs in a slab. Placed either at the beginning of mem allocated
197 * for a slab, or allocated from an general cache.
198 * Slabs are chained into three list: fully used, partial, fully free slabs.
199 */
200struct slab {
201 struct list_head list;
202 unsigned long colouroff;
203 void *s_mem; /* including colour offset */
204 unsigned int inuse; /* num of objs active in slab */
205 kmem_bufctl_t free;
206 unsigned short nodeid;
207};
208
209/*
210 * struct slab_rcu 194 * struct slab_rcu
211 * 195 *
212 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 196 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +203,6 @@ struct slab {
219 * 203 *
220 * rcu_read_lock before reading the address, then rcu_read_unlock after 204 * rcu_read_lock before reading the address, then rcu_read_unlock after
221 * taking the spinlock within the structure expected at that address. 205 * taking the spinlock within the structure expected at that address.
222 *
223 * We assume struct slab_rcu can overlay struct slab when destroying.
224 */ 206 */
225struct slab_rcu { 207struct slab_rcu {
226 struct rcu_head head; 208 struct rcu_head head;
@@ -229,6 +211,27 @@ struct slab_rcu {
229}; 211};
230 212
231/* 213/*
214 * struct slab
215 *
216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
217 * for a slab, or allocated from an general cache.
218 * Slabs are chained into three list: fully used, partial, fully free slabs.
219 */
220struct slab {
221 union {
222 struct {
223 struct list_head list;
224 unsigned long colouroff;
225 void *s_mem; /* including colour offset */
226 unsigned int inuse; /* num of objs active in slab */
227 kmem_bufctl_t free;
228 unsigned short nodeid;
229 };
230 struct slab_rcu __slab_cover_slab_rcu;
231 };
232};
233
234/*
232 * struct array_cache 235 * struct array_cache
233 * 236 *
234 * Purpose: 237 * Purpose:
@@ -875,7 +878,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
875 nc = kmalloc_node(memsize, gfp, node); 878 nc = kmalloc_node(memsize, gfp, node);
876 /* 879 /*
877 * The array_cache structures contain pointers to free object. 880 * The array_cache structures contain pointers to free object.
878 * However, when such objects are allocated or transfered to another 881 * However, when such objects are allocated or transferred to another
879 * cache the pointers are not cleared and they could be counted as 882 * cache the pointers are not cleared and they could be counted as
880 * valid references during a kmemleak scan. Therefore, kmemleak must 883 * valid references during a kmemleak scan. Therefore, kmemleak must
881 * not scan such objects. 884 * not scan such objects.
@@ -1387,7 +1390,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
1387 break; 1390 break;
1388 } 1391 }
1389out: 1392out:
1390 return ret ? notifier_from_errno(ret) : NOTIFY_OK; 1393 return notifier_from_errno(ret);
1391} 1394}
1392#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ 1395#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
1393 1396
@@ -2147,8 +2150,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2147 * 2150 *
2148 * @name must be valid until the cache is destroyed. This implies that 2151 * @name must be valid until the cache is destroyed. This implies that
2149 * the module calling this has to destroy the cache before getting unloaded. 2152 * the module calling this has to destroy the cache before getting unloaded.
2150 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
2151 * therefore applications must manage it themselves.
2152 * 2153 *
2153 * The flags are 2154 * The flags are
2154 * 2155 *
@@ -2288,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2288 if (ralign < align) { 2289 if (ralign < align) {
2289 ralign = align; 2290 ralign = align;
2290 } 2291 }
2291 /* disable debug if not aligning with REDZONE_ALIGN */ 2292 /* disable debug if necessary */
2292 if (ralign & (__alignof__(unsigned long long) - 1)) 2293 if (ralign > __alignof__(unsigned long long))
2293 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2294 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2294 /* 2295 /*
2295 * 4) Store it. 2296 * 4) Store it.
@@ -2315,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2315 */ 2316 */
2316 if (flags & SLAB_RED_ZONE) { 2317 if (flags & SLAB_RED_ZONE) {
2317 /* add space for red zone words */ 2318 /* add space for red zone words */
2318 cachep->obj_offset += align; 2319 cachep->obj_offset += sizeof(unsigned long long);
2319 size += align + sizeof(unsigned long long); 2320 size += 2 * sizeof(unsigned long long);
2320 } 2321 }
2321 if (flags & SLAB_STORE_USER) { 2322 if (flags & SLAB_STORE_USER) {
2322 /* user store requires one word storage behind the end of 2323 /* user store requires one word storage behind the end of
@@ -2605,7 +2606,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2605 * 2606 *
2606 * The cache must be empty before calling this function. 2607 * The cache must be empty before calling this function.
2607 * 2608 *
2608 * The caller must guarantee that noone will allocate memory from the cache 2609 * The caller must guarantee that no one will allocate memory from the cache
2609 * during the kmem_cache_destroy(). 2610 * during the kmem_cache_destroy().
2610 */ 2611 */
2611void kmem_cache_destroy(struct kmem_cache *cachep) 2612void kmem_cache_destroy(struct kmem_cache *cachep)
@@ -3840,12 +3841,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep)
3840} 3841}
3841EXPORT_SYMBOL(kmem_cache_size); 3842EXPORT_SYMBOL(kmem_cache_size);
3842 3843
3843const char *kmem_cache_name(struct kmem_cache *cachep)
3844{
3845 return cachep->name;
3846}
3847EXPORT_SYMBOL_GPL(kmem_cache_name);
3848
3849/* 3844/*
3850 * This initializes kmem_list3 or resizes various caches for all nodes. 3845 * This initializes kmem_list3 or resizes various caches for all nodes.
3851 */ 3846 */
diff --git a/mm/slob.c b/mm/slob.c
index 3588eaaef726..46e0aee33a23 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -666,12 +666,6 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
666} 666}
667EXPORT_SYMBOL(kmem_cache_size); 667EXPORT_SYMBOL(kmem_cache_size);
668 668
669const char *kmem_cache_name(struct kmem_cache *c)
670{
671 return c->name;
672}
673EXPORT_SYMBOL(kmem_cache_name);
674
675int kmem_cache_shrink(struct kmem_cache *d) 669int kmem_cache_shrink(struct kmem_cache *d)
676{ 670{
677 return 0; 671 return 0;
diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c9..94d2a33a866e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -64,7 +64,7 @@
64 * we must stay away from it for a while since we may cause a bouncing 64 * we must stay away from it for a while since we may cause a bouncing
65 * cacheline if we try to acquire the lock. So go onto the next slab. 65 * cacheline if we try to acquire the lock. So go onto the next slab.
66 * If all pages are busy then we may allocate a new slab instead of reusing 66 * If all pages are busy then we may allocate a new slab instead of reusing
67 * a partial slab. A new slab has noone operating on it and thus there is 67 * a partial slab. A new slab has no one operating on it and thus there is
68 * no danger of cacheline contention. 68 * no danger of cacheline contention.
69 * 69 *
70 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
@@ -217,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 217
218#endif 218#endif
219 219
220static inline void stat(struct kmem_cache *s, enum stat_item si) 220static inline void stat(const struct kmem_cache *s, enum stat_item si)
221{ 221{
222#ifdef CONFIG_SLUB_STATS 222#ifdef CONFIG_SLUB_STATS
223 __this_cpu_inc(s->cpu_slab->stat[si]); 223 __this_cpu_inc(s->cpu_slab->stat[si]);
@@ -281,11 +281,40 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
281 return (p - addr) / s->size; 281 return (p - addr) / s->size;
282} 282}
283 283
284static inline size_t slab_ksize(const struct kmem_cache *s)
285{
286#ifdef CONFIG_SLUB_DEBUG
287 /*
288 * Debugging requires use of the padding between object
289 * and whatever may come after it.
290 */
291 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
292 return s->objsize;
293
294#endif
295 /*
296 * If we have the need to store the freelist pointer
297 * back there or track user information then we can
298 * only use the space before that information.
299 */
300 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
301 return s->inuse;
302 /*
303 * Else we can use all the padding etc for the allocation
304 */
305 return s->size;
306}
307
308static inline int order_objects(int order, unsigned long size, int reserved)
309{
310 return ((PAGE_SIZE << order) - reserved) / size;
311}
312
284static inline struct kmem_cache_order_objects oo_make(int order, 313static inline struct kmem_cache_order_objects oo_make(int order,
285 unsigned long size) 314 unsigned long size, int reserved)
286{ 315{
287 struct kmem_cache_order_objects x = { 316 struct kmem_cache_order_objects x = {
288 (order << OO_SHIFT) + (PAGE_SIZE << order) / size 317 (order << OO_SHIFT) + order_objects(order, size, reserved)
289 }; 318 };
290 319
291 return x; 320 return x;
@@ -617,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
617 return 1; 646 return 1;
618 647
619 start = page_address(page); 648 start = page_address(page);
620 length = (PAGE_SIZE << compound_order(page)); 649 length = (PAGE_SIZE << compound_order(page)) - s->reserved;
621 end = start + length; 650 end = start + length;
622 remainder = length % s->size; 651 remainder = length % s->size;
623 if (!remainder) 652 if (!remainder)
@@ -698,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
698 return 0; 727 return 0;
699 } 728 }
700 729
701 maxobj = (PAGE_SIZE << compound_order(page)) / s->size; 730 maxobj = order_objects(compound_order(page), s->size, s->reserved);
702 if (page->objects > maxobj) { 731 if (page->objects > maxobj) {
703 slab_err(s, page, "objects %u > max %u", 732 slab_err(s, page, "objects %u > max %u",
704 s->name, page->objects, maxobj); 733 s->name, page->objects, maxobj);
@@ -748,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
748 nr++; 777 nr++;
749 } 778 }
750 779
751 max_objects = (PAGE_SIZE << compound_order(page)) / s->size; 780 max_objects = order_objects(compound_order(page), s->size, s->reserved);
752 if (max_objects > MAX_OBJS_PER_PAGE) 781 if (max_objects > MAX_OBJS_PER_PAGE)
753 max_objects = MAX_OBJS_PER_PAGE; 782 max_objects = MAX_OBJS_PER_PAGE;
754 783
@@ -800,21 +829,31 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
800static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 829static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
801{ 830{
802 flags &= gfp_allowed_mask; 831 flags &= gfp_allowed_mask;
803 kmemcheck_slab_alloc(s, flags, object, s->objsize); 832 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
804 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); 833 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
805} 834}
806 835
807static inline void slab_free_hook(struct kmem_cache *s, void *x) 836static inline void slab_free_hook(struct kmem_cache *s, void *x)
808{ 837{
809 kmemleak_free_recursive(x, s->flags); 838 kmemleak_free_recursive(x, s->flags);
810}
811 839
812static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) 840 /*
813{ 841 * Trouble is that we may no longer disable interupts in the fast path
814 kmemcheck_slab_free(s, object, s->objsize); 842 * So in order to make the debug calls that expect irqs to be
815 debug_check_no_locks_freed(object, s->objsize); 843 * disabled we need to disable interrupts temporarily.
844 */
845#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
846 {
847 unsigned long flags;
848
849 local_irq_save(flags);
850 kmemcheck_slab_free(s, x, s->objsize);
851 debug_check_no_locks_freed(x, s->objsize);
852 local_irq_restore(flags);
853 }
854#endif
816 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 855 if (!(s->flags & SLAB_DEBUG_OBJECTS))
817 debug_check_no_obj_freed(object, s->objsize); 856 debug_check_no_obj_freed(x, s->objsize);
818} 857}
819 858
820/* 859/*
@@ -1101,9 +1140,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
1101 1140
1102static inline void slab_free_hook(struct kmem_cache *s, void *x) {} 1141static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
1103 1142
1104static inline void slab_free_hook_irq(struct kmem_cache *s,
1105 void *object) {}
1106
1107#endif /* CONFIG_SLUB_DEBUG */ 1143#endif /* CONFIG_SLUB_DEBUG */
1108 1144
1109/* 1145/*
@@ -1249,21 +1285,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1249 __free_pages(page, order); 1285 __free_pages(page, order);
1250} 1286}
1251 1287
1288#define need_reserve_slab_rcu \
1289 (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
1290
1252static void rcu_free_slab(struct rcu_head *h) 1291static void rcu_free_slab(struct rcu_head *h)
1253{ 1292{
1254 struct page *page; 1293 struct page *page;
1255 1294
1256 page = container_of((struct list_head *)h, struct page, lru); 1295 if (need_reserve_slab_rcu)
1296 page = virt_to_head_page(h);
1297 else
1298 page = container_of((struct list_head *)h, struct page, lru);
1299
1257 __free_slab(page->slab, page); 1300 __free_slab(page->slab, page);
1258} 1301}
1259 1302
1260static void free_slab(struct kmem_cache *s, struct page *page) 1303static void free_slab(struct kmem_cache *s, struct page *page)
1261{ 1304{
1262 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { 1305 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
1263 /* 1306 struct rcu_head *head;
1264 * RCU free overloads the RCU head over the LRU 1307
1265 */ 1308 if (need_reserve_slab_rcu) {
1266 struct rcu_head *head = (void *)&page->lru; 1309 int order = compound_order(page);
1310 int offset = (PAGE_SIZE << order) - s->reserved;
1311
1312 VM_BUG_ON(s->reserved != sizeof(*head));
1313 head = page_address(page) + offset;
1314 } else {
1315 /*
1316 * RCU free overloads the RCU head over the LRU
1317 */
1318 head = (void *)&page->lru;
1319 }
1267 1320
1268 call_rcu(head, rcu_free_slab); 1321 call_rcu(head, rcu_free_slab);
1269 } else 1322 } else
@@ -1487,6 +1540,78 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1487 } 1540 }
1488} 1541}
1489 1542
1543#ifdef CONFIG_CMPXCHG_LOCAL
1544#ifdef CONFIG_PREEMPT
1545/*
1546 * Calculate the next globally unique transaction for disambiguiation
1547 * during cmpxchg. The transactions start with the cpu number and are then
1548 * incremented by CONFIG_NR_CPUS.
1549 */
1550#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
1551#else
1552/*
1553 * No preemption supported therefore also no need to check for
1554 * different cpus.
1555 */
1556#define TID_STEP 1
1557#endif
1558
1559static inline unsigned long next_tid(unsigned long tid)
1560{
1561 return tid + TID_STEP;
1562}
1563
1564static inline unsigned int tid_to_cpu(unsigned long tid)
1565{
1566 return tid % TID_STEP;
1567}
1568
1569static inline unsigned long tid_to_event(unsigned long tid)
1570{
1571 return tid / TID_STEP;
1572}
1573
1574static inline unsigned int init_tid(int cpu)
1575{
1576 return cpu;
1577}
1578
1579static inline void note_cmpxchg_failure(const char *n,
1580 const struct kmem_cache *s, unsigned long tid)
1581{
1582#ifdef SLUB_DEBUG_CMPXCHG
1583 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1584
1585 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1586
1587#ifdef CONFIG_PREEMPT
1588 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1589 printk("due to cpu change %d -> %d\n",
1590 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1591 else
1592#endif
1593 if (tid_to_event(tid) != tid_to_event(actual_tid))
1594 printk("due to cpu running other code. Event %ld->%ld\n",
1595 tid_to_event(tid), tid_to_event(actual_tid));
1596 else
1597 printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1598 actual_tid, tid, next_tid(tid));
1599#endif
1600 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1601}
1602
1603#endif
1604
1605void init_kmem_cache_cpus(struct kmem_cache *s)
1606{
1607#ifdef CONFIG_CMPXCHG_LOCAL
1608 int cpu;
1609
1610 for_each_possible_cpu(cpu)
1611 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1612#endif
1613
1614}
1490/* 1615/*
1491 * Remove the cpu slab 1616 * Remove the cpu slab
1492 */ 1617 */
@@ -1518,6 +1643,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1518 page->inuse--; 1643 page->inuse--;
1519 } 1644 }
1520 c->page = NULL; 1645 c->page = NULL;
1646#ifdef CONFIG_CMPXCHG_LOCAL
1647 c->tid = next_tid(c->tid);
1648#endif
1521 unfreeze_slab(s, page, tail); 1649 unfreeze_slab(s, page, tail);
1522} 1650}
1523 1651
@@ -1652,6 +1780,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1652{ 1780{
1653 void **object; 1781 void **object;
1654 struct page *new; 1782 struct page *new;
1783#ifdef CONFIG_CMPXCHG_LOCAL
1784 unsigned long flags;
1785
1786 local_irq_save(flags);
1787#ifdef CONFIG_PREEMPT
1788 /*
1789 * We may have been preempted and rescheduled on a different
1790 * cpu before disabling interrupts. Need to reload cpu area
1791 * pointer.
1792 */
1793 c = this_cpu_ptr(s->cpu_slab);
1794#endif
1795#endif
1655 1796
1656 /* We handle __GFP_ZERO in the caller */ 1797 /* We handle __GFP_ZERO in the caller */
1657 gfpflags &= ~__GFP_ZERO; 1798 gfpflags &= ~__GFP_ZERO;
@@ -1678,6 +1819,10 @@ load_freelist:
1678 c->node = page_to_nid(c->page); 1819 c->node = page_to_nid(c->page);
1679unlock_out: 1820unlock_out:
1680 slab_unlock(c->page); 1821 slab_unlock(c->page);
1822#ifdef CONFIG_CMPXCHG_LOCAL
1823 c->tid = next_tid(c->tid);
1824 local_irq_restore(flags);
1825#endif
1681 stat(s, ALLOC_SLOWPATH); 1826 stat(s, ALLOC_SLOWPATH);
1682 return object; 1827 return object;
1683 1828
@@ -1713,6 +1858,9 @@ new_slab:
1713 } 1858 }
1714 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 1859 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1715 slab_out_of_memory(s, gfpflags, node); 1860 slab_out_of_memory(s, gfpflags, node);
1861#ifdef CONFIG_CMPXCHG_LOCAL
1862 local_irq_restore(flags);
1863#endif
1716 return NULL; 1864 return NULL;
1717debug: 1865debug:
1718 if (!alloc_debug_processing(s, c->page, object, addr)) 1866 if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1739,23 +1887,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1739{ 1887{
1740 void **object; 1888 void **object;
1741 struct kmem_cache_cpu *c; 1889 struct kmem_cache_cpu *c;
1890#ifdef CONFIG_CMPXCHG_LOCAL
1891 unsigned long tid;
1892#else
1742 unsigned long flags; 1893 unsigned long flags;
1894#endif
1743 1895
1744 if (slab_pre_alloc_hook(s, gfpflags)) 1896 if (slab_pre_alloc_hook(s, gfpflags))
1745 return NULL; 1897 return NULL;
1746 1898
1899#ifndef CONFIG_CMPXCHG_LOCAL
1747 local_irq_save(flags); 1900 local_irq_save(flags);
1901#else
1902redo:
1903#endif
1904
1905 /*
1906 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
1907 * enabled. We may switch back and forth between cpus while
1908 * reading from one cpu area. That does not matter as long
1909 * as we end up on the original cpu again when doing the cmpxchg.
1910 */
1748 c = __this_cpu_ptr(s->cpu_slab); 1911 c = __this_cpu_ptr(s->cpu_slab);
1912
1913#ifdef CONFIG_CMPXCHG_LOCAL
1914 /*
1915 * The transaction ids are globally unique per cpu and per operation on
1916 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
1917 * occurs on the right processor and that there was no operation on the
1918 * linked list in between.
1919 */
1920 tid = c->tid;
1921 barrier();
1922#endif
1923
1749 object = c->freelist; 1924 object = c->freelist;
1750 if (unlikely(!object || !node_match(c, node))) 1925 if (unlikely(!object || !node_match(c, node)))
1751 1926
1752 object = __slab_alloc(s, gfpflags, node, addr, c); 1927 object = __slab_alloc(s, gfpflags, node, addr, c);
1753 1928
1754 else { 1929 else {
1930#ifdef CONFIG_CMPXCHG_LOCAL
1931 /*
1932 * The cmpxchg will only match if there was no additional
1933 * operation and if we are on the right processor.
1934 *
1935 * The cmpxchg does the following atomically (without lock semantics!)
1936 * 1. Relocate first pointer to the current per cpu area.
1937 * 2. Verify that tid and freelist have not been changed
1938 * 3. If they were not changed replace tid and freelist
1939 *
1940 * Since this is without lock semantics the protection is only against
1941 * code executing on this cpu *not* from access by other cpus.
1942 */
1943 if (unlikely(!this_cpu_cmpxchg_double(
1944 s->cpu_slab->freelist, s->cpu_slab->tid,
1945 object, tid,
1946 get_freepointer(s, object), next_tid(tid)))) {
1947
1948 note_cmpxchg_failure("slab_alloc", s, tid);
1949 goto redo;
1950 }
1951#else
1755 c->freelist = get_freepointer(s, object); 1952 c->freelist = get_freepointer(s, object);
1953#endif
1756 stat(s, ALLOC_FASTPATH); 1954 stat(s, ALLOC_FASTPATH);
1757 } 1955 }
1956
1957#ifndef CONFIG_CMPXCHG_LOCAL
1758 local_irq_restore(flags); 1958 local_irq_restore(flags);
1959#endif
1759 1960
1760 if (unlikely(gfpflags & __GFP_ZERO) && object) 1961 if (unlikely(gfpflags & __GFP_ZERO) && object)
1761 memset(object, 0, s->objsize); 1962 memset(object, 0, s->objsize);
@@ -1833,9 +2034,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1833{ 2034{
1834 void *prior; 2035 void *prior;
1835 void **object = (void *)x; 2036 void **object = (void *)x;
2037#ifdef CONFIG_CMPXCHG_LOCAL
2038 unsigned long flags;
1836 2039
1837 stat(s, FREE_SLOWPATH); 2040 local_irq_save(flags);
2041#endif
1838 slab_lock(page); 2042 slab_lock(page);
2043 stat(s, FREE_SLOWPATH);
1839 2044
1840 if (kmem_cache_debug(s)) 2045 if (kmem_cache_debug(s))
1841 goto debug; 2046 goto debug;
@@ -1865,6 +2070,9 @@ checks_ok:
1865 2070
1866out_unlock: 2071out_unlock:
1867 slab_unlock(page); 2072 slab_unlock(page);
2073#ifdef CONFIG_CMPXCHG_LOCAL
2074 local_irq_restore(flags);
2075#endif
1868 return; 2076 return;
1869 2077
1870slab_empty: 2078slab_empty:
@@ -1876,6 +2084,9 @@ slab_empty:
1876 stat(s, FREE_REMOVE_PARTIAL); 2084 stat(s, FREE_REMOVE_PARTIAL);
1877 } 2085 }
1878 slab_unlock(page); 2086 slab_unlock(page);
2087#ifdef CONFIG_CMPXCHG_LOCAL
2088 local_irq_restore(flags);
2089#endif
1879 stat(s, FREE_SLAB); 2090 stat(s, FREE_SLAB);
1880 discard_slab(s, page); 2091 discard_slab(s, page);
1881 return; 2092 return;
@@ -1902,23 +2113,56 @@ static __always_inline void slab_free(struct kmem_cache *s,
1902{ 2113{
1903 void **object = (void *)x; 2114 void **object = (void *)x;
1904 struct kmem_cache_cpu *c; 2115 struct kmem_cache_cpu *c;
2116#ifdef CONFIG_CMPXCHG_LOCAL
2117 unsigned long tid;
2118#else
1905 unsigned long flags; 2119 unsigned long flags;
2120#endif
1906 2121
1907 slab_free_hook(s, x); 2122 slab_free_hook(s, x);
1908 2123
2124#ifndef CONFIG_CMPXCHG_LOCAL
1909 local_irq_save(flags); 2125 local_irq_save(flags);
2126
2127#else
2128redo:
2129#endif
2130
2131 /*
2132 * Determine the currently cpus per cpu slab.
2133 * The cpu may change afterward. However that does not matter since
2134 * data is retrieved via this pointer. If we are on the same cpu
2135 * during the cmpxchg then the free will succedd.
2136 */
1910 c = __this_cpu_ptr(s->cpu_slab); 2137 c = __this_cpu_ptr(s->cpu_slab);
1911 2138
1912 slab_free_hook_irq(s, x); 2139#ifdef CONFIG_CMPXCHG_LOCAL
2140 tid = c->tid;
2141 barrier();
2142#endif
1913 2143
1914 if (likely(page == c->page && c->node != NUMA_NO_NODE)) { 2144 if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1915 set_freepointer(s, object, c->freelist); 2145 set_freepointer(s, object, c->freelist);
2146
2147#ifdef CONFIG_CMPXCHG_LOCAL
2148 if (unlikely(!this_cpu_cmpxchg_double(
2149 s->cpu_slab->freelist, s->cpu_slab->tid,
2150 c->freelist, tid,
2151 object, next_tid(tid)))) {
2152
2153 note_cmpxchg_failure("slab_free", s, tid);
2154 goto redo;
2155 }
2156#else
1916 c->freelist = object; 2157 c->freelist = object;
2158#endif
1917 stat(s, FREE_FASTPATH); 2159 stat(s, FREE_FASTPATH);
1918 } else 2160 } else
1919 __slab_free(s, page, x, addr); 2161 __slab_free(s, page, x, addr);
1920 2162
2163#ifndef CONFIG_CMPXCHG_LOCAL
1921 local_irq_restore(flags); 2164 local_irq_restore(flags);
2165#endif
1922} 2166}
1923 2167
1924void kmem_cache_free(struct kmem_cache *s, void *x) 2168void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1988,13 +2232,13 @@ static int slub_nomerge;
1988 * the smallest order which will fit the object. 2232 * the smallest order which will fit the object.
1989 */ 2233 */
1990static inline int slab_order(int size, int min_objects, 2234static inline int slab_order(int size, int min_objects,
1991 int max_order, int fract_leftover) 2235 int max_order, int fract_leftover, int reserved)
1992{ 2236{
1993 int order; 2237 int order;
1994 int rem; 2238 int rem;
1995 int min_order = slub_min_order; 2239 int min_order = slub_min_order;
1996 2240
1997 if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) 2241 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
1998 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 2242 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
1999 2243
2000 for (order = max(min_order, 2244 for (order = max(min_order,
@@ -2003,10 +2247,10 @@ static inline int slab_order(int size, int min_objects,
2003 2247
2004 unsigned long slab_size = PAGE_SIZE << order; 2248 unsigned long slab_size = PAGE_SIZE << order;
2005 2249
2006 if (slab_size < min_objects * size) 2250 if (slab_size < min_objects * size + reserved)
2007 continue; 2251 continue;
2008 2252
2009 rem = slab_size % size; 2253 rem = (slab_size - reserved) % size;
2010 2254
2011 if (rem <= slab_size / fract_leftover) 2255 if (rem <= slab_size / fract_leftover)
2012 break; 2256 break;
@@ -2016,7 +2260,7 @@ static inline int slab_order(int size, int min_objects,
2016 return order; 2260 return order;
2017} 2261}
2018 2262
2019static inline int calculate_order(int size) 2263static inline int calculate_order(int size, int reserved)
2020{ 2264{
2021 int order; 2265 int order;
2022 int min_objects; 2266 int min_objects;
@@ -2034,14 +2278,14 @@ static inline int calculate_order(int size)
2034 min_objects = slub_min_objects; 2278 min_objects = slub_min_objects;
2035 if (!min_objects) 2279 if (!min_objects)
2036 min_objects = 4 * (fls(nr_cpu_ids) + 1); 2280 min_objects = 4 * (fls(nr_cpu_ids) + 1);
2037 max_objects = (PAGE_SIZE << slub_max_order)/size; 2281 max_objects = order_objects(slub_max_order, size, reserved);
2038 min_objects = min(min_objects, max_objects); 2282 min_objects = min(min_objects, max_objects);
2039 2283
2040 while (min_objects > 1) { 2284 while (min_objects > 1) {
2041 fraction = 16; 2285 fraction = 16;
2042 while (fraction >= 4) { 2286 while (fraction >= 4) {
2043 order = slab_order(size, min_objects, 2287 order = slab_order(size, min_objects,
2044 slub_max_order, fraction); 2288 slub_max_order, fraction, reserved);
2045 if (order <= slub_max_order) 2289 if (order <= slub_max_order)
2046 return order; 2290 return order;
2047 fraction /= 2; 2291 fraction /= 2;
@@ -2053,14 +2297,14 @@ static inline int calculate_order(int size)
2053 * We were unable to place multiple objects in a slab. Now 2297 * We were unable to place multiple objects in a slab. Now
2054 * lets see if we can place a single object there. 2298 * lets see if we can place a single object there.
2055 */ 2299 */
2056 order = slab_order(size, 1, slub_max_order, 1); 2300 order = slab_order(size, 1, slub_max_order, 1, reserved);
2057 if (order <= slub_max_order) 2301 if (order <= slub_max_order)
2058 return order; 2302 return order;
2059 2303
2060 /* 2304 /*
2061 * Doh this slab cannot be placed using slub_max_order. 2305 * Doh this slab cannot be placed using slub_max_order.
2062 */ 2306 */
2063 order = slab_order(size, 1, MAX_ORDER, 1); 2307 order = slab_order(size, 1, MAX_ORDER, 1, reserved);
2064 if (order < MAX_ORDER) 2308 if (order < MAX_ORDER)
2065 return order; 2309 return order;
2066 return -ENOSYS; 2310 return -ENOSYS;
@@ -2110,9 +2354,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2110 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2354 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2111 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2355 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2112 2356
2357#ifdef CONFIG_CMPXCHG_LOCAL
2358 /*
2359 * Must align to double word boundary for the double cmpxchg instructions
2360 * to work.
2361 */
2362 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
2363#else
2364 /* Regular alignment is sufficient */
2113 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); 2365 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2366#endif
2367
2368 if (!s->cpu_slab)
2369 return 0;
2114 2370
2115 return s->cpu_slab != NULL; 2371 init_kmem_cache_cpus(s);
2372
2373 return 1;
2116} 2374}
2117 2375
2118static struct kmem_cache *kmem_cache_node; 2376static struct kmem_cache *kmem_cache_node;
@@ -2311,7 +2569,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2311 if (forced_order >= 0) 2569 if (forced_order >= 0)
2312 order = forced_order; 2570 order = forced_order;
2313 else 2571 else
2314 order = calculate_order(size); 2572 order = calculate_order(size, s->reserved);
2315 2573
2316 if (order < 0) 2574 if (order < 0)
2317 return 0; 2575 return 0;
@@ -2329,8 +2587,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2329 /* 2587 /*
2330 * Determine the number of objects per slab 2588 * Determine the number of objects per slab
2331 */ 2589 */
2332 s->oo = oo_make(order, size); 2590 s->oo = oo_make(order, size, s->reserved);
2333 s->min = oo_make(get_order(size), size); 2591 s->min = oo_make(get_order(size), size, s->reserved);
2334 if (oo_objects(s->oo) > oo_objects(s->max)) 2592 if (oo_objects(s->oo) > oo_objects(s->max))
2335 s->max = s->oo; 2593 s->max = s->oo;
2336 2594
@@ -2349,6 +2607,10 @@ static int kmem_cache_open(struct kmem_cache *s,
2349 s->objsize = size; 2607 s->objsize = size;
2350 s->align = align; 2608 s->align = align;
2351 s->flags = kmem_cache_flags(size, flags, name, ctor); 2609 s->flags = kmem_cache_flags(size, flags, name, ctor);
2610 s->reserved = 0;
2611
2612 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
2613 s->reserved = sizeof(struct rcu_head);
2352 2614
2353 if (!calculate_sizes(s, -1)) 2615 if (!calculate_sizes(s, -1))
2354 goto error; 2616 goto error;
@@ -2399,12 +2661,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
2399} 2661}
2400EXPORT_SYMBOL(kmem_cache_size); 2662EXPORT_SYMBOL(kmem_cache_size);
2401 2663
2402const char *kmem_cache_name(struct kmem_cache *s)
2403{
2404 return s->name;
2405}
2406EXPORT_SYMBOL(kmem_cache_name);
2407
2408static void list_slab_objects(struct kmem_cache *s, struct page *page, 2664static void list_slab_objects(struct kmem_cache *s, struct page *page,
2409 const char *text) 2665 const char *text)
2410{ 2666{
@@ -2696,7 +2952,6 @@ EXPORT_SYMBOL(__kmalloc_node);
2696size_t ksize(const void *object) 2952size_t ksize(const void *object)
2697{ 2953{
2698 struct page *page; 2954 struct page *page;
2699 struct kmem_cache *s;
2700 2955
2701 if (unlikely(object == ZERO_SIZE_PTR)) 2956 if (unlikely(object == ZERO_SIZE_PTR))
2702 return 0; 2957 return 0;
@@ -2707,28 +2962,8 @@ size_t ksize(const void *object)
2707 WARN_ON(!PageCompound(page)); 2962 WARN_ON(!PageCompound(page));
2708 return PAGE_SIZE << compound_order(page); 2963 return PAGE_SIZE << compound_order(page);
2709 } 2964 }
2710 s = page->slab;
2711
2712#ifdef CONFIG_SLUB_DEBUG
2713 /*
2714 * Debugging requires use of the padding between object
2715 * and whatever may come after it.
2716 */
2717 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2718 return s->objsize;
2719 2965
2720#endif 2966 return slab_ksize(page->slab);
2721 /*
2722 * If we have the need to store the freelist pointer
2723 * back there or track user information then we can
2724 * only use the space before that information.
2725 */
2726 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2727 return s->inuse;
2728 /*
2729 * Else we can use all the padding etc for the allocation
2730 */
2731 return s->size;
2732} 2967}
2733EXPORT_SYMBOL(ksize); 2968EXPORT_SYMBOL(ksize);
2734 2969
@@ -3312,7 +3547,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3312 3547
3313 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); 3548 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3314 3549
3315 /* Honor the call site pointer we recieved. */ 3550 /* Honor the call site pointer we received. */
3316 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3551 trace_kmalloc(caller, ret, size, s->size, gfpflags);
3317 3552
3318 return ret; 3553 return ret;
@@ -3342,7 +3577,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3342 3577
3343 ret = slab_alloc(s, gfpflags, node, caller); 3578 ret = slab_alloc(s, gfpflags, node, caller);
3344 3579
3345 /* Honor the call site pointer we recieved. */ 3580 /* Honor the call site pointer we received. */
3346 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3581 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3347 3582
3348 return ret; 3583 return ret;
@@ -4017,6 +4252,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
4017} 4252}
4018SLAB_ATTR_RO(destroy_by_rcu); 4253SLAB_ATTR_RO(destroy_by_rcu);
4019 4254
4255static ssize_t reserved_show(struct kmem_cache *s, char *buf)
4256{
4257 return sprintf(buf, "%d\n", s->reserved);
4258}
4259SLAB_ATTR_RO(reserved);
4260
4020#ifdef CONFIG_SLUB_DEBUG 4261#ifdef CONFIG_SLUB_DEBUG
4021static ssize_t slabs_show(struct kmem_cache *s, char *buf) 4262static ssize_t slabs_show(struct kmem_cache *s, char *buf)
4022{ 4263{
@@ -4303,6 +4544,7 @@ static struct attribute *slab_attrs[] = {
4303 &reclaim_account_attr.attr, 4544 &reclaim_account_attr.attr,
4304 &destroy_by_rcu_attr.attr, 4545 &destroy_by_rcu_attr.attr,
4305 &shrink_attr.attr, 4546 &shrink_attr.attr,
4547 &reserved_attr.attr,
4306#ifdef CONFIG_SLUB_DEBUG 4548#ifdef CONFIG_SLUB_DEBUG
4307 &total_objects_attr.attr, 4549 &total_objects_attr.attr,
4308 &slabs_attr.attr, 4550 &slabs_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 93250207c5cf..aa64b12831a2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -500,7 +500,7 @@ void __init sparse_init(void)
500 * so alloc 2M (with 2M align) and 24 bytes in turn will 500 * so alloc 2M (with 2M align) and 24 bytes in turn will
501 * make next 2M slip to one more 2M later. 501 * make next 2M slip to one more 2M later.
502 * then in big system, the memory will have a lot of holes... 502 * then in big system, the memory will have a lot of holes...
503 * here try to allocate 2M pages continously. 503 * here try to allocate 2M pages continuously.
504 * 504 *
505 * powerpc need to call sparse_init_one_section right after each 505 * powerpc need to call sparse_init_one_section right after each
506 * sparse_early_mem_map_alloc, so allocate usemap_map at first. 506 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
diff --git a/mm/swap.c b/mm/swap.c
index c02f93611a84..a448db377cb0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,6 +39,7 @@ int page_cluster;
39 39
40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
42static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
42 43
43/* 44/*
44 * This path almost never happens for VM activity - pages are normally 45 * This path almost never happens for VM activity - pages are normally
@@ -178,15 +179,13 @@ void put_pages_list(struct list_head *pages)
178} 179}
179EXPORT_SYMBOL(put_pages_list); 180EXPORT_SYMBOL(put_pages_list);
180 181
181/* 182static void pagevec_lru_move_fn(struct pagevec *pvec,
182 * pagevec_move_tail() must be called with IRQ disabled. 183 void (*move_fn)(struct page *page, void *arg),
183 * Otherwise this may cause nasty races. 184 void *arg)
184 */
185static void pagevec_move_tail(struct pagevec *pvec)
186{ 185{
187 int i; 186 int i;
188 int pgmoved = 0;
189 struct zone *zone = NULL; 187 struct zone *zone = NULL;
188 unsigned long flags = 0;
190 189
191 for (i = 0; i < pagevec_count(pvec); i++) { 190 for (i = 0; i < pagevec_count(pvec); i++) {
192 struct page *page = pvec->pages[i]; 191 struct page *page = pvec->pages[i];
@@ -194,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec)
194 193
195 if (pagezone != zone) { 194 if (pagezone != zone) {
196 if (zone) 195 if (zone)
197 spin_unlock(&zone->lru_lock); 196 spin_unlock_irqrestore(&zone->lru_lock, flags);
198 zone = pagezone; 197 zone = pagezone;
199 spin_lock(&zone->lru_lock); 198 spin_lock_irqsave(&zone->lru_lock, flags);
200 }
201 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
202 int lru = page_lru_base_type(page);
203 list_move_tail(&page->lru, &zone->lru[lru].list);
204 pgmoved++;
205 } 199 }
200
201 (*move_fn)(page, arg);
206 } 202 }
207 if (zone) 203 if (zone)
208 spin_unlock(&zone->lru_lock); 204 spin_unlock_irqrestore(&zone->lru_lock, flags);
209 __count_vm_events(PGROTATED, pgmoved);
210 release_pages(pvec->pages, pvec->nr, pvec->cold); 205 release_pages(pvec->pages, pvec->nr, pvec->cold);
211 pagevec_reinit(pvec); 206 pagevec_reinit(pvec);
212} 207}
213 208
209static void pagevec_move_tail_fn(struct page *page, void *arg)
210{
211 int *pgmoved = arg;
212 struct zone *zone = page_zone(page);
213
214 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
215 enum lru_list lru = page_lru_base_type(page);
216 list_move_tail(&page->lru, &zone->lru[lru].list);
217 mem_cgroup_rotate_reclaimable_page(page);
218 (*pgmoved)++;
219 }
220}
221
222/*
223 * pagevec_move_tail() must be called with IRQ disabled.
224 * Otherwise this may cause nasty races.
225 */
226static void pagevec_move_tail(struct pagevec *pvec)
227{
228 int pgmoved = 0;
229
230 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
231 __count_vm_events(PGROTATED, pgmoved);
232}
233
214/* 234/*
215 * Writeback is about to end against a page which has been marked for immediate 235 * Writeback is about to end against a page which has been marked for immediate
216 * reclaim. If it still appears to be reclaimable, move it to the tail of the 236 * reclaim. If it still appears to be reclaimable, move it to the tail of the
217 * inactive list. 237 * inactive list.
218 */ 238 */
219void rotate_reclaimable_page(struct page *page) 239void rotate_reclaimable_page(struct page *page)
220{ 240{
221 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 241 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
222 !PageUnevictable(page) && PageLRU(page)) { 242 !PageUnevictable(page) && PageLRU(page)) {
@@ -347,6 +367,71 @@ void add_page_to_unevictable_list(struct page *page)
347} 367}
348 368
349/* 369/*
370 * If the page can not be invalidated, it is moved to the
371 * inactive list to speed up its reclaim. It is moved to the
372 * head of the list, rather than the tail, to give the flusher
373 * threads some time to write it out, as this is much more
374 * effective than the single-page writeout from reclaim.
375 *
376 * If the page isn't page_mapped and dirty/writeback, the page
377 * could reclaim asap using PG_reclaim.
378 *
379 * 1. active, mapped page -> none
380 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
381 * 3. inactive, mapped page -> none
382 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
383 * 5. inactive, clean -> inactive, tail
384 * 6. Others -> none
385 *
386 * In 4, why it moves inactive's head, the VM expects the page would
387 * be write it out by flusher threads as this is much more effective
388 * than the single-page writeout from reclaim.
389 */
390static void lru_deactivate_fn(struct page *page, void *arg)
391{
392 int lru, file;
393 bool active;
394 struct zone *zone = page_zone(page);
395
396 if (!PageLRU(page))
397 return;
398
399 /* Some processes are using the page */
400 if (page_mapped(page))
401 return;
402
403 active = PageActive(page);
404
405 file = page_is_file_cache(page);
406 lru = page_lru_base_type(page);
407 del_page_from_lru_list(zone, page, lru + active);
408 ClearPageActive(page);
409 ClearPageReferenced(page);
410 add_page_to_lru_list(zone, page, lru);
411
412 if (PageWriteback(page) || PageDirty(page)) {
413 /*
414 * PG_reclaim could be raced with end_page_writeback
415 * It can make readahead confusing. But race window
416 * is _really_ small and it's non-critical problem.
417 */
418 SetPageReclaim(page);
419 } else {
420 /*
421 * The page's writeback ends up during pagevec
422 * We moves tha page into tail of inactive.
423 */
424 list_move_tail(&page->lru, &zone->lru[lru].list);
425 mem_cgroup_rotate_reclaimable_page(page);
426 __count_vm_event(PGROTATED);
427 }
428
429 if (active)
430 __count_vm_event(PGDEACTIVATE);
431 update_page_reclaim_stat(zone, page, file, 0);
432}
433
434/*
350 * Drain pages out of the cpu's pagevecs. 435 * Drain pages out of the cpu's pagevecs.
351 * Either "cpu" is the current CPU, and preemption has already been 436 * Either "cpu" is the current CPU, and preemption has already been
352 * disabled; or "cpu" is being hot-unplugged, and is already dead. 437 * disabled; or "cpu" is being hot-unplugged, and is already dead.
@@ -372,6 +457,29 @@ static void drain_cpu_pagevecs(int cpu)
372 pagevec_move_tail(pvec); 457 pagevec_move_tail(pvec);
373 local_irq_restore(flags); 458 local_irq_restore(flags);
374 } 459 }
460
461 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
462 if (pagevec_count(pvec))
463 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
464}
465
466/**
467 * deactivate_page - forcefully deactivate a page
468 * @page: page to deactivate
469 *
470 * This function hints the VM that @page is a good reclaim candidate,
471 * for example if its invalidation fails due to the page being dirty
472 * or under writeback.
473 */
474void deactivate_page(struct page *page)
475{
476 if (likely(get_page_unless_zero(page))) {
477 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
478
479 if (!pagevec_add(pvec, page))
480 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
481 put_cpu_var(lru_deactivate_pvecs);
482 }
375} 483}
376 484
377void lru_add_drain(void) 485void lru_add_drain(void)
@@ -516,44 +624,33 @@ void lru_add_page_tail(struct zone* zone,
516 } 624 }
517} 625}
518 626
627static void ____pagevec_lru_add_fn(struct page *page, void *arg)
628{
629 enum lru_list lru = (enum lru_list)arg;
630 struct zone *zone = page_zone(page);
631 int file = is_file_lru(lru);
632 int active = is_active_lru(lru);
633
634 VM_BUG_ON(PageActive(page));
635 VM_BUG_ON(PageUnevictable(page));
636 VM_BUG_ON(PageLRU(page));
637
638 SetPageLRU(page);
639 if (active)
640 SetPageActive(page);
641 update_page_reclaim_stat(zone, page, file, active);
642 add_page_to_lru_list(zone, page, lru);
643}
644
519/* 645/*
520 * Add the passed pages to the LRU, then drop the caller's refcount 646 * Add the passed pages to the LRU, then drop the caller's refcount
521 * on them. Reinitialises the caller's pagevec. 647 * on them. Reinitialises the caller's pagevec.
522 */ 648 */
523void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 649void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
524{ 650{
525 int i;
526 struct zone *zone = NULL;
527
528 VM_BUG_ON(is_unevictable_lru(lru)); 651 VM_BUG_ON(is_unevictable_lru(lru));
529 652
530 for (i = 0; i < pagevec_count(pvec); i++) { 653 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
531 struct page *page = pvec->pages[i];
532 struct zone *pagezone = page_zone(page);
533 int file;
534 int active;
535
536 if (pagezone != zone) {
537 if (zone)
538 spin_unlock_irq(&zone->lru_lock);
539 zone = pagezone;
540 spin_lock_irq(&zone->lru_lock);
541 }
542 VM_BUG_ON(PageActive(page));
543 VM_BUG_ON(PageUnevictable(page));
544 VM_BUG_ON(PageLRU(page));
545 SetPageLRU(page);
546 active = is_active_lru(lru);
547 file = is_file_lru(lru);
548 if (active)
549 SetPageActive(page);
550 update_page_reclaim_stat(zone, page, file, active);
551 add_page_to_lru_list(zone, page, lru);
552 }
553 if (zone)
554 spin_unlock_irq(&zone->lru_lock);
555 release_pages(pvec->pages, pvec->nr, pvec->cold);
556 pagevec_reinit(pvec);
557} 654}
558 655
559EXPORT_SYMBOL(____pagevec_lru_add); 656EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5c8cfabbc9bc..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
24 24
25/* 25/*
26 * swapper_space is a fiction, retained to simplify the path through 26 * swapper_space is a fiction, retained to simplify the path through
27 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow 27 * vmscan's shrink_page_list.
28 * future use of radix_tree tags in the swap cache.
29 */ 28 */
30static const struct address_space_operations swap_aops = { 29static const struct address_space_operations swap_aops = {
31 .writepage = swap_writepage, 30 .writepage = swap_writepage,
32 .sync_page = block_sync_page,
33 .set_page_dirty = __set_page_dirty_nobuffers, 31 .set_page_dirty = __set_page_dirty_nobuffers,
34 .migratepage = migrate_page, 32 .migratepage = migrate_page,
35}; 33};
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
37static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
38 .name = "swap", 36 .name = "swap",
39 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
40 .unplug_io_fn = swap_unplug_io_fn,
41}; 38};
42 39
43struct address_space swapper_space = { 40struct address_space swapper_space = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..8c6b3ce38f09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
95} 95}
96 96
97/* 97/*
98 * We need this because the bdev->unplug_fn can sleep and we cannot
99 * hold swap_lock while calling the unplug_fn. And swap_lock
100 * cannot be turned into a mutex.
101 */
102static DECLARE_RWSEM(swap_unplug_sem);
103
104void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
105{
106 swp_entry_t entry;
107
108 down_read(&swap_unplug_sem);
109 entry.val = page_private(page);
110 if (PageSwapCache(page)) {
111 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
112 struct backing_dev_info *bdi;
113
114 /*
115 * If the page is removed from swapcache from under us (with a
116 * racy try_to_unuse/swapoff) we need an additional reference
117 * count to avoid reading garbage from page_private(page) above.
118 * If the WARN_ON triggers during a swapoff it maybe the race
119 * condition and it's harmless. However if it triggers without
120 * swapoff it signals a problem.
121 */
122 WARN_ON(page_count(page) <= 1);
123
124 bdi = bdev->bd_inode->i_mapping->backing_dev_info;
125 blk_run_backing_dev(bdi, page);
126 }
127 up_read(&swap_unplug_sem);
128}
129
130/*
131 * swapon tell device that all the old swap contents can be discarded, 98 * swapon tell device that all the old swap contents can be discarded,
132 * to allow the swap device to optimize its wear-levelling. 99 * to allow the swap device to optimize its wear-levelling.
133 */ 100 */
@@ -212,8 +179,8 @@ static int wait_for_discard(void *word)
212#define SWAPFILE_CLUSTER 256 179#define SWAPFILE_CLUSTER 256
213#define LATENCY_LIMIT 256 180#define LATENCY_LIMIT 256
214 181
215static inline unsigned long scan_swap_map(struct swap_info_struct *si, 182static unsigned long scan_swap_map(struct swap_info_struct *si,
216 unsigned char usage) 183 unsigned char usage)
217{ 184{
218 unsigned long offset; 185 unsigned long offset;
219 unsigned long scan_base; 186 unsigned long scan_base;
@@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free)
880static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 847static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
881 unsigned long addr, swp_entry_t entry, struct page *page) 848 unsigned long addr, swp_entry_t entry, struct page *page)
882{ 849{
883 struct mem_cgroup *ptr = NULL; 850 struct mem_cgroup *ptr;
884 spinlock_t *ptl; 851 spinlock_t *ptl;
885 pte_t *pte; 852 pte_t *pte;
886 int ret = 1; 853 int ret = 1;
@@ -1550,6 +1517,36 @@ bad_bmap:
1550 goto out; 1517 goto out;
1551} 1518}
1552 1519
1520static void enable_swap_info(struct swap_info_struct *p, int prio,
1521 unsigned char *swap_map)
1522{
1523 int i, prev;
1524
1525 spin_lock(&swap_lock);
1526 if (prio >= 0)
1527 p->prio = prio;
1528 else
1529 p->prio = --least_priority;
1530 p->swap_map = swap_map;
1531 p->flags |= SWP_WRITEOK;
1532 nr_swap_pages += p->pages;
1533 total_swap_pages += p->pages;
1534
1535 /* insert swap space into swap_list: */
1536 prev = -1;
1537 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1538 if (p->prio >= swap_info[i]->prio)
1539 break;
1540 prev = i;
1541 }
1542 p->next = i;
1543 if (prev < 0)
1544 swap_list.head = swap_list.next = p->type;
1545 else
1546 swap_info[prev]->next = p->type;
1547 spin_unlock(&swap_lock);
1548}
1549
1553SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1550SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1554{ 1551{
1555 struct swap_info_struct *p = NULL; 1552 struct swap_info_struct *p = NULL;
@@ -1621,32 +1618,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1621 current->flags &= ~PF_OOM_ORIGIN; 1618 current->flags &= ~PF_OOM_ORIGIN;
1622 1619
1623 if (err) { 1620 if (err) {
1621 /*
1622 * reading p->prio and p->swap_map outside the lock is
1623 * safe here because only sys_swapon and sys_swapoff
1624 * change them, and there can be no other sys_swapon or
1625 * sys_swapoff for this swap_info_struct at this point.
1626 */
1624 /* re-insert swap space back into swap_list */ 1627 /* re-insert swap space back into swap_list */
1625 spin_lock(&swap_lock); 1628 enable_swap_info(p, p->prio, p->swap_map);
1626 if (p->prio < 0)
1627 p->prio = --least_priority;
1628 prev = -1;
1629 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1630 if (p->prio >= swap_info[i]->prio)
1631 break;
1632 prev = i;
1633 }
1634 p->next = i;
1635 if (prev < 0)
1636 swap_list.head = swap_list.next = type;
1637 else
1638 swap_info[prev]->next = type;
1639 nr_swap_pages += p->pages;
1640 total_swap_pages += p->pages;
1641 p->flags |= SWP_WRITEOK;
1642 spin_unlock(&swap_lock);
1643 goto out_dput; 1629 goto out_dput;
1644 } 1630 }
1645 1631
1646 /* wait for any unplug function to finish */
1647 down_write(&swap_unplug_sem);
1648 up_write(&swap_unplug_sem);
1649
1650 destroy_swap_extents(p); 1632 destroy_swap_extents(p);
1651 if (p->flags & SWP_CONTINUED) 1633 if (p->flags & SWP_CONTINUED)
1652 free_swap_count_continuations(p); 1634 free_swap_count_continuations(p);
@@ -1844,49 +1826,24 @@ static int __init max_swapfiles_check(void)
1844late_initcall(max_swapfiles_check); 1826late_initcall(max_swapfiles_check);
1845#endif 1827#endif
1846 1828
1847/* 1829static struct swap_info_struct *alloc_swap_info(void)
1848 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
1849 *
1850 * The swapon system call
1851 */
1852SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1853{ 1830{
1854 struct swap_info_struct *p; 1831 struct swap_info_struct *p;
1855 char *name = NULL;
1856 struct block_device *bdev = NULL;
1857 struct file *swap_file = NULL;
1858 struct address_space *mapping;
1859 unsigned int type; 1832 unsigned int type;
1860 int i, prev;
1861 int error;
1862 union swap_header *swap_header;
1863 unsigned int nr_good_pages;
1864 int nr_extents = 0;
1865 sector_t span;
1866 unsigned long maxpages;
1867 unsigned long swapfilepages;
1868 unsigned char *swap_map = NULL;
1869 struct page *page = NULL;
1870 struct inode *inode = NULL;
1871 int did_down = 0;
1872
1873 if (!capable(CAP_SYS_ADMIN))
1874 return -EPERM;
1875 1833
1876 p = kzalloc(sizeof(*p), GFP_KERNEL); 1834 p = kzalloc(sizeof(*p), GFP_KERNEL);
1877 if (!p) 1835 if (!p)
1878 return -ENOMEM; 1836 return ERR_PTR(-ENOMEM);
1879 1837
1880 spin_lock(&swap_lock); 1838 spin_lock(&swap_lock);
1881 for (type = 0; type < nr_swapfiles; type++) { 1839 for (type = 0; type < nr_swapfiles; type++) {
1882 if (!(swap_info[type]->flags & SWP_USED)) 1840 if (!(swap_info[type]->flags & SWP_USED))
1883 break; 1841 break;
1884 } 1842 }
1885 error = -EPERM;
1886 if (type >= MAX_SWAPFILES) { 1843 if (type >= MAX_SWAPFILES) {
1887 spin_unlock(&swap_lock); 1844 spin_unlock(&swap_lock);
1888 kfree(p); 1845 kfree(p);
1889 goto out; 1846 return ERR_PTR(-EPERM);
1890 } 1847 }
1891 if (type >= nr_swapfiles) { 1848 if (type >= nr_swapfiles) {
1892 p->type = type; 1849 p->type = type;
@@ -1911,81 +1868,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1911 p->next = -1; 1868 p->next = -1;
1912 spin_unlock(&swap_lock); 1869 spin_unlock(&swap_lock);
1913 1870
1914 name = getname(specialfile); 1871 return p;
1915 error = PTR_ERR(name); 1872}
1916 if (IS_ERR(name)) {
1917 name = NULL;
1918 goto bad_swap_2;
1919 }
1920 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
1921 error = PTR_ERR(swap_file);
1922 if (IS_ERR(swap_file)) {
1923 swap_file = NULL;
1924 goto bad_swap_2;
1925 }
1926
1927 p->swap_file = swap_file;
1928 mapping = swap_file->f_mapping;
1929 inode = mapping->host;
1930
1931 error = -EBUSY;
1932 for (i = 0; i < nr_swapfiles; i++) {
1933 struct swap_info_struct *q = swap_info[i];
1934 1873
1935 if (i == type || !q->swap_file) 1874static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
1936 continue; 1875{
1937 if (mapping == q->swap_file->f_mapping) 1876 int error;
1938 goto bad_swap;
1939 }
1940 1877
1941 error = -EINVAL;
1942 if (S_ISBLK(inode->i_mode)) { 1878 if (S_ISBLK(inode->i_mode)) {
1943 bdev = I_BDEV(inode); 1879 p->bdev = bdgrab(I_BDEV(inode));
1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, 1880 error = blkdev_get(p->bdev,
1881 FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon); 1882 sys_swapon);
1946 if (error < 0) { 1883 if (error < 0) {
1947 bdev = NULL; 1884 p->bdev = NULL;
1948 error = -EINVAL; 1885 return -EINVAL;
1949 goto bad_swap;
1950 } 1886 }
1951 p->old_block_size = block_size(bdev); 1887 p->old_block_size = block_size(p->bdev);
1952 error = set_blocksize(bdev, PAGE_SIZE); 1888 error = set_blocksize(p->bdev, PAGE_SIZE);
1953 if (error < 0) 1889 if (error < 0)
1954 goto bad_swap; 1890 return error;
1955 p->bdev = bdev;
1956 p->flags |= SWP_BLKDEV; 1891 p->flags |= SWP_BLKDEV;
1957 } else if (S_ISREG(inode->i_mode)) { 1892 } else if (S_ISREG(inode->i_mode)) {
1958 p->bdev = inode->i_sb->s_bdev; 1893 p->bdev = inode->i_sb->s_bdev;
1959 mutex_lock(&inode->i_mutex); 1894 mutex_lock(&inode->i_mutex);
1960 did_down = 1; 1895 if (IS_SWAPFILE(inode))
1961 if (IS_SWAPFILE(inode)) { 1896 return -EBUSY;
1962 error = -EBUSY; 1897 } else
1963 goto bad_swap; 1898 return -EINVAL;
1964 }
1965 } else {
1966 goto bad_swap;
1967 }
1968 1899
1969 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 1900 return 0;
1901}
1970 1902
1971 /* 1903static unsigned long read_swap_header(struct swap_info_struct *p,
1972 * Read the swap header. 1904 union swap_header *swap_header,
1973 */ 1905 struct inode *inode)
1974 if (!mapping->a_ops->readpage) { 1906{
1975 error = -EINVAL; 1907 int i;
1976 goto bad_swap; 1908 unsigned long maxpages;
1977 } 1909 unsigned long swapfilepages;
1978 page = read_mapping_page(mapping, 0, swap_file);
1979 if (IS_ERR(page)) {
1980 error = PTR_ERR(page);
1981 goto bad_swap;
1982 }
1983 swap_header = kmap(page);
1984 1910
1985 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 1911 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1986 printk(KERN_ERR "Unable to find swap-space signature\n"); 1912 printk(KERN_ERR "Unable to find swap-space signature\n");
1987 error = -EINVAL; 1913 return 0;
1988 goto bad_swap;
1989 } 1914 }
1990 1915
1991 /* swap partition endianess hack... */ 1916 /* swap partition endianess hack... */
@@ -2001,8 +1926,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2001 printk(KERN_WARNING 1926 printk(KERN_WARNING
2002 "Unable to handle swap header version %d\n", 1927 "Unable to handle swap header version %d\n",
2003 swap_header->info.version); 1928 swap_header->info.version);
2004 error = -EINVAL; 1929 return 0;
2005 goto bad_swap;
2006 } 1930 }
2007 1931
2008 p->lowest_bit = 1; 1932 p->lowest_bit = 1;
@@ -2033,61 +1957,155 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2033 } 1957 }
2034 p->highest_bit = maxpages - 1; 1958 p->highest_bit = maxpages - 1;
2035 1959
2036 error = -EINVAL;
2037 if (!maxpages) 1960 if (!maxpages)
2038 goto bad_swap; 1961 return 0;
1962 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
2039 if (swapfilepages && maxpages > swapfilepages) { 1963 if (swapfilepages && maxpages > swapfilepages) {
2040 printk(KERN_WARNING 1964 printk(KERN_WARNING
2041 "Swap area shorter than signature indicates\n"); 1965 "Swap area shorter than signature indicates\n");
2042 goto bad_swap; 1966 return 0;
2043 } 1967 }
2044 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1968 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
2045 goto bad_swap; 1969 return 0;
2046 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1970 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
2047 goto bad_swap; 1971 return 0;
2048 1972
2049 /* OK, set up the swap map and apply the bad block list */ 1973 return maxpages;
2050 swap_map = vmalloc(maxpages); 1974}
2051 if (!swap_map) { 1975
2052 error = -ENOMEM; 1976static int setup_swap_map_and_extents(struct swap_info_struct *p,
2053 goto bad_swap; 1977 union swap_header *swap_header,
2054 } 1978 unsigned char *swap_map,
1979 unsigned long maxpages,
1980 sector_t *span)
1981{
1982 int i;
1983 unsigned int nr_good_pages;
1984 int nr_extents;
2055 1985
2056 memset(swap_map, 0, maxpages);
2057 nr_good_pages = maxpages - 1; /* omit header page */ 1986 nr_good_pages = maxpages - 1; /* omit header page */
2058 1987
2059 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1988 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2060 unsigned int page_nr = swap_header->info.badpages[i]; 1989 unsigned int page_nr = swap_header->info.badpages[i];
2061 if (page_nr == 0 || page_nr > swap_header->info.last_page) { 1990 if (page_nr == 0 || page_nr > swap_header->info.last_page)
2062 error = -EINVAL; 1991 return -EINVAL;
2063 goto bad_swap;
2064 }
2065 if (page_nr < maxpages) { 1992 if (page_nr < maxpages) {
2066 swap_map[page_nr] = SWAP_MAP_BAD; 1993 swap_map[page_nr] = SWAP_MAP_BAD;
2067 nr_good_pages--; 1994 nr_good_pages--;
2068 } 1995 }
2069 } 1996 }
2070 1997
2071 error = swap_cgroup_swapon(type, maxpages);
2072 if (error)
2073 goto bad_swap;
2074
2075 if (nr_good_pages) { 1998 if (nr_good_pages) {
2076 swap_map[0] = SWAP_MAP_BAD; 1999 swap_map[0] = SWAP_MAP_BAD;
2077 p->max = maxpages; 2000 p->max = maxpages;
2078 p->pages = nr_good_pages; 2001 p->pages = nr_good_pages;
2079 nr_extents = setup_swap_extents(p, &span); 2002 nr_extents = setup_swap_extents(p, span);
2080 if (nr_extents < 0) { 2003 if (nr_extents < 0)
2081 error = nr_extents; 2004 return nr_extents;
2082 goto bad_swap;
2083 }
2084 nr_good_pages = p->pages; 2005 nr_good_pages = p->pages;
2085 } 2006 }
2086 if (!nr_good_pages) { 2007 if (!nr_good_pages) {
2087 printk(KERN_WARNING "Empty swap-file\n"); 2008 printk(KERN_WARNING "Empty swap-file\n");
2009 return -EINVAL;
2010 }
2011
2012 return nr_extents;
2013}
2014
2015SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2016{
2017 struct swap_info_struct *p;
2018 char *name;
2019 struct file *swap_file = NULL;
2020 struct address_space *mapping;
2021 int i;
2022 int prio;
2023 int error;
2024 union swap_header *swap_header;
2025 int nr_extents;
2026 sector_t span;
2027 unsigned long maxpages;
2028 unsigned char *swap_map = NULL;
2029 struct page *page = NULL;
2030 struct inode *inode = NULL;
2031
2032 if (!capable(CAP_SYS_ADMIN))
2033 return -EPERM;
2034
2035 p = alloc_swap_info();
2036 if (IS_ERR(p))
2037 return PTR_ERR(p);
2038
2039 name = getname(specialfile);
2040 if (IS_ERR(name)) {
2041 error = PTR_ERR(name);
2042 name = NULL;
2043 goto bad_swap;
2044 }
2045 swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
2046 if (IS_ERR(swap_file)) {
2047 error = PTR_ERR(swap_file);
2048 swap_file = NULL;
2049 goto bad_swap;
2050 }
2051
2052 p->swap_file = swap_file;
2053 mapping = swap_file->f_mapping;
2054
2055 for (i = 0; i < nr_swapfiles; i++) {
2056 struct swap_info_struct *q = swap_info[i];
2057
2058 if (q == p || !q->swap_file)
2059 continue;
2060 if (mapping == q->swap_file->f_mapping) {
2061 error = -EBUSY;
2062 goto bad_swap;
2063 }
2064 }
2065
2066 inode = mapping->host;
2067 /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
2068 error = claim_swapfile(p, inode);
2069 if (unlikely(error))
2070 goto bad_swap;
2071
2072 /*
2073 * Read the swap header.
2074 */
2075 if (!mapping->a_ops->readpage) {
2088 error = -EINVAL; 2076 error = -EINVAL;
2089 goto bad_swap; 2077 goto bad_swap;
2090 } 2078 }
2079 page = read_mapping_page(mapping, 0, swap_file);
2080 if (IS_ERR(page)) {
2081 error = PTR_ERR(page);
2082 goto bad_swap;
2083 }
2084 swap_header = kmap(page);
2085
2086 maxpages = read_swap_header(p, swap_header, inode);
2087 if (unlikely(!maxpages)) {
2088 error = -EINVAL;
2089 goto bad_swap;
2090 }
2091
2092 /* OK, set up the swap map and apply the bad block list */
2093 swap_map = vzalloc(maxpages);
2094 if (!swap_map) {
2095 error = -ENOMEM;
2096 goto bad_swap;
2097 }
2098
2099 error = swap_cgroup_swapon(p->type, maxpages);
2100 if (error)
2101 goto bad_swap;
2102
2103 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2104 maxpages, &span);
2105 if (unlikely(nr_extents < 0)) {
2106 error = nr_extents;
2107 goto bad_swap;
2108 }
2091 2109
2092 if (p->bdev) { 2110 if (p->bdev) {
2093 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2111 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2099,58 +2117,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2099 } 2117 }
2100 2118
2101 mutex_lock(&swapon_mutex); 2119 mutex_lock(&swapon_mutex);
2102 spin_lock(&swap_lock); 2120 prio = -1;
2103 if (swap_flags & SWAP_FLAG_PREFER) 2121 if (swap_flags & SWAP_FLAG_PREFER)
2104 p->prio = 2122 prio =
2105 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2123 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2106 else 2124 enable_swap_info(p, prio, swap_map);
2107 p->prio = --least_priority;
2108 p->swap_map = swap_map;
2109 p->flags |= SWP_WRITEOK;
2110 nr_swap_pages += nr_good_pages;
2111 total_swap_pages += nr_good_pages;
2112 2125
2113 printk(KERN_INFO "Adding %uk swap on %s. " 2126 printk(KERN_INFO "Adding %uk swap on %s. "
2114 "Priority:%d extents:%d across:%lluk %s%s\n", 2127 "Priority:%d extents:%d across:%lluk %s%s\n",
2115 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 2128 p->pages<<(PAGE_SHIFT-10), name, p->prio,
2116 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2129 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2117 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2130 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2118 (p->flags & SWP_DISCARDABLE) ? "D" : ""); 2131 (p->flags & SWP_DISCARDABLE) ? "D" : "");
2119 2132
2120 /* insert swap space into swap_list: */
2121 prev = -1;
2122 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2123 if (p->prio >= swap_info[i]->prio)
2124 break;
2125 prev = i;
2126 }
2127 p->next = i;
2128 if (prev < 0)
2129 swap_list.head = swap_list.next = type;
2130 else
2131 swap_info[prev]->next = type;
2132 spin_unlock(&swap_lock);
2133 mutex_unlock(&swapon_mutex); 2133 mutex_unlock(&swapon_mutex);
2134 atomic_inc(&proc_poll_event); 2134 atomic_inc(&proc_poll_event);
2135 wake_up_interruptible(&proc_poll_wait); 2135 wake_up_interruptible(&proc_poll_wait);
2136 2136
2137 if (S_ISREG(inode->i_mode))
2138 inode->i_flags |= S_SWAPFILE;
2137 error = 0; 2139 error = 0;
2138 goto out; 2140 goto out;
2139bad_swap: 2141bad_swap:
2140 if (bdev) { 2142 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2141 set_blocksize(bdev, p->old_block_size); 2143 set_blocksize(p->bdev, p->old_block_size);
2142 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2144 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2143 } 2145 }
2144 destroy_swap_extents(p); 2146 destroy_swap_extents(p);
2145 swap_cgroup_swapoff(type); 2147 swap_cgroup_swapoff(p->type);
2146bad_swap_2:
2147 spin_lock(&swap_lock); 2148 spin_lock(&swap_lock);
2148 p->swap_file = NULL; 2149 p->swap_file = NULL;
2149 p->flags = 0; 2150 p->flags = 0;
2150 spin_unlock(&swap_lock); 2151 spin_unlock(&swap_lock);
2151 vfree(swap_map); 2152 vfree(swap_map);
2152 if (swap_file) 2153 if (swap_file) {
2154 if (inode && S_ISREG(inode->i_mode)) {
2155 mutex_unlock(&inode->i_mutex);
2156 inode = NULL;
2157 }
2153 filp_close(swap_file, NULL); 2158 filp_close(swap_file, NULL);
2159 }
2154out: 2160out:
2155 if (page && !IS_ERR(page)) { 2161 if (page && !IS_ERR(page)) {
2156 kunmap(page); 2162 kunmap(page);
@@ -2158,11 +2164,8 @@ out:
2158 } 2164 }
2159 if (name) 2165 if (name)
2160 putname(name); 2166 putname(name);
2161 if (did_down) { 2167 if (inode && S_ISREG(inode->i_mode))
2162 if (!error)
2163 inode->i_flags |= S_SWAPFILE;
2164 mutex_unlock(&inode->i_mutex); 2168 mutex_unlock(&inode->i_mutex);
2165 }
2166 return error; 2169 return error;
2167} 2170}
2168 2171
diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..a95667529135 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -106,9 +106,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
106 cancel_dirty_page(page, PAGE_CACHE_SIZE); 106 cancel_dirty_page(page, PAGE_CACHE_SIZE);
107 107
108 clear_page_mlock(page); 108 clear_page_mlock(page);
109 remove_from_page_cache(page);
110 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
111 page_cache_release(page); /* pagecache ref */ 110 delete_from_page_cache(page);
112 return 0; 111 return 0;
113} 112}
114 113
@@ -225,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
225 next = start; 224 next = start;
226 while (next <= end && 225 while (next <= end &&
227 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 226 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
227 mem_cgroup_uncharge_start();
228 for (i = 0; i < pagevec_count(&pvec); i++) { 228 for (i = 0; i < pagevec_count(&pvec); i++) {
229 struct page *page = pvec.pages[i]; 229 struct page *page = pvec.pages[i];
230 pgoff_t page_index = page->index; 230 pgoff_t page_index = page->index;
@@ -247,6 +247,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
247 unlock_page(page); 247 unlock_page(page);
248 } 248 }
249 pagevec_release(&pvec); 249 pagevec_release(&pvec);
250 mem_cgroup_uncharge_end();
250 cond_resched(); 251 cond_resched();
251 } 252 }
252 253
@@ -320,11 +321,12 @@ EXPORT_SYMBOL(truncate_inode_pages);
320 * pagetables. 321 * pagetables.
321 */ 322 */
322unsigned long invalidate_mapping_pages(struct address_space *mapping, 323unsigned long invalidate_mapping_pages(struct address_space *mapping,
323 pgoff_t start, pgoff_t end) 324 pgoff_t start, pgoff_t end)
324{ 325{
325 struct pagevec pvec; 326 struct pagevec pvec;
326 pgoff_t next = start; 327 pgoff_t next = start;
327 unsigned long ret = 0; 328 unsigned long ret;
329 unsigned long count = 0;
328 int i; 330 int i;
329 331
330 pagevec_init(&pvec, 0); 332 pagevec_init(&pvec, 0);
@@ -351,9 +353,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
351 if (lock_failed) 353 if (lock_failed)
352 continue; 354 continue;
353 355
354 ret += invalidate_inode_page(page); 356 ret = invalidate_inode_page(page);
355
356 unlock_page(page); 357 unlock_page(page);
358 /*
359 * Invalidation is a hint that the page is no longer
360 * of interest and try to speed up its reclaim.
361 */
362 if (!ret)
363 deactivate_page(page);
364 count += ret;
357 if (next > end) 365 if (next > end)
358 break; 366 break;
359 } 367 }
@@ -361,7 +369,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
361 mem_cgroup_uncharge_end(); 369 mem_cgroup_uncharge_end();
362 cond_resched(); 370 cond_resched();
363 } 371 }
364 return ret; 372 return count;
365} 373}
366EXPORT_SYMBOL(invalidate_mapping_pages); 374EXPORT_SYMBOL(invalidate_mapping_pages);
367 375
@@ -387,7 +395,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
387 395
388 clear_page_mlock(page); 396 clear_page_mlock(page);
389 BUG_ON(page_has_private(page)); 397 BUG_ON(page_has_private(page));
390 __remove_from_page_cache(page); 398 __delete_from_page_cache(page);
391 spin_unlock_irq(&mapping->tree_lock); 399 spin_unlock_irq(&mapping->tree_lock);
392 mem_cgroup_uncharge_cache_page(page); 400 mem_cgroup_uncharge_cache_page(page);
393 401
diff --git a/mm/util.c b/mm/util.c
index f126975ef23e..e7b103a6fd21 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -227,7 +227,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
227/* 227/*
228 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall 228 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
229 * back to the regular GUP. 229 * back to the regular GUP.
230 * If the architecture not support this fucntion, simply return with no 230 * If the architecture not support this function, simply return with no
231 * page pinned 231 * page pinned
232 */ 232 */
233int __attribute__((weak)) __get_user_pages_fast(unsigned long start, 233int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f9b166732e70..5d6030235d7a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,8 +261,15 @@ struct vmap_area {
261}; 261};
262 262
263static DEFINE_SPINLOCK(vmap_area_lock); 263static DEFINE_SPINLOCK(vmap_area_lock);
264static struct rb_root vmap_area_root = RB_ROOT;
265static LIST_HEAD(vmap_area_list); 264static LIST_HEAD(vmap_area_list);
265static struct rb_root vmap_area_root = RB_ROOT;
266
267/* The vmap cache globals are protected by vmap_area_lock */
268static struct rb_node *free_vmap_cache;
269static unsigned long cached_hole_size;
270static unsigned long cached_vstart;
271static unsigned long cached_align;
272
266static unsigned long vmap_area_pcpu_hole; 273static unsigned long vmap_area_pcpu_hole;
267 274
268static struct vmap_area *__find_vmap_area(unsigned long addr) 275static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
331 struct rb_node *n; 338 struct rb_node *n;
332 unsigned long addr; 339 unsigned long addr;
333 int purged = 0; 340 int purged = 0;
341 struct vmap_area *first;
334 342
335 BUG_ON(!size); 343 BUG_ON(!size);
336 BUG_ON(size & ~PAGE_MASK); 344 BUG_ON(size & ~PAGE_MASK);
345 BUG_ON(!is_power_of_2(align));
337 346
338 va = kmalloc_node(sizeof(struct vmap_area), 347 va = kmalloc_node(sizeof(struct vmap_area),
339 gfp_mask & GFP_RECLAIM_MASK, node); 348 gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
341 return ERR_PTR(-ENOMEM); 350 return ERR_PTR(-ENOMEM);
342 351
343retry: 352retry:
344 addr = ALIGN(vstart, align);
345
346 spin_lock(&vmap_area_lock); 353 spin_lock(&vmap_area_lock);
347 if (addr + size - 1 < addr) 354 /*
348 goto overflow; 355 * Invalidate cache if we have more permissive parameters.
356 * cached_hole_size notes the largest hole noticed _below_
357 * the vmap_area cached in free_vmap_cache: if size fits
358 * into that hole, we want to scan from vstart to reuse
359 * the hole instead of allocating above free_vmap_cache.
360 * Note that __free_vmap_area may update free_vmap_cache
361 * without updating cached_hole_size or cached_align.
362 */
363 if (!free_vmap_cache ||
364 size < cached_hole_size ||
365 vstart < cached_vstart ||
366 align < cached_align) {
367nocache:
368 cached_hole_size = 0;
369 free_vmap_cache = NULL;
370 }
371 /* record if we encounter less permissive parameters */
372 cached_vstart = vstart;
373 cached_align = align;
374
375 /* find starting point for our search */
376 if (free_vmap_cache) {
377 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
378 addr = ALIGN(first->va_end + PAGE_SIZE, align);
379 if (addr < vstart)
380 goto nocache;
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 } else {
385 addr = ALIGN(vstart, align);
386 if (addr + size - 1 < addr)
387 goto overflow;
349 388
350 /* XXX: could have a last_hole cache */ 389 n = vmap_area_root.rb_node;
351 n = vmap_area_root.rb_node; 390 first = NULL;
352 if (n) {
353 struct vmap_area *first = NULL;
354 391
355 do { 392 while (n) {
356 struct vmap_area *tmp; 393 struct vmap_area *tmp;
357 tmp = rb_entry(n, struct vmap_area, rb_node); 394 tmp = rb_entry(n, struct vmap_area, rb_node);
358 if (tmp->va_end >= addr) { 395 if (tmp->va_end >= addr) {
359 if (!first && tmp->va_start < addr + size)
360 first = tmp;
361 n = n->rb_left;
362 } else {
363 first = tmp; 396 first = tmp;
397 if (tmp->va_start <= addr)
398 break;
399 n = n->rb_left;
400 } else
364 n = n->rb_right; 401 n = n->rb_right;
365 } 402 }
366 } while (n);
367 403
368 if (!first) 404 if (!first)
369 goto found; 405 goto found;
370
371 if (first->va_end < addr) {
372 n = rb_next(&first->rb_node);
373 if (n)
374 first = rb_entry(n, struct vmap_area, rb_node);
375 else
376 goto found;
377 }
378
379 while (addr + size > first->va_start && addr + size <= vend) {
380 addr = ALIGN(first->va_end + PAGE_SIZE, align);
381 if (addr + size - 1 < addr)
382 goto overflow;
383
384 n = rb_next(&first->rb_node);
385 if (n)
386 first = rb_entry(n, struct vmap_area, rb_node);
387 else
388 goto found;
389 }
390 } 406 }
391found: 407
392 if (addr + size > vend) { 408 /* from the starting point, walk areas until a suitable hole is found */
393overflow: 409 while (addr + size >= first->va_start && addr + size <= vend) {
394 spin_unlock(&vmap_area_lock); 410 if (addr + cached_hole_size < first->va_start)
395 if (!purged) { 411 cached_hole_size = first->va_start - addr;
396 purge_vmap_area_lazy(); 412 addr = ALIGN(first->va_end + PAGE_SIZE, align);
397 purged = 1; 413 if (addr + size - 1 < addr)
398 goto retry; 414 goto overflow;
399 } 415
400 if (printk_ratelimit()) 416 n = rb_next(&first->rb_node);
401 printk(KERN_WARNING 417 if (n)
402 "vmap allocation for size %lu failed: " 418 first = rb_entry(n, struct vmap_area, rb_node);
403 "use vmalloc=<size> to increase size.\n", size); 419 else
404 kfree(va); 420 goto found;
405 return ERR_PTR(-EBUSY);
406 } 421 }
407 422
408 BUG_ON(addr & (align-1)); 423found:
424 if (addr + size > vend)
425 goto overflow;
409 426
410 va->va_start = addr; 427 va->va_start = addr;
411 va->va_end = addr + size; 428 va->va_end = addr + size;
412 va->flags = 0; 429 va->flags = 0;
413 __insert_vmap_area(va); 430 __insert_vmap_area(va);
431 free_vmap_cache = &va->rb_node;
414 spin_unlock(&vmap_area_lock); 432 spin_unlock(&vmap_area_lock);
415 433
434 BUG_ON(va->va_start & (align-1));
435 BUG_ON(va->va_start < vstart);
436 BUG_ON(va->va_end > vend);
437
416 return va; 438 return va;
439
440overflow:
441 spin_unlock(&vmap_area_lock);
442 if (!purged) {
443 purge_vmap_area_lazy();
444 purged = 1;
445 goto retry;
446 }
447 if (printk_ratelimit())
448 printk(KERN_WARNING
449 "vmap allocation for size %lu failed: "
450 "use vmalloc=<size> to increase size.\n", size);
451 kfree(va);
452 return ERR_PTR(-EBUSY);
417} 453}
418 454
419static void rcu_free_va(struct rcu_head *head) 455static void rcu_free_va(struct rcu_head *head)
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
426static void __free_vmap_area(struct vmap_area *va) 462static void __free_vmap_area(struct vmap_area *va)
427{ 463{
428 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 464 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
465
466 if (free_vmap_cache) {
467 if (va->va_end < cached_vstart) {
468 free_vmap_cache = NULL;
469 } else {
470 struct vmap_area *cache;
471 cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
472 if (va->va_start <= cache->va_start) {
473 free_vmap_cache = rb_prev(&va->rb_node);
474 /*
475 * We don't try to update cached_hole_size or
476 * cached_align, but it won't go very wrong.
477 */
478 }
479 }
480 }
429 rb_erase(&va->rb_node, &vmap_area_root); 481 rb_erase(&va->rb_node, &vmap_area_root);
430 RB_CLEAR_NODE(&va->rb_node); 482 RB_CLEAR_NODE(&va->rb_node);
431 list_del_rcu(&va->list); 483 list_del_rcu(&va->list);
@@ -1951,8 +2003,6 @@ finished:
1951 * should know vmalloc() area is valid and can use memcpy(). 2003 * should know vmalloc() area is valid and can use memcpy().
1952 * This is for routines which have to access vmalloc area without 2004 * This is for routines which have to access vmalloc area without
1953 * any informaion, as /dev/kmem. 2005 * any informaion, as /dev/kmem.
1954 *
1955 * The caller should guarantee KM_USER1 is not used.
1956 */ 2006 */
1957 2007
1958long vwrite(char *buf, char *addr, unsigned long count) 2008long vwrite(char *buf, char *addr, unsigned long count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 148c6e630df2..f6b435c80079 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
41#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
42#include <linux/delayacct.h> 42#include <linux/delayacct.h>
43#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/oom.h>
44 45
45#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
46#include <asm/div64.h> 47#include <asm/div64.h>
@@ -358,7 +359,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
358static void handle_write_error(struct address_space *mapping, 359static void handle_write_error(struct address_space *mapping,
359 struct page *page, int error) 360 struct page *page, int error)
360{ 361{
361 lock_page_nosync(page); 362 lock_page(page);
362 if (page_mapping(page) == mapping) 363 if (page_mapping(page) == mapping)
363 mapping_set_error(mapping, error); 364 mapping_set_error(mapping, error);
364 unlock_page(page); 365 unlock_page(page);
@@ -514,7 +515,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
514 515
515 freepage = mapping->a_ops->freepage; 516 freepage = mapping->a_ops->freepage;
516 517
517 __remove_from_page_cache(page); 518 __delete_from_page_cache(page);
518 spin_unlock_irq(&mapping->tree_lock); 519 spin_unlock_irq(&mapping->tree_lock);
519 mem_cgroup_uncharge_cache_page(page); 520 mem_cgroup_uncharge_cache_page(page);
520 521
@@ -1065,7 +1066,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1065 * surrounding the tag page. Only take those pages of 1066 * surrounding the tag page. Only take those pages of
1066 * the same active state as that tag page. We may safely 1067 * the same active state as that tag page. We may safely
1067 * round the target page pfn down to the requested order 1068 * round the target page pfn down to the requested order
1068 * as the mem_map is guarenteed valid out to MAX_ORDER, 1069 * as the mem_map is guaranteed valid out to MAX_ORDER,
1069 * where that page is in a different zone we will detect 1070 * where that page is in a different zone we will detect
1070 * it from its zone id and abort this block scan. 1071 * it from its zone id and abort this block scan.
1071 */ 1072 */
@@ -1841,16 +1842,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1842 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1842 return false; 1843 return false;
1843 1844
1844 /* 1845 /* Consider stopping depending on scan and reclaim activity */
1845 * If we failed to reclaim and have scanned the full list, stop. 1846 if (sc->gfp_mask & __GFP_REPEAT) {
1846 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far 1847 /*
1847 * faster but obviously would be less likely to succeed 1848 * For __GFP_REPEAT allocations, stop reclaiming if the
1848 * allocation. If this is desirable, use GFP_REPEAT to decide 1849 * full LRU list has been scanned and we are still failing
1849 * if both reclaimed and scanned should be checked or just 1850 * to reclaim pages. This full LRU scan is potentially
1850 * reclaimed 1851 * expensive but a __GFP_REPEAT caller really wants to succeed
1851 */ 1852 */
1852 if (!nr_reclaimed && !nr_scanned) 1853 if (!nr_reclaimed && !nr_scanned)
1853 return false; 1854 return false;
1855 } else {
1856 /*
1857 * For non-__GFP_REPEAT allocations which can presumably
1858 * fail without consequence, stop if we failed to reclaim
1859 * any pages from the last SWAP_CLUSTER_MAX number of
1860 * pages that were scanned. This will return to the
1861 * caller faster at the risk reclaim/compaction and
1862 * the resulting allocation attempt fails
1863 */
1864 if (!nr_reclaimed)
1865 return false;
1866 }
1854 1867
1855 /* 1868 /*
1856 * If we have not reclaimed enough pages for compaction and the 1869 * If we have not reclaimed enough pages for compaction and the
@@ -1882,12 +1895,12 @@ static void shrink_zone(int priority, struct zone *zone,
1882 unsigned long nr[NR_LRU_LISTS]; 1895 unsigned long nr[NR_LRU_LISTS];
1883 unsigned long nr_to_scan; 1896 unsigned long nr_to_scan;
1884 enum lru_list l; 1897 enum lru_list l;
1885 unsigned long nr_reclaimed; 1898 unsigned long nr_reclaimed, nr_scanned;
1886 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1899 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1887 unsigned long nr_scanned = sc->nr_scanned;
1888 1900
1889restart: 1901restart:
1890 nr_reclaimed = 0; 1902 nr_reclaimed = 0;
1903 nr_scanned = sc->nr_scanned;
1891 get_scan_count(zone, sc, nr, priority); 1904 get_scan_count(zone, sc, nr, priority);
1892 1905
1893 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1906 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1976,17 +1989,12 @@ static bool zone_reclaimable(struct zone *zone)
1976 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 1989 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1977} 1990}
1978 1991
1979/* 1992/* All zones in zonelist are unreclaimable? */
1980 * As hibernation is going on, kswapd is freezed so that it can't mark
1981 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1982 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1983 */
1984static bool all_unreclaimable(struct zonelist *zonelist, 1993static bool all_unreclaimable(struct zonelist *zonelist,
1985 struct scan_control *sc) 1994 struct scan_control *sc)
1986{ 1995{
1987 struct zoneref *z; 1996 struct zoneref *z;
1988 struct zone *zone; 1997 struct zone *zone;
1989 bool all_unreclaimable = true;
1990 1998
1991 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1999 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1992 gfp_zone(sc->gfp_mask), sc->nodemask) { 2000 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1994,13 +2002,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
1994 continue; 2002 continue;
1995 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2003 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1996 continue; 2004 continue;
1997 if (zone_reclaimable(zone)) { 2005 if (!zone->all_unreclaimable)
1998 all_unreclaimable = false; 2006 return false;
1999 break;
2000 }
2001 } 2007 }
2002 2008
2003 return all_unreclaimable; 2009 return true;
2004} 2010}
2005 2011
2006/* 2012/*
@@ -2096,6 +2102,14 @@ out:
2096 if (sc->nr_reclaimed) 2102 if (sc->nr_reclaimed)
2097 return sc->nr_reclaimed; 2103 return sc->nr_reclaimed;
2098 2104
2105 /*
2106 * As hibernation is going on, kswapd is freezed so that it can't mark
2107 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2108 * check.
2109 */
2110 if (oom_killer_disabled)
2111 return 0;
2112
2099 /* top priority shrink_zones still had more to do? don't OOM, then */ 2113 /* top priority shrink_zones still had more to do? don't OOM, then */
2100 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2114 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2101 return 1; 2115 return 1;
@@ -2212,7 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2212 * o a 16M DMA zone that is balanced will not balance a zone on any 2226 * o a 16M DMA zone that is balanced will not balance a zone on any
2213 * reasonable sized machine 2227 * reasonable sized machine
2214 * o On all other machines, the top zone must be at least a reasonable 2228 * o On all other machines, the top zone must be at least a reasonable
2215 * precentage of the middle zones. For example, on 32-bit x86, highmem 2229 * percentage of the middle zones. For example, on 32-bit x86, highmem
2216 * would need to be at least 256M for it to be balance a whole node. 2230 * would need to be at least 256M for it to be balance a whole node.
2217 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2231 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2218 * to balance a node on its own. These seemed like reasonable ratios. 2232 * to balance a node on its own. These seemed like reasonable ratios.
@@ -2385,9 +2399,9 @@ loop_again:
2385 * cause too much scanning of the lower zones. 2399 * cause too much scanning of the lower zones.
2386 */ 2400 */
2387 for (i = 0; i <= end_zone; i++) { 2401 for (i = 0; i <= end_zone; i++) {
2388 int compaction;
2389 struct zone *zone = pgdat->node_zones + i; 2402 struct zone *zone = pgdat->node_zones + i;
2390 int nr_slab; 2403 int nr_slab;
2404 unsigned long balance_gap;
2391 2405
2392 if (!populated_zone(zone)) 2406 if (!populated_zone(zone))
2393 continue; 2407 continue;
@@ -2404,11 +2418,20 @@ loop_again:
2404 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); 2418 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2405 2419
2406 /* 2420 /*
2407 * We put equal pressure on every zone, unless one 2421 * We put equal pressure on every zone, unless
2408 * zone has way too many pages free already. 2422 * one zone has way too many pages free
2423 * already. The "too many pages" is defined
2424 * as the high wmark plus a "gap" where the
2425 * gap is either the low watermark or 1%
2426 * of the zone, whichever is smaller.
2409 */ 2427 */
2428 balance_gap = min(low_wmark_pages(zone),
2429 (zone->present_pages +
2430 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2431 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2410 if (!zone_watermark_ok_safe(zone, order, 2432 if (!zone_watermark_ok_safe(zone, order,
2411 8*high_wmark_pages(zone), end_zone, 0)) 2433 high_wmark_pages(zone) + balance_gap,
2434 end_zone, 0))
2412 shrink_zone(priority, zone, &sc); 2435 shrink_zone(priority, zone, &sc);
2413 reclaim_state->reclaimed_slab = 0; 2436 reclaim_state->reclaimed_slab = 0;
2414 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2437 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2416,24 +2439,9 @@ loop_again:
2416 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2439 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2417 total_scanned += sc.nr_scanned; 2440 total_scanned += sc.nr_scanned;
2418 2441
2419 compaction = 0;
2420 if (order &&
2421 zone_watermark_ok(zone, 0,
2422 high_wmark_pages(zone),
2423 end_zone, 0) &&
2424 !zone_watermark_ok(zone, order,
2425 high_wmark_pages(zone),
2426 end_zone, 0)) {
2427 compact_zone_order(zone,
2428 order,
2429 sc.gfp_mask, false,
2430 COMPACT_MODE_KSWAPD);
2431 compaction = 1;
2432 }
2433
2434 if (zone->all_unreclaimable) 2442 if (zone->all_unreclaimable)
2435 continue; 2443 continue;
2436 if (!compaction && nr_slab == 0 && 2444 if (nr_slab == 0 &&
2437 !zone_reclaimable(zone)) 2445 !zone_reclaimable(zone))
2438 zone->all_unreclaimable = 1; 2446 zone->all_unreclaimable = 1;
2439 /* 2447 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b5048773e..897ea9e88238 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone,
321 /* 321 /*
322 * The fetching of the stat_threshold is racy. We may apply 322 * The fetching of the stat_threshold is racy. We may apply
323 * a counter threshold to the wrong the cpu if we get 323 * a counter threshold to the wrong the cpu if we get
324 * rescheduled while executing here. However, the following 324 * rescheduled while executing here. However, the next
325 * will apply the threshold again and therefore bring the 325 * counter update will apply the threshold again and
326 * counter under the threshold. 326 * therefore bring the counter under the threshold again.
327 *
328 * Most of the time the thresholds are the same anyways
329 * for all cpus in a zone.
327 */ 330 */
328 t = this_cpu_read(pcp->stat_threshold); 331 t = this_cpu_read(pcp->stat_threshold);
329 332
@@ -500,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu)
500 * z = the zone from which the allocation occurred. 503 * z = the zone from which the allocation occurred.
501 * 504 *
502 * Must be called with interrupts disabled. 505 * Must be called with interrupts disabled.
506 *
507 * When __GFP_OTHER_NODE is set assume the node of the preferred
508 * zone is the local node. This is useful for daemons who allocate
509 * memory on behalf of other processes.
503 */ 510 */
504void zone_statistics(struct zone *preferred_zone, struct zone *z) 511void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
505{ 512{
506 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 513 if (z->zone_pgdat == preferred_zone->zone_pgdat) {
507 __inc_zone_state(z, NUMA_HIT); 514 __inc_zone_state(z, NUMA_HIT);
@@ -509,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
509 __inc_zone_state(z, NUMA_MISS); 516 __inc_zone_state(z, NUMA_MISS);
510 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 517 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
511 } 518 }
512 if (z->node == numa_node_id()) 519 if (z->node == ((flags & __GFP_OTHER_NODE) ?
520 preferred_zone->node : numa_node_id()))
513 __inc_zone_state(z, NUMA_LOCAL); 521 __inc_zone_state(z, NUMA_LOCAL);
514 else 522 else
515 __inc_zone_state(z, NUMA_OTHER); 523 __inc_zone_state(z, NUMA_OTHER);
@@ -940,7 +948,16 @@ static const char * const vmstat_text[] = {
940 "unevictable_pgs_cleared", 948 "unevictable_pgs_cleared",
941 "unevictable_pgs_stranded", 949 "unevictable_pgs_stranded",
942 "unevictable_pgs_mlockfreed", 950 "unevictable_pgs_mlockfreed",
951
952#ifdef CONFIG_TRANSPARENT_HUGEPAGE
953 "thp_fault_alloc",
954 "thp_fault_fallback",
955 "thp_collapse_alloc",
956 "thp_collapse_alloc_failed",
957 "thp_split",
943#endif 958#endif
959
960#endif /* CONFIG_VM_EVENTS_COUNTERS */
944}; 961};
945 962
946static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 963static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,