aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2009-11-01 05:11:07 -0500
committerTakashi Iwai <tiwai@suse.de>2009-11-01 05:11:07 -0500
commite87a3dd33eab30b4db539500064a9584867e4f2c (patch)
tree2f7ad16e46ae30518ff63bb5391b63f7f7cc74dd /mm
parentb14f5de731ae657d498d18d713c6431bfbeefb4b (diff)
parent3d00941371a765779c4e3509214c7e5793cce1fe (diff)
Merge branch 'fix/misc' into topic/misc
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig28
-rw-r--r--mm/Kconfig.debug12
-rw-r--r--mm/Makefile9
-rw-r--r--mm/filemap.c12
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/hugetlb.c265
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/internal.h10
-rw-r--r--mm/ksm.c1711
-rw-r--r--mm/madvise.c83
-rw-r--r--mm/memcontrol.c714
-rw-r--r--mm/memory-failure.c832
-rw-r--r--mm/memory.c298
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/mempool.c7
-rw-r--r--mm/migrate.c26
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mmap.c59
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c18
-rw-r--r--mm/nommu.c130
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c73
-rw-r--r--mm/page_alloc.c328
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/percpu.c83
-rw-r--r--mm/quicklist.c3
-rw-r--r--mm/rmap.c142
-rw-r--r--mm/shmem.c29
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c143
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/truncate.c136
-rw-r--r--mm/vmalloc.c223
-rw-r--r--mm/vmscan.c272
-rw-r--r--mm/vmstat.c5
42 files changed, 5037 insertions, 1028 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa519f52e18..edd300aca173 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT
214config MMU_NOTIFIER 214config MMU_NOTIFIER
215 bool 215 bool
216 216
217config KSM
218 bool "Enable KSM for page merging"
219 depends on MMU
220 help
221 Enable Kernel Samepage Merging: KSM periodically scans those areas
222 of an application's address space that an app has advised may be
223 mergeable. When it finds pages of identical content, it replaces
224 the many instances by a single resident page with that content, so
225 saving memory until one or another app needs to modify the content.
226 Recommended for use with KVM, or with other duplicative applications.
227 See Documentation/vm/ksm.txt for more information.
228
217config DEFAULT_MMAP_MIN_ADDR 229config DEFAULT_MMAP_MIN_ADDR
218 int "Low address space to protect from user allocation" 230 int "Low address space to protect from user allocation"
219 default 4096 231 default 4096
@@ -232,6 +244,22 @@ config DEFAULT_MMAP_MIN_ADDR
232 This value can be changed after boot using the 244 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable. 245 /proc/sys/vm/mmap_min_addr tunable.
234 246
247config ARCH_SUPPORTS_MEMORY_FAILURE
248 bool
249
250config MEMORY_FAILURE
251 depends on MMU
252 depends on ARCH_SUPPORTS_MEMORY_FAILURE
253 bool "Enable recovery from hardware memory errors"
254 help
255 Enables code to recover from some memory failures on systems
256 with MCA recovery. This allows a system to continue running
257 even when some of its memory has uncorrected errors. This requires
258 special hardware support and typically ECC memory.
259
260config HWPOISON_INJECT
261 tristate "Poison pages injector"
262 depends on MEMORY_FAILURE && DEBUG_KERNEL
235 263
236config NOMMU_INITIAL_TRIM_EXCESS 264config NOMMU_INITIAL_TRIM_EXCESS
237 int "Turn on mmap() excess space trimming before booting" 265 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index aa99fd1f7109..af7cfb43d2f0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC
6 ---help--- 6 ---help---
7 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
9 of memory corruptions. 9 of memory corruption.
10 10
11config WANT_PAGE_DEBUG_FLAGS 11config WANT_PAGE_DEBUG_FLAGS
12 bool 12 bool
@@ -17,11 +17,11 @@ config PAGE_POISONING
17 depends on !HIBERNATION 17 depends on !HIBERNATION
18 select DEBUG_PAGEALLOC 18 select DEBUG_PAGEALLOC
19 select WANT_PAGE_DEBUG_FLAGS 19 select WANT_PAGE_DEBUG_FLAGS
20 help 20 ---help---
21 Fill the pages with poison patterns after free_pages() and verify 21 Fill the pages with poison patterns after free_pages() and verify
22 the patterns before alloc_pages(). This results in a large slowdown, 22 the patterns before alloc_pages(). This results in a large slowdown,
23 but helps to find certain types of memory corruptions. 23 but helps to find certain types of memory corruption.
24 24
25 This option cannot enalbe with hibernation. Otherwise, it will get 25 This option cannot be enabled in combination with hibernation as
26 wrong messages for memory corruption because the free pages are not 26 that would result in incorrect warnings of memory corruption after
27 saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index ea4b18bd3960..ebf849042ed3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,16 +5,16 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o 8 vmalloc.o pagewalk.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o mmu_context.o \
15 $(mmu-y)
15obj-y += init-mm.o 16obj-y += init-mm.o
16 17
17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
18obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
20obj-$(CONFIG_HAS_DMA) += dmapool.o 20obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 26obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
29obj-$(CONFIG_SLAB) += slab.o 30obj-$(CONFIG_SLAB) += slab.o
30obj-$(CONFIG_SLUB) += slub.o 31obj-$(CONFIG_SLUB) += slub.o
@@ -40,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o
40endif 41endif
41obj-$(CONFIG_QUICKLIST) += quicklist.o 42obj-$(CONFIG_QUICKLIST) += quicklist.o
42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
45obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
43obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 46obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
44obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 47obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/filemap.c b/mm/filemap.c
index dd51c68e2b86..ef169f37156d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -58,7 +58,7 @@
58/* 58/*
59 * Lock ordering: 59 * Lock ordering:
60 * 60 *
61 * ->i_mmap_lock (vmtruncate) 61 * ->i_mmap_lock (truncate_pagecache)
62 * ->private_lock (__free_pte->__set_page_dirty_buffers) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers)
63 * ->swap_lock (exclusive_swap_page, others) 63 * ->swap_lock (exclusive_swap_page, others)
64 * ->mapping->tree_lock 64 * ->mapping->tree_lock
@@ -104,6 +104,10 @@
104 * 104 *
105 * ->task->proc_lock 105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
107 */ 111 */
108 112
109/* 113/*
@@ -119,6 +123,8 @@ void __remove_from_page_cache(struct page *page)
119 page->mapping = NULL; 123 page->mapping = NULL;
120 mapping->nrpages--; 124 mapping->nrpages--;
121 __dec_zone_page_state(page, NR_FILE_PAGES); 125 __dec_zone_page_state(page, NR_FILE_PAGES);
126 if (PageSwapBacked(page))
127 __dec_zone_page_state(page, NR_SHMEM);
122 BUG_ON(page_mapped(page)); 128 BUG_ON(page_mapped(page));
123 129
124 /* 130 /*
@@ -431,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
431 if (likely(!error)) { 437 if (likely(!error)) {
432 mapping->nrpages++; 438 mapping->nrpages++;
433 __inc_zone_page_state(page, NR_FILE_PAGES); 439 __inc_zone_page_state(page, NR_FILE_PAGES);
440 if (PageSwapBacked(page))
441 __inc_zone_page_state(page, NR_SHMEM);
434 spin_unlock_irq(&mapping->tree_lock); 442 spin_unlock_irq(&mapping->tree_lock);
435 } else { 443 } else {
436 page->mapping = NULL; 444 page->mapping = NULL;
@@ -1603,7 +1611,7 @@ page_not_uptodate:
1603} 1611}
1604EXPORT_SYMBOL(filemap_fault); 1612EXPORT_SYMBOL(filemap_fault);
1605 1613
1606struct vm_operations_struct generic_file_vm_ops = { 1614const struct vm_operations_struct generic_file_vm_ops = {
1607 .fault = filemap_fault, 1615 .fault = filemap_fault,
1608}; 1616};
1609 1617
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 427dfe3ce78c..1888b2d71bb8 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -296,7 +296,7 @@ out:
296 } 296 }
297} 297}
298 298
299static struct vm_operations_struct xip_file_vm_ops = { 299static const struct vm_operations_struct xip_file_vm_ops = {
300 .fault = xip_file_fault, 300 .fault = xip_file_fault,
301}; 301};
302 302
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d63634777..5d7601b02874 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
456 h->free_huge_pages_node[nid]++; 456 h->free_huge_pages_node[nid]++;
457} 457}
458 458
459static struct page *dequeue_huge_page(struct hstate *h)
460{
461 int nid;
462 struct page *page = NULL;
463
464 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
465 if (!list_empty(&h->hugepage_freelists[nid])) {
466 page = list_entry(h->hugepage_freelists[nid].next,
467 struct page, lru);
468 list_del(&page->lru);
469 h->free_huge_pages--;
470 h->free_huge_pages_node[nid]--;
471 break;
472 }
473 }
474 return page;
475}
476
477static struct page *dequeue_huge_page_vma(struct hstate *h, 459static struct page *dequeue_huge_page_vma(struct hstate *h,
478 struct vm_area_struct *vma, 460 struct vm_area_struct *vma,
479 unsigned long address, int avoid_reserve) 461 unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
641 623
642/* 624/*
643 * Use a helper variable to find the next node and then 625 * Use a helper variable to find the next node and then
644 * copy it back to hugetlb_next_nid afterwards: 626 * copy it back to next_nid_to_alloc afterwards:
645 * otherwise there's a window in which a racer might 627 * otherwise there's a window in which a racer might
646 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
647 * But we don't need to use a spin_lock here: it really 629 * But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
650 * if we just successfully allocated a hugepage so that 632 * if we just successfully allocated a hugepage so that
651 * the next caller gets hugepages on the next node. 633 * the next caller gets hugepages on the next node.
652 */ 634 */
653static int hstate_next_node(struct hstate *h) 635static int hstate_next_node_to_alloc(struct hstate *h)
654{ 636{
655 int next_nid; 637 int next_nid;
656 next_nid = next_node(h->hugetlb_next_nid, node_online_map); 638 next_nid = next_node(h->next_nid_to_alloc, node_online_map);
657 if (next_nid == MAX_NUMNODES) 639 if (next_nid == MAX_NUMNODES)
658 next_nid = first_node(node_online_map); 640 next_nid = first_node(node_online_map);
659 h->hugetlb_next_nid = next_nid; 641 h->next_nid_to_alloc = next_nid;
660 return next_nid; 642 return next_nid;
661} 643}
662 644
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
667 int next_nid; 649 int next_nid;
668 int ret = 0; 650 int ret = 0;
669 651
670 start_nid = h->hugetlb_next_nid; 652 start_nid = h->next_nid_to_alloc;
653 next_nid = start_nid;
671 654
672 do { 655 do {
673 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); 656 page = alloc_fresh_huge_page_node(h, next_nid);
674 if (page) 657 if (page)
675 ret = 1; 658 ret = 1;
676 next_nid = hstate_next_node(h); 659 next_nid = hstate_next_node_to_alloc(h);
677 } while (!page && h->hugetlb_next_nid != start_nid); 660 } while (!page && next_nid != start_nid);
678 661
679 if (ret) 662 if (ret)
680 count_vm_event(HTLB_BUDDY_PGALLOC); 663 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
684 return ret; 667 return ret;
685} 668}
686 669
670/*
671 * helper for free_pool_huge_page() - find next node
672 * from which to free a huge page
673 */
674static int hstate_next_node_to_free(struct hstate *h)
675{
676 int next_nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map);
678 if (next_nid == MAX_NUMNODES)
679 next_nid = first_node(node_online_map);
680 h->next_nid_to_free = next_nid;
681 return next_nid;
682}
683
684/*
685 * Free huge page from pool from next node to free.
686 * Attempt to keep persistent huge pages more or less
687 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked.
689 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691{
692 int start_nid;
693 int next_nid;
694 int ret = 0;
695
696 start_nid = h->next_nid_to_free;
697 next_nid = start_nid;
698
699 do {
700 /*
701 * If we're returning unused surplus pages, only examine
702 * nodes with surplus pages.
703 */
704 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 !list_empty(&h->hugepage_freelists[next_nid])) {
706 struct page *page =
707 list_entry(h->hugepage_freelists[next_nid].next,
708 struct page, lru);
709 list_del(&page->lru);
710 h->free_huge_pages--;
711 h->free_huge_pages_node[next_nid]--;
712 if (acct_surplus) {
713 h->surplus_huge_pages--;
714 h->surplus_huge_pages_node[next_nid]--;
715 }
716 update_and_free_page(h, page);
717 ret = 1;
718 }
719 next_nid = hstate_next_node_to_free(h);
720 } while (!ret && next_nid != start_nid);
721
722 return ret;
723}
724
687static struct page *alloc_buddy_huge_page(struct hstate *h, 725static struct page *alloc_buddy_huge_page(struct hstate *h,
688 struct vm_area_struct *vma, unsigned long address) 726 struct vm_area_struct *vma, unsigned long address)
689{ 727{
@@ -855,22 +893,13 @@ free:
855 * When releasing a hugetlb pool reservation, any surplus pages that were 893 * When releasing a hugetlb pool reservation, any surplus pages that were
856 * allocated to satisfy the reservation must be explicitly freed if they were 894 * allocated to satisfy the reservation must be explicitly freed if they were
857 * never used. 895 * never used.
896 * Called with hugetlb_lock held.
858 */ 897 */
859static void return_unused_surplus_pages(struct hstate *h, 898static void return_unused_surplus_pages(struct hstate *h,
860 unsigned long unused_resv_pages) 899 unsigned long unused_resv_pages)
861{ 900{
862 static int nid = -1;
863 struct page *page;
864 unsigned long nr_pages; 901 unsigned long nr_pages;
865 902
866 /*
867 * We want to release as many surplus pages as possible, spread
868 * evenly across all nodes. Iterate across all nodes until we
869 * can no longer free unreserved surplus pages. This occurs when
870 * the nodes with surplus pages have no free pages.
871 */
872 unsigned long remaining_iterations = nr_online_nodes;
873
874 /* Uncommit the reservation */ 903 /* Uncommit the reservation */
875 h->resv_huge_pages -= unused_resv_pages; 904 h->resv_huge_pages -= unused_resv_pages;
876 905
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
880 909
881 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 910 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
882 911
883 while (remaining_iterations-- && nr_pages) { 912 /*
884 nid = next_node(nid, node_online_map); 913 * We want to release as many surplus pages as possible, spread
885 if (nid == MAX_NUMNODES) 914 * evenly across all nodes. Iterate across all nodes until we
886 nid = first_node(node_online_map); 915 * can no longer free unreserved surplus pages. This occurs when
887 916 * the nodes with surplus pages have no free pages.
888 if (!h->surplus_huge_pages_node[nid]) 917 * free_pool_huge_page() will balance the the frees across the
889 continue; 918 * on-line nodes for us and will handle the hstate accounting.
890 919 */
891 if (!list_empty(&h->hugepage_freelists[nid])) { 920 while (nr_pages--) {
892 page = list_entry(h->hugepage_freelists[nid].next, 921 if (!free_pool_huge_page(h, 1))
893 struct page, lru); 922 break;
894 list_del(&page->lru);
895 update_and_free_page(h, page);
896 h->free_huge_pages--;
897 h->free_huge_pages_node[nid]--;
898 h->surplus_huge_pages--;
899 h->surplus_huge_pages_node[nid]--;
900 nr_pages--;
901 remaining_iterations = nr_online_nodes;
902 }
903 } 923 }
904} 924}
905 925
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1008 void *addr; 1028 void *addr;
1009 1029
1010 addr = __alloc_bootmem_node_nopanic( 1030 addr = __alloc_bootmem_node_nopanic(
1011 NODE_DATA(h->hugetlb_next_nid), 1031 NODE_DATA(h->next_nid_to_alloc),
1012 huge_page_size(h), huge_page_size(h), 0); 1032 huge_page_size(h), huge_page_size(h), 0);
1013 1033
1034 hstate_next_node_to_alloc(h);
1014 if (addr) { 1035 if (addr) {
1015 /* 1036 /*
1016 * Use the beginning of the huge page to store the 1037 * Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1020 m = addr; 1041 m = addr;
1021 goto found; 1042 goto found;
1022 } 1043 }
1023 hstate_next_node(h);
1024 nr_nodes--; 1044 nr_nodes--;
1025 } 1045 }
1026 return 0; 1046 return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1141 */ 1161 */
1142static int adjust_pool_surplus(struct hstate *h, int delta) 1162static int adjust_pool_surplus(struct hstate *h, int delta)
1143{ 1163{
1144 static int prev_nid; 1164 int start_nid, next_nid;
1145 int nid = prev_nid;
1146 int ret = 0; 1165 int ret = 0;
1147 1166
1148 VM_BUG_ON(delta != -1 && delta != 1); 1167 VM_BUG_ON(delta != -1 && delta != 1);
1149 do {
1150 nid = next_node(nid, node_online_map);
1151 if (nid == MAX_NUMNODES)
1152 nid = first_node(node_online_map);
1153 1168
1154 /* To shrink on this node, there must be a surplus page */ 1169 if (delta < 0)
1155 if (delta < 0 && !h->surplus_huge_pages_node[nid]) 1170 start_nid = h->next_nid_to_alloc;
1156 continue; 1171 else
1157 /* Surplus cannot exceed the total number of pages */ 1172 start_nid = h->next_nid_to_free;
1158 if (delta > 0 && h->surplus_huge_pages_node[nid] >= 1173 next_nid = start_nid;
1174
1175 do {
1176 int nid = next_nid;
1177 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /*
1180 * To shrink on this node, there must be a surplus page
1181 */
1182 if (!h->surplus_huge_pages_node[nid])
1183 continue;
1184 }
1185 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /*
1188 * Surplus cannot exceed the total number of pages
1189 */
1190 if (h->surplus_huge_pages_node[nid] >=
1159 h->nr_huge_pages_node[nid]) 1191 h->nr_huge_pages_node[nid])
1160 continue; 1192 continue;
1193 }
1161 1194
1162 h->surplus_huge_pages += delta; 1195 h->surplus_huge_pages += delta;
1163 h->surplus_huge_pages_node[nid] += delta; 1196 h->surplus_huge_pages_node[nid] += delta;
1164 ret = 1; 1197 ret = 1;
1165 break; 1198 break;
1166 } while (nid != prev_nid); 1199 } while (next_nid != start_nid);
1167 1200
1168 prev_nid = nid;
1169 return ret; 1201 return ret;
1170} 1202}
1171 1203
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1227 min_count = max(count, min_count); 1259 min_count = max(count, min_count);
1228 try_to_free_low(h, min_count); 1260 try_to_free_low(h, min_count);
1229 while (min_count < persistent_huge_pages(h)) { 1261 while (min_count < persistent_huge_pages(h)) {
1230 struct page *page = dequeue_huge_page(h); 1262 if (!free_pool_huge_page(h, 0))
1231 if (!page)
1232 break; 1263 break;
1233 update_and_free_page(h, page);
1234 } 1264 }
1235 while (count < persistent_huge_pages(h)) { 1265 while (count < persistent_huge_pages(h)) {
1236 if (!adjust_pool_surplus(h, 1)) 1266 if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1442 h->free_huge_pages = 0; 1472 h->free_huge_pages = 0;
1443 for (i = 0; i < MAX_NUMNODES; ++i) 1473 for (i = 0; i < MAX_NUMNODES; ++i)
1444 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1445 h->hugetlb_next_nid = first_node(node_online_map); 1475 h->next_nid_to_alloc = first_node(node_online_map);
1476 h->next_nid_to_free = first_node(node_online_map);
1446 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1447 huge_page_size(h)/1024); 1478 huge_page_size(h)/1024);
1448 1479
@@ -1506,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1506 1537
1507#ifdef CONFIG_SYSCTL 1538#ifdef CONFIG_SYSCTL
1508int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1539int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1509 struct file *file, void __user *buffer, 1540 void __user *buffer,
1510 size_t *length, loff_t *ppos) 1541 size_t *length, loff_t *ppos)
1511{ 1542{
1512 struct hstate *h = &default_hstate; 1543 struct hstate *h = &default_hstate;
@@ -1517,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1517 1548
1518 table->data = &tmp; 1549 table->data = &tmp;
1519 table->maxlen = sizeof(unsigned long); 1550 table->maxlen = sizeof(unsigned long);
1520 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1551 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1521 1552
1522 if (write) 1553 if (write)
1523 h->max_huge_pages = set_max_huge_pages(h, tmp); 1554 h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1526,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1526} 1557}
1527 1558
1528int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1529 struct file *file, void __user *buffer, 1560 void __user *buffer,
1530 size_t *length, loff_t *ppos) 1561 size_t *length, loff_t *ppos)
1531{ 1562{
1532 proc_dointvec(table, write, file, buffer, length, ppos); 1563 proc_dointvec(table, write, buffer, length, ppos);
1533 if (hugepages_treat_as_movable) 1564 if (hugepages_treat_as_movable)
1534 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1565 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1535 else 1566 else
@@ -1538,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1538} 1569}
1539 1570
1540int hugetlb_overcommit_handler(struct ctl_table *table, int write, 1571int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1541 struct file *file, void __user *buffer, 1572 void __user *buffer,
1542 size_t *length, loff_t *ppos) 1573 size_t *length, loff_t *ppos)
1543{ 1574{
1544 struct hstate *h = &default_hstate; 1575 struct hstate *h = &default_hstate;
@@ -1549,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1549 1580
1550 table->data = &tmp; 1581 table->data = &tmp;
1551 table->maxlen = sizeof(unsigned long); 1582 table->maxlen = sizeof(unsigned long);
1552 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1583 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1553 1584
1554 if (write) { 1585 if (write) {
1555 spin_lock(&hugetlb_lock); 1586 spin_lock(&hugetlb_lock);
@@ -1690,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1690 return 0; 1721 return 0;
1691} 1722}
1692 1723
1693struct vm_operations_struct hugetlb_vm_ops = { 1724const struct vm_operations_struct hugetlb_vm_ops = {
1694 .fault = hugetlb_vm_op_fault, 1725 .fault = hugetlb_vm_op_fault,
1695 .open = hugetlb_vm_op_open, 1726 .open = hugetlb_vm_op_open,
1696 .close = hugetlb_vm_op_close, 1727 .close = hugetlb_vm_op_close,
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1985 return find_lock_page(mapping, idx); 2016 return find_lock_page(mapping, idx);
1986} 2017}
1987 2018
2019/*
2020 * Return whether there is a pagecache page to back given address within VMA.
2021 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022 */
2023static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 struct vm_area_struct *vma, unsigned long address)
2025{
2026 struct address_space *mapping;
2027 pgoff_t idx;
2028 struct page *page;
2029
2030 mapping = vma->vm_file->f_mapping;
2031 idx = vma_hugecache_offset(h, vma, address);
2032
2033 page = find_get_page(mapping, idx);
2034 if (page)
2035 put_page(page);
2036 return page != NULL;
2037}
2038
1988static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1989 unsigned long address, pte_t *ptep, unsigned int flags) 2040 unsigned long address, pte_t *ptep, unsigned int flags)
1990{ 2041{
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2180 return NULL; 2231 return NULL;
2181} 2232}
2182 2233
2183static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2184{
2185 if (!ptep || write || shared)
2186 return 0;
2187 else
2188 return huge_pte_none(huge_ptep_get(ptep));
2189}
2190
2191int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2234int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2192 struct page **pages, struct vm_area_struct **vmas, 2235 struct page **pages, struct vm_area_struct **vmas,
2193 unsigned long *position, int *length, int i, 2236 unsigned long *position, int *length, int i,
2194 int write) 2237 unsigned int flags)
2195{ 2238{
2196 unsigned long pfn_offset; 2239 unsigned long pfn_offset;
2197 unsigned long vaddr = *position; 2240 unsigned long vaddr = *position;
2198 int remainder = *length; 2241 int remainder = *length;
2199 struct hstate *h = hstate_vma(vma); 2242 struct hstate *h = hstate_vma(vma);
2200 int zeropage_ok = 0;
2201 int shared = vma->vm_flags & VM_SHARED;
2202 2243
2203 spin_lock(&mm->page_table_lock); 2244 spin_lock(&mm->page_table_lock);
2204 while (vaddr < vma->vm_end && remainder) { 2245 while (vaddr < vma->vm_end && remainder) {
2205 pte_t *pte; 2246 pte_t *pte;
2247 int absent;
2206 struct page *page; 2248 struct page *page;
2207 2249
2208 /* 2250 /*
2209 * Some archs (sparc64, sh*) have multiple pte_ts to 2251 * Some archs (sparc64, sh*) have multiple pte_ts to
2210 * each hugepage. We have to make * sure we get the 2252 * each hugepage. We have to make sure we get the
2211 * first, for the page indexing below to work. 2253 * first, for the page indexing below to work.
2212 */ 2254 */
2213 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2255 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2214 if (huge_zeropage_ok(pte, write, shared)) 2256 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2215 zeropage_ok = 1; 2257
2258 /*
2259 * When coredumping, it suits get_dump_page if we just return
2260 * an error where there's an empty slot with no huge pagecache
2261 * to back it. This way, we avoid allocating a hugepage, and
2262 * the sparse dumpfile avoids allocating disk blocks, but its
2263 * huge holes still show up with zeroes where they need to be.
2264 */
2265 if (absent && (flags & FOLL_DUMP) &&
2266 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 remainder = 0;
2268 break;
2269 }
2216 2270
2217 if (!pte || 2271 if (absent ||
2218 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || 2272 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2219 (write && !pte_write(huge_ptep_get(pte)))) {
2220 int ret; 2273 int ret;
2221 2274
2222 spin_unlock(&mm->page_table_lock); 2275 spin_unlock(&mm->page_table_lock);
2223 ret = hugetlb_fault(mm, vma, vaddr, write); 2276 ret = hugetlb_fault(mm, vma, vaddr,
2277 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2224 spin_lock(&mm->page_table_lock); 2278 spin_lock(&mm->page_table_lock);
2225 if (!(ret & VM_FAULT_ERROR)) 2279 if (!(ret & VM_FAULT_ERROR))
2226 continue; 2280 continue;
2227 2281
2228 remainder = 0; 2282 remainder = 0;
2229 if (!i)
2230 i = -EFAULT;
2231 break; 2283 break;
2232 } 2284 }
2233 2285
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2235 page = pte_page(huge_ptep_get(pte)); 2287 page = pte_page(huge_ptep_get(pte));
2236same_page: 2288same_page:
2237 if (pages) { 2289 if (pages) {
2238 if (zeropage_ok) 2290 pages[i] = mem_map_offset(page, pfn_offset);
2239 pages[i] = ZERO_PAGE(0);
2240 else
2241 pages[i] = mem_map_offset(page, pfn_offset);
2242 get_page(pages[i]); 2291 get_page(pages[i]);
2243 } 2292 }
2244 2293
@@ -2262,7 +2311,7 @@ same_page:
2262 *length = remainder; 2311 *length = remainder;
2263 *position = vaddr; 2312 *position = vaddr;
2264 2313
2265 return i; 2314 return i ? i : -EFAULT;
2266} 2315}
2267 2316
2268void hugetlb_change_protection(struct vm_area_struct *vma, 2317void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */
2#include <linux/module.h>
3#include <linux/debugfs.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6
7static struct dentry *hwpoison_dir, *corrupt_pfn;
8
9static int hwpoison_inject(void *data, u64 val)
10{
11 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14 return __memory_failure(val, 18, 0);
15}
16
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
18
19static void pfn_inject_exit(void)
20{
21 if (hwpoison_dir)
22 debugfs_remove_recursive(hwpoison_dir);
23}
24
25static int pfn_inject_init(void)
26{
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL)
29 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) {
33 pfn_inject_exit();
34 return -ENOMEM;
35 }
36 return 0;
37}
38
39module_init(pfn_inject_init);
40module_exit(pfn_inject_exit);
41MODULE_LICENSE("GPL");
diff --git a/mm/internal.h b/mm/internal.h
index f290c4db528b..22ec8d2b0fb8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,8 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40extern unsigned long highest_memmap_pfn;
41
40/* 42/*
41 * in mm/vmscan.c: 43 * in mm/vmscan.c:
42 */ 44 */
@@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page);
46/* 48/*
47 * in mm/page_alloc.c 49 * in mm/page_alloc.c
48 */ 50 */
49extern unsigned long highest_memmap_pfn;
50extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
52 53
@@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
250} 251}
251#endif /* CONFIG_SPARSEMEM */ 252#endif /* CONFIG_SPARSEMEM */
252 253
253#define GUP_FLAGS_WRITE 0x1
254#define GUP_FLAGS_FORCE 0x2
255#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
256#define GUP_FLAGS_IGNORE_SIGKILL 0x8
257
258int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 254int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
259 unsigned long start, int len, int flags, 255 unsigned long start, int len, unsigned int foll_flags,
260 struct page **pages, struct vm_area_struct **vmas); 256 struct page **pages, struct vm_area_struct **vmas);
261 257
262#define ZONE_RECLAIM_NOSCAN -2 258#define ZONE_RECLAIM_NOSCAN -2
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000000..f7edac356f46
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1711 @@
1/*
2 * Memory merging support.
3 *
4 * This code enables dynamic sharing of identical pages found in different
5 * memory areas, even if they are not shared by fork()
6 *
7 * Copyright (C) 2008-2009 Red Hat, Inc.
8 * Authors:
9 * Izik Eidus
10 * Andrea Arcangeli
11 * Chris Wright
12 * Hugh Dickins
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2.
15 */
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h>
33#include <linux/swap.h>
34#include <linux/ksm.h>
35
36#include <asm/tlbflush.h>
37
38/*
39 * A few notes about the KSM scanning process,
40 * to make it easier to understand the data structures below:
41 *
42 * In order to reduce excessive scanning, KSM sorts the memory pages by their
43 * contents into a data structure that holds pointers to the pages' locations.
44 *
45 * Since the contents of the pages may change at any moment, KSM cannot just
46 * insert the pages into a normal sorted tree and expect it to find anything.
47 * Therefore KSM uses two data structures - the stable and the unstable tree.
48 *
49 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
50 * by their contents. Because each such page is write-protected, searching on
51 * this tree is fully assured to be working (except when pages are unmapped),
52 * and therefore this tree is called the stable tree.
53 *
54 * In addition to the stable tree, KSM uses a second data structure called the
55 * unstable tree: this tree holds pointers to pages which have been found to
56 * be "unchanged for a period of time". The unstable tree sorts these pages
57 * by their contents, but since they are not write-protected, KSM cannot rely
58 * upon the unstable tree to work correctly - the unstable tree is liable to
59 * be corrupted as its contents are modified, and so it is called unstable.
60 *
61 * KSM solves this problem by several techniques:
62 *
63 * 1) The unstable tree is flushed every time KSM completes scanning all
64 * memory areas, and then the tree is rebuilt again from the beginning.
65 * 2) KSM will only insert into the unstable tree, pages whose hash value
66 * has not changed since the previous scan of all memory areas.
67 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
68 * colors of the nodes and not on their contents, assuring that even when
69 * the tree gets "corrupted" it won't get out of balance, so scanning time
70 * remains the same (also, searching and inserting nodes in an rbtree uses
71 * the same algorithm, so we have no overhead when we flush and rebuild).
72 * 4) KSM never flushes the stable tree, which means that even if it were to
73 * take 10 attempts to find a page in the unstable tree, once it is found,
74 * it is secured in the stable tree. (When we scan a new page, we first
75 * compare it against the stable tree, and then against the unstable tree.)
76 */
77
78/**
79 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items
83 * @mm: the mm that this information is valid for
84 */
85struct mm_slot {
86 struct hlist_node link;
87 struct list_head mm_list;
88 struct list_head rmap_list;
89 struct mm_struct *mm;
90};
91
92/**
93 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node)
98 *
99 * There is only the one ksm_scan instance of this cursor structure.
100 */
101struct ksm_scan {
102 struct mm_slot *mm_slot;
103 unsigned long address;
104 struct rmap_item *rmap_item;
105 unsigned long seqnr;
106};
107
108/**
109 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm)
111 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree
115 * @next: next rmap_item hanging off the same node of the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree
117 */
118struct rmap_item {
119 struct list_head link;
120 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */
122 union {
123 unsigned int oldchecksum; /* when unstable */
124 struct rmap_item *next; /* when stable */
125 };
126 union {
127 struct rb_node node; /* when tree node */
128 struct rmap_item *prev; /* in stable list */
129 };
130};
131
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
135
136/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT;
138static struct rb_root root_unstable_tree = RB_ROOT;
139
140#define MM_SLOTS_HASH_HEADS 1024
141static struct hlist_head *mm_slots_hash;
142
143static struct mm_slot ksm_mm_head = {
144 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
145};
146static struct ksm_scan ksm_scan = {
147 .mm_slot = &ksm_mm_head,
148};
149
150static struct kmem_cache *rmap_item_cache;
151static struct kmem_cache *mm_slot_cache;
152
153/* The number of nodes in the stable tree */
154static unsigned long ksm_pages_shared;
155
156/* The number of page slots additionally sharing those nodes */
157static unsigned long ksm_pages_sharing;
158
159/* The number of nodes in the unstable tree */
160static unsigned long ksm_pages_unshared;
161
162/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items;
164
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100;
170
171/* Milliseconds ksmd should sleep between batches */
172static unsigned int ksm_thread_sleep_millisecs = 20;
173
174#define KSM_RUN_STOP 0
175#define KSM_RUN_MERGE 1
176#define KSM_RUN_UNMERGE 2
177static unsigned int ksm_run = KSM_RUN_STOP;
178
179static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
180static DEFINE_MUTEX(ksm_thread_mutex);
181static DEFINE_SPINLOCK(ksm_mmlist_lock);
182
183#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
184 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL)
186
187static void __init ksm_init_max_kernel_pages(void)
188{
189 ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
190}
191
192static int __init ksm_slab_init(void)
193{
194 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
195 if (!rmap_item_cache)
196 goto out;
197
198 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
199 if (!mm_slot_cache)
200 goto out_free;
201
202 return 0;
203
204out_free:
205 kmem_cache_destroy(rmap_item_cache);
206out:
207 return -ENOMEM;
208}
209
210static void __init ksm_slab_free(void)
211{
212 kmem_cache_destroy(mm_slot_cache);
213 kmem_cache_destroy(rmap_item_cache);
214 mm_slot_cache = NULL;
215}
216
217static inline struct rmap_item *alloc_rmap_item(void)
218{
219 struct rmap_item *rmap_item;
220
221 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
222 if (rmap_item)
223 ksm_rmap_items++;
224 return rmap_item;
225}
226
227static inline void free_rmap_item(struct rmap_item *rmap_item)
228{
229 ksm_rmap_items--;
230 rmap_item->mm = NULL; /* debug safety */
231 kmem_cache_free(rmap_item_cache, rmap_item);
232}
233
234static inline struct mm_slot *alloc_mm_slot(void)
235{
236 if (!mm_slot_cache) /* initialization failed */
237 return NULL;
238 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
239}
240
241static inline void free_mm_slot(struct mm_slot *mm_slot)
242{
243 kmem_cache_free(mm_slot_cache, mm_slot);
244}
245
246static int __init mm_slots_hash_init(void)
247{
248 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
249 GFP_KERNEL);
250 if (!mm_slots_hash)
251 return -ENOMEM;
252 return 0;
253}
254
255static void __init mm_slots_hash_free(void)
256{
257 kfree(mm_slots_hash);
258}
259
260static struct mm_slot *get_mm_slot(struct mm_struct *mm)
261{
262 struct mm_slot *mm_slot;
263 struct hlist_head *bucket;
264 struct hlist_node *node;
265
266 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
267 % MM_SLOTS_HASH_HEADS];
268 hlist_for_each_entry(mm_slot, node, bucket, link) {
269 if (mm == mm_slot->mm)
270 return mm_slot;
271 }
272 return NULL;
273}
274
275static void insert_to_mm_slots_hash(struct mm_struct *mm,
276 struct mm_slot *mm_slot)
277{
278 struct hlist_head *bucket;
279
280 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
281 % MM_SLOTS_HASH_HEADS];
282 mm_slot->mm = mm;
283 INIT_LIST_HEAD(&mm_slot->rmap_list);
284 hlist_add_head(&mm_slot->link, bucket);
285}
286
287static inline int in_stable_tree(struct rmap_item *rmap_item)
288{
289 return rmap_item->address & STABLE_FLAG;
290}
291
292/*
293 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
294 * page tables after it has passed through ksm_exit() - which, if necessary,
295 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
296 * a special flag: they can just back out as soon as mm_users goes to zero.
297 * ksm_test_exit() is used throughout to make this test for exit: in some
298 * places for correctness, in some places just to avoid unnecessary work.
299 */
300static inline bool ksm_test_exit(struct mm_struct *mm)
301{
302 return atomic_read(&mm->mm_users) == 0;
303}
304
305/*
306 * We use break_ksm to break COW on a ksm page: it's a stripped down
307 *
308 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
309 * put_page(page);
310 *
311 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
312 * in case the application has unmapped and remapped mm,addr meanwhile.
313 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
314 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
315 */
316static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
317{
318 struct page *page;
319 int ret = 0;
320
321 do {
322 cond_resched();
323 page = follow_page(vma, addr, FOLL_GET);
324 if (!page)
325 break;
326 if (PageKsm(page))
327 ret = handle_mm_fault(vma->vm_mm, vma, addr,
328 FAULT_FLAG_WRITE);
329 else
330 ret = VM_FAULT_WRITE;
331 put_page(page);
332 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
333 /*
334 * We must loop because handle_mm_fault() may back out if there's
335 * any difficulty e.g. if pte accessed bit gets updated concurrently.
336 *
337 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
338 * COW has been broken, even if the vma does not permit VM_WRITE;
339 * but note that a concurrent fault might break PageKsm for us.
340 *
341 * VM_FAULT_SIGBUS could occur if we race with truncation of the
342 * backing file, which also invalidates anonymous pages: that's
343 * okay, that truncation will have unmapped the PageKsm for us.
344 *
345 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
346 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
347 * current task has TIF_MEMDIE set, and will be OOM killed on return
348 * to user; and ksmd, having no mm, would never be chosen for that.
349 *
350 * But if the mm is in a limited mem_cgroup, then the fault may fail
351 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
352 * even ksmd can fail in this way - though it's usually breaking ksm
353 * just to undo a merge it made a moment before, so unlikely to oom.
354 *
355 * That's a pity: we might therefore have more kernel pages allocated
356 * than we're counting as nodes in the stable tree; but ksm_do_scan
357 * will retry to break_cow on each pass, so should recover the page
358 * in due course. The important thing is to not let VM_MERGEABLE
359 * be cleared while any such pages might remain in the area.
360 */
361 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
362}
363
364static void break_cow(struct mm_struct *mm, unsigned long addr)
365{
366 struct vm_area_struct *vma;
367
368 down_read(&mm->mmap_sem);
369 if (ksm_test_exit(mm))
370 goto out;
371 vma = find_vma(mm, addr);
372 if (!vma || vma->vm_start > addr)
373 goto out;
374 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
375 goto out;
376 break_ksm(vma, addr);
377out:
378 up_read(&mm->mmap_sem);
379}
380
381static struct page *get_mergeable_page(struct rmap_item *rmap_item)
382{
383 struct mm_struct *mm = rmap_item->mm;
384 unsigned long addr = rmap_item->address;
385 struct vm_area_struct *vma;
386 struct page *page;
387
388 down_read(&mm->mmap_sem);
389 if (ksm_test_exit(mm))
390 goto out;
391 vma = find_vma(mm, addr);
392 if (!vma || vma->vm_start > addr)
393 goto out;
394 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
395 goto out;
396
397 page = follow_page(vma, addr, FOLL_GET);
398 if (!page)
399 goto out;
400 if (PageAnon(page)) {
401 flush_anon_page(vma, page, addr);
402 flush_dcache_page(page);
403 } else {
404 put_page(page);
405out: page = NULL;
406 }
407 up_read(&mm->mmap_sem);
408 return page;
409}
410
411/*
412 * get_ksm_page: checks if the page at the virtual address in rmap_item
413 * is still PageKsm, in which case we can trust the content of the page,
414 * and it returns the gotten page; but NULL if the page has been zapped.
415 */
416static struct page *get_ksm_page(struct rmap_item *rmap_item)
417{
418 struct page *page;
419
420 page = get_mergeable_page(rmap_item);
421 if (page && !PageKsm(page)) {
422 put_page(page);
423 page = NULL;
424 }
425 return page;
426}
427
428/*
429 * Removing rmap_item from stable or unstable tree.
430 * This function will clean the information from the stable/unstable tree.
431 */
432static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
433{
434 if (in_stable_tree(rmap_item)) {
435 struct rmap_item *next_item = rmap_item->next;
436
437 if (rmap_item->address & NODE_FLAG) {
438 if (next_item) {
439 rb_replace_node(&rmap_item->node,
440 &next_item->node,
441 &root_stable_tree);
442 next_item->address |= NODE_FLAG;
443 ksm_pages_sharing--;
444 } else {
445 rb_erase(&rmap_item->node, &root_stable_tree);
446 ksm_pages_shared--;
447 }
448 } else {
449 struct rmap_item *prev_item = rmap_item->prev;
450
451 BUG_ON(prev_item->next != rmap_item);
452 prev_item->next = next_item;
453 if (next_item) {
454 BUG_ON(next_item->prev != rmap_item);
455 next_item->prev = rmap_item->prev;
456 }
457 ksm_pages_sharing--;
458 }
459
460 rmap_item->next = NULL;
461
462 } else if (rmap_item->address & NODE_FLAG) {
463 unsigned char age;
464 /*
465 * Usually ksmd can and must skip the rb_erase, because
466 * root_unstable_tree was already reset to RB_ROOT.
467 * But be careful when an mm is exiting: do the rb_erase
468 * if this rmap_item was inserted by this scan, rather
469 * than left over from before.
470 */
471 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
472 BUG_ON(age > 1);
473 if (!age)
474 rb_erase(&rmap_item->node, &root_unstable_tree);
475 ksm_pages_unshared--;
476 }
477
478 rmap_item->address &= PAGE_MASK;
479
480 cond_resched(); /* we're called from many long loops */
481}
482
483static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
484 struct list_head *cur)
485{
486 struct rmap_item *rmap_item;
487
488 while (cur != &mm_slot->rmap_list) {
489 rmap_item = list_entry(cur, struct rmap_item, link);
490 cur = cur->next;
491 remove_rmap_item_from_tree(rmap_item);
492 list_del(&rmap_item->link);
493 free_rmap_item(rmap_item);
494 }
495}
496
497/*
498 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
499 * than check every pte of a given vma, the locking doesn't quite work for
500 * that - an rmap_item is assigned to the stable tree after inserting ksm
501 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
502 * rmap_items from parent to child at fork time (so as not to waste time
503 * if exit comes before the next scan reaches it).
504 *
505 * Similarly, although we'd like to remove rmap_items (so updating counts
506 * and freeing memory) when unmerging an area, it's easier to leave that
507 * to the next pass of ksmd - consider, for example, how ksmd might be
508 * in cmp_and_merge_page on one of the rmap_items we would be removing.
509 */
510static int unmerge_ksm_pages(struct vm_area_struct *vma,
511 unsigned long start, unsigned long end)
512{
513 unsigned long addr;
514 int err = 0;
515
516 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
517 if (ksm_test_exit(vma->vm_mm))
518 break;
519 if (signal_pending(current))
520 err = -ERESTARTSYS;
521 else
522 err = break_ksm(vma, addr);
523 }
524 return err;
525}
526
527#ifdef CONFIG_SYSFS
528/*
529 * Only called through the sysfs control interface:
530 */
531static int unmerge_and_remove_all_rmap_items(void)
532{
533 struct mm_slot *mm_slot;
534 struct mm_struct *mm;
535 struct vm_area_struct *vma;
536 int err = 0;
537
538 spin_lock(&ksm_mmlist_lock);
539 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
540 struct mm_slot, mm_list);
541 spin_unlock(&ksm_mmlist_lock);
542
543 for (mm_slot = ksm_scan.mm_slot;
544 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
545 mm = mm_slot->mm;
546 down_read(&mm->mmap_sem);
547 for (vma = mm->mmap; vma; vma = vma->vm_next) {
548 if (ksm_test_exit(mm))
549 break;
550 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
551 continue;
552 err = unmerge_ksm_pages(vma,
553 vma->vm_start, vma->vm_end);
554 if (err)
555 goto error;
556 }
557
558 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
559
560 spin_lock(&ksm_mmlist_lock);
561 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
562 struct mm_slot, mm_list);
563 if (ksm_test_exit(mm)) {
564 hlist_del(&mm_slot->link);
565 list_del(&mm_slot->mm_list);
566 spin_unlock(&ksm_mmlist_lock);
567
568 free_mm_slot(mm_slot);
569 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
570 up_read(&mm->mmap_sem);
571 mmdrop(mm);
572 } else {
573 spin_unlock(&ksm_mmlist_lock);
574 up_read(&mm->mmap_sem);
575 }
576 }
577
578 ksm_scan.seqnr = 0;
579 return 0;
580
581error:
582 up_read(&mm->mmap_sem);
583 spin_lock(&ksm_mmlist_lock);
584 ksm_scan.mm_slot = &ksm_mm_head;
585 spin_unlock(&ksm_mmlist_lock);
586 return err;
587}
588#endif /* CONFIG_SYSFS */
589
590static u32 calc_checksum(struct page *page)
591{
592 u32 checksum;
593 void *addr = kmap_atomic(page, KM_USER0);
594 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
595 kunmap_atomic(addr, KM_USER0);
596 return checksum;
597}
598
599static int memcmp_pages(struct page *page1, struct page *page2)
600{
601 char *addr1, *addr2;
602 int ret;
603
604 addr1 = kmap_atomic(page1, KM_USER0);
605 addr2 = kmap_atomic(page2, KM_USER1);
606 ret = memcmp(addr1, addr2, PAGE_SIZE);
607 kunmap_atomic(addr2, KM_USER1);
608 kunmap_atomic(addr1, KM_USER0);
609 return ret;
610}
611
612static inline int pages_identical(struct page *page1, struct page *page2)
613{
614 return !memcmp_pages(page1, page2);
615}
616
617static int write_protect_page(struct vm_area_struct *vma, struct page *page,
618 pte_t *orig_pte)
619{
620 struct mm_struct *mm = vma->vm_mm;
621 unsigned long addr;
622 pte_t *ptep;
623 spinlock_t *ptl;
624 int swapped;
625 int err = -EFAULT;
626
627 addr = page_address_in_vma(page, vma);
628 if (addr == -EFAULT)
629 goto out;
630
631 ptep = page_check_address(page, mm, addr, &ptl, 0);
632 if (!ptep)
633 goto out;
634
635 if (pte_write(*ptep)) {
636 pte_t entry;
637
638 swapped = PageSwapCache(page);
639 flush_cache_page(vma, addr, page_to_pfn(page));
640 /*
641 * Ok this is tricky, when get_user_pages_fast() run it doesnt
642 * take any lock, therefore the check that we are going to make
643 * with the pagecount against the mapcount is racey and
644 * O_DIRECT can happen right after the check.
645 * So we clear the pte and flush the tlb before the check
646 * this assure us that no O_DIRECT can happen after the check
647 * or in the middle of the check.
648 */
649 entry = ptep_clear_flush(vma, addr, ptep);
650 /*
651 * Check that no O_DIRECT or similar I/O is in progress on the
652 * page
653 */
654 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
655 set_pte_at_notify(mm, addr, ptep, entry);
656 goto out_unlock;
657 }
658 entry = pte_wrprotect(entry);
659 set_pte_at_notify(mm, addr, ptep, entry);
660 }
661 *orig_pte = *ptep;
662 err = 0;
663
664out_unlock:
665 pte_unmap_unlock(ptep, ptl);
666out:
667 return err;
668}
669
670/**
671 * replace_page - replace page in vma by new ksm page
672 * @vma: vma that holds the pte pointing to oldpage
673 * @oldpage: the page we are replacing by newpage
674 * @newpage: the ksm page we replace oldpage by
675 * @orig_pte: the original value of the pte
676 *
677 * Returns 0 on success, -EFAULT on failure.
678 */
679static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
680 struct page *newpage, pte_t orig_pte)
681{
682 struct mm_struct *mm = vma->vm_mm;
683 pgd_t *pgd;
684 pud_t *pud;
685 pmd_t *pmd;
686 pte_t *ptep;
687 spinlock_t *ptl;
688 unsigned long addr;
689 pgprot_t prot;
690 int err = -EFAULT;
691
692 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
693
694 addr = page_address_in_vma(oldpage, vma);
695 if (addr == -EFAULT)
696 goto out;
697
698 pgd = pgd_offset(mm, addr);
699 if (!pgd_present(*pgd))
700 goto out;
701
702 pud = pud_offset(pgd, addr);
703 if (!pud_present(*pud))
704 goto out;
705
706 pmd = pmd_offset(pud, addr);
707 if (!pmd_present(*pmd))
708 goto out;
709
710 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
711 if (!pte_same(*ptep, orig_pte)) {
712 pte_unmap_unlock(ptep, ptl);
713 goto out;
714 }
715
716 get_page(newpage);
717 page_add_ksm_rmap(newpage);
718
719 flush_cache_page(vma, addr, pte_pfn(*ptep));
720 ptep_clear_flush(vma, addr, ptep);
721 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
722
723 page_remove_rmap(oldpage);
724 put_page(oldpage);
725
726 pte_unmap_unlock(ptep, ptl);
727 err = 0;
728out:
729 return err;
730}
731
732/*
733 * try_to_merge_one_page - take two pages and merge them into one
734 * @vma: the vma that hold the pte pointing into oldpage
735 * @oldpage: the page that we want to replace with newpage
736 * @newpage: the page that we want to map instead of oldpage
737 *
738 * Note:
739 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
740 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
741 *
742 * This function returns 0 if the pages were merged, -EFAULT otherwise.
743 */
744static int try_to_merge_one_page(struct vm_area_struct *vma,
745 struct page *oldpage,
746 struct page *newpage)
747{
748 pte_t orig_pte = __pte(0);
749 int err = -EFAULT;
750
751 if (!(vma->vm_flags & VM_MERGEABLE))
752 goto out;
753
754 if (!PageAnon(oldpage))
755 goto out;
756
757 get_page(newpage);
758 get_page(oldpage);
759
760 /*
761 * We need the page lock to read a stable PageSwapCache in
762 * write_protect_page(). We use trylock_page() instead of
763 * lock_page() because we don't want to wait here - we
764 * prefer to continue scanning and merging different pages,
765 * then come back to this page when it is unlocked.
766 */
767 if (!trylock_page(oldpage))
768 goto out_putpage;
769 /*
770 * If this anonymous page is mapped only here, its pte may need
771 * to be write-protected. If it's mapped elsewhere, all of its
772 * ptes are necessarily already write-protected. But in either
773 * case, we need to lock and check page_count is not raised.
774 */
775 if (write_protect_page(vma, oldpage, &orig_pte)) {
776 unlock_page(oldpage);
777 goto out_putpage;
778 }
779 unlock_page(oldpage);
780
781 if (pages_identical(oldpage, newpage))
782 err = replace_page(vma, oldpage, newpage, orig_pte);
783
784out_putpage:
785 put_page(oldpage);
786 put_page(newpage);
787out:
788 return err;
789}
790
791/*
792 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
793 * but no new kernel page is allocated: kpage must already be a ksm page.
794 */
795static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
796 unsigned long addr1,
797 struct page *page1,
798 struct page *kpage)
799{
800 struct vm_area_struct *vma;
801 int err = -EFAULT;
802
803 down_read(&mm1->mmap_sem);
804 if (ksm_test_exit(mm1))
805 goto out;
806
807 vma = find_vma(mm1, addr1);
808 if (!vma || vma->vm_start > addr1)
809 goto out;
810
811 err = try_to_merge_one_page(vma, page1, kpage);
812out:
813 up_read(&mm1->mmap_sem);
814 return err;
815}
816
817/*
818 * try_to_merge_two_pages - take two identical pages and prepare them
819 * to be merged into one page.
820 *
821 * This function returns 0 if we successfully mapped two identical pages
822 * into one page, -EFAULT otherwise.
823 *
824 * Note that this function allocates a new kernel page: if one of the pages
825 * is already a ksm page, try_to_merge_with_ksm_page should be used.
826 */
827static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
828 struct page *page1, struct mm_struct *mm2,
829 unsigned long addr2, struct page *page2)
830{
831 struct vm_area_struct *vma;
832 struct page *kpage;
833 int err = -EFAULT;
834
835 /*
836 * The number of nodes in the stable tree
837 * is the number of kernel pages that we hold.
838 */
839 if (ksm_max_kernel_pages &&
840 ksm_max_kernel_pages <= ksm_pages_shared)
841 return err;
842
843 kpage = alloc_page(GFP_HIGHUSER);
844 if (!kpage)
845 return err;
846
847 down_read(&mm1->mmap_sem);
848 if (ksm_test_exit(mm1)) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852 vma = find_vma(mm1, addr1);
853 if (!vma || vma->vm_start > addr1) {
854 up_read(&mm1->mmap_sem);
855 goto out;
856 }
857
858 copy_user_highpage(kpage, page1, addr1, vma);
859 err = try_to_merge_one_page(vma, page1, kpage);
860 up_read(&mm1->mmap_sem);
861
862 if (!err) {
863 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
864 /*
865 * If that fails, we have a ksm page with only one pte
866 * pointing to it: so break it.
867 */
868 if (err)
869 break_cow(mm1, addr1);
870 }
871out:
872 put_page(kpage);
873 return err;
874}
875
876/*
877 * stable_tree_search - search page inside the stable tree
878 * @page: the page that we are searching identical pages to.
879 * @page2: pointer into identical page that we are holding inside the stable
880 * tree that we have found.
881 * @rmap_item: the reverse mapping item
882 *
883 * This function checks if there is a page inside the stable tree
884 * with identical content to the page that we are scanning right now.
885 *
886 * This function return rmap_item pointer to the identical item if found,
887 * NULL otherwise.
888 */
889static struct rmap_item *stable_tree_search(struct page *page,
890 struct page **page2,
891 struct rmap_item *rmap_item)
892{
893 struct rb_node *node = root_stable_tree.rb_node;
894
895 while (node) {
896 struct rmap_item *tree_rmap_item, *next_rmap_item;
897 int ret;
898
899 tree_rmap_item = rb_entry(node, struct rmap_item, node);
900 while (tree_rmap_item) {
901 BUG_ON(!in_stable_tree(tree_rmap_item));
902 cond_resched();
903 page2[0] = get_ksm_page(tree_rmap_item);
904 if (page2[0])
905 break;
906 next_rmap_item = tree_rmap_item->next;
907 remove_rmap_item_from_tree(tree_rmap_item);
908 tree_rmap_item = next_rmap_item;
909 }
910 if (!tree_rmap_item)
911 return NULL;
912
913 ret = memcmp_pages(page, page2[0]);
914
915 if (ret < 0) {
916 put_page(page2[0]);
917 node = node->rb_left;
918 } else if (ret > 0) {
919 put_page(page2[0]);
920 node = node->rb_right;
921 } else {
922 return tree_rmap_item;
923 }
924 }
925
926 return NULL;
927}
928
929/*
930 * stable_tree_insert - insert rmap_item pointing to new ksm page
931 * into the stable tree.
932 *
933 * @page: the page that we are searching identical page to inside the stable
934 * tree.
935 * @rmap_item: pointer to the reverse mapping item.
936 *
937 * This function returns rmap_item if success, NULL otherwise.
938 */
939static struct rmap_item *stable_tree_insert(struct page *page,
940 struct rmap_item *rmap_item)
941{
942 struct rb_node **new = &root_stable_tree.rb_node;
943 struct rb_node *parent = NULL;
944
945 while (*new) {
946 struct rmap_item *tree_rmap_item, *next_rmap_item;
947 struct page *tree_page;
948 int ret;
949
950 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
951 while (tree_rmap_item) {
952 BUG_ON(!in_stable_tree(tree_rmap_item));
953 cond_resched();
954 tree_page = get_ksm_page(tree_rmap_item);
955 if (tree_page)
956 break;
957 next_rmap_item = tree_rmap_item->next;
958 remove_rmap_item_from_tree(tree_rmap_item);
959 tree_rmap_item = next_rmap_item;
960 }
961 if (!tree_rmap_item)
962 return NULL;
963
964 ret = memcmp_pages(page, tree_page);
965 put_page(tree_page);
966
967 parent = *new;
968 if (ret < 0)
969 new = &parent->rb_left;
970 else if (ret > 0)
971 new = &parent->rb_right;
972 else {
973 /*
974 * It is not a bug that stable_tree_search() didn't
975 * find this node: because at that time our page was
976 * not yet write-protected, so may have changed since.
977 */
978 return NULL;
979 }
980 }
981
982 rmap_item->address |= NODE_FLAG | STABLE_FLAG;
983 rmap_item->next = NULL;
984 rb_link_node(&rmap_item->node, parent, new);
985 rb_insert_color(&rmap_item->node, &root_stable_tree);
986
987 ksm_pages_shared++;
988 return rmap_item;
989}
990
991/*
992 * unstable_tree_search_insert - search and insert items into the unstable tree.
993 *
994 * @page: the page that we are going to search for identical page or to insert
995 * into the unstable tree
996 * @page2: pointer into identical page that was found inside the unstable tree
997 * @rmap_item: the reverse mapping item of page
998 *
999 * This function searches for a page in the unstable tree identical to the
1000 * page currently being scanned; and if no identical page is found in the
1001 * tree, we insert rmap_item as a new object into the unstable tree.
1002 *
1003 * This function returns pointer to rmap_item found to be identical
1004 * to the currently scanned page, NULL otherwise.
1005 *
1006 * This function does both searching and inserting, because they share
1007 * the same walking algorithm in an rbtree.
1008 */
1009static struct rmap_item *unstable_tree_search_insert(struct page *page,
1010 struct page **page2,
1011 struct rmap_item *rmap_item)
1012{
1013 struct rb_node **new = &root_unstable_tree.rb_node;
1014 struct rb_node *parent = NULL;
1015
1016 while (*new) {
1017 struct rmap_item *tree_rmap_item;
1018 int ret;
1019
1020 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1021 page2[0] = get_mergeable_page(tree_rmap_item);
1022 if (!page2[0])
1023 return NULL;
1024
1025 /*
1026 * Don't substitute an unswappable ksm page
1027 * just for one good swappable forked page.
1028 */
1029 if (page == page2[0]) {
1030 put_page(page2[0]);
1031 return NULL;
1032 }
1033
1034 ret = memcmp_pages(page, page2[0]);
1035
1036 parent = *new;
1037 if (ret < 0) {
1038 put_page(page2[0]);
1039 new = &parent->rb_left;
1040 } else if (ret > 0) {
1041 put_page(page2[0]);
1042 new = &parent->rb_right;
1043 } else {
1044 return tree_rmap_item;
1045 }
1046 }
1047
1048 rmap_item->address |= NODE_FLAG;
1049 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1050 rb_link_node(&rmap_item->node, parent, new);
1051 rb_insert_color(&rmap_item->node, &root_unstable_tree);
1052
1053 ksm_pages_unshared++;
1054 return NULL;
1055}
1056
1057/*
1058 * stable_tree_append - add another rmap_item to the linked list of
1059 * rmap_items hanging off a given node of the stable tree, all sharing
1060 * the same ksm page.
1061 */
1062static void stable_tree_append(struct rmap_item *rmap_item,
1063 struct rmap_item *tree_rmap_item)
1064{
1065 rmap_item->next = tree_rmap_item->next;
1066 rmap_item->prev = tree_rmap_item;
1067
1068 if (tree_rmap_item->next)
1069 tree_rmap_item->next->prev = rmap_item;
1070
1071 tree_rmap_item->next = rmap_item;
1072 rmap_item->address |= STABLE_FLAG;
1073
1074 ksm_pages_sharing++;
1075}
1076
1077/*
1078 * cmp_and_merge_page - first see if page can be merged into the stable tree;
1079 * if not, compare checksum to previous and if it's the same, see if page can
1080 * be inserted into the unstable tree, or merged with a page already there and
1081 * both transferred to the stable tree.
1082 *
1083 * @page: the page that we are searching identical page to.
1084 * @rmap_item: the reverse mapping into the virtual address of this page
1085 */
1086static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1087{
1088 struct page *page2[1];
1089 struct rmap_item *tree_rmap_item;
1090 unsigned int checksum;
1091 int err;
1092
1093 if (in_stable_tree(rmap_item))
1094 remove_rmap_item_from_tree(rmap_item);
1095
1096 /* We first start with searching the page inside the stable tree */
1097 tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1098 if (tree_rmap_item) {
1099 if (page == page2[0]) /* forked */
1100 err = 0;
1101 else
1102 err = try_to_merge_with_ksm_page(rmap_item->mm,
1103 rmap_item->address,
1104 page, page2[0]);
1105 put_page(page2[0]);
1106
1107 if (!err) {
1108 /*
1109 * The page was successfully merged:
1110 * add its rmap_item to the stable tree.
1111 */
1112 stable_tree_append(rmap_item, tree_rmap_item);
1113 }
1114 return;
1115 }
1116
1117 /*
1118 * A ksm page might have got here by fork, but its other
1119 * references have already been removed from the stable tree.
1120 * Or it might be left over from a break_ksm which failed
1121 * when the mem_cgroup had reached its limit: try again now.
1122 */
1123 if (PageKsm(page))
1124 break_cow(rmap_item->mm, rmap_item->address);
1125
1126 /*
1127 * In case the hash value of the page was changed from the last time we
1128 * have calculated it, this page to be changed frequely, therefore we
1129 * don't want to insert it to the unstable tree, and we don't want to
1130 * waste our time to search if there is something identical to it there.
1131 */
1132 checksum = calc_checksum(page);
1133 if (rmap_item->oldchecksum != checksum) {
1134 rmap_item->oldchecksum = checksum;
1135 return;
1136 }
1137
1138 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1139 if (tree_rmap_item) {
1140 err = try_to_merge_two_pages(rmap_item->mm,
1141 rmap_item->address, page,
1142 tree_rmap_item->mm,
1143 tree_rmap_item->address, page2[0]);
1144 /*
1145 * As soon as we merge this page, we want to remove the
1146 * rmap_item of the page we have merged with from the unstable
1147 * tree, and insert it instead as new node in the stable tree.
1148 */
1149 if (!err) {
1150 rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1151 tree_rmap_item->address &= ~NODE_FLAG;
1152 ksm_pages_unshared--;
1153
1154 /*
1155 * If we fail to insert the page into the stable tree,
1156 * we will have 2 virtual addresses that are pointing
1157 * to a ksm page left outside the stable tree,
1158 * in which case we need to break_cow on both.
1159 */
1160 if (stable_tree_insert(page2[0], tree_rmap_item))
1161 stable_tree_append(rmap_item, tree_rmap_item);
1162 else {
1163 break_cow(tree_rmap_item->mm,
1164 tree_rmap_item->address);
1165 break_cow(rmap_item->mm, rmap_item->address);
1166 }
1167 }
1168
1169 put_page(page2[0]);
1170 }
1171}
1172
1173static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1174 struct list_head *cur,
1175 unsigned long addr)
1176{
1177 struct rmap_item *rmap_item;
1178
1179 while (cur != &mm_slot->rmap_list) {
1180 rmap_item = list_entry(cur, struct rmap_item, link);
1181 if ((rmap_item->address & PAGE_MASK) == addr) {
1182 if (!in_stable_tree(rmap_item))
1183 remove_rmap_item_from_tree(rmap_item);
1184 return rmap_item;
1185 }
1186 if (rmap_item->address > addr)
1187 break;
1188 cur = cur->next;
1189 remove_rmap_item_from_tree(rmap_item);
1190 list_del(&rmap_item->link);
1191 free_rmap_item(rmap_item);
1192 }
1193
1194 rmap_item = alloc_rmap_item();
1195 if (rmap_item) {
1196 /* It has already been zeroed */
1197 rmap_item->mm = mm_slot->mm;
1198 rmap_item->address = addr;
1199 list_add_tail(&rmap_item->link, cur);
1200 }
1201 return rmap_item;
1202}
1203
1204static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1205{
1206 struct mm_struct *mm;
1207 struct mm_slot *slot;
1208 struct vm_area_struct *vma;
1209 struct rmap_item *rmap_item;
1210
1211 if (list_empty(&ksm_mm_head.mm_list))
1212 return NULL;
1213
1214 slot = ksm_scan.mm_slot;
1215 if (slot == &ksm_mm_head) {
1216 root_unstable_tree = RB_ROOT;
1217
1218 spin_lock(&ksm_mmlist_lock);
1219 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1220 ksm_scan.mm_slot = slot;
1221 spin_unlock(&ksm_mmlist_lock);
1222next_mm:
1223 ksm_scan.address = 0;
1224 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1225 struct rmap_item, link);
1226 }
1227
1228 mm = slot->mm;
1229 down_read(&mm->mmap_sem);
1230 if (ksm_test_exit(mm))
1231 vma = NULL;
1232 else
1233 vma = find_vma(mm, ksm_scan.address);
1234
1235 for (; vma; vma = vma->vm_next) {
1236 if (!(vma->vm_flags & VM_MERGEABLE))
1237 continue;
1238 if (ksm_scan.address < vma->vm_start)
1239 ksm_scan.address = vma->vm_start;
1240 if (!vma->anon_vma)
1241 ksm_scan.address = vma->vm_end;
1242
1243 while (ksm_scan.address < vma->vm_end) {
1244 if (ksm_test_exit(mm))
1245 break;
1246 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1247 if (*page && PageAnon(*page)) {
1248 flush_anon_page(vma, *page, ksm_scan.address);
1249 flush_dcache_page(*page);
1250 rmap_item = get_next_rmap_item(slot,
1251 ksm_scan.rmap_item->link.next,
1252 ksm_scan.address);
1253 if (rmap_item) {
1254 ksm_scan.rmap_item = rmap_item;
1255 ksm_scan.address += PAGE_SIZE;
1256 } else
1257 put_page(*page);
1258 up_read(&mm->mmap_sem);
1259 return rmap_item;
1260 }
1261 if (*page)
1262 put_page(*page);
1263 ksm_scan.address += PAGE_SIZE;
1264 cond_resched();
1265 }
1266 }
1267
1268 if (ksm_test_exit(mm)) {
1269 ksm_scan.address = 0;
1270 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1271 struct rmap_item, link);
1272 }
1273 /*
1274 * Nuke all the rmap_items that are above this current rmap:
1275 * because there were no VM_MERGEABLE vmas with such addresses.
1276 */
1277 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1278
1279 spin_lock(&ksm_mmlist_lock);
1280 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1281 struct mm_slot, mm_list);
1282 if (ksm_scan.address == 0) {
1283 /*
1284 * We've completed a full scan of all vmas, holding mmap_sem
1285 * throughout, and found no VM_MERGEABLE: so do the same as
1286 * __ksm_exit does to remove this mm from all our lists now.
1287 * This applies either when cleaning up after __ksm_exit
1288 * (but beware: we can reach here even before __ksm_exit),
1289 * or when all VM_MERGEABLE areas have been unmapped (and
1290 * mmap_sem then protects against race with MADV_MERGEABLE).
1291 */
1292 hlist_del(&slot->link);
1293 list_del(&slot->mm_list);
1294 spin_unlock(&ksm_mmlist_lock);
1295
1296 free_mm_slot(slot);
1297 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1298 up_read(&mm->mmap_sem);
1299 mmdrop(mm);
1300 } else {
1301 spin_unlock(&ksm_mmlist_lock);
1302 up_read(&mm->mmap_sem);
1303 }
1304
1305 /* Repeat until we've completed scanning the whole list */
1306 slot = ksm_scan.mm_slot;
1307 if (slot != &ksm_mm_head)
1308 goto next_mm;
1309
1310 ksm_scan.seqnr++;
1311 return NULL;
1312}
1313
1314/**
1315 * ksm_do_scan - the ksm scanner main worker function.
1316 * @scan_npages - number of pages we want to scan before we return.
1317 */
1318static void ksm_do_scan(unsigned int scan_npages)
1319{
1320 struct rmap_item *rmap_item;
1321 struct page *page;
1322
1323 while (scan_npages--) {
1324 cond_resched();
1325 rmap_item = scan_get_next_rmap_item(&page);
1326 if (!rmap_item)
1327 return;
1328 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1329 cmp_and_merge_page(page, rmap_item);
1330 else if (page_mapcount(page) == 1) {
1331 /*
1332 * Replace now-unshared ksm page by ordinary page.
1333 */
1334 break_cow(rmap_item->mm, rmap_item->address);
1335 remove_rmap_item_from_tree(rmap_item);
1336 rmap_item->oldchecksum = calc_checksum(page);
1337 }
1338 put_page(page);
1339 }
1340}
1341
1342static int ksmd_should_run(void)
1343{
1344 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1345}
1346
1347static int ksm_scan_thread(void *nothing)
1348{
1349 set_user_nice(current, 5);
1350
1351 while (!kthread_should_stop()) {
1352 mutex_lock(&ksm_thread_mutex);
1353 if (ksmd_should_run())
1354 ksm_do_scan(ksm_thread_pages_to_scan);
1355 mutex_unlock(&ksm_thread_mutex);
1356
1357 if (ksmd_should_run()) {
1358 schedule_timeout_interruptible(
1359 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1360 } else {
1361 wait_event_interruptible(ksm_thread_wait,
1362 ksmd_should_run() || kthread_should_stop());
1363 }
1364 }
1365 return 0;
1366}
1367
1368int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1369 unsigned long end, int advice, unsigned long *vm_flags)
1370{
1371 struct mm_struct *mm = vma->vm_mm;
1372 int err;
1373
1374 switch (advice) {
1375 case MADV_MERGEABLE:
1376 /*
1377 * Be somewhat over-protective for now!
1378 */
1379 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1380 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1381 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1382 VM_MIXEDMAP | VM_SAO))
1383 return 0; /* just ignore the advice */
1384
1385 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1386 err = __ksm_enter(mm);
1387 if (err)
1388 return err;
1389 }
1390
1391 *vm_flags |= VM_MERGEABLE;
1392 break;
1393
1394 case MADV_UNMERGEABLE:
1395 if (!(*vm_flags & VM_MERGEABLE))
1396 return 0; /* just ignore the advice */
1397
1398 if (vma->anon_vma) {
1399 err = unmerge_ksm_pages(vma, start, end);
1400 if (err)
1401 return err;
1402 }
1403
1404 *vm_flags &= ~VM_MERGEABLE;
1405 break;
1406 }
1407
1408 return 0;
1409}
1410
1411int __ksm_enter(struct mm_struct *mm)
1412{
1413 struct mm_slot *mm_slot;
1414 int needs_wakeup;
1415
1416 mm_slot = alloc_mm_slot();
1417 if (!mm_slot)
1418 return -ENOMEM;
1419
1420 /* Check ksm_run too? Would need tighter locking */
1421 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1422
1423 spin_lock(&ksm_mmlist_lock);
1424 insert_to_mm_slots_hash(mm, mm_slot);
1425 /*
1426 * Insert just behind the scanning cursor, to let the area settle
1427 * down a little; when fork is followed by immediate exec, we don't
1428 * want ksmd to waste time setting up and tearing down an rmap_list.
1429 */
1430 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1431 spin_unlock(&ksm_mmlist_lock);
1432
1433 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1434 atomic_inc(&mm->mm_count);
1435
1436 if (needs_wakeup)
1437 wake_up_interruptible(&ksm_thread_wait);
1438
1439 return 0;
1440}
1441
1442void __ksm_exit(struct mm_struct *mm)
1443{
1444 struct mm_slot *mm_slot;
1445 int easy_to_free = 0;
1446
1447 /*
1448 * This process is exiting: if it's straightforward (as is the
1449 * case when ksmd was never running), free mm_slot immediately.
1450 * But if it's at the cursor or has rmap_items linked to it, use
1451 * mmap_sem to synchronize with any break_cows before pagetables
1452 * are freed, and leave the mm_slot on the list for ksmd to free.
1453 * Beware: ksm may already have noticed it exiting and freed the slot.
1454 */
1455
1456 spin_lock(&ksm_mmlist_lock);
1457 mm_slot = get_mm_slot(mm);
1458 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1459 if (list_empty(&mm_slot->rmap_list)) {
1460 hlist_del(&mm_slot->link);
1461 list_del(&mm_slot->mm_list);
1462 easy_to_free = 1;
1463 } else {
1464 list_move(&mm_slot->mm_list,
1465 &ksm_scan.mm_slot->mm_list);
1466 }
1467 }
1468 spin_unlock(&ksm_mmlist_lock);
1469
1470 if (easy_to_free) {
1471 free_mm_slot(mm_slot);
1472 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1473 mmdrop(mm);
1474 } else if (mm_slot) {
1475 down_write(&mm->mmap_sem);
1476 up_write(&mm->mmap_sem);
1477 }
1478}
1479
1480#ifdef CONFIG_SYSFS
1481/*
1482 * This all compiles without CONFIG_SYSFS, but is a waste of space.
1483 */
1484
1485#define KSM_ATTR_RO(_name) \
1486 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1487#define KSM_ATTR(_name) \
1488 static struct kobj_attribute _name##_attr = \
1489 __ATTR(_name, 0644, _name##_show, _name##_store)
1490
1491static ssize_t sleep_millisecs_show(struct kobject *kobj,
1492 struct kobj_attribute *attr, char *buf)
1493{
1494 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1495}
1496
1497static ssize_t sleep_millisecs_store(struct kobject *kobj,
1498 struct kobj_attribute *attr,
1499 const char *buf, size_t count)
1500{
1501 unsigned long msecs;
1502 int err;
1503
1504 err = strict_strtoul(buf, 10, &msecs);
1505 if (err || msecs > UINT_MAX)
1506 return -EINVAL;
1507
1508 ksm_thread_sleep_millisecs = msecs;
1509
1510 return count;
1511}
1512KSM_ATTR(sleep_millisecs);
1513
1514static ssize_t pages_to_scan_show(struct kobject *kobj,
1515 struct kobj_attribute *attr, char *buf)
1516{
1517 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1518}
1519
1520static ssize_t pages_to_scan_store(struct kobject *kobj,
1521 struct kobj_attribute *attr,
1522 const char *buf, size_t count)
1523{
1524 int err;
1525 unsigned long nr_pages;
1526
1527 err = strict_strtoul(buf, 10, &nr_pages);
1528 if (err || nr_pages > UINT_MAX)
1529 return -EINVAL;
1530
1531 ksm_thread_pages_to_scan = nr_pages;
1532
1533 return count;
1534}
1535KSM_ATTR(pages_to_scan);
1536
1537static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1538 char *buf)
1539{
1540 return sprintf(buf, "%u\n", ksm_run);
1541}
1542
1543static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1544 const char *buf, size_t count)
1545{
1546 int err;
1547 unsigned long flags;
1548
1549 err = strict_strtoul(buf, 10, &flags);
1550 if (err || flags > UINT_MAX)
1551 return -EINVAL;
1552 if (flags > KSM_RUN_UNMERGE)
1553 return -EINVAL;
1554
1555 /*
1556 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1557 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1558 * breaking COW to free the unswappable pages_shared (but leaves
1559 * mm_slots on the list for when ksmd may be set running again).
1560 */
1561
1562 mutex_lock(&ksm_thread_mutex);
1563 if (ksm_run != flags) {
1564 ksm_run = flags;
1565 if (flags & KSM_RUN_UNMERGE) {
1566 current->flags |= PF_OOM_ORIGIN;
1567 err = unmerge_and_remove_all_rmap_items();
1568 current->flags &= ~PF_OOM_ORIGIN;
1569 if (err) {
1570 ksm_run = KSM_RUN_STOP;
1571 count = err;
1572 }
1573 }
1574 }
1575 mutex_unlock(&ksm_thread_mutex);
1576
1577 if (flags & KSM_RUN_MERGE)
1578 wake_up_interruptible(&ksm_thread_wait);
1579
1580 return count;
1581}
1582KSM_ATTR(run);
1583
1584static ssize_t max_kernel_pages_store(struct kobject *kobj,
1585 struct kobj_attribute *attr,
1586 const char *buf, size_t count)
1587{
1588 int err;
1589 unsigned long nr_pages;
1590
1591 err = strict_strtoul(buf, 10, &nr_pages);
1592 if (err)
1593 return -EINVAL;
1594
1595 ksm_max_kernel_pages = nr_pages;
1596
1597 return count;
1598}
1599
1600static ssize_t max_kernel_pages_show(struct kobject *kobj,
1601 struct kobj_attribute *attr, char *buf)
1602{
1603 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1604}
1605KSM_ATTR(max_kernel_pages);
1606
1607static ssize_t pages_shared_show(struct kobject *kobj,
1608 struct kobj_attribute *attr, char *buf)
1609{
1610 return sprintf(buf, "%lu\n", ksm_pages_shared);
1611}
1612KSM_ATTR_RO(pages_shared);
1613
1614static ssize_t pages_sharing_show(struct kobject *kobj,
1615 struct kobj_attribute *attr, char *buf)
1616{
1617 return sprintf(buf, "%lu\n", ksm_pages_sharing);
1618}
1619KSM_ATTR_RO(pages_sharing);
1620
1621static ssize_t pages_unshared_show(struct kobject *kobj,
1622 struct kobj_attribute *attr, char *buf)
1623{
1624 return sprintf(buf, "%lu\n", ksm_pages_unshared);
1625}
1626KSM_ATTR_RO(pages_unshared);
1627
1628static ssize_t pages_volatile_show(struct kobject *kobj,
1629 struct kobj_attribute *attr, char *buf)
1630{
1631 long ksm_pages_volatile;
1632
1633 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1634 - ksm_pages_sharing - ksm_pages_unshared;
1635 /*
1636 * It was not worth any locking to calculate that statistic,
1637 * but it might therefore sometimes be negative: conceal that.
1638 */
1639 if (ksm_pages_volatile < 0)
1640 ksm_pages_volatile = 0;
1641 return sprintf(buf, "%ld\n", ksm_pages_volatile);
1642}
1643KSM_ATTR_RO(pages_volatile);
1644
1645static ssize_t full_scans_show(struct kobject *kobj,
1646 struct kobj_attribute *attr, char *buf)
1647{
1648 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
1649}
1650KSM_ATTR_RO(full_scans);
1651
1652static struct attribute *ksm_attrs[] = {
1653 &sleep_millisecs_attr.attr,
1654 &pages_to_scan_attr.attr,
1655 &run_attr.attr,
1656 &max_kernel_pages_attr.attr,
1657 &pages_shared_attr.attr,
1658 &pages_sharing_attr.attr,
1659 &pages_unshared_attr.attr,
1660 &pages_volatile_attr.attr,
1661 &full_scans_attr.attr,
1662 NULL,
1663};
1664
1665static struct attribute_group ksm_attr_group = {
1666 .attrs = ksm_attrs,
1667 .name = "ksm",
1668};
1669#endif /* CONFIG_SYSFS */
1670
1671static int __init ksm_init(void)
1672{
1673 struct task_struct *ksm_thread;
1674 int err;
1675
1676 ksm_init_max_kernel_pages();
1677
1678 err = ksm_slab_init();
1679 if (err)
1680 goto out;
1681
1682 err = mm_slots_hash_init();
1683 if (err)
1684 goto out_free1;
1685
1686 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1687 if (IS_ERR(ksm_thread)) {
1688 printk(KERN_ERR "ksm: creating kthread failed\n");
1689 err = PTR_ERR(ksm_thread);
1690 goto out_free2;
1691 }
1692
1693#ifdef CONFIG_SYSFS
1694 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1695 if (err) {
1696 printk(KERN_ERR "ksm: register sysfs failed\n");
1697 kthread_stop(ksm_thread);
1698 goto out_free2;
1699 }
1700#endif /* CONFIG_SYSFS */
1701
1702 return 0;
1703
1704out_free2:
1705 mm_slots_hash_free();
1706out_free1:
1707 ksm_slab_free();
1708out:
1709 return err;
1710}
1711module_init(ksm_init)
diff --git a/mm/madvise.c b/mm/madvise.c
index 76eb4193acdd..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/ksm.h>
14 15
15/* 16/*
16 * Any behaviour which results in changes to the vma->vm_flags needs to 17 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
41 struct mm_struct * mm = vma->vm_mm; 42 struct mm_struct * mm = vma->vm_mm;
42 int error = 0; 43 int error = 0;
43 pgoff_t pgoff; 44 pgoff_t pgoff;
44 int new_flags = vma->vm_flags; 45 unsigned long new_flags = vma->vm_flags;
45 46
46 switch (behavior) { 47 switch (behavior) {
47 case MADV_NORMAL: 48 case MADV_NORMAL:
@@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma,
57 new_flags |= VM_DONTCOPY; 58 new_flags |= VM_DONTCOPY;
58 break; 59 break;
59 case MADV_DOFORK: 60 case MADV_DOFORK:
61 if (vma->vm_flags & VM_IO) {
62 error = -EINVAL;
63 goto out;
64 }
60 new_flags &= ~VM_DONTCOPY; 65 new_flags &= ~VM_DONTCOPY;
61 break; 66 break;
67 case MADV_MERGEABLE:
68 case MADV_UNMERGEABLE:
69 error = ksm_madvise(vma, start, end, behavior, &new_flags);
70 if (error)
71 goto out;
72 break;
62 } 73 }
63 74
64 if (new_flags == vma->vm_flags) { 75 if (new_flags == vma->vm_flags) {
@@ -207,41 +218,46 @@ static long madvise_remove(struct vm_area_struct *vma,
207 return error; 218 return error;
208} 219}
209 220
221#ifdef CONFIG_MEMORY_FAILURE
222/*
223 * Error injection support for memory error handling.
224 */
225static int madvise_hwpoison(unsigned long start, unsigned long end)
226{
227 int ret = 0;
228
229 if (!capable(CAP_SYS_ADMIN))
230 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) {
232 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1,
234 0, 0, &p, NULL);
235 if (ret != 1)
236 return ret;
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start);
239 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1);
241 put_page(p);
242 }
243 return ret;
244}
245#endif
246
210static long 247static long
211madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
212 unsigned long start, unsigned long end, int behavior) 249 unsigned long start, unsigned long end, int behavior)
213{ 250{
214 long error;
215
216 switch (behavior) { 251 switch (behavior) {
217 case MADV_DOFORK:
218 if (vma->vm_flags & VM_IO) {
219 error = -EINVAL;
220 break;
221 }
222 case MADV_DONTFORK:
223 case MADV_NORMAL:
224 case MADV_SEQUENTIAL:
225 case MADV_RANDOM:
226 error = madvise_behavior(vma, prev, start, end, behavior);
227 break;
228 case MADV_REMOVE: 252 case MADV_REMOVE:
229 error = madvise_remove(vma, prev, start, end); 253 return madvise_remove(vma, prev, start, end);
230 break;
231
232 case MADV_WILLNEED: 254 case MADV_WILLNEED:
233 error = madvise_willneed(vma, prev, start, end); 255 return madvise_willneed(vma, prev, start, end);
234 break;
235
236 case MADV_DONTNEED: 256 case MADV_DONTNEED:
237 error = madvise_dontneed(vma, prev, start, end); 257 return madvise_dontneed(vma, prev, start, end);
238 break;
239
240 default: 258 default:
241 BUG(); 259 return madvise_behavior(vma, prev, start, end, behavior);
242 break;
243 } 260 }
244 return error;
245} 261}
246 262
247static int 263static int
@@ -256,12 +272,17 @@ madvise_behavior_valid(int behavior)
256 case MADV_REMOVE: 272 case MADV_REMOVE:
257 case MADV_WILLNEED: 273 case MADV_WILLNEED:
258 case MADV_DONTNEED: 274 case MADV_DONTNEED:
275#ifdef CONFIG_KSM
276 case MADV_MERGEABLE:
277 case MADV_UNMERGEABLE:
278#endif
259 return 1; 279 return 1;
260 280
261 default: 281 default:
262 return 0; 282 return 0;
263 } 283 }
264} 284}
285
265/* 286/*
266 * The madvise(2) system call. 287 * The madvise(2) system call.
267 * 288 *
@@ -286,6 +307,12 @@ madvise_behavior_valid(int behavior)
286 * so the kernel can free resources associated with it. 307 * so the kernel can free resources associated with it.
287 * MADV_REMOVE - the application wants to free up the given range of 308 * MADV_REMOVE - the application wants to free up the given range of
288 * pages and associated backing store. 309 * pages and associated backing store.
310 * MADV_DONTFORK - omit this area from child's address space when forking:
311 * typically, to avoid COWing pages pinned by get_user_pages().
312 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
313 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
314 * this area with pages of identical content from other such areas.
315 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
289 * 316 *
290 * return values: 317 * return values:
291 * zero - success 318 * zero - success
@@ -307,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
307 int write; 334 int write;
308 size_t len; 335 size_t len;
309 336
337#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON)
339 return madvise_hwpoison(start, start+len_in);
340#endif
310 if (!madvise_behavior_valid(behavior)) 341 if (!madvise_behavior_valid(behavior))
311 return error; 342 return error;
312 343
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fd4529d86de5..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -43,6 +44,7 @@
43 44
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
46 48
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53#endif 55#endif
54 56
55static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
56 59
57/* 60/*
58 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
69 74
70 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
71}; 76};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
78 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
79}; 84};
80 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
81/* 100/*
82 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
83 */ 102 */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
117 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
118 137
119 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
120}; 145};
121/* Macro for accessing counter */ 146/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
130}; 155};
131 156
132/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
133 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
186 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
187}; 232};
188 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
189enum charge_type { 241enum charge_type {
190 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
191 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 255/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 257
211/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
212#define _MEM (0) 259#define _MEM (0)
@@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
215#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
216#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
217 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
218static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
219static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
220static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
221 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz,
317 unsigned long long new_usage_in_excess)
318{
319 struct rb_node **p = &mctz->rb_root.rb_node;
320 struct rb_node *parent = NULL;
321 struct mem_cgroup_per_zone *mz_node;
322
323 if (mz->on_tree)
324 return;
325
326 mz->usage_in_excess = new_usage_in_excess;
327 if (!mz->usage_in_excess)
328 return;
329 while (*p) {
330 parent = *p;
331 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
332 tree_node);
333 if (mz->usage_in_excess < mz_node->usage_in_excess)
334 p = &(*p)->rb_left;
335 /*
336 * We can't avoid mem cgroups that are over their soft
337 * limit by the same amount
338 */
339 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
340 p = &(*p)->rb_right;
341 }
342 rb_link_node(&mz->tree_node, parent, p);
343 rb_insert_color(&mz->tree_node, &mctz->rb_root);
344 mz->on_tree = true;
345}
346
347static void
348__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
349 struct mem_cgroup_per_zone *mz,
350 struct mem_cgroup_tree_per_zone *mctz)
351{
352 if (!mz->on_tree)
353 return;
354 rb_erase(&mz->tree_node, &mctz->rb_root);
355 mz->on_tree = false;
356}
357
358static void
359mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
360 struct mem_cgroup_per_zone *mz,
361 struct mem_cgroup_tree_per_zone *mctz)
362{
363 spin_lock(&mctz->lock);
364 __mem_cgroup_remove_exceeded(mem, mz, mctz);
365 spin_unlock(&mctz->lock);
366}
367
368static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
369{
370 bool ret = false;
371 int cpu;
372 s64 val;
373 struct mem_cgroup_stat_cpu *cpustat;
374
375 cpu = get_cpu();
376 cpustat = &mem->stat.cpustat[cpu];
377 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
378 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
379 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
380 ret = true;
381 }
382 put_cpu();
383 return ret;
384}
385
386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
387{
388 unsigned long long excess;
389 struct mem_cgroup_per_zone *mz;
390 struct mem_cgroup_tree_per_zone *mctz;
391 int nid = page_to_nid(page);
392 int zid = page_zonenum(page);
393 mctz = soft_limit_tree_from_page(page);
394
395 /*
396 * Necessary to update all ancestors when hierarchy is used.
397 * because their event counter is not touched.
398 */
399 for (; mem; mem = parent_mem_cgroup(mem)) {
400 mz = mem_cgroup_zoneinfo(mem, nid, zid);
401 excess = res_counter_soft_limit_excess(&mem->res);
402 /*
403 * We have to update the tree if mz is on RB-tree or
404 * mem is over its softlimit.
405 */
406 if (excess || mz->on_tree) {
407 spin_lock(&mctz->lock);
408 /* if on-tree, remove it */
409 if (mz->on_tree)
410 __mem_cgroup_remove_exceeded(mem, mz, mctz);
411 /*
412 * Insert again. mz->usage_in_excess will be updated.
413 * If excess is 0, no tree ops.
414 */
415 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
416 spin_unlock(&mctz->lock);
417 }
418 }
419}
420
421static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
422{
423 int node, zone;
424 struct mem_cgroup_per_zone *mz;
425 struct mem_cgroup_tree_per_zone *mctz;
426
427 for_each_node_state(node, N_POSSIBLE) {
428 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
429 mz = mem_cgroup_zoneinfo(mem, node, zone);
430 mctz = soft_limit_tree_node_zone(node, zone);
431 mem_cgroup_remove_exceeded(mem, mz, mctz);
432 }
433 }
434}
435
436static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
437{
438 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
439}
440
441static struct mem_cgroup_per_zone *
442__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
443{
444 struct rb_node *rightmost = NULL;
445 struct mem_cgroup_per_zone *mz;
446
447retry:
448 mz = NULL;
449 rightmost = rb_last(&mctz->rb_root);
450 if (!rightmost)
451 goto done; /* Nothing to reclaim from */
452
453 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
454 /*
455 * Remove the node now but someone else can add it back,
456 * we will to add it back at the end of reclaim to its correct
457 * position in the tree.
458 */
459 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
460 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
461 !css_tryget(&mz->mem->css))
462 goto retry;
463done:
464 return mz;
465}
466
467static struct mem_cgroup_per_zone *
468mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
469{
470 struct mem_cgroup_per_zone *mz;
471
472 spin_lock(&mctz->lock);
473 mz = __mem_cgroup_largest_soft_limit_node(mctz);
474 spin_unlock(&mctz->lock);
475 return mz;
476}
477
478static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
479 bool charge)
480{
481 int val = (charge) ? 1 : -1;
482 struct mem_cgroup_stat *stat = &mem->stat;
483 struct mem_cgroup_stat_cpu *cpustat;
484 int cpu = get_cpu();
485
486 cpustat = &stat->cpustat[cpu];
487 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
488 put_cpu();
489}
490
222static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 491static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
223 struct page_cgroup *pc, 492 struct page_cgroup *pc,
224 bool charge) 493 bool charge)
225{ 494{
226 int val = (charge)? 1 : -1; 495 int val = (charge) ? 1 : -1;
227 struct mem_cgroup_stat *stat = &mem->stat; 496 struct mem_cgroup_stat *stat = &mem->stat;
228 struct mem_cgroup_stat_cpu *cpustat; 497 struct mem_cgroup_stat_cpu *cpustat;
229 int cpu = get_cpu(); 498 int cpu = get_cpu();
@@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
240 else 509 else
241 __mem_cgroup_stat_add_safe(cpustat, 510 __mem_cgroup_stat_add_safe(cpustat,
242 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 511 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
512 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
243 put_cpu(); 513 put_cpu();
244} 514}
245 515
246static struct mem_cgroup_per_zone *
247mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
248{
249 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
250}
251
252static struct mem_cgroup_per_zone *
253page_cgroup_zoneinfo(struct page_cgroup *pc)
254{
255 struct mem_cgroup *mem = pc->mem_cgroup;
256 int nid = page_cgroup_nid(pc);
257 int zid = page_cgroup_zid(pc);
258
259 if (!mem)
260 return NULL;
261
262 return mem_cgroup_zoneinfo(mem, nid, zid);
263}
264
265static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 516static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
266 enum lru_list idx) 517 enum lru_list idx)
267{ 518{
@@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 605 return ret;
355} 606}
356 607
608static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
609{
610 return (mem == root_mem_cgroup);
611}
612
357/* 613/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 614 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 615 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 627void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 628{
373 struct page_cgroup *pc; 629 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 630 struct mem_cgroup_per_zone *mz;
376 631
377 if (mem_cgroup_disabled()) 632 if (mem_cgroup_disabled())
378 return; 633 return;
379 pc = lookup_page_cgroup(page); 634 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 635 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 636 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 637 return;
638 VM_BUG_ON(!pc->mem_cgroup);
383 /* 639 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 640 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 641 * removed from global LRU.
386 */ 642 */
387 mz = page_cgroup_zoneinfo(pc); 643 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 644 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
645 if (mem_cgroup_is_root(pc->mem_cgroup))
646 return;
647 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 648 list_del_init(&pc->lru);
391 return; 649 return;
392} 650}
@@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 668 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 669 */
412 smp_rmb(); 670 smp_rmb();
413 /* unused page is not rotated. */ 671 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 672 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 673 return;
416 mz = page_cgroup_zoneinfo(pc); 674 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 675 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 683 if (mem_cgroup_disabled())
426 return; 684 return;
427 pc = lookup_page_cgroup(page); 685 pc = lookup_page_cgroup(page);
686 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 687 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 688 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 689 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 694
436 mz = page_cgroup_zoneinfo(pc); 695 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 696 MEM_CGROUP_ZSTAT(mz, lru) += 1;
697 SetPageCgroupAcctLRU(pc);
698 if (mem_cgroup_is_root(pc->mem_cgroup))
699 return;
438 list_add(&pc->lru, &mz->lists[lru]); 700 list_add(&pc->lru, &mz->lists[lru]);
439} 701}
440 702
@@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 731
470 spin_lock_irqsave(&zone->lru_lock, flags); 732 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 733 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 734 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 735 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 736 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 737}
@@ -648,7 +910,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
648 int nid = z->zone_pgdat->node_id; 910 int nid = z->zone_pgdat->node_id;
649 int zid = zone_idx(z); 911 int zid = zone_idx(z);
650 struct mem_cgroup_per_zone *mz; 912 struct mem_cgroup_per_zone *mz;
651 int lru = LRU_FILE * !!file + !!active; 913 int lru = LRU_FILE * file + active;
652 int ret; 914 int ret;
653 915
654 BUG_ON(!mem_cont); 916 BUG_ON(!mem_cont);
@@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
855 * If shrink==true, for avoiding to free too much, this returns immedieately. 1117 * If shrink==true, for avoiding to free too much, this returns immedieately.
856 */ 1118 */
857static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1119static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
858 gfp_t gfp_mask, bool noswap, bool shrink) 1120 struct zone *zone,
1121 gfp_t gfp_mask,
1122 unsigned long reclaim_options)
859{ 1123{
860 struct mem_cgroup *victim; 1124 struct mem_cgroup *victim;
861 int ret, total = 0; 1125 int ret, total = 0;
862 int loop = 0; 1126 int loop = 0;
1127 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1128 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1129 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1130 unsigned long excess = mem_cgroup_get_excess(root_mem);
863 1131
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1132 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum) 1133 if (root_mem->memsw_is_minimum)
866 noswap = true; 1134 noswap = true;
867 1135
868 while (loop < 2) { 1136 while (1) {
869 victim = mem_cgroup_select_victim(root_mem); 1137 victim = mem_cgroup_select_victim(root_mem);
870 if (victim == root_mem) 1138 if (victim == root_mem) {
871 loop++; 1139 loop++;
1140 if (loop >= 2) {
1141 /*
1142 * If we have not been able to reclaim
1143 * anything, it might because there are
1144 * no reclaimable pages under this hierarchy
1145 */
1146 if (!check_soft || !total) {
1147 css_put(&victim->css);
1148 break;
1149 }
1150 /*
1151 * We want to do more targetted reclaim.
1152 * excess >> 2 is not to excessive so as to
1153 * reclaim too much, nor too less that we keep
1154 * coming back to reclaim from this cgroup
1155 */
1156 if (total >= (excess >> 2) ||
1157 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1158 css_put(&victim->css);
1159 break;
1160 }
1161 }
1162 }
872 if (!mem_cgroup_local_usage(&victim->stat)) { 1163 if (!mem_cgroup_local_usage(&victim->stat)) {
873 /* this cgroup's local usage == 0 */ 1164 /* this cgroup's local usage == 0 */
874 css_put(&victim->css); 1165 css_put(&victim->css);
875 continue; 1166 continue;
876 } 1167 }
877 /* we use swappiness of local cgroup */ 1168 /* we use swappiness of local cgroup */
878 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1169 if (check_soft)
879 get_swappiness(victim)); 1170 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1171 noswap, get_swappiness(victim), zone,
1172 zone->zone_pgdat->node_id);
1173 else
1174 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1175 noswap, get_swappiness(victim));
880 css_put(&victim->css); 1176 css_put(&victim->css);
881 /* 1177 /*
882 * At shrinking usage, we can't check we should stop here or 1178 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
886 if (shrink) 1182 if (shrink)
887 return ret; 1183 return ret;
888 total += ret; 1184 total += ret;
889 if (mem_cgroup_check_under_limit(root_mem)) 1185 if (check_soft) {
1186 if (res_counter_check_under_soft_limit(&root_mem->res))
1187 return total;
1188 } else if (mem_cgroup_check_under_limit(root_mem))
890 return 1 + total; 1189 return 1 + total;
891 } 1190 }
892 return total; 1191 return total;
@@ -965,7 +1264,7 @@ done:
965 */ 1264 */
966static int __mem_cgroup_try_charge(struct mm_struct *mm, 1265static int __mem_cgroup_try_charge(struct mm_struct *mm,
967 gfp_t gfp_mask, struct mem_cgroup **memcg, 1266 gfp_t gfp_mask, struct mem_cgroup **memcg,
968 bool oom) 1267 bool oom, struct page *page)
969{ 1268{
970 struct mem_cgroup *mem, *mem_over_limit; 1269 struct mem_cgroup *mem, *mem_over_limit;
971 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
996 VM_BUG_ON(css_is_removed(&mem->css)); 1295 VM_BUG_ON(css_is_removed(&mem->css));
997 1296
998 while (1) { 1297 while (1) {
999 int ret; 1298 int ret = 0;
1000 bool noswap = false; 1299 unsigned long flags = 0;
1001 1300
1301 if (mem_cgroup_is_root(mem))
1302 goto done;
1002 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
1003 if (likely(!ret)) { 1304 if (likely(!ret)) {
1004 if (!do_swap_account) 1305 if (!do_swap_account)
@@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1009 break; 1310 break;
1010 /* mem+swap counter fails */ 1311 /* mem+swap counter fails */
1011 res_counter_uncharge(&mem->res, PAGE_SIZE); 1312 res_counter_uncharge(&mem->res, PAGE_SIZE);
1012 noswap = true; 1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1013 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1014 memsw); 1315 memsw);
1015 } else 1316 } else
@@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1020 if (!(gfp_mask & __GFP_WAIT)) 1321 if (!(gfp_mask & __GFP_WAIT))
1021 goto nomem; 1322 goto nomem;
1022 1323
1023 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1324 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1024 noswap, false); 1325 gfp_mask, flags);
1025 if (ret) 1326 if (ret)
1026 continue; 1327 continue;
1027 1328
@@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1046 goto nomem; 1347 goto nomem;
1047 } 1348 }
1048 } 1349 }
1350 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit.
1353 */
1354 if (mem_cgroup_soft_limit_check(mem))
1355 mem_cgroup_update_tree(mem, page);
1356done:
1049 return 0; 1357 return 0;
1050nomem: 1358nomem:
1051 css_put(&mem->css); 1359 css_put(&mem->css);
1052 return -ENOMEM; 1360 return -ENOMEM;
1053} 1361}
1054 1362
1055
1056/* 1363/*
1057 * A helper function to get mem_cgroup from ID. must be called under 1364 * A helper function to get mem_cgroup from ID. must be called under
1058 * rcu_read_lock(). The caller must check css_is_removed() or some if 1365 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1119 lock_page_cgroup(pc); 1426 lock_page_cgroup(pc);
1120 if (unlikely(PageCgroupUsed(pc))) { 1427 if (unlikely(PageCgroupUsed(pc))) {
1121 unlock_page_cgroup(pc); 1428 unlock_page_cgroup(pc);
1122 res_counter_uncharge(&mem->res, PAGE_SIZE); 1429 if (!mem_cgroup_is_root(mem)) {
1123 if (do_swap_account) 1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1124 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1125 css_put(&mem->css); 1434 css_put(&mem->css);
1126 return; 1435 return;
1127 } 1436 }
1437
1128 pc->mem_cgroup = mem; 1438 pc->mem_cgroup = mem;
1439 /*
1440 * We access a page_cgroup asynchronously without lock_page_cgroup().
1441 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1442 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1443 * before USED bit, we need memory barrier here.
1444 * See mem_cgroup_add_lru_list(), etc.
1445 */
1129 smp_wmb(); 1446 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1447 switch (ctype) {
1448 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1449 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1450 SetPageCgroupCache(pc);
1451 SetPageCgroupUsed(pc);
1452 break;
1453 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1454 ClearPageCgroupCache(pc);
1455 SetPageCgroupUsed(pc);
1456 break;
1457 default:
1458 break;
1459 }
1131 1460
1132 mem_cgroup_charge_statistics(mem, pc, true); 1461 mem_cgroup_charge_statistics(mem, pc, true);
1133 1462
@@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1178 if (pc->mem_cgroup != from) 1507 if (pc->mem_cgroup != from)
1179 goto out; 1508 goto out;
1180 1509
1181 res_counter_uncharge(&from->res, PAGE_SIZE); 1510 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE);
1182 mem_cgroup_charge_statistics(from, pc, false); 1512 mem_cgroup_charge_statistics(from, pc, false);
1183 1513
1184 page = pc->page; 1514 page = pc->page;
@@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1197 1); 1527 1);
1198 } 1528 }
1199 1529
1200 if (do_swap_account) 1530 if (do_swap_account && !mem_cgroup_is_root(from))
1201 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1531 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1202 css_put(&from->css); 1532 css_put(&from->css);
1203 1533
@@ -1238,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1238 parent = mem_cgroup_from_cont(pcg); 1568 parent = mem_cgroup_from_cont(pcg);
1239 1569
1240 1570
1241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1242 if (ret || !parent) 1572 if (ret || !parent)
1243 return ret; 1573 return ret;
1244 1574
@@ -1268,9 +1598,11 @@ uncharge:
1268 /* drop extra refcnt by try_charge() */ 1598 /* drop extra refcnt by try_charge() */
1269 css_put(&parent->css); 1599 css_put(&parent->css);
1270 /* uncharge if move fails */ 1600 /* uncharge if move fails */
1271 res_counter_uncharge(&parent->res, PAGE_SIZE); 1601 if (!mem_cgroup_is_root(parent)) {
1272 if (do_swap_account) 1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1273 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1274 return ret; 1606 return ret;
1275} 1607}
1276 1608
@@ -1295,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1295 prefetchw(pc); 1627 prefetchw(pc);
1296 1628
1297 mem = memcg; 1629 mem = memcg;
1298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1630 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1299 if (ret || !mem) 1631 if (ret || !mem)
1300 return ret; 1632 return ret;
1301 1633
@@ -1414,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1414 if (!mem) 1746 if (!mem)
1415 goto charge_cur_mm; 1747 goto charge_cur_mm;
1416 *ptr = mem; 1748 *ptr = mem;
1417 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1749 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1418 /* drop extra refcnt from tryget */ 1750 /* drop extra refcnt from tryget */
1419 css_put(&mem->css); 1751 css_put(&mem->css);
1420 return ret; 1752 return ret;
1421charge_cur_mm: 1753charge_cur_mm:
1422 if (unlikely(!mm)) 1754 if (unlikely(!mm))
1423 mm = &init_mm; 1755 mm = &init_mm;
1424 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1756 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1425} 1757}
1426 1758
1427static void 1759static void
@@ -1459,7 +1791,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1459 * This recorded memcg can be obsolete one. So, avoid 1791 * This recorded memcg can be obsolete one. So, avoid
1460 * calling css_tryget 1792 * calling css_tryget
1461 */ 1793 */
1462 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1794 if (!mem_cgroup_is_root(memcg))
1795 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1796 mem_cgroup_swap_statistics(memcg, false);
1463 mem_cgroup_put(memcg); 1797 mem_cgroup_put(memcg);
1464 } 1798 }
1465 rcu_read_unlock(); 1799 rcu_read_unlock();
@@ -1484,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1484 return; 1818 return;
1485 if (!mem) 1819 if (!mem)
1486 return; 1820 return;
1487 res_counter_uncharge(&mem->res, PAGE_SIZE); 1821 if (!mem_cgroup_is_root(mem)) {
1488 if (do_swap_account) 1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1489 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1490 css_put(&mem->css); 1826 css_put(&mem->css);
1491} 1827}
1492 1828
@@ -1538,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1538 break; 1874 break;
1539 } 1875 }
1540 1876
1541 res_counter_uncharge(&mem->res, PAGE_SIZE); 1877 if (!mem_cgroup_is_root(mem)) {
1542 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1878 res_counter_uncharge(&mem->res, PAGE_SIZE);
1543 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true);
1544 mem_cgroup_charge_statistics(mem, pc, false); 1885 mem_cgroup_charge_statistics(mem, pc, false);
1545 1886
1546 ClearPageCgroupUsed(pc); 1887 ClearPageCgroupUsed(pc);
@@ -1554,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1554 mz = page_cgroup_zoneinfo(pc); 1895 mz = page_cgroup_zoneinfo(pc);
1555 unlock_page_cgroup(pc); 1896 unlock_page_cgroup(pc);
1556 1897
1898 if (mem_cgroup_soft_limit_check(mem))
1899 mem_cgroup_update_tree(mem, page);
1557 /* at swapout, this memcg will be accessed to record to swap */ 1900 /* at swapout, this memcg will be accessed to record to swap */
1558 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1559 css_put(&mem->css); 1902 css_put(&mem->css);
@@ -1629,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1629 * We uncharge this because swap is freed. 1972 * We uncharge this because swap is freed.
1630 * This memcg can be obsolete one. We avoid calling css_tryget 1973 * This memcg can be obsolete one. We avoid calling css_tryget
1631 */ 1974 */
1632 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1975 if (!mem_cgroup_is_root(memcg))
1976 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1977 mem_cgroup_swap_statistics(memcg, false);
1633 mem_cgroup_put(memcg); 1978 mem_cgroup_put(memcg);
1634 } 1979 }
1635 rcu_read_unlock(); 1980 rcu_read_unlock();
@@ -1658,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1658 unlock_page_cgroup(pc); 2003 unlock_page_cgroup(pc);
1659 2004
1660 if (mem) { 2005 if (mem) {
1661 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2006 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2007 page);
1662 css_put(&mem->css); 2008 css_put(&mem->css);
1663 } 2009 }
1664 *ptr = mem; 2010 *ptr = mem;
@@ -1798,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1798 if (!ret) 2144 if (!ret)
1799 break; 2145 break;
1800 2146
1801 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1802 false, true); 2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK);
1803 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1804 /* Usage is reduced ? */ 2151 /* Usage is reduced ? */
1805 if (curusage >= oldusage) 2152 if (curusage >= oldusage)
@@ -1851,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1851 if (!ret) 2198 if (!ret)
1852 break; 2199 break;
1853 2200
1854 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); 2201 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2202 MEM_CGROUP_RECLAIM_NOSWAP |
2203 MEM_CGROUP_RECLAIM_SHRINK);
1855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2204 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1856 /* Usage is reduced ? */ 2205 /* Usage is reduced ? */
1857 if (curusage >= oldusage) 2206 if (curusage >= oldusage)
@@ -1862,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1862 return ret; 2211 return ret;
1863} 2212}
1864 2213
2214unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2215 gfp_t gfp_mask, int nid,
2216 int zid)
2217{
2218 unsigned long nr_reclaimed = 0;
2219 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2220 unsigned long reclaimed;
2221 int loop = 0;
2222 struct mem_cgroup_tree_per_zone *mctz;
2223 unsigned long long excess;
2224
2225 if (order > 0)
2226 return 0;
2227
2228 mctz = soft_limit_tree_node_zone(nid, zid);
2229 /*
2230 * This loop can run a while, specially if mem_cgroup's continuously
2231 * keep exceeding their soft limit and putting the system under
2232 * pressure
2233 */
2234 do {
2235 if (next_mz)
2236 mz = next_mz;
2237 else
2238 mz = mem_cgroup_largest_soft_limit_node(mctz);
2239 if (!mz)
2240 break;
2241
2242 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2243 gfp_mask,
2244 MEM_CGROUP_RECLAIM_SOFT);
2245 nr_reclaimed += reclaimed;
2246 spin_lock(&mctz->lock);
2247
2248 /*
2249 * If we failed to reclaim anything from this memory cgroup
2250 * it is time to move on to the next cgroup
2251 */
2252 next_mz = NULL;
2253 if (!reclaimed) {
2254 do {
2255 /*
2256 * Loop until we find yet another one.
2257 *
2258 * By the time we get the soft_limit lock
2259 * again, someone might have aded the
2260 * group back on the RB tree. Iterate to
2261 * make sure we get a different mem.
2262 * mem_cgroup_largest_soft_limit_node returns
2263 * NULL if no other cgroup is present on
2264 * the tree
2265 */
2266 next_mz =
2267 __mem_cgroup_largest_soft_limit_node(mctz);
2268 if (next_mz == mz) {
2269 css_put(&next_mz->mem->css);
2270 next_mz = NULL;
2271 } else /* next_mz == NULL or other memcg */
2272 break;
2273 } while (1);
2274 }
2275 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2276 excess = res_counter_soft_limit_excess(&mz->mem->res);
2277 /*
2278 * One school of thought says that we should not add
2279 * back the node to the tree if reclaim returns 0.
2280 * But our reclaim could return 0, simply because due
2281 * to priority we are exposing a smaller subset of
2282 * memory to reclaim from. Consider this as a longer
2283 * term TODO.
2284 */
2285 /* If excess == 0, no tree ops */
2286 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2287 spin_unlock(&mctz->lock);
2288 css_put(&mz->mem->css);
2289 loop++;
2290 /*
2291 * Could not reclaim anything and there are no more
2292 * mem cgroups to try or we seem to be looping without
2293 * reclaiming anything.
2294 */
2295 if (!nr_reclaimed &&
2296 (next_mz == NULL ||
2297 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2298 break;
2299 } while (!nr_reclaimed);
2300 if (next_mz)
2301 css_put(&next_mz->mem->css);
2302 return nr_reclaimed;
2303}
2304
1865/* 2305/*
1866 * This routine traverse page_cgroup in given list and drop them all. 2306 * This routine traverse page_cgroup in given list and drop them all.
1867 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2307 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2046 return retval; 2486 return retval;
2047} 2487}
2048 2488
2489struct mem_cgroup_idx_data {
2490 s64 val;
2491 enum mem_cgroup_stat_index idx;
2492};
2493
2494static int
2495mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2496{
2497 struct mem_cgroup_idx_data *d = data;
2498 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2499 return 0;
2500}
2501
2502static void
2503mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2504 enum mem_cgroup_stat_index idx, s64 *val)
2505{
2506 struct mem_cgroup_idx_data d;
2507 d.idx = idx;
2508 d.val = 0;
2509 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2510 *val = d.val;
2511}
2512
2049static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2513static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2050{ 2514{
2051 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2052 u64 val = 0; 2516 u64 idx_val, val;
2053 int type, name; 2517 int type, name;
2054 2518
2055 type = MEMFILE_TYPE(cft->private); 2519 type = MEMFILE_TYPE(cft->private);
2056 name = MEMFILE_ATTR(cft->private); 2520 name = MEMFILE_ATTR(cft->private);
2057 switch (type) { 2521 switch (type) {
2058 case _MEM: 2522 case _MEM:
2059 val = res_counter_read_u64(&mem->res, name); 2523 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2524 mem_cgroup_get_recursive_idx_stat(mem,
2525 MEM_CGROUP_STAT_CACHE, &idx_val);
2526 val = idx_val;
2527 mem_cgroup_get_recursive_idx_stat(mem,
2528 MEM_CGROUP_STAT_RSS, &idx_val);
2529 val += idx_val;
2530 val <<= PAGE_SHIFT;
2531 } else
2532 val = res_counter_read_u64(&mem->res, name);
2060 break; 2533 break;
2061 case _MEMSWAP: 2534 case _MEMSWAP:
2062 val = res_counter_read_u64(&mem->memsw, name); 2535 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2536 mem_cgroup_get_recursive_idx_stat(mem,
2537 MEM_CGROUP_STAT_CACHE, &idx_val);
2538 val = idx_val;
2539 mem_cgroup_get_recursive_idx_stat(mem,
2540 MEM_CGROUP_STAT_RSS, &idx_val);
2541 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2544 val <<= PAGE_SHIFT;
2545 } else
2546 val = res_counter_read_u64(&mem->memsw, name);
2063 break; 2547 break;
2064 default: 2548 default:
2065 BUG(); 2549 BUG();
@@ -2083,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2567 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2568 switch (name) {
2085 case RES_LIMIT: 2569 case RES_LIMIT:
2570 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2571 ret = -EINVAL;
2572 break;
2573 }
2086 /* This function does all necessary parse...reuse it */ 2574 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2575 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2576 if (ret)
@@ -2092,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2092 else 2580 else
2093 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2581 ret = mem_cgroup_resize_memsw_limit(memcg, val);
2094 break; 2582 break;
2583 case RES_SOFT_LIMIT:
2584 ret = res_counter_memparse_write_strategy(buffer, &val);
2585 if (ret)
2586 break;
2587 /*
2588 * For memsw, soft limits are hard to implement in terms
2589 * of semantics, for now, we support soft limits for
2590 * control without swap
2591 */
2592 if (type == _MEM)
2593 ret = res_counter_set_soft_limit(&memcg->res, val);
2594 else
2595 ret = -EINVAL;
2596 break;
2095 default: 2597 default:
2096 ret = -EINVAL; /* should be BUG() ? */ 2598 ret = -EINVAL; /* should be BUG() ? */
2097 break; 2599 break;
@@ -2149,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2149 res_counter_reset_failcnt(&mem->memsw); 2651 res_counter_reset_failcnt(&mem->memsw);
2150 break; 2652 break;
2151 } 2653 }
2654
2152 return 0; 2655 return 0;
2153} 2656}
2154 2657
@@ -2160,6 +2663,7 @@ enum {
2160 MCS_MAPPED_FILE, 2663 MCS_MAPPED_FILE,
2161 MCS_PGPGIN, 2664 MCS_PGPGIN,
2162 MCS_PGPGOUT, 2665 MCS_PGPGOUT,
2666 MCS_SWAP,
2163 MCS_INACTIVE_ANON, 2667 MCS_INACTIVE_ANON,
2164 MCS_ACTIVE_ANON, 2668 MCS_ACTIVE_ANON,
2165 MCS_INACTIVE_FILE, 2669 MCS_INACTIVE_FILE,
@@ -2181,6 +2685,7 @@ struct {
2181 {"mapped_file", "total_mapped_file"}, 2685 {"mapped_file", "total_mapped_file"},
2182 {"pgpgin", "total_pgpgin"}, 2686 {"pgpgin", "total_pgpgin"},
2183 {"pgpgout", "total_pgpgout"}, 2687 {"pgpgout", "total_pgpgout"},
2688 {"swap", "total_swap"},
2184 {"inactive_anon", "total_inactive_anon"}, 2689 {"inactive_anon", "total_inactive_anon"},
2185 {"active_anon", "total_active_anon"}, 2690 {"active_anon", "total_active_anon"},
2186 {"inactive_file", "total_inactive_file"}, 2691 {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2205 s->stat[MCS_PGPGIN] += val; 2710 s->stat[MCS_PGPGIN] += val;
2206 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2207 s->stat[MCS_PGPGOUT] += val; 2712 s->stat[MCS_PGPGOUT] += val;
2713 if (do_swap_account) {
2714 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2715 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2716 }
2208 2717
2209 /* per zone stat */ 2718 /* per zone stat */
2210 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2719 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2236 memset(&mystat, 0, sizeof(mystat)); 2745 memset(&mystat, 0, sizeof(mystat));
2237 mem_cgroup_get_local_stat(mem_cont, &mystat); 2746 mem_cgroup_get_local_stat(mem_cont, &mystat);
2238 2747
2239 for (i = 0; i < NR_MCS_STAT; i++) 2748 for (i = 0; i < NR_MCS_STAT; i++) {
2749 if (i == MCS_SWAP && !do_swap_account)
2750 continue;
2240 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2751 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2752 }
2241 2753
2242 /* Hierarchical information */ 2754 /* Hierarchical information */
2243 { 2755 {
@@ -2250,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2250 2762
2251 memset(&mystat, 0, sizeof(mystat)); 2763 memset(&mystat, 0, sizeof(mystat));
2252 mem_cgroup_get_total_stat(mem_cont, &mystat); 2764 mem_cgroup_get_total_stat(mem_cont, &mystat);
2253 for (i = 0; i < NR_MCS_STAT; i++) 2765 for (i = 0; i < NR_MCS_STAT; i++) {
2766 if (i == MCS_SWAP && !do_swap_account)
2767 continue;
2254 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2768 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2255 2769 }
2256 2770
2257#ifdef CONFIG_DEBUG_VM 2771#ifdef CONFIG_DEBUG_VM
2258 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2772 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = {
2345 .read_u64 = mem_cgroup_read, 2859 .read_u64 = mem_cgroup_read,
2346 }, 2860 },
2347 { 2861 {
2862 .name = "soft_limit_in_bytes",
2863 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2864 .write_string = mem_cgroup_write,
2865 .read_u64 = mem_cgroup_read,
2866 },
2867 {
2348 .name = "failcnt", 2868 .name = "failcnt",
2349 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2869 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2350 .trigger = mem_cgroup_reset, 2870 .trigger = mem_cgroup_reset,
@@ -2438,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2438 mz = &pn->zoneinfo[zone]; 2958 mz = &pn->zoneinfo[zone];
2439 for_each_lru(l) 2959 for_each_lru(l)
2440 INIT_LIST_HEAD(&mz->lists[l]); 2960 INIT_LIST_HEAD(&mz->lists[l]);
2961 mz->usage_in_excess = 0;
2962 mz->on_tree = false;
2963 mz->mem = mem;
2441 } 2964 }
2442 return 0; 2965 return 0;
2443} 2966}
@@ -2483,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2483{ 3006{
2484 int node; 3007 int node;
2485 3008
3009 mem_cgroup_remove_from_trees(mem);
2486 free_css_id(&mem_cgroup_subsys, &mem->css); 3010 free_css_id(&mem_cgroup_subsys, &mem->css);
2487 3011
2488 for_each_node_state(node, N_POSSIBLE) 3012 for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3055,31 @@ static void __init enable_swap_cgroup(void)
2531} 3055}
2532#endif 3056#endif
2533 3057
3058static int mem_cgroup_soft_limit_tree_init(void)
3059{
3060 struct mem_cgroup_tree_per_node *rtpn;
3061 struct mem_cgroup_tree_per_zone *rtpz;
3062 int tmp, node, zone;
3063
3064 for_each_node_state(node, N_POSSIBLE) {
3065 tmp = node;
3066 if (!node_state(node, N_NORMAL_MEMORY))
3067 tmp = -1;
3068 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3069 if (!rtpn)
3070 return 1;
3071
3072 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3073
3074 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3075 rtpz = &rtpn->rb_tree_per_zone[zone];
3076 rtpz->rb_root = RB_ROOT;
3077 spin_lock_init(&rtpz->lock);
3078 }
3079 }
3080 return 0;
3081}
3082
2534static struct cgroup_subsys_state * __ref 3083static struct cgroup_subsys_state * __ref
2535mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3084mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2536{ 3085{
@@ -2545,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2545 for_each_node_state(node, N_POSSIBLE) 3094 for_each_node_state(node, N_POSSIBLE)
2546 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3095 if (alloc_mem_cgroup_per_zone_info(mem, node))
2547 goto free_out; 3096 goto free_out;
3097
2548 /* root ? */ 3098 /* root ? */
2549 if (cont->parent == NULL) { 3099 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 3100 enable_swap_cgroup();
2551 parent = NULL; 3101 parent = NULL;
3102 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out;
3105
2552 } else { 3106 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 3107 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 3108 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 3131 return &mem->css;
2578free_out: 3132free_out:
2579 __mem_cgroup_free(mem); 3133 __mem_cgroup_free(mem);
3134 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 3135 return ERR_PTR(error);
2581} 3136}
2582 3137
@@ -2612,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2612static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3167static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2613 struct cgroup *cont, 3168 struct cgroup *cont,
2614 struct cgroup *old_cont, 3169 struct cgroup *old_cont,
2615 struct task_struct *p) 3170 struct task_struct *p,
3171 bool threadgroup)
2616{ 3172{
2617 mutex_lock(&memcg_tasklist); 3173 mutex_lock(&memcg_tasklist);
2618 /* 3174 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..729d4b15b645
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/rmap.h>
39#include <linux/pagemap.h>
40#include <linux/swap.h>
41#include <linux/backing-dev.h>
42#include "internal.h"
43
44int sysctl_memory_failure_early_kill __read_mostly = 0;
45
46int sysctl_memory_failure_recovery __read_mostly = 1;
47
48atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
49
50/*
51 * Send all the processes who have the page mapped an ``action optional''
52 * signal.
53 */
54static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
55 unsigned long pfn)
56{
57 struct siginfo si;
58 int ret;
59
60 printk(KERN_ERR
61 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
62 pfn, t->comm, t->pid);
63 si.si_signo = SIGBUS;
64 si.si_errno = 0;
65 si.si_code = BUS_MCEERR_AO;
66 si.si_addr = (void *)addr;
67#ifdef __ARCH_SI_TRAPNO
68 si.si_trapno = trapno;
69#endif
70 si.si_addr_lsb = PAGE_SHIFT;
71 /*
72 * Don't use force here, it's convenient if the signal
73 * can be temporarily blocked.
74 * This could cause a loop when the user sets SIGBUS
75 * to SIG_IGN, but hopefully noone will do that?
76 */
77 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
78 if (ret < 0)
79 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
80 t->comm, t->pid, ret);
81 return ret;
82}
83
84/*
85 * Kill all processes that have a poisoned page mapped and then isolate
86 * the page.
87 *
88 * General strategy:
89 * Find all processes having the page mapped and kill them.
90 * But we keep a page reference around so that the page is not
91 * actually freed yet.
92 * Then stash the page away
93 *
94 * There's no convenient way to get back to mapped processes
95 * from the VMAs. So do a brute-force search over all
96 * running processes.
97 *
98 * Remember that machine checks are not common (or rather
99 * if they are common you have other problems), so this shouldn't
100 * be a performance issue.
101 *
102 * Also there are some races possible while we get from the
103 * error detection to actually handle it.
104 */
105
106struct to_kill {
107 struct list_head nd;
108 struct task_struct *tsk;
109 unsigned long addr;
110 unsigned addr_valid:1;
111};
112
113/*
114 * Failure handling: if we can't find or can't kill a process there's
115 * not much we can do. We just print a message and ignore otherwise.
116 */
117
118/*
119 * Schedule a process for later kill.
120 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
121 * TBD would GFP_NOIO be enough?
122 */
123static void add_to_kill(struct task_struct *tsk, struct page *p,
124 struct vm_area_struct *vma,
125 struct list_head *to_kill,
126 struct to_kill **tkc)
127{
128 struct to_kill *tk;
129
130 if (*tkc) {
131 tk = *tkc;
132 *tkc = NULL;
133 } else {
134 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
135 if (!tk) {
136 printk(KERN_ERR
137 "MCE: Out of memory while machine check handling\n");
138 return;
139 }
140 }
141 tk->addr = page_address_in_vma(p, vma);
142 tk->addr_valid = 1;
143
144 /*
145 * In theory we don't have to kill when the page was
146 * munmaped. But it could be also a mremap. Since that's
147 * likely very rare kill anyways just out of paranoia, but use
148 * a SIGKILL because the error is not contained anymore.
149 */
150 if (tk->addr == -EFAULT) {
151 pr_debug("MCE: Unable to find user space address %lx in %s\n",
152 page_to_pfn(p), tsk->comm);
153 tk->addr_valid = 0;
154 }
155 get_task_struct(tsk);
156 tk->tsk = tsk;
157 list_add_tail(&tk->nd, to_kill);
158}
159
160/*
161 * Kill the processes that have been collected earlier.
162 *
163 * Only do anything when DOIT is set, otherwise just free the list
164 * (this is used for clean pages which do not need killing)
165 * Also when FAIL is set do a force kill because something went
166 * wrong earlier.
167 */
168static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
169 int fail, unsigned long pfn)
170{
171 struct to_kill *tk, *next;
172
173 list_for_each_entry_safe (tk, next, to_kill, nd) {
174 if (doit) {
175 /*
176 * In case something went wrong with munmaping
177 * make sure the process doesn't catch the
178 * signal and then access the memory. Just kill it.
179 * the signal handlers
180 */
181 if (fail || tk->addr_valid == 0) {
182 printk(KERN_ERR
183 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
184 pfn, tk->tsk->comm, tk->tsk->pid);
185 force_sig(SIGKILL, tk->tsk);
186 }
187
188 /*
189 * In theory the process could have mapped
190 * something else on the address in-between. We could
191 * check for that, but we need to tell the
192 * process anyways.
193 */
194 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
195 pfn) < 0)
196 printk(KERN_ERR
197 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
198 pfn, tk->tsk->comm, tk->tsk->pid);
199 }
200 put_task_struct(tk->tsk);
201 kfree(tk);
202 }
203}
204
205static int task_early_kill(struct task_struct *tsk)
206{
207 if (!tsk->mm)
208 return 0;
209 if (tsk->flags & PF_MCE_PROCESS)
210 return !!(tsk->flags & PF_MCE_EARLY);
211 return sysctl_memory_failure_early_kill;
212}
213
214/*
215 * Collect processes when the error hit an anonymous page.
216 */
217static void collect_procs_anon(struct page *page, struct list_head *to_kill,
218 struct to_kill **tkc)
219{
220 struct vm_area_struct *vma;
221 struct task_struct *tsk;
222 struct anon_vma *av;
223
224 read_lock(&tasklist_lock);
225 av = page_lock_anon_vma(page);
226 if (av == NULL) /* Not actually mapped anymore */
227 goto out;
228 for_each_process (tsk) {
229 if (!task_early_kill(tsk))
230 continue;
231 list_for_each_entry (vma, &av->head, anon_vma_node) {
232 if (!page_mapped_in_vma(page, vma))
233 continue;
234 if (vma->vm_mm == tsk->mm)
235 add_to_kill(tsk, page, vma, to_kill, tkc);
236 }
237 }
238 page_unlock_anon_vma(av);
239out:
240 read_unlock(&tasklist_lock);
241}
242
243/*
244 * Collect processes when the error hit a file mapped page.
245 */
246static void collect_procs_file(struct page *page, struct list_head *to_kill,
247 struct to_kill **tkc)
248{
249 struct vm_area_struct *vma;
250 struct task_struct *tsk;
251 struct prio_tree_iter iter;
252 struct address_space *mapping = page->mapping;
253
254 /*
255 * A note on the locking order between the two locks.
256 * We don't rely on this particular order.
257 * If you have some other code that needs a different order
258 * feel free to switch them around. Or add a reverse link
259 * from mm_struct to task_struct, then this could be all
260 * done without taking tasklist_lock and looping over all tasks.
261 */
262
263 read_lock(&tasklist_lock);
264 spin_lock(&mapping->i_mmap_lock);
265 for_each_process(tsk) {
266 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
267
268 if (!task_early_kill(tsk))
269 continue;
270
271 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
272 pgoff) {
273 /*
274 * Send early kill signal to tasks where a vma covers
275 * the page but the corrupted page is not necessarily
276 * mapped it in its pte.
277 * Assume applications who requested early kill want
278 * to be informed of all such data corruptions.
279 */
280 if (vma->vm_mm == tsk->mm)
281 add_to_kill(tsk, page, vma, to_kill, tkc);
282 }
283 }
284 spin_unlock(&mapping->i_mmap_lock);
285 read_unlock(&tasklist_lock);
286}
287
288/*
289 * Collect the processes who have the corrupted page mapped to kill.
290 * This is done in two steps for locking reasons.
291 * First preallocate one tokill structure outside the spin locks,
292 * so that we can kill at least one process reasonably reliable.
293 */
294static void collect_procs(struct page *page, struct list_head *tokill)
295{
296 struct to_kill *tk;
297
298 if (!page->mapping)
299 return;
300
301 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
302 if (!tk)
303 return;
304 if (PageAnon(page))
305 collect_procs_anon(page, tokill, &tk);
306 else
307 collect_procs_file(page, tokill, &tk);
308 kfree(tk);
309}
310
311/*
312 * Error handlers for various types of pages.
313 */
314
315enum outcome {
316 FAILED, /* Error handling failed */
317 DELAYED, /* Will be handled later */
318 IGNORED, /* Error safely ignored */
319 RECOVERED, /* Successfully recovered */
320};
321
322static const char *action_name[] = {
323 [FAILED] = "Failed",
324 [DELAYED] = "Delayed",
325 [IGNORED] = "Ignored",
326 [RECOVERED] = "Recovered",
327};
328
329/*
330 * Error hit kernel page.
331 * Do nothing, try to be lucky and not touch this instead. For a few cases we
332 * could be more sophisticated.
333 */
334static int me_kernel(struct page *p, unsigned long pfn)
335{
336 return DELAYED;
337}
338
339/*
340 * Already poisoned page.
341 */
342static int me_ignore(struct page *p, unsigned long pfn)
343{
344 return IGNORED;
345}
346
347/*
348 * Page in unknown state. Do nothing.
349 */
350static int me_unknown(struct page *p, unsigned long pfn)
351{
352 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
353 return FAILED;
354}
355
356/*
357 * Free memory
358 */
359static int me_free(struct page *p, unsigned long pfn)
360{
361 return DELAYED;
362}
363
364/*
365 * Clean (or cleaned) page cache page.
366 */
367static int me_pagecache_clean(struct page *p, unsigned long pfn)
368{
369 int err;
370 int ret = FAILED;
371 struct address_space *mapping;
372
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /*
377 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds.
379 */
380 if (PageAnon(p))
381 return RECOVERED;
382
383 /*
384 * Now truncate the page in the page cache. This is really
385 * more like a "temporary hole punch"
386 * Don't do this for block devices when someone else
387 * has a reference, because it could be file system metadata
388 * and that's not safe to truncate.
389 */
390 mapping = page_mapping(p);
391 if (!mapping) {
392 /*
393 * Page has been teared down in the meanwhile
394 */
395 return FAILED;
396 }
397
398 /*
399 * Truncation is a bit tricky. Enable it per file system for now.
400 *
401 * Open: to take i_mutex or not for this? Right now we don't.
402 */
403 if (mapping->a_ops->error_remove_page) {
404 err = mapping->a_ops->error_remove_page(mapping, p);
405 if (err != 0) {
406 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
407 pfn, err);
408 } else if (page_has_private(p) &&
409 !try_to_release_page(p, GFP_NOIO)) {
410 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
411 } else {
412 ret = RECOVERED;
413 }
414 } else {
415 /*
416 * If the file system doesn't support it just invalidate
417 * This fails on dirty or anything with private pages
418 */
419 if (invalidate_inode_page(p))
420 ret = RECOVERED;
421 else
422 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
423 pfn);
424 }
425 return ret;
426}
427
428/*
429 * Dirty cache page page
430 * Issues: when the error hit a hole page the error is not properly
431 * propagated.
432 */
433static int me_pagecache_dirty(struct page *p, unsigned long pfn)
434{
435 struct address_space *mapping = page_mapping(p);
436
437 SetPageError(p);
438 /* TBD: print more information about the file. */
439 if (mapping) {
440 /*
441 * IO error will be reported by write(), fsync(), etc.
442 * who check the mapping.
443 * This way the application knows that something went
444 * wrong with its dirty file data.
445 *
446 * There's one open issue:
447 *
448 * The EIO will be only reported on the next IO
449 * operation and then cleared through the IO map.
450 * Normally Linux has two mechanisms to pass IO error
451 * first through the AS_EIO flag in the address space
452 * and then through the PageError flag in the page.
453 * Since we drop pages on memory failure handling the
454 * only mechanism open to use is through AS_AIO.
455 *
456 * This has the disadvantage that it gets cleared on
457 * the first operation that returns an error, while
458 * the PageError bit is more sticky and only cleared
459 * when the page is reread or dropped. If an
460 * application assumes it will always get error on
461 * fsync, but does other operations on the fd before
462 * and the page is dropped inbetween then the error
463 * will not be properly reported.
464 *
465 * This can already happen even without hwpoisoned
466 * pages: first on metadata IO errors (which only
467 * report through AS_EIO) or when the page is dropped
468 * at the wrong time.
469 *
470 * So right now we assume that the application DTRT on
471 * the first EIO, but we're not worse than other parts
472 * of the kernel.
473 */
474 mapping_set_error(mapping, EIO);
475 }
476
477 return me_pagecache_clean(p, pfn);
478}
479
480/*
481 * Clean and dirty swap cache.
482 *
483 * Dirty swap cache page is tricky to handle. The page could live both in page
484 * cache and swap cache(ie. page is freshly swapped in). So it could be
485 * referenced concurrently by 2 types of PTEs:
486 * normal PTEs and swap PTEs. We try to handle them consistently by calling
487 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
488 * and then
489 * - clear dirty bit to prevent IO
490 * - remove from LRU
491 * - but keep in the swap cache, so that when we return to it on
492 * a later page fault, we know the application is accessing
493 * corrupted data and shall be killed (we installed simple
494 * interception code in do_swap_page to catch it).
495 *
496 * Clean swap cache pages can be directly isolated. A later page fault will
497 * bring in the known good data from disk.
498 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{
501 int ret = FAILED;
502
503 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p);
506
507 if (!isolate_lru_page(p)) {
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513}
514
515static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p);
524 return ret;
525}
526
527/*
528 * Huge pages. Needs work.
529 * Issues:
530 * No rmap support so we cannot find the original mapper. In theory could walk
531 * all MMs and look for the mappings, but that would be non atomic and racy.
532 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
533 * like just walking the current process and hoping it has it mapped (that
534 * should be usually true for the common "shared database cache" case)
535 * Should handle free huge pages and dequeue them too, but this needs to
536 * handle huge page accounting correctly.
537 */
538static int me_huge_page(struct page *p, unsigned long pfn)
539{
540 return FAILED;
541}
542
543/*
544 * Various page states we can handle.
545 *
546 * A page state is defined by its current page->flags bits.
547 * The table matches them in order and calls the right handler.
548 *
549 * This is quite tricky because we can access page at any time
550 * in its live cycle, so all accesses have to be extremly careful.
551 *
552 * This is not complete. More states could be added.
553 * For any missing state don't attempt recovery.
554 */
555
556#define dirty (1UL << PG_dirty)
557#define sc (1UL << PG_swapcache)
558#define unevict (1UL << PG_unevictable)
559#define mlock (1UL << PG_mlocked)
560#define writeback (1UL << PG_writeback)
561#define lru (1UL << PG_lru)
562#define swapbacked (1UL << PG_swapbacked)
563#define head (1UL << PG_head)
564#define tail (1UL << PG_tail)
565#define compound (1UL << PG_compound)
566#define slab (1UL << PG_slab)
567#define buddy (1UL << PG_buddy)
568#define reserved (1UL << PG_reserved)
569
570static struct page_state {
571 unsigned long mask;
572 unsigned long res;
573 char *msg;
574 int (*action)(struct page *p, unsigned long pfn);
575} error_states[] = {
576 { reserved, reserved, "reserved kernel", me_ignore },
577 { buddy, buddy, "free kernel", me_free },
578
579 /*
580 * Could in theory check if slab page is free or if we can drop
581 * currently unused objects without touching them. But just
582 * treat it as standard kernel for now.
583 */
584 { slab, slab, "kernel slab", me_kernel },
585
586#ifdef CONFIG_PAGEFLAGS_EXTENDED
587 { head, head, "huge", me_huge_page },
588 { tail, tail, "huge", me_huge_page },
589#else
590 { compound, compound, "huge", me_huge_page },
591#endif
592
593 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
594 { sc|dirty, sc, "swapcache", me_swapcache_clean },
595
596 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
597 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
598
599#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
600 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
601 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
602#endif
603
604 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
605 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
606 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
607
608 /*
609 * Catchall entry: must be at end.
610 */
611 { 0, 0, "unknown page state", me_unknown },
612};
613
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result)
617{
618 struct page *page = NULL;
619 if (pfn_valid(pfn))
620 page = pfn_to_page(pfn);
621
622 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
623 pfn,
624 page && PageDirty(page) ? "dirty " : "",
625 msg, action_name[result]);
626}
627
628static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref)
630{
631 int result;
632
633 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref)
636 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1);
639
640 /* Could do more checks here if page looks ok */
641 /*
642 * Could adjust zone counters here to correct for the missing page.
643 */
644
645 return result == RECOVERED ? 0 : -EBUSY;
646}
647
648#define N_UNMAP_TRIES 5
649
650/*
651 * Do all that is necessary to remove user space mappings. Unmap
652 * the pages and send SIGBUS to the processes if the data was dirty.
653 */
654static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
655 int trapno)
656{
657 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
658 struct address_space *mapping;
659 LIST_HEAD(tokill);
660 int ret;
661 int i;
662 int kill = 1;
663
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p))
665 return;
666
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /*
671 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills.
673 */
674 if (!page_mapped(p))
675 return;
676
677 if (PageSwapCache(p)) {
678 printk(KERN_ERR
679 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
680 ttu |= TTU_IGNORE_HWPOISON;
681 }
682
683 /*
684 * Propagate the dirty bit from PTEs to struct page first, because we
685 * need this to decide if we should kill or just drop the page.
686 */
687 mapping = page_mapping(p);
688 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
689 if (page_mkclean(p)) {
690 SetPageDirty(p);
691 } else {
692 kill = 0;
693 ttu |= TTU_IGNORE_HWPOISON;
694 printk(KERN_INFO
695 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
696 pfn);
697 }
698 }
699
700 /*
701 * First collect all the processes that have the page
702 * mapped in dirty form. This has to be done before try_to_unmap,
703 * because ttu takes the rmap data structures down.
704 *
705 * Error handling: We ignore errors here because
706 * there's nothing that can be done.
707 */
708 if (kill)
709 collect_procs(p, &tokill);
710
711 /*
712 * try_to_unmap can fail temporarily due to races.
713 * Try a few times (RED-PEN better strategy?)
714 */
715 for (i = 0; i < N_UNMAP_TRIES; i++) {
716 ret = try_to_unmap(p, ttu);
717 if (ret == SWAP_SUCCESS)
718 break;
719 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
720 }
721
722 if (ret != SWAP_SUCCESS)
723 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
724 pfn, page_mapcount(p));
725
726 /*
727 * Now that the dirty bit has been propagated to the
728 * struct page and all unmaps done we can decide if
729 * killing is needed or not. Only kill when the page
730 * was dirty, otherwise the tokill list is merely
731 * freed. When there was a problem unmapping earlier
732 * use a more force-full uncatchable kill to prevent
733 * any accesses to the poisoned memory.
734 */
735 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
736 ret != SWAP_SUCCESS, pfn);
737}
738
739int __memory_failure(unsigned long pfn, int trapno, int ref)
740{
741 struct page_state *ps;
742 struct page *p;
743 int res;
744
745 if (!sysctl_memory_failure_recovery)
746 panic("Memory failure from trap %d on page %lx", trapno, pfn);
747
748 if (!pfn_valid(pfn)) {
749 action_result(pfn, "memory outside kernel control", IGNORED);
750 return -EIO;
751 }
752
753 p = pfn_to_page(pfn);
754 if (TestSetPageHWPoison(p)) {
755 action_result(pfn, "already hardware poisoned", IGNORED);
756 return 0;
757 }
758
759 atomic_long_add(1, &mce_bad_pages);
760
761 /*
762 * We need/can do nothing about count=0 pages.
763 * 1) it's a free page, and therefore in safe hand:
764 * prep_new_page() will be the gate keeper.
765 * 2) it's part of a non-compound high order page.
766 * Implies some kernel user: cannot stop them from
767 * R/W the page; let's pray that the page has been
768 * used and will be freed some time later.
769 * In fact it's dangerous to directly bump up page count from 0,
770 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
771 */
772 if (!get_page_unless_zero(compound_head(p))) {
773 action_result(pfn, "free or high order kernel", IGNORED);
774 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
775 }
776
777 /*
778 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here.
781 */
782 lock_page_nosync(p);
783 wait_on_page_writeback(p);
784
785 /*
786 * Now take care of user space mappings.
787 */
788 hwpoison_user_mappings(p, pfn, trapno);
789
790 /*
791 * Torn down by someone else?
792 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0;
796 goto out;
797 }
798
799 res = -EBUSY;
800 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref);
803 break;
804 }
805 }
806out:
807 unlock_page(p);
808 return res;
809}
810EXPORT_SYMBOL_GPL(__memory_failure);
811
812/**
813 * memory_failure - Handle memory failure of a page.
814 * @pfn: Page Number of the corrupted page
815 * @trapno: Trap number reported in the signal to user space.
816 *
817 * This function is called by the low level machine check code
818 * of an architecture when it detects hardware memory corruption
819 * of a page. It tries its best to recover, which includes
820 * dropping pages, killing processes etc.
821 *
822 * The function is primarily of use for corruptions that
823 * happen outside the current execution context (e.g. when
824 * detected by a background scrubber)
825 *
826 * Must run in process context (e.g. a work queue) with interrupts
827 * enabled and no spinlocks hold.
828 */
829void memory_failure(unsigned long pfn, int trapno)
830{
831 __memory_failure(pfn, trapno, 0);
832}
diff --git a/mm/memory.c b/mm/memory.c
index e8f63d9961ea..7e91b5f9f690 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -107,6 +108,18 @@ static int __init disable_randmaps(char *s)
107} 108}
108__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
109 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
110 123
111/* 124/*
112 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -284,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
284 unsigned long addr = vma->vm_start; 297 unsigned long addr = vma->vm_start;
285 298
286 /* 299 /*
287 * Hide vma from rmap and vmtruncate before freeing pgtables 300 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables
288 */ 302 */
289 anon_vma_unlink(vma); 303 anon_vma_unlink(vma);
290 unlink_file_vma(vma); 304 unlink_file_vma(vma);
@@ -443,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags)
443 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 457 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
444} 458}
445 459
460#ifndef is_zero_pfn
461static inline int is_zero_pfn(unsigned long pfn)
462{
463 return pfn == zero_pfn;
464}
465#endif
466
467#ifndef my_zero_pfn
468static inline unsigned long my_zero_pfn(unsigned long addr)
469{
470 return zero_pfn;
471}
472#endif
473
446/* 474/*
447 * vm_normal_page -- This function gets the "struct page" associated with a pte. 475 * vm_normal_page -- This function gets the "struct page" associated with a pte.
448 * 476 *
@@ -498,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
498 if (HAVE_PTE_SPECIAL) { 526 if (HAVE_PTE_SPECIAL) {
499 if (likely(!pte_special(pte))) 527 if (likely(!pte_special(pte)))
500 goto check_pfn; 528 goto check_pfn;
501 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 529 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
530 return NULL;
531 if (!is_zero_pfn(pfn))
502 print_bad_pte(vma, addr, pte, NULL); 532 print_bad_pte(vma, addr, pte, NULL);
503 return NULL; 533 return NULL;
504 } 534 }
@@ -520,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
520 } 550 }
521 } 551 }
522 552
553 if (is_zero_pfn(pfn))
554 return NULL;
523check_pfn: 555check_pfn:
524 if (unlikely(pfn > highest_memmap_pfn)) { 556 if (unlikely(pfn > highest_memmap_pfn)) {
525 print_bad_pte(vma, addr, pte, NULL); 557 print_bad_pte(vma, addr, pte, NULL);
@@ -597,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
597 page = vm_normal_page(vma, addr, pte); 629 page = vm_normal_page(vma, addr, pte);
598 if (page) { 630 if (page) {
599 get_page(page); 631 get_page(page);
600 page_dup_rmap(page, vma, addr); 632 page_dup_rmap(page);
601 rss[!!PageAnon(page)]++; 633 rss[PageAnon(page)]++;
602 } 634 }
603 635
604out_set_pte: 636out_set_pte:
@@ -1143,9 +1175,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1143 goto no_page; 1175 goto no_page;
1144 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1176 if ((flags & FOLL_WRITE) && !pte_write(pte))
1145 goto unlock; 1177 goto unlock;
1178
1146 page = vm_normal_page(vma, address, pte); 1179 page = vm_normal_page(vma, address, pte);
1147 if (unlikely(!page)) 1180 if (unlikely(!page)) {
1148 goto bad_page; 1181 if ((flags & FOLL_DUMP) ||
1182 !is_zero_pfn(pte_pfn(pte)))
1183 goto bad_page;
1184 page = pte_page(pte);
1185 }
1149 1186
1150 if (flags & FOLL_GET) 1187 if (flags & FOLL_GET)
1151 get_page(page); 1188 get_page(page);
@@ -1173,65 +1210,46 @@ no_page:
1173 pte_unmap_unlock(ptep, ptl); 1210 pte_unmap_unlock(ptep, ptl);
1174 if (!pte_none(pte)) 1211 if (!pte_none(pte))
1175 return page; 1212 return page;
1176 /* Fall through to ZERO_PAGE handling */ 1213
1177no_page_table: 1214no_page_table:
1178 /* 1215 /*
1179 * When core dumping an enormous anonymous area that nobody 1216 * When core dumping an enormous anonymous area that nobody
1180 * has touched so far, we don't want to allocate page tables. 1217 * has touched so far, we don't want to allocate unnecessary pages or
1218 * page tables. Return error instead of NULL to skip handle_mm_fault,
1219 * then get_dump_page() will return NULL to leave a hole in the dump.
1220 * But we can only make this optimization where a hole would surely
1221 * be zero-filled if handle_mm_fault() actually did handle it.
1181 */ 1222 */
1182 if (flags & FOLL_ANON) { 1223 if ((flags & FOLL_DUMP) &&
1183 page = ZERO_PAGE(0); 1224 (!vma->vm_ops || !vma->vm_ops->fault))
1184 if (flags & FOLL_GET) 1225 return ERR_PTR(-EFAULT);
1185 get_page(page);
1186 BUG_ON(flags & FOLL_WRITE);
1187 }
1188 return page; 1226 return page;
1189} 1227}
1190 1228
1191/* Can we do the FOLL_ANON optimization? */
1192static inline int use_zero_page(struct vm_area_struct *vma)
1193{
1194 /*
1195 * We don't want to optimize FOLL_ANON for make_pages_present()
1196 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1197 * we want to get the page from the page tables to make sure
1198 * that we serialize and update with any other user of that
1199 * mapping.
1200 */
1201 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1202 return 0;
1203 /*
1204 * And if we have a fault routine, it's not an anonymous region.
1205 */
1206 return !vma->vm_ops || !vma->vm_ops->fault;
1207}
1208
1209
1210
1211int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1229int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1212 unsigned long start, int nr_pages, int flags, 1230 unsigned long start, int nr_pages, unsigned int gup_flags,
1213 struct page **pages, struct vm_area_struct **vmas) 1231 struct page **pages, struct vm_area_struct **vmas)
1214{ 1232{
1215 int i; 1233 int i;
1216 unsigned int vm_flags = 0; 1234 unsigned long vm_flags;
1217 int write = !!(flags & GUP_FLAGS_WRITE);
1218 int force = !!(flags & GUP_FLAGS_FORCE);
1219 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1220 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1221 1235
1222 if (nr_pages <= 0) 1236 if (nr_pages <= 0)
1223 return 0; 1237 return 0;
1238
1239 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1240
1224 /* 1241 /*
1225 * Require read or write permissions. 1242 * Require read or write permissions.
1226 * If 'force' is set, we only require the "MAY" flags. 1243 * If FOLL_FORCE is set, we only require the "MAY" flags.
1227 */ 1244 */
1228 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1245 vm_flags = (gup_flags & FOLL_WRITE) ?
1229 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1246 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1247 vm_flags &= (gup_flags & FOLL_FORCE) ?
1248 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1230 i = 0; 1249 i = 0;
1231 1250
1232 do { 1251 do {
1233 struct vm_area_struct *vma; 1252 struct vm_area_struct *vma;
1234 unsigned int foll_flags;
1235 1253
1236 vma = find_extend_vma(mm, start); 1254 vma = find_extend_vma(mm, start);
1237 if (!vma && in_gate_area(tsk, start)) { 1255 if (!vma && in_gate_area(tsk, start)) {
@@ -1243,7 +1261,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1243 pte_t *pte; 1261 pte_t *pte;
1244 1262
1245 /* user gate pages are read-only */ 1263 /* user gate pages are read-only */
1246 if (!ignore && write) 1264 if (gup_flags & FOLL_WRITE)
1247 return i ? : -EFAULT; 1265 return i ? : -EFAULT;
1248 if (pg > TASK_SIZE) 1266 if (pg > TASK_SIZE)
1249 pgd = pgd_offset_k(pg); 1267 pgd = pgd_offset_k(pg);
@@ -1277,38 +1295,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1277 1295
1278 if (!vma || 1296 if (!vma ||
1279 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1297 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1280 (!ignore && !(vm_flags & vma->vm_flags))) 1298 !(vm_flags & vma->vm_flags))
1281 return i ? : -EFAULT; 1299 return i ? : -EFAULT;
1282 1300
1283 if (is_vm_hugetlb_page(vma)) { 1301 if (is_vm_hugetlb_page(vma)) {
1284 i = follow_hugetlb_page(mm, vma, pages, vmas, 1302 i = follow_hugetlb_page(mm, vma, pages, vmas,
1285 &start, &nr_pages, i, write); 1303 &start, &nr_pages, i, gup_flags);
1286 continue; 1304 continue;
1287 } 1305 }
1288 1306
1289 foll_flags = FOLL_TOUCH;
1290 if (pages)
1291 foll_flags |= FOLL_GET;
1292 if (!write && use_zero_page(vma))
1293 foll_flags |= FOLL_ANON;
1294
1295 do { 1307 do {
1296 struct page *page; 1308 struct page *page;
1309 unsigned int foll_flags = gup_flags;
1297 1310
1298 /* 1311 /*
1299 * If we have a pending SIGKILL, don't keep faulting 1312 * If we have a pending SIGKILL, don't keep faulting
1300 * pages and potentially allocating memory, unless 1313 * pages and potentially allocating memory.
1301 * current is handling munlock--e.g., on exit. In
1302 * that case, we are not allocating memory. Rather,
1303 * we're only unlocking already resident/mapped pages.
1304 */ 1314 */
1305 if (unlikely(!ignore_sigkill && 1315 if (unlikely(fatal_signal_pending(current)))
1306 fatal_signal_pending(current)))
1307 return i ? i : -ERESTARTSYS; 1316 return i ? i : -ERESTARTSYS;
1308 1317
1309 if (write)
1310 foll_flags |= FOLL_WRITE;
1311
1312 cond_resched(); 1318 cond_resched();
1313 while (!(page = follow_page(vma, start, foll_flags))) { 1319 while (!(page = follow_page(vma, start, foll_flags))) {
1314 int ret; 1320 int ret;
@@ -1320,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1320 if (ret & VM_FAULT_ERROR) { 1326 if (ret & VM_FAULT_ERROR) {
1321 if (ret & VM_FAULT_OOM) 1327 if (ret & VM_FAULT_OOM)
1322 return i ? i : -ENOMEM; 1328 return i ? i : -ENOMEM;
1323 else if (ret & VM_FAULT_SIGBUS) 1329 if (ret &
1330 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1324 return i ? i : -EFAULT; 1331 return i ? i : -EFAULT;
1325 BUG(); 1332 BUG();
1326 } 1333 }
@@ -1419,18 +1426,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1419 unsigned long start, int nr_pages, int write, int force, 1426 unsigned long start, int nr_pages, int write, int force,
1420 struct page **pages, struct vm_area_struct **vmas) 1427 struct page **pages, struct vm_area_struct **vmas)
1421{ 1428{
1422 int flags = 0; 1429 int flags = FOLL_TOUCH;
1423 1430
1431 if (pages)
1432 flags |= FOLL_GET;
1424 if (write) 1433 if (write)
1425 flags |= GUP_FLAGS_WRITE; 1434 flags |= FOLL_WRITE;
1426 if (force) 1435 if (force)
1427 flags |= GUP_FLAGS_FORCE; 1436 flags |= FOLL_FORCE;
1428 1437
1429 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1438 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1430} 1439}
1431
1432EXPORT_SYMBOL(get_user_pages); 1440EXPORT_SYMBOL(get_user_pages);
1433 1441
1442/**
1443 * get_dump_page() - pin user page in memory while writing it to core dump
1444 * @addr: user address
1445 *
1446 * Returns struct page pointer of user page pinned for dump,
1447 * to be freed afterwards by page_cache_release() or put_page().
1448 *
1449 * Returns NULL on any kind of failure - a hole must then be inserted into
1450 * the corefile, to preserve alignment with its headers; and also returns
1451 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1452 * allowing a hole to be left in the corefile to save diskspace.
1453 *
1454 * Called without mmap_sem, but after all other threads have been killed.
1455 */
1456#ifdef CONFIG_ELF_CORE
1457struct page *get_dump_page(unsigned long addr)
1458{
1459 struct vm_area_struct *vma;
1460 struct page *page;
1461
1462 if (__get_user_pages(current, current->mm, addr, 1,
1463 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1464 return NULL;
1465 flush_cache_page(vma, addr, page_to_pfn(page));
1466 return page;
1467}
1468#endif /* CONFIG_ELF_CORE */
1469
1434pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1470pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1435 spinlock_t **ptl) 1471 spinlock_t **ptl)
1436{ 1472{
@@ -1608,7 +1644,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1608 * If we don't have pte special, then we have to use the pfn_valid() 1644 * If we don't have pte special, then we have to use the pfn_valid()
1609 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1645 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1610 * refcount the page if pfn_valid is true (hence insert_page rather 1646 * refcount the page if pfn_valid is true (hence insert_page rather
1611 * than insert_pfn). 1647 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1648 * without pte special, it would there be refcounted as a normal page.
1612 */ 1649 */
1613 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1650 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1614 struct page *page; 1651 struct page *page;
@@ -1974,7 +2011,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1974 * Take out anonymous pages first, anonymous shared vmas are 2011 * Take out anonymous pages first, anonymous shared vmas are
1975 * not dirty accountable. 2012 * not dirty accountable.
1976 */ 2013 */
1977 if (PageAnon(old_page)) { 2014 if (PageAnon(old_page) && !PageKsm(old_page)) {
1978 if (!trylock_page(old_page)) { 2015 if (!trylock_page(old_page)) {
1979 page_cache_get(old_page); 2016 page_cache_get(old_page);
1980 pte_unmap_unlock(page_table, ptl); 2017 pte_unmap_unlock(page_table, ptl);
@@ -2075,10 +2112,19 @@ gotten:
2075 2112
2076 if (unlikely(anon_vma_prepare(vma))) 2113 if (unlikely(anon_vma_prepare(vma)))
2077 goto oom; 2114 goto oom;
2078 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2115
2079 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2116 if (is_zero_pfn(pte_pfn(orig_pte))) {
2080 if (!new_page) 2117 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2081 goto oom; 2118 if (!new_page)
2119 goto oom;
2120 } else {
2121 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2122 if (!new_page)
2123 goto oom;
2124 cow_user_page(new_page, old_page, address, vma);
2125 }
2126 __SetPageUptodate(new_page);
2127
2082 /* 2128 /*
2083 * Don't let another task, with possibly unlocked vma, 2129 * Don't let another task, with possibly unlocked vma,
2084 * keep the mlocked page. 2130 * keep the mlocked page.
@@ -2088,8 +2134,6 @@ gotten:
2088 clear_page_mlock(old_page); 2134 clear_page_mlock(old_page);
2089 unlock_page(old_page); 2135 unlock_page(old_page);
2090 } 2136 }
2091 cow_user_page(new_page, old_page, address, vma);
2092 __SetPageUptodate(new_page);
2093 2137
2094 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2138 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2095 goto oom_free_new; 2139 goto oom_free_new;
@@ -2115,9 +2159,14 @@ gotten:
2115 * seen in the presence of one thread doing SMC and another 2159 * seen in the presence of one thread doing SMC and another
2116 * thread doing COW. 2160 * thread doing COW.
2117 */ 2161 */
2118 ptep_clear_flush_notify(vma, address, page_table); 2162 ptep_clear_flush(vma, address, page_table);
2119 page_add_new_anon_rmap(new_page, vma, address); 2163 page_add_new_anon_rmap(new_page, vma, address);
2120 set_pte_at(mm, address, page_table, entry); 2164 /*
2165 * We call the notify macro here because, when using secondary
2166 * mmu page tables (such as kvm shadow page tables), we want the
2167 * new page to be mapped directly into the secondary page table.
2168 */
2169 set_pte_at_notify(mm, address, page_table, entry);
2121 update_mmu_cache(vma, address, entry); 2170 update_mmu_cache(vma, address, entry);
2122 if (old_page) { 2171 if (old_page) {
2123 /* 2172 /*
@@ -2360,7 +2409,7 @@ restart:
2360 * @mapping: the address space containing mmaps to be unmapped. 2409 * @mapping: the address space containing mmaps to be unmapped.
2361 * @holebegin: byte in first page to unmap, relative to the start of 2410 * @holebegin: byte in first page to unmap, relative to the start of
2362 * the underlying file. This will be rounded down to a PAGE_SIZE 2411 * the underlying file. This will be rounded down to a PAGE_SIZE
2363 * boundary. Note that this is different from vmtruncate(), which 2412 * boundary. Note that this is different from truncate_pagecache(), which
2364 * must keep the partial page. In contrast, we must get rid of 2413 * must keep the partial page. In contrast, we must get rid of
2365 * partial pages. 2414 * partial pages.
2366 * @holelen: size of prospective hole in bytes. This will be rounded 2415 * @holelen: size of prospective hole in bytes. This will be rounded
@@ -2411,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping,
2411} 2460}
2412EXPORT_SYMBOL(unmap_mapping_range); 2461EXPORT_SYMBOL(unmap_mapping_range);
2413 2462
2414/**
2415 * vmtruncate - unmap mappings "freed" by truncate() syscall
2416 * @inode: inode of the file used
2417 * @offset: file offset to start truncating
2418 *
2419 * NOTE! We have to be ready to update the memory sharing
2420 * between the file and the memory map for a potential last
2421 * incomplete page. Ugly, but necessary.
2422 */
2423int vmtruncate(struct inode * inode, loff_t offset)
2424{
2425 if (inode->i_size < offset) {
2426 unsigned long limit;
2427
2428 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2429 if (limit != RLIM_INFINITY && offset > limit)
2430 goto out_sig;
2431 if (offset > inode->i_sb->s_maxbytes)
2432 goto out_big;
2433 i_size_write(inode, offset);
2434 } else {
2435 struct address_space *mapping = inode->i_mapping;
2436
2437 /*
2438 * truncation of in-use swapfiles is disallowed - it would
2439 * cause subsequent swapout to scribble on the now-freed
2440 * blocks.
2441 */
2442 if (IS_SWAPFILE(inode))
2443 return -ETXTBSY;
2444 i_size_write(inode, offset);
2445
2446 /*
2447 * unmap_mapping_range is called twice, first simply for
2448 * efficiency so that truncate_inode_pages does fewer
2449 * single-page unmaps. However after this first call, and
2450 * before truncate_inode_pages finishes, it is possible for
2451 * private pages to be COWed, which remain after
2452 * truncate_inode_pages finishes, hence the second
2453 * unmap_mapping_range call must be made for correctness.
2454 */
2455 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2456 truncate_inode_pages(mapping, offset);
2457 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2458 }
2459
2460 if (inode->i_op->truncate)
2461 inode->i_op->truncate(inode);
2462 return 0;
2463
2464out_sig:
2465 send_sig(SIGXFSZ, current, 0);
2466out_big:
2467 return -EFBIG;
2468}
2469EXPORT_SYMBOL(vmtruncate);
2470
2471int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 2463int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2472{ 2464{
2473 struct address_space *mapping = inode->i_mapping; 2465 struct address_space *mapping = inode->i_mapping;
@@ -2512,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2512 goto out; 2504 goto out;
2513 2505
2514 entry = pte_to_swp_entry(orig_pte); 2506 entry = pte_to_swp_entry(orig_pte);
2515 if (is_migration_entry(entry)) { 2507 if (unlikely(non_swap_entry(entry))) {
2516 migration_entry_wait(mm, pmd, address); 2508 if (is_migration_entry(entry)) {
2509 migration_entry_wait(mm, pmd, address);
2510 } else if (is_hwpoison_entry(entry)) {
2511 ret = VM_FAULT_HWPOISON;
2512 } else {
2513 print_bad_pte(vma, address, orig_pte, NULL);
2514 ret = VM_FAULT_OOM;
2515 }
2517 goto out; 2516 goto out;
2518 } 2517 }
2519 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2518 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2537,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2537 /* Had to read the page from swap area: Major fault */ 2536 /* Had to read the page from swap area: Major fault */
2538 ret = VM_FAULT_MAJOR; 2537 ret = VM_FAULT_MAJOR;
2539 count_vm_event(PGMAJFAULT); 2538 count_vm_event(PGMAJFAULT);
2539 } else if (PageHWPoison(page)) {
2540 ret = VM_FAULT_HWPOISON;
2541 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2542 goto out;
2540 } 2543 }
2541 2544
2542 lock_page(page); 2545 lock_page(page);
@@ -2625,6 +2628,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2625 spinlock_t *ptl; 2628 spinlock_t *ptl;
2626 pte_t entry; 2629 pte_t entry;
2627 2630
2631 if (!(flags & FAULT_FLAG_WRITE)) {
2632 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2633 vma->vm_page_prot));
2634 ptl = pte_lockptr(mm, pmd);
2635 spin_lock(ptl);
2636 if (!pte_none(*page_table))
2637 goto unlock;
2638 goto setpte;
2639 }
2640
2628 /* Allocate our own private page. */ 2641 /* Allocate our own private page. */
2629 pte_unmap(page_table); 2642 pte_unmap(page_table);
2630 2643
@@ -2639,13 +2652,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2639 goto oom_free_page; 2652 goto oom_free_page;
2640 2653
2641 entry = mk_pte(page, vma->vm_page_prot); 2654 entry = mk_pte(page, vma->vm_page_prot);
2642 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2655 if (vma->vm_flags & VM_WRITE)
2656 entry = pte_mkwrite(pte_mkdirty(entry));
2643 2657
2644 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2658 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2645 if (!pte_none(*page_table)) 2659 if (!pte_none(*page_table))
2646 goto release; 2660 goto release;
2661
2647 inc_mm_counter(mm, anon_rss); 2662 inc_mm_counter(mm, anon_rss);
2648 page_add_new_anon_rmap(page, vma, address); 2663 page_add_new_anon_rmap(page, vma, address);
2664setpte:
2649 set_pte_at(mm, address, page_table, entry); 2665 set_pte_at(mm, address, page_table, entry);
2650 2666
2651 /* No need to invalidate - it was non-present before */ 2667 /* No need to invalidate - it was non-present before */
@@ -2700,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2700 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2716 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2701 return ret; 2717 return ret;
2702 2718
2719 if (unlikely(PageHWPoison(vmf.page))) {
2720 if (ret & VM_FAULT_LOCKED)
2721 unlock_page(vmf.page);
2722 return VM_FAULT_HWPOISON;
2723 }
2724
2703 /* 2725 /*
2704 * For consistency in subsequent calls, make the faulted page always 2726 * For consistency in subsequent calls, make the faulted page always
2705 * locked. 2727 * locked.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4412a676c88..821dee596377 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages);
339 339
340void online_page(struct page *page) 340void online_page(struct page *page)
341{ 341{
342 unsigned long pfn = page_to_pfn(page);
343
342 totalram_pages++; 344 totalram_pages++;
343 num_physpages++; 345 if (pfn >= num_physpages)
346 num_physpages = pfn + 1;
344 347
345#ifdef CONFIG_HIGHMEM 348#ifdef CONFIG_HIGHMEM
346 if (PageHighMem(page)) 349 if (PageHighMem(page))
@@ -410,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
410 if (!populated_zone(zone)) 413 if (!populated_zone(zone))
411 need_zonelists_rebuild = 1; 414 need_zonelists_rebuild = 1;
412 415
413 ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, 416 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
414 online_pages_range); 417 online_pages_range);
415 if (ret) { 418 if (ret) {
416 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 419 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
@@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 425 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 426 zone->zone_pgdat->node_present_pages += onlined_pages;
424 427
428 zone_pcp_update(zone);
425 setup_per_zone_wmarks(); 429 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone); 430 calculate_zone_inactive_ratio(zone);
427 if (onlined_pages) { 431 if (onlined_pages) {
@@ -701,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
701static void 705static void
702offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 706offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
703{ 707{
704 walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, 708 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
705 offline_isolated_pages_cb); 709 offline_isolated_pages_cb);
706} 710}
707 711
@@ -727,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
727 long offlined = 0; 731 long offlined = 0;
728 int ret; 732 int ret;
729 733
730 ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, 734 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
731 check_pages_isolated_cb); 735 check_pages_isolated_cb);
732 if (ret < 0) 736 if (ret < 0)
733 offlined = (long)ret; 737 offlined = (long)ret;
@@ -831,7 +835,6 @@ repeat:
831 zone->present_pages -= offlined_pages; 835 zone->present_pages -= offlined_pages;
832 zone->zone_pgdat->node_present_pages -= offlined_pages; 836 zone->zone_pgdat->node_present_pages -= offlined_pages;
833 totalram_pages -= offlined_pages; 837 totalram_pages -= offlined_pages;
834 num_physpages -= offlined_pages;
835 838
836 setup_per_zone_wmarks(); 839 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone); 840 calculate_zone_inactive_ratio(zone);
diff --git a/mm/mempool.c b/mm/mempool.c
index 32e75d400503..1a3bc3d4d554 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{
313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask);
315}
316EXPORT_SYMBOL(mempool_kzalloc);
317
318void mempool_kfree(void *element, void *pool_data) 311void mempool_kfree(void *element, void *pool_data)
319{ 312{
320 kfree(element); 313 kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index 939888f9ddab..1a4bf4813780 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
67 67
68 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
69 list_del(&page->lru); 69 list_del(&page->lru);
70 dec_zone_page_state(page, NR_ISOLATED_ANON +
71 page_is_file_cache(page));
70 putback_lru_page(page); 72 putback_lru_page(page);
71 count++; 73 count++;
72 } 74 }
@@ -147,7 +149,7 @@ out:
147static void remove_file_migration_ptes(struct page *old, struct page *new) 149static void remove_file_migration_ptes(struct page *old, struct page *new)
148{ 150{
149 struct vm_area_struct *vma; 151 struct vm_area_struct *vma;
150 struct address_space *mapping = page_mapping(new); 152 struct address_space *mapping = new->mapping;
151 struct prio_tree_iter iter; 153 struct prio_tree_iter iter;
152 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
153 155
@@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 272 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 273 page_index(page));
272 274
273 expected_count = 2 + !!page_has_private(page); 275 expected_count = 2 + page_has_private(page);
274 if (page_count(page) != expected_count || 276 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 277 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 278 spin_unlock_irq(&mapping->tree_lock);
@@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
312 */ 314 */
313 __dec_zone_page_state(page, NR_FILE_PAGES); 315 __dec_zone_page_state(page, NR_FILE_PAGES);
314 __inc_zone_page_state(newpage, NR_FILE_PAGES); 316 __inc_zone_page_state(newpage, NR_FILE_PAGES);
315 317 if (PageSwapBacked(page)) {
318 __dec_zone_page_state(page, NR_SHMEM);
319 __inc_zone_page_state(newpage, NR_SHMEM);
320 }
316 spin_unlock_irq(&mapping->tree_lock); 321 spin_unlock_irq(&mapping->tree_lock);
317 322
318 return 0; 323 return 0;
@@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
664 * needs to be effective. 669 * needs to be effective.
665 */ 670 */
666 try_to_free_buffers(page); 671 try_to_free_buffers(page);
672 goto rcu_unlock;
667 } 673 }
668 goto rcu_unlock; 674 goto skip_unmap;
669 } 675 }
670 676
671 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
672 try_to_unmap(page, 1); 678 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
673 679
680skip_unmap:
674 if (!page_mapped(page)) 681 if (!page_mapped(page))
675 rc = move_to_new_page(newpage, page); 682 rc = move_to_new_page(newpage, page);
676 683
@@ -693,6 +700,8 @@ unlock:
693 * restored. 700 * restored.
694 */ 701 */
695 list_del(&page->lru); 702 list_del(&page->lru);
703 dec_zone_page_state(page, NR_ISOLATED_ANON +
704 page_is_file_cache(page));
696 putback_lru_page(page); 705 putback_lru_page(page);
697 } 706 }
698 707
@@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from,
737 struct page *page2; 746 struct page *page2;
738 int swapwrite = current->flags & PF_SWAPWRITE; 747 int swapwrite = current->flags & PF_SWAPWRITE;
739 int rc; 748 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
740 756
741 if (!swapwrite) 757 if (!swapwrite)
742 current->flags |= PF_SWAPWRITE; 758 current->flags |= PF_SWAPWRITE;
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650b9654..bd6f0e466f6c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
139} 139}
140 140
141/** 141/**
142 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. 142 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
143 * @vma: target vma 143 * @vma: target vma
144 * @start: start address 144 * @start: start address
145 * @end: end address 145 * @end: end address
146 * @mlock: 0 indicate munlock, otherwise mlock.
147 * 146 *
148 * If @mlock == 0, unlock an mlocked range; 147 * This takes care of making the pages present too.
149 * else mlock the range of pages. This takes care of making the pages present ,
150 * too.
151 * 148 *
152 * return 0 on success, negative error code on error. 149 * return 0 on success, negative error code on error.
153 * 150 *
154 * vma->vm_mm->mmap_sem must be held for at least read. 151 * vma->vm_mm->mmap_sem must be held for at least read.
155 */ 152 */
156static long __mlock_vma_pages_range(struct vm_area_struct *vma, 153static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157 unsigned long start, unsigned long end, 154 unsigned long start, unsigned long end)
158 int mlock)
159{ 155{
160 struct mm_struct *mm = vma->vm_mm; 156 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 157 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */ 158 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 159 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0; 160 int ret = 0;
165 int gup_flags = 0; 161 int gup_flags;
166 162
167 VM_BUG_ON(start & ~PAGE_MASK); 163 VM_BUG_ON(start & ~PAGE_MASK);
168 VM_BUG_ON(end & ~PAGE_MASK); 164 VM_BUG_ON(end & ~PAGE_MASK);
169 VM_BUG_ON(start < vma->vm_start); 165 VM_BUG_ON(start < vma->vm_start);
170 VM_BUG_ON(end > vma->vm_end); 166 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && 167 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 (atomic_read(&mm->mm_users) != 0));
173
174 /*
175 * mlock: don't page populate if vma has PROT_NONE permission.
176 * munlock: always do munlock although the vma has PROT_NONE
177 * permission, or SIGKILL is pending.
178 */
179 if (!mlock)
180 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
181 GUP_FLAGS_IGNORE_SIGKILL;
182 168
169 gup_flags = FOLL_TOUCH | FOLL_GET;
183 if (vma->vm_flags & VM_WRITE) 170 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 171 gup_flags |= FOLL_WRITE;
185 172
186 while (nr_pages > 0) { 173 while (nr_pages > 0) {
187 int i; 174 int i;
@@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
201 * This can happen for, e.g., VM_NONLINEAR regions before 188 * This can happen for, e.g., VM_NONLINEAR regions before
202 * a page has been allocated and mapped at a given offset, 189 * a page has been allocated and mapped at a given offset,
203 * or for addresses that map beyond end of a file. 190 * or for addresses that map beyond end of a file.
204 * We'll mlock the the pages if/when they get faulted in. 191 * We'll mlock the pages if/when they get faulted in.
205 */ 192 */
206 if (ret < 0) 193 if (ret < 0)
207 break; 194 break;
208 if (ret == 0) {
209 /*
210 * We know the vma is there, so the only time
211 * we cannot get a single page should be an
212 * error (ret < 0) case.
213 */
214 WARN_ON(1);
215 break;
216 }
217 195
218 lru_add_drain(); /* push cached pages to LRU */ 196 lru_add_drain(); /* push cached pages to LRU */
219 197
220 for (i = 0; i < ret; i++) { 198 for (i = 0; i < ret; i++) {
221 struct page *page = pages[i]; 199 struct page *page = pages[i];
222 200
223 lock_page(page);
224 /*
225 * Because we lock page here and migration is blocked
226 * by the elevated reference, we need only check for
227 * page truncation (file-cache only).
228 */
229 if (page->mapping) { 201 if (page->mapping) {
230 if (mlock) 202 /*
203 * That preliminary check is mainly to avoid
204 * the pointless overhead of lock_page on the
205 * ZERO_PAGE: which might bounce very badly if
206 * there is contention. However, we're still
207 * dirtying its cacheline with get/put_page:
208 * we'll add another __get_user_pages flag to
209 * avoid it if that case turns out to matter.
210 */
211 lock_page(page);
212 /*
213 * Because we lock page here and migration is
214 * blocked by the elevated reference, we need
215 * only check for file-cache page truncation.
216 */
217 if (page->mapping)
231 mlock_vma_page(page); 218 mlock_vma_page(page);
232 else 219 unlock_page(page);
233 munlock_vma_page(page);
234 } 220 }
235 unlock_page(page); 221 put_page(page); /* ref from get_user_pages() */
236 put_page(page); /* ref from get_user_pages() */
237
238 /*
239 * here we assume that get_user_pages() has given us
240 * a list of virtually contiguous pages.
241 */
242 addr += PAGE_SIZE; /* for next get_user_pages() */
243 nr_pages--;
244 } 222 }
223
224 addr += ret * PAGE_SIZE;
225 nr_pages -= ret;
245 ret = 0; 226 ret = 0;
246 } 227 }
247 228
248 return ret; /* count entire vma as locked_vm */ 229 return ret; /* 0 or negative error code */
249} 230}
250 231
251/* 232/*
@@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
289 is_vm_hugetlb_page(vma) || 270 is_vm_hugetlb_page(vma) ||
290 vma == get_gate_vma(current))) { 271 vma == get_gate_vma(current))) {
291 272
292 __mlock_vma_pages_range(vma, start, end, 1); 273 __mlock_vma_pages_range(vma, start, end);
293 274
294 /* Hide errors from mmap() and other callers */ 275 /* Hide errors from mmap() and other callers */
295 return 0; 276 return 0;
@@ -310,7 +291,6 @@ no_mlock:
310 return nr_pages; /* error or pages NOT mlocked */ 291 return nr_pages; /* error or pages NOT mlocked */
311} 292}
312 293
313
314/* 294/*
315 * munlock_vma_pages_range() - munlock all pages in the vma range.' 295 * munlock_vma_pages_range() - munlock all pages in the vma range.'
316 * @vma - vma containing range to be munlock()ed. 296 * @vma - vma containing range to be munlock()ed.
@@ -330,10 +310,38 @@ no_mlock:
330 * free them. This will result in freeing mlocked pages. 310 * free them. This will result in freeing mlocked pages.
331 */ 311 */
332void munlock_vma_pages_range(struct vm_area_struct *vma, 312void munlock_vma_pages_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long end) 313 unsigned long start, unsigned long end)
334{ 314{
315 unsigned long addr;
316
317 lru_add_drain();
335 vma->vm_flags &= ~VM_LOCKED; 318 vma->vm_flags &= ~VM_LOCKED;
336 __mlock_vma_pages_range(vma, start, end, 0); 319
320 for (addr = start; addr < end; addr += PAGE_SIZE) {
321 struct page *page;
322 /*
323 * Although FOLL_DUMP is intended for get_dump_page(),
324 * it just so happens that its special treatment of the
325 * ZERO_PAGE (returning an error instead of doing get_page)
326 * suits munlock very well (and if somehow an abnormal page
327 * has sneaked into the range, we won't oops here: great).
328 */
329 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
330 if (page && !IS_ERR(page)) {
331 lock_page(page);
332 /*
333 * Like in __mlock_vma_pages_range(),
334 * because we lock page here and migration is
335 * blocked by the elevated reference, we need
336 * only check for file-cache page truncation.
337 */
338 if (page->mapping)
339 munlock_vma_page(page);
340 unlock_page(page);
341 put_page(page);
342 }
343 cond_resched();
344 }
337} 345}
338 346
339/* 347/*
@@ -400,18 +408,14 @@ success:
400 * It's okay if try_to_unmap_one unmaps a page just after we 408 * It's okay if try_to_unmap_one unmaps a page just after we
401 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 409 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402 */ 410 */
403 vma->vm_flags = newflags;
404 411
405 if (lock) { 412 if (lock) {
406 ret = __mlock_vma_pages_range(vma, start, end, 1); 413 vma->vm_flags = newflags;
407 414 ret = __mlock_vma_pages_range(vma, start, end);
408 if (ret > 0) { 415 if (ret < 0)
409 mm->locked_vm -= ret; 416 ret = __mlock_posix_error_return(ret);
410 ret = 0;
411 } else
412 ret = __mlock_posix_error_return(ret); /* translate if needed */
413 } else { 417 } else {
414 __mlock_vma_pages_range(vma, start, end, 0); 418 munlock_vma_pages_range(vma, start, end);
415 } 419 }
416 420
417out: 421out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 26892e346d8f..73f5e4b64010 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h> 31#include <linux/perf_event.h>
32 32
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -570,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end);
570 570
571 /* 571 /*
572 * When changing only vma->vm_end, we don't really need 572 * When changing only vma->vm_end, we don't really need
573 * anon_vma lock: but is that case worth optimizing out? 573 * anon_vma lock.
574 */ 574 */
575 if (vma->anon_vma) 575 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
576 anon_vma = vma->anon_vma; 576 anon_vma = vma->anon_vma;
577 if (anon_vma) { 577 if (anon_vma) {
578 spin_lock(&anon_vma->lock); 578 spin_lock(&anon_vma->lock);
@@ -656,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end);
656 validate_mm(mm); 656 validate_mm(mm);
657} 657}
658 658
659/* Flags that can be inherited from an existing mapping when merging */
660#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
661
662/* 659/*
663 * If the vma has a ->close operation then the driver probably needs to release 660 * If the vma has a ->close operation then the driver probably needs to release
664 * per-vma resources, so we don't attempt to merge those. 661 * per-vma resources, so we don't attempt to merge those.
@@ -666,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end);
666static inline int is_mergeable_vma(struct vm_area_struct *vma, 663static inline int is_mergeable_vma(struct vm_area_struct *vma,
667 struct file *file, unsigned long vm_flags) 664 struct file *file, unsigned long vm_flags)
668{ 665{
669 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) 666 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
667 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
670 return 0; 668 return 0;
671 if (vma->vm_file != file) 669 if (vma->vm_file != file)
672 return 0; 670 return 0;
@@ -951,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
951 if (mm->map_count > sysctl_max_map_count) 949 if (mm->map_count > sysctl_max_map_count)
952 return -ENOMEM; 950 return -ENOMEM;
953 951
952 if (flags & MAP_HUGETLB) {
953 struct user_struct *user = NULL;
954 if (file)
955 return -EINVAL;
956
957 /*
958 * VM_NORESERVE is used because the reservations will be
959 * taken when vm_ops->mmap() is called
960 * A dummy user value is used because we are not locking
961 * memory so no accounting is necessary
962 */
963 len = ALIGN(len, huge_page_size(&default_hstate));
964 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
965 &user, HUGETLB_ANONHUGE_INODE);
966 if (IS_ERR(file))
967 return PTR_ERR(file);
968 }
969
954 /* Obtain the address to map to. we verify (or select) it and ensure 970 /* Obtain the address to map to. we verify (or select) it and ensure
955 * that it represents a valid section of the address space. 971 * that it represents a valid section of the address space.
956 */ 972 */
@@ -965,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
965 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 981 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
966 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 982 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
967 983
968 if (flags & MAP_LOCKED) { 984 if (flags & MAP_LOCKED)
969 if (!can_do_mlock()) 985 if (!can_do_mlock())
970 return -EPERM; 986 return -EPERM;
971 vm_flags |= VM_LOCKED;
972 }
973 987
974 /* mlock MCL_FUTURE? */ 988 /* mlock MCL_FUTURE? */
975 if (vm_flags & VM_LOCKED) { 989 if (vm_flags & VM_LOCKED) {
@@ -1195,21 +1209,21 @@ munmap_back:
1195 goto unmap_and_free_vma; 1209 goto unmap_and_free_vma;
1196 if (vm_flags & VM_EXECUTABLE) 1210 if (vm_flags & VM_EXECUTABLE)
1197 added_exe_file_vma(mm); 1211 added_exe_file_vma(mm);
1212
1213 /* Can addr have changed??
1214 *
1215 * Answer: Yes, several device drivers can do it in their
1216 * f_op->mmap method. -DaveM
1217 */
1218 addr = vma->vm_start;
1219 pgoff = vma->vm_pgoff;
1220 vm_flags = vma->vm_flags;
1198 } else if (vm_flags & VM_SHARED) { 1221 } else if (vm_flags & VM_SHARED) {
1199 error = shmem_zero_setup(vma); 1222 error = shmem_zero_setup(vma);
1200 if (error) 1223 if (error)
1201 goto free_vma; 1224 goto free_vma;
1202 } 1225 }
1203 1226
1204 /* Can addr have changed??
1205 *
1206 * Answer: Yes, several device drivers can do it in their
1207 * f_op->mmap method. -DaveM
1208 */
1209 addr = vma->vm_start;
1210 pgoff = vma->vm_pgoff;
1211 vm_flags = vma->vm_flags;
1212
1213 if (vma_wants_writenotify(vma)) 1227 if (vma_wants_writenotify(vma))
1214 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1228 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1215 1229
@@ -1220,7 +1234,7 @@ munmap_back:
1220 if (correct_wcount) 1234 if (correct_wcount)
1221 atomic_inc(&inode->i_writecount); 1235 atomic_inc(&inode->i_writecount);
1222out: 1236out:
1223 perf_counter_mmap(vma); 1237 perf_event_mmap(vma);
1224 1238
1225 mm->total_vm += len >> PAGE_SHIFT; 1239 mm->total_vm += len >> PAGE_SHIFT;
1226 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1240 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2111,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm)
2111 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2125 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2112 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2126 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2113 vm_unacct_memory(nr_accounted); 2127 vm_unacct_memory(nr_accounted);
2128
2114 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2129 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2115 tlb_finish_mmu(tlb, 0, end); 2130 tlb_finish_mmu(tlb, 0, end);
2116 2131
@@ -2267,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
2267{ 2282{
2268} 2283}
2269 2284
2270static struct vm_operations_struct special_mapping_vmops = { 2285static const struct vm_operations_struct special_mapping_vmops = {
2271 .close = special_mapping_close, 2286 .close = special_mapping_close,
2272 .fault = special_mapping_fault, 2287 .fault = special_mapping_fault,
2273}; 2288};
@@ -2308,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm,
2308 2323
2309 mm->total_vm += len >> PAGE_SHIFT; 2324 mm->total_vm += len >> PAGE_SHIFT;
2310 2325
2311 perf_counter_mmap(vma); 2326 perf_event_mmap(vma);
2312 2327
2313 return 0; 2328 return 0;
2314} 2329}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 000000000000..ded9081f4021
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,58 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 *
3 * See ../COPYING for licensing terms.
4 */
5
6#include <linux/mm.h>
7#include <linux/mmu_context.h>
8#include <linux/sched.h>
9
10#include <asm/mmu_context.h>
11
12/*
13 * use_mm
14 * Makes the calling kernel thread take on the specified
15 * mm context.
16 * Called by the retry thread execute retries within the
17 * iocb issuer's mm context, so that copy_from/to_user
18 * operations work seamlessly for aio.
19 * (Note: this routine is intended to be called only
20 * from a kernel thread context)
21 */
22void use_mm(struct mm_struct *mm)
23{
24 struct mm_struct *active_mm;
25 struct task_struct *tsk = current;
26
27 task_lock(tsk);
28 active_mm = tsk->active_mm;
29 if (active_mm != mm) {
30 atomic_inc(&mm->mm_count);
31 tsk->active_mm = mm;
32 }
33 tsk->mm = mm;
34 switch_mm(active_mm, mm, tsk);
35 task_unlock(tsk);
36
37 if (active_mm != mm)
38 mmdrop(active_mm);
39}
40
41/*
42 * unuse_mm
43 * Reverses the effect of use_mm, i.e. releases the
44 * specified mm context which was earlier taken on
45 * by the calling kernel thread
46 * (Note: this routine is intended to be called only
47 * from a kernel thread context)
48 */
49void unuse_mm(struct mm_struct *mm)
50{
51 struct task_struct *tsk = current;
52
53 task_lock(tsk);
54 tsk->mm = NULL;
55 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk);
58}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
99 return young; 99 return young;
100} 100}
101 101
102void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
103 pte_t pte)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->change_pte)
111 mn->ops->change_pte(mn, mm, address, pte);
112 /*
113 * Some drivers don't have change_pte,
114 * so we must call invalidate_page in that case.
115 */
116 else if (mn->ops->invalidate_page)
117 mn->ops->invalidate_page(mn, mm, address);
118 }
119 rcu_read_unlock();
120}
121
102void __mmu_notifier_invalidate_page(struct mm_struct *mm, 122void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address) 123 unsigned long address)
104{ 124{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2d..8bc969d8112d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h> 26#include <linux/perf_event.h>
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <asm/pgtable.h> 28#include <asm/pgtable.h>
29#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
301 if (error) 301 if (error)
302 goto out; 302 goto out;
303 perf_counter_mmap(vma); 303 perf_event_mmap(vma);
304 nstart = tmp; 304 nstart = tmp;
305 305
306 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b91be46..97bff2547719 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/ksm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/swap.h> 16#include <linux/swap.h>
16#include <linux/capability.h> 17#include <linux/capability.h>
@@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
85 if (vma->vm_file) { 86 if (vma->vm_file) {
86 /* 87 /*
87 * Subtle point from Rajesh Venkatasubramanian: before 88 * Subtle point from Rajesh Venkatasubramanian: before
88 * moving file-based ptes, we must lock vmtruncate out, 89 * moving file-based ptes, we must lock truncate_pagecache
89 * since it might clean the dst vma before the src vma, 90 * out, since it might clean the dst vma before the src vma,
90 * and we propagate stale pages into the dst afterward. 91 * and we propagate stale pages into the dst afterward.
91 */ 92 */
92 mapping = vma->vm_file->f_mapping; 93 mapping = vma->vm_file->f_mapping;
@@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
174 unsigned long excess = 0; 175 unsigned long excess = 0;
175 unsigned long hiwater_vm; 176 unsigned long hiwater_vm;
176 int split = 0; 177 int split = 0;
178 int err;
177 179
178 /* 180 /*
179 * We'd prefer to avoid failure later on in do_munmap: 181 * We'd prefer to avoid failure later on in do_munmap:
@@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
182 if (mm->map_count >= sysctl_max_map_count - 3) 184 if (mm->map_count >= sysctl_max_map_count - 3)
183 return -ENOMEM; 185 return -ENOMEM;
184 186
187 /*
188 * Advise KSM to break any KSM pages in the area to be moved:
189 * it would be confusing if they were to turn up at the new
190 * location, where they happen to coincide with different KSM
191 * pages recently unmapped. But leave vma->vm_flags as it was,
192 * so KSM can come around to merge on vma and new_vma afterwards.
193 */
194 err = ksm_madvise(vma, old_addr, old_addr + old_len,
195 MADV_UNMERGEABLE, &vm_flags);
196 if (err)
197 return err;
198
185 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 199 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
186 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 200 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
187 if (!new_vma) 201 if (!new_vma)
diff --git a/mm/nommu.c b/mm/nommu.c
index 66e81e7e9fe9..5189b5aed8c0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,6 +33,7 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/mmu_context.h>
36#include "internal.h" 37#include "internal.h"
37 38
38static inline __attribute__((format(printf, 1, 2))) 39static inline __attribute__((format(printf, 1, 2)))
@@ -56,12 +57,11 @@ void no_printk(const char *fmt, ...)
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) 57 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif 58#endif
58 59
59#include "internal.h"
60
61void *high_memory; 60void *high_memory;
62struct page *mem_map; 61struct page *mem_map;
63unsigned long max_mapnr; 62unsigned long max_mapnr;
64unsigned long num_physpages; 63unsigned long num_physpages;
64unsigned long highest_memmap_pfn;
65struct percpu_counter vm_committed_as; 65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
@@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar;
79struct rb_root nommu_region_tree = RB_ROOT; 79struct rb_root nommu_region_tree = RB_ROOT;
80DECLARE_RWSEM(nommu_region_sem); 80DECLARE_RWSEM(nommu_region_sem);
81 81
82struct vm_operations_struct generic_file_vm_ops = { 82const struct vm_operations_struct generic_file_vm_ops = {
83}; 83};
84 84
85/* 85/*
86 * Handle all mappings that got truncated by a "truncate()"
87 * system call.
88 *
89 * NOTE! We have to be ready to update the memory sharing
90 * between the file and the memory map for a potential last
91 * incomplete page. Ugly, but necessary.
92 */
93int vmtruncate(struct inode *inode, loff_t offset)
94{
95 struct address_space *mapping = inode->i_mapping;
96 unsigned long limit;
97
98 if (inode->i_size < offset)
99 goto do_expand;
100 i_size_write(inode, offset);
101
102 truncate_inode_pages(mapping, offset);
103 goto out_truncate;
104
105do_expand:
106 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
107 if (limit != RLIM_INFINITY && offset > limit)
108 goto out_sig;
109 if (offset > inode->i_sb->s_maxbytes)
110 goto out;
111 i_size_write(inode, offset);
112
113out_truncate:
114 if (inode->i_op->truncate)
115 inode->i_op->truncate(inode);
116 return 0;
117out_sig:
118 send_sig(SIGXFSZ, current, 0);
119out:
120 return -EFBIG;
121}
122
123EXPORT_SYMBOL(vmtruncate);
124
125/*
126 * Return the total memory allocated for this pointer, not 86 * Return the total memory allocated for this pointer, not
127 * just what the caller asked for. 87 * just what the caller asked for.
128 * 88 *
@@ -170,21 +130,20 @@ unsigned int kobjsize(const void *objp)
170} 130}
171 131
172int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 132int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
173 unsigned long start, int nr_pages, int flags, 133 unsigned long start, int nr_pages, unsigned int foll_flags,
174 struct page **pages, struct vm_area_struct **vmas) 134 struct page **pages, struct vm_area_struct **vmas)
175{ 135{
176 struct vm_area_struct *vma; 136 struct vm_area_struct *vma;
177 unsigned long vm_flags; 137 unsigned long vm_flags;
178 int i; 138 int i;
179 int write = !!(flags & GUP_FLAGS_WRITE);
180 int force = !!(flags & GUP_FLAGS_FORCE);
181 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
182 139
183 /* calculate required read or write permissions. 140 /* calculate required read or write permissions.
184 * - if 'force' is set, we only require the "MAY" flags. 141 * If FOLL_FORCE is set, we only require the "MAY" flags.
185 */ 142 */
186 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 143 vm_flags = (foll_flags & FOLL_WRITE) ?
187 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 144 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
145 vm_flags &= (foll_flags & FOLL_FORCE) ?
146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
188 147
189 for (i = 0; i < nr_pages; i++) { 148 for (i = 0; i < nr_pages; i++) {
190 vma = find_vma(mm, start); 149 vma = find_vma(mm, start);
@@ -192,8 +151,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
192 goto finish_or_fault; 151 goto finish_or_fault;
193 152
194 /* protect what we can, including chardevs */ 153 /* protect what we can, including chardevs */
195 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 154 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
196 (!ignore && !(vm_flags & vma->vm_flags))) 155 !(vm_flags & vma->vm_flags))
197 goto finish_or_fault; 156 goto finish_or_fault;
198 157
199 if (pages) { 158 if (pages) {
@@ -212,7 +171,6 @@ finish_or_fault:
212 return i ? : -EFAULT; 171 return i ? : -EFAULT;
213} 172}
214 173
215
216/* 174/*
217 * get a list of pages in an address range belonging to the specified process 175 * get a list of pages in an address range belonging to the specified process
218 * and indicate the VMA that covers each page 176 * and indicate the VMA that covers each page
@@ -227,9 +185,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
227 int flags = 0; 185 int flags = 0;
228 186
229 if (write) 187 if (write)
230 flags |= GUP_FLAGS_WRITE; 188 flags |= FOLL_WRITE;
231 if (force) 189 if (force)
232 flags |= GUP_FLAGS_FORCE; 190 flags |= FOLL_FORCE;
233 191
234 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 192 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
235} 193}
@@ -627,6 +585,22 @@ static void put_nommu_region(struct vm_region *region)
627} 585}
628 586
629/* 587/*
588 * update protection on a vma
589 */
590static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
591{
592#ifdef CONFIG_MPU
593 struct mm_struct *mm = vma->vm_mm;
594 long start = vma->vm_start & PAGE_MASK;
595 while (start < vma->vm_end) {
596 protect_page(mm, start, flags);
597 start += PAGE_SIZE;
598 }
599 update_protections(mm);
600#endif
601}
602
603/*
630 * add a VMA into a process's mm_struct in the appropriate place in the list 604 * add a VMA into a process's mm_struct in the appropriate place in the list
631 * and tree and add to the address space's page tree also if not an anonymous 605 * and tree and add to the address space's page tree also if not an anonymous
632 * page 606 * page
@@ -645,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
645 mm->map_count++; 619 mm->map_count++;
646 vma->vm_mm = mm; 620 vma->vm_mm = mm;
647 621
622 protect_vma(vma, vma->vm_flags);
623
648 /* add the VMA to the mapping */ 624 /* add the VMA to the mapping */
649 if (vma->vm_file) { 625 if (vma->vm_file) {
650 mapping = vma->vm_file->f_mapping; 626 mapping = vma->vm_file->f_mapping;
@@ -707,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
707 683
708 kenter("%p", vma); 684 kenter("%p", vma);
709 685
686 protect_vma(vma, 0);
687
710 mm->map_count--; 688 mm->map_count--;
711 if (mm->mmap_cache == vma) 689 if (mm->mmap_cache == vma)
712 mm->mmap_cache = NULL; 690 mm->mmap_cache = NULL;
@@ -848,7 +826,7 @@ static int validate_mmap_request(struct file *file,
848 int ret; 826 int ret;
849 827
850 /* do the simple checks first */ 828 /* do the simple checks first */
851 if (flags & MAP_FIXED || addr) { 829 if (flags & MAP_FIXED) {
852 printk(KERN_DEBUG 830 printk(KERN_DEBUG
853 "%d: Can't do fixed-address/overlay mmap of RAM\n", 831 "%d: Can't do fixed-address/overlay mmap of RAM\n",
854 current->pid); 832 current->pid);
@@ -1056,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1056 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1034 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1057 if (ret == 0) { 1035 if (ret == 0) {
1058 vma->vm_region->vm_top = vma->vm_region->vm_end; 1036 vma->vm_region->vm_top = vma->vm_region->vm_end;
1059 return ret; 1037 return 0;
1060 } 1038 }
1061 if (ret != -ENOSYS) 1039 if (ret != -ENOSYS)
1062 return ret; 1040 return ret;
@@ -1073,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1073 */ 1051 */
1074static int do_mmap_private(struct vm_area_struct *vma, 1052static int do_mmap_private(struct vm_area_struct *vma,
1075 struct vm_region *region, 1053 struct vm_region *region,
1076 unsigned long len) 1054 unsigned long len,
1055 unsigned long capabilities)
1077{ 1056{
1078 struct page *pages; 1057 struct page *pages;
1079 unsigned long total, point, n, rlen; 1058 unsigned long total, point, n, rlen;
@@ -1084,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma,
1084 * shared mappings on devices or memory 1063 * shared mappings on devices or memory
1085 * - VM_MAYSHARE will be set if it may attempt to share 1064 * - VM_MAYSHARE will be set if it may attempt to share
1086 */ 1065 */
1087 if (vma->vm_file) { 1066 if (capabilities & BDI_CAP_MAP_DIRECT) {
1088 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1067 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1089 if (ret == 0) { 1068 if (ret == 0) {
1090 /* shouldn't return success if we're not sharing */ 1069 /* shouldn't return success if we're not sharing */
1091 BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); 1070 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1092 vma->vm_region->vm_top = vma->vm_region->vm_end; 1071 vma->vm_region->vm_top = vma->vm_region->vm_end;
1093 return ret; 1072 return 0;
1094 } 1073 }
1095 if (ret != -ENOSYS) 1074 if (ret != -ENOSYS)
1096 return ret; 1075 return ret;
@@ -1203,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1203 1182
1204 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1183 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1205 1184
1206 if (!(flags & MAP_FIXED))
1207 addr = round_hint_to_min(addr);
1208
1209 /* decide whether we should attempt the mapping, and if so what sort of 1185 /* decide whether we should attempt the mapping, and if so what sort of
1210 * mapping */ 1186 * mapping */
1211 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1187 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1215,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file,
1215 return ret; 1191 return ret;
1216 } 1192 }
1217 1193
1194 /* we ignore the address hint */
1195 addr = 0;
1196
1218 /* we've determined that we can make the mapping, now translate what we 1197 /* we've determined that we can make the mapping, now translate what we
1219 * now know into VMA flags */ 1198 * now know into VMA flags */
1220 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1199 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
@@ -1328,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1328 * - this is the hook for quasi-memory character devices to 1307 * - this is the hook for quasi-memory character devices to
1329 * tell us the location of a shared mapping 1308 * tell us the location of a shared mapping
1330 */ 1309 */
1331 if (file && file->f_op->get_unmapped_area) { 1310 if (capabilities & BDI_CAP_MAP_DIRECT) {
1332 addr = file->f_op->get_unmapped_area(file, addr, len, 1311 addr = file->f_op->get_unmapped_area(file, addr, len,
1333 pgoff, flags); 1312 pgoff, flags);
1334 if (IS_ERR((void *) addr)) { 1313 if (IS_ERR((void *) addr)) {
@@ -1352,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file,
1352 } 1331 }
1353 1332
1354 vma->vm_region = region; 1333 vma->vm_region = region;
1355 add_nommu_region(region);
1356 1334
1357 /* set up the mapping */ 1335 /* set up the mapping
1336 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1337 */
1358 if (file && vma->vm_flags & VM_SHARED) 1338 if (file && vma->vm_flags & VM_SHARED)
1359 ret = do_mmap_shared_file(vma); 1339 ret = do_mmap_shared_file(vma);
1360 else 1340 else
1361 ret = do_mmap_private(vma, region, len); 1341 ret = do_mmap_private(vma, region, len, capabilities);
1362 if (ret < 0) 1342 if (ret < 0)
1363 goto error_put_region; 1343 goto error_just_free;
1344 add_nommu_region(region);
1364 1345
1365 /* okay... we have a mapping; now we have to register it */ 1346 /* okay... we have a mapping; now we have to register it */
1366 result = vma->vm_start; 1347 result = vma->vm_start;
@@ -1378,19 +1359,6 @@ share:
1378 kleave(" = %lx", result); 1359 kleave(" = %lx", result);
1379 return result; 1360 return result;
1380 1361
1381error_put_region:
1382 __put_nommu_region(region);
1383 if (vma) {
1384 if (vma->vm_file) {
1385 fput(vma->vm_file);
1386 if (vma->vm_flags & VM_EXECUTABLE)
1387 removed_exe_file_vma(vma->vm_mm);
1388 }
1389 kmem_cache_free(vm_area_cachep, vma);
1390 }
1391 kleave(" = %d [pr]", ret);
1392 return ret;
1393
1394error_just_free: 1362error_just_free:
1395 up_write(&nommu_region_sem); 1363 up_write(&nommu_region_sem);
1396error: 1364error:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..ea2147dabba6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_lock); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/*
38 * Is all threads of the target process nodes overlap ours?
39 */
40static int has_intersects_mems_allowed(struct task_struct *tsk)
41{
42 struct task_struct *t;
43
44 t = tsk;
45 do {
46 if (cpuset_mems_allowed_intersects(current, t))
47 return 1;
48 t = next_thread(t);
49 } while (t != tsk);
50
51 return 0;
52}
53
37/** 54/**
38 * badness - calculate a numeric value for how bad this task has been 55 * badness - calculate a numeric value for how bad this task has been
39 * @p: task struct of which task we should calculate 56 * @p: task struct of which task we should calculate
@@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 75 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 76 struct mm_struct *mm;
60 struct task_struct *child; 77 struct task_struct *child;
78 int oom_adj = p->signal->oom_adj;
79 struct task_cputime task_time;
80 unsigned long utime;
81 unsigned long stime;
82
83 if (oom_adj == OOM_DISABLE)
84 return 0;
61 85
62 task_lock(p); 86 task_lock(p);
63 mm = p->mm; 87 mm = p->mm;
@@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
79 /* 103 /*
80 * swapoff can easily use up all memory, so kill those first. 104 * swapoff can easily use up all memory, so kill those first.
81 */ 105 */
82 if (p->flags & PF_SWAPOFF) 106 if (p->flags & PF_OOM_ORIGIN)
83 return ULONG_MAX; 107 return ULONG_MAX;
84 108
85 /* 109 /*
@@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
102 * of seconds. There is no particular reason for this other than 126 * of seconds. There is no particular reason for this other than
103 * that it turned out to work very well in practice. 127 * that it turned out to work very well in practice.
104 */ 128 */
105 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) 129 thread_group_cputime(p, &task_time);
106 >> (SHIFT_HZ + 3); 130 utime = cputime_to_jiffies(task_time.utime);
131 stime = cputime_to_jiffies(task_time.stime);
132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
133
107 134
108 if (uptime >= p->start_time.tv_sec) 135 if (uptime >= p->start_time.tv_sec)
109 run_time = (uptime - p->start_time.tv_sec) >> 10; 136 run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
144 * because p may have allocated or otherwise mapped memory on 171 * because p may have allocated or otherwise mapped memory on
145 * this node before. However it will be less likely. 172 * this node before. However it will be less likely.
146 */ 173 */
147 if (!cpuset_mems_allowed_intersects(current, p)) 174 if (!has_intersects_mems_allowed(p))
148 points /= 8; 175 points /= 8;
149 176
150 /* 177 /*
151 * Adjust the score by oomkilladj. 178 * Adjust the score by oom_adj.
152 */ 179 */
153 if (p->oomkilladj) { 180 if (oom_adj) {
154 if (p->oomkilladj > 0) { 181 if (oom_adj > 0) {
155 if (!points) 182 if (!points)
156 points = 1; 183 points = 1;
157 points <<= p->oomkilladj; 184 points <<= oom_adj;
158 } else 185 } else
159 points >>= -(p->oomkilladj); 186 points >>= -(oom_adj);
160 } 187 }
161 188
162#ifdef DEBUG 189#ifdef DEBUG
@@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200static struct task_struct *select_bad_process(unsigned long *ppoints, 227static struct task_struct *select_bad_process(unsigned long *ppoints,
201 struct mem_cgroup *mem) 228 struct mem_cgroup *mem)
202{ 229{
203 struct task_struct *g, *p; 230 struct task_struct *p;
204 struct task_struct *chosen = NULL; 231 struct task_struct *chosen = NULL;
205 struct timespec uptime; 232 struct timespec uptime;
206 *ppoints = 0; 233 *ppoints = 0;
207 234
208 do_posix_clock_monotonic_gettime(&uptime); 235 do_posix_clock_monotonic_gettime(&uptime);
209 do_each_thread(g, p) { 236 for_each_process(p) {
210 unsigned long points; 237 unsigned long points;
211 238
212 /* 239 /*
@@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 278 *ppoints = ULONG_MAX;
252 } 279 }
253 280
254 if (p->oomkilladj == OOM_DISABLE) 281 if (p->signal->oom_adj == OOM_DISABLE)
255 continue; 282 continue;
256 283
257 points = badness(p, uptime.tv_sec); 284 points = badness(p, uptime.tv_sec);
@@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
259 chosen = p; 286 chosen = p;
260 *ppoints = points; 287 *ppoints = points;
261 } 288 }
262 } while_each_thread(g, p); 289 }
263 290
264 return chosen; 291 return chosen;
265} 292}
@@ -304,7 +331,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 331 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
308 p->comm); 335 p->comm);
309 task_unlock(p); 336 task_unlock(p);
310 } while_each_thread(g, p); 337 } while_each_thread(g, p);
@@ -346,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
346 373
347static int oom_kill_task(struct task_struct *p) 374static int oom_kill_task(struct task_struct *p)
348{ 375{
349 struct mm_struct *mm;
350 struct task_struct *g, *q;
351
352 mm = p->mm;
353
354 /* WARNING: mm may not be dereferenced since we did not obtain its 376 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is 377 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below. 378 * compare mm to q->mm below.
@@ -359,30 +381,11 @@ static int oom_kill_task(struct task_struct *p)
359 * change to NULL at any time since we do not hold task_lock(p). 381 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us. 382 * However, this is of no concern to us.
361 */ 383 */
362 384 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
363 if (mm == NULL)
364 return 1; 385 return 1;
365 386
366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 387 __oom_kill_task(p, 1);
375 388
376 /*
377 * kill all processes that share the ->mm (i.e. all threads),
378 * but are in a different thread group. Don't let them have access
379 * to memory reserves though, otherwise we might deplete all memory.
380 */
381 do_each_thread(g, q) {
382 if (q->mm == mm && !same_thread_group(q, p))
383 force_sig(SIGKILL, q);
384 } while_each_thread(g, q);
385
386 return 0; 389 return 0;
387} 390}
388 391
@@ -394,8 +397,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 397
395 if (printk_ratelimit()) { 398 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: " 399 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj); 401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
399 task_lock(current); 403 task_lock(current);
400 cpuset_print_task_mems_allowed(current); 404 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 405 task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1eea4fa0d410..a3b14090b1fb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -44,18 +44,21 @@ static long ratelimit_pages = 32;
44/* 44/*
45 * When balance_dirty_pages decides that the caller needs to perform some 45 * When balance_dirty_pages decides that the caller needs to perform some
46 * non-background writeback, this is how many pages it will attempt to write. 46 * non-background writeback, this is how many pages it will attempt to write.
47 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably 47 * It should be somewhat larger than dirtied pages to ensure that reasonably
48 * large amounts of I/O are submitted. 48 * large amounts of I/O are submitted.
49 */ 49 */
50static inline long sync_writeback_pages(void) 50static inline long sync_writeback_pages(unsigned long dirtied)
51{ 51{
52 return ratelimit_pages + ratelimit_pages / 2; 52 if (dirtied < ratelimit_pages)
53 dirtied = ratelimit_pages;
54
55 return dirtied + dirtied / 2;
53} 56}
54 57
55/* The following parameters are exported via /proc/sys/vm */ 58/* The following parameters are exported via /proc/sys/vm */
56 59
57/* 60/*
58 * Start background writeback (via pdflush) at this percentage 61 * Start background writeback (via writeback threads) at this percentage
59 */ 62 */
60int dirty_background_ratio = 10; 63int dirty_background_ratio = 10;
61 64
@@ -155,37 +158,37 @@ static void update_completion_period(void)
155} 158}
156 159
157int dirty_background_ratio_handler(struct ctl_table *table, int write, 160int dirty_background_ratio_handler(struct ctl_table *table, int write,
158 struct file *filp, void __user *buffer, size_t *lenp, 161 void __user *buffer, size_t *lenp,
159 loff_t *ppos) 162 loff_t *ppos)
160{ 163{
161 int ret; 164 int ret;
162 165
163 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 166 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
164 if (ret == 0 && write) 167 if (ret == 0 && write)
165 dirty_background_bytes = 0; 168 dirty_background_bytes = 0;
166 return ret; 169 return ret;
167} 170}
168 171
169int dirty_background_bytes_handler(struct ctl_table *table, int write, 172int dirty_background_bytes_handler(struct ctl_table *table, int write,
170 struct file *filp, void __user *buffer, size_t *lenp, 173 void __user *buffer, size_t *lenp,
171 loff_t *ppos) 174 loff_t *ppos)
172{ 175{
173 int ret; 176 int ret;
174 177
175 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 178 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
176 if (ret == 0 && write) 179 if (ret == 0 && write)
177 dirty_background_ratio = 0; 180 dirty_background_ratio = 0;
178 return ret; 181 return ret;
179} 182}
180 183
181int dirty_ratio_handler(struct ctl_table *table, int write, 184int dirty_ratio_handler(struct ctl_table *table, int write,
182 struct file *filp, void __user *buffer, size_t *lenp, 185 void __user *buffer, size_t *lenp,
183 loff_t *ppos) 186 loff_t *ppos)
184{ 187{
185 int old_ratio = vm_dirty_ratio; 188 int old_ratio = vm_dirty_ratio;
186 int ret; 189 int ret;
187 190
188 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 191 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
189 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 192 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
190 update_completion_period(); 193 update_completion_period();
191 vm_dirty_bytes = 0; 194 vm_dirty_bytes = 0;
@@ -195,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
195 198
196 199
197int dirty_bytes_handler(struct ctl_table *table, int write, 200int dirty_bytes_handler(struct ctl_table *table, int write,
198 struct file *filp, void __user *buffer, size_t *lenp, 201 void __user *buffer, size_t *lenp,
199 loff_t *ppos) 202 loff_t *ppos)
200{ 203{
201 unsigned long old_bytes = vm_dirty_bytes; 204 unsigned long old_bytes = vm_dirty_bytes;
202 int ret; 205 int ret;
203 206
204 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 207 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
205 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 208 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
206 update_completion_period(); 209 update_completion_period();
207 vm_dirty_ratio = 0; 210 vm_dirty_ratio = 0;
@@ -380,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
380 struct zone *z = 383 struct zone *z =
381 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 384 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
382 385
383 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); 386 x += zone_page_state(z, NR_FREE_PAGES) +
387 zone_reclaimable_pages(z);
384 } 388 }
385 /* 389 /*
386 * Make sure that the number of highmem pages is never larger 390 * Make sure that the number of highmem pages is never larger
@@ -404,7 +408,7 @@ unsigned long determine_dirtyable_memory(void)
404{ 408{
405 unsigned long x; 409 unsigned long x;
406 410
407 x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); 411 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
408 412
409 if (!vm_highmem_is_dirtyable) 413 if (!vm_highmem_is_dirtyable)
410 x -= highmem_dirtyable_memory(x); 414 x -= highmem_dirtyable_memory(x);
@@ -473,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
473 * balance_dirty_pages() must be called by processes which are generating dirty 477 * balance_dirty_pages() must be called by processes which are generating dirty
474 * data. It looks at the number of dirty pages in the machine and will force 478 * data. It looks at the number of dirty pages in the machine and will force
475 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 479 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
476 * If we're over `background_thresh' then pdflush is woken to perform some 480 * If we're over `background_thresh' then the writeback threads are woken to
477 * writeout. 481 * perform some writeout.
478 */ 482 */
479static void balance_dirty_pages(struct address_space *mapping) 483static void balance_dirty_pages(struct address_space *mapping,
484 unsigned long write_chunk)
480{ 485{
481 long nr_reclaimable, bdi_nr_reclaimable; 486 long nr_reclaimable, bdi_nr_reclaimable;
482 long nr_writeback, bdi_nr_writeback; 487 long nr_writeback, bdi_nr_writeback;
@@ -484,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping)
484 unsigned long dirty_thresh; 489 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 490 unsigned long bdi_thresh;
486 unsigned long pages_written = 0; 491 unsigned long pages_written = 0;
487 unsigned long write_chunk = sync_writeback_pages(); 492 unsigned long pause = 1;
488 493
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 494 struct backing_dev_info *bdi = mapping->backing_dev_info;
490 495
@@ -561,7 +566,15 @@ static void balance_dirty_pages(struct address_space *mapping)
561 if (pages_written >= write_chunk) 566 if (pages_written >= write_chunk)
562 break; /* We've done our duty */ 567 break; /* We've done our duty */
563 568
564 schedule_timeout(1); 569 schedule_timeout_interruptible(pause);
570
571 /*
572 * Increase the delay for each loop, up to our previous
573 * default of taking a 100ms nap.
574 */
575 pause <<= 1;
576 if (pause > HZ / 10)
577 pause = HZ / 10;
565 } 578 }
566 579
567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 580 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -569,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping)
569 bdi->dirty_exceeded = 0; 582 bdi->dirty_exceeded = 0;
570 583
571 if (writeback_in_progress(bdi)) 584 if (writeback_in_progress(bdi))
572 return; /* pdflush is already working this queue */ 585 return;
573 586
574 /* 587 /*
575 * In laptop mode, we wait until hitting the higher threshold before 588 * In laptop mode, we wait until hitting the higher threshold before
@@ -580,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping)
580 * background_thresh, to keep the amount of dirty memory low. 593 * background_thresh, to keep the amount of dirty memory low.
581 */ 594 */
582 if ((laptop_mode && pages_written) || 595 if ((laptop_mode && pages_written) ||
583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) 596 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
584 + global_page_state(NR_UNSTABLE_NFS)) 597 + global_page_state(NR_UNSTABLE_NFS))
585 > background_thresh))) 598 > background_thresh)))
586 bdi_start_writeback(bdi, nr_writeback); 599 bdi_start_writeback(bdi, NULL, 0);
587} 600}
588 601
589void set_page_dirty_balance(struct page *page, int page_mkwrite) 602void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -630,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
630 p = &__get_cpu_var(bdp_ratelimits); 643 p = &__get_cpu_var(bdp_ratelimits);
631 *p += nr_pages_dirtied; 644 *p += nr_pages_dirtied;
632 if (unlikely(*p >= ratelimit)) { 645 if (unlikely(*p >= ratelimit)) {
646 ratelimit = sync_writeback_pages(*p);
633 *p = 0; 647 *p = 0;
634 preempt_enable(); 648 preempt_enable();
635 balance_dirty_pages(mapping); 649 balance_dirty_pages(mapping, ratelimit);
636 return; 650 return;
637 } 651 }
638 preempt_enable(); 652 preempt_enable();
@@ -676,9 +690,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
676 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 690 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
677 */ 691 */
678int dirty_writeback_centisecs_handler(ctl_table *table, int write, 692int dirty_writeback_centisecs_handler(ctl_table *table, int write,
679 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 693 void __user *buffer, size_t *length, loff_t *ppos)
680{ 694{
681 proc_dointvec(table, write, file, buffer, length, ppos); 695 proc_dointvec(table, write, buffer, length, ppos);
682 return 0; 696 return 0;
683} 697}
684 698
@@ -1139,6 +1153,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1139EXPORT_SYMBOL(redirty_page_for_writepage); 1153EXPORT_SYMBOL(redirty_page_for_writepage);
1140 1154
1141/* 1155/*
1156 * Dirty a page.
1157 *
1158 * For pages with a mapping this should be done under the page lock
1159 * for the benefit of asynchronous memory errors who prefer a consistent
1160 * dirty state. This rule can be broken in some special cases,
1161 * but should be better not to.
1162 *
1142 * If the mapping doesn't provide a set_page_dirty a_op, then 1163 * If the mapping doesn't provide a set_page_dirty a_op, then
1143 * just fall through and assume that it wants buffer_heads. 1164 * just fall through and assume that it wants buffer_heads.
1144 */ 1165 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a0de15f46987..bf720550b44d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <trace/events/kmem.h>
51 52
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
53#include <asm/div64.h> 54#include <asm/div64.h>
@@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states);
71 72
72unsigned long totalram_pages __read_mostly; 73unsigned long totalram_pages __read_mostly;
73unsigned long totalreserve_pages __read_mostly; 74unsigned long totalreserve_pages __read_mostly;
74unsigned long highest_memmap_pfn __read_mostly;
75int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
77 77
@@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
123 123
124int min_free_kbytes = 1024; 124int min_free_kbytes = 1024;
125 125
126unsigned long __meminitdata nr_kernel_pages; 126static unsigned long __meminitdata nr_kernel_pages;
127unsigned long __meminitdata nr_all_pages; 127static unsigned long __meminitdata nr_all_pages;
128static unsigned long __meminitdata dma_reserve; 128static unsigned long __meminitdata dma_reserve;
129 129
130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
234 static unsigned long nr_shown; 234 static unsigned long nr_shown;
235 static unsigned long nr_unshown; 235 static unsigned long nr_unshown;
236 236
237 /* Don't complain about poisoned pages */
238 if (PageHWPoison(page)) {
239 __ClearPageBuddy(page);
240 return;
241 }
242
237 /* 243 /*
238 * Allow a burst of 60 reports, then keep quiet for that minute; 244 * Allow a burst of 60 reports, then keep quiet for that minute;
239 * or allow a steady drip of one report per second. 245 * or allow a steady drip of one report per second.
@@ -510,7 +516,7 @@ static inline int free_pages_check(struct page *page)
510} 516}
511 517
512/* 518/*
513 * Frees a list of pages. 519 * Frees a number of pages from the PCP lists
514 * Assumes all pages on list are in same zone, and of same order. 520 * Assumes all pages on list are in same zone, and of same order.
515 * count is the number of pages to free. 521 * count is the number of pages to free.
516 * 522 *
@@ -520,22 +526,42 @@ static inline int free_pages_check(struct page *page)
520 * And clear the zone's pages_scanned counter, to hold off the "all pages are 526 * And clear the zone's pages_scanned counter, to hold off the "all pages are
521 * pinned" detection logic. 527 * pinned" detection logic.
522 */ 528 */
523static void free_pages_bulk(struct zone *zone, int count, 529static void free_pcppages_bulk(struct zone *zone, int count,
524 struct list_head *list, int order) 530 struct per_cpu_pages *pcp)
525{ 531{
532 int migratetype = 0;
533 int batch_free = 0;
534
526 spin_lock(&zone->lock); 535 spin_lock(&zone->lock);
527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 536 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
528 zone->pages_scanned = 0; 537 zone->pages_scanned = 0;
529 538
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); 539 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
531 while (count--) { 540 while (count) {
532 struct page *page; 541 struct page *page;
542 struct list_head *list;
533 543
534 VM_BUG_ON(list_empty(list)); 544 /*
535 page = list_entry(list->prev, struct page, lru); 545 * Remove pages from lists in a round-robin fashion. A
536 /* have to delete it as __free_one_page list manipulates */ 546 * batch_free count is maintained that is incremented when an
537 list_del(&page->lru); 547 * empty list is encountered. This is so more pages are freed
538 __free_one_page(page, zone, order, page_private(page)); 548 * off fuller lists instead of spinning excessively around empty
549 * lists
550 */
551 do {
552 batch_free++;
553 if (++migratetype == MIGRATE_PCPTYPES)
554 migratetype = 0;
555 list = &pcp->lists[migratetype];
556 } while (list_empty(list));
557
558 do {
559 page = list_entry(list->prev, struct page, lru);
560 /* must delete as __free_one_page list manipulates */
561 list_del(&page->lru);
562 __free_one_page(page, zone, 0, migratetype);
563 trace_mm_page_pcpu_drain(page, 0, migratetype);
564 } while (--count && --batch_free && !list_empty(list));
539 } 565 }
540 spin_unlock(&zone->lock); 566 spin_unlock(&zone->lock);
541} 567}
@@ -557,7 +583,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
557 unsigned long flags; 583 unsigned long flags;
558 int i; 584 int i;
559 int bad = 0; 585 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page); 586 int wasMlocked = __TestClearPageMlocked(page);
561 587
562 kmemcheck_free_shadow(page, order); 588 kmemcheck_free_shadow(page, order);
563 589
@@ -646,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
646/* 672/*
647 * This page is about to be returned from the page allocator 673 * This page is about to be returned from the page allocator
648 */ 674 */
649static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 675static inline int check_new_page(struct page *page)
650{ 676{
651 if (unlikely(page_mapcount(page) | 677 if (unlikely(page_mapcount(page) |
652 (page->mapping != NULL) | 678 (page->mapping != NULL) |
@@ -655,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
655 bad_page(page); 681 bad_page(page);
656 return 1; 682 return 1;
657 } 683 }
684 return 0;
685}
686
687static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
688{
689 int i;
690
691 for (i = 0; i < (1 << order); i++) {
692 struct page *p = page + i;
693 if (unlikely(check_new_page(p)))
694 return 1;
695 }
658 696
659 set_page_private(page, 0); 697 set_page_private(page, 0);
660 set_page_refcounted(page); 698 set_page_refcounted(page);
@@ -783,6 +821,17 @@ static int move_freepages_block(struct zone *zone, struct page *page,
783 return move_freepages(zone, start_page, end_page, migratetype); 821 return move_freepages(zone, start_page, end_page, migratetype);
784} 822}
785 823
824static void change_pageblock_range(struct page *pageblock_page,
825 int start_order, int migratetype)
826{
827 int nr_pageblocks = 1 << (start_order - pageblock_order);
828
829 while (nr_pageblocks--) {
830 set_pageblock_migratetype(pageblock_page, migratetype);
831 pageblock_page += pageblock_nr_pages;
832 }
833}
834
786/* Remove an element from the buddy allocator from the fallback list */ 835/* Remove an element from the buddy allocator from the fallback list */
787static inline struct page * 836static inline struct page *
788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 837__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -836,11 +885,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
836 list_del(&page->lru); 885 list_del(&page->lru);
837 rmv_page_order(page); 886 rmv_page_order(page);
838 887
839 if (current_order == pageblock_order) 888 /* Take ownership for orders >= pageblock_order */
840 set_pageblock_migratetype(page, 889 if (current_order >= pageblock_order)
890 change_pageblock_range(page, current_order,
841 start_migratetype); 891 start_migratetype);
842 892
843 expand(zone, page, order, current_order, area, migratetype); 893 expand(zone, page, order, current_order, area, migratetype);
894
895 trace_mm_page_alloc_extfrag(page, order, current_order,
896 start_migratetype, migratetype);
897
844 return page; 898 return page;
845 } 899 }
846 } 900 }
@@ -874,6 +928,7 @@ retry_reserve:
874 } 928 }
875 } 929 }
876 930
931 trace_mm_page_alloc_zone_locked(page, order, migratetype);
877 return page; 932 return page;
878} 933}
879 934
@@ -934,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
934 to_drain = pcp->batch; 989 to_drain = pcp->batch;
935 else 990 else
936 to_drain = pcp->count; 991 to_drain = pcp->count;
937 free_pages_bulk(zone, to_drain, &pcp->list, 0); 992 free_pcppages_bulk(zone, to_drain, pcp);
938 pcp->count -= to_drain; 993 pcp->count -= to_drain;
939 local_irq_restore(flags); 994 local_irq_restore(flags);
940} 995}
@@ -960,7 +1015,7 @@ static void drain_pages(unsigned int cpu)
960 1015
961 pcp = &pset->pcp; 1016 pcp = &pset->pcp;
962 local_irq_save(flags); 1017 local_irq_save(flags);
963 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 1018 free_pcppages_bulk(zone, pcp->count, pcp);
964 pcp->count = 0; 1019 pcp->count = 0;
965 local_irq_restore(flags); 1020 local_irq_restore(flags);
966 } 1021 }
@@ -1026,7 +1081,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1026 struct zone *zone = page_zone(page); 1081 struct zone *zone = page_zone(page);
1027 struct per_cpu_pages *pcp; 1082 struct per_cpu_pages *pcp;
1028 unsigned long flags; 1083 unsigned long flags;
1029 int wasMlocked = TestClearPageMlocked(page); 1084 int migratetype;
1085 int wasMlocked = __TestClearPageMlocked(page);
1030 1086
1031 kmemcheck_free_shadow(page, 0); 1087 kmemcheck_free_shadow(page, 0);
1032 1088
@@ -1043,35 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold)
1043 kernel_map_pages(page, 1, 0); 1099 kernel_map_pages(page, 1, 0);
1044 1100
1045 pcp = &zone_pcp(zone, get_cpu())->pcp; 1101 pcp = &zone_pcp(zone, get_cpu())->pcp;
1046 set_page_private(page, get_pageblock_migratetype(page)); 1102 migratetype = get_pageblock_migratetype(page);
1103 set_page_private(page, migratetype);
1047 local_irq_save(flags); 1104 local_irq_save(flags);
1048 if (unlikely(wasMlocked)) 1105 if (unlikely(wasMlocked))
1049 free_page_mlock(page); 1106 free_page_mlock(page);
1050 __count_vm_event(PGFREE); 1107 __count_vm_event(PGFREE);
1051 1108
1109 /*
1110 * We only track unmovable, reclaimable and movable on pcp lists.
1111 * Free ISOLATE pages back to the allocator because they are being
1112 * offlined but treat RESERVE as movable pages so we can get those
1113 * areas back if necessary. Otherwise, we may have to free
1114 * excessively into the page allocator
1115 */
1116 if (migratetype >= MIGRATE_PCPTYPES) {
1117 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1118 free_one_page(zone, page, 0, migratetype);
1119 goto out;
1120 }
1121 migratetype = MIGRATE_MOVABLE;
1122 }
1123
1052 if (cold) 1124 if (cold)
1053 list_add_tail(&page->lru, &pcp->list); 1125 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1054 else 1126 else
1055 list_add(&page->lru, &pcp->list); 1127 list_add(&page->lru, &pcp->lists[migratetype]);
1056 pcp->count++; 1128 pcp->count++;
1057 if (pcp->count >= pcp->high) { 1129 if (pcp->count >= pcp->high) {
1058 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1130 free_pcppages_bulk(zone, pcp->batch, pcp);
1059 pcp->count -= pcp->batch; 1131 pcp->count -= pcp->batch;
1060 } 1132 }
1133
1134out:
1061 local_irq_restore(flags); 1135 local_irq_restore(flags);
1062 put_cpu(); 1136 put_cpu();
1063} 1137}
1064 1138
1065void free_hot_page(struct page *page) 1139void free_hot_page(struct page *page)
1066{ 1140{
1141 trace_mm_page_free_direct(page, 0);
1067 free_hot_cold_page(page, 0); 1142 free_hot_cold_page(page, 0);
1068} 1143}
1069 1144
1070void free_cold_page(struct page *page)
1071{
1072 free_hot_cold_page(page, 1);
1073}
1074
1075/* 1145/*
1076 * split_page takes a non-compound higher-order page, and splits it into 1146 * split_page takes a non-compound higher-order page, and splits it into
1077 * n (1<<order) sub-pages: page[0..n] 1147 * n (1<<order) sub-pages: page[0..n]
@@ -1119,35 +1189,23 @@ again:
1119 cpu = get_cpu(); 1189 cpu = get_cpu();
1120 if (likely(order == 0)) { 1190 if (likely(order == 0)) {
1121 struct per_cpu_pages *pcp; 1191 struct per_cpu_pages *pcp;
1192 struct list_head *list;
1122 1193
1123 pcp = &zone_pcp(zone, cpu)->pcp; 1194 pcp = &zone_pcp(zone, cpu)->pcp;
1195 list = &pcp->lists[migratetype];
1124 local_irq_save(flags); 1196 local_irq_save(flags);
1125 if (!pcp->count) { 1197 if (list_empty(list)) {
1126 pcp->count = rmqueue_bulk(zone, 0, 1198 pcp->count += rmqueue_bulk(zone, 0,
1127 pcp->batch, &pcp->list, 1199 pcp->batch, list,
1128 migratetype, cold); 1200 migratetype, cold);
1129 if (unlikely(!pcp->count)) 1201 if (unlikely(list_empty(list)))
1130 goto failed; 1202 goto failed;
1131 } 1203 }
1132 1204
1133 /* Find a page of the appropriate migrate type */ 1205 if (cold)
1134 if (cold) { 1206 page = list_entry(list->prev, struct page, lru);
1135 list_for_each_entry_reverse(page, &pcp->list, lru) 1207 else
1136 if (page_private(page) == migratetype) 1208 page = list_entry(list->next, struct page, lru);
1137 break;
1138 } else {
1139 list_for_each_entry(page, &pcp->list, lru)
1140 if (page_private(page) == migratetype)
1141 break;
1142 }
1143
1144 /* Allocate more to the pcp list if necessary */
1145 if (unlikely(&page->lru == &pcp->list)) {
1146 pcp->count += rmqueue_bulk(zone, 0,
1147 pcp->batch, &pcp->list,
1148 migratetype, cold);
1149 page = list_entry(pcp->list.next, struct page, lru);
1150 }
1151 1209
1152 list_del(&page->lru); 1210 list_del(&page->lru);
1153 pcp->count--; 1211 pcp->count--;
@@ -1627,10 +1685,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1627 1685
1628 /* We now go into synchronous reclaim */ 1686 /* We now go into synchronous reclaim */
1629 cpuset_memory_pressure_bump(); 1687 cpuset_memory_pressure_bump();
1630
1631 /*
1632 * The task's cpuset might have expanded its set of allowable nodes
1633 */
1634 p->flags |= PF_MEMALLOC; 1688 p->flags |= PF_MEMALLOC;
1635 lockdep_set_current_reclaim_state(gfp_mask); 1689 lockdep_set_current_reclaim_state(gfp_mask);
1636 reclaim_state.reclaimed_slab = 0; 1690 reclaim_state.reclaimed_slab = 0;
@@ -1765,6 +1819,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1765 1819
1766 wake_all_kswapd(order, zonelist, high_zoneidx); 1820 wake_all_kswapd(order, zonelist, high_zoneidx);
1767 1821
1822restart:
1768 /* 1823 /*
1769 * OK, we're below the kswapd watermark and have kicked background 1824 * OK, we're below the kswapd watermark and have kicked background
1770 * reclaim. Now things get more complex, so set up alloc_flags according 1825 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -1772,7 +1827,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1772 */ 1827 */
1773 alloc_flags = gfp_to_alloc_flags(gfp_mask); 1828 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1774 1829
1775restart:
1776 /* This is the last chance, in general, before the goto nopage. */ 1830 /* This is the last chance, in general, before the goto nopage. */
1777 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1831 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1778 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 1832 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1907,6 +1961,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1907 zonelist, high_zoneidx, nodemask, 1961 zonelist, high_zoneidx, nodemask,
1908 preferred_zone, migratetype); 1962 preferred_zone, migratetype);
1909 1963
1964 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1910 return page; 1965 return page;
1911} 1966}
1912EXPORT_SYMBOL(__alloc_pages_nodemask); 1967EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -1916,44 +1971,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask);
1916 */ 1971 */
1917unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1972unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1918{ 1973{
1919 struct page * page; 1974 struct page *page;
1975
1976 /*
1977 * __get_free_pages() returns a 32-bit address, which cannot represent
1978 * a highmem page
1979 */
1980 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1981
1920 page = alloc_pages(gfp_mask, order); 1982 page = alloc_pages(gfp_mask, order);
1921 if (!page) 1983 if (!page)
1922 return 0; 1984 return 0;
1923 return (unsigned long) page_address(page); 1985 return (unsigned long) page_address(page);
1924} 1986}
1925
1926EXPORT_SYMBOL(__get_free_pages); 1987EXPORT_SYMBOL(__get_free_pages);
1927 1988
1928unsigned long get_zeroed_page(gfp_t gfp_mask) 1989unsigned long get_zeroed_page(gfp_t gfp_mask)
1929{ 1990{
1930 struct page * page; 1991 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1931
1932 /*
1933 * get_zeroed_page() returns a 32-bit address, which cannot represent
1934 * a highmem page
1935 */
1936 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1937
1938 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1939 if (page)
1940 return (unsigned long) page_address(page);
1941 return 0;
1942} 1992}
1943
1944EXPORT_SYMBOL(get_zeroed_page); 1993EXPORT_SYMBOL(get_zeroed_page);
1945 1994
1946void __pagevec_free(struct pagevec *pvec) 1995void __pagevec_free(struct pagevec *pvec)
1947{ 1996{
1948 int i = pagevec_count(pvec); 1997 int i = pagevec_count(pvec);
1949 1998
1950 while (--i >= 0) 1999 while (--i >= 0) {
2000 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1951 free_hot_cold_page(pvec->pages[i], pvec->cold); 2001 free_hot_cold_page(pvec->pages[i], pvec->cold);
2002 }
1952} 2003}
1953 2004
1954void __free_pages(struct page *page, unsigned int order) 2005void __free_pages(struct page *page, unsigned int order)
1955{ 2006{
1956 if (put_page_testzero(page)) { 2007 if (put_page_testzero(page)) {
2008 trace_mm_page_free_direct(page, order);
1957 if (order == 0) 2009 if (order == 0)
1958 free_hot_page(page); 2010 free_hot_page(page);
1959 else 2011 else
@@ -2128,23 +2180,28 @@ void show_free_areas(void)
2128 } 2180 }
2129 } 2181 }
2130 2182
2131 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2132 " inactive_file:%lu" 2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2133 " unevictable:%lu" 2185 " unevictable:%lu"
2134 " dirty:%lu writeback:%lu unstable:%lu\n" 2186 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
2135 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2136 global_page_state(NR_ACTIVE_ANON), 2189 global_page_state(NR_ACTIVE_ANON),
2137 global_page_state(NR_ACTIVE_FILE),
2138 global_page_state(NR_INACTIVE_ANON), 2190 global_page_state(NR_INACTIVE_ANON),
2191 global_page_state(NR_ISOLATED_ANON),
2192 global_page_state(NR_ACTIVE_FILE),
2139 global_page_state(NR_INACTIVE_FILE), 2193 global_page_state(NR_INACTIVE_FILE),
2194 global_page_state(NR_ISOLATED_FILE),
2140 global_page_state(NR_UNEVICTABLE), 2195 global_page_state(NR_UNEVICTABLE),
2141 global_page_state(NR_FILE_DIRTY), 2196 global_page_state(NR_FILE_DIRTY),
2142 global_page_state(NR_WRITEBACK), 2197 global_page_state(NR_WRITEBACK),
2143 global_page_state(NR_UNSTABLE_NFS), 2198 global_page_state(NR_UNSTABLE_NFS),
2199 nr_blockdev_pages(),
2144 global_page_state(NR_FREE_PAGES), 2200 global_page_state(NR_FREE_PAGES),
2145 global_page_state(NR_SLAB_RECLAIMABLE) + 2201 global_page_state(NR_SLAB_RECLAIMABLE),
2146 global_page_state(NR_SLAB_UNRECLAIMABLE), 2202 global_page_state(NR_SLAB_UNRECLAIMABLE),
2147 global_page_state(NR_FILE_MAPPED), 2203 global_page_state(NR_FILE_MAPPED),
2204 global_page_state(NR_SHMEM),
2148 global_page_state(NR_PAGETABLE), 2205 global_page_state(NR_PAGETABLE),
2149 global_page_state(NR_BOUNCE)); 2206 global_page_state(NR_BOUNCE));
2150 2207
@@ -2162,7 +2219,21 @@ void show_free_areas(void)
2162 " active_file:%lukB" 2219 " active_file:%lukB"
2163 " inactive_file:%lukB" 2220 " inactive_file:%lukB"
2164 " unevictable:%lukB" 2221 " unevictable:%lukB"
2222 " isolated(anon):%lukB"
2223 " isolated(file):%lukB"
2165 " present:%lukB" 2224 " present:%lukB"
2225 " mlocked:%lukB"
2226 " dirty:%lukB"
2227 " writeback:%lukB"
2228 " mapped:%lukB"
2229 " shmem:%lukB"
2230 " slab_reclaimable:%lukB"
2231 " slab_unreclaimable:%lukB"
2232 " kernel_stack:%lukB"
2233 " pagetables:%lukB"
2234 " unstable:%lukB"
2235 " bounce:%lukB"
2236 " writeback_tmp:%lukB"
2166 " pages_scanned:%lu" 2237 " pages_scanned:%lu"
2167 " all_unreclaimable? %s" 2238 " all_unreclaimable? %s"
2168 "\n", 2239 "\n",
@@ -2176,7 +2247,22 @@ void show_free_areas(void)
2176 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2247 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2177 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2248 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2178 K(zone_page_state(zone, NR_UNEVICTABLE)), 2249 K(zone_page_state(zone, NR_UNEVICTABLE)),
2250 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2251 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2179 K(zone->present_pages), 2252 K(zone->present_pages),
2253 K(zone_page_state(zone, NR_MLOCK)),
2254 K(zone_page_state(zone, NR_FILE_DIRTY)),
2255 K(zone_page_state(zone, NR_WRITEBACK)),
2256 K(zone_page_state(zone, NR_FILE_MAPPED)),
2257 K(zone_page_state(zone, NR_SHMEM)),
2258 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2259 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2260 zone_page_state(zone, NR_KERNEL_STACK) *
2261 THREAD_SIZE / 1024,
2262 K(zone_page_state(zone, NR_PAGETABLE)),
2263 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2264 K(zone_page_state(zone, NR_BOUNCE)),
2265 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2180 zone->pages_scanned, 2266 zone->pages_scanned,
2181 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2267 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
2182 ); 2268 );
@@ -2305,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
2305 * sysctl handler for numa_zonelist_order 2391 * sysctl handler for numa_zonelist_order
2306 */ 2392 */
2307int numa_zonelist_order_handler(ctl_table *table, int write, 2393int numa_zonelist_order_handler(ctl_table *table, int write,
2308 struct file *file, void __user *buffer, size_t *length, 2394 void __user *buffer, size_t *length,
2309 loff_t *ppos) 2395 loff_t *ppos)
2310{ 2396{
2311 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2397 char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2314,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2314 if (write) 2400 if (write)
2315 strncpy(saved_string, (char*)table->data, 2401 strncpy(saved_string, (char*)table->data,
2316 NUMA_ZONELIST_ORDER_LEN); 2402 NUMA_ZONELIST_ORDER_LEN);
2317 ret = proc_dostring(table, write, file, buffer, length, ppos); 2403 ret = proc_dostring(table, write, buffer, length, ppos);
2318 if (ret) 2404 if (ret)
2319 return ret; 2405 return ret;
2320 if (write) { 2406 if (write) {
@@ -2783,7 +2869,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2783{ 2869{
2784 unsigned long start_pfn, pfn, end_pfn; 2870 unsigned long start_pfn, pfn, end_pfn;
2785 struct page *page; 2871 struct page *page;
2786 unsigned long reserve, block_migratetype; 2872 unsigned long block_migratetype;
2873 int reserve;
2787 2874
2788 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2875 /* Get the start pfn, end pfn and the number of blocks to reserve */
2789 start_pfn = zone->zone_start_pfn; 2876 start_pfn = zone->zone_start_pfn;
@@ -2791,6 +2878,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2791 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 2878 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2792 pageblock_order; 2879 pageblock_order;
2793 2880
2881 /*
2882 * Reserve blocks are generally in place to help high-order atomic
2883 * allocations that are short-lived. A min_free_kbytes value that
2884 * would result in more than 2 reserve blocks for atomic allocations
2885 * is assumed to be in place to help anti-fragmentation for the
2886 * future allocation of hugepages at runtime.
2887 */
2888 reserve = min(2, reserve);
2889
2794 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2890 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2795 if (!pfn_valid(pfn)) 2891 if (!pfn_valid(pfn))
2796 continue; 2892 continue;
@@ -2961,6 +3057,7 @@ static int zone_batchsize(struct zone *zone)
2961static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3057static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2962{ 3058{
2963 struct per_cpu_pages *pcp; 3059 struct per_cpu_pages *pcp;
3060 int migratetype;
2964 3061
2965 memset(p, 0, sizeof(*p)); 3062 memset(p, 0, sizeof(*p));
2966 3063
@@ -2968,7 +3065,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2968 pcp->count = 0; 3065 pcp->count = 0;
2969 pcp->high = 6 * batch; 3066 pcp->high = 6 * batch;
2970 pcp->batch = max(1UL, 1 * batch); 3067 pcp->batch = max(1UL, 1 * batch);
2971 INIT_LIST_HEAD(&pcp->list); 3068 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3069 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2972} 3070}
2973 3071
2974/* 3072/*
@@ -3146,6 +3244,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3146 return 0; 3244 return 0;
3147} 3245}
3148 3246
3247static int __zone_pcp_update(void *data)
3248{
3249 struct zone *zone = data;
3250 int cpu;
3251 unsigned long batch = zone_batchsize(zone), flags;
3252
3253 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3254 struct per_cpu_pageset *pset;
3255 struct per_cpu_pages *pcp;
3256
3257 pset = zone_pcp(zone, cpu);
3258 pcp = &pset->pcp;
3259
3260 local_irq_save(flags);
3261 free_pcppages_bulk(zone, pcp->count, pcp);
3262 setup_pageset(pset, batch);
3263 local_irq_restore(flags);
3264 }
3265 return 0;
3266}
3267
3268void zone_pcp_update(struct zone *zone)
3269{
3270 stop_machine(__zone_pcp_update, zone, NULL);
3271}
3272
3149static __meminit void zone_pcp_init(struct zone *zone) 3273static __meminit void zone_pcp_init(struct zone *zone)
3150{ 3274{
3151 int cpu; 3275 int cpu;
@@ -3720,7 +3844,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3720 zone_pcp_init(zone); 3844 zone_pcp_init(zone);
3721 for_each_lru(l) { 3845 for_each_lru(l) {
3722 INIT_LIST_HEAD(&zone->lru[l].list); 3846 INIT_LIST_HEAD(&zone->lru[l].list);
3723 zone->lru[l].nr_saved_scan = 0; 3847 zone->reclaim_stat.nr_saved_scan[l] = 0;
3724 } 3848 }
3725 zone->reclaim_stat.recent_rotated[0] = 0; 3849 zone->reclaim_stat.recent_rotated[0] = 0;
3726 zone->reclaim_stat.recent_rotated[1] = 0; 3850 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4509,7 +4633,7 @@ void setup_per_zone_wmarks(void)
4509 calculate_totalreserve_pages(); 4633 calculate_totalreserve_pages();
4510} 4634}
4511 4635
4512/** 4636/*
4513 * The inactive anon list should be small enough that the VM never has to 4637 * The inactive anon list should be small enough that the VM never has to
4514 * do too much work, but large enough that each inactive page has a chance 4638 * do too much work, but large enough that each inactive page has a chance
4515 * to be referenced again before it is swapped out. 4639 * to be referenced again before it is swapped out.
@@ -4600,9 +4724,9 @@ module_init(init_per_zone_wmark_min)
4600 * changes. 4724 * changes.
4601 */ 4725 */
4602int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4726int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4603 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4727 void __user *buffer, size_t *length, loff_t *ppos)
4604{ 4728{
4605 proc_dointvec(table, write, file, buffer, length, ppos); 4729 proc_dointvec(table, write, buffer, length, ppos);
4606 if (write) 4730 if (write)
4607 setup_per_zone_wmarks(); 4731 setup_per_zone_wmarks();
4608 return 0; 4732 return 0;
@@ -4610,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4610 4734
4611#ifdef CONFIG_NUMA 4735#ifdef CONFIG_NUMA
4612int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4736int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4613 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4737 void __user *buffer, size_t *length, loff_t *ppos)
4614{ 4738{
4615 struct zone *zone; 4739 struct zone *zone;
4616 int rc; 4740 int rc;
4617 4741
4618 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4742 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4619 if (rc) 4743 if (rc)
4620 return rc; 4744 return rc;
4621 4745
@@ -4626,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4626} 4750}
4627 4751
4628int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4752int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4629 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4753 void __user *buffer, size_t *length, loff_t *ppos)
4630{ 4754{
4631 struct zone *zone; 4755 struct zone *zone;
4632 int rc; 4756 int rc;
4633 4757
4634 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4758 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4635 if (rc) 4759 if (rc)
4636 return rc; 4760 return rc;
4637 4761
@@ -4652,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4652 * if in function of the boot time zone sizes. 4776 * if in function of the boot time zone sizes.
4653 */ 4777 */
4654int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4778int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4655 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4779 void __user *buffer, size_t *length, loff_t *ppos)
4656{ 4780{
4657 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4781 proc_dointvec_minmax(table, write, buffer, length, ppos);
4658 setup_per_zone_lowmem_reserve(); 4782 setup_per_zone_lowmem_reserve();
4659 return 0; 4783 return 0;
4660} 4784}
@@ -4666,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4666 */ 4790 */
4667 4791
4668int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4792int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4669 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4793 void __user *buffer, size_t *length, loff_t *ppos)
4670{ 4794{
4671 struct zone *zone; 4795 struct zone *zone;
4672 unsigned int cpu; 4796 unsigned int cpu;
4673 int ret; 4797 int ret;
4674 4798
4675 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4799 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4676 if (!write || (ret == -EINVAL)) 4800 if (!write || (ret == -EINVAL))
4677 return ret; 4801 return ret;
4678 for_each_populated_zone(zone) { 4802 for_each_populated_zone(zone) {
@@ -4732,7 +4856,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4732 numentries <<= (PAGE_SHIFT - scale); 4856 numentries <<= (PAGE_SHIFT - scale);
4733 4857
4734 /* Make sure we've got at least a 0-order allocation.. */ 4858 /* Make sure we've got at least a 0-order allocation.. */
4735 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4859 if (unlikely(flags & HASH_SMALL)) {
4860 /* Makes no sense without HASH_EARLY */
4861 WARN_ON(!(flags & HASH_EARLY));
4862 if (!(numentries >> *_hash_shift)) {
4863 numentries = 1UL << *_hash_shift;
4864 BUG_ON(!numentries);
4865 }
4866 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4736 numentries = PAGE_SIZE / bucketsize; 4867 numentries = PAGE_SIZE / bucketsize;
4737 } 4868 }
4738 numentries = roundup_pow_of_two(numentries); 4869 numentries = roundup_pow_of_two(numentries);
@@ -4874,13 +5005,16 @@ int set_migratetype_isolate(struct page *page)
4874 struct zone *zone; 5005 struct zone *zone;
4875 unsigned long flags; 5006 unsigned long flags;
4876 int ret = -EBUSY; 5007 int ret = -EBUSY;
5008 int zone_idx;
4877 5009
4878 zone = page_zone(page); 5010 zone = page_zone(page);
5011 zone_idx = zone_idx(zone);
4879 spin_lock_irqsave(&zone->lock, flags); 5012 spin_lock_irqsave(&zone->lock, flags);
4880 /* 5013 /*
4881 * In future, more migrate types will be able to be isolation target. 5014 * In future, more migrate types will be able to be isolation target.
4882 */ 5015 */
4883 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 5016 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
5017 zone_idx != ZONE_MOVABLE)
4884 goto out; 5018 goto out;
4885 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5019 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4886 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5020 move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f22b4ebbd8dc..3d535d594826 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
116 nid = page_to_nid(pfn_to_page(pfn)); 116 nid = page_to_nid(pfn_to_page(pfn));
117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
118 VM_BUG_ON(!slab_is_available()); 118 VM_BUG_ON(!slab_is_available());
119 base = kmalloc_node(table_size, 119 if (node_state(nid, N_HIGH_MEMORY)) {
120 base = kmalloc_node(table_size,
120 GFP_KERNEL | __GFP_NOWARN, nid); 121 GFP_KERNEL | __GFP_NOWARN, nid);
121 if (!base) 122 if (!base)
122 base = vmalloc_node(table_size, nid); 123 base = vmalloc_node(table_size, nid);
124 } else {
125 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
126 if (!base)
127 base = vmalloc(table_size);
128 }
123 } else { 129 } else {
124 /* 130 /*
125 * We don't have to allocate page_cgroup again, but 131 * We don't have to allocate page_cgroup again, but
diff --git a/mm/percpu.c b/mm/percpu.c
index 43d8cacfdaa5..4a048abad043 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1043,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1043 */ 1043 */
1044static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1044static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1045{ 1045{
1046 static int warn_limit = 10;
1046 struct pcpu_chunk *chunk; 1047 struct pcpu_chunk *chunk;
1048 const char *err;
1047 int slot, off; 1049 int slot, off;
1048 1050
1049 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1051 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
@@ -1059,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1059 if (reserved && pcpu_reserved_chunk) { 1061 if (reserved && pcpu_reserved_chunk) {
1060 chunk = pcpu_reserved_chunk; 1062 chunk = pcpu_reserved_chunk;
1061 if (size > chunk->contig_hint || 1063 if (size > chunk->contig_hint ||
1062 pcpu_extend_area_map(chunk) < 0) 1064 pcpu_extend_area_map(chunk) < 0) {
1065 err = "failed to extend area map of reserved chunk";
1063 goto fail_unlock; 1066 goto fail_unlock;
1067 }
1064 off = pcpu_alloc_area(chunk, size, align); 1068 off = pcpu_alloc_area(chunk, size, align);
1065 if (off >= 0) 1069 if (off >= 0)
1066 goto area_found; 1070 goto area_found;
1071 err = "alloc from reserved chunk failed";
1067 goto fail_unlock; 1072 goto fail_unlock;
1068 } 1073 }
1069 1074
@@ -1080,6 +1085,7 @@ restart:
1080 case 1: 1085 case 1:
1081 goto restart; /* pcpu_lock dropped, restart */ 1086 goto restart; /* pcpu_lock dropped, restart */
1082 default: 1087 default:
1088 err = "failed to extend area map";
1083 goto fail_unlock; 1089 goto fail_unlock;
1084 } 1090 }
1085 1091
@@ -1093,8 +1099,10 @@ restart:
1093 spin_unlock_irq(&pcpu_lock); 1099 spin_unlock_irq(&pcpu_lock);
1094 1100
1095 chunk = alloc_pcpu_chunk(); 1101 chunk = alloc_pcpu_chunk();
1096 if (!chunk) 1102 if (!chunk) {
1103 err = "failed to allocate new chunk";
1097 goto fail_unlock_mutex; 1104 goto fail_unlock_mutex;
1105 }
1098 1106
1099 spin_lock_irq(&pcpu_lock); 1107 spin_lock_irq(&pcpu_lock);
1100 pcpu_chunk_relocate(chunk, -1); 1108 pcpu_chunk_relocate(chunk, -1);
@@ -1107,6 +1115,7 @@ area_found:
1107 if (pcpu_populate_chunk(chunk, off, size)) { 1115 if (pcpu_populate_chunk(chunk, off, size)) {
1108 spin_lock_irq(&pcpu_lock); 1116 spin_lock_irq(&pcpu_lock);
1109 pcpu_free_area(chunk, off); 1117 pcpu_free_area(chunk, off);
1118 err = "failed to populate";
1110 goto fail_unlock; 1119 goto fail_unlock;
1111 } 1120 }
1112 1121
@@ -1119,6 +1128,13 @@ fail_unlock:
1119 spin_unlock_irq(&pcpu_lock); 1128 spin_unlock_irq(&pcpu_lock);
1120fail_unlock_mutex: 1129fail_unlock_mutex:
1121 mutex_unlock(&pcpu_alloc_mutex); 1130 mutex_unlock(&pcpu_alloc_mutex);
1131 if (warn_limit) {
1132 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
1133 "%s\n", size, align, err);
1134 dump_stack();
1135 if (!--warn_limit)
1136 pr_info("PERCPU: limit reached, disable warning\n");
1137 }
1122 return NULL; 1138 return NULL;
1123} 1139}
1124 1140
@@ -1347,6 +1363,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1347 struct pcpu_alloc_info *ai; 1363 struct pcpu_alloc_info *ai;
1348 unsigned int *cpu_map; 1364 unsigned int *cpu_map;
1349 1365
1366 /* this function may be called multiple times */
1367 memset(group_map, 0, sizeof(group_map));
1368 memset(group_cnt, 0, sizeof(group_map));
1369
1350 /* 1370 /*
1351 * Determine min_unit_size, alloc_size and max_upa such that 1371 * Determine min_unit_size, alloc_size and max_upa such that
1352 * alloc_size is multiple of atom_size and is the smallest 1372 * alloc_size is multiple of atom_size and is the smallest
@@ -1574,6 +1594,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
1574int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, 1594int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1575 void *base_addr) 1595 void *base_addr)
1576{ 1596{
1597 static char cpus_buf[4096] __initdata;
1577 static int smap[2], dmap[2]; 1598 static int smap[2], dmap[2];
1578 size_t dyn_size = ai->dyn_size; 1599 size_t dyn_size = ai->dyn_size;
1579 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; 1600 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
@@ -1585,17 +1606,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1585 int *unit_map; 1606 int *unit_map;
1586 int group, unit, i; 1607 int group, unit, i;
1587 1608
1609 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1610
1611#define PCPU_SETUP_BUG_ON(cond) do { \
1612 if (unlikely(cond)) { \
1613 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1614 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
1615 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1616 BUG(); \
1617 } \
1618} while (0)
1619
1588 /* sanity checks */ 1620 /* sanity checks */
1589 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1621 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1590 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1622 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1591 BUG_ON(ai->nr_groups <= 0); 1623 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1592 BUG_ON(!ai->static_size); 1624 PCPU_SETUP_BUG_ON(!ai->static_size);
1593 BUG_ON(!base_addr); 1625 PCPU_SETUP_BUG_ON(!base_addr);
1594 BUG_ON(ai->unit_size < size_sum); 1626 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1595 BUG_ON(ai->unit_size & ~PAGE_MASK); 1627 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1596 BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1628 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1597
1598 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1599 1629
1600 /* process group information and build config tables accordingly */ 1630 /* process group information and build config tables accordingly */
1601 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1631 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
@@ -1604,7 +1634,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1604 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); 1634 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1605 1635
1606 for (cpu = 0; cpu < nr_cpu_ids; cpu++) 1636 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1607 unit_map[cpu] = NR_CPUS; 1637 unit_map[cpu] = UINT_MAX;
1608 pcpu_first_unit_cpu = NR_CPUS; 1638 pcpu_first_unit_cpu = NR_CPUS;
1609 1639
1610 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { 1640 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
@@ -1618,8 +1648,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1618 if (cpu == NR_CPUS) 1648 if (cpu == NR_CPUS)
1619 continue; 1649 continue;
1620 1650
1621 BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); 1651 PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
1622 BUG_ON(unit_map[cpu] != NR_CPUS); 1652 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1653 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1623 1654
1624 unit_map[cpu] = unit + i; 1655 unit_map[cpu] = unit + i;
1625 unit_off[cpu] = gi->base_offset + i * ai->unit_size; 1656 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
@@ -1632,7 +1663,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1632 pcpu_nr_units = unit; 1663 pcpu_nr_units = unit;
1633 1664
1634 for_each_possible_cpu(cpu) 1665 for_each_possible_cpu(cpu)
1635 BUG_ON(unit_map[cpu] == NR_CPUS); 1666 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1667
1668 /* we're done parsing the input, undefine BUG macro and dump config */
1669#undef PCPU_SETUP_BUG_ON
1670 pcpu_dump_alloc_info(KERN_INFO, ai);
1636 1671
1637 pcpu_nr_groups = ai->nr_groups; 1672 pcpu_nr_groups = ai->nr_groups;
1638 pcpu_group_offsets = group_offsets; 1673 pcpu_group_offsets = group_offsets;
@@ -1782,7 +1817,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1782 void *base = (void *)ULONG_MAX; 1817 void *base = (void *)ULONG_MAX;
1783 void **areas = NULL; 1818 void **areas = NULL;
1784 struct pcpu_alloc_info *ai; 1819 struct pcpu_alloc_info *ai;
1785 size_t size_sum, areas_size; 1820 size_t size_sum, areas_size, max_distance;
1786 int group, i, rc; 1821 int group, i, rc;
1787 1822
1788 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, 1823 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
@@ -1832,8 +1867,24 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1832 } 1867 }
1833 1868
1834 /* base address is now known, determine group base offsets */ 1869 /* base address is now known, determine group base offsets */
1835 for (group = 0; group < ai->nr_groups; group++) 1870 max_distance = 0;
1871 for (group = 0; group < ai->nr_groups; group++) {
1836 ai->groups[group].base_offset = areas[group] - base; 1872 ai->groups[group].base_offset = areas[group] - base;
1873 max_distance = max(max_distance, ai->groups[group].base_offset);
1874 }
1875 max_distance += ai->unit_size;
1876
1877 /* warn if maximum distance is further than 75% of vmalloc space */
1878 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
1879 pr_warning("PERCPU: max_distance=0x%lx too large for vmalloc "
1880 "space 0x%lx\n",
1881 max_distance, VMALLOC_END - VMALLOC_START);
1882#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1883 /* and fail if we have fallback */
1884 rc = -EINVAL;
1885 goto out_free;
1886#endif
1887 }
1837 1888
1838 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", 1889 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1839 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, 1890 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6eedf7e473d1..6633965bb27b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
29 int node = numa_node_id(); 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones; 30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node; 31 int num_cpus_on_node;
32 const struct cpumask *cpumask_on_node = cpumask_of_node(node);
33 32
34 node_free_pages = 33 node_free_pages =
35#ifdef CONFIG_ZONE_DMA 34#ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
42 41
43 max = node_free_pages / FRACTION_OF_NODE_MEM; 42 max = node_free_pages / FRACTION_OF_NODE_MEM;
44 43
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); 44 num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
46 max /= num_cpus_on_node; 45 max /= num_cpus_on_node;
47 46
48 return max(max, min_pages); 47 return max(max, min_pages);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0895b5c7cbff..dd43373a483f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 *
40 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock
39 */ 44 */
40 45
41#include <linux/mm.h> 46#include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 196 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 197 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 198 */
194static struct anon_vma *page_lock_anon_vma(struct page *page) 199struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 200{
196 struct anon_vma *anon_vma; 201 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 202 unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
211 return NULL; 216 return NULL;
212} 217}
213 218
214static void page_unlock_anon_vma(struct anon_vma *anon_vma) 219void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 220{
216 spin_unlock(&anon_vma->lock); 221 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 222 rcu_read_unlock();
@@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
237} 242}
238 243
239/* 244/*
240 * At what user virtual address is page expected in vma? checking that the 245 * At what user virtual address is page expected in vma?
241 * page matches the vma: currently only used on anon pages, by unuse_vma; 246 * checking that the page matches the vma.
242 */ 247 */
243unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
244{ 249{
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
311 * if the page is not mapped into the page tables of this VMA. Only 316 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs. 317 * valid for normal file or anonymous VMAs.
313 */ 318 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 319int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{ 320{
316 unsigned long address; 321 unsigned long address;
317 pte_t *pte; 322 pte_t *pte;
@@ -710,27 +715,6 @@ void page_add_file_rmap(struct page *page)
710 } 715 }
711} 716}
712 717
713#ifdef CONFIG_DEBUG_VM
714/**
715 * page_dup_rmap - duplicate pte mapping to a page
716 * @page: the page to add the mapping to
717 * @vma: the vm area being duplicated
718 * @address: the user virtual address mapped
719 *
720 * For copy_page_range only: minimal extract from page_add_file_rmap /
721 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
722 * quicker.
723 *
724 * The caller needs to hold the pte lock.
725 */
726void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
727{
728 if (PageAnon(page))
729 __page_check_anon_rmap(page, vma, address);
730 atomic_inc(&page->_mapcount);
731}
732#endif
733
734/** 718/**
735 * page_remove_rmap - take down pte mapping from a page 719 * page_remove_rmap - take down pte mapping from a page
736 * @page: page to remove mapping from 720 * @page: page to remove mapping from
@@ -739,34 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
739 */ 723 */
740void page_remove_rmap(struct page *page) 724void page_remove_rmap(struct page *page)
741{ 725{
742 if (atomic_add_negative(-1, &page->_mapcount)) { 726 /* page still mapped by someone else? */
743 /* 727 if (!atomic_add_negative(-1, &page->_mapcount))
744 * Now that the last pte has gone, s390 must transfer dirty 728 return;
745 * flag from storage key to struct page. We can usually skip 729
746 * this if the page is anon, so about to be freed; but perhaps 730 /*
747 * not if it's in swapcache - there might be another pte slot 731 * Now that the last pte has gone, s390 must transfer dirty
748 * containing the swap entry, but page not yet written to swap. 732 * flag from storage key to struct page. We can usually skip
749 */ 733 * this if the page is anon, so about to be freed; but perhaps
750 if ((!PageAnon(page) || PageSwapCache(page)) && 734 * not if it's in swapcache - there might be another pte slot
751 page_test_dirty(page)) { 735 * containing the swap entry, but page not yet written to swap.
752 page_clear_dirty(page); 736 */
753 set_page_dirty(page); 737 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
754 } 738 page_clear_dirty(page);
755 if (PageAnon(page)) 739 set_page_dirty(page);
756 mem_cgroup_uncharge_page(page); 740 }
757 __dec_zone_page_state(page, 741 if (PageAnon(page)) {
758 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 742 mem_cgroup_uncharge_page(page);
759 mem_cgroup_update_mapped_file_stat(page, -1); 743 __dec_zone_page_state(page, NR_ANON_PAGES);
760 /* 744 } else {
761 * It would be tidy to reset the PageAnon mapping here, 745 __dec_zone_page_state(page, NR_FILE_MAPPED);
762 * but that might overwrite a racing page_add_anon_rmap
763 * which increments mapcount after us but sets mapping
764 * before us: so leave the reset to free_hot_cold_page,
765 * and remember that it's only reliable while mapped.
766 * Leaving it set also helps swapoff to reinstate ptes
767 * faster for those pages still in swapcache.
768 */
769 } 746 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /*
749 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap
751 * which increments mapcount after us but sets mapping
752 * before us: so leave the reset to free_hot_cold_page,
753 * and remember that it's only reliable while mapped.
754 * Leaving it set also helps swapoff to reinstate ptes
755 * faster for those pages still in swapcache.
756 */
770} 757}
771 758
772/* 759/*
@@ -774,7 +761,7 @@ void page_remove_rmap(struct page *page)
774 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
775 */ 762 */
776static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
777 int migration) 764 enum ttu_flags flags)
778{ 765{
779 struct mm_struct *mm = vma->vm_mm; 766 struct mm_struct *mm = vma->vm_mm;
780 unsigned long address; 767 unsigned long address;
@@ -796,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
796 * If it's recently referenced (perhaps page_referenced 783 * If it's recently referenced (perhaps page_referenced
797 * skipped over this mm) then we should reactivate it. 784 * skipped over this mm) then we should reactivate it.
798 */ 785 */
799 if (!migration) { 786 if (!(flags & TTU_IGNORE_MLOCK)) {
800 if (vma->vm_flags & VM_LOCKED) { 787 if (vma->vm_flags & VM_LOCKED) {
801 ret = SWAP_MLOCK; 788 ret = SWAP_MLOCK;
802 goto out_unmap; 789 goto out_unmap;
803 } 790 }
791 }
792 if (!(flags & TTU_IGNORE_ACCESS)) {
804 if (ptep_clear_flush_young_notify(vma, address, pte)) { 793 if (ptep_clear_flush_young_notify(vma, address, pte)) {
805 ret = SWAP_FAIL; 794 ret = SWAP_FAIL;
806 goto out_unmap; 795 goto out_unmap;
@@ -818,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
818 /* Update high watermark before we lower rss */ 807 /* Update high watermark before we lower rss */
819 update_hiwater_rss(mm); 808 update_hiwater_rss(mm);
820 809
821 if (PageAnon(page)) { 810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss);
813 else
814 dec_mm_counter(mm, file_rss);
815 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) {
822 swp_entry_t entry = { .val = page_private(page) }; 818 swp_entry_t entry = { .val = page_private(page) };
823 819
824 if (PageSwapCache(page)) { 820 if (PageSwapCache(page)) {
@@ -840,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
840 * pte. do_swap_page() will wait until the migration 836 * pte. do_swap_page() will wait until the migration
841 * pte is removed and then restart fault handling. 837 * pte is removed and then restart fault handling.
842 */ 838 */
843 BUG_ON(!migration); 839 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
844 entry = make_migration_entry(page, pte_write(pteval)); 840 entry = make_migration_entry(page, pte_write(pteval));
845 } 841 }
846 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 842 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
847 BUG_ON(pte_file(*pte)); 843 BUG_ON(pte_file(*pte));
848 } else if (PAGE_MIGRATION && migration) { 844 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
849 /* Establish migration entry for a file page */ 845 /* Establish migration entry for a file page */
850 swp_entry_t entry; 846 swp_entry_t entry;
851 entry = make_migration_entry(page, pte_write(pteval)); 847 entry = make_migration_entry(page, pte_write(pteval));
@@ -1014,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
1014 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1010 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1015 * 'LOCKED. 1011 * 'LOCKED.
1016 */ 1012 */
1017static int try_to_unmap_anon(struct page *page, int unlock, int migration) 1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1018{ 1014{
1019 struct anon_vma *anon_vma; 1015 struct anon_vma *anon_vma;
1020 struct vm_area_struct *vma; 1016 struct vm_area_struct *vma;
1021 unsigned int mlocked = 0; 1017 unsigned int mlocked = 0;
1022 int ret = SWAP_AGAIN; 1018 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1023 1020
1024 if (MLOCK_PAGES && unlikely(unlock)) 1021 if (MLOCK_PAGES && unlikely(unlock))
1025 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1035,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1035 continue; /* must visit all unlocked vmas */ 1032 continue; /* must visit all unlocked vmas */
1036 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1037 } else { 1034 } else {
1038 ret = try_to_unmap_one(page, vma, migration); 1035 ret = try_to_unmap_one(page, vma, flags);
1039 if (ret == SWAP_FAIL || !page_mapped(page)) 1036 if (ret == SWAP_FAIL || !page_mapped(page))
1040 break; 1037 break;
1041 } 1038 }
@@ -1059,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1059/** 1056/**
1060 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1057 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1061 * @page: the page to unmap/unlock 1058 * @page: the page to unmap/unlock
1062 * @unlock: request for unlock rather than unmap [unlikely] 1059 * @flags: action and flags
1063 * @migration: unmapping for migration - ignored if @unlock
1064 * 1060 *
1065 * Find all the mappings of a page using the mapping pointer and the vma chains 1061 * Find all the mappings of a page using the mapping pointer and the vma chains
1066 * contained in the address_space struct it points to. 1062 * contained in the address_space struct it points to.
@@ -1072,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1072 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1068 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1073 * 'LOCKED. 1069 * 'LOCKED.
1074 */ 1070 */
1075static int try_to_unmap_file(struct page *page, int unlock, int migration) 1071static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1076{ 1072{
1077 struct address_space *mapping = page->mapping; 1073 struct address_space *mapping = page->mapping;
1078 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1074 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1084,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1084 unsigned long max_nl_size = 0; 1080 unsigned long max_nl_size = 0;
1085 unsigned int mapcount; 1081 unsigned int mapcount;
1086 unsigned int mlocked = 0; 1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1087 1084
1088 if (MLOCK_PAGES && unlikely(unlock)) 1085 if (MLOCK_PAGES && unlikely(unlock))
1089 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1096,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1096 continue; /* must visit all vmas */ 1093 continue; /* must visit all vmas */
1097 ret = SWAP_MLOCK; 1094 ret = SWAP_MLOCK;
1098 } else { 1095 } else {
1099 ret = try_to_unmap_one(page, vma, migration); 1096 ret = try_to_unmap_one(page, vma, flags);
1100 if (ret == SWAP_FAIL || !page_mapped(page)) 1097 if (ret == SWAP_FAIL || !page_mapped(page))
1101 goto out; 1098 goto out;
1102 } 1099 }
@@ -1121,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1121 ret = SWAP_MLOCK; /* leave mlocked == 0 */ 1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1122 goto out; /* no need to look further */ 1119 goto out; /* no need to look further */
1123 } 1120 }
1124 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) 1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1125 continue; 1123 continue;
1126 cursor = (unsigned long) vma->vm_private_data; 1124 cursor = (unsigned long) vma->vm_private_data;
1127 if (cursor > max_nl_cursor) 1125 if (cursor > max_nl_cursor)
@@ -1155,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1155 do { 1153 do {
1156 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1157 shared.vm_set.list) { 1155 shared.vm_set.list) {
1158 if (!MLOCK_PAGES && !migration && 1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1159 (vma->vm_flags & VM_LOCKED)) 1157 (vma->vm_flags & VM_LOCKED))
1160 continue; 1158 continue;
1161 cursor = (unsigned long) vma->vm_private_data; 1159 cursor = (unsigned long) vma->vm_private_data;
@@ -1195,7 +1193,7 @@ out:
1195/** 1193/**
1196 * try_to_unmap - try to remove all page table mappings to a page 1194 * try_to_unmap - try to remove all page table mappings to a page
1197 * @page: the page to get unmapped 1195 * @page: the page to get unmapped
1198 * @migration: migration flag 1196 * @flags: action and flags
1199 * 1197 *
1200 * Tries to remove all the page table entries which are mapping this 1198 * Tries to remove all the page table entries which are mapping this
1201 * page, used in the pageout path. Caller must hold the page lock. 1199 * page, used in the pageout path. Caller must hold the page lock.
@@ -1206,16 +1204,16 @@ out:
1206 * SWAP_FAIL - the page is unswappable 1204 * SWAP_FAIL - the page is unswappable
1207 * SWAP_MLOCK - page is mlocked. 1205 * SWAP_MLOCK - page is mlocked.
1208 */ 1206 */
1209int try_to_unmap(struct page *page, int migration) 1207int try_to_unmap(struct page *page, enum ttu_flags flags)
1210{ 1208{
1211 int ret; 1209 int ret;
1212 1210
1213 BUG_ON(!PageLocked(page)); 1211 BUG_ON(!PageLocked(page));
1214 1212
1215 if (PageAnon(page)) 1213 if (PageAnon(page))
1216 ret = try_to_unmap_anon(page, 0, migration); 1214 ret = try_to_unmap_anon(page, flags);
1217 else 1215 else
1218 ret = try_to_unmap_file(page, 0, migration); 1216 ret = try_to_unmap_file(page, flags);
1219 if (ret != SWAP_MLOCK && !page_mapped(page)) 1217 if (ret != SWAP_MLOCK && !page_mapped(page))
1220 ret = SWAP_SUCCESS; 1218 ret = SWAP_SUCCESS;
1221 return ret; 1219 return ret;
@@ -1240,8 +1238,8 @@ int try_to_munlock(struct page *page)
1240 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1241 1239
1242 if (PageAnon(page)) 1240 if (PageAnon(page))
1243 return try_to_unmap_anon(page, 1, 0); 1241 return try_to_unmap_anon(page, TTU_MUNLOCK);
1244 else 1242 else
1245 return try_to_unmap_file(page, 1, 0); 1243 return try_to_unmap_file(page, TTU_MUNLOCK);
1246} 1244}
1247 1245
diff --git a/mm/shmem.c b/mm/shmem.c
index bd20f8bb02aa..356dd99566ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt;
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
50#include <linux/shmem_fs.h> 50#include <linux/shmem_fs.h>
51#include <linux/writeback.h> 51#include <linux/writeback.h>
52#include <linux/vfs.h>
53#include <linux/blkdev.h> 52#include <linux/blkdev.h>
54#include <linux/security.h> 53#include <linux/security.h>
55#include <linux/swapops.h> 54#include <linux/swapops.h>
@@ -219,7 +218,7 @@ static const struct file_operations shmem_file_operations;
219static const struct inode_operations shmem_inode_operations; 218static const struct inode_operations shmem_inode_operations;
220static const struct inode_operations shmem_dir_inode_operations; 219static const struct inode_operations shmem_dir_inode_operations;
221static const struct inode_operations shmem_special_inode_operations; 220static const struct inode_operations shmem_special_inode_operations;
222static struct vm_operations_struct shmem_vm_ops; 221static const struct vm_operations_struct shmem_vm_ops;
223 222
224static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 223static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
225 .ra_pages = 0, /* No readahead */ 224 .ra_pages = 0, /* No readahead */
@@ -1047,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1047 * sync from ever calling shmem_writepage; but a stacking filesystem 1046 * sync from ever calling shmem_writepage; but a stacking filesystem
1048 * may use the ->writepage of its underlying filesystem, in which case 1047 * may use the ->writepage of its underlying filesystem, in which case
1049 * tmpfs should write out to swap only in response to memory pressure, 1048 * tmpfs should write out to swap only in response to memory pressure,
1050 * and not for pdflush or sync. However, in those cases, we do still 1049 * and not for the writeback threads or sync. However, in those cases,
1051 * want to check if there's a redundant swappage to be discarded. 1050 * we do still want to check if there's a redundant swappage to be
1051 * discarded.
1052 */ 1052 */
1053 if (wbc->for_reclaim) 1053 if (wbc->for_reclaim)
1054 swap = get_swap_page(); 1054 swap = get_swap_page();
@@ -1097,6 +1097,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 /*
1101 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1102 * clear SWAP_HAS_CACHE flag.
1103 */
1100 swapcache_free(swap, NULL); 1104 swapcache_free(swap, NULL);
1101redirty: 1105redirty:
1102 set_page_dirty(page); 1106 set_page_dirty(page);
@@ -1630,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1630 if (pos + copied > inode->i_size) 1634 if (pos + copied > inode->i_size)
1631 i_size_write(inode, pos + copied); 1635 i_size_write(inode, pos + copied);
1632 1636
1633 unlock_page(page);
1634 set_page_dirty(page); 1637 set_page_dirty(page);
1638 unlock_page(page);
1635 page_cache_release(page); 1639 page_cache_release(page);
1636 1640
1637 return copied; 1641 return copied;
@@ -1968,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1968 iput(inode); 1972 iput(inode);
1969 return error; 1973 return error;
1970 } 1974 }
1971 unlock_page(page);
1972 inode->i_mapping->a_ops = &shmem_aops; 1975 inode->i_mapping->a_ops = &shmem_aops;
1973 inode->i_op = &shmem_symlink_inode_operations; 1976 inode->i_op = &shmem_symlink_inode_operations;
1974 kaddr = kmap_atomic(page, KM_USER0); 1977 kaddr = kmap_atomic(page, KM_USER0);
1975 memcpy(kaddr, symname, len); 1978 memcpy(kaddr, symname, len);
1976 kunmap_atomic(kaddr, KM_USER0); 1979 kunmap_atomic(kaddr, KM_USER0);
1977 set_page_dirty(page); 1980 set_page_dirty(page);
1981 unlock_page(page);
1978 page_cache_release(page); 1982 page_cache_release(page);
1979 } 1983 }
1980 if (dir->i_mode & S_ISGID) 1984 if (dir->i_mode & S_ISGID)
@@ -2306,17 +2310,14 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2306 int err = -ENOMEM; 2310 int err = -ENOMEM;
2307 2311
2308 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2312 /* Round up to L1_CACHE_BYTES to resist false sharing */
2309 sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), 2313 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2310 L1_CACHE_BYTES), GFP_KERNEL); 2314 L1_CACHE_BYTES), GFP_KERNEL);
2311 if (!sbinfo) 2315 if (!sbinfo)
2312 return -ENOMEM; 2316 return -ENOMEM;
2313 2317
2314 sbinfo->max_blocks = 0;
2315 sbinfo->max_inodes = 0;
2316 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2318 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2317 sbinfo->uid = current_fsuid(); 2319 sbinfo->uid = current_fsuid();
2318 sbinfo->gid = current_fsgid(); 2320 sbinfo->gid = current_fsgid();
2319 sbinfo->mpol = NULL;
2320 sb->s_fs_info = sbinfo; 2321 sb->s_fs_info = sbinfo;
2321 2322
2322#ifdef CONFIG_TMPFS 2323#ifdef CONFIG_TMPFS
@@ -2420,6 +2421,7 @@ static const struct address_space_operations shmem_aops = {
2420 .write_end = shmem_write_end, 2421 .write_end = shmem_write_end,
2421#endif 2422#endif
2422 .migratepage = migrate_page, 2423 .migratepage = migrate_page,
2424 .error_remove_page = generic_error_remove_page,
2423}; 2425};
2424 2426
2425static const struct file_operations shmem_file_operations = { 2427static const struct file_operations shmem_file_operations = {
@@ -2496,7 +2498,7 @@ static const struct super_operations shmem_ops = {
2496 .put_super = shmem_put_super, 2498 .put_super = shmem_put_super,
2497}; 2499};
2498 2500
2499static struct vm_operations_struct shmem_vm_ops = { 2501static const struct vm_operations_struct shmem_vm_ops = {
2500 .fault = shmem_fault, 2502 .fault = shmem_fault,
2501#ifdef CONFIG_NUMA 2503#ifdef CONFIG_NUMA
2502 .set_policy = shmem_set_policy, 2504 .set_policy = shmem_set_policy,
@@ -2590,6 +2592,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2590 return 0; 2592 return 0;
2591} 2593}
2592 2594
2595int shmem_lock(struct file *file, int lock, struct user_struct *user)
2596{
2597 return 0;
2598}
2599
2593#define shmem_vm_ops generic_file_vm_ops 2600#define shmem_vm_ops generic_file_vm_ops
2594#define shmem_file_operations ramfs_file_operations 2601#define shmem_file_operations ramfs_file_operations
2595#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2602#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..7dfa481c96ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
1384 * Fragmentation resistance on low memory - only use bigger 1384 * Fragmentation resistance on low memory - only use bigger
1385 * page orders on machines with more than 32MB of memory. 1385 * page orders on machines with more than 32MB of memory.
1386 */ 1386 */
1387 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1387 if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1389 1389
1390 /* Bootstrap is tricky, because several objects are allocated 1390 /* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/slub.c b/mm/slub.c
index 0a216aae227e..4996fc719552 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3345,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3345{ 3345{
3346 struct kmem_cache *s; 3346 struct kmem_cache *s;
3347 3347
3348 if (WARN_ON(!name))
3349 return NULL;
3350
3348 down_write(&slub_lock); 3351 down_write(&slub_lock);
3349 s = find_mergeable(size, align, flags, name, ctor); 3352 s = find_mergeable(size, align, flags, name, ctor);
3350 if (s) { 3353 if (s) {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea6401ae7..d9714bdcb4a3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 48{
49 /* If the main allocator is up use that, fallback to bootmem. */ 49 /* If the main allocator is up use that, fallback to bootmem. */
50 if (slab_is_available()) { 50 if (slab_is_available()) {
51 struct page *page = alloc_pages_node(node, 51 struct page *page;
52
53 if (node_state(node, N_HIGH_MEMORY))
54 page = alloc_pages_node(node,
52 GFP_KERNEL | __GFP_ZERO, get_order(size)); 55 GFP_KERNEL | __GFP_ZERO, get_order(size));
56 else
57 page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
58 get_order(size));
53 if (page) 59 if (page)
54 return page_address(page); 60 return page_address(page);
55 return NULL; 61 return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9f0ae8..6ce4aab69e99 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
62 unsigned long array_size = SECTIONS_PER_ROOT * 62 unsigned long array_size = SECTIONS_PER_ROOT *
63 sizeof(struct mem_section); 63 sizeof(struct mem_section);
64 64
65 if (slab_is_available()) 65 if (slab_is_available()) {
66 section = kmalloc_node(array_size, GFP_KERNEL, nid); 66 if (node_state(nid, N_HIGH_MEMORY))
67 else 67 section = kmalloc_node(array_size, GFP_KERNEL, nid);
68 else
69 section = kmalloc(array_size, GFP_KERNEL);
70 } else
68 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 71 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
69 72
70 if (section) 73 if (section)
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..308e57d8d7ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
118 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
119 } 119 }
120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
121 int lru = page_is_file_cache(page); 121 int lru = page_lru_base_type(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list); 122 list_move_tail(&page->lru, &zone->lru[lru].list);
123 pgmoved++; 123 pgmoved++;
124 } 124 }
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
181 spin_lock_irq(&zone->lru_lock); 181 spin_lock_irq(&zone->lru_lock);
182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
183 int file = page_is_file_cache(page); 183 int file = page_is_file_cache(page);
184 int lru = LRU_BASE + file; 184 int lru = page_lru_base_type(page);
185 del_page_from_lru_list(zone, page, lru); 185 del_page_from_lru_list(zone, page, lru);
186 186
187 SetPageActive(page); 187 SetPageActive(page);
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
189 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
190 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
191 191
192 update_page_reclaim_stat(zone, page, !!file, 1); 192 update_page_reclaim_stat(zone, page, file, 1);
193 } 193 }
194 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
195} 195}
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
496 */ 496 */
497void __init swap_setup(void) 497void __init swap_setup(void)
498{ 498{
499 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 499 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
500 500
501#ifdef CONFIG_SWAP 501#ifdef CONFIG_SWAP
502 bdi_init(swapper_space.backing_dev_info); 502 bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5ae6b8b78c80..6d1daeb1cb4a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -67,10 +67,10 @@ void show_swap_cache_info(void)
67} 67}
68 68
69/* 69/*
70 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 70 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
71 * but sets SwapCache flag and private instead of mapping and index. 71 * but sets SwapCache flag and private instead of mapping and index.
72 */ 72 */
73int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 73static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
74{ 74{
75 int error; 75 int error;
76 76
@@ -78,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
78 VM_BUG_ON(PageSwapCache(page)); 78 VM_BUG_ON(PageSwapCache(page));
79 VM_BUG_ON(!PageSwapBacked(page)); 79 VM_BUG_ON(!PageSwapBacked(page));
80 80
81 page_cache_get(page);
82 SetPageSwapCache(page);
83 set_page_private(page, entry.val);
84
85 spin_lock_irq(&swapper_space.tree_lock);
86 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
87 if (likely(!error)) {
88 total_swapcache_pages++;
89 __inc_zone_page_state(page, NR_FILE_PAGES);
90 INC_CACHE_INFO(add_total);
91 }
92 spin_unlock_irq(&swapper_space.tree_lock);
93
94 if (unlikely(error)) {
95 /*
96 * Only the context which have set SWAP_HAS_CACHE flag
97 * would call add_to_swap_cache().
98 * So add_to_swap_cache() doesn't returns -EEXIST.
99 */
100 VM_BUG_ON(error == -EEXIST);
101 set_page_private(page, 0UL);
102 ClearPageSwapCache(page);
103 page_cache_release(page);
104 }
105
106 return error;
107}
108
109
110int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
111{
112 int error;
113
81 error = radix_tree_preload(gfp_mask); 114 error = radix_tree_preload(gfp_mask);
82 if (!error) { 115 if (!error) {
83 page_cache_get(page); 116 error = __add_to_swap_cache(page, entry);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86
87 spin_lock_irq(&swapper_space.tree_lock);
88 error = radix_tree_insert(&swapper_space.page_tree,
89 entry.val, page);
90 if (likely(!error)) {
91 total_swapcache_pages++;
92 __inc_zone_page_state(page, NR_FILE_PAGES);
93 INC_CACHE_INFO(add_total);
94 }
95 spin_unlock_irq(&swapper_space.tree_lock);
96 radix_tree_preload_end(); 117 radix_tree_preload_end();
97
98 if (unlikely(error)) {
99 set_page_private(page, 0UL);
100 ClearPageSwapCache(page);
101 page_cache_release(page);
102 }
103 } 118 }
104 return error; 119 return error;
105} 120}
@@ -137,38 +152,34 @@ int add_to_swap(struct page *page)
137 VM_BUG_ON(!PageLocked(page)); 152 VM_BUG_ON(!PageLocked(page));
138 VM_BUG_ON(!PageUptodate(page)); 153 VM_BUG_ON(!PageUptodate(page));
139 154
140 for (;;) { 155 entry = get_swap_page();
141 entry = get_swap_page(); 156 if (!entry.val)
142 if (!entry.val) 157 return 0;
143 return 0;
144 158
159 /*
160 * Radix-tree node allocations from PF_MEMALLOC contexts could
161 * completely exhaust the page allocator. __GFP_NOMEMALLOC
162 * stops emergency reserves from being allocated.
163 *
164 * TODO: this could cause a theoretical memory reclaim
165 * deadlock in the swap out path.
166 */
167 /*
168 * Add it to the swap cache and mark it dirty
169 */
170 err = add_to_swap_cache(page, entry,
171 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
172
173 if (!err) { /* Success */
174 SetPageDirty(page);
175 return 1;
176 } else { /* -ENOMEM radix-tree allocation failure */
145 /* 177 /*
146 * Radix-tree node allocations from PF_MEMALLOC contexts could 178 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
147 * completely exhaust the page allocator. __GFP_NOMEMALLOC 179 * clear SWAP_HAS_CACHE flag.
148 * stops emergency reserves from being allocated.
149 *
150 * TODO: this could cause a theoretical memory reclaim
151 * deadlock in the swap out path.
152 */
153 /*
154 * Add it to the swap cache and mark it dirty
155 */ 180 */
156 err = add_to_swap_cache(page, entry, 181 swapcache_free(entry, NULL);
157 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 182 return 0;
158
159 switch (err) {
160 case 0: /* Success */
161 SetPageDirty(page);
162 return 1;
163 case -EEXIST:
164 /* Raced with "speculative" read_swap_cache_async */
165 swapcache_free(entry, NULL);
166 continue;
167 default:
168 /* -ENOMEM radix-tree allocation failure */
169 swapcache_free(entry, NULL);
170 return 0;
171 }
172 } 183 }
173} 184}
174 185
@@ -290,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
290 } 301 }
291 302
292 /* 303 /*
304 * call radix_tree_preload() while we can wait.
305 */
306 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307 if (err)
308 break;
309
310 /*
293 * Swap entry may have been freed since our caller observed it. 311 * Swap entry may have been freed since our caller observed it.
294 */ 312 */
295 err = swapcache_prepare(entry); 313 err = swapcache_prepare(entry);
296 if (err == -EEXIST) /* seems racy */ 314 if (err == -EEXIST) { /* seems racy */
315 radix_tree_preload_end();
297 continue; 316 continue;
298 if (err) /* swp entry is obsolete ? */ 317 }
318 if (err) { /* swp entry is obsolete ? */
319 radix_tree_preload_end();
299 break; 320 break;
321 }
300 322
301 /* 323 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
302 * Associate the page with swap entry in the swap cache.
303 * May fail (-EEXIST) if there is already a page associated
304 * with this entry in the swap cache: added by a racing
305 * read_swap_cache_async, or add_to_swap or shmem_writepage
306 * re-using the just freed swap entry for an existing page.
307 * May fail (-ENOMEM) if radix-tree node allocation failed.
308 */
309 __set_page_locked(new_page); 324 __set_page_locked(new_page);
310 SetPageSwapBacked(new_page); 325 SetPageSwapBacked(new_page);
311 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 326 err = __add_to_swap_cache(new_page, entry);
312 if (likely(!err)) { 327 if (likely(!err)) {
328 radix_tree_preload_end();
313 /* 329 /*
314 * Initiate read into locked page and return. 330 * Initiate read into locked page and return.
315 */ 331 */
@@ -317,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
317 swap_readpage(new_page); 333 swap_readpage(new_page);
318 return new_page; 334 return new_page;
319 } 335 }
336 radix_tree_preload_end();
320 ClearPageSwapBacked(new_page); 337 ClearPageSwapBacked(new_page);
321 __clear_page_locked(new_page); 338 __clear_page_locked(new_page);
339 /*
340 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341 * clear SWAP_HAS_CACHE flag.
342 */
322 swapcache_free(entry, NULL); 343 swapcache_free(entry, NULL);
323 } while (err != -ENOMEM); 344 } while (err != -ENOMEM);
324 345
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 74f1102e8749..a1bc6b9af9a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
699 struct swap_info_struct *p; 699 struct swap_info_struct *p;
700 struct page *page = NULL; 700 struct page *page = NULL;
701 701
702 if (is_migration_entry(entry)) 702 if (non_swap_entry(entry))
703 return 1; 703 return 1;
704 704
705 p = swap_info_get(entry); 705 p = swap_info_get(entry);
@@ -1575,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1575 p->flags &= ~SWP_WRITEOK; 1575 p->flags &= ~SWP_WRITEOK;
1576 spin_unlock(&swap_lock); 1576 spin_unlock(&swap_lock);
1577 1577
1578 current->flags |= PF_SWAPOFF; 1578 current->flags |= PF_OOM_ORIGIN;
1579 err = try_to_unuse(type); 1579 err = try_to_unuse(type);
1580 current->flags &= ~PF_SWAPOFF; 1580 current->flags &= ~PF_OOM_ORIGIN;
1581 1581
1582 if (err) { 1582 if (err) {
1583 /* re-insert swap space back into swap_list */ 1583 /* re-insert swap space back into swap_list */
@@ -1974,12 +1974,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1974 goto bad_swap; 1974 goto bad_swap;
1975 } 1975 }
1976 1976
1977 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1977 if (p->bdev) {
1978 p->flags |= SWP_SOLIDSTATE; 1978 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1979 p->cluster_next = 1 + (random32() % p->highest_bit); 1979 p->flags |= SWP_SOLIDSTATE;
1980 p->cluster_next = 1 + (random32() % p->highest_bit);
1981 }
1982 if (discard_swap(p) == 0)
1983 p->flags |= SWP_DISCARDABLE;
1980 } 1984 }
1981 if (discard_swap(p) == 0)
1982 p->flags |= SWP_DISCARDABLE;
1983 1985
1984 mutex_lock(&swapon_mutex); 1986 mutex_lock(&swapon_mutex);
1985 spin_lock(&swap_lock); 1987 spin_lock(&swap_lock);
@@ -2085,7 +2087,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
2085 int count; 2087 int count;
2086 bool has_cache; 2088 bool has_cache;
2087 2089
2088 if (is_migration_entry(entry)) 2090 if (non_swap_entry(entry))
2089 return -EINVAL; 2091 return -EINVAL;
2090 2092
2091 type = swp_type(entry); 2093 type = swp_type(entry);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..450cebdabfc0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
95 */ 95 */
96static void 96static int
97truncate_complete_page(struct address_space *mapping, struct page *page) 97truncate_complete_page(struct address_space *mapping, struct page *page)
98{ 98{
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return -EIO;
101 101
102 if (page_has_private(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
108 remove_from_page_cache(page); 108 remove_from_page_cache(page);
109 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
110 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
111 return 0;
111} 112}
112 113
113/* 114/*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
135 return ret; 136 return ret;
136} 137}
137 138
139int truncate_inode_page(struct address_space *mapping, struct page *page)
140{
141 if (page_mapped(page)) {
142 unmap_mapping_range(mapping,
143 (loff_t)page->index << PAGE_CACHE_SHIFT,
144 PAGE_CACHE_SIZE, 0);
145 }
146 return truncate_complete_page(mapping, page);
147}
148
149/*
150 * Used to get rid of pages on hardware memory corruption.
151 */
152int generic_error_remove_page(struct address_space *mapping, struct page *page)
153{
154 if (!mapping)
155 return -EINVAL;
156 /*
157 * Only punch for normal data pages for now.
158 * Handling other types like directories would need more auditing.
159 */
160 if (!S_ISREG(mapping->host->i_mode))
161 return -EIO;
162 return truncate_inode_page(mapping, page);
163}
164EXPORT_SYMBOL(generic_error_remove_page);
165
166/*
167 * Safely invalidate one page from its pagecache mapping.
168 * It only drops clean, unused pages. The page must be locked.
169 *
170 * Returns 1 if the page is successfully invalidated, otherwise 0.
171 */
172int invalidate_inode_page(struct page *page)
173{
174 struct address_space *mapping = page_mapping(page);
175 if (!mapping)
176 return 0;
177 if (PageDirty(page) || PageWriteback(page))
178 return 0;
179 if (page_mapped(page))
180 return 0;
181 return invalidate_complete_page(mapping, page);
182}
183
138/** 184/**
139 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 185 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
140 * @mapping: mapping to truncate 186 * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
196 unlock_page(page); 242 unlock_page(page);
197 continue; 243 continue;
198 } 244 }
199 if (page_mapped(page)) { 245 truncate_inode_page(mapping, page);
200 unmap_mapping_range(mapping,
201 (loff_t)page_index<<PAGE_CACHE_SHIFT,
202 PAGE_CACHE_SIZE, 0);
203 }
204 truncate_complete_page(mapping, page);
205 unlock_page(page); 246 unlock_page(page);
206 } 247 }
207 pagevec_release(&pvec); 248 pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 break; 279 break;
239 lock_page(page); 280 lock_page(page);
240 wait_on_page_writeback(page); 281 wait_on_page_writeback(page);
241 if (page_mapped(page)) { 282 truncate_inode_page(mapping, page);
242 unmap_mapping_range(mapping,
243 (loff_t)page->index<<PAGE_CACHE_SHIFT,
244 PAGE_CACHE_SIZE, 0);
245 }
246 if (page->index > next) 283 if (page->index > next)
247 next = page->index; 284 next = page->index;
248 next++; 285 next++;
249 truncate_complete_page(mapping, page);
250 unlock_page(page); 286 unlock_page(page);
251 } 287 }
252 pagevec_release(&pvec); 288 pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
311 if (lock_failed) 347 if (lock_failed)
312 continue; 348 continue;
313 349
314 if (PageDirty(page) || PageWriteback(page)) 350 ret += invalidate_inode_page(page);
315 goto unlock; 351
316 if (page_mapped(page))
317 goto unlock;
318 ret += invalidate_complete_page(mapping, page);
319unlock:
320 unlock_page(page); 352 unlock_page(page);
321 if (next > end) 353 if (next > end)
322 break; 354 break;
@@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
465 return invalidate_inode_pages2_range(mapping, 0, -1); 497 return invalidate_inode_pages2_range(mapping, 0, -1);
466} 498}
467EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 499EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
500
501/**
502 * truncate_pagecache - unmap and remove pagecache that has been truncated
503 * @inode: inode
504 * @old: old file offset
505 * @new: new file offset
506 *
507 * inode's new i_size must already be written before truncate_pagecache
508 * is called.
509 *
510 * This function should typically be called before the filesystem
511 * releases resources associated with the freed range (eg. deallocates
512 * blocks). This way, pagecache will always stay logically coherent
513 * with on-disk format, and the filesystem would not have to deal with
514 * situations such as writepage being called for a page that has already
515 * had its underlying blocks deallocated.
516 */
517void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
518{
519 if (new < old) {
520 struct address_space *mapping = inode->i_mapping;
521
522 /*
523 * unmap_mapping_range is called twice, first simply for
524 * efficiency so that truncate_inode_pages does fewer
525 * single-page unmaps. However after this first call, and
526 * before truncate_inode_pages finishes, it is possible for
527 * private pages to be COWed, which remain after
528 * truncate_inode_pages finishes, hence the second
529 * unmap_mapping_range call must be made for correctness.
530 */
531 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
532 truncate_inode_pages(mapping, new);
533 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
534 }
535}
536EXPORT_SYMBOL(truncate_pagecache);
537
538/**
539 * vmtruncate - unmap mappings "freed" by truncate() syscall
540 * @inode: inode of the file used
541 * @offset: file offset to start truncating
542 *
543 * NOTE! We have to be ready to update the memory sharing
544 * between the file and the memory map for a potential last
545 * incomplete page. Ugly, but necessary.
546 */
547int vmtruncate(struct inode *inode, loff_t offset)
548{
549 loff_t oldsize;
550 int error;
551
552 error = inode_newsize_ok(inode, offset);
553 if (error)
554 return error;
555 oldsize = inode->i_size;
556 i_size_write(inode, offset);
557 truncate_pagecache(inode, oldsize, offset);
558 if (inode->i_op->truncate)
559 inode->i_op->truncate(inode);
560
561 return error;
562}
563EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 204b8243d8ab..69511e663234 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,7 +25,7 @@
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/pfn.h> 26#include <linux/pfn.h>
27#include <linux/kmemleak.h> 27#include <linux/kmemleak.h>
28 28#include <linux/highmem.h>
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
@@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
168 next = pgd_addr_end(addr, end); 168 next = pgd_addr_end(addr, end);
169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
170 if (err) 170 if (err)
171 break; 171 return err;
172 } while (pgd++, addr = next, addr != end); 172 } while (pgd++, addr = next, addr != end);
173 173
174 if (unlikely(err))
175 return err;
176 return nr; 174 return nr;
177} 175}
178 176
@@ -186,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
186 return ret; 184 return ret;
187} 185}
188 186
189static inline int is_vmalloc_or_module_addr(const void *x) 187int is_vmalloc_or_module_addr(const void *x)
190{ 188{
191 /* 189 /*
192 * ARM, x86-64 and sparc64 put modules in a special place, 190 * ARM, x86-64 and sparc64 put modules in a special place,
@@ -1272,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1272 if (va && va->flags & VM_VM_AREA) { 1270 if (va && va->flags & VM_VM_AREA) {
1273 struct vm_struct *vm = va->private; 1271 struct vm_struct *vm = va->private;
1274 struct vm_struct *tmp, **p; 1272 struct vm_struct *tmp, **p;
1275 1273 /*
1276 vmap_debug_free_range(va->va_start, va->va_end); 1274 * remove from list and disallow access to this vm_struct
1277 free_unmap_vmap_area(va); 1275 * before unmap. (address range confliction is maintained by
1278 vm->size -= PAGE_SIZE; 1276 * vmap.)
1279 1277 */
1280 write_lock(&vmlist_lock); 1278 write_lock(&vmlist_lock);
1281 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1279 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1282 ; 1280 ;
1283 *p = tmp->next; 1281 *p = tmp->next;
1284 write_unlock(&vmlist_lock); 1282 write_unlock(&vmlist_lock);
1285 1283
1284 vmap_debug_free_range(va->va_start, va->va_end);
1285 free_unmap_vmap_area(va);
1286 vm->size -= PAGE_SIZE;
1287
1286 return vm; 1288 return vm;
1287 } 1289 }
1288 return NULL; 1290 return NULL;
@@ -1384,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
1384 1386
1385 might_sleep(); 1387 might_sleep();
1386 1388
1387 if (count > num_physpages) 1389 if (count > totalram_pages)
1388 return NULL; 1390 return NULL;
1389 1391
1390 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1392 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1491,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1491 unsigned long real_size = size; 1493 unsigned long real_size = size;
1492 1494
1493 size = PAGE_ALIGN(size); 1495 size = PAGE_ALIGN(size);
1494 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1496 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1495 return NULL; 1497 return NULL;
1496 1498
1497 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1499 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
@@ -1641,10 +1643,120 @@ void *vmalloc_32_user(unsigned long size)
1641} 1643}
1642EXPORT_SYMBOL(vmalloc_32_user); 1644EXPORT_SYMBOL(vmalloc_32_user);
1643 1645
1646/*
1647 * small helper routine , copy contents to buf from addr.
1648 * If the page is not present, fill zero.
1649 */
1650
1651static int aligned_vread(char *buf, char *addr, unsigned long count)
1652{
1653 struct page *p;
1654 int copied = 0;
1655
1656 while (count) {
1657 unsigned long offset, length;
1658
1659 offset = (unsigned long)addr & ~PAGE_MASK;
1660 length = PAGE_SIZE - offset;
1661 if (length > count)
1662 length = count;
1663 p = vmalloc_to_page(addr);
1664 /*
1665 * To do safe access to this _mapped_ area, we need
1666 * lock. But adding lock here means that we need to add
1667 * overhead of vmalloc()/vfree() calles for this _debug_
1668 * interface, rarely used. Instead of that, we'll use
1669 * kmap() and get small overhead in this access function.
1670 */
1671 if (p) {
1672 /*
1673 * we can expect USER0 is not used (see vread/vwrite's
1674 * function description)
1675 */
1676 void *map = kmap_atomic(p, KM_USER0);
1677 memcpy(buf, map + offset, length);
1678 kunmap_atomic(map, KM_USER0);
1679 } else
1680 memset(buf, 0, length);
1681
1682 addr += length;
1683 buf += length;
1684 copied += length;
1685 count -= length;
1686 }
1687 return copied;
1688}
1689
1690static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1691{
1692 struct page *p;
1693 int copied = 0;
1694
1695 while (count) {
1696 unsigned long offset, length;
1697
1698 offset = (unsigned long)addr & ~PAGE_MASK;
1699 length = PAGE_SIZE - offset;
1700 if (length > count)
1701 length = count;
1702 p = vmalloc_to_page(addr);
1703 /*
1704 * To do safe access to this _mapped_ area, we need
1705 * lock. But adding lock here means that we need to add
1706 * overhead of vmalloc()/vfree() calles for this _debug_
1707 * interface, rarely used. Instead of that, we'll use
1708 * kmap() and get small overhead in this access function.
1709 */
1710 if (p) {
1711 /*
1712 * we can expect USER0 is not used (see vread/vwrite's
1713 * function description)
1714 */
1715 void *map = kmap_atomic(p, KM_USER0);
1716 memcpy(map + offset, buf, length);
1717 kunmap_atomic(map, KM_USER0);
1718 }
1719 addr += length;
1720 buf += length;
1721 copied += length;
1722 count -= length;
1723 }
1724 return copied;
1725}
1726
1727/**
1728 * vread() - read vmalloc area in a safe way.
1729 * @buf: buffer for reading data
1730 * @addr: vm address.
1731 * @count: number of bytes to be read.
1732 *
1733 * Returns # of bytes which addr and buf should be increased.
1734 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1735 * includes any intersect with alive vmalloc area.
1736 *
1737 * This function checks that addr is a valid vmalloc'ed area, and
1738 * copy data from that area to a given buffer. If the given memory range
1739 * of [addr...addr+count) includes some valid address, data is copied to
1740 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1741 * IOREMAP area is treated as memory hole and no copy is done.
1742 *
1743 * If [addr...addr+count) doesn't includes any intersects with alive
1744 * vm_struct area, returns 0.
1745 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1746 * the caller should guarantee KM_USER0 is not used.
1747 *
1748 * Note: In usual ops, vread() is never necessary because the caller
1749 * should know vmalloc() area is valid and can use memcpy().
1750 * This is for routines which have to access vmalloc area without
1751 * any informaion, as /dev/kmem.
1752 *
1753 */
1754
1644long vread(char *buf, char *addr, unsigned long count) 1755long vread(char *buf, char *addr, unsigned long count)
1645{ 1756{
1646 struct vm_struct *tmp; 1757 struct vm_struct *tmp;
1647 char *vaddr, *buf_start = buf; 1758 char *vaddr, *buf_start = buf;
1759 unsigned long buflen = count;
1648 unsigned long n; 1760 unsigned long n;
1649 1761
1650 /* Don't allow overflow */ 1762 /* Don't allow overflow */
@@ -1652,7 +1764,7 @@ long vread(char *buf, char *addr, unsigned long count)
1652 count = -(unsigned long) addr; 1764 count = -(unsigned long) addr;
1653 1765
1654 read_lock(&vmlist_lock); 1766 read_lock(&vmlist_lock);
1655 for (tmp = vmlist; tmp; tmp = tmp->next) { 1767 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1656 vaddr = (char *) tmp->addr; 1768 vaddr = (char *) tmp->addr;
1657 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1769 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1658 continue; 1770 continue;
@@ -1665,32 +1777,72 @@ long vread(char *buf, char *addr, unsigned long count)
1665 count--; 1777 count--;
1666 } 1778 }
1667 n = vaddr + tmp->size - PAGE_SIZE - addr; 1779 n = vaddr + tmp->size - PAGE_SIZE - addr;
1668 do { 1780 if (n > count)
1669 if (count == 0) 1781 n = count;
1670 goto finished; 1782 if (!(tmp->flags & VM_IOREMAP))
1671 *buf = *addr; 1783 aligned_vread(buf, addr, n);
1672 buf++; 1784 else /* IOREMAP area is treated as memory hole */
1673 addr++; 1785 memset(buf, 0, n);
1674 count--; 1786 buf += n;
1675 } while (--n > 0); 1787 addr += n;
1788 count -= n;
1676 } 1789 }
1677finished: 1790finished:
1678 read_unlock(&vmlist_lock); 1791 read_unlock(&vmlist_lock);
1679 return buf - buf_start; 1792
1793 if (buf == buf_start)
1794 return 0;
1795 /* zero-fill memory holes */
1796 if (buf != buf_start + buflen)
1797 memset(buf, 0, buflen - (buf - buf_start));
1798
1799 return buflen;
1680} 1800}
1681 1801
1802/**
1803 * vwrite() - write vmalloc area in a safe way.
1804 * @buf: buffer for source data
1805 * @addr: vm address.
1806 * @count: number of bytes to be read.
1807 *
1808 * Returns # of bytes which addr and buf should be incresed.
1809 * (same number to @count).
1810 * If [addr...addr+count) doesn't includes any intersect with valid
1811 * vmalloc area, returns 0.
1812 *
1813 * This function checks that addr is a valid vmalloc'ed area, and
1814 * copy data from a buffer to the given addr. If specified range of
1815 * [addr...addr+count) includes some valid address, data is copied from
1816 * proper area of @buf. If there are memory holes, no copy to hole.
1817 * IOREMAP area is treated as memory hole and no copy is done.
1818 *
1819 * If [addr...addr+count) doesn't includes any intersects with alive
1820 * vm_struct area, returns 0.
1821 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1822 * the caller should guarantee KM_USER0 is not used.
1823 *
1824 * Note: In usual ops, vwrite() is never necessary because the caller
1825 * should know vmalloc() area is valid and can use memcpy().
1826 * This is for routines which have to access vmalloc area without
1827 * any informaion, as /dev/kmem.
1828 *
1829 * The caller should guarantee KM_USER1 is not used.
1830 */
1831
1682long vwrite(char *buf, char *addr, unsigned long count) 1832long vwrite(char *buf, char *addr, unsigned long count)
1683{ 1833{
1684 struct vm_struct *tmp; 1834 struct vm_struct *tmp;
1685 char *vaddr, *buf_start = buf; 1835 char *vaddr;
1686 unsigned long n; 1836 unsigned long n, buflen;
1837 int copied = 0;
1687 1838
1688 /* Don't allow overflow */ 1839 /* Don't allow overflow */
1689 if ((unsigned long) addr + count < count) 1840 if ((unsigned long) addr + count < count)
1690 count = -(unsigned long) addr; 1841 count = -(unsigned long) addr;
1842 buflen = count;
1691 1843
1692 read_lock(&vmlist_lock); 1844 read_lock(&vmlist_lock);
1693 for (tmp = vmlist; tmp; tmp = tmp->next) { 1845 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1694 vaddr = (char *) tmp->addr; 1846 vaddr = (char *) tmp->addr;
1695 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1847 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1696 continue; 1848 continue;
@@ -1702,18 +1854,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
1702 count--; 1854 count--;
1703 } 1855 }
1704 n = vaddr + tmp->size - PAGE_SIZE - addr; 1856 n = vaddr + tmp->size - PAGE_SIZE - addr;
1705 do { 1857 if (n > count)
1706 if (count == 0) 1858 n = count;
1707 goto finished; 1859 if (!(tmp->flags & VM_IOREMAP)) {
1708 *addr = *buf; 1860 aligned_vwrite(buf, addr, n);
1709 buf++; 1861 copied++;
1710 addr++; 1862 }
1711 count--; 1863 buf += n;
1712 } while (--n > 0); 1864 addr += n;
1865 count -= n;
1713 } 1866 }
1714finished: 1867finished:
1715 read_unlock(&vmlist_lock); 1868 read_unlock(&vmlist_lock);
1716 return buf - buf_start; 1869 if (!copied)
1870 return 0;
1871 return buflen;
1717} 1872}
1718 1873
1719/** 1874/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba8228e0a806..64e438898832 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
148 return &zone->reclaim_stat; 148 return &zone->reclaim_stat;
149} 149}
150 150
151static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, 151static unsigned long zone_nr_lru_pages(struct zone *zone,
152 enum lru_list lru) 152 struct scan_control *sc, enum lru_list lru)
153{ 153{
154 if (!scanning_global_lru(sc)) 154 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page)
286 286
287static inline int is_page_cache_freeable(struct page *page) 287static inline int is_page_cache_freeable(struct page *page)
288{ 288{
289 return page_count(page) - !!page_has_private(page) == 2; 289 /*
290 * A freeable page cache page is referenced only by the caller
291 * that isolated the page, the page cache radix tree and
292 * optional buffer heads at page->private.
293 */
294 return page_count(page) - page_has_private(page) == 2;
290} 295}
291 296
292static int may_write_to_queue(struct backing_dev_info *bdi) 297static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
361 * block, for some throttling. This happens by accident, because 366 * block, for some throttling. This happens by accident, because
362 * swap_backing_dev_info is bust: it doesn't reflect the 367 * swap_backing_dev_info is bust: it doesn't reflect the
363 * congestion state of the swapdevs. Easy to fix, if needed. 368 * congestion state of the swapdevs. Easy to fix, if needed.
364 * See swapfile.c:page_queue_congested().
365 */ 369 */
366 if (!is_page_cache_freeable(page)) 370 if (!is_page_cache_freeable(page))
367 return PAGE_KEEP; 371 return PAGE_KEEP;
@@ -531,7 +535,7 @@ redo:
531 * unevictable page on [in]active list. 535 * unevictable page on [in]active list.
532 * We know how to handle that. 536 * We know how to handle that.
533 */ 537 */
534 lru = active + page_is_file_cache(page); 538 lru = active + page_lru_base_type(page);
535 lru_cache_add_lru(page, lru); 539 lru_cache_add_lru(page, lru);
536 } else { 540 } else {
537 /* 541 /*
@@ -659,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
659 * processes. Try to unmap it here. 663 * processes. Try to unmap it here.
660 */ 664 */
661 if (page_mapped(page) && mapping) { 665 if (page_mapped(page) && mapping) {
662 switch (try_to_unmap(page, 0)) { 666 switch (try_to_unmap(page, TTU_UNMAP)) {
663 case SWAP_FAIL: 667 case SWAP_FAIL:
664 goto activate_locked; 668 goto activate_locked;
665 case SWAP_AGAIN: 669 case SWAP_AGAIN:
@@ -821,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
821 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 825 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
822 return ret; 826 return ret;
823 827
824 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) 828 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
825 return ret; 829 return ret;
826 830
827 /* 831 /*
@@ -935,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
935 /* Check that we have not crossed a zone boundary. */ 939 /* Check that we have not crossed a zone boundary. */
936 if (unlikely(page_zone_id(cursor_page) != zone_id)) 940 if (unlikely(page_zone_id(cursor_page) != zone_id))
937 continue; 941 continue;
942
943 /*
944 * If we don't have enough swap space, reclaiming of
945 * anon page which don't already have a swap slot is
946 * pointless.
947 */
948 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
949 !PageSwapCache(cursor_page))
950 continue;
951
938 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 952 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
939 list_move(&cursor_page->lru, dst); 953 list_move(&cursor_page->lru, dst);
940 mem_cgroup_del_lru(cursor_page); 954 mem_cgroup_del_lru(cursor_page);
@@ -961,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr,
961 if (file) 975 if (file)
962 lru += LRU_FILE; 976 lru += LRU_FILE;
963 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 977 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
964 mode, !!file); 978 mode, file);
965} 979}
966 980
967/* 981/*
@@ -976,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
976 struct page *page; 990 struct page *page;
977 991
978 list_for_each_entry(page, page_list, lru) { 992 list_for_each_entry(page, page_list, lru) {
979 lru = page_is_file_cache(page); 993 lru = page_lru_base_type(page);
980 if (PageActive(page)) { 994 if (PageActive(page)) {
981 lru += LRU_ACTIVE; 995 lru += LRU_ACTIVE;
982 ClearPageActive(page); 996 ClearPageActive(page);
@@ -1034,6 +1048,31 @@ int isolate_lru_page(struct page *page)
1034} 1048}
1035 1049
1036/* 1050/*
1051 * Are there way too many processes in the direct reclaim path already?
1052 */
1053static int too_many_isolated(struct zone *zone, int file,
1054 struct scan_control *sc)
1055{
1056 unsigned long inactive, isolated;
1057
1058 if (current_is_kswapd())
1059 return 0;
1060
1061 if (!scanning_global_lru(sc))
1062 return 0;
1063
1064 if (file) {
1065 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1066 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1067 } else {
1068 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1069 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1070 }
1071
1072 return isolated > inactive;
1073}
1074
1075/*
1037 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1076 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1038 * of reclaimed pages 1077 * of reclaimed pages
1039 */ 1078 */
@@ -1048,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1048 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1087 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1049 int lumpy_reclaim = 0; 1088 int lumpy_reclaim = 0;
1050 1089
1090 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10);
1092
1093 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current))
1095 return SWAP_CLUSTER_MAX;
1096 }
1097
1051 /* 1098 /*
1052 * If we need a large contiguous chunk of memory, or have 1099 * If we need a large contiguous chunk of memory, or have
1053 * trouble getting a small set of contiguous pages, we 1100 * trouble getting a small set of contiguous pages, we
@@ -1072,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1072 unsigned long nr_active; 1119 unsigned long nr_active;
1073 unsigned int count[NR_LRU_LISTS] = { 0, }; 1120 unsigned int count[NR_LRU_LISTS] = { 0, };
1074 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1121 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1122 unsigned long nr_anon;
1123 unsigned long nr_file;
1075 1124
1076 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1125 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1077 &page_list, &nr_scan, sc->order, mode, 1126 &page_list, &nr_scan, sc->order, mode,
1078 zone, sc->mem_cgroup, 0, file); 1127 zone, sc->mem_cgroup, 0, file);
1128
1129 if (scanning_global_lru(sc)) {
1130 zone->pages_scanned += nr_scan;
1131 if (current_is_kswapd())
1132 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1133 nr_scan);
1134 else
1135 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1136 nr_scan);
1137 }
1138
1139 if (nr_taken == 0)
1140 goto done;
1141
1079 nr_active = clear_active_flags(&page_list, count); 1142 nr_active = clear_active_flags(&page_list, count);
1080 __count_vm_events(PGDEACTIVATE, nr_active); 1143 __count_vm_events(PGDEACTIVATE, nr_active);
1081 1144
@@ -1088,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1088 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1151 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1089 -count[LRU_INACTIVE_ANON]); 1152 -count[LRU_INACTIVE_ANON]);
1090 1153
1091 if (scanning_global_lru(sc)) 1154 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1092 zone->pages_scanned += nr_scan; 1155 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1156 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1157 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1093 1158
1094 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1159 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1095 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1160 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1123,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1123 } 1188 }
1124 1189
1125 nr_reclaimed += nr_freed; 1190 nr_reclaimed += nr_freed;
1191
1126 local_irq_disable(); 1192 local_irq_disable();
1127 if (current_is_kswapd()) { 1193 if (current_is_kswapd())
1128 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1129 __count_vm_events(KSWAPD_STEAL, nr_freed); 1194 __count_vm_events(KSWAPD_STEAL, nr_freed);
1130 } else if (scanning_global_lru(sc))
1131 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1132
1133 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1195 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1134 1196
1135 if (nr_taken == 0)
1136 goto done;
1137
1138 spin_lock(&zone->lru_lock); 1197 spin_lock(&zone->lru_lock);
1139 /* 1198 /*
1140 * Put back any unfreeable pages. 1199 * Put back any unfreeable pages.
@@ -1153,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1153 SetPageLRU(page); 1212 SetPageLRU(page);
1154 lru = page_lru(page); 1213 lru = page_lru(page);
1155 add_page_to_lru_list(zone, page, lru); 1214 add_page_to_lru_list(zone, page, lru);
1156 if (PageActive(page)) { 1215 if (is_active_lru(lru)) {
1157 int file = !!page_is_file_cache(page); 1216 int file = is_file_lru(lru);
1158 reclaim_stat->recent_rotated[file]++; 1217 reclaim_stat->recent_rotated[file]++;
1159 } 1218 }
1160 if (!pagevec_add(&pvec, page)) { 1219 if (!pagevec_add(&pvec, page)) {
@@ -1163,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1163 spin_lock_irq(&zone->lru_lock); 1222 spin_lock_irq(&zone->lru_lock);
1164 } 1223 }
1165 } 1224 }
1225 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1226 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1227
1166 } while (nr_scanned < max_scan); 1228 } while (nr_scanned < max_scan);
1167 spin_unlock(&zone->lru_lock); 1229
1168done: 1230done:
1169 local_irq_enable(); 1231 spin_unlock_irq(&zone->lru_lock);
1170 pagevec_release(&pvec); 1232 pagevec_release(&pvec);
1171 return nr_reclaimed; 1233 return nr_reclaimed;
1172} 1234}
@@ -1215,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone,
1215 1277
1216 while (!list_empty(list)) { 1278 while (!list_empty(list)) {
1217 page = lru_to_page(list); 1279 page = lru_to_page(list);
1218 prefetchw_prev_lru_page(page, list, flags);
1219 1280
1220 VM_BUG_ON(PageLRU(page)); 1281 VM_BUG_ON(PageLRU(page));
1221 SetPageLRU(page); 1282 SetPageLRU(page);
1222 1283
1223 VM_BUG_ON(!PageActive(page));
1224 if (!is_active_lru(lru))
1225 ClearPageActive(page); /* we are de-activating */
1226
1227 list_move(&page->lru, &zone->lru[lru].list); 1284 list_move(&page->lru, &zone->lru[lru].list);
1228 mem_cgroup_add_lru_list(page, lru); 1285 mem_cgroup_add_lru_list(page, lru);
1229 pgmoved++; 1286 pgmoved++;
@@ -1244,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1244static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1301static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 struct scan_control *sc, int priority, int file) 1302 struct scan_control *sc, int priority, int file)
1246{ 1303{
1247 unsigned long pgmoved; 1304 unsigned long nr_taken;
1248 unsigned long pgscanned; 1305 unsigned long pgscanned;
1249 unsigned long vm_flags; 1306 unsigned long vm_flags;
1250 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1307 LIST_HEAD(l_hold); /* The pages which were snipped off */
@@ -1252,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1252 LIST_HEAD(l_inactive); 1309 LIST_HEAD(l_inactive);
1253 struct page *page; 1310 struct page *page;
1254 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1311 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1312 unsigned long nr_rotated = 0;
1255 1313
1256 lru_add_drain(); 1314 lru_add_drain();
1257 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1258 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1316 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1259 ISOLATE_ACTIVE, zone, 1317 ISOLATE_ACTIVE, zone,
1260 sc->mem_cgroup, 1, file); 1318 sc->mem_cgroup, 1, file);
1261 /* 1319 /*
@@ -1265,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1265 if (scanning_global_lru(sc)) { 1323 if (scanning_global_lru(sc)) {
1266 zone->pages_scanned += pgscanned; 1324 zone->pages_scanned += pgscanned;
1267 } 1325 }
1268 reclaim_stat->recent_scanned[!!file] += pgmoved; 1326 reclaim_stat->recent_scanned[file] += nr_taken;
1269 1327
1270 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1328 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1271 if (file) 1329 if (file)
1272 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1330 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1273 else 1331 else
1274 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1332 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1333 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1275 spin_unlock_irq(&zone->lru_lock); 1334 spin_unlock_irq(&zone->lru_lock);
1276 1335
1277 pgmoved = 0; /* count referenced (mapping) mapped pages */
1278 while (!list_empty(&l_hold)) { 1336 while (!list_empty(&l_hold)) {
1279 cond_resched(); 1337 cond_resched();
1280 page = lru_to_page(&l_hold); 1338 page = lru_to_page(&l_hold);
@@ -1288,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1288 /* page_referenced clears PageReferenced */ 1346 /* page_referenced clears PageReferenced */
1289 if (page_mapping_inuse(page) && 1347 if (page_mapping_inuse(page) &&
1290 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1348 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1291 pgmoved++; 1349 nr_rotated++;
1292 /* 1350 /*
1293 * Identify referenced, file-backed active pages and 1351 * Identify referenced, file-backed active pages and
1294 * give them one more trip around the active list. So 1352 * give them one more trip around the active list. So
@@ -1304,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1304 } 1362 }
1305 } 1363 }
1306 1364
1365 ClearPageActive(page); /* we are de-activating */
1307 list_add(&page->lru, &l_inactive); 1366 list_add(&page->lru, &l_inactive);
1308 } 1367 }
1309 1368
@@ -1317,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1317 * helps balance scan pressure between file and anonymous pages in 1376 * helps balance scan pressure between file and anonymous pages in
1318 * get_scan_ratio. 1377 * get_scan_ratio.
1319 */ 1378 */
1320 reclaim_stat->recent_rotated[!!file] += pgmoved; 1379 reclaim_stat->recent_rotated[file] += nr_rotated;
1321 1380
1322 move_active_pages_to_lru(zone, &l_active, 1381 move_active_pages_to_lru(zone, &l_active,
1323 LRU_ACTIVE + file * LRU_FILE); 1382 LRU_ACTIVE + file * LRU_FILE);
1324 move_active_pages_to_lru(zone, &l_inactive, 1383 move_active_pages_to_lru(zone, &l_inactive,
1325 LRU_BASE + file * LRU_FILE); 1384 LRU_BASE + file * LRU_FILE);
1326 1385 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1327 spin_unlock_irq(&zone->lru_lock); 1386 spin_unlock_irq(&zone->lru_lock);
1328} 1387}
1329 1388
@@ -1429,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1429 unsigned long ap, fp; 1488 unsigned long ap, fp;
1430 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1489 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1431 1490
1432 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1491 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1433 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1492 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1434 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1493 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1435 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); 1494 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1436 1495
1437 if (scanning_global_lru(sc)) { 1496 if (scanning_global_lru(sc)) {
1438 free = zone_page_state(zone, NR_FREE_PAGES); 1497 free = zone_page_state(zone, NR_FREE_PAGES);
@@ -1526,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone,
1526 enum lru_list l; 1585 enum lru_list l;
1527 unsigned long nr_reclaimed = sc->nr_reclaimed; 1586 unsigned long nr_reclaimed = sc->nr_reclaimed;
1528 unsigned long swap_cluster_max = sc->swap_cluster_max; 1587 unsigned long swap_cluster_max = sc->swap_cluster_max;
1588 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1529 int noswap = 0; 1589 int noswap = 0;
1530 1590
1531 /* If we have no swap space, do not bother scanning anon pages. */ 1591 /* If we have no swap space, do not bother scanning anon pages. */
@@ -1540,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone,
1540 int file = is_file_lru(l); 1600 int file = is_file_lru(l);
1541 unsigned long scan; 1601 unsigned long scan;
1542 1602
1543 scan = zone_nr_pages(zone, sc, l); 1603 scan = zone_nr_lru_pages(zone, sc, l);
1544 if (priority || noswap) { 1604 if (priority || noswap) {
1545 scan >>= priority; 1605 scan >>= priority;
1546 scan = (scan * percent[file]) / 100; 1606 scan = (scan * percent[file]) / 100;
1547 } 1607 }
1548 if (scanning_global_lru(sc)) 1608 nr[l] = nr_scan_try_batch(scan,
1549 nr[l] = nr_scan_try_batch(scan, 1609 &reclaim_stat->nr_saved_scan[l],
1550 &zone->lru[l].nr_saved_scan, 1610 swap_cluster_max);
1551 swap_cluster_max);
1552 else
1553 nr[l] = scan;
1554 } 1611 }
1555 1612
1556 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1613 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1652,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1652 * 1709 *
1653 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1710 * If the caller is !__GFP_FS then the probability of a failure is reasonably
1654 * high - the zone may be full of dirty or under-writeback pages, which this 1711 * high - the zone may be full of dirty or under-writeback pages, which this
1655 * caller can't do much about. We kick pdflush and take explicit naps in the 1712 * caller can't do much about. We kick the writeback threads and take explicit
1656 * hope that some of these pages can be written. But if the allocating task 1713 * naps in the hope that some of these pages can be written. But if the
1657 * holds filesystem locks which prevent writeout this might not work, and the 1714 * allocating task holds filesystem locks which prevent writeout this might not
1658 * allocation attempt will fail. 1715 * work, and the allocation attempt will fail.
1659 * 1716 *
1660 * returns: 0, if no pages reclaimed 1717 * returns: 0, if no pages reclaimed
1661 * else, the number of pages reclaimed 1718 * else, the number of pages reclaimed
@@ -1685,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1685 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1742 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1686 continue; 1743 continue;
1687 1744
1688 lru_pages += zone_lru_pages(zone); 1745 lru_pages += zone_reclaimable_pages(zone);
1689 } 1746 }
1690 } 1747 }
1691 1748
@@ -1779,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1779 1836
1780#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1781 1838
1839unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1840 gfp_t gfp_mask, bool noswap,
1841 unsigned int swappiness,
1842 struct zone *zone, int nid)
1843{
1844 struct scan_control sc = {
1845 .may_writepage = !laptop_mode,
1846 .may_unmap = 1,
1847 .may_swap = !noswap,
1848 .swap_cluster_max = SWAP_CLUSTER_MAX,
1849 .swappiness = swappiness,
1850 .order = 0,
1851 .mem_cgroup = mem,
1852 .isolate_pages = mem_cgroup_isolate_pages,
1853 };
1854 nodemask_t nm = nodemask_of_node(nid);
1855
1856 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1857 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1858 sc.nodemask = &nm;
1859 sc.nr_reclaimed = 0;
1860 sc.nr_scanned = 0;
1861 /*
1862 * NOTE: Although we can get the priority field, using it
1863 * here is not a good idea, since it limits the pages we can scan.
1864 * if we don't reclaim here, the shrink_zone from balance_pgdat
1865 * will pick up pages from other mem cgroup's as well. We hack
1866 * the priority and make it zero.
1867 */
1868 shrink_zone(0, zone, &sc);
1869 return sc.nr_reclaimed;
1870}
1871
1782unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1872unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1783 gfp_t gfp_mask, 1873 gfp_t gfp_mask,
1784 bool noswap, 1874 bool noswap,
1785 unsigned int swappiness) 1875 unsigned int swappiness)
1786{ 1876{
1877 struct zonelist *zonelist;
1787 struct scan_control sc = { 1878 struct scan_control sc = {
1788 .may_writepage = !laptop_mode, 1879 .may_writepage = !laptop_mode,
1789 .may_unmap = 1, 1880 .may_unmap = 1,
@@ -1795,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1795 .isolate_pages = mem_cgroup_isolate_pages, 1886 .isolate_pages = mem_cgroup_isolate_pages,
1796 .nodemask = NULL, /* we don't care the placement */ 1887 .nodemask = NULL, /* we don't care the placement */
1797 }; 1888 };
1798 struct zonelist *zonelist;
1799 1889
1800 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1890 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1801 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1891 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1902,7 +1992,7 @@ loop_again:
1902 for (i = 0; i <= end_zone; i++) { 1992 for (i = 0; i <= end_zone; i++) {
1903 struct zone *zone = pgdat->node_zones + i; 1993 struct zone *zone = pgdat->node_zones + i;
1904 1994
1905 lru_pages += zone_lru_pages(zone); 1995 lru_pages += zone_reclaimable_pages(zone);
1906 } 1996 }
1907 1997
1908 /* 1998 /*
@@ -1917,6 +2007,7 @@ loop_again:
1917 for (i = 0; i <= end_zone; i++) { 2007 for (i = 0; i <= end_zone; i++) {
1918 struct zone *zone = pgdat->node_zones + i; 2008 struct zone *zone = pgdat->node_zones + i;
1919 int nr_slab; 2009 int nr_slab;
2010 int nid, zid;
1920 2011
1921 if (!populated_zone(zone)) 2012 if (!populated_zone(zone))
1922 continue; 2013 continue;
@@ -1931,6 +2022,15 @@ loop_again:
1931 temp_priority[i] = priority; 2022 temp_priority[i] = priority;
1932 sc.nr_scanned = 0; 2023 sc.nr_scanned = 0;
1933 note_zone_scanning_priority(zone, priority); 2024 note_zone_scanning_priority(zone, priority);
2025
2026 nid = pgdat->node_id;
2027 zid = zone_idx(zone);
2028 /*
2029 * Call soft limit reclaim before calling shrink_zone.
2030 * For now we ignore the return value
2031 */
2032 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
2033 nid, zid);
1934 /* 2034 /*
1935 * We put equal pressure on every zone, unless one 2035 * We put equal pressure on every zone, unless one
1936 * zone has way too many pages free already. 2036 * zone has way too many pages free already.
@@ -1946,7 +2046,7 @@ loop_again:
1946 if (zone_is_all_unreclaimable(zone)) 2046 if (zone_is_all_unreclaimable(zone))
1947 continue; 2047 continue;
1948 if (nr_slab == 0 && zone->pages_scanned >= 2048 if (nr_slab == 0 && zone->pages_scanned >=
1949 (zone_lru_pages(zone) * 6)) 2049 (zone_reclaimable_pages(zone) * 6))
1950 zone_set_flag(zone, 2050 zone_set_flag(zone,
1951 ZONE_ALL_UNRECLAIMABLE); 2051 ZONE_ALL_UNRECLAIMABLE);
1952 /* 2052 /*
@@ -2113,12 +2213,39 @@ void wakeup_kswapd(struct zone *zone, int order)
2113 wake_up_interruptible(&pgdat->kswapd_wait); 2213 wake_up_interruptible(&pgdat->kswapd_wait);
2114} 2214}
2115 2215
2116unsigned long global_lru_pages(void) 2216/*
2217 * The reclaimable count would be mostly accurate.
2218 * The less reclaimable pages may be
2219 * - mlocked pages, which will be moved to unevictable list when encountered
2220 * - mapped pages, which may require several travels to be reclaimed
2221 * - dirty pages, which is not "instantly" reclaimable
2222 */
2223unsigned long global_reclaimable_pages(void)
2224{
2225 int nr;
2226
2227 nr = global_page_state(NR_ACTIVE_FILE) +
2228 global_page_state(NR_INACTIVE_FILE);
2229
2230 if (nr_swap_pages > 0)
2231 nr += global_page_state(NR_ACTIVE_ANON) +
2232 global_page_state(NR_INACTIVE_ANON);
2233
2234 return nr;
2235}
2236
2237unsigned long zone_reclaimable_pages(struct zone *zone)
2117{ 2238{
2118 return global_page_state(NR_ACTIVE_ANON) 2239 int nr;
2119 + global_page_state(NR_ACTIVE_FILE) 2240
2120 + global_page_state(NR_INACTIVE_ANON) 2241 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2121 + global_page_state(NR_INACTIVE_FILE); 2242 zone_page_state(zone, NR_INACTIVE_FILE);
2243
2244 if (nr_swap_pages > 0)
2245 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2246 zone_page_state(zone, NR_INACTIVE_ANON);
2247
2248 return nr;
2122} 2249}
2123 2250
2124#ifdef CONFIG_HIBERNATION 2251#ifdef CONFIG_HIBERNATION
@@ -2133,6 +2260,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2133{ 2260{
2134 struct zone *zone; 2261 struct zone *zone;
2135 unsigned long nr_reclaimed = 0; 2262 unsigned long nr_reclaimed = 0;
2263 struct zone_reclaim_stat *reclaim_stat;
2136 2264
2137 for_each_populated_zone(zone) { 2265 for_each_populated_zone(zone) {
2138 enum lru_list l; 2266 enum lru_list l;
@@ -2149,11 +2277,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2149 l == LRU_ACTIVE_FILE)) 2277 l == LRU_ACTIVE_FILE))
2150 continue; 2278 continue;
2151 2279
2152 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; 2280 reclaim_stat = get_reclaim_stat(zone, sc);
2153 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { 2281 reclaim_stat->nr_saved_scan[l] +=
2282 (lru_pages >> prio) + 1;
2283 if (reclaim_stat->nr_saved_scan[l]
2284 >= nr_pages || pass > 3) {
2154 unsigned long nr_to_scan; 2285 unsigned long nr_to_scan;
2155 2286
2156 zone->lru[l].nr_saved_scan = 0; 2287 reclaim_stat->nr_saved_scan[l] = 0;
2157 nr_to_scan = min(nr_pages, lru_pages); 2288 nr_to_scan = min(nr_pages, lru_pages);
2158 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2289 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2159 sc, prio); 2290 sc, prio);
@@ -2190,7 +2321,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2190 2321
2191 current->reclaim_state = &reclaim_state; 2322 current->reclaim_state = &reclaim_state;
2192 2323
2193 lru_pages = global_lru_pages(); 2324 lru_pages = global_reclaimable_pages();
2194 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2325 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2195 /* If slab caches are huge, it's better to hit them first */ 2326 /* If slab caches are huge, it's better to hit them first */
2196 while (nr_slab >= lru_pages) { 2327 while (nr_slab >= lru_pages) {
@@ -2232,7 +2363,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2232 2363
2233 reclaim_state.reclaimed_slab = 0; 2364 reclaim_state.reclaimed_slab = 0;
2234 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2365 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2235 global_lru_pages()); 2366 global_reclaimable_pages());
2236 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2367 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2237 if (sc.nr_reclaimed >= nr_pages) 2368 if (sc.nr_reclaimed >= nr_pages)
2238 goto out; 2369 goto out;
@@ -2249,7 +2380,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2249 if (!sc.nr_reclaimed) { 2380 if (!sc.nr_reclaimed) {
2250 do { 2381 do {
2251 reclaim_state.reclaimed_slab = 0; 2382 reclaim_state.reclaimed_slab = 0;
2252 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2383 shrink_slab(nr_pages, sc.gfp_mask,
2384 global_reclaimable_pages());
2253 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2385 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2254 } while (sc.nr_reclaimed < nr_pages && 2386 } while (sc.nr_reclaimed < nr_pages &&
2255 reclaim_state.reclaimed_slab > 0); 2387 reclaim_state.reclaimed_slab > 0);
@@ -2569,7 +2701,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
2569retry: 2701retry:
2570 ClearPageUnevictable(page); 2702 ClearPageUnevictable(page);
2571 if (page_evictable(page, NULL)) { 2703 if (page_evictable(page, NULL)) {
2572 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); 2704 enum lru_list l = page_lru_base_type(page);
2573 2705
2574 __dec_zone_state(zone, NR_UNEVICTABLE); 2706 __dec_zone_state(zone, NR_UNEVICTABLE);
2575 list_move(&page->lru, &zone->lru[l].list); 2707 list_move(&page->lru, &zone->lru[l].list);
@@ -2712,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
2712unsigned long scan_unevictable_pages; 2844unsigned long scan_unevictable_pages;
2713 2845
2714int scan_unevictable_handler(struct ctl_table *table, int write, 2846int scan_unevictable_handler(struct ctl_table *table, int write,
2715 struct file *file, void __user *buffer, 2847 void __user *buffer,
2716 size_t *length, loff_t *ppos) 2848 size_t *length, loff_t *ppos)
2717{ 2849{
2718 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2850 proc_doulongvec_minmax(table, write, buffer, length, ppos);
2719 2851
2720 if (write && *(unsigned long *)table->data) 2852 if (write && *(unsigned long *)table->data)
2721 scan_all_zones_unevictable_pages(); 2853 scan_all_zones_unevictable_pages();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..c81321f9feec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -639,11 +639,14 @@ static const char * const vmstat_text[] = {
639 "nr_slab_reclaimable", 639 "nr_slab_reclaimable",
640 "nr_slab_unreclaimable", 640 "nr_slab_unreclaimable",
641 "nr_page_table_pages", 641 "nr_page_table_pages",
642 "nr_kernel_stack",
642 "nr_unstable", 643 "nr_unstable",
643 "nr_bounce", 644 "nr_bounce",
644 "nr_vmscan_write", 645 "nr_vmscan_write",
645 "nr_writeback_temp", 646 "nr_writeback_temp",
646 647 "nr_isolated_anon",
648 "nr_isolated_file",
649 "nr_shmem",
647#ifdef CONFIG_NUMA 650#ifdef CONFIG_NUMA
648 "numa_hit", 651 "numa_hit",
649 "numa_miss", 652 "numa_miss",