aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/Makefile3
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/hugetlb.c251
-rw-r--r--mm/internal.h10
-rw-r--r--mm/ksm.c1703
-rw-r--r--mm/madvise.c53
-rw-r--r--mm/memcontrol.c2
-rw-r--r--mm/memory.c212
-rw-r--r--mm/memory_hotplug.c7
-rw-r--r--mm/mempool.c7
-rw-r--r--mm/migrate.c24
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mmap.c57
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c14
-rw-r--r--mm/nommu.c45
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c16
-rw-r--r--mm/page_alloc.c284
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/rmap.c78
-rw-r--r--mm/shmem.c15
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c3
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c143
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmalloc.c221
-rw-r--r--mm/vmscan.c213
-rw-r--r--mm/vmstat.c5
35 files changed, 3045 insertions, 676 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3aa519f52e18..71eb0b4cce8d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT
214config MMU_NOTIFIER 214config MMU_NOTIFIER
215 bool 215 bool
216 216
217config KSM
218 bool "Enable KSM for page merging"
219 depends on MMU
220 help
221 Enable Kernel Samepage Merging: KSM periodically scans those areas
222 of an application's address space that an app has advised may be
223 mergeable. When it finds pages of identical content, it replaces
224 the many instances by a single resident page with that content, so
225 saving memory until one or another app needs to modify the content.
226 Recommended for use with KVM, or with other duplicative applications.
227 See Documentation/vm/ksm.txt for more information.
228
217config DEFAULT_MMAP_MIN_ADDR 229config DEFAULT_MMAP_MIN_ADDR
218 int "Low address space to protect from user allocation" 230 int "Low address space to protect from user allocation"
219 default 4096 231 default 4096
diff --git a/mm/Makefile b/mm/Makefile
index ea4b18bd3960..728a9fde49d1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o mmu_context.o $(mmu-y)
15obj-y += init-mm.o 15obj-y += init-mm.o
16 16
17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 26obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
29obj-$(CONFIG_SLAB) += slab.o 30obj-$(CONFIG_SLAB) += slab.o
30obj-$(CONFIG_SLUB) += slub.o 31obj-$(CONFIG_SLUB) += slub.o
diff --git a/mm/filemap.c b/mm/filemap.c
index dd51c68e2b86..bcc7372aebbc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,6 +119,8 @@ void __remove_from_page_cache(struct page *page)
119 page->mapping = NULL; 119 page->mapping = NULL;
120 mapping->nrpages--; 120 mapping->nrpages--;
121 __dec_zone_page_state(page, NR_FILE_PAGES); 121 __dec_zone_page_state(page, NR_FILE_PAGES);
122 if (PageSwapBacked(page))
123 __dec_zone_page_state(page, NR_SHMEM);
122 BUG_ON(page_mapped(page)); 124 BUG_ON(page_mapped(page));
123 125
124 /* 126 /*
@@ -431,6 +433,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
431 if (likely(!error)) { 433 if (likely(!error)) {
432 mapping->nrpages++; 434 mapping->nrpages++;
433 __inc_zone_page_state(page, NR_FILE_PAGES); 435 __inc_zone_page_state(page, NR_FILE_PAGES);
436 if (PageSwapBacked(page))
437 __inc_zone_page_state(page, NR_SHMEM);
434 spin_unlock_irq(&mapping->tree_lock); 438 spin_unlock_irq(&mapping->tree_lock);
435 } else { 439 } else {
436 page->mapping = NULL; 440 page->mapping = NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d63634777..815dbd4a6dcb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
456 h->free_huge_pages_node[nid]++; 456 h->free_huge_pages_node[nid]++;
457} 457}
458 458
459static struct page *dequeue_huge_page(struct hstate *h)
460{
461 int nid;
462 struct page *page = NULL;
463
464 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
465 if (!list_empty(&h->hugepage_freelists[nid])) {
466 page = list_entry(h->hugepage_freelists[nid].next,
467 struct page, lru);
468 list_del(&page->lru);
469 h->free_huge_pages--;
470 h->free_huge_pages_node[nid]--;
471 break;
472 }
473 }
474 return page;
475}
476
477static struct page *dequeue_huge_page_vma(struct hstate *h, 459static struct page *dequeue_huge_page_vma(struct hstate *h,
478 struct vm_area_struct *vma, 460 struct vm_area_struct *vma,
479 unsigned long address, int avoid_reserve) 461 unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
641 623
642/* 624/*
643 * Use a helper variable to find the next node and then 625 * Use a helper variable to find the next node and then
644 * copy it back to hugetlb_next_nid afterwards: 626 * copy it back to next_nid_to_alloc afterwards:
645 * otherwise there's a window in which a racer might 627 * otherwise there's a window in which a racer might
646 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
647 * But we don't need to use a spin_lock here: it really 629 * But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
650 * if we just successfully allocated a hugepage so that 632 * if we just successfully allocated a hugepage so that
651 * the next caller gets hugepages on the next node. 633 * the next caller gets hugepages on the next node.
652 */ 634 */
653static int hstate_next_node(struct hstate *h) 635static int hstate_next_node_to_alloc(struct hstate *h)
654{ 636{
655 int next_nid; 637 int next_nid;
656 next_nid = next_node(h->hugetlb_next_nid, node_online_map); 638 next_nid = next_node(h->next_nid_to_alloc, node_online_map);
657 if (next_nid == MAX_NUMNODES) 639 if (next_nid == MAX_NUMNODES)
658 next_nid = first_node(node_online_map); 640 next_nid = first_node(node_online_map);
659 h->hugetlb_next_nid = next_nid; 641 h->next_nid_to_alloc = next_nid;
660 return next_nid; 642 return next_nid;
661} 643}
662 644
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
667 int next_nid; 649 int next_nid;
668 int ret = 0; 650 int ret = 0;
669 651
670 start_nid = h->hugetlb_next_nid; 652 start_nid = h->next_nid_to_alloc;
653 next_nid = start_nid;
671 654
672 do { 655 do {
673 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); 656 page = alloc_fresh_huge_page_node(h, next_nid);
674 if (page) 657 if (page)
675 ret = 1; 658 ret = 1;
676 next_nid = hstate_next_node(h); 659 next_nid = hstate_next_node_to_alloc(h);
677 } while (!page && h->hugetlb_next_nid != start_nid); 660 } while (!page && next_nid != start_nid);
678 661
679 if (ret) 662 if (ret)
680 count_vm_event(HTLB_BUDDY_PGALLOC); 663 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
684 return ret; 667 return ret;
685} 668}
686 669
670/*
671 * helper for free_pool_huge_page() - find next node
672 * from which to free a huge page
673 */
674static int hstate_next_node_to_free(struct hstate *h)
675{
676 int next_nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map);
678 if (next_nid == MAX_NUMNODES)
679 next_nid = first_node(node_online_map);
680 h->next_nid_to_free = next_nid;
681 return next_nid;
682}
683
684/*
685 * Free huge page from pool from next node to free.
686 * Attempt to keep persistent huge pages more or less
687 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked.
689 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691{
692 int start_nid;
693 int next_nid;
694 int ret = 0;
695
696 start_nid = h->next_nid_to_free;
697 next_nid = start_nid;
698
699 do {
700 /*
701 * If we're returning unused surplus pages, only examine
702 * nodes with surplus pages.
703 */
704 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 !list_empty(&h->hugepage_freelists[next_nid])) {
706 struct page *page =
707 list_entry(h->hugepage_freelists[next_nid].next,
708 struct page, lru);
709 list_del(&page->lru);
710 h->free_huge_pages--;
711 h->free_huge_pages_node[next_nid]--;
712 if (acct_surplus) {
713 h->surplus_huge_pages--;
714 h->surplus_huge_pages_node[next_nid]--;
715 }
716 update_and_free_page(h, page);
717 ret = 1;
718 }
719 next_nid = hstate_next_node_to_free(h);
720 } while (!ret && next_nid != start_nid);
721
722 return ret;
723}
724
687static struct page *alloc_buddy_huge_page(struct hstate *h, 725static struct page *alloc_buddy_huge_page(struct hstate *h,
688 struct vm_area_struct *vma, unsigned long address) 726 struct vm_area_struct *vma, unsigned long address)
689{ 727{
@@ -855,22 +893,13 @@ free:
855 * When releasing a hugetlb pool reservation, any surplus pages that were 893 * When releasing a hugetlb pool reservation, any surplus pages that were
856 * allocated to satisfy the reservation must be explicitly freed if they were 894 * allocated to satisfy the reservation must be explicitly freed if they were
857 * never used. 895 * never used.
896 * Called with hugetlb_lock held.
858 */ 897 */
859static void return_unused_surplus_pages(struct hstate *h, 898static void return_unused_surplus_pages(struct hstate *h,
860 unsigned long unused_resv_pages) 899 unsigned long unused_resv_pages)
861{ 900{
862 static int nid = -1;
863 struct page *page;
864 unsigned long nr_pages; 901 unsigned long nr_pages;
865 902
866 /*
867 * We want to release as many surplus pages as possible, spread
868 * evenly across all nodes. Iterate across all nodes until we
869 * can no longer free unreserved surplus pages. This occurs when
870 * the nodes with surplus pages have no free pages.
871 */
872 unsigned long remaining_iterations = nr_online_nodes;
873
874 /* Uncommit the reservation */ 903 /* Uncommit the reservation */
875 h->resv_huge_pages -= unused_resv_pages; 904 h->resv_huge_pages -= unused_resv_pages;
876 905
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
880 909
881 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 910 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
882 911
883 while (remaining_iterations-- && nr_pages) { 912 /*
884 nid = next_node(nid, node_online_map); 913 * We want to release as many surplus pages as possible, spread
885 if (nid == MAX_NUMNODES) 914 * evenly across all nodes. Iterate across all nodes until we
886 nid = first_node(node_online_map); 915 * can no longer free unreserved surplus pages. This occurs when
887 916 * the nodes with surplus pages have no free pages.
888 if (!h->surplus_huge_pages_node[nid]) 917 * free_pool_huge_page() will balance the the frees across the
889 continue; 918 * on-line nodes for us and will handle the hstate accounting.
890 919 */
891 if (!list_empty(&h->hugepage_freelists[nid])) { 920 while (nr_pages--) {
892 page = list_entry(h->hugepage_freelists[nid].next, 921 if (!free_pool_huge_page(h, 1))
893 struct page, lru); 922 break;
894 list_del(&page->lru);
895 update_and_free_page(h, page);
896 h->free_huge_pages--;
897 h->free_huge_pages_node[nid]--;
898 h->surplus_huge_pages--;
899 h->surplus_huge_pages_node[nid]--;
900 nr_pages--;
901 remaining_iterations = nr_online_nodes;
902 }
903 } 923 }
904} 924}
905 925
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1008 void *addr; 1028 void *addr;
1009 1029
1010 addr = __alloc_bootmem_node_nopanic( 1030 addr = __alloc_bootmem_node_nopanic(
1011 NODE_DATA(h->hugetlb_next_nid), 1031 NODE_DATA(h->next_nid_to_alloc),
1012 huge_page_size(h), huge_page_size(h), 0); 1032 huge_page_size(h), huge_page_size(h), 0);
1013 1033
1034 hstate_next_node_to_alloc(h);
1014 if (addr) { 1035 if (addr) {
1015 /* 1036 /*
1016 * Use the beginning of the huge page to store the 1037 * Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1020 m = addr; 1041 m = addr;
1021 goto found; 1042 goto found;
1022 } 1043 }
1023 hstate_next_node(h);
1024 nr_nodes--; 1044 nr_nodes--;
1025 } 1045 }
1026 return 0; 1046 return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1141 */ 1161 */
1142static int adjust_pool_surplus(struct hstate *h, int delta) 1162static int adjust_pool_surplus(struct hstate *h, int delta)
1143{ 1163{
1144 static int prev_nid; 1164 int start_nid, next_nid;
1145 int nid = prev_nid;
1146 int ret = 0; 1165 int ret = 0;
1147 1166
1148 VM_BUG_ON(delta != -1 && delta != 1); 1167 VM_BUG_ON(delta != -1 && delta != 1);
1149 do {
1150 nid = next_node(nid, node_online_map);
1151 if (nid == MAX_NUMNODES)
1152 nid = first_node(node_online_map);
1153 1168
1154 /* To shrink on this node, there must be a surplus page */ 1169 if (delta < 0)
1155 if (delta < 0 && !h->surplus_huge_pages_node[nid]) 1170 start_nid = h->next_nid_to_alloc;
1156 continue; 1171 else
1157 /* Surplus cannot exceed the total number of pages */ 1172 start_nid = h->next_nid_to_free;
1158 if (delta > 0 && h->surplus_huge_pages_node[nid] >= 1173 next_nid = start_nid;
1174
1175 do {
1176 int nid = next_nid;
1177 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /*
1180 * To shrink on this node, there must be a surplus page
1181 */
1182 if (!h->surplus_huge_pages_node[nid])
1183 continue;
1184 }
1185 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /*
1188 * Surplus cannot exceed the total number of pages
1189 */
1190 if (h->surplus_huge_pages_node[nid] >=
1159 h->nr_huge_pages_node[nid]) 1191 h->nr_huge_pages_node[nid])
1160 continue; 1192 continue;
1193 }
1161 1194
1162 h->surplus_huge_pages += delta; 1195 h->surplus_huge_pages += delta;
1163 h->surplus_huge_pages_node[nid] += delta; 1196 h->surplus_huge_pages_node[nid] += delta;
1164 ret = 1; 1197 ret = 1;
1165 break; 1198 break;
1166 } while (nid != prev_nid); 1199 } while (next_nid != start_nid);
1167 1200
1168 prev_nid = nid;
1169 return ret; 1201 return ret;
1170} 1202}
1171 1203
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1227 min_count = max(count, min_count); 1259 min_count = max(count, min_count);
1228 try_to_free_low(h, min_count); 1260 try_to_free_low(h, min_count);
1229 while (min_count < persistent_huge_pages(h)) { 1261 while (min_count < persistent_huge_pages(h)) {
1230 struct page *page = dequeue_huge_page(h); 1262 if (!free_pool_huge_page(h, 0))
1231 if (!page)
1232 break; 1263 break;
1233 update_and_free_page(h, page);
1234 } 1264 }
1235 while (count < persistent_huge_pages(h)) { 1265 while (count < persistent_huge_pages(h)) {
1236 if (!adjust_pool_surplus(h, 1)) 1266 if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1442 h->free_huge_pages = 0; 1472 h->free_huge_pages = 0;
1443 for (i = 0; i < MAX_NUMNODES; ++i) 1473 for (i = 0; i < MAX_NUMNODES; ++i)
1444 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1445 h->hugetlb_next_nid = first_node(node_online_map); 1475 h->next_nid_to_alloc = first_node(node_online_map);
1476 h->next_nid_to_free = first_node(node_online_map);
1446 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1447 huge_page_size(h)/1024); 1478 huge_page_size(h)/1024);
1448 1479
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1985 return find_lock_page(mapping, idx); 2016 return find_lock_page(mapping, idx);
1986} 2017}
1987 2018
2019/*
2020 * Return whether there is a pagecache page to back given address within VMA.
2021 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022 */
2023static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 struct vm_area_struct *vma, unsigned long address)
2025{
2026 struct address_space *mapping;
2027 pgoff_t idx;
2028 struct page *page;
2029
2030 mapping = vma->vm_file->f_mapping;
2031 idx = vma_hugecache_offset(h, vma, address);
2032
2033 page = find_get_page(mapping, idx);
2034 if (page)
2035 put_page(page);
2036 return page != NULL;
2037}
2038
1988static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1989 unsigned long address, pte_t *ptep, unsigned int flags) 2040 unsigned long address, pte_t *ptep, unsigned int flags)
1990{ 2041{
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2180 return NULL; 2231 return NULL;
2181} 2232}
2182 2233
2183static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2184{
2185 if (!ptep || write || shared)
2186 return 0;
2187 else
2188 return huge_pte_none(huge_ptep_get(ptep));
2189}
2190
2191int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2234int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2192 struct page **pages, struct vm_area_struct **vmas, 2235 struct page **pages, struct vm_area_struct **vmas,
2193 unsigned long *position, int *length, int i, 2236 unsigned long *position, int *length, int i,
2194 int write) 2237 unsigned int flags)
2195{ 2238{
2196 unsigned long pfn_offset; 2239 unsigned long pfn_offset;
2197 unsigned long vaddr = *position; 2240 unsigned long vaddr = *position;
2198 int remainder = *length; 2241 int remainder = *length;
2199 struct hstate *h = hstate_vma(vma); 2242 struct hstate *h = hstate_vma(vma);
2200 int zeropage_ok = 0;
2201 int shared = vma->vm_flags & VM_SHARED;
2202 2243
2203 spin_lock(&mm->page_table_lock); 2244 spin_lock(&mm->page_table_lock);
2204 while (vaddr < vma->vm_end && remainder) { 2245 while (vaddr < vma->vm_end && remainder) {
2205 pte_t *pte; 2246 pte_t *pte;
2247 int absent;
2206 struct page *page; 2248 struct page *page;
2207 2249
2208 /* 2250 /*
2209 * Some archs (sparc64, sh*) have multiple pte_ts to 2251 * Some archs (sparc64, sh*) have multiple pte_ts to
2210 * each hugepage. We have to make * sure we get the 2252 * each hugepage. We have to make sure we get the
2211 * first, for the page indexing below to work. 2253 * first, for the page indexing below to work.
2212 */ 2254 */
2213 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2255 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2214 if (huge_zeropage_ok(pte, write, shared)) 2256 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2215 zeropage_ok = 1; 2257
2258 /*
2259 * When coredumping, it suits get_dump_page if we just return
2260 * an error where there's an empty slot with no huge pagecache
2261 * to back it. This way, we avoid allocating a hugepage, and
2262 * the sparse dumpfile avoids allocating disk blocks, but its
2263 * huge holes still show up with zeroes where they need to be.
2264 */
2265 if (absent && (flags & FOLL_DUMP) &&
2266 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 remainder = 0;
2268 break;
2269 }
2216 2270
2217 if (!pte || 2271 if (absent ||
2218 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || 2272 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2219 (write && !pte_write(huge_ptep_get(pte)))) {
2220 int ret; 2273 int ret;
2221 2274
2222 spin_unlock(&mm->page_table_lock); 2275 spin_unlock(&mm->page_table_lock);
2223 ret = hugetlb_fault(mm, vma, vaddr, write); 2276 ret = hugetlb_fault(mm, vma, vaddr,
2277 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2224 spin_lock(&mm->page_table_lock); 2278 spin_lock(&mm->page_table_lock);
2225 if (!(ret & VM_FAULT_ERROR)) 2279 if (!(ret & VM_FAULT_ERROR))
2226 continue; 2280 continue;
2227 2281
2228 remainder = 0; 2282 remainder = 0;
2229 if (!i)
2230 i = -EFAULT;
2231 break; 2283 break;
2232 } 2284 }
2233 2285
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2235 page = pte_page(huge_ptep_get(pte)); 2287 page = pte_page(huge_ptep_get(pte));
2236same_page: 2288same_page:
2237 if (pages) { 2289 if (pages) {
2238 if (zeropage_ok) 2290 pages[i] = mem_map_offset(page, pfn_offset);
2239 pages[i] = ZERO_PAGE(0);
2240 else
2241 pages[i] = mem_map_offset(page, pfn_offset);
2242 get_page(pages[i]); 2291 get_page(pages[i]);
2243 } 2292 }
2244 2293
@@ -2262,7 +2311,7 @@ same_page:
2262 *length = remainder; 2311 *length = remainder;
2263 *position = vaddr; 2312 *position = vaddr;
2264 2313
2265 return i; 2314 return i ? i : -EFAULT;
2266} 2315}
2267 2316
2268void hugetlb_change_protection(struct vm_area_struct *vma, 2317void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/internal.h b/mm/internal.h
index f290c4db528b..22ec8d2b0fb8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,8 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40extern unsigned long highest_memmap_pfn;
41
40/* 42/*
41 * in mm/vmscan.c: 43 * in mm/vmscan.c:
42 */ 44 */
@@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page);
46/* 48/*
47 * in mm/page_alloc.c 49 * in mm/page_alloc.c
48 */ 50 */
49extern unsigned long highest_memmap_pfn;
50extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
52 53
@@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
250} 251}
251#endif /* CONFIG_SPARSEMEM */ 252#endif /* CONFIG_SPARSEMEM */
252 253
253#define GUP_FLAGS_WRITE 0x1
254#define GUP_FLAGS_FORCE 0x2
255#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
256#define GUP_FLAGS_IGNORE_SIGKILL 0x8
257
258int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 254int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
259 unsigned long start, int len, int flags, 255 unsigned long start, int len, unsigned int foll_flags,
260 struct page **pages, struct vm_area_struct **vmas); 256 struct page **pages, struct vm_area_struct **vmas);
261 257
262#define ZONE_RECLAIM_NOSCAN -2 258#define ZONE_RECLAIM_NOSCAN -2
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000000..37cc37325094
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1703 @@
1/*
2 * Memory merging support.
3 *
4 * This code enables dynamic sharing of identical pages found in different
5 * memory areas, even if they are not shared by fork()
6 *
7 * Copyright (C) 2008-2009 Red Hat, Inc.
8 * Authors:
9 * Izik Eidus
10 * Andrea Arcangeli
11 * Chris Wright
12 * Hugh Dickins
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2.
15 */
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h>
33#include <linux/ksm.h>
34
35#include <asm/tlbflush.h>
36
37/*
38 * A few notes about the KSM scanning process,
39 * to make it easier to understand the data structures below:
40 *
41 * In order to reduce excessive scanning, KSM sorts the memory pages by their
42 * contents into a data structure that holds pointers to the pages' locations.
43 *
44 * Since the contents of the pages may change at any moment, KSM cannot just
45 * insert the pages into a normal sorted tree and expect it to find anything.
46 * Therefore KSM uses two data structures - the stable and the unstable tree.
47 *
48 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
49 * by their contents. Because each such page is write-protected, searching on
50 * this tree is fully assured to be working (except when pages are unmapped),
51 * and therefore this tree is called the stable tree.
52 *
53 * In addition to the stable tree, KSM uses a second data structure called the
54 * unstable tree: this tree holds pointers to pages which have been found to
55 * be "unchanged for a period of time". The unstable tree sorts these pages
56 * by their contents, but since they are not write-protected, KSM cannot rely
57 * upon the unstable tree to work correctly - the unstable tree is liable to
58 * be corrupted as its contents are modified, and so it is called unstable.
59 *
60 * KSM solves this problem by several techniques:
61 *
62 * 1) The unstable tree is flushed every time KSM completes scanning all
63 * memory areas, and then the tree is rebuilt again from the beginning.
64 * 2) KSM will only insert into the unstable tree, pages whose hash value
65 * has not changed since the previous scan of all memory areas.
66 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
67 * colors of the nodes and not on their contents, assuring that even when
68 * the tree gets "corrupted" it won't get out of balance, so scanning time
69 * remains the same (also, searching and inserting nodes in an rbtree uses
70 * the same algorithm, so we have no overhead when we flush and rebuild).
71 * 4) KSM never flushes the stable tree, which means that even if it were to
72 * take 10 attempts to find a page in the unstable tree, once it is found,
73 * it is secured in the stable tree. (When we scan a new page, we first
74 * compare it against the stable tree, and then against the unstable tree.)
75 */
76
77/**
78 * struct mm_slot - ksm information per mm that is being scanned
79 * @link: link to the mm_slots hash list
80 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
81 * @rmap_list: head for this mm_slot's list of rmap_items
82 * @mm: the mm that this information is valid for
83 */
84struct mm_slot {
85 struct hlist_node link;
86 struct list_head mm_list;
87 struct list_head rmap_list;
88 struct mm_struct *mm;
89};
90
91/**
92 * struct ksm_scan - cursor for scanning
93 * @mm_slot: the current mm_slot we are scanning
94 * @address: the next address inside that to be scanned
95 * @rmap_item: the current rmap that we are scanning inside the rmap_list
96 * @seqnr: count of completed full scans (needed when removing unstable node)
97 *
98 * There is only the one ksm_scan instance of this cursor structure.
99 */
100struct ksm_scan {
101 struct mm_slot *mm_slot;
102 unsigned long address;
103 struct rmap_item *rmap_item;
104 unsigned long seqnr;
105};
106
107/**
108 * struct rmap_item - reverse mapping item for virtual addresses
109 * @link: link into mm_slot's rmap_list (rmap_list is per mm)
110 * @mm: the memory structure this rmap_item is pointing into
111 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
112 * @oldchecksum: previous checksum of the page at that virtual address
113 * @node: rb_node of this rmap_item in either unstable or stable tree
114 * @next: next rmap_item hanging off the same node of the stable tree
115 * @prev: previous rmap_item hanging off the same node of the stable tree
116 */
117struct rmap_item {
118 struct list_head link;
119 struct mm_struct *mm;
120 unsigned long address; /* + low bits used for flags below */
121 union {
122 unsigned int oldchecksum; /* when unstable */
123 struct rmap_item *next; /* when stable */
124 };
125 union {
126 struct rb_node node; /* when tree node */
127 struct rmap_item *prev; /* in stable list */
128 };
129};
130
131#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
132#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
133#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
134
135/* The stable and unstable tree heads */
136static struct rb_root root_stable_tree = RB_ROOT;
137static struct rb_root root_unstable_tree = RB_ROOT;
138
139#define MM_SLOTS_HASH_HEADS 1024
140static struct hlist_head *mm_slots_hash;
141
142static struct mm_slot ksm_mm_head = {
143 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
144};
145static struct ksm_scan ksm_scan = {
146 .mm_slot = &ksm_mm_head,
147};
148
149static struct kmem_cache *rmap_item_cache;
150static struct kmem_cache *mm_slot_cache;
151
152/* The number of nodes in the stable tree */
153static unsigned long ksm_pages_shared;
154
155/* The number of page slots additionally sharing those nodes */
156static unsigned long ksm_pages_sharing;
157
158/* The number of nodes in the unstable tree */
159static unsigned long ksm_pages_unshared;
160
161/* The number of rmap_items in use: to calculate pages_volatile */
162static unsigned long ksm_rmap_items;
163
164/* Limit on the number of unswappable pages used */
165static unsigned long ksm_max_kernel_pages = 2000;
166
167/* Number of pages ksmd should scan in one batch */
168static unsigned int ksm_thread_pages_to_scan = 200;
169
170/* Milliseconds ksmd should sleep between batches */
171static unsigned int ksm_thread_sleep_millisecs = 20;
172
173#define KSM_RUN_STOP 0
174#define KSM_RUN_MERGE 1
175#define KSM_RUN_UNMERGE 2
176static unsigned int ksm_run = KSM_RUN_MERGE;
177
178static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
179static DEFINE_MUTEX(ksm_thread_mutex);
180static DEFINE_SPINLOCK(ksm_mmlist_lock);
181
182#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
183 sizeof(struct __struct), __alignof__(struct __struct),\
184 (__flags), NULL)
185
186static int __init ksm_slab_init(void)
187{
188 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
189 if (!rmap_item_cache)
190 goto out;
191
192 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
193 if (!mm_slot_cache)
194 goto out_free;
195
196 return 0;
197
198out_free:
199 kmem_cache_destroy(rmap_item_cache);
200out:
201 return -ENOMEM;
202}
203
204static void __init ksm_slab_free(void)
205{
206 kmem_cache_destroy(mm_slot_cache);
207 kmem_cache_destroy(rmap_item_cache);
208 mm_slot_cache = NULL;
209}
210
211static inline struct rmap_item *alloc_rmap_item(void)
212{
213 struct rmap_item *rmap_item;
214
215 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
216 if (rmap_item)
217 ksm_rmap_items++;
218 return rmap_item;
219}
220
221static inline void free_rmap_item(struct rmap_item *rmap_item)
222{
223 ksm_rmap_items--;
224 rmap_item->mm = NULL; /* debug safety */
225 kmem_cache_free(rmap_item_cache, rmap_item);
226}
227
228static inline struct mm_slot *alloc_mm_slot(void)
229{
230 if (!mm_slot_cache) /* initialization failed */
231 return NULL;
232 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
233}
234
235static inline void free_mm_slot(struct mm_slot *mm_slot)
236{
237 kmem_cache_free(mm_slot_cache, mm_slot);
238}
239
240static int __init mm_slots_hash_init(void)
241{
242 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
243 GFP_KERNEL);
244 if (!mm_slots_hash)
245 return -ENOMEM;
246 return 0;
247}
248
249static void __init mm_slots_hash_free(void)
250{
251 kfree(mm_slots_hash);
252}
253
254static struct mm_slot *get_mm_slot(struct mm_struct *mm)
255{
256 struct mm_slot *mm_slot;
257 struct hlist_head *bucket;
258 struct hlist_node *node;
259
260 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
261 % MM_SLOTS_HASH_HEADS];
262 hlist_for_each_entry(mm_slot, node, bucket, link) {
263 if (mm == mm_slot->mm)
264 return mm_slot;
265 }
266 return NULL;
267}
268
269static void insert_to_mm_slots_hash(struct mm_struct *mm,
270 struct mm_slot *mm_slot)
271{
272 struct hlist_head *bucket;
273
274 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
275 % MM_SLOTS_HASH_HEADS];
276 mm_slot->mm = mm;
277 INIT_LIST_HEAD(&mm_slot->rmap_list);
278 hlist_add_head(&mm_slot->link, bucket);
279}
280
281static inline int in_stable_tree(struct rmap_item *rmap_item)
282{
283 return rmap_item->address & STABLE_FLAG;
284}
285
286/*
287 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
288 * page tables after it has passed through ksm_exit() - which, if necessary,
289 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
290 * a special flag: they can just back out as soon as mm_users goes to zero.
291 * ksm_test_exit() is used throughout to make this test for exit: in some
292 * places for correctness, in some places just to avoid unnecessary work.
293 */
294static inline bool ksm_test_exit(struct mm_struct *mm)
295{
296 return atomic_read(&mm->mm_users) == 0;
297}
298
299/*
300 * We use break_ksm to break COW on a ksm page: it's a stripped down
301 *
302 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
303 * put_page(page);
304 *
305 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
306 * in case the application has unmapped and remapped mm,addr meanwhile.
307 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
308 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
309 */
310static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
311{
312 struct page *page;
313 int ret = 0;
314
315 do {
316 cond_resched();
317 page = follow_page(vma, addr, FOLL_GET);
318 if (!page)
319 break;
320 if (PageKsm(page))
321 ret = handle_mm_fault(vma->vm_mm, vma, addr,
322 FAULT_FLAG_WRITE);
323 else
324 ret = VM_FAULT_WRITE;
325 put_page(page);
326 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
327 /*
328 * We must loop because handle_mm_fault() may back out if there's
329 * any difficulty e.g. if pte accessed bit gets updated concurrently.
330 *
331 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
332 * COW has been broken, even if the vma does not permit VM_WRITE;
333 * but note that a concurrent fault might break PageKsm for us.
334 *
335 * VM_FAULT_SIGBUS could occur if we race with truncation of the
336 * backing file, which also invalidates anonymous pages: that's
337 * okay, that truncation will have unmapped the PageKsm for us.
338 *
339 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
340 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
341 * current task has TIF_MEMDIE set, and will be OOM killed on return
342 * to user; and ksmd, having no mm, would never be chosen for that.
343 *
344 * But if the mm is in a limited mem_cgroup, then the fault may fail
345 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
346 * even ksmd can fail in this way - though it's usually breaking ksm
347 * just to undo a merge it made a moment before, so unlikely to oom.
348 *
349 * That's a pity: we might therefore have more kernel pages allocated
350 * than we're counting as nodes in the stable tree; but ksm_do_scan
351 * will retry to break_cow on each pass, so should recover the page
352 * in due course. The important thing is to not let VM_MERGEABLE
353 * be cleared while any such pages might remain in the area.
354 */
355 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
356}
357
358static void break_cow(struct mm_struct *mm, unsigned long addr)
359{
360 struct vm_area_struct *vma;
361
362 down_read(&mm->mmap_sem);
363 if (ksm_test_exit(mm))
364 goto out;
365 vma = find_vma(mm, addr);
366 if (!vma || vma->vm_start > addr)
367 goto out;
368 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
369 goto out;
370 break_ksm(vma, addr);
371out:
372 up_read(&mm->mmap_sem);
373}
374
375static struct page *get_mergeable_page(struct rmap_item *rmap_item)
376{
377 struct mm_struct *mm = rmap_item->mm;
378 unsigned long addr = rmap_item->address;
379 struct vm_area_struct *vma;
380 struct page *page;
381
382 down_read(&mm->mmap_sem);
383 if (ksm_test_exit(mm))
384 goto out;
385 vma = find_vma(mm, addr);
386 if (!vma || vma->vm_start > addr)
387 goto out;
388 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
389 goto out;
390
391 page = follow_page(vma, addr, FOLL_GET);
392 if (!page)
393 goto out;
394 if (PageAnon(page)) {
395 flush_anon_page(vma, page, addr);
396 flush_dcache_page(page);
397 } else {
398 put_page(page);
399out: page = NULL;
400 }
401 up_read(&mm->mmap_sem);
402 return page;
403}
404
405/*
406 * get_ksm_page: checks if the page at the virtual address in rmap_item
407 * is still PageKsm, in which case we can trust the content of the page,
408 * and it returns the gotten page; but NULL if the page has been zapped.
409 */
410static struct page *get_ksm_page(struct rmap_item *rmap_item)
411{
412 struct page *page;
413
414 page = get_mergeable_page(rmap_item);
415 if (page && !PageKsm(page)) {
416 put_page(page);
417 page = NULL;
418 }
419 return page;
420}
421
422/*
423 * Removing rmap_item from stable or unstable tree.
424 * This function will clean the information from the stable/unstable tree.
425 */
426static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
427{
428 if (in_stable_tree(rmap_item)) {
429 struct rmap_item *next_item = rmap_item->next;
430
431 if (rmap_item->address & NODE_FLAG) {
432 if (next_item) {
433 rb_replace_node(&rmap_item->node,
434 &next_item->node,
435 &root_stable_tree);
436 next_item->address |= NODE_FLAG;
437 ksm_pages_sharing--;
438 } else {
439 rb_erase(&rmap_item->node, &root_stable_tree);
440 ksm_pages_shared--;
441 }
442 } else {
443 struct rmap_item *prev_item = rmap_item->prev;
444
445 BUG_ON(prev_item->next != rmap_item);
446 prev_item->next = next_item;
447 if (next_item) {
448 BUG_ON(next_item->prev != rmap_item);
449 next_item->prev = rmap_item->prev;
450 }
451 ksm_pages_sharing--;
452 }
453
454 rmap_item->next = NULL;
455
456 } else if (rmap_item->address & NODE_FLAG) {
457 unsigned char age;
458 /*
459 * Usually ksmd can and must skip the rb_erase, because
460 * root_unstable_tree was already reset to RB_ROOT.
461 * But be careful when an mm is exiting: do the rb_erase
462 * if this rmap_item was inserted by this scan, rather
463 * than left over from before.
464 */
465 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
466 BUG_ON(age > 1);
467 if (!age)
468 rb_erase(&rmap_item->node, &root_unstable_tree);
469 ksm_pages_unshared--;
470 }
471
472 rmap_item->address &= PAGE_MASK;
473
474 cond_resched(); /* we're called from many long loops */
475}
476
477static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
478 struct list_head *cur)
479{
480 struct rmap_item *rmap_item;
481
482 while (cur != &mm_slot->rmap_list) {
483 rmap_item = list_entry(cur, struct rmap_item, link);
484 cur = cur->next;
485 remove_rmap_item_from_tree(rmap_item);
486 list_del(&rmap_item->link);
487 free_rmap_item(rmap_item);
488 }
489}
490
491/*
492 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
493 * than check every pte of a given vma, the locking doesn't quite work for
494 * that - an rmap_item is assigned to the stable tree after inserting ksm
495 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
496 * rmap_items from parent to child at fork time (so as not to waste time
497 * if exit comes before the next scan reaches it).
498 *
499 * Similarly, although we'd like to remove rmap_items (so updating counts
500 * and freeing memory) when unmerging an area, it's easier to leave that
501 * to the next pass of ksmd - consider, for example, how ksmd might be
502 * in cmp_and_merge_page on one of the rmap_items we would be removing.
503 */
504static int unmerge_ksm_pages(struct vm_area_struct *vma,
505 unsigned long start, unsigned long end)
506{
507 unsigned long addr;
508 int err = 0;
509
510 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
511 if (ksm_test_exit(vma->vm_mm))
512 break;
513 if (signal_pending(current))
514 err = -ERESTARTSYS;
515 else
516 err = break_ksm(vma, addr);
517 }
518 return err;
519}
520
521#ifdef CONFIG_SYSFS
522/*
523 * Only called through the sysfs control interface:
524 */
525static int unmerge_and_remove_all_rmap_items(void)
526{
527 struct mm_slot *mm_slot;
528 struct mm_struct *mm;
529 struct vm_area_struct *vma;
530 int err = 0;
531
532 spin_lock(&ksm_mmlist_lock);
533 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
534 struct mm_slot, mm_list);
535 spin_unlock(&ksm_mmlist_lock);
536
537 for (mm_slot = ksm_scan.mm_slot;
538 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
539 mm = mm_slot->mm;
540 down_read(&mm->mmap_sem);
541 for (vma = mm->mmap; vma; vma = vma->vm_next) {
542 if (ksm_test_exit(mm))
543 break;
544 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
545 continue;
546 err = unmerge_ksm_pages(vma,
547 vma->vm_start, vma->vm_end);
548 if (err)
549 goto error;
550 }
551
552 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
553
554 spin_lock(&ksm_mmlist_lock);
555 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
556 struct mm_slot, mm_list);
557 if (ksm_test_exit(mm)) {
558 hlist_del(&mm_slot->link);
559 list_del(&mm_slot->mm_list);
560 spin_unlock(&ksm_mmlist_lock);
561
562 free_mm_slot(mm_slot);
563 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
564 up_read(&mm->mmap_sem);
565 mmdrop(mm);
566 } else {
567 spin_unlock(&ksm_mmlist_lock);
568 up_read(&mm->mmap_sem);
569 }
570 }
571
572 ksm_scan.seqnr = 0;
573 return 0;
574
575error:
576 up_read(&mm->mmap_sem);
577 spin_lock(&ksm_mmlist_lock);
578 ksm_scan.mm_slot = &ksm_mm_head;
579 spin_unlock(&ksm_mmlist_lock);
580 return err;
581}
582#endif /* CONFIG_SYSFS */
583
584static u32 calc_checksum(struct page *page)
585{
586 u32 checksum;
587 void *addr = kmap_atomic(page, KM_USER0);
588 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
589 kunmap_atomic(addr, KM_USER0);
590 return checksum;
591}
592
593static int memcmp_pages(struct page *page1, struct page *page2)
594{
595 char *addr1, *addr2;
596 int ret;
597
598 addr1 = kmap_atomic(page1, KM_USER0);
599 addr2 = kmap_atomic(page2, KM_USER1);
600 ret = memcmp(addr1, addr2, PAGE_SIZE);
601 kunmap_atomic(addr2, KM_USER1);
602 kunmap_atomic(addr1, KM_USER0);
603 return ret;
604}
605
606static inline int pages_identical(struct page *page1, struct page *page2)
607{
608 return !memcmp_pages(page1, page2);
609}
610
611static int write_protect_page(struct vm_area_struct *vma, struct page *page,
612 pte_t *orig_pte)
613{
614 struct mm_struct *mm = vma->vm_mm;
615 unsigned long addr;
616 pte_t *ptep;
617 spinlock_t *ptl;
618 int swapped;
619 int err = -EFAULT;
620
621 addr = page_address_in_vma(page, vma);
622 if (addr == -EFAULT)
623 goto out;
624
625 ptep = page_check_address(page, mm, addr, &ptl, 0);
626 if (!ptep)
627 goto out;
628
629 if (pte_write(*ptep)) {
630 pte_t entry;
631
632 swapped = PageSwapCache(page);
633 flush_cache_page(vma, addr, page_to_pfn(page));
634 /*
635 * Ok this is tricky, when get_user_pages_fast() run it doesnt
636 * take any lock, therefore the check that we are going to make
637 * with the pagecount against the mapcount is racey and
638 * O_DIRECT can happen right after the check.
639 * So we clear the pte and flush the tlb before the check
640 * this assure us that no O_DIRECT can happen after the check
641 * or in the middle of the check.
642 */
643 entry = ptep_clear_flush(vma, addr, ptep);
644 /*
645 * Check that no O_DIRECT or similar I/O is in progress on the
646 * page
647 */
648 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
649 set_pte_at_notify(mm, addr, ptep, entry);
650 goto out_unlock;
651 }
652 entry = pte_wrprotect(entry);
653 set_pte_at_notify(mm, addr, ptep, entry);
654 }
655 *orig_pte = *ptep;
656 err = 0;
657
658out_unlock:
659 pte_unmap_unlock(ptep, ptl);
660out:
661 return err;
662}
663
664/**
665 * replace_page - replace page in vma by new ksm page
666 * @vma: vma that holds the pte pointing to oldpage
667 * @oldpage: the page we are replacing by newpage
668 * @newpage: the ksm page we replace oldpage by
669 * @orig_pte: the original value of the pte
670 *
671 * Returns 0 on success, -EFAULT on failure.
672 */
673static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
674 struct page *newpage, pte_t orig_pte)
675{
676 struct mm_struct *mm = vma->vm_mm;
677 pgd_t *pgd;
678 pud_t *pud;
679 pmd_t *pmd;
680 pte_t *ptep;
681 spinlock_t *ptl;
682 unsigned long addr;
683 pgprot_t prot;
684 int err = -EFAULT;
685
686 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
687
688 addr = page_address_in_vma(oldpage, vma);
689 if (addr == -EFAULT)
690 goto out;
691
692 pgd = pgd_offset(mm, addr);
693 if (!pgd_present(*pgd))
694 goto out;
695
696 pud = pud_offset(pgd, addr);
697 if (!pud_present(*pud))
698 goto out;
699
700 pmd = pmd_offset(pud, addr);
701 if (!pmd_present(*pmd))
702 goto out;
703
704 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
705 if (!pte_same(*ptep, orig_pte)) {
706 pte_unmap_unlock(ptep, ptl);
707 goto out;
708 }
709
710 get_page(newpage);
711 page_add_ksm_rmap(newpage);
712
713 flush_cache_page(vma, addr, pte_pfn(*ptep));
714 ptep_clear_flush(vma, addr, ptep);
715 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
716
717 page_remove_rmap(oldpage);
718 put_page(oldpage);
719
720 pte_unmap_unlock(ptep, ptl);
721 err = 0;
722out:
723 return err;
724}
725
726/*
727 * try_to_merge_one_page - take two pages and merge them into one
728 * @vma: the vma that hold the pte pointing into oldpage
729 * @oldpage: the page that we want to replace with newpage
730 * @newpage: the page that we want to map instead of oldpage
731 *
732 * Note:
733 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
734 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
735 *
736 * This function returns 0 if the pages were merged, -EFAULT otherwise.
737 */
738static int try_to_merge_one_page(struct vm_area_struct *vma,
739 struct page *oldpage,
740 struct page *newpage)
741{
742 pte_t orig_pte = __pte(0);
743 int err = -EFAULT;
744
745 if (!(vma->vm_flags & VM_MERGEABLE))
746 goto out;
747
748 if (!PageAnon(oldpage))
749 goto out;
750
751 get_page(newpage);
752 get_page(oldpage);
753
754 /*
755 * We need the page lock to read a stable PageSwapCache in
756 * write_protect_page(). We use trylock_page() instead of
757 * lock_page() because we don't want to wait here - we
758 * prefer to continue scanning and merging different pages,
759 * then come back to this page when it is unlocked.
760 */
761 if (!trylock_page(oldpage))
762 goto out_putpage;
763 /*
764 * If this anonymous page is mapped only here, its pte may need
765 * to be write-protected. If it's mapped elsewhere, all of its
766 * ptes are necessarily already write-protected. But in either
767 * case, we need to lock and check page_count is not raised.
768 */
769 if (write_protect_page(vma, oldpage, &orig_pte)) {
770 unlock_page(oldpage);
771 goto out_putpage;
772 }
773 unlock_page(oldpage);
774
775 if (pages_identical(oldpage, newpage))
776 err = replace_page(vma, oldpage, newpage, orig_pte);
777
778out_putpage:
779 put_page(oldpage);
780 put_page(newpage);
781out:
782 return err;
783}
784
785/*
786 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
787 * but no new kernel page is allocated: kpage must already be a ksm page.
788 */
789static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
790 unsigned long addr1,
791 struct page *page1,
792 struct page *kpage)
793{
794 struct vm_area_struct *vma;
795 int err = -EFAULT;
796
797 down_read(&mm1->mmap_sem);
798 if (ksm_test_exit(mm1))
799 goto out;
800
801 vma = find_vma(mm1, addr1);
802 if (!vma || vma->vm_start > addr1)
803 goto out;
804
805 err = try_to_merge_one_page(vma, page1, kpage);
806out:
807 up_read(&mm1->mmap_sem);
808 return err;
809}
810
811/*
812 * try_to_merge_two_pages - take two identical pages and prepare them
813 * to be merged into one page.
814 *
815 * This function returns 0 if we successfully mapped two identical pages
816 * into one page, -EFAULT otherwise.
817 *
818 * Note that this function allocates a new kernel page: if one of the pages
819 * is already a ksm page, try_to_merge_with_ksm_page should be used.
820 */
821static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
822 struct page *page1, struct mm_struct *mm2,
823 unsigned long addr2, struct page *page2)
824{
825 struct vm_area_struct *vma;
826 struct page *kpage;
827 int err = -EFAULT;
828
829 /*
830 * The number of nodes in the stable tree
831 * is the number of kernel pages that we hold.
832 */
833 if (ksm_max_kernel_pages &&
834 ksm_max_kernel_pages <= ksm_pages_shared)
835 return err;
836
837 kpage = alloc_page(GFP_HIGHUSER);
838 if (!kpage)
839 return err;
840
841 down_read(&mm1->mmap_sem);
842 if (ksm_test_exit(mm1)) {
843 up_read(&mm1->mmap_sem);
844 goto out;
845 }
846 vma = find_vma(mm1, addr1);
847 if (!vma || vma->vm_start > addr1) {
848 up_read(&mm1->mmap_sem);
849 goto out;
850 }
851
852 copy_user_highpage(kpage, page1, addr1, vma);
853 err = try_to_merge_one_page(vma, page1, kpage);
854 up_read(&mm1->mmap_sem);
855
856 if (!err) {
857 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
858 /*
859 * If that fails, we have a ksm page with only one pte
860 * pointing to it: so break it.
861 */
862 if (err)
863 break_cow(mm1, addr1);
864 }
865out:
866 put_page(kpage);
867 return err;
868}
869
870/*
871 * stable_tree_search - search page inside the stable tree
872 * @page: the page that we are searching identical pages to.
873 * @page2: pointer into identical page that we are holding inside the stable
874 * tree that we have found.
875 * @rmap_item: the reverse mapping item
876 *
877 * This function checks if there is a page inside the stable tree
878 * with identical content to the page that we are scanning right now.
879 *
880 * This function return rmap_item pointer to the identical item if found,
881 * NULL otherwise.
882 */
883static struct rmap_item *stable_tree_search(struct page *page,
884 struct page **page2,
885 struct rmap_item *rmap_item)
886{
887 struct rb_node *node = root_stable_tree.rb_node;
888
889 while (node) {
890 struct rmap_item *tree_rmap_item, *next_rmap_item;
891 int ret;
892
893 tree_rmap_item = rb_entry(node, struct rmap_item, node);
894 while (tree_rmap_item) {
895 BUG_ON(!in_stable_tree(tree_rmap_item));
896 cond_resched();
897 page2[0] = get_ksm_page(tree_rmap_item);
898 if (page2[0])
899 break;
900 next_rmap_item = tree_rmap_item->next;
901 remove_rmap_item_from_tree(tree_rmap_item);
902 tree_rmap_item = next_rmap_item;
903 }
904 if (!tree_rmap_item)
905 return NULL;
906
907 ret = memcmp_pages(page, page2[0]);
908
909 if (ret < 0) {
910 put_page(page2[0]);
911 node = node->rb_left;
912 } else if (ret > 0) {
913 put_page(page2[0]);
914 node = node->rb_right;
915 } else {
916 return tree_rmap_item;
917 }
918 }
919
920 return NULL;
921}
922
923/*
924 * stable_tree_insert - insert rmap_item pointing to new ksm page
925 * into the stable tree.
926 *
927 * @page: the page that we are searching identical page to inside the stable
928 * tree.
929 * @rmap_item: pointer to the reverse mapping item.
930 *
931 * This function returns rmap_item if success, NULL otherwise.
932 */
933static struct rmap_item *stable_tree_insert(struct page *page,
934 struct rmap_item *rmap_item)
935{
936 struct rb_node **new = &root_stable_tree.rb_node;
937 struct rb_node *parent = NULL;
938
939 while (*new) {
940 struct rmap_item *tree_rmap_item, *next_rmap_item;
941 struct page *tree_page;
942 int ret;
943
944 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
945 while (tree_rmap_item) {
946 BUG_ON(!in_stable_tree(tree_rmap_item));
947 cond_resched();
948 tree_page = get_ksm_page(tree_rmap_item);
949 if (tree_page)
950 break;
951 next_rmap_item = tree_rmap_item->next;
952 remove_rmap_item_from_tree(tree_rmap_item);
953 tree_rmap_item = next_rmap_item;
954 }
955 if (!tree_rmap_item)
956 return NULL;
957
958 ret = memcmp_pages(page, tree_page);
959 put_page(tree_page);
960
961 parent = *new;
962 if (ret < 0)
963 new = &parent->rb_left;
964 else if (ret > 0)
965 new = &parent->rb_right;
966 else {
967 /*
968 * It is not a bug that stable_tree_search() didn't
969 * find this node: because at that time our page was
970 * not yet write-protected, so may have changed since.
971 */
972 return NULL;
973 }
974 }
975
976 rmap_item->address |= NODE_FLAG | STABLE_FLAG;
977 rmap_item->next = NULL;
978 rb_link_node(&rmap_item->node, parent, new);
979 rb_insert_color(&rmap_item->node, &root_stable_tree);
980
981 ksm_pages_shared++;
982 return rmap_item;
983}
984
985/*
986 * unstable_tree_search_insert - search and insert items into the unstable tree.
987 *
988 * @page: the page that we are going to search for identical page or to insert
989 * into the unstable tree
990 * @page2: pointer into identical page that was found inside the unstable tree
991 * @rmap_item: the reverse mapping item of page
992 *
993 * This function searches for a page in the unstable tree identical to the
994 * page currently being scanned; and if no identical page is found in the
995 * tree, we insert rmap_item as a new object into the unstable tree.
996 *
997 * This function returns pointer to rmap_item found to be identical
998 * to the currently scanned page, NULL otherwise.
999 *
1000 * This function does both searching and inserting, because they share
1001 * the same walking algorithm in an rbtree.
1002 */
1003static struct rmap_item *unstable_tree_search_insert(struct page *page,
1004 struct page **page2,
1005 struct rmap_item *rmap_item)
1006{
1007 struct rb_node **new = &root_unstable_tree.rb_node;
1008 struct rb_node *parent = NULL;
1009
1010 while (*new) {
1011 struct rmap_item *tree_rmap_item;
1012 int ret;
1013
1014 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1015 page2[0] = get_mergeable_page(tree_rmap_item);
1016 if (!page2[0])
1017 return NULL;
1018
1019 /*
1020 * Don't substitute an unswappable ksm page
1021 * just for one good swappable forked page.
1022 */
1023 if (page == page2[0]) {
1024 put_page(page2[0]);
1025 return NULL;
1026 }
1027
1028 ret = memcmp_pages(page, page2[0]);
1029
1030 parent = *new;
1031 if (ret < 0) {
1032 put_page(page2[0]);
1033 new = &parent->rb_left;
1034 } else if (ret > 0) {
1035 put_page(page2[0]);
1036 new = &parent->rb_right;
1037 } else {
1038 return tree_rmap_item;
1039 }
1040 }
1041
1042 rmap_item->address |= NODE_FLAG;
1043 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1044 rb_link_node(&rmap_item->node, parent, new);
1045 rb_insert_color(&rmap_item->node, &root_unstable_tree);
1046
1047 ksm_pages_unshared++;
1048 return NULL;
1049}
1050
1051/*
1052 * stable_tree_append - add another rmap_item to the linked list of
1053 * rmap_items hanging off a given node of the stable tree, all sharing
1054 * the same ksm page.
1055 */
1056static void stable_tree_append(struct rmap_item *rmap_item,
1057 struct rmap_item *tree_rmap_item)
1058{
1059 rmap_item->next = tree_rmap_item->next;
1060 rmap_item->prev = tree_rmap_item;
1061
1062 if (tree_rmap_item->next)
1063 tree_rmap_item->next->prev = rmap_item;
1064
1065 tree_rmap_item->next = rmap_item;
1066 rmap_item->address |= STABLE_FLAG;
1067
1068 ksm_pages_sharing++;
1069}
1070
1071/*
1072 * cmp_and_merge_page - first see if page can be merged into the stable tree;
1073 * if not, compare checksum to previous and if it's the same, see if page can
1074 * be inserted into the unstable tree, or merged with a page already there and
1075 * both transferred to the stable tree.
1076 *
1077 * @page: the page that we are searching identical page to.
1078 * @rmap_item: the reverse mapping into the virtual address of this page
1079 */
1080static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1081{
1082 struct page *page2[1];
1083 struct rmap_item *tree_rmap_item;
1084 unsigned int checksum;
1085 int err;
1086
1087 if (in_stable_tree(rmap_item))
1088 remove_rmap_item_from_tree(rmap_item);
1089
1090 /* We first start with searching the page inside the stable tree */
1091 tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1092 if (tree_rmap_item) {
1093 if (page == page2[0]) /* forked */
1094 err = 0;
1095 else
1096 err = try_to_merge_with_ksm_page(rmap_item->mm,
1097 rmap_item->address,
1098 page, page2[0]);
1099 put_page(page2[0]);
1100
1101 if (!err) {
1102 /*
1103 * The page was successfully merged:
1104 * add its rmap_item to the stable tree.
1105 */
1106 stable_tree_append(rmap_item, tree_rmap_item);
1107 }
1108 return;
1109 }
1110
1111 /*
1112 * A ksm page might have got here by fork, but its other
1113 * references have already been removed from the stable tree.
1114 * Or it might be left over from a break_ksm which failed
1115 * when the mem_cgroup had reached its limit: try again now.
1116 */
1117 if (PageKsm(page))
1118 break_cow(rmap_item->mm, rmap_item->address);
1119
1120 /*
1121 * In case the hash value of the page was changed from the last time we
1122 * have calculated it, this page to be changed frequely, therefore we
1123 * don't want to insert it to the unstable tree, and we don't want to
1124 * waste our time to search if there is something identical to it there.
1125 */
1126 checksum = calc_checksum(page);
1127 if (rmap_item->oldchecksum != checksum) {
1128 rmap_item->oldchecksum = checksum;
1129 return;
1130 }
1131
1132 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1133 if (tree_rmap_item) {
1134 err = try_to_merge_two_pages(rmap_item->mm,
1135 rmap_item->address, page,
1136 tree_rmap_item->mm,
1137 tree_rmap_item->address, page2[0]);
1138 /*
1139 * As soon as we merge this page, we want to remove the
1140 * rmap_item of the page we have merged with from the unstable
1141 * tree, and insert it instead as new node in the stable tree.
1142 */
1143 if (!err) {
1144 rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1145 tree_rmap_item->address &= ~NODE_FLAG;
1146 ksm_pages_unshared--;
1147
1148 /*
1149 * If we fail to insert the page into the stable tree,
1150 * we will have 2 virtual addresses that are pointing
1151 * to a ksm page left outside the stable tree,
1152 * in which case we need to break_cow on both.
1153 */
1154 if (stable_tree_insert(page2[0], tree_rmap_item))
1155 stable_tree_append(rmap_item, tree_rmap_item);
1156 else {
1157 break_cow(tree_rmap_item->mm,
1158 tree_rmap_item->address);
1159 break_cow(rmap_item->mm, rmap_item->address);
1160 }
1161 }
1162
1163 put_page(page2[0]);
1164 }
1165}
1166
1167static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1168 struct list_head *cur,
1169 unsigned long addr)
1170{
1171 struct rmap_item *rmap_item;
1172
1173 while (cur != &mm_slot->rmap_list) {
1174 rmap_item = list_entry(cur, struct rmap_item, link);
1175 if ((rmap_item->address & PAGE_MASK) == addr) {
1176 if (!in_stable_tree(rmap_item))
1177 remove_rmap_item_from_tree(rmap_item);
1178 return rmap_item;
1179 }
1180 if (rmap_item->address > addr)
1181 break;
1182 cur = cur->next;
1183 remove_rmap_item_from_tree(rmap_item);
1184 list_del(&rmap_item->link);
1185 free_rmap_item(rmap_item);
1186 }
1187
1188 rmap_item = alloc_rmap_item();
1189 if (rmap_item) {
1190 /* It has already been zeroed */
1191 rmap_item->mm = mm_slot->mm;
1192 rmap_item->address = addr;
1193 list_add_tail(&rmap_item->link, cur);
1194 }
1195 return rmap_item;
1196}
1197
1198static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1199{
1200 struct mm_struct *mm;
1201 struct mm_slot *slot;
1202 struct vm_area_struct *vma;
1203 struct rmap_item *rmap_item;
1204
1205 if (list_empty(&ksm_mm_head.mm_list))
1206 return NULL;
1207
1208 slot = ksm_scan.mm_slot;
1209 if (slot == &ksm_mm_head) {
1210 root_unstable_tree = RB_ROOT;
1211
1212 spin_lock(&ksm_mmlist_lock);
1213 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1214 ksm_scan.mm_slot = slot;
1215 spin_unlock(&ksm_mmlist_lock);
1216next_mm:
1217 ksm_scan.address = 0;
1218 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1219 struct rmap_item, link);
1220 }
1221
1222 mm = slot->mm;
1223 down_read(&mm->mmap_sem);
1224 if (ksm_test_exit(mm))
1225 vma = NULL;
1226 else
1227 vma = find_vma(mm, ksm_scan.address);
1228
1229 for (; vma; vma = vma->vm_next) {
1230 if (!(vma->vm_flags & VM_MERGEABLE))
1231 continue;
1232 if (ksm_scan.address < vma->vm_start)
1233 ksm_scan.address = vma->vm_start;
1234 if (!vma->anon_vma)
1235 ksm_scan.address = vma->vm_end;
1236
1237 while (ksm_scan.address < vma->vm_end) {
1238 if (ksm_test_exit(mm))
1239 break;
1240 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1241 if (*page && PageAnon(*page)) {
1242 flush_anon_page(vma, *page, ksm_scan.address);
1243 flush_dcache_page(*page);
1244 rmap_item = get_next_rmap_item(slot,
1245 ksm_scan.rmap_item->link.next,
1246 ksm_scan.address);
1247 if (rmap_item) {
1248 ksm_scan.rmap_item = rmap_item;
1249 ksm_scan.address += PAGE_SIZE;
1250 } else
1251 put_page(*page);
1252 up_read(&mm->mmap_sem);
1253 return rmap_item;
1254 }
1255 if (*page)
1256 put_page(*page);
1257 ksm_scan.address += PAGE_SIZE;
1258 cond_resched();
1259 }
1260 }
1261
1262 if (ksm_test_exit(mm)) {
1263 ksm_scan.address = 0;
1264 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1265 struct rmap_item, link);
1266 }
1267 /*
1268 * Nuke all the rmap_items that are above this current rmap:
1269 * because there were no VM_MERGEABLE vmas with such addresses.
1270 */
1271 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1272
1273 spin_lock(&ksm_mmlist_lock);
1274 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1275 struct mm_slot, mm_list);
1276 if (ksm_scan.address == 0) {
1277 /*
1278 * We've completed a full scan of all vmas, holding mmap_sem
1279 * throughout, and found no VM_MERGEABLE: so do the same as
1280 * __ksm_exit does to remove this mm from all our lists now.
1281 * This applies either when cleaning up after __ksm_exit
1282 * (but beware: we can reach here even before __ksm_exit),
1283 * or when all VM_MERGEABLE areas have been unmapped (and
1284 * mmap_sem then protects against race with MADV_MERGEABLE).
1285 */
1286 hlist_del(&slot->link);
1287 list_del(&slot->mm_list);
1288 spin_unlock(&ksm_mmlist_lock);
1289
1290 free_mm_slot(slot);
1291 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1292 up_read(&mm->mmap_sem);
1293 mmdrop(mm);
1294 } else {
1295 spin_unlock(&ksm_mmlist_lock);
1296 up_read(&mm->mmap_sem);
1297 }
1298
1299 /* Repeat until we've completed scanning the whole list */
1300 slot = ksm_scan.mm_slot;
1301 if (slot != &ksm_mm_head)
1302 goto next_mm;
1303
1304 ksm_scan.seqnr++;
1305 return NULL;
1306}
1307
1308/**
1309 * ksm_do_scan - the ksm scanner main worker function.
1310 * @scan_npages - number of pages we want to scan before we return.
1311 */
1312static void ksm_do_scan(unsigned int scan_npages)
1313{
1314 struct rmap_item *rmap_item;
1315 struct page *page;
1316
1317 while (scan_npages--) {
1318 cond_resched();
1319 rmap_item = scan_get_next_rmap_item(&page);
1320 if (!rmap_item)
1321 return;
1322 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1323 cmp_and_merge_page(page, rmap_item);
1324 else if (page_mapcount(page) == 1) {
1325 /*
1326 * Replace now-unshared ksm page by ordinary page.
1327 */
1328 break_cow(rmap_item->mm, rmap_item->address);
1329 remove_rmap_item_from_tree(rmap_item);
1330 rmap_item->oldchecksum = calc_checksum(page);
1331 }
1332 put_page(page);
1333 }
1334}
1335
1336static int ksmd_should_run(void)
1337{
1338 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1339}
1340
1341static int ksm_scan_thread(void *nothing)
1342{
1343 set_user_nice(current, 5);
1344
1345 while (!kthread_should_stop()) {
1346 mutex_lock(&ksm_thread_mutex);
1347 if (ksmd_should_run())
1348 ksm_do_scan(ksm_thread_pages_to_scan);
1349 mutex_unlock(&ksm_thread_mutex);
1350
1351 if (ksmd_should_run()) {
1352 schedule_timeout_interruptible(
1353 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1354 } else {
1355 wait_event_interruptible(ksm_thread_wait,
1356 ksmd_should_run() || kthread_should_stop());
1357 }
1358 }
1359 return 0;
1360}
1361
1362int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1363 unsigned long end, int advice, unsigned long *vm_flags)
1364{
1365 struct mm_struct *mm = vma->vm_mm;
1366 int err;
1367
1368 switch (advice) {
1369 case MADV_MERGEABLE:
1370 /*
1371 * Be somewhat over-protective for now!
1372 */
1373 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1374 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1375 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1376 VM_MIXEDMAP | VM_SAO))
1377 return 0; /* just ignore the advice */
1378
1379 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1380 err = __ksm_enter(mm);
1381 if (err)
1382 return err;
1383 }
1384
1385 *vm_flags |= VM_MERGEABLE;
1386 break;
1387
1388 case MADV_UNMERGEABLE:
1389 if (!(*vm_flags & VM_MERGEABLE))
1390 return 0; /* just ignore the advice */
1391
1392 if (vma->anon_vma) {
1393 err = unmerge_ksm_pages(vma, start, end);
1394 if (err)
1395 return err;
1396 }
1397
1398 *vm_flags &= ~VM_MERGEABLE;
1399 break;
1400 }
1401
1402 return 0;
1403}
1404
1405int __ksm_enter(struct mm_struct *mm)
1406{
1407 struct mm_slot *mm_slot;
1408 int needs_wakeup;
1409
1410 mm_slot = alloc_mm_slot();
1411 if (!mm_slot)
1412 return -ENOMEM;
1413
1414 /* Check ksm_run too? Would need tighter locking */
1415 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1416
1417 spin_lock(&ksm_mmlist_lock);
1418 insert_to_mm_slots_hash(mm, mm_slot);
1419 /*
1420 * Insert just behind the scanning cursor, to let the area settle
1421 * down a little; when fork is followed by immediate exec, we don't
1422 * want ksmd to waste time setting up and tearing down an rmap_list.
1423 */
1424 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1425 spin_unlock(&ksm_mmlist_lock);
1426
1427 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1428 atomic_inc(&mm->mm_count);
1429
1430 if (needs_wakeup)
1431 wake_up_interruptible(&ksm_thread_wait);
1432
1433 return 0;
1434}
1435
1436void __ksm_exit(struct mm_struct *mm)
1437{
1438 struct mm_slot *mm_slot;
1439 int easy_to_free = 0;
1440
1441 /*
1442 * This process is exiting: if it's straightforward (as is the
1443 * case when ksmd was never running), free mm_slot immediately.
1444 * But if it's at the cursor or has rmap_items linked to it, use
1445 * mmap_sem to synchronize with any break_cows before pagetables
1446 * are freed, and leave the mm_slot on the list for ksmd to free.
1447 * Beware: ksm may already have noticed it exiting and freed the slot.
1448 */
1449
1450 spin_lock(&ksm_mmlist_lock);
1451 mm_slot = get_mm_slot(mm);
1452 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1453 if (list_empty(&mm_slot->rmap_list)) {
1454 hlist_del(&mm_slot->link);
1455 list_del(&mm_slot->mm_list);
1456 easy_to_free = 1;
1457 } else {
1458 list_move(&mm_slot->mm_list,
1459 &ksm_scan.mm_slot->mm_list);
1460 }
1461 }
1462 spin_unlock(&ksm_mmlist_lock);
1463
1464 if (easy_to_free) {
1465 free_mm_slot(mm_slot);
1466 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1467 mmdrop(mm);
1468 } else if (mm_slot) {
1469 down_write(&mm->mmap_sem);
1470 up_write(&mm->mmap_sem);
1471 }
1472}
1473
1474#ifdef CONFIG_SYSFS
1475/*
1476 * This all compiles without CONFIG_SYSFS, but is a waste of space.
1477 */
1478
1479#define KSM_ATTR_RO(_name) \
1480 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1481#define KSM_ATTR(_name) \
1482 static struct kobj_attribute _name##_attr = \
1483 __ATTR(_name, 0644, _name##_show, _name##_store)
1484
1485static ssize_t sleep_millisecs_show(struct kobject *kobj,
1486 struct kobj_attribute *attr, char *buf)
1487{
1488 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1489}
1490
1491static ssize_t sleep_millisecs_store(struct kobject *kobj,
1492 struct kobj_attribute *attr,
1493 const char *buf, size_t count)
1494{
1495 unsigned long msecs;
1496 int err;
1497
1498 err = strict_strtoul(buf, 10, &msecs);
1499 if (err || msecs > UINT_MAX)
1500 return -EINVAL;
1501
1502 ksm_thread_sleep_millisecs = msecs;
1503
1504 return count;
1505}
1506KSM_ATTR(sleep_millisecs);
1507
1508static ssize_t pages_to_scan_show(struct kobject *kobj,
1509 struct kobj_attribute *attr, char *buf)
1510{
1511 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1512}
1513
1514static ssize_t pages_to_scan_store(struct kobject *kobj,
1515 struct kobj_attribute *attr,
1516 const char *buf, size_t count)
1517{
1518 int err;
1519 unsigned long nr_pages;
1520
1521 err = strict_strtoul(buf, 10, &nr_pages);
1522 if (err || nr_pages > UINT_MAX)
1523 return -EINVAL;
1524
1525 ksm_thread_pages_to_scan = nr_pages;
1526
1527 return count;
1528}
1529KSM_ATTR(pages_to_scan);
1530
1531static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1532 char *buf)
1533{
1534 return sprintf(buf, "%u\n", ksm_run);
1535}
1536
1537static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1538 const char *buf, size_t count)
1539{
1540 int err;
1541 unsigned long flags;
1542
1543 err = strict_strtoul(buf, 10, &flags);
1544 if (err || flags > UINT_MAX)
1545 return -EINVAL;
1546 if (flags > KSM_RUN_UNMERGE)
1547 return -EINVAL;
1548
1549 /*
1550 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1551 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1552 * breaking COW to free the unswappable pages_shared (but leaves
1553 * mm_slots on the list for when ksmd may be set running again).
1554 */
1555
1556 mutex_lock(&ksm_thread_mutex);
1557 if (ksm_run != flags) {
1558 ksm_run = flags;
1559 if (flags & KSM_RUN_UNMERGE) {
1560 current->flags |= PF_OOM_ORIGIN;
1561 err = unmerge_and_remove_all_rmap_items();
1562 current->flags &= ~PF_OOM_ORIGIN;
1563 if (err) {
1564 ksm_run = KSM_RUN_STOP;
1565 count = err;
1566 }
1567 }
1568 }
1569 mutex_unlock(&ksm_thread_mutex);
1570
1571 if (flags & KSM_RUN_MERGE)
1572 wake_up_interruptible(&ksm_thread_wait);
1573
1574 return count;
1575}
1576KSM_ATTR(run);
1577
1578static ssize_t max_kernel_pages_store(struct kobject *kobj,
1579 struct kobj_attribute *attr,
1580 const char *buf, size_t count)
1581{
1582 int err;
1583 unsigned long nr_pages;
1584
1585 err = strict_strtoul(buf, 10, &nr_pages);
1586 if (err)
1587 return -EINVAL;
1588
1589 ksm_max_kernel_pages = nr_pages;
1590
1591 return count;
1592}
1593
1594static ssize_t max_kernel_pages_show(struct kobject *kobj,
1595 struct kobj_attribute *attr, char *buf)
1596{
1597 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1598}
1599KSM_ATTR(max_kernel_pages);
1600
1601static ssize_t pages_shared_show(struct kobject *kobj,
1602 struct kobj_attribute *attr, char *buf)
1603{
1604 return sprintf(buf, "%lu\n", ksm_pages_shared);
1605}
1606KSM_ATTR_RO(pages_shared);
1607
1608static ssize_t pages_sharing_show(struct kobject *kobj,
1609 struct kobj_attribute *attr, char *buf)
1610{
1611 return sprintf(buf, "%lu\n", ksm_pages_sharing);
1612}
1613KSM_ATTR_RO(pages_sharing);
1614
1615static ssize_t pages_unshared_show(struct kobject *kobj,
1616 struct kobj_attribute *attr, char *buf)
1617{
1618 return sprintf(buf, "%lu\n", ksm_pages_unshared);
1619}
1620KSM_ATTR_RO(pages_unshared);
1621
1622static ssize_t pages_volatile_show(struct kobject *kobj,
1623 struct kobj_attribute *attr, char *buf)
1624{
1625 long ksm_pages_volatile;
1626
1627 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1628 - ksm_pages_sharing - ksm_pages_unshared;
1629 /*
1630 * It was not worth any locking to calculate that statistic,
1631 * but it might therefore sometimes be negative: conceal that.
1632 */
1633 if (ksm_pages_volatile < 0)
1634 ksm_pages_volatile = 0;
1635 return sprintf(buf, "%ld\n", ksm_pages_volatile);
1636}
1637KSM_ATTR_RO(pages_volatile);
1638
1639static ssize_t full_scans_show(struct kobject *kobj,
1640 struct kobj_attribute *attr, char *buf)
1641{
1642 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
1643}
1644KSM_ATTR_RO(full_scans);
1645
1646static struct attribute *ksm_attrs[] = {
1647 &sleep_millisecs_attr.attr,
1648 &pages_to_scan_attr.attr,
1649 &run_attr.attr,
1650 &max_kernel_pages_attr.attr,
1651 &pages_shared_attr.attr,
1652 &pages_sharing_attr.attr,
1653 &pages_unshared_attr.attr,
1654 &pages_volatile_attr.attr,
1655 &full_scans_attr.attr,
1656 NULL,
1657};
1658
1659static struct attribute_group ksm_attr_group = {
1660 .attrs = ksm_attrs,
1661 .name = "ksm",
1662};
1663#endif /* CONFIG_SYSFS */
1664
1665static int __init ksm_init(void)
1666{
1667 struct task_struct *ksm_thread;
1668 int err;
1669
1670 err = ksm_slab_init();
1671 if (err)
1672 goto out;
1673
1674 err = mm_slots_hash_init();
1675 if (err)
1676 goto out_free1;
1677
1678 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1679 if (IS_ERR(ksm_thread)) {
1680 printk(KERN_ERR "ksm: creating kthread failed\n");
1681 err = PTR_ERR(ksm_thread);
1682 goto out_free2;
1683 }
1684
1685#ifdef CONFIG_SYSFS
1686 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1687 if (err) {
1688 printk(KERN_ERR "ksm: register sysfs failed\n");
1689 kthread_stop(ksm_thread);
1690 goto out_free2;
1691 }
1692#endif /* CONFIG_SYSFS */
1693
1694 return 0;
1695
1696out_free2:
1697 mm_slots_hash_free();
1698out_free1:
1699 ksm_slab_free();
1700out:
1701 return err;
1702}
1703module_init(ksm_init)
diff --git a/mm/madvise.c b/mm/madvise.c
index 76eb4193acdd..d9ae2067952e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/ksm.h>
14 15
15/* 16/*
16 * Any behaviour which results in changes to the vma->vm_flags needs to 17 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
41 struct mm_struct * mm = vma->vm_mm; 42 struct mm_struct * mm = vma->vm_mm;
42 int error = 0; 43 int error = 0;
43 pgoff_t pgoff; 44 pgoff_t pgoff;
44 int new_flags = vma->vm_flags; 45 unsigned long new_flags = vma->vm_flags;
45 46
46 switch (behavior) { 47 switch (behavior) {
47 case MADV_NORMAL: 48 case MADV_NORMAL:
@@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma,
57 new_flags |= VM_DONTCOPY; 58 new_flags |= VM_DONTCOPY;
58 break; 59 break;
59 case MADV_DOFORK: 60 case MADV_DOFORK:
61 if (vma->vm_flags & VM_IO) {
62 error = -EINVAL;
63 goto out;
64 }
60 new_flags &= ~VM_DONTCOPY; 65 new_flags &= ~VM_DONTCOPY;
61 break; 66 break;
67 case MADV_MERGEABLE:
68 case MADV_UNMERGEABLE:
69 error = ksm_madvise(vma, start, end, behavior, &new_flags);
70 if (error)
71 goto out;
72 break;
62 } 73 }
63 74
64 if (new_flags == vma->vm_flags) { 75 if (new_flags == vma->vm_flags) {
@@ -211,37 +222,16 @@ static long
211madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 222madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
212 unsigned long start, unsigned long end, int behavior) 223 unsigned long start, unsigned long end, int behavior)
213{ 224{
214 long error;
215
216 switch (behavior) { 225 switch (behavior) {
217 case MADV_DOFORK:
218 if (vma->vm_flags & VM_IO) {
219 error = -EINVAL;
220 break;
221 }
222 case MADV_DONTFORK:
223 case MADV_NORMAL:
224 case MADV_SEQUENTIAL:
225 case MADV_RANDOM:
226 error = madvise_behavior(vma, prev, start, end, behavior);
227 break;
228 case MADV_REMOVE: 226 case MADV_REMOVE:
229 error = madvise_remove(vma, prev, start, end); 227 return madvise_remove(vma, prev, start, end);
230 break;
231
232 case MADV_WILLNEED: 228 case MADV_WILLNEED:
233 error = madvise_willneed(vma, prev, start, end); 229 return madvise_willneed(vma, prev, start, end);
234 break;
235
236 case MADV_DONTNEED: 230 case MADV_DONTNEED:
237 error = madvise_dontneed(vma, prev, start, end); 231 return madvise_dontneed(vma, prev, start, end);
238 break;
239
240 default: 232 default:
241 BUG(); 233 return madvise_behavior(vma, prev, start, end, behavior);
242 break;
243 } 234 }
244 return error;
245} 235}
246 236
247static int 237static int
@@ -256,12 +246,17 @@ madvise_behavior_valid(int behavior)
256 case MADV_REMOVE: 246 case MADV_REMOVE:
257 case MADV_WILLNEED: 247 case MADV_WILLNEED:
258 case MADV_DONTNEED: 248 case MADV_DONTNEED:
249#ifdef CONFIG_KSM
250 case MADV_MERGEABLE:
251 case MADV_UNMERGEABLE:
252#endif
259 return 1; 253 return 1;
260 254
261 default: 255 default:
262 return 0; 256 return 0;
263 } 257 }
264} 258}
259
265/* 260/*
266 * The madvise(2) system call. 261 * The madvise(2) system call.
267 * 262 *
@@ -286,6 +281,12 @@ madvise_behavior_valid(int behavior)
286 * so the kernel can free resources associated with it. 281 * so the kernel can free resources associated with it.
287 * MADV_REMOVE - the application wants to free up the given range of 282 * MADV_REMOVE - the application wants to free up the given range of
288 * pages and associated backing store. 283 * pages and associated backing store.
284 * MADV_DONTFORK - omit this area from child's address space when forking:
285 * typically, to avoid COWing pages pinned by get_user_pages().
286 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
287 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
288 * this area with pages of identical content from other such areas.
289 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
289 * 290 *
290 * return values: 291 * return values:
291 * zero - success 292 * zero - success
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fd4529d86de5..9b10d8753784 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -648,7 +648,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
648 int nid = z->zone_pgdat->node_id; 648 int nid = z->zone_pgdat->node_id;
649 int zid = zone_idx(z); 649 int zid = zone_idx(z);
650 struct mem_cgroup_per_zone *mz; 650 struct mem_cgroup_per_zone *mz;
651 int lru = LRU_FILE * !!file + !!active; 651 int lru = LRU_FILE * file + active;
652 int ret; 652 int ret;
653 653
654 BUG_ON(!mem_cont); 654 BUG_ON(!mem_cont);
diff --git a/mm/memory.c b/mm/memory.c
index e8f63d9961ea..b1443ac07c00 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -107,6 +108,18 @@ static int __init disable_randmaps(char *s)
107} 108}
108__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
109 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
110 123
111/* 124/*
112 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -443,6 +456,20 @@ static inline int is_cow_mapping(unsigned int flags)
443 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 456 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
444} 457}
445 458
459#ifndef is_zero_pfn
460static inline int is_zero_pfn(unsigned long pfn)
461{
462 return pfn == zero_pfn;
463}
464#endif
465
466#ifndef my_zero_pfn
467static inline unsigned long my_zero_pfn(unsigned long addr)
468{
469 return zero_pfn;
470}
471#endif
472
446/* 473/*
447 * vm_normal_page -- This function gets the "struct page" associated with a pte. 474 * vm_normal_page -- This function gets the "struct page" associated with a pte.
448 * 475 *
@@ -498,7 +525,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
498 if (HAVE_PTE_SPECIAL) { 525 if (HAVE_PTE_SPECIAL) {
499 if (likely(!pte_special(pte))) 526 if (likely(!pte_special(pte)))
500 goto check_pfn; 527 goto check_pfn;
501 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 528 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
529 return NULL;
530 if (!is_zero_pfn(pfn))
502 print_bad_pte(vma, addr, pte, NULL); 531 print_bad_pte(vma, addr, pte, NULL);
503 return NULL; 532 return NULL;
504 } 533 }
@@ -520,6 +549,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
520 } 549 }
521 } 550 }
522 551
552 if (is_zero_pfn(pfn))
553 return NULL;
523check_pfn: 554check_pfn:
524 if (unlikely(pfn > highest_memmap_pfn)) { 555 if (unlikely(pfn > highest_memmap_pfn)) {
525 print_bad_pte(vma, addr, pte, NULL); 556 print_bad_pte(vma, addr, pte, NULL);
@@ -597,8 +628,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
597 page = vm_normal_page(vma, addr, pte); 628 page = vm_normal_page(vma, addr, pte);
598 if (page) { 629 if (page) {
599 get_page(page); 630 get_page(page);
600 page_dup_rmap(page, vma, addr); 631 page_dup_rmap(page);
601 rss[!!PageAnon(page)]++; 632 rss[PageAnon(page)]++;
602 } 633 }
603 634
604out_set_pte: 635out_set_pte:
@@ -1143,9 +1174,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1143 goto no_page; 1174 goto no_page;
1144 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1175 if ((flags & FOLL_WRITE) && !pte_write(pte))
1145 goto unlock; 1176 goto unlock;
1177
1146 page = vm_normal_page(vma, address, pte); 1178 page = vm_normal_page(vma, address, pte);
1147 if (unlikely(!page)) 1179 if (unlikely(!page)) {
1148 goto bad_page; 1180 if ((flags & FOLL_DUMP) ||
1181 !is_zero_pfn(pte_pfn(pte)))
1182 goto bad_page;
1183 page = pte_page(pte);
1184 }
1149 1185
1150 if (flags & FOLL_GET) 1186 if (flags & FOLL_GET)
1151 get_page(page); 1187 get_page(page);
@@ -1173,65 +1209,46 @@ no_page:
1173 pte_unmap_unlock(ptep, ptl); 1209 pte_unmap_unlock(ptep, ptl);
1174 if (!pte_none(pte)) 1210 if (!pte_none(pte))
1175 return page; 1211 return page;
1176 /* Fall through to ZERO_PAGE handling */ 1212
1177no_page_table: 1213no_page_table:
1178 /* 1214 /*
1179 * When core dumping an enormous anonymous area that nobody 1215 * When core dumping an enormous anonymous area that nobody
1180 * has touched so far, we don't want to allocate page tables. 1216 * has touched so far, we don't want to allocate unnecessary pages or
1217 * page tables. Return error instead of NULL to skip handle_mm_fault,
1218 * then get_dump_page() will return NULL to leave a hole in the dump.
1219 * But we can only make this optimization where a hole would surely
1220 * be zero-filled if handle_mm_fault() actually did handle it.
1181 */ 1221 */
1182 if (flags & FOLL_ANON) { 1222 if ((flags & FOLL_DUMP) &&
1183 page = ZERO_PAGE(0); 1223 (!vma->vm_ops || !vma->vm_ops->fault))
1184 if (flags & FOLL_GET) 1224 return ERR_PTR(-EFAULT);
1185 get_page(page);
1186 BUG_ON(flags & FOLL_WRITE);
1187 }
1188 return page; 1225 return page;
1189} 1226}
1190 1227
1191/* Can we do the FOLL_ANON optimization? */
1192static inline int use_zero_page(struct vm_area_struct *vma)
1193{
1194 /*
1195 * We don't want to optimize FOLL_ANON for make_pages_present()
1196 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1197 * we want to get the page from the page tables to make sure
1198 * that we serialize and update with any other user of that
1199 * mapping.
1200 */
1201 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1202 return 0;
1203 /*
1204 * And if we have a fault routine, it's not an anonymous region.
1205 */
1206 return !vma->vm_ops || !vma->vm_ops->fault;
1207}
1208
1209
1210
1211int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1228int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1212 unsigned long start, int nr_pages, int flags, 1229 unsigned long start, int nr_pages, unsigned int gup_flags,
1213 struct page **pages, struct vm_area_struct **vmas) 1230 struct page **pages, struct vm_area_struct **vmas)
1214{ 1231{
1215 int i; 1232 int i;
1216 unsigned int vm_flags = 0; 1233 unsigned long vm_flags;
1217 int write = !!(flags & GUP_FLAGS_WRITE);
1218 int force = !!(flags & GUP_FLAGS_FORCE);
1219 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1220 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1221 1234
1222 if (nr_pages <= 0) 1235 if (nr_pages <= 0)
1223 return 0; 1236 return 0;
1237
1238 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1239
1224 /* 1240 /*
1225 * Require read or write permissions. 1241 * Require read or write permissions.
1226 * If 'force' is set, we only require the "MAY" flags. 1242 * If FOLL_FORCE is set, we only require the "MAY" flags.
1227 */ 1243 */
1228 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1244 vm_flags = (gup_flags & FOLL_WRITE) ?
1229 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1245 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1246 vm_flags &= (gup_flags & FOLL_FORCE) ?
1247 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1230 i = 0; 1248 i = 0;
1231 1249
1232 do { 1250 do {
1233 struct vm_area_struct *vma; 1251 struct vm_area_struct *vma;
1234 unsigned int foll_flags;
1235 1252
1236 vma = find_extend_vma(mm, start); 1253 vma = find_extend_vma(mm, start);
1237 if (!vma && in_gate_area(tsk, start)) { 1254 if (!vma && in_gate_area(tsk, start)) {
@@ -1243,7 +1260,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1243 pte_t *pte; 1260 pte_t *pte;
1244 1261
1245 /* user gate pages are read-only */ 1262 /* user gate pages are read-only */
1246 if (!ignore && write) 1263 if (gup_flags & FOLL_WRITE)
1247 return i ? : -EFAULT; 1264 return i ? : -EFAULT;
1248 if (pg > TASK_SIZE) 1265 if (pg > TASK_SIZE)
1249 pgd = pgd_offset_k(pg); 1266 pgd = pgd_offset_k(pg);
@@ -1277,38 +1294,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1277 1294
1278 if (!vma || 1295 if (!vma ||
1279 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1296 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1280 (!ignore && !(vm_flags & vma->vm_flags))) 1297 !(vm_flags & vma->vm_flags))
1281 return i ? : -EFAULT; 1298 return i ? : -EFAULT;
1282 1299
1283 if (is_vm_hugetlb_page(vma)) { 1300 if (is_vm_hugetlb_page(vma)) {
1284 i = follow_hugetlb_page(mm, vma, pages, vmas, 1301 i = follow_hugetlb_page(mm, vma, pages, vmas,
1285 &start, &nr_pages, i, write); 1302 &start, &nr_pages, i, gup_flags);
1286 continue; 1303 continue;
1287 } 1304 }
1288 1305
1289 foll_flags = FOLL_TOUCH;
1290 if (pages)
1291 foll_flags |= FOLL_GET;
1292 if (!write && use_zero_page(vma))
1293 foll_flags |= FOLL_ANON;
1294
1295 do { 1306 do {
1296 struct page *page; 1307 struct page *page;
1308 unsigned int foll_flags = gup_flags;
1297 1309
1298 /* 1310 /*
1299 * If we have a pending SIGKILL, don't keep faulting 1311 * If we have a pending SIGKILL, don't keep faulting
1300 * pages and potentially allocating memory, unless 1312 * pages and potentially allocating memory.
1301 * current is handling munlock--e.g., on exit. In
1302 * that case, we are not allocating memory. Rather,
1303 * we're only unlocking already resident/mapped pages.
1304 */ 1313 */
1305 if (unlikely(!ignore_sigkill && 1314 if (unlikely(fatal_signal_pending(current)))
1306 fatal_signal_pending(current)))
1307 return i ? i : -ERESTARTSYS; 1315 return i ? i : -ERESTARTSYS;
1308 1316
1309 if (write)
1310 foll_flags |= FOLL_WRITE;
1311
1312 cond_resched(); 1317 cond_resched();
1313 while (!(page = follow_page(vma, start, foll_flags))) { 1318 while (!(page = follow_page(vma, start, foll_flags))) {
1314 int ret; 1319 int ret;
@@ -1419,18 +1424,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1419 unsigned long start, int nr_pages, int write, int force, 1424 unsigned long start, int nr_pages, int write, int force,
1420 struct page **pages, struct vm_area_struct **vmas) 1425 struct page **pages, struct vm_area_struct **vmas)
1421{ 1426{
1422 int flags = 0; 1427 int flags = FOLL_TOUCH;
1423 1428
1429 if (pages)
1430 flags |= FOLL_GET;
1424 if (write) 1431 if (write)
1425 flags |= GUP_FLAGS_WRITE; 1432 flags |= FOLL_WRITE;
1426 if (force) 1433 if (force)
1427 flags |= GUP_FLAGS_FORCE; 1434 flags |= FOLL_FORCE;
1428 1435
1429 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1436 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1430} 1437}
1431
1432EXPORT_SYMBOL(get_user_pages); 1438EXPORT_SYMBOL(get_user_pages);
1433 1439
1440/**
1441 * get_dump_page() - pin user page in memory while writing it to core dump
1442 * @addr: user address
1443 *
1444 * Returns struct page pointer of user page pinned for dump,
1445 * to be freed afterwards by page_cache_release() or put_page().
1446 *
1447 * Returns NULL on any kind of failure - a hole must then be inserted into
1448 * the corefile, to preserve alignment with its headers; and also returns
1449 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1450 * allowing a hole to be left in the corefile to save diskspace.
1451 *
1452 * Called without mmap_sem, but after all other threads have been killed.
1453 */
1454#ifdef CONFIG_ELF_CORE
1455struct page *get_dump_page(unsigned long addr)
1456{
1457 struct vm_area_struct *vma;
1458 struct page *page;
1459
1460 if (__get_user_pages(current, current->mm, addr, 1,
1461 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1462 return NULL;
1463 flush_cache_page(vma, addr, page_to_pfn(page));
1464 return page;
1465}
1466#endif /* CONFIG_ELF_CORE */
1467
1434pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1468pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1435 spinlock_t **ptl) 1469 spinlock_t **ptl)
1436{ 1470{
@@ -1608,7 +1642,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1608 * If we don't have pte special, then we have to use the pfn_valid() 1642 * If we don't have pte special, then we have to use the pfn_valid()
1609 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1643 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1610 * refcount the page if pfn_valid is true (hence insert_page rather 1644 * refcount the page if pfn_valid is true (hence insert_page rather
1611 * than insert_pfn). 1645 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1646 * without pte special, it would there be refcounted as a normal page.
1612 */ 1647 */
1613 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1648 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1614 struct page *page; 1649 struct page *page;
@@ -1974,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1974 * Take out anonymous pages first, anonymous shared vmas are 2009 * Take out anonymous pages first, anonymous shared vmas are
1975 * not dirty accountable. 2010 * not dirty accountable.
1976 */ 2011 */
1977 if (PageAnon(old_page)) { 2012 if (PageAnon(old_page) && !PageKsm(old_page)) {
1978 if (!trylock_page(old_page)) { 2013 if (!trylock_page(old_page)) {
1979 page_cache_get(old_page); 2014 page_cache_get(old_page);
1980 pte_unmap_unlock(page_table, ptl); 2015 pte_unmap_unlock(page_table, ptl);
@@ -2075,10 +2110,19 @@ gotten:
2075 2110
2076 if (unlikely(anon_vma_prepare(vma))) 2111 if (unlikely(anon_vma_prepare(vma)))
2077 goto oom; 2112 goto oom;
2078 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2113
2079 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2114 if (is_zero_pfn(pte_pfn(orig_pte))) {
2080 if (!new_page) 2115 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2081 goto oom; 2116 if (!new_page)
2117 goto oom;
2118 } else {
2119 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2120 if (!new_page)
2121 goto oom;
2122 cow_user_page(new_page, old_page, address, vma);
2123 }
2124 __SetPageUptodate(new_page);
2125
2082 /* 2126 /*
2083 * Don't let another task, with possibly unlocked vma, 2127 * Don't let another task, with possibly unlocked vma,
2084 * keep the mlocked page. 2128 * keep the mlocked page.
@@ -2088,8 +2132,6 @@ gotten:
2088 clear_page_mlock(old_page); 2132 clear_page_mlock(old_page);
2089 unlock_page(old_page); 2133 unlock_page(old_page);
2090 } 2134 }
2091 cow_user_page(new_page, old_page, address, vma);
2092 __SetPageUptodate(new_page);
2093 2135
2094 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2136 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2095 goto oom_free_new; 2137 goto oom_free_new;
@@ -2115,9 +2157,14 @@ gotten:
2115 * seen in the presence of one thread doing SMC and another 2157 * seen in the presence of one thread doing SMC and another
2116 * thread doing COW. 2158 * thread doing COW.
2117 */ 2159 */
2118 ptep_clear_flush_notify(vma, address, page_table); 2160 ptep_clear_flush(vma, address, page_table);
2119 page_add_new_anon_rmap(new_page, vma, address); 2161 page_add_new_anon_rmap(new_page, vma, address);
2120 set_pte_at(mm, address, page_table, entry); 2162 /*
2163 * We call the notify macro here because, when using secondary
2164 * mmu page tables (such as kvm shadow page tables), we want the
2165 * new page to be mapped directly into the secondary page table.
2166 */
2167 set_pte_at_notify(mm, address, page_table, entry);
2121 update_mmu_cache(vma, address, entry); 2168 update_mmu_cache(vma, address, entry);
2122 if (old_page) { 2169 if (old_page) {
2123 /* 2170 /*
@@ -2625,6 +2672,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2625 spinlock_t *ptl; 2672 spinlock_t *ptl;
2626 pte_t entry; 2673 pte_t entry;
2627 2674
2675 if (!(flags & FAULT_FLAG_WRITE)) {
2676 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2677 vma->vm_page_prot));
2678 ptl = pte_lockptr(mm, pmd);
2679 spin_lock(ptl);
2680 if (!pte_none(*page_table))
2681 goto unlock;
2682 goto setpte;
2683 }
2684
2628 /* Allocate our own private page. */ 2685 /* Allocate our own private page. */
2629 pte_unmap(page_table); 2686 pte_unmap(page_table);
2630 2687
@@ -2639,13 +2696,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2639 goto oom_free_page; 2696 goto oom_free_page;
2640 2697
2641 entry = mk_pte(page, vma->vm_page_prot); 2698 entry = mk_pte(page, vma->vm_page_prot);
2642 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2699 if (vma->vm_flags & VM_WRITE)
2700 entry = pte_mkwrite(pte_mkdirty(entry));
2643 2701
2644 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2702 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2645 if (!pte_none(*page_table)) 2703 if (!pte_none(*page_table))
2646 goto release; 2704 goto release;
2705
2647 inc_mm_counter(mm, anon_rss); 2706 inc_mm_counter(mm, anon_rss);
2648 page_add_new_anon_rmap(page, vma, address); 2707 page_add_new_anon_rmap(page, vma, address);
2708setpte:
2649 set_pte_at(mm, address, page_table, entry); 2709 set_pte_at(mm, address, page_table, entry);
2650 2710
2651 /* No need to invalidate - it was non-present before */ 2711 /* No need to invalidate - it was non-present before */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4412a676c88..efe3e0ec2e61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages);
339 339
340void online_page(struct page *page) 340void online_page(struct page *page)
341{ 341{
342 unsigned long pfn = page_to_pfn(page);
343
342 totalram_pages++; 344 totalram_pages++;
343 num_physpages++; 345 if (pfn >= num_physpages)
346 num_physpages = pfn + 1;
344 347
345#ifdef CONFIG_HIGHMEM 348#ifdef CONFIG_HIGHMEM
346 if (PageHighMem(page)) 349 if (PageHighMem(page))
@@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 425 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 426 zone->zone_pgdat->node_present_pages += onlined_pages;
424 427
428 zone_pcp_update(zone);
425 setup_per_zone_wmarks(); 429 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone); 430 calculate_zone_inactive_ratio(zone);
427 if (onlined_pages) { 431 if (onlined_pages) {
@@ -831,7 +835,6 @@ repeat:
831 zone->present_pages -= offlined_pages; 835 zone->present_pages -= offlined_pages;
832 zone->zone_pgdat->node_present_pages -= offlined_pages; 836 zone->zone_pgdat->node_present_pages -= offlined_pages;
833 totalram_pages -= offlined_pages; 837 totalram_pages -= offlined_pages;
834 num_physpages -= offlined_pages;
835 838
836 setup_per_zone_wmarks(); 839 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone); 840 calculate_zone_inactive_ratio(zone);
diff --git a/mm/mempool.c b/mm/mempool.c
index 32e75d400503..1a3bc3d4d554 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{
313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask);
315}
316EXPORT_SYMBOL(mempool_kzalloc);
317
318void mempool_kfree(void *element, void *pool_data) 311void mempool_kfree(void *element, void *pool_data)
319{ 312{
320 kfree(element); 313 kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index 939888f9ddab..16052e80aaac 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
67 67
68 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
69 list_del(&page->lru); 69 list_del(&page->lru);
70 dec_zone_page_state(page, NR_ISOLATED_ANON +
71 page_is_file_cache(page));
70 putback_lru_page(page); 72 putback_lru_page(page);
71 count++; 73 count++;
72 } 74 }
@@ -147,7 +149,7 @@ out:
147static void remove_file_migration_ptes(struct page *old, struct page *new) 149static void remove_file_migration_ptes(struct page *old, struct page *new)
148{ 150{
149 struct vm_area_struct *vma; 151 struct vm_area_struct *vma;
150 struct address_space *mapping = page_mapping(new); 152 struct address_space *mapping = new->mapping;
151 struct prio_tree_iter iter; 153 struct prio_tree_iter iter;
152 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
153 155
@@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 272 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 273 page_index(page));
272 274
273 expected_count = 2 + !!page_has_private(page); 275 expected_count = 2 + page_has_private(page);
274 if (page_count(page) != expected_count || 276 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 277 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 278 spin_unlock_irq(&mapping->tree_lock);
@@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
312 */ 314 */
313 __dec_zone_page_state(page, NR_FILE_PAGES); 315 __dec_zone_page_state(page, NR_FILE_PAGES);
314 __inc_zone_page_state(newpage, NR_FILE_PAGES); 316 __inc_zone_page_state(newpage, NR_FILE_PAGES);
315 317 if (PageSwapBacked(page)) {
318 __dec_zone_page_state(page, NR_SHMEM);
319 __inc_zone_page_state(newpage, NR_SHMEM);
320 }
316 spin_unlock_irq(&mapping->tree_lock); 321 spin_unlock_irq(&mapping->tree_lock);
317 322
318 return 0; 323 return 0;
@@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
664 * needs to be effective. 669 * needs to be effective.
665 */ 670 */
666 try_to_free_buffers(page); 671 try_to_free_buffers(page);
672 goto rcu_unlock;
667 } 673 }
668 goto rcu_unlock; 674 goto skip_unmap;
669 } 675 }
670 676
671 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
672 try_to_unmap(page, 1); 678 try_to_unmap(page, 1);
673 679
680skip_unmap:
674 if (!page_mapped(page)) 681 if (!page_mapped(page))
675 rc = move_to_new_page(newpage, page); 682 rc = move_to_new_page(newpage, page);
676 683
@@ -693,6 +700,8 @@ unlock:
693 * restored. 700 * restored.
694 */ 701 */
695 list_del(&page->lru); 702 list_del(&page->lru);
703 dec_zone_page_state(page, NR_ISOLATED_ANON +
704 page_is_file_cache(page));
696 putback_lru_page(page); 705 putback_lru_page(page);
697 } 706 }
698 707
@@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from,
737 struct page *page2; 746 struct page *page2;
738 int swapwrite = current->flags & PF_SWAPWRITE; 747 int swapwrite = current->flags & PF_SWAPWRITE;
739 int rc; 748 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
740 756
741 if (!swapwrite) 757 if (!swapwrite)
742 current->flags |= PF_SWAPWRITE; 758 current->flags |= PF_SWAPWRITE;
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650b9654..bd6f0e466f6c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
139} 139}
140 140
141/** 141/**
142 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. 142 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
143 * @vma: target vma 143 * @vma: target vma
144 * @start: start address 144 * @start: start address
145 * @end: end address 145 * @end: end address
146 * @mlock: 0 indicate munlock, otherwise mlock.
147 * 146 *
148 * If @mlock == 0, unlock an mlocked range; 147 * This takes care of making the pages present too.
149 * else mlock the range of pages. This takes care of making the pages present ,
150 * too.
151 * 148 *
152 * return 0 on success, negative error code on error. 149 * return 0 on success, negative error code on error.
153 * 150 *
154 * vma->vm_mm->mmap_sem must be held for at least read. 151 * vma->vm_mm->mmap_sem must be held for at least read.
155 */ 152 */
156static long __mlock_vma_pages_range(struct vm_area_struct *vma, 153static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157 unsigned long start, unsigned long end, 154 unsigned long start, unsigned long end)
158 int mlock)
159{ 155{
160 struct mm_struct *mm = vma->vm_mm; 156 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 157 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */ 158 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 159 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0; 160 int ret = 0;
165 int gup_flags = 0; 161 int gup_flags;
166 162
167 VM_BUG_ON(start & ~PAGE_MASK); 163 VM_BUG_ON(start & ~PAGE_MASK);
168 VM_BUG_ON(end & ~PAGE_MASK); 164 VM_BUG_ON(end & ~PAGE_MASK);
169 VM_BUG_ON(start < vma->vm_start); 165 VM_BUG_ON(start < vma->vm_start);
170 VM_BUG_ON(end > vma->vm_end); 166 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && 167 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 (atomic_read(&mm->mm_users) != 0));
173
174 /*
175 * mlock: don't page populate if vma has PROT_NONE permission.
176 * munlock: always do munlock although the vma has PROT_NONE
177 * permission, or SIGKILL is pending.
178 */
179 if (!mlock)
180 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
181 GUP_FLAGS_IGNORE_SIGKILL;
182 168
169 gup_flags = FOLL_TOUCH | FOLL_GET;
183 if (vma->vm_flags & VM_WRITE) 170 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 171 gup_flags |= FOLL_WRITE;
185 172
186 while (nr_pages > 0) { 173 while (nr_pages > 0) {
187 int i; 174 int i;
@@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
201 * This can happen for, e.g., VM_NONLINEAR regions before 188 * This can happen for, e.g., VM_NONLINEAR regions before
202 * a page has been allocated and mapped at a given offset, 189 * a page has been allocated and mapped at a given offset,
203 * or for addresses that map beyond end of a file. 190 * or for addresses that map beyond end of a file.
204 * We'll mlock the the pages if/when they get faulted in. 191 * We'll mlock the pages if/when they get faulted in.
205 */ 192 */
206 if (ret < 0) 193 if (ret < 0)
207 break; 194 break;
208 if (ret == 0) {
209 /*
210 * We know the vma is there, so the only time
211 * we cannot get a single page should be an
212 * error (ret < 0) case.
213 */
214 WARN_ON(1);
215 break;
216 }
217 195
218 lru_add_drain(); /* push cached pages to LRU */ 196 lru_add_drain(); /* push cached pages to LRU */
219 197
220 for (i = 0; i < ret; i++) { 198 for (i = 0; i < ret; i++) {
221 struct page *page = pages[i]; 199 struct page *page = pages[i];
222 200
223 lock_page(page);
224 /*
225 * Because we lock page here and migration is blocked
226 * by the elevated reference, we need only check for
227 * page truncation (file-cache only).
228 */
229 if (page->mapping) { 201 if (page->mapping) {
230 if (mlock) 202 /*
203 * That preliminary check is mainly to avoid
204 * the pointless overhead of lock_page on the
205 * ZERO_PAGE: which might bounce very badly if
206 * there is contention. However, we're still
207 * dirtying its cacheline with get/put_page:
208 * we'll add another __get_user_pages flag to
209 * avoid it if that case turns out to matter.
210 */
211 lock_page(page);
212 /*
213 * Because we lock page here and migration is
214 * blocked by the elevated reference, we need
215 * only check for file-cache page truncation.
216 */
217 if (page->mapping)
231 mlock_vma_page(page); 218 mlock_vma_page(page);
232 else 219 unlock_page(page);
233 munlock_vma_page(page);
234 } 220 }
235 unlock_page(page); 221 put_page(page); /* ref from get_user_pages() */
236 put_page(page); /* ref from get_user_pages() */
237
238 /*
239 * here we assume that get_user_pages() has given us
240 * a list of virtually contiguous pages.
241 */
242 addr += PAGE_SIZE; /* for next get_user_pages() */
243 nr_pages--;
244 } 222 }
223
224 addr += ret * PAGE_SIZE;
225 nr_pages -= ret;
245 ret = 0; 226 ret = 0;
246 } 227 }
247 228
248 return ret; /* count entire vma as locked_vm */ 229 return ret; /* 0 or negative error code */
249} 230}
250 231
251/* 232/*
@@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
289 is_vm_hugetlb_page(vma) || 270 is_vm_hugetlb_page(vma) ||
290 vma == get_gate_vma(current))) { 271 vma == get_gate_vma(current))) {
291 272
292 __mlock_vma_pages_range(vma, start, end, 1); 273 __mlock_vma_pages_range(vma, start, end);
293 274
294 /* Hide errors from mmap() and other callers */ 275 /* Hide errors from mmap() and other callers */
295 return 0; 276 return 0;
@@ -310,7 +291,6 @@ no_mlock:
310 return nr_pages; /* error or pages NOT mlocked */ 291 return nr_pages; /* error or pages NOT mlocked */
311} 292}
312 293
313
314/* 294/*
315 * munlock_vma_pages_range() - munlock all pages in the vma range.' 295 * munlock_vma_pages_range() - munlock all pages in the vma range.'
316 * @vma - vma containing range to be munlock()ed. 296 * @vma - vma containing range to be munlock()ed.
@@ -330,10 +310,38 @@ no_mlock:
330 * free them. This will result in freeing mlocked pages. 310 * free them. This will result in freeing mlocked pages.
331 */ 311 */
332void munlock_vma_pages_range(struct vm_area_struct *vma, 312void munlock_vma_pages_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long end) 313 unsigned long start, unsigned long end)
334{ 314{
315 unsigned long addr;
316
317 lru_add_drain();
335 vma->vm_flags &= ~VM_LOCKED; 318 vma->vm_flags &= ~VM_LOCKED;
336 __mlock_vma_pages_range(vma, start, end, 0); 319
320 for (addr = start; addr < end; addr += PAGE_SIZE) {
321 struct page *page;
322 /*
323 * Although FOLL_DUMP is intended for get_dump_page(),
324 * it just so happens that its special treatment of the
325 * ZERO_PAGE (returning an error instead of doing get_page)
326 * suits munlock very well (and if somehow an abnormal page
327 * has sneaked into the range, we won't oops here: great).
328 */
329 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
330 if (page && !IS_ERR(page)) {
331 lock_page(page);
332 /*
333 * Like in __mlock_vma_pages_range(),
334 * because we lock page here and migration is
335 * blocked by the elevated reference, we need
336 * only check for file-cache page truncation.
337 */
338 if (page->mapping)
339 munlock_vma_page(page);
340 unlock_page(page);
341 put_page(page);
342 }
343 cond_resched();
344 }
337} 345}
338 346
339/* 347/*
@@ -400,18 +408,14 @@ success:
400 * It's okay if try_to_unmap_one unmaps a page just after we 408 * It's okay if try_to_unmap_one unmaps a page just after we
401 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 409 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402 */ 410 */
403 vma->vm_flags = newflags;
404 411
405 if (lock) { 412 if (lock) {
406 ret = __mlock_vma_pages_range(vma, start, end, 1); 413 vma->vm_flags = newflags;
407 414 ret = __mlock_vma_pages_range(vma, start, end);
408 if (ret > 0) { 415 if (ret < 0)
409 mm->locked_vm -= ret; 416 ret = __mlock_posix_error_return(ret);
410 ret = 0;
411 } else
412 ret = __mlock_posix_error_return(ret); /* translate if needed */
413 } else { 417 } else {
414 __mlock_vma_pages_range(vma, start, end, 0); 418 munlock_vma_pages_range(vma, start, end);
415 } 419 }
416 420
417out: 421out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 26892e346d8f..21d4029a07b3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h> 31#include <linux/perf_event.h>
32 32
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -570,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end);
570 570
571 /* 571 /*
572 * When changing only vma->vm_end, we don't really need 572 * When changing only vma->vm_end, we don't really need
573 * anon_vma lock: but is that case worth optimizing out? 573 * anon_vma lock.
574 */ 574 */
575 if (vma->anon_vma) 575 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
576 anon_vma = vma->anon_vma; 576 anon_vma = vma->anon_vma;
577 if (anon_vma) { 577 if (anon_vma) {
578 spin_lock(&anon_vma->lock); 578 spin_lock(&anon_vma->lock);
@@ -656,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end);
656 validate_mm(mm); 656 validate_mm(mm);
657} 657}
658 658
659/* Flags that can be inherited from an existing mapping when merging */
660#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
661
662/* 659/*
663 * If the vma has a ->close operation then the driver probably needs to release 660 * If the vma has a ->close operation then the driver probably needs to release
664 * per-vma resources, so we don't attempt to merge those. 661 * per-vma resources, so we don't attempt to merge those.
@@ -666,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end);
666static inline int is_mergeable_vma(struct vm_area_struct *vma, 663static inline int is_mergeable_vma(struct vm_area_struct *vma,
667 struct file *file, unsigned long vm_flags) 664 struct file *file, unsigned long vm_flags)
668{ 665{
669 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) 666 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
667 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
670 return 0; 668 return 0;
671 if (vma->vm_file != file) 669 if (vma->vm_file != file)
672 return 0; 670 return 0;
@@ -951,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
951 if (mm->map_count > sysctl_max_map_count) 949 if (mm->map_count > sysctl_max_map_count)
952 return -ENOMEM; 950 return -ENOMEM;
953 951
952 if (flags & MAP_HUGETLB) {
953 struct user_struct *user = NULL;
954 if (file)
955 return -EINVAL;
956
957 /*
958 * VM_NORESERVE is used because the reservations will be
959 * taken when vm_ops->mmap() is called
960 * A dummy user value is used because we are not locking
961 * memory so no accounting is necessary
962 */
963 len = ALIGN(len, huge_page_size(&default_hstate));
964 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
965 &user, HUGETLB_ANONHUGE_INODE);
966 if (IS_ERR(file))
967 return PTR_ERR(file);
968 }
969
954 /* Obtain the address to map to. we verify (or select) it and ensure 970 /* Obtain the address to map to. we verify (or select) it and ensure
955 * that it represents a valid section of the address space. 971 * that it represents a valid section of the address space.
956 */ 972 */
@@ -965,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
965 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 981 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
966 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 982 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
967 983
968 if (flags & MAP_LOCKED) { 984 if (flags & MAP_LOCKED)
969 if (!can_do_mlock()) 985 if (!can_do_mlock())
970 return -EPERM; 986 return -EPERM;
971 vm_flags |= VM_LOCKED;
972 }
973 987
974 /* mlock MCL_FUTURE? */ 988 /* mlock MCL_FUTURE? */
975 if (vm_flags & VM_LOCKED) { 989 if (vm_flags & VM_LOCKED) {
@@ -1195,21 +1209,21 @@ munmap_back:
1195 goto unmap_and_free_vma; 1209 goto unmap_and_free_vma;
1196 if (vm_flags & VM_EXECUTABLE) 1210 if (vm_flags & VM_EXECUTABLE)
1197 added_exe_file_vma(mm); 1211 added_exe_file_vma(mm);
1212
1213 /* Can addr have changed??
1214 *
1215 * Answer: Yes, several device drivers can do it in their
1216 * f_op->mmap method. -DaveM
1217 */
1218 addr = vma->vm_start;
1219 pgoff = vma->vm_pgoff;
1220 vm_flags = vma->vm_flags;
1198 } else if (vm_flags & VM_SHARED) { 1221 } else if (vm_flags & VM_SHARED) {
1199 error = shmem_zero_setup(vma); 1222 error = shmem_zero_setup(vma);
1200 if (error) 1223 if (error)
1201 goto free_vma; 1224 goto free_vma;
1202 } 1225 }
1203 1226
1204 /* Can addr have changed??
1205 *
1206 * Answer: Yes, several device drivers can do it in their
1207 * f_op->mmap method. -DaveM
1208 */
1209 addr = vma->vm_start;
1210 pgoff = vma->vm_pgoff;
1211 vm_flags = vma->vm_flags;
1212
1213 if (vma_wants_writenotify(vma)) 1227 if (vma_wants_writenotify(vma))
1214 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1228 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1215 1229
@@ -1220,7 +1234,7 @@ munmap_back:
1220 if (correct_wcount) 1234 if (correct_wcount)
1221 atomic_inc(&inode->i_writecount); 1235 atomic_inc(&inode->i_writecount);
1222out: 1236out:
1223 perf_counter_mmap(vma); 1237 perf_event_mmap(vma);
1224 1238
1225 mm->total_vm += len >> PAGE_SHIFT; 1239 mm->total_vm += len >> PAGE_SHIFT;
1226 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1240 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2111,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm)
2111 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2125 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2112 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2126 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2113 vm_unacct_memory(nr_accounted); 2127 vm_unacct_memory(nr_accounted);
2128
2114 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2129 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2115 tlb_finish_mmu(tlb, 0, end); 2130 tlb_finish_mmu(tlb, 0, end);
2116 2131
@@ -2308,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm,
2308 2323
2309 mm->total_vm += len >> PAGE_SHIFT; 2324 mm->total_vm += len >> PAGE_SHIFT;
2310 2325
2311 perf_counter_mmap(vma); 2326 perf_event_mmap(vma);
2312 2327
2313 return 0; 2328 return 0;
2314} 2329}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 000000000000..ded9081f4021
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,58 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 *
3 * See ../COPYING for licensing terms.
4 */
5
6#include <linux/mm.h>
7#include <linux/mmu_context.h>
8#include <linux/sched.h>
9
10#include <asm/mmu_context.h>
11
12/*
13 * use_mm
14 * Makes the calling kernel thread take on the specified
15 * mm context.
16 * Called by the retry thread execute retries within the
17 * iocb issuer's mm context, so that copy_from/to_user
18 * operations work seamlessly for aio.
19 * (Note: this routine is intended to be called only
20 * from a kernel thread context)
21 */
22void use_mm(struct mm_struct *mm)
23{
24 struct mm_struct *active_mm;
25 struct task_struct *tsk = current;
26
27 task_lock(tsk);
28 active_mm = tsk->active_mm;
29 if (active_mm != mm) {
30 atomic_inc(&mm->mm_count);
31 tsk->active_mm = mm;
32 }
33 tsk->mm = mm;
34 switch_mm(active_mm, mm, tsk);
35 task_unlock(tsk);
36
37 if (active_mm != mm)
38 mmdrop(active_mm);
39}
40
41/*
42 * unuse_mm
43 * Reverses the effect of use_mm, i.e. releases the
44 * specified mm context which was earlier taken on
45 * by the calling kernel thread
46 * (Note: this routine is intended to be called only
47 * from a kernel thread context)
48 */
49void unuse_mm(struct mm_struct *mm)
50{
51 struct task_struct *tsk = current;
52
53 task_lock(tsk);
54 tsk->mm = NULL;
55 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk);
58}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
99 return young; 99 return young;
100} 100}
101 101
102void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
103 pte_t pte)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->change_pte)
111 mn->ops->change_pte(mn, mm, address, pte);
112 /*
113 * Some drivers don't have change_pte,
114 * so we must call invalidate_page in that case.
115 */
116 else if (mn->ops->invalidate_page)
117 mn->ops->invalidate_page(mn, mm, address);
118 }
119 rcu_read_unlock();
120}
121
102void __mmu_notifier_invalidate_page(struct mm_struct *mm, 122void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address) 123 unsigned long address)
104{ 124{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2d..8bc969d8112d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h> 26#include <linux/perf_event.h>
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <asm/pgtable.h> 28#include <asm/pgtable.h>
29#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
301 if (error) 301 if (error)
302 goto out; 302 goto out;
303 perf_counter_mmap(vma); 303 perf_event_mmap(vma);
304 nstart = tmp; 304 nstart = tmp;
305 305
306 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b91be46..20a07dba6be0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/ksm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/swap.h> 16#include <linux/swap.h>
16#include <linux/capability.h> 17#include <linux/capability.h>
@@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
174 unsigned long excess = 0; 175 unsigned long excess = 0;
175 unsigned long hiwater_vm; 176 unsigned long hiwater_vm;
176 int split = 0; 177 int split = 0;
178 int err;
177 179
178 /* 180 /*
179 * We'd prefer to avoid failure later on in do_munmap: 181 * We'd prefer to avoid failure later on in do_munmap:
@@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
182 if (mm->map_count >= sysctl_max_map_count - 3) 184 if (mm->map_count >= sysctl_max_map_count - 3)
183 return -ENOMEM; 185 return -ENOMEM;
184 186
187 /*
188 * Advise KSM to break any KSM pages in the area to be moved:
189 * it would be confusing if they were to turn up at the new
190 * location, where they happen to coincide with different KSM
191 * pages recently unmapped. But leave vma->vm_flags as it was,
192 * so KSM can come around to merge on vma and new_vma afterwards.
193 */
194 err = ksm_madvise(vma, old_addr, old_addr + old_len,
195 MADV_UNMERGEABLE, &vm_flags);
196 if (err)
197 return err;
198
185 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 199 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
186 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 200 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
187 if (!new_vma) 201 if (!new_vma)
diff --git a/mm/nommu.c b/mm/nommu.c
index 66e81e7e9fe9..1a4473faac48 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,6 +33,7 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/mmu_context.h>
36#include "internal.h" 37#include "internal.h"
37 38
38static inline __attribute__((format(printf, 1, 2))) 39static inline __attribute__((format(printf, 1, 2)))
@@ -56,8 +57,6 @@ void no_printk(const char *fmt, ...)
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) 57 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif 58#endif
58 59
59#include "internal.h"
60
61void *high_memory; 60void *high_memory;
62struct page *mem_map; 61struct page *mem_map;
63unsigned long max_mapnr; 62unsigned long max_mapnr;
@@ -170,21 +169,20 @@ unsigned int kobjsize(const void *objp)
170} 169}
171 170
172int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 171int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
173 unsigned long start, int nr_pages, int flags, 172 unsigned long start, int nr_pages, int foll_flags,
174 struct page **pages, struct vm_area_struct **vmas) 173 struct page **pages, struct vm_area_struct **vmas)
175{ 174{
176 struct vm_area_struct *vma; 175 struct vm_area_struct *vma;
177 unsigned long vm_flags; 176 unsigned long vm_flags;
178 int i; 177 int i;
179 int write = !!(flags & GUP_FLAGS_WRITE);
180 int force = !!(flags & GUP_FLAGS_FORCE);
181 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
182 178
183 /* calculate required read or write permissions. 179 /* calculate required read or write permissions.
184 * - if 'force' is set, we only require the "MAY" flags. 180 * If FOLL_FORCE is set, we only require the "MAY" flags.
185 */ 181 */
186 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 182 vm_flags = (foll_flags & FOLL_WRITE) ?
187 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 183 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
184 vm_flags &= (foll_flags & FOLL_FORCE) ?
185 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
188 186
189 for (i = 0; i < nr_pages; i++) { 187 for (i = 0; i < nr_pages; i++) {
190 vma = find_vma(mm, start); 188 vma = find_vma(mm, start);
@@ -192,8 +190,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
192 goto finish_or_fault; 190 goto finish_or_fault;
193 191
194 /* protect what we can, including chardevs */ 192 /* protect what we can, including chardevs */
195 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 193 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
196 (!ignore && !(vm_flags & vma->vm_flags))) 194 !(vm_flags & vma->vm_flags))
197 goto finish_or_fault; 195 goto finish_or_fault;
198 196
199 if (pages) { 197 if (pages) {
@@ -212,7 +210,6 @@ finish_or_fault:
212 return i ? : -EFAULT; 210 return i ? : -EFAULT;
213} 211}
214 212
215
216/* 213/*
217 * get a list of pages in an address range belonging to the specified process 214 * get a list of pages in an address range belonging to the specified process
218 * and indicate the VMA that covers each page 215 * and indicate the VMA that covers each page
@@ -227,9 +224,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
227 int flags = 0; 224 int flags = 0;
228 225
229 if (write) 226 if (write)
230 flags |= GUP_FLAGS_WRITE; 227 flags |= FOLL_WRITE;
231 if (force) 228 if (force)
232 flags |= GUP_FLAGS_FORCE; 229 flags |= FOLL_FORCE;
233 230
234 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 231 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
235} 232}
@@ -627,6 +624,22 @@ static void put_nommu_region(struct vm_region *region)
627} 624}
628 625
629/* 626/*
627 * update protection on a vma
628 */
629static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
630{
631#ifdef CONFIG_MPU
632 struct mm_struct *mm = vma->vm_mm;
633 long start = vma->vm_start & PAGE_MASK;
634 while (start < vma->vm_end) {
635 protect_page(mm, start, flags);
636 start += PAGE_SIZE;
637 }
638 update_protections(mm);
639#endif
640}
641
642/*
630 * add a VMA into a process's mm_struct in the appropriate place in the list 643 * add a VMA into a process's mm_struct in the appropriate place in the list
631 * and tree and add to the address space's page tree also if not an anonymous 644 * and tree and add to the address space's page tree also if not an anonymous
632 * page 645 * page
@@ -645,6 +658,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
645 mm->map_count++; 658 mm->map_count++;
646 vma->vm_mm = mm; 659 vma->vm_mm = mm;
647 660
661 protect_vma(vma, vma->vm_flags);
662
648 /* add the VMA to the mapping */ 663 /* add the VMA to the mapping */
649 if (vma->vm_file) { 664 if (vma->vm_file) {
650 mapping = vma->vm_file->f_mapping; 665 mapping = vma->vm_file->f_mapping;
@@ -707,6 +722,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
707 722
708 kenter("%p", vma); 723 kenter("%p", vma);
709 724
725 protect_vma(vma, 0);
726
710 mm->map_count--; 727 mm->map_count--;
711 if (mm->mmap_cache == vma) 728 if (mm->mmap_cache == vma)
712 mm->mmap_cache = NULL; 729 mm->mmap_cache = NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..ea2147dabba6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_lock); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/*
38 * Is all threads of the target process nodes overlap ours?
39 */
40static int has_intersects_mems_allowed(struct task_struct *tsk)
41{
42 struct task_struct *t;
43
44 t = tsk;
45 do {
46 if (cpuset_mems_allowed_intersects(current, t))
47 return 1;
48 t = next_thread(t);
49 } while (t != tsk);
50
51 return 0;
52}
53
37/** 54/**
38 * badness - calculate a numeric value for how bad this task has been 55 * badness - calculate a numeric value for how bad this task has been
39 * @p: task struct of which task we should calculate 56 * @p: task struct of which task we should calculate
@@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 75 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 76 struct mm_struct *mm;
60 struct task_struct *child; 77 struct task_struct *child;
78 int oom_adj = p->signal->oom_adj;
79 struct task_cputime task_time;
80 unsigned long utime;
81 unsigned long stime;
82
83 if (oom_adj == OOM_DISABLE)
84 return 0;
61 85
62 task_lock(p); 86 task_lock(p);
63 mm = p->mm; 87 mm = p->mm;
@@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
79 /* 103 /*
80 * swapoff can easily use up all memory, so kill those first. 104 * swapoff can easily use up all memory, so kill those first.
81 */ 105 */
82 if (p->flags & PF_SWAPOFF) 106 if (p->flags & PF_OOM_ORIGIN)
83 return ULONG_MAX; 107 return ULONG_MAX;
84 108
85 /* 109 /*
@@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
102 * of seconds. There is no particular reason for this other than 126 * of seconds. There is no particular reason for this other than
103 * that it turned out to work very well in practice. 127 * that it turned out to work very well in practice.
104 */ 128 */
105 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) 129 thread_group_cputime(p, &task_time);
106 >> (SHIFT_HZ + 3); 130 utime = cputime_to_jiffies(task_time.utime);
131 stime = cputime_to_jiffies(task_time.stime);
132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
133
107 134
108 if (uptime >= p->start_time.tv_sec) 135 if (uptime >= p->start_time.tv_sec)
109 run_time = (uptime - p->start_time.tv_sec) >> 10; 136 run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
144 * because p may have allocated or otherwise mapped memory on 171 * because p may have allocated or otherwise mapped memory on
145 * this node before. However it will be less likely. 172 * this node before. However it will be less likely.
146 */ 173 */
147 if (!cpuset_mems_allowed_intersects(current, p)) 174 if (!has_intersects_mems_allowed(p))
148 points /= 8; 175 points /= 8;
149 176
150 /* 177 /*
151 * Adjust the score by oomkilladj. 178 * Adjust the score by oom_adj.
152 */ 179 */
153 if (p->oomkilladj) { 180 if (oom_adj) {
154 if (p->oomkilladj > 0) { 181 if (oom_adj > 0) {
155 if (!points) 182 if (!points)
156 points = 1; 183 points = 1;
157 points <<= p->oomkilladj; 184 points <<= oom_adj;
158 } else 185 } else
159 points >>= -(p->oomkilladj); 186 points >>= -(oom_adj);
160 } 187 }
161 188
162#ifdef DEBUG 189#ifdef DEBUG
@@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200static struct task_struct *select_bad_process(unsigned long *ppoints, 227static struct task_struct *select_bad_process(unsigned long *ppoints,
201 struct mem_cgroup *mem) 228 struct mem_cgroup *mem)
202{ 229{
203 struct task_struct *g, *p; 230 struct task_struct *p;
204 struct task_struct *chosen = NULL; 231 struct task_struct *chosen = NULL;
205 struct timespec uptime; 232 struct timespec uptime;
206 *ppoints = 0; 233 *ppoints = 0;
207 234
208 do_posix_clock_monotonic_gettime(&uptime); 235 do_posix_clock_monotonic_gettime(&uptime);
209 do_each_thread(g, p) { 236 for_each_process(p) {
210 unsigned long points; 237 unsigned long points;
211 238
212 /* 239 /*
@@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 278 *ppoints = ULONG_MAX;
252 } 279 }
253 280
254 if (p->oomkilladj == OOM_DISABLE) 281 if (p->signal->oom_adj == OOM_DISABLE)
255 continue; 282 continue;
256 283
257 points = badness(p, uptime.tv_sec); 284 points = badness(p, uptime.tv_sec);
@@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
259 chosen = p; 286 chosen = p;
260 *ppoints = points; 287 *ppoints = points;
261 } 288 }
262 } while_each_thread(g, p); 289 }
263 290
264 return chosen; 291 return chosen;
265} 292}
@@ -304,7 +331,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 331 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
308 p->comm); 335 p->comm);
309 task_unlock(p); 336 task_unlock(p);
310 } while_each_thread(g, p); 337 } while_each_thread(g, p);
@@ -346,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
346 373
347static int oom_kill_task(struct task_struct *p) 374static int oom_kill_task(struct task_struct *p)
348{ 375{
349 struct mm_struct *mm;
350 struct task_struct *g, *q;
351
352 mm = p->mm;
353
354 /* WARNING: mm may not be dereferenced since we did not obtain its 376 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is 377 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below. 378 * compare mm to q->mm below.
@@ -359,30 +381,11 @@ static int oom_kill_task(struct task_struct *p)
359 * change to NULL at any time since we do not hold task_lock(p). 381 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us. 382 * However, this is of no concern to us.
361 */ 383 */
362 384 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
363 if (mm == NULL)
364 return 1; 385 return 1;
365 386
366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 387 __oom_kill_task(p, 1);
375 388
376 /*
377 * kill all processes that share the ->mm (i.e. all threads),
378 * but are in a different thread group. Don't let them have access
379 * to memory reserves though, otherwise we might deplete all memory.
380 */
381 do_each_thread(g, q) {
382 if (q->mm == mm && !same_thread_group(q, p))
383 force_sig(SIGKILL, q);
384 } while_each_thread(g, q);
385
386 return 0; 389 return 0;
387} 390}
388 391
@@ -394,8 +397,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 397
395 if (printk_ratelimit()) { 398 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: " 399 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj); 401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
399 task_lock(current); 403 task_lock(current);
400 cpuset_print_task_mems_allowed(current); 404 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 405 task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1eea4fa0d410..5f378dd58802 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -380,7 +380,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
380 struct zone *z = 380 struct zone *z =
381 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 381 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
382 382
383 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); 383 x += zone_page_state(z, NR_FREE_PAGES) +
384 zone_reclaimable_pages(z);
384 } 385 }
385 /* 386 /*
386 * Make sure that the number of highmem pages is never larger 387 * Make sure that the number of highmem pages is never larger
@@ -404,7 +405,7 @@ unsigned long determine_dirtyable_memory(void)
404{ 405{
405 unsigned long x; 406 unsigned long x;
406 407
407 x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); 408 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
408 409
409 if (!vm_highmem_is_dirtyable) 410 if (!vm_highmem_is_dirtyable)
410 x -= highmem_dirtyable_memory(x); 411 x -= highmem_dirtyable_memory(x);
@@ -485,6 +486,7 @@ static void balance_dirty_pages(struct address_space *mapping)
485 unsigned long bdi_thresh; 486 unsigned long bdi_thresh;
486 unsigned long pages_written = 0; 487 unsigned long pages_written = 0;
487 unsigned long write_chunk = sync_writeback_pages(); 488 unsigned long write_chunk = sync_writeback_pages();
489 unsigned long pause = 1;
488 490
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 491 struct backing_dev_info *bdi = mapping->backing_dev_info;
490 492
@@ -561,7 +563,15 @@ static void balance_dirty_pages(struct address_space *mapping)
561 if (pages_written >= write_chunk) 563 if (pages_written >= write_chunk)
562 break; /* We've done our duty */ 564 break; /* We've done our duty */
563 565
564 schedule_timeout(1); 566 schedule_timeout_interruptible(pause);
567
568 /*
569 * Increase the delay for each loop, up to our previous
570 * default of taking a 100ms nap.
571 */
572 pause <<= 1;
573 if (pause > HZ / 10)
574 pause = HZ / 10;
565 } 575 }
566 576
567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 577 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a0de15f46987..5717f27a0704 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <trace/events/kmem.h>
51 52
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
53#include <asm/div64.h> 54#include <asm/div64.h>
@@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states);
71 72
72unsigned long totalram_pages __read_mostly; 73unsigned long totalram_pages __read_mostly;
73unsigned long totalreserve_pages __read_mostly; 74unsigned long totalreserve_pages __read_mostly;
74unsigned long highest_memmap_pfn __read_mostly;
75int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
77 77
@@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
123 123
124int min_free_kbytes = 1024; 124int min_free_kbytes = 1024;
125 125
126unsigned long __meminitdata nr_kernel_pages; 126static unsigned long __meminitdata nr_kernel_pages;
127unsigned long __meminitdata nr_all_pages; 127static unsigned long __meminitdata nr_all_pages;
128static unsigned long __meminitdata dma_reserve; 128static unsigned long __meminitdata dma_reserve;
129 129
130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -510,7 +510,7 @@ static inline int free_pages_check(struct page *page)
510} 510}
511 511
512/* 512/*
513 * Frees a list of pages. 513 * Frees a number of pages from the PCP lists
514 * Assumes all pages on list are in same zone, and of same order. 514 * Assumes all pages on list are in same zone, and of same order.
515 * count is the number of pages to free. 515 * count is the number of pages to free.
516 * 516 *
@@ -520,22 +520,42 @@ static inline int free_pages_check(struct page *page)
520 * And clear the zone's pages_scanned counter, to hold off the "all pages are 520 * And clear the zone's pages_scanned counter, to hold off the "all pages are
521 * pinned" detection logic. 521 * pinned" detection logic.
522 */ 522 */
523static void free_pages_bulk(struct zone *zone, int count, 523static void free_pcppages_bulk(struct zone *zone, int count,
524 struct list_head *list, int order) 524 struct per_cpu_pages *pcp)
525{ 525{
526 int migratetype = 0;
527 int batch_free = 0;
528
526 spin_lock(&zone->lock); 529 spin_lock(&zone->lock);
527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 530 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
528 zone->pages_scanned = 0; 531 zone->pages_scanned = 0;
529 532
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); 533 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
531 while (count--) { 534 while (count) {
532 struct page *page; 535 struct page *page;
536 struct list_head *list;
533 537
534 VM_BUG_ON(list_empty(list)); 538 /*
535 page = list_entry(list->prev, struct page, lru); 539 * Remove pages from lists in a round-robin fashion. A
536 /* have to delete it as __free_one_page list manipulates */ 540 * batch_free count is maintained that is incremented when an
537 list_del(&page->lru); 541 * empty list is encountered. This is so more pages are freed
538 __free_one_page(page, zone, order, page_private(page)); 542 * off fuller lists instead of spinning excessively around empty
543 * lists
544 */
545 do {
546 batch_free++;
547 if (++migratetype == MIGRATE_PCPTYPES)
548 migratetype = 0;
549 list = &pcp->lists[migratetype];
550 } while (list_empty(list));
551
552 do {
553 page = list_entry(list->prev, struct page, lru);
554 /* must delete as __free_one_page list manipulates */
555 list_del(&page->lru);
556 __free_one_page(page, zone, 0, migratetype);
557 trace_mm_page_pcpu_drain(page, 0, migratetype);
558 } while (--count && --batch_free && !list_empty(list));
539 } 559 }
540 spin_unlock(&zone->lock); 560 spin_unlock(&zone->lock);
541} 561}
@@ -557,7 +577,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
557 unsigned long flags; 577 unsigned long flags;
558 int i; 578 int i;
559 int bad = 0; 579 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page); 580 int wasMlocked = __TestClearPageMlocked(page);
561 581
562 kmemcheck_free_shadow(page, order); 582 kmemcheck_free_shadow(page, order);
563 583
@@ -783,6 +803,17 @@ static int move_freepages_block(struct zone *zone, struct page *page,
783 return move_freepages(zone, start_page, end_page, migratetype); 803 return move_freepages(zone, start_page, end_page, migratetype);
784} 804}
785 805
806static void change_pageblock_range(struct page *pageblock_page,
807 int start_order, int migratetype)
808{
809 int nr_pageblocks = 1 << (start_order - pageblock_order);
810
811 while (nr_pageblocks--) {
812 set_pageblock_migratetype(pageblock_page, migratetype);
813 pageblock_page += pageblock_nr_pages;
814 }
815}
816
786/* Remove an element from the buddy allocator from the fallback list */ 817/* Remove an element from the buddy allocator from the fallback list */
787static inline struct page * 818static inline struct page *
788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 819__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -836,11 +867,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
836 list_del(&page->lru); 867 list_del(&page->lru);
837 rmv_page_order(page); 868 rmv_page_order(page);
838 869
839 if (current_order == pageblock_order) 870 /* Take ownership for orders >= pageblock_order */
840 set_pageblock_migratetype(page, 871 if (current_order >= pageblock_order)
872 change_pageblock_range(page, current_order,
841 start_migratetype); 873 start_migratetype);
842 874
843 expand(zone, page, order, current_order, area, migratetype); 875 expand(zone, page, order, current_order, area, migratetype);
876
877 trace_mm_page_alloc_extfrag(page, order, current_order,
878 start_migratetype, migratetype);
879
844 return page; 880 return page;
845 } 881 }
846 } 882 }
@@ -874,6 +910,7 @@ retry_reserve:
874 } 910 }
875 } 911 }
876 912
913 trace_mm_page_alloc_zone_locked(page, order, migratetype);
877 return page; 914 return page;
878} 915}
879 916
@@ -934,7 +971,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
934 to_drain = pcp->batch; 971 to_drain = pcp->batch;
935 else 972 else
936 to_drain = pcp->count; 973 to_drain = pcp->count;
937 free_pages_bulk(zone, to_drain, &pcp->list, 0); 974 free_pcppages_bulk(zone, to_drain, pcp);
938 pcp->count -= to_drain; 975 pcp->count -= to_drain;
939 local_irq_restore(flags); 976 local_irq_restore(flags);
940} 977}
@@ -960,7 +997,7 @@ static void drain_pages(unsigned int cpu)
960 997
961 pcp = &pset->pcp; 998 pcp = &pset->pcp;
962 local_irq_save(flags); 999 local_irq_save(flags);
963 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 1000 free_pcppages_bulk(zone, pcp->count, pcp);
964 pcp->count = 0; 1001 pcp->count = 0;
965 local_irq_restore(flags); 1002 local_irq_restore(flags);
966 } 1003 }
@@ -1026,7 +1063,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1026 struct zone *zone = page_zone(page); 1063 struct zone *zone = page_zone(page);
1027 struct per_cpu_pages *pcp; 1064 struct per_cpu_pages *pcp;
1028 unsigned long flags; 1065 unsigned long flags;
1029 int wasMlocked = TestClearPageMlocked(page); 1066 int migratetype;
1067 int wasMlocked = __TestClearPageMlocked(page);
1030 1068
1031 kmemcheck_free_shadow(page, 0); 1069 kmemcheck_free_shadow(page, 0);
1032 1070
@@ -1043,35 +1081,49 @@ static void free_hot_cold_page(struct page *page, int cold)
1043 kernel_map_pages(page, 1, 0); 1081 kernel_map_pages(page, 1, 0);
1044 1082
1045 pcp = &zone_pcp(zone, get_cpu())->pcp; 1083 pcp = &zone_pcp(zone, get_cpu())->pcp;
1046 set_page_private(page, get_pageblock_migratetype(page)); 1084 migratetype = get_pageblock_migratetype(page);
1085 set_page_private(page, migratetype);
1047 local_irq_save(flags); 1086 local_irq_save(flags);
1048 if (unlikely(wasMlocked)) 1087 if (unlikely(wasMlocked))
1049 free_page_mlock(page); 1088 free_page_mlock(page);
1050 __count_vm_event(PGFREE); 1089 __count_vm_event(PGFREE);
1051 1090
1091 /*
1092 * We only track unmovable, reclaimable and movable on pcp lists.
1093 * Free ISOLATE pages back to the allocator because they are being
1094 * offlined but treat RESERVE as movable pages so we can get those
1095 * areas back if necessary. Otherwise, we may have to free
1096 * excessively into the page allocator
1097 */
1098 if (migratetype >= MIGRATE_PCPTYPES) {
1099 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1100 free_one_page(zone, page, 0, migratetype);
1101 goto out;
1102 }
1103 migratetype = MIGRATE_MOVABLE;
1104 }
1105
1052 if (cold) 1106 if (cold)
1053 list_add_tail(&page->lru, &pcp->list); 1107 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1054 else 1108 else
1055 list_add(&page->lru, &pcp->list); 1109 list_add(&page->lru, &pcp->lists[migratetype]);
1056 pcp->count++; 1110 pcp->count++;
1057 if (pcp->count >= pcp->high) { 1111 if (pcp->count >= pcp->high) {
1058 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1112 free_pcppages_bulk(zone, pcp->batch, pcp);
1059 pcp->count -= pcp->batch; 1113 pcp->count -= pcp->batch;
1060 } 1114 }
1115
1116out:
1061 local_irq_restore(flags); 1117 local_irq_restore(flags);
1062 put_cpu(); 1118 put_cpu();
1063} 1119}
1064 1120
1065void free_hot_page(struct page *page) 1121void free_hot_page(struct page *page)
1066{ 1122{
1123 trace_mm_page_free_direct(page, 0);
1067 free_hot_cold_page(page, 0); 1124 free_hot_cold_page(page, 0);
1068} 1125}
1069 1126
1070void free_cold_page(struct page *page)
1071{
1072 free_hot_cold_page(page, 1);
1073}
1074
1075/* 1127/*
1076 * split_page takes a non-compound higher-order page, and splits it into 1128 * split_page takes a non-compound higher-order page, and splits it into
1077 * n (1<<order) sub-pages: page[0..n] 1129 * n (1<<order) sub-pages: page[0..n]
@@ -1119,35 +1171,23 @@ again:
1119 cpu = get_cpu(); 1171 cpu = get_cpu();
1120 if (likely(order == 0)) { 1172 if (likely(order == 0)) {
1121 struct per_cpu_pages *pcp; 1173 struct per_cpu_pages *pcp;
1174 struct list_head *list;
1122 1175
1123 pcp = &zone_pcp(zone, cpu)->pcp; 1176 pcp = &zone_pcp(zone, cpu)->pcp;
1177 list = &pcp->lists[migratetype];
1124 local_irq_save(flags); 1178 local_irq_save(flags);
1125 if (!pcp->count) { 1179 if (list_empty(list)) {
1126 pcp->count = rmqueue_bulk(zone, 0, 1180 pcp->count += rmqueue_bulk(zone, 0,
1127 pcp->batch, &pcp->list, 1181 pcp->batch, list,
1128 migratetype, cold); 1182 migratetype, cold);
1129 if (unlikely(!pcp->count)) 1183 if (unlikely(list_empty(list)))
1130 goto failed; 1184 goto failed;
1131 } 1185 }
1132 1186
1133 /* Find a page of the appropriate migrate type */ 1187 if (cold)
1134 if (cold) { 1188 page = list_entry(list->prev, struct page, lru);
1135 list_for_each_entry_reverse(page, &pcp->list, lru) 1189 else
1136 if (page_private(page) == migratetype) 1190 page = list_entry(list->next, struct page, lru);
1137 break;
1138 } else {
1139 list_for_each_entry(page, &pcp->list, lru)
1140 if (page_private(page) == migratetype)
1141 break;
1142 }
1143
1144 /* Allocate more to the pcp list if necessary */
1145 if (unlikely(&page->lru == &pcp->list)) {
1146 pcp->count += rmqueue_bulk(zone, 0,
1147 pcp->batch, &pcp->list,
1148 migratetype, cold);
1149 page = list_entry(pcp->list.next, struct page, lru);
1150 }
1151 1191
1152 list_del(&page->lru); 1192 list_del(&page->lru);
1153 pcp->count--; 1193 pcp->count--;
@@ -1627,10 +1667,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1627 1667
1628 /* We now go into synchronous reclaim */ 1668 /* We now go into synchronous reclaim */
1629 cpuset_memory_pressure_bump(); 1669 cpuset_memory_pressure_bump();
1630
1631 /*
1632 * The task's cpuset might have expanded its set of allowable nodes
1633 */
1634 p->flags |= PF_MEMALLOC; 1670 p->flags |= PF_MEMALLOC;
1635 lockdep_set_current_reclaim_state(gfp_mask); 1671 lockdep_set_current_reclaim_state(gfp_mask);
1636 reclaim_state.reclaimed_slab = 0; 1672 reclaim_state.reclaimed_slab = 0;
@@ -1765,6 +1801,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1765 1801
1766 wake_all_kswapd(order, zonelist, high_zoneidx); 1802 wake_all_kswapd(order, zonelist, high_zoneidx);
1767 1803
1804restart:
1768 /* 1805 /*
1769 * OK, we're below the kswapd watermark and have kicked background 1806 * OK, we're below the kswapd watermark and have kicked background
1770 * reclaim. Now things get more complex, so set up alloc_flags according 1807 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -1772,7 +1809,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1772 */ 1809 */
1773 alloc_flags = gfp_to_alloc_flags(gfp_mask); 1810 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1774 1811
1775restart:
1776 /* This is the last chance, in general, before the goto nopage. */ 1812 /* This is the last chance, in general, before the goto nopage. */
1777 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1813 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1778 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 1814 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1907,6 +1943,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1907 zonelist, high_zoneidx, nodemask, 1943 zonelist, high_zoneidx, nodemask,
1908 preferred_zone, migratetype); 1944 preferred_zone, migratetype);
1909 1945
1946 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1910 return page; 1947 return page;
1911} 1948}
1912EXPORT_SYMBOL(__alloc_pages_nodemask); 1949EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -1916,44 +1953,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask);
1916 */ 1953 */
1917unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1954unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1918{ 1955{
1919 struct page * page; 1956 struct page *page;
1957
1958 /*
1959 * __get_free_pages() returns a 32-bit address, which cannot represent
1960 * a highmem page
1961 */
1962 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1963
1920 page = alloc_pages(gfp_mask, order); 1964 page = alloc_pages(gfp_mask, order);
1921 if (!page) 1965 if (!page)
1922 return 0; 1966 return 0;
1923 return (unsigned long) page_address(page); 1967 return (unsigned long) page_address(page);
1924} 1968}
1925
1926EXPORT_SYMBOL(__get_free_pages); 1969EXPORT_SYMBOL(__get_free_pages);
1927 1970
1928unsigned long get_zeroed_page(gfp_t gfp_mask) 1971unsigned long get_zeroed_page(gfp_t gfp_mask)
1929{ 1972{
1930 struct page * page; 1973 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1931
1932 /*
1933 * get_zeroed_page() returns a 32-bit address, which cannot represent
1934 * a highmem page
1935 */
1936 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1937
1938 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1939 if (page)
1940 return (unsigned long) page_address(page);
1941 return 0;
1942} 1974}
1943
1944EXPORT_SYMBOL(get_zeroed_page); 1975EXPORT_SYMBOL(get_zeroed_page);
1945 1976
1946void __pagevec_free(struct pagevec *pvec) 1977void __pagevec_free(struct pagevec *pvec)
1947{ 1978{
1948 int i = pagevec_count(pvec); 1979 int i = pagevec_count(pvec);
1949 1980
1950 while (--i >= 0) 1981 while (--i >= 0) {
1982 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1951 free_hot_cold_page(pvec->pages[i], pvec->cold); 1983 free_hot_cold_page(pvec->pages[i], pvec->cold);
1984 }
1952} 1985}
1953 1986
1954void __free_pages(struct page *page, unsigned int order) 1987void __free_pages(struct page *page, unsigned int order)
1955{ 1988{
1956 if (put_page_testzero(page)) { 1989 if (put_page_testzero(page)) {
1990 trace_mm_page_free_direct(page, order);
1957 if (order == 0) 1991 if (order == 0)
1958 free_hot_page(page); 1992 free_hot_page(page);
1959 else 1993 else
@@ -2128,23 +2162,28 @@ void show_free_areas(void)
2128 } 2162 }
2129 } 2163 }
2130 2164
2131 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2165 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2132 " inactive_file:%lu" 2166 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2133 " unevictable:%lu" 2167 " unevictable:%lu"
2134 " dirty:%lu writeback:%lu unstable:%lu\n" 2168 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
2135 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2169 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2170 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2136 global_page_state(NR_ACTIVE_ANON), 2171 global_page_state(NR_ACTIVE_ANON),
2137 global_page_state(NR_ACTIVE_FILE),
2138 global_page_state(NR_INACTIVE_ANON), 2172 global_page_state(NR_INACTIVE_ANON),
2173 global_page_state(NR_ISOLATED_ANON),
2174 global_page_state(NR_ACTIVE_FILE),
2139 global_page_state(NR_INACTIVE_FILE), 2175 global_page_state(NR_INACTIVE_FILE),
2176 global_page_state(NR_ISOLATED_FILE),
2140 global_page_state(NR_UNEVICTABLE), 2177 global_page_state(NR_UNEVICTABLE),
2141 global_page_state(NR_FILE_DIRTY), 2178 global_page_state(NR_FILE_DIRTY),
2142 global_page_state(NR_WRITEBACK), 2179 global_page_state(NR_WRITEBACK),
2143 global_page_state(NR_UNSTABLE_NFS), 2180 global_page_state(NR_UNSTABLE_NFS),
2181 nr_blockdev_pages(),
2144 global_page_state(NR_FREE_PAGES), 2182 global_page_state(NR_FREE_PAGES),
2145 global_page_state(NR_SLAB_RECLAIMABLE) + 2183 global_page_state(NR_SLAB_RECLAIMABLE),
2146 global_page_state(NR_SLAB_UNRECLAIMABLE), 2184 global_page_state(NR_SLAB_UNRECLAIMABLE),
2147 global_page_state(NR_FILE_MAPPED), 2185 global_page_state(NR_FILE_MAPPED),
2186 global_page_state(NR_SHMEM),
2148 global_page_state(NR_PAGETABLE), 2187 global_page_state(NR_PAGETABLE),
2149 global_page_state(NR_BOUNCE)); 2188 global_page_state(NR_BOUNCE));
2150 2189
@@ -2162,7 +2201,21 @@ void show_free_areas(void)
2162 " active_file:%lukB" 2201 " active_file:%lukB"
2163 " inactive_file:%lukB" 2202 " inactive_file:%lukB"
2164 " unevictable:%lukB" 2203 " unevictable:%lukB"
2204 " isolated(anon):%lukB"
2205 " isolated(file):%lukB"
2165 " present:%lukB" 2206 " present:%lukB"
2207 " mlocked:%lukB"
2208 " dirty:%lukB"
2209 " writeback:%lukB"
2210 " mapped:%lukB"
2211 " shmem:%lukB"
2212 " slab_reclaimable:%lukB"
2213 " slab_unreclaimable:%lukB"
2214 " kernel_stack:%lukB"
2215 " pagetables:%lukB"
2216 " unstable:%lukB"
2217 " bounce:%lukB"
2218 " writeback_tmp:%lukB"
2166 " pages_scanned:%lu" 2219 " pages_scanned:%lu"
2167 " all_unreclaimable? %s" 2220 " all_unreclaimable? %s"
2168 "\n", 2221 "\n",
@@ -2176,7 +2229,22 @@ void show_free_areas(void)
2176 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2229 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2177 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2230 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2178 K(zone_page_state(zone, NR_UNEVICTABLE)), 2231 K(zone_page_state(zone, NR_UNEVICTABLE)),
2232 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2233 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2179 K(zone->present_pages), 2234 K(zone->present_pages),
2235 K(zone_page_state(zone, NR_MLOCK)),
2236 K(zone_page_state(zone, NR_FILE_DIRTY)),
2237 K(zone_page_state(zone, NR_WRITEBACK)),
2238 K(zone_page_state(zone, NR_FILE_MAPPED)),
2239 K(zone_page_state(zone, NR_SHMEM)),
2240 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2241 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2242 zone_page_state(zone, NR_KERNEL_STACK) *
2243 THREAD_SIZE / 1024,
2244 K(zone_page_state(zone, NR_PAGETABLE)),
2245 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2246 K(zone_page_state(zone, NR_BOUNCE)),
2247 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2180 zone->pages_scanned, 2248 zone->pages_scanned,
2181 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2249 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
2182 ); 2250 );
@@ -2783,7 +2851,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2783{ 2851{
2784 unsigned long start_pfn, pfn, end_pfn; 2852 unsigned long start_pfn, pfn, end_pfn;
2785 struct page *page; 2853 struct page *page;
2786 unsigned long reserve, block_migratetype; 2854 unsigned long block_migratetype;
2855 int reserve;
2787 2856
2788 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2857 /* Get the start pfn, end pfn and the number of blocks to reserve */
2789 start_pfn = zone->zone_start_pfn; 2858 start_pfn = zone->zone_start_pfn;
@@ -2791,6 +2860,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2791 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 2860 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2792 pageblock_order; 2861 pageblock_order;
2793 2862
2863 /*
2864 * Reserve blocks are generally in place to help high-order atomic
2865 * allocations that are short-lived. A min_free_kbytes value that
2866 * would result in more than 2 reserve blocks for atomic allocations
2867 * is assumed to be in place to help anti-fragmentation for the
2868 * future allocation of hugepages at runtime.
2869 */
2870 reserve = min(2, reserve);
2871
2794 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2872 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2795 if (!pfn_valid(pfn)) 2873 if (!pfn_valid(pfn))
2796 continue; 2874 continue;
@@ -2961,6 +3039,7 @@ static int zone_batchsize(struct zone *zone)
2961static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3039static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2962{ 3040{
2963 struct per_cpu_pages *pcp; 3041 struct per_cpu_pages *pcp;
3042 int migratetype;
2964 3043
2965 memset(p, 0, sizeof(*p)); 3044 memset(p, 0, sizeof(*p));
2966 3045
@@ -2968,7 +3047,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2968 pcp->count = 0; 3047 pcp->count = 0;
2969 pcp->high = 6 * batch; 3048 pcp->high = 6 * batch;
2970 pcp->batch = max(1UL, 1 * batch); 3049 pcp->batch = max(1UL, 1 * batch);
2971 INIT_LIST_HEAD(&pcp->list); 3050 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3051 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2972} 3052}
2973 3053
2974/* 3054/*
@@ -3146,6 +3226,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3146 return 0; 3226 return 0;
3147} 3227}
3148 3228
3229static int __zone_pcp_update(void *data)
3230{
3231 struct zone *zone = data;
3232 int cpu;
3233 unsigned long batch = zone_batchsize(zone), flags;
3234
3235 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3236 struct per_cpu_pageset *pset;
3237 struct per_cpu_pages *pcp;
3238
3239 pset = zone_pcp(zone, cpu);
3240 pcp = &pset->pcp;
3241
3242 local_irq_save(flags);
3243 free_pcppages_bulk(zone, pcp->count, pcp);
3244 setup_pageset(pset, batch);
3245 local_irq_restore(flags);
3246 }
3247 return 0;
3248}
3249
3250void zone_pcp_update(struct zone *zone)
3251{
3252 stop_machine(__zone_pcp_update, zone, NULL);
3253}
3254
3149static __meminit void zone_pcp_init(struct zone *zone) 3255static __meminit void zone_pcp_init(struct zone *zone)
3150{ 3256{
3151 int cpu; 3257 int cpu;
@@ -3720,7 +3826,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3720 zone_pcp_init(zone); 3826 zone_pcp_init(zone);
3721 for_each_lru(l) { 3827 for_each_lru(l) {
3722 INIT_LIST_HEAD(&zone->lru[l].list); 3828 INIT_LIST_HEAD(&zone->lru[l].list);
3723 zone->lru[l].nr_saved_scan = 0; 3829 zone->reclaim_stat.nr_saved_scan[l] = 0;
3724 } 3830 }
3725 zone->reclaim_stat.recent_rotated[0] = 0; 3831 zone->reclaim_stat.recent_rotated[0] = 0;
3726 zone->reclaim_stat.recent_rotated[1] = 0; 3832 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4509,7 +4615,7 @@ void setup_per_zone_wmarks(void)
4509 calculate_totalreserve_pages(); 4615 calculate_totalreserve_pages();
4510} 4616}
4511 4617
4512/** 4618/*
4513 * The inactive anon list should be small enough that the VM never has to 4619 * The inactive anon list should be small enough that the VM never has to
4514 * do too much work, but large enough that each inactive page has a chance 4620 * do too much work, but large enough that each inactive page has a chance
4515 * to be referenced again before it is swapped out. 4621 * to be referenced again before it is swapped out.
@@ -4732,7 +4838,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4732 numentries <<= (PAGE_SHIFT - scale); 4838 numentries <<= (PAGE_SHIFT - scale);
4733 4839
4734 /* Make sure we've got at least a 0-order allocation.. */ 4840 /* Make sure we've got at least a 0-order allocation.. */
4735 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4841 if (unlikely(flags & HASH_SMALL)) {
4842 /* Makes no sense without HASH_EARLY */
4843 WARN_ON(!(flags & HASH_EARLY));
4844 if (!(numentries >> *_hash_shift)) {
4845 numentries = 1UL << *_hash_shift;
4846 BUG_ON(!numentries);
4847 }
4848 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4736 numentries = PAGE_SIZE / bucketsize; 4849 numentries = PAGE_SIZE / bucketsize;
4737 } 4850 }
4738 numentries = roundup_pow_of_two(numentries); 4851 numentries = roundup_pow_of_two(numentries);
@@ -4874,13 +4987,16 @@ int set_migratetype_isolate(struct page *page)
4874 struct zone *zone; 4987 struct zone *zone;
4875 unsigned long flags; 4988 unsigned long flags;
4876 int ret = -EBUSY; 4989 int ret = -EBUSY;
4990 int zone_idx;
4877 4991
4878 zone = page_zone(page); 4992 zone = page_zone(page);
4993 zone_idx = zone_idx(zone);
4879 spin_lock_irqsave(&zone->lock, flags); 4994 spin_lock_irqsave(&zone->lock, flags);
4880 /* 4995 /*
4881 * In future, more migrate types will be able to be isolation target. 4996 * In future, more migrate types will be able to be isolation target.
4882 */ 4997 */
4883 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 4998 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
4999 zone_idx != ZONE_MOVABLE)
4884 goto out; 5000 goto out;
4885 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5001 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4886 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5002 move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f22b4ebbd8dc..3d535d594826 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
116 nid = page_to_nid(pfn_to_page(pfn)); 116 nid = page_to_nid(pfn_to_page(pfn));
117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
118 VM_BUG_ON(!slab_is_available()); 118 VM_BUG_ON(!slab_is_available());
119 base = kmalloc_node(table_size, 119 if (node_state(nid, N_HIGH_MEMORY)) {
120 base = kmalloc_node(table_size,
120 GFP_KERNEL | __GFP_NOWARN, nid); 121 GFP_KERNEL | __GFP_NOWARN, nid);
121 if (!base) 122 if (!base)
122 base = vmalloc_node(table_size, nid); 123 base = vmalloc_node(table_size, nid);
124 } else {
125 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
126 if (!base)
127 base = vmalloc(table_size);
128 }
123 } else { 129 } else {
124 /* 130 /*
125 * We don't have to allocate page_cgroup again, but 131 * We don't have to allocate page_cgroup again, but
diff --git a/mm/rmap.c b/mm/rmap.c
index 0895b5c7cbff..720fc03a7bc4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -710,27 +710,6 @@ void page_add_file_rmap(struct page *page)
710 } 710 }
711} 711}
712 712
713#ifdef CONFIG_DEBUG_VM
714/**
715 * page_dup_rmap - duplicate pte mapping to a page
716 * @page: the page to add the mapping to
717 * @vma: the vm area being duplicated
718 * @address: the user virtual address mapped
719 *
720 * For copy_page_range only: minimal extract from page_add_file_rmap /
721 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
722 * quicker.
723 *
724 * The caller needs to hold the pte lock.
725 */
726void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
727{
728 if (PageAnon(page))
729 __page_check_anon_rmap(page, vma, address);
730 atomic_inc(&page->_mapcount);
731}
732#endif
733
734/** 713/**
735 * page_remove_rmap - take down pte mapping from a page 714 * page_remove_rmap - take down pte mapping from a page
736 * @page: page to remove mapping from 715 * @page: page to remove mapping from
@@ -739,34 +718,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
739 */ 718 */
740void page_remove_rmap(struct page *page) 719void page_remove_rmap(struct page *page)
741{ 720{
742 if (atomic_add_negative(-1, &page->_mapcount)) { 721 /* page still mapped by someone else? */
743 /* 722 if (!atomic_add_negative(-1, &page->_mapcount))
744 * Now that the last pte has gone, s390 must transfer dirty 723 return;
745 * flag from storage key to struct page. We can usually skip 724
746 * this if the page is anon, so about to be freed; but perhaps 725 /*
747 * not if it's in swapcache - there might be another pte slot 726 * Now that the last pte has gone, s390 must transfer dirty
748 * containing the swap entry, but page not yet written to swap. 727 * flag from storage key to struct page. We can usually skip
749 */ 728 * this if the page is anon, so about to be freed; but perhaps
750 if ((!PageAnon(page) || PageSwapCache(page)) && 729 * not if it's in swapcache - there might be another pte slot
751 page_test_dirty(page)) { 730 * containing the swap entry, but page not yet written to swap.
752 page_clear_dirty(page); 731 */
753 set_page_dirty(page); 732 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
754 } 733 page_clear_dirty(page);
755 if (PageAnon(page)) 734 set_page_dirty(page);
756 mem_cgroup_uncharge_page(page);
757 __dec_zone_page_state(page,
758 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
759 mem_cgroup_update_mapped_file_stat(page, -1);
760 /*
761 * It would be tidy to reset the PageAnon mapping here,
762 * but that might overwrite a racing page_add_anon_rmap
763 * which increments mapcount after us but sets mapping
764 * before us: so leave the reset to free_hot_cold_page,
765 * and remember that it's only reliable while mapped.
766 * Leaving it set also helps swapoff to reinstate ptes
767 * faster for those pages still in swapcache.
768 */
769 } 735 }
736 if (PageAnon(page)) {
737 mem_cgroup_uncharge_page(page);
738 __dec_zone_page_state(page, NR_ANON_PAGES);
739 } else {
740 __dec_zone_page_state(page, NR_FILE_MAPPED);
741 }
742 mem_cgroup_update_mapped_file_stat(page, -1);
743 /*
744 * It would be tidy to reset the PageAnon mapping here,
745 * but that might overwrite a racing page_add_anon_rmap
746 * which increments mapcount after us but sets mapping
747 * before us: so leave the reset to free_hot_cold_page,
748 * and remember that it's only reliable while mapped.
749 * Leaving it set also helps swapoff to reinstate ptes
750 * faster for those pages still in swapcache.
751 */
770} 752}
771 753
772/* 754/*
diff --git a/mm/shmem.c b/mm/shmem.c
index bd20f8bb02aa..b206a7a32e2a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt;
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
50#include <linux/shmem_fs.h> 50#include <linux/shmem_fs.h>
51#include <linux/writeback.h> 51#include <linux/writeback.h>
52#include <linux/vfs.h>
53#include <linux/blkdev.h> 52#include <linux/blkdev.h>
54#include <linux/security.h> 53#include <linux/security.h>
55#include <linux/swapops.h> 54#include <linux/swapops.h>
@@ -1097,6 +1096,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1096 shmem_swp_unmap(entry);
1098unlock: 1097unlock:
1099 spin_unlock(&info->lock); 1098 spin_unlock(&info->lock);
1099 /*
1100 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1101 * clear SWAP_HAS_CACHE flag.
1102 */
1100 swapcache_free(swap, NULL); 1103 swapcache_free(swap, NULL);
1101redirty: 1104redirty:
1102 set_page_dirty(page); 1105 set_page_dirty(page);
@@ -2306,17 +2309,14 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2306 int err = -ENOMEM; 2309 int err = -ENOMEM;
2307 2310
2308 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2311 /* Round up to L1_CACHE_BYTES to resist false sharing */
2309 sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), 2312 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2310 L1_CACHE_BYTES), GFP_KERNEL); 2313 L1_CACHE_BYTES), GFP_KERNEL);
2311 if (!sbinfo) 2314 if (!sbinfo)
2312 return -ENOMEM; 2315 return -ENOMEM;
2313 2316
2314 sbinfo->max_blocks = 0;
2315 sbinfo->max_inodes = 0;
2316 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2317 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2317 sbinfo->uid = current_fsuid(); 2318 sbinfo->uid = current_fsuid();
2318 sbinfo->gid = current_fsgid(); 2319 sbinfo->gid = current_fsgid();
2319 sbinfo->mpol = NULL;
2320 sb->s_fs_info = sbinfo; 2320 sb->s_fs_info = sbinfo;
2321 2321
2322#ifdef CONFIG_TMPFS 2322#ifdef CONFIG_TMPFS
@@ -2590,6 +2590,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2590 return 0; 2590 return 0;
2591} 2591}
2592 2592
2593int shmem_lock(struct file *file, int lock, struct user_struct *user)
2594{
2595 return 0;
2596}
2597
2593#define shmem_vm_ops generic_file_vm_ops 2598#define shmem_vm_ops generic_file_vm_ops
2594#define shmem_file_operations ramfs_file_operations 2599#define shmem_file_operations ramfs_file_operations
2595#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2600#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..7dfa481c96ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
1384 * Fragmentation resistance on low memory - only use bigger 1384 * Fragmentation resistance on low memory - only use bigger
1385 * page orders on machines with more than 32MB of memory. 1385 * page orders on machines with more than 32MB of memory.
1386 */ 1386 */
1387 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1387 if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1389 1389
1390 /* Bootstrap is tricky, because several objects are allocated 1390 /* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/slub.c b/mm/slub.c
index 0a216aae227e..4996fc719552 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3345,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3345{ 3345{
3346 struct kmem_cache *s; 3346 struct kmem_cache *s;
3347 3347
3348 if (WARN_ON(!name))
3349 return NULL;
3350
3348 down_write(&slub_lock); 3351 down_write(&slub_lock);
3349 s = find_mergeable(size, align, flags, name, ctor); 3352 s = find_mergeable(size, align, flags, name, ctor);
3350 if (s) { 3353 if (s) {
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea6401ae7..d9714bdcb4a3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 48{
49 /* If the main allocator is up use that, fallback to bootmem. */ 49 /* If the main allocator is up use that, fallback to bootmem. */
50 if (slab_is_available()) { 50 if (slab_is_available()) {
51 struct page *page = alloc_pages_node(node, 51 struct page *page;
52
53 if (node_state(node, N_HIGH_MEMORY))
54 page = alloc_pages_node(node,
52 GFP_KERNEL | __GFP_ZERO, get_order(size)); 55 GFP_KERNEL | __GFP_ZERO, get_order(size));
56 else
57 page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
58 get_order(size));
53 if (page) 59 if (page)
54 return page_address(page); 60 return page_address(page);
55 return NULL; 61 return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9f0ae8..6ce4aab69e99 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
62 unsigned long array_size = SECTIONS_PER_ROOT * 62 unsigned long array_size = SECTIONS_PER_ROOT *
63 sizeof(struct mem_section); 63 sizeof(struct mem_section);
64 64
65 if (slab_is_available()) 65 if (slab_is_available()) {
66 section = kmalloc_node(array_size, GFP_KERNEL, nid); 66 if (node_state(nid, N_HIGH_MEMORY))
67 else 67 section = kmalloc_node(array_size, GFP_KERNEL, nid);
68 else
69 section = kmalloc(array_size, GFP_KERNEL);
70 } else
68 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 71 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
69 72
70 if (section) 73 if (section)
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..308e57d8d7ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
118 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
119 } 119 }
120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
121 int lru = page_is_file_cache(page); 121 int lru = page_lru_base_type(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list); 122 list_move_tail(&page->lru, &zone->lru[lru].list);
123 pgmoved++; 123 pgmoved++;
124 } 124 }
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
181 spin_lock_irq(&zone->lru_lock); 181 spin_lock_irq(&zone->lru_lock);
182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
183 int file = page_is_file_cache(page); 183 int file = page_is_file_cache(page);
184 int lru = LRU_BASE + file; 184 int lru = page_lru_base_type(page);
185 del_page_from_lru_list(zone, page, lru); 185 del_page_from_lru_list(zone, page, lru);
186 186
187 SetPageActive(page); 187 SetPageActive(page);
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
189 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
190 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
191 191
192 update_page_reclaim_stat(zone, page, !!file, 1); 192 update_page_reclaim_stat(zone, page, file, 1);
193 } 193 }
194 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
195} 195}
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
496 */ 496 */
497void __init swap_setup(void) 497void __init swap_setup(void)
498{ 498{
499 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 499 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
500 500
501#ifdef CONFIG_SWAP 501#ifdef CONFIG_SWAP
502 bdi_init(swapper_space.backing_dev_info); 502 bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5ae6b8b78c80..6d1daeb1cb4a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -67,10 +67,10 @@ void show_swap_cache_info(void)
67} 67}
68 68
69/* 69/*
70 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 70 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
71 * but sets SwapCache flag and private instead of mapping and index. 71 * but sets SwapCache flag and private instead of mapping and index.
72 */ 72 */
73int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 73static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
74{ 74{
75 int error; 75 int error;
76 76
@@ -78,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
78 VM_BUG_ON(PageSwapCache(page)); 78 VM_BUG_ON(PageSwapCache(page));
79 VM_BUG_ON(!PageSwapBacked(page)); 79 VM_BUG_ON(!PageSwapBacked(page));
80 80
81 page_cache_get(page);
82 SetPageSwapCache(page);
83 set_page_private(page, entry.val);
84
85 spin_lock_irq(&swapper_space.tree_lock);
86 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
87 if (likely(!error)) {
88 total_swapcache_pages++;
89 __inc_zone_page_state(page, NR_FILE_PAGES);
90 INC_CACHE_INFO(add_total);
91 }
92 spin_unlock_irq(&swapper_space.tree_lock);
93
94 if (unlikely(error)) {
95 /*
96 * Only the context which have set SWAP_HAS_CACHE flag
97 * would call add_to_swap_cache().
98 * So add_to_swap_cache() doesn't returns -EEXIST.
99 */
100 VM_BUG_ON(error == -EEXIST);
101 set_page_private(page, 0UL);
102 ClearPageSwapCache(page);
103 page_cache_release(page);
104 }
105
106 return error;
107}
108
109
110int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
111{
112 int error;
113
81 error = radix_tree_preload(gfp_mask); 114 error = radix_tree_preload(gfp_mask);
82 if (!error) { 115 if (!error) {
83 page_cache_get(page); 116 error = __add_to_swap_cache(page, entry);
84 SetPageSwapCache(page);
85 set_page_private(page, entry.val);
86
87 spin_lock_irq(&swapper_space.tree_lock);
88 error = radix_tree_insert(&swapper_space.page_tree,
89 entry.val, page);
90 if (likely(!error)) {
91 total_swapcache_pages++;
92 __inc_zone_page_state(page, NR_FILE_PAGES);
93 INC_CACHE_INFO(add_total);
94 }
95 spin_unlock_irq(&swapper_space.tree_lock);
96 radix_tree_preload_end(); 117 radix_tree_preload_end();
97
98 if (unlikely(error)) {
99 set_page_private(page, 0UL);
100 ClearPageSwapCache(page);
101 page_cache_release(page);
102 }
103 } 118 }
104 return error; 119 return error;
105} 120}
@@ -137,38 +152,34 @@ int add_to_swap(struct page *page)
137 VM_BUG_ON(!PageLocked(page)); 152 VM_BUG_ON(!PageLocked(page));
138 VM_BUG_ON(!PageUptodate(page)); 153 VM_BUG_ON(!PageUptodate(page));
139 154
140 for (;;) { 155 entry = get_swap_page();
141 entry = get_swap_page(); 156 if (!entry.val)
142 if (!entry.val) 157 return 0;
143 return 0;
144 158
159 /*
160 * Radix-tree node allocations from PF_MEMALLOC contexts could
161 * completely exhaust the page allocator. __GFP_NOMEMALLOC
162 * stops emergency reserves from being allocated.
163 *
164 * TODO: this could cause a theoretical memory reclaim
165 * deadlock in the swap out path.
166 */
167 /*
168 * Add it to the swap cache and mark it dirty
169 */
170 err = add_to_swap_cache(page, entry,
171 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
172
173 if (!err) { /* Success */
174 SetPageDirty(page);
175 return 1;
176 } else { /* -ENOMEM radix-tree allocation failure */
145 /* 177 /*
146 * Radix-tree node allocations from PF_MEMALLOC contexts could 178 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
147 * completely exhaust the page allocator. __GFP_NOMEMALLOC 179 * clear SWAP_HAS_CACHE flag.
148 * stops emergency reserves from being allocated.
149 *
150 * TODO: this could cause a theoretical memory reclaim
151 * deadlock in the swap out path.
152 */
153 /*
154 * Add it to the swap cache and mark it dirty
155 */ 180 */
156 err = add_to_swap_cache(page, entry, 181 swapcache_free(entry, NULL);
157 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 182 return 0;
158
159 switch (err) {
160 case 0: /* Success */
161 SetPageDirty(page);
162 return 1;
163 case -EEXIST:
164 /* Raced with "speculative" read_swap_cache_async */
165 swapcache_free(entry, NULL);
166 continue;
167 default:
168 /* -ENOMEM radix-tree allocation failure */
169 swapcache_free(entry, NULL);
170 return 0;
171 }
172 } 183 }
173} 184}
174 185
@@ -290,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
290 } 301 }
291 302
292 /* 303 /*
304 * call radix_tree_preload() while we can wait.
305 */
306 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307 if (err)
308 break;
309
310 /*
293 * Swap entry may have been freed since our caller observed it. 311 * Swap entry may have been freed since our caller observed it.
294 */ 312 */
295 err = swapcache_prepare(entry); 313 err = swapcache_prepare(entry);
296 if (err == -EEXIST) /* seems racy */ 314 if (err == -EEXIST) { /* seems racy */
315 radix_tree_preload_end();
297 continue; 316 continue;
298 if (err) /* swp entry is obsolete ? */ 317 }
318 if (err) { /* swp entry is obsolete ? */
319 radix_tree_preload_end();
299 break; 320 break;
321 }
300 322
301 /* 323 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
302 * Associate the page with swap entry in the swap cache.
303 * May fail (-EEXIST) if there is already a page associated
304 * with this entry in the swap cache: added by a racing
305 * read_swap_cache_async, or add_to_swap or shmem_writepage
306 * re-using the just freed swap entry for an existing page.
307 * May fail (-ENOMEM) if radix-tree node allocation failed.
308 */
309 __set_page_locked(new_page); 324 __set_page_locked(new_page);
310 SetPageSwapBacked(new_page); 325 SetPageSwapBacked(new_page);
311 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 326 err = __add_to_swap_cache(new_page, entry);
312 if (likely(!err)) { 327 if (likely(!err)) {
328 radix_tree_preload_end();
313 /* 329 /*
314 * Initiate read into locked page and return. 330 * Initiate read into locked page and return.
315 */ 331 */
@@ -317,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
317 swap_readpage(new_page); 333 swap_readpage(new_page);
318 return new_page; 334 return new_page;
319 } 335 }
336 radix_tree_preload_end();
320 ClearPageSwapBacked(new_page); 337 ClearPageSwapBacked(new_page);
321 __clear_page_locked(new_page); 338 __clear_page_locked(new_page);
339 /*
340 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341 * clear SWAP_HAS_CACHE flag.
342 */
322 swapcache_free(entry, NULL); 343 swapcache_free(entry, NULL);
323 } while (err != -ENOMEM); 344 } while (err != -ENOMEM);
324 345
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 74f1102e8749..f1bf19daadc6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1575,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1575 p->flags &= ~SWP_WRITEOK; 1575 p->flags &= ~SWP_WRITEOK;
1576 spin_unlock(&swap_lock); 1576 spin_unlock(&swap_lock);
1577 1577
1578 current->flags |= PF_SWAPOFF; 1578 current->flags |= PF_OOM_ORIGIN;
1579 err = try_to_unuse(type); 1579 err = try_to_unuse(type);
1580 current->flags &= ~PF_SWAPOFF; 1580 current->flags &= ~PF_OOM_ORIGIN;
1581 1581
1582 if (err) { 1582 if (err) {
1583 /* re-insert swap space back into swap_list */ 1583 /* re-insert swap space back into swap_list */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 204b8243d8ab..5535da1d6961 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,7 +25,7 @@
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/pfn.h> 26#include <linux/pfn.h>
27#include <linux/kmemleak.h> 27#include <linux/kmemleak.h>
28 28#include <linux/highmem.h>
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
@@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
168 next = pgd_addr_end(addr, end); 168 next = pgd_addr_end(addr, end);
169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
170 if (err) 170 if (err)
171 break; 171 return err;
172 } while (pgd++, addr = next, addr != end); 172 } while (pgd++, addr = next, addr != end);
173 173
174 if (unlikely(err))
175 return err;
176 return nr; 174 return nr;
177} 175}
178 176
@@ -1272,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1272 if (va && va->flags & VM_VM_AREA) { 1270 if (va && va->flags & VM_VM_AREA) {
1273 struct vm_struct *vm = va->private; 1271 struct vm_struct *vm = va->private;
1274 struct vm_struct *tmp, **p; 1272 struct vm_struct *tmp, **p;
1275 1273 /*
1276 vmap_debug_free_range(va->va_start, va->va_end); 1274 * remove from list and disallow access to this vm_struct
1277 free_unmap_vmap_area(va); 1275 * before unmap. (address range confliction is maintained by
1278 vm->size -= PAGE_SIZE; 1276 * vmap.)
1279 1277 */
1280 write_lock(&vmlist_lock); 1278 write_lock(&vmlist_lock);
1281 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1279 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1282 ; 1280 ;
1283 *p = tmp->next; 1281 *p = tmp->next;
1284 write_unlock(&vmlist_lock); 1282 write_unlock(&vmlist_lock);
1285 1283
1284 vmap_debug_free_range(va->va_start, va->va_end);
1285 free_unmap_vmap_area(va);
1286 vm->size -= PAGE_SIZE;
1287
1286 return vm; 1288 return vm;
1287 } 1289 }
1288 return NULL; 1290 return NULL;
@@ -1384,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
1384 1386
1385 might_sleep(); 1387 might_sleep();
1386 1388
1387 if (count > num_physpages) 1389 if (count > totalram_pages)
1388 return NULL; 1390 return NULL;
1389 1391
1390 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1392 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1491,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1491 unsigned long real_size = size; 1493 unsigned long real_size = size;
1492 1494
1493 size = PAGE_ALIGN(size); 1495 size = PAGE_ALIGN(size);
1494 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1496 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1495 return NULL; 1497 return NULL;
1496 1498
1497 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1499 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
@@ -1641,10 +1643,120 @@ void *vmalloc_32_user(unsigned long size)
1641} 1643}
1642EXPORT_SYMBOL(vmalloc_32_user); 1644EXPORT_SYMBOL(vmalloc_32_user);
1643 1645
1646/*
1647 * small helper routine , copy contents to buf from addr.
1648 * If the page is not present, fill zero.
1649 */
1650
1651static int aligned_vread(char *buf, char *addr, unsigned long count)
1652{
1653 struct page *p;
1654 int copied = 0;
1655
1656 while (count) {
1657 unsigned long offset, length;
1658
1659 offset = (unsigned long)addr & ~PAGE_MASK;
1660 length = PAGE_SIZE - offset;
1661 if (length > count)
1662 length = count;
1663 p = vmalloc_to_page(addr);
1664 /*
1665 * To do safe access to this _mapped_ area, we need
1666 * lock. But adding lock here means that we need to add
1667 * overhead of vmalloc()/vfree() calles for this _debug_
1668 * interface, rarely used. Instead of that, we'll use
1669 * kmap() and get small overhead in this access function.
1670 */
1671 if (p) {
1672 /*
1673 * we can expect USER0 is not used (see vread/vwrite's
1674 * function description)
1675 */
1676 void *map = kmap_atomic(p, KM_USER0);
1677 memcpy(buf, map + offset, length);
1678 kunmap_atomic(map, KM_USER0);
1679 } else
1680 memset(buf, 0, length);
1681
1682 addr += length;
1683 buf += length;
1684 copied += length;
1685 count -= length;
1686 }
1687 return copied;
1688}
1689
1690static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1691{
1692 struct page *p;
1693 int copied = 0;
1694
1695 while (count) {
1696 unsigned long offset, length;
1697
1698 offset = (unsigned long)addr & ~PAGE_MASK;
1699 length = PAGE_SIZE - offset;
1700 if (length > count)
1701 length = count;
1702 p = vmalloc_to_page(addr);
1703 /*
1704 * To do safe access to this _mapped_ area, we need
1705 * lock. But adding lock here means that we need to add
1706 * overhead of vmalloc()/vfree() calles for this _debug_
1707 * interface, rarely used. Instead of that, we'll use
1708 * kmap() and get small overhead in this access function.
1709 */
1710 if (p) {
1711 /*
1712 * we can expect USER0 is not used (see vread/vwrite's
1713 * function description)
1714 */
1715 void *map = kmap_atomic(p, KM_USER0);
1716 memcpy(map + offset, buf, length);
1717 kunmap_atomic(map, KM_USER0);
1718 }
1719 addr += length;
1720 buf += length;
1721 copied += length;
1722 count -= length;
1723 }
1724 return copied;
1725}
1726
1727/**
1728 * vread() - read vmalloc area in a safe way.
1729 * @buf: buffer for reading data
1730 * @addr: vm address.
1731 * @count: number of bytes to be read.
1732 *
1733 * Returns # of bytes which addr and buf should be increased.
1734 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1735 * includes any intersect with alive vmalloc area.
1736 *
1737 * This function checks that addr is a valid vmalloc'ed area, and
1738 * copy data from that area to a given buffer. If the given memory range
1739 * of [addr...addr+count) includes some valid address, data is copied to
1740 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1741 * IOREMAP area is treated as memory hole and no copy is done.
1742 *
1743 * If [addr...addr+count) doesn't includes any intersects with alive
1744 * vm_struct area, returns 0.
1745 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1746 * the caller should guarantee KM_USER0 is not used.
1747 *
1748 * Note: In usual ops, vread() is never necessary because the caller
1749 * should know vmalloc() area is valid and can use memcpy().
1750 * This is for routines which have to access vmalloc area without
1751 * any informaion, as /dev/kmem.
1752 *
1753 */
1754
1644long vread(char *buf, char *addr, unsigned long count) 1755long vread(char *buf, char *addr, unsigned long count)
1645{ 1756{
1646 struct vm_struct *tmp; 1757 struct vm_struct *tmp;
1647 char *vaddr, *buf_start = buf; 1758 char *vaddr, *buf_start = buf;
1759 unsigned long buflen = count;
1648 unsigned long n; 1760 unsigned long n;
1649 1761
1650 /* Don't allow overflow */ 1762 /* Don't allow overflow */
@@ -1652,7 +1764,7 @@ long vread(char *buf, char *addr, unsigned long count)
1652 count = -(unsigned long) addr; 1764 count = -(unsigned long) addr;
1653 1765
1654 read_lock(&vmlist_lock); 1766 read_lock(&vmlist_lock);
1655 for (tmp = vmlist; tmp; tmp = tmp->next) { 1767 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1656 vaddr = (char *) tmp->addr; 1768 vaddr = (char *) tmp->addr;
1657 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1769 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1658 continue; 1770 continue;
@@ -1665,32 +1777,72 @@ long vread(char *buf, char *addr, unsigned long count)
1665 count--; 1777 count--;
1666 } 1778 }
1667 n = vaddr + tmp->size - PAGE_SIZE - addr; 1779 n = vaddr + tmp->size - PAGE_SIZE - addr;
1668 do { 1780 if (n > count)
1669 if (count == 0) 1781 n = count;
1670 goto finished; 1782 if (!(tmp->flags & VM_IOREMAP))
1671 *buf = *addr; 1783 aligned_vread(buf, addr, n);
1672 buf++; 1784 else /* IOREMAP area is treated as memory hole */
1673 addr++; 1785 memset(buf, 0, n);
1674 count--; 1786 buf += n;
1675 } while (--n > 0); 1787 addr += n;
1788 count -= n;
1676 } 1789 }
1677finished: 1790finished:
1678 read_unlock(&vmlist_lock); 1791 read_unlock(&vmlist_lock);
1679 return buf - buf_start; 1792
1793 if (buf == buf_start)
1794 return 0;
1795 /* zero-fill memory holes */
1796 if (buf != buf_start + buflen)
1797 memset(buf, 0, buflen - (buf - buf_start));
1798
1799 return buflen;
1680} 1800}
1681 1801
1802/**
1803 * vwrite() - write vmalloc area in a safe way.
1804 * @buf: buffer for source data
1805 * @addr: vm address.
1806 * @count: number of bytes to be read.
1807 *
1808 * Returns # of bytes which addr and buf should be incresed.
1809 * (same number to @count).
1810 * If [addr...addr+count) doesn't includes any intersect with valid
1811 * vmalloc area, returns 0.
1812 *
1813 * This function checks that addr is a valid vmalloc'ed area, and
1814 * copy data from a buffer to the given addr. If specified range of
1815 * [addr...addr+count) includes some valid address, data is copied from
1816 * proper area of @buf. If there are memory holes, no copy to hole.
1817 * IOREMAP area is treated as memory hole and no copy is done.
1818 *
1819 * If [addr...addr+count) doesn't includes any intersects with alive
1820 * vm_struct area, returns 0.
1821 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1822 * the caller should guarantee KM_USER0 is not used.
1823 *
1824 * Note: In usual ops, vwrite() is never necessary because the caller
1825 * should know vmalloc() area is valid and can use memcpy().
1826 * This is for routines which have to access vmalloc area without
1827 * any informaion, as /dev/kmem.
1828 *
1829 * The caller should guarantee KM_USER1 is not used.
1830 */
1831
1682long vwrite(char *buf, char *addr, unsigned long count) 1832long vwrite(char *buf, char *addr, unsigned long count)
1683{ 1833{
1684 struct vm_struct *tmp; 1834 struct vm_struct *tmp;
1685 char *vaddr, *buf_start = buf; 1835 char *vaddr;
1686 unsigned long n; 1836 unsigned long n, buflen;
1837 int copied = 0;
1687 1838
1688 /* Don't allow overflow */ 1839 /* Don't allow overflow */
1689 if ((unsigned long) addr + count < count) 1840 if ((unsigned long) addr + count < count)
1690 count = -(unsigned long) addr; 1841 count = -(unsigned long) addr;
1842 buflen = count;
1691 1843
1692 read_lock(&vmlist_lock); 1844 read_lock(&vmlist_lock);
1693 for (tmp = vmlist; tmp; tmp = tmp->next) { 1845 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1694 vaddr = (char *) tmp->addr; 1846 vaddr = (char *) tmp->addr;
1695 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1847 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1696 continue; 1848 continue;
@@ -1702,18 +1854,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
1702 count--; 1854 count--;
1703 } 1855 }
1704 n = vaddr + tmp->size - PAGE_SIZE - addr; 1856 n = vaddr + tmp->size - PAGE_SIZE - addr;
1705 do { 1857 if (n > count)
1706 if (count == 0) 1858 n = count;
1707 goto finished; 1859 if (!(tmp->flags & VM_IOREMAP)) {
1708 *addr = *buf; 1860 aligned_vwrite(buf, addr, n);
1709 buf++; 1861 copied++;
1710 addr++; 1862 }
1711 count--; 1863 buf += n;
1712 } while (--n > 0); 1864 addr += n;
1865 count -= n;
1713 } 1866 }
1714finished: 1867finished:
1715 read_unlock(&vmlist_lock); 1868 read_unlock(&vmlist_lock);
1716 return buf - buf_start; 1869 if (!copied)
1870 return 0;
1871 return buflen;
1717} 1872}
1718 1873
1719/** 1874/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ba8228e0a806..613e89f471d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
148 return &zone->reclaim_stat; 148 return &zone->reclaim_stat;
149} 149}
150 150
151static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, 151static unsigned long zone_nr_lru_pages(struct zone *zone,
152 enum lru_list lru) 152 struct scan_control *sc, enum lru_list lru)
153{ 153{
154 if (!scanning_global_lru(sc)) 154 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page)
286 286
287static inline int is_page_cache_freeable(struct page *page) 287static inline int is_page_cache_freeable(struct page *page)
288{ 288{
289 return page_count(page) - !!page_has_private(page) == 2; 289 /*
290 * A freeable page cache page is referenced only by the caller
291 * that isolated the page, the page cache radix tree and
292 * optional buffer heads at page->private.
293 */
294 return page_count(page) - page_has_private(page) == 2;
290} 295}
291 296
292static int may_write_to_queue(struct backing_dev_info *bdi) 297static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
361 * block, for some throttling. This happens by accident, because 366 * block, for some throttling. This happens by accident, because
362 * swap_backing_dev_info is bust: it doesn't reflect the 367 * swap_backing_dev_info is bust: it doesn't reflect the
363 * congestion state of the swapdevs. Easy to fix, if needed. 368 * congestion state of the swapdevs. Easy to fix, if needed.
364 * See swapfile.c:page_queue_congested().
365 */ 369 */
366 if (!is_page_cache_freeable(page)) 370 if (!is_page_cache_freeable(page))
367 return PAGE_KEEP; 371 return PAGE_KEEP;
@@ -531,7 +535,7 @@ redo:
531 * unevictable page on [in]active list. 535 * unevictable page on [in]active list.
532 * We know how to handle that. 536 * We know how to handle that.
533 */ 537 */
534 lru = active + page_is_file_cache(page); 538 lru = active + page_lru_base_type(page);
535 lru_cache_add_lru(page, lru); 539 lru_cache_add_lru(page, lru);
536 } else { 540 } else {
537 /* 541 /*
@@ -821,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
821 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 825 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
822 return ret; 826 return ret;
823 827
824 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) 828 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
825 return ret; 829 return ret;
826 830
827 /* 831 /*
@@ -935,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
935 /* Check that we have not crossed a zone boundary. */ 939 /* Check that we have not crossed a zone boundary. */
936 if (unlikely(page_zone_id(cursor_page) != zone_id)) 940 if (unlikely(page_zone_id(cursor_page) != zone_id))
937 continue; 941 continue;
942
943 /*
944 * If we don't have enough swap space, reclaiming of
945 * anon page which don't already have a swap slot is
946 * pointless.
947 */
948 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
949 !PageSwapCache(cursor_page))
950 continue;
951
938 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 952 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
939 list_move(&cursor_page->lru, dst); 953 list_move(&cursor_page->lru, dst);
940 mem_cgroup_del_lru(cursor_page); 954 mem_cgroup_del_lru(cursor_page);
@@ -961,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr,
961 if (file) 975 if (file)
962 lru += LRU_FILE; 976 lru += LRU_FILE;
963 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 977 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
964 mode, !!file); 978 mode, file);
965} 979}
966 980
967/* 981/*
@@ -976,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
976 struct page *page; 990 struct page *page;
977 991
978 list_for_each_entry(page, page_list, lru) { 992 list_for_each_entry(page, page_list, lru) {
979 lru = page_is_file_cache(page); 993 lru = page_lru_base_type(page);
980 if (PageActive(page)) { 994 if (PageActive(page)) {
981 lru += LRU_ACTIVE; 995 lru += LRU_ACTIVE;
982 ClearPageActive(page); 996 ClearPageActive(page);
@@ -1034,6 +1048,31 @@ int isolate_lru_page(struct page *page)
1034} 1048}
1035 1049
1036/* 1050/*
1051 * Are there way too many processes in the direct reclaim path already?
1052 */
1053static int too_many_isolated(struct zone *zone, int file,
1054 struct scan_control *sc)
1055{
1056 unsigned long inactive, isolated;
1057
1058 if (current_is_kswapd())
1059 return 0;
1060
1061 if (!scanning_global_lru(sc))
1062 return 0;
1063
1064 if (file) {
1065 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1066 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1067 } else {
1068 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1069 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1070 }
1071
1072 return isolated > inactive;
1073}
1074
1075/*
1037 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1076 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1038 * of reclaimed pages 1077 * of reclaimed pages
1039 */ 1078 */
@@ -1048,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1048 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1087 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1049 int lumpy_reclaim = 0; 1088 int lumpy_reclaim = 0;
1050 1089
1090 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10);
1092
1093 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current))
1095 return SWAP_CLUSTER_MAX;
1096 }
1097
1051 /* 1098 /*
1052 * If we need a large contiguous chunk of memory, or have 1099 * If we need a large contiguous chunk of memory, or have
1053 * trouble getting a small set of contiguous pages, we 1100 * trouble getting a small set of contiguous pages, we
@@ -1072,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1072 unsigned long nr_active; 1119 unsigned long nr_active;
1073 unsigned int count[NR_LRU_LISTS] = { 0, }; 1120 unsigned int count[NR_LRU_LISTS] = { 0, };
1074 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1121 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1122 unsigned long nr_anon;
1123 unsigned long nr_file;
1075 1124
1076 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1125 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1077 &page_list, &nr_scan, sc->order, mode, 1126 &page_list, &nr_scan, sc->order, mode,
1078 zone, sc->mem_cgroup, 0, file); 1127 zone, sc->mem_cgroup, 0, file);
1128
1129 if (scanning_global_lru(sc)) {
1130 zone->pages_scanned += nr_scan;
1131 if (current_is_kswapd())
1132 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1133 nr_scan);
1134 else
1135 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1136 nr_scan);
1137 }
1138
1139 if (nr_taken == 0)
1140 goto done;
1141
1079 nr_active = clear_active_flags(&page_list, count); 1142 nr_active = clear_active_flags(&page_list, count);
1080 __count_vm_events(PGDEACTIVATE, nr_active); 1143 __count_vm_events(PGDEACTIVATE, nr_active);
1081 1144
@@ -1088,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1088 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1151 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1089 -count[LRU_INACTIVE_ANON]); 1152 -count[LRU_INACTIVE_ANON]);
1090 1153
1091 if (scanning_global_lru(sc)) 1154 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1092 zone->pages_scanned += nr_scan; 1155 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1156 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1157 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1093 1158
1094 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1159 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1095 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1160 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1123,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1123 } 1188 }
1124 1189
1125 nr_reclaimed += nr_freed; 1190 nr_reclaimed += nr_freed;
1191
1126 local_irq_disable(); 1192 local_irq_disable();
1127 if (current_is_kswapd()) { 1193 if (current_is_kswapd())
1128 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1129 __count_vm_events(KSWAPD_STEAL, nr_freed); 1194 __count_vm_events(KSWAPD_STEAL, nr_freed);
1130 } else if (scanning_global_lru(sc))
1131 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1132
1133 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1195 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1134 1196
1135 if (nr_taken == 0)
1136 goto done;
1137
1138 spin_lock(&zone->lru_lock); 1197 spin_lock(&zone->lru_lock);
1139 /* 1198 /*
1140 * Put back any unfreeable pages. 1199 * Put back any unfreeable pages.
@@ -1153,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1153 SetPageLRU(page); 1212 SetPageLRU(page);
1154 lru = page_lru(page); 1213 lru = page_lru(page);
1155 add_page_to_lru_list(zone, page, lru); 1214 add_page_to_lru_list(zone, page, lru);
1156 if (PageActive(page)) { 1215 if (is_active_lru(lru)) {
1157 int file = !!page_is_file_cache(page); 1216 int file = is_file_lru(lru);
1158 reclaim_stat->recent_rotated[file]++; 1217 reclaim_stat->recent_rotated[file]++;
1159 } 1218 }
1160 if (!pagevec_add(&pvec, page)) { 1219 if (!pagevec_add(&pvec, page)) {
@@ -1163,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1163 spin_lock_irq(&zone->lru_lock); 1222 spin_lock_irq(&zone->lru_lock);
1164 } 1223 }
1165 } 1224 }
1225 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1226 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1227
1166 } while (nr_scanned < max_scan); 1228 } while (nr_scanned < max_scan);
1167 spin_unlock(&zone->lru_lock); 1229
1168done: 1230done:
1169 local_irq_enable(); 1231 spin_unlock_irq(&zone->lru_lock);
1170 pagevec_release(&pvec); 1232 pagevec_release(&pvec);
1171 return nr_reclaimed; 1233 return nr_reclaimed;
1172} 1234}
@@ -1215,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone,
1215 1277
1216 while (!list_empty(list)) { 1278 while (!list_empty(list)) {
1217 page = lru_to_page(list); 1279 page = lru_to_page(list);
1218 prefetchw_prev_lru_page(page, list, flags);
1219 1280
1220 VM_BUG_ON(PageLRU(page)); 1281 VM_BUG_ON(PageLRU(page));
1221 SetPageLRU(page); 1282 SetPageLRU(page);
1222 1283
1223 VM_BUG_ON(!PageActive(page));
1224 if (!is_active_lru(lru))
1225 ClearPageActive(page); /* we are de-activating */
1226
1227 list_move(&page->lru, &zone->lru[lru].list); 1284 list_move(&page->lru, &zone->lru[lru].list);
1228 mem_cgroup_add_lru_list(page, lru); 1285 mem_cgroup_add_lru_list(page, lru);
1229 pgmoved++; 1286 pgmoved++;
@@ -1244,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1244static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1301static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 struct scan_control *sc, int priority, int file) 1302 struct scan_control *sc, int priority, int file)
1246{ 1303{
1247 unsigned long pgmoved; 1304 unsigned long nr_taken;
1248 unsigned long pgscanned; 1305 unsigned long pgscanned;
1249 unsigned long vm_flags; 1306 unsigned long vm_flags;
1250 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1307 LIST_HEAD(l_hold); /* The pages which were snipped off */
@@ -1252,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1252 LIST_HEAD(l_inactive); 1309 LIST_HEAD(l_inactive);
1253 struct page *page; 1310 struct page *page;
1254 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1311 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1312 unsigned long nr_rotated = 0;
1255 1313
1256 lru_add_drain(); 1314 lru_add_drain();
1257 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1258 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1316 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1259 ISOLATE_ACTIVE, zone, 1317 ISOLATE_ACTIVE, zone,
1260 sc->mem_cgroup, 1, file); 1318 sc->mem_cgroup, 1, file);
1261 /* 1319 /*
@@ -1265,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1265 if (scanning_global_lru(sc)) { 1323 if (scanning_global_lru(sc)) {
1266 zone->pages_scanned += pgscanned; 1324 zone->pages_scanned += pgscanned;
1267 } 1325 }
1268 reclaim_stat->recent_scanned[!!file] += pgmoved; 1326 reclaim_stat->recent_scanned[file] += nr_taken;
1269 1327
1270 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1328 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1271 if (file) 1329 if (file)
1272 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1330 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1273 else 1331 else
1274 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1332 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1333 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1275 spin_unlock_irq(&zone->lru_lock); 1334 spin_unlock_irq(&zone->lru_lock);
1276 1335
1277 pgmoved = 0; /* count referenced (mapping) mapped pages */
1278 while (!list_empty(&l_hold)) { 1336 while (!list_empty(&l_hold)) {
1279 cond_resched(); 1337 cond_resched();
1280 page = lru_to_page(&l_hold); 1338 page = lru_to_page(&l_hold);
@@ -1288,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1288 /* page_referenced clears PageReferenced */ 1346 /* page_referenced clears PageReferenced */
1289 if (page_mapping_inuse(page) && 1347 if (page_mapping_inuse(page) &&
1290 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1348 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1291 pgmoved++; 1349 nr_rotated++;
1292 /* 1350 /*
1293 * Identify referenced, file-backed active pages and 1351 * Identify referenced, file-backed active pages and
1294 * give them one more trip around the active list. So 1352 * give them one more trip around the active list. So
@@ -1304,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1304 } 1362 }
1305 } 1363 }
1306 1364
1365 ClearPageActive(page); /* we are de-activating */
1307 list_add(&page->lru, &l_inactive); 1366 list_add(&page->lru, &l_inactive);
1308 } 1367 }
1309 1368
@@ -1317,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1317 * helps balance scan pressure between file and anonymous pages in 1376 * helps balance scan pressure between file and anonymous pages in
1318 * get_scan_ratio. 1377 * get_scan_ratio.
1319 */ 1378 */
1320 reclaim_stat->recent_rotated[!!file] += pgmoved; 1379 reclaim_stat->recent_rotated[file] += nr_rotated;
1321 1380
1322 move_active_pages_to_lru(zone, &l_active, 1381 move_active_pages_to_lru(zone, &l_active,
1323 LRU_ACTIVE + file * LRU_FILE); 1382 LRU_ACTIVE + file * LRU_FILE);
1324 move_active_pages_to_lru(zone, &l_inactive, 1383 move_active_pages_to_lru(zone, &l_inactive,
1325 LRU_BASE + file * LRU_FILE); 1384 LRU_BASE + file * LRU_FILE);
1326 1385 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1327 spin_unlock_irq(&zone->lru_lock); 1386 spin_unlock_irq(&zone->lru_lock);
1328} 1387}
1329 1388
@@ -1429,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1429 unsigned long ap, fp; 1488 unsigned long ap, fp;
1430 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1489 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1431 1490
1432 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1491 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1433 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1492 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1434 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1493 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1435 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); 1494 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1436 1495
1437 if (scanning_global_lru(sc)) { 1496 if (scanning_global_lru(sc)) {
1438 free = zone_page_state(zone, NR_FREE_PAGES); 1497 free = zone_page_state(zone, NR_FREE_PAGES);
@@ -1526,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone,
1526 enum lru_list l; 1585 enum lru_list l;
1527 unsigned long nr_reclaimed = sc->nr_reclaimed; 1586 unsigned long nr_reclaimed = sc->nr_reclaimed;
1528 unsigned long swap_cluster_max = sc->swap_cluster_max; 1587 unsigned long swap_cluster_max = sc->swap_cluster_max;
1588 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1529 int noswap = 0; 1589 int noswap = 0;
1530 1590
1531 /* If we have no swap space, do not bother scanning anon pages. */ 1591 /* If we have no swap space, do not bother scanning anon pages. */
@@ -1540,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone,
1540 int file = is_file_lru(l); 1600 int file = is_file_lru(l);
1541 unsigned long scan; 1601 unsigned long scan;
1542 1602
1543 scan = zone_nr_pages(zone, sc, l); 1603 scan = zone_nr_lru_pages(zone, sc, l);
1544 if (priority || noswap) { 1604 if (priority || noswap) {
1545 scan >>= priority; 1605 scan >>= priority;
1546 scan = (scan * percent[file]) / 100; 1606 scan = (scan * percent[file]) / 100;
1547 } 1607 }
1548 if (scanning_global_lru(sc)) 1608 nr[l] = nr_scan_try_batch(scan,
1549 nr[l] = nr_scan_try_batch(scan, 1609 &reclaim_stat->nr_saved_scan[l],
1550 &zone->lru[l].nr_saved_scan, 1610 swap_cluster_max);
1551 swap_cluster_max);
1552 else
1553 nr[l] = scan;
1554 } 1611 }
1555 1612
1556 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1613 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1685,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1685 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1742 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1686 continue; 1743 continue;
1687 1744
1688 lru_pages += zone_lru_pages(zone); 1745 lru_pages += zone_reclaimable_pages(zone);
1689 } 1746 }
1690 } 1747 }
1691 1748
@@ -1902,7 +1959,7 @@ loop_again:
1902 for (i = 0; i <= end_zone; i++) { 1959 for (i = 0; i <= end_zone; i++) {
1903 struct zone *zone = pgdat->node_zones + i; 1960 struct zone *zone = pgdat->node_zones + i;
1904 1961
1905 lru_pages += zone_lru_pages(zone); 1962 lru_pages += zone_reclaimable_pages(zone);
1906 } 1963 }
1907 1964
1908 /* 1965 /*
@@ -1946,7 +2003,7 @@ loop_again:
1946 if (zone_is_all_unreclaimable(zone)) 2003 if (zone_is_all_unreclaimable(zone))
1947 continue; 2004 continue;
1948 if (nr_slab == 0 && zone->pages_scanned >= 2005 if (nr_slab == 0 && zone->pages_scanned >=
1949 (zone_lru_pages(zone) * 6)) 2006 (zone_reclaimable_pages(zone) * 6))
1950 zone_set_flag(zone, 2007 zone_set_flag(zone,
1951 ZONE_ALL_UNRECLAIMABLE); 2008 ZONE_ALL_UNRECLAIMABLE);
1952 /* 2009 /*
@@ -2113,12 +2170,39 @@ void wakeup_kswapd(struct zone *zone, int order)
2113 wake_up_interruptible(&pgdat->kswapd_wait); 2170 wake_up_interruptible(&pgdat->kswapd_wait);
2114} 2171}
2115 2172
2116unsigned long global_lru_pages(void) 2173/*
2174 * The reclaimable count would be mostly accurate.
2175 * The less reclaimable pages may be
2176 * - mlocked pages, which will be moved to unevictable list when encountered
2177 * - mapped pages, which may require several travels to be reclaimed
2178 * - dirty pages, which is not "instantly" reclaimable
2179 */
2180unsigned long global_reclaimable_pages(void)
2181{
2182 int nr;
2183
2184 nr = global_page_state(NR_ACTIVE_FILE) +
2185 global_page_state(NR_INACTIVE_FILE);
2186
2187 if (nr_swap_pages > 0)
2188 nr += global_page_state(NR_ACTIVE_ANON) +
2189 global_page_state(NR_INACTIVE_ANON);
2190
2191 return nr;
2192}
2193
2194unsigned long zone_reclaimable_pages(struct zone *zone)
2117{ 2195{
2118 return global_page_state(NR_ACTIVE_ANON) 2196 int nr;
2119 + global_page_state(NR_ACTIVE_FILE) 2197
2120 + global_page_state(NR_INACTIVE_ANON) 2198 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2121 + global_page_state(NR_INACTIVE_FILE); 2199 zone_page_state(zone, NR_INACTIVE_FILE);
2200
2201 if (nr_swap_pages > 0)
2202 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2203 zone_page_state(zone, NR_INACTIVE_ANON);
2204
2205 return nr;
2122} 2206}
2123 2207
2124#ifdef CONFIG_HIBERNATION 2208#ifdef CONFIG_HIBERNATION
@@ -2133,6 +2217,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2133{ 2217{
2134 struct zone *zone; 2218 struct zone *zone;
2135 unsigned long nr_reclaimed = 0; 2219 unsigned long nr_reclaimed = 0;
2220 struct zone_reclaim_stat *reclaim_stat;
2136 2221
2137 for_each_populated_zone(zone) { 2222 for_each_populated_zone(zone) {
2138 enum lru_list l; 2223 enum lru_list l;
@@ -2149,11 +2234,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2149 l == LRU_ACTIVE_FILE)) 2234 l == LRU_ACTIVE_FILE))
2150 continue; 2235 continue;
2151 2236
2152 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; 2237 reclaim_stat = get_reclaim_stat(zone, sc);
2153 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { 2238 reclaim_stat->nr_saved_scan[l] +=
2239 (lru_pages >> prio) + 1;
2240 if (reclaim_stat->nr_saved_scan[l]
2241 >= nr_pages || pass > 3) {
2154 unsigned long nr_to_scan; 2242 unsigned long nr_to_scan;
2155 2243
2156 zone->lru[l].nr_saved_scan = 0; 2244 reclaim_stat->nr_saved_scan[l] = 0;
2157 nr_to_scan = min(nr_pages, lru_pages); 2245 nr_to_scan = min(nr_pages, lru_pages);
2158 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2246 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2159 sc, prio); 2247 sc, prio);
@@ -2190,7 +2278,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2190 2278
2191 current->reclaim_state = &reclaim_state; 2279 current->reclaim_state = &reclaim_state;
2192 2280
2193 lru_pages = global_lru_pages(); 2281 lru_pages = global_reclaimable_pages();
2194 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2282 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2195 /* If slab caches are huge, it's better to hit them first */ 2283 /* If slab caches are huge, it's better to hit them first */
2196 while (nr_slab >= lru_pages) { 2284 while (nr_slab >= lru_pages) {
@@ -2232,7 +2320,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2232 2320
2233 reclaim_state.reclaimed_slab = 0; 2321 reclaim_state.reclaimed_slab = 0;
2234 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2322 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2235 global_lru_pages()); 2323 global_reclaimable_pages());
2236 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2324 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2237 if (sc.nr_reclaimed >= nr_pages) 2325 if (sc.nr_reclaimed >= nr_pages)
2238 goto out; 2326 goto out;
@@ -2249,7 +2337,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2249 if (!sc.nr_reclaimed) { 2337 if (!sc.nr_reclaimed) {
2250 do { 2338 do {
2251 reclaim_state.reclaimed_slab = 0; 2339 reclaim_state.reclaimed_slab = 0;
2252 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2340 shrink_slab(nr_pages, sc.gfp_mask,
2341 global_reclaimable_pages());
2253 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2342 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2254 } while (sc.nr_reclaimed < nr_pages && 2343 } while (sc.nr_reclaimed < nr_pages &&
2255 reclaim_state.reclaimed_slab > 0); 2344 reclaim_state.reclaimed_slab > 0);
@@ -2569,7 +2658,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
2569retry: 2658retry:
2570 ClearPageUnevictable(page); 2659 ClearPageUnevictable(page);
2571 if (page_evictable(page, NULL)) { 2660 if (page_evictable(page, NULL)) {
2572 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); 2661 enum lru_list l = page_lru_base_type(page);
2573 2662
2574 __dec_zone_state(zone, NR_UNEVICTABLE); 2663 __dec_zone_state(zone, NR_UNEVICTABLE);
2575 list_move(&page->lru, &zone->lru[l].list); 2664 list_move(&page->lru, &zone->lru[l].list);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..c81321f9feec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -639,11 +639,14 @@ static const char * const vmstat_text[] = {
639 "nr_slab_reclaimable", 639 "nr_slab_reclaimable",
640 "nr_slab_unreclaimable", 640 "nr_slab_unreclaimable",
641 "nr_page_table_pages", 641 "nr_page_table_pages",
642 "nr_kernel_stack",
642 "nr_unstable", 643 "nr_unstable",
643 "nr_bounce", 644 "nr_bounce",
644 "nr_vmscan_write", 645 "nr_vmscan_write",
645 "nr_writeback_temp", 646 "nr_writeback_temp",
646 647 "nr_isolated_anon",
648 "nr_isolated_file",
649 "nr_shmem",
647#ifdef CONFIG_NUMA 650#ifdef CONFIG_NUMA
648 "numa_hit", 651 "numa_hit",
649 "numa_miss", 652 "numa_miss",