aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2010-01-23 01:45:46 -0500
committerDavid S. Miller <davem@davemloft.net>2010-01-23 01:45:46 -0500
commit6be325719b3e54624397e413efd4b33a997e55a3 (patch)
tree57f321a56794cab2222e179b16731e0d76a4a68a /mm
parent26d92f9276a56d55511a427fb70bd70886af647a (diff)
parent92dcffb916d309aa01778bf8963a6932e4014d07 (diff)
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig22
-rw-r--r--mm/Makefile5
-rw-r--r--mm/allocpercpu.c177
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/hugetlb.c553
-rw-r--r--mm/hwpoison-inject.c113
-rw-r--r--mm/internal.h35
-rw-r--r--mm/kmemleak.c188
-rw-r--r--mm/ksm.c953
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memcontrol.c453
-rw-r--r--mm/memory-failure.c571
-rw-r--r--mm/memory.c35
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/mempolicy.c69
-rw-r--r--mm/migrate.c133
-rw-r--r--mm/mincore.c37
-rw-r--r--mm/mlock.c45
-rw-r--r--mm/mmap.c90
-rw-r--r--mm/nommu.c152
-rw-r--r--mm/oom_kill.c103
-rw-r--r--mm/page_alloc.c123
-rw-r--r--mm/page_io.c17
-rw-r--r--mm/pagewalk.c32
-rw-r--r--mm/percpu.c28
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/rmap.c354
-rw-r--r--mm/shmem.c84
-rw-r--r--mm/shmem_acl.c171
-rw-r--r--mm/slab.c160
-rw-r--r--mm/slub.c24
-rw-r--r--mm/swapfile.c847
-rw-r--r--mm/truncate.c36
-rw-r--r--mm/util.c46
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmscan.c324
-rw-r--r--mm/vmstat.c10
39 files changed, 3895 insertions, 2193 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 44cf6f0a3a6d..17b8947aa7da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -158,11 +158,13 @@ config PAGEFLAGS_EXTENDED
158# Default to 4 for wider testing, though 8 might be more appropriate. 158# Default to 4 for wider testing, though 8 might be more appropriate.
159# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. 159# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
160# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. 160# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
161# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
161# 162#
162config SPLIT_PTLOCK_CPUS 163config SPLIT_PTLOCK_CPUS
163 int 164 int
164 default "4096" if ARM && !CPU_CACHE_VIPT 165 default "999999" if ARM && !CPU_CACHE_VIPT
165 default "4096" if PARISC && !PA20 166 default "999999" if PARISC && !PA20
167 default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
166 default "4" 168 default "4"
167 169
168# 170#
@@ -200,14 +202,6 @@ config VIRT_TO_BUS
200 def_bool y 202 def_bool y
201 depends on !ARCH_NO_VIRT_TO_BUS 203 depends on !ARCH_NO_VIRT_TO_BUS
202 204
203config HAVE_MLOCK
204 bool
205 default y if MMU=y
206
207config HAVE_MLOCKED_PAGE_BIT
208 bool
209 default y if HAVE_MLOCK=y
210
211config MMU_NOTIFIER 205config MMU_NOTIFIER
212 bool 206 bool
213 207
@@ -218,7 +212,7 @@ config KSM
218 Enable Kernel Samepage Merging: KSM periodically scans those areas 212 Enable Kernel Samepage Merging: KSM periodically scans those areas
219 of an application's address space that an app has advised may be 213 of an application's address space that an app has advised may be
220 mergeable. When it finds pages of identical content, it replaces 214 mergeable. When it finds pages of identical content, it replaces
221 the many instances by a single resident page with that content, so 215 the many instances by a single page with that content, so
222 saving memory until one or another app needs to modify the content. 216 saving memory until one or another app needs to modify the content.
223 Recommended for use with KVM, or with other duplicative applications. 217 Recommended for use with KVM, or with other duplicative applications.
224 See Documentation/vm/ksm.txt for more information: KSM is inactive 218 See Documentation/vm/ksm.txt for more information: KSM is inactive
@@ -227,6 +221,7 @@ config KSM
227 221
228config DEFAULT_MMAP_MIN_ADDR 222config DEFAULT_MMAP_MIN_ADDR
229 int "Low address space to protect from user allocation" 223 int "Low address space to protect from user allocation"
224 depends on MMU
230 default 4096 225 default 4096
231 help 226 help
232 This is the portion of low virtual memory which should be protected 227 This is the portion of low virtual memory which should be protected
@@ -257,8 +252,9 @@ config MEMORY_FAILURE
257 special hardware support and typically ECC memory. 252 special hardware support and typically ECC memory.
258 253
259config HWPOISON_INJECT 254config HWPOISON_INJECT
260 tristate "Poison pages injector" 255 tristate "HWPoison pages injector"
261 depends on MEMORY_FAILURE && DEBUG_KERNEL 256 depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
257 select PROC_PAGE_MONITOR
262 258
263config NOMMU_INITIAL_TRIM_EXCESS 259config NOMMU_INITIAL_TRIM_EXCESS
264 int "Turn on mmap() excess space trimming before booting" 260 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index ebf849042ed3..7a68d2ab5560 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
22obj-$(CONFIG_NUMA) += mempolicy.o 22obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o 27obj-$(CONFIG_KSM) += ksm.o
@@ -34,11 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
34obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
35obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
36obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
37ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
38obj-$(CONFIG_SMP) += percpu.o 36obj-$(CONFIG_SMP) += percpu.o
39else
40obj-$(CONFIG_SMP) += allocpercpu.o
41endif
42obj-$(CONFIG_QUICKLIST) += quicklist.o 37obj-$(CONFIG_QUICKLIST) += quicklist.o
43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 38obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 39obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
deleted file mode 100644
index df34ceae0c67..000000000000
--- a/mm/allocpercpu.c
+++ /dev/null
@@ -1,177 +0,0 @@
1/*
2 * linux/mm/allocpercpu.c
3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter
5 */
6#include <linux/mm.h>
7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
10
11#ifndef cache_line_size
12#define cache_line_size() L1_CACHE_BYTES
13#endif
14
15/**
16 * percpu_depopulate - depopulate per-cpu data for given cpu
17 * @__pdata: per-cpu data to depopulate
18 * @cpu: depopulate per-cpu data for this cpu
19 *
20 * Depopulating per-cpu data for a cpu going offline would be a typical
21 * use case. You need to register a cpu hotplug handler for that purpose.
22 */
23static void percpu_depopulate(void *__pdata, int cpu)
24{
25 struct percpu_data *pdata = __percpu_disguise(__pdata);
26
27 kfree(pdata->ptrs[cpu]);
28 pdata->ptrs[cpu] = NULL;
29}
30
31/**
32 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
33 * @__pdata: per-cpu data to depopulate
34 * @mask: depopulate per-cpu data for cpu's selected through mask bits
35 */
36static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
37{
38 int cpu;
39 for_each_cpu_mask_nr(cpu, *mask)
40 percpu_depopulate(__pdata, cpu);
41}
42
43#define percpu_depopulate_mask(__pdata, mask) \
44 __percpu_depopulate_mask((__pdata), &(mask))
45
46/**
47 * percpu_populate - populate per-cpu data for given cpu
48 * @__pdata: per-cpu data to populate further
49 * @size: size of per-cpu object
50 * @gfp: may sleep or not etc.
51 * @cpu: populate per-data for this cpu
52 *
53 * Populating per-cpu data for a cpu coming online would be a typical
54 * use case. You need to register a cpu hotplug handler for that purpose.
55 * Per-cpu object is populated with zeroed buffer.
56 */
57static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
58{
59 struct percpu_data *pdata = __percpu_disguise(__pdata);
60 int node = cpu_to_node(cpu);
61
62 /*
63 * We should make sure each CPU gets private memory.
64 */
65 size = roundup(size, cache_line_size());
66
67 BUG_ON(pdata->ptrs[cpu]);
68 if (node_online(node))
69 pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
70 else
71 pdata->ptrs[cpu] = kzalloc(size, gfp);
72 return pdata->ptrs[cpu];
73}
74
75/**
76 * percpu_populate_mask - populate per-cpu data for more cpu's
77 * @__pdata: per-cpu data to populate further
78 * @size: size of per-cpu object
79 * @gfp: may sleep or not etc.
80 * @mask: populate per-cpu data for cpu's selected through mask bits
81 *
82 * Per-cpu objects are populated with zeroed buffers.
83 */
84static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
85 cpumask_t *mask)
86{
87 cpumask_t populated;
88 int cpu;
89
90 cpus_clear(populated);
91 for_each_cpu_mask_nr(cpu, *mask)
92 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
93 __percpu_depopulate_mask(__pdata, &populated);
94 return -ENOMEM;
95 } else
96 cpu_set(cpu, populated);
97 return 0;
98}
99
100#define percpu_populate_mask(__pdata, size, gfp, mask) \
101 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
102
103/**
104 * alloc_percpu - initial setup of per-cpu data
105 * @size: size of per-cpu object
106 * @align: alignment
107 *
108 * Allocate dynamic percpu area. Percpu objects are populated with
109 * zeroed buffers.
110 */
111void *__alloc_percpu(size_t size, size_t align)
112{
113 /*
114 * We allocate whole cache lines to avoid false sharing
115 */
116 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
117 void *pdata = kzalloc(sz, GFP_KERNEL);
118 void *__pdata = __percpu_disguise(pdata);
119
120 /*
121 * Can't easily make larger alignment work with kmalloc. WARN
122 * on it. Larger alignment should only be used for module
123 * percpu sections on SMP for which this path isn't used.
124 */
125 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
126
127 if (unlikely(!pdata))
128 return NULL;
129 if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
130 &cpu_possible_map)))
131 return __pdata;
132 kfree(pdata);
133 return NULL;
134}
135EXPORT_SYMBOL_GPL(__alloc_percpu);
136
137/**
138 * free_percpu - final cleanup of per-cpu data
139 * @__pdata: object to clean up
140 *
141 * We simply clean up any per-cpu object left. No need for the client to
142 * track and specify through a bis mask which per-cpu objects are to free.
143 */
144void free_percpu(void *__pdata)
145{
146 if (unlikely(!__pdata))
147 return;
148 __percpu_depopulate_mask(__pdata, cpu_possible_mask);
149 kfree(__percpu_disguise(__pdata));
150}
151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d1dc23cc7f10..7d1486875e1c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -432,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
432 return mark_bootmem(start, end, 1, flags); 432 return mark_bootmem(start, end, 1, flags);
433} 433}
434 434
435static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, 435static unsigned long __init align_idx(struct bootmem_data *bdata,
436 unsigned long step) 436 unsigned long idx, unsigned long step)
437{ 437{
438 unsigned long base = bdata->node_min_pfn; 438 unsigned long base = bdata->node_min_pfn;
439 439
@@ -445,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
445 return ALIGN(base + idx, step) - base; 445 return ALIGN(base + idx, step) - base;
446} 446}
447 447
448static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, 448static unsigned long __init align_off(struct bootmem_data *bdata,
449 unsigned long align) 449 unsigned long off, unsigned long align)
450{ 450{
451 unsigned long base = PFN_PHYS(bdata->node_min_pfn); 451 unsigned long base = PFN_PHYS(bdata->node_min_pfn);
452 452
diff --git a/mm/filemap.c b/mm/filemap.c
index 8b4d88f9249e..96ac6b0eb6cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2240,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2240 size_t count, ssize_t written) 2240 size_t count, ssize_t written)
2241{ 2241{
2242 struct file *file = iocb->ki_filp; 2242 struct file *file = iocb->ki_filp;
2243 struct address_space *mapping = file->f_mapping;
2244 ssize_t status; 2243 ssize_t status;
2245 struct iov_iter i; 2244 struct iov_iter i;
2246 2245
@@ -2252,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2252 *ppos = pos + status; 2251 *ppos = pos + status;
2253 } 2252 }
2254 2253
2255 /*
2256 * If we get here for O_DIRECT writes then we must have fallen through
2257 * to buffered writes (block instantiation inside i_size). So we sync
2258 * the file data here, to try to honour O_DIRECT expectations.
2259 */
2260 if (unlikely(file->f_flags & O_DIRECT) && written)
2261 status = filemap_write_and_wait_range(mapping,
2262 pos, pos + written - 1);
2263
2264 return written ? written : status; 2254 return written ? written : status;
2265} 2255}
2266EXPORT_SYMBOL(generic_file_buffered_write); 2256EXPORT_SYMBOL(generic_file_buffered_write);
@@ -2359,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2359 * semantics. 2349 * semantics.
2360 */ 2350 */
2361 endbyte = pos + written_buffered - written - 1; 2351 endbyte = pos + written_buffered - written - 1;
2362 err = do_sync_mapping_range(file->f_mapping, pos, endbyte, 2352 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2363 SYNC_FILE_RANGE_WAIT_BEFORE|
2364 SYNC_FILE_RANGE_WRITE|
2365 SYNC_FILE_RANGE_WAIT_AFTER);
2366 if (err == 0) { 2353 if (err == 0) {
2367 written = written_buffered; 2354 written = written_buffered;
2368 invalidate_mapping_pages(mapping, 2355 invalidate_mapping_pages(mapping,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d7601b02874..e91b81b63670 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,6 +24,7 @@
24#include <asm/io.h> 24#include <asm/io.h>
25 25
26#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
27#include <linux/node.h>
27#include "internal.h" 28#include "internal.h"
28 29
29const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 30const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page,
401{ 402{
402 int i; 403 int i;
403 404
404 if (unlikely(sz > MAX_ORDER_NR_PAGES)) { 405 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
405 clear_gigantic_page(page, addr, sz); 406 clear_gigantic_page(page, addr, sz);
406 return; 407 return;
407 } 408 }
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
622} 623}
623 624
624/* 625/*
625 * Use a helper variable to find the next node and then 626 * common helper functions for hstate_next_node_to_{alloc|free}.
626 * copy it back to next_nid_to_alloc afterwards: 627 * We may have allocated or freed a huge page based on a different
627 * otherwise there's a window in which a racer might 628 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 629 * be outside of *nodes_allowed. Ensure that we use an allowed
629 * But we don't need to use a spin_lock here: it really 630 * node for alloc or free.
630 * doesn't matter if occasionally a racer chooses the
631 * same nid as we do. Move nid forward in the mask even
632 * if we just successfully allocated a hugepage so that
633 * the next caller gets hugepages on the next node.
634 */ 631 */
635static int hstate_next_node_to_alloc(struct hstate *h) 632static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
636{ 633{
637 int next_nid; 634 nid = next_node(nid, *nodes_allowed);
638 next_nid = next_node(h->next_nid_to_alloc, node_online_map); 635 if (nid == MAX_NUMNODES)
639 if (next_nid == MAX_NUMNODES) 636 nid = first_node(*nodes_allowed);
640 next_nid = first_node(node_online_map); 637 VM_BUG_ON(nid >= MAX_NUMNODES);
641 h->next_nid_to_alloc = next_nid; 638
642 return next_nid; 639 return nid;
640}
641
642static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
643{
644 if (!node_isset(nid, *nodes_allowed))
645 nid = next_node_allowed(nid, nodes_allowed);
646 return nid;
647}
648
649/*
650 * returns the previously saved node ["this node"] from which to
651 * allocate a persistent huge page for the pool and advance the
652 * next node from which to allocate, handling wrap at end of node
653 * mask.
654 */
655static int hstate_next_node_to_alloc(struct hstate *h,
656 nodemask_t *nodes_allowed)
657{
658 int nid;
659
660 VM_BUG_ON(!nodes_allowed);
661
662 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
663 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
664
665 return nid;
643} 666}
644 667
645static int alloc_fresh_huge_page(struct hstate *h) 668static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
646{ 669{
647 struct page *page; 670 struct page *page;
648 int start_nid; 671 int start_nid;
649 int next_nid; 672 int next_nid;
650 int ret = 0; 673 int ret = 0;
651 674
652 start_nid = h->next_nid_to_alloc; 675 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
653 next_nid = start_nid; 676 next_nid = start_nid;
654 677
655 do { 678 do {
656 page = alloc_fresh_huge_page_node(h, next_nid); 679 page = alloc_fresh_huge_page_node(h, next_nid);
657 if (page) 680 if (page) {
658 ret = 1; 681 ret = 1;
659 next_nid = hstate_next_node_to_alloc(h); 682 break;
660 } while (!page && next_nid != start_nid); 683 }
684 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
685 } while (next_nid != start_nid);
661 686
662 if (ret) 687 if (ret)
663 count_vm_event(HTLB_BUDDY_PGALLOC); 688 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
668} 693}
669 694
670/* 695/*
671 * helper for free_pool_huge_page() - find next node 696 * helper for free_pool_huge_page() - return the previously saved
672 * from which to free a huge page 697 * node ["this node"] from which to free a huge page. Advance the
698 * next node id whether or not we find a free huge page to free so
699 * that the next attempt to free addresses the next node.
673 */ 700 */
674static int hstate_next_node_to_free(struct hstate *h) 701static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
675{ 702{
676 int next_nid; 703 int nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map); 704
678 if (next_nid == MAX_NUMNODES) 705 VM_BUG_ON(!nodes_allowed);
679 next_nid = first_node(node_online_map); 706
680 h->next_nid_to_free = next_nid; 707 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
681 return next_nid; 708 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
709
710 return nid;
682} 711}
683 712
684/* 713/*
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h)
687 * balanced over allowed nodes. 716 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked. 717 * Called with hugetlb_lock locked.
689 */ 718 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus) 719static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
720 bool acct_surplus)
691{ 721{
692 int start_nid; 722 int start_nid;
693 int next_nid; 723 int next_nid;
694 int ret = 0; 724 int ret = 0;
695 725
696 start_nid = h->next_nid_to_free; 726 start_nid = hstate_next_node_to_free(h, nodes_allowed);
697 next_nid = start_nid; 727 next_nid = start_nid;
698 728
699 do { 729 do {
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
715 } 745 }
716 update_and_free_page(h, page); 746 update_and_free_page(h, page);
717 ret = 1; 747 ret = 1;
748 break;
718 } 749 }
719 next_nid = hstate_next_node_to_free(h); 750 next_nid = hstate_next_node_to_free(h, nodes_allowed);
720 } while (!ret && next_nid != start_nid); 751 } while (next_nid != start_nid);
721 752
722 return ret; 753 return ret;
723} 754}
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h,
911 942
912 /* 943 /*
913 * We want to release as many surplus pages as possible, spread 944 * We want to release as many surplus pages as possible, spread
914 * evenly across all nodes. Iterate across all nodes until we 945 * evenly across all nodes with memory. Iterate across these nodes
915 * can no longer free unreserved surplus pages. This occurs when 946 * until we can no longer free unreserved surplus pages. This occurs
916 * the nodes with surplus pages have no free pages. 947 * when the nodes with surplus pages have no free pages.
917 * free_pool_huge_page() will balance the the frees across the 948 * free_pool_huge_page() will balance the the freed pages across the
918 * on-line nodes for us and will handle the hstate accounting. 949 * on-line nodes with memory and will handle the hstate accounting.
919 */ 950 */
920 while (nr_pages--) { 951 while (nr_pages--) {
921 if (!free_pool_huge_page(h, 1)) 952 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
922 break; 953 break;
923 } 954 }
924} 955}
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1022int __weak alloc_bootmem_huge_page(struct hstate *h) 1053int __weak alloc_bootmem_huge_page(struct hstate *h)
1023{ 1054{
1024 struct huge_bootmem_page *m; 1055 struct huge_bootmem_page *m;
1025 int nr_nodes = nodes_weight(node_online_map); 1056 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1026 1057
1027 while (nr_nodes) { 1058 while (nr_nodes) {
1028 void *addr; 1059 void *addr;
1029 1060
1030 addr = __alloc_bootmem_node_nopanic( 1061 addr = __alloc_bootmem_node_nopanic(
1031 NODE_DATA(h->next_nid_to_alloc), 1062 NODE_DATA(hstate_next_node_to_alloc(h,
1063 &node_states[N_HIGH_MEMORY])),
1032 huge_page_size(h), huge_page_size(h), 0); 1064 huge_page_size(h), huge_page_size(h), 0);
1033 1065
1034 hstate_next_node_to_alloc(h);
1035 if (addr) { 1066 if (addr) {
1036 /* 1067 /*
1037 * Use the beginning of the huge page to store the 1068 * Use the beginning of the huge page to store the
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1084 if (h->order >= MAX_ORDER) { 1115 if (h->order >= MAX_ORDER) {
1085 if (!alloc_bootmem_huge_page(h)) 1116 if (!alloc_bootmem_huge_page(h))
1086 break; 1117 break;
1087 } else if (!alloc_fresh_huge_page(h)) 1118 } else if (!alloc_fresh_huge_page(h,
1119 &node_states[N_HIGH_MEMORY]))
1088 break; 1120 break;
1089 } 1121 }
1090 h->max_huge_pages = i; 1122 h->max_huge_pages = i;
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void)
1126} 1158}
1127 1159
1128#ifdef CONFIG_HIGHMEM 1160#ifdef CONFIG_HIGHMEM
1129static void try_to_free_low(struct hstate *h, unsigned long count) 1161static void try_to_free_low(struct hstate *h, unsigned long count,
1162 nodemask_t *nodes_allowed)
1130{ 1163{
1131 int i; 1164 int i;
1132 1165
1133 if (h->order >= MAX_ORDER) 1166 if (h->order >= MAX_ORDER)
1134 return; 1167 return;
1135 1168
1136 for (i = 0; i < MAX_NUMNODES; ++i) { 1169 for_each_node_mask(i, *nodes_allowed) {
1137 struct page *page, *next; 1170 struct page *page, *next;
1138 struct list_head *freel = &h->hugepage_freelists[i]; 1171 struct list_head *freel = &h->hugepage_freelists[i];
1139 list_for_each_entry_safe(page, next, freel, lru) { 1172 list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
1149 } 1182 }
1150} 1183}
1151#else 1184#else
1152static inline void try_to_free_low(struct hstate *h, unsigned long count) 1185static inline void try_to_free_low(struct hstate *h, unsigned long count,
1186 nodemask_t *nodes_allowed)
1153{ 1187{
1154} 1188}
1155#endif 1189#endif
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1159 * balanced by operating on them in a round-robin fashion. 1193 * balanced by operating on them in a round-robin fashion.
1160 * Returns 1 if an adjustment was made. 1194 * Returns 1 if an adjustment was made.
1161 */ 1195 */
1162static int adjust_pool_surplus(struct hstate *h, int delta) 1196static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1197 int delta)
1163{ 1198{
1164 int start_nid, next_nid; 1199 int start_nid, next_nid;
1165 int ret = 0; 1200 int ret = 0;
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1167 VM_BUG_ON(delta != -1 && delta != 1); 1202 VM_BUG_ON(delta != -1 && delta != 1);
1168 1203
1169 if (delta < 0) 1204 if (delta < 0)
1170 start_nid = h->next_nid_to_alloc; 1205 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1171 else 1206 else
1172 start_nid = h->next_nid_to_free; 1207 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1173 next_nid = start_nid; 1208 next_nid = start_nid;
1174 1209
1175 do { 1210 do {
1176 int nid = next_nid; 1211 int nid = next_nid;
1177 if (delta < 0) { 1212 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /* 1213 /*
1180 * To shrink on this node, there must be a surplus page 1214 * To shrink on this node, there must be a surplus page
1181 */ 1215 */
1182 if (!h->surplus_huge_pages_node[nid]) 1216 if (!h->surplus_huge_pages_node[nid]) {
1217 next_nid = hstate_next_node_to_alloc(h,
1218 nodes_allowed);
1183 continue; 1219 continue;
1220 }
1184 } 1221 }
1185 if (delta > 0) { 1222 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /* 1223 /*
1188 * Surplus cannot exceed the total number of pages 1224 * Surplus cannot exceed the total number of pages
1189 */ 1225 */
1190 if (h->surplus_huge_pages_node[nid] >= 1226 if (h->surplus_huge_pages_node[nid] >=
1191 h->nr_huge_pages_node[nid]) 1227 h->nr_huge_pages_node[nid]) {
1228 next_nid = hstate_next_node_to_free(h,
1229 nodes_allowed);
1192 continue; 1230 continue;
1231 }
1193 } 1232 }
1194 1233
1195 h->surplus_huge_pages += delta; 1234 h->surplus_huge_pages += delta;
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1202} 1241}
1203 1242
1204#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1243#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1205static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1244static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1245 nodemask_t *nodes_allowed)
1206{ 1246{
1207 unsigned long min_count, ret; 1247 unsigned long min_count, ret;
1208 1248
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1222 */ 1262 */
1223 spin_lock(&hugetlb_lock); 1263 spin_lock(&hugetlb_lock);
1224 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1264 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1225 if (!adjust_pool_surplus(h, -1)) 1265 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1226 break; 1266 break;
1227 } 1267 }
1228 1268
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1233 * and reducing the surplus. 1273 * and reducing the surplus.
1234 */ 1274 */
1235 spin_unlock(&hugetlb_lock); 1275 spin_unlock(&hugetlb_lock);
1236 ret = alloc_fresh_huge_page(h); 1276 ret = alloc_fresh_huge_page(h, nodes_allowed);
1237 spin_lock(&hugetlb_lock); 1277 spin_lock(&hugetlb_lock);
1238 if (!ret) 1278 if (!ret)
1239 goto out; 1279 goto out;
1240 1280
1281 /* Bail for signals. Probably ctrl-c from user */
1282 if (signal_pending(current))
1283 goto out;
1241 } 1284 }
1242 1285
1243 /* 1286 /*
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1257 */ 1300 */
1258 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1301 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1259 min_count = max(count, min_count); 1302 min_count = max(count, min_count);
1260 try_to_free_low(h, min_count); 1303 try_to_free_low(h, min_count, nodes_allowed);
1261 while (min_count < persistent_huge_pages(h)) { 1304 while (min_count < persistent_huge_pages(h)) {
1262 if (!free_pool_huge_page(h, 0)) 1305 if (!free_pool_huge_page(h, nodes_allowed, 0))
1263 break; 1306 break;
1264 } 1307 }
1265 while (count < persistent_huge_pages(h)) { 1308 while (count < persistent_huge_pages(h)) {
1266 if (!adjust_pool_surplus(h, 1)) 1309 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1267 break; 1310 break;
1268 } 1311 }
1269out: 1312out:
@@ -1282,43 +1325,117 @@ out:
1282static struct kobject *hugepages_kobj; 1325static struct kobject *hugepages_kobj;
1283static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1326static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1284 1327
1285static struct hstate *kobj_to_hstate(struct kobject *kobj) 1328static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1329
1330static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1286{ 1331{
1287 int i; 1332 int i;
1333
1288 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1334 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1289 if (hstate_kobjs[i] == kobj) 1335 if (hstate_kobjs[i] == kobj) {
1336 if (nidp)
1337 *nidp = NUMA_NO_NODE;
1290 return &hstates[i]; 1338 return &hstates[i];
1291 BUG(); 1339 }
1292 return NULL; 1340
1341 return kobj_to_node_hstate(kobj, nidp);
1293} 1342}
1294 1343
1295static ssize_t nr_hugepages_show(struct kobject *kobj, 1344static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1296 struct kobj_attribute *attr, char *buf) 1345 struct kobj_attribute *attr, char *buf)
1297{ 1346{
1298 struct hstate *h = kobj_to_hstate(kobj); 1347 struct hstate *h;
1299 return sprintf(buf, "%lu\n", h->nr_huge_pages); 1348 unsigned long nr_huge_pages;
1349 int nid;
1350
1351 h = kobj_to_hstate(kobj, &nid);
1352 if (nid == NUMA_NO_NODE)
1353 nr_huge_pages = h->nr_huge_pages;
1354 else
1355 nr_huge_pages = h->nr_huge_pages_node[nid];
1356
1357 return sprintf(buf, "%lu\n", nr_huge_pages);
1300} 1358}
1301static ssize_t nr_hugepages_store(struct kobject *kobj, 1359static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1302 struct kobj_attribute *attr, const char *buf, size_t count) 1360 struct kobject *kobj, struct kobj_attribute *attr,
1361 const char *buf, size_t len)
1303{ 1362{
1304 int err; 1363 int err;
1305 unsigned long input; 1364 int nid;
1306 struct hstate *h = kobj_to_hstate(kobj); 1365 unsigned long count;
1366 struct hstate *h;
1367 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1307 1368
1308 err = strict_strtoul(buf, 10, &input); 1369 err = strict_strtoul(buf, 10, &count);
1309 if (err) 1370 if (err)
1310 return 0; 1371 return 0;
1311 1372
1312 h->max_huge_pages = set_max_huge_pages(h, input); 1373 h = kobj_to_hstate(kobj, &nid);
1374 if (nid == NUMA_NO_NODE) {
1375 /*
1376 * global hstate attribute
1377 */
1378 if (!(obey_mempolicy &&
1379 init_nodemask_of_mempolicy(nodes_allowed))) {
1380 NODEMASK_FREE(nodes_allowed);
1381 nodes_allowed = &node_states[N_HIGH_MEMORY];
1382 }
1383 } else if (nodes_allowed) {
1384 /*
1385 * per node hstate attribute: adjust count to global,
1386 * but restrict alloc/free to the specified node.
1387 */
1388 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1389 init_nodemask_of_node(nodes_allowed, nid);
1390 } else
1391 nodes_allowed = &node_states[N_HIGH_MEMORY];
1392
1393 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1313 1394
1314 return count; 1395 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1396 NODEMASK_FREE(nodes_allowed);
1397
1398 return len;
1399}
1400
1401static ssize_t nr_hugepages_show(struct kobject *kobj,
1402 struct kobj_attribute *attr, char *buf)
1403{
1404 return nr_hugepages_show_common(kobj, attr, buf);
1405}
1406
1407static ssize_t nr_hugepages_store(struct kobject *kobj,
1408 struct kobj_attribute *attr, const char *buf, size_t len)
1409{
1410 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1315} 1411}
1316HSTATE_ATTR(nr_hugepages); 1412HSTATE_ATTR(nr_hugepages);
1317 1413
1414#ifdef CONFIG_NUMA
1415
1416/*
1417 * hstate attribute for optionally mempolicy-based constraint on persistent
1418 * huge page alloc/free.
1419 */
1420static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1421 struct kobj_attribute *attr, char *buf)
1422{
1423 return nr_hugepages_show_common(kobj, attr, buf);
1424}
1425
1426static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1427 struct kobj_attribute *attr, const char *buf, size_t len)
1428{
1429 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1430}
1431HSTATE_ATTR(nr_hugepages_mempolicy);
1432#endif
1433
1434
1318static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1435static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1319 struct kobj_attribute *attr, char *buf) 1436 struct kobj_attribute *attr, char *buf)
1320{ 1437{
1321 struct hstate *h = kobj_to_hstate(kobj); 1438 struct hstate *h = kobj_to_hstate(kobj, NULL);
1322 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1439 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1323} 1440}
1324static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1441static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1326{ 1443{
1327 int err; 1444 int err;
1328 unsigned long input; 1445 unsigned long input;
1329 struct hstate *h = kobj_to_hstate(kobj); 1446 struct hstate *h = kobj_to_hstate(kobj, NULL);
1330 1447
1331 err = strict_strtoul(buf, 10, &input); 1448 err = strict_strtoul(buf, 10, &input);
1332 if (err) 1449 if (err)
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
1343static ssize_t free_hugepages_show(struct kobject *kobj, 1460static ssize_t free_hugepages_show(struct kobject *kobj,
1344 struct kobj_attribute *attr, char *buf) 1461 struct kobj_attribute *attr, char *buf)
1345{ 1462{
1346 struct hstate *h = kobj_to_hstate(kobj); 1463 struct hstate *h;
1347 return sprintf(buf, "%lu\n", h->free_huge_pages); 1464 unsigned long free_huge_pages;
1465 int nid;
1466
1467 h = kobj_to_hstate(kobj, &nid);
1468 if (nid == NUMA_NO_NODE)
1469 free_huge_pages = h->free_huge_pages;
1470 else
1471 free_huge_pages = h->free_huge_pages_node[nid];
1472
1473 return sprintf(buf, "%lu\n", free_huge_pages);
1348} 1474}
1349HSTATE_ATTR_RO(free_hugepages); 1475HSTATE_ATTR_RO(free_hugepages);
1350 1476
1351static ssize_t resv_hugepages_show(struct kobject *kobj, 1477static ssize_t resv_hugepages_show(struct kobject *kobj,
1352 struct kobj_attribute *attr, char *buf) 1478 struct kobj_attribute *attr, char *buf)
1353{ 1479{
1354 struct hstate *h = kobj_to_hstate(kobj); 1480 struct hstate *h = kobj_to_hstate(kobj, NULL);
1355 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1481 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1356} 1482}
1357HSTATE_ATTR_RO(resv_hugepages); 1483HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages);
1359static ssize_t surplus_hugepages_show(struct kobject *kobj, 1485static ssize_t surplus_hugepages_show(struct kobject *kobj,
1360 struct kobj_attribute *attr, char *buf) 1486 struct kobj_attribute *attr, char *buf)
1361{ 1487{
1362 struct hstate *h = kobj_to_hstate(kobj); 1488 struct hstate *h;
1363 return sprintf(buf, "%lu\n", h->surplus_huge_pages); 1489 unsigned long surplus_huge_pages;
1490 int nid;
1491
1492 h = kobj_to_hstate(kobj, &nid);
1493 if (nid == NUMA_NO_NODE)
1494 surplus_huge_pages = h->surplus_huge_pages;
1495 else
1496 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1497
1498 return sprintf(buf, "%lu\n", surplus_huge_pages);
1364} 1499}
1365HSTATE_ATTR_RO(surplus_hugepages); 1500HSTATE_ATTR_RO(surplus_hugepages);
1366 1501
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = {
1370 &free_hugepages_attr.attr, 1505 &free_hugepages_attr.attr,
1371 &resv_hugepages_attr.attr, 1506 &resv_hugepages_attr.attr,
1372 &surplus_hugepages_attr.attr, 1507 &surplus_hugepages_attr.attr,
1508#ifdef CONFIG_NUMA
1509 &nr_hugepages_mempolicy_attr.attr,
1510#endif
1373 NULL, 1511 NULL,
1374}; 1512};
1375 1513
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = {
1377 .attrs = hstate_attrs, 1515 .attrs = hstate_attrs,
1378}; 1516};
1379 1517
1380static int __init hugetlb_sysfs_add_hstate(struct hstate *h) 1518static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
1519 struct kobject *parent,
1520 struct kobject **hstate_kobjs,
1521 struct attribute_group *hstate_attr_group)
1381{ 1522{
1382 int retval; 1523 int retval;
1524 int hi = h - hstates;
1383 1525
1384 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, 1526 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1385 hugepages_kobj); 1527 if (!hstate_kobjs[hi])
1386 if (!hstate_kobjs[h - hstates])
1387 return -ENOMEM; 1528 return -ENOMEM;
1388 1529
1389 retval = sysfs_create_group(hstate_kobjs[h - hstates], 1530 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1390 &hstate_attr_group);
1391 if (retval) 1531 if (retval)
1392 kobject_put(hstate_kobjs[h - hstates]); 1532 kobject_put(hstate_kobjs[hi]);
1393 1533
1394 return retval; 1534 return retval;
1395} 1535}
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
1404 return; 1544 return;
1405 1545
1406 for_each_hstate(h) { 1546 for_each_hstate(h) {
1407 err = hugetlb_sysfs_add_hstate(h); 1547 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1548 hstate_kobjs, &hstate_attr_group);
1408 if (err) 1549 if (err)
1409 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1550 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1410 h->name); 1551 h->name);
1411 } 1552 }
1412} 1553}
1413 1554
1555#ifdef CONFIG_NUMA
1556
1557/*
1558 * node_hstate/s - associate per node hstate attributes, via their kobjects,
1559 * with node sysdevs in node_devices[] using a parallel array. The array
1560 * index of a node sysdev or _hstate == node id.
1561 * This is here to avoid any static dependency of the node sysdev driver, in
1562 * the base kernel, on the hugetlb module.
1563 */
1564struct node_hstate {
1565 struct kobject *hugepages_kobj;
1566 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1567};
1568struct node_hstate node_hstates[MAX_NUMNODES];
1569
1570/*
1571 * A subset of global hstate attributes for node sysdevs
1572 */
1573static struct attribute *per_node_hstate_attrs[] = {
1574 &nr_hugepages_attr.attr,
1575 &free_hugepages_attr.attr,
1576 &surplus_hugepages_attr.attr,
1577 NULL,
1578};
1579
1580static struct attribute_group per_node_hstate_attr_group = {
1581 .attrs = per_node_hstate_attrs,
1582};
1583
1584/*
1585 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1586 * Returns node id via non-NULL nidp.
1587 */
1588static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1589{
1590 int nid;
1591
1592 for (nid = 0; nid < nr_node_ids; nid++) {
1593 struct node_hstate *nhs = &node_hstates[nid];
1594 int i;
1595 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1596 if (nhs->hstate_kobjs[i] == kobj) {
1597 if (nidp)
1598 *nidp = nid;
1599 return &hstates[i];
1600 }
1601 }
1602
1603 BUG();
1604 return NULL;
1605}
1606
1607/*
1608 * Unregister hstate attributes from a single node sysdev.
1609 * No-op if no hstate attributes attached.
1610 */
1611void hugetlb_unregister_node(struct node *node)
1612{
1613 struct hstate *h;
1614 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1615
1616 if (!nhs->hugepages_kobj)
1617 return; /* no hstate attributes */
1618
1619 for_each_hstate(h)
1620 if (nhs->hstate_kobjs[h - hstates]) {
1621 kobject_put(nhs->hstate_kobjs[h - hstates]);
1622 nhs->hstate_kobjs[h - hstates] = NULL;
1623 }
1624
1625 kobject_put(nhs->hugepages_kobj);
1626 nhs->hugepages_kobj = NULL;
1627}
1628
1629/*
1630 * hugetlb module exit: unregister hstate attributes from node sysdevs
1631 * that have them.
1632 */
1633static void hugetlb_unregister_all_nodes(void)
1634{
1635 int nid;
1636
1637 /*
1638 * disable node sysdev registrations.
1639 */
1640 register_hugetlbfs_with_node(NULL, NULL);
1641
1642 /*
1643 * remove hstate attributes from any nodes that have them.
1644 */
1645 for (nid = 0; nid < nr_node_ids; nid++)
1646 hugetlb_unregister_node(&node_devices[nid]);
1647}
1648
1649/*
1650 * Register hstate attributes for a single node sysdev.
1651 * No-op if attributes already registered.
1652 */
1653void hugetlb_register_node(struct node *node)
1654{
1655 struct hstate *h;
1656 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1657 int err;
1658
1659 if (nhs->hugepages_kobj)
1660 return; /* already allocated */
1661
1662 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1663 &node->sysdev.kobj);
1664 if (!nhs->hugepages_kobj)
1665 return;
1666
1667 for_each_hstate(h) {
1668 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1669 nhs->hstate_kobjs,
1670 &per_node_hstate_attr_group);
1671 if (err) {
1672 printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1673 " for node %d\n",
1674 h->name, node->sysdev.id);
1675 hugetlb_unregister_node(node);
1676 break;
1677 }
1678 }
1679}
1680
1681/*
1682 * hugetlb init time: register hstate attributes for all registered node
1683 * sysdevs of nodes that have memory. All on-line nodes should have
1684 * registered their associated sysdev by this time.
1685 */
1686static void hugetlb_register_all_nodes(void)
1687{
1688 int nid;
1689
1690 for_each_node_state(nid, N_HIGH_MEMORY) {
1691 struct node *node = &node_devices[nid];
1692 if (node->sysdev.id == nid)
1693 hugetlb_register_node(node);
1694 }
1695
1696 /*
1697 * Let the node sysdev driver know we're here so it can
1698 * [un]register hstate attributes on node hotplug.
1699 */
1700 register_hugetlbfs_with_node(hugetlb_register_node,
1701 hugetlb_unregister_node);
1702}
1703#else /* !CONFIG_NUMA */
1704
1705static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1706{
1707 BUG();
1708 if (nidp)
1709 *nidp = -1;
1710 return NULL;
1711}
1712
1713static void hugetlb_unregister_all_nodes(void) { }
1714
1715static void hugetlb_register_all_nodes(void) { }
1716
1717#endif
1718
1414static void __exit hugetlb_exit(void) 1719static void __exit hugetlb_exit(void)
1415{ 1720{
1416 struct hstate *h; 1721 struct hstate *h;
1417 1722
1723 hugetlb_unregister_all_nodes();
1724
1418 for_each_hstate(h) { 1725 for_each_hstate(h) {
1419 kobject_put(hstate_kobjs[h - hstates]); 1726 kobject_put(hstate_kobjs[h - hstates]);
1420 } 1727 }
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
1449 1756
1450 hugetlb_sysfs_init(); 1757 hugetlb_sysfs_init();
1451 1758
1759 hugetlb_register_all_nodes();
1760
1452 return 0; 1761 return 0;
1453} 1762}
1454module_init(hugetlb_init); 1763module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
1472 h->free_huge_pages = 0; 1781 h->free_huge_pages = 0;
1473 for (i = 0; i < MAX_NUMNODES; ++i) 1782 for (i = 0; i < MAX_NUMNODES; ++i)
1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1783 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1475 h->next_nid_to_alloc = first_node(node_online_map); 1784 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1476 h->next_nid_to_free = first_node(node_online_map); 1785 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1786 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1478 huge_page_size(h)/1024); 1787 huge_page_size(h)/1024);
1479 1788
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1536} 1845}
1537 1846
1538#ifdef CONFIG_SYSCTL 1847#ifdef CONFIG_SYSCTL
1539int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1848static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1540 void __user *buffer, 1849 struct ctl_table *table, int write,
1541 size_t *length, loff_t *ppos) 1850 void __user *buffer, size_t *length, loff_t *ppos)
1542{ 1851{
1543 struct hstate *h = &default_hstate; 1852 struct hstate *h = &default_hstate;
1544 unsigned long tmp; 1853 unsigned long tmp;
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1550 table->maxlen = sizeof(unsigned long); 1859 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1860 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1861
1553 if (write) 1862 if (write) {
1554 h->max_huge_pages = set_max_huge_pages(h, tmp); 1863 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1864 GFP_KERNEL | __GFP_NORETRY);
1865 if (!(obey_mempolicy &&
1866 init_nodemask_of_mempolicy(nodes_allowed))) {
1867 NODEMASK_FREE(nodes_allowed);
1868 nodes_allowed = &node_states[N_HIGH_MEMORY];
1869 }
1870 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1871
1872 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1873 NODEMASK_FREE(nodes_allowed);
1874 }
1555 1875
1556 return 0; 1876 return 0;
1557} 1877}
1558 1878
1879int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1880 void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882
1883 return hugetlb_sysctl_handler_common(false, table, write,
1884 buffer, length, ppos);
1885}
1886
1887#ifdef CONFIG_NUMA
1888int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1889 void __user *buffer, size_t *length, loff_t *ppos)
1890{
1891 return hugetlb_sysctl_handler_common(true, table, write,
1892 buffer, length, ppos);
1893}
1894#endif /* CONFIG_NUMA */
1895
1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1896int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 void __user *buffer, 1897 void __user *buffer,
1561 size_t *length, loff_t *ppos) 1898 size_t *length, loff_t *ppos)
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1903 + (vma->vm_pgoff >> PAGE_SHIFT); 2240 + (vma->vm_pgoff >> PAGE_SHIFT);
1904 mapping = (struct address_space *)page_private(page); 2241 mapping = (struct address_space *)page_private(page);
1905 2242
2243 /*
2244 * Take the mapping lock for the duration of the table walk. As
2245 * this mapping should be shared between all the VMAs,
2246 * __unmap_hugepage_range() is called as the lock is already held
2247 */
2248 spin_lock(&mapping->i_mmap_lock);
1906 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2249 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1907 /* Do not unmap the current VMA */ 2250 /* Do not unmap the current VMA */
1908 if (iter_vma == vma) 2251 if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1916 * from the time of fork. This would look like data corruption 2259 * from the time of fork. This would look like data corruption
1917 */ 2260 */
1918 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2261 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1919 unmap_hugepage_range(iter_vma, 2262 __unmap_hugepage_range(iter_vma,
1920 address, address + huge_page_size(h), 2263 address, address + huge_page_size(h),
1921 page); 2264 page);
1922 } 2265 }
2266 spin_unlock(&mapping->i_mmap_lock);
1923 2267
1924 return 1; 2268 return 1;
1925} 2269}
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
1959 outside_reserve = 1; 2303 outside_reserve = 1;
1960 2304
1961 page_cache_get(old_page); 2305 page_cache_get(old_page);
2306
2307 /* Drop page_table_lock as buddy allocator may be called */
2308 spin_unlock(&mm->page_table_lock);
1962 new_page = alloc_huge_page(vma, address, outside_reserve); 2309 new_page = alloc_huge_page(vma, address, outside_reserve);
1963 2310
1964 if (IS_ERR(new_page)) { 2311 if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
1976 if (unmap_ref_private(mm, vma, old_page, address)) { 2323 if (unmap_ref_private(mm, vma, old_page, address)) {
1977 BUG_ON(page_count(old_page) != 1); 2324 BUG_ON(page_count(old_page) != 1);
1978 BUG_ON(huge_pte_none(pte)); 2325 BUG_ON(huge_pte_none(pte));
2326 spin_lock(&mm->page_table_lock);
1979 goto retry_avoidcopy; 2327 goto retry_avoidcopy;
1980 } 2328 }
1981 WARN_ON_ONCE(1); 2329 WARN_ON_ONCE(1);
1982 } 2330 }
1983 2331
2332 /* Caller expects lock to be held */
2333 spin_lock(&mm->page_table_lock);
1984 return -PTR_ERR(new_page); 2334 return -PTR_ERR(new_page);
1985 } 2335 }
1986 2336
1987 spin_unlock(&mm->page_table_lock);
1988 copy_huge_page(new_page, old_page, address, vma); 2337 copy_huge_page(new_page, old_page, address, vma);
1989 __SetPageUptodate(new_page); 2338 __SetPageUptodate(new_page);
1990 spin_lock(&mm->page_table_lock);
1991 2339
2340 /*
2341 * Retake the page_table_lock to check for racing updates
2342 * before the page tables are altered
2343 */
2344 spin_lock(&mm->page_table_lock);
1992 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2345 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1993 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2346 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1994 /* Break COW */ 2347 /* Break COW */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h>
7#include <linux/pagemap.h>
8#include "internal.h"
6 9
7static struct dentry *hwpoison_dir, *corrupt_pfn; 10static struct dentry *hwpoison_dir;
8 11
9static int hwpoison_inject(void *data, u64 val) 12static int hwpoison_inject(void *data, u64 val)
10{ 13{
14 unsigned long pfn = val;
15 struct page *p;
16 int err;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return -EPERM;
20
21 if (!hwpoison_filter_enable)
22 goto inject;
23 if (!pfn_valid(pfn))
24 return -ENXIO;
25
26 p = pfn_to_page(pfn);
27 /*
28 * This implies unable to support free buddy pages.
29 */
30 if (!get_page_unless_zero(p))
31 return 0;
32
33 if (!PageLRU(p))
34 shake_page(p, 0);
35 /*
36 * This implies unable to support non-LRU pages.
37 */
38 if (!PageLRU(p))
39 return 0;
40
41 /*
42 * do a racy check with elevated page count, to make sure PG_hwpoison
43 * will only be set for the targeted owner (or on a free page).
44 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock.
46 */
47 lock_page(p);
48 err = hwpoison_filter(p);
49 unlock_page(p);
50 if (err)
51 return 0;
52
53inject:
54 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
55 return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
56}
57
58static int hwpoison_unpoison(void *data, u64 val)
59{
11 if (!capable(CAP_SYS_ADMIN)) 60 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM; 61 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 62
14 return __memory_failure(val, 18, 0); 63 return unpoison_memory(val);
15} 64}
16 65
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 66DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
67DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
18 68
19static void pfn_inject_exit(void) 69static void pfn_inject_exit(void)
20{ 70{
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
24 74
25static int pfn_inject_init(void) 75static int pfn_inject_init(void)
26{ 76{
77 struct dentry *dentry;
78
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 79 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL) 80 if (hwpoison_dir == NULL)
29 return -ENOMEM; 81 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 82
83 /*
84 * Note that the below poison/unpoison interfaces do not involve
85 * hardware status change, hence do not require hardware support.
86 * They are mainly for testing hwpoison in software level.
87 */
88 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops); 89 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) { 90 if (!dentry)
33 pfn_inject_exit(); 91 goto fail;
34 return -ENOMEM; 92
35 } 93 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
94 NULL, &unpoison_fops);
95 if (!dentry)
96 goto fail;
97
98 dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
99 hwpoison_dir, &hwpoison_filter_enable);
100 if (!dentry)
101 goto fail;
102
103 dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
104 hwpoison_dir, &hwpoison_filter_dev_major);
105 if (!dentry)
106 goto fail;
107
108 dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
109 hwpoison_dir, &hwpoison_filter_dev_minor);
110 if (!dentry)
111 goto fail;
112
113 dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
114 hwpoison_dir, &hwpoison_filter_flags_mask);
115 if (!dentry)
116 goto fail;
117
118 dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
119 hwpoison_dir, &hwpoison_filter_flags_value);
120 if (!dentry)
121 goto fail;
122
123#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
124 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
125 hwpoison_dir, &hwpoison_filter_memcg);
126 if (!dentry)
127 goto fail;
128#endif
129
36 return 0; 130 return 0;
131fail:
132 pfn_inject_exit();
133 return -ENOMEM;
37} 134}
38 135
39module_init(pfn_inject_init); 136module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 22ec8d2b0fb8..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
50 */ 50 */
51extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
52extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
53#ifdef CONFIG_MEMORY_FAILURE
54extern bool is_free_buddy_page(struct page *page);
55#endif
53 56
54 57
55/* 58/*
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page)
63 return page_private(page); 66 return page_private(page);
64} 67}
65 68
66#ifdef CONFIG_HAVE_MLOCK 69#ifdef CONFIG_MMU
67extern long mlock_vma_pages_range(struct vm_area_struct *vma, 70extern long mlock_vma_pages_range(struct vm_area_struct *vma,
68 unsigned long start, unsigned long end); 71 unsigned long start, unsigned long end);
69extern void munlock_vma_pages_range(struct vm_area_struct *vma, 72extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
72{ 75{
73 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 76 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
74} 77}
75#endif
76 78
77/* 79/*
78 * unevictable_migrate_page() called only from migrate_page_copy() to
79 * migrate unevictable flag to new page.
80 * Note that the old page has been isolated from the LRU lists at this
81 * point so we don't need to worry about LRU statistics.
82 */
83static inline void unevictable_migrate_page(struct page *new, struct page *old)
84{
85 if (TestClearPageUnevictable(old))
86 SetPageUnevictable(new);
87}
88
89#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
90/*
91 * Called only in fault path via page_evictable() for a new page 80 * Called only in fault path via page_evictable() for a new page
92 * to determine if it's being mapped into a LOCKED vma. 81 * to determine if it's being mapped into a LOCKED vma.
93 * If so, mark page as mlocked. 82 * If so, mark page as mlocked.
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
107} 96}
108 97
109/* 98/*
110 * must be called with vma's mmap_sem held for read, and page locked. 99 * must be called with vma's mmap_sem held for read or write, and page locked.
111 */ 100 */
112extern void mlock_vma_page(struct page *page); 101extern void mlock_vma_page(struct page *page);
102extern void munlock_vma_page(struct page *page);
113 103
114/* 104/*
115 * Clear the page's PageMlocked(). This can be useful in a situation where 105 * Clear the page's PageMlocked(). This can be useful in a situation where
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
144 } 134 }
145} 135}
146 136
147#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 137#else /* !CONFIG_MMU */
148static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
149{ 139{
150 return 0; 140 return 0;
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { }
153static inline void mlock_vma_page(struct page *page) { } 143static inline void mlock_vma_page(struct page *page) { }
154static inline void mlock_migrate_page(struct page *new, struct page *old) { } 144static inline void mlock_migrate_page(struct page *new, struct page *old) { }
155 145
156#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#endif /* !CONFIG_MMU */
157 147
158/* 148/*
159 * Return the mem_map entry representing the 'offset' subpage within 149 * Return the mem_map entry representing the 'offset' subpage within
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
260#define ZONE_RECLAIM_SOME 0 250#define ZONE_RECLAIM_SOME 0
261#define ZONE_RECLAIM_SUCCESS 1 251#define ZONE_RECLAIM_SUCCESS 1
262#endif 252#endif
253
254extern int hwpoison_filter(struct page *p);
255
256extern u32 hwpoison_filter_dev_major;
257extern u32 hwpoison_filter_dev_minor;
258extern u64 hwpoison_filter_flags_mask;
259extern u64 hwpoison_filter_flags_value;
260extern u64 hwpoison_filter_memcg;
261extern u32 hwpoison_filter_enable;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 13f33b3081ec..5b069e4f5e48 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -93,6 +93,7 @@
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h> 95#include <linux/workqueue.h>
96#include <linux/crc32.h>
96 97
97#include <asm/sections.h> 98#include <asm/sections.h>
98#include <asm/processor.h> 99#include <asm/processor.h>
@@ -108,7 +109,6 @@
108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ 109#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 110#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 111#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ 112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
113 113
114#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
@@ -119,8 +119,8 @@
119/* scanning area inside a memory block */ 119/* scanning area inside a memory block */
120struct kmemleak_scan_area { 120struct kmemleak_scan_area {
121 struct hlist_node node; 121 struct hlist_node node;
122 unsigned long offset; 122 unsigned long start;
123 size_t length; 123 size_t size;
124}; 124};
125 125
126#define KMEMLEAK_GREY 0 126#define KMEMLEAK_GREY 0
@@ -149,6 +149,8 @@ struct kmemleak_object {
149 int min_count; 149 int min_count;
150 /* the total number of pointers found pointing to this object */ 150 /* the total number of pointers found pointing to this object */
151 int count; 151 int count;
152 /* checksum for detecting modified objects */
153 u32 checksum;
152 /* memory ranges to be scanned inside an object (empty for all) */ 154 /* memory ranges to be scanned inside an object (empty for all) */
153 struct hlist_head area_list; 155 struct hlist_head area_list;
154 unsigned long trace[MAX_TRACE]; 156 unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
164#define OBJECT_REPORTED (1 << 1) 166#define OBJECT_REPORTED (1 << 1)
165/* flag set to not scan the object */ 167/* flag set to not scan the object */
166#define OBJECT_NO_SCAN (1 << 2) 168#define OBJECT_NO_SCAN (1 << 2)
167/* flag set on newly allocated objects */
168#define OBJECT_NEW (1 << 3)
169 169
170/* number of bytes to print per line; must be 16 or 32 */ 170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16 171#define HEX_ROW_SIZE 16
@@ -241,8 +241,6 @@ struct early_log {
241 const void *ptr; /* allocated/freed memory block */ 241 const void *ptr; /* allocated/freed memory block */
242 size_t size; /* memory block size */ 242 size_t size; /* memory block size */
243 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
244 unsigned long offset; /* scan area offset */
245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */ 244 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */ 245 unsigned int trace_len; /* stack trace length */
248}; 246};
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object)
323 object->count >= object->min_count; 321 object->count >= object->min_count;
324} 322}
325 323
326static bool color_black(const struct kmemleak_object *object)
327{
328 return object->min_count == KMEMLEAK_BLACK;
329}
330
331/* 324/*
332 * Objects are considered unreferenced only if their color is white, they have 325 * Objects are considered unreferenced only if their color is white, they have
333 * not be deleted and have a minimum age to avoid false positives caused by 326 * not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object)
335 */ 328 */
336static bool unreferenced_object(struct kmemleak_object *object) 329static bool unreferenced_object(struct kmemleak_object *object)
337{ 330{
338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 331 return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
339 time_before_eq(object->jiffies + jiffies_min_age, 332 time_before_eq(object->jiffies + jiffies_min_age,
340 jiffies_last_scan); 333 jiffies_last_scan);
341} 334}
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq,
348 struct kmemleak_object *object) 341 struct kmemleak_object *object)
349{ 342{
350 int i; 343 int i;
344 unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
351 345
352 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", 346 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
353 object->pointer, object->size); 347 object->pointer, object->size);
354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 348 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
355 object->comm, object->pid, object->jiffies); 349 object->comm, object->pid, object->jiffies,
350 msecs_age / 1000, msecs_age % 1000);
356 hex_dump_object(seq, object); 351 hex_dump_object(seq, object);
357 seq_printf(seq, " backtrace:\n"); 352 seq_printf(seq, " backtrace:\n");
358 353
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object)
381 pr_notice(" min_count = %d\n", object->min_count); 376 pr_notice(" min_count = %d\n", object->min_count);
382 pr_notice(" count = %d\n", object->count); 377 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags); 378 pr_notice(" flags = 0x%lx\n", object->flags);
379 pr_notice(" checksum = %d\n", object->checksum);
384 pr_notice(" backtrace:\n"); 380 pr_notice(" backtrace:\n");
385 print_stack_trace(&trace, 4); 381 print_stack_trace(&trace, 4);
386} 382}
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
522 INIT_HLIST_HEAD(&object->area_list); 518 INIT_HLIST_HEAD(&object->area_list);
523 spin_lock_init(&object->lock); 519 spin_lock_init(&object->lock);
524 atomic_set(&object->use_count, 1); 520 atomic_set(&object->use_count, 1);
525 object->flags = OBJECT_ALLOCATED | OBJECT_NEW; 521 object->flags = OBJECT_ALLOCATED;
526 object->pointer = ptr; 522 object->pointer = ptr;
527 object->size = size; 523 object->size = size;
528 object->min_count = min_count; 524 object->min_count = min_count;
529 object->count = -1; /* no color initially */ 525 object->count = 0; /* white color initially */
530 object->jiffies = jiffies; 526 object->jiffies = jiffies;
527 object->checksum = 0;
531 528
532 /* task information */ 529 /* task information */
533 if (in_irq()) { 530 if (in_irq()) {
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr)
720 * Add a scanning area to the object. If at least one such area is added, 717 * Add a scanning area to the object. If at least one such area is added,
721 * kmemleak will only scan these ranges rather than the whole memory block. 718 * kmemleak will only scan these ranges rather than the whole memory block.
722 */ 719 */
723static void add_scan_area(unsigned long ptr, unsigned long offset, 720static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
724 size_t length, gfp_t gfp)
725{ 721{
726 unsigned long flags; 722 unsigned long flags;
727 struct kmemleak_object *object; 723 struct kmemleak_object *object;
728 struct kmemleak_scan_area *area; 724 struct kmemleak_scan_area *area;
729 725
730 object = find_and_get_object(ptr, 0); 726 object = find_and_get_object(ptr, 1);
731 if (!object) { 727 if (!object) {
732 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", 728 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
733 ptr); 729 ptr);
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
741 } 737 }
742 738
743 spin_lock_irqsave(&object->lock, flags); 739 spin_lock_irqsave(&object->lock, flags);
744 if (offset + length > object->size) { 740 if (ptr + size > object->pointer + object->size) {
745 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); 741 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
746 dump_object_info(object); 742 dump_object_info(object);
747 kmem_cache_free(scan_area_cache, area); 743 kmem_cache_free(scan_area_cache, area);
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
749 } 745 }
750 746
751 INIT_HLIST_NODE(&area->node); 747 INIT_HLIST_NODE(&area->node);
752 area->offset = offset; 748 area->start = ptr;
753 area->length = length; 749 area->size = size;
754 750
755 hlist_add_head(&area->node, &object->area_list); 751 hlist_add_head(&area->node, &object->area_list);
756out_unlock: 752out_unlock:
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr)
786 * processed later once kmemleak is fully initialized. 782 * processed later once kmemleak is fully initialized.
787 */ 783 */
788static void __init log_early(int op_type, const void *ptr, size_t size, 784static void __init log_early(int op_type, const void *ptr, size_t size,
789 int min_count, unsigned long offset, size_t length) 785 int min_count)
790{ 786{
791 unsigned long flags; 787 unsigned long flags;
792 struct early_log *log; 788 struct early_log *log;
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
808 log->ptr = ptr; 804 log->ptr = ptr;
809 log->size = size; 805 log->size = size;
810 log->min_count = min_count; 806 log->min_count = min_count;
811 log->offset = offset;
812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC) 807 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace); 808 log->trace_len = __save_stack_trace(log->trace);
815 crt_early_log++; 809 crt_early_log++;
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
858 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 852 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
859 create_object((unsigned long)ptr, size, min_count, gfp); 853 create_object((unsigned long)ptr, size, min_count, gfp);
860 else if (atomic_read(&kmemleak_early_log)) 854 else if (atomic_read(&kmemleak_early_log))
861 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); 855 log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
862} 856}
863EXPORT_SYMBOL_GPL(kmemleak_alloc); 857EXPORT_SYMBOL_GPL(kmemleak_alloc);
864 858
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr)
873 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 867 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
874 delete_object_full((unsigned long)ptr); 868 delete_object_full((unsigned long)ptr);
875 else if (atomic_read(&kmemleak_early_log)) 869 else if (atomic_read(&kmemleak_early_log))
876 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); 870 log_early(KMEMLEAK_FREE, ptr, 0, 0);
877} 871}
878EXPORT_SYMBOL_GPL(kmemleak_free); 872EXPORT_SYMBOL_GPL(kmemleak_free);
879 873
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
888 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 882 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
889 delete_object_part((unsigned long)ptr, size); 883 delete_object_part((unsigned long)ptr, size);
890 else if (atomic_read(&kmemleak_early_log)) 884 else if (atomic_read(&kmemleak_early_log))
891 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); 885 log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
892} 886}
893EXPORT_SYMBOL_GPL(kmemleak_free_part); 887EXPORT_SYMBOL_GPL(kmemleak_free_part);
894 888
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr)
903 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 897 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
904 make_gray_object((unsigned long)ptr); 898 make_gray_object((unsigned long)ptr);
905 else if (atomic_read(&kmemleak_early_log)) 899 else if (atomic_read(&kmemleak_early_log))
906 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); 900 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
907} 901}
908EXPORT_SYMBOL(kmemleak_not_leak); 902EXPORT_SYMBOL(kmemleak_not_leak);
909 903
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr)
919 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 913 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
920 make_black_object((unsigned long)ptr); 914 make_black_object((unsigned long)ptr);
921 else if (atomic_read(&kmemleak_early_log)) 915 else if (atomic_read(&kmemleak_early_log))
922 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); 916 log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
923} 917}
924EXPORT_SYMBOL(kmemleak_ignore); 918EXPORT_SYMBOL(kmemleak_ignore);
925 919
926/* 920/*
927 * Limit the range to be scanned in an allocated memory block. 921 * Limit the range to be scanned in an allocated memory block.
928 */ 922 */
929void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, 923void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
930 size_t length, gfp_t gfp)
931{ 924{
932 pr_debug("%s(0x%p)\n", __func__, ptr); 925 pr_debug("%s(0x%p)\n", __func__, ptr);
933 926
934 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 927 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
935 add_scan_area((unsigned long)ptr, offset, length, gfp); 928 add_scan_area((unsigned long)ptr, size, gfp);
936 else if (atomic_read(&kmemleak_early_log)) 929 else if (atomic_read(&kmemleak_early_log))
937 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); 930 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
938} 931}
939EXPORT_SYMBOL(kmemleak_scan_area); 932EXPORT_SYMBOL(kmemleak_scan_area);
940 933
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr)
948 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 941 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
949 object_no_scan((unsigned long)ptr); 942 object_no_scan((unsigned long)ptr);
950 else if (atomic_read(&kmemleak_early_log)) 943 else if (atomic_read(&kmemleak_early_log))
951 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); 944 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
952} 945}
953EXPORT_SYMBOL(kmemleak_no_scan); 946EXPORT_SYMBOL(kmemleak_no_scan);
954 947
955/* 948/*
949 * Update an object's checksum and return true if it was modified.
950 */
951static bool update_checksum(struct kmemleak_object *object)
952{
953 u32 old_csum = object->checksum;
954
955 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
956 return false;
957
958 object->checksum = crc32(0, (void *)object->pointer, object->size);
959 return object->checksum != old_csum;
960}
961
962/*
956 * Memory scanning is a long process and it needs to be interruptable. This 963 * Memory scanning is a long process and it needs to be interruptable. This
957 * function checks whether such interrupt condition occured. 964 * function checks whether such interrupt condition occured.
958 */ 965 */
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end,
1031 * added to the gray_list. 1038 * added to the gray_list.
1032 */ 1039 */
1033 object->count++; 1040 object->count++;
1034 if (color_gray(object)) 1041 if (color_gray(object)) {
1035 list_add_tail(&object->gray_list, &gray_list); 1042 list_add_tail(&object->gray_list, &gray_list);
1036 else 1043 spin_unlock_irqrestore(&object->lock, flags);
1037 put_object(object); 1044 continue;
1045 }
1046
1038 spin_unlock_irqrestore(&object->lock, flags); 1047 spin_unlock_irqrestore(&object->lock, flags);
1048 put_object(object);
1039 } 1049 }
1040} 1050}
1041 1051
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object)
1075 } 1085 }
1076 } else 1086 } else
1077 hlist_for_each_entry(area, elem, &object->area_list, node) 1087 hlist_for_each_entry(area, elem, &object->area_list, node)
1078 scan_block((void *)(object->pointer + area->offset), 1088 scan_block((void *)area->start,
1079 (void *)(object->pointer + area->offset 1089 (void *)(area->start + area->size),
1080 + area->length), object, 0); 1090 object, 0);
1081out: 1091out:
1082 spin_unlock_irqrestore(&object->lock, flags); 1092 spin_unlock_irqrestore(&object->lock, flags);
1083} 1093}
1084 1094
1085/* 1095/*
1096 * Scan the objects already referenced (gray objects). More objects will be
1097 * referenced and, if there are no memory leaks, all the objects are scanned.
1098 */
1099static void scan_gray_list(void)
1100{
1101 struct kmemleak_object *object, *tmp;
1102
1103 /*
1104 * The list traversal is safe for both tail additions and removals
1105 * from inside the loop. The kmemleak objects cannot be freed from
1106 * outside the loop because their use_count was incremented.
1107 */
1108 object = list_entry(gray_list.next, typeof(*object), gray_list);
1109 while (&object->gray_list != &gray_list) {
1110 cond_resched();
1111
1112 /* may add new objects to the list */
1113 if (!scan_should_stop())
1114 scan_object(object);
1115
1116 tmp = list_entry(object->gray_list.next, typeof(*object),
1117 gray_list);
1118
1119 /* remove the object from the list and release it */
1120 list_del(&object->gray_list);
1121 put_object(object);
1122
1123 object = tmp;
1124 }
1125 WARN_ON(!list_empty(&gray_list));
1126}
1127
1128/*
1086 * Scan data sections and all the referenced memory blocks allocated via the 1129 * Scan data sections and all the referenced memory blocks allocated via the
1087 * kernel's standard allocators. This function must be called with the 1130 * kernel's standard allocators. This function must be called with the
1088 * scan_mutex held. 1131 * scan_mutex held.
@@ -1090,10 +1133,9 @@ out:
1090static void kmemleak_scan(void) 1133static void kmemleak_scan(void)
1091{ 1134{
1092 unsigned long flags; 1135 unsigned long flags;
1093 struct kmemleak_object *object, *tmp; 1136 struct kmemleak_object *object;
1094 int i; 1137 int i;
1095 int new_leaks = 0; 1138 int new_leaks = 0;
1096 int gray_list_pass = 0;
1097 1139
1098 jiffies_last_scan = jiffies; 1140 jiffies_last_scan = jiffies;
1099 1141
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void)
1114#endif 1156#endif
1115 /* reset the reference count (whiten the object) */ 1157 /* reset the reference count (whiten the object) */
1116 object->count = 0; 1158 object->count = 0;
1117 object->flags &= ~OBJECT_NEW;
1118 if (color_gray(object) && get_object(object)) 1159 if (color_gray(object) && get_object(object))
1119 list_add_tail(&object->gray_list, &gray_list); 1160 list_add_tail(&object->gray_list, &gray_list);
1120 1161
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void)
1172 1213
1173 /* 1214 /*
1174 * Scan the objects already referenced from the sections scanned 1215 * Scan the objects already referenced from the sections scanned
1175 * above. More objects will be referenced and, if there are no memory 1216 * above.
1176 * leaks, all the objects will be scanned. The list traversal is safe
1177 * for both tail additions and removals from inside the loop. The
1178 * kmemleak objects cannot be freed from outside the loop because their
1179 * use_count was increased.
1180 */ 1217 */
1181repeat: 1218 scan_gray_list();
1182 object = list_entry(gray_list.next, typeof(*object), gray_list);
1183 while (&object->gray_list != &gray_list) {
1184 cond_resched();
1185
1186 /* may add new objects to the list */
1187 if (!scan_should_stop())
1188 scan_object(object);
1189
1190 tmp = list_entry(object->gray_list.next, typeof(*object),
1191 gray_list);
1192
1193 /* remove the object from the list and release it */
1194 list_del(&object->gray_list);
1195 put_object(object);
1196
1197 object = tmp;
1198 }
1199
1200 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1201 goto scan_end;
1202 1219
1203 /* 1220 /*
1204 * Check for new objects allocated during this scanning and add them 1221 * Check for new or unreferenced objects modified since the previous
1205 * to the gray list. 1222 * scan and color them gray until the next scan.
1206 */ 1223 */
1207 rcu_read_lock(); 1224 rcu_read_lock();
1208 list_for_each_entry_rcu(object, &object_list, object_list) { 1225 list_for_each_entry_rcu(object, &object_list, object_list) {
1209 spin_lock_irqsave(&object->lock, flags); 1226 spin_lock_irqsave(&object->lock, flags);
1210 if ((object->flags & OBJECT_NEW) && !color_black(object) && 1227 if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
1211 get_object(object)) { 1228 && update_checksum(object) && get_object(object)) {
1212 object->flags &= ~OBJECT_NEW; 1229 /* color it gray temporarily */
1230 object->count = object->min_count;
1213 list_add_tail(&object->gray_list, &gray_list); 1231 list_add_tail(&object->gray_list, &gray_list);
1214 } 1232 }
1215 spin_unlock_irqrestore(&object->lock, flags); 1233 spin_unlock_irqrestore(&object->lock, flags);
1216 } 1234 }
1217 rcu_read_unlock(); 1235 rcu_read_unlock();
1218 1236
1219 if (!list_empty(&gray_list)) 1237 /*
1220 goto repeat; 1238 * Re-scan the gray list for modified unreferenced objects.
1221 1239 */
1222scan_end: 1240 scan_gray_list();
1223 WARN_ON(!list_empty(&gray_list));
1224 1241
1225 /* 1242 /*
1226 * If scanning was stopped or new objects were being allocated at a 1243 * If scanning was stopped do not report any new unreferenced objects.
1227 * higher rate than gray list scanning, do not report any new
1228 * unreferenced objects.
1229 */ 1244 */
1230 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) 1245 if (scan_should_stop())
1231 return; 1246 return;
1232 1247
1233 /* 1248 /*
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void)
1642 kmemleak_ignore(log->ptr); 1657 kmemleak_ignore(log->ptr);
1643 break; 1658 break;
1644 case KMEMLEAK_SCAN_AREA: 1659 case KMEMLEAK_SCAN_AREA:
1645 kmemleak_scan_area(log->ptr, log->offset, log->length, 1660 kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
1646 GFP_KERNEL);
1647 break; 1661 break;
1648 case KMEMLEAK_NO_SCAN: 1662 case KMEMLEAK_NO_SCAN:
1649 kmemleak_no_scan(log->ptr); 1663 kmemleak_no_scan(log->ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 5575f8628fef..56a0da1f9979 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -29,11 +29,13 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rbtree.h> 31#include <linux/rbtree.h>
32#include <linux/memory.h>
32#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/ksm.h> 35#include <linux/ksm.h>
35 36
36#include <asm/tlbflush.h> 37#include <asm/tlbflush.h>
38#include "internal.h"
37 39
38/* 40/*
39 * A few notes about the KSM scanning process, 41 * A few notes about the KSM scanning process,
@@ -79,13 +81,13 @@
79 * struct mm_slot - ksm information per mm that is being scanned 81 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list 82 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 83 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items 84 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
83 * @mm: the mm that this information is valid for 85 * @mm: the mm that this information is valid for
84 */ 86 */
85struct mm_slot { 87struct mm_slot {
86 struct hlist_node link; 88 struct hlist_node link;
87 struct list_head mm_list; 89 struct list_head mm_list;
88 struct list_head rmap_list; 90 struct rmap_item *rmap_list;
89 struct mm_struct *mm; 91 struct mm_struct *mm;
90}; 92};
91 93
@@ -93,7 +95,7 @@ struct mm_slot {
93 * struct ksm_scan - cursor for scanning 95 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning 96 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned 97 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list 98 * @rmap_list: link to the next rmap to be scanned in the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node) 99 * @seqnr: count of completed full scans (needed when removing unstable node)
98 * 100 *
99 * There is only the one ksm_scan instance of this cursor structure. 101 * There is only the one ksm_scan instance of this cursor structure.
@@ -101,37 +103,51 @@ struct mm_slot {
101struct ksm_scan { 103struct ksm_scan {
102 struct mm_slot *mm_slot; 104 struct mm_slot *mm_slot;
103 unsigned long address; 105 unsigned long address;
104 struct rmap_item *rmap_item; 106 struct rmap_item **rmap_list;
105 unsigned long seqnr; 107 unsigned long seqnr;
106}; 108};
107 109
108/** 110/**
111 * struct stable_node - node of the stable rbtree
112 * @node: rb node of this ksm page in the stable tree
113 * @hlist: hlist head of rmap_items using this ksm page
114 * @kpfn: page frame number of this ksm page
115 */
116struct stable_node {
117 struct rb_node node;
118 struct hlist_head hlist;
119 unsigned long kpfn;
120};
121
122/**
109 * struct rmap_item - reverse mapping item for virtual addresses 123 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm) 124 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
125 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
111 * @mm: the memory structure this rmap_item is pointing into 126 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 127 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address 128 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree 129 * @node: rb node of this rmap_item in the unstable tree
115 * @next: next rmap_item hanging off the same node of the stable tree 130 * @head: pointer to stable_node heading this list in the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree 131 * @hlist: link into hlist of rmap_items hanging off that stable_node
117 */ 132 */
118struct rmap_item { 133struct rmap_item {
119 struct list_head link; 134 struct rmap_item *rmap_list;
135 struct anon_vma *anon_vma; /* when stable */
120 struct mm_struct *mm; 136 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */ 137 unsigned long address; /* + low bits used for flags below */
138 unsigned int oldchecksum; /* when unstable */
122 union { 139 union {
123 unsigned int oldchecksum; /* when unstable */ 140 struct rb_node node; /* when node of unstable tree */
124 struct rmap_item *next; /* when stable */ 141 struct { /* when listed from stable tree */
125 }; 142 struct stable_node *head;
126 union { 143 struct hlist_node hlist;
127 struct rb_node node; /* when tree node */ 144 };
128 struct rmap_item *prev; /* in stable list */
129 }; 145 };
130}; 146};
131 147
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 148#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ 149#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ 150#define STABLE_FLAG 0x200 /* is listed from the stable tree */
135 151
136/* The stable and unstable tree heads */ 152/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT; 153static struct rb_root root_stable_tree = RB_ROOT;
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = {
148}; 164};
149 165
150static struct kmem_cache *rmap_item_cache; 166static struct kmem_cache *rmap_item_cache;
167static struct kmem_cache *stable_node_cache;
151static struct kmem_cache *mm_slot_cache; 168static struct kmem_cache *mm_slot_cache;
152 169
153/* The number of nodes in the stable tree */ 170/* The number of nodes in the stable tree */
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared;
162/* The number of rmap_items in use: to calculate pages_volatile */ 179/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items; 180static unsigned long ksm_rmap_items;
164 181
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */ 182/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100; 183static unsigned int ksm_thread_pages_to_scan = 100;
170 184
@@ -190,13 +204,19 @@ static int __init ksm_slab_init(void)
190 if (!rmap_item_cache) 204 if (!rmap_item_cache)
191 goto out; 205 goto out;
192 206
207 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
208 if (!stable_node_cache)
209 goto out_free1;
210
193 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 211 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
194 if (!mm_slot_cache) 212 if (!mm_slot_cache)
195 goto out_free; 213 goto out_free2;
196 214
197 return 0; 215 return 0;
198 216
199out_free: 217out_free2:
218 kmem_cache_destroy(stable_node_cache);
219out_free1:
200 kmem_cache_destroy(rmap_item_cache); 220 kmem_cache_destroy(rmap_item_cache);
201out: 221out:
202 return -ENOMEM; 222 return -ENOMEM;
@@ -205,6 +225,7 @@ out:
205static void __init ksm_slab_free(void) 225static void __init ksm_slab_free(void)
206{ 226{
207 kmem_cache_destroy(mm_slot_cache); 227 kmem_cache_destroy(mm_slot_cache);
228 kmem_cache_destroy(stable_node_cache);
208 kmem_cache_destroy(rmap_item_cache); 229 kmem_cache_destroy(rmap_item_cache);
209 mm_slot_cache = NULL; 230 mm_slot_cache = NULL;
210} 231}
@@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
226 kmem_cache_free(rmap_item_cache, rmap_item); 247 kmem_cache_free(rmap_item_cache, rmap_item);
227} 248}
228 249
250static inline struct stable_node *alloc_stable_node(void)
251{
252 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
253}
254
255static inline void free_stable_node(struct stable_node *stable_node)
256{
257 kmem_cache_free(stable_node_cache, stable_node);
258}
259
229static inline struct mm_slot *alloc_mm_slot(void) 260static inline struct mm_slot *alloc_mm_slot(void)
230{ 261{
231 if (!mm_slot_cache) /* initialization failed */ 262 if (!mm_slot_cache) /* initialization failed */
@@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
275 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
276 % MM_SLOTS_HASH_HEADS]; 307 % MM_SLOTS_HASH_HEADS];
277 mm_slot->mm = mm; 308 mm_slot->mm = mm;
278 INIT_LIST_HEAD(&mm_slot->rmap_list);
279 hlist_add_head(&mm_slot->link, bucket); 309 hlist_add_head(&mm_slot->link, bucket);
280} 310}
281 311
@@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
284 return rmap_item->address & STABLE_FLAG; 314 return rmap_item->address & STABLE_FLAG;
285} 315}
286 316
317static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma)
319{
320 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->ksm_refcount);
322}
323
324static void drop_anon_vma(struct rmap_item *rmap_item)
325{
326 struct anon_vma *anon_vma = rmap_item->anon_vma;
327
328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock);
331 if (empty)
332 anon_vma_free(anon_vma);
333 }
334}
335
287/* 336/*
288 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 337 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
289 * page tables after it has passed through ksm_exit() - which, if necessary, 338 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
356 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 405 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
357} 406}
358 407
359static void break_cow(struct mm_struct *mm, unsigned long addr) 408static void break_cow(struct rmap_item *rmap_item)
360{ 409{
410 struct mm_struct *mm = rmap_item->mm;
411 unsigned long addr = rmap_item->address;
361 struct vm_area_struct *vma; 412 struct vm_area_struct *vma;
362 413
414 /*
415 * It is not an accident that whenever we want to break COW
416 * to undo, we also need to drop a reference to the anon_vma.
417 */
418 drop_anon_vma(rmap_item);
419
363 down_read(&mm->mmap_sem); 420 down_read(&mm->mmap_sem);
364 if (ksm_test_exit(mm)) 421 if (ksm_test_exit(mm))
365 goto out; 422 goto out;
@@ -403,21 +460,77 @@ out: page = NULL;
403 return page; 460 return page;
404} 461}
405 462
463static void remove_node_from_stable_tree(struct stable_node *stable_node)
464{
465 struct rmap_item *rmap_item;
466 struct hlist_node *hlist;
467
468 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
469 if (rmap_item->hlist.next)
470 ksm_pages_sharing--;
471 else
472 ksm_pages_shared--;
473 drop_anon_vma(rmap_item);
474 rmap_item->address &= PAGE_MASK;
475 cond_resched();
476 }
477
478 rb_erase(&stable_node->node, &root_stable_tree);
479 free_stable_node(stable_node);
480}
481
406/* 482/*
407 * get_ksm_page: checks if the page at the virtual address in rmap_item 483 * get_ksm_page: checks if the page indicated by the stable node
408 * is still PageKsm, in which case we can trust the content of the page, 484 * is still its ksm page, despite having held no reference to it.
409 * and it returns the gotten page; but NULL if the page has been zapped. 485 * In which case we can trust the content of the page, and it
486 * returns the gotten page; but if the page has now been zapped,
487 * remove the stale node from the stable tree and return NULL.
488 *
489 * You would expect the stable_node to hold a reference to the ksm page.
490 * But if it increments the page's count, swapping out has to wait for
491 * ksmd to come around again before it can free the page, which may take
492 * seconds or even minutes: much too unresponsive. So instead we use a
493 * "keyhole reference": access to the ksm page from the stable node peeps
494 * out through its keyhole to see if that page still holds the right key,
495 * pointing back to this stable node. This relies on freeing a PageAnon
496 * page to reset its page->mapping to NULL, and relies on no other use of
497 * a page to put something that might look like our key in page->mapping.
498 *
499 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
500 * but this is different - made simpler by ksm_thread_mutex being held, but
501 * interesting for assuming that no other use of the struct page could ever
502 * put our expected_mapping into page->mapping (or a field of the union which
503 * coincides with page->mapping). The RCU calls are not for KSM at all, but
504 * to keep the page_count protocol described with page_cache_get_speculative.
505 *
506 * Note: it is possible that get_ksm_page() will return NULL one moment,
507 * then page the next, if the page is in between page_freeze_refs() and
508 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
509 * is on its way to being freed; but it is an anomaly to bear in mind.
410 */ 510 */
411static struct page *get_ksm_page(struct rmap_item *rmap_item) 511static struct page *get_ksm_page(struct stable_node *stable_node)
412{ 512{
413 struct page *page; 513 struct page *page;
414 514 void *expected_mapping;
415 page = get_mergeable_page(rmap_item); 515
416 if (page && !PageKsm(page)) { 516 page = pfn_to_page(stable_node->kpfn);
517 expected_mapping = (void *)stable_node +
518 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
519 rcu_read_lock();
520 if (page->mapping != expected_mapping)
521 goto stale;
522 if (!get_page_unless_zero(page))
523 goto stale;
524 if (page->mapping != expected_mapping) {
417 put_page(page); 525 put_page(page);
418 page = NULL; 526 goto stale;
419 } 527 }
528 rcu_read_unlock();
420 return page; 529 return page;
530stale:
531 rcu_read_unlock();
532 remove_node_from_stable_tree(stable_node);
533 return NULL;
421} 534}
422 535
423/* 536/*
@@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item)
426 */ 539 */
427static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 540static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
428{ 541{
429 if (in_stable_tree(rmap_item)) { 542 if (rmap_item->address & STABLE_FLAG) {
430 struct rmap_item *next_item = rmap_item->next; 543 struct stable_node *stable_node;
431 544 struct page *page;
432 if (rmap_item->address & NODE_FLAG) {
433 if (next_item) {
434 rb_replace_node(&rmap_item->node,
435 &next_item->node,
436 &root_stable_tree);
437 next_item->address |= NODE_FLAG;
438 ksm_pages_sharing--;
439 } else {
440 rb_erase(&rmap_item->node, &root_stable_tree);
441 ksm_pages_shared--;
442 }
443 } else {
444 struct rmap_item *prev_item = rmap_item->prev;
445 545
446 BUG_ON(prev_item->next != rmap_item); 546 stable_node = rmap_item->head;
447 prev_item->next = next_item; 547 page = get_ksm_page(stable_node);
448 if (next_item) { 548 if (!page)
449 BUG_ON(next_item->prev != rmap_item); 549 goto out;
450 next_item->prev = rmap_item->prev; 550
451 } 551 lock_page(page);
552 hlist_del(&rmap_item->hlist);
553 unlock_page(page);
554 put_page(page);
555
556 if (stable_node->hlist.first)
452 ksm_pages_sharing--; 557 ksm_pages_sharing--;
453 } 558 else
559 ksm_pages_shared--;
454 560
455 rmap_item->next = NULL; 561 drop_anon_vma(rmap_item);
562 rmap_item->address &= PAGE_MASK;
456 563
457 } else if (rmap_item->address & NODE_FLAG) { 564 } else if (rmap_item->address & UNSTABLE_FLAG) {
458 unsigned char age; 565 unsigned char age;
459 /* 566 /*
460 * Usually ksmd can and must skip the rb_erase, because 567 * Usually ksmd can and must skip the rb_erase, because
@@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
467 BUG_ON(age > 1); 574 BUG_ON(age > 1);
468 if (!age) 575 if (!age)
469 rb_erase(&rmap_item->node, &root_unstable_tree); 576 rb_erase(&rmap_item->node, &root_unstable_tree);
577
470 ksm_pages_unshared--; 578 ksm_pages_unshared--;
579 rmap_item->address &= PAGE_MASK;
471 } 580 }
472 581out:
473 rmap_item->address &= PAGE_MASK;
474
475 cond_resched(); /* we're called from many long loops */ 582 cond_resched(); /* we're called from many long loops */
476} 583}
477 584
478static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 585static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
479 struct list_head *cur) 586 struct rmap_item **rmap_list)
480{ 587{
481 struct rmap_item *rmap_item; 588 while (*rmap_list) {
482 589 struct rmap_item *rmap_item = *rmap_list;
483 while (cur != &mm_slot->rmap_list) { 590 *rmap_list = rmap_item->rmap_list;
484 rmap_item = list_entry(cur, struct rmap_item, link);
485 cur = cur->next;
486 remove_rmap_item_from_tree(rmap_item); 591 remove_rmap_item_from_tree(rmap_item);
487 list_del(&rmap_item->link);
488 free_rmap_item(rmap_item); 592 free_rmap_item(rmap_item);
489 } 593 }
490} 594}
@@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void)
550 goto error; 654 goto error;
551 } 655 }
552 656
553 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 657 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
554 658
555 spin_lock(&ksm_mmlist_lock); 659 spin_lock(&ksm_mmlist_lock);
556 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 660 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -646,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
646 * Check that no O_DIRECT or similar I/O is in progress on the 750 * Check that no O_DIRECT or similar I/O is in progress on the
647 * page 751 * page
648 */ 752 */
649 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
650 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at_notify(mm, addr, ptep, entry);
651 goto out_unlock; 755 goto out_unlock;
652 } 756 }
@@ -664,15 +768,15 @@ out:
664 768
665/** 769/**
666 * replace_page - replace page in vma by new ksm page 770 * replace_page - replace page in vma by new ksm page
667 * @vma: vma that holds the pte pointing to oldpage 771 * @vma: vma that holds the pte pointing to page
668 * @oldpage: the page we are replacing by newpage 772 * @page: the page we are replacing by kpage
669 * @newpage: the ksm page we replace oldpage by 773 * @kpage: the ksm page we replace page by
670 * @orig_pte: the original value of the pte 774 * @orig_pte: the original value of the pte
671 * 775 *
672 * Returns 0 on success, -EFAULT on failure. 776 * Returns 0 on success, -EFAULT on failure.
673 */ 777 */
674static int replace_page(struct vm_area_struct *vma, struct page *oldpage, 778static int replace_page(struct vm_area_struct *vma, struct page *page,
675 struct page *newpage, pte_t orig_pte) 779 struct page *kpage, pte_t orig_pte)
676{ 780{
677 struct mm_struct *mm = vma->vm_mm; 781 struct mm_struct *mm = vma->vm_mm;
678 pgd_t *pgd; 782 pgd_t *pgd;
@@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
681 pte_t *ptep; 785 pte_t *ptep;
682 spinlock_t *ptl; 786 spinlock_t *ptl;
683 unsigned long addr; 787 unsigned long addr;
684 pgprot_t prot;
685 int err = -EFAULT; 788 int err = -EFAULT;
686 789
687 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); 790 addr = page_address_in_vma(page, vma);
688
689 addr = page_address_in_vma(oldpage, vma);
690 if (addr == -EFAULT) 791 if (addr == -EFAULT)
691 goto out; 792 goto out;
692 793
@@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
708 goto out; 809 goto out;
709 } 810 }
710 811
711 get_page(newpage); 812 get_page(kpage);
712 page_add_ksm_rmap(newpage); 813 page_add_anon_rmap(kpage, vma, addr);
713 814
714 flush_cache_page(vma, addr, pte_pfn(*ptep)); 815 flush_cache_page(vma, addr, pte_pfn(*ptep));
715 ptep_clear_flush(vma, addr, ptep); 816 ptep_clear_flush(vma, addr, ptep);
716 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
717 818
718 page_remove_rmap(oldpage); 819 page_remove_rmap(page);
719 put_page(oldpage); 820 put_page(page);
720 821
721 pte_unmap_unlock(ptep, ptl); 822 pte_unmap_unlock(ptep, ptl);
722 err = 0; 823 err = 0;
@@ -726,32 +827,27 @@ out:
726 827
727/* 828/*
728 * try_to_merge_one_page - take two pages and merge them into one 829 * try_to_merge_one_page - take two pages and merge them into one
729 * @vma: the vma that hold the pte pointing into oldpage 830 * @vma: the vma that holds the pte pointing to page
730 * @oldpage: the page that we want to replace with newpage 831 * @page: the PageAnon page that we want to replace with kpage
731 * @newpage: the page that we want to map instead of oldpage 832 * @kpage: the PageKsm page that we want to map instead of page,
732 * 833 * or NULL the first time when we want to use page as kpage.
733 * Note:
734 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
735 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
736 * 834 *
737 * This function returns 0 if the pages were merged, -EFAULT otherwise. 835 * This function returns 0 if the pages were merged, -EFAULT otherwise.
738 */ 836 */
739static int try_to_merge_one_page(struct vm_area_struct *vma, 837static int try_to_merge_one_page(struct vm_area_struct *vma,
740 struct page *oldpage, 838 struct page *page, struct page *kpage)
741 struct page *newpage)
742{ 839{
743 pte_t orig_pte = __pte(0); 840 pte_t orig_pte = __pte(0);
744 int err = -EFAULT; 841 int err = -EFAULT;
745 842
843 if (page == kpage) /* ksm page forked */
844 return 0;
845
746 if (!(vma->vm_flags & VM_MERGEABLE)) 846 if (!(vma->vm_flags & VM_MERGEABLE))
747 goto out; 847 goto out;
748 848 if (!PageAnon(page))
749 if (!PageAnon(oldpage))
750 goto out; 849 goto out;
751 850
752 get_page(newpage);
753 get_page(oldpage);
754
755 /* 851 /*
756 * We need the page lock to read a stable PageSwapCache in 852 * We need the page lock to read a stable PageSwapCache in
757 * write_protect_page(). We use trylock_page() instead of 853 * write_protect_page(). We use trylock_page() instead of
@@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
759 * prefer to continue scanning and merging different pages, 855 * prefer to continue scanning and merging different pages,
760 * then come back to this page when it is unlocked. 856 * then come back to this page when it is unlocked.
761 */ 857 */
762 if (!trylock_page(oldpage)) 858 if (!trylock_page(page))
763 goto out_putpage; 859 goto out;
764 /* 860 /*
765 * If this anonymous page is mapped only here, its pte may need 861 * If this anonymous page is mapped only here, its pte may need
766 * to be write-protected. If it's mapped elsewhere, all of its 862 * to be write-protected. If it's mapped elsewhere, all of its
767 * ptes are necessarily already write-protected. But in either 863 * ptes are necessarily already write-protected. But in either
768 * case, we need to lock and check page_count is not raised. 864 * case, we need to lock and check page_count is not raised.
769 */ 865 */
770 if (write_protect_page(vma, oldpage, &orig_pte)) { 866 if (write_protect_page(vma, page, &orig_pte) == 0) {
771 unlock_page(oldpage); 867 if (!kpage) {
772 goto out_putpage; 868 /*
869 * While we hold page lock, upgrade page from
870 * PageAnon+anon_vma to PageKsm+NULL stable_node:
871 * stable_tree_insert() will update stable_node.
872 */
873 set_page_stable_node(page, NULL);
874 mark_page_accessed(page);
875 err = 0;
876 } else if (pages_identical(page, kpage))
877 err = replace_page(vma, page, kpage, orig_pte);
773 } 878 }
774 unlock_page(oldpage);
775 879
776 if (pages_identical(oldpage, newpage)) 880 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
777 err = replace_page(vma, oldpage, newpage, orig_pte); 881 munlock_vma_page(page);
882 if (!PageMlocked(kpage)) {
883 unlock_page(page);
884 lock_page(kpage);
885 mlock_vma_page(kpage);
886 page = kpage; /* for final unlock */
887 }
888 }
778 889
779out_putpage: 890 unlock_page(page);
780 put_page(oldpage);
781 put_page(newpage);
782out: 891out:
783 return err; 892 return err;
784} 893}
@@ -786,26 +895,31 @@ out:
786/* 895/*
787 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 896 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
788 * but no new kernel page is allocated: kpage must already be a ksm page. 897 * but no new kernel page is allocated: kpage must already be a ksm page.
898 *
899 * This function returns 0 if the pages were merged, -EFAULT otherwise.
789 */ 900 */
790static int try_to_merge_with_ksm_page(struct mm_struct *mm1, 901static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
791 unsigned long addr1, 902 struct page *page, struct page *kpage)
792 struct page *page1,
793 struct page *kpage)
794{ 903{
904 struct mm_struct *mm = rmap_item->mm;
795 struct vm_area_struct *vma; 905 struct vm_area_struct *vma;
796 int err = -EFAULT; 906 int err = -EFAULT;
797 907
798 down_read(&mm1->mmap_sem); 908 down_read(&mm->mmap_sem);
799 if (ksm_test_exit(mm1)) 909 if (ksm_test_exit(mm))
910 goto out;
911 vma = find_vma(mm, rmap_item->address);
912 if (!vma || vma->vm_start > rmap_item->address)
800 goto out; 913 goto out;
801 914
802 vma = find_vma(mm1, addr1); 915 err = try_to_merge_one_page(vma, page, kpage);
803 if (!vma || vma->vm_start > addr1) 916 if (err)
804 goto out; 917 goto out;
805 918
806 err = try_to_merge_one_page(vma, page1, kpage); 919 /* Must get reference to anon_vma while still holding mmap_sem */
920 hold_anon_vma(rmap_item, vma->anon_vma);
807out: 921out:
808 up_read(&mm1->mmap_sem); 922 up_read(&mm->mmap_sem);
809 return err; 923 return err;
810} 924}
811 925
@@ -813,109 +927,73 @@ out:
813 * try_to_merge_two_pages - take two identical pages and prepare them 927 * try_to_merge_two_pages - take two identical pages and prepare them
814 * to be merged into one page. 928 * to be merged into one page.
815 * 929 *
816 * This function returns 0 if we successfully mapped two identical pages 930 * This function returns the kpage if we successfully merged two identical
817 * into one page, -EFAULT otherwise. 931 * pages into one ksm page, NULL otherwise.
818 * 932 *
819 * Note that this function allocates a new kernel page: if one of the pages 933 * Note that this function upgrades page to ksm page: if one of the pages
820 * is already a ksm page, try_to_merge_with_ksm_page should be used. 934 * is already a ksm page, try_to_merge_with_ksm_page should be used.
821 */ 935 */
822static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, 936static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
823 struct page *page1, struct mm_struct *mm2, 937 struct page *page,
824 unsigned long addr2, struct page *page2) 938 struct rmap_item *tree_rmap_item,
939 struct page *tree_page)
825{ 940{
826 struct vm_area_struct *vma; 941 int err;
827 struct page *kpage;
828 int err = -EFAULT;
829
830 /*
831 * The number of nodes in the stable tree
832 * is the number of kernel pages that we hold.
833 */
834 if (ksm_max_kernel_pages &&
835 ksm_max_kernel_pages <= ksm_pages_shared)
836 return err;
837
838 kpage = alloc_page(GFP_HIGHUSER);
839 if (!kpage)
840 return err;
841
842 down_read(&mm1->mmap_sem);
843 if (ksm_test_exit(mm1)) {
844 up_read(&mm1->mmap_sem);
845 goto out;
846 }
847 vma = find_vma(mm1, addr1);
848 if (!vma || vma->vm_start > addr1) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852
853 copy_user_highpage(kpage, page1, addr1, vma);
854 err = try_to_merge_one_page(vma, page1, kpage);
855 up_read(&mm1->mmap_sem);
856 942
943 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
857 if (!err) { 944 if (!err) {
858 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); 945 err = try_to_merge_with_ksm_page(tree_rmap_item,
946 tree_page, page);
859 /* 947 /*
860 * If that fails, we have a ksm page with only one pte 948 * If that fails, we have a ksm page with only one pte
861 * pointing to it: so break it. 949 * pointing to it: so break it.
862 */ 950 */
863 if (err) 951 if (err)
864 break_cow(mm1, addr1); 952 break_cow(rmap_item);
865 } 953 }
866out: 954 return err ? NULL : page;
867 put_page(kpage);
868 return err;
869} 955}
870 956
871/* 957/*
872 * stable_tree_search - search page inside the stable tree 958 * stable_tree_search - search for page inside the stable tree
873 * @page: the page that we are searching identical pages to.
874 * @page2: pointer into identical page that we are holding inside the stable
875 * tree that we have found.
876 * @rmap_item: the reverse mapping item
877 * 959 *
878 * This function checks if there is a page inside the stable tree 960 * This function checks if there is a page inside the stable tree
879 * with identical content to the page that we are scanning right now. 961 * with identical content to the page that we are scanning right now.
880 * 962 *
881 * This function return rmap_item pointer to the identical item if found, 963 * This function returns the stable tree node of identical content if found,
882 * NULL otherwise. 964 * NULL otherwise.
883 */ 965 */
884static struct rmap_item *stable_tree_search(struct page *page, 966static struct page *stable_tree_search(struct page *page)
885 struct page **page2,
886 struct rmap_item *rmap_item)
887{ 967{
888 struct rb_node *node = root_stable_tree.rb_node; 968 struct rb_node *node = root_stable_tree.rb_node;
969 struct stable_node *stable_node;
970
971 stable_node = page_stable_node(page);
972 if (stable_node) { /* ksm page forked */
973 get_page(page);
974 return page;
975 }
889 976
890 while (node) { 977 while (node) {
891 struct rmap_item *tree_rmap_item, *next_rmap_item; 978 struct page *tree_page;
892 int ret; 979 int ret;
893 980
894 tree_rmap_item = rb_entry(node, struct rmap_item, node); 981 cond_resched();
895 while (tree_rmap_item) { 982 stable_node = rb_entry(node, struct stable_node, node);
896 BUG_ON(!in_stable_tree(tree_rmap_item)); 983 tree_page = get_ksm_page(stable_node);
897 cond_resched(); 984 if (!tree_page)
898 page2[0] = get_ksm_page(tree_rmap_item);
899 if (page2[0])
900 break;
901 next_rmap_item = tree_rmap_item->next;
902 remove_rmap_item_from_tree(tree_rmap_item);
903 tree_rmap_item = next_rmap_item;
904 }
905 if (!tree_rmap_item)
906 return NULL; 985 return NULL;
907 986
908 ret = memcmp_pages(page, page2[0]); 987 ret = memcmp_pages(page, tree_page);
909 988
910 if (ret < 0) { 989 if (ret < 0) {
911 put_page(page2[0]); 990 put_page(tree_page);
912 node = node->rb_left; 991 node = node->rb_left;
913 } else if (ret > 0) { 992 } else if (ret > 0) {
914 put_page(page2[0]); 993 put_page(tree_page);
915 node = node->rb_right; 994 node = node->rb_right;
916 } else { 995 } else
917 return tree_rmap_item; 996 return tree_page;
918 }
919 } 997 }
920 998
921 return NULL; 999 return NULL;
@@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page,
925 * stable_tree_insert - insert rmap_item pointing to new ksm page 1003 * stable_tree_insert - insert rmap_item pointing to new ksm page
926 * into the stable tree. 1004 * into the stable tree.
927 * 1005 *
928 * @page: the page that we are searching identical page to inside the stable 1006 * This function returns the stable tree node just allocated on success,
929 * tree. 1007 * NULL otherwise.
930 * @rmap_item: pointer to the reverse mapping item.
931 *
932 * This function returns rmap_item if success, NULL otherwise.
933 */ 1008 */
934static struct rmap_item *stable_tree_insert(struct page *page, 1009static struct stable_node *stable_tree_insert(struct page *kpage)
935 struct rmap_item *rmap_item)
936{ 1010{
937 struct rb_node **new = &root_stable_tree.rb_node; 1011 struct rb_node **new = &root_stable_tree.rb_node;
938 struct rb_node *parent = NULL; 1012 struct rb_node *parent = NULL;
1013 struct stable_node *stable_node;
939 1014
940 while (*new) { 1015 while (*new) {
941 struct rmap_item *tree_rmap_item, *next_rmap_item;
942 struct page *tree_page; 1016 struct page *tree_page;
943 int ret; 1017 int ret;
944 1018
945 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1019 cond_resched();
946 while (tree_rmap_item) { 1020 stable_node = rb_entry(*new, struct stable_node, node);
947 BUG_ON(!in_stable_tree(tree_rmap_item)); 1021 tree_page = get_ksm_page(stable_node);
948 cond_resched(); 1022 if (!tree_page)
949 tree_page = get_ksm_page(tree_rmap_item);
950 if (tree_page)
951 break;
952 next_rmap_item = tree_rmap_item->next;
953 remove_rmap_item_from_tree(tree_rmap_item);
954 tree_rmap_item = next_rmap_item;
955 }
956 if (!tree_rmap_item)
957 return NULL; 1023 return NULL;
958 1024
959 ret = memcmp_pages(page, tree_page); 1025 ret = memcmp_pages(kpage, tree_page);
960 put_page(tree_page); 1026 put_page(tree_page);
961 1027
962 parent = *new; 1028 parent = *new;
@@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page,
974 } 1040 }
975 } 1041 }
976 1042
977 rmap_item->address |= NODE_FLAG | STABLE_FLAG; 1043 stable_node = alloc_stable_node();
978 rmap_item->next = NULL; 1044 if (!stable_node)
979 rb_link_node(&rmap_item->node, parent, new); 1045 return NULL;
980 rb_insert_color(&rmap_item->node, &root_stable_tree);
981 1046
982 ksm_pages_shared++; 1047 rb_link_node(&stable_node->node, parent, new);
983 return rmap_item; 1048 rb_insert_color(&stable_node->node, &root_stable_tree);
1049
1050 INIT_HLIST_HEAD(&stable_node->hlist);
1051
1052 stable_node->kpfn = page_to_pfn(kpage);
1053 set_page_stable_node(kpage, stable_node);
1054
1055 return stable_node;
984} 1056}
985 1057
986/* 1058/*
987 * unstable_tree_search_insert - search and insert items into the unstable tree. 1059 * unstable_tree_search_insert - search for identical page,
988 * 1060 * else insert rmap_item into the unstable tree.
989 * @page: the page that we are going to search for identical page or to insert
990 * into the unstable tree
991 * @page2: pointer into identical page that was found inside the unstable tree
992 * @rmap_item: the reverse mapping item of page
993 * 1061 *
994 * This function searches for a page in the unstable tree identical to the 1062 * This function searches for a page in the unstable tree identical to the
995 * page currently being scanned; and if no identical page is found in the 1063 * page currently being scanned; and if no identical page is found in the
@@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page,
1001 * This function does both searching and inserting, because they share 1069 * This function does both searching and inserting, because they share
1002 * the same walking algorithm in an rbtree. 1070 * the same walking algorithm in an rbtree.
1003 */ 1071 */
1004static struct rmap_item *unstable_tree_search_insert(struct page *page, 1072static
1005 struct page **page2, 1073struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1006 struct rmap_item *rmap_item) 1074 struct page *page,
1075 struct page **tree_pagep)
1076
1007{ 1077{
1008 struct rb_node **new = &root_unstable_tree.rb_node; 1078 struct rb_node **new = &root_unstable_tree.rb_node;
1009 struct rb_node *parent = NULL; 1079 struct rb_node *parent = NULL;
1010 1080
1011 while (*new) { 1081 while (*new) {
1012 struct rmap_item *tree_rmap_item; 1082 struct rmap_item *tree_rmap_item;
1083 struct page *tree_page;
1013 int ret; 1084 int ret;
1014 1085
1015 cond_resched(); 1086 cond_resched();
1016 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1017 page2[0] = get_mergeable_page(tree_rmap_item); 1088 tree_page = get_mergeable_page(tree_rmap_item);
1018 if (!page2[0]) 1089 if (!tree_page)
1019 return NULL; 1090 return NULL;
1020 1091
1021 /* 1092 /*
1022 * Don't substitute an unswappable ksm page 1093 * Don't substitute a ksm page for a forked page.
1023 * just for one good swappable forked page.
1024 */ 1094 */
1025 if (page == page2[0]) { 1095 if (page == tree_page) {
1026 put_page(page2[0]); 1096 put_page(tree_page);
1027 return NULL; 1097 return NULL;
1028 } 1098 }
1029 1099
1030 ret = memcmp_pages(page, page2[0]); 1100 ret = memcmp_pages(page, tree_page);
1031 1101
1032 parent = *new; 1102 parent = *new;
1033 if (ret < 0) { 1103 if (ret < 0) {
1034 put_page(page2[0]); 1104 put_page(tree_page);
1035 new = &parent->rb_left; 1105 new = &parent->rb_left;
1036 } else if (ret > 0) { 1106 } else if (ret > 0) {
1037 put_page(page2[0]); 1107 put_page(tree_page);
1038 new = &parent->rb_right; 1108 new = &parent->rb_right;
1039 } else { 1109 } else {
1110 *tree_pagep = tree_page;
1040 return tree_rmap_item; 1111 return tree_rmap_item;
1041 } 1112 }
1042 } 1113 }
1043 1114
1044 rmap_item->address |= NODE_FLAG; 1115 rmap_item->address |= UNSTABLE_FLAG;
1045 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1116 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1046 rb_link_node(&rmap_item->node, parent, new); 1117 rb_link_node(&rmap_item->node, parent, new);
1047 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1118 rb_insert_color(&rmap_item->node, &root_unstable_tree);
@@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
1056 * the same ksm page. 1127 * the same ksm page.
1057 */ 1128 */
1058static void stable_tree_append(struct rmap_item *rmap_item, 1129static void stable_tree_append(struct rmap_item *rmap_item,
1059 struct rmap_item *tree_rmap_item) 1130 struct stable_node *stable_node)
1060{ 1131{
1061 rmap_item->next = tree_rmap_item->next; 1132 rmap_item->head = stable_node;
1062 rmap_item->prev = tree_rmap_item;
1063
1064 if (tree_rmap_item->next)
1065 tree_rmap_item->next->prev = rmap_item;
1066
1067 tree_rmap_item->next = rmap_item;
1068 rmap_item->address |= STABLE_FLAG; 1133 rmap_item->address |= STABLE_FLAG;
1134 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1069 1135
1070 ksm_pages_sharing++; 1136 if (rmap_item->hlist.next)
1137 ksm_pages_sharing++;
1138 else
1139 ksm_pages_shared++;
1071} 1140}
1072 1141
1073/* 1142/*
@@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item,
1081 */ 1150 */
1082static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1151static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1083{ 1152{
1084 struct page *page2[1];
1085 struct rmap_item *tree_rmap_item; 1153 struct rmap_item *tree_rmap_item;
1154 struct page *tree_page = NULL;
1155 struct stable_node *stable_node;
1156 struct page *kpage;
1086 unsigned int checksum; 1157 unsigned int checksum;
1087 int err; 1158 int err;
1088 1159
1089 if (in_stable_tree(rmap_item)) 1160 remove_rmap_item_from_tree(rmap_item);
1090 remove_rmap_item_from_tree(rmap_item);
1091 1161
1092 /* We first start with searching the page inside the stable tree */ 1162 /* We first start with searching the page inside the stable tree */
1093 tree_rmap_item = stable_tree_search(page, page2, rmap_item); 1163 kpage = stable_tree_search(page);
1094 if (tree_rmap_item) { 1164 if (kpage) {
1095 if (page == page2[0]) /* forked */ 1165 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1096 err = 0;
1097 else
1098 err = try_to_merge_with_ksm_page(rmap_item->mm,
1099 rmap_item->address,
1100 page, page2[0]);
1101 put_page(page2[0]);
1102
1103 if (!err) { 1166 if (!err) {
1104 /* 1167 /*
1105 * The page was successfully merged: 1168 * The page was successfully merged:
1106 * add its rmap_item to the stable tree. 1169 * add its rmap_item to the stable tree.
1107 */ 1170 */
1108 stable_tree_append(rmap_item, tree_rmap_item); 1171 lock_page(kpage);
1172 stable_tree_append(rmap_item, page_stable_node(kpage));
1173 unlock_page(kpage);
1109 } 1174 }
1175 put_page(kpage);
1110 return; 1176 return;
1111 } 1177 }
1112 1178
1113 /* 1179 /*
1114 * A ksm page might have got here by fork, but its other 1180 * If the hash value of the page has changed from the last time
1115 * references have already been removed from the stable tree. 1181 * we calculated it, this page is changing frequently: therefore we
1116 * Or it might be left over from a break_ksm which failed 1182 * don't want to insert it in the unstable tree, and we don't want
1117 * when the mem_cgroup had reached its limit: try again now. 1183 * to waste our time searching for something identical to it there.
1118 */
1119 if (PageKsm(page))
1120 break_cow(rmap_item->mm, rmap_item->address);
1121
1122 /*
1123 * In case the hash value of the page was changed from the last time we
1124 * have calculated it, this page to be changed frequely, therefore we
1125 * don't want to insert it to the unstable tree, and we don't want to
1126 * waste our time to search if there is something identical to it there.
1127 */ 1184 */
1128 checksum = calc_checksum(page); 1185 checksum = calc_checksum(page);
1129 if (rmap_item->oldchecksum != checksum) { 1186 if (rmap_item->oldchecksum != checksum) {
@@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1131 return; 1188 return;
1132 } 1189 }
1133 1190
1134 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); 1191 tree_rmap_item =
1192 unstable_tree_search_insert(rmap_item, page, &tree_page);
1135 if (tree_rmap_item) { 1193 if (tree_rmap_item) {
1136 err = try_to_merge_two_pages(rmap_item->mm, 1194 kpage = try_to_merge_two_pages(rmap_item, page,
1137 rmap_item->address, page, 1195 tree_rmap_item, tree_page);
1138 tree_rmap_item->mm, 1196 put_page(tree_page);
1139 tree_rmap_item->address, page2[0]);
1140 /* 1197 /*
1141 * As soon as we merge this page, we want to remove the 1198 * As soon as we merge this page, we want to remove the
1142 * rmap_item of the page we have merged with from the unstable 1199 * rmap_item of the page we have merged with from the unstable
1143 * tree, and insert it instead as new node in the stable tree. 1200 * tree, and insert it instead as new node in the stable tree.
1144 */ 1201 */
1145 if (!err) { 1202 if (kpage) {
1146 rb_erase(&tree_rmap_item->node, &root_unstable_tree); 1203 remove_rmap_item_from_tree(tree_rmap_item);
1147 tree_rmap_item->address &= ~NODE_FLAG; 1204
1148 ksm_pages_unshared--; 1205 lock_page(kpage);
1206 stable_node = stable_tree_insert(kpage);
1207 if (stable_node) {
1208 stable_tree_append(tree_rmap_item, stable_node);
1209 stable_tree_append(rmap_item, stable_node);
1210 }
1211 unlock_page(kpage);
1149 1212
1150 /* 1213 /*
1151 * If we fail to insert the page into the stable tree, 1214 * If we fail to insert the page into the stable tree,
@@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1153 * to a ksm page left outside the stable tree, 1216 * to a ksm page left outside the stable tree,
1154 * in which case we need to break_cow on both. 1217 * in which case we need to break_cow on both.
1155 */ 1218 */
1156 if (stable_tree_insert(page2[0], tree_rmap_item)) 1219 if (!stable_node) {
1157 stable_tree_append(rmap_item, tree_rmap_item); 1220 break_cow(tree_rmap_item);
1158 else { 1221 break_cow(rmap_item);
1159 break_cow(tree_rmap_item->mm,
1160 tree_rmap_item->address);
1161 break_cow(rmap_item->mm, rmap_item->address);
1162 } 1222 }
1163 } 1223 }
1164
1165 put_page(page2[0]);
1166 } 1224 }
1167} 1225}
1168 1226
1169static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1227static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1170 struct list_head *cur, 1228 struct rmap_item **rmap_list,
1171 unsigned long addr) 1229 unsigned long addr)
1172{ 1230{
1173 struct rmap_item *rmap_item; 1231 struct rmap_item *rmap_item;
1174 1232
1175 while (cur != &mm_slot->rmap_list) { 1233 while (*rmap_list) {
1176 rmap_item = list_entry(cur, struct rmap_item, link); 1234 rmap_item = *rmap_list;
1177 if ((rmap_item->address & PAGE_MASK) == addr) { 1235 if ((rmap_item->address & PAGE_MASK) == addr)
1178 if (!in_stable_tree(rmap_item))
1179 remove_rmap_item_from_tree(rmap_item);
1180 return rmap_item; 1236 return rmap_item;
1181 }
1182 if (rmap_item->address > addr) 1237 if (rmap_item->address > addr)
1183 break; 1238 break;
1184 cur = cur->next; 1239 *rmap_list = rmap_item->rmap_list;
1185 remove_rmap_item_from_tree(rmap_item); 1240 remove_rmap_item_from_tree(rmap_item);
1186 list_del(&rmap_item->link);
1187 free_rmap_item(rmap_item); 1241 free_rmap_item(rmap_item);
1188 } 1242 }
1189 1243
@@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1192 /* It has already been zeroed */ 1246 /* It has already been zeroed */
1193 rmap_item->mm = mm_slot->mm; 1247 rmap_item->mm = mm_slot->mm;
1194 rmap_item->address = addr; 1248 rmap_item->address = addr;
1195 list_add_tail(&rmap_item->link, cur); 1249 rmap_item->rmap_list = *rmap_list;
1250 *rmap_list = rmap_item;
1196 } 1251 }
1197 return rmap_item; 1252 return rmap_item;
1198} 1253}
@@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1217 spin_unlock(&ksm_mmlist_lock); 1272 spin_unlock(&ksm_mmlist_lock);
1218next_mm: 1273next_mm:
1219 ksm_scan.address = 0; 1274 ksm_scan.address = 0;
1220 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1275 ksm_scan.rmap_list = &slot->rmap_list;
1221 struct rmap_item, link);
1222 } 1276 }
1223 1277
1224 mm = slot->mm; 1278 mm = slot->mm;
@@ -1244,10 +1298,10 @@ next_mm:
1244 flush_anon_page(vma, *page, ksm_scan.address); 1298 flush_anon_page(vma, *page, ksm_scan.address);
1245 flush_dcache_page(*page); 1299 flush_dcache_page(*page);
1246 rmap_item = get_next_rmap_item(slot, 1300 rmap_item = get_next_rmap_item(slot,
1247 ksm_scan.rmap_item->link.next, 1301 ksm_scan.rmap_list, ksm_scan.address);
1248 ksm_scan.address);
1249 if (rmap_item) { 1302 if (rmap_item) {
1250 ksm_scan.rmap_item = rmap_item; 1303 ksm_scan.rmap_list =
1304 &rmap_item->rmap_list;
1251 ksm_scan.address += PAGE_SIZE; 1305 ksm_scan.address += PAGE_SIZE;
1252 } else 1306 } else
1253 put_page(*page); 1307 put_page(*page);
@@ -1263,14 +1317,13 @@ next_mm:
1263 1317
1264 if (ksm_test_exit(mm)) { 1318 if (ksm_test_exit(mm)) {
1265 ksm_scan.address = 0; 1319 ksm_scan.address = 0;
1266 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1320 ksm_scan.rmap_list = &slot->rmap_list;
1267 struct rmap_item, link);
1268 } 1321 }
1269 /* 1322 /*
1270 * Nuke all the rmap_items that are above this current rmap: 1323 * Nuke all the rmap_items that are above this current rmap:
1271 * because there were no VM_MERGEABLE vmas with such addresses. 1324 * because there were no VM_MERGEABLE vmas with such addresses.
1272 */ 1325 */
1273 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); 1326 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1274 1327
1275 spin_lock(&ksm_mmlist_lock); 1328 spin_lock(&ksm_mmlist_lock);
1276 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1329 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages)
1323 return; 1376 return;
1324 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1377 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1325 cmp_and_merge_page(page, rmap_item); 1378 cmp_and_merge_page(page, rmap_item);
1326 else if (page_mapcount(page) == 1) {
1327 /*
1328 * Replace now-unshared ksm page by ordinary page.
1329 */
1330 break_cow(rmap_item->mm, rmap_item->address);
1331 remove_rmap_item_from_tree(rmap_item);
1332 rmap_item->oldchecksum = calc_checksum(page);
1333 }
1334 put_page(page); 1379 put_page(page);
1335 } 1380 }
1336} 1381}
@@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1375 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1420 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1376 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1421 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1377 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1422 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1378 VM_MIXEDMAP | VM_SAO)) 1423 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
1379 return 0; /* just ignore the advice */ 1424 return 0; /* just ignore the advice */
1380 1425
1381 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1426 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm)
1452 spin_lock(&ksm_mmlist_lock); 1497 spin_lock(&ksm_mmlist_lock);
1453 mm_slot = get_mm_slot(mm); 1498 mm_slot = get_mm_slot(mm);
1454 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1499 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1455 if (list_empty(&mm_slot->rmap_list)) { 1500 if (!mm_slot->rmap_list) {
1456 hlist_del(&mm_slot->link); 1501 hlist_del(&mm_slot->link);
1457 list_del(&mm_slot->mm_list); 1502 list_del(&mm_slot->mm_list);
1458 easy_to_free = 1; 1503 easy_to_free = 1;
@@ -1473,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm)
1473 } 1518 }
1474} 1519}
1475 1520
1521struct page *ksm_does_need_to_copy(struct page *page,
1522 struct vm_area_struct *vma, unsigned long address)
1523{
1524 struct page *new_page;
1525
1526 unlock_page(page); /* any racers will COW it, not modify it */
1527
1528 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1529 if (new_page) {
1530 copy_user_highpage(new_page, page, address, vma);
1531
1532 SetPageDirty(new_page);
1533 __SetPageUptodate(new_page);
1534 SetPageSwapBacked(new_page);
1535 __set_page_locked(new_page);
1536
1537 if (page_evictable(new_page, vma))
1538 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1539 else
1540 add_page_to_unevictable_list(new_page);
1541 }
1542
1543 page_cache_release(page);
1544 return new_page;
1545}
1546
1547int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1548 unsigned long *vm_flags)
1549{
1550 struct stable_node *stable_node;
1551 struct rmap_item *rmap_item;
1552 struct hlist_node *hlist;
1553 unsigned int mapcount = page_mapcount(page);
1554 int referenced = 0;
1555 int search_new_forks = 0;
1556
1557 VM_BUG_ON(!PageKsm(page));
1558 VM_BUG_ON(!PageLocked(page));
1559
1560 stable_node = page_stable_node(page);
1561 if (!stable_node)
1562 return 0;
1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct vm_area_struct *vma;
1567
1568 spin_lock(&anon_vma->lock);
1569 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1570 if (rmap_item->address < vma->vm_start ||
1571 rmap_item->address >= vma->vm_end)
1572 continue;
1573 /*
1574 * Initially we examine only the vma which covers this
1575 * rmap_item; but later, if there is still work to do,
1576 * we examine covering vmas in other mms: in case they
1577 * were forked from the original since ksmd passed.
1578 */
1579 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1580 continue;
1581
1582 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
1583 continue;
1584
1585 referenced += page_referenced_one(page, vma,
1586 rmap_item->address, &mapcount, vm_flags);
1587 if (!search_new_forks || !mapcount)
1588 break;
1589 }
1590 spin_unlock(&anon_vma->lock);
1591 if (!mapcount)
1592 goto out;
1593 }
1594 if (!search_new_forks++)
1595 goto again;
1596out:
1597 return referenced;
1598}
1599
1600int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1601{
1602 struct stable_node *stable_node;
1603 struct hlist_node *hlist;
1604 struct rmap_item *rmap_item;
1605 int ret = SWAP_AGAIN;
1606 int search_new_forks = 0;
1607
1608 VM_BUG_ON(!PageKsm(page));
1609 VM_BUG_ON(!PageLocked(page));
1610
1611 stable_node = page_stable_node(page);
1612 if (!stable_node)
1613 return SWAP_FAIL;
1614again:
1615 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1616 struct anon_vma *anon_vma = rmap_item->anon_vma;
1617 struct vm_area_struct *vma;
1618
1619 spin_lock(&anon_vma->lock);
1620 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1621 if (rmap_item->address < vma->vm_start ||
1622 rmap_item->address >= vma->vm_end)
1623 continue;
1624 /*
1625 * Initially we examine only the vma which covers this
1626 * rmap_item; but later, if there is still work to do,
1627 * we examine covering vmas in other mms: in case they
1628 * were forked from the original since ksmd passed.
1629 */
1630 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1631 continue;
1632
1633 ret = try_to_unmap_one(page, vma,
1634 rmap_item->address, flags);
1635 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1636 spin_unlock(&anon_vma->lock);
1637 goto out;
1638 }
1639 }
1640 spin_unlock(&anon_vma->lock);
1641 }
1642 if (!search_new_forks++)
1643 goto again;
1644out:
1645 return ret;
1646}
1647
1648#ifdef CONFIG_MIGRATION
1649int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1650 struct vm_area_struct *, unsigned long, void *), void *arg)
1651{
1652 struct stable_node *stable_node;
1653 struct hlist_node *hlist;
1654 struct rmap_item *rmap_item;
1655 int ret = SWAP_AGAIN;
1656 int search_new_forks = 0;
1657
1658 VM_BUG_ON(!PageKsm(page));
1659 VM_BUG_ON(!PageLocked(page));
1660
1661 stable_node = page_stable_node(page);
1662 if (!stable_node)
1663 return ret;
1664again:
1665 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1666 struct anon_vma *anon_vma = rmap_item->anon_vma;
1667 struct vm_area_struct *vma;
1668
1669 spin_lock(&anon_vma->lock);
1670 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1671 if (rmap_item->address < vma->vm_start ||
1672 rmap_item->address >= vma->vm_end)
1673 continue;
1674 /*
1675 * Initially we examine only the vma which covers this
1676 * rmap_item; but later, if there is still work to do,
1677 * we examine covering vmas in other mms: in case they
1678 * were forked from the original since ksmd passed.
1679 */
1680 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1681 continue;
1682
1683 ret = rmap_one(page, vma, rmap_item->address, arg);
1684 if (ret != SWAP_AGAIN) {
1685 spin_unlock(&anon_vma->lock);
1686 goto out;
1687 }
1688 }
1689 spin_unlock(&anon_vma->lock);
1690 }
1691 if (!search_new_forks++)
1692 goto again;
1693out:
1694 return ret;
1695}
1696
1697void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1698{
1699 struct stable_node *stable_node;
1700
1701 VM_BUG_ON(!PageLocked(oldpage));
1702 VM_BUG_ON(!PageLocked(newpage));
1703 VM_BUG_ON(newpage->mapping != oldpage->mapping);
1704
1705 stable_node = page_stable_node(newpage);
1706 if (stable_node) {
1707 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1708 stable_node->kpfn = page_to_pfn(newpage);
1709 }
1710}
1711#endif /* CONFIG_MIGRATION */
1712
1713#ifdef CONFIG_MEMORY_HOTREMOVE
1714static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
1715 unsigned long end_pfn)
1716{
1717 struct rb_node *node;
1718
1719 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
1720 struct stable_node *stable_node;
1721
1722 stable_node = rb_entry(node, struct stable_node, node);
1723 if (stable_node->kpfn >= start_pfn &&
1724 stable_node->kpfn < end_pfn)
1725 return stable_node;
1726 }
1727 return NULL;
1728}
1729
1730static int ksm_memory_callback(struct notifier_block *self,
1731 unsigned long action, void *arg)
1732{
1733 struct memory_notify *mn = arg;
1734 struct stable_node *stable_node;
1735
1736 switch (action) {
1737 case MEM_GOING_OFFLINE:
1738 /*
1739 * Keep it very simple for now: just lock out ksmd and
1740 * MADV_UNMERGEABLE while any memory is going offline.
1741 */
1742 mutex_lock(&ksm_thread_mutex);
1743 break;
1744
1745 case MEM_OFFLINE:
1746 /*
1747 * Most of the work is done by page migration; but there might
1748 * be a few stable_nodes left over, still pointing to struct
1749 * pages which have been offlined: prune those from the tree.
1750 */
1751 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
1752 mn->start_pfn + mn->nr_pages)) != NULL)
1753 remove_node_from_stable_tree(stable_node);
1754 /* fallthrough */
1755
1756 case MEM_CANCEL_OFFLINE:
1757 mutex_unlock(&ksm_thread_mutex);
1758 break;
1759 }
1760 return NOTIFY_OK;
1761}
1762#endif /* CONFIG_MEMORY_HOTREMOVE */
1763
1476#ifdef CONFIG_SYSFS 1764#ifdef CONFIG_SYSFS
1477/* 1765/*
1478 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1766 * This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -1551,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1551 /* 1839 /*
1552 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1840 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1553 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1841 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1554 * breaking COW to free the unswappable pages_shared (but leaves 1842 * breaking COW to free the pages_shared (but leaves mm_slots
1555 * mm_slots on the list for when ksmd may be set running again). 1843 * on the list for when ksmd may be set running again).
1556 */ 1844 */
1557 1845
1558 mutex_lock(&ksm_thread_mutex); 1846 mutex_lock(&ksm_thread_mutex);
@@ -1577,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1577} 1865}
1578KSM_ATTR(run); 1866KSM_ATTR(run);
1579 1867
1580static ssize_t max_kernel_pages_store(struct kobject *kobj,
1581 struct kobj_attribute *attr,
1582 const char *buf, size_t count)
1583{
1584 int err;
1585 unsigned long nr_pages;
1586
1587 err = strict_strtoul(buf, 10, &nr_pages);
1588 if (err)
1589 return -EINVAL;
1590
1591 ksm_max_kernel_pages = nr_pages;
1592
1593 return count;
1594}
1595
1596static ssize_t max_kernel_pages_show(struct kobject *kobj,
1597 struct kobj_attribute *attr, char *buf)
1598{
1599 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1600}
1601KSM_ATTR(max_kernel_pages);
1602
1603static ssize_t pages_shared_show(struct kobject *kobj, 1868static ssize_t pages_shared_show(struct kobject *kobj,
1604 struct kobj_attribute *attr, char *buf) 1869 struct kobj_attribute *attr, char *buf)
1605{ 1870{
@@ -1649,7 +1914,6 @@ static struct attribute *ksm_attrs[] = {
1649 &sleep_millisecs_attr.attr, 1914 &sleep_millisecs_attr.attr,
1650 &pages_to_scan_attr.attr, 1915 &pages_to_scan_attr.attr,
1651 &run_attr.attr, 1916 &run_attr.attr,
1652 &max_kernel_pages_attr.attr,
1653 &pages_shared_attr.attr, 1917 &pages_shared_attr.attr,
1654 &pages_sharing_attr.attr, 1918 &pages_sharing_attr.attr,
1655 &pages_unshared_attr.attr, 1919 &pages_unshared_attr.attr,
@@ -1669,8 +1933,6 @@ static int __init ksm_init(void)
1669 struct task_struct *ksm_thread; 1933 struct task_struct *ksm_thread;
1670 int err; 1934 int err;
1671 1935
1672 ksm_max_kernel_pages = totalram_pages / 4;
1673
1674 err = ksm_slab_init(); 1936 err = ksm_slab_init();
1675 if (err) 1937 if (err)
1676 goto out; 1938 goto out;
@@ -1698,6 +1960,13 @@ static int __init ksm_init(void)
1698 1960
1699#endif /* CONFIG_SYSFS */ 1961#endif /* CONFIG_SYSFS */
1700 1962
1963#ifdef CONFIG_MEMORY_HOTREMOVE
1964 /*
1965 * Choose a high priority since the callback takes ksm_thread_mutex:
1966 * later callbacks could only be taking locks which nest within that.
1967 */
1968 hotplug_memory_notifier(ksm_memory_callback, 100);
1969#endif
1701 return 0; 1970 return 0;
1702 1971
1703out_free2: 1972out_free2:
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
14 * Safely read from address @src to the buffer at @dst. If a kernel fault 14 * Safely read from address @src to the buffer at @dst. If a kernel fault
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 */ 16 */
17long probe_kernel_read(void *dst, void *src, size_t size) 17
18long __weak probe_kernel_read(void *dst, void *src, size_t size)
19 __attribute__((alias("__probe_kernel_read")));
20
21long __probe_kernel_read(void *dst, void *src, size_t size)
18{ 22{
19 long ret; 23 long ret;
20 mm_segment_t old_fs = get_fs(); 24 mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 43 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 44 * happens, handle that and return -EFAULT.
41 */ 45 */
42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) 46long __weak probe_kernel_write(void *dst, void *src, size_t size)
47 __attribute__((alias("__probe_kernel_write")));
48
49long __probe_kernel_write(void *dst, void *src, size_t size)
43{ 50{
44 long ret; 51 long ret;
45 mm_segment_t old_fs = get_fs(); 52 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
12#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/ksm.h> 15#include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
222/* 223/*
223 * Error injection support for memory error handling. 224 * Error injection support for memory error handling.
224 */ 225 */
225static int madvise_hwpoison(unsigned long start, unsigned long end) 226static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
226{ 227{
227 int ret = 0; 228 int ret = 0;
228 229
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
230 return -EPERM; 231 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) { 232 for (; start < end; start += PAGE_SIZE) {
232 struct page *p; 233 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1, 234 int ret = get_user_pages_fast(start, 1, 0, &p);
234 0, 0, &p, NULL);
235 if (ret != 1) 235 if (ret != 1)
236 return ret; 236 return ret;
237 if (bhv == MADV_SOFT_OFFLINE) {
238 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
239 page_to_pfn(p), start);
240 ret = soft_offline_page(p, MF_COUNT_INCREASED);
241 if (ret)
242 break;
243 continue;
244 }
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 245 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start); 246 page_to_pfn(p), start);
239 /* Ignore return value for now */ 247 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1); 248 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
241 put_page(p);
242 } 249 }
243 return ret; 250 return ret;
244} 251}
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
335 size_t len; 342 size_t len;
336 343
337#ifdef CONFIG_MEMORY_FAILURE 344#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON) 345 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
339 return madvise_hwpoison(start, start+len_in); 346 return madvise_hwpoison(behavior, start, start+len_in);
340#endif 347#endif
341 if (!madvise_behavior_valid(behavior)) 348 if (!madvise_behavior_valid(behavior))
342 return error; 349 return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c31a310aa146..954032b80bed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -38,6 +38,7 @@
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mm_inline.h> 39#include <linux/mm_inline.h>
40#include <linux/page_cgroup.h> 40#include <linux/page_cgroup.h>
41#include <linux/cpu.h>
41#include "internal.h" 42#include "internal.h"
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54#define do_swap_account (0) 55#define do_swap_account (0)
55#endif 56#endif
56 57
57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000) 58#define SOFTLIMIT_EVENTS_THRESH (1000)
59 59
60/* 60/*
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index {
66 */ 66 */
67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ 72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
@@ -275,6 +275,7 @@ enum charge_type {
275static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
276static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
278static void drain_all_stock_async(void);
278 279
279static struct mem_cgroup_per_zone * 280static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 281mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 283 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283} 284}
284 285
286struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
287{
288 return &mem->css;
289}
290
285static struct mem_cgroup_per_zone * 291static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc) 292page_cgroup_zoneinfo(struct page_cgroup *pc)
287{ 293{
@@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
758 task_unlock(task); 764 task_unlock(task);
759 if (!curr) 765 if (!curr)
760 return 0; 766 return 0;
761 if (curr->use_hierarchy) 767 /*
768 * We should check use_hierarchy of "mem" not "curr". Because checking
769 * use_hierarchy of "curr" here make this function true if hierarchy is
770 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
771 * hierarchy(even if use_hierarchy is disabled in "mem").
772 */
773 if (mem->use_hierarchy)
762 ret = css_is_ancestor(&curr->css, &mem->css); 774 ret = css_is_ancestor(&curr->css, &mem->css);
763 else 775 else
764 ret = (curr == mem); 776 ret = (curr == mem);
@@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1007 static char memcg_name[PATH_MAX]; 1019 static char memcg_name[PATH_MAX];
1008 int ret; 1020 int ret;
1009 1021
1010 if (!memcg) 1022 if (!memcg || !p)
1011 return; 1023 return;
1012 1024
1013 1025
@@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1137 victim = mem_cgroup_select_victim(root_mem); 1149 victim = mem_cgroup_select_victim(root_mem);
1138 if (victim == root_mem) { 1150 if (victim == root_mem) {
1139 loop++; 1151 loop++;
1152 if (loop >= 1)
1153 drain_all_stock_async();
1140 if (loop >= 2) { 1154 if (loop >= 2) {
1141 /* 1155 /*
1142 * If we have not been able to reclaim 1156 * If we have not been able to reclaim
@@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem)
1223 * Currently used to update mapped file statistics, but the routine can be 1237 * Currently used to update mapped file statistics, but the routine can be
1224 * generalized to update other statistics as well. 1238 * generalized to update other statistics as well.
1225 */ 1239 */
1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1240void mem_cgroup_update_file_mapped(struct page *page, int val)
1227{ 1241{
1228 struct mem_cgroup *mem; 1242 struct mem_cgroup *mem;
1229 struct mem_cgroup_stat *stat; 1243 struct mem_cgroup_stat *stat;
@@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1231 int cpu; 1245 int cpu;
1232 struct page_cgroup *pc; 1246 struct page_cgroup *pc;
1233 1247
1234 if (!page_is_file_cache(page))
1235 return;
1236
1237 pc = lookup_page_cgroup(page); 1248 pc = lookup_page_cgroup(page);
1238 if (unlikely(!pc)) 1249 if (unlikely(!pc))
1239 return; 1250 return;
@@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
1253 stat = &mem->stat; 1264 stat = &mem->stat;
1254 cpustat = &stat->cpustat[cpu]; 1265 cpustat = &stat->cpustat[cpu];
1255 1266
1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); 1267 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
1257done: 1268done:
1258 unlock_page_cgroup(pc); 1269 unlock_page_cgroup(pc);
1259} 1270}
1260 1271
1261/* 1272/*
1273 * size of first charge trial. "32" comes from vmscan.c's magic value.
1274 * TODO: maybe necessary to use big numbers in big irons.
1275 */
1276#define CHARGE_SIZE (32 * PAGE_SIZE)
1277struct memcg_stock_pcp {
1278 struct mem_cgroup *cached; /* this never be root cgroup */
1279 int charge;
1280 struct work_struct work;
1281};
1282static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1283static atomic_t memcg_drain_count;
1284
1285/*
1286 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1287 * from local stock and true is returned. If the stock is 0 or charges from a
1288 * cgroup which is not current target, returns false. This stock will be
1289 * refilled.
1290 */
1291static bool consume_stock(struct mem_cgroup *mem)
1292{
1293 struct memcg_stock_pcp *stock;
1294 bool ret = true;
1295
1296 stock = &get_cpu_var(memcg_stock);
1297 if (mem == stock->cached && stock->charge)
1298 stock->charge -= PAGE_SIZE;
1299 else /* need to call res_counter_charge */
1300 ret = false;
1301 put_cpu_var(memcg_stock);
1302 return ret;
1303}
1304
1305/*
1306 * Returns stocks cached in percpu to res_counter and reset cached information.
1307 */
1308static void drain_stock(struct memcg_stock_pcp *stock)
1309{
1310 struct mem_cgroup *old = stock->cached;
1311
1312 if (stock->charge) {
1313 res_counter_uncharge(&old->res, stock->charge);
1314 if (do_swap_account)
1315 res_counter_uncharge(&old->memsw, stock->charge);
1316 }
1317 stock->cached = NULL;
1318 stock->charge = 0;
1319}
1320
1321/*
1322 * This must be called under preempt disabled or must be called by
1323 * a thread which is pinned to local cpu.
1324 */
1325static void drain_local_stock(struct work_struct *dummy)
1326{
1327 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1328 drain_stock(stock);
1329}
1330
1331/*
1332 * Cache charges(val) which is from res_counter, to local per_cpu area.
1333 * This will be consumed by consumt_stock() function, later.
1334 */
1335static void refill_stock(struct mem_cgroup *mem, int val)
1336{
1337 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1338
1339 if (stock->cached != mem) { /* reset if necessary */
1340 drain_stock(stock);
1341 stock->cached = mem;
1342 }
1343 stock->charge += val;
1344 put_cpu_var(memcg_stock);
1345}
1346
1347/*
1348 * Tries to drain stocked charges in other cpus. This function is asynchronous
1349 * and just put a work per cpu for draining localy on each cpu. Caller can
1350 * expects some charges will be back to res_counter later but cannot wait for
1351 * it.
1352 */
1353static void drain_all_stock_async(void)
1354{
1355 int cpu;
1356 /* This function is for scheduling "drain" in asynchronous way.
1357 * The result of "drain" is not directly handled by callers. Then,
1358 * if someone is calling drain, we don't have to call drain more.
1359 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1360 * there is a race. We just do loose check here.
1361 */
1362 if (atomic_read(&memcg_drain_count))
1363 return;
1364 /* Notify other cpus that system-wide "drain" is running */
1365 atomic_inc(&memcg_drain_count);
1366 get_online_cpus();
1367 for_each_online_cpu(cpu) {
1368 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1369 schedule_work_on(cpu, &stock->work);
1370 }
1371 put_online_cpus();
1372 atomic_dec(&memcg_drain_count);
1373 /* We don't wait for flush_work */
1374}
1375
1376/* This is a synchronous drain interface. */
1377static void drain_all_stock_sync(void)
1378{
1379 /* called when force_empty is called */
1380 atomic_inc(&memcg_drain_count);
1381 schedule_on_each_cpu(drain_local_stock);
1382 atomic_dec(&memcg_drain_count);
1383}
1384
1385static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1386 unsigned long action,
1387 void *hcpu)
1388{
1389 int cpu = (unsigned long)hcpu;
1390 struct memcg_stock_pcp *stock;
1391
1392 if (action != CPU_DEAD)
1393 return NOTIFY_OK;
1394 stock = &per_cpu(memcg_stock, cpu);
1395 drain_stock(stock);
1396 return NOTIFY_OK;
1397}
1398
1399/*
1262 * Unlike exported interface, "oom" parameter is added. if oom==true, 1400 * Unlike exported interface, "oom" parameter is added. if oom==true,
1263 * oom-killer can be invoked. 1401 * oom-killer can be invoked.
1264 */ 1402 */
@@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1269 struct mem_cgroup *mem, *mem_over_limit; 1407 struct mem_cgroup *mem, *mem_over_limit;
1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1408 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1271 struct res_counter *fail_res; 1409 struct res_counter *fail_res;
1410 int csize = CHARGE_SIZE;
1272 1411
1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1412 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1274 /* Don't account this! */ 1413 /* Don't account this! */
@@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1293 return 0; 1432 return 0;
1294 1433
1295 VM_BUG_ON(css_is_removed(&mem->css)); 1434 VM_BUG_ON(css_is_removed(&mem->css));
1435 if (mem_cgroup_is_root(mem))
1436 goto done;
1296 1437
1297 while (1) { 1438 while (1) {
1298 int ret = 0; 1439 int ret = 0;
1299 unsigned long flags = 0; 1440 unsigned long flags = 0;
1300 1441
1301 if (mem_cgroup_is_root(mem)) 1442 if (consume_stock(mem))
1302 goto done; 1443 goto charged;
1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1444
1445 ret = res_counter_charge(&mem->res, csize, &fail_res);
1304 if (likely(!ret)) { 1446 if (likely(!ret)) {
1305 if (!do_swap_account) 1447 if (!do_swap_account)
1306 break; 1448 break;
1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1449 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1308 &fail_res);
1309 if (likely(!ret)) 1450 if (likely(!ret))
1310 break; 1451 break;
1311 /* mem+swap counter fails */ 1452 /* mem+swap counter fails */
1312 res_counter_uncharge(&mem->res, PAGE_SIZE); 1453 res_counter_uncharge(&mem->res, csize);
1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1454 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1455 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1315 memsw); 1456 memsw);
@@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1459 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1319 res); 1460 res);
1320 1461
1462 /* reduce request size and retry */
1463 if (csize > PAGE_SIZE) {
1464 csize = PAGE_SIZE;
1465 continue;
1466 }
1321 if (!(gfp_mask & __GFP_WAIT)) 1467 if (!(gfp_mask & __GFP_WAIT))
1322 goto nomem; 1468 goto nomem;
1323 1469
@@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1339 1485
1340 if (!nr_retries--) { 1486 if (!nr_retries--) {
1341 if (oom) { 1487 if (oom) {
1342 mutex_lock(&memcg_tasklist);
1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1488 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1344 mutex_unlock(&memcg_tasklist);
1345 record_last_oom(mem_over_limit); 1489 record_last_oom(mem_over_limit);
1346 } 1490 }
1347 goto nomem; 1491 goto nomem;
1348 } 1492 }
1349 } 1493 }
1494 if (csize > PAGE_SIZE)
1495 refill_stock(mem, csize - PAGE_SIZE);
1496charged:
1350 /* 1497 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1498 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit. 1499 * if they exceeds softlimit.
@@ -1361,6 +1508,21 @@ nomem:
1361} 1508}
1362 1509
1363/* 1510/*
1511 * Somemtimes we have to undo a charge we got by try_charge().
1512 * This function is for that and do uncharge, put css's refcnt.
1513 * gotten by try_charge().
1514 */
1515static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1516{
1517 if (!mem_cgroup_is_root(mem)) {
1518 res_counter_uncharge(&mem->res, PAGE_SIZE);
1519 if (do_swap_account)
1520 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1521 }
1522 css_put(&mem->css);
1523}
1524
1525/*
1364 * A helper function to get mem_cgroup from ID. must be called under 1526 * A helper function to get mem_cgroup from ID. must be called under
1365 * rcu_read_lock(). The caller must check css_is_removed() or some if 1527 * rcu_read_lock(). The caller must check css_is_removed() or some if
1366 * it's concern. (dropping refcnt from swap can be called against removed 1528 * it's concern. (dropping refcnt from swap can be called against removed
@@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1379 return container_of(css, struct mem_cgroup, css); 1541 return container_of(css, struct mem_cgroup, css);
1380} 1542}
1381 1543
1382static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1544struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1383{ 1545{
1384 struct mem_cgroup *mem; 1546 struct mem_cgroup *mem = NULL;
1385 struct page_cgroup *pc; 1547 struct page_cgroup *pc;
1386 unsigned short id; 1548 unsigned short id;
1387 swp_entry_t ent; 1549 swp_entry_t ent;
1388 1550
1389 VM_BUG_ON(!PageLocked(page)); 1551 VM_BUG_ON(!PageLocked(page));
1390 1552
1391 if (!PageSwapCache(page))
1392 return NULL;
1393
1394 pc = lookup_page_cgroup(page); 1553 pc = lookup_page_cgroup(page);
1395 lock_page_cgroup(pc); 1554 lock_page_cgroup(pc);
1396 if (PageCgroupUsed(pc)) { 1555 if (PageCgroupUsed(pc)) {
1397 mem = pc->mem_cgroup; 1556 mem = pc->mem_cgroup;
1398 if (mem && !css_tryget(&mem->css)) 1557 if (mem && !css_tryget(&mem->css))
1399 mem = NULL; 1558 mem = NULL;
1400 } else { 1559 } else if (PageSwapCache(page)) {
1401 ent.val = page_private(page); 1560 ent.val = page_private(page);
1402 id = lookup_swap_cgroup(ent); 1561 id = lookup_swap_cgroup(ent);
1403 rcu_read_lock(); 1562 rcu_read_lock();
@@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1426 lock_page_cgroup(pc); 1585 lock_page_cgroup(pc);
1427 if (unlikely(PageCgroupUsed(pc))) { 1586 if (unlikely(PageCgroupUsed(pc))) {
1428 unlock_page_cgroup(pc); 1587 unlock_page_cgroup(pc);
1429 if (!mem_cgroup_is_root(mem)) { 1588 mem_cgroup_cancel_charge(mem);
1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1434 css_put(&mem->css);
1435 return; 1589 return;
1436 } 1590 }
1437 1591
@@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1464} 1618}
1465 1619
1466/** 1620/**
1467 * mem_cgroup_move_account - move account of the page 1621 * __mem_cgroup_move_account - move account of the page
1468 * @pc: page_cgroup of the page. 1622 * @pc: page_cgroup of the page.
1469 * @from: mem_cgroup which the page is moved from. 1623 * @from: mem_cgroup which the page is moved from.
1470 * @to: mem_cgroup which the page is moved to. @from != @to. 1624 * @to: mem_cgroup which the page is moved to. @from != @to.
1471 * 1625 *
1472 * The caller must confirm following. 1626 * The caller must confirm following.
1473 * - page is not on LRU (isolate_page() is useful.) 1627 * - page is not on LRU (isolate_page() is useful.)
1474 * 1628 * - the pc is locked, used, and ->mem_cgroup points to @from.
1475 * returns 0 at success,
1476 * returns -EBUSY when lock is busy or "pc" is unstable.
1477 * 1629 *
1478 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1630 * This function does "uncharge" from old cgroup but doesn't do "charge" to
1479 * new cgroup. It should be done by a caller. 1631 * new cgroup. It should be done by a caller.
1480 */ 1632 */
1481 1633
1482static int mem_cgroup_move_account(struct page_cgroup *pc, 1634static void __mem_cgroup_move_account(struct page_cgroup *pc,
1483 struct mem_cgroup *from, struct mem_cgroup *to) 1635 struct mem_cgroup *from, struct mem_cgroup *to)
1484{ 1636{
1485 struct mem_cgroup_per_zone *from_mz, *to_mz;
1486 int nid, zid;
1487 int ret = -EBUSY;
1488 struct page *page; 1637 struct page *page;
1489 int cpu; 1638 int cpu;
1490 struct mem_cgroup_stat *stat; 1639 struct mem_cgroup_stat *stat;
@@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1492 1641
1493 VM_BUG_ON(from == to); 1642 VM_BUG_ON(from == to);
1494 VM_BUG_ON(PageLRU(pc->page)); 1643 VM_BUG_ON(PageLRU(pc->page));
1495 1644 VM_BUG_ON(!PageCgroupLocked(pc));
1496 nid = page_cgroup_nid(pc); 1645 VM_BUG_ON(!PageCgroupUsed(pc));
1497 zid = page_cgroup_zid(pc); 1646 VM_BUG_ON(pc->mem_cgroup != from);
1498 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
1499 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
1500
1501 if (!trylock_page_cgroup(pc))
1502 return ret;
1503
1504 if (!PageCgroupUsed(pc))
1505 goto out;
1506
1507 if (pc->mem_cgroup != from)
1508 goto out;
1509 1647
1510 if (!mem_cgroup_is_root(from)) 1648 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE); 1649 res_counter_uncharge(&from->res, PAGE_SIZE);
1512 mem_cgroup_charge_statistics(from, pc, false); 1650 mem_cgroup_charge_statistics(from, pc, false);
1513 1651
1514 page = pc->page; 1652 page = pc->page;
1515 if (page_is_file_cache(page) && page_mapped(page)) { 1653 if (page_mapped(page) && !PageAnon(page)) {
1516 cpu = smp_processor_id(); 1654 cpu = smp_processor_id();
1517 /* Update mapped_file data for mem_cgroup "from" */ 1655 /* Update mapped_file data for mem_cgroup "from" */
1518 stat = &from->stat; 1656 stat = &from->stat;
1519 cpustat = &stat->cpustat[cpu]; 1657 cpustat = &stat->cpustat[cpu];
1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1658 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1521 -1); 1659 -1);
1522 1660
1523 /* Update mapped_file data for mem_cgroup "to" */ 1661 /* Update mapped_file data for mem_cgroup "to" */
1524 stat = &to->stat; 1662 stat = &to->stat;
1525 cpustat = &stat->cpustat[cpu]; 1663 cpustat = &stat->cpustat[cpu];
1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, 1664 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
1527 1); 1665 1);
1528 } 1666 }
1529 1667
@@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1534 css_get(&to->css); 1672 css_get(&to->css);
1535 pc->mem_cgroup = to; 1673 pc->mem_cgroup = to;
1536 mem_cgroup_charge_statistics(to, pc, true); 1674 mem_cgroup_charge_statistics(to, pc, true);
1537 ret = 0;
1538out:
1539 unlock_page_cgroup(pc);
1540 /* 1675 /*
1541 * We charges against "to" which may not have any tasks. Then, "to" 1676 * We charges against "to" which may not have any tasks. Then, "to"
1542 * can be under rmdir(). But in current implementation, caller of 1677 * can be under rmdir(). But in current implementation, caller of
1543 * this function is just force_empty() and it's garanteed that 1678 * this function is just force_empty() and it's garanteed that
1544 * "to" is never removed. So, we don't check rmdir status here. 1679 * "to" is never removed. So, we don't check rmdir status here.
1545 */ 1680 */
1681}
1682
1683/*
1684 * check whether the @pc is valid for moving account and call
1685 * __mem_cgroup_move_account()
1686 */
1687static int mem_cgroup_move_account(struct page_cgroup *pc,
1688 struct mem_cgroup *from, struct mem_cgroup *to)
1689{
1690 int ret = -EINVAL;
1691 lock_page_cgroup(pc);
1692 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1693 __mem_cgroup_move_account(pc, from, to);
1694 ret = 0;
1695 }
1696 unlock_page_cgroup(pc);
1546 return ret; 1697 return ret;
1547} 1698}
1548 1699
@@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1564 if (!pcg) 1715 if (!pcg)
1565 return -EINVAL; 1716 return -EINVAL;
1566 1717
1718 ret = -EBUSY;
1719 if (!get_page_unless_zero(page))
1720 goto out;
1721 if (isolate_lru_page(page))
1722 goto put;
1567 1723
1568 parent = mem_cgroup_from_cont(pcg); 1724 parent = mem_cgroup_from_cont(pcg);
1569
1570
1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); 1725 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1572 if (ret || !parent) 1726 if (ret || !parent)
1573 return ret; 1727 goto put_back;
1574
1575 if (!get_page_unless_zero(page)) {
1576 ret = -EBUSY;
1577 goto uncharge;
1578 }
1579
1580 ret = isolate_lru_page(page);
1581
1582 if (ret)
1583 goto cancel;
1584 1728
1585 ret = mem_cgroup_move_account(pc, child, parent); 1729 ret = mem_cgroup_move_account(pc, child, parent);
1586 1730 if (!ret)
1731 css_put(&parent->css); /* drop extra refcnt by try_charge() */
1732 else
1733 mem_cgroup_cancel_charge(parent); /* does css_put */
1734put_back:
1587 putback_lru_page(page); 1735 putback_lru_page(page);
1588 if (!ret) { 1736put:
1589 put_page(page);
1590 /* drop extra refcnt by try_charge() */
1591 css_put(&parent->css);
1592 return 0;
1593 }
1594
1595cancel:
1596 put_page(page); 1737 put_page(page);
1597uncharge: 1738out:
1598 /* drop extra refcnt by try_charge() */
1599 css_put(&parent->css);
1600 /* uncharge if move fails */
1601 if (!mem_cgroup_is_root(parent)) {
1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1606 return ret; 1739 return ret;
1607} 1740}
1608 1741
@@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1737 goto charge_cur_mm; 1870 goto charge_cur_mm;
1738 /* 1871 /*
1739 * A racing thread's fault, or swapoff, may have already updated 1872 * A racing thread's fault, or swapoff, may have already updated
1740 * the pte, and even removed page from swap cache: return success 1873 * the pte, and even removed page from swap cache: in those cases
1741 * to go on to do_swap_page()'s pte_same() test, which should fail. 1874 * do_swap_page()'s pte_same() test will fail; but there's also a
1875 * KSM case which does need to charge the page.
1742 */ 1876 */
1743 if (!PageSwapCache(page)) 1877 if (!PageSwapCache(page))
1744 return 0; 1878 goto charge_cur_mm;
1745 mem = try_get_mem_cgroup_from_swapcache(page); 1879 mem = try_get_mem_cgroup_from_page(page);
1746 if (!mem) 1880 if (!mem)
1747 goto charge_cur_mm; 1881 goto charge_cur_mm;
1748 *ptr = mem; 1882 *ptr = mem;
@@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1818 return; 1952 return;
1819 if (!mem) 1953 if (!mem)
1820 return; 1954 return;
1821 if (!mem_cgroup_is_root(mem)) { 1955 mem_cgroup_cancel_charge(mem);
1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1826 css_put(&mem->css);
1827} 1956}
1828 1957
1958static void
1959__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
1960{
1961 struct memcg_batch_info *batch = NULL;
1962 bool uncharge_memsw = true;
1963 /* If swapout, usage of swap doesn't decrease */
1964 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1965 uncharge_memsw = false;
1966 /*
1967 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
1968 * In those cases, all pages freed continously can be expected to be in
1969 * the same cgroup and we have chance to coalesce uncharges.
1970 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
1971 * because we want to do uncharge as soon as possible.
1972 */
1973 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
1974 goto direct_uncharge;
1975
1976 batch = &current->memcg_batch;
1977 /*
1978 * In usual, we do css_get() when we remember memcg pointer.
1979 * But in this case, we keep res->usage until end of a series of
1980 * uncharges. Then, it's ok to ignore memcg's refcnt.
1981 */
1982 if (!batch->memcg)
1983 batch->memcg = mem;
1984 /*
1985 * In typical case, batch->memcg == mem. This means we can
1986 * merge a series of uncharges to an uncharge of res_counter.
1987 * If not, we uncharge res_counter ony by one.
1988 */
1989 if (batch->memcg != mem)
1990 goto direct_uncharge;
1991 /* remember freed charge and uncharge it later */
1992 batch->bytes += PAGE_SIZE;
1993 if (uncharge_memsw)
1994 batch->memsw_bytes += PAGE_SIZE;
1995 return;
1996direct_uncharge:
1997 res_counter_uncharge(&mem->res, PAGE_SIZE);
1998 if (uncharge_memsw)
1999 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2000 return;
2001}
1829 2002
1830/* 2003/*
1831 * uncharge if !page_mapped(page) 2004 * uncharge if !page_mapped(page)
@@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1874 break; 2047 break;
1875 } 2048 }
1876 2049
1877 if (!mem_cgroup_is_root(mem)) { 2050 if (!mem_cgroup_is_root(mem))
1878 res_counter_uncharge(&mem->res, PAGE_SIZE); 2051 __do_uncharge(mem, ctype);
1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2052 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true); 2053 mem_cgroup_swap_statistics(mem, true);
1885 mem_cgroup_charge_statistics(mem, pc, false); 2054 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1925 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2094 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1926} 2095}
1927 2096
2097/*
2098 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2099 * In that cases, pages are freed continuously and we can expect pages
2100 * are in the same memcg. All these calls itself limits the number of
2101 * pages freed at once, then uncharge_start/end() is called properly.
2102 * This may be called prural(2) times in a context,
2103 */
2104
2105void mem_cgroup_uncharge_start(void)
2106{
2107 current->memcg_batch.do_batch++;
2108 /* We can do nest. */
2109 if (current->memcg_batch.do_batch == 1) {
2110 current->memcg_batch.memcg = NULL;
2111 current->memcg_batch.bytes = 0;
2112 current->memcg_batch.memsw_bytes = 0;
2113 }
2114}
2115
2116void mem_cgroup_uncharge_end(void)
2117{
2118 struct memcg_batch_info *batch = &current->memcg_batch;
2119
2120 if (!batch->do_batch)
2121 return;
2122
2123 batch->do_batch--;
2124 if (batch->do_batch) /* If stacked, do nothing. */
2125 return;
2126
2127 if (!batch->memcg)
2128 return;
2129 /*
2130 * This "batch->memcg" is valid without any css_get/put etc...
2131 * bacause we hide charges behind us.
2132 */
2133 if (batch->bytes)
2134 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2135 if (batch->memsw_bytes)
2136 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2137 /* forget this pointer (for sanity check) */
2138 batch->memcg = NULL;
2139}
2140
1928#ifdef CONFIG_SWAP 2141#ifdef CONFIG_SWAP
1929/* 2142/*
1930 * called after __delete_from_swap_cache() and drop "page" account. 2143 * called after __delete_from_swap_cache() and drop "page" account.
@@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2100 unsigned long long val) 2313 unsigned long long val)
2101{ 2314{
2102 int retry_count; 2315 int retry_count;
2103 int progress;
2104 u64 memswlimit; 2316 u64 memswlimit;
2105 int ret = 0; 2317 int ret = 0;
2106 int children = mem_cgroup_count_children(memcg); 2318 int children = mem_cgroup_count_children(memcg);
@@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2144 if (!ret) 2356 if (!ret)
2145 break; 2357 break;
2146 2358
2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2359 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK); 2360 MEM_CGROUP_RECLAIM_SHRINK);
2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2361 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2151 /* Usage is reduced ? */ 2362 /* Usage is reduced ? */
@@ -2375,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2375 if (free_all) 2586 if (free_all)
2376 goto try_to_free; 2587 goto try_to_free;
2377move_account: 2588move_account:
2378 while (mem->res.usage > 0) { 2589 do {
2379 ret = -EBUSY; 2590 ret = -EBUSY;
2380 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2591 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2381 goto out; 2592 goto out;
@@ -2384,6 +2595,7 @@ move_account:
2384 goto out; 2595 goto out;
2385 /* This is for making all *used* pages to be on LRU. */ 2596 /* This is for making all *used* pages to be on LRU. */
2386 lru_add_drain_all(); 2597 lru_add_drain_all();
2598 drain_all_stock_sync();
2387 ret = 0; 2599 ret = 0;
2388 for_each_node_state(node, N_HIGH_MEMORY) { 2600 for_each_node_state(node, N_HIGH_MEMORY) {
2389 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2601 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2402,8 +2614,8 @@ move_account:
2402 if (ret == -ENOMEM) 2614 if (ret == -ENOMEM)
2403 goto try_to_free; 2615 goto try_to_free;
2404 cond_resched(); 2616 cond_resched();
2405 } 2617 /* "ret" should also be checked to ensure all lists are empty. */
2406 ret = 0; 2618 } while (mem->res.usage > 0 || ret);
2407out: 2619out:
2408 css_put(&mem->css); 2620 css_put(&mem->css);
2409 return ret; 2621 return ret;
@@ -2436,10 +2648,7 @@ try_to_free:
2436 } 2648 }
2437 lru_add_drain(); 2649 lru_add_drain();
2438 /* try move_account...there may be some *locked* pages. */ 2650 /* try move_account...there may be some *locked* pages. */
2439 if (mem->res.usage) 2651 goto move_account;
2440 goto move_account;
2441 ret = 0;
2442 goto out;
2443} 2652}
2444 2653
2445int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2654int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
@@ -2541,6 +2750,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2541 val += idx_val; 2750 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem, 2751 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val); 2752 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2753 val += idx_val;
2544 val <<= PAGE_SHIFT; 2754 val <<= PAGE_SHIFT;
2545 } else 2755 } else
2546 val = res_counter_read_u64(&mem->memsw, name); 2756 val = res_counter_read_u64(&mem->memsw, name);
@@ -2660,7 +2870,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2660enum { 2870enum {
2661 MCS_CACHE, 2871 MCS_CACHE,
2662 MCS_RSS, 2872 MCS_RSS,
2663 MCS_MAPPED_FILE, 2873 MCS_FILE_MAPPED,
2664 MCS_PGPGIN, 2874 MCS_PGPGIN,
2665 MCS_PGPGOUT, 2875 MCS_PGPGOUT,
2666 MCS_SWAP, 2876 MCS_SWAP,
@@ -2704,8 +2914,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2704 s->stat[MCS_CACHE] += val * PAGE_SIZE; 2914 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2705 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 2915 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2706 s->stat[MCS_RSS] += val * PAGE_SIZE; 2916 s->stat[MCS_RSS] += val * PAGE_SIZE;
2707 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 2917 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
2708 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 2918 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2709 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 2919 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2710 s->stat[MCS_PGPGIN] += val; 2920 s->stat[MCS_PGPGIN] += val;
2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2921 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3097,11 +3307,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3097 3307
3098 /* root ? */ 3308 /* root ? */
3099 if (cont->parent == NULL) { 3309 if (cont->parent == NULL) {
3310 int cpu;
3100 enable_swap_cgroup(); 3311 enable_swap_cgroup();
3101 parent = NULL; 3312 parent = NULL;
3102 root_mem_cgroup = mem; 3313 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init()) 3314 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out; 3315 goto free_out;
3316 for_each_possible_cpu(cpu) {
3317 struct memcg_stock_pcp *stock =
3318 &per_cpu(memcg_stock, cpu);
3319 INIT_WORK(&stock->work, drain_local_stock);
3320 }
3321 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3105 3322
3106 } else { 3323 } else {
3107 parent = mem_cgroup_from_cont(cont->parent); 3324 parent = mem_cgroup_from_cont(cont->parent);
@@ -3170,12 +3387,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3170 struct task_struct *p, 3387 struct task_struct *p,
3171 bool threadgroup) 3388 bool threadgroup)
3172{ 3389{
3173 mutex_lock(&memcg_tasklist);
3174 /* 3390 /*
3175 * FIXME: It's better to move charges of this process from old 3391 * FIXME: It's better to move charges of this process from old
3176 * memcg to new memcg. But it's just on TODO-List now. 3392 * memcg to new memcg. But it's just on TODO-List now.
3177 */ 3393 */
3178 mutex_unlock(&memcg_tasklist);
3179} 3394}
3180 3395
3181struct cgroup_subsys mem_cgroup_subsys = { 3396struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1ac49fef95ab..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,12 +34,16 @@
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/kernel-page-flags.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
38#include <linux/ksm.h> 39#include <linux/ksm.h>
39#include <linux/rmap.h> 40#include <linux/rmap.h>
40#include <linux/pagemap.h> 41#include <linux/pagemap.h>
41#include <linux/swap.h> 42#include <linux/swap.h>
42#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
43#include "internal.h" 47#include "internal.h"
44 48
45int sysctl_memory_failure_early_kill __read_mostly = 0; 49int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -48,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
48 52
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 53atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50 54
55#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
56
57u32 hwpoison_filter_enable = 0;
58u32 hwpoison_filter_dev_major = ~0U;
59u32 hwpoison_filter_dev_minor = ~0U;
60u64 hwpoison_filter_flags_mask;
61u64 hwpoison_filter_flags_value;
62EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
63EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
64EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
65EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
66EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
67
68static int hwpoison_filter_dev(struct page *p)
69{
70 struct address_space *mapping;
71 dev_t dev;
72
73 if (hwpoison_filter_dev_major == ~0U &&
74 hwpoison_filter_dev_minor == ~0U)
75 return 0;
76
77 /*
78 * page_mapping() does not accept slab page
79 */
80 if (PageSlab(p))
81 return -EINVAL;
82
83 mapping = page_mapping(p);
84 if (mapping == NULL || mapping->host == NULL)
85 return -EINVAL;
86
87 dev = mapping->host->i_sb->s_dev;
88 if (hwpoison_filter_dev_major != ~0U &&
89 hwpoison_filter_dev_major != MAJOR(dev))
90 return -EINVAL;
91 if (hwpoison_filter_dev_minor != ~0U &&
92 hwpoison_filter_dev_minor != MINOR(dev))
93 return -EINVAL;
94
95 return 0;
96}
97
98static int hwpoison_filter_flags(struct page *p)
99{
100 if (!hwpoison_filter_flags_mask)
101 return 0;
102
103 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
104 hwpoison_filter_flags_value)
105 return 0;
106 else
107 return -EINVAL;
108}
109
110/*
111 * This allows stress tests to limit test scope to a collection of tasks
112 * by putting them under some memcg. This prevents killing unrelated/important
113 * processes such as /sbin/init. Note that the target task may share clean
114 * pages with init (eg. libc text), which is harmless. If the target task
115 * share _dirty_ pages with another task B, the test scheme must make sure B
116 * is also included in the memcg. At last, due to race conditions this filter
117 * can only guarantee that the page either belongs to the memcg tasks, or is
118 * a freed page.
119 */
120#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
121u64 hwpoison_filter_memcg;
122EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
123static int hwpoison_filter_task(struct page *p)
124{
125 struct mem_cgroup *mem;
126 struct cgroup_subsys_state *css;
127 unsigned long ino;
128
129 if (!hwpoison_filter_memcg)
130 return 0;
131
132 mem = try_get_mem_cgroup_from_page(p);
133 if (!mem)
134 return -EINVAL;
135
136 css = mem_cgroup_css(mem);
137 /* root_mem_cgroup has NULL dentries */
138 if (!css->cgroup->dentry)
139 return -EINVAL;
140
141 ino = css->cgroup->dentry->d_inode->i_ino;
142 css_put(css);
143
144 if (ino != hwpoison_filter_memcg)
145 return -EINVAL;
146
147 return 0;
148}
149#else
150static int hwpoison_filter_task(struct page *p) { return 0; }
151#endif
152
153int hwpoison_filter(struct page *p)
154{
155 if (!hwpoison_filter_enable)
156 return 0;
157
158 if (hwpoison_filter_dev(p))
159 return -EINVAL;
160
161 if (hwpoison_filter_flags(p))
162 return -EINVAL;
163
164 if (hwpoison_filter_task(p))
165 return -EINVAL;
166
167 return 0;
168}
169#else
170int hwpoison_filter(struct page *p)
171{
172 return 0;
173}
174#endif
175
176EXPORT_SYMBOL_GPL(hwpoison_filter);
177
51/* 178/*
52 * Send all the processes who have the page mapped an ``action optional'' 179 * Send all the processes who have the page mapped an ``action optional''
53 * signal. 180 * signal.
@@ -83,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
83} 210}
84 211
85/* 212/*
213 * When a unknown page type is encountered drain as many buffers as possible
214 * in the hope to turn the page into a LRU or free page, which we can handle.
215 */
216void shake_page(struct page *p, int access)
217{
218 if (!PageSlab(p)) {
219 lru_add_drain_all();
220 if (PageLRU(p))
221 return;
222 drain_all_pages();
223 if (PageLRU(p) || is_free_buddy_page(p))
224 return;
225 }
226
227 /*
228 * Only all shrink_slab here (which would also
229 * shrink other caches) if access is not potentially fatal.
230 */
231 if (access) {
232 int nr;
233 do {
234 nr = shrink_slab(1000, GFP_KERNEL, 1000);
235 if (page_count(p) == 0)
236 break;
237 } while (nr > 10);
238 }
239}
240EXPORT_SYMBOL_GPL(shake_page);
241
242/*
86 * Kill all processes that have a poisoned page mapped and then isolate 243 * Kill all processes that have a poisoned page mapped and then isolate
87 * the page. 244 * the page.
88 * 245 *
@@ -177,7 +334,6 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
177 * In case something went wrong with munmapping 334 * In case something went wrong with munmapping
178 * make sure the process doesn't catch the 335 * make sure the process doesn't catch the
179 * signal and then access the memory. Just kill it. 336 * signal and then access the memory. Just kill it.
180 * the signal handlers
181 */ 337 */
182 if (fail || tk->addr_valid == 0) { 338 if (fail || tk->addr_valid == 0) {
183 printk(KERN_ERR 339 printk(KERN_ERR
@@ -314,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
314 */ 470 */
315 471
316enum outcome { 472enum outcome {
317 FAILED, /* Error handling failed */ 473 IGNORED, /* Error: cannot be handled */
474 FAILED, /* Error: handling failed */
318 DELAYED, /* Will be handled later */ 475 DELAYED, /* Will be handled later */
319 IGNORED, /* Error safely ignored */
320 RECOVERED, /* Successfully recovered */ 476 RECOVERED, /* Successfully recovered */
321}; 477};
322 478
323static const char *action_name[] = { 479static const char *action_name[] = {
480 [IGNORED] = "Ignored",
324 [FAILED] = "Failed", 481 [FAILED] = "Failed",
325 [DELAYED] = "Delayed", 482 [DELAYED] = "Delayed",
326 [IGNORED] = "Ignored",
327 [RECOVERED] = "Recovered", 483 [RECOVERED] = "Recovered",
328}; 484};
329 485
330/* 486/*
331 * Error hit kernel page. 487 * XXX: It is possible that a page is isolated from LRU cache,
332 * Do nothing, try to be lucky and not touch this instead. For a few cases we 488 * and then kept in swap cache or failed to remove from page cache.
333 * could be more sophisticated. 489 * The page count will stop it from being freed by unpoison.
490 * Stress tests should be aware of this memory leak problem.
334 */ 491 */
335static int me_kernel(struct page *p, unsigned long pfn) 492static int delete_from_lru_cache(struct page *p)
336{ 493{
337 return DELAYED; 494 if (!isolate_lru_page(p)) {
495 /*
496 * Clear sensible page flags, so that the buddy system won't
497 * complain when the page is unpoison-and-freed.
498 */
499 ClearPageActive(p);
500 ClearPageUnevictable(p);
501 /*
502 * drop the page count elevated by isolate_lru_page()
503 */
504 page_cache_release(p);
505 return 0;
506 }
507 return -EIO;
338} 508}
339 509
340/* 510/*
341 * Already poisoned page. 511 * Error hit kernel page.
512 * Do nothing, try to be lucky and not touch this instead. For a few cases we
513 * could be more sophisticated.
342 */ 514 */
343static int me_ignore(struct page *p, unsigned long pfn) 515static int me_kernel(struct page *p, unsigned long pfn)
344{ 516{
345 return IGNORED; 517 return IGNORED;
346} 518}
@@ -355,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
355} 527}
356 528
357/* 529/*
358 * Free memory
359 */
360static int me_free(struct page *p, unsigned long pfn)
361{
362 return DELAYED;
363}
364
365/*
366 * Clean (or cleaned) page cache page. 530 * Clean (or cleaned) page cache page.
367 */ 531 */
368static int me_pagecache_clean(struct page *p, unsigned long pfn) 532static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -371,6 +535,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
371 int ret = FAILED; 535 int ret = FAILED;
372 struct address_space *mapping; 536 struct address_space *mapping;
373 537
538 delete_from_lru_cache(p);
539
374 /* 540 /*
375 * For anonymous pages we're done the only reference left 541 * For anonymous pages we're done the only reference left
376 * should be the one m_f() holds. 542 * should be the one m_f() holds.
@@ -500,14 +666,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500 /* Trigger EIO in shmem: */ 666 /* Trigger EIO in shmem: */
501 ClearPageUptodate(p); 667 ClearPageUptodate(p);
502 668
503 return DELAYED; 669 if (!delete_from_lru_cache(p))
670 return DELAYED;
671 else
672 return FAILED;
504} 673}
505 674
506static int me_swapcache_clean(struct page *p, unsigned long pfn) 675static int me_swapcache_clean(struct page *p, unsigned long pfn)
507{ 676{
508 delete_from_swap_cache(p); 677 delete_from_swap_cache(p);
509 678
510 return RECOVERED; 679 if (!delete_from_lru_cache(p))
680 return RECOVERED;
681 else
682 return FAILED;
511} 683}
512 684
513/* 685/*
@@ -550,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
550#define tail (1UL << PG_tail) 722#define tail (1UL << PG_tail)
551#define compound (1UL << PG_compound) 723#define compound (1UL << PG_compound)
552#define slab (1UL << PG_slab) 724#define slab (1UL << PG_slab)
553#define buddy (1UL << PG_buddy)
554#define reserved (1UL << PG_reserved) 725#define reserved (1UL << PG_reserved)
555 726
556static struct page_state { 727static struct page_state {
@@ -559,8 +730,11 @@ static struct page_state {
559 char *msg; 730 char *msg;
560 int (*action)(struct page *p, unsigned long pfn); 731 int (*action)(struct page *p, unsigned long pfn);
561} error_states[] = { 732} error_states[] = {
562 { reserved, reserved, "reserved kernel", me_ignore }, 733 { reserved, reserved, "reserved kernel", me_kernel },
563 { buddy, buddy, "free kernel", me_free }, 734 /*
735 * free pages are specially detected outside this table:
736 * PG_buddy pages only make a small fraction of all free pages.
737 */
564 738
565 /* 739 /*
566 * Could in theory check if slab page is free or if we can drop 740 * Could in theory check if slab page is free or if we can drop
@@ -582,14 +756,11 @@ static struct page_state {
582 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 756 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
583 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 757 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
584 758
585#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
586 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 759 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
587 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 760 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
588#endif
589 761
590 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 762 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
591 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 763 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
592 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
593 764
594 /* 765 /*
595 * Catchall entry: must be at end. 766 * Catchall entry: must be at end.
@@ -597,20 +768,31 @@ static struct page_state {
597 { 0, 0, "unknown page state", me_unknown }, 768 { 0, 0, "unknown page state", me_unknown },
598}; 769};
599 770
771#undef dirty
772#undef sc
773#undef unevict
774#undef mlock
775#undef writeback
776#undef lru
777#undef swapbacked
778#undef head
779#undef tail
780#undef compound
781#undef slab
782#undef reserved
783
600static void action_result(unsigned long pfn, char *msg, int result) 784static void action_result(unsigned long pfn, char *msg, int result)
601{ 785{
602 struct page *page = NULL; 786 struct page *page = pfn_to_page(pfn);
603 if (pfn_valid(pfn))
604 page = pfn_to_page(pfn);
605 787
606 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 788 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
607 pfn, 789 pfn,
608 page && PageDirty(page) ? "dirty " : "", 790 PageDirty(page) ? "dirty " : "",
609 msg, action_name[result]); 791 msg, action_name[result]);
610} 792}
611 793
612static int page_action(struct page_state *ps, struct page *p, 794static int page_action(struct page_state *ps, struct page *p,
613 unsigned long pfn, int ref) 795 unsigned long pfn)
614{ 796{
615 int result; 797 int result;
616 int count; 798 int count;
@@ -618,18 +800,22 @@ static int page_action(struct page_state *ps, struct page *p,
618 result = ps->action(p, pfn); 800 result = ps->action(p, pfn);
619 action_result(pfn, ps->msg, result); 801 action_result(pfn, ps->msg, result);
620 802
621 count = page_count(p) - 1 - ref; 803 count = page_count(p) - 1;
622 if (count != 0) 804 if (ps->action == me_swapcache_dirty && result == DELAYED)
805 count--;
806 if (count != 0) {
623 printk(KERN_ERR 807 printk(KERN_ERR
624 "MCE %#lx: %s page still referenced by %d users\n", 808 "MCE %#lx: %s page still referenced by %d users\n",
625 pfn, ps->msg, count); 809 pfn, ps->msg, count);
810 result = FAILED;
811 }
626 812
627 /* Could do more checks here if page looks ok */ 813 /* Could do more checks here if page looks ok */
628 /* 814 /*
629 * Could adjust zone counters here to correct for the missing page. 815 * Could adjust zone counters here to correct for the missing page.
630 */ 816 */
631 817
632 return result == RECOVERED ? 0 : -EBUSY; 818 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
633} 819}
634 820
635#define N_UNMAP_TRIES 5 821#define N_UNMAP_TRIES 5
@@ -638,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p,
638 * Do all that is necessary to remove user space mappings. Unmap 824 * Do all that is necessary to remove user space mappings. Unmap
639 * the pages and send SIGBUS to the processes if the data was dirty. 825 * the pages and send SIGBUS to the processes if the data was dirty.
640 */ 826 */
641static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 827static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
642 int trapno) 828 int trapno)
643{ 829{
644 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 830 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -648,15 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
648 int i; 834 int i;
649 int kill = 1; 835 int kill = 1;
650 836
651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 837 if (PageReserved(p) || PageSlab(p))
652 return; 838 return SWAP_SUCCESS;
653 839
654 /* 840 /*
655 * This check implies we don't kill processes if their pages 841 * This check implies we don't kill processes if their pages
656 * are in the swap cache early. Those are always late kills. 842 * are in the swap cache early. Those are always late kills.
657 */ 843 */
658 if (!page_mapped(p)) 844 if (!page_mapped(p))
659 return; 845 return SWAP_SUCCESS;
846
847 if (PageCompound(p) || PageKsm(p))
848 return SWAP_FAIL;
660 849
661 if (PageSwapCache(p)) { 850 if (PageSwapCache(p)) {
662 printk(KERN_ERR 851 printk(KERN_ERR
@@ -667,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
667 /* 856 /*
668 * Propagate the dirty bit from PTEs to struct page first, because we 857 * Propagate the dirty bit from PTEs to struct page first, because we
669 * need this to decide if we should kill or just drop the page. 858 * need this to decide if we should kill or just drop the page.
859 * XXX: the dirty test could be racy: set_page_dirty() may not always
860 * be called inside page lock (it's recommended but not enforced).
670 */ 861 */
671 mapping = page_mapping(p); 862 mapping = page_mapping(p);
672 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 863 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -718,11 +909,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
718 */ 909 */
719 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 910 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
720 ret != SWAP_SUCCESS, pfn); 911 ret != SWAP_SUCCESS, pfn);
912
913 return ret;
721} 914}
722 915
723int __memory_failure(unsigned long pfn, int trapno, int ref) 916int __memory_failure(unsigned long pfn, int trapno, int flags)
724{ 917{
725 unsigned long lru_flag;
726 struct page_state *ps; 918 struct page_state *ps;
727 struct page *p; 919 struct page *p;
728 int res; 920 int res;
@@ -731,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
731 panic("Memory failure from trap %d on page %lx", trapno, pfn); 923 panic("Memory failure from trap %d on page %lx", trapno, pfn);
732 924
733 if (!pfn_valid(pfn)) { 925 if (!pfn_valid(pfn)) {
734 action_result(pfn, "memory outside kernel control", IGNORED); 926 printk(KERN_ERR
735 return -EIO; 927 "MCE %#lx: memory outside kernel control\n",
928 pfn);
929 return -ENXIO;
736 } 930 }
737 931
738 p = pfn_to_page(pfn); 932 p = pfn_to_page(pfn);
739 if (TestSetPageHWPoison(p)) { 933 if (TestSetPageHWPoison(p)) {
740 action_result(pfn, "already hardware poisoned", IGNORED); 934 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
741 return 0; 935 return 0;
742 } 936 }
743 937
@@ -754,9 +948,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
754 * In fact it's dangerous to directly bump up page count from 0, 948 * In fact it's dangerous to directly bump up page count from 0,
755 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 949 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
756 */ 950 */
757 if (!get_page_unless_zero(compound_head(p))) { 951 if (!(flags & MF_COUNT_INCREASED) &&
758 action_result(pfn, "free or high order kernel", IGNORED); 952 !get_page_unless_zero(compound_head(p))) {
759 return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 953 if (is_free_buddy_page(p)) {
954 action_result(pfn, "free buddy", DELAYED);
955 return 0;
956 } else {
957 action_result(pfn, "high order kernel", IGNORED);
958 return -EBUSY;
959 }
760 } 960 }
761 961
762 /* 962 /*
@@ -768,14 +968,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
768 * walked by the page reclaim code, however that's not a big loss. 968 * walked by the page reclaim code, however that's not a big loss.
769 */ 969 */
770 if (!PageLRU(p)) 970 if (!PageLRU(p))
771 lru_add_drain_all(); 971 shake_page(p, 0);
772 lru_flag = p->flags & lru; 972 if (!PageLRU(p)) {
773 if (isolate_lru_page(p)) { 973 /*
974 * shake_page could have turned it free.
975 */
976 if (is_free_buddy_page(p)) {
977 action_result(pfn, "free buddy, 2nd try", DELAYED);
978 return 0;
979 }
774 action_result(pfn, "non LRU", IGNORED); 980 action_result(pfn, "non LRU", IGNORED);
775 put_page(p); 981 put_page(p);
776 return -EBUSY; 982 return -EBUSY;
777 } 983 }
778 page_cache_release(p);
779 984
780 /* 985 /*
781 * Lock the page and wait for writeback to finish. 986 * Lock the page and wait for writeback to finish.
@@ -783,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
783 * and in many cases impossible, so we just avoid it here. 988 * and in many cases impossible, so we just avoid it here.
784 */ 989 */
785 lock_page_nosync(p); 990 lock_page_nosync(p);
991
992 /*
993 * unpoison always clear PG_hwpoison inside page lock
994 */
995 if (!PageHWPoison(p)) {
996 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
997 res = 0;
998 goto out;
999 }
1000 if (hwpoison_filter(p)) {
1001 if (TestClearPageHWPoison(p))
1002 atomic_long_dec(&mce_bad_pages);
1003 unlock_page(p);
1004 put_page(p);
1005 return 0;
1006 }
1007
786 wait_on_page_writeback(p); 1008 wait_on_page_writeback(p);
787 1009
788 /* 1010 /*
789 * Now take care of user space mappings. 1011 * Now take care of user space mappings.
1012 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
790 */ 1013 */
791 hwpoison_user_mappings(p, pfn, trapno); 1014 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1015 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1016 res = -EBUSY;
1017 goto out;
1018 }
792 1019
793 /* 1020 /*
794 * Torn down by someone else? 1021 * Torn down by someone else?
795 */ 1022 */
796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { 1023 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
797 action_result(pfn, "already truncated LRU", IGNORED); 1024 action_result(pfn, "already truncated LRU", IGNORED);
798 res = 0; 1025 res = -EBUSY;
799 goto out; 1026 goto out;
800 } 1027 }
801 1028
802 res = -EBUSY; 1029 res = -EBUSY;
803 for (ps = error_states;; ps++) { 1030 for (ps = error_states;; ps++) {
804 if (((p->flags | lru_flag)& ps->mask) == ps->res) { 1031 if ((p->flags & ps->mask) == ps->res) {
805 res = page_action(ps, p, pfn, ref); 1032 res = page_action(ps, p, pfn);
806 break; 1033 break;
807 } 1034 }
808 } 1035 }
@@ -833,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno)
833{ 1060{
834 __memory_failure(pfn, trapno, 0); 1061 __memory_failure(pfn, trapno, 0);
835} 1062}
1063
1064/**
1065 * unpoison_memory - Unpoison a previously poisoned page
1066 * @pfn: Page number of the to be unpoisoned page
1067 *
1068 * Software-unpoison a page that has been poisoned by
1069 * memory_failure() earlier.
1070 *
1071 * This is only done on the software-level, so it only works
1072 * for linux injected failures, not real hardware failures
1073 *
1074 * Returns 0 for success, otherwise -errno.
1075 */
1076int unpoison_memory(unsigned long pfn)
1077{
1078 struct page *page;
1079 struct page *p;
1080 int freeit = 0;
1081
1082 if (!pfn_valid(pfn))
1083 return -ENXIO;
1084
1085 p = pfn_to_page(pfn);
1086 page = compound_head(p);
1087
1088 if (!PageHWPoison(p)) {
1089 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1090 return 0;
1091 }
1092
1093 if (!get_page_unless_zero(page)) {
1094 if (TestClearPageHWPoison(p))
1095 atomic_long_dec(&mce_bad_pages);
1096 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1097 return 0;
1098 }
1099
1100 lock_page_nosync(page);
1101 /*
1102 * This test is racy because PG_hwpoison is set outside of page lock.
1103 * That's acceptable because that won't trigger kernel panic. Instead,
1104 * the PG_hwpoison page will be caught and isolated on the entrance to
1105 * the free buddy page pool.
1106 */
1107 if (TestClearPageHWPoison(p)) {
1108 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1109 atomic_long_dec(&mce_bad_pages);
1110 freeit = 1;
1111 }
1112 unlock_page(page);
1113
1114 put_page(page);
1115 if (freeit)
1116 put_page(page);
1117
1118 return 0;
1119}
1120EXPORT_SYMBOL(unpoison_memory);
1121
1122static struct page *new_page(struct page *p, unsigned long private, int **x)
1123{
1124 int nid = page_to_nid(p);
1125 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1126}
1127
1128/*
1129 * Safely get reference count of an arbitrary page.
1130 * Returns 0 for a free page, -EIO for a zero refcount page
1131 * that is not free, and 1 for any other page type.
1132 * For 1 the page is returned with increased page count, otherwise not.
1133 */
1134static int get_any_page(struct page *p, unsigned long pfn, int flags)
1135{
1136 int ret;
1137
1138 if (flags & MF_COUNT_INCREASED)
1139 return 1;
1140
1141 /*
1142 * The lock_system_sleep prevents a race with memory hotplug,
1143 * because the isolation assumes there's only a single user.
1144 * This is a big hammer, a better would be nicer.
1145 */
1146 lock_system_sleep();
1147
1148 /*
1149 * Isolate the page, so that it doesn't get reallocated if it
1150 * was free.
1151 */
1152 set_migratetype_isolate(p);
1153 if (!get_page_unless_zero(compound_head(p))) {
1154 if (is_free_buddy_page(p)) {
1155 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1156 /* Set hwpoison bit while page is still isolated */
1157 SetPageHWPoison(p);
1158 ret = 0;
1159 } else {
1160 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1161 pfn, p->flags);
1162 ret = -EIO;
1163 }
1164 } else {
1165 /* Not a free page */
1166 ret = 1;
1167 }
1168 unset_migratetype_isolate(p);
1169 unlock_system_sleep();
1170 return ret;
1171}
1172
1173/**
1174 * soft_offline_page - Soft offline a page.
1175 * @page: page to offline
1176 * @flags: flags. Same as memory_failure().
1177 *
1178 * Returns 0 on success, otherwise negated errno.
1179 *
1180 * Soft offline a page, by migration or invalidation,
1181 * without killing anything. This is for the case when
1182 * a page is not corrupted yet (so it's still valid to access),
1183 * but has had a number of corrected errors and is better taken
1184 * out.
1185 *
1186 * The actual policy on when to do that is maintained by
1187 * user space.
1188 *
1189 * This should never impact any application or cause data loss,
1190 * however it might take some time.
1191 *
1192 * This is not a 100% solution for all memory, but tries to be
1193 * ``good enough'' for the majority of memory.
1194 */
1195int soft_offline_page(struct page *page, int flags)
1196{
1197 int ret;
1198 unsigned long pfn = page_to_pfn(page);
1199
1200 ret = get_any_page(page, pfn, flags);
1201 if (ret < 0)
1202 return ret;
1203 if (ret == 0)
1204 goto done;
1205
1206 /*
1207 * Page cache page we can handle?
1208 */
1209 if (!PageLRU(page)) {
1210 /*
1211 * Try to free it.
1212 */
1213 put_page(page);
1214 shake_page(page, 1);
1215
1216 /*
1217 * Did it turn free?
1218 */
1219 ret = get_any_page(page, pfn, 0);
1220 if (ret < 0)
1221 return ret;
1222 if (ret == 0)
1223 goto done;
1224 }
1225 if (!PageLRU(page)) {
1226 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1227 pfn, page->flags);
1228 return -EIO;
1229 }
1230
1231 lock_page(page);
1232 wait_on_page_writeback(page);
1233
1234 /*
1235 * Synchronized using the page lock with memory_failure()
1236 */
1237 if (PageHWPoison(page)) {
1238 unlock_page(page);
1239 put_page(page);
1240 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1241 return -EBUSY;
1242 }
1243
1244 /*
1245 * Try to invalidate first. This should work for
1246 * non dirty unmapped page cache pages.
1247 */
1248 ret = invalidate_inode_page(page);
1249 unlock_page(page);
1250
1251 /*
1252 * Drop count because page migration doesn't like raised
1253 * counts. The page could get re-allocated, but if it becomes
1254 * LRU the isolation will just fail.
1255 * RED-PEN would be better to keep it isolated here, but we
1256 * would need to fix isolation locking first.
1257 */
1258 put_page(page);
1259 if (ret == 1) {
1260 ret = 0;
1261 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1262 goto done;
1263 }
1264
1265 /*
1266 * Simple invalidation didn't work.
1267 * Try to migrate to a new page instead. migrate.c
1268 * handles a large number of cases for us.
1269 */
1270 ret = isolate_lru_page(page);
1271 if (!ret) {
1272 LIST_HEAD(pagelist);
1273
1274 list_add(&page->lru, &pagelist);
1275 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1276 if (ret) {
1277 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1278 pfn, ret, page->flags);
1279 if (ret > 0)
1280 ret = -EIO;
1281 }
1282 } else {
1283 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1284 pfn, ret, page_count(page), page->flags);
1285 }
1286 if (ret)
1287 return ret;
1288
1289done:
1290 atomic_long_add(1, &mce_bad_pages);
1291 SetPageHWPoison(page);
1292 /* keep elevated page count for bad page */
1293 return ret;
1294}
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..09e4b1be7b67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
572 * covered by this vma. 572 * covered by this vma.
573 */ 573 */
574 574
575static inline void 575static inline unsigned long
576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
578 unsigned long addr, int *rss) 578 unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
586 if (!pte_file(pte)) { 586 if (!pte_file(pte)) {
587 swp_entry_t entry = pte_to_swp_entry(pte); 587 swp_entry_t entry = pte_to_swp_entry(pte);
588 588
589 swap_duplicate(entry); 589 if (swap_duplicate(entry) < 0)
590 return entry.val;
591
590 /* make sure dst_mm is on swapoff's mmlist. */ 592 /* make sure dst_mm is on swapoff's mmlist. */
591 if (unlikely(list_empty(&dst_mm->mmlist))) { 593 if (unlikely(list_empty(&dst_mm->mmlist))) {
592 spin_lock(&mmlist_lock); 594 spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
635 637
636out_set_pte: 638out_set_pte:
637 set_pte_at(dst_mm, addr, dst_pte, pte); 639 set_pte_at(dst_mm, addr, dst_pte, pte);
640 return 0;
638} 641}
639 642
640static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 643static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
646 spinlock_t *src_ptl, *dst_ptl; 649 spinlock_t *src_ptl, *dst_ptl;
647 int progress = 0; 650 int progress = 0;
648 int rss[2]; 651 int rss[2];
652 swp_entry_t entry = (swp_entry_t){0};
649 653
650again: 654again:
651 rss[1] = rss[0] = 0; 655 rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
674 progress++; 678 progress++;
675 continue; 679 continue;
676 } 680 }
677 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 681 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
682 vma, addr, rss);
683 if (entry.val)
684 break;
678 progress += 8; 685 progress += 8;
679 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 686 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680 687
@@ -684,6 +691,12 @@ again:
684 add_mm_rss(dst_mm, rss[0], rss[1]); 691 add_mm_rss(dst_mm, rss[0], rss[1]);
685 pte_unmap_unlock(orig_dst_pte, dst_ptl); 692 pte_unmap_unlock(orig_dst_pte, dst_ptl);
686 cond_resched(); 693 cond_resched();
694
695 if (entry.val) {
696 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
697 return -ENOMEM;
698 progress = 0;
699 }
687 if (addr != end) 700 if (addr != end)
688 goto again; 701 goto again;
689 return 0; 702 return 0;
@@ -943,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
943 details = NULL; 956 details = NULL;
944 957
945 BUG_ON(addr >= end); 958 BUG_ON(addr >= end);
959 mem_cgroup_uncharge_start();
946 tlb_start_vma(tlb, vma); 960 tlb_start_vma(tlb, vma);
947 pgd = pgd_offset(vma->vm_mm, addr); 961 pgd = pgd_offset(vma->vm_mm, addr);
948 do { 962 do {
@@ -955,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
955 zap_work, details); 969 zap_work, details);
956 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 970 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
957 tlb_end_vma(tlb, vma); 971 tlb_end_vma(tlb, vma);
972 mem_cgroup_uncharge_end();
958 973
959 return addr; 974 return addr;
960} 975}
@@ -2514,7 +2529,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2514 ret = VM_FAULT_HWPOISON; 2529 ret = VM_FAULT_HWPOISON;
2515 } else { 2530 } else {
2516 print_bad_pte(vma, address, orig_pte, NULL); 2531 print_bad_pte(vma, address, orig_pte, NULL);
2517 ret = VM_FAULT_OOM; 2532 ret = VM_FAULT_SIGBUS;
2518 } 2533 }
2519 goto out; 2534 goto out;
2520 } 2535 }
@@ -2540,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2540 ret = VM_FAULT_MAJOR; 2555 ret = VM_FAULT_MAJOR;
2541 count_vm_event(PGMAJFAULT); 2556 count_vm_event(PGMAJFAULT);
2542 } else if (PageHWPoison(page)) { 2557 } else if (PageHWPoison(page)) {
2558 /*
2559 * hwpoisoned dirty swapcache pages are kept for killing
2560 * owner processes (which may be unknown at hwpoison time)
2561 */
2543 ret = VM_FAULT_HWPOISON; 2562 ret = VM_FAULT_HWPOISON;
2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2563 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2545 goto out_release; 2564 goto out_release;
@@ -2548,6 +2567,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2548 lock_page(page); 2567 lock_page(page);
2549 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2568 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2550 2569
2570 page = ksm_might_need_to_copy(page, vma, address);
2571 if (!page) {
2572 ret = VM_FAULT_OOM;
2573 goto out;
2574 }
2575
2551 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2576 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2552 ret = VM_FAULT_OOM; 2577 ret = VM_FAULT_OOM;
2553 goto out_page; 2578 goto out_page;
@@ -2910,7 +2935,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2910 * Page table corrupted: show pte and kill process. 2935 * Page table corrupted: show pte and kill process.
2911 */ 2936 */
2912 print_bad_pte(vma, address, orig_pte, NULL); 2937 print_bad_pte(vma, address, orig_pte, NULL);
2913 return VM_FAULT_OOM; 2938 return VM_FAULT_SIGBUS;
2914 } 2939 }
2915 2940
2916 pgoff = pte_to_pgoff(orig_pte); 2941 pgoff = pte_to_pgoff(orig_pte);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2047465cd27c..030ce8a5bb0e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -27,6 +27,7 @@
27#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h>
30 31
31#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
32 33
@@ -71,7 +72,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
71 atomic_inc(&page->_count); 72 atomic_inc(&page->_count);
72} 73}
73 74
74void put_page_bootmem(struct page *page) 75/* reference to __meminit __free_pages_bootmem is valid
76 * so use __ref to tell modpost not to generate a warning */
77void __ref put_page_bootmem(struct page *page)
75{ 78{
76 int type; 79 int type;
77 80
@@ -672,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
672 if (!ret) { /* Success */ 675 if (!ret) { /* Success */
673 list_add_tail(&page->lru, &source); 676 list_add_tail(&page->lru, &source);
674 move_pages--; 677 move_pages--;
678 inc_zone_page_state(page, NR_ISOLATED_ANON +
679 page_is_file_cache(page));
680
675 } else { 681 } else {
676 /* Becasue we don't have big zone->lock. we should 682 /* Becasue we don't have big zone->lock. we should
677 check this again here. */ 683 check this again here. */
@@ -694,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
694 if (list_empty(&source)) 700 if (list_empty(&source))
695 goto out; 701 goto out;
696 /* this function returns # of failed pages */ 702 /* this function returns # of failed pages */
697 ret = migrate_pages(&source, hotremove_migrate_alloc, 0); 703 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
698 704
699out: 705out:
700 return ret; 706 return ret;
@@ -747,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
747 return offlined; 753 return offlined;
748} 754}
749 755
750int offline_pages(unsigned long start_pfn, 756static int offline_pages(unsigned long start_pfn,
751 unsigned long end_pfn, unsigned long timeout) 757 unsigned long end_pfn, unsigned long timeout)
752{ 758{
753 unsigned long pfn, nr_pages, expire; 759 unsigned long pfn, nr_pages, expire;
@@ -849,6 +855,10 @@ repeat:
849 855
850 setup_per_zone_wmarks(); 856 setup_per_zone_wmarks();
851 calculate_zone_inactive_ratio(zone); 857 calculate_zone_inactive_ratio(zone);
858 if (!node_present_pages(node)) {
859 node_clear_state(node, N_HIGH_MEMORY);
860 kswapd_stop(node);
861 }
852 862
853 vm_total_pages = nr_free_pagecache_pages(); 863 vm_total_pages = nr_free_pagecache_pages();
854 writeback_set_ratelimit(); 864 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4545d5944243..290fb5bf0440 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,10 +85,12 @@
85#include <linux/seq_file.h> 85#include <linux/seq_file.h>
86#include <linux/proc_fs.h> 86#include <linux/proc_fs.h>
87#include <linux/migrate.h> 87#include <linux/migrate.h>
88#include <linux/ksm.h>
88#include <linux/rmap.h> 89#include <linux/rmap.h>
89#include <linux/security.h> 90#include <linux/security.h>
90#include <linux/syscalls.h> 91#include <linux/syscalls.h>
91#include <linux/ctype.h> 92#include <linux/ctype.h>
93#include <linux/mm_inline.h>
92 94
93#include <asm/tlbflush.h> 95#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 96#include <asm/uaccess.h>
@@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
412 if (!page) 414 if (!page)
413 continue; 415 continue;
414 /* 416 /*
415 * The check for PageReserved here is important to avoid 417 * vm_normal_page() filters out zero pages, but there might
416 * handling zero pages and other pages that may have been 418 * still be PageReserved pages to skip, perhaps in a VDSO.
417 * marked special by the system. 419 * And we cannot move PageKsm pages sensibly or safely yet.
418 *
419 * If the PageReserved would not be checked here then f.e.
420 * the location of the zero page could have an influence
421 * on MPOL_MF_STRICT, zero pages would be counted for
422 * the per node stats, and there would be useless attempts
423 * to put zero pages on the migration list.
424 */ 420 */
425 if (PageReserved(page)) 421 if (PageReserved(page) || PageKsm(page))
426 continue; 422 continue;
427 nid = page_to_nid(page); 423 nid = page_to_nid(page);
428 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 424 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
809 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 805 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810 if (!isolate_lru_page(page)) { 806 if (!isolate_lru_page(page)) {
811 list_add_tail(&page->lru, pagelist); 807 list_add_tail(&page->lru, pagelist);
808 inc_zone_page_state(page, NR_ISOLATED_ANON +
809 page_is_file_cache(page));
812 } 810 }
813 } 811 }
814} 812}
@@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
836 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 834 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837 835
838 if (!list_empty(&pagelist)) 836 if (!list_empty(&pagelist))
839 err = migrate_pages(&pagelist, new_node_page, dest); 837 err = migrate_pages(&pagelist, new_node_page, dest, 0);
840 838
841 return err; 839 return err;
842} 840}
@@ -1053,7 +1051,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1053 1051
1054 if (!list_empty(&pagelist)) 1052 if (!list_empty(&pagelist))
1055 nr_failed = migrate_pages(&pagelist, new_vma_page, 1053 nr_failed = migrate_pages(&pagelist, new_vma_page,
1056 (unsigned long)vma); 1054 (unsigned long)vma, 0);
1057 1055
1058 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1056 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1059 err = -EIO; 1057 err = -EIO;
@@ -1565,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1565 } 1563 }
1566 return zl; 1564 return zl;
1567} 1565}
1566
1567/*
1568 * init_nodemask_of_mempolicy
1569 *
1570 * If the current task's mempolicy is "default" [NULL], return 'false'
1571 * to indicate default policy. Otherwise, extract the policy nodemask
1572 * for 'bind' or 'interleave' policy into the argument nodemask, or
1573 * initialize the argument nodemask to contain the single node for
1574 * 'preferred' or 'local' policy and return 'true' to indicate presence
1575 * of non-default mempolicy.
1576 *
1577 * We don't bother with reference counting the mempolicy [mpol_get/put]
1578 * because the current task is examining it's own mempolicy and a task's
1579 * mempolicy is only ever changed by the task itself.
1580 *
1581 * N.B., it is the caller's responsibility to free a returned nodemask.
1582 */
1583bool init_nodemask_of_mempolicy(nodemask_t *mask)
1584{
1585 struct mempolicy *mempolicy;
1586 int nid;
1587
1588 if (!(mask && current->mempolicy))
1589 return false;
1590
1591 mempolicy = current->mempolicy;
1592 switch (mempolicy->mode) {
1593 case MPOL_PREFERRED:
1594 if (mempolicy->flags & MPOL_F_LOCAL)
1595 nid = numa_node_id();
1596 else
1597 nid = mempolicy->v.preferred_node;
1598 init_nodemask_of_node(mask, nid);
1599 break;
1600
1601 case MPOL_BIND:
1602 /* Fall through */
1603 case MPOL_INTERLEAVE:
1604 *mask = mempolicy->v.nodes;
1605 break;
1606
1607 default:
1608 BUG();
1609 }
1610
1611 return true;
1612}
1568#endif 1613#endif
1569 1614
1570/* Allocate a page in interleaved policy. 1615/* Allocate a page in interleaved policy.
diff --git a/mm/migrate.c b/mm/migrate.c
index 7dbcb22316d2..efddbf0926b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,6 +21,7 @@
21#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
23#include <linux/pagevec.h> 23#include <linux/pagevec.h>
24#include <linux/ksm.h>
24#include <linux/rmap.h> 25#include <linux/rmap.h>
25#include <linux/topology.h> 26#include <linux/topology.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l)
78/* 79/*
79 * Restore a potential migration pte to a working pte entry 80 * Restore a potential migration pte to a working pte entry
80 */ 81 */
81static void remove_migration_pte(struct vm_area_struct *vma, 82static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
82 struct page *old, struct page *new) 83 unsigned long addr, void *old)
83{ 84{
84 struct mm_struct *mm = vma->vm_mm; 85 struct mm_struct *mm = vma->vm_mm;
85 swp_entry_t entry; 86 swp_entry_t entry;
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma,
88 pmd_t *pmd; 89 pmd_t *pmd;
89 pte_t *ptep, pte; 90 pte_t *ptep, pte;
90 spinlock_t *ptl; 91 spinlock_t *ptl;
91 unsigned long addr = page_address_in_vma(new, vma);
92
93 if (addr == -EFAULT)
94 return;
95 92
96 pgd = pgd_offset(mm, addr); 93 pgd = pgd_offset(mm, addr);
97 if (!pgd_present(*pgd)) 94 if (!pgd_present(*pgd))
98 return; 95 goto out;
99 96
100 pud = pud_offset(pgd, addr); 97 pud = pud_offset(pgd, addr);
101 if (!pud_present(*pud)) 98 if (!pud_present(*pud))
102 return; 99 goto out;
103 100
104 pmd = pmd_offset(pud, addr); 101 pmd = pmd_offset(pud, addr);
105 if (!pmd_present(*pmd)) 102 if (!pmd_present(*pmd))
106 return; 103 goto out;
107 104
108 ptep = pte_offset_map(pmd, addr); 105 ptep = pte_offset_map(pmd, addr);
109 106
110 if (!is_swap_pte(*ptep)) { 107 if (!is_swap_pte(*ptep)) {
111 pte_unmap(ptep); 108 pte_unmap(ptep);
112 return; 109 goto out;
113 } 110 }
114 111
115 ptl = pte_lockptr(mm, pmd); 112 ptl = pte_lockptr(mm, pmd);
116 spin_lock(ptl); 113 spin_lock(ptl);
117 pte = *ptep; 114 pte = *ptep;
118 if (!is_swap_pte(pte)) 115 if (!is_swap_pte(pte))
119 goto out; 116 goto unlock;
120 117
121 entry = pte_to_swp_entry(pte); 118 entry = pte_to_swp_entry(pte);
122 119
123 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 120 if (!is_migration_entry(entry) ||
124 goto out; 121 migration_entry_to_page(entry) != old)
122 goto unlock;
125 123
126 get_page(new); 124 get_page(new);
127 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
@@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma,
137 135
138 /* No need to invalidate - it was non-present before */ 136 /* No need to invalidate - it was non-present before */
139 update_mmu_cache(vma, addr, pte); 137 update_mmu_cache(vma, addr, pte);
140 138unlock:
141out:
142 pte_unmap_unlock(ptep, ptl); 139 pte_unmap_unlock(ptep, ptl);
143} 140out:
144 141 return SWAP_AGAIN;
145/*
146 * Note that remove_file_migration_ptes will only work on regular mappings,
147 * Nonlinear mappings do not use migration entries.
148 */
149static void remove_file_migration_ptes(struct page *old, struct page *new)
150{
151 struct vm_area_struct *vma;
152 struct address_space *mapping = new->mapping;
153 struct prio_tree_iter iter;
154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
155
156 if (!mapping)
157 return;
158
159 spin_lock(&mapping->i_mmap_lock);
160
161 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
162 remove_migration_pte(vma, old, new);
163
164 spin_unlock(&mapping->i_mmap_lock);
165}
166
167/*
168 * Must hold mmap_sem lock on at least one of the vmas containing
169 * the page so that the anon_vma cannot vanish.
170 */
171static void remove_anon_migration_ptes(struct page *old, struct page *new)
172{
173 struct anon_vma *anon_vma;
174 struct vm_area_struct *vma;
175 unsigned long mapping;
176
177 mapping = (unsigned long)new->mapping;
178
179 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
180 return;
181
182 /*
183 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
184 */
185 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
186 spin_lock(&anon_vma->lock);
187
188 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
189 remove_migration_pte(vma, old, new);
190
191 spin_unlock(&anon_vma->lock);
192} 142}
193 143
194/* 144/*
@@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
197 */ 147 */
198static void remove_migration_ptes(struct page *old, struct page *new) 148static void remove_migration_ptes(struct page *old, struct page *new)
199{ 149{
200 if (PageAnon(new)) 150 rmap_walk(new, remove_migration_pte, old);
201 remove_anon_migration_ptes(old, new);
202 else
203 remove_file_migration_ptes(old, new);
204} 151}
205 152
206/* 153/*
@@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
341 if (TestClearPageActive(page)) { 288 if (TestClearPageActive(page)) {
342 VM_BUG_ON(PageUnevictable(page)); 289 VM_BUG_ON(PageUnevictable(page));
343 SetPageActive(newpage); 290 SetPageActive(newpage);
344 } else 291 } else if (TestClearPageUnevictable(page))
345 unevictable_migrate_page(newpage, page); 292 SetPageUnevictable(newpage);
346 if (PageChecked(page)) 293 if (PageChecked(page))
347 SetPageChecked(newpage); 294 SetPageChecked(newpage);
348 if (PageMappedToDisk(page)) 295 if (PageMappedToDisk(page))
@@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
361 } 308 }
362 309
363 mlock_migrate_page(newpage, page); 310 mlock_migrate_page(newpage, page);
311 ksm_migrate_page(newpage, page);
364 312
365 ClearPageSwapCache(page); 313 ClearPageSwapCache(page);
366 ClearPagePrivate(page); 314 ClearPagePrivate(page);
@@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page)
580 else 528 else
581 rc = fallback_migrate_page(mapping, newpage, page); 529 rc = fallback_migrate_page(mapping, newpage, page);
582 530
583 if (!rc) { 531 if (!rc)
584 remove_migration_ptes(page, newpage); 532 remove_migration_ptes(page, newpage);
585 } else 533 else
586 newpage->mapping = NULL; 534 newpage->mapping = NULL;
587 535
588 unlock_page(newpage); 536 unlock_page(newpage);
@@ -595,7 +543,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
595 * to the newly allocated page in newpage. 543 * to the newly allocated page in newpage.
596 */ 544 */
597static int unmap_and_move(new_page_t get_new_page, unsigned long private, 545static int unmap_and_move(new_page_t get_new_page, unsigned long private,
598 struct page *page, int force) 546 struct page *page, int force, int offlining)
599{ 547{
600 int rc = 0; 548 int rc = 0;
601 int *result = NULL; 549 int *result = NULL;
@@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
621 lock_page(page); 569 lock_page(page);
622 } 570 }
623 571
572 /*
573 * Only memory hotplug's offline_pages() caller has locked out KSM,
574 * and can safely migrate a KSM page. The other cases have skipped
575 * PageKsm along with PageReserved - but it is only now when we have
576 * the page lock that we can be certain it will not go KSM beneath us
577 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
578 * its pagecount raised, but only here do we take the page lock which
579 * serializes that).
580 */
581 if (PageKsm(page) && !offlining) {
582 rc = -EBUSY;
583 goto unlock;
584 }
585
624 /* charge against new page */ 586 /* charge against new page */
625 charge = mem_cgroup_prepare_migration(page, &mem); 587 charge = mem_cgroup_prepare_migration(page, &mem);
626 if (charge == -ENOMEM) { 588 if (charge == -ENOMEM) {
@@ -737,7 +699,7 @@ move_newpage:
737 * Return: Number of pages not migrated or error code. 699 * Return: Number of pages not migrated or error code.
738 */ 700 */
739int migrate_pages(struct list_head *from, 701int migrate_pages(struct list_head *from,
740 new_page_t get_new_page, unsigned long private) 702 new_page_t get_new_page, unsigned long private, int offlining)
741{ 703{
742 int retry = 1; 704 int retry = 1;
743 int nr_failed = 0; 705 int nr_failed = 0;
@@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from,
746 struct page *page2; 708 struct page *page2;
747 int swapwrite = current->flags & PF_SWAPWRITE; 709 int swapwrite = current->flags & PF_SWAPWRITE;
748 int rc; 710 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
756 711
757 if (!swapwrite) 712 if (!swapwrite)
758 current->flags |= PF_SWAPWRITE; 713 current->flags |= PF_SWAPWRITE;
@@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from,
764 cond_resched(); 719 cond_resched();
765 720
766 rc = unmap_and_move(get_new_page, private, 721 rc = unmap_and_move(get_new_page, private,
767 page, pass > 2); 722 page, pass > 2, offlining);
768 723
769 switch(rc) { 724 switch(rc) {
770 case -ENOMEM: 725 case -ENOMEM:
@@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
860 if (!page) 815 if (!page)
861 goto set_status; 816 goto set_status;
862 817
863 if (PageReserved(page)) /* Check for zero page */ 818 /* Use PageReserved to check for zero page */
819 if (PageReserved(page) || PageKsm(page))
864 goto put_and_set; 820 goto put_and_set;
865 821
866 pp->page = page; 822 pp->page = page;
@@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
878 goto put_and_set; 834 goto put_and_set;
879 835
880 err = isolate_lru_page(page); 836 err = isolate_lru_page(page);
881 if (!err) 837 if (!err) {
882 list_add_tail(&page->lru, &pagelist); 838 list_add_tail(&page->lru, &pagelist);
839 inc_zone_page_state(page, NR_ISOLATED_ANON +
840 page_is_file_cache(page));
841 }
883put_and_set: 842put_and_set:
884 /* 843 /*
885 * Either remove the duplicate refcount from 844 * Either remove the duplicate refcount from
@@ -894,7 +853,7 @@ set_status:
894 err = 0; 853 err = 0;
895 if (!list_empty(&pagelist)) 854 if (!list_empty(&pagelist))
896 err = migrate_pages(&pagelist, new_page_node, 855 err = migrate_pages(&pagelist, new_page_node,
897 (unsigned long)pm); 856 (unsigned long)pm, 0);
898 857
899 up_read(&mm->mmap_sem); 858 up_read(&mm->mmap_sem);
900 return err; 859 return err;
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1015 974
1016 err = -ENOENT; 975 err = -ENOENT;
1017 /* Use PageReserved to check for zero page */ 976 /* Use PageReserved to check for zero page */
1018 if (!page || PageReserved(page)) 977 if (!page || PageReserved(page) || PageKsm(page))
1019 goto set_status; 978 goto set_status;
1020 979
1021 err = page_to_nid(page); 980 err = page_to_nid(page);
@@ -1044,7 +1003,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1044 int err; 1003 int err;
1045 1004
1046 for (i = 0; i < nr_pages; i += chunk_nr) { 1005 for (i = 0; i < nr_pages; i += chunk_nr) {
1047 if (chunk_nr + i > nr_pages) 1006 if (chunk_nr > nr_pages - i)
1048 chunk_nr = nr_pages - i; 1007 chunk_nr = nr_pages - i;
1049 1008
1050 err = copy_from_user(chunk_pages, &pages[i], 1009 err = copy_from_user(chunk_pages, &pages[i],
diff --git a/mm/mincore.c b/mm/mincore.c
index 8cb508f84ea4..7a3436ef39eb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,6 +14,7 @@
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/swapops.h> 16#include <linux/swapops.h>
17#include <linux/hugetlb.h>
17 18
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
72 if (!vma || addr < vma->vm_start) 73 if (!vma || addr < vma->vm_start)
73 return -ENOMEM; 74 return -ENOMEM;
74 75
76#ifdef CONFIG_HUGETLB_PAGE
77 if (is_vm_hugetlb_page(vma)) {
78 struct hstate *h;
79 unsigned long nr_huge;
80 unsigned char present;
81
82 i = 0;
83 nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
84 h = hstate_vma(vma);
85 nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
86 - (addr >> huge_page_shift(h)) + 1;
87 nr_huge = min(nr_huge,
88 (vma->vm_end - addr) >> huge_page_shift(h));
89 while (1) {
90 /* hugepage always in RAM for now,
91 * but generally it needs to be check */
92 ptep = huge_pte_offset(current->mm,
93 addr & huge_page_mask(h));
94 present = !!(ptep &&
95 !huge_pte_none(huge_ptep_get(ptep)));
96 while (1) {
97 vec[i++] = present;
98 addr += PAGE_SIZE;
99 /* reach buffer limit */
100 if (i == nr)
101 return nr;
102 /* check hugepage border */
103 if (!((addr & ~huge_page_mask(h))
104 >> PAGE_SHIFT))
105 break;
106 }
107 }
108 return nr;
109 }
110#endif
111
75 /* 112 /*
76 * Calculate how many pages there are left in the last level of the 113 * Calculate how many pages there are left in the last level of the
77 * PTE array for our address. 114 * PTE array for our address.
diff --git a/mm/mlock.c b/mm/mlock.c
index bd6f0e466f6c..2b8335a89400 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page)
88 } 88 }
89} 89}
90 90
91/* 91/**
92 * called from munlock()/munmap() path with page supposedly on the LRU. 92 * munlock_vma_page - munlock a vma page
93 * @page - page to be unlocked
93 * 94 *
94 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked 95 * called from munlock()/munmap() path with page supposedly on the LRU.
95 * [in try_to_munlock()] and then attempt to isolate the page. We must 96 * When we munlock a page, because the vma where we found the page is being
96 * isolate the page to keep others from messing with its unevictable 97 * munlock()ed or munmap()ed, we want to check whether other vmas hold the
97 * and mlocked state while trying to munlock. However, we pre-clear the 98 * page locked so that we can leave it on the unevictable lru list and not
98 * mlocked state anyway as we might lose the isolation race and we might 99 * bother vmscan with it. However, to walk the page's rmap list in
99 * not get another chance to clear PageMlocked. If we successfully 100 * try_to_munlock() we must isolate the page from the LRU. If some other
100 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas 101 * task has removed the page from the LRU, we won't be able to do that.
101 * mapping the page, it will restore the PageMlocked state, unless the page 102 * So we clear the PageMlocked as we might not get another chance. If we
102 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), 103 * can't isolate the page, we leave it for putback_lru_page() and vmscan
103 * perhaps redundantly. 104 * [page_referenced()/try_to_unmap()] to deal with.
104 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
105 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
106 * either of which will restore the PageMlocked state by calling
107 * mlock_vma_page() above, if it can grab the vma's mmap sem.
108 */ 105 */
109static void munlock_vma_page(struct page *page) 106void munlock_vma_page(struct page *page)
110{ 107{
111 BUG_ON(!PageLocked(page)); 108 BUG_ON(!PageLocked(page));
112 109
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page)
117 /* 114 /*
118 * did try_to_unlock() succeed or punt? 115 * did try_to_unlock() succeed or punt?
119 */ 116 */
120 if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) 117 if (ret != SWAP_MLOCK)
121 count_vm_event(UNEVICTABLE_PGMUNLOCKED); 118 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
122 119
123 putback_lru_page(page); 120 putback_lru_page(page);
124 } else { 121 } else {
125 /* 122 /*
126 * We lost the race. let try_to_unmap() deal 123 * Some other task has removed the page from the LRU.
127 * with it. At least we get the page state and 124 * putback_lru_page() will take care of removing the
128 * mlock stats right. However, page is still on 125 * page from the unevictable list, if necessary.
129 * the noreclaim list. We'll fix that up when 126 * vmscan [page_referenced()] will move the page back
130 * the page is eventually freed or we scan the 127 * to the unevictable list if some other vma has it
131 * noreclaim list. 128 * mlocked.
132 */ 129 */
133 if (PageUnevictable(page)) 130 if (PageUnevictable(page))
134 count_vm_event(UNEVICTABLE_PGSTRANDED); 131 count_vm_event(UNEVICTABLE_PGSTRANDED);
diff --git a/mm/mmap.c b/mm/mmap.c
index ed70a68e882a..ee2298936fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1043,6 +1043,46 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1043} 1043}
1044EXPORT_SYMBOL(do_mmap_pgoff); 1044EXPORT_SYMBOL(do_mmap_pgoff);
1045 1045
1046SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1047 unsigned long, prot, unsigned long, flags,
1048 unsigned long, fd, unsigned long, pgoff)
1049{
1050 struct file *file = NULL;
1051 unsigned long retval = -EBADF;
1052
1053 if (!(flags & MAP_ANONYMOUS)) {
1054 if (unlikely(flags & MAP_HUGETLB))
1055 return -EINVAL;
1056 file = fget(fd);
1057 if (!file)
1058 goto out;
1059 } else if (flags & MAP_HUGETLB) {
1060 struct user_struct *user = NULL;
1061 /*
1062 * VM_NORESERVE is used because the reservations will be
1063 * taken when vm_ops->mmap() is called
1064 * A dummy user value is used because we are not locking
1065 * memory so no accounting is necessary
1066 */
1067 len = ALIGN(len, huge_page_size(&default_hstate));
1068 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
1069 &user, HUGETLB_ANONHUGE_INODE);
1070 if (IS_ERR(file))
1071 return PTR_ERR(file);
1072 }
1073
1074 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1075
1076 down_write(&current->mm->mmap_sem);
1077 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1078 up_write(&current->mm->mmap_sem);
1079
1080 if (file)
1081 fput(file);
1082out:
1083 return retval;
1084}
1085
1046/* 1086/*
1047 * Some shared mappigns will want the pages marked read-only 1087 * Some shared mappigns will want the pages marked read-only
1048 * to track write events. If so, we'll downgrade vm_page_prot 1088 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1198,8 +1238,20 @@ munmap_back:
1198 goto free_vma; 1238 goto free_vma;
1199 } 1239 }
1200 1240
1201 if (vma_wants_writenotify(vma)) 1241 if (vma_wants_writenotify(vma)) {
1242 pgprot_t pprot = vma->vm_page_prot;
1243
1244 /* Can vma->vm_page_prot have changed??
1245 *
1246 * Answer: Yes, drivers may have changed it in their
1247 * f_op->mmap method.
1248 *
1249 * Ensures that vmas marked as uncached stay that way.
1250 */
1202 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1251 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1252 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1253 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1254 }
1203 1255
1204 vma_link(mm, vma, prev, rb_link, rb_parent); 1256 vma_link(mm, vma, prev, rb_link, rb_parent);
1205 file = vma->vm_file; 1257 file = vma->vm_file;
@@ -1811,10 +1863,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1811} 1863}
1812 1864
1813/* 1865/*
1814 * Split a vma into two pieces at address 'addr', a new vma is allocated 1866 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
1815 * either for the first part or the tail. 1867 * munmap path where it doesn't make sense to fail.
1816 */ 1868 */
1817int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 1869static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1818 unsigned long addr, int new_below) 1870 unsigned long addr, int new_below)
1819{ 1871{
1820 struct mempolicy *pol; 1872 struct mempolicy *pol;
@@ -1824,9 +1876,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1824 ~(huge_page_mask(hstate_vma(vma))))) 1876 ~(huge_page_mask(hstate_vma(vma)))))
1825 return -EINVAL; 1877 return -EINVAL;
1826 1878
1827 if (mm->map_count >= sysctl_max_map_count)
1828 return -ENOMEM;
1829
1830 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1879 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1831 if (!new) 1880 if (!new)
1832 return -ENOMEM; 1881 return -ENOMEM;
@@ -1866,6 +1915,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1866 return 0; 1915 return 0;
1867} 1916}
1868 1917
1918/*
1919 * Split a vma into two pieces at address 'addr', a new vma is allocated
1920 * either for the first part or the tail.
1921 */
1922int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1923 unsigned long addr, int new_below)
1924{
1925 if (mm->map_count >= sysctl_max_map_count)
1926 return -ENOMEM;
1927
1928 return __split_vma(mm, vma, addr, new_below);
1929}
1930
1869/* Munmap is split into 2 main parts -- this part which finds 1931/* Munmap is split into 2 main parts -- this part which finds
1870 * what needs doing, and the areas themselves, which do the 1932 * what needs doing, and the areas themselves, which do the
1871 * work. This now handles partial unmappings. 1933 * work. This now handles partial unmappings.
@@ -1901,7 +1963,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1901 * places tmp vma above, and higher split_vma places tmp vma below. 1963 * places tmp vma above, and higher split_vma places tmp vma below.
1902 */ 1964 */
1903 if (start > vma->vm_start) { 1965 if (start > vma->vm_start) {
1904 int error = split_vma(mm, vma, start, 0); 1966 int error;
1967
1968 /*
1969 * Make sure that map_count on return from munmap() will
1970 * not exceed its limit; but let map_count go just above
1971 * its limit temporarily, to help free resources as expected.
1972 */
1973 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
1974 return -ENOMEM;
1975
1976 error = __split_vma(mm, vma, start, 0);
1905 if (error) 1977 if (error)
1906 return error; 1978 return error;
1907 prev = vma; 1979 prev = vma;
@@ -1910,7 +1982,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1910 /* Does it split the last one? */ 1982 /* Does it split the last one? */
1911 last = find_vma(mm, end); 1983 last = find_vma(mm, end);
1912 if (last && end > last->vm_start) { 1984 if (last && end > last->vm_start) {
1913 int error = split_vma(mm, last, end, 1); 1985 int error = __split_vma(mm, last, end, 1);
1914 if (error) 1986 if (error)
1915 return error; 1987 return error;
1916 } 1988 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 9876fa0c3ad3..48a2ecfaf059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
432 /* 432 /*
433 * Ok, looks good - let it rip. 433 * Ok, looks good - let it rip.
434 */ 434 */
435 flush_icache_range(mm->brk, brk);
435 return mm->brk = brk; 436 return mm->brk = brk;
436} 437}
437 438
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
551static void __put_nommu_region(struct vm_region *region) 552static void __put_nommu_region(struct vm_region *region)
552 __releases(nommu_region_sem) 553 __releases(nommu_region_sem)
553{ 554{
554 kenter("%p{%d}", region, atomic_read(&region->vm_usage)); 555 kenter("%p{%d}", region, region->vm_usage);
555 556
556 BUG_ON(!nommu_region_tree.rb_node); 557 BUG_ON(!nommu_region_tree.rb_node);
557 558
558 if (atomic_dec_and_test(&region->vm_usage)) { 559 if (--region->vm_usage == 0) {
559 if (region->vm_top > region->vm_start) 560 if (region->vm_top > region->vm_start)
560 delete_nommu_region(region); 561 delete_nommu_region(region);
561 up_write(&nommu_region_sem); 562 up_write(&nommu_region_sem);
@@ -1143,9 +1144,6 @@ static int do_mmap_private(struct vm_area_struct *vma,
1143 if (ret < rlen) 1144 if (ret < rlen)
1144 memset(base + ret, 0, rlen - ret); 1145 memset(base + ret, 0, rlen - ret);
1145 1146
1146 } else {
1147 /* if it's an anonymous mapping, then just clear it */
1148 memset(base, 0, rlen);
1149 } 1147 }
1150 1148
1151 return 0; 1149 return 0;
@@ -1207,7 +1205,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1207 if (!vma) 1205 if (!vma)
1208 goto error_getting_vma; 1206 goto error_getting_vma;
1209 1207
1210 atomic_set(&region->vm_usage, 1); 1208 region->vm_usage = 1;
1211 region->vm_flags = vm_flags; 1209 region->vm_flags = vm_flags;
1212 region->vm_pgoff = pgoff; 1210 region->vm_pgoff = pgoff;
1213 1211
@@ -1274,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1274 } 1272 }
1275 1273
1276 /* we've found a region we can share */ 1274 /* we've found a region we can share */
1277 atomic_inc(&pregion->vm_usage); 1275 pregion->vm_usage++;
1278 vma->vm_region = pregion; 1276 vma->vm_region = pregion;
1279 start = pregion->vm_start; 1277 start = pregion->vm_start;
1280 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1278 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1291,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1291 vma->vm_region = NULL; 1289 vma->vm_region = NULL;
1292 vma->vm_start = 0; 1290 vma->vm_start = 0;
1293 vma->vm_end = 0; 1291 vma->vm_end = 0;
1294 atomic_dec(&pregion->vm_usage); 1292 pregion->vm_usage--;
1295 pregion = NULL; 1293 pregion = NULL;
1296 goto error_just_free; 1294 goto error_just_free;
1297 } 1295 }
@@ -1343,6 +1341,11 @@ unsigned long do_mmap_pgoff(struct file *file,
1343 goto error_just_free; 1341 goto error_just_free;
1344 add_nommu_region(region); 1342 add_nommu_region(region);
1345 1343
1344 /* clear anonymous mappings that don't ask for uninitialized data */
1345 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1346 memset((void *)region->vm_start, 0,
1347 region->vm_end - region->vm_start);
1348
1346 /* okay... we have a mapping; now we have to register it */ 1349 /* okay... we have a mapping; now we have to register it */
1347 result = vma->vm_start; 1350 result = vma->vm_start;
1348 1351
@@ -1351,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file,
1351share: 1354share:
1352 add_vma_to_mm(current->mm, vma); 1355 add_vma_to_mm(current->mm, vma);
1353 1356
1354 up_write(&nommu_region_sem); 1357 /* we flush the region from the icache only when the first executable
1358 * mapping of it is made */
1359 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1360 flush_icache_range(region->vm_start, region->vm_end);
1361 region->vm_icache_flushed = true;
1362 }
1355 1363
1356 if (prot & PROT_EXEC) 1364 up_write(&nommu_region_sem);
1357 flush_icache_range(result, result + len);
1358 1365
1359 kleave(" = %lx", result); 1366 kleave(" = %lx", result);
1360 return result; 1367 return result;
@@ -1396,6 +1403,31 @@ error_getting_region:
1396} 1403}
1397EXPORT_SYMBOL(do_mmap_pgoff); 1404EXPORT_SYMBOL(do_mmap_pgoff);
1398 1405
1406SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1407 unsigned long, prot, unsigned long, flags,
1408 unsigned long, fd, unsigned long, pgoff)
1409{
1410 struct file *file = NULL;
1411 unsigned long retval = -EBADF;
1412
1413 if (!(flags & MAP_ANONYMOUS)) {
1414 file = fget(fd);
1415 if (!file)
1416 goto out;
1417 }
1418
1419 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1420
1421 down_write(&current->mm->mmap_sem);
1422 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1423 up_write(&current->mm->mmap_sem);
1424
1425 if (file)
1426 fput(file);
1427out:
1428 return retval;
1429}
1430
1399/* 1431/*
1400 * split a vma into two pieces at address 'addr', a new vma is allocated either 1432 * split a vma into two pieces at address 'addr', a new vma is allocated either
1401 * for the first part or the tail. 1433 * for the first part or the tail.
@@ -1409,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1409 1441
1410 kenter(""); 1442 kenter("");
1411 1443
1412 /* we're only permitted to split anonymous regions that have a single 1444 /* we're only permitted to split anonymous regions (these should have
1413 * owner */ 1445 * only a single usage on the region) */
1414 if (vma->vm_file || 1446 if (vma->vm_file)
1415 atomic_read(&vma->vm_region->vm_usage) != 1)
1416 return -ENOMEM; 1447 return -ENOMEM;
1417 1448
1418 if (mm->map_count >= sysctl_max_map_count) 1449 if (mm->map_count >= sysctl_max_map_count)
@@ -1486,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm,
1486 1517
1487 /* cut the backing region down to size */ 1518 /* cut the backing region down to size */
1488 region = vma->vm_region; 1519 region = vma->vm_region;
1489 BUG_ON(atomic_read(&region->vm_usage) != 1); 1520 BUG_ON(region->vm_usage != 1);
1490 1521
1491 down_write(&nommu_region_sem); 1522 down_write(&nommu_region_sem);
1492 delete_nommu_region(region); 1523 delete_nommu_region(region);
@@ -1730,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping,
1730EXPORT_SYMBOL(unmap_mapping_range); 1761EXPORT_SYMBOL(unmap_mapping_range);
1731 1762
1732/* 1763/*
1733 * ask for an unmapped area at which to create a mapping on a file
1734 */
1735unsigned long get_unmapped_area(struct file *file, unsigned long addr,
1736 unsigned long len, unsigned long pgoff,
1737 unsigned long flags)
1738{
1739 unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
1740 unsigned long, unsigned long);
1741
1742 get_area = current->mm->get_unmapped_area;
1743 if (file && file->f_op && file->f_op->get_unmapped_area)
1744 get_area = file->f_op->get_unmapped_area;
1745
1746 if (!get_area)
1747 return -ENOSYS;
1748
1749 return get_area(file, addr, len, pgoff, flags);
1750}
1751EXPORT_SYMBOL(get_unmapped_area);
1752
1753/*
1754 * Check that a process has enough memory to allocate a new virtual 1764 * Check that a process has enough memory to allocate a new virtual
1755 * mapping. 0 means there is enough memory for the allocation to 1765 * mapping. 0 means there is enough memory for the allocation to
1756 * succeed and -ENOMEM implies there is not. 1766 * succeed and -ENOMEM implies there is not.
@@ -1889,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1889 1899
1890 /* only read or write mappings where it is permitted */ 1900 /* only read or write mappings where it is permitted */
1891 if (write && vma->vm_flags & VM_MAYWRITE) 1901 if (write && vma->vm_flags & VM_MAYWRITE)
1892 len -= copy_to_user((void *) addr, buf, len); 1902 copy_to_user_page(vma, NULL, addr,
1903 (void *) addr, buf, len);
1893 else if (!write && vma->vm_flags & VM_MAYREAD) 1904 else if (!write && vma->vm_flags & VM_MAYREAD)
1894 len -= copy_from_user(buf, (void *) addr, len); 1905 copy_from_user_page(vma, NULL, addr,
1906 buf, (void *) addr, len);
1895 else 1907 else
1896 len = 0; 1908 len = 0;
1897 } else { 1909 } else {
@@ -1902,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1902 mmput(mm); 1914 mmput(mm);
1903 return len; 1915 return len;
1904} 1916}
1917
1918/**
1919 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1920 * @inode: The inode to check
1921 * @size: The current filesize of the inode
1922 * @newsize: The proposed filesize of the inode
1923 *
1924 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1925 * make sure that that any outstanding VMAs aren't broken and then shrink the
1926 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1927 * automatically grant mappings that are too large.
1928 */
1929int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1930 size_t newsize)
1931{
1932 struct vm_area_struct *vma;
1933 struct prio_tree_iter iter;
1934 struct vm_region *region;
1935 pgoff_t low, high;
1936 size_t r_size, r_top;
1937
1938 low = newsize >> PAGE_SHIFT;
1939 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1940
1941 down_write(&nommu_region_sem);
1942
1943 /* search for VMAs that fall within the dead zone */
1944 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1945 low, high) {
1946 /* found one - only interested if it's shared out of the page
1947 * cache */
1948 if (vma->vm_flags & VM_SHARED) {
1949 up_write(&nommu_region_sem);
1950 return -ETXTBSY; /* not quite true, but near enough */
1951 }
1952 }
1953
1954 /* reduce any regions that overlap the dead zone - if in existence,
1955 * these will be pointed to by VMAs that don't overlap the dead zone
1956 *
1957 * we don't check for any regions that start beyond the EOF as there
1958 * shouldn't be any
1959 */
1960 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1961 0, ULONG_MAX) {
1962 if (!(vma->vm_flags & VM_SHARED))
1963 continue;
1964
1965 region = vma->vm_region;
1966 r_size = region->vm_top - region->vm_start;
1967 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1968
1969 if (r_top > newsize) {
1970 region->vm_top -= r_top - newsize;
1971 if (region->vm_end > region->vm_top)
1972 region->vm_end = region->vm_top;
1973 }
1974 }
1975
1976 up_write(&nommu_region_sem);
1977 return 0;
1978}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ea2147dabba6..f52481b1c1e5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
196/* 196/*
197 * Determine the type of allocation constraint. 197 * Determine the type of allocation constraint.
198 */ 198 */
199static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200 gfp_t gfp_mask)
201{
202#ifdef CONFIG_NUMA 199#ifdef CONFIG_NUMA
200static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
201 gfp_t gfp_mask, nodemask_t *nodemask)
202{
203 struct zone *zone; 203 struct zone *zone;
204 struct zoneref *z; 204 struct zoneref *z;
205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 205 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
206 nodemask_t nodes = node_states[N_HIGH_MEMORY];
207 206
208 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 207 /*
209 if (cpuset_zone_allowed_softwall(zone, gfp_mask)) 208 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
210 node_clear(zone_to_nid(zone), nodes); 209 * to kill current.We have to random task kill in this case.
211 else 210 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
212 return CONSTRAINT_CPUSET; 211 */
212 if (gfp_mask & __GFP_THISNODE)
213 return CONSTRAINT_NONE;
213 214
214 if (!nodes_empty(nodes)) 215 /*
216 * The nodemask here is a nodemask passed to alloc_pages(). Now,
217 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
218 * feature. mempolicy is an only user of nodemask here.
219 * check mempolicy's nodemask contains all N_HIGH_MEMORY
220 */
221 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
215 return CONSTRAINT_MEMORY_POLICY; 222 return CONSTRAINT_MEMORY_POLICY;
216#endif
217 223
224 /* Check this allocation failure is caused by cpuset's wall function */
225 for_each_zone_zonelist_nodemask(zone, z, zonelist,
226 high_zoneidx, nodemask)
227 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
228 return CONSTRAINT_CPUSET;
229
230 return CONSTRAINT_NONE;
231}
232#else
233static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
234 gfp_t gfp_mask, nodemask_t *nodemask)
235{
218 return CONSTRAINT_NONE; 236 return CONSTRAINT_NONE;
219} 237}
238#endif
220 239
221/* 240/*
222 * Simple selection loop. We chose the process with the highest 241 * Simple selection loop. We chose the process with the highest
@@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem)
337 } while_each_thread(g, p); 356 } while_each_thread(g, p);
338} 357}
339 358
359static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
360 struct mem_cgroup *mem)
361{
362 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
363 "oom_adj=%d\n",
364 current->comm, gfp_mask, order, current->signal->oom_adj);
365 task_lock(current);
366 cpuset_print_task_mems_allowed(current);
367 task_unlock(current);
368 dump_stack();
369 mem_cgroup_print_oom_info(mem, p);
370 show_mem();
371 if (sysctl_oom_dump_tasks)
372 dump_tasks(mem);
373}
374
375#define K(x) ((x) << (PAGE_SHIFT-10))
376
340/* 377/*
341 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 378 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
342 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 379 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
@@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
350 return; 387 return;
351 } 388 }
352 389
390 task_lock(p);
353 if (!p->mm) { 391 if (!p->mm) {
354 WARN_ON(1); 392 WARN_ON(1);
355 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 393 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
394 task_pid_nr(p), p->comm);
395 task_unlock(p);
356 return; 396 return;
357 } 397 }
358 398
359 if (verbose) 399 if (verbose)
360 printk(KERN_ERR "Killed process %d (%s)\n", 400 printk(KERN_ERR "Killed process %d (%s) "
361 task_pid_nr(p), p->comm); 401 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
402 task_pid_nr(p), p->comm,
403 K(p->mm->total_vm),
404 K(get_mm_counter(p->mm, anon_rss)),
405 K(get_mm_counter(p->mm, file_rss)));
406 task_unlock(p);
362 407
363 /* 408 /*
364 * We give our sacrificial lamb high priority and access to 409 * We give our sacrificial lamb high priority and access to
@@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
395{ 440{
396 struct task_struct *c; 441 struct task_struct *c;
397 442
398 if (printk_ratelimit()) { 443 if (printk_ratelimit())
399 printk(KERN_WARNING "%s invoked oom-killer: " 444 dump_header(p, gfp_mask, order, mem);
400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
403 task_lock(current);
404 cpuset_print_task_mems_allowed(current);
405 task_unlock(current);
406 dump_stack();
407 mem_cgroup_print_oom_info(mem, current);
408 show_mem();
409 if (sysctl_oom_dump_tasks)
410 dump_tasks(mem);
411 }
412 445
413 /* 446 /*
414 * If the task is already exiting, don't alarm the sysadmin or kill 447 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -544,6 +577,7 @@ retry:
544 /* Found nothing?!?! Either we hang forever, or we panic. */ 577 /* Found nothing?!?! Either we hang forever, or we panic. */
545 if (!p) { 578 if (!p) {
546 read_unlock(&tasklist_lock); 579 read_unlock(&tasklist_lock);
580 dump_header(NULL, gfp_mask, order, NULL);
547 panic("Out of memory and no killable processes...\n"); 581 panic("Out of memory and no killable processes...\n");
548 } 582 }
549 583
@@ -599,7 +633,8 @@ rest_and_return:
599 * OR try to be smart about which process to kill. Note that we 633 * OR try to be smart about which process to kill. Note that we
600 * don't have to be perfect here, we just have to be good. 634 * don't have to be perfect here, we just have to be good.
601 */ 635 */
602void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 636void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
637 int order, nodemask_t *nodemask)
603{ 638{
604 unsigned long freed = 0; 639 unsigned long freed = 0;
605 enum oom_constraint constraint; 640 enum oom_constraint constraint;
@@ -609,14 +644,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
609 /* Got some memory back in the last second. */ 644 /* Got some memory back in the last second. */
610 return; 645 return;
611 646
612 if (sysctl_panic_on_oom == 2) 647 if (sysctl_panic_on_oom == 2) {
648 dump_header(NULL, gfp_mask, order, NULL);
613 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 649 panic("out of memory. Compulsory panic_on_oom is selected.\n");
650 }
614 651
615 /* 652 /*
616 * Check if there were limitations on the allocation (only relevant for 653 * Check if there were limitations on the allocation (only relevant for
617 * NUMA) that may require different handling. 654 * NUMA) that may require different handling.
618 */ 655 */
619 constraint = constrained_alloc(zonelist, gfp_mask); 656 constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
620 read_lock(&tasklist_lock); 657 read_lock(&tasklist_lock);
621 658
622 switch (constraint) { 659 switch (constraint) {
@@ -626,8 +663,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
626 break; 663 break;
627 664
628 case CONSTRAINT_NONE: 665 case CONSTRAINT_NONE:
629 if (sysctl_panic_on_oom) 666 if (sysctl_panic_on_oom) {
667 dump_header(NULL, gfp_mask, order, NULL);
630 panic("out of memory. panic_on_oom is selected\n"); 668 panic("out of memory. panic_on_oom is selected\n");
669 }
631 /* Fall-through */ 670 /* Fall-through */
632 case CONSTRAINT_CPUSET: 671 case CONSTRAINT_CPUSET:
633 __out_of_memory(gfp_mask, order); 672 __out_of_memory(gfp_mask, order);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bc2ac63f41e..d2a8889b4c58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h>
51#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
52 53
53#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
@@ -486,7 +487,6 @@ static inline void __free_one_page(struct page *page,
486 zone->free_area[order].nr_free++; 487 zone->free_area[order].nr_free++;
487} 488}
488 489
489#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
490/* 490/*
491 * free_page_mlock() -- clean up attempts to free and mlocked() page. 491 * free_page_mlock() -- clean up attempts to free and mlocked() page.
492 * Page should not be on lru, so no need to fix that up. 492 * Page should not be on lru, so no need to fix that up.
@@ -497,9 +497,6 @@ static inline void free_page_mlock(struct page *page)
497 __dec_zone_page_state(page, NR_MLOCK); 497 __dec_zone_page_state(page, NR_MLOCK);
498 __count_vm_event(UNEVICTABLE_MLOCKFREED); 498 __count_vm_event(UNEVICTABLE_MLOCKFREED);
499} 499}
500#else
501static void free_page_mlock(struct page *page) { }
502#endif
503 500
504static inline int free_pages_check(struct page *page) 501static inline int free_pages_check(struct page *page)
505{ 502{
@@ -1225,10 +1222,10 @@ again:
1225 } 1222 }
1226 spin_lock_irqsave(&zone->lock, flags); 1223 spin_lock_irqsave(&zone->lock, flags);
1227 page = __rmqueue(zone, order, migratetype); 1224 page = __rmqueue(zone, order, migratetype);
1228 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1229 spin_unlock(&zone->lock); 1225 spin_unlock(&zone->lock);
1230 if (!page) 1226 if (!page)
1231 goto failed; 1227 goto failed;
1228 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1232 } 1229 }
1233 1230
1234 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1231 __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1658,12 +1655,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1658 if (page) 1655 if (page)
1659 goto out; 1656 goto out;
1660 1657
1661 /* The OOM killer will not help higher order allocs */ 1658 if (!(gfp_mask & __GFP_NOFAIL)) {
1662 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) 1659 /* The OOM killer will not help higher order allocs */
1663 goto out; 1660 if (order > PAGE_ALLOC_COSTLY_ORDER)
1664 1661 goto out;
1662 /*
1663 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1664 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
1665 * The caller should handle page allocation failure by itself if
1666 * it specifies __GFP_THISNODE.
1667 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
1668 */
1669 if (gfp_mask & __GFP_THISNODE)
1670 goto out;
1671 }
1665 /* Exhausted what can be done so it's blamo time */ 1672 /* Exhausted what can be done so it's blamo time */
1666 out_of_memory(zonelist, gfp_mask, order); 1673 out_of_memory(zonelist, gfp_mask, order, nodemask);
1667 1674
1668out: 1675out:
1669 clear_zonelist_oom(zonelist, gfp_mask); 1676 clear_zonelist_oom(zonelist, gfp_mask);
@@ -2395,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2395{ 2402{
2396 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2403 char saved_string[NUMA_ZONELIST_ORDER_LEN];
2397 int ret; 2404 int ret;
2405 static DEFINE_MUTEX(zl_order_mutex);
2398 2406
2407 mutex_lock(&zl_order_mutex);
2399 if (write) 2408 if (write)
2400 strncpy(saved_string, (char*)table->data, 2409 strcpy(saved_string, (char*)table->data);
2401 NUMA_ZONELIST_ORDER_LEN);
2402 ret = proc_dostring(table, write, buffer, length, ppos); 2410 ret = proc_dostring(table, write, buffer, length, ppos);
2403 if (ret) 2411 if (ret)
2404 return ret; 2412 goto out;
2405 if (write) { 2413 if (write) {
2406 int oldval = user_zonelist_order; 2414 int oldval = user_zonelist_order;
2407 if (__parse_numa_zonelist_order((char*)table->data)) { 2415 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2414,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2414 } else if (oldval != user_zonelist_order) 2422 } else if (oldval != user_zonelist_order)
2415 build_all_zonelists(); 2423 build_all_zonelists();
2416 } 2424 }
2417 return 0; 2425out:
2426 mutex_unlock(&zl_order_mutex);
2427 return ret;
2418} 2428}
2419 2429
2420 2430
@@ -3127,7 +3137,7 @@ static int __cpuinit process_zones(int cpu)
3127 3137
3128 if (percpu_pagelist_fraction) 3138 if (percpu_pagelist_fraction)
3129 setup_pagelist_highmark(zone_pcp(zone, cpu), 3139 setup_pagelist_highmark(zone_pcp(zone, cpu),
3130 (zone->present_pages / percpu_pagelist_fraction)); 3140 (zone->present_pages / percpu_pagelist_fraction));
3131 } 3141 }
3132 3142
3133 return 0; 3143 return 0;
@@ -3573,7 +3583,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3573 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3583 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3574 * then all holes in the requested range will be accounted for. 3584 * then all holes in the requested range will be accounted for.
3575 */ 3585 */
3576static unsigned long __meminit __absent_pages_in_range(int nid, 3586unsigned long __meminit __absent_pages_in_range(int nid,
3577 unsigned long range_start_pfn, 3587 unsigned long range_start_pfn,
3578 unsigned long range_end_pfn) 3588 unsigned long range_end_pfn)
3579{ 3589{
@@ -3988,7 +3998,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3988 } 3998 }
3989 3999
3990 /* Merge backward if suitable */ 4000 /* Merge backward if suitable */
3991 if (start_pfn < early_node_map[i].end_pfn && 4001 if (start_pfn < early_node_map[i].start_pfn &&
3992 end_pfn >= early_node_map[i].start_pfn) { 4002 end_pfn >= early_node_map[i].start_pfn) {
3993 early_node_map[i].start_pfn = start_pfn; 4003 early_node_map[i].start_pfn = start_pfn;
3994 return; 4004 return;
@@ -4102,7 +4112,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
4102} 4112}
4103 4113
4104/* sort the node_map by start_pfn */ 4114/* sort the node_map by start_pfn */
4105static void __init sort_node_map(void) 4115void __init sort_node_map(void)
4106{ 4116{
4107 sort(early_node_map, (size_t)nr_nodemap_entries, 4117 sort(early_node_map, (size_t)nr_nodemap_entries,
4108 sizeof(struct node_active_region), 4118 sizeof(struct node_active_region),
@@ -5002,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5002int set_migratetype_isolate(struct page *page) 5012int set_migratetype_isolate(struct page *page)
5003{ 5013{
5004 struct zone *zone; 5014 struct zone *zone;
5005 unsigned long flags; 5015 struct page *curr_page;
5016 unsigned long flags, pfn, iter;
5017 unsigned long immobile = 0;
5018 struct memory_isolate_notify arg;
5019 int notifier_ret;
5006 int ret = -EBUSY; 5020 int ret = -EBUSY;
5007 int zone_idx; 5021 int zone_idx;
5008 5022
5009 zone = page_zone(page); 5023 zone = page_zone(page);
5010 zone_idx = zone_idx(zone); 5024 zone_idx = zone_idx(zone);
5025
5011 spin_lock_irqsave(&zone->lock, flags); 5026 spin_lock_irqsave(&zone->lock, flags);
5027 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5028 zone_idx == ZONE_MOVABLE) {
5029 ret = 0;
5030 goto out;
5031 }
5032
5033 pfn = page_to_pfn(page);
5034 arg.start_pfn = pfn;
5035 arg.nr_pages = pageblock_nr_pages;
5036 arg.pages_found = 0;
5037
5012 /* 5038 /*
5013 * In future, more migrate types will be able to be isolation target. 5039 * It may be possible to isolate a pageblock even if the
5040 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5041 * notifier chain is used by balloon drivers to return the
5042 * number of pages in a range that are held by the balloon
5043 * driver to shrink memory. If all the pages are accounted for
5044 * by balloons, are free, or on the LRU, isolation can continue.
5045 * Later, for example, when memory hotplug notifier runs, these
5046 * pages reported as "can be isolated" should be isolated(freed)
5047 * by the balloon driver through the memory notifier chain.
5014 */ 5048 */
5015 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && 5049 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5016 zone_idx != ZONE_MOVABLE) 5050 notifier_ret = notifier_to_errno(notifier_ret);
5051 if (notifier_ret || !arg.pages_found)
5017 goto out; 5052 goto out;
5018 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5053
5019 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5054 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5020 ret = 0; 5055 if (!pfn_valid_within(pfn))
5056 continue;
5057
5058 curr_page = pfn_to_page(iter);
5059 if (!page_count(curr_page) || PageLRU(curr_page))
5060 continue;
5061
5062 immobile++;
5063 }
5064
5065 if (arg.pages_found == immobile)
5066 ret = 0;
5067
5021out: 5068out:
5069 if (!ret) {
5070 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5071 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5072 }
5073
5022 spin_unlock_irqrestore(&zone->lock, flags); 5074 spin_unlock_irqrestore(&zone->lock, flags);
5023 if (!ret) 5075 if (!ret)
5024 drain_all_pages(); 5076 drain_all_pages();
@@ -5085,3 +5137,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5085 spin_unlock_irqrestore(&zone->lock, flags); 5137 spin_unlock_irqrestore(&zone->lock, flags);
5086} 5138}
5087#endif 5139#endif
5140
5141#ifdef CONFIG_MEMORY_FAILURE
5142bool is_free_buddy_page(struct page *page)
5143{
5144 struct zone *zone = page_zone(page);
5145 unsigned long pfn = page_to_pfn(page);
5146 unsigned long flags;
5147 int order;
5148
5149 spin_lock_irqsave(&zone->lock, flags);
5150 for (order = 0; order < MAX_ORDER; order++) {
5151 struct page *page_head = page - (pfn & ((1 << order) - 1));
5152
5153 if (PageBuddy(page_head) && page_order(page_head) >= order)
5154 break;
5155 }
5156 spin_unlock_irqrestore(&zone->lock, flags);
5157
5158 return order < MAX_ORDER;
5159}
5160#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index c6f3e5071de3..a19af956ee1b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,20 +19,15 @@
19#include <linux/writeback.h> 19#include <linux/writeback.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, 22static struct bio *get_swap_bio(gfp_t gfp_flags,
23 struct page *page, bio_end_io_t end_io) 23 struct page *page, bio_end_io_t end_io)
24{ 24{
25 struct bio *bio; 25 struct bio *bio;
26 26
27 bio = bio_alloc(gfp_flags, 1); 27 bio = bio_alloc(gfp_flags, 1);
28 if (bio) { 28 if (bio) {
29 struct swap_info_struct *sis; 29 bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
30 swp_entry_t entry = { .val = index, }; 30 bio->bi_sector <<= PAGE_SHIFT - 9;
31
32 sis = get_swap_info_struct(swp_type(entry));
33 bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
34 (PAGE_SIZE >> 9);
35 bio->bi_bdev = sis->bdev;
36 bio->bi_io_vec[0].bv_page = page; 31 bio->bi_io_vec[0].bv_page = page;
37 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 32 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
38 bio->bi_io_vec[0].bv_offset = 0; 33 bio->bi_io_vec[0].bv_offset = 0;
@@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
102 unlock_page(page); 97 unlock_page(page);
103 goto out; 98 goto out;
104 } 99 }
105 bio = get_swap_bio(GFP_NOIO, page_private(page), page, 100 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
106 end_swap_bio_write);
107 if (bio == NULL) { 101 if (bio == NULL) {
108 set_page_dirty(page); 102 set_page_dirty(page);
109 unlock_page(page); 103 unlock_page(page);
@@ -127,8 +121,7 @@ int swap_readpage(struct page *page)
127 121
128 VM_BUG_ON(!PageLocked(page)); 122 VM_BUG_ON(!PageLocked(page));
129 VM_BUG_ON(PageUptodate(page)); 123 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 124 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
131 end_swap_bio_read);
132 if (bio == NULL) { 125 if (bio == NULL) {
133 unlock_page(page); 126 unlock_page(page);
134 ret = -ENOMEM; 127 ret = -ENOMEM;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d5878bed7841..7b47a57b6646 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,6 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/highmem.h> 2#include <linux/highmem.h>
3#include <linux/sched.h> 3#include <linux/sched.h>
4#include <linux/hugetlb.h>
4 5
5static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 6static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
6 struct mm_walk *walk) 7 struct mm_walk *walk)
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
107 pgd_t *pgd; 108 pgd_t *pgd;
108 unsigned long next; 109 unsigned long next;
109 int err = 0; 110 int err = 0;
111 struct vm_area_struct *vma;
110 112
111 if (addr >= end) 113 if (addr >= end)
112 return err; 114 return err;
@@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end,
117 pgd = pgd_offset(walk->mm, addr); 119 pgd = pgd_offset(walk->mm, addr);
118 do { 120 do {
119 next = pgd_addr_end(addr, end); 121 next = pgd_addr_end(addr, end);
122
123 /*
124 * handle hugetlb vma individually because pagetable walk for
125 * the hugetlb page is dependent on the architecture and
126 * we can't handled it in the same manner as non-huge pages.
127 */
128 vma = find_vma(walk->mm, addr);
129#ifdef CONFIG_HUGETLB_PAGE
130 if (vma && is_vm_hugetlb_page(vma)) {
131 pte_t *pte;
132 struct hstate *hs;
133
134 if (vma->vm_end < next)
135 next = vma->vm_end;
136 hs = hstate_vma(vma);
137 pte = huge_pte_offset(walk->mm,
138 addr & huge_page_mask(hs));
139 if (pte && !huge_pte_none(huge_ptep_get(pte))
140 && walk->hugetlb_entry)
141 err = walk->hugetlb_entry(pte, addr,
142 next, walk);
143 if (err)
144 break;
145 continue;
146 }
147#endif
120 if (pgd_none_or_clear_bad(pgd)) { 148 if (pgd_none_or_clear_bad(pgd)) {
121 if (walk->pte_hole) 149 if (walk->pte_hole)
122 err = walk->pte_hole(addr, next, walk); 150 err = walk->pte_hole(addr, next, walk);
123 if (err) 151 if (err)
124 break; 152 break;
153 pgd++;
125 continue; 154 continue;
126 } 155 }
127 if (walk->pgd_entry) 156 if (walk->pgd_entry)
@@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end,
131 err = walk_pud_range(pgd, addr, next, walk); 160 err = walk_pud_range(pgd, addr, next, walk);
132 if (err) 161 if (err)
133 break; 162 break;
134 } while (pgd++, addr = next, addr != end); 163 pgd++;
164 } while (addr = next, addr != end);
135 165
136 return err; 166 return err;
137} 167}
diff --git a/mm/percpu.c b/mm/percpu.c
index 5adfc268b408..083e7c91e5f6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -46,8 +46,6 @@
46 * 46 *
47 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
48 * 48 *
49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
50 *
51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 49 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
52 * regular address to percpu pointer and back if they need to be 50 * regular address to percpu pointer and back if they need to be
53 * different from the default 51 * different from the default
@@ -74,6 +72,7 @@
74#include <asm/cacheflush.h> 72#include <asm/cacheflush.h>
75#include <asm/sections.h> 73#include <asm/sections.h>
76#include <asm/tlbflush.h> 74#include <asm/tlbflush.h>
75#include <asm/io.h>
77 76
78#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
79#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
@@ -1272,7 +1271,7 @@ static void pcpu_reclaim(struct work_struct *work)
1272 */ 1271 */
1273void free_percpu(void *ptr) 1272void free_percpu(void *ptr)
1274{ 1273{
1275 void *addr = __pcpu_ptr_to_addr(ptr); 1274 void *addr;
1276 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
1277 unsigned long flags; 1276 unsigned long flags;
1278 int off; 1277 int off;
@@ -1280,6 +1279,8 @@ void free_percpu(void *ptr)
1280 if (!ptr) 1279 if (!ptr)
1281 return; 1280 return;
1282 1281
1282 addr = __pcpu_ptr_to_addr(ptr);
1283
1283 spin_lock_irqsave(&pcpu_lock, flags); 1284 spin_lock_irqsave(&pcpu_lock, flags);
1284 1285
1285 chunk = pcpu_chunk_addr_search(addr); 1286 chunk = pcpu_chunk_addr_search(addr);
@@ -1302,6 +1303,27 @@ void free_percpu(void *ptr)
1302} 1303}
1303EXPORT_SYMBOL_GPL(free_percpu); 1304EXPORT_SYMBOL_GPL(free_percpu);
1304 1305
1306/**
1307 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1308 * @addr: the address to be converted to physical address
1309 *
1310 * Given @addr which is dereferenceable address obtained via one of
1311 * percpu access macros, this function translates it into its physical
1312 * address. The caller is responsible for ensuring @addr stays valid
1313 * until this function finishes.
1314 *
1315 * RETURNS:
1316 * The physical address for @addr.
1317 */
1318phys_addr_t per_cpu_ptr_to_phys(void *addr)
1319{
1320 if ((unsigned long)addr < VMALLOC_START ||
1321 (unsigned long)addr >= VMALLOC_END)
1322 return __pa(addr);
1323 else
1324 return page_to_phys(vmalloc_to_page(addr));
1325}
1326
1305static inline size_t pcpu_calc_fc_sizes(size_t static_size, 1327static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1306 size_t reserved_size, 1328 size_t reserved_size,
1307 ssize_t *dyn_sizep) 1329 ssize_t *dyn_sizep)
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..033bc135a41f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping,
547 547
548 /* do read-ahead */ 548 /* do read-ahead */
549 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 549 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
550
551#ifdef CONFIG_BLOCK
552 /*
553 * Normally the current page is !uptodate and lock_page() will be
554 * immediately called to implicitly unplug the device. However this
555 * is not always true for RAID conifgurations, where data arrives
556 * not strictly in their submission order. In this case we need to
557 * explicitly kick off the IO.
558 */
559 if (PageUptodate(page))
560 blk_run_backing_dev(mapping->backing_dev_info, NULL);
561#endif
550} 562}
551EXPORT_SYMBOL_GPL(page_cache_async_readahead); 563EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373a483f..278cd277bdec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/swapops.h> 49#include <linux/swapops.h>
50#include <linux/slab.h> 50#include <linux/slab.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/ksm.h>
52#include <linux/rmap.h> 53#include <linux/rmap.h>
53#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
54#include <linux/module.h> 55#include <linux/module.h>
@@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
67 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 68 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
68} 69}
69 70
70static inline void anon_vma_free(struct anon_vma *anon_vma) 71void anon_vma_free(struct anon_vma *anon_vma)
71{ 72{
72 kmem_cache_free(anon_vma_cachep, anon_vma); 73 kmem_cache_free(anon_vma_cachep, anon_vma);
73} 74}
@@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
171 list_del(&vma->anon_vma_node); 172 list_del(&vma->anon_vma_node);
172 173
173 /* We must garbage collect the anon_vma if it's empty */ 174 /* We must garbage collect the anon_vma if it's empty */
174 empty = list_empty(&anon_vma->head); 175 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
175 spin_unlock(&anon_vma->lock); 176 spin_unlock(&anon_vma->lock);
176 177
177 if (empty) 178 if (empty)
@@ -183,6 +184,7 @@ static void anon_vma_ctor(void *data)
183 struct anon_vma *anon_vma = data; 184 struct anon_vma *anon_vma = data;
184 185
185 spin_lock_init(&anon_vma->lock); 186 spin_lock_init(&anon_vma->lock);
187 ksm_refcount_init(anon_vma);
186 INIT_LIST_HEAD(&anon_vma->head); 188 INIT_LIST_HEAD(&anon_vma->head);
187} 189}
188 190
@@ -202,8 +204,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
202 unsigned long anon_mapping; 204 unsigned long anon_mapping;
203 205
204 rcu_read_lock(); 206 rcu_read_lock();
205 anon_mapping = (unsigned long) page->mapping; 207 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
206 if (!(anon_mapping & PAGE_MAPPING_ANON)) 208 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
207 goto out; 209 goto out;
208 if (!page_mapped(page)) 210 if (!page_mapped(page))
209 goto out; 211 goto out;
@@ -248,8 +250,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 250unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
249{ 251{
250 if (PageAnon(page)) { 252 if (PageAnon(page)) {
251 if ((void *)vma->anon_vma != 253 if (vma->anon_vma != page_anon_vma(page))
252 (void *)page->mapping - PAGE_MAPPING_ANON)
253 return -EFAULT; 254 return -EFAULT;
254 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 255 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
255 if (!vma->vm_file || 256 if (!vma->vm_file ||
@@ -337,21 +338,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
337 * Subfunctions of page_referenced: page_referenced_one called 338 * Subfunctions of page_referenced: page_referenced_one called
338 * repeatedly from either page_referenced_anon or page_referenced_file. 339 * repeatedly from either page_referenced_anon or page_referenced_file.
339 */ 340 */
340static int page_referenced_one(struct page *page, 341int page_referenced_one(struct page *page, struct vm_area_struct *vma,
341 struct vm_area_struct *vma, 342 unsigned long address, unsigned int *mapcount,
342 unsigned int *mapcount, 343 unsigned long *vm_flags)
343 unsigned long *vm_flags)
344{ 344{
345 struct mm_struct *mm = vma->vm_mm; 345 struct mm_struct *mm = vma->vm_mm;
346 unsigned long address;
347 pte_t *pte; 346 pte_t *pte;
348 spinlock_t *ptl; 347 spinlock_t *ptl;
349 int referenced = 0; 348 int referenced = 0;
350 349
351 address = vma_address(page, vma);
352 if (address == -EFAULT)
353 goto out;
354
355 pte = page_check_address(page, mm, address, &ptl, 0); 350 pte = page_check_address(page, mm, address, &ptl, 0);
356 if (!pte) 351 if (!pte)
357 goto out; 352 goto out;
@@ -388,9 +383,10 @@ static int page_referenced_one(struct page *page,
388out_unmap: 383out_unmap:
389 (*mapcount)--; 384 (*mapcount)--;
390 pte_unmap_unlock(pte, ptl); 385 pte_unmap_unlock(pte, ptl);
391out: 386
392 if (referenced) 387 if (referenced)
393 *vm_flags |= vma->vm_flags; 388 *vm_flags |= vma->vm_flags;
389out:
394 return referenced; 390 return referenced;
395} 391}
396 392
@@ -409,6 +405,9 @@ static int page_referenced_anon(struct page *page,
409 405
410 mapcount = page_mapcount(page); 406 mapcount = page_mapcount(page);
411 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 407 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
408 unsigned long address = vma_address(page, vma);
409 if (address == -EFAULT)
410 continue;
412 /* 411 /*
413 * If we are reclaiming on behalf of a cgroup, skip 412 * If we are reclaiming on behalf of a cgroup, skip
414 * counting on behalf of references from different 413 * counting on behalf of references from different
@@ -416,7 +415,7 @@ static int page_referenced_anon(struct page *page,
416 */ 415 */
417 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 416 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
418 continue; 417 continue;
419 referenced += page_referenced_one(page, vma, 418 referenced += page_referenced_one(page, vma, address,
420 &mapcount, vm_flags); 419 &mapcount, vm_flags);
421 if (!mapcount) 420 if (!mapcount)
422 break; 421 break;
@@ -474,6 +473,9 @@ static int page_referenced_file(struct page *page,
474 mapcount = page_mapcount(page); 473 mapcount = page_mapcount(page);
475 474
476 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 475 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
476 unsigned long address = vma_address(page, vma);
477 if (address == -EFAULT)
478 continue;
477 /* 479 /*
478 * If we are reclaiming on behalf of a cgroup, skip 480 * If we are reclaiming on behalf of a cgroup, skip
479 * counting on behalf of references from different 481 * counting on behalf of references from different
@@ -481,7 +483,7 @@ static int page_referenced_file(struct page *page,
481 */ 483 */
482 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 484 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
483 continue; 485 continue;
484 referenced += page_referenced_one(page, vma, 486 referenced += page_referenced_one(page, vma, address,
485 &mapcount, vm_flags); 487 &mapcount, vm_flags);
486 if (!mapcount) 488 if (!mapcount)
487 break; 489 break;
@@ -507,46 +509,47 @@ int page_referenced(struct page *page,
507 unsigned long *vm_flags) 509 unsigned long *vm_flags)
508{ 510{
509 int referenced = 0; 511 int referenced = 0;
512 int we_locked = 0;
510 513
511 if (TestClearPageReferenced(page)) 514 if (TestClearPageReferenced(page))
512 referenced++; 515 referenced++;
513 516
514 *vm_flags = 0; 517 *vm_flags = 0;
515 if (page_mapped(page) && page->mapping) { 518 if (page_mapped(page) && page_rmapping(page)) {
516 if (PageAnon(page)) 519 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
520 we_locked = trylock_page(page);
521 if (!we_locked) {
522 referenced++;
523 goto out;
524 }
525 }
526 if (unlikely(PageKsm(page)))
527 referenced += page_referenced_ksm(page, mem_cont,
528 vm_flags);
529 else if (PageAnon(page))
517 referenced += page_referenced_anon(page, mem_cont, 530 referenced += page_referenced_anon(page, mem_cont,
518 vm_flags); 531 vm_flags);
519 else if (is_locked) 532 else if (page->mapping)
520 referenced += page_referenced_file(page, mem_cont, 533 referenced += page_referenced_file(page, mem_cont,
521 vm_flags); 534 vm_flags);
522 else if (!trylock_page(page)) 535 if (we_locked)
523 referenced++;
524 else {
525 if (page->mapping)
526 referenced += page_referenced_file(page,
527 mem_cont, vm_flags);
528 unlock_page(page); 536 unlock_page(page);
529 }
530 } 537 }
531 538out:
532 if (page_test_and_clear_young(page)) 539 if (page_test_and_clear_young(page))
533 referenced++; 540 referenced++;
534 541
535 return referenced; 542 return referenced;
536} 543}
537 544
538static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 545static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
546 unsigned long address)
539{ 547{
540 struct mm_struct *mm = vma->vm_mm; 548 struct mm_struct *mm = vma->vm_mm;
541 unsigned long address;
542 pte_t *pte; 549 pte_t *pte;
543 spinlock_t *ptl; 550 spinlock_t *ptl;
544 int ret = 0; 551 int ret = 0;
545 552
546 address = vma_address(page, vma);
547 if (address == -EFAULT)
548 goto out;
549
550 pte = page_check_address(page, mm, address, &ptl, 1); 553 pte = page_check_address(page, mm, address, &ptl, 1);
551 if (!pte) 554 if (!pte)
552 goto out; 555 goto out;
@@ -578,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
578 581
579 spin_lock(&mapping->i_mmap_lock); 582 spin_lock(&mapping->i_mmap_lock);
580 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 583 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
581 if (vma->vm_flags & VM_SHARED) 584 if (vma->vm_flags & VM_SHARED) {
582 ret += page_mkclean_one(page, vma); 585 unsigned long address = vma_address(page, vma);
586 if (address == -EFAULT)
587 continue;
588 ret += page_mkclean_one(page, vma, address);
589 }
583 } 590 }
584 spin_unlock(&mapping->i_mmap_lock); 591 spin_unlock(&mapping->i_mmap_lock);
585 return ret; 592 return ret;
@@ -620,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page,
620 BUG_ON(!anon_vma); 627 BUG_ON(!anon_vma);
621 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 628 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
622 page->mapping = (struct address_space *) anon_vma; 629 page->mapping = (struct address_space *) anon_vma;
623
624 page->index = linear_page_index(vma, address); 630 page->index = linear_page_index(vma, address);
625
626 /*
627 * nr_mapped state can be updated without turning off
628 * interrupts because it is not modified via interrupt.
629 */
630 __inc_zone_page_state(page, NR_ANON_PAGES);
631} 631}
632 632
633/** 633/**
@@ -665,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page,
665 * @vma: the vm area in which the mapping is added 665 * @vma: the vm area in which the mapping is added
666 * @address: the user virtual address mapped 666 * @address: the user virtual address mapped
667 * 667 *
668 * The caller needs to hold the pte lock and the page must be locked. 668 * The caller needs to hold the pte lock, and the page must be locked in
669 * the anon_vma case: to serialize mapping,index checking after setting,
670 * and to ensure that PageAnon is not being upgraded racily to PageKsm
671 * (but PageKsm is never downgraded to PageAnon).
669 */ 672 */
670void page_add_anon_rmap(struct page *page, 673void page_add_anon_rmap(struct page *page,
671 struct vm_area_struct *vma, unsigned long address) 674 struct vm_area_struct *vma, unsigned long address)
672{ 675{
676 int first = atomic_inc_and_test(&page->_mapcount);
677 if (first)
678 __inc_zone_page_state(page, NR_ANON_PAGES);
679 if (unlikely(PageKsm(page)))
680 return;
681
673 VM_BUG_ON(!PageLocked(page)); 682 VM_BUG_ON(!PageLocked(page));
674 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 683 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
675 if (atomic_inc_and_test(&page->_mapcount)) 684 if (first)
676 __page_set_anon_rmap(page, vma, address); 685 __page_set_anon_rmap(page, vma, address);
677 else 686 else
678 __page_check_anon_rmap(page, vma, address); 687 __page_check_anon_rmap(page, vma, address);
@@ -694,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page,
694 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 703 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
695 SetPageSwapBacked(page); 704 SetPageSwapBacked(page);
696 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 705 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
706 __inc_zone_page_state(page, NR_ANON_PAGES);
697 __page_set_anon_rmap(page, vma, address); 707 __page_set_anon_rmap(page, vma, address);
698 if (page_evictable(page, vma)) 708 if (page_evictable(page, vma))
699 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 709 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -711,7 +721,7 @@ void page_add_file_rmap(struct page *page)
711{ 721{
712 if (atomic_inc_and_test(&page->_mapcount)) { 722 if (atomic_inc_and_test(&page->_mapcount)) {
713 __inc_zone_page_state(page, NR_FILE_MAPPED); 723 __inc_zone_page_state(page, NR_FILE_MAPPED);
714 mem_cgroup_update_mapped_file_stat(page, 1); 724 mem_cgroup_update_file_mapped(page, 1);
715 } 725 }
716} 726}
717 727
@@ -743,8 +753,8 @@ void page_remove_rmap(struct page *page)
743 __dec_zone_page_state(page, NR_ANON_PAGES); 753 __dec_zone_page_state(page, NR_ANON_PAGES);
744 } else { 754 } else {
745 __dec_zone_page_state(page, NR_FILE_MAPPED); 755 __dec_zone_page_state(page, NR_FILE_MAPPED);
756 mem_cgroup_update_file_mapped(page, -1);
746 } 757 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /* 758 /*
749 * It would be tidy to reset the PageAnon mapping here, 759 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap 760 * but that might overwrite a racing page_add_anon_rmap
@@ -760,20 +770,15 @@ void page_remove_rmap(struct page *page)
760 * Subfunctions of try_to_unmap: try_to_unmap_one called 770 * Subfunctions of try_to_unmap: try_to_unmap_one called
761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 771 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
762 */ 772 */
763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 773int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
764 enum ttu_flags flags) 774 unsigned long address, enum ttu_flags flags)
765{ 775{
766 struct mm_struct *mm = vma->vm_mm; 776 struct mm_struct *mm = vma->vm_mm;
767 unsigned long address;
768 pte_t *pte; 777 pte_t *pte;
769 pte_t pteval; 778 pte_t pteval;
770 spinlock_t *ptl; 779 spinlock_t *ptl;
771 int ret = SWAP_AGAIN; 780 int ret = SWAP_AGAIN;
772 781
773 address = vma_address(page, vma);
774 if (address == -EFAULT)
775 goto out;
776
777 pte = page_check_address(page, mm, address, &ptl, 0); 782 pte = page_check_address(page, mm, address, &ptl, 0);
778 if (!pte) 783 if (!pte)
779 goto out; 784 goto out;
@@ -784,10 +789,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
784 * skipped over this mm) then we should reactivate it. 789 * skipped over this mm) then we should reactivate it.
785 */ 790 */
786 if (!(flags & TTU_IGNORE_MLOCK)) { 791 if (!(flags & TTU_IGNORE_MLOCK)) {
787 if (vma->vm_flags & VM_LOCKED) { 792 if (vma->vm_flags & VM_LOCKED)
788 ret = SWAP_MLOCK; 793 goto out_mlock;
794
795 if (TTU_ACTION(flags) == TTU_MUNLOCK)
789 goto out_unmap; 796 goto out_unmap;
790 }
791 } 797 }
792 if (!(flags & TTU_IGNORE_ACCESS)) { 798 if (!(flags & TTU_IGNORE_ACCESS)) {
793 if (ptep_clear_flush_young_notify(vma, address, pte)) { 799 if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -822,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
822 * Store the swap location in the pte. 828 * Store the swap location in the pte.
823 * See handle_pte_fault() ... 829 * See handle_pte_fault() ...
824 */ 830 */
825 swap_duplicate(entry); 831 if (swap_duplicate(entry) < 0) {
832 set_pte_at(mm, address, pte, pteval);
833 ret = SWAP_FAIL;
834 goto out_unmap;
835 }
826 if (list_empty(&mm->mmlist)) { 836 if (list_empty(&mm->mmlist)) {
827 spin_lock(&mmlist_lock); 837 spin_lock(&mmlist_lock);
828 if (list_empty(&mm->mmlist)) 838 if (list_empty(&mm->mmlist))
@@ -849,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
849 } else 859 } else
850 dec_mm_counter(mm, file_rss); 860 dec_mm_counter(mm, file_rss);
851 861
852
853 page_remove_rmap(page); 862 page_remove_rmap(page);
854 page_cache_release(page); 863 page_cache_release(page);
855 864
@@ -857,6 +866,27 @@ out_unmap:
857 pte_unmap_unlock(pte, ptl); 866 pte_unmap_unlock(pte, ptl);
858out: 867out:
859 return ret; 868 return ret;
869
870out_mlock:
871 pte_unmap_unlock(pte, ptl);
872
873
874 /*
875 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
876 * unstable result and race. Plus, We can't wait here because
877 * we now hold anon_vma->lock or mapping->i_mmap_lock.
878 * if trylock failed, the page remain in evictable lru and later
879 * vmscan could retry to move the page to unevictable lru if the
880 * page is actually mlocked.
881 */
882 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
883 if (vma->vm_flags & VM_LOCKED) {
884 mlock_vma_page(page);
885 ret = SWAP_MLOCK;
886 }
887 up_read(&vma->vm_mm->mmap_sem);
888 }
889 return ret;
860} 890}
861 891
862/* 892/*
@@ -922,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
922 return ret; 952 return ret;
923 953
924 /* 954 /*
925 * MLOCK_PAGES => feature is configured. 955 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
926 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
927 * keep the sem while scanning the cluster for mlocking pages. 956 * keep the sem while scanning the cluster for mlocking pages.
928 */ 957 */
929 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { 958 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
930 locked_vma = (vma->vm_flags & VM_LOCKED); 959 locked_vma = (vma->vm_flags & VM_LOCKED);
931 if (!locked_vma) 960 if (!locked_vma)
932 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 961 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
@@ -976,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
976 return ret; 1005 return ret;
977} 1006}
978 1007
979/*
980 * common handling for pages mapped in VM_LOCKED vmas
981 */
982static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
983{
984 int mlocked = 0;
985
986 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
987 if (vma->vm_flags & VM_LOCKED) {
988 mlock_vma_page(page);
989 mlocked++; /* really mlocked the page */
990 }
991 up_read(&vma->vm_mm->mmap_sem);
992 }
993 return mlocked;
994}
995
996/** 1008/**
997 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1009 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
998 * rmap method 1010 * rmap method
999 * @page: the page to unmap/unlock 1011 * @page: the page to unmap/unlock
1000 * @unlock: request for unlock rather than unmap [unlikely] 1012 * @flags: action and flags
1001 * @migration: unmapping for migration - ignored if @unlock
1002 * 1013 *
1003 * Find all the mappings of a page using the mapping pointer and the vma chains 1014 * Find all the mappings of a page using the mapping pointer and the vma chains
1004 * contained in the anon_vma struct it points to. 1015 * contained in the anon_vma struct it points to.
@@ -1014,42 +1025,22 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1014{ 1025{
1015 struct anon_vma *anon_vma; 1026 struct anon_vma *anon_vma;
1016 struct vm_area_struct *vma; 1027 struct vm_area_struct *vma;
1017 unsigned int mlocked = 0;
1018 int ret = SWAP_AGAIN; 1028 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1020
1021 if (MLOCK_PAGES && unlikely(unlock))
1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1023 1029
1024 anon_vma = page_lock_anon_vma(page); 1030 anon_vma = page_lock_anon_vma(page);
1025 if (!anon_vma) 1031 if (!anon_vma)
1026 return ret; 1032 return ret;
1027 1033
1028 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1034 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1029 if (MLOCK_PAGES && unlikely(unlock)) { 1035 unsigned long address = vma_address(page, vma);
1030 if (!((vma->vm_flags & VM_LOCKED) && 1036 if (address == -EFAULT)
1031 page_mapped_in_vma(page, vma))) 1037 continue;
1032 continue; /* must visit all unlocked vmas */ 1038 ret = try_to_unmap_one(page, vma, address, flags);
1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1039 if (ret != SWAP_AGAIN || !page_mapped(page))
1034 } else { 1040 break;
1035 ret = try_to_unmap_one(page, vma, flags);
1036 if (ret == SWAP_FAIL || !page_mapped(page))
1037 break;
1038 }
1039 if (ret == SWAP_MLOCK) {
1040 mlocked = try_to_mlock_page(page, vma);
1041 if (mlocked)
1042 break; /* stop if actually mlocked page */
1043 }
1044 } 1041 }
1045 1042
1046 page_unlock_anon_vma(anon_vma); 1043 page_unlock_anon_vma(anon_vma);
1047
1048 if (mlocked)
1049 ret = SWAP_MLOCK; /* actually mlocked the page */
1050 else if (ret == SWAP_MLOCK)
1051 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1052
1053 return ret; 1044 return ret;
1054} 1045}
1055 1046
@@ -1079,48 +1070,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1079 unsigned long max_nl_cursor = 0; 1070 unsigned long max_nl_cursor = 0;
1080 unsigned long max_nl_size = 0; 1071 unsigned long max_nl_size = 0;
1081 unsigned int mapcount; 1072 unsigned int mapcount;
1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1084
1085 if (MLOCK_PAGES && unlikely(unlock))
1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1087 1073
1088 spin_lock(&mapping->i_mmap_lock); 1074 spin_lock(&mapping->i_mmap_lock);
1089 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1075 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1090 if (MLOCK_PAGES && unlikely(unlock)) { 1076 unsigned long address = vma_address(page, vma);
1091 if (!((vma->vm_flags & VM_LOCKED) && 1077 if (address == -EFAULT)
1092 page_mapped_in_vma(page, vma))) 1078 continue;
1093 continue; /* must visit all vmas */ 1079 ret = try_to_unmap_one(page, vma, address, flags);
1094 ret = SWAP_MLOCK; 1080 if (ret != SWAP_AGAIN || !page_mapped(page))
1095 } else { 1081 goto out;
1096 ret = try_to_unmap_one(page, vma, flags);
1097 if (ret == SWAP_FAIL || !page_mapped(page))
1098 goto out;
1099 }
1100 if (ret == SWAP_MLOCK) {
1101 mlocked = try_to_mlock_page(page, vma);
1102 if (mlocked)
1103 break; /* stop if actually mlocked page */
1104 }
1105 } 1082 }
1106 1083
1107 if (mlocked) 1084 if (list_empty(&mapping->i_mmap_nonlinear))
1108 goto out; 1085 goto out;
1109 1086
1110 if (list_empty(&mapping->i_mmap_nonlinear)) 1087 /*
1088 * We don't bother to try to find the munlocked page in nonlinears.
1089 * It's costly. Instead, later, page reclaim logic may call
1090 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1091 */
1092 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1111 goto out; 1093 goto out;
1112 1094
1113 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1095 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1114 shared.vm_set.list) { 1096 shared.vm_set.list) {
1115 if (MLOCK_PAGES && unlikely(unlock)) {
1116 if (!(vma->vm_flags & VM_LOCKED))
1117 continue; /* must visit all vmas */
1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1119 goto out; /* no need to look further */
1120 }
1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1123 continue;
1124 cursor = (unsigned long) vma->vm_private_data; 1097 cursor = (unsigned long) vma->vm_private_data;
1125 if (cursor > max_nl_cursor) 1098 if (cursor > max_nl_cursor)
1126 max_nl_cursor = cursor; 1099 max_nl_cursor = cursor;
@@ -1153,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1153 do { 1126 do {
1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1127 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1155 shared.vm_set.list) { 1128 shared.vm_set.list) {
1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1157 (vma->vm_flags & VM_LOCKED))
1158 continue;
1159 cursor = (unsigned long) vma->vm_private_data; 1129 cursor = (unsigned long) vma->vm_private_data;
1160 while ( cursor < max_nl_cursor && 1130 while ( cursor < max_nl_cursor &&
1161 cursor < vma->vm_end - vma->vm_start) { 1131 cursor < vma->vm_end - vma->vm_start) {
1162 ret = try_to_unmap_cluster(cursor, &mapcount, 1132 if (try_to_unmap_cluster(cursor, &mapcount,
1163 vma, page); 1133 vma, page) == SWAP_MLOCK)
1164 if (ret == SWAP_MLOCK) 1134 ret = SWAP_MLOCK;
1165 mlocked = 2; /* to return below */
1166 cursor += CLUSTER_SIZE; 1135 cursor += CLUSTER_SIZE;
1167 vma->vm_private_data = (void *) cursor; 1136 vma->vm_private_data = (void *) cursor;
1168 if ((int)mapcount <= 0) 1137 if ((int)mapcount <= 0)
@@ -1183,10 +1152,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1183 vma->vm_private_data = NULL; 1152 vma->vm_private_data = NULL;
1184out: 1153out:
1185 spin_unlock(&mapping->i_mmap_lock); 1154 spin_unlock(&mapping->i_mmap_lock);
1186 if (mlocked)
1187 ret = SWAP_MLOCK; /* actually mlocked the page */
1188 else if (ret == SWAP_MLOCK)
1189 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1190 return ret; 1155 return ret;
1191} 1156}
1192 1157
@@ -1210,7 +1175,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1210 1175
1211 BUG_ON(!PageLocked(page)); 1176 BUG_ON(!PageLocked(page));
1212 1177
1213 if (PageAnon(page)) 1178 if (unlikely(PageKsm(page)))
1179 ret = try_to_unmap_ksm(page, flags);
1180 else if (PageAnon(page))
1214 ret = try_to_unmap_anon(page, flags); 1181 ret = try_to_unmap_anon(page, flags);
1215 else 1182 else
1216 ret = try_to_unmap_file(page, flags); 1183 ret = try_to_unmap_file(page, flags);
@@ -1229,17 +1196,98 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1229 * 1196 *
1230 * Return values are: 1197 * Return values are:
1231 * 1198 *
1232 * SWAP_SUCCESS - no vma's holding page mlocked. 1199 * SWAP_AGAIN - no vma is holding page mlocked, or,
1233 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1200 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1201 * SWAP_FAIL - page cannot be located at present
1234 * SWAP_MLOCK - page is now mlocked. 1202 * SWAP_MLOCK - page is now mlocked.
1235 */ 1203 */
1236int try_to_munlock(struct page *page) 1204int try_to_munlock(struct page *page)
1237{ 1205{
1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1206 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1239 1207
1240 if (PageAnon(page)) 1208 if (unlikely(PageKsm(page)))
1209 return try_to_unmap_ksm(page, TTU_MUNLOCK);
1210 else if (PageAnon(page))
1241 return try_to_unmap_anon(page, TTU_MUNLOCK); 1211 return try_to_unmap_anon(page, TTU_MUNLOCK);
1242 else 1212 else
1243 return try_to_unmap_file(page, TTU_MUNLOCK); 1213 return try_to_unmap_file(page, TTU_MUNLOCK);
1244} 1214}
1245 1215
1216#ifdef CONFIG_MIGRATION
1217/*
1218 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1219 * Called by migrate.c to remove migration ptes, but might be used more later.
1220 */
1221static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1222 struct vm_area_struct *, unsigned long, void *), void *arg)
1223{
1224 struct anon_vma *anon_vma;
1225 struct vm_area_struct *vma;
1226 int ret = SWAP_AGAIN;
1227
1228 /*
1229 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1230 * because that depends on page_mapped(); but not all its usages
1231 * are holding mmap_sem, which also gave the necessary guarantee
1232 * (that this anon_vma's slab has not already been destroyed).
1233 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1234 * is risky, and currently limits the usefulness of rmap_walk().
1235 */
1236 anon_vma = page_anon_vma(page);
1237 if (!anon_vma)
1238 return ret;
1239 spin_lock(&anon_vma->lock);
1240 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
1241 unsigned long address = vma_address(page, vma);
1242 if (address == -EFAULT)
1243 continue;
1244 ret = rmap_one(page, vma, address, arg);
1245 if (ret != SWAP_AGAIN)
1246 break;
1247 }
1248 spin_unlock(&anon_vma->lock);
1249 return ret;
1250}
1251
1252static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1253 struct vm_area_struct *, unsigned long, void *), void *arg)
1254{
1255 struct address_space *mapping = page->mapping;
1256 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1257 struct vm_area_struct *vma;
1258 struct prio_tree_iter iter;
1259 int ret = SWAP_AGAIN;
1260
1261 if (!mapping)
1262 return ret;
1263 spin_lock(&mapping->i_mmap_lock);
1264 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1265 unsigned long address = vma_address(page, vma);
1266 if (address == -EFAULT)
1267 continue;
1268 ret = rmap_one(page, vma, address, arg);
1269 if (ret != SWAP_AGAIN)
1270 break;
1271 }
1272 /*
1273 * No nonlinear handling: being always shared, nonlinear vmas
1274 * never contain migration ptes. Decide what to do about this
1275 * limitation to linear when we need rmap_walk() on nonlinear.
1276 */
1277 spin_unlock(&mapping->i_mmap_lock);
1278 return ret;
1279}
1280
1281int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1282 struct vm_area_struct *, unsigned long, void *), void *arg)
1283{
1284 VM_BUG_ON(!PageLocked(page));
1285
1286 if (unlikely(PageKsm(page)))
1287 return rmap_walk_ksm(page, rmap_one, arg);
1288 else if (PageAnon(page))
1289 return rmap_walk_anon(page, rmap_one, arg);
1290 else
1291 return rmap_walk_file(page, rmap_one, arg);
1292}
1293#endif /* CONFIG_MIGRATION */
diff --git a/mm/shmem.c b/mm/shmem.c
index 356dd99566ec..eef4ebea5158 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,6 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/ima.h>
33 32
34static struct vfsmount *shm_mnt; 33static struct vfsmount *shm_mnt;
35 34
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt;
42 41
43#include <linux/xattr.h> 42#include <linux/xattr.h>
44#include <linux/exportfs.h> 43#include <linux/exportfs.h>
44#include <linux/posix_acl.h>
45#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
46#include <linux/mman.h> 46#include <linux/mman.h>
47#include <linux/string.h> 47#include <linux/string.h>
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
810 error = inode_setattr(inode, attr); 810 error = inode_setattr(inode, attr);
811#ifdef CONFIG_TMPFS_POSIX_ACL 811#ifdef CONFIG_TMPFS_POSIX_ACL
812 if (!error && (attr->ia_valid & ATTR_MODE)) 812 if (!error && (attr->ia_valid & ATTR_MODE))
813 error = generic_acl_chmod(inode, &shmem_acl_ops); 813 error = generic_acl_chmod(inode);
814#endif 814#endif
815 if (page) 815 if (page)
816 page_cache_release(page); 816 page_cache_release(page);
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1017 goto out; 1017 goto out;
1018 } 1018 }
1019 mutex_unlock(&shmem_swaplist_mutex); 1019 mutex_unlock(&shmem_swaplist_mutex);
1020out: return found; /* 0 or 1 or -ENOMEM */ 1020 /*
1021 * Can some race bring us here? We've been holding page lock,
1022 * so I think not; but would rather try again later than BUG()
1023 */
1024 unlock_page(page);
1025 page_cache_release(page);
1026out:
1027 return (found < 0) ? found : 0;
1021} 1028}
1022 1029
1023/* 1030/*
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1080 else 1087 else
1081 inode = NULL; 1088 inode = NULL;
1082 spin_unlock(&info->lock); 1089 spin_unlock(&info->lock);
1083 swap_duplicate(swap); 1090 swap_shmem_alloc(swap);
1084 BUG_ON(page_mapped(page)); 1091 BUG_ON(page_mapped(page));
1085 page_cache_release(page); /* pagecache ref */ 1092 page_cache_release(page); /* pagecache ref */
1086 swap_writepage(page, wbc); 1093 swap_writepage(page, wbc);
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1817 return error; 1824 return error;
1818 } 1825 }
1819 } 1826 }
1820 error = shmem_acl_init(inode, dir); 1827#ifdef CONFIG_TMPFS_POSIX_ACL
1828 error = generic_acl_init(inode, dir);
1821 if (error) { 1829 if (error) {
1822 iput(inode); 1830 iput(inode);
1823 return error; 1831 return error;
1824 } 1832 }
1833#else
1834 error = 0;
1835#endif
1825 if (dir->i_mode & S_ISGID) { 1836 if (dir->i_mode & S_ISGID) {
1826 inode->i_gid = dir->i_gid; 1837 inode->i_gid = dir->i_gid;
1827 if (S_ISDIR(mode)) 1838 if (S_ISDIR(mode))
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = {
2036 * filesystem level, though. 2047 * filesystem level, though.
2037 */ 2048 */
2038 2049
2039static size_t shmem_xattr_security_list(struct inode *inode, char *list, 2050static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
2040 size_t list_len, const char *name, 2051 size_t list_len, const char *name,
2041 size_t name_len) 2052 size_t name_len, int handler_flags)
2042{ 2053{
2043 return security_inode_listsecurity(inode, list, list_len); 2054 return security_inode_listsecurity(dentry->d_inode, list, list_len);
2044} 2055}
2045 2056
2046static int shmem_xattr_security_get(struct inode *inode, const char *name, 2057static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
2047 void *buffer, size_t size) 2058 void *buffer, size_t size, int handler_flags)
2048{ 2059{
2049 if (strcmp(name, "") == 0) 2060 if (strcmp(name, "") == 0)
2050 return -EINVAL; 2061 return -EINVAL;
2051 return xattr_getsecurity(inode, name, buffer, size); 2062 return xattr_getsecurity(dentry->d_inode, name, buffer, size);
2052} 2063}
2053 2064
2054static int shmem_xattr_security_set(struct inode *inode, const char *name, 2065static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
2055 const void *value, size_t size, int flags) 2066 const void *value, size_t size, int flags, int handler_flags)
2056{ 2067{
2057 if (strcmp(name, "") == 0) 2068 if (strcmp(name, "") == 0)
2058 return -EINVAL; 2069 return -EINVAL;
2059 return security_inode_setsecurity(inode, name, value, size, flags); 2070 return security_inode_setsecurity(dentry->d_inode, name, value,
2071 size, flags);
2060} 2072}
2061 2073
2062static struct xattr_handler shmem_xattr_security_handler = { 2074static struct xattr_handler shmem_xattr_security_handler = {
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = {
2067}; 2079};
2068 2080
2069static struct xattr_handler *shmem_xattr_handlers[] = { 2081static struct xattr_handler *shmem_xattr_handlers[] = {
2070 &shmem_xattr_acl_access_handler, 2082 &generic_acl_access_handler,
2071 &shmem_xattr_acl_default_handler, 2083 &generic_acl_default_handler,
2072 &shmem_xattr_security_handler, 2084 &shmem_xattr_security_handler,
2073 NULL 2085 NULL
2074}; 2086};
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = {
2447 .getxattr = generic_getxattr, 2459 .getxattr = generic_getxattr,
2448 .listxattr = generic_listxattr, 2460 .listxattr = generic_listxattr,
2449 .removexattr = generic_removexattr, 2461 .removexattr = generic_removexattr,
2450 .check_acl = shmem_check_acl, 2462 .check_acl = generic_check_acl,
2451#endif 2463#endif
2452 2464
2453}; 2465};
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2470 .getxattr = generic_getxattr, 2482 .getxattr = generic_getxattr,
2471 .listxattr = generic_listxattr, 2483 .listxattr = generic_listxattr,
2472 .removexattr = generic_removexattr, 2484 .removexattr = generic_removexattr,
2473 .check_acl = shmem_check_acl, 2485 .check_acl = generic_check_acl,
2474#endif 2486#endif
2475}; 2487};
2476 2488
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2481 .getxattr = generic_getxattr, 2493 .getxattr = generic_getxattr,
2482 .listxattr = generic_listxattr, 2494 .listxattr = generic_listxattr,
2483 .removexattr = generic_removexattr, 2495 .removexattr = generic_removexattr,
2484 .check_acl = shmem_check_acl, 2496 .check_acl = generic_check_acl,
2485#endif 2497#endif
2486}; 2498};
2487 2499
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2619 int error; 2631 int error;
2620 struct file *file; 2632 struct file *file;
2621 struct inode *inode; 2633 struct inode *inode;
2622 struct dentry *dentry, *root; 2634 struct path path;
2635 struct dentry *root;
2623 struct qstr this; 2636 struct qstr this;
2624 2637
2625 if (IS_ERR(shm_mnt)) 2638 if (IS_ERR(shm_mnt))
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2636 this.len = strlen(name); 2649 this.len = strlen(name);
2637 this.hash = 0; /* will go */ 2650 this.hash = 0; /* will go */
2638 root = shm_mnt->mnt_root; 2651 root = shm_mnt->mnt_root;
2639 dentry = d_alloc(root, &this); 2652 path.dentry = d_alloc(root, &this);
2640 if (!dentry) 2653 if (!path.dentry)
2641 goto put_memory; 2654 goto put_memory;
2642 2655 path.mnt = mntget(shm_mnt);
2643 error = -ENFILE;
2644 file = get_empty_filp();
2645 if (!file)
2646 goto put_dentry;
2647 2656
2648 error = -ENOSPC; 2657 error = -ENOSPC;
2649 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); 2658 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
2650 if (!inode) 2659 if (!inode)
2651 goto close_file; 2660 goto put_dentry;
2652 2661
2653 d_instantiate(dentry, inode); 2662 d_instantiate(path.dentry, inode);
2654 inode->i_size = size; 2663 inode->i_size = size;
2655 inode->i_nlink = 0; /* It is unlinked */ 2664 inode->i_nlink = 0; /* It is unlinked */
2656 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2657 &shmem_file_operations);
2658
2659#ifndef CONFIG_MMU 2665#ifndef CONFIG_MMU
2660 error = ramfs_nommu_expand_for_mapping(inode, size); 2666 error = ramfs_nommu_expand_for_mapping(inode, size);
2661 if (error) 2667 if (error)
2662 goto close_file; 2668 goto put_dentry;
2663#endif 2669#endif
2664 ima_counts_get(file); 2670
2671 error = -ENFILE;
2672 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2673 &shmem_file_operations);
2674 if (!file)
2675 goto put_dentry;
2676
2665 return file; 2677 return file;
2666 2678
2667close_file:
2668 put_filp(file);
2669put_dentry: 2679put_dentry:
2670 dput(dentry); 2680 path_put(&path);
2671put_memory: 2681put_memory:
2672 shmem_unacct_size(flags, size); 2682 shmem_unacct_size(flags, size);
2673 return ERR_PTR(error); 2683 return ERR_PTR(error);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
deleted file mode 100644
index df2c87fdae50..000000000000
--- a/mm/shmem_acl.c
+++ /dev/null
@@ -1,171 +0,0 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(inode->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(inode->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_set_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = inode->i_acl;
49 inode->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = inode->i_default_acl;
54 inode->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */
160int
161shmem_check_acl(struct inode *inode, int mask)
162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
164
165 if (acl) {
166 int error = posix_acl_permission(inode, acl, mask);
167 posix_acl_release(acl);
168 return error;
169 }
170 return -EAGAIN;
171}
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481c96ba..7451bdacaf18 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
490 490
491#endif 491#endif
492 492
493#ifdef CONFIG_KMEMTRACE 493#ifdef CONFIG_TRACING
494size_t slab_buffer_size(struct kmem_cache *cachep) 494size_t slab_buffer_size(struct kmem_cache *cachep)
495{ 495{
496 return cachep->buffer_size; 496 return cachep->buffer_size;
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = {
604 604
605#define BAD_ALIEN_MAGIC 0x01020304ul 605#define BAD_ALIEN_MAGIC 0x01020304ul
606 606
607/*
608 * chicken and egg problem: delay the per-cpu array allocation
609 * until the general caches are up.
610 */
611static enum {
612 NONE,
613 PARTIAL_AC,
614 PARTIAL_L3,
615 EARLY,
616 FULL
617} g_cpucache_up;
618
619/*
620 * used by boot code to determine if it can use slab based allocator
621 */
622int slab_is_available(void)
623{
624 return g_cpucache_up >= EARLY;
625}
626
607#ifdef CONFIG_LOCKDEP 627#ifdef CONFIG_LOCKDEP
608 628
609/* 629/*
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = {
620static struct lock_class_key on_slab_l3_key; 640static struct lock_class_key on_slab_l3_key;
621static struct lock_class_key on_slab_alc_key; 641static struct lock_class_key on_slab_alc_key;
622 642
623static inline void init_lock_keys(void) 643static void init_node_lock_keys(int q)
624
625{ 644{
626 int q;
627 struct cache_sizes *s = malloc_sizes; 645 struct cache_sizes *s = malloc_sizes;
628 646
629 while (s->cs_size != ULONG_MAX) { 647 if (g_cpucache_up != FULL)
630 for_each_node(q) { 648 return;
631 struct array_cache **alc; 649
632 int r; 650 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
633 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; 651 struct array_cache **alc;
634 if (!l3 || OFF_SLAB(s->cs_cachep)) 652 struct kmem_list3 *l3;
635 continue; 653 int r;
636 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 654
637 alc = l3->alien; 655 l3 = s->cs_cachep->nodelists[q];
638 /* 656 if (!l3 || OFF_SLAB(s->cs_cachep))
639 * FIXME: This check for BAD_ALIEN_MAGIC 657 continue;
640 * should go away when common slab code is taught to 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
641 * work even without alien caches. 659 alc = l3->alien;
642 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 660 /*
643 * for alloc_alien_cache, 661 * FIXME: This check for BAD_ALIEN_MAGIC
644 */ 662 * should go away when common slab code is taught to
645 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 663 * work even without alien caches.
646 continue; 664 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 for_each_node(r) { 665 * for alloc_alien_cache,
648 if (alc[r]) 666 */
649 lockdep_set_class(&alc[r]->lock, 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 &on_slab_alc_key); 668 continue;
651 } 669 for_each_node(r) {
670 if (alc[r])
671 lockdep_set_class(&alc[r]->lock,
672 &on_slab_alc_key);
652 } 673 }
653 s++;
654 } 674 }
655} 675}
676
677static inline void init_lock_keys(void)
678{
679 int node;
680
681 for_each_node(node)
682 init_node_lock_keys(node);
683}
656#else 684#else
685static void init_node_lock_keys(int q)
686{
687}
688
657static inline void init_lock_keys(void) 689static inline void init_lock_keys(void)
658{ 690{
659} 691}
@@ -665,27 +697,7 @@ static inline void init_lock_keys(void)
665static DEFINE_MUTEX(cache_chain_mutex); 697static DEFINE_MUTEX(cache_chain_mutex);
666static struct list_head cache_chain; 698static struct list_head cache_chain;
667 699
668/* 700static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
669 * chicken and egg problem: delay the per-cpu array allocation
670 * until the general caches are up.
671 */
672static enum {
673 NONE,
674 PARTIAL_AC,
675 PARTIAL_L3,
676 EARLY,
677 FULL
678} g_cpucache_up;
679
680/*
681 * used by boot code to determine if it can use slab based allocator
682 */
683int slab_is_available(void)
684{
685 return g_cpucache_up >= EARLY;
686}
687
688static DEFINE_PER_CPU(struct delayed_work, reap_work);
689 701
690static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 702static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
691{ 703{
@@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup);
826 * objects freed on different nodes from which they were allocated) and the 838 * objects freed on different nodes from which they were allocated) and the
827 * flushing of remote pcps by calling drain_node_pages. 839 * flushing of remote pcps by calling drain_node_pages.
828 */ 840 */
829static DEFINE_PER_CPU(unsigned long, reap_node); 841static DEFINE_PER_CPU(unsigned long, slab_reap_node);
830 842
831static void init_reap_node(int cpu) 843static void init_reap_node(int cpu)
832{ 844{
@@ -836,17 +848,17 @@ static void init_reap_node(int cpu)
836 if (node == MAX_NUMNODES) 848 if (node == MAX_NUMNODES)
837 node = first_node(node_online_map); 849 node = first_node(node_online_map);
838 850
839 per_cpu(reap_node, cpu) = node; 851 per_cpu(slab_reap_node, cpu) = node;
840} 852}
841 853
842static void next_reap_node(void) 854static void next_reap_node(void)
843{ 855{
844 int node = __get_cpu_var(reap_node); 856 int node = __get_cpu_var(slab_reap_node);
845 857
846 node = next_node(node, node_online_map); 858 node = next_node(node, node_online_map);
847 if (unlikely(node >= MAX_NUMNODES)) 859 if (unlikely(node >= MAX_NUMNODES))
848 node = first_node(node_online_map); 860 node = first_node(node_online_map);
849 __get_cpu_var(reap_node) = node; 861 __get_cpu_var(slab_reap_node) = node;
850} 862}
851 863
852#else 864#else
@@ -863,7 +875,7 @@ static void next_reap_node(void)
863 */ 875 */
864static void __cpuinit start_cpu_timer(int cpu) 876static void __cpuinit start_cpu_timer(int cpu)
865{ 877{
866 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 878 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
867 879
868 /* 880 /*
869 * When this gets called from do_initcalls via cpucache_init(), 881 * When this gets called from do_initcalls via cpucache_init(),
@@ -1027,7 +1039,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1027 */ 1039 */
1028static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1040static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1029{ 1041{
1030 int node = __get_cpu_var(reap_node); 1042 int node = __get_cpu_var(slab_reap_node);
1031 1043
1032 if (l3->alien) { 1044 if (l3->alien) {
1033 struct array_cache *ac = l3->alien[node]; 1045 struct array_cache *ac = l3->alien[node];
@@ -1120,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1120 if (nc) 1132 if (nc)
1121 free_block(cachep, nc->entry, nc->avail, node); 1133 free_block(cachep, nc->entry, nc->avail, node);
1122 1134
1123 if (!cpus_empty(*mask)) { 1135 if (!cpumask_empty(mask)) {
1124 spin_unlock_irq(&l3->list_lock); 1136 spin_unlock_irq(&l3->list_lock);
1125 goto free_array_cache; 1137 goto free_array_cache;
1126 } 1138 }
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1254 kfree(shared); 1266 kfree(shared);
1255 free_alien_cache(alien); 1267 free_alien_cache(alien);
1256 } 1268 }
1269 init_node_lock_keys(node);
1270
1257 return 0; 1271 return 0;
1258bad: 1272bad:
1259 cpuup_canceled(cpu); 1273 cpuup_canceled(cpu);
@@ -1286,9 +1300,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1286 * anything expensive but will only modify reap_work 1300 * anything expensive but will only modify reap_work
1287 * and reschedule the timer. 1301 * and reschedule the timer.
1288 */ 1302 */
1289 cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); 1303 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
1290 /* Now the cache_reaper is guaranteed to be not running. */ 1304 /* Now the cache_reaper is guaranteed to be not running. */
1291 per_cpu(reap_work, cpu).work.func = NULL; 1305 per_cpu(slab_reap_work, cpu).work.func = NULL;
1292 break; 1306 break;
1293 case CPU_DOWN_FAILED: 1307 case CPU_DOWN_FAILED:
1294 case CPU_DOWN_FAILED_FROZEN: 1308 case CPU_DOWN_FAILED_FROZEN:
@@ -2261,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2261 /* 2275 /*
2262 * Determine if the slab management is 'on' or 'off' slab. 2276 * Determine if the slab management is 'on' or 'off' slab.
2263 * (bootstrapping cannot cope with offslab caches so don't do 2277 * (bootstrapping cannot cope with offslab caches so don't do
2264 * it too early on.) 2278 * it too early on. Always use on-slab management when
2279 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2265 */ 2280 */
2266 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 2281 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2282 !(flags & SLAB_NOLEAKTRACE))
2267 /* 2283 /*
2268 * Size is large, assume best to place the slab management obj 2284 * Size is large, assume best to place the slab management obj
2269 * off-slab (should allow better packing of objs). 2285 * off-slab (should allow better packing of objs).
@@ -2582,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2582 * kmemleak does not treat the ->s_mem pointer as a reference 2598 * kmemleak does not treat the ->s_mem pointer as a reference
2583 * to the object. Otherwise we will not report the leak. 2599 * to the object. Otherwise we will not report the leak.
2584 */ 2600 */
2585 kmemleak_scan_area(slabp, offsetof(struct slab, list), 2601 kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2586 sizeof(struct list_head), local_flags); 2602 local_flags);
2587 if (!slabp) 2603 if (!slabp)
2588 return NULL; 2604 return NULL;
2589 } else { 2605 } else {
@@ -3103,13 +3119,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3103 } else { 3119 } else {
3104 STATS_INC_ALLOCMISS(cachep); 3120 STATS_INC_ALLOCMISS(cachep);
3105 objp = cache_alloc_refill(cachep, flags); 3121 objp = cache_alloc_refill(cachep, flags);
3122 /*
3123 * the 'ac' may be updated by cache_alloc_refill(),
3124 * and kmemleak_erase() requires its correct value.
3125 */
3126 ac = cpu_cache_get(cachep);
3106 } 3127 }
3107 /* 3128 /*
3108 * To avoid a false negative, if an object that is in one of the 3129 * To avoid a false negative, if an object that is in one of the
3109 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3130 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3110 * treat the array pointers as a reference to the object. 3131 * treat the array pointers as a reference to the object.
3111 */ 3132 */
3112 kmemleak_erase(&ac->entry[ac->avail]); 3133 if (objp)
3134 kmemleak_erase(&ac->entry[ac->avail]);
3113 return objp; 3135 return objp;
3114} 3136}
3115 3137
@@ -3306,7 +3328,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3306 cache_alloc_debugcheck_before(cachep, flags); 3328 cache_alloc_debugcheck_before(cachep, flags);
3307 local_irq_save(save_flags); 3329 local_irq_save(save_flags);
3308 3330
3309 if (unlikely(nodeid == -1)) 3331 if (nodeid == -1)
3310 nodeid = numa_node_id(); 3332 nodeid = numa_node_id();
3311 3333
3312 if (unlikely(!cachep->nodelists[nodeid])) { 3334 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3558,7 +3580,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3558} 3580}
3559EXPORT_SYMBOL(kmem_cache_alloc); 3581EXPORT_SYMBOL(kmem_cache_alloc);
3560 3582
3561#ifdef CONFIG_KMEMTRACE 3583#ifdef CONFIG_TRACING
3562void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3584void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3563{ 3585{
3564 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3586 return __cache_alloc(cachep, flags, __builtin_return_address(0));
@@ -3621,7 +3643,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3621} 3643}
3622EXPORT_SYMBOL(kmem_cache_alloc_node); 3644EXPORT_SYMBOL(kmem_cache_alloc_node);
3623 3645
3624#ifdef CONFIG_KMEMTRACE 3646#ifdef CONFIG_TRACING
3625void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3647void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3626 gfp_t flags, 3648 gfp_t flags,
3627 int nodeid) 3649 int nodeid)
@@ -3649,7 +3671,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3649 return ret; 3671 return ret;
3650} 3672}
3651 3673
3652#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3674#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3653void *__kmalloc_node(size_t size, gfp_t flags, int node) 3675void *__kmalloc_node(size_t size, gfp_t flags, int node)
3654{ 3676{
3655 return __do_kmalloc_node(size, flags, node, 3677 return __do_kmalloc_node(size, flags, node,
@@ -3669,7 +3691,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3669 return __do_kmalloc_node(size, flags, node, NULL); 3691 return __do_kmalloc_node(size, flags, node, NULL);
3670} 3692}
3671EXPORT_SYMBOL(__kmalloc_node); 3693EXPORT_SYMBOL(__kmalloc_node);
3672#endif /* CONFIG_DEBUG_SLAB */ 3694#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3673#endif /* CONFIG_NUMA */ 3695#endif /* CONFIG_NUMA */
3674 3696
3675/** 3697/**
@@ -3701,7 +3723,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3701} 3723}
3702 3724
3703 3725
3704#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3726#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3705void *__kmalloc(size_t size, gfp_t flags) 3727void *__kmalloc(size_t size, gfp_t flags)
3706{ 3728{
3707 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3729 return __do_kmalloc(size, flags, __builtin_return_address(0));
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc719552..8d71aaf888d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1735 } 1735 }
1736 local_irq_restore(flags); 1736 local_irq_restore(flags);
1737 1737
1738 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1738 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1739 memset(object, 0, objsize);
1740 1740
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
@@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1754} 1754}
1755EXPORT_SYMBOL(kmem_cache_alloc); 1755EXPORT_SYMBOL(kmem_cache_alloc);
1756 1756
1757#ifdef CONFIG_KMEMTRACE 1757#ifdef CONFIG_TRACING
1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1759{ 1759{
1760 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1760 return slab_alloc(s, gfpflags, -1, _RET_IP_);
@@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1775EXPORT_SYMBOL(kmem_cache_alloc_node); 1775EXPORT_SYMBOL(kmem_cache_alloc_node);
1776#endif 1776#endif
1777 1777
1778#ifdef CONFIG_KMEMTRACE 1778#ifdef CONFIG_TRACING
1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1780 gfp_t gfpflags, 1780 gfp_t gfpflags,
1781 int node) 1781 int node)
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4371 return len + sprintf(buf + len, "\n"); 4371 return len + sprintf(buf + len, "\n");
4372} 4372}
4373 4373
4374static void clear_stat(struct kmem_cache *s, enum stat_item si)
4375{
4376 int cpu;
4377
4378 for_each_online_cpu(cpu)
4379 get_cpu_slab(s, cpu)->stat[si] = 0;
4380}
4381
4374#define STAT_ATTR(si, text) \ 4382#define STAT_ATTR(si, text) \
4375static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4383static ssize_t text##_show(struct kmem_cache *s, char *buf) \
4376{ \ 4384{ \
4377 return show_stat(s, buf, si); \ 4385 return show_stat(s, buf, si); \
4378} \ 4386} \
4379SLAB_ATTR_RO(text); \ 4387static ssize_t text##_store(struct kmem_cache *s, \
4388 const char *buf, size_t length) \
4389{ \
4390 if (buf[0] != '0') \
4391 return -EINVAL; \
4392 clear_stat(s, si); \
4393 return length; \
4394} \
4395SLAB_ATTR(text); \
4380 4396
4381STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4397STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4382STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4398STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c590eef7912..6c0585b16418 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/ksm.h>
25#include <linux/rmap.h> 26#include <linux/rmap.h>
26#include <linux/security.h> 27#include <linux/security.h>
27#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
@@ -35,11 +36,15 @@
35#include <linux/swapops.h> 36#include <linux/swapops.h>
36#include <linux/page_cgroup.h> 37#include <linux/page_cgroup.h>
37 38
39static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
40 unsigned char);
41static void free_swap_count_continuations(struct swap_info_struct *);
42static sector_t map_swap_entry(swp_entry_t, struct block_device**);
43
38static DEFINE_SPINLOCK(swap_lock); 44static DEFINE_SPINLOCK(swap_lock);
39static unsigned int nr_swapfiles; 45static unsigned int nr_swapfiles;
40long nr_swap_pages; 46long nr_swap_pages;
41long total_swap_pages; 47long total_swap_pages;
42static int swap_overflow;
43static int least_priority; 48static int least_priority;
44 49
45static const char Bad_file[] = "Bad swap file entry "; 50static const char Bad_file[] = "Bad swap file entry ";
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry ";
49 54
50static struct swap_list_t swap_list = {-1, -1}; 55static struct swap_list_t swap_list = {-1, -1};
51 56
52static struct swap_info_struct swap_info[MAX_SWAPFILES]; 57static struct swap_info_struct *swap_info[MAX_SWAPFILES];
53 58
54static DEFINE_MUTEX(swapon_mutex); 59static DEFINE_MUTEX(swapon_mutex);
55 60
56/* For reference count accounting in swap_map */ 61static inline unsigned char swap_count(unsigned char ent)
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{ 62{
70 return !!(ent & SWAP_HAS_CACHE); 63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
71} 64}
72 65
73static inline unsigned short encode_swapmap(int count, bool has_cache) 66/* returns 1 if swap entry is freed */
74{
75 unsigned short ret = count;
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80}
81
82/* returnes 1 if swap entry is freed */
83static int 67static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 68__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{ 69{
86 int type = si - swap_info; 70 swp_entry_t entry = swp_entry(si->type, offset);
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page; 71 struct page *page;
89 int ret = 0; 72 int ret = 0;
90 73
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
120 down_read(&swap_unplug_sem); 103 down_read(&swap_unplug_sem);
121 entry.val = page_private(page); 104 entry.val = page_private(page);
122 if (PageSwapCache(page)) { 105 if (PageSwapCache(page)) {
123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 106 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
124 struct backing_dev_info *bdi; 107 struct backing_dev_info *bdi;
125 108
126 /* 109 /*
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
146static int discard_swap(struct swap_info_struct *si) 129static int discard_swap(struct swap_info_struct *si)
147{ 130{
148 struct swap_extent *se; 131 struct swap_extent *se;
132 sector_t start_block;
133 sector_t nr_blocks;
149 int err = 0; 134 int err = 0;
150 135
151 list_for_each_entry(se, &si->extent_list, list) { 136 /* Do not discard the swap header page! */
152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 137 se = &si->first_swap_extent;
153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 138 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
143 if (err)
144 return err;
145 cond_resched();
146 }
154 147
155 if (se->start_page == 0) { 148 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 /* Do not discard the swap header page! */ 149 start_block = se->start_block << (PAGE_SHIFT - 9);
157 start_block += 1 << (PAGE_SHIFT - 9); 150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158 nr_blocks -= 1 << (PAGE_SHIFT - 9);
159 if (!nr_blocks)
160 continue;
161 }
162 151
163 err = blkdev_issue_discard(si->bdev, start_block, 152 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL, 153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
165 DISCARD_FL_BARRIER);
166 if (err) 154 if (err)
167 break; 155 break;
168 156
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
201 start_block <<= PAGE_SHIFT - 9; 189 start_block <<= PAGE_SHIFT - 9;
202 nr_blocks <<= PAGE_SHIFT - 9; 190 nr_blocks <<= PAGE_SHIFT - 9;
203 if (blkdev_issue_discard(si->bdev, start_block, 191 if (blkdev_issue_discard(si->bdev, start_block,
204 nr_blocks, GFP_NOIO, 192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
205 DISCARD_FL_BARRIER))
206 break; 193 break;
207 } 194 }
208 195
209 lh = se->list.next; 196 lh = se->list.next;
210 if (lh == &si->extent_list)
211 lh = lh->next;
212 se = list_entry(lh, struct swap_extent, list); 197 se = list_entry(lh, struct swap_extent, list);
213 } 198 }
214} 199}
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word)
223#define LATENCY_LIMIT 256 208#define LATENCY_LIMIT 256
224 209
225static inline unsigned long scan_swap_map(struct swap_info_struct *si, 210static inline unsigned long scan_swap_map(struct swap_info_struct *si,
226 int cache) 211 unsigned char usage)
227{ 212{
228 unsigned long offset; 213 unsigned long offset;
229 unsigned long scan_base; 214 unsigned long scan_base;
@@ -354,10 +339,7 @@ checks:
354 si->lowest_bit = si->max; 339 si->lowest_bit = si->max;
355 si->highest_bit = 0; 340 si->highest_bit = 0;
356 } 341 }
357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 342 si->swap_map[offset] = usage;
358 si->swap_map[offset] = encode_swapmap(0, true);
359 else /* at suspend */
360 si->swap_map[offset] = encode_swapmap(1, false);
361 si->cluster_next = offset + 1; 343 si->cluster_next = offset + 1;
362 si->flags -= SWP_SCANNING; 344 si->flags -= SWP_SCANNING;
363 345
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void)
467 nr_swap_pages--; 449 nr_swap_pages--;
468 450
469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 451 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
470 si = swap_info + type; 452 si = swap_info[type];
471 next = si->next; 453 next = si->next;
472 if (next < 0 || 454 if (next < 0 ||
473 (!wrapped && si->prio != swap_info[next].prio)) { 455 (!wrapped && si->prio != swap_info[next]->prio)) {
474 next = swap_list.head; 456 next = swap_list.head;
475 wrapped++; 457 wrapped++;
476 } 458 }
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void)
482 464
483 swap_list.next = next; 465 swap_list.next = next;
484 /* This is called for allocating swap entry for cache */ 466 /* This is called for allocating swap entry for cache */
485 offset = scan_swap_map(si, SWAP_CACHE); 467 offset = scan_swap_map(si, SWAP_HAS_CACHE);
486 if (offset) { 468 if (offset) {
487 spin_unlock(&swap_lock); 469 spin_unlock(&swap_lock);
488 return swp_entry(type, offset); 470 return swp_entry(type, offset);
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type)
503 pgoff_t offset; 485 pgoff_t offset;
504 486
505 spin_lock(&swap_lock); 487 spin_lock(&swap_lock);
506 si = swap_info + type; 488 si = swap_info[type];
507 if (si->flags & SWP_WRITEOK) { 489 if (si && (si->flags & SWP_WRITEOK)) {
508 nr_swap_pages--; 490 nr_swap_pages--;
509 /* This is called for allocating swap entry, not cache */ 491 /* This is called for allocating swap entry, not cache */
510 offset = scan_swap_map(si, SWAP_MAP); 492 offset = scan_swap_map(si, 1);
511 if (offset) { 493 if (offset) {
512 spin_unlock(&swap_lock); 494 spin_unlock(&swap_lock);
513 return swp_entry(type, offset); 495 return swp_entry(type, offset);
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type)
518 return (swp_entry_t) {0}; 500 return (swp_entry_t) {0};
519} 501}
520 502
521static struct swap_info_struct * swap_info_get(swp_entry_t entry) 503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
522{ 504{
523 struct swap_info_struct * p; 505 struct swap_info_struct *p;
524 unsigned long offset, type; 506 unsigned long offset, type;
525 507
526 if (!entry.val) 508 if (!entry.val)
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
528 type = swp_type(entry); 510 type = swp_type(entry);
529 if (type >= nr_swapfiles) 511 if (type >= nr_swapfiles)
530 goto bad_nofile; 512 goto bad_nofile;
531 p = & swap_info[type]; 513 p = swap_info[type];
532 if (!(p->flags & SWP_USED)) 514 if (!(p->flags & SWP_USED))
533 goto bad_device; 515 goto bad_device;
534 offset = swp_offset(entry); 516 offset = swp_offset(entry);
@@ -554,41 +536,56 @@ out:
554 return NULL; 536 return NULL;
555} 537}
556 538
557static int swap_entry_free(struct swap_info_struct *p, 539static unsigned char swap_entry_free(struct swap_info_struct *p,
558 swp_entry_t ent, int cache) 540 swp_entry_t entry, unsigned char usage)
559{ 541{
560 unsigned long offset = swp_offset(ent); 542 unsigned long offset = swp_offset(entry);
561 int count = swap_count(p->swap_map[offset]); 543 unsigned char count;
562 bool has_cache; 544 unsigned char has_cache;
563 545
564 has_cache = swap_has_cache(p->swap_map[offset]); 546 count = p->swap_map[offset];
547 has_cache = count & SWAP_HAS_CACHE;
548 count &= ~SWAP_HAS_CACHE;
565 549
566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 550 if (usage == SWAP_HAS_CACHE) {
567 if (count < SWAP_MAP_MAX) {
568 count--;
569 p->swap_map[offset] = encode_swapmap(count, has_cache);
570 }
571 } else { /* dropping swap cache flag */
572 VM_BUG_ON(!has_cache); 551 VM_BUG_ON(!has_cache);
573 p->swap_map[offset] = encode_swapmap(count, false); 552 has_cache = 0;
574 553 } else if (count == SWAP_MAP_SHMEM) {
554 /*
555 * Or we could insist on shmem.c using a special
556 * swap_shmem_free() and free_shmem_swap_and_cache()...
557 */
558 count = 0;
559 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
560 if (count == COUNT_CONTINUED) {
561 if (swap_count_continued(p, offset, count))
562 count = SWAP_MAP_MAX | COUNT_CONTINUED;
563 else
564 count = SWAP_MAP_MAX;
565 } else
566 count--;
575 } 567 }
576 /* return code. */ 568
577 count = p->swap_map[offset]; 569 if (!count)
570 mem_cgroup_uncharge_swap(entry);
571
572 usage = count | has_cache;
573 p->swap_map[offset] = usage;
574
578 /* free if no reference */ 575 /* free if no reference */
579 if (!count) { 576 if (!usage) {
580 if (offset < p->lowest_bit) 577 if (offset < p->lowest_bit)
581 p->lowest_bit = offset; 578 p->lowest_bit = offset;
582 if (offset > p->highest_bit) 579 if (offset > p->highest_bit)
583 p->highest_bit = offset; 580 p->highest_bit = offset;
584 if (p->prio > swap_info[swap_list.next].prio) 581 if (swap_list.next >= 0 &&
585 swap_list.next = p - swap_info; 582 p->prio > swap_info[swap_list.next]->prio)
583 swap_list.next = p->type;
586 nr_swap_pages++; 584 nr_swap_pages++;
587 p->inuse_pages--; 585 p->inuse_pages--;
588 } 586 }
589 if (!swap_count(count)) 587
590 mem_cgroup_uncharge_swap(ent); 588 return usage;
591 return count;
592} 589}
593 590
594/* 591/*
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p,
597 */ 594 */
598void swap_free(swp_entry_t entry) 595void swap_free(swp_entry_t entry)
599{ 596{
600 struct swap_info_struct * p; 597 struct swap_info_struct *p;
601 598
602 p = swap_info_get(entry); 599 p = swap_info_get(entry);
603 if (p) { 600 if (p) {
604 swap_entry_free(p, entry, SWAP_MAP); 601 swap_entry_free(p, entry, 1);
605 spin_unlock(&swap_lock); 602 spin_unlock(&swap_lock);
606 } 603 }
607} 604}
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry)
612void swapcache_free(swp_entry_t entry, struct page *page) 609void swapcache_free(swp_entry_t entry, struct page *page)
613{ 610{
614 struct swap_info_struct *p; 611 struct swap_info_struct *p;
615 int ret; 612 unsigned char count;
616 613
617 p = swap_info_get(entry); 614 p = swap_info_get(entry);
618 if (p) { 615 if (p) {
619 ret = swap_entry_free(p, entry, SWAP_CACHE); 616 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
620 if (page) { 617 if (page)
621 bool swapout; 618 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
622 if (ret)
623 swapout = true; /* the end of swap out */
624 else
625 swapout = false; /* no more swap users! */
626 mem_cgroup_uncharge_swapcache(page, entry, swapout);
627 }
628 spin_unlock(&swap_lock); 619 spin_unlock(&swap_lock);
629 } 620 }
630 return;
631} 621}
632 622
633/* 623/*
634 * How many references to page are currently swapped out? 624 * How many references to page are currently swapped out?
625 * This does not give an exact answer when swap count is continued,
626 * but does include the high COUNT_CONTINUED flag to allow for that.
635 */ 627 */
636static inline int page_swapcount(struct page *page) 628static inline int page_swapcount(struct page *page)
637{ 629{
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page)
659 int count; 651 int count;
660 652
661 VM_BUG_ON(!PageLocked(page)); 653 VM_BUG_ON(!PageLocked(page));
654 if (unlikely(PageKsm(page)))
655 return 0;
662 count = page_mapcount(page); 656 count = page_mapcount(page);
663 if (count <= 1 && PageSwapCache(page)) { 657 if (count <= 1 && PageSwapCache(page)) {
664 count += page_swapcount(page); 658 count += page_swapcount(page);
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page)
667 SetPageDirty(page); 661 SetPageDirty(page);
668 } 662 }
669 } 663 }
670 return count == 1; 664 return count <= 1;
671} 665}
672 666
673/* 667/*
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry)
704 698
705 p = swap_info_get(entry); 699 p = swap_info_get(entry);
706 if (p) { 700 if (p) {
707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
708 page = find_get_page(&swapper_space, entry.val); 702 page = find_get_page(&swapper_space, entry.val);
709 if (page && !trylock_page(page)) { 703 if (page && !trylock_page(page)) {
710 page_cache_release(page); 704 page_cache_release(page);
@@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry)
741int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 735int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
742{ 736{
743 struct block_device *bdev = NULL; 737 struct block_device *bdev = NULL;
744 int i; 738 int type;
745 739
746 if (device) 740 if (device)
747 bdev = bdget(device); 741 bdev = bdget(device);
748 742
749 spin_lock(&swap_lock); 743 spin_lock(&swap_lock);
750 for (i = 0; i < nr_swapfiles; i++) { 744 for (type = 0; type < nr_swapfiles; type++) {
751 struct swap_info_struct *sis = swap_info + i; 745 struct swap_info_struct *sis = swap_info[type];
752 746
753 if (!(sis->flags & SWP_WRITEOK)) 747 if (!(sis->flags & SWP_WRITEOK))
754 continue; 748 continue;
@@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
758 *bdev_p = bdgrab(sis->bdev); 752 *bdev_p = bdgrab(sis->bdev);
759 753
760 spin_unlock(&swap_lock); 754 spin_unlock(&swap_lock);
761 return i; 755 return type;
762 } 756 }
763 if (bdev == sis->bdev) { 757 if (bdev == sis->bdev) {
764 struct swap_extent *se; 758 struct swap_extent *se = &sis->first_swap_extent;
765 759
766 se = list_entry(sis->extent_list.next,
767 struct swap_extent, list);
768 if (se->start_block == offset) { 760 if (se->start_block == offset) {
769 if (bdev_p) 761 if (bdev_p)
770 *bdev_p = bdgrab(sis->bdev); 762 *bdev_p = bdgrab(sis->bdev);
771 763
772 spin_unlock(&swap_lock); 764 spin_unlock(&swap_lock);
773 bdput(bdev); 765 bdput(bdev);
774 return i; 766 return type;
775 } 767 }
776 } 768 }
777 } 769 }
@@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
783} 775}
784 776
785/* 777/*
778 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
779 * corresponding to given index in swap_info (swap type).
780 */
781sector_t swapdev_block(int type, pgoff_t offset)
782{
783 struct block_device *bdev;
784
785 if ((unsigned int)type >= nr_swapfiles)
786 return 0;
787 if (!(swap_info[type]->flags & SWP_WRITEOK))
788 return 0;
789 return map_swap_entry(swp_entry(type, offset), &bdev);
790}
791
792/*
786 * Return either the total number of swap pages of given type, or the number 793 * Return either the total number of swap pages of given type, or the number
787 * of free pages of that type (depending on @free) 794 * of free pages of that type (depending on @free)
788 * 795 *
@@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free)
792{ 799{
793 unsigned int n = 0; 800 unsigned int n = 0;
794 801
795 if (type < nr_swapfiles) { 802 spin_lock(&swap_lock);
796 spin_lock(&swap_lock); 803 if ((unsigned int)type < nr_swapfiles) {
797 if (swap_info[type].flags & SWP_WRITEOK) { 804 struct swap_info_struct *sis = swap_info[type];
798 n = swap_info[type].pages; 805
806 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages;
799 if (free) 808 if (free)
800 n -= swap_info[type].inuse_pages; 809 n -= sis->inuse_pages;
801 } 810 }
802 spin_unlock(&swap_lock);
803 } 811 }
812 spin_unlock(&swap_lock);
804 return n; 813 return n;
805} 814}
806#endif 815#endif /* CONFIG_HIBERNATION */
807 816
808/* 817/*
809 * No need to decide whether this PTE shares the swap entry with others, 818 * No need to decide whether this PTE shares the swap entry with others,
@@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma,
932 unsigned long addr, end, next; 941 unsigned long addr, end, next;
933 int ret; 942 int ret;
934 943
935 if (page->mapping) { 944 if (page_anon_vma(page)) {
936 addr = page_address_in_vma(page, vma); 945 addr = page_address_in_vma(page, vma);
937 if (addr == -EFAULT) 946 if (addr == -EFAULT)
938 return 0; 947 return 0;
@@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
988{ 997{
989 unsigned int max = si->max; 998 unsigned int max = si->max;
990 unsigned int i = prev; 999 unsigned int i = prev;
991 int count; 1000 unsigned char count;
992 1001
993 /* 1002 /*
994 * No need for swap_lock here: we're just looking 1003 * No need for swap_lock here: we're just looking
@@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1024 */ 1033 */
1025static int try_to_unuse(unsigned int type) 1034static int try_to_unuse(unsigned int type)
1026{ 1035{
1027 struct swap_info_struct * si = &swap_info[type]; 1036 struct swap_info_struct *si = swap_info[type];
1028 struct mm_struct *start_mm; 1037 struct mm_struct *start_mm;
1029 unsigned short *swap_map; 1038 unsigned char *swap_map;
1030 unsigned short swcount; 1039 unsigned char swcount;
1031 struct page *page; 1040 struct page *page;
1032 swp_entry_t entry; 1041 swp_entry_t entry;
1033 unsigned int i = 0; 1042 unsigned int i = 0;
1034 int retval = 0; 1043 int retval = 0;
1035 int reset_overflow = 0;
1036 int shmem;
1037 1044
1038 /* 1045 /*
1039 * When searching mms for an entry, a good strategy is to 1046 * When searching mms for an entry, a good strategy is to
@@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type)
1047 * together, child after parent. If we race with dup_mmap(), we 1054 * together, child after parent. If we race with dup_mmap(), we
1048 * prefer to resolve parent before child, lest we miss entries 1055 * prefer to resolve parent before child, lest we miss entries
1049 * duplicated after we scanned child: using last mm would invert 1056 * duplicated after we scanned child: using last mm would invert
1050 * that. Though it's only a serious concern when an overflowed 1057 * that.
1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1052 */ 1058 */
1053 start_mm = &init_mm; 1059 start_mm = &init_mm;
1054 atomic_inc(&init_mm.mm_users); 1060 atomic_inc(&init_mm.mm_users);
@@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type)
1110 1116
1111 /* 1117 /*
1112 * Remove all references to entry. 1118 * Remove all references to entry.
1113 * Whenever we reach init_mm, there's no address space
1114 * to search, but use it as a reminder to search shmem.
1115 */ 1119 */
1116 shmem = 0;
1117 swcount = *swap_map; 1120 swcount = *swap_map;
1118 if (swap_count(swcount)) { 1121 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1119 if (start_mm == &init_mm) 1122 retval = shmem_unuse(entry, page);
1120 shmem = shmem_unuse(entry, page); 1123 /* page has already been unlocked and released */
1121 else 1124 if (retval < 0)
1122 retval = unuse_mm(start_mm, entry, page); 1125 break;
1126 continue;
1123 } 1127 }
1128 if (swap_count(swcount) && start_mm != &init_mm)
1129 retval = unuse_mm(start_mm, entry, page);
1130
1124 if (swap_count(*swap_map)) { 1131 if (swap_count(*swap_map)) {
1125 int set_start_mm = (*swap_map >= swcount); 1132 int set_start_mm = (*swap_map >= swcount);
1126 struct list_head *p = &start_mm->mmlist; 1133 struct list_head *p = &start_mm->mmlist;
@@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type)
1131 atomic_inc(&new_start_mm->mm_users); 1138 atomic_inc(&new_start_mm->mm_users);
1132 atomic_inc(&prev_mm->mm_users); 1139 atomic_inc(&prev_mm->mm_users);
1133 spin_lock(&mmlist_lock); 1140 spin_lock(&mmlist_lock);
1134 while (swap_count(*swap_map) && !retval && !shmem && 1141 while (swap_count(*swap_map) && !retval &&
1135 (p = p->next) != &start_mm->mmlist) { 1142 (p = p->next) != &start_mm->mmlist) {
1136 mm = list_entry(p, struct mm_struct, mmlist); 1143 mm = list_entry(p, struct mm_struct, mmlist);
1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1144 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1145,10 +1152,9 @@ static int try_to_unuse(unsigned int type)
1145 swcount = *swap_map; 1152 swcount = *swap_map;
1146 if (!swap_count(swcount)) /* any usage ? */ 1153 if (!swap_count(swcount)) /* any usage ? */
1147 ; 1154 ;
1148 else if (mm == &init_mm) { 1155 else if (mm == &init_mm)
1149 set_start_mm = 1; 1156 set_start_mm = 1;
1150 shmem = shmem_unuse(entry, page); 1157 else
1151 } else
1152 retval = unuse_mm(mm, entry, page); 1158 retval = unuse_mm(mm, entry, page);
1153 1159
1154 if (set_start_mm && *swap_map < swcount) { 1160 if (set_start_mm && *swap_map < swcount) {
@@ -1164,13 +1170,6 @@ static int try_to_unuse(unsigned int type)
1164 mmput(start_mm); 1170 mmput(start_mm);
1165 start_mm = new_start_mm; 1171 start_mm = new_start_mm;
1166 } 1172 }
1167 if (shmem) {
1168 /* page has already been unlocked and released */
1169 if (shmem > 0)
1170 continue;
1171 retval = shmem;
1172 break;
1173 }
1174 if (retval) { 1173 if (retval) {
1175 unlock_page(page); 1174 unlock_page(page);
1176 page_cache_release(page); 1175 page_cache_release(page);
@@ -1178,30 +1177,6 @@ static int try_to_unuse(unsigned int type)
1178 } 1177 }
1179 1178
1180 /* 1179 /*
1181 * How could swap count reach 0x7ffe ?
1182 * There's no way to repeat a swap page within an mm
1183 * (except in shmem, where it's the shared object which takes
1184 * the reference count)?
1185 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1186 * short is too small....)
1187 * If that's wrong, then we should worry more about
1188 * exit_mmap() and do_munmap() cases described above:
1189 * we might be resetting SWAP_MAP_MAX too early here.
1190 * We know "Undead"s can happen, they're okay, so don't
1191 * report them; but do report if we reset SWAP_MAP_MAX.
1192 */
1193 /* We might release the lock_page() in unuse_mm(). */
1194 if (!PageSwapCache(page) || page_private(page) != entry.val)
1195 goto retry;
1196
1197 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1198 spin_lock(&swap_lock);
1199 *swap_map = encode_swapmap(0, true);
1200 spin_unlock(&swap_lock);
1201 reset_overflow = 1;
1202 }
1203
1204 /*
1205 * If a reference remains (rare), we would like to leave 1180 * If a reference remains (rare), we would like to leave
1206 * the page in the swap cache; but try_to_unmap could 1181 * the page in the swap cache; but try_to_unmap could
1207 * then re-duplicate the entry once we drop page lock, 1182 * then re-duplicate the entry once we drop page lock,
@@ -1213,6 +1188,12 @@ static int try_to_unuse(unsigned int type)
1213 * read from disk into another page. Splitting into two 1188 * read from disk into another page. Splitting into two
1214 * pages would be incorrect if swap supported "shared 1189 * pages would be incorrect if swap supported "shared
1215 * private" pages, but they are handled by tmpfs files. 1190 * private" pages, but they are handled by tmpfs files.
1191 *
1192 * Given how unuse_vma() targets one particular offset
1193 * in an anon_vma, once the anon_vma has been determined,
1194 * this splitting happens to be just what is needed to
1195 * handle where KSM pages have been swapped out: re-reading
1196 * is unnecessarily slow, but we can fix that later on.
1216 */ 1197 */
1217 if (swap_count(*swap_map) && 1198 if (swap_count(*swap_map) &&
1218 PageDirty(page) && PageSwapCache(page)) { 1199 PageDirty(page) && PageSwapCache(page)) {
@@ -1242,7 +1223,6 @@ static int try_to_unuse(unsigned int type)
1242 * mark page dirty so shrink_page_list will preserve it. 1223 * mark page dirty so shrink_page_list will preserve it.
1243 */ 1224 */
1244 SetPageDirty(page); 1225 SetPageDirty(page);
1245retry:
1246 unlock_page(page); 1226 unlock_page(page);
1247 page_cache_release(page); 1227 page_cache_release(page);
1248 1228
@@ -1254,10 +1234,6 @@ retry:
1254 } 1234 }
1255 1235
1256 mmput(start_mm); 1236 mmput(start_mm);
1257 if (reset_overflow) {
1258 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1259 swap_overflow = 0;
1260 }
1261 return retval; 1237 return retval;
1262} 1238}
1263 1239
@@ -1270,10 +1246,10 @@ retry:
1270static void drain_mmlist(void) 1246static void drain_mmlist(void)
1271{ 1247{
1272 struct list_head *p, *next; 1248 struct list_head *p, *next;
1273 unsigned int i; 1249 unsigned int type;
1274 1250
1275 for (i = 0; i < nr_swapfiles; i++) 1251 for (type = 0; type < nr_swapfiles; type++)
1276 if (swap_info[i].inuse_pages) 1252 if (swap_info[type]->inuse_pages)
1277 return; 1253 return;
1278 spin_lock(&mmlist_lock); 1254 spin_lock(&mmlist_lock);
1279 list_for_each_safe(p, next, &init_mm.mmlist) 1255 list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1283,12 +1259,23 @@ static void drain_mmlist(void)
1283 1259
1284/* 1260/*
1285 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1261 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1286 * corresponds to page offset `offset'. 1262 * corresponds to page offset for the specified swap entry.
1263 * Note that the type of this function is sector_t, but it returns page offset
1264 * into the bdev, not sector offset.
1287 */ 1265 */
1288sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1266static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1289{ 1267{
1290 struct swap_extent *se = sis->curr_swap_extent; 1268 struct swap_info_struct *sis;
1291 struct swap_extent *start_se = se; 1269 struct swap_extent *start_se;
1270 struct swap_extent *se;
1271 pgoff_t offset;
1272
1273 sis = swap_info[swp_type(entry)];
1274 *bdev = sis->bdev;
1275
1276 offset = swp_offset(entry);
1277 start_se = sis->curr_swap_extent;
1278 se = start_se;
1292 1279
1293 for ( ; ; ) { 1280 for ( ; ; ) {
1294 struct list_head *lh; 1281 struct list_head *lh;
@@ -1298,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
1298 return se->start_block + (offset - se->start_page); 1285 return se->start_block + (offset - se->start_page);
1299 } 1286 }
1300 lh = se->list.next; 1287 lh = se->list.next;
1301 if (lh == &sis->extent_list)
1302 lh = lh->next;
1303 se = list_entry(lh, struct swap_extent, list); 1288 se = list_entry(lh, struct swap_extent, list);
1304 sis->curr_swap_extent = se; 1289 sis->curr_swap_extent = se;
1305 BUG_ON(se == start_se); /* It *must* be present */ 1290 BUG_ON(se == start_se); /* It *must* be present */
1306 } 1291 }
1307} 1292}
1308 1293
1309#ifdef CONFIG_HIBERNATION
1310/* 1294/*
1311 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1295 * Returns the page offset into bdev for the specified page's swap entry.
1312 * corresponding to given index in swap_info (swap type).
1313 */ 1296 */
1314sector_t swapdev_block(int swap_type, pgoff_t offset) 1297sector_t map_swap_page(struct page *page, struct block_device **bdev)
1315{ 1298{
1316 struct swap_info_struct *sis; 1299 swp_entry_t entry;
1317 1300 entry.val = page_private(page);
1318 if (swap_type >= nr_swapfiles) 1301 return map_swap_entry(entry, bdev);
1319 return 0;
1320
1321 sis = swap_info + swap_type;
1322 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1323} 1302}
1324#endif /* CONFIG_HIBERNATION */
1325 1303
1326/* 1304/*
1327 * Free all of a swapdev's extent information 1305 * Free all of a swapdev's extent information
1328 */ 1306 */
1329static void destroy_swap_extents(struct swap_info_struct *sis) 1307static void destroy_swap_extents(struct swap_info_struct *sis)
1330{ 1308{
1331 while (!list_empty(&sis->extent_list)) { 1309 while (!list_empty(&sis->first_swap_extent.list)) {
1332 struct swap_extent *se; 1310 struct swap_extent *se;
1333 1311
1334 se = list_entry(sis->extent_list.next, 1312 se = list_entry(sis->first_swap_extent.list.next,
1335 struct swap_extent, list); 1313 struct swap_extent, list);
1336 list_del(&se->list); 1314 list_del(&se->list);
1337 kfree(se); 1315 kfree(se);
@@ -1352,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1352 struct swap_extent *new_se; 1330 struct swap_extent *new_se;
1353 struct list_head *lh; 1331 struct list_head *lh;
1354 1332
1355 lh = sis->extent_list.prev; /* The highest page extent */ 1333 if (start_page == 0) {
1356 if (lh != &sis->extent_list) { 1334 se = &sis->first_swap_extent;
1335 sis->curr_swap_extent = se;
1336 se->start_page = 0;
1337 se->nr_pages = nr_pages;
1338 se->start_block = start_block;
1339 return 1;
1340 } else {
1341 lh = sis->first_swap_extent.list.prev; /* Highest extent */
1357 se = list_entry(lh, struct swap_extent, list); 1342 se = list_entry(lh, struct swap_extent, list);
1358 BUG_ON(se->start_page + se->nr_pages != start_page); 1343 BUG_ON(se->start_page + se->nr_pages != start_page);
1359 if (se->start_block + se->nr_pages == start_block) { 1344 if (se->start_block + se->nr_pages == start_block) {
@@ -1373,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1373 new_se->nr_pages = nr_pages; 1358 new_se->nr_pages = nr_pages;
1374 new_se->start_block = start_block; 1359 new_se->start_block = start_block;
1375 1360
1376 list_add_tail(&new_se->list, &sis->extent_list); 1361 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1377 return 1; 1362 return 1;
1378} 1363}
1379 1364
@@ -1425,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1425 if (S_ISBLK(inode->i_mode)) { 1410 if (S_ISBLK(inode->i_mode)) {
1426 ret = add_swap_extent(sis, 0, sis->max, 0); 1411 ret = add_swap_extent(sis, 0, sis->max, 0);
1427 *span = sis->pages; 1412 *span = sis->pages;
1428 goto done; 1413 goto out;
1429 } 1414 }
1430 1415
1431 blkbits = inode->i_blkbits; 1416 blkbits = inode->i_blkbits;
@@ -1496,25 +1481,22 @@ reprobe:
1496 sis->max = page_no; 1481 sis->max = page_no;
1497 sis->pages = page_no - 1; 1482 sis->pages = page_no - 1;
1498 sis->highest_bit = page_no - 1; 1483 sis->highest_bit = page_no - 1;
1499done: 1484out:
1500 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1485 return ret;
1501 struct swap_extent, list);
1502 goto out;
1503bad_bmap: 1486bad_bmap:
1504 printk(KERN_ERR "swapon: swapfile has holes\n"); 1487 printk(KERN_ERR "swapon: swapfile has holes\n");
1505 ret = -EINVAL; 1488 ret = -EINVAL;
1506out: 1489 goto out;
1507 return ret;
1508} 1490}
1509 1491
1510SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1492SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1511{ 1493{
1512 struct swap_info_struct * p = NULL; 1494 struct swap_info_struct *p = NULL;
1513 unsigned short *swap_map; 1495 unsigned char *swap_map;
1514 struct file *swap_file, *victim; 1496 struct file *swap_file, *victim;
1515 struct address_space *mapping; 1497 struct address_space *mapping;
1516 struct inode *inode; 1498 struct inode *inode;
1517 char * pathname; 1499 char *pathname;
1518 int i, type, prev; 1500 int i, type, prev;
1519 int err; 1501 int err;
1520 1502
@@ -1535,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1535 mapping = victim->f_mapping; 1517 mapping = victim->f_mapping;
1536 prev = -1; 1518 prev = -1;
1537 spin_lock(&swap_lock); 1519 spin_lock(&swap_lock);
1538 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1520 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1539 p = swap_info + type; 1521 p = swap_info[type];
1540 if (p->flags & SWP_WRITEOK) { 1522 if (p->flags & SWP_WRITEOK) {
1541 if (p->swap_file->f_mapping == mapping) 1523 if (p->swap_file->f_mapping == mapping)
1542 break; 1524 break;
@@ -1555,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1555 spin_unlock(&swap_lock); 1537 spin_unlock(&swap_lock);
1556 goto out_dput; 1538 goto out_dput;
1557 } 1539 }
1558 if (prev < 0) { 1540 if (prev < 0)
1559 swap_list.head = p->next; 1541 swap_list.head = p->next;
1560 } else { 1542 else
1561 swap_info[prev].next = p->next; 1543 swap_info[prev]->next = p->next;
1562 }
1563 if (type == swap_list.next) { 1544 if (type == swap_list.next) {
1564 /* just pick something that's safe... */ 1545 /* just pick something that's safe... */
1565 swap_list.next = swap_list.head; 1546 swap_list.next = swap_list.head;
1566 } 1547 }
1567 if (p->prio < 0) { 1548 if (p->prio < 0) {
1568 for (i = p->next; i >= 0; i = swap_info[i].next) 1549 for (i = p->next; i >= 0; i = swap_info[i]->next)
1569 swap_info[i].prio = p->prio--; 1550 swap_info[i]->prio = p->prio--;
1570 least_priority++; 1551 least_priority++;
1571 } 1552 }
1572 nr_swap_pages -= p->pages; 1553 nr_swap_pages -= p->pages;
@@ -1584,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1584 if (p->prio < 0) 1565 if (p->prio < 0)
1585 p->prio = --least_priority; 1566 p->prio = --least_priority;
1586 prev = -1; 1567 prev = -1;
1587 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1568 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1588 if (p->prio >= swap_info[i].prio) 1569 if (p->prio >= swap_info[i]->prio)
1589 break; 1570 break;
1590 prev = i; 1571 prev = i;
1591 } 1572 }
1592 p->next = i; 1573 p->next = i;
1593 if (prev < 0) 1574 if (prev < 0)
1594 swap_list.head = swap_list.next = p - swap_info; 1575 swap_list.head = swap_list.next = type;
1595 else 1576 else
1596 swap_info[prev].next = p - swap_info; 1577 swap_info[prev]->next = type;
1597 nr_swap_pages += p->pages; 1578 nr_swap_pages += p->pages;
1598 total_swap_pages += p->pages; 1579 total_swap_pages += p->pages;
1599 p->flags |= SWP_WRITEOK; 1580 p->flags |= SWP_WRITEOK;
@@ -1606,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1606 up_write(&swap_unplug_sem); 1587 up_write(&swap_unplug_sem);
1607 1588
1608 destroy_swap_extents(p); 1589 destroy_swap_extents(p);
1590 if (p->flags & SWP_CONTINUED)
1591 free_swap_count_continuations(p);
1592
1609 mutex_lock(&swapon_mutex); 1593 mutex_lock(&swapon_mutex);
1610 spin_lock(&swap_lock); 1594 spin_lock(&swap_lock);
1611 drain_mmlist(); 1595 drain_mmlist();
@@ -1653,8 +1637,8 @@ out:
1653/* iterator */ 1637/* iterator */
1654static void *swap_start(struct seq_file *swap, loff_t *pos) 1638static void *swap_start(struct seq_file *swap, loff_t *pos)
1655{ 1639{
1656 struct swap_info_struct *ptr = swap_info; 1640 struct swap_info_struct *si;
1657 int i; 1641 int type;
1658 loff_t l = *pos; 1642 loff_t l = *pos;
1659 1643
1660 mutex_lock(&swapon_mutex); 1644 mutex_lock(&swapon_mutex);
@@ -1662,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1662 if (!l) 1646 if (!l)
1663 return SEQ_START_TOKEN; 1647 return SEQ_START_TOKEN;
1664 1648
1665 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1649 for (type = 0; type < nr_swapfiles; type++) {
1666 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1650 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1651 si = swap_info[type];
1652 if (!(si->flags & SWP_USED) || !si->swap_map)
1667 continue; 1653 continue;
1668 if (!--l) 1654 if (!--l)
1669 return ptr; 1655 return si;
1670 } 1656 }
1671 1657
1672 return NULL; 1658 return NULL;
@@ -1674,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1674 1660
1675static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1661static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1676{ 1662{
1677 struct swap_info_struct *ptr; 1663 struct swap_info_struct *si = v;
1678 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1664 int type;
1679 1665
1680 if (v == SEQ_START_TOKEN) 1666 if (v == SEQ_START_TOKEN)
1681 ptr = swap_info; 1667 type = 0;
1682 else { 1668 else
1683 ptr = v; 1669 type = si->type + 1;
1684 ptr++;
1685 }
1686 1670
1687 for (; ptr < endptr; ptr++) { 1671 for (; type < nr_swapfiles; type++) {
1688 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1672 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1673 si = swap_info[type];
1674 if (!(si->flags & SWP_USED) || !si->swap_map)
1689 continue; 1675 continue;
1690 ++*pos; 1676 ++*pos;
1691 return ptr; 1677 return si;
1692 } 1678 }
1693 1679
1694 return NULL; 1680 return NULL;
@@ -1701,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v)
1701 1687
1702static int swap_show(struct seq_file *swap, void *v) 1688static int swap_show(struct seq_file *swap, void *v)
1703{ 1689{
1704 struct swap_info_struct *ptr = v; 1690 struct swap_info_struct *si = v;
1705 struct file *file; 1691 struct file *file;
1706 int len; 1692 int len;
1707 1693
1708 if (ptr == SEQ_START_TOKEN) { 1694 if (si == SEQ_START_TOKEN) {
1709 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1695 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1710 return 0; 1696 return 0;
1711 } 1697 }
1712 1698
1713 file = ptr->swap_file; 1699 file = si->swap_file;
1714 len = seq_path(swap, &file->f_path, " \t\n\\"); 1700 len = seq_path(swap, &file->f_path, " \t\n\\");
1715 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1701 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1716 len < 40 ? 40 - len : 1, " ", 1702 len < 40 ? 40 - len : 1, " ",
1717 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1703 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1718 "partition" : "file\t", 1704 "partition" : "file\t",
1719 ptr->pages << (PAGE_SHIFT - 10), 1705 si->pages << (PAGE_SHIFT - 10),
1720 ptr->inuse_pages << (PAGE_SHIFT - 10), 1706 si->inuse_pages << (PAGE_SHIFT - 10),
1721 ptr->prio); 1707 si->prio);
1722 return 0; 1708 return 0;
1723} 1709}
1724 1710
@@ -1765,7 +1751,7 @@ late_initcall(max_swapfiles_check);
1765 */ 1751 */
1766SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1752SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1767{ 1753{
1768 struct swap_info_struct * p; 1754 struct swap_info_struct *p;
1769 char *name = NULL; 1755 char *name = NULL;
1770 struct block_device *bdev = NULL; 1756 struct block_device *bdev = NULL;
1771 struct file *swap_file = NULL; 1757 struct file *swap_file = NULL;
@@ -1779,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1779 sector_t span; 1765 sector_t span;
1780 unsigned long maxpages = 1; 1766 unsigned long maxpages = 1;
1781 unsigned long swapfilepages; 1767 unsigned long swapfilepages;
1782 unsigned short *swap_map = NULL; 1768 unsigned char *swap_map = NULL;
1783 struct page *page = NULL; 1769 struct page *page = NULL;
1784 struct inode *inode = NULL; 1770 struct inode *inode = NULL;
1785 int did_down = 0; 1771 int did_down = 0;
1786 1772
1787 if (!capable(CAP_SYS_ADMIN)) 1773 if (!capable(CAP_SYS_ADMIN))
1788 return -EPERM; 1774 return -EPERM;
1775
1776 p = kzalloc(sizeof(*p), GFP_KERNEL);
1777 if (!p)
1778 return -ENOMEM;
1779
1789 spin_lock(&swap_lock); 1780 spin_lock(&swap_lock);
1790 p = swap_info; 1781 for (type = 0; type < nr_swapfiles; type++) {
1791 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1782 if (!(swap_info[type]->flags & SWP_USED))
1792 if (!(p->flags & SWP_USED))
1793 break; 1783 break;
1784 }
1794 error = -EPERM; 1785 error = -EPERM;
1795 if (type >= MAX_SWAPFILES) { 1786 if (type >= MAX_SWAPFILES) {
1796 spin_unlock(&swap_lock); 1787 spin_unlock(&swap_lock);
1788 kfree(p);
1797 goto out; 1789 goto out;
1798 } 1790 }
1799 if (type >= nr_swapfiles) 1791 if (type >= nr_swapfiles) {
1800 nr_swapfiles = type+1; 1792 p->type = type;
1801 memset(p, 0, sizeof(*p)); 1793 swap_info[type] = p;
1802 INIT_LIST_HEAD(&p->extent_list); 1794 /*
1795 * Write swap_info[type] before nr_swapfiles, in case a
1796 * racing procfs swap_start() or swap_next() is reading them.
1797 * (We never shrink nr_swapfiles, we never free this entry.)
1798 */
1799 smp_wmb();
1800 nr_swapfiles++;
1801 } else {
1802 kfree(p);
1803 p = swap_info[type];
1804 /*
1805 * Do not memset this entry: a racing procfs swap_next()
1806 * would be relying on p->type to remain valid.
1807 */
1808 }
1809 INIT_LIST_HEAD(&p->first_swap_extent.list);
1803 p->flags = SWP_USED; 1810 p->flags = SWP_USED;
1804 p->next = -1; 1811 p->next = -1;
1805 spin_unlock(&swap_lock); 1812 spin_unlock(&swap_lock);
1813
1806 name = getname(specialfile); 1814 name = getname(specialfile);
1807 error = PTR_ERR(name); 1815 error = PTR_ERR(name);
1808 if (IS_ERR(name)) { 1816 if (IS_ERR(name)) {
@@ -1822,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1822 1830
1823 error = -EBUSY; 1831 error = -EBUSY;
1824 for (i = 0; i < nr_swapfiles; i++) { 1832 for (i = 0; i < nr_swapfiles; i++) {
1825 struct swap_info_struct *q = &swap_info[i]; 1833 struct swap_info_struct *q = swap_info[i];
1826 1834
1827 if (i == type || !q->swap_file) 1835 if (i == type || !q->swap_file)
1828 continue; 1836 continue;
@@ -1897,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1897 1905
1898 p->lowest_bit = 1; 1906 p->lowest_bit = 1;
1899 p->cluster_next = 1; 1907 p->cluster_next = 1;
1908 p->cluster_nr = 0;
1900 1909
1901 /* 1910 /*
1902 * Find out how many pages are allowed for a single swap 1911 * Find out how many pages are allowed for a single swap
@@ -1932,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1932 goto bad_swap; 1941 goto bad_swap;
1933 1942
1934 /* OK, set up the swap map and apply the bad block list */ 1943 /* OK, set up the swap map and apply the bad block list */
1935 swap_map = vmalloc(maxpages * sizeof(short)); 1944 swap_map = vmalloc(maxpages);
1936 if (!swap_map) { 1945 if (!swap_map) {
1937 error = -ENOMEM; 1946 error = -ENOMEM;
1938 goto bad_swap; 1947 goto bad_swap;
1939 } 1948 }
1940 1949
1941 memset(swap_map, 0, maxpages * sizeof(short)); 1950 memset(swap_map, 0, maxpages);
1942 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1951 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1943 int page_nr = swap_header->info.badpages[i]; 1952 int page_nr = swap_header->info.badpages[i];
1944 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1953 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -2003,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2003 2012
2004 /* insert swap space into swap_list: */ 2013 /* insert swap space into swap_list: */
2005 prev = -1; 2014 prev = -1;
2006 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2015 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2007 if (p->prio >= swap_info[i].prio) { 2016 if (p->prio >= swap_info[i]->prio)
2008 break; 2017 break;
2009 }
2010 prev = i; 2018 prev = i;
2011 } 2019 }
2012 p->next = i; 2020 p->next = i;
2013 if (prev < 0) { 2021 if (prev < 0)
2014 swap_list.head = swap_list.next = p - swap_info; 2022 swap_list.head = swap_list.next = type;
2015 } else { 2023 else
2016 swap_info[prev].next = p - swap_info; 2024 swap_info[prev]->next = type;
2017 }
2018 spin_unlock(&swap_lock); 2025 spin_unlock(&swap_lock);
2019 mutex_unlock(&swapon_mutex); 2026 mutex_unlock(&swapon_mutex);
2020 error = 0; 2027 error = 0;
@@ -2051,15 +2058,15 @@ out:
2051 2058
2052void si_swapinfo(struct sysinfo *val) 2059void si_swapinfo(struct sysinfo *val)
2053{ 2060{
2054 unsigned int i; 2061 unsigned int type;
2055 unsigned long nr_to_be_unused = 0; 2062 unsigned long nr_to_be_unused = 0;
2056 2063
2057 spin_lock(&swap_lock); 2064 spin_lock(&swap_lock);
2058 for (i = 0; i < nr_swapfiles; i++) { 2065 for (type = 0; type < nr_swapfiles; type++) {
2059 if (!(swap_info[i].flags & SWP_USED) || 2066 struct swap_info_struct *si = swap_info[type];
2060 (swap_info[i].flags & SWP_WRITEOK)) 2067
2061 continue; 2068 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2062 nr_to_be_unused += swap_info[i].inuse_pages; 2069 nr_to_be_unused += si->inuse_pages;
2063 } 2070 }
2064 val->freeswap = nr_swap_pages + nr_to_be_unused; 2071 val->freeswap = nr_swap_pages + nr_to_be_unused;
2065 val->totalswap = total_swap_pages + nr_to_be_unused; 2072 val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2069,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val)
2069/* 2076/*
2070 * Verify that a swap entry is valid and increment its swap map count. 2077 * Verify that a swap entry is valid and increment its swap map count.
2071 * 2078 *
2072 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2073 * "permanent", but will be reclaimed by the next swapoff.
2074 * Returns error code in following case. 2079 * Returns error code in following case.
2075 * - success -> 0 2080 * - success -> 0
2076 * - swp_entry is invalid -> EINVAL 2081 * - swp_entry is invalid -> EINVAL
2077 * - swp_entry is migration entry -> EINVAL 2082 * - swp_entry is migration entry -> EINVAL
2078 * - swap-cache reference is requested but there is already one. -> EEXIST 2083 * - swap-cache reference is requested but there is already one. -> EEXIST
2079 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2084 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2085 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2080 */ 2086 */
2081static int __swap_duplicate(swp_entry_t entry, bool cache) 2087static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2082{ 2088{
2083 struct swap_info_struct * p; 2089 struct swap_info_struct *p;
2084 unsigned long offset, type; 2090 unsigned long offset, type;
2085 int result = -EINVAL; 2091 unsigned char count;
2086 int count; 2092 unsigned char has_cache;
2087 bool has_cache; 2093 int err = -EINVAL;
2088 2094
2089 if (non_swap_entry(entry)) 2095 if (non_swap_entry(entry))
2090 return -EINVAL; 2096 goto out;
2091 2097
2092 type = swp_type(entry); 2098 type = swp_type(entry);
2093 if (type >= nr_swapfiles) 2099 if (type >= nr_swapfiles)
2094 goto bad_file; 2100 goto bad_file;
2095 p = type + swap_info; 2101 p = swap_info[type];
2096 offset = swp_offset(entry); 2102 offset = swp_offset(entry);
2097 2103
2098 spin_lock(&swap_lock); 2104 spin_lock(&swap_lock);
2099
2100 if (unlikely(offset >= p->max)) 2105 if (unlikely(offset >= p->max))
2101 goto unlock_out; 2106 goto unlock_out;
2102 2107
2103 count = swap_count(p->swap_map[offset]); 2108 count = p->swap_map[offset];
2104 has_cache = swap_has_cache(p->swap_map[offset]); 2109 has_cache = count & SWAP_HAS_CACHE;
2110 count &= ~SWAP_HAS_CACHE;
2111 err = 0;
2105 2112
2106 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2113 if (usage == SWAP_HAS_CACHE) {
2107 2114
2108 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2115 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2109 if (!has_cache && count) { 2116 if (!has_cache && count)
2110 p->swap_map[offset] = encode_swapmap(count, true); 2117 has_cache = SWAP_HAS_CACHE;
2111 result = 0; 2118 else if (has_cache) /* someone else added cache */
2112 } else if (has_cache) /* someone added cache */ 2119 err = -EEXIST;
2113 result = -EEXIST; 2120 else /* no users remaining */
2114 else if (!count) /* no users */ 2121 err = -ENOENT;
2115 result = -ENOENT;
2116 2122
2117 } else if (count || has_cache) { 2123 } else if (count || has_cache) {
2118 if (count < SWAP_MAP_MAX - 1) { 2124
2119 p->swap_map[offset] = encode_swapmap(count + 1, 2125 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2120 has_cache); 2126 count += usage;
2121 result = 0; 2127 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2122 } else if (count <= SWAP_MAP_MAX) { 2128 err = -EINVAL;
2123 if (swap_overflow++ < 5) 2129 else if (swap_count_continued(p, offset, count))
2124 printk(KERN_WARNING 2130 count = COUNT_CONTINUED;
2125 "swap_dup: swap entry overflow\n"); 2131 else
2126 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2132 err = -ENOMEM;
2127 has_cache);
2128 result = 0;
2129 }
2130 } else 2133 } else
2131 result = -ENOENT; /* unused swap entry */ 2134 err = -ENOENT; /* unused swap entry */
2135
2136 p->swap_map[offset] = count | has_cache;
2137
2132unlock_out: 2138unlock_out:
2133 spin_unlock(&swap_lock); 2139 spin_unlock(&swap_lock);
2134out: 2140out:
2135 return result; 2141 return err;
2136 2142
2137bad_file: 2143bad_file:
2138 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2144 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2139 goto out; 2145 goto out;
2140} 2146}
2147
2148/*
2149 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2150 * (in which case its reference count is never incremented).
2151 */
2152void swap_shmem_alloc(swp_entry_t entry)
2153{
2154 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2155}
2156
2141/* 2157/*
2142 * increase reference count of swap entry by 1. 2158 * increase reference count of swap entry by 1.
2143 */ 2159 */
2144void swap_duplicate(swp_entry_t entry) 2160int swap_duplicate(swp_entry_t entry)
2145{ 2161{
2146 __swap_duplicate(entry, SWAP_MAP); 2162 int err = 0;
2163
2164 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2165 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2166 return err;
2147} 2167}
2148 2168
2149/* 2169/*
2150 * @entry: swap entry for which we allocate swap cache. 2170 * @entry: swap entry for which we allocate swap cache.
2151 * 2171 *
2152 * Called when allocating swap cache for exising swap entry, 2172 * Called when allocating swap cache for existing swap entry,
2153 * This can return error codes. Returns 0 at success. 2173 * This can return error codes. Returns 0 at success.
2154 * -EBUSY means there is a swap cache. 2174 * -EBUSY means there is a swap cache.
2155 * Note: return code is different from swap_duplicate(). 2175 * Note: return code is different from swap_duplicate().
2156 */ 2176 */
2157int swapcache_prepare(swp_entry_t entry) 2177int swapcache_prepare(swp_entry_t entry)
2158{ 2178{
2159 return __swap_duplicate(entry, SWAP_CACHE); 2179 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2160}
2161
2162
2163struct swap_info_struct *
2164get_swap_info_struct(unsigned type)
2165{
2166 return &swap_info[type];
2167} 2180}
2168 2181
2169/* 2182/*
@@ -2181,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2181 if (!our_page_cluster) /* no readahead */ 2194 if (!our_page_cluster) /* no readahead */
2182 return 0; 2195 return 0;
2183 2196
2184 si = &swap_info[swp_type(entry)]; 2197 si = swap_info[swp_type(entry)];
2185 target = swp_offset(entry); 2198 target = swp_offset(entry);
2186 base = (target >> our_page_cluster) << our_page_cluster; 2199 base = (target >> our_page_cluster) << our_page_cluster;
2187 end = base + (1 << our_page_cluster); 2200 end = base + (1 << our_page_cluster);
@@ -2217,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2217 *offset = ++toff; 2230 *offset = ++toff;
2218 return nr_pages? ++nr_pages: 0; 2231 return nr_pages? ++nr_pages: 0;
2219} 2232}
2233
2234/*
2235 * add_swap_count_continuation - called when a swap count is duplicated
2236 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2237 * page of the original vmalloc'ed swap_map, to hold the continuation count
2238 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
2239 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2240 *
2241 * These continuation pages are seldom referenced: the common paths all work
2242 * on the original swap_map, only referring to a continuation page when the
2243 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2244 *
2245 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2246 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2247 * can be called after dropping locks.
2248 */
2249int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2250{
2251 struct swap_info_struct *si;
2252 struct page *head;
2253 struct page *page;
2254 struct page *list_page;
2255 pgoff_t offset;
2256 unsigned char count;
2257
2258 /*
2259 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2260 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2261 */
2262 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2263
2264 si = swap_info_get(entry);
2265 if (!si) {
2266 /*
2267 * An acceptable race has occurred since the failing
2268 * __swap_duplicate(): the swap entry has been freed,
2269 * perhaps even the whole swap_map cleared for swapoff.
2270 */
2271 goto outer;
2272 }
2273
2274 offset = swp_offset(entry);
2275 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2276
2277 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2278 /*
2279 * The higher the swap count, the more likely it is that tasks
2280 * will race to add swap count continuation: we need to avoid
2281 * over-provisioning.
2282 */
2283 goto out;
2284 }
2285
2286 if (!page) {
2287 spin_unlock(&swap_lock);
2288 return -ENOMEM;
2289 }
2290
2291 /*
2292 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2293 * no architecture is using highmem pages for kernel pagetables: so it
2294 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2295 */
2296 head = vmalloc_to_page(si->swap_map + offset);
2297 offset &= ~PAGE_MASK;
2298
2299 /*
2300 * Page allocation does not initialize the page's lru field,
2301 * but it does always reset its private field.
2302 */
2303 if (!page_private(head)) {
2304 BUG_ON(count & COUNT_CONTINUED);
2305 INIT_LIST_HEAD(&head->lru);
2306 set_page_private(head, SWP_CONTINUED);
2307 si->flags |= SWP_CONTINUED;
2308 }
2309
2310 list_for_each_entry(list_page, &head->lru, lru) {
2311 unsigned char *map;
2312
2313 /*
2314 * If the previous map said no continuation, but we've found
2315 * a continuation page, free our allocation and use this one.
2316 */
2317 if (!(count & COUNT_CONTINUED))
2318 goto out;
2319
2320 map = kmap_atomic(list_page, KM_USER0) + offset;
2321 count = *map;
2322 kunmap_atomic(map, KM_USER0);
2323
2324 /*
2325 * If this continuation count now has some space in it,
2326 * free our allocation and use this one.
2327 */
2328 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2329 goto out;
2330 }
2331
2332 list_add_tail(&page->lru, &head->lru);
2333 page = NULL; /* now it's attached, don't free it */
2334out:
2335 spin_unlock(&swap_lock);
2336outer:
2337 if (page)
2338 __free_page(page);
2339 return 0;
2340}
2341
2342/*
2343 * swap_count_continued - when the original swap_map count is incremented
2344 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2345 * into, carry if so, or else fail until a new continuation page is allocated;
2346 * when the original swap_map count is decremented from 0 with continuation,
2347 * borrow from the continuation and report whether it still holds more.
2348 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2349 */
2350static bool swap_count_continued(struct swap_info_struct *si,
2351 pgoff_t offset, unsigned char count)
2352{
2353 struct page *head;
2354 struct page *page;
2355 unsigned char *map;
2356
2357 head = vmalloc_to_page(si->swap_map + offset);
2358 if (page_private(head) != SWP_CONTINUED) {
2359 BUG_ON(count & COUNT_CONTINUED);
2360 return false; /* need to add count continuation */
2361 }
2362
2363 offset &= ~PAGE_MASK;
2364 page = list_entry(head->lru.next, struct page, lru);
2365 map = kmap_atomic(page, KM_USER0) + offset;
2366
2367 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2368 goto init_map; /* jump over SWAP_CONT_MAX checks */
2369
2370 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2371 /*
2372 * Think of how you add 1 to 999
2373 */
2374 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2375 kunmap_atomic(map, KM_USER0);
2376 page = list_entry(page->lru.next, struct page, lru);
2377 BUG_ON(page == head);
2378 map = kmap_atomic(page, KM_USER0) + offset;
2379 }
2380 if (*map == SWAP_CONT_MAX) {
2381 kunmap_atomic(map, KM_USER0);
2382 page = list_entry(page->lru.next, struct page, lru);
2383 if (page == head)
2384 return false; /* add count continuation */
2385 map = kmap_atomic(page, KM_USER0) + offset;
2386init_map: *map = 0; /* we didn't zero the page */
2387 }
2388 *map += 1;
2389 kunmap_atomic(map, KM_USER0);
2390 page = list_entry(page->lru.prev, struct page, lru);
2391 while (page != head) {
2392 map = kmap_atomic(page, KM_USER0) + offset;
2393 *map = COUNT_CONTINUED;
2394 kunmap_atomic(map, KM_USER0);
2395 page = list_entry(page->lru.prev, struct page, lru);
2396 }
2397 return true; /* incremented */
2398
2399 } else { /* decrementing */
2400 /*
2401 * Think of how you subtract 1 from 1000
2402 */
2403 BUG_ON(count != COUNT_CONTINUED);
2404 while (*map == COUNT_CONTINUED) {
2405 kunmap_atomic(map, KM_USER0);
2406 page = list_entry(page->lru.next, struct page, lru);
2407 BUG_ON(page == head);
2408 map = kmap_atomic(page, KM_USER0) + offset;
2409 }
2410 BUG_ON(*map == 0);
2411 *map -= 1;
2412 if (*map == 0)
2413 count = 0;
2414 kunmap_atomic(map, KM_USER0);
2415 page = list_entry(page->lru.prev, struct page, lru);
2416 while (page != head) {
2417 map = kmap_atomic(page, KM_USER0) + offset;
2418 *map = SWAP_CONT_MAX | count;
2419 count = COUNT_CONTINUED;
2420 kunmap_atomic(map, KM_USER0);
2421 page = list_entry(page->lru.prev, struct page, lru);
2422 }
2423 return count == COUNT_CONTINUED;
2424 }
2425}
2426
2427/*
2428 * free_swap_count_continuations - swapoff free all the continuation pages
2429 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2430 */
2431static void free_swap_count_continuations(struct swap_info_struct *si)
2432{
2433 pgoff_t offset;
2434
2435 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2436 struct page *head;
2437 head = vmalloc_to_page(si->swap_map + offset);
2438 if (page_private(head)) {
2439 struct list_head *this, *next;
2440 list_for_each_safe(this, next, &head->lru) {
2441 struct page *page;
2442 page = list_entry(this, struct page, lru);
2443 list_del(this);
2444 __free_page(page);
2445 }
2446 }
2447 }
2448}
diff --git a/mm/truncate.c b/mm/truncate.c
index 2c147a7e5f2c..e87e37244829 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
272 pagevec_release(&pvec); 272 pagevec_release(&pvec);
273 break; 273 break;
274 } 274 }
275 mem_cgroup_uncharge_start();
275 for (i = 0; i < pagevec_count(&pvec); i++) { 276 for (i = 0; i < pagevec_count(&pvec); i++) {
276 struct page *page = pvec.pages[i]; 277 struct page *page = pvec.pages[i];
277 278
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
286 unlock_page(page); 287 unlock_page(page);
287 } 288 }
288 pagevec_release(&pvec); 289 pagevec_release(&pvec);
290 mem_cgroup_uncharge_end();
289 } 291 }
290} 292}
291EXPORT_SYMBOL(truncate_inode_pages_range); 293EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
327 pagevec_init(&pvec, 0); 329 pagevec_init(&pvec, 0);
328 while (next <= end && 330 while (next <= end &&
329 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 331 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
332 mem_cgroup_uncharge_start();
330 for (i = 0; i < pagevec_count(&pvec); i++) { 333 for (i = 0; i < pagevec_count(&pvec); i++) {
331 struct page *page = pvec.pages[i]; 334 struct page *page = pvec.pages[i];
332 pgoff_t index; 335 pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
354 break; 357 break;
355 } 358 }
356 pagevec_release(&pvec); 359 pagevec_release(&pvec);
360 mem_cgroup_uncharge_end();
357 cond_resched(); 361 cond_resched();
358 } 362 }
359 return ret; 363 return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428 while (next <= end && !wrapped && 432 while (next <= end && !wrapped &&
429 pagevec_lookup(&pvec, mapping, next, 433 pagevec_lookup(&pvec, mapping, next,
430 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 434 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
435 mem_cgroup_uncharge_start();
431 for (i = 0; i < pagevec_count(&pvec); i++) { 436 for (i = 0; i < pagevec_count(&pvec); i++) {
432 struct page *page = pvec.pages[i]; 437 struct page *page = pvec.pages[i];
433 pgoff_t page_index; 438 pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
477 unlock_page(page); 482 unlock_page(page);
478 } 483 }
479 pagevec_release(&pvec); 484 pagevec_release(&pvec);
485 mem_cgroup_uncharge_end();
480 cond_resched(); 486 cond_resched();
481 } 487 }
482 return ret; 488 return ret;
@@ -516,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
516 */ 522 */
517void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 523void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
518{ 524{
519 if (new < old) { 525 struct address_space *mapping = inode->i_mapping;
520 struct address_space *mapping = inode->i_mapping; 526
521 527 /*
522 /* 528 * unmap_mapping_range is called twice, first simply for
523 * unmap_mapping_range is called twice, first simply for 529 * efficiency so that truncate_inode_pages does fewer
524 * efficiency so that truncate_inode_pages does fewer 530 * single-page unmaps. However after this first call, and
525 * single-page unmaps. However after this first call, and 531 * before truncate_inode_pages finishes, it is possible for
526 * before truncate_inode_pages finishes, it is possible for 532 * private pages to be COWed, which remain after
527 * private pages to be COWed, which remain after 533 * truncate_inode_pages finishes, hence the second
528 * truncate_inode_pages finishes, hence the second 534 * unmap_mapping_range call must be made for correctness.
529 * unmap_mapping_range call must be made for correctness. 535 */
530 */ 536 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
531 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 537 truncate_inode_pages(mapping, new);
532 truncate_inode_pages(mapping, new); 538 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
533 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
534 }
535} 539}
536EXPORT_SYMBOL(truncate_pagecache); 540EXPORT_SYMBOL(truncate_pagecache);
537 541
diff --git a/mm/util.c b/mm/util.c
index b377ce430803..834db7be240f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,10 +4,6 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/hugetlb.h>
8#include <linux/syscalls.h>
9#include <linux/mman.h>
10#include <linux/file.h>
11#include <asm/uaccess.h> 7#include <asm/uaccess.h>
12 8
13#define CREATE_TRACE_POINTS 9#define CREATE_TRACE_POINTS
@@ -224,7 +220,7 @@ char *strndup_user(const char __user *s, long n)
224} 220}
225EXPORT_SYMBOL(strndup_user); 221EXPORT_SYMBOL(strndup_user);
226 222
227#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT 223#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
228void arch_pick_mmap_layout(struct mm_struct *mm) 224void arch_pick_mmap_layout(struct mm_struct *mm)
229{ 225{
230 mm->mmap_base = TASK_UNMAPPED_BASE; 226 mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -272,46 +268,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
272} 268}
273EXPORT_SYMBOL_GPL(get_user_pages_fast); 269EXPORT_SYMBOL_GPL(get_user_pages_fast);
274 270
275SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
276 unsigned long, prot, unsigned long, flags,
277 unsigned long, fd, unsigned long, pgoff)
278{
279 struct file * file = NULL;
280 unsigned long retval = -EBADF;
281
282 if (!(flags & MAP_ANONYMOUS)) {
283 if (unlikely(flags & MAP_HUGETLB))
284 return -EINVAL;
285 file = fget(fd);
286 if (!file)
287 goto out;
288 } else if (flags & MAP_HUGETLB) {
289 struct user_struct *user = NULL;
290 /*
291 * VM_NORESERVE is used because the reservations will be
292 * taken when vm_ops->mmap() is called
293 * A dummy user value is used because we are not locking
294 * memory so no accounting is necessary
295 */
296 len = ALIGN(len, huge_page_size(&default_hstate));
297 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
298 &user, HUGETLB_ANONHUGE_INODE);
299 if (IS_ERR(file))
300 return PTR_ERR(file);
301 }
302
303 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
304
305 down_write(&current->mm->mmap_sem);
306 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
307 up_write(&current->mm->mmap_sem);
308
309 if (file)
310 fput(file);
311out:
312 return retval;
313}
314
315/* Tracepoints definitions. */ 271/* Tracepoints definitions. */
316EXPORT_TRACEPOINT_SYMBOL(kmalloc); 272EXPORT_TRACEPOINT_SYMBOL(kmalloc);
317EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 273EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f551a4a44cd..d55d905463eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -555,10 +555,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
555 } 555 }
556 rcu_read_unlock(); 556 rcu_read_unlock();
557 557
558 if (nr) { 558 if (nr)
559 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
560 atomic_sub(nr, &vmap_lazy_nr); 559 atomic_sub(nr, &vmap_lazy_nr);
561 }
562 560
563 if (nr || force_flush) 561 if (nr || force_flush)
564 flush_tlb_kernel_range(*start, *end); 562 flush_tlb_kernel_range(*start, *end);
@@ -761,7 +759,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
761 spin_lock(&vbq->lock); 759 spin_lock(&vbq->lock);
762 list_add(&vb->free_list, &vbq->free); 760 list_add(&vb->free_list, &vbq->free);
763 spin_unlock(&vbq->lock); 761 spin_unlock(&vbq->lock);
764 put_cpu_var(vmap_cpu_blocks); 762 put_cpu_var(vmap_block_queue);
765 763
766 return vb; 764 return vb;
767} 765}
@@ -826,7 +824,7 @@ again:
826 } 824 }
827 spin_unlock(&vb->lock); 825 spin_unlock(&vb->lock);
828 } 826 }
829 put_cpu_var(vmap_cpu_blocks); 827 put_cpu_var(vmap_block_queue);
830 rcu_read_unlock(); 828 rcu_read_unlock();
831 829
832 if (!addr) { 830 if (!addr) {
@@ -1411,6 +1409,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1411{ 1409{
1412 struct page **pages; 1410 struct page **pages;
1413 unsigned int nr_pages, array_size, i; 1411 unsigned int nr_pages, array_size, i;
1412 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1414 1413
1415 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1414 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1416 array_size = (nr_pages * sizeof(struct page *)); 1415 array_size = (nr_pages * sizeof(struct page *));
@@ -1418,13 +1417,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1418 area->nr_pages = nr_pages; 1417 area->nr_pages = nr_pages;
1419 /* Please note that the recursion is strictly bounded. */ 1418 /* Please note that the recursion is strictly bounded. */
1420 if (array_size > PAGE_SIZE) { 1419 if (array_size > PAGE_SIZE) {
1421 pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, 1420 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1422 PAGE_KERNEL, node, caller); 1421 PAGE_KERNEL, node, caller);
1423 area->flags |= VM_VPAGES; 1422 area->flags |= VM_VPAGES;
1424 } else { 1423 } else {
1425 pages = kmalloc_node(array_size, 1424 pages = kmalloc_node(array_size, nested_gfp, node);
1426 (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
1427 node);
1428 } 1425 }
1429 area->pages = pages; 1426 area->pages = pages;
1430 area->caller = caller; 1427 area->caller = caller;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 777af57fd8c8..c26986c85ce0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,11 @@ struct scan_control {
55 /* Number of pages freed so far during a call to shrink_zones() */ 55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed; 56 unsigned long nr_reclaimed;
57 57
58 /* How many pages shrink_list() should reclaim */
59 unsigned long nr_to_reclaim;
60
61 unsigned long hibernation_mode;
62
58 /* This context's GFP mask */ 63 /* This context's GFP mask */
59 gfp_t gfp_mask; 64 gfp_t gfp_mask;
60 65
@@ -66,12 +71,6 @@ struct scan_control {
66 /* Can pages be swapped as part of reclaim? */ 71 /* Can pages be swapped as part of reclaim? */
67 int may_swap; 72 int may_swap;
68 73
69 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
70 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
71 * In this context, it doesn't matter that we scan the
72 * whole list at once. */
73 int swap_cluster_max;
74
75 int swappiness; 74 int swappiness;
76 75
77 int all_unreclaimable; 76 int all_unreclaimable;
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
358 * stalls if we need to run get_block(). We could test 357 * stalls if we need to run get_block(). We could test
359 * PagePrivate for that. 358 * PagePrivate for that.
360 * 359 *
361 * If this process is currently in generic_file_write() against 360 * If this process is currently in __generic_file_aio_write() against
362 * this page's queue, we can perform writeback even if that 361 * this page's queue, we can perform writeback even if that
363 * will block. 362 * will block.
364 * 363 *
@@ -1132,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1132 unsigned long nr_anon; 1131 unsigned long nr_anon;
1133 unsigned long nr_file; 1132 unsigned long nr_file;
1134 1133
1135 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1134 nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
1136 &page_list, &nr_scan, sc->order, mode, 1135 &page_list, &nr_scan, sc->order, mode,
1137 zone, sc->mem_cgroup, 0, file); 1136 zone, sc->mem_cgroup, 0, file);
1138 1137
@@ -1166,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1166 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1165 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1167 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); 1166 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1168 1167
1169 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1168 reclaim_stat->recent_scanned[0] += nr_anon;
1170 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1169 reclaim_stat->recent_scanned[1] += nr_file;
1171 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1172 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1173 1170
1174 spin_unlock_irq(&zone->lru_lock); 1171 spin_unlock_irq(&zone->lru_lock);
1175 1172
@@ -1464,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1464 return low; 1461 return low;
1465} 1462}
1466 1463
1464static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1465 int file)
1466{
1467 if (file)
1468 return inactive_file_is_low(zone, sc);
1469 else
1470 return inactive_anon_is_low(zone, sc);
1471}
1472
1467static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1473static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1468 struct zone *zone, struct scan_control *sc, int priority) 1474 struct zone *zone, struct scan_control *sc, int priority)
1469{ 1475{
1470 int file = is_file_lru(lru); 1476 int file = is_file_lru(lru);
1471 1477
1472 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { 1478 if (is_active_lru(lru)) {
1473 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1479 if (inactive_list_is_low(zone, sc, file))
1480 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1474 return 0; 1481 return 0;
1475 } 1482 }
1476 1483
1477 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1478 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1479 return 0;
1480 }
1481 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1484 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1482} 1485}
1483 1486
@@ -1567,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1567 * until we collected @swap_cluster_max pages to scan. 1570 * until we collected @swap_cluster_max pages to scan.
1568 */ 1571 */
1569static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, 1572static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1570 unsigned long *nr_saved_scan, 1573 unsigned long *nr_saved_scan)
1571 unsigned long swap_cluster_max)
1572{ 1574{
1573 unsigned long nr; 1575 unsigned long nr;
1574 1576
1575 *nr_saved_scan += nr_to_scan; 1577 *nr_saved_scan += nr_to_scan;
1576 nr = *nr_saved_scan; 1578 nr = *nr_saved_scan;
1577 1579
1578 if (nr >= swap_cluster_max) 1580 if (nr >= SWAP_CLUSTER_MAX)
1579 *nr_saved_scan = 0; 1581 *nr_saved_scan = 0;
1580 else 1582 else
1581 nr = 0; 1583 nr = 0;
@@ -1594,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone,
1594 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1596 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1595 enum lru_list l; 1597 enum lru_list l;
1596 unsigned long nr_reclaimed = sc->nr_reclaimed; 1598 unsigned long nr_reclaimed = sc->nr_reclaimed;
1597 unsigned long swap_cluster_max = sc->swap_cluster_max; 1599 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1598 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1600 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1599 int noswap = 0; 1601 int noswap = 0;
1600 1602
@@ -1616,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone,
1616 scan = (scan * percent[file]) / 100; 1618 scan = (scan * percent[file]) / 100;
1617 } 1619 }
1618 nr[l] = nr_scan_try_batch(scan, 1620 nr[l] = nr_scan_try_batch(scan,
1619 &reclaim_stat->nr_saved_scan[l], 1621 &reclaim_stat->nr_saved_scan[l]);
1620 swap_cluster_max);
1621 } 1622 }
1622 1623
1623 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1624 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1624 nr[LRU_INACTIVE_FILE]) { 1625 nr[LRU_INACTIVE_FILE]) {
1625 for_each_evictable_lru(l) { 1626 for_each_evictable_lru(l) {
1626 if (nr[l]) { 1627 if (nr[l]) {
1627 nr_to_scan = min(nr[l], swap_cluster_max); 1628 nr_to_scan = min_t(unsigned long,
1629 nr[l], SWAP_CLUSTER_MAX);
1628 nr[l] -= nr_to_scan; 1630 nr[l] -= nr_to_scan;
1629 1631
1630 nr_reclaimed += shrink_list(l, nr_to_scan, 1632 nr_reclaimed += shrink_list(l, nr_to_scan,
@@ -1639,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone,
1639 * with multiple processes reclaiming pages, the total 1641 * with multiple processes reclaiming pages, the total
1640 * freeing target can get unreasonably large. 1642 * freeing target can get unreasonably large.
1641 */ 1643 */
1642 if (nr_reclaimed > swap_cluster_max && 1644 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1643 priority < DEF_PRIORITY && !current_is_kswapd())
1644 break; 1645 break;
1645 } 1646 }
1646 1647
@@ -1738,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1738 struct zoneref *z; 1739 struct zoneref *z;
1739 struct zone *zone; 1740 struct zone *zone;
1740 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1741 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1742 unsigned long writeback_threshold;
1741 1743
1742 delayacct_freepages_start(); 1744 delayacct_freepages_start();
1743 1745
@@ -1773,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1773 } 1775 }
1774 } 1776 }
1775 total_scanned += sc->nr_scanned; 1777 total_scanned += sc->nr_scanned;
1776 if (sc->nr_reclaimed >= sc->swap_cluster_max) { 1778 if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
1777 ret = sc->nr_reclaimed; 1779 ret = sc->nr_reclaimed;
1778 goto out; 1780 goto out;
1779 } 1781 }
@@ -1785,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1785 * that's undesirable in laptop mode, where we *want* lumpy 1787 * that's undesirable in laptop mode, where we *want* lumpy
1786 * writeout. So in laptop mode, write out the whole world. 1788 * writeout. So in laptop mode, write out the whole world.
1787 */ 1789 */
1788 if (total_scanned > sc->swap_cluster_max + 1790 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
1789 sc->swap_cluster_max / 2) { 1791 if (total_scanned > writeback_threshold) {
1790 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 1792 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1791 sc->may_writepage = 1; 1793 sc->may_writepage = 1;
1792 } 1794 }
1793 1795
1794 /* Take a nap, wait for some writeback to complete */ 1796 /* Take a nap, wait for some writeback to complete */
1795 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1797 if (!sc->hibernation_mode && sc->nr_scanned &&
1798 priority < DEF_PRIORITY - 2)
1796 congestion_wait(BLK_RW_ASYNC, HZ/10); 1799 congestion_wait(BLK_RW_ASYNC, HZ/10);
1797 } 1800 }
1798 /* top priority shrink_zones still had more to do? don't OOM, then */ 1801 /* top priority shrink_zones still had more to do? don't OOM, then */
@@ -1831,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1831 struct scan_control sc = { 1834 struct scan_control sc = {
1832 .gfp_mask = gfp_mask, 1835 .gfp_mask = gfp_mask,
1833 .may_writepage = !laptop_mode, 1836 .may_writepage = !laptop_mode,
1834 .swap_cluster_max = SWAP_CLUSTER_MAX, 1837 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1835 .may_unmap = 1, 1838 .may_unmap = 1,
1836 .may_swap = 1, 1839 .may_swap = 1,
1837 .swappiness = vm_swappiness, 1840 .swappiness = vm_swappiness,
@@ -1855,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1855 .may_writepage = !laptop_mode, 1858 .may_writepage = !laptop_mode,
1856 .may_unmap = 1, 1859 .may_unmap = 1,
1857 .may_swap = !noswap, 1860 .may_swap = !noswap,
1858 .swap_cluster_max = SWAP_CLUSTER_MAX,
1859 .swappiness = swappiness, 1861 .swappiness = swappiness,
1860 .order = 0, 1862 .order = 0,
1861 .mem_cgroup = mem, 1863 .mem_cgroup = mem,
@@ -1889,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1889 .may_writepage = !laptop_mode, 1891 .may_writepage = !laptop_mode,
1890 .may_unmap = 1, 1892 .may_unmap = 1,
1891 .may_swap = !noswap, 1893 .may_swap = !noswap,
1892 .swap_cluster_max = SWAP_CLUSTER_MAX, 1894 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1893 .swappiness = swappiness, 1895 .swappiness = swappiness,
1894 .order = 0, 1896 .order = 0,
1895 .mem_cgroup = mem_cont, 1897 .mem_cgroup = mem_cont,
@@ -1904,6 +1906,33 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1904} 1906}
1905#endif 1907#endif
1906 1908
1909/* is kswapd sleeping prematurely? */
1910static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1911{
1912 int i;
1913
1914 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
1915 if (remaining)
1916 return 1;
1917
1918 /* If after HZ/10, a zone is below the high mark, it's premature */
1919 for (i = 0; i < pgdat->nr_zones; i++) {
1920 struct zone *zone = pgdat->node_zones + i;
1921
1922 if (!populated_zone(zone))
1923 continue;
1924
1925 if (zone_is_all_unreclaimable(zone))
1926 continue;
1927
1928 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
1929 0, 0))
1930 return 1;
1931 }
1932
1933 return 0;
1934}
1935
1907/* 1936/*
1908 * For kswapd, balance_pgdat() will work across all this node's zones until 1937 * For kswapd, balance_pgdat() will work across all this node's zones until
1909 * they are all at high_wmark_pages(zone). 1938 * they are all at high_wmark_pages(zone).
@@ -1936,7 +1965,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1936 .gfp_mask = GFP_KERNEL, 1965 .gfp_mask = GFP_KERNEL,
1937 .may_unmap = 1, 1966 .may_unmap = 1,
1938 .may_swap = 1, 1967 .may_swap = 1,
1939 .swap_cluster_max = SWAP_CLUSTER_MAX, 1968 /*
1969 * kswapd doesn't want to be bailed out while reclaim. because
1970 * we want to put equal scanning pressure on each zone.
1971 */
1972 .nr_to_reclaim = ULONG_MAX,
1940 .swappiness = vm_swappiness, 1973 .swappiness = vm_swappiness,
1941 .order = order, 1974 .order = order,
1942 .mem_cgroup = NULL, 1975 .mem_cgroup = NULL,
@@ -1961,6 +1994,7 @@ loop_again:
1961 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1994 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1962 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1995 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1963 unsigned long lru_pages = 0; 1996 unsigned long lru_pages = 0;
1997 int has_under_min_watermark_zone = 0;
1964 1998
1965 /* The swap token gets in the way of swapout... */ 1999 /* The swap token gets in the way of swapout... */
1966 if (!priority) 2000 if (!priority)
@@ -2067,6 +2101,15 @@ loop_again:
2067 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2101 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2068 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2102 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2069 sc.may_writepage = 1; 2103 sc.may_writepage = 1;
2104
2105 /*
2106 * We are still under min water mark. it mean we have
2107 * GFP_ATOMIC allocation failure risk. Hurry up!
2108 */
2109 if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
2110 end_zone, 0))
2111 has_under_min_watermark_zone = 1;
2112
2070 } 2113 }
2071 if (all_zones_ok) 2114 if (all_zones_ok)
2072 break; /* kswapd: all done */ 2115 break; /* kswapd: all done */
@@ -2074,8 +2117,12 @@ loop_again:
2074 * OK, kswapd is getting into trouble. Take a nap, then take 2117 * OK, kswapd is getting into trouble. Take a nap, then take
2075 * another pass across the zones. 2118 * another pass across the zones.
2076 */ 2119 */
2077 if (total_scanned && priority < DEF_PRIORITY - 2) 2120 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2078 congestion_wait(BLK_RW_ASYNC, HZ/10); 2121 if (has_under_min_watermark_zone)
2122 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2123 else
2124 congestion_wait(BLK_RW_ASYNC, HZ/10);
2125 }
2079 2126
2080 /* 2127 /*
2081 * We do this so kswapd doesn't build up large priorities for 2128 * We do this so kswapd doesn't build up large priorities for
@@ -2173,6 +2220,7 @@ static int kswapd(void *p)
2173 order = 0; 2220 order = 0;
2174 for ( ; ; ) { 2221 for ( ; ; ) {
2175 unsigned long new_order; 2222 unsigned long new_order;
2223 int ret;
2176 2224
2177 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2225 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2178 new_order = pgdat->kswapd_max_order; 2226 new_order = pgdat->kswapd_max_order;
@@ -2184,19 +2232,45 @@ static int kswapd(void *p)
2184 */ 2232 */
2185 order = new_order; 2233 order = new_order;
2186 } else { 2234 } else {
2187 if (!freezing(current)) 2235 if (!freezing(current) && !kthread_should_stop()) {
2188 schedule(); 2236 long remaining = 0;
2237
2238 /* Try to sleep for a short interval */
2239 if (!sleeping_prematurely(pgdat, order, remaining)) {
2240 remaining = schedule_timeout(HZ/10);
2241 finish_wait(&pgdat->kswapd_wait, &wait);
2242 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2243 }
2244
2245 /*
2246 * After a short sleep, check if it was a
2247 * premature sleep. If not, then go fully
2248 * to sleep until explicitly woken up
2249 */
2250 if (!sleeping_prematurely(pgdat, order, remaining))
2251 schedule();
2252 else {
2253 if (remaining)
2254 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2255 else
2256 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2257 }
2258 }
2189 2259
2190 order = pgdat->kswapd_max_order; 2260 order = pgdat->kswapd_max_order;
2191 } 2261 }
2192 finish_wait(&pgdat->kswapd_wait, &wait); 2262 finish_wait(&pgdat->kswapd_wait, &wait);
2193 2263
2194 if (!try_to_freeze()) { 2264 ret = try_to_freeze();
2195 /* We can speed up thawing tasks if we don't call 2265 if (kthread_should_stop())
2196 * balance_pgdat after returning from the refrigerator 2266 break;
2197 */ 2267
2268 /*
2269 * We can speed up thawing tasks if we don't call balance_pgdat
2270 * after returning from the refrigerator
2271 */
2272 if (!ret)
2198 balance_pgdat(pgdat, order); 2273 balance_pgdat(pgdat, order);
2199 }
2200 } 2274 }
2201 return 0; 2275 return 0;
2202} 2276}
@@ -2260,148 +2334,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
2260 2334
2261#ifdef CONFIG_HIBERNATION 2335#ifdef CONFIG_HIBERNATION
2262/* 2336/*
2263 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2337 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
2264 * from LRU lists system-wide, for given pass and priority.
2265 *
2266 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2267 */
2268static void shrink_all_zones(unsigned long nr_pages, int prio,
2269 int pass, struct scan_control *sc)
2270{
2271 struct zone *zone;
2272 unsigned long nr_reclaimed = 0;
2273 struct zone_reclaim_stat *reclaim_stat;
2274
2275 for_each_populated_zone(zone) {
2276 enum lru_list l;
2277
2278 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2279 continue;
2280
2281 for_each_evictable_lru(l) {
2282 enum zone_stat_item ls = NR_LRU_BASE + l;
2283 unsigned long lru_pages = zone_page_state(zone, ls);
2284
2285 /* For pass = 0, we don't shrink the active list */
2286 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2287 l == LRU_ACTIVE_FILE))
2288 continue;
2289
2290 reclaim_stat = get_reclaim_stat(zone, sc);
2291 reclaim_stat->nr_saved_scan[l] +=
2292 (lru_pages >> prio) + 1;
2293 if (reclaim_stat->nr_saved_scan[l]
2294 >= nr_pages || pass > 3) {
2295 unsigned long nr_to_scan;
2296
2297 reclaim_stat->nr_saved_scan[l] = 0;
2298 nr_to_scan = min(nr_pages, lru_pages);
2299 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2300 sc, prio);
2301 if (nr_reclaimed >= nr_pages) {
2302 sc->nr_reclaimed += nr_reclaimed;
2303 return;
2304 }
2305 }
2306 }
2307 }
2308 sc->nr_reclaimed += nr_reclaimed;
2309}
2310
2311/*
2312 * Try to free `nr_pages' of memory, system-wide, and return the number of
2313 * freed pages. 2338 * freed pages.
2314 * 2339 *
2315 * Rather than trying to age LRUs the aim is to preserve the overall 2340 * Rather than trying to age LRUs the aim is to preserve the overall
2316 * LRU order by reclaiming preferentially 2341 * LRU order by reclaiming preferentially
2317 * inactive > active > active referenced > active mapped 2342 * inactive > active > active referenced > active mapped
2318 */ 2343 */
2319unsigned long shrink_all_memory(unsigned long nr_pages) 2344unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2320{ 2345{
2321 unsigned long lru_pages, nr_slab;
2322 int pass;
2323 struct reclaim_state reclaim_state; 2346 struct reclaim_state reclaim_state;
2324 struct scan_control sc = { 2347 struct scan_control sc = {
2325 .gfp_mask = GFP_KERNEL, 2348 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2326 .may_unmap = 0, 2349 .may_swap = 1,
2350 .may_unmap = 1,
2327 .may_writepage = 1, 2351 .may_writepage = 1,
2352 .nr_to_reclaim = nr_to_reclaim,
2353 .hibernation_mode = 1,
2354 .swappiness = vm_swappiness,
2355 .order = 0,
2328 .isolate_pages = isolate_pages_global, 2356 .isolate_pages = isolate_pages_global,
2329 .nr_reclaimed = 0,
2330 }; 2357 };
2358 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2359 struct task_struct *p = current;
2360 unsigned long nr_reclaimed;
2331 2361
2332 current->reclaim_state = &reclaim_state; 2362 p->flags |= PF_MEMALLOC;
2333 2363 lockdep_set_current_reclaim_state(sc.gfp_mask);
2334 lru_pages = global_reclaimable_pages(); 2364 reclaim_state.reclaimed_slab = 0;
2335 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2365 p->reclaim_state = &reclaim_state;
2336 /* If slab caches are huge, it's better to hit them first */
2337 while (nr_slab >= lru_pages) {
2338 reclaim_state.reclaimed_slab = 0;
2339 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2340 if (!reclaim_state.reclaimed_slab)
2341 break;
2342
2343 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2344 if (sc.nr_reclaimed >= nr_pages)
2345 goto out;
2346
2347 nr_slab -= reclaim_state.reclaimed_slab;
2348 }
2349
2350 /*
2351 * We try to shrink LRUs in 5 passes:
2352 * 0 = Reclaim from inactive_list only
2353 * 1 = Reclaim from active list but don't reclaim mapped
2354 * 2 = 2nd pass of type 1
2355 * 3 = Reclaim mapped (normal reclaim)
2356 * 4 = 2nd pass of type 3
2357 */
2358 for (pass = 0; pass < 5; pass++) {
2359 int prio;
2360
2361 /* Force reclaiming mapped pages in the passes #3 and #4 */
2362 if (pass > 2)
2363 sc.may_unmap = 1;
2364
2365 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2366 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2367
2368 sc.nr_scanned = 0;
2369 sc.swap_cluster_max = nr_to_scan;
2370 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2371 if (sc.nr_reclaimed >= nr_pages)
2372 goto out;
2373
2374 reclaim_state.reclaimed_slab = 0;
2375 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2376 global_reclaimable_pages());
2377 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2378 if (sc.nr_reclaimed >= nr_pages)
2379 goto out;
2380
2381 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2382 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2383 }
2384 }
2385
2386 /*
2387 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2388 * something in slab caches
2389 */
2390 if (!sc.nr_reclaimed) {
2391 do {
2392 reclaim_state.reclaimed_slab = 0;
2393 shrink_slab(nr_pages, sc.gfp_mask,
2394 global_reclaimable_pages());
2395 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2396 } while (sc.nr_reclaimed < nr_pages &&
2397 reclaim_state.reclaimed_slab > 0);
2398 }
2399 2366
2367 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2400 2368
2401out: 2369 p->reclaim_state = NULL;
2402 current->reclaim_state = NULL; 2370 lockdep_clear_current_reclaim_state();
2371 p->flags &= ~PF_MEMALLOC;
2403 2372
2404 return sc.nr_reclaimed; 2373 return nr_reclaimed;
2405} 2374}
2406#endif /* CONFIG_HIBERNATION */ 2375#endif /* CONFIG_HIBERNATION */
2407 2376
@@ -2451,6 +2420,17 @@ int kswapd_run(int nid)
2451 return ret; 2420 return ret;
2452} 2421}
2453 2422
2423/*
2424 * Called by memory hotplug when all memory in a node is offlined.
2425 */
2426void kswapd_stop(int nid)
2427{
2428 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2429
2430 if (kswapd)
2431 kthread_stop(kswapd);
2432}
2433
2454static int __init kswapd_init(void) 2434static int __init kswapd_init(void)
2455{ 2435{
2456 int nid; 2436 int nid;
@@ -2553,8 +2533,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2553 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2533 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2554 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2534 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2555 .may_swap = 1, 2535 .may_swap = 1,
2556 .swap_cluster_max = max_t(unsigned long, nr_pages, 2536 .nr_to_reclaim = max_t(unsigned long, nr_pages,
2557 SWAP_CLUSTER_MAX), 2537 SWAP_CLUSTER_MAX),
2558 .gfp_mask = gfp_mask, 2538 .gfp_mask = gfp_mask,
2559 .swappiness = vm_swappiness, 2539 .swappiness = vm_swappiness,
2560 .order = order, 2540 .order = order,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c81321f9feec..6051fbab67ba 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -683,6 +683,9 @@ static const char * const vmstat_text[] = {
683 "slabs_scanned", 683 "slabs_scanned",
684 "kswapd_steal", 684 "kswapd_steal",
685 "kswapd_inodesteal", 685 "kswapd_inodesteal",
686 "kswapd_low_wmark_hit_quickly",
687 "kswapd_high_wmark_hit_quickly",
688 "kswapd_skip_congestion_wait",
686 "pageoutrun", 689 "pageoutrun",
687 "allocstall", 690 "allocstall",
688 691
@@ -883,11 +886,10 @@ static void vmstat_update(struct work_struct *w)
883 886
884static void __cpuinit start_cpu_timer(int cpu) 887static void __cpuinit start_cpu_timer(int cpu)
885{ 888{
886 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); 889 struct delayed_work *work = &per_cpu(vmstat_work, cpu);
887 890
888 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); 891 INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
889 schedule_delayed_work_on(cpu, vmstat_work, 892 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
890 __round_jiffies_relative(HZ, cpu));
891} 893}
892 894
893/* 895/*