aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig28
-rw-r--r--mm/Makefile7
-rw-r--r--mm/allocpercpu.c177
-rw-r--r--mm/backing-dev.c39
-rw-r--r--mm/bootmem.c228
-rw-r--r--mm/bounce.c1
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/failslab.c19
-rw-r--r--mm/filemap.c175
-rw-r--r--mm/filemap_xip.c3
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hugetlb.c565
-rw-r--r--mm/hwpoison-inject.c113
-rw-r--r--mm/internal.h35
-rw-r--r--mm/kmemleak.c193
-rw-r--r--mm/ksm.c971
-rw-r--r--mm/maccess.c11
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memcontrol.c1835
-rw-r--r--mm/memory-failure.c579
-rw-r--r--mm/memory.c216
-rw-r--r--mm/memory_hotplug.c26
-rw-r--r--mm/mempolicy.c234
-rw-r--r--mm/migrate.c177
-rw-r--r--mm/mincore.c39
-rw-r--r--mm/mlock.c57
-rw-r--r--mm/mmap.c398
-rw-r--r--mm/mmu_context.c4
-rw-r--r--mm/mmu_notifier.c1
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/mremap.c249
-rw-r--r--mm/nommu.c187
-rw-r--r--mm/oom_kill.c116
-rw-r--r--mm/page-writeback.c12
-rw-r--r--mm/page_alloc.c527
-rw-r--r--mm/page_cgroup.c42
-rw-r--r--mm/page_io.c18
-rw-r--r--mm/pagewalk.c59
-rw-r--r--mm/percpu.c90
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c19
-rw-r--r--mm/rmap.c568
-rw-r--r--mm/shmem.c84
-rw-r--r--mm/shmem_acl.c171
-rw-r--r--mm/slab.c186
-rw-r--r--mm/slub.c368
-rw-r--r--mm/sparse-vmemmap.c77
-rw-r--r--mm/sparse.c197
-rw-r--r--mm/swap.c3
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c918
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmalloc.c125
-rw-r--r--mm/vmscan.c464
-rw-r--r--mm/vmstat.c28
58 files changed, 7338 insertions, 3431 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 44cf6f0a3a6d..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 bool 116 bool
117 117
118config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
119 def_bool y
120 depends on SPARSEMEM && X86_64
121
118config SPARSEMEM_VMEMMAP 122config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 123 bool "Sparse Memory virtual memmap"
120 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE 124 depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
@@ -158,11 +162,13 @@ config PAGEFLAGS_EXTENDED
158# Default to 4 for wider testing, though 8 might be more appropriate. 162# Default to 4 for wider testing, though 8 might be more appropriate.
159# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. 163# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
160# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. 164# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
165# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
161# 166#
162config SPLIT_PTLOCK_CPUS 167config SPLIT_PTLOCK_CPUS
163 int 168 int
164 default "4096" if ARM && !CPU_CACHE_VIPT 169 default "999999" if ARM && !CPU_CACHE_VIPT
165 default "4096" if PARISC && !PA20 170 default "999999" if PARISC && !PA20
171 default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
166 default "4" 172 default "4"
167 173
168# 174#
@@ -193,21 +199,13 @@ config BOUNCE
193config NR_QUICK 199config NR_QUICK
194 int 200 int
195 depends on QUICKLIST 201 depends on QUICKLIST
196 default "2" if SUPERH || AVR32 202 default "2" if AVR32
197 default "1" 203 default "1"
198 204
199config VIRT_TO_BUS 205config VIRT_TO_BUS
200 def_bool y 206 def_bool y
201 depends on !ARCH_NO_VIRT_TO_BUS 207 depends on !ARCH_NO_VIRT_TO_BUS
202 208
203config HAVE_MLOCK
204 bool
205 default y if MMU=y
206
207config HAVE_MLOCKED_PAGE_BIT
208 bool
209 default y if HAVE_MLOCK=y
210
211config MMU_NOTIFIER 209config MMU_NOTIFIER
212 bool 210 bool
213 211
@@ -218,7 +216,7 @@ config KSM
218 Enable Kernel Samepage Merging: KSM periodically scans those areas 216 Enable Kernel Samepage Merging: KSM periodically scans those areas
219 of an application's address space that an app has advised may be 217 of an application's address space that an app has advised may be
220 mergeable. When it finds pages of identical content, it replaces 218 mergeable. When it finds pages of identical content, it replaces
221 the many instances by a single resident page with that content, so 219 the many instances by a single page with that content, so
222 saving memory until one or another app needs to modify the content. 220 saving memory until one or another app needs to modify the content.
223 Recommended for use with KVM, or with other duplicative applications. 221 Recommended for use with KVM, or with other duplicative applications.
224 See Documentation/vm/ksm.txt for more information: KSM is inactive 222 See Documentation/vm/ksm.txt for more information: KSM is inactive
@@ -227,6 +225,7 @@ config KSM
227 225
228config DEFAULT_MMAP_MIN_ADDR 226config DEFAULT_MMAP_MIN_ADDR
229 int "Low address space to protect from user allocation" 227 int "Low address space to protect from user allocation"
228 depends on MMU
230 default 4096 229 default 4096
231 help 230 help
232 This is the portion of low virtual memory which should be protected 231 This is the portion of low virtual memory which should be protected
@@ -257,8 +256,9 @@ config MEMORY_FAILURE
257 special hardware support and typically ECC memory. 256 special hardware support and typically ECC memory.
258 257
259config HWPOISON_INJECT 258config HWPOISON_INJECT
260 tristate "Poison pages injector" 259 tristate "HWPoison pages injector"
261 depends on MEMORY_FAILURE && DEBUG_KERNEL 260 depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
261 select PROC_PAGE_MONITOR
262 262
263config NOMMU_INITIAL_TRIM_EXCESS 263config NOMMU_INITIAL_TRIM_EXCESS
264 int "Turn on mmap() excess space trimming before booting" 264 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index ebf849042ed3..6c2a73a54a43 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
22obj-$(CONFIG_NUMA) += mempolicy.o 22obj-$(CONFIG_NUMA) += mempolicy.o
23obj-$(CONFIG_SPARSEMEM) += sparse.o 23obj-$(CONFIG_SPARSEMEM) += sparse.o
24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 24obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o 27obj-$(CONFIG_KSM) += ksm.o
@@ -34,10 +33,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
34obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
35obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
36obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
37ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA 36ifdef CONFIG_SMP
38obj-$(CONFIG_SMP) += percpu.o 37obj-y += percpu.o
39else 38else
40obj-$(CONFIG_SMP) += allocpercpu.o 39obj-y += percpu_up.o
41endif 40endif
42obj-$(CONFIG_QUICKLIST) += quicklist.o 41obj-$(CONFIG_QUICKLIST) += quicklist.o
43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
deleted file mode 100644
index df34ceae0c67..000000000000
--- a/mm/allocpercpu.c
+++ /dev/null
@@ -1,177 +0,0 @@
1/*
2 * linux/mm/allocpercpu.c
3 *
4 * Separated from slab.c August 11, 2006 Christoph Lameter
5 */
6#include <linux/mm.h>
7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
10
11#ifndef cache_line_size
12#define cache_line_size() L1_CACHE_BYTES
13#endif
14
15/**
16 * percpu_depopulate - depopulate per-cpu data for given cpu
17 * @__pdata: per-cpu data to depopulate
18 * @cpu: depopulate per-cpu data for this cpu
19 *
20 * Depopulating per-cpu data for a cpu going offline would be a typical
21 * use case. You need to register a cpu hotplug handler for that purpose.
22 */
23static void percpu_depopulate(void *__pdata, int cpu)
24{
25 struct percpu_data *pdata = __percpu_disguise(__pdata);
26
27 kfree(pdata->ptrs[cpu]);
28 pdata->ptrs[cpu] = NULL;
29}
30
31/**
32 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
33 * @__pdata: per-cpu data to depopulate
34 * @mask: depopulate per-cpu data for cpu's selected through mask bits
35 */
36static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
37{
38 int cpu;
39 for_each_cpu_mask_nr(cpu, *mask)
40 percpu_depopulate(__pdata, cpu);
41}
42
43#define percpu_depopulate_mask(__pdata, mask) \
44 __percpu_depopulate_mask((__pdata), &(mask))
45
46/**
47 * percpu_populate - populate per-cpu data for given cpu
48 * @__pdata: per-cpu data to populate further
49 * @size: size of per-cpu object
50 * @gfp: may sleep or not etc.
51 * @cpu: populate per-data for this cpu
52 *
53 * Populating per-cpu data for a cpu coming online would be a typical
54 * use case. You need to register a cpu hotplug handler for that purpose.
55 * Per-cpu object is populated with zeroed buffer.
56 */
57static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
58{
59 struct percpu_data *pdata = __percpu_disguise(__pdata);
60 int node = cpu_to_node(cpu);
61
62 /*
63 * We should make sure each CPU gets private memory.
64 */
65 size = roundup(size, cache_line_size());
66
67 BUG_ON(pdata->ptrs[cpu]);
68 if (node_online(node))
69 pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
70 else
71 pdata->ptrs[cpu] = kzalloc(size, gfp);
72 return pdata->ptrs[cpu];
73}
74
75/**
76 * percpu_populate_mask - populate per-cpu data for more cpu's
77 * @__pdata: per-cpu data to populate further
78 * @size: size of per-cpu object
79 * @gfp: may sleep or not etc.
80 * @mask: populate per-cpu data for cpu's selected through mask bits
81 *
82 * Per-cpu objects are populated with zeroed buffers.
83 */
84static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
85 cpumask_t *mask)
86{
87 cpumask_t populated;
88 int cpu;
89
90 cpus_clear(populated);
91 for_each_cpu_mask_nr(cpu, *mask)
92 if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
93 __percpu_depopulate_mask(__pdata, &populated);
94 return -ENOMEM;
95 } else
96 cpu_set(cpu, populated);
97 return 0;
98}
99
100#define percpu_populate_mask(__pdata, size, gfp, mask) \
101 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
102
103/**
104 * alloc_percpu - initial setup of per-cpu data
105 * @size: size of per-cpu object
106 * @align: alignment
107 *
108 * Allocate dynamic percpu area. Percpu objects are populated with
109 * zeroed buffers.
110 */
111void *__alloc_percpu(size_t size, size_t align)
112{
113 /*
114 * We allocate whole cache lines to avoid false sharing
115 */
116 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
117 void *pdata = kzalloc(sz, GFP_KERNEL);
118 void *__pdata = __percpu_disguise(pdata);
119
120 /*
121 * Can't easily make larger alignment work with kmalloc. WARN
122 * on it. Larger alignment should only be used for module
123 * percpu sections on SMP for which this path isn't used.
124 */
125 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
126
127 if (unlikely(!pdata))
128 return NULL;
129 if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
130 &cpu_possible_map)))
131 return __pdata;
132 kfree(pdata);
133 return NULL;
134}
135EXPORT_SYMBOL_GPL(__alloc_percpu);
136
137/**
138 * free_percpu - final cleanup of per-cpu data
139 * @__pdata: object to clean up
140 *
141 * We simply clean up any per-cpu object left. No need for the client to
142 * track and specify through a bis mask which per-cpu objects are to free.
143 */
144void free_percpu(void *__pdata)
145{
146 if (unlikely(!__pdata))
147 return;
148 __percpu_depopulate_mask(__pdata, cpu_possible_mask);
149 kfree(__percpu_disguise(__pdata));
150}
151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 67a33a5a1a93..707d0dc6da0f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,8 @@
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/device.h> 12#include <linux/device.h>
13 13
14static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
15
14void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 16void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
15{ 17{
16} 18}
@@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = {
25}; 27};
26EXPORT_SYMBOL_GPL(default_backing_dev_info); 28EXPORT_SYMBOL_GPL(default_backing_dev_info);
27 29
30struct backing_dev_info noop_backing_dev_info = {
31 .name = "noop",
32};
33EXPORT_SYMBOL_GPL(noop_backing_dev_info);
34
28static struct class *bdi_class; 35static struct class *bdi_class;
29 36
30/* 37/*
@@ -227,6 +234,9 @@ static struct device_attribute bdi_dev_attrs[] = {
227static __init int bdi_class_init(void) 234static __init int bdi_class_init(void)
228{ 235{
229 bdi_class = class_create(THIS_MODULE, "bdi"); 236 bdi_class = class_create(THIS_MODULE, "bdi");
237 if (IS_ERR(bdi_class))
238 return PTR_ERR(bdi_class);
239
230 bdi_class->dev_attrs = bdi_dev_attrs; 240 bdi_class->dev_attrs = bdi_dev_attrs;
231 bdi_debug_init(); 241 bdi_debug_init();
232 return 0; 242 return 0;
@@ -609,7 +619,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
609 * it would never exet if it is currently stuck in the refrigerator. 619 * it would never exet if it is currently stuck in the refrigerator.
610 */ 620 */
611 list_for_each_entry(wb, &bdi->wb_list, list) { 621 list_for_each_entry(wb, &bdi->wb_list, list) {
612 wb->task->flags &= ~PF_FROZEN; 622 thaw_process(wb->task);
613 kthread_stop(wb->task); 623 kthread_stop(wb->task);
614 } 624 }
615} 625}
@@ -712,6 +722,33 @@ void bdi_destroy(struct backing_dev_info *bdi)
712} 722}
713EXPORT_SYMBOL(bdi_destroy); 723EXPORT_SYMBOL(bdi_destroy);
714 724
725/*
726 * For use from filesystems to quickly init and register a bdi associated
727 * with dirty writeback
728 */
729int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
730 unsigned int cap)
731{
732 char tmp[32];
733 int err;
734
735 bdi->name = name;
736 bdi->capabilities = cap;
737 err = bdi_init(bdi);
738 if (err)
739 return err;
740
741 sprintf(tmp, "%.28s%s", name, "-%d");
742 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
743 if (err) {
744 bdi_destroy(bdi);
745 return err;
746 }
747
748 return 0;
749}
750EXPORT_SYMBOL(bdi_setup_and_register);
751
715static wait_queue_head_t congestion_wqh[2] = { 752static wait_queue_head_t congestion_wqh[2] = {
716 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 753 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
717 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 754 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2731c6..58c66cc5056a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -10,9 +10,11 @@
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h>
13#include <linux/bootmem.h> 14#include <linux/bootmem.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
17#include <linux/range.h>
16 18
17#include <asm/bug.h> 19#include <asm/bug.h>
18#include <asm/io.h> 20#include <asm/io.h>
@@ -32,6 +34,7 @@ unsigned long max_pfn;
32unsigned long saved_max_pfn; 34unsigned long saved_max_pfn;
33#endif 35#endif
34 36
37#ifndef CONFIG_NO_BOOTMEM
35bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; 38bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
36 39
37static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); 40static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +145,78 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
142 min_low_pfn = start; 145 min_low_pfn = start;
143 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); 146 return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
144} 147}
148#endif
149/*
150 * free_bootmem_late - free bootmem pages directly to page allocator
151 * @addr: starting address of the range
152 * @size: size of the range in bytes
153 *
154 * This is only useful when the bootmem allocator has already been torn
155 * down, but we are still initializing the system. Pages are given directly
156 * to the page allocator, no bootmem metadata is updated because it is gone.
157 */
158void __init free_bootmem_late(unsigned long addr, unsigned long size)
159{
160 unsigned long cursor, end;
145 161
162 kmemleak_free_part(__va(addr), size);
163
164 cursor = PFN_UP(addr);
165 end = PFN_DOWN(addr + size);
166
167 for (; cursor < end; cursor++) {
168 __free_pages_bootmem(pfn_to_page(cursor), 0);
169 totalram_pages++;
170 }
171}
172
173#ifdef CONFIG_NO_BOOTMEM
174static void __init __free_pages_memory(unsigned long start, unsigned long end)
175{
176 int i;
177 unsigned long start_aligned, end_aligned;
178 int order = ilog2(BITS_PER_LONG);
179
180 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
181 end_aligned = end & ~(BITS_PER_LONG - 1);
182
183 if (end_aligned <= start_aligned) {
184 for (i = start; i < end; i++)
185 __free_pages_bootmem(pfn_to_page(i), 0);
186
187 return;
188 }
189
190 for (i = start; i < start_aligned; i++)
191 __free_pages_bootmem(pfn_to_page(i), 0);
192
193 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
194 __free_pages_bootmem(pfn_to_page(i), order);
195
196 for (i = end_aligned; i < end; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198}
199
200unsigned long __init free_all_memory_core_early(int nodeid)
201{
202 int i;
203 u64 start, end;
204 unsigned long count = 0;
205 struct range *range = NULL;
206 int nr_range;
207
208 nr_range = get_free_all_memory_range(&range, nodeid);
209
210 for (i = 0; i < nr_range; i++) {
211 start = range[i].start;
212 end = range[i].end;
213 count += end - start;
214 __free_pages_memory(start, end);
215 }
216
217 return count;
218}
219#else
146static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 220static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
147{ 221{
148 int aligned; 222 int aligned;
@@ -203,6 +277,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
203 277
204 return count; 278 return count;
205} 279}
280#endif
206 281
207/** 282/**
208 * free_all_bootmem_node - release a node's free pages to the buddy allocator 283 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -213,7 +288,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
213unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 288unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
214{ 289{
215 register_page_bootmem_info_node(pgdat); 290 register_page_bootmem_info_node(pgdat);
291#ifdef CONFIG_NO_BOOTMEM
292 /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
293 return 0;
294#else
216 return free_all_bootmem_core(pgdat->bdata); 295 return free_all_bootmem_core(pgdat->bdata);
296#endif
217} 297}
218 298
219/** 299/**
@@ -223,9 +303,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
223 */ 303 */
224unsigned long __init free_all_bootmem(void) 304unsigned long __init free_all_bootmem(void)
225{ 305{
226 return free_all_bootmem_core(NODE_DATA(0)->bdata); 306#ifdef CONFIG_NO_BOOTMEM
307 /*
308 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
309 * because in some case like Node0 doesnt have RAM installed
310 * low ram will be on Node1
311 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
312 * will be used instead of only Node0 related
313 */
314 return free_all_memory_core_early(MAX_NUMNODES);
315#else
316 unsigned long total_pages = 0;
317 bootmem_data_t *bdata;
318
319 list_for_each_entry(bdata, &bdata_list, list)
320 total_pages += free_all_bootmem_core(bdata);
321
322 return total_pages;
323#endif
227} 324}
228 325
326#ifndef CONFIG_NO_BOOTMEM
229static void __init __free(bootmem_data_t *bdata, 327static void __init __free(bootmem_data_t *bdata,
230 unsigned long sidx, unsigned long eidx) 328 unsigned long sidx, unsigned long eidx)
231{ 329{
@@ -320,6 +418,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
320 } 418 }
321 BUG(); 419 BUG();
322} 420}
421#endif
323 422
324/** 423/**
325 * free_bootmem_node - mark a page range as usable 424 * free_bootmem_node - mark a page range as usable
@@ -334,6 +433,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
334void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 433void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
335 unsigned long size) 434 unsigned long size)
336{ 435{
436#ifdef CONFIG_NO_BOOTMEM
437 free_early(physaddr, physaddr + size);
438#else
337 unsigned long start, end; 439 unsigned long start, end;
338 440
339 kmemleak_free_part(__va(physaddr), size); 441 kmemleak_free_part(__va(physaddr), size);
@@ -342,6 +444,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
342 end = PFN_DOWN(physaddr + size); 444 end = PFN_DOWN(physaddr + size);
343 445
344 mark_bootmem_node(pgdat->bdata, start, end, 0, 0); 446 mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
447#endif
345} 448}
346 449
347/** 450/**
@@ -355,6 +458,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
355 */ 458 */
356void __init free_bootmem(unsigned long addr, unsigned long size) 459void __init free_bootmem(unsigned long addr, unsigned long size)
357{ 460{
461#ifdef CONFIG_NO_BOOTMEM
462 free_early(addr, addr + size);
463#else
358 unsigned long start, end; 464 unsigned long start, end;
359 465
360 kmemleak_free_part(__va(addr), size); 466 kmemleak_free_part(__va(addr), size);
@@ -363,6 +469,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
363 end = PFN_DOWN(addr + size); 469 end = PFN_DOWN(addr + size);
364 470
365 mark_bootmem(start, end, 0, 0); 471 mark_bootmem(start, end, 0, 0);
472#endif
366} 473}
367 474
368/** 475/**
@@ -379,12 +486,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
379int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 486int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
380 unsigned long size, int flags) 487 unsigned long size, int flags)
381{ 488{
489#ifdef CONFIG_NO_BOOTMEM
490 panic("no bootmem");
491 return 0;
492#else
382 unsigned long start, end; 493 unsigned long start, end;
383 494
384 start = PFN_DOWN(physaddr); 495 start = PFN_DOWN(physaddr);
385 end = PFN_UP(physaddr + size); 496 end = PFN_UP(physaddr + size);
386 497
387 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 498 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
499#endif
388} 500}
389 501
390/** 502/**
@@ -400,16 +512,22 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
400int __init reserve_bootmem(unsigned long addr, unsigned long size, 512int __init reserve_bootmem(unsigned long addr, unsigned long size,
401 int flags) 513 int flags)
402{ 514{
515#ifdef CONFIG_NO_BOOTMEM
516 panic("no bootmem");
517 return 0;
518#else
403 unsigned long start, end; 519 unsigned long start, end;
404 520
405 start = PFN_DOWN(addr); 521 start = PFN_DOWN(addr);
406 end = PFN_UP(addr + size); 522 end = PFN_UP(addr + size);
407 523
408 return mark_bootmem(start, end, 1, flags); 524 return mark_bootmem(start, end, 1, flags);
525#endif
409} 526}
410 527
411static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, 528#ifndef CONFIG_NO_BOOTMEM
412 unsigned long step) 529static unsigned long __init align_idx(struct bootmem_data *bdata,
530 unsigned long idx, unsigned long step)
413{ 531{
414 unsigned long base = bdata->node_min_pfn; 532 unsigned long base = bdata->node_min_pfn;
415 533
@@ -421,8 +539,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
421 return ALIGN(base + idx, step) - base; 539 return ALIGN(base + idx, step) - base;
422} 540}
423 541
424static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, 542static unsigned long __init align_off(struct bootmem_data *bdata,
425 unsigned long align) 543 unsigned long off, unsigned long align)
426{ 544{
427 unsigned long base = PFN_PHYS(bdata->node_min_pfn); 545 unsigned long base = PFN_PHYS(bdata->node_min_pfn);
428 546
@@ -558,12 +676,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
558#endif 676#endif
559 return NULL; 677 return NULL;
560} 678}
679#endif
561 680
562static void * __init ___alloc_bootmem_nopanic(unsigned long size, 681static void * __init ___alloc_bootmem_nopanic(unsigned long size,
563 unsigned long align, 682 unsigned long align,
564 unsigned long goal, 683 unsigned long goal,
565 unsigned long limit) 684 unsigned long limit)
566{ 685{
686#ifdef CONFIG_NO_BOOTMEM
687 void *ptr;
688
689 if (WARN_ON_ONCE(slab_is_available()))
690 return kzalloc(size, GFP_NOWAIT);
691
692restart:
693
694 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
695
696 if (ptr)
697 return ptr;
698
699 if (goal != 0) {
700 goal = 0;
701 goto restart;
702 }
703
704 return NULL;
705#else
567 bootmem_data_t *bdata; 706 bootmem_data_t *bdata;
568 void *region; 707 void *region;
569 708
@@ -589,6 +728,7 @@ restart:
589 } 728 }
590 729
591 return NULL; 730 return NULL;
731#endif
592} 732}
593 733
594/** 734/**
@@ -607,7 +747,13 @@ restart:
607void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, 747void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
608 unsigned long goal) 748 unsigned long goal)
609{ 749{
610 return ___alloc_bootmem_nopanic(size, align, goal, 0); 750 unsigned long limit = 0;
751
752#ifdef CONFIG_NO_BOOTMEM
753 limit = -1UL;
754#endif
755
756 return ___alloc_bootmem_nopanic(size, align, goal, limit);
611} 757}
612 758
613static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, 759static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -641,9 +787,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
641void * __init __alloc_bootmem(unsigned long size, unsigned long align, 787void * __init __alloc_bootmem(unsigned long size, unsigned long align,
642 unsigned long goal) 788 unsigned long goal)
643{ 789{
644 return ___alloc_bootmem(size, align, goal, 0); 790 unsigned long limit = 0;
791
792#ifdef CONFIG_NO_BOOTMEM
793 limit = -1UL;
794#endif
795
796 return ___alloc_bootmem(size, align, goal, limit);
645} 797}
646 798
799#ifndef CONFIG_NO_BOOTMEM
647static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 800static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
648 unsigned long size, unsigned long align, 801 unsigned long size, unsigned long align,
649 unsigned long goal, unsigned long limit) 802 unsigned long goal, unsigned long limit)
@@ -660,6 +813,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
660 813
661 return ___alloc_bootmem(size, align, goal, limit); 814 return ___alloc_bootmem(size, align, goal, limit);
662} 815}
816#endif
663 817
664/** 818/**
665 * __alloc_bootmem_node - allocate boot memory from a specific node 819 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -682,7 +836,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
682 if (WARN_ON_ONCE(slab_is_available())) 836 if (WARN_ON_ONCE(slab_is_available()))
683 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 837 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
684 838
839#ifdef CONFIG_NO_BOOTMEM
840 return __alloc_memory_core_early(pgdat->node_id, size, align,
841 goal, -1ULL);
842#else
685 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 843 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
844#endif
845}
846
847void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
848 unsigned long align, unsigned long goal)
849{
850#ifdef MAX_DMA32_PFN
851 unsigned long end_pfn;
852
853 if (WARN_ON_ONCE(slab_is_available()))
854 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
855
856 /* update goal according ...MAX_DMA32_PFN */
857 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
858
859 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
860 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
861 void *ptr;
862 unsigned long new_goal;
863
864 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
865#ifdef CONFIG_NO_BOOTMEM
866 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
867 new_goal, -1ULL);
868#else
869 ptr = alloc_bootmem_core(pgdat->bdata, size, align,
870 new_goal, 0);
871#endif
872 if (ptr)
873 return ptr;
874 }
875#endif
876
877 return __alloc_bootmem_node(pgdat, size, align, goal);
878
686} 879}
687 880
688#ifdef CONFIG_SPARSEMEM 881#ifdef CONFIG_SPARSEMEM
@@ -696,6 +889,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
696void * __init alloc_bootmem_section(unsigned long size, 889void * __init alloc_bootmem_section(unsigned long size,
697 unsigned long section_nr) 890 unsigned long section_nr)
698{ 891{
892#ifdef CONFIG_NO_BOOTMEM
893 unsigned long pfn, goal, limit;
894
895 pfn = section_nr_to_pfn(section_nr);
896 goal = pfn << PAGE_SHIFT;
897 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
898
899 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
900 SMP_CACHE_BYTES, goal, limit);
901#else
699 bootmem_data_t *bdata; 902 bootmem_data_t *bdata;
700 unsigned long pfn, goal, limit; 903 unsigned long pfn, goal, limit;
701 904
@@ -705,6 +908,7 @@ void * __init alloc_bootmem_section(unsigned long size,
705 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 908 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
706 909
707 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 910 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
911#endif
708} 912}
709#endif 913#endif
710 914
@@ -716,11 +920,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
716 if (WARN_ON_ONCE(slab_is_available())) 920 if (WARN_ON_ONCE(slab_is_available()))
717 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 921 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
718 922
923#ifdef CONFIG_NO_BOOTMEM
924 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
925 goal, -1ULL);
926#else
719 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); 927 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
720 if (ptr) 928 if (ptr)
721 return ptr; 929 return ptr;
722 930
723 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 931 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
932#endif
724 if (ptr) 933 if (ptr)
725 return ptr; 934 return ptr;
726 935
@@ -771,6 +980,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
771 if (WARN_ON_ONCE(slab_is_available())) 980 if (WARN_ON_ONCE(slab_is_available()))
772 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 981 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
773 982
983#ifdef CONFIG_NO_BOOTMEM
984 return __alloc_memory_core_early(pgdat->node_id, size, align,
985 goal, ARCH_LOW_ADDRESS_LIMIT);
986#else
774 return ___alloc_bootmem_node(pgdat->bdata, size, align, 987 return ___alloc_bootmem_node(pgdat->bdata, size, align,
775 goal, ARCH_LOW_ADDRESS_LIMIT); 988 goal, ARCH_LOW_ADDRESS_LIMIT);
989#endif
776} 990}
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e34..13b6dad1eed2 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -6,6 +6,7 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/gfp.h>
9#include <linux/bio.h> 10#include <linux/bio.h>
10#include <linux/pagemap.h> 11#include <linux/pagemap.h>
11#include <linux/mempool.h> 12#include <linux/mempool.h>
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6f..8d723c9e8b75 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
77 switch (advice) { 77 switch (advice) {
78 case POSIX_FADV_NORMAL: 78 case POSIX_FADV_NORMAL:
79 file->f_ra.ra_pages = bdi->ra_pages; 79 file->f_ra.ra_pages = bdi->ra_pages;
80 spin_lock(&file->f_lock);
81 file->f_mode &= ~FMODE_RANDOM;
82 spin_unlock(&file->f_lock);
80 break; 83 break;
81 case POSIX_FADV_RANDOM: 84 case POSIX_FADV_RANDOM:
82 file->f_ra.ra_pages = 0; 85 spin_lock(&file->f_lock);
86 file->f_mode |= FMODE_RANDOM;
87 spin_unlock(&file->f_lock);
83 break; 88 break;
84 case POSIX_FADV_SEQUENTIAL: 89 case POSIX_FADV_SEQUENTIAL:
85 file->f_ra.ra_pages = bdi->ra_pages * 2; 90 file->f_ra.ra_pages = bdi->ra_pages * 2;
91 spin_lock(&file->f_lock);
92 file->f_mode &= ~FMODE_RANDOM;
93 spin_unlock(&file->f_lock);
86 break; 94 break;
87 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
88 if (!mapping->a_ops->readpage) { 96 if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..c5f88f240ddc 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,21 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h> 2#include <linux/slab.h>
3 3
4static struct { 4static struct {
5 struct fault_attr attr; 5 struct fault_attr attr;
6 u32 ignore_gfp_wait; 6 u32 ignore_gfp_wait;
7 int cache_filter;
7#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 8#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
8 struct dentry *ignore_gfp_wait_file; 9 struct dentry *ignore_gfp_wait_file;
10 struct dentry *cache_filter_file;
9#endif 11#endif
10} failslab = { 12} failslab = {
11 .attr = FAULT_ATTR_INITIALIZER, 13 .attr = FAULT_ATTR_INITIALIZER,
12 .ignore_gfp_wait = 1, 14 .ignore_gfp_wait = 1,
15 .cache_filter = 0,
13}; 16};
14 17
15bool should_failslab(size_t size, gfp_t gfpflags) 18bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
16{ 19{
17 if (gfpflags & __GFP_NOFAIL) 20 if (gfpflags & __GFP_NOFAIL)
18 return false; 21 return false;
@@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
20 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) 23 if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
21 return false; 24 return false;
22 25
26 if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
27 return false;
28
23 return should_fail(&failslab.attr, size); 29 return should_fail(&failslab.attr, size);
24} 30}
25 31
@@ -30,7 +36,6 @@ static int __init setup_failslab(char *str)
30__setup("failslab=", setup_failslab); 36__setup("failslab=", setup_failslab);
31 37
32#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 38#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
33
34static int __init failslab_debugfs_init(void) 39static int __init failslab_debugfs_init(void)
35{ 40{
36 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 41 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void)
46 debugfs_create_bool("ignore-gfp-wait", mode, dir, 51 debugfs_create_bool("ignore-gfp-wait", mode, dir,
47 &failslab.ignore_gfp_wait); 52 &failslab.ignore_gfp_wait);
48 53
49 if (!failslab.ignore_gfp_wait_file) { 54 failslab.cache_filter_file =
55 debugfs_create_bool("cache-filter", mode, dir,
56 &failslab.cache_filter);
57
58 if (!failslab.ignore_gfp_wait_file ||
59 !failslab.cache_filter_file) {
50 err = -ENOMEM; 60 err = -ENOMEM;
61 debugfs_remove(failslab.cache_filter_file);
51 debugfs_remove(failslab.ignore_gfp_wait_file); 62 debugfs_remove(failslab.ignore_gfp_wait_file);
52 cleanup_fault_attr_dentries(&failslab.attr); 63 cleanup_fault_attr_dentries(&failslab.attr);
53 } 64 }
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f37156d..140ebda9640f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -10,13 +10,13 @@
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/compiler.h> 13#include <linux/compiler.h>
15#include <linux/fs.h> 14#include <linux/fs.h>
16#include <linux/uaccess.h> 15#include <linux/uaccess.h>
17#include <linux/aio.h> 16#include <linux/aio.h>
18#include <linux/capability.h> 17#include <linux/capability.h>
19#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/gfp.h>
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping)
260EXPORT_SYMBOL(filemap_flush); 260EXPORT_SYMBOL(filemap_flush);
261 261
262/** 262/**
263 * wait_on_page_writeback_range - wait for writeback to complete 263 * filemap_fdatawait_range - wait for writeback to complete
264 * @mapping: target address_space 264 * @mapping: address space structure to wait for
265 * @start: beginning page index 265 * @start_byte: offset in bytes where the range starts
266 * @end: ending page index 266 * @end_byte: offset in bytes where the range ends (inclusive)
267 * 267 *
268 * Wait for writeback to complete against pages indexed by start->end 268 * Walk the list of under-writeback pages of the given address space
269 * inclusive 269 * in the given range and wait for all of them.
270 */ 270 */
271int wait_on_page_writeback_range(struct address_space *mapping, 271int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
272 pgoff_t start, pgoff_t end) 272 loff_t end_byte)
273{ 273{
274 pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
275 pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
274 struct pagevec pvec; 276 struct pagevec pvec;
275 int nr_pages; 277 int nr_pages;
276 int ret = 0; 278 int ret = 0;
277 pgoff_t index;
278 279
279 if (end < start) 280 if (end_byte < start_byte)
280 return 0; 281 return 0;
281 282
282 pagevec_init(&pvec, 0); 283 pagevec_init(&pvec, 0);
283 index = start;
284 while ((index <= end) && 284 while ((index <= end) &&
285 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 285 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
286 PAGECACHE_TAG_WRITEBACK, 286 PAGECACHE_TAG_WRITEBACK,
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping,
310 310
311 return ret; 311 return ret;
312} 312}
313
314/**
315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
316 * @mapping: address space structure to wait for
317 * @start: offset in bytes where the range starts
318 * @end: offset in bytes where the range ends (inclusive)
319 *
320 * Walk the list of under-writeback pages of the given address space
321 * in the given range and wait for all of them.
322 *
323 * This is just a simple wrapper so that callers don't have to convert offsets
324 * to page indexes themselves
325 */
326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
327 loff_t end)
328{
329 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
330 end >> PAGE_CACHE_SHIFT);
331}
332EXPORT_SYMBOL(filemap_fdatawait_range); 313EXPORT_SYMBOL(filemap_fdatawait_range);
333 314
334/** 315/**
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping)
345 if (i_size == 0) 326 if (i_size == 0)
346 return 0; 327 return 0;
347 328
348 return wait_on_page_writeback_range(mapping, 0, 329 return filemap_fdatawait_range(mapping, 0, i_size - 1);
349 (i_size - 1) >> PAGE_CACHE_SHIFT);
350} 330}
351EXPORT_SYMBOL(filemap_fdatawait); 331EXPORT_SYMBOL(filemap_fdatawait);
352 332
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
393 WB_SYNC_ALL); 373 WB_SYNC_ALL);
394 /* See comment of filemap_write_and_wait() */ 374 /* See comment of filemap_write_and_wait() */
395 if (err != -EIO) { 375 if (err != -EIO) {
396 int err2 = wait_on_page_writeback_range(mapping, 376 int err2 = filemap_fdatawait_range(mapping,
397 lstart >> PAGE_CACHE_SHIFT, 377 lstart, lend);
398 lend >> PAGE_CACHE_SHIFT);
399 if (!err) 378 if (!err)
400 err = err2; 379 err = err2;
401 } 380 }
@@ -1138,7 +1117,7 @@ readpage:
1138 if (!PageUptodate(page)) { 1117 if (!PageUptodate(page)) {
1139 if (page->mapping == NULL) { 1118 if (page->mapping == NULL) {
1140 /* 1119 /*
1141 * invalidate_inode_pages got it 1120 * invalidate_mapping_pages got it
1142 */ 1121 */
1143 unlock_page(page); 1122 unlock_page(page);
1144 page_cache_release(page); 1123 page_cache_release(page);
@@ -1655,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1655static struct page *__read_cache_page(struct address_space *mapping, 1634static struct page *__read_cache_page(struct address_space *mapping,
1656 pgoff_t index, 1635 pgoff_t index,
1657 int (*filler)(void *,struct page*), 1636 int (*filler)(void *,struct page*),
1658 void *data) 1637 void *data,
1638 gfp_t gfp)
1659{ 1639{
1660 struct page *page; 1640 struct page *page;
1661 int err; 1641 int err;
1662repeat: 1642repeat:
1663 page = find_get_page(mapping, index); 1643 page = find_get_page(mapping, index);
1664 if (!page) { 1644 if (!page) {
1665 page = page_cache_alloc_cold(mapping); 1645 page = __page_cache_alloc(gfp | __GFP_COLD);
1666 if (!page) 1646 if (!page)
1667 return ERR_PTR(-ENOMEM); 1647 return ERR_PTR(-ENOMEM);
1668 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 1648 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1682,31 +1662,18 @@ repeat:
1682 return page; 1662 return page;
1683} 1663}
1684 1664
1685/** 1665static struct page *do_read_cache_page(struct address_space *mapping,
1686 * read_cache_page_async - read into page cache, fill it if needed
1687 * @mapping: the page's address_space
1688 * @index: the page index
1689 * @filler: function to perform the read
1690 * @data: destination for read data
1691 *
1692 * Same as read_cache_page, but don't wait for page to become unlocked
1693 * after submitting it to the filler.
1694 *
1695 * Read into the page cache. If a page already exists, and PageUptodate() is
1696 * not set, try to fill the page but don't wait for it to become unlocked.
1697 *
1698 * If the page does not get brought uptodate, return -EIO.
1699 */
1700struct page *read_cache_page_async(struct address_space *mapping,
1701 pgoff_t index, 1666 pgoff_t index,
1702 int (*filler)(void *,struct page*), 1667 int (*filler)(void *,struct page*),
1703 void *data) 1668 void *data,
1669 gfp_t gfp)
1670
1704{ 1671{
1705 struct page *page; 1672 struct page *page;
1706 int err; 1673 int err;
1707 1674
1708retry: 1675retry:
1709 page = __read_cache_page(mapping, index, filler, data); 1676 page = __read_cache_page(mapping, index, filler, data, gfp);
1710 if (IS_ERR(page)) 1677 if (IS_ERR(page))
1711 return page; 1678 return page;
1712 if (PageUptodate(page)) 1679 if (PageUptodate(page))
@@ -1731,8 +1698,67 @@ out:
1731 mark_page_accessed(page); 1698 mark_page_accessed(page);
1732 return page; 1699 return page;
1733} 1700}
1701
1702/**
1703 * read_cache_page_async - read into page cache, fill it if needed
1704 * @mapping: the page's address_space
1705 * @index: the page index
1706 * @filler: function to perform the read
1707 * @data: destination for read data
1708 *
1709 * Same as read_cache_page, but don't wait for page to become unlocked
1710 * after submitting it to the filler.
1711 *
1712 * Read into the page cache. If a page already exists, and PageUptodate() is
1713 * not set, try to fill the page but don't wait for it to become unlocked.
1714 *
1715 * If the page does not get brought uptodate, return -EIO.
1716 */
1717struct page *read_cache_page_async(struct address_space *mapping,
1718 pgoff_t index,
1719 int (*filler)(void *,struct page*),
1720 void *data)
1721{
1722 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1723}
1734EXPORT_SYMBOL(read_cache_page_async); 1724EXPORT_SYMBOL(read_cache_page_async);
1735 1725
1726static struct page *wait_on_page_read(struct page *page)
1727{
1728 if (!IS_ERR(page)) {
1729 wait_on_page_locked(page);
1730 if (!PageUptodate(page)) {
1731 page_cache_release(page);
1732 page = ERR_PTR(-EIO);
1733 }
1734 }
1735 return page;
1736}
1737
1738/**
1739 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
1740 * @mapping: the page's address_space
1741 * @index: the page index
1742 * @gfp: the page allocator flags to use if allocating
1743 *
1744 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
1745 * any new page allocations done using the specified allocation flags. Note
1746 * that the Radix tree operations will still use GFP_KERNEL, so you can't
1747 * expect to do this atomically or anything like that - but you can pass in
1748 * other page requirements.
1749 *
1750 * If the page does not get brought uptodate, return -EIO.
1751 */
1752struct page *read_cache_page_gfp(struct address_space *mapping,
1753 pgoff_t index,
1754 gfp_t gfp)
1755{
1756 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1757
1758 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
1759}
1760EXPORT_SYMBOL(read_cache_page_gfp);
1761
1736/** 1762/**
1737 * read_cache_page - read into page cache, fill it if needed 1763 * read_cache_page - read into page cache, fill it if needed
1738 * @mapping: the page's address_space 1764 * @mapping: the page's address_space
@@ -1750,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping,
1750 int (*filler)(void *,struct page*), 1776 int (*filler)(void *,struct page*),
1751 void *data) 1777 void *data)
1752{ 1778{
1753 struct page *page; 1779 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1754
1755 page = read_cache_page_async(mapping, index, filler, data);
1756 if (IS_ERR(page))
1757 goto out;
1758 wait_on_page_locked(page);
1759 if (!PageUptodate(page)) {
1760 page_cache_release(page);
1761 page = ERR_PTR(-EIO);
1762 }
1763 out:
1764 return page;
1765} 1780}
1766EXPORT_SYMBOL(read_cache_page); 1781EXPORT_SYMBOL(read_cache_page);
1767 1782
@@ -1844,7 +1859,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1844 1859
1845/* 1860/*
1846 * Copy as much as we can into the page and return the number of bytes which 1861 * Copy as much as we can into the page and return the number of bytes which
1847 * were sucessfully copied. If a fault is encountered then return the number of 1862 * were successfully copied. If a fault is encountered then return the number of
1848 * bytes which were copied. 1863 * bytes which were copied.
1849 */ 1864 */
1850size_t iov_iter_copy_from_user_atomic(struct page *page, 1865size_t iov_iter_copy_from_user_atomic(struct page *page,
@@ -1971,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
1971inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 1986inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1972{ 1987{
1973 struct inode *inode = file->f_mapping->host; 1988 struct inode *inode = file->f_mapping->host;
1974 unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 1989 unsigned long limit = rlimit(RLIMIT_FSIZE);
1975 1990
1976 if (unlikely(*pos < 0)) 1991 if (unlikely(*pos < 0))
1977 return -EINVAL; 1992 return -EINVAL;
@@ -2217,6 +2232,9 @@ again:
2217 if (unlikely(status)) 2232 if (unlikely(status))
2218 break; 2233 break;
2219 2234
2235 if (mapping_writably_mapped(mapping))
2236 flush_dcache_page(page);
2237
2220 pagefault_disable(); 2238 pagefault_disable();
2221 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2239 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2222 pagefault_enable(); 2240 pagefault_enable();
@@ -2261,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2261 size_t count, ssize_t written) 2279 size_t count, ssize_t written)
2262{ 2280{
2263 struct file *file = iocb->ki_filp; 2281 struct file *file = iocb->ki_filp;
2264 struct address_space *mapping = file->f_mapping;
2265 ssize_t status; 2282 ssize_t status;
2266 struct iov_iter i; 2283 struct iov_iter i;
2267 2284
@@ -2273,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2273 *ppos = pos + status; 2290 *ppos = pos + status;
2274 } 2291 }
2275 2292
2276 /*
2277 * If we get here for O_DIRECT writes then we must have fallen through
2278 * to buffered writes (block instantiation inside i_size). So we sync
2279 * the file data here, to try to honour O_DIRECT expectations.
2280 */
2281 if (unlikely(file->f_flags & O_DIRECT) && written)
2282 status = filemap_write_and_wait_range(mapping,
2283 pos, pos + written - 1);
2284
2285 return written ? written : status; 2293 return written ? written : status;
2286} 2294}
2287EXPORT_SYMBOL(generic_file_buffered_write); 2295EXPORT_SYMBOL(generic_file_buffered_write);
@@ -2380,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2380 * semantics. 2388 * semantics.
2381 */ 2389 */
2382 endbyte = pos + written_buffered - written - 1; 2390 endbyte = pos + written_buffered - written - 1;
2383 err = do_sync_mapping_range(file->f_mapping, pos, endbyte, 2391 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
2384 SYNC_FILE_RANGE_WAIT_BEFORE|
2385 SYNC_FILE_RANGE_WRITE|
2386 SYNC_FILE_RANGE_WAIT_AFTER);
2387 if (err == 0) { 2392 if (err == 0) {
2388 written = written_buffered; 2393 written = written_buffered;
2389 invalidate_mapping_pages(mapping, 2394 invalidate_mapping_pages(mapping,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb8..83364df74a33 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,7 @@
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/seqlock.h> 18#include <linux/seqlock.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/gfp.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/io.h> 22#include <asm/io.h>
22 23
@@ -194,7 +195,7 @@ retry:
194 flush_cache_page(vma, address, pte_pfn(*pte)); 195 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 196 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page); 197 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 198 dec_mm_counter(mm, MM_FILEPAGES);
198 BUG_ON(pte_dirty(pteval)); 199 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 200 pte_unmap_unlock(pte, ptl);
200 page_cache_release(page); 201 page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb39..46f5dacf90a2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
40 page_remove_rmap(page); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, MM_FILEPAGES);
44 } 44 }
45 } else { 45 } else {
46 if (!pte_file(pte)) 46 if (!pte_file(pte))
diff --git a/mm/highmem.c b/mm/highmem.c
index 9c1e627f282e..bed8a8bfd01f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high);
220 * @page: &struct page to pin 220 * @page: &struct page to pin
221 * 221 *
222 * Returns the page's current virtual memory address, or NULL if no mapping 222 * Returns the page's current virtual memory address, or NULL if no mapping
223 * exists. When and only when a non null address is returned then a 223 * exists. If and only if a non null address is returned then a
224 * matching call to kunmap_high() is necessary. 224 * matching call to kunmap_high() is necessary.
225 * 225 *
226 * This can be called from any context. 226 * This can be called from any context.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d7601b02874..4c9e6bbf3772 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) William Irwin, April 2004
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/list.h> 5#include <linux/list.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -18,12 +17,14 @@
18#include <linux/mutex.h> 17#include <linux/mutex.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h>
21 21
22#include <asm/page.h> 22#include <asm/page.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
24#include <asm/io.h> 24#include <asm/io.h>
25 25
26#include <linux/hugetlb.h> 26#include <linux/hugetlb.h>
27#include <linux/node.h>
27#include "internal.h" 28#include "internal.h"
28 29
29const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 30const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page,
401{ 402{
402 int i; 403 int i;
403 404
404 if (unlikely(sz > MAX_ORDER_NR_PAGES)) { 405 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
405 clear_gigantic_page(page, addr, sz); 406 clear_gigantic_page(page, addr, sz);
406 return; 407 return;
407 } 408 }
@@ -545,6 +546,7 @@ static void free_huge_page(struct page *page)
545 546
546 mapping = (struct address_space *) page_private(page); 547 mapping = (struct address_space *) page_private(page);
547 set_page_private(page, 0); 548 set_page_private(page, 0);
549 page->mapping = NULL;
548 BUG_ON(page_count(page)); 550 BUG_ON(page_count(page));
549 INIT_LIST_HEAD(&page->lru); 551 INIT_LIST_HEAD(&page->lru);
550 552
@@ -622,42 +624,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
622} 624}
623 625
624/* 626/*
625 * Use a helper variable to find the next node and then 627 * common helper functions for hstate_next_node_to_{alloc|free}.
626 * copy it back to next_nid_to_alloc afterwards: 628 * We may have allocated or freed a huge page based on a different
627 * otherwise there's a window in which a racer might 629 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 630 * be outside of *nodes_allowed. Ensure that we use an allowed
629 * But we don't need to use a spin_lock here: it really 631 * node for alloc or free.
630 * doesn't matter if occasionally a racer chooses the
631 * same nid as we do. Move nid forward in the mask even
632 * if we just successfully allocated a hugepage so that
633 * the next caller gets hugepages on the next node.
634 */ 632 */
635static int hstate_next_node_to_alloc(struct hstate *h) 633static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
636{ 634{
637 int next_nid; 635 nid = next_node(nid, *nodes_allowed);
638 next_nid = next_node(h->next_nid_to_alloc, node_online_map); 636 if (nid == MAX_NUMNODES)
639 if (next_nid == MAX_NUMNODES) 637 nid = first_node(*nodes_allowed);
640 next_nid = first_node(node_online_map); 638 VM_BUG_ON(nid >= MAX_NUMNODES);
641 h->next_nid_to_alloc = next_nid; 639
642 return next_nid; 640 return nid;
643} 641}
644 642
645static int alloc_fresh_huge_page(struct hstate *h) 643static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
644{
645 if (!node_isset(nid, *nodes_allowed))
646 nid = next_node_allowed(nid, nodes_allowed);
647 return nid;
648}
649
650/*
651 * returns the previously saved node ["this node"] from which to
652 * allocate a persistent huge page for the pool and advance the
653 * next node from which to allocate, handling wrap at end of node
654 * mask.
655 */
656static int hstate_next_node_to_alloc(struct hstate *h,
657 nodemask_t *nodes_allowed)
658{
659 int nid;
660
661 VM_BUG_ON(!nodes_allowed);
662
663 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
664 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
665
666 return nid;
667}
668
669static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
646{ 670{
647 struct page *page; 671 struct page *page;
648 int start_nid; 672 int start_nid;
649 int next_nid; 673 int next_nid;
650 int ret = 0; 674 int ret = 0;
651 675
652 start_nid = h->next_nid_to_alloc; 676 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
653 next_nid = start_nid; 677 next_nid = start_nid;
654 678
655 do { 679 do {
656 page = alloc_fresh_huge_page_node(h, next_nid); 680 page = alloc_fresh_huge_page_node(h, next_nid);
657 if (page) 681 if (page) {
658 ret = 1; 682 ret = 1;
659 next_nid = hstate_next_node_to_alloc(h); 683 break;
660 } while (!page && next_nid != start_nid); 684 }
685 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
686 } while (next_nid != start_nid);
661 687
662 if (ret) 688 if (ret)
663 count_vm_event(HTLB_BUDDY_PGALLOC); 689 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +694,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
668} 694}
669 695
670/* 696/*
671 * helper for free_pool_huge_page() - find next node 697 * helper for free_pool_huge_page() - return the previously saved
672 * from which to free a huge page 698 * node ["this node"] from which to free a huge page. Advance the
699 * next node id whether or not we find a free huge page to free so
700 * that the next attempt to free addresses the next node.
673 */ 701 */
674static int hstate_next_node_to_free(struct hstate *h) 702static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
675{ 703{
676 int next_nid; 704 int nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map); 705
678 if (next_nid == MAX_NUMNODES) 706 VM_BUG_ON(!nodes_allowed);
679 next_nid = first_node(node_online_map); 707
680 h->next_nid_to_free = next_nid; 708 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
681 return next_nid; 709 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
710
711 return nid;
682} 712}
683 713
684/* 714/*
@@ -687,13 +717,14 @@ static int hstate_next_node_to_free(struct hstate *h)
687 * balanced over allowed nodes. 717 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked. 718 * Called with hugetlb_lock locked.
689 */ 719 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus) 720static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
721 bool acct_surplus)
691{ 722{
692 int start_nid; 723 int start_nid;
693 int next_nid; 724 int next_nid;
694 int ret = 0; 725 int ret = 0;
695 726
696 start_nid = h->next_nid_to_free; 727 start_nid = hstate_next_node_to_free(h, nodes_allowed);
697 next_nid = start_nid; 728 next_nid = start_nid;
698 729
699 do { 730 do {
@@ -715,9 +746,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
715 } 746 }
716 update_and_free_page(h, page); 747 update_and_free_page(h, page);
717 ret = 1; 748 ret = 1;
749 break;
718 } 750 }
719 next_nid = hstate_next_node_to_free(h); 751 next_nid = hstate_next_node_to_free(h, nodes_allowed);
720 } while (!ret && next_nid != start_nid); 752 } while (next_nid != start_nid);
721 753
722 return ret; 754 return ret;
723} 755}
@@ -911,14 +943,14 @@ static void return_unused_surplus_pages(struct hstate *h,
911 943
912 /* 944 /*
913 * We want to release as many surplus pages as possible, spread 945 * We want to release as many surplus pages as possible, spread
914 * evenly across all nodes. Iterate across all nodes until we 946 * evenly across all nodes with memory. Iterate across these nodes
915 * can no longer free unreserved surplus pages. This occurs when 947 * until we can no longer free unreserved surplus pages. This occurs
916 * the nodes with surplus pages have no free pages. 948 * when the nodes with surplus pages have no free pages.
917 * free_pool_huge_page() will balance the the frees across the 949 * free_pool_huge_page() will balance the the freed pages across the
918 * on-line nodes for us and will handle the hstate accounting. 950 * on-line nodes with memory and will handle the hstate accounting.
919 */ 951 */
920 while (nr_pages--) { 952 while (nr_pages--) {
921 if (!free_pool_huge_page(h, 1)) 953 if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
922 break; 954 break;
923 } 955 }
924} 956}
@@ -1007,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1007 page = alloc_buddy_huge_page(h, vma, addr); 1039 page = alloc_buddy_huge_page(h, vma, addr);
1008 if (!page) { 1040 if (!page) {
1009 hugetlb_put_quota(inode->i_mapping, chg); 1041 hugetlb_put_quota(inode->i_mapping, chg);
1010 return ERR_PTR(-VM_FAULT_OOM); 1042 return ERR_PTR(-VM_FAULT_SIGBUS);
1011 } 1043 }
1012 } 1044 }
1013 1045
@@ -1022,16 +1054,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1022int __weak alloc_bootmem_huge_page(struct hstate *h) 1054int __weak alloc_bootmem_huge_page(struct hstate *h)
1023{ 1055{
1024 struct huge_bootmem_page *m; 1056 struct huge_bootmem_page *m;
1025 int nr_nodes = nodes_weight(node_online_map); 1057 int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1026 1058
1027 while (nr_nodes) { 1059 while (nr_nodes) {
1028 void *addr; 1060 void *addr;
1029 1061
1030 addr = __alloc_bootmem_node_nopanic( 1062 addr = __alloc_bootmem_node_nopanic(
1031 NODE_DATA(h->next_nid_to_alloc), 1063 NODE_DATA(hstate_next_node_to_alloc(h,
1064 &node_states[N_HIGH_MEMORY])),
1032 huge_page_size(h), huge_page_size(h), 0); 1065 huge_page_size(h), huge_page_size(h), 0);
1033 1066
1034 hstate_next_node_to_alloc(h);
1035 if (addr) { 1067 if (addr) {
1036 /* 1068 /*
1037 * Use the beginning of the huge page to store the 1069 * Use the beginning of the huge page to store the
@@ -1084,7 +1116,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1084 if (h->order >= MAX_ORDER) { 1116 if (h->order >= MAX_ORDER) {
1085 if (!alloc_bootmem_huge_page(h)) 1117 if (!alloc_bootmem_huge_page(h))
1086 break; 1118 break;
1087 } else if (!alloc_fresh_huge_page(h)) 1119 } else if (!alloc_fresh_huge_page(h,
1120 &node_states[N_HIGH_MEMORY]))
1088 break; 1121 break;
1089 } 1122 }
1090 h->max_huge_pages = i; 1123 h->max_huge_pages = i;
@@ -1126,14 +1159,15 @@ static void __init report_hugepages(void)
1126} 1159}
1127 1160
1128#ifdef CONFIG_HIGHMEM 1161#ifdef CONFIG_HIGHMEM
1129static void try_to_free_low(struct hstate *h, unsigned long count) 1162static void try_to_free_low(struct hstate *h, unsigned long count,
1163 nodemask_t *nodes_allowed)
1130{ 1164{
1131 int i; 1165 int i;
1132 1166
1133 if (h->order >= MAX_ORDER) 1167 if (h->order >= MAX_ORDER)
1134 return; 1168 return;
1135 1169
1136 for (i = 0; i < MAX_NUMNODES; ++i) { 1170 for_each_node_mask(i, *nodes_allowed) {
1137 struct page *page, *next; 1171 struct page *page, *next;
1138 struct list_head *freel = &h->hugepage_freelists[i]; 1172 struct list_head *freel = &h->hugepage_freelists[i];
1139 list_for_each_entry_safe(page, next, freel, lru) { 1173 list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1183,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
1149 } 1183 }
1150} 1184}
1151#else 1185#else
1152static inline void try_to_free_low(struct hstate *h, unsigned long count) 1186static inline void try_to_free_low(struct hstate *h, unsigned long count,
1187 nodemask_t *nodes_allowed)
1153{ 1188{
1154} 1189}
1155#endif 1190#endif
@@ -1159,7 +1194,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1159 * balanced by operating on them in a round-robin fashion. 1194 * balanced by operating on them in a round-robin fashion.
1160 * Returns 1 if an adjustment was made. 1195 * Returns 1 if an adjustment was made.
1161 */ 1196 */
1162static int adjust_pool_surplus(struct hstate *h, int delta) 1197static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1198 int delta)
1163{ 1199{
1164 int start_nid, next_nid; 1200 int start_nid, next_nid;
1165 int ret = 0; 1201 int ret = 0;
@@ -1167,29 +1203,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1167 VM_BUG_ON(delta != -1 && delta != 1); 1203 VM_BUG_ON(delta != -1 && delta != 1);
1168 1204
1169 if (delta < 0) 1205 if (delta < 0)
1170 start_nid = h->next_nid_to_alloc; 1206 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1171 else 1207 else
1172 start_nid = h->next_nid_to_free; 1208 start_nid = hstate_next_node_to_free(h, nodes_allowed);
1173 next_nid = start_nid; 1209 next_nid = start_nid;
1174 1210
1175 do { 1211 do {
1176 int nid = next_nid; 1212 int nid = next_nid;
1177 if (delta < 0) { 1213 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /* 1214 /*
1180 * To shrink on this node, there must be a surplus page 1215 * To shrink on this node, there must be a surplus page
1181 */ 1216 */
1182 if (!h->surplus_huge_pages_node[nid]) 1217 if (!h->surplus_huge_pages_node[nid]) {
1218 next_nid = hstate_next_node_to_alloc(h,
1219 nodes_allowed);
1183 continue; 1220 continue;
1221 }
1184 } 1222 }
1185 if (delta > 0) { 1223 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /* 1224 /*
1188 * Surplus cannot exceed the total number of pages 1225 * Surplus cannot exceed the total number of pages
1189 */ 1226 */
1190 if (h->surplus_huge_pages_node[nid] >= 1227 if (h->surplus_huge_pages_node[nid] >=
1191 h->nr_huge_pages_node[nid]) 1228 h->nr_huge_pages_node[nid]) {
1229 next_nid = hstate_next_node_to_free(h,
1230 nodes_allowed);
1192 continue; 1231 continue;
1232 }
1193 } 1233 }
1194 1234
1195 h->surplus_huge_pages += delta; 1235 h->surplus_huge_pages += delta;
@@ -1202,7 +1242,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
1202} 1242}
1203 1243
1204#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1244#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1205static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) 1245static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1246 nodemask_t *nodes_allowed)
1206{ 1247{
1207 unsigned long min_count, ret; 1248 unsigned long min_count, ret;
1208 1249
@@ -1222,7 +1263,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1222 */ 1263 */
1223 spin_lock(&hugetlb_lock); 1264 spin_lock(&hugetlb_lock);
1224 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1265 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1225 if (!adjust_pool_surplus(h, -1)) 1266 if (!adjust_pool_surplus(h, nodes_allowed, -1))
1226 break; 1267 break;
1227 } 1268 }
1228 1269
@@ -1233,11 +1274,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1233 * and reducing the surplus. 1274 * and reducing the surplus.
1234 */ 1275 */
1235 spin_unlock(&hugetlb_lock); 1276 spin_unlock(&hugetlb_lock);
1236 ret = alloc_fresh_huge_page(h); 1277 ret = alloc_fresh_huge_page(h, nodes_allowed);
1237 spin_lock(&hugetlb_lock); 1278 spin_lock(&hugetlb_lock);
1238 if (!ret) 1279 if (!ret)
1239 goto out; 1280 goto out;
1240 1281
1282 /* Bail for signals. Probably ctrl-c from user */
1283 if (signal_pending(current))
1284 goto out;
1241 } 1285 }
1242 1286
1243 /* 1287 /*
@@ -1257,13 +1301,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1257 */ 1301 */
1258 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1302 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1259 min_count = max(count, min_count); 1303 min_count = max(count, min_count);
1260 try_to_free_low(h, min_count); 1304 try_to_free_low(h, min_count, nodes_allowed);
1261 while (min_count < persistent_huge_pages(h)) { 1305 while (min_count < persistent_huge_pages(h)) {
1262 if (!free_pool_huge_page(h, 0)) 1306 if (!free_pool_huge_page(h, nodes_allowed, 0))
1263 break; 1307 break;
1264 } 1308 }
1265 while (count < persistent_huge_pages(h)) { 1309 while (count < persistent_huge_pages(h)) {
1266 if (!adjust_pool_surplus(h, 1)) 1310 if (!adjust_pool_surplus(h, nodes_allowed, 1))
1267 break; 1311 break;
1268 } 1312 }
1269out: 1313out:
@@ -1282,43 +1326,117 @@ out:
1282static struct kobject *hugepages_kobj; 1326static struct kobject *hugepages_kobj;
1283static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1327static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1284 1328
1285static struct hstate *kobj_to_hstate(struct kobject *kobj) 1329static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1330
1331static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1286{ 1332{
1287 int i; 1333 int i;
1334
1288 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1335 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1289 if (hstate_kobjs[i] == kobj) 1336 if (hstate_kobjs[i] == kobj) {
1337 if (nidp)
1338 *nidp = NUMA_NO_NODE;
1290 return &hstates[i]; 1339 return &hstates[i];
1291 BUG(); 1340 }
1292 return NULL; 1341
1342 return kobj_to_node_hstate(kobj, nidp);
1293} 1343}
1294 1344
1295static ssize_t nr_hugepages_show(struct kobject *kobj, 1345static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1296 struct kobj_attribute *attr, char *buf) 1346 struct kobj_attribute *attr, char *buf)
1297{ 1347{
1298 struct hstate *h = kobj_to_hstate(kobj); 1348 struct hstate *h;
1299 return sprintf(buf, "%lu\n", h->nr_huge_pages); 1349 unsigned long nr_huge_pages;
1350 int nid;
1351
1352 h = kobj_to_hstate(kobj, &nid);
1353 if (nid == NUMA_NO_NODE)
1354 nr_huge_pages = h->nr_huge_pages;
1355 else
1356 nr_huge_pages = h->nr_huge_pages_node[nid];
1357
1358 return sprintf(buf, "%lu\n", nr_huge_pages);
1300} 1359}
1301static ssize_t nr_hugepages_store(struct kobject *kobj, 1360static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1302 struct kobj_attribute *attr, const char *buf, size_t count) 1361 struct kobject *kobj, struct kobj_attribute *attr,
1362 const char *buf, size_t len)
1303{ 1363{
1304 int err; 1364 int err;
1305 unsigned long input; 1365 int nid;
1306 struct hstate *h = kobj_to_hstate(kobj); 1366 unsigned long count;
1367 struct hstate *h;
1368 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1307 1369
1308 err = strict_strtoul(buf, 10, &input); 1370 err = strict_strtoul(buf, 10, &count);
1309 if (err) 1371 if (err)
1310 return 0; 1372 return 0;
1311 1373
1312 h->max_huge_pages = set_max_huge_pages(h, input); 1374 h = kobj_to_hstate(kobj, &nid);
1375 if (nid == NUMA_NO_NODE) {
1376 /*
1377 * global hstate attribute
1378 */
1379 if (!(obey_mempolicy &&
1380 init_nodemask_of_mempolicy(nodes_allowed))) {
1381 NODEMASK_FREE(nodes_allowed);
1382 nodes_allowed = &node_states[N_HIGH_MEMORY];
1383 }
1384 } else if (nodes_allowed) {
1385 /*
1386 * per node hstate attribute: adjust count to global,
1387 * but restrict alloc/free to the specified node.
1388 */
1389 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1390 init_nodemask_of_node(nodes_allowed, nid);
1391 } else
1392 nodes_allowed = &node_states[N_HIGH_MEMORY];
1313 1393
1314 return count; 1394 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1395
1396 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1397 NODEMASK_FREE(nodes_allowed);
1398
1399 return len;
1400}
1401
1402static ssize_t nr_hugepages_show(struct kobject *kobj,
1403 struct kobj_attribute *attr, char *buf)
1404{
1405 return nr_hugepages_show_common(kobj, attr, buf);
1406}
1407
1408static ssize_t nr_hugepages_store(struct kobject *kobj,
1409 struct kobj_attribute *attr, const char *buf, size_t len)
1410{
1411 return nr_hugepages_store_common(false, kobj, attr, buf, len);
1315} 1412}
1316HSTATE_ATTR(nr_hugepages); 1413HSTATE_ATTR(nr_hugepages);
1317 1414
1415#ifdef CONFIG_NUMA
1416
1417/*
1418 * hstate attribute for optionally mempolicy-based constraint on persistent
1419 * huge page alloc/free.
1420 */
1421static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1422 struct kobj_attribute *attr, char *buf)
1423{
1424 return nr_hugepages_show_common(kobj, attr, buf);
1425}
1426
1427static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1428 struct kobj_attribute *attr, const char *buf, size_t len)
1429{
1430 return nr_hugepages_store_common(true, kobj, attr, buf, len);
1431}
1432HSTATE_ATTR(nr_hugepages_mempolicy);
1433#endif
1434
1435
1318static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1436static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1319 struct kobj_attribute *attr, char *buf) 1437 struct kobj_attribute *attr, char *buf)
1320{ 1438{
1321 struct hstate *h = kobj_to_hstate(kobj); 1439 struct hstate *h = kobj_to_hstate(kobj, NULL);
1322 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1440 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1323} 1441}
1324static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1442static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1444,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1326{ 1444{
1327 int err; 1445 int err;
1328 unsigned long input; 1446 unsigned long input;
1329 struct hstate *h = kobj_to_hstate(kobj); 1447 struct hstate *h = kobj_to_hstate(kobj, NULL);
1330 1448
1331 err = strict_strtoul(buf, 10, &input); 1449 err = strict_strtoul(buf, 10, &input);
1332 if (err) 1450 if (err)
@@ -1343,15 +1461,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
1343static ssize_t free_hugepages_show(struct kobject *kobj, 1461static ssize_t free_hugepages_show(struct kobject *kobj,
1344 struct kobj_attribute *attr, char *buf) 1462 struct kobj_attribute *attr, char *buf)
1345{ 1463{
1346 struct hstate *h = kobj_to_hstate(kobj); 1464 struct hstate *h;
1347 return sprintf(buf, "%lu\n", h->free_huge_pages); 1465 unsigned long free_huge_pages;
1466 int nid;
1467
1468 h = kobj_to_hstate(kobj, &nid);
1469 if (nid == NUMA_NO_NODE)
1470 free_huge_pages = h->free_huge_pages;
1471 else
1472 free_huge_pages = h->free_huge_pages_node[nid];
1473
1474 return sprintf(buf, "%lu\n", free_huge_pages);
1348} 1475}
1349HSTATE_ATTR_RO(free_hugepages); 1476HSTATE_ATTR_RO(free_hugepages);
1350 1477
1351static ssize_t resv_hugepages_show(struct kobject *kobj, 1478static ssize_t resv_hugepages_show(struct kobject *kobj,
1352 struct kobj_attribute *attr, char *buf) 1479 struct kobj_attribute *attr, char *buf)
1353{ 1480{
1354 struct hstate *h = kobj_to_hstate(kobj); 1481 struct hstate *h = kobj_to_hstate(kobj, NULL);
1355 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1482 return sprintf(buf, "%lu\n", h->resv_huge_pages);
1356} 1483}
1357HSTATE_ATTR_RO(resv_hugepages); 1484HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1486,17 @@ HSTATE_ATTR_RO(resv_hugepages);
1359static ssize_t surplus_hugepages_show(struct kobject *kobj, 1486static ssize_t surplus_hugepages_show(struct kobject *kobj,
1360 struct kobj_attribute *attr, char *buf) 1487 struct kobj_attribute *attr, char *buf)
1361{ 1488{
1362 struct hstate *h = kobj_to_hstate(kobj); 1489 struct hstate *h;
1363 return sprintf(buf, "%lu\n", h->surplus_huge_pages); 1490 unsigned long surplus_huge_pages;
1491 int nid;
1492
1493 h = kobj_to_hstate(kobj, &nid);
1494 if (nid == NUMA_NO_NODE)
1495 surplus_huge_pages = h->surplus_huge_pages;
1496 else
1497 surplus_huge_pages = h->surplus_huge_pages_node[nid];
1498
1499 return sprintf(buf, "%lu\n", surplus_huge_pages);
1364} 1500}
1365HSTATE_ATTR_RO(surplus_hugepages); 1501HSTATE_ATTR_RO(surplus_hugepages);
1366 1502
@@ -1370,6 +1506,9 @@ static struct attribute *hstate_attrs[] = {
1370 &free_hugepages_attr.attr, 1506 &free_hugepages_attr.attr,
1371 &resv_hugepages_attr.attr, 1507 &resv_hugepages_attr.attr,
1372 &surplus_hugepages_attr.attr, 1508 &surplus_hugepages_attr.attr,
1509#ifdef CONFIG_NUMA
1510 &nr_hugepages_mempolicy_attr.attr,
1511#endif
1373 NULL, 1512 NULL,
1374}; 1513};
1375 1514
@@ -1377,19 +1516,20 @@ static struct attribute_group hstate_attr_group = {
1377 .attrs = hstate_attrs, 1516 .attrs = hstate_attrs,
1378}; 1517};
1379 1518
1380static int __init hugetlb_sysfs_add_hstate(struct hstate *h) 1519static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1520 struct kobject **hstate_kobjs,
1521 struct attribute_group *hstate_attr_group)
1381{ 1522{
1382 int retval; 1523 int retval;
1524 int hi = h - hstates;
1383 1525
1384 hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, 1526 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1385 hugepages_kobj); 1527 if (!hstate_kobjs[hi])
1386 if (!hstate_kobjs[h - hstates])
1387 return -ENOMEM; 1528 return -ENOMEM;
1388 1529
1389 retval = sysfs_create_group(hstate_kobjs[h - hstates], 1530 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1390 &hstate_attr_group);
1391 if (retval) 1531 if (retval)
1392 kobject_put(hstate_kobjs[h - hstates]); 1532 kobject_put(hstate_kobjs[hi]);
1393 1533
1394 return retval; 1534 return retval;
1395} 1535}
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
1404 return; 1544 return;
1405 1545
1406 for_each_hstate(h) { 1546 for_each_hstate(h) {
1407 err = hugetlb_sysfs_add_hstate(h); 1547 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1548 hstate_kobjs, &hstate_attr_group);
1408 if (err) 1549 if (err)
1409 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1550 printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1410 h->name); 1551 h->name);
1411 } 1552 }
1412} 1553}
1413 1554
1555#ifdef CONFIG_NUMA
1556
1557/*
1558 * node_hstate/s - associate per node hstate attributes, via their kobjects,
1559 * with node sysdevs in node_devices[] using a parallel array. The array
1560 * index of a node sysdev or _hstate == node id.
1561 * This is here to avoid any static dependency of the node sysdev driver, in
1562 * the base kernel, on the hugetlb module.
1563 */
1564struct node_hstate {
1565 struct kobject *hugepages_kobj;
1566 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1567};
1568struct node_hstate node_hstates[MAX_NUMNODES];
1569
1570/*
1571 * A subset of global hstate attributes for node sysdevs
1572 */
1573static struct attribute *per_node_hstate_attrs[] = {
1574 &nr_hugepages_attr.attr,
1575 &free_hugepages_attr.attr,
1576 &surplus_hugepages_attr.attr,
1577 NULL,
1578};
1579
1580static struct attribute_group per_node_hstate_attr_group = {
1581 .attrs = per_node_hstate_attrs,
1582};
1583
1584/*
1585 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1586 * Returns node id via non-NULL nidp.
1587 */
1588static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1589{
1590 int nid;
1591
1592 for (nid = 0; nid < nr_node_ids; nid++) {
1593 struct node_hstate *nhs = &node_hstates[nid];
1594 int i;
1595 for (i = 0; i < HUGE_MAX_HSTATE; i++)
1596 if (nhs->hstate_kobjs[i] == kobj) {
1597 if (nidp)
1598 *nidp = nid;
1599 return &hstates[i];
1600 }
1601 }
1602
1603 BUG();
1604 return NULL;
1605}
1606
1607/*
1608 * Unregister hstate attributes from a single node sysdev.
1609 * No-op if no hstate attributes attached.
1610 */
1611void hugetlb_unregister_node(struct node *node)
1612{
1613 struct hstate *h;
1614 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1615
1616 if (!nhs->hugepages_kobj)
1617 return; /* no hstate attributes */
1618
1619 for_each_hstate(h)
1620 if (nhs->hstate_kobjs[h - hstates]) {
1621 kobject_put(nhs->hstate_kobjs[h - hstates]);
1622 nhs->hstate_kobjs[h - hstates] = NULL;
1623 }
1624
1625 kobject_put(nhs->hugepages_kobj);
1626 nhs->hugepages_kobj = NULL;
1627}
1628
1629/*
1630 * hugetlb module exit: unregister hstate attributes from node sysdevs
1631 * that have them.
1632 */
1633static void hugetlb_unregister_all_nodes(void)
1634{
1635 int nid;
1636
1637 /*
1638 * disable node sysdev registrations.
1639 */
1640 register_hugetlbfs_with_node(NULL, NULL);
1641
1642 /*
1643 * remove hstate attributes from any nodes that have them.
1644 */
1645 for (nid = 0; nid < nr_node_ids; nid++)
1646 hugetlb_unregister_node(&node_devices[nid]);
1647}
1648
1649/*
1650 * Register hstate attributes for a single node sysdev.
1651 * No-op if attributes already registered.
1652 */
1653void hugetlb_register_node(struct node *node)
1654{
1655 struct hstate *h;
1656 struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1657 int err;
1658
1659 if (nhs->hugepages_kobj)
1660 return; /* already allocated */
1661
1662 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1663 &node->sysdev.kobj);
1664 if (!nhs->hugepages_kobj)
1665 return;
1666
1667 for_each_hstate(h) {
1668 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1669 nhs->hstate_kobjs,
1670 &per_node_hstate_attr_group);
1671 if (err) {
1672 printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1673 " for node %d\n",
1674 h->name, node->sysdev.id);
1675 hugetlb_unregister_node(node);
1676 break;
1677 }
1678 }
1679}
1680
1681/*
1682 * hugetlb init time: register hstate attributes for all registered node
1683 * sysdevs of nodes that have memory. All on-line nodes should have
1684 * registered their associated sysdev by this time.
1685 */
1686static void hugetlb_register_all_nodes(void)
1687{
1688 int nid;
1689
1690 for_each_node_state(nid, N_HIGH_MEMORY) {
1691 struct node *node = &node_devices[nid];
1692 if (node->sysdev.id == nid)
1693 hugetlb_register_node(node);
1694 }
1695
1696 /*
1697 * Let the node sysdev driver know we're here so it can
1698 * [un]register hstate attributes on node hotplug.
1699 */
1700 register_hugetlbfs_with_node(hugetlb_register_node,
1701 hugetlb_unregister_node);
1702}
1703#else /* !CONFIG_NUMA */
1704
1705static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1706{
1707 BUG();
1708 if (nidp)
1709 *nidp = -1;
1710 return NULL;
1711}
1712
1713static void hugetlb_unregister_all_nodes(void) { }
1714
1715static void hugetlb_register_all_nodes(void) { }
1716
1717#endif
1718
1414static void __exit hugetlb_exit(void) 1719static void __exit hugetlb_exit(void)
1415{ 1720{
1416 struct hstate *h; 1721 struct hstate *h;
1417 1722
1723 hugetlb_unregister_all_nodes();
1724
1418 for_each_hstate(h) { 1725 for_each_hstate(h) {
1419 kobject_put(hstate_kobjs[h - hstates]); 1726 kobject_put(hstate_kobjs[h - hstates]);
1420 } 1727 }
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
1449 1756
1450 hugetlb_sysfs_init(); 1757 hugetlb_sysfs_init();
1451 1758
1759 hugetlb_register_all_nodes();
1760
1452 return 0; 1761 return 0;
1453} 1762}
1454module_init(hugetlb_init); 1763module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
1472 h->free_huge_pages = 0; 1781 h->free_huge_pages = 0;
1473 for (i = 0; i < MAX_NUMNODES; ++i) 1782 for (i = 0; i < MAX_NUMNODES; ++i)
1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1783 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1475 h->next_nid_to_alloc = first_node(node_online_map); 1784 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1476 h->next_nid_to_free = first_node(node_online_map); 1785 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1786 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1478 huge_page_size(h)/1024); 1787 huge_page_size(h)/1024);
1479 1788
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1536} 1845}
1537 1846
1538#ifdef CONFIG_SYSCTL 1847#ifdef CONFIG_SYSCTL
1539int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1848static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1540 void __user *buffer, 1849 struct ctl_table *table, int write,
1541 size_t *length, loff_t *ppos) 1850 void __user *buffer, size_t *length, loff_t *ppos)
1542{ 1851{
1543 struct hstate *h = &default_hstate; 1852 struct hstate *h = &default_hstate;
1544 unsigned long tmp; 1853 unsigned long tmp;
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1550 table->maxlen = sizeof(unsigned long); 1859 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1860 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1861
1553 if (write) 1862 if (write) {
1554 h->max_huge_pages = set_max_huge_pages(h, tmp); 1863 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1864 GFP_KERNEL | __GFP_NORETRY);
1865 if (!(obey_mempolicy &&
1866 init_nodemask_of_mempolicy(nodes_allowed))) {
1867 NODEMASK_FREE(nodes_allowed);
1868 nodes_allowed = &node_states[N_HIGH_MEMORY];
1869 }
1870 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1871
1872 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1873 NODEMASK_FREE(nodes_allowed);
1874 }
1555 1875
1556 return 0; 1876 return 0;
1557} 1877}
1558 1878
1879int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1880 void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882
1883 return hugetlb_sysctl_handler_common(false, table, write,
1884 buffer, length, ppos);
1885}
1886
1887#ifdef CONFIG_NUMA
1888int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1889 void __user *buffer, size_t *length, loff_t *ppos)
1890{
1891 return hugetlb_sysctl_handler_common(true, table, write,
1892 buffer, length, ppos);
1893}
1894#endif /* CONFIG_NUMA */
1895
1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1896int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1560 void __user *buffer, 1897 void __user *buffer,
1561 size_t *length, loff_t *ppos) 1898 size_t *length, loff_t *ppos)
@@ -1751,7 +2088,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
1751 2088
1752 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2089 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
1753 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2090 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
1754 update_mmu_cache(vma, address, entry); 2091 update_mmu_cache(vma, address, ptep);
1755 } 2092 }
1756} 2093}
1757 2094
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1903 + (vma->vm_pgoff >> PAGE_SHIFT); 2240 + (vma->vm_pgoff >> PAGE_SHIFT);
1904 mapping = (struct address_space *)page_private(page); 2241 mapping = (struct address_space *)page_private(page);
1905 2242
2243 /*
2244 * Take the mapping lock for the duration of the table walk. As
2245 * this mapping should be shared between all the VMAs,
2246 * __unmap_hugepage_range() is called as the lock is already held
2247 */
2248 spin_lock(&mapping->i_mmap_lock);
1906 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2249 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1907 /* Do not unmap the current VMA */ 2250 /* Do not unmap the current VMA */
1908 if (iter_vma == vma) 2251 if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1916 * from the time of fork. This would look like data corruption 2259 * from the time of fork. This would look like data corruption
1917 */ 2260 */
1918 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2261 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1919 unmap_hugepage_range(iter_vma, 2262 __unmap_hugepage_range(iter_vma,
1920 address, address + huge_page_size(h), 2263 address, address + huge_page_size(h),
1921 page); 2264 page);
1922 } 2265 }
2266 spin_unlock(&mapping->i_mmap_lock);
1923 2267
1924 return 1; 2268 return 1;
1925} 2269}
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
1959 outside_reserve = 1; 2303 outside_reserve = 1;
1960 2304
1961 page_cache_get(old_page); 2305 page_cache_get(old_page);
2306
2307 /* Drop page_table_lock as buddy allocator may be called */
2308 spin_unlock(&mm->page_table_lock);
1962 new_page = alloc_huge_page(vma, address, outside_reserve); 2309 new_page = alloc_huge_page(vma, address, outside_reserve);
1963 2310
1964 if (IS_ERR(new_page)) { 2311 if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
1976 if (unmap_ref_private(mm, vma, old_page, address)) { 2323 if (unmap_ref_private(mm, vma, old_page, address)) {
1977 BUG_ON(page_count(old_page) != 1); 2324 BUG_ON(page_count(old_page) != 1);
1978 BUG_ON(huge_pte_none(pte)); 2325 BUG_ON(huge_pte_none(pte));
2326 spin_lock(&mm->page_table_lock);
1979 goto retry_avoidcopy; 2327 goto retry_avoidcopy;
1980 } 2328 }
1981 WARN_ON_ONCE(1); 2329 WARN_ON_ONCE(1);
1982 } 2330 }
1983 2331
2332 /* Caller expects lock to be held */
2333 spin_lock(&mm->page_table_lock);
1984 return -PTR_ERR(new_page); 2334 return -PTR_ERR(new_page);
1985 } 2335 }
1986 2336
1987 spin_unlock(&mm->page_table_lock);
1988 copy_huge_page(new_page, old_page, address, vma); 2337 copy_huge_page(new_page, old_page, address, vma);
1989 __SetPageUptodate(new_page); 2338 __SetPageUptodate(new_page);
1990 spin_lock(&mm->page_table_lock);
1991 2339
2340 /*
2341 * Retake the page_table_lock to check for racing updates
2342 * before the page tables are altered
2343 */
2344 spin_lock(&mm->page_table_lock);
1992 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2345 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1993 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2346 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1994 /* Break COW */ 2347 /* Break COW */
@@ -2095,8 +2448,10 @@ retry:
2095 spin_lock(&inode->i_lock); 2448 spin_lock(&inode->i_lock);
2096 inode->i_blocks += blocks_per_huge_page(h); 2449 inode->i_blocks += blocks_per_huge_page(h);
2097 spin_unlock(&inode->i_lock); 2450 spin_unlock(&inode->i_lock);
2098 } else 2451 } else {
2099 lock_page(page); 2452 lock_page(page);
2453 page->mapping = HUGETLB_POISON;
2454 }
2100 } 2455 }
2101 2456
2102 /* 2457 /*
@@ -2206,7 +2561,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2206 entry = pte_mkyoung(entry); 2561 entry = pte_mkyoung(entry);
2207 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 2562 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2208 flags & FAULT_FLAG_WRITE)) 2563 flags & FAULT_FLAG_WRITE))
2209 update_mmu_cache(vma, address, entry); 2564 update_mmu_cache(vma, address, ptep);
2210 2565
2211out_page_table_lock: 2566out_page_table_lock:
2212 spin_unlock(&mm->page_table_lock); 2567 spin_unlock(&mm->page_table_lock);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
3#include <linux/debugfs.h> 3#include <linux/debugfs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h>
7#include <linux/pagemap.h>
8#include "internal.h"
6 9
7static struct dentry *hwpoison_dir, *corrupt_pfn; 10static struct dentry *hwpoison_dir;
8 11
9static int hwpoison_inject(void *data, u64 val) 12static int hwpoison_inject(void *data, u64 val)
10{ 13{
14 unsigned long pfn = val;
15 struct page *p;
16 int err;
17
18 if (!capable(CAP_SYS_ADMIN))
19 return -EPERM;
20
21 if (!hwpoison_filter_enable)
22 goto inject;
23 if (!pfn_valid(pfn))
24 return -ENXIO;
25
26 p = pfn_to_page(pfn);
27 /*
28 * This implies unable to support free buddy pages.
29 */
30 if (!get_page_unless_zero(p))
31 return 0;
32
33 if (!PageLRU(p))
34 shake_page(p, 0);
35 /*
36 * This implies unable to support non-LRU pages.
37 */
38 if (!PageLRU(p))
39 return 0;
40
41 /*
42 * do a racy check with elevated page count, to make sure PG_hwpoison
43 * will only be set for the targeted owner (or on a free page).
44 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock.
46 */
47 lock_page(p);
48 err = hwpoison_filter(p);
49 unlock_page(p);
50 if (err)
51 return 0;
52
53inject:
54 printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
55 return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
56}
57
58static int hwpoison_unpoison(void *data, u64 val)
59{
11 if (!capable(CAP_SYS_ADMIN)) 60 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM; 61 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); 62
14 return __memory_failure(val, 18, 0); 63 return unpoison_memory(val);
15} 64}
16 65
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); 66DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
67DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
18 68
19static void pfn_inject_exit(void) 69static void pfn_inject_exit(void)
20{ 70{
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
24 74
25static int pfn_inject_init(void) 75static int pfn_inject_init(void)
26{ 76{
77 struct dentry *dentry;
78
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL); 79 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL) 80 if (hwpoison_dir == NULL)
29 return -ENOMEM; 81 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 82
83 /*
84 * Note that the below poison/unpoison interfaces do not involve
85 * hardware status change, hence do not require hardware support.
86 * They are mainly for testing hwpoison in software level.
87 */
88 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops); 89 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) { 90 if (!dentry)
33 pfn_inject_exit(); 91 goto fail;
34 return -ENOMEM; 92
35 } 93 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
94 NULL, &unpoison_fops);
95 if (!dentry)
96 goto fail;
97
98 dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
99 hwpoison_dir, &hwpoison_filter_enable);
100 if (!dentry)
101 goto fail;
102
103 dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
104 hwpoison_dir, &hwpoison_filter_dev_major);
105 if (!dentry)
106 goto fail;
107
108 dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
109 hwpoison_dir, &hwpoison_filter_dev_minor);
110 if (!dentry)
111 goto fail;
112
113 dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
114 hwpoison_dir, &hwpoison_filter_flags_mask);
115 if (!dentry)
116 goto fail;
117
118 dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
119 hwpoison_dir, &hwpoison_filter_flags_value);
120 if (!dentry)
121 goto fail;
122
123#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
124 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
125 hwpoison_dir, &hwpoison_filter_memcg);
126 if (!dentry)
127 goto fail;
128#endif
129
36 return 0; 130 return 0;
131fail:
132 pfn_inject_exit();
133 return -ENOMEM;
37} 134}
38 135
39module_init(pfn_inject_init); 136module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 22ec8d2b0fb8..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
50 */ 50 */
51extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
52extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
53#ifdef CONFIG_MEMORY_FAILURE
54extern bool is_free_buddy_page(struct page *page);
55#endif
53 56
54 57
55/* 58/*
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page)
63 return page_private(page); 66 return page_private(page);
64} 67}
65 68
66#ifdef CONFIG_HAVE_MLOCK 69#ifdef CONFIG_MMU
67extern long mlock_vma_pages_range(struct vm_area_struct *vma, 70extern long mlock_vma_pages_range(struct vm_area_struct *vma,
68 unsigned long start, unsigned long end); 71 unsigned long start, unsigned long end);
69extern void munlock_vma_pages_range(struct vm_area_struct *vma, 72extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
72{ 75{
73 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 76 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
74} 77}
75#endif
76 78
77/* 79/*
78 * unevictable_migrate_page() called only from migrate_page_copy() to
79 * migrate unevictable flag to new page.
80 * Note that the old page has been isolated from the LRU lists at this
81 * point so we don't need to worry about LRU statistics.
82 */
83static inline void unevictable_migrate_page(struct page *new, struct page *old)
84{
85 if (TestClearPageUnevictable(old))
86 SetPageUnevictable(new);
87}
88
89#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
90/*
91 * Called only in fault path via page_evictable() for a new page 80 * Called only in fault path via page_evictable() for a new page
92 * to determine if it's being mapped into a LOCKED vma. 81 * to determine if it's being mapped into a LOCKED vma.
93 * If so, mark page as mlocked. 82 * If so, mark page as mlocked.
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
107} 96}
108 97
109/* 98/*
110 * must be called with vma's mmap_sem held for read, and page locked. 99 * must be called with vma's mmap_sem held for read or write, and page locked.
111 */ 100 */
112extern void mlock_vma_page(struct page *page); 101extern void mlock_vma_page(struct page *page);
102extern void munlock_vma_page(struct page *page);
113 103
114/* 104/*
115 * Clear the page's PageMlocked(). This can be useful in a situation where 105 * Clear the page's PageMlocked(). This can be useful in a situation where
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
144 } 134 }
145} 135}
146 136
147#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 137#else /* !CONFIG_MMU */
148static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
149{ 139{
150 return 0; 140 return 0;
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { }
153static inline void mlock_vma_page(struct page *page) { } 143static inline void mlock_vma_page(struct page *page) { }
154static inline void mlock_migrate_page(struct page *new, struct page *old) { } 144static inline void mlock_migrate_page(struct page *new, struct page *old) { }
155 145
156#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ 146#endif /* !CONFIG_MMU */
157 147
158/* 148/*
159 * Return the mem_map entry representing the 'offset' subpage within 149 * Return the mem_map entry representing the 'offset' subpage within
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
260#define ZONE_RECLAIM_SOME 0 250#define ZONE_RECLAIM_SOME 0
261#define ZONE_RECLAIM_SUCCESS 1 251#define ZONE_RECLAIM_SUCCESS 1
262#endif 252#endif
253
254extern int hwpoison_filter(struct page *p);
255
256extern u32 hwpoison_filter_dev_major;
257extern u32 hwpoison_filter_dev_minor;
258extern u64 hwpoison_filter_flags_mask;
259extern u64 hwpoison_filter_flags_value;
260extern u64 hwpoison_filter_memcg;
261extern u32 hwpoison_filter_enable;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8bf765c4f58d..2c0d032ac898 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -72,7 +72,6 @@
72#include <linux/module.h> 72#include <linux/module.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/prio_tree.h>
75#include <linux/gfp.h>
76#include <linux/fs.h> 75#include <linux/fs.h>
77#include <linux/debugfs.h> 76#include <linux/debugfs.h>
78#include <linux/seq_file.h> 77#include <linux/seq_file.h>
@@ -93,6 +92,7 @@
93#include <linux/nodemask.h> 92#include <linux/nodemask.h>
94#include <linux/mm.h> 93#include <linux/mm.h>
95#include <linux/workqueue.h> 94#include <linux/workqueue.h>
95#include <linux/crc32.h>
96 96
97#include <asm/sections.h> 97#include <asm/sections.h>
98#include <asm/processor.h> 98#include <asm/processor.h>
@@ -108,7 +108,6 @@
108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ 108#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ 111#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
113 112
114#define BYTES_PER_POINTER sizeof(void *) 113#define BYTES_PER_POINTER sizeof(void *)
@@ -119,8 +118,8 @@
119/* scanning area inside a memory block */ 118/* scanning area inside a memory block */
120struct kmemleak_scan_area { 119struct kmemleak_scan_area {
121 struct hlist_node node; 120 struct hlist_node node;
122 unsigned long offset; 121 unsigned long start;
123 size_t length; 122 size_t size;
124}; 123};
125 124
126#define KMEMLEAK_GREY 0 125#define KMEMLEAK_GREY 0
@@ -149,6 +148,8 @@ struct kmemleak_object {
149 int min_count; 148 int min_count;
150 /* the total number of pointers found pointing to this object */ 149 /* the total number of pointers found pointing to this object */
151 int count; 150 int count;
151 /* checksum for detecting modified objects */
152 u32 checksum;
152 /* memory ranges to be scanned inside an object (empty for all) */ 153 /* memory ranges to be scanned inside an object (empty for all) */
153 struct hlist_head area_list; 154 struct hlist_head area_list;
154 unsigned long trace[MAX_TRACE]; 155 unsigned long trace[MAX_TRACE];
@@ -164,8 +165,6 @@ struct kmemleak_object {
164#define OBJECT_REPORTED (1 << 1) 165#define OBJECT_REPORTED (1 << 1)
165/* flag set to not scan the object */ 166/* flag set to not scan the object */
166#define OBJECT_NO_SCAN (1 << 2) 167#define OBJECT_NO_SCAN (1 << 2)
167/* flag set on newly allocated objects */
168#define OBJECT_NEW (1 << 3)
169 168
170/* number of bytes to print per line; must be 16 or 32 */ 169/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16 170#define HEX_ROW_SIZE 16
@@ -241,8 +240,6 @@ struct early_log {
241 const void *ptr; /* allocated/freed memory block */ 240 const void *ptr; /* allocated/freed memory block */
242 size_t size; /* memory block size */ 241 size_t size; /* memory block size */
243 int min_count; /* minimum reference count */ 242 int min_count; /* minimum reference count */
244 unsigned long offset; /* scan area offset */
245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */ 243 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */ 244 unsigned int trace_len; /* stack trace length */
248}; 245};
@@ -323,11 +320,6 @@ static bool color_gray(const struct kmemleak_object *object)
323 object->count >= object->min_count; 320 object->count >= object->min_count;
324} 321}
325 322
326static bool color_black(const struct kmemleak_object *object)
327{
328 return object->min_count == KMEMLEAK_BLACK;
329}
330
331/* 323/*
332 * Objects are considered unreferenced only if their color is white, they have 324 * Objects are considered unreferenced only if their color is white, they have
333 * not be deleted and have a minimum age to avoid false positives caused by 325 * not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +327,7 @@ static bool color_black(const struct kmemleak_object *object)
335 */ 327 */
336static bool unreferenced_object(struct kmemleak_object *object) 328static bool unreferenced_object(struct kmemleak_object *object)
337{ 329{
338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 330 return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
339 time_before_eq(object->jiffies + jiffies_min_age, 331 time_before_eq(object->jiffies + jiffies_min_age,
340 jiffies_last_scan); 332 jiffies_last_scan);
341} 333}
@@ -348,11 +340,13 @@ static void print_unreferenced(struct seq_file *seq,
348 struct kmemleak_object *object) 340 struct kmemleak_object *object)
349{ 341{
350 int i; 342 int i;
343 unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
351 344
352 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", 345 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
353 object->pointer, object->size); 346 object->pointer, object->size);
354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 347 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
355 object->comm, object->pid, object->jiffies); 348 object->comm, object->pid, object->jiffies,
349 msecs_age / 1000, msecs_age % 1000);
356 hex_dump_object(seq, object); 350 hex_dump_object(seq, object);
357 seq_printf(seq, " backtrace:\n"); 351 seq_printf(seq, " backtrace:\n");
358 352
@@ -381,6 +375,7 @@ static void dump_object_info(struct kmemleak_object *object)
381 pr_notice(" min_count = %d\n", object->min_count); 375 pr_notice(" min_count = %d\n", object->min_count);
382 pr_notice(" count = %d\n", object->count); 376 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags); 377 pr_notice(" flags = 0x%lx\n", object->flags);
378 pr_notice(" checksum = %d\n", object->checksum);
384 pr_notice(" backtrace:\n"); 379 pr_notice(" backtrace:\n");
385 print_stack_trace(&trace, 4); 380 print_stack_trace(&trace, 4);
386} 381}
@@ -522,12 +517,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
522 INIT_HLIST_HEAD(&object->area_list); 517 INIT_HLIST_HEAD(&object->area_list);
523 spin_lock_init(&object->lock); 518 spin_lock_init(&object->lock);
524 atomic_set(&object->use_count, 1); 519 atomic_set(&object->use_count, 1);
525 object->flags = OBJECT_ALLOCATED | OBJECT_NEW; 520 object->flags = OBJECT_ALLOCATED;
526 object->pointer = ptr; 521 object->pointer = ptr;
527 object->size = size; 522 object->size = size;
528 object->min_count = min_count; 523 object->min_count = min_count;
529 object->count = -1; /* no color initially */ 524 object->count = 0; /* white color initially */
530 object->jiffies = jiffies; 525 object->jiffies = jiffies;
526 object->checksum = 0;
531 527
532 /* task information */ 528 /* task information */
533 if (in_irq()) { 529 if (in_irq()) {
@@ -720,14 +716,13 @@ static void make_black_object(unsigned long ptr)
720 * Add a scanning area to the object. If at least one such area is added, 716 * Add a scanning area to the object. If at least one such area is added,
721 * kmemleak will only scan these ranges rather than the whole memory block. 717 * kmemleak will only scan these ranges rather than the whole memory block.
722 */ 718 */
723static void add_scan_area(unsigned long ptr, unsigned long offset, 719static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
724 size_t length, gfp_t gfp)
725{ 720{
726 unsigned long flags; 721 unsigned long flags;
727 struct kmemleak_object *object; 722 struct kmemleak_object *object;
728 struct kmemleak_scan_area *area; 723 struct kmemleak_scan_area *area;
729 724
730 object = find_and_get_object(ptr, 0); 725 object = find_and_get_object(ptr, 1);
731 if (!object) { 726 if (!object) {
732 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", 727 kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
733 ptr); 728 ptr);
@@ -741,7 +736,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
741 } 736 }
742 737
743 spin_lock_irqsave(&object->lock, flags); 738 spin_lock_irqsave(&object->lock, flags);
744 if (offset + length > object->size) { 739 if (ptr + size > object->pointer + object->size) {
745 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); 740 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
746 dump_object_info(object); 741 dump_object_info(object);
747 kmem_cache_free(scan_area_cache, area); 742 kmem_cache_free(scan_area_cache, area);
@@ -749,8 +744,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
749 } 744 }
750 745
751 INIT_HLIST_NODE(&area->node); 746 INIT_HLIST_NODE(&area->node);
752 area->offset = offset; 747 area->start = ptr;
753 area->length = length; 748 area->size = size;
754 749
755 hlist_add_head(&area->node, &object->area_list); 750 hlist_add_head(&area->node, &object->area_list);
756out_unlock: 751out_unlock:
@@ -786,7 +781,7 @@ static void object_no_scan(unsigned long ptr)
786 * processed later once kmemleak is fully initialized. 781 * processed later once kmemleak is fully initialized.
787 */ 782 */
788static void __init log_early(int op_type, const void *ptr, size_t size, 783static void __init log_early(int op_type, const void *ptr, size_t size,
789 int min_count, unsigned long offset, size_t length) 784 int min_count)
790{ 785{
791 unsigned long flags; 786 unsigned long flags;
792 struct early_log *log; 787 struct early_log *log;
@@ -808,8 +803,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
808 log->ptr = ptr; 803 log->ptr = ptr;
809 log->size = size; 804 log->size = size;
810 log->min_count = min_count; 805 log->min_count = min_count;
811 log->offset = offset;
812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC) 806 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace); 807 log->trace_len = __save_stack_trace(log->trace);
815 crt_early_log++; 808 crt_early_log++;
@@ -858,7 +851,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
858 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 851 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
859 create_object((unsigned long)ptr, size, min_count, gfp); 852 create_object((unsigned long)ptr, size, min_count, gfp);
860 else if (atomic_read(&kmemleak_early_log)) 853 else if (atomic_read(&kmemleak_early_log))
861 log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); 854 log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
862} 855}
863EXPORT_SYMBOL_GPL(kmemleak_alloc); 856EXPORT_SYMBOL_GPL(kmemleak_alloc);
864 857
@@ -873,7 +866,7 @@ void __ref kmemleak_free(const void *ptr)
873 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 866 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
874 delete_object_full((unsigned long)ptr); 867 delete_object_full((unsigned long)ptr);
875 else if (atomic_read(&kmemleak_early_log)) 868 else if (atomic_read(&kmemleak_early_log))
876 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); 869 log_early(KMEMLEAK_FREE, ptr, 0, 0);
877} 870}
878EXPORT_SYMBOL_GPL(kmemleak_free); 871EXPORT_SYMBOL_GPL(kmemleak_free);
879 872
@@ -888,7 +881,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
888 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 881 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
889 delete_object_part((unsigned long)ptr, size); 882 delete_object_part((unsigned long)ptr, size);
890 else if (atomic_read(&kmemleak_early_log)) 883 else if (atomic_read(&kmemleak_early_log))
891 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); 884 log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
892} 885}
893EXPORT_SYMBOL_GPL(kmemleak_free_part); 886EXPORT_SYMBOL_GPL(kmemleak_free_part);
894 887
@@ -903,7 +896,7 @@ void __ref kmemleak_not_leak(const void *ptr)
903 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 896 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
904 make_gray_object((unsigned long)ptr); 897 make_gray_object((unsigned long)ptr);
905 else if (atomic_read(&kmemleak_early_log)) 898 else if (atomic_read(&kmemleak_early_log))
906 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); 899 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
907} 900}
908EXPORT_SYMBOL(kmemleak_not_leak); 901EXPORT_SYMBOL(kmemleak_not_leak);
909 902
@@ -919,22 +912,21 @@ void __ref kmemleak_ignore(const void *ptr)
919 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 912 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
920 make_black_object((unsigned long)ptr); 913 make_black_object((unsigned long)ptr);
921 else if (atomic_read(&kmemleak_early_log)) 914 else if (atomic_read(&kmemleak_early_log))
922 log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); 915 log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
923} 916}
924EXPORT_SYMBOL(kmemleak_ignore); 917EXPORT_SYMBOL(kmemleak_ignore);
925 918
926/* 919/*
927 * Limit the range to be scanned in an allocated memory block. 920 * Limit the range to be scanned in an allocated memory block.
928 */ 921 */
929void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, 922void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
930 size_t length, gfp_t gfp)
931{ 923{
932 pr_debug("%s(0x%p)\n", __func__, ptr); 924 pr_debug("%s(0x%p)\n", __func__, ptr);
933 925
934 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 926 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
935 add_scan_area((unsigned long)ptr, offset, length, gfp); 927 add_scan_area((unsigned long)ptr, size, gfp);
936 else if (atomic_read(&kmemleak_early_log)) 928 else if (atomic_read(&kmemleak_early_log))
937 log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); 929 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
938} 930}
939EXPORT_SYMBOL(kmemleak_scan_area); 931EXPORT_SYMBOL(kmemleak_scan_area);
940 932
@@ -948,11 +940,25 @@ void __ref kmemleak_no_scan(const void *ptr)
948 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 940 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
949 object_no_scan((unsigned long)ptr); 941 object_no_scan((unsigned long)ptr);
950 else if (atomic_read(&kmemleak_early_log)) 942 else if (atomic_read(&kmemleak_early_log))
951 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); 943 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
952} 944}
953EXPORT_SYMBOL(kmemleak_no_scan); 945EXPORT_SYMBOL(kmemleak_no_scan);
954 946
955/* 947/*
948 * Update an object's checksum and return true if it was modified.
949 */
950static bool update_checksum(struct kmemleak_object *object)
951{
952 u32 old_csum = object->checksum;
953
954 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
955 return false;
956
957 object->checksum = crc32(0, (void *)object->pointer, object->size);
958 return object->checksum != old_csum;
959}
960
961/*
956 * Memory scanning is a long process and it needs to be interruptable. This 962 * Memory scanning is a long process and it needs to be interruptable. This
957 * function checks whether such interrupt condition occured. 963 * function checks whether such interrupt condition occured.
958 */ 964 */
@@ -1031,11 +1037,14 @@ static void scan_block(void *_start, void *_end,
1031 * added to the gray_list. 1037 * added to the gray_list.
1032 */ 1038 */
1033 object->count++; 1039 object->count++;
1034 if (color_gray(object)) 1040 if (color_gray(object)) {
1035 list_add_tail(&object->gray_list, &gray_list); 1041 list_add_tail(&object->gray_list, &gray_list);
1036 else 1042 spin_unlock_irqrestore(&object->lock, flags);
1037 put_object(object); 1043 continue;
1044 }
1045
1038 spin_unlock_irqrestore(&object->lock, flags); 1046 spin_unlock_irqrestore(&object->lock, flags);
1047 put_object(object);
1039 } 1048 }
1040} 1049}
1041 1050
@@ -1050,8 +1059,8 @@ static void scan_object(struct kmemleak_object *object)
1050 unsigned long flags; 1059 unsigned long flags;
1051 1060
1052 /* 1061 /*
1053 * Once the object->lock is aquired, the corresponding memory block 1062 * Once the object->lock is acquired, the corresponding memory block
1054 * cannot be freed (the same lock is aquired in delete_object). 1063 * cannot be freed (the same lock is acquired in delete_object).
1055 */ 1064 */
1056 spin_lock_irqsave(&object->lock, flags); 1065 spin_lock_irqsave(&object->lock, flags);
1057 if (object->flags & OBJECT_NO_SCAN) 1066 if (object->flags & OBJECT_NO_SCAN)
@@ -1075,14 +1084,47 @@ static void scan_object(struct kmemleak_object *object)
1075 } 1084 }
1076 } else 1085 } else
1077 hlist_for_each_entry(area, elem, &object->area_list, node) 1086 hlist_for_each_entry(area, elem, &object->area_list, node)
1078 scan_block((void *)(object->pointer + area->offset), 1087 scan_block((void *)area->start,
1079 (void *)(object->pointer + area->offset 1088 (void *)(area->start + area->size),
1080 + area->length), object, 0); 1089 object, 0);
1081out: 1090out:
1082 spin_unlock_irqrestore(&object->lock, flags); 1091 spin_unlock_irqrestore(&object->lock, flags);
1083} 1092}
1084 1093
1085/* 1094/*
1095 * Scan the objects already referenced (gray objects). More objects will be
1096 * referenced and, if there are no memory leaks, all the objects are scanned.
1097 */
1098static void scan_gray_list(void)
1099{
1100 struct kmemleak_object *object, *tmp;
1101
1102 /*
1103 * The list traversal is safe for both tail additions and removals
1104 * from inside the loop. The kmemleak objects cannot be freed from
1105 * outside the loop because their use_count was incremented.
1106 */
1107 object = list_entry(gray_list.next, typeof(*object), gray_list);
1108 while (&object->gray_list != &gray_list) {
1109 cond_resched();
1110
1111 /* may add new objects to the list */
1112 if (!scan_should_stop())
1113 scan_object(object);
1114
1115 tmp = list_entry(object->gray_list.next, typeof(*object),
1116 gray_list);
1117
1118 /* remove the object from the list and release it */
1119 list_del(&object->gray_list);
1120 put_object(object);
1121
1122 object = tmp;
1123 }
1124 WARN_ON(!list_empty(&gray_list));
1125}
1126
1127/*
1086 * Scan data sections and all the referenced memory blocks allocated via the 1128 * Scan data sections and all the referenced memory blocks allocated via the
1087 * kernel's standard allocators. This function must be called with the 1129 * kernel's standard allocators. This function must be called with the
1088 * scan_mutex held. 1130 * scan_mutex held.
@@ -1090,10 +1132,9 @@ out:
1090static void kmemleak_scan(void) 1132static void kmemleak_scan(void)
1091{ 1133{
1092 unsigned long flags; 1134 unsigned long flags;
1093 struct kmemleak_object *object, *tmp; 1135 struct kmemleak_object *object;
1094 int i; 1136 int i;
1095 int new_leaks = 0; 1137 int new_leaks = 0;
1096 int gray_list_pass = 0;
1097 1138
1098 jiffies_last_scan = jiffies; 1139 jiffies_last_scan = jiffies;
1099 1140
@@ -1114,7 +1155,6 @@ static void kmemleak_scan(void)
1114#endif 1155#endif
1115 /* reset the reference count (whiten the object) */ 1156 /* reset the reference count (whiten the object) */
1116 object->count = 0; 1157 object->count = 0;
1117 object->flags &= ~OBJECT_NEW;
1118 if (color_gray(object) && get_object(object)) 1158 if (color_gray(object) && get_object(object))
1119 list_add_tail(&object->gray_list, &gray_list); 1159 list_add_tail(&object->gray_list, &gray_list);
1120 1160
@@ -1172,62 +1212,36 @@ static void kmemleak_scan(void)
1172 1212
1173 /* 1213 /*
1174 * Scan the objects already referenced from the sections scanned 1214 * Scan the objects already referenced from the sections scanned
1175 * above. More objects will be referenced and, if there are no memory 1215 * above.
1176 * leaks, all the objects will be scanned. The list traversal is safe
1177 * for both tail additions and removals from inside the loop. The
1178 * kmemleak objects cannot be freed from outside the loop because their
1179 * use_count was increased.
1180 */ 1216 */
1181repeat: 1217 scan_gray_list();
1182 object = list_entry(gray_list.next, typeof(*object), gray_list);
1183 while (&object->gray_list != &gray_list) {
1184 cond_resched();
1185
1186 /* may add new objects to the list */
1187 if (!scan_should_stop())
1188 scan_object(object);
1189
1190 tmp = list_entry(object->gray_list.next, typeof(*object),
1191 gray_list);
1192
1193 /* remove the object from the list and release it */
1194 list_del(&object->gray_list);
1195 put_object(object);
1196
1197 object = tmp;
1198 }
1199
1200 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1201 goto scan_end;
1202 1218
1203 /* 1219 /*
1204 * Check for new objects allocated during this scanning and add them 1220 * Check for new or unreferenced objects modified since the previous
1205 * to the gray list. 1221 * scan and color them gray until the next scan.
1206 */ 1222 */
1207 rcu_read_lock(); 1223 rcu_read_lock();
1208 list_for_each_entry_rcu(object, &object_list, object_list) { 1224 list_for_each_entry_rcu(object, &object_list, object_list) {
1209 spin_lock_irqsave(&object->lock, flags); 1225 spin_lock_irqsave(&object->lock, flags);
1210 if ((object->flags & OBJECT_NEW) && !color_black(object) && 1226 if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
1211 get_object(object)) { 1227 && update_checksum(object) && get_object(object)) {
1212 object->flags &= ~OBJECT_NEW; 1228 /* color it gray temporarily */
1229 object->count = object->min_count;
1213 list_add_tail(&object->gray_list, &gray_list); 1230 list_add_tail(&object->gray_list, &gray_list);
1214 } 1231 }
1215 spin_unlock_irqrestore(&object->lock, flags); 1232 spin_unlock_irqrestore(&object->lock, flags);
1216 } 1233 }
1217 rcu_read_unlock(); 1234 rcu_read_unlock();
1218 1235
1219 if (!list_empty(&gray_list)) 1236 /*
1220 goto repeat; 1237 * Re-scan the gray list for modified unreferenced objects.
1221 1238 */
1222scan_end: 1239 scan_gray_list();
1223 WARN_ON(!list_empty(&gray_list));
1224 1240
1225 /* 1241 /*
1226 * If scanning was stopped or new objects were being allocated at a 1242 * If scanning was stopped do not report any new unreferenced objects.
1227 * higher rate than gray list scanning, do not report any new
1228 * unreferenced objects.
1229 */ 1243 */
1230 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) 1244 if (scan_should_stop())
1231 return; 1245 return;
1232 1246
1233 /* 1247 /*
@@ -1642,8 +1656,7 @@ void __init kmemleak_init(void)
1642 kmemleak_ignore(log->ptr); 1656 kmemleak_ignore(log->ptr);
1643 break; 1657 break;
1644 case KMEMLEAK_SCAN_AREA: 1658 case KMEMLEAK_SCAN_AREA:
1645 kmemleak_scan_area(log->ptr, log->offset, log->length, 1659 kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
1646 GFP_KERNEL);
1647 break; 1660 break;
1648 case KMEMLEAK_NO_SCAN: 1661 case KMEMLEAK_NO_SCAN:
1649 kmemleak_no_scan(log->ptr); 1662 kmemleak_no_scan(log->ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 5575f8628fef..956880f2ff49 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -29,11 +29,13 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/rbtree.h> 31#include <linux/rbtree.h>
32#include <linux/memory.h>
32#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/ksm.h> 35#include <linux/ksm.h>
35 36
36#include <asm/tlbflush.h> 37#include <asm/tlbflush.h>
38#include "internal.h"
37 39
38/* 40/*
39 * A few notes about the KSM scanning process, 41 * A few notes about the KSM scanning process,
@@ -79,13 +81,13 @@
79 * struct mm_slot - ksm information per mm that is being scanned 81 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list 82 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head 83 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items 84 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
83 * @mm: the mm that this information is valid for 85 * @mm: the mm that this information is valid for
84 */ 86 */
85struct mm_slot { 87struct mm_slot {
86 struct hlist_node link; 88 struct hlist_node link;
87 struct list_head mm_list; 89 struct list_head mm_list;
88 struct list_head rmap_list; 90 struct rmap_item *rmap_list;
89 struct mm_struct *mm; 91 struct mm_struct *mm;
90}; 92};
91 93
@@ -93,7 +95,7 @@ struct mm_slot {
93 * struct ksm_scan - cursor for scanning 95 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning 96 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned 97 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list 98 * @rmap_list: link to the next rmap to be scanned in the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node) 99 * @seqnr: count of completed full scans (needed when removing unstable node)
98 * 100 *
99 * There is only the one ksm_scan instance of this cursor structure. 101 * There is only the one ksm_scan instance of this cursor structure.
@@ -101,37 +103,51 @@ struct mm_slot {
101struct ksm_scan { 103struct ksm_scan {
102 struct mm_slot *mm_slot; 104 struct mm_slot *mm_slot;
103 unsigned long address; 105 unsigned long address;
104 struct rmap_item *rmap_item; 106 struct rmap_item **rmap_list;
105 unsigned long seqnr; 107 unsigned long seqnr;
106}; 108};
107 109
108/** 110/**
111 * struct stable_node - node of the stable rbtree
112 * @node: rb node of this ksm page in the stable tree
113 * @hlist: hlist head of rmap_items using this ksm page
114 * @kpfn: page frame number of this ksm page
115 */
116struct stable_node {
117 struct rb_node node;
118 struct hlist_head hlist;
119 unsigned long kpfn;
120};
121
122/**
109 * struct rmap_item - reverse mapping item for virtual addresses 123 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm) 124 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
125 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
111 * @mm: the memory structure this rmap_item is pointing into 126 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 127 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address 128 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree 129 * @node: rb node of this rmap_item in the unstable tree
115 * @next: next rmap_item hanging off the same node of the stable tree 130 * @head: pointer to stable_node heading this list in the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree 131 * @hlist: link into hlist of rmap_items hanging off that stable_node
117 */ 132 */
118struct rmap_item { 133struct rmap_item {
119 struct list_head link; 134 struct rmap_item *rmap_list;
135 struct anon_vma *anon_vma; /* when stable */
120 struct mm_struct *mm; 136 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */ 137 unsigned long address; /* + low bits used for flags below */
138 unsigned int oldchecksum; /* when unstable */
122 union { 139 union {
123 unsigned int oldchecksum; /* when unstable */ 140 struct rb_node node; /* when node of unstable tree */
124 struct rmap_item *next; /* when stable */ 141 struct { /* when listed from stable tree */
125 }; 142 struct stable_node *head;
126 union { 143 struct hlist_node hlist;
127 struct rb_node node; /* when tree node */ 144 };
128 struct rmap_item *prev; /* in stable list */
129 }; 145 };
130}; 146};
131 147
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 148#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ 149#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ 150#define STABLE_FLAG 0x200 /* is listed from the stable tree */
135 151
136/* The stable and unstable tree heads */ 152/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT; 153static struct rb_root root_stable_tree = RB_ROOT;
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = {
148}; 164};
149 165
150static struct kmem_cache *rmap_item_cache; 166static struct kmem_cache *rmap_item_cache;
167static struct kmem_cache *stable_node_cache;
151static struct kmem_cache *mm_slot_cache; 168static struct kmem_cache *mm_slot_cache;
152 169
153/* The number of nodes in the stable tree */ 170/* The number of nodes in the stable tree */
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared;
162/* The number of rmap_items in use: to calculate pages_volatile */ 179/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items; 180static unsigned long ksm_rmap_items;
164 181
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */ 182/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100; 183static unsigned int ksm_thread_pages_to_scan = 100;
170 184
@@ -190,13 +204,19 @@ static int __init ksm_slab_init(void)
190 if (!rmap_item_cache) 204 if (!rmap_item_cache)
191 goto out; 205 goto out;
192 206
207 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
208 if (!stable_node_cache)
209 goto out_free1;
210
193 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); 211 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
194 if (!mm_slot_cache) 212 if (!mm_slot_cache)
195 goto out_free; 213 goto out_free2;
196 214
197 return 0; 215 return 0;
198 216
199out_free: 217out_free2:
218 kmem_cache_destroy(stable_node_cache);
219out_free1:
200 kmem_cache_destroy(rmap_item_cache); 220 kmem_cache_destroy(rmap_item_cache);
201out: 221out:
202 return -ENOMEM; 222 return -ENOMEM;
@@ -205,6 +225,7 @@ out:
205static void __init ksm_slab_free(void) 225static void __init ksm_slab_free(void)
206{ 226{
207 kmem_cache_destroy(mm_slot_cache); 227 kmem_cache_destroy(mm_slot_cache);
228 kmem_cache_destroy(stable_node_cache);
208 kmem_cache_destroy(rmap_item_cache); 229 kmem_cache_destroy(rmap_item_cache);
209 mm_slot_cache = NULL; 230 mm_slot_cache = NULL;
210} 231}
@@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
226 kmem_cache_free(rmap_item_cache, rmap_item); 247 kmem_cache_free(rmap_item_cache, rmap_item);
227} 248}
228 249
250static inline struct stable_node *alloc_stable_node(void)
251{
252 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
253}
254
255static inline void free_stable_node(struct stable_node *stable_node)
256{
257 kmem_cache_free(stable_node_cache, stable_node);
258}
259
229static inline struct mm_slot *alloc_mm_slot(void) 260static inline struct mm_slot *alloc_mm_slot(void)
230{ 261{
231 if (!mm_slot_cache) /* initialization failed */ 262 if (!mm_slot_cache) /* initialization failed */
@@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
275 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
276 % MM_SLOTS_HASH_HEADS]; 307 % MM_SLOTS_HASH_HEADS];
277 mm_slot->mm = mm; 308 mm_slot->mm = mm;
278 INIT_LIST_HEAD(&mm_slot->rmap_list);
279 hlist_add_head(&mm_slot->link, bucket); 309 hlist_add_head(&mm_slot->link, bucket);
280} 310}
281 311
@@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
284 return rmap_item->address & STABLE_FLAG; 314 return rmap_item->address & STABLE_FLAG;
285} 315}
286 316
317static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma)
319{
320 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->ksm_refcount);
322}
323
324static void drop_anon_vma(struct rmap_item *rmap_item)
325{
326 struct anon_vma *anon_vma = rmap_item->anon_vma;
327
328 if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock);
331 if (empty)
332 anon_vma_free(anon_vma);
333 }
334}
335
287/* 336/*
288 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 337 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
289 * page tables after it has passed through ksm_exit() - which, if necessary, 338 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -316,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
316 do { 365 do {
317 cond_resched(); 366 cond_resched();
318 page = follow_page(vma, addr, FOLL_GET); 367 page = follow_page(vma, addr, FOLL_GET);
319 if (!page) 368 if (IS_ERR_OR_NULL(page))
320 break; 369 break;
321 if (PageKsm(page)) 370 if (PageKsm(page))
322 ret = handle_mm_fault(vma->vm_mm, vma, addr, 371 ret = handle_mm_fault(vma->vm_mm, vma, addr,
@@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
356 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 405 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
357} 406}
358 407
359static void break_cow(struct mm_struct *mm, unsigned long addr) 408static void break_cow(struct rmap_item *rmap_item)
360{ 409{
410 struct mm_struct *mm = rmap_item->mm;
411 unsigned long addr = rmap_item->address;
361 struct vm_area_struct *vma; 412 struct vm_area_struct *vma;
362 413
414 /*
415 * It is not an accident that whenever we want to break COW
416 * to undo, we also need to drop a reference to the anon_vma.
417 */
418 drop_anon_vma(rmap_item);
419
363 down_read(&mm->mmap_sem); 420 down_read(&mm->mmap_sem);
364 if (ksm_test_exit(mm)) 421 if (ksm_test_exit(mm))
365 goto out; 422 goto out;
@@ -390,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
390 goto out; 447 goto out;
391 448
392 page = follow_page(vma, addr, FOLL_GET); 449 page = follow_page(vma, addr, FOLL_GET);
393 if (!page) 450 if (IS_ERR_OR_NULL(page))
394 goto out; 451 goto out;
395 if (PageAnon(page)) { 452 if (PageAnon(page)) {
396 flush_anon_page(vma, page, addr); 453 flush_anon_page(vma, page, addr);
@@ -403,21 +460,77 @@ out: page = NULL;
403 return page; 460 return page;
404} 461}
405 462
463static void remove_node_from_stable_tree(struct stable_node *stable_node)
464{
465 struct rmap_item *rmap_item;
466 struct hlist_node *hlist;
467
468 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
469 if (rmap_item->hlist.next)
470 ksm_pages_sharing--;
471 else
472 ksm_pages_shared--;
473 drop_anon_vma(rmap_item);
474 rmap_item->address &= PAGE_MASK;
475 cond_resched();
476 }
477
478 rb_erase(&stable_node->node, &root_stable_tree);
479 free_stable_node(stable_node);
480}
481
406/* 482/*
407 * get_ksm_page: checks if the page at the virtual address in rmap_item 483 * get_ksm_page: checks if the page indicated by the stable node
408 * is still PageKsm, in which case we can trust the content of the page, 484 * is still its ksm page, despite having held no reference to it.
409 * and it returns the gotten page; but NULL if the page has been zapped. 485 * In which case we can trust the content of the page, and it
486 * returns the gotten page; but if the page has now been zapped,
487 * remove the stale node from the stable tree and return NULL.
488 *
489 * You would expect the stable_node to hold a reference to the ksm page.
490 * But if it increments the page's count, swapping out has to wait for
491 * ksmd to come around again before it can free the page, which may take
492 * seconds or even minutes: much too unresponsive. So instead we use a
493 * "keyhole reference": access to the ksm page from the stable node peeps
494 * out through its keyhole to see if that page still holds the right key,
495 * pointing back to this stable node. This relies on freeing a PageAnon
496 * page to reset its page->mapping to NULL, and relies on no other use of
497 * a page to put something that might look like our key in page->mapping.
498 *
499 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
500 * but this is different - made simpler by ksm_thread_mutex being held, but
501 * interesting for assuming that no other use of the struct page could ever
502 * put our expected_mapping into page->mapping (or a field of the union which
503 * coincides with page->mapping). The RCU calls are not for KSM at all, but
504 * to keep the page_count protocol described with page_cache_get_speculative.
505 *
506 * Note: it is possible that get_ksm_page() will return NULL one moment,
507 * then page the next, if the page is in between page_freeze_refs() and
508 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
509 * is on its way to being freed; but it is an anomaly to bear in mind.
410 */ 510 */
411static struct page *get_ksm_page(struct rmap_item *rmap_item) 511static struct page *get_ksm_page(struct stable_node *stable_node)
412{ 512{
413 struct page *page; 513 struct page *page;
414 514 void *expected_mapping;
415 page = get_mergeable_page(rmap_item); 515
416 if (page && !PageKsm(page)) { 516 page = pfn_to_page(stable_node->kpfn);
517 expected_mapping = (void *)stable_node +
518 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
519 rcu_read_lock();
520 if (page->mapping != expected_mapping)
521 goto stale;
522 if (!get_page_unless_zero(page))
523 goto stale;
524 if (page->mapping != expected_mapping) {
417 put_page(page); 525 put_page(page);
418 page = NULL; 526 goto stale;
419 } 527 }
528 rcu_read_unlock();
420 return page; 529 return page;
530stale:
531 rcu_read_unlock();
532 remove_node_from_stable_tree(stable_node);
533 return NULL;
421} 534}
422 535
423/* 536/*
@@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item)
426 */ 539 */
427static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) 540static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
428{ 541{
429 if (in_stable_tree(rmap_item)) { 542 if (rmap_item->address & STABLE_FLAG) {
430 struct rmap_item *next_item = rmap_item->next; 543 struct stable_node *stable_node;
431 544 struct page *page;
432 if (rmap_item->address & NODE_FLAG) {
433 if (next_item) {
434 rb_replace_node(&rmap_item->node,
435 &next_item->node,
436 &root_stable_tree);
437 next_item->address |= NODE_FLAG;
438 ksm_pages_sharing--;
439 } else {
440 rb_erase(&rmap_item->node, &root_stable_tree);
441 ksm_pages_shared--;
442 }
443 } else {
444 struct rmap_item *prev_item = rmap_item->prev;
445 545
446 BUG_ON(prev_item->next != rmap_item); 546 stable_node = rmap_item->head;
447 prev_item->next = next_item; 547 page = get_ksm_page(stable_node);
448 if (next_item) { 548 if (!page)
449 BUG_ON(next_item->prev != rmap_item); 549 goto out;
450 next_item->prev = rmap_item->prev; 550
451 } 551 lock_page(page);
552 hlist_del(&rmap_item->hlist);
553 unlock_page(page);
554 put_page(page);
555
556 if (stable_node->hlist.first)
452 ksm_pages_sharing--; 557 ksm_pages_sharing--;
453 } 558 else
559 ksm_pages_shared--;
454 560
455 rmap_item->next = NULL; 561 drop_anon_vma(rmap_item);
562 rmap_item->address &= PAGE_MASK;
456 563
457 } else if (rmap_item->address & NODE_FLAG) { 564 } else if (rmap_item->address & UNSTABLE_FLAG) {
458 unsigned char age; 565 unsigned char age;
459 /* 566 /*
460 * Usually ksmd can and must skip the rb_erase, because 567 * Usually ksmd can and must skip the rb_erase, because
@@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
467 BUG_ON(age > 1); 574 BUG_ON(age > 1);
468 if (!age) 575 if (!age)
469 rb_erase(&rmap_item->node, &root_unstable_tree); 576 rb_erase(&rmap_item->node, &root_unstable_tree);
577
470 ksm_pages_unshared--; 578 ksm_pages_unshared--;
579 rmap_item->address &= PAGE_MASK;
471 } 580 }
472 581out:
473 rmap_item->address &= PAGE_MASK;
474
475 cond_resched(); /* we're called from many long loops */ 582 cond_resched(); /* we're called from many long loops */
476} 583}
477 584
478static void remove_trailing_rmap_items(struct mm_slot *mm_slot, 585static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
479 struct list_head *cur) 586 struct rmap_item **rmap_list)
480{ 587{
481 struct rmap_item *rmap_item; 588 while (*rmap_list) {
482 589 struct rmap_item *rmap_item = *rmap_list;
483 while (cur != &mm_slot->rmap_list) { 590 *rmap_list = rmap_item->rmap_list;
484 rmap_item = list_entry(cur, struct rmap_item, link);
485 cur = cur->next;
486 remove_rmap_item_from_tree(rmap_item); 591 remove_rmap_item_from_tree(rmap_item);
487 list_del(&rmap_item->link);
488 free_rmap_item(rmap_item); 592 free_rmap_item(rmap_item);
489 } 593 }
490} 594}
@@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void)
550 goto error; 654 goto error;
551 } 655 }
552 656
553 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); 657 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
554 658
555 spin_lock(&ksm_mmlist_lock); 659 spin_lock(&ksm_mmlist_lock);
556 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 660 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -646,8 +750,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
646 * Check that no O_DIRECT or similar I/O is in progress on the 750 * Check that no O_DIRECT or similar I/O is in progress on the
647 * page 751 * page
648 */ 752 */
649 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
650 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at(mm, addr, ptep, entry);
651 goto out_unlock; 755 goto out_unlock;
652 } 756 }
653 entry = pte_wrprotect(entry); 757 entry = pte_wrprotect(entry);
@@ -664,15 +768,15 @@ out:
664 768
665/** 769/**
666 * replace_page - replace page in vma by new ksm page 770 * replace_page - replace page in vma by new ksm page
667 * @vma: vma that holds the pte pointing to oldpage 771 * @vma: vma that holds the pte pointing to page
668 * @oldpage: the page we are replacing by newpage 772 * @page: the page we are replacing by kpage
669 * @newpage: the ksm page we replace oldpage by 773 * @kpage: the ksm page we replace page by
670 * @orig_pte: the original value of the pte 774 * @orig_pte: the original value of the pte
671 * 775 *
672 * Returns 0 on success, -EFAULT on failure. 776 * Returns 0 on success, -EFAULT on failure.
673 */ 777 */
674static int replace_page(struct vm_area_struct *vma, struct page *oldpage, 778static int replace_page(struct vm_area_struct *vma, struct page *page,
675 struct page *newpage, pte_t orig_pte) 779 struct page *kpage, pte_t orig_pte)
676{ 780{
677 struct mm_struct *mm = vma->vm_mm; 781 struct mm_struct *mm = vma->vm_mm;
678 pgd_t *pgd; 782 pgd_t *pgd;
@@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
681 pte_t *ptep; 785 pte_t *ptep;
682 spinlock_t *ptl; 786 spinlock_t *ptl;
683 unsigned long addr; 787 unsigned long addr;
684 pgprot_t prot;
685 int err = -EFAULT; 788 int err = -EFAULT;
686 789
687 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); 790 addr = page_address_in_vma(page, vma);
688
689 addr = page_address_in_vma(oldpage, vma);
690 if (addr == -EFAULT) 791 if (addr == -EFAULT)
691 goto out; 792 goto out;
692 793
@@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
708 goto out; 809 goto out;
709 } 810 }
710 811
711 get_page(newpage); 812 get_page(kpage);
712 page_add_ksm_rmap(newpage); 813 page_add_anon_rmap(kpage, vma, addr);
713 814
714 flush_cache_page(vma, addr, pte_pfn(*ptep)); 815 flush_cache_page(vma, addr, pte_pfn(*ptep));
715 ptep_clear_flush(vma, addr, ptep); 816 ptep_clear_flush(vma, addr, ptep);
716 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
717 818
718 page_remove_rmap(oldpage); 819 page_remove_rmap(page);
719 put_page(oldpage); 820 put_page(page);
720 821
721 pte_unmap_unlock(ptep, ptl); 822 pte_unmap_unlock(ptep, ptl);
722 err = 0; 823 err = 0;
@@ -726,32 +827,27 @@ out:
726 827
727/* 828/*
728 * try_to_merge_one_page - take two pages and merge them into one 829 * try_to_merge_one_page - take two pages and merge them into one
729 * @vma: the vma that hold the pte pointing into oldpage 830 * @vma: the vma that holds the pte pointing to page
730 * @oldpage: the page that we want to replace with newpage 831 * @page: the PageAnon page that we want to replace with kpage
731 * @newpage: the page that we want to map instead of oldpage 832 * @kpage: the PageKsm page that we want to map instead of page,
732 * 833 * or NULL the first time when we want to use page as kpage.
733 * Note:
734 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
735 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
736 * 834 *
737 * This function returns 0 if the pages were merged, -EFAULT otherwise. 835 * This function returns 0 if the pages were merged, -EFAULT otherwise.
738 */ 836 */
739static int try_to_merge_one_page(struct vm_area_struct *vma, 837static int try_to_merge_one_page(struct vm_area_struct *vma,
740 struct page *oldpage, 838 struct page *page, struct page *kpage)
741 struct page *newpage)
742{ 839{
743 pte_t orig_pte = __pte(0); 840 pte_t orig_pte = __pte(0);
744 int err = -EFAULT; 841 int err = -EFAULT;
745 842
843 if (page == kpage) /* ksm page forked */
844 return 0;
845
746 if (!(vma->vm_flags & VM_MERGEABLE)) 846 if (!(vma->vm_flags & VM_MERGEABLE))
747 goto out; 847 goto out;
748 848 if (!PageAnon(page))
749 if (!PageAnon(oldpage))
750 goto out; 849 goto out;
751 850
752 get_page(newpage);
753 get_page(oldpage);
754
755 /* 851 /*
756 * We need the page lock to read a stable PageSwapCache in 852 * We need the page lock to read a stable PageSwapCache in
757 * write_protect_page(). We use trylock_page() instead of 853 * write_protect_page(). We use trylock_page() instead of
@@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
759 * prefer to continue scanning and merging different pages, 855 * prefer to continue scanning and merging different pages,
760 * then come back to this page when it is unlocked. 856 * then come back to this page when it is unlocked.
761 */ 857 */
762 if (!trylock_page(oldpage)) 858 if (!trylock_page(page))
763 goto out_putpage; 859 goto out;
764 /* 860 /*
765 * If this anonymous page is mapped only here, its pte may need 861 * If this anonymous page is mapped only here, its pte may need
766 * to be write-protected. If it's mapped elsewhere, all of its 862 * to be write-protected. If it's mapped elsewhere, all of its
767 * ptes are necessarily already write-protected. But in either 863 * ptes are necessarily already write-protected. But in either
768 * case, we need to lock and check page_count is not raised. 864 * case, we need to lock and check page_count is not raised.
769 */ 865 */
770 if (write_protect_page(vma, oldpage, &orig_pte)) { 866 if (write_protect_page(vma, page, &orig_pte) == 0) {
771 unlock_page(oldpage); 867 if (!kpage) {
772 goto out_putpage; 868 /*
869 * While we hold page lock, upgrade page from
870 * PageAnon+anon_vma to PageKsm+NULL stable_node:
871 * stable_tree_insert() will update stable_node.
872 */
873 set_page_stable_node(page, NULL);
874 mark_page_accessed(page);
875 err = 0;
876 } else if (pages_identical(page, kpage))
877 err = replace_page(vma, page, kpage, orig_pte);
773 } 878 }
774 unlock_page(oldpage);
775 879
776 if (pages_identical(oldpage, newpage)) 880 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
777 err = replace_page(vma, oldpage, newpage, orig_pte); 881 munlock_vma_page(page);
882 if (!PageMlocked(kpage)) {
883 unlock_page(page);
884 lock_page(kpage);
885 mlock_vma_page(kpage);
886 page = kpage; /* for final unlock */
887 }
888 }
778 889
779out_putpage: 890 unlock_page(page);
780 put_page(oldpage);
781 put_page(newpage);
782out: 891out:
783 return err; 892 return err;
784} 893}
@@ -786,26 +895,31 @@ out:
786/* 895/*
787 * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 896 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
788 * but no new kernel page is allocated: kpage must already be a ksm page. 897 * but no new kernel page is allocated: kpage must already be a ksm page.
898 *
899 * This function returns 0 if the pages were merged, -EFAULT otherwise.
789 */ 900 */
790static int try_to_merge_with_ksm_page(struct mm_struct *mm1, 901static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
791 unsigned long addr1, 902 struct page *page, struct page *kpage)
792 struct page *page1,
793 struct page *kpage)
794{ 903{
904 struct mm_struct *mm = rmap_item->mm;
795 struct vm_area_struct *vma; 905 struct vm_area_struct *vma;
796 int err = -EFAULT; 906 int err = -EFAULT;
797 907
798 down_read(&mm1->mmap_sem); 908 down_read(&mm->mmap_sem);
799 if (ksm_test_exit(mm1)) 909 if (ksm_test_exit(mm))
910 goto out;
911 vma = find_vma(mm, rmap_item->address);
912 if (!vma || vma->vm_start > rmap_item->address)
800 goto out; 913 goto out;
801 914
802 vma = find_vma(mm1, addr1); 915 err = try_to_merge_one_page(vma, page, kpage);
803 if (!vma || vma->vm_start > addr1) 916 if (err)
804 goto out; 917 goto out;
805 918
806 err = try_to_merge_one_page(vma, page1, kpage); 919 /* Must get reference to anon_vma while still holding mmap_sem */
920 hold_anon_vma(rmap_item, vma->anon_vma);
807out: 921out:
808 up_read(&mm1->mmap_sem); 922 up_read(&mm->mmap_sem);
809 return err; 923 return err;
810} 924}
811 925
@@ -813,109 +927,73 @@ out:
813 * try_to_merge_two_pages - take two identical pages and prepare them 927 * try_to_merge_two_pages - take two identical pages and prepare them
814 * to be merged into one page. 928 * to be merged into one page.
815 * 929 *
816 * This function returns 0 if we successfully mapped two identical pages 930 * This function returns the kpage if we successfully merged two identical
817 * into one page, -EFAULT otherwise. 931 * pages into one ksm page, NULL otherwise.
818 * 932 *
819 * Note that this function allocates a new kernel page: if one of the pages 933 * Note that this function upgrades page to ksm page: if one of the pages
820 * is already a ksm page, try_to_merge_with_ksm_page should be used. 934 * is already a ksm page, try_to_merge_with_ksm_page should be used.
821 */ 935 */
822static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, 936static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
823 struct page *page1, struct mm_struct *mm2, 937 struct page *page,
824 unsigned long addr2, struct page *page2) 938 struct rmap_item *tree_rmap_item,
939 struct page *tree_page)
825{ 940{
826 struct vm_area_struct *vma; 941 int err;
827 struct page *kpage;
828 int err = -EFAULT;
829
830 /*
831 * The number of nodes in the stable tree
832 * is the number of kernel pages that we hold.
833 */
834 if (ksm_max_kernel_pages &&
835 ksm_max_kernel_pages <= ksm_pages_shared)
836 return err;
837
838 kpage = alloc_page(GFP_HIGHUSER);
839 if (!kpage)
840 return err;
841
842 down_read(&mm1->mmap_sem);
843 if (ksm_test_exit(mm1)) {
844 up_read(&mm1->mmap_sem);
845 goto out;
846 }
847 vma = find_vma(mm1, addr1);
848 if (!vma || vma->vm_start > addr1) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852
853 copy_user_highpage(kpage, page1, addr1, vma);
854 err = try_to_merge_one_page(vma, page1, kpage);
855 up_read(&mm1->mmap_sem);
856 942
943 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
857 if (!err) { 944 if (!err) {
858 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); 945 err = try_to_merge_with_ksm_page(tree_rmap_item,
946 tree_page, page);
859 /* 947 /*
860 * If that fails, we have a ksm page with only one pte 948 * If that fails, we have a ksm page with only one pte
861 * pointing to it: so break it. 949 * pointing to it: so break it.
862 */ 950 */
863 if (err) 951 if (err)
864 break_cow(mm1, addr1); 952 break_cow(rmap_item);
865 } 953 }
866out: 954 return err ? NULL : page;
867 put_page(kpage);
868 return err;
869} 955}
870 956
871/* 957/*
872 * stable_tree_search - search page inside the stable tree 958 * stable_tree_search - search for page inside the stable tree
873 * @page: the page that we are searching identical pages to.
874 * @page2: pointer into identical page that we are holding inside the stable
875 * tree that we have found.
876 * @rmap_item: the reverse mapping item
877 * 959 *
878 * This function checks if there is a page inside the stable tree 960 * This function checks if there is a page inside the stable tree
879 * with identical content to the page that we are scanning right now. 961 * with identical content to the page that we are scanning right now.
880 * 962 *
881 * This function return rmap_item pointer to the identical item if found, 963 * This function returns the stable tree node of identical content if found,
882 * NULL otherwise. 964 * NULL otherwise.
883 */ 965 */
884static struct rmap_item *stable_tree_search(struct page *page, 966static struct page *stable_tree_search(struct page *page)
885 struct page **page2,
886 struct rmap_item *rmap_item)
887{ 967{
888 struct rb_node *node = root_stable_tree.rb_node; 968 struct rb_node *node = root_stable_tree.rb_node;
969 struct stable_node *stable_node;
970
971 stable_node = page_stable_node(page);
972 if (stable_node) { /* ksm page forked */
973 get_page(page);
974 return page;
975 }
889 976
890 while (node) { 977 while (node) {
891 struct rmap_item *tree_rmap_item, *next_rmap_item; 978 struct page *tree_page;
892 int ret; 979 int ret;
893 980
894 tree_rmap_item = rb_entry(node, struct rmap_item, node); 981 cond_resched();
895 while (tree_rmap_item) { 982 stable_node = rb_entry(node, struct stable_node, node);
896 BUG_ON(!in_stable_tree(tree_rmap_item)); 983 tree_page = get_ksm_page(stable_node);
897 cond_resched(); 984 if (!tree_page)
898 page2[0] = get_ksm_page(tree_rmap_item);
899 if (page2[0])
900 break;
901 next_rmap_item = tree_rmap_item->next;
902 remove_rmap_item_from_tree(tree_rmap_item);
903 tree_rmap_item = next_rmap_item;
904 }
905 if (!tree_rmap_item)
906 return NULL; 985 return NULL;
907 986
908 ret = memcmp_pages(page, page2[0]); 987 ret = memcmp_pages(page, tree_page);
909 988
910 if (ret < 0) { 989 if (ret < 0) {
911 put_page(page2[0]); 990 put_page(tree_page);
912 node = node->rb_left; 991 node = node->rb_left;
913 } else if (ret > 0) { 992 } else if (ret > 0) {
914 put_page(page2[0]); 993 put_page(tree_page);
915 node = node->rb_right; 994 node = node->rb_right;
916 } else { 995 } else
917 return tree_rmap_item; 996 return tree_page;
918 }
919 } 997 }
920 998
921 return NULL; 999 return NULL;
@@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page,
925 * stable_tree_insert - insert rmap_item pointing to new ksm page 1003 * stable_tree_insert - insert rmap_item pointing to new ksm page
926 * into the stable tree. 1004 * into the stable tree.
927 * 1005 *
928 * @page: the page that we are searching identical page to inside the stable 1006 * This function returns the stable tree node just allocated on success,
929 * tree. 1007 * NULL otherwise.
930 * @rmap_item: pointer to the reverse mapping item.
931 *
932 * This function returns rmap_item if success, NULL otherwise.
933 */ 1008 */
934static struct rmap_item *stable_tree_insert(struct page *page, 1009static struct stable_node *stable_tree_insert(struct page *kpage)
935 struct rmap_item *rmap_item)
936{ 1010{
937 struct rb_node **new = &root_stable_tree.rb_node; 1011 struct rb_node **new = &root_stable_tree.rb_node;
938 struct rb_node *parent = NULL; 1012 struct rb_node *parent = NULL;
1013 struct stable_node *stable_node;
939 1014
940 while (*new) { 1015 while (*new) {
941 struct rmap_item *tree_rmap_item, *next_rmap_item;
942 struct page *tree_page; 1016 struct page *tree_page;
943 int ret; 1017 int ret;
944 1018
945 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1019 cond_resched();
946 while (tree_rmap_item) { 1020 stable_node = rb_entry(*new, struct stable_node, node);
947 BUG_ON(!in_stable_tree(tree_rmap_item)); 1021 tree_page = get_ksm_page(stable_node);
948 cond_resched(); 1022 if (!tree_page)
949 tree_page = get_ksm_page(tree_rmap_item);
950 if (tree_page)
951 break;
952 next_rmap_item = tree_rmap_item->next;
953 remove_rmap_item_from_tree(tree_rmap_item);
954 tree_rmap_item = next_rmap_item;
955 }
956 if (!tree_rmap_item)
957 return NULL; 1023 return NULL;
958 1024
959 ret = memcmp_pages(page, tree_page); 1025 ret = memcmp_pages(kpage, tree_page);
960 put_page(tree_page); 1026 put_page(tree_page);
961 1027
962 parent = *new; 1028 parent = *new;
@@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page,
974 } 1040 }
975 } 1041 }
976 1042
977 rmap_item->address |= NODE_FLAG | STABLE_FLAG; 1043 stable_node = alloc_stable_node();
978 rmap_item->next = NULL; 1044 if (!stable_node)
979 rb_link_node(&rmap_item->node, parent, new); 1045 return NULL;
980 rb_insert_color(&rmap_item->node, &root_stable_tree);
981 1046
982 ksm_pages_shared++; 1047 rb_link_node(&stable_node->node, parent, new);
983 return rmap_item; 1048 rb_insert_color(&stable_node->node, &root_stable_tree);
1049
1050 INIT_HLIST_HEAD(&stable_node->hlist);
1051
1052 stable_node->kpfn = page_to_pfn(kpage);
1053 set_page_stable_node(kpage, stable_node);
1054
1055 return stable_node;
984} 1056}
985 1057
986/* 1058/*
987 * unstable_tree_search_insert - search and insert items into the unstable tree. 1059 * unstable_tree_search_insert - search for identical page,
988 * 1060 * else insert rmap_item into the unstable tree.
989 * @page: the page that we are going to search for identical page or to insert
990 * into the unstable tree
991 * @page2: pointer into identical page that was found inside the unstable tree
992 * @rmap_item: the reverse mapping item of page
993 * 1061 *
994 * This function searches for a page in the unstable tree identical to the 1062 * This function searches for a page in the unstable tree identical to the
995 * page currently being scanned; and if no identical page is found in the 1063 * page currently being scanned; and if no identical page is found in the
@@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page,
1001 * This function does both searching and inserting, because they share 1069 * This function does both searching and inserting, because they share
1002 * the same walking algorithm in an rbtree. 1070 * the same walking algorithm in an rbtree.
1003 */ 1071 */
1004static struct rmap_item *unstable_tree_search_insert(struct page *page, 1072static
1005 struct page **page2, 1073struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1006 struct rmap_item *rmap_item) 1074 struct page *page,
1075 struct page **tree_pagep)
1076
1007{ 1077{
1008 struct rb_node **new = &root_unstable_tree.rb_node; 1078 struct rb_node **new = &root_unstable_tree.rb_node;
1009 struct rb_node *parent = NULL; 1079 struct rb_node *parent = NULL;
1010 1080
1011 while (*new) { 1081 while (*new) {
1012 struct rmap_item *tree_rmap_item; 1082 struct rmap_item *tree_rmap_item;
1083 struct page *tree_page;
1013 int ret; 1084 int ret;
1014 1085
1015 cond_resched(); 1086 cond_resched();
1016 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1017 page2[0] = get_mergeable_page(tree_rmap_item); 1088 tree_page = get_mergeable_page(tree_rmap_item);
1018 if (!page2[0]) 1089 if (IS_ERR_OR_NULL(tree_page))
1019 return NULL; 1090 return NULL;
1020 1091
1021 /* 1092 /*
1022 * Don't substitute an unswappable ksm page 1093 * Don't substitute a ksm page for a forked page.
1023 * just for one good swappable forked page.
1024 */ 1094 */
1025 if (page == page2[0]) { 1095 if (page == tree_page) {
1026 put_page(page2[0]); 1096 put_page(tree_page);
1027 return NULL; 1097 return NULL;
1028 } 1098 }
1029 1099
1030 ret = memcmp_pages(page, page2[0]); 1100 ret = memcmp_pages(page, tree_page);
1031 1101
1032 parent = *new; 1102 parent = *new;
1033 if (ret < 0) { 1103 if (ret < 0) {
1034 put_page(page2[0]); 1104 put_page(tree_page);
1035 new = &parent->rb_left; 1105 new = &parent->rb_left;
1036 } else if (ret > 0) { 1106 } else if (ret > 0) {
1037 put_page(page2[0]); 1107 put_page(tree_page);
1038 new = &parent->rb_right; 1108 new = &parent->rb_right;
1039 } else { 1109 } else {
1110 *tree_pagep = tree_page;
1040 return tree_rmap_item; 1111 return tree_rmap_item;
1041 } 1112 }
1042 } 1113 }
1043 1114
1044 rmap_item->address |= NODE_FLAG; 1115 rmap_item->address |= UNSTABLE_FLAG;
1045 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1116 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1046 rb_link_node(&rmap_item->node, parent, new); 1117 rb_link_node(&rmap_item->node, parent, new);
1047 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1118 rb_insert_color(&rmap_item->node, &root_unstable_tree);
@@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
1056 * the same ksm page. 1127 * the same ksm page.
1057 */ 1128 */
1058static void stable_tree_append(struct rmap_item *rmap_item, 1129static void stable_tree_append(struct rmap_item *rmap_item,
1059 struct rmap_item *tree_rmap_item) 1130 struct stable_node *stable_node)
1060{ 1131{
1061 rmap_item->next = tree_rmap_item->next; 1132 rmap_item->head = stable_node;
1062 rmap_item->prev = tree_rmap_item;
1063
1064 if (tree_rmap_item->next)
1065 tree_rmap_item->next->prev = rmap_item;
1066
1067 tree_rmap_item->next = rmap_item;
1068 rmap_item->address |= STABLE_FLAG; 1133 rmap_item->address |= STABLE_FLAG;
1134 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1069 1135
1070 ksm_pages_sharing++; 1136 if (rmap_item->hlist.next)
1137 ksm_pages_sharing++;
1138 else
1139 ksm_pages_shared++;
1071} 1140}
1072 1141
1073/* 1142/*
@@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item,
1081 */ 1150 */
1082static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) 1151static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1083{ 1152{
1084 struct page *page2[1];
1085 struct rmap_item *tree_rmap_item; 1153 struct rmap_item *tree_rmap_item;
1154 struct page *tree_page = NULL;
1155 struct stable_node *stable_node;
1156 struct page *kpage;
1086 unsigned int checksum; 1157 unsigned int checksum;
1087 int err; 1158 int err;
1088 1159
1089 if (in_stable_tree(rmap_item)) 1160 remove_rmap_item_from_tree(rmap_item);
1090 remove_rmap_item_from_tree(rmap_item);
1091 1161
1092 /* We first start with searching the page inside the stable tree */ 1162 /* We first start with searching the page inside the stable tree */
1093 tree_rmap_item = stable_tree_search(page, page2, rmap_item); 1163 kpage = stable_tree_search(page);
1094 if (tree_rmap_item) { 1164 if (kpage) {
1095 if (page == page2[0]) /* forked */ 1165 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1096 err = 0;
1097 else
1098 err = try_to_merge_with_ksm_page(rmap_item->mm,
1099 rmap_item->address,
1100 page, page2[0]);
1101 put_page(page2[0]);
1102
1103 if (!err) { 1166 if (!err) {
1104 /* 1167 /*
1105 * The page was successfully merged: 1168 * The page was successfully merged:
1106 * add its rmap_item to the stable tree. 1169 * add its rmap_item to the stable tree.
1107 */ 1170 */
1108 stable_tree_append(rmap_item, tree_rmap_item); 1171 lock_page(kpage);
1172 stable_tree_append(rmap_item, page_stable_node(kpage));
1173 unlock_page(kpage);
1109 } 1174 }
1175 put_page(kpage);
1110 return; 1176 return;
1111 } 1177 }
1112 1178
1113 /* 1179 /*
1114 * A ksm page might have got here by fork, but its other 1180 * If the hash value of the page has changed from the last time
1115 * references have already been removed from the stable tree. 1181 * we calculated it, this page is changing frequently: therefore we
1116 * Or it might be left over from a break_ksm which failed 1182 * don't want to insert it in the unstable tree, and we don't want
1117 * when the mem_cgroup had reached its limit: try again now. 1183 * to waste our time searching for something identical to it there.
1118 */
1119 if (PageKsm(page))
1120 break_cow(rmap_item->mm, rmap_item->address);
1121
1122 /*
1123 * In case the hash value of the page was changed from the last time we
1124 * have calculated it, this page to be changed frequely, therefore we
1125 * don't want to insert it to the unstable tree, and we don't want to
1126 * waste our time to search if there is something identical to it there.
1127 */ 1184 */
1128 checksum = calc_checksum(page); 1185 checksum = calc_checksum(page);
1129 if (rmap_item->oldchecksum != checksum) { 1186 if (rmap_item->oldchecksum != checksum) {
@@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1131 return; 1188 return;
1132 } 1189 }
1133 1190
1134 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); 1191 tree_rmap_item =
1192 unstable_tree_search_insert(rmap_item, page, &tree_page);
1135 if (tree_rmap_item) { 1193 if (tree_rmap_item) {
1136 err = try_to_merge_two_pages(rmap_item->mm, 1194 kpage = try_to_merge_two_pages(rmap_item, page,
1137 rmap_item->address, page, 1195 tree_rmap_item, tree_page);
1138 tree_rmap_item->mm, 1196 put_page(tree_page);
1139 tree_rmap_item->address, page2[0]);
1140 /* 1197 /*
1141 * As soon as we merge this page, we want to remove the 1198 * As soon as we merge this page, we want to remove the
1142 * rmap_item of the page we have merged with from the unstable 1199 * rmap_item of the page we have merged with from the unstable
1143 * tree, and insert it instead as new node in the stable tree. 1200 * tree, and insert it instead as new node in the stable tree.
1144 */ 1201 */
1145 if (!err) { 1202 if (kpage) {
1146 rb_erase(&tree_rmap_item->node, &root_unstable_tree); 1203 remove_rmap_item_from_tree(tree_rmap_item);
1147 tree_rmap_item->address &= ~NODE_FLAG; 1204
1148 ksm_pages_unshared--; 1205 lock_page(kpage);
1206 stable_node = stable_tree_insert(kpage);
1207 if (stable_node) {
1208 stable_tree_append(tree_rmap_item, stable_node);
1209 stable_tree_append(rmap_item, stable_node);
1210 }
1211 unlock_page(kpage);
1149 1212
1150 /* 1213 /*
1151 * If we fail to insert the page into the stable tree, 1214 * If we fail to insert the page into the stable tree,
@@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1153 * to a ksm page left outside the stable tree, 1216 * to a ksm page left outside the stable tree,
1154 * in which case we need to break_cow on both. 1217 * in which case we need to break_cow on both.
1155 */ 1218 */
1156 if (stable_tree_insert(page2[0], tree_rmap_item)) 1219 if (!stable_node) {
1157 stable_tree_append(rmap_item, tree_rmap_item); 1220 break_cow(tree_rmap_item);
1158 else { 1221 break_cow(rmap_item);
1159 break_cow(tree_rmap_item->mm,
1160 tree_rmap_item->address);
1161 break_cow(rmap_item->mm, rmap_item->address);
1162 } 1222 }
1163 } 1223 }
1164
1165 put_page(page2[0]);
1166 } 1224 }
1167} 1225}
1168 1226
1169static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, 1227static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1170 struct list_head *cur, 1228 struct rmap_item **rmap_list,
1171 unsigned long addr) 1229 unsigned long addr)
1172{ 1230{
1173 struct rmap_item *rmap_item; 1231 struct rmap_item *rmap_item;
1174 1232
1175 while (cur != &mm_slot->rmap_list) { 1233 while (*rmap_list) {
1176 rmap_item = list_entry(cur, struct rmap_item, link); 1234 rmap_item = *rmap_list;
1177 if ((rmap_item->address & PAGE_MASK) == addr) { 1235 if ((rmap_item->address & PAGE_MASK) == addr)
1178 if (!in_stable_tree(rmap_item))
1179 remove_rmap_item_from_tree(rmap_item);
1180 return rmap_item; 1236 return rmap_item;
1181 }
1182 if (rmap_item->address > addr) 1237 if (rmap_item->address > addr)
1183 break; 1238 break;
1184 cur = cur->next; 1239 *rmap_list = rmap_item->rmap_list;
1185 remove_rmap_item_from_tree(rmap_item); 1240 remove_rmap_item_from_tree(rmap_item);
1186 list_del(&rmap_item->link);
1187 free_rmap_item(rmap_item); 1241 free_rmap_item(rmap_item);
1188 } 1242 }
1189 1243
@@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1192 /* It has already been zeroed */ 1246 /* It has already been zeroed */
1193 rmap_item->mm = mm_slot->mm; 1247 rmap_item->mm = mm_slot->mm;
1194 rmap_item->address = addr; 1248 rmap_item->address = addr;
1195 list_add_tail(&rmap_item->link, cur); 1249 rmap_item->rmap_list = *rmap_list;
1250 *rmap_list = rmap_item;
1196 } 1251 }
1197 return rmap_item; 1252 return rmap_item;
1198} 1253}
@@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1217 spin_unlock(&ksm_mmlist_lock); 1272 spin_unlock(&ksm_mmlist_lock);
1218next_mm: 1273next_mm:
1219 ksm_scan.address = 0; 1274 ksm_scan.address = 0;
1220 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1275 ksm_scan.rmap_list = &slot->rmap_list;
1221 struct rmap_item, link);
1222 } 1276 }
1223 1277
1224 mm = slot->mm; 1278 mm = slot->mm;
@@ -1240,21 +1294,21 @@ next_mm:
1240 if (ksm_test_exit(mm)) 1294 if (ksm_test_exit(mm))
1241 break; 1295 break;
1242 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1296 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1243 if (*page && PageAnon(*page)) { 1297 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
1244 flush_anon_page(vma, *page, ksm_scan.address); 1298 flush_anon_page(vma, *page, ksm_scan.address);
1245 flush_dcache_page(*page); 1299 flush_dcache_page(*page);
1246 rmap_item = get_next_rmap_item(slot, 1300 rmap_item = get_next_rmap_item(slot,
1247 ksm_scan.rmap_item->link.next, 1301 ksm_scan.rmap_list, ksm_scan.address);
1248 ksm_scan.address);
1249 if (rmap_item) { 1302 if (rmap_item) {
1250 ksm_scan.rmap_item = rmap_item; 1303 ksm_scan.rmap_list =
1304 &rmap_item->rmap_list;
1251 ksm_scan.address += PAGE_SIZE; 1305 ksm_scan.address += PAGE_SIZE;
1252 } else 1306 } else
1253 put_page(*page); 1307 put_page(*page);
1254 up_read(&mm->mmap_sem); 1308 up_read(&mm->mmap_sem);
1255 return rmap_item; 1309 return rmap_item;
1256 } 1310 }
1257 if (*page) 1311 if (!IS_ERR_OR_NULL(*page))
1258 put_page(*page); 1312 put_page(*page);
1259 ksm_scan.address += PAGE_SIZE; 1313 ksm_scan.address += PAGE_SIZE;
1260 cond_resched(); 1314 cond_resched();
@@ -1263,14 +1317,13 @@ next_mm:
1263 1317
1264 if (ksm_test_exit(mm)) { 1318 if (ksm_test_exit(mm)) {
1265 ksm_scan.address = 0; 1319 ksm_scan.address = 0;
1266 ksm_scan.rmap_item = list_entry(&slot->rmap_list, 1320 ksm_scan.rmap_list = &slot->rmap_list;
1267 struct rmap_item, link);
1268 } 1321 }
1269 /* 1322 /*
1270 * Nuke all the rmap_items that are above this current rmap: 1323 * Nuke all the rmap_items that are above this current rmap:
1271 * because there were no VM_MERGEABLE vmas with such addresses. 1324 * because there were no VM_MERGEABLE vmas with such addresses.
1272 */ 1325 */
1273 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); 1326 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1274 1327
1275 spin_lock(&ksm_mmlist_lock); 1328 spin_lock(&ksm_mmlist_lock);
1276 ksm_scan.mm_slot = list_entry(slot->mm_list.next, 1329 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -1314,7 +1367,7 @@ next_mm:
1314static void ksm_do_scan(unsigned int scan_npages) 1367static void ksm_do_scan(unsigned int scan_npages)
1315{ 1368{
1316 struct rmap_item *rmap_item; 1369 struct rmap_item *rmap_item;
1317 struct page *page; 1370 struct page *uninitialized_var(page);
1318 1371
1319 while (scan_npages--) { 1372 while (scan_npages--) {
1320 cond_resched(); 1373 cond_resched();
@@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages)
1323 return; 1376 return;
1324 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1377 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1325 cmp_and_merge_page(page, rmap_item); 1378 cmp_and_merge_page(page, rmap_item);
1326 else if (page_mapcount(page) == 1) {
1327 /*
1328 * Replace now-unshared ksm page by ordinary page.
1329 */
1330 break_cow(rmap_item->mm, rmap_item->address);
1331 remove_rmap_item_from_tree(rmap_item);
1332 rmap_item->oldchecksum = calc_checksum(page);
1333 }
1334 put_page(page); 1379 put_page(page);
1335 } 1380 }
1336} 1381}
@@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1375 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1420 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1376 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1421 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1377 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1422 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1378 VM_MIXEDMAP | VM_SAO)) 1423 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
1379 return 0; /* just ignore the advice */ 1424 return 0; /* just ignore the advice */
1380 1425
1381 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1426 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm)
1452 spin_lock(&ksm_mmlist_lock); 1497 spin_lock(&ksm_mmlist_lock);
1453 mm_slot = get_mm_slot(mm); 1498 mm_slot = get_mm_slot(mm);
1454 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1499 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1455 if (list_empty(&mm_slot->rmap_list)) { 1500 if (!mm_slot->rmap_list) {
1456 hlist_del(&mm_slot->link); 1501 hlist_del(&mm_slot->link);
1457 list_del(&mm_slot->mm_list); 1502 list_del(&mm_slot->mm_list);
1458 easy_to_free = 1; 1503 easy_to_free = 1;
@@ -1473,6 +1518,255 @@ void __ksm_exit(struct mm_struct *mm)
1473 } 1518 }
1474} 1519}
1475 1520
1521struct page *ksm_does_need_to_copy(struct page *page,
1522 struct vm_area_struct *vma, unsigned long address)
1523{
1524 struct page *new_page;
1525
1526 unlock_page(page); /* any racers will COW it, not modify it */
1527
1528 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1529 if (new_page) {
1530 copy_user_highpage(new_page, page, address, vma);
1531
1532 SetPageDirty(new_page);
1533 __SetPageUptodate(new_page);
1534 SetPageSwapBacked(new_page);
1535 __set_page_locked(new_page);
1536
1537 if (page_evictable(new_page, vma))
1538 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1539 else
1540 add_page_to_unevictable_list(new_page);
1541 }
1542
1543 page_cache_release(page);
1544 return new_page;
1545}
1546
1547int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1548 unsigned long *vm_flags)
1549{
1550 struct stable_node *stable_node;
1551 struct rmap_item *rmap_item;
1552 struct hlist_node *hlist;
1553 unsigned int mapcount = page_mapcount(page);
1554 int referenced = 0;
1555 int search_new_forks = 0;
1556
1557 VM_BUG_ON(!PageKsm(page));
1558 VM_BUG_ON(!PageLocked(page));
1559
1560 stable_node = page_stable_node(page);
1561 if (!stable_node)
1562 return 0;
1563again:
1564 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1565 struct anon_vma *anon_vma = rmap_item->anon_vma;
1566 struct anon_vma_chain *vmac;
1567 struct vm_area_struct *vma;
1568
1569 spin_lock(&anon_vma->lock);
1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma;
1572 if (rmap_item->address < vma->vm_start ||
1573 rmap_item->address >= vma->vm_end)
1574 continue;
1575 /*
1576 * Initially we examine only the vma which covers this
1577 * rmap_item; but later, if there is still work to do,
1578 * we examine covering vmas in other mms: in case they
1579 * were forked from the original since ksmd passed.
1580 */
1581 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1582 continue;
1583
1584 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
1585 continue;
1586
1587 referenced += page_referenced_one(page, vma,
1588 rmap_item->address, &mapcount, vm_flags);
1589 if (!search_new_forks || !mapcount)
1590 break;
1591 }
1592 spin_unlock(&anon_vma->lock);
1593 if (!mapcount)
1594 goto out;
1595 }
1596 if (!search_new_forks++)
1597 goto again;
1598out:
1599 return referenced;
1600}
1601
1602int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1603{
1604 struct stable_node *stable_node;
1605 struct hlist_node *hlist;
1606 struct rmap_item *rmap_item;
1607 int ret = SWAP_AGAIN;
1608 int search_new_forks = 0;
1609
1610 VM_BUG_ON(!PageKsm(page));
1611 VM_BUG_ON(!PageLocked(page));
1612
1613 stable_node = page_stable_node(page);
1614 if (!stable_node)
1615 return SWAP_FAIL;
1616again:
1617 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1618 struct anon_vma *anon_vma = rmap_item->anon_vma;
1619 struct anon_vma_chain *vmac;
1620 struct vm_area_struct *vma;
1621
1622 spin_lock(&anon_vma->lock);
1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma;
1625 if (rmap_item->address < vma->vm_start ||
1626 rmap_item->address >= vma->vm_end)
1627 continue;
1628 /*
1629 * Initially we examine only the vma which covers this
1630 * rmap_item; but later, if there is still work to do,
1631 * we examine covering vmas in other mms: in case they
1632 * were forked from the original since ksmd passed.
1633 */
1634 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1635 continue;
1636
1637 ret = try_to_unmap_one(page, vma,
1638 rmap_item->address, flags);
1639 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1640 spin_unlock(&anon_vma->lock);
1641 goto out;
1642 }
1643 }
1644 spin_unlock(&anon_vma->lock);
1645 }
1646 if (!search_new_forks++)
1647 goto again;
1648out:
1649 return ret;
1650}
1651
1652#ifdef CONFIG_MIGRATION
1653int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
1654 struct vm_area_struct *, unsigned long, void *), void *arg)
1655{
1656 struct stable_node *stable_node;
1657 struct hlist_node *hlist;
1658 struct rmap_item *rmap_item;
1659 int ret = SWAP_AGAIN;
1660 int search_new_forks = 0;
1661
1662 VM_BUG_ON(!PageKsm(page));
1663 VM_BUG_ON(!PageLocked(page));
1664
1665 stable_node = page_stable_node(page);
1666 if (!stable_node)
1667 return ret;
1668again:
1669 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
1670 struct anon_vma *anon_vma = rmap_item->anon_vma;
1671 struct anon_vma_chain *vmac;
1672 struct vm_area_struct *vma;
1673
1674 spin_lock(&anon_vma->lock);
1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma;
1677 if (rmap_item->address < vma->vm_start ||
1678 rmap_item->address >= vma->vm_end)
1679 continue;
1680 /*
1681 * Initially we examine only the vma which covers this
1682 * rmap_item; but later, if there is still work to do,
1683 * we examine covering vmas in other mms: in case they
1684 * were forked from the original since ksmd passed.
1685 */
1686 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1687 continue;
1688
1689 ret = rmap_one(page, vma, rmap_item->address, arg);
1690 if (ret != SWAP_AGAIN) {
1691 spin_unlock(&anon_vma->lock);
1692 goto out;
1693 }
1694 }
1695 spin_unlock(&anon_vma->lock);
1696 }
1697 if (!search_new_forks++)
1698 goto again;
1699out:
1700 return ret;
1701}
1702
1703void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1704{
1705 struct stable_node *stable_node;
1706
1707 VM_BUG_ON(!PageLocked(oldpage));
1708 VM_BUG_ON(!PageLocked(newpage));
1709 VM_BUG_ON(newpage->mapping != oldpage->mapping);
1710
1711 stable_node = page_stable_node(newpage);
1712 if (stable_node) {
1713 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1714 stable_node->kpfn = page_to_pfn(newpage);
1715 }
1716}
1717#endif /* CONFIG_MIGRATION */
1718
1719#ifdef CONFIG_MEMORY_HOTREMOVE
1720static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
1721 unsigned long end_pfn)
1722{
1723 struct rb_node *node;
1724
1725 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
1726 struct stable_node *stable_node;
1727
1728 stable_node = rb_entry(node, struct stable_node, node);
1729 if (stable_node->kpfn >= start_pfn &&
1730 stable_node->kpfn < end_pfn)
1731 return stable_node;
1732 }
1733 return NULL;
1734}
1735
1736static int ksm_memory_callback(struct notifier_block *self,
1737 unsigned long action, void *arg)
1738{
1739 struct memory_notify *mn = arg;
1740 struct stable_node *stable_node;
1741
1742 switch (action) {
1743 case MEM_GOING_OFFLINE:
1744 /*
1745 * Keep it very simple for now: just lock out ksmd and
1746 * MADV_UNMERGEABLE while any memory is going offline.
1747 */
1748 mutex_lock(&ksm_thread_mutex);
1749 break;
1750
1751 case MEM_OFFLINE:
1752 /*
1753 * Most of the work is done by page migration; but there might
1754 * be a few stable_nodes left over, still pointing to struct
1755 * pages which have been offlined: prune those from the tree.
1756 */
1757 while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
1758 mn->start_pfn + mn->nr_pages)) != NULL)
1759 remove_node_from_stable_tree(stable_node);
1760 /* fallthrough */
1761
1762 case MEM_CANCEL_OFFLINE:
1763 mutex_unlock(&ksm_thread_mutex);
1764 break;
1765 }
1766 return NOTIFY_OK;
1767}
1768#endif /* CONFIG_MEMORY_HOTREMOVE */
1769
1476#ifdef CONFIG_SYSFS 1770#ifdef CONFIG_SYSFS
1477/* 1771/*
1478 * This all compiles without CONFIG_SYSFS, but is a waste of space. 1772 * This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -1551,8 +1845,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1551 /* 1845 /*
1552 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 1846 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1553 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 1847 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1554 * breaking COW to free the unswappable pages_shared (but leaves 1848 * breaking COW to free the pages_shared (but leaves mm_slots
1555 * mm_slots on the list for when ksmd may be set running again). 1849 * on the list for when ksmd may be set running again).
1556 */ 1850 */
1557 1851
1558 mutex_lock(&ksm_thread_mutex); 1852 mutex_lock(&ksm_thread_mutex);
@@ -1577,29 +1871,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1577} 1871}
1578KSM_ATTR(run); 1872KSM_ATTR(run);
1579 1873
1580static ssize_t max_kernel_pages_store(struct kobject *kobj,
1581 struct kobj_attribute *attr,
1582 const char *buf, size_t count)
1583{
1584 int err;
1585 unsigned long nr_pages;
1586
1587 err = strict_strtoul(buf, 10, &nr_pages);
1588 if (err)
1589 return -EINVAL;
1590
1591 ksm_max_kernel_pages = nr_pages;
1592
1593 return count;
1594}
1595
1596static ssize_t max_kernel_pages_show(struct kobject *kobj,
1597 struct kobj_attribute *attr, char *buf)
1598{
1599 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1600}
1601KSM_ATTR(max_kernel_pages);
1602
1603static ssize_t pages_shared_show(struct kobject *kobj, 1874static ssize_t pages_shared_show(struct kobject *kobj,
1604 struct kobj_attribute *attr, char *buf) 1875 struct kobj_attribute *attr, char *buf)
1605{ 1876{
@@ -1649,7 +1920,6 @@ static struct attribute *ksm_attrs[] = {
1649 &sleep_millisecs_attr.attr, 1920 &sleep_millisecs_attr.attr,
1650 &pages_to_scan_attr.attr, 1921 &pages_to_scan_attr.attr,
1651 &run_attr.attr, 1922 &run_attr.attr,
1652 &max_kernel_pages_attr.attr,
1653 &pages_shared_attr.attr, 1923 &pages_shared_attr.attr,
1654 &pages_sharing_attr.attr, 1924 &pages_sharing_attr.attr,
1655 &pages_unshared_attr.attr, 1925 &pages_unshared_attr.attr,
@@ -1669,8 +1939,6 @@ static int __init ksm_init(void)
1669 struct task_struct *ksm_thread; 1939 struct task_struct *ksm_thread;
1670 int err; 1940 int err;
1671 1941
1672 ksm_max_kernel_pages = totalram_pages / 4;
1673
1674 err = ksm_slab_init(); 1942 err = ksm_slab_init();
1675 if (err) 1943 if (err)
1676 goto out; 1944 goto out;
@@ -1698,6 +1966,13 @@ static int __init ksm_init(void)
1698 1966
1699#endif /* CONFIG_SYSFS */ 1967#endif /* CONFIG_SYSFS */
1700 1968
1969#ifdef CONFIG_MEMORY_HOTREMOVE
1970 /*
1971 * Choose a high priority since the callback takes ksm_thread_mutex:
1972 * later callbacks could only be taking locks which nest within that.
1973 */
1974 hotplug_memory_notifier(ksm_memory_callback, 100);
1975#endif
1701 return 0; 1976 return 0;
1702 1977
1703out_free2: 1978out_free2:
diff --git a/mm/maccess.c b/mm/maccess.c
index 9073695ff25f..4e348dbaecd7 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -14,7 +14,11 @@
14 * Safely read from address @src to the buffer at @dst. If a kernel fault 14 * Safely read from address @src to the buffer at @dst. If a kernel fault
15 * happens, handle that and return -EFAULT. 15 * happens, handle that and return -EFAULT.
16 */ 16 */
17long probe_kernel_read(void *dst, void *src, size_t size) 17
18long __weak probe_kernel_read(void *dst, void *src, size_t size)
19 __attribute__((alias("__probe_kernel_read")));
20
21long __probe_kernel_read(void *dst, void *src, size_t size)
18{ 22{
19 long ret; 23 long ret;
20 mm_segment_t old_fs = get_fs(); 24 mm_segment_t old_fs = get_fs();
@@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
39 * Safely write to address @dst from the buffer at @src. If a kernel fault 43 * Safely write to address @dst from the buffer at @src. If a kernel fault
40 * happens, handle that and return -EFAULT. 44 * happens, handle that and return -EFAULT.
41 */ 45 */
42long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) 46long __weak probe_kernel_write(void *dst, void *src, size_t size)
47 __attribute__((alias("__probe_kernel_write")));
48
49long __probe_kernel_write(void *dst, void *src, size_t size)
43{ 50{
44 long ret; 51 long ret;
45 mm_segment_t old_fs = get_fs(); 52 mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
12#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/ksm.h> 15#include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
222/* 223/*
223 * Error injection support for memory error handling. 224 * Error injection support for memory error handling.
224 */ 225 */
225static int madvise_hwpoison(unsigned long start, unsigned long end) 226static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
226{ 227{
227 int ret = 0; 228 int ret = 0;
228 229
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
230 return -EPERM; 231 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) { 232 for (; start < end; start += PAGE_SIZE) {
232 struct page *p; 233 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1, 234 int ret = get_user_pages_fast(start, 1, 0, &p);
234 0, 0, &p, NULL);
235 if (ret != 1) 235 if (ret != 1)
236 return ret; 236 return ret;
237 if (bhv == MADV_SOFT_OFFLINE) {
238 printk(KERN_INFO "Soft offlining page %lx at %lx\n",
239 page_to_pfn(p), start);
240 ret = soft_offline_page(p, MF_COUNT_INCREASED);
241 if (ret)
242 break;
243 continue;
244 }
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 245 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start); 246 page_to_pfn(p), start);
239 /* Ignore return value for now */ 247 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1); 248 __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
241 put_page(p);
242 } 249 }
243 return ret; 250 return ret;
244} 251}
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
335 size_t len; 342 size_t len;
336 343
337#ifdef CONFIG_MEMORY_FAILURE 344#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON) 345 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
339 return madvise_hwpoison(start, start+len_in); 346 return madvise_hwpoison(behavior, start, start+len_in);
340#endif 347#endif
341 if (!madvise_behavior_valid(behavior)) 348 if (!madvise_behavior_valid(behavior))
342 return error; 349 return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f99f5991d6bb..8a79a6f0f029 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov
12 *
9 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +25,7 @@
21#include <linux/memcontrol.h> 25#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 26#include <linux/cgroup.h>
23#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/hugetlb.h>
24#include <linux/pagemap.h> 29#include <linux/pagemap.h>
25#include <linux/smp.h> 30#include <linux/smp.h>
26#include <linux/page-flags.h> 31#include <linux/page-flags.h>
@@ -32,12 +37,16 @@
32#include <linux/rbtree.h> 37#include <linux/rbtree.h>
33#include <linux/slab.h> 38#include <linux/slab.h>
34#include <linux/swap.h> 39#include <linux/swap.h>
40#include <linux/swapops.h>
35#include <linux/spinlock.h> 41#include <linux/spinlock.h>
42#include <linux/eventfd.h>
43#include <linux/sort.h>
36#include <linux/fs.h> 44#include <linux/fs.h>
37#include <linux/seq_file.h> 45#include <linux/seq_file.h>
38#include <linux/vmalloc.h> 46#include <linux/vmalloc.h>
39#include <linux/mm_inline.h> 47#include <linux/mm_inline.h>
40#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/cpu.h>
41#include "internal.h" 50#include "internal.h"
42 51
43#include <asm/uaccess.h> 52#include <asm/uaccess.h>
@@ -54,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54#define do_swap_account (0) 63#define do_swap_account (0)
55#endif 64#endif
56 65
57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 66/*
58#define SOFTLIMIT_EVENTS_THRESH (1000) 67 * Per memcg event counter is incremented at every pagein/pageout. This counter
68 * is used for trigger some periodic events. This is straightforward and better
69 * than using jiffies etc. to handle periodic memcg event.
70 *
71 * These values will be used as !((event) & ((1 <<(thresh)) - 1))
72 */
73#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
74#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
59 75
60/* 76/*
61 * Statistics for memory cgroup. 77 * Statistics for memory cgroup.
@@ -66,65 +82,19 @@ enum mem_cgroup_stat_index {
66 */ 82 */
67 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 83 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
68 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 84 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 85 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 86 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 87 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
74 90
75 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
76}; 92};
77 93
78struct mem_cgroup_stat_cpu { 94struct mem_cgroup_stat_cpu {
79 s64 count[MEM_CGROUP_STAT_NSTATS]; 95 s64 count[MEM_CGROUP_STAT_NSTATS];
80} ____cacheline_aligned_in_smp;
81
82struct mem_cgroup_stat {
83 struct mem_cgroup_stat_cpu cpustat[0];
84}; 96};
85 97
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
100/*
101 * For accounting under irq disable, no need for increment preempt count.
102 */
103static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
104 enum mem_cgroup_stat_index idx, int val)
105{
106 stat->count[idx] += val;
107}
108
109static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
110 enum mem_cgroup_stat_index idx)
111{
112 int cpu;
113 s64 ret = 0;
114 for_each_possible_cpu(cpu)
115 ret += stat->cpustat[cpu].count[idx];
116 return ret;
117}
118
119static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
120{
121 s64 ret;
122
123 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
124 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
125 return ret;
126}
127
128/* 98/*
129 * per-zone information in memory controller. 99 * per-zone information in memory controller.
130 */ 100 */
@@ -174,6 +144,22 @@ struct mem_cgroup_tree {
174 144
175static struct mem_cgroup_tree soft_limit_tree __read_mostly; 145static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176 146
147struct mem_cgroup_threshold {
148 struct eventfd_ctx *eventfd;
149 u64 threshold;
150};
151
152struct mem_cgroup_threshold_ary {
153 /* An array index points to threshold just below usage. */
154 atomic_t current_threshold;
155 /* Size of entries[] */
156 unsigned int size;
157 /* Array of thresholds */
158 struct mem_cgroup_threshold entries[0];
159};
160
161static void mem_cgroup_threshold(struct mem_cgroup *mem);
162
177/* 163/*
178 * The memory controller data structure. The memory controller controls both 164 * The memory controller data structure. The memory controller controls both
179 * page cache and RSS per cgroup. We would eventually like to provide 165 * page cache and RSS per cgroup. We would eventually like to provide
@@ -209,7 +195,7 @@ struct mem_cgroup {
209 int prev_priority; /* for recording reclaim priority */ 195 int prev_priority; /* for recording reclaim priority */
210 196
211 /* 197 /*
212 * While reclaiming in a hiearchy, we cache the last child we 198 * While reclaiming in a hierarchy, we cache the last child we
213 * reclaimed from. 199 * reclaimed from.
214 */ 200 */
215 int last_scanned_child; 201 int last_scanned_child;
@@ -217,7 +203,7 @@ struct mem_cgroup {
217 * Should the accounting and control be hierarchical, per subtree? 203 * Should the accounting and control be hierarchical, per subtree?
218 */ 204 */
219 bool use_hierarchy; 205 bool use_hierarchy;
220 unsigned long last_oom_jiffies; 206 atomic_t oom_lock;
221 atomic_t refcnt; 207 atomic_t refcnt;
222 208
223 unsigned int swappiness; 209 unsigned int swappiness;
@@ -225,10 +211,48 @@ struct mem_cgroup {
225 /* set when res.limit == memsw.limit */ 211 /* set when res.limit == memsw.limit */
226 bool memsw_is_minimum; 212 bool memsw_is_minimum;
227 213
214 /* protect arrays of thresholds */
215 struct mutex thresholds_lock;
216
217 /* thresholds for memory usage. RCU-protected */
218 struct mem_cgroup_threshold_ary *thresholds;
219
220 /* thresholds for mem+swap usage. RCU-protected */
221 struct mem_cgroup_threshold_ary *memsw_thresholds;
222
223 /*
224 * Should we move charges of a task when a task is moved into this
225 * mem_cgroup ? And what type of charges should we move ?
226 */
227 unsigned long move_charge_at_immigrate;
228
228 /* 229 /*
229 * statistics. This must be placed at the end of memcg. 230 * percpu counter.
230 */ 231 */
231 struct mem_cgroup_stat stat; 232 struct mem_cgroup_stat_cpu *stat;
233};
234
235/* Stuffs for move charges at task migration. */
236/*
237 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
238 * left-shifted bitmap of these types.
239 */
240enum move_type {
241 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
242 NR_MOVE_TYPE,
243};
244
245/* "mc" and its members are protected by cgroup_mutex */
246static struct move_charge_struct {
247 struct mem_cgroup *from;
248 struct mem_cgroup *to;
249 unsigned long precharge;
250 unsigned long moved_charge;
251 unsigned long moved_swap;
252 struct task_struct *moving_task; /* a task moving charges */
253 wait_queue_head_t waitq; /* a waitq for other context */
254} mc = {
255 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
232}; 256};
233 257
234/* 258/*
@@ -275,6 +299,7 @@ enum charge_type {
275static void mem_cgroup_get(struct mem_cgroup *mem); 299static void mem_cgroup_get(struct mem_cgroup *mem);
276static void mem_cgroup_put(struct mem_cgroup *mem); 300static void mem_cgroup_put(struct mem_cgroup *mem);
277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 301static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
302static void drain_all_stock_async(void);
278 303
279static struct mem_cgroup_per_zone * 304static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 305mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -282,6 +307,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 307 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283} 308}
284 309
310struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
311{
312 return &mem->css;
313}
314
285static struct mem_cgroup_per_zone * 315static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc) 316page_cgroup_zoneinfo(struct page_cgroup *pc)
287{ 317{
@@ -365,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
365 spin_unlock(&mctz->lock); 395 spin_unlock(&mctz->lock);
366} 396}
367 397
368static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
369{
370 bool ret = false;
371 int cpu;
372 s64 val;
373 struct mem_cgroup_stat_cpu *cpustat;
374
375 cpu = get_cpu();
376 cpustat = &mem->stat.cpustat[cpu];
377 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
378 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
379 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
380 ret = true;
381 }
382 put_cpu();
383 return ret;
384}
385 398
386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 399static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
387{ 400{
@@ -475,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
475 return mz; 488 return mz;
476} 489}
477 490
491static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
492 enum mem_cgroup_stat_index idx)
493{
494 int cpu;
495 s64 val = 0;
496
497 for_each_possible_cpu(cpu)
498 val += per_cpu(mem->stat->count[idx], cpu);
499 return val;
500}
501
502static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
503{
504 s64 ret;
505
506 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
507 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
508 return ret;
509}
510
478static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 511static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
479 bool charge) 512 bool charge)
480{ 513{
481 int val = (charge) ? 1 : -1; 514 int val = (charge) ? 1 : -1;
482 struct mem_cgroup_stat *stat = &mem->stat; 515 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
483 struct mem_cgroup_stat_cpu *cpustat;
484 int cpu = get_cpu();
485
486 cpustat = &stat->cpustat[cpu];
487 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
488 put_cpu();
489} 516}
490 517
491static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 518static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -493,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
493 bool charge) 520 bool charge)
494{ 521{
495 int val = (charge) ? 1 : -1; 522 int val = (charge) ? 1 : -1;
496 struct mem_cgroup_stat *stat = &mem->stat;
497 struct mem_cgroup_stat_cpu *cpustat;
498 int cpu = get_cpu();
499 523
500 cpustat = &stat->cpustat[cpu]; 524 preempt_disable();
525
501 if (PageCgroupCache(pc)) 526 if (PageCgroupCache(pc))
502 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 527 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
503 else 528 else
504 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); 529 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
505 530
506 if (charge) 531 if (charge)
507 __mem_cgroup_stat_add_safe(cpustat, 532 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
508 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
509 else 533 else
510 __mem_cgroup_stat_add_safe(cpustat, 534 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
511 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 535 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
512 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); 536
513 put_cpu(); 537 preempt_enable();
514} 538}
515 539
516static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 540static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -528,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
528 return total; 552 return total;
529} 553}
530 554
555static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
556{
557 s64 val;
558
559 val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
560
561 return !(val & ((1 << event_mask_shift) - 1));
562}
563
564/*
565 * Check events in order.
566 *
567 */
568static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
569{
570 /* threshold event is triggered in finer grain than soft limit */
571 if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
572 mem_cgroup_threshold(mem);
573 if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
574 mem_cgroup_update_tree(mem, page);
575 }
576}
577
531static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 578static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
532{ 579{
533 return container_of(cgroup_subsys_state(cont, 580 return container_of(cgroup_subsys_state(cont,
@@ -758,7 +805,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
758 task_unlock(task); 805 task_unlock(task);
759 if (!curr) 806 if (!curr)
760 return 0; 807 return 0;
761 if (curr->use_hierarchy) 808 /*
809 * We should check use_hierarchy of "mem" not "curr". Because checking
810 * use_hierarchy of "curr" here make this function true if hierarchy is
811 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
812 * hierarchy(even if use_hierarchy is disabled in "mem").
813 */
814 if (mem->use_hierarchy)
762 ret = css_is_ancestor(&curr->css, &mem->css); 815 ret = css_is_ancestor(&curr->css, &mem->css);
763 else 816 else
764 ret = (curr == mem); 817 ret = (curr == mem);
@@ -988,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
988} 1041}
989 1042
990/** 1043/**
991 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. 1044 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
992 * @memcg: The memory cgroup that went over limit 1045 * @memcg: The memory cgroup that went over limit
993 * @p: Task that is going to be killed 1046 * @p: Task that is going to be killed
994 * 1047 *
@@ -1007,7 +1060,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1007 static char memcg_name[PATH_MAX]; 1060 static char memcg_name[PATH_MAX];
1008 int ret; 1061 int ret;
1009 1062
1010 if (!memcg) 1063 if (!memcg || !p)
1011 return; 1064 return;
1012 1065
1013 1066
@@ -1137,6 +1190,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1137 victim = mem_cgroup_select_victim(root_mem); 1190 victim = mem_cgroup_select_victim(root_mem);
1138 if (victim == root_mem) { 1191 if (victim == root_mem) {
1139 loop++; 1192 loop++;
1193 if (loop >= 1)
1194 drain_all_stock_async();
1140 if (loop >= 2) { 1195 if (loop >= 2) {
1141 /* 1196 /*
1142 * If we have not been able to reclaim 1197 * If we have not been able to reclaim
@@ -1160,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1160 } 1215 }
1161 } 1216 }
1162 } 1217 }
1163 if (!mem_cgroup_local_usage(&victim->stat)) { 1218 if (!mem_cgroup_local_usage(victim)) {
1164 /* this cgroup's local usage == 0 */ 1219 /* this cgroup's local usage == 0 */
1165 css_put(&victim->css); 1220 css_put(&victim->css);
1166 continue; 1221 continue;
@@ -1191,90 +1246,284 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1191 return total; 1246 return total;
1192} 1247}
1193 1248
1194bool mem_cgroup_oom_called(struct task_struct *task) 1249static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1195{ 1250{
1196 bool ret = false; 1251 int *val = (int *)data;
1197 struct mem_cgroup *mem; 1252 int x;
1198 struct mm_struct *mm; 1253 /*
1254 * Logically, we can stop scanning immediately when we find
1255 * a memcg is already locked. But condidering unlock ops and
1256 * creation/removal of memcg, scan-all is simple operation.
1257 */
1258 x = atomic_inc_return(&mem->oom_lock);
1259 *val = max(x, *val);
1260 return 0;
1261}
1262/*
1263 * Check OOM-Killer is already running under our hierarchy.
1264 * If someone is running, return false.
1265 */
1266static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1267{
1268 int lock_count = 0;
1199 1269
1200 rcu_read_lock(); 1270 mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
1201 mm = task->mm; 1271
1202 if (!mm) 1272 if (lock_count == 1)
1203 mm = &init_mm; 1273 return true;
1204 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1274 return false;
1205 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1206 ret = true;
1207 rcu_read_unlock();
1208 return ret;
1209} 1275}
1210 1276
1211static int record_last_oom_cb(struct mem_cgroup *mem, void *data) 1277static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1212{ 1278{
1213 mem->last_oom_jiffies = jiffies; 1279 /*
1280 * When a new child is created while the hierarchy is under oom,
1281 * mem_cgroup_oom_lock() may not be called. We have to use
1282 * atomic_add_unless() here.
1283 */
1284 atomic_add_unless(&mem->oom_lock, -1, 0);
1214 return 0; 1285 return 0;
1215} 1286}
1216 1287
1217static void record_last_oom(struct mem_cgroup *mem) 1288static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1289{
1290 mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1291}
1292
1293static DEFINE_MUTEX(memcg_oom_mutex);
1294static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1295
1296/*
1297 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1298 */
1299bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1218{ 1300{
1219 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); 1301 DEFINE_WAIT(wait);
1302 bool locked;
1303
1304 /* At first, try to OOM lock hierarchy under mem.*/
1305 mutex_lock(&memcg_oom_mutex);
1306 locked = mem_cgroup_oom_lock(mem);
1307 /*
1308 * Even if signal_pending(), we can't quit charge() loop without
1309 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1310 * under OOM is always welcomed, use TASK_KILLABLE here.
1311 */
1312 if (!locked)
1313 prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
1314 mutex_unlock(&memcg_oom_mutex);
1315
1316 if (locked)
1317 mem_cgroup_out_of_memory(mem, mask);
1318 else {
1319 schedule();
1320 finish_wait(&memcg_oom_waitq, &wait);
1321 }
1322 mutex_lock(&memcg_oom_mutex);
1323 mem_cgroup_oom_unlock(mem);
1324 /*
1325 * Here, we use global waitq .....more fine grained waitq ?
1326 * Assume following hierarchy.
1327 * A/
1328 * 01
1329 * 02
1330 * assume OOM happens both in A and 01 at the same time. Tthey are
1331 * mutually exclusive by lock. (kill in 01 helps A.)
1332 * When we use per memcg waitq, we have to wake up waiters on A and 02
1333 * in addtion to waiters on 01. We use global waitq for avoiding mess.
1334 * It will not be a big problem.
1335 * (And a task may be moved to other groups while it's waiting for OOM.)
1336 */
1337 wake_up_all(&memcg_oom_waitq);
1338 mutex_unlock(&memcg_oom_mutex);
1339
1340 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1341 return false;
1342 /* Give chance to dying process */
1343 schedule_timeout(1);
1344 return true;
1220} 1345}
1221 1346
1222/* 1347/*
1223 * Currently used to update mapped file statistics, but the routine can be 1348 * Currently used to update mapped file statistics, but the routine can be
1224 * generalized to update other statistics as well. 1349 * generalized to update other statistics as well.
1225 */ 1350 */
1226void mem_cgroup_update_mapped_file_stat(struct page *page, int val) 1351void mem_cgroup_update_file_mapped(struct page *page, int val)
1227{ 1352{
1228 struct mem_cgroup *mem; 1353 struct mem_cgroup *mem;
1229 struct mem_cgroup_stat *stat;
1230 struct mem_cgroup_stat_cpu *cpustat;
1231 int cpu;
1232 struct page_cgroup *pc; 1354 struct page_cgroup *pc;
1233 1355
1234 if (!page_is_file_cache(page))
1235 return;
1236
1237 pc = lookup_page_cgroup(page); 1356 pc = lookup_page_cgroup(page);
1238 if (unlikely(!pc)) 1357 if (unlikely(!pc))
1239 return; 1358 return;
1240 1359
1241 lock_page_cgroup(pc); 1360 lock_page_cgroup(pc);
1242 mem = pc->mem_cgroup; 1361 mem = pc->mem_cgroup;
1243 if (!mem) 1362 if (!mem || !PageCgroupUsed(pc))
1244 goto done;
1245
1246 if (!PageCgroupUsed(pc))
1247 goto done; 1363 goto done;
1248 1364
1249 /* 1365 /*
1250 * Preemption is already disabled, we don't need get_cpu() 1366 * Preemption is already disabled. We can use __this_cpu_xxx
1251 */ 1367 */
1252 cpu = smp_processor_id(); 1368 if (val > 0) {
1253 stat = &mem->stat; 1369 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1254 cpustat = &stat->cpustat[cpu]; 1370 SetPageCgroupFileMapped(pc);
1371 } else {
1372 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1373 ClearPageCgroupFileMapped(pc);
1374 }
1255 1375
1256 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
1257done: 1376done:
1258 unlock_page_cgroup(pc); 1377 unlock_page_cgroup(pc);
1259} 1378}
1260 1379
1261/* 1380/*
1381 * size of first charge trial. "32" comes from vmscan.c's magic value.
1382 * TODO: maybe necessary to use big numbers in big irons.
1383 */
1384#define CHARGE_SIZE (32 * PAGE_SIZE)
1385struct memcg_stock_pcp {
1386 struct mem_cgroup *cached; /* this never be root cgroup */
1387 int charge;
1388 struct work_struct work;
1389};
1390static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1391static atomic_t memcg_drain_count;
1392
1393/*
1394 * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
1395 * from local stock and true is returned. If the stock is 0 or charges from a
1396 * cgroup which is not current target, returns false. This stock will be
1397 * refilled.
1398 */
1399static bool consume_stock(struct mem_cgroup *mem)
1400{
1401 struct memcg_stock_pcp *stock;
1402 bool ret = true;
1403
1404 stock = &get_cpu_var(memcg_stock);
1405 if (mem == stock->cached && stock->charge)
1406 stock->charge -= PAGE_SIZE;
1407 else /* need to call res_counter_charge */
1408 ret = false;
1409 put_cpu_var(memcg_stock);
1410 return ret;
1411}
1412
1413/*
1414 * Returns stocks cached in percpu to res_counter and reset cached information.
1415 */
1416static void drain_stock(struct memcg_stock_pcp *stock)
1417{
1418 struct mem_cgroup *old = stock->cached;
1419
1420 if (stock->charge) {
1421 res_counter_uncharge(&old->res, stock->charge);
1422 if (do_swap_account)
1423 res_counter_uncharge(&old->memsw, stock->charge);
1424 }
1425 stock->cached = NULL;
1426 stock->charge = 0;
1427}
1428
1429/*
1430 * This must be called under preempt disabled or must be called by
1431 * a thread which is pinned to local cpu.
1432 */
1433static void drain_local_stock(struct work_struct *dummy)
1434{
1435 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1436 drain_stock(stock);
1437}
1438
1439/*
1440 * Cache charges(val) which is from res_counter, to local per_cpu area.
1441 * This will be consumed by consumt_stock() function, later.
1442 */
1443static void refill_stock(struct mem_cgroup *mem, int val)
1444{
1445 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1446
1447 if (stock->cached != mem) { /* reset if necessary */
1448 drain_stock(stock);
1449 stock->cached = mem;
1450 }
1451 stock->charge += val;
1452 put_cpu_var(memcg_stock);
1453}
1454
1455/*
1456 * Tries to drain stocked charges in other cpus. This function is asynchronous
1457 * and just put a work per cpu for draining localy on each cpu. Caller can
1458 * expects some charges will be back to res_counter later but cannot wait for
1459 * it.
1460 */
1461static void drain_all_stock_async(void)
1462{
1463 int cpu;
1464 /* This function is for scheduling "drain" in asynchronous way.
1465 * The result of "drain" is not directly handled by callers. Then,
1466 * if someone is calling drain, we don't have to call drain more.
1467 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1468 * there is a race. We just do loose check here.
1469 */
1470 if (atomic_read(&memcg_drain_count))
1471 return;
1472 /* Notify other cpus that system-wide "drain" is running */
1473 atomic_inc(&memcg_drain_count);
1474 get_online_cpus();
1475 for_each_online_cpu(cpu) {
1476 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1477 schedule_work_on(cpu, &stock->work);
1478 }
1479 put_online_cpus();
1480 atomic_dec(&memcg_drain_count);
1481 /* We don't wait for flush_work */
1482}
1483
1484/* This is a synchronous drain interface. */
1485static void drain_all_stock_sync(void)
1486{
1487 /* called when force_empty is called */
1488 atomic_inc(&memcg_drain_count);
1489 schedule_on_each_cpu(drain_local_stock);
1490 atomic_dec(&memcg_drain_count);
1491}
1492
1493static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1494 unsigned long action,
1495 void *hcpu)
1496{
1497 int cpu = (unsigned long)hcpu;
1498 struct memcg_stock_pcp *stock;
1499
1500 if (action != CPU_DEAD)
1501 return NOTIFY_OK;
1502 stock = &per_cpu(memcg_stock, cpu);
1503 drain_stock(stock);
1504 return NOTIFY_OK;
1505}
1506
1507/*
1262 * Unlike exported interface, "oom" parameter is added. if oom==true, 1508 * Unlike exported interface, "oom" parameter is added. if oom==true,
1263 * oom-killer can be invoked. 1509 * oom-killer can be invoked.
1264 */ 1510 */
1265static int __mem_cgroup_try_charge(struct mm_struct *mm, 1511static int __mem_cgroup_try_charge(struct mm_struct *mm,
1266 gfp_t gfp_mask, struct mem_cgroup **memcg, 1512 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1267 bool oom, struct page *page)
1268{ 1513{
1269 struct mem_cgroup *mem, *mem_over_limit; 1514 struct mem_cgroup *mem, *mem_over_limit;
1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1515 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1271 struct res_counter *fail_res; 1516 struct res_counter *fail_res;
1517 int csize = CHARGE_SIZE;
1272 1518
1273 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1519 /*
1274 /* Don't account this! */ 1520 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1275 *memcg = NULL; 1521 * in system level. So, allow to go ahead dying process in addition to
1276 return 0; 1522 * MEMDIE process.
1277 } 1523 */
1524 if (unlikely(test_thread_flag(TIF_MEMDIE)
1525 || fatal_signal_pending(current)))
1526 goto bypass;
1278 1527
1279 /* 1528 /*
1280 * We always charge the cgroup the mm_struct belongs to. 1529 * We always charge the cgroup the mm_struct belongs to.
@@ -1293,23 +1542,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1293 return 0; 1542 return 0;
1294 1543
1295 VM_BUG_ON(css_is_removed(&mem->css)); 1544 VM_BUG_ON(css_is_removed(&mem->css));
1545 if (mem_cgroup_is_root(mem))
1546 goto done;
1296 1547
1297 while (1) { 1548 while (1) {
1298 int ret = 0; 1549 int ret = 0;
1299 unsigned long flags = 0; 1550 unsigned long flags = 0;
1300 1551
1301 if (mem_cgroup_is_root(mem)) 1552 if (consume_stock(mem))
1302 goto done; 1553 goto done;
1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1554
1555 ret = res_counter_charge(&mem->res, csize, &fail_res);
1304 if (likely(!ret)) { 1556 if (likely(!ret)) {
1305 if (!do_swap_account) 1557 if (!do_swap_account)
1306 break; 1558 break;
1307 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1559 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1308 &fail_res);
1309 if (likely(!ret)) 1560 if (likely(!ret))
1310 break; 1561 break;
1311 /* mem+swap counter fails */ 1562 /* mem+swap counter fails */
1312 res_counter_uncharge(&mem->res, PAGE_SIZE); 1563 res_counter_uncharge(&mem->res, csize);
1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1564 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1565 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1315 memsw); 1566 memsw);
@@ -1318,6 +1569,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1318 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1569 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1319 res); 1570 res);
1320 1571
1572 /* reduce request size and retry */
1573 if (csize > PAGE_SIZE) {
1574 csize = PAGE_SIZE;
1575 continue;
1576 }
1321 if (!(gfp_mask & __GFP_WAIT)) 1577 if (!(gfp_mask & __GFP_WAIT))
1322 goto nomem; 1578 goto nomem;
1323 1579
@@ -1337,27 +1593,92 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1337 if (mem_cgroup_check_under_limit(mem_over_limit)) 1593 if (mem_cgroup_check_under_limit(mem_over_limit))
1338 continue; 1594 continue;
1339 1595
1596 /* try to avoid oom while someone is moving charge */
1597 if (mc.moving_task && current != mc.moving_task) {
1598 struct mem_cgroup *from, *to;
1599 bool do_continue = false;
1600 /*
1601 * There is a small race that "from" or "to" can be
1602 * freed by rmdir, so we use css_tryget().
1603 */
1604 from = mc.from;
1605 to = mc.to;
1606 if (from && css_tryget(&from->css)) {
1607 if (mem_over_limit->use_hierarchy)
1608 do_continue = css_is_ancestor(
1609 &from->css,
1610 &mem_over_limit->css);
1611 else
1612 do_continue = (from == mem_over_limit);
1613 css_put(&from->css);
1614 }
1615 if (!do_continue && to && css_tryget(&to->css)) {
1616 if (mem_over_limit->use_hierarchy)
1617 do_continue = css_is_ancestor(
1618 &to->css,
1619 &mem_over_limit->css);
1620 else
1621 do_continue = (to == mem_over_limit);
1622 css_put(&to->css);
1623 }
1624 if (do_continue) {
1625 DEFINE_WAIT(wait);
1626 prepare_to_wait(&mc.waitq, &wait,
1627 TASK_INTERRUPTIBLE);
1628 /* moving charge context might have finished. */
1629 if (mc.moving_task)
1630 schedule();
1631 finish_wait(&mc.waitq, &wait);
1632 continue;
1633 }
1634 }
1635
1340 if (!nr_retries--) { 1636 if (!nr_retries--) {
1341 if (oom) { 1637 if (!oom)
1342 mutex_lock(&memcg_tasklist); 1638 goto nomem;
1343 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 1639 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1344 mutex_unlock(&memcg_tasklist); 1640 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1345 record_last_oom(mem_over_limit); 1641 continue;
1346 } 1642 }
1347 goto nomem; 1643 /* When we reach here, current task is dying .*/
1644 css_put(&mem->css);
1645 goto bypass;
1348 } 1646 }
1349 } 1647 }
1350 /* 1648 if (csize > PAGE_SIZE)
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 1649 refill_stock(mem, csize - PAGE_SIZE);
1352 * if they exceeds softlimit.
1353 */
1354 if (mem_cgroup_soft_limit_check(mem))
1355 mem_cgroup_update_tree(mem, page);
1356done: 1650done:
1357 return 0; 1651 return 0;
1358nomem: 1652nomem:
1359 css_put(&mem->css); 1653 css_put(&mem->css);
1360 return -ENOMEM; 1654 return -ENOMEM;
1655bypass:
1656 *memcg = NULL;
1657 return 0;
1658}
1659
1660/*
1661 * Somemtimes we have to undo a charge we got by try_charge().
1662 * This function is for that and do uncharge, put css's refcnt.
1663 * gotten by try_charge().
1664 */
1665static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1666 unsigned long count)
1667{
1668 if (!mem_cgroup_is_root(mem)) {
1669 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1670 if (do_swap_account)
1671 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1672 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1673 WARN_ON_ONCE(count > INT_MAX);
1674 __css_put(&mem->css, (int)count);
1675 }
1676 /* we don't need css_put for root */
1677}
1678
1679static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
1680{
1681 __mem_cgroup_cancel_charge(mem, 1);
1361} 1682}
1362 1683
1363/* 1684/*
@@ -1379,25 +1700,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1379 return container_of(css, struct mem_cgroup, css); 1700 return container_of(css, struct mem_cgroup, css);
1380} 1701}
1381 1702
1382static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1703struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
1383{ 1704{
1384 struct mem_cgroup *mem; 1705 struct mem_cgroup *mem = NULL;
1385 struct page_cgroup *pc; 1706 struct page_cgroup *pc;
1386 unsigned short id; 1707 unsigned short id;
1387 swp_entry_t ent; 1708 swp_entry_t ent;
1388 1709
1389 VM_BUG_ON(!PageLocked(page)); 1710 VM_BUG_ON(!PageLocked(page));
1390 1711
1391 if (!PageSwapCache(page))
1392 return NULL;
1393
1394 pc = lookup_page_cgroup(page); 1712 pc = lookup_page_cgroup(page);
1395 lock_page_cgroup(pc); 1713 lock_page_cgroup(pc);
1396 if (PageCgroupUsed(pc)) { 1714 if (PageCgroupUsed(pc)) {
1397 mem = pc->mem_cgroup; 1715 mem = pc->mem_cgroup;
1398 if (mem && !css_tryget(&mem->css)) 1716 if (mem && !css_tryget(&mem->css))
1399 mem = NULL; 1717 mem = NULL;
1400 } else { 1718 } else if (PageSwapCache(page)) {
1401 ent.val = page_private(page); 1719 ent.val = page_private(page);
1402 id = lookup_swap_cgroup(ent); 1720 id = lookup_swap_cgroup(ent);
1403 rcu_read_lock(); 1721 rcu_read_lock();
@@ -1426,12 +1744,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1426 lock_page_cgroup(pc); 1744 lock_page_cgroup(pc);
1427 if (unlikely(PageCgroupUsed(pc))) { 1745 if (unlikely(PageCgroupUsed(pc))) {
1428 unlock_page_cgroup(pc); 1746 unlock_page_cgroup(pc);
1429 if (!mem_cgroup_is_root(mem)) { 1747 mem_cgroup_cancel_charge(mem);
1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1434 css_put(&mem->css);
1435 return; 1748 return;
1436 } 1749 }
1437 1750
@@ -1461,88 +1774,83 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1461 mem_cgroup_charge_statistics(mem, pc, true); 1774 mem_cgroup_charge_statistics(mem, pc, true);
1462 1775
1463 unlock_page_cgroup(pc); 1776 unlock_page_cgroup(pc);
1777 /*
1778 * "charge_statistics" updated event counter. Then, check it.
1779 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1780 * if they exceeds softlimit.
1781 */
1782 memcg_check_events(mem, pc->page);
1464} 1783}
1465 1784
1466/** 1785/**
1467 * mem_cgroup_move_account - move account of the page 1786 * __mem_cgroup_move_account - move account of the page
1468 * @pc: page_cgroup of the page. 1787 * @pc: page_cgroup of the page.
1469 * @from: mem_cgroup which the page is moved from. 1788 * @from: mem_cgroup which the page is moved from.
1470 * @to: mem_cgroup which the page is moved to. @from != @to. 1789 * @to: mem_cgroup which the page is moved to. @from != @to.
1790 * @uncharge: whether we should call uncharge and css_put against @from.
1471 * 1791 *
1472 * The caller must confirm following. 1792 * The caller must confirm following.
1473 * - page is not on LRU (isolate_page() is useful.) 1793 * - page is not on LRU (isolate_page() is useful.)
1794 * - the pc is locked, used, and ->mem_cgroup points to @from.
1474 * 1795 *
1475 * returns 0 at success, 1796 * This function doesn't do "charge" nor css_get to new cgroup. It should be
1476 * returns -EBUSY when lock is busy or "pc" is unstable. 1797 * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
1477 * 1798 * true, this function does "uncharge" from old cgroup, but it doesn't if
1478 * This function does "uncharge" from old cgroup but doesn't do "charge" to 1799 * @uncharge is false, so a caller should do "uncharge".
1479 * new cgroup. It should be done by a caller.
1480 */ 1800 */
1481 1801
1482static int mem_cgroup_move_account(struct page_cgroup *pc, 1802static void __mem_cgroup_move_account(struct page_cgroup *pc,
1483 struct mem_cgroup *from, struct mem_cgroup *to) 1803 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1484{ 1804{
1485 struct mem_cgroup_per_zone *from_mz, *to_mz;
1486 int nid, zid;
1487 int ret = -EBUSY;
1488 struct page *page;
1489 int cpu;
1490 struct mem_cgroup_stat *stat;
1491 struct mem_cgroup_stat_cpu *cpustat;
1492
1493 VM_BUG_ON(from == to); 1805 VM_BUG_ON(from == to);
1494 VM_BUG_ON(PageLRU(pc->page)); 1806 VM_BUG_ON(PageLRU(pc->page));
1495 1807 VM_BUG_ON(!PageCgroupLocked(pc));
1496 nid = page_cgroup_nid(pc); 1808 VM_BUG_ON(!PageCgroupUsed(pc));
1497 zid = page_cgroup_zid(pc); 1809 VM_BUG_ON(pc->mem_cgroup != from);
1498 from_mz = mem_cgroup_zoneinfo(from, nid, zid); 1810
1499 to_mz = mem_cgroup_zoneinfo(to, nid, zid); 1811 if (PageCgroupFileMapped(pc)) {
1500 1812 /* Update mapped_file data for mem_cgroup */
1501 if (!trylock_page_cgroup(pc)) 1813 preempt_disable();
1502 return ret; 1814 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1503 1815 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1504 if (!PageCgroupUsed(pc)) 1816 preempt_enable();
1505 goto out;
1506
1507 if (pc->mem_cgroup != from)
1508 goto out;
1509
1510 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE);
1512 mem_cgroup_charge_statistics(from, pc, false);
1513
1514 page = pc->page;
1515 if (page_is_file_cache(page) && page_mapped(page)) {
1516 cpu = smp_processor_id();
1517 /* Update mapped_file data for mem_cgroup "from" */
1518 stat = &from->stat;
1519 cpustat = &stat->cpustat[cpu];
1520 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1521 -1);
1522
1523 /* Update mapped_file data for mem_cgroup "to" */
1524 stat = &to->stat;
1525 cpustat = &stat->cpustat[cpu];
1526 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
1527 1);
1528 } 1817 }
1818 mem_cgroup_charge_statistics(from, pc, false);
1819 if (uncharge)
1820 /* This is not "cancel", but cancel_charge does all we need. */
1821 mem_cgroup_cancel_charge(from);
1529 1822
1530 if (do_swap_account && !mem_cgroup_is_root(from)) 1823 /* caller should have done css_get */
1531 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1532 css_put(&from->css);
1533
1534 css_get(&to->css);
1535 pc->mem_cgroup = to; 1824 pc->mem_cgroup = to;
1536 mem_cgroup_charge_statistics(to, pc, true); 1825 mem_cgroup_charge_statistics(to, pc, true);
1537 ret = 0;
1538out:
1539 unlock_page_cgroup(pc);
1540 /* 1826 /*
1541 * We charges against "to" which may not have any tasks. Then, "to" 1827 * We charges against "to" which may not have any tasks. Then, "to"
1542 * can be under rmdir(). But in current implementation, caller of 1828 * can be under rmdir(). But in current implementation, caller of
1543 * this function is just force_empty() and it's garanteed that 1829 * this function is just force_empty() and move charge, so it's
1544 * "to" is never removed. So, we don't check rmdir status here. 1830 * garanteed that "to" is never removed. So, we don't check rmdir
1831 * status here.
1832 */
1833}
1834
1835/*
1836 * check whether the @pc is valid for moving account and call
1837 * __mem_cgroup_move_account()
1838 */
1839static int mem_cgroup_move_account(struct page_cgroup *pc,
1840 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1841{
1842 int ret = -EINVAL;
1843 lock_page_cgroup(pc);
1844 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
1845 __mem_cgroup_move_account(pc, from, to, uncharge);
1846 ret = 0;
1847 }
1848 unlock_page_cgroup(pc);
1849 /*
1850 * check events
1545 */ 1851 */
1852 memcg_check_events(to, pc->page);
1853 memcg_check_events(from, pc->page);
1546 return ret; 1854 return ret;
1547} 1855}
1548 1856
@@ -1564,45 +1872,25 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1564 if (!pcg) 1872 if (!pcg)
1565 return -EINVAL; 1873 return -EINVAL;
1566 1874
1875 ret = -EBUSY;
1876 if (!get_page_unless_zero(page))
1877 goto out;
1878 if (isolate_lru_page(page))
1879 goto put;
1567 1880
1568 parent = mem_cgroup_from_cont(pcg); 1881 parent = mem_cgroup_from_cont(pcg);
1569 1882 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1570
1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1572 if (ret || !parent) 1883 if (ret || !parent)
1573 return ret; 1884 goto put_back;
1574
1575 if (!get_page_unless_zero(page)) {
1576 ret = -EBUSY;
1577 goto uncharge;
1578 }
1579
1580 ret = isolate_lru_page(page);
1581 1885
1886 ret = mem_cgroup_move_account(pc, child, parent, true);
1582 if (ret) 1887 if (ret)
1583 goto cancel; 1888 mem_cgroup_cancel_charge(parent);
1584 1889put_back:
1585 ret = mem_cgroup_move_account(pc, child, parent);
1586
1587 putback_lru_page(page); 1890 putback_lru_page(page);
1588 if (!ret) { 1891put:
1589 put_page(page);
1590 /* drop extra refcnt by try_charge() */
1591 css_put(&parent->css);
1592 return 0;
1593 }
1594
1595cancel:
1596 put_page(page); 1892 put_page(page);
1597uncharge: 1893out:
1598 /* drop extra refcnt by try_charge() */
1599 css_put(&parent->css);
1600 /* uncharge if move fails */
1601 if (!mem_cgroup_is_root(parent)) {
1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1606 return ret; 1894 return ret;
1607} 1895}
1608 1896
@@ -1627,7 +1915,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1627 prefetchw(pc); 1915 prefetchw(pc);
1628 1916
1629 mem = memcg; 1917 mem = memcg;
1630 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); 1918 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1631 if (ret || !mem) 1919 if (ret || !mem)
1632 return ret; 1920 return ret;
1633 1921
@@ -1720,7 +2008,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1720/* 2008/*
1721 * While swap-in, try_charge -> commit or cancel, the page is locked. 2009 * While swap-in, try_charge -> commit or cancel, the page is locked.
1722 * And when try_charge() successfully returns, one refcnt to memcg without 2010 * And when try_charge() successfully returns, one refcnt to memcg without
1723 * struct page_cgroup is aquired. This refcnt will be cumsumed by 2011 * struct page_cgroup is acquired. This refcnt will be consumed by
1724 * "commit()" or removed by "cancel()" 2012 * "commit()" or removed by "cancel()"
1725 */ 2013 */
1726int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2014int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
@@ -1737,23 +2025,24 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1737 goto charge_cur_mm; 2025 goto charge_cur_mm;
1738 /* 2026 /*
1739 * A racing thread's fault, or swapoff, may have already updated 2027 * A racing thread's fault, or swapoff, may have already updated
1740 * the pte, and even removed page from swap cache: return success 2028 * the pte, and even removed page from swap cache: in those cases
1741 * to go on to do_swap_page()'s pte_same() test, which should fail. 2029 * do_swap_page()'s pte_same() test will fail; but there's also a
2030 * KSM case which does need to charge the page.
1742 */ 2031 */
1743 if (!PageSwapCache(page)) 2032 if (!PageSwapCache(page))
1744 return 0; 2033 goto charge_cur_mm;
1745 mem = try_get_mem_cgroup_from_swapcache(page); 2034 mem = try_get_mem_cgroup_from_page(page);
1746 if (!mem) 2035 if (!mem)
1747 goto charge_cur_mm; 2036 goto charge_cur_mm;
1748 *ptr = mem; 2037 *ptr = mem;
1749 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); 2038 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1750 /* drop extra refcnt from tryget */ 2039 /* drop extra refcnt from tryget */
1751 css_put(&mem->css); 2040 css_put(&mem->css);
1752 return ret; 2041 return ret;
1753charge_cur_mm: 2042charge_cur_mm:
1754 if (unlikely(!mm)) 2043 if (unlikely(!mm))
1755 mm = &init_mm; 2044 mm = &init_mm;
1756 return __mem_cgroup_try_charge(mm, mask, ptr, true, page); 2045 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1757} 2046}
1758 2047
1759static void 2048static void
@@ -1818,14 +2107,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1818 return; 2107 return;
1819 if (!mem) 2108 if (!mem)
1820 return; 2109 return;
1821 if (!mem_cgroup_is_root(mem)) { 2110 mem_cgroup_cancel_charge(mem);
1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1826 css_put(&mem->css);
1827} 2111}
1828 2112
2113static void
2114__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2115{
2116 struct memcg_batch_info *batch = NULL;
2117 bool uncharge_memsw = true;
2118 /* If swapout, usage of swap doesn't decrease */
2119 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2120 uncharge_memsw = false;
2121 /*
2122 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2123 * In those cases, all pages freed continously can be expected to be in
2124 * the same cgroup and we have chance to coalesce uncharges.
2125 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2126 * because we want to do uncharge as soon as possible.
2127 */
2128 if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
2129 goto direct_uncharge;
2130
2131 batch = &current->memcg_batch;
2132 /*
2133 * In usual, we do css_get() when we remember memcg pointer.
2134 * But in this case, we keep res->usage until end of a series of
2135 * uncharges. Then, it's ok to ignore memcg's refcnt.
2136 */
2137 if (!batch->memcg)
2138 batch->memcg = mem;
2139 /*
2140 * In typical case, batch->memcg == mem. This means we can
2141 * merge a series of uncharges to an uncharge of res_counter.
2142 * If not, we uncharge res_counter ony by one.
2143 */
2144 if (batch->memcg != mem)
2145 goto direct_uncharge;
2146 /* remember freed charge and uncharge it later */
2147 batch->bytes += PAGE_SIZE;
2148 if (uncharge_memsw)
2149 batch->memsw_bytes += PAGE_SIZE;
2150 return;
2151direct_uncharge:
2152 res_counter_uncharge(&mem->res, PAGE_SIZE);
2153 if (uncharge_memsw)
2154 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
2155 return;
2156}
1829 2157
1830/* 2158/*
1831 * uncharge if !page_mapped(page) 2159 * uncharge if !page_mapped(page)
@@ -1874,12 +2202,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1874 break; 2202 break;
1875 } 2203 }
1876 2204
1877 if (!mem_cgroup_is_root(mem)) { 2205 if (!mem_cgroup_is_root(mem))
1878 res_counter_uncharge(&mem->res, PAGE_SIZE); 2206 __do_uncharge(mem, ctype);
1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2207 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true); 2208 mem_cgroup_swap_statistics(mem, true);
1885 mem_cgroup_charge_statistics(mem, pc, false); 2209 mem_cgroup_charge_statistics(mem, pc, false);
@@ -1895,8 +2219,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1895 mz = page_cgroup_zoneinfo(pc); 2219 mz = page_cgroup_zoneinfo(pc);
1896 unlock_page_cgroup(pc); 2220 unlock_page_cgroup(pc);
1897 2221
1898 if (mem_cgroup_soft_limit_check(mem)) 2222 memcg_check_events(mem, page);
1899 mem_cgroup_update_tree(mem, page);
1900 /* at swapout, this memcg will be accessed to record to swap */ 2223 /* at swapout, this memcg will be accessed to record to swap */
1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2224 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1902 css_put(&mem->css); 2225 css_put(&mem->css);
@@ -1925,6 +2248,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
1925 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2248 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1926} 2249}
1927 2250
2251/*
2252 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2253 * In that cases, pages are freed continuously and we can expect pages
2254 * are in the same memcg. All these calls itself limits the number of
2255 * pages freed at once, then uncharge_start/end() is called properly.
2256 * This may be called prural(2) times in a context,
2257 */
2258
2259void mem_cgroup_uncharge_start(void)
2260{
2261 current->memcg_batch.do_batch++;
2262 /* We can do nest. */
2263 if (current->memcg_batch.do_batch == 1) {
2264 current->memcg_batch.memcg = NULL;
2265 current->memcg_batch.bytes = 0;
2266 current->memcg_batch.memsw_bytes = 0;
2267 }
2268}
2269
2270void mem_cgroup_uncharge_end(void)
2271{
2272 struct memcg_batch_info *batch = &current->memcg_batch;
2273
2274 if (!batch->do_batch)
2275 return;
2276
2277 batch->do_batch--;
2278 if (batch->do_batch) /* If stacked, do nothing. */
2279 return;
2280
2281 if (!batch->memcg)
2282 return;
2283 /*
2284 * This "batch->memcg" is valid without any css_get/put etc...
2285 * bacause we hide charges behind us.
2286 */
2287 if (batch->bytes)
2288 res_counter_uncharge(&batch->memcg->res, batch->bytes);
2289 if (batch->memsw_bytes)
2290 res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
2291 /* forget this pointer (for sanity check) */
2292 batch->memcg = NULL;
2293}
2294
1928#ifdef CONFIG_SWAP 2295#ifdef CONFIG_SWAP
1929/* 2296/*
1930 * called after __delete_from_swap_cache() and drop "page" account. 2297 * called after __delete_from_swap_cache() and drop "page" account.
@@ -1979,6 +2346,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1979 } 2346 }
1980 rcu_read_unlock(); 2347 rcu_read_unlock();
1981} 2348}
2349
2350/**
2351 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2352 * @entry: swap entry to be moved
2353 * @from: mem_cgroup which the entry is moved from
2354 * @to: mem_cgroup which the entry is moved to
2355 * @need_fixup: whether we should fixup res_counters and refcounts.
2356 *
2357 * It succeeds only when the swap_cgroup's record for this entry is the same
2358 * as the mem_cgroup's id of @from.
2359 *
2360 * Returns 0 on success, -EINVAL on failure.
2361 *
2362 * The caller must have charged to @to, IOW, called res_counter_charge() about
2363 * both res and memsw, and called css_get().
2364 */
2365static int mem_cgroup_move_swap_account(swp_entry_t entry,
2366 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2367{
2368 unsigned short old_id, new_id;
2369
2370 old_id = css_id(&from->css);
2371 new_id = css_id(&to->css);
2372
2373 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2374 mem_cgroup_swap_statistics(from, false);
2375 mem_cgroup_swap_statistics(to, true);
2376 /*
2377 * This function is only called from task migration context now.
2378 * It postpones res_counter and refcount handling till the end
2379 * of task migration(mem_cgroup_clear_mc()) for performance
2380 * improvement. But we cannot postpone mem_cgroup_get(to)
2381 * because if the process that has been moved to @to does
2382 * swap-in, the refcount of @to might be decreased to 0.
2383 */
2384 mem_cgroup_get(to);
2385 if (need_fixup) {
2386 if (!mem_cgroup_is_root(from))
2387 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2388 mem_cgroup_put(from);
2389 /*
2390 * we charged both to->res and to->memsw, so we should
2391 * uncharge to->res.
2392 */
2393 if (!mem_cgroup_is_root(to))
2394 res_counter_uncharge(&to->res, PAGE_SIZE);
2395 css_put(&to->css);
2396 }
2397 return 0;
2398 }
2399 return -EINVAL;
2400}
2401#else
2402static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2403 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2404{
2405 return -EINVAL;
2406}
1982#endif 2407#endif
1983 2408
1984/* 2409/*
@@ -2002,12 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2002 } 2427 }
2003 unlock_page_cgroup(pc); 2428 unlock_page_cgroup(pc);
2004 2429
2430 *ptr = mem;
2005 if (mem) { 2431 if (mem) {
2006 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, 2432 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2007 page);
2008 css_put(&mem->css); 2433 css_put(&mem->css);
2009 } 2434 }
2010 *ptr = mem;
2011 return ret; 2435 return ret;
2012} 2436}
2013 2437
@@ -2100,7 +2524,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2100 unsigned long long val) 2524 unsigned long long val)
2101{ 2525{
2102 int retry_count; 2526 int retry_count;
2103 int progress;
2104 u64 memswlimit; 2527 u64 memswlimit;
2105 int ret = 0; 2528 int ret = 0;
2106 int children = mem_cgroup_count_children(memcg); 2529 int children = mem_cgroup_count_children(memcg);
@@ -2144,8 +2567,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2144 if (!ret) 2567 if (!ret)
2145 break; 2568 break;
2146 2569
2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, 2570 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK); 2571 MEM_CGROUP_RECLAIM_SHRINK);
2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2572 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2151 /* Usage is reduced ? */ 2573 /* Usage is reduced ? */
@@ -2334,7 +2756,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
2334 pc = list_entry(list->prev, struct page_cgroup, lru); 2756 pc = list_entry(list->prev, struct page_cgroup, lru);
2335 if (busy == pc) { 2757 if (busy == pc) {
2336 list_move(&pc->lru, list); 2758 list_move(&pc->lru, list);
2337 busy = 0; 2759 busy = NULL;
2338 spin_unlock_irqrestore(&zone->lru_lock, flags); 2760 spin_unlock_irqrestore(&zone->lru_lock, flags);
2339 continue; 2761 continue;
2340 } 2762 }
@@ -2375,7 +2797,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
2375 if (free_all) 2797 if (free_all)
2376 goto try_to_free; 2798 goto try_to_free;
2377move_account: 2799move_account:
2378 while (mem->res.usage > 0) { 2800 do {
2379 ret = -EBUSY; 2801 ret = -EBUSY;
2380 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 2802 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
2381 goto out; 2803 goto out;
@@ -2384,6 +2806,7 @@ move_account:
2384 goto out; 2806 goto out;
2385 /* This is for making all *used* pages to be on LRU. */ 2807 /* This is for making all *used* pages to be on LRU. */
2386 lru_add_drain_all(); 2808 lru_add_drain_all();
2809 drain_all_stock_sync();
2387 ret = 0; 2810 ret = 0;
2388 for_each_node_state(node, N_HIGH_MEMORY) { 2811 for_each_node_state(node, N_HIGH_MEMORY) {
2389 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 2812 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2402,8 +2825,8 @@ move_account:
2402 if (ret == -ENOMEM) 2825 if (ret == -ENOMEM)
2403 goto try_to_free; 2826 goto try_to_free;
2404 cond_resched(); 2827 cond_resched();
2405 } 2828 /* "ret" should also be checked to ensure all lists are empty. */
2406 ret = 0; 2829 } while (mem->res.usage > 0 || ret);
2407out: 2830out:
2408 css_put(&mem->css); 2831 css_put(&mem->css);
2409 return ret; 2832 return ret;
@@ -2436,10 +2859,7 @@ try_to_free:
2436 } 2859 }
2437 lru_add_drain(); 2860 lru_add_drain();
2438 /* try move_account...there may be some *locked* pages. */ 2861 /* try move_account...there may be some *locked* pages. */
2439 if (mem->res.usage) 2862 goto move_account;
2440 goto move_account;
2441 ret = 0;
2442 goto out;
2443} 2863}
2444 2864
2445int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 2865int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
@@ -2466,7 +2886,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2466 2886
2467 cgroup_lock(); 2887 cgroup_lock();
2468 /* 2888 /*
2469 * If parent's use_hiearchy is set, we can't make any modifications 2889 * If parent's use_hierarchy is set, we can't make any modifications
2470 * in the child subtrees. If it is unset, then the change can 2890 * in the child subtrees. If it is unset, then the change can
2471 * occur, provided the current cgroup has no children. 2891 * occur, provided the current cgroup has no children.
2472 * 2892 *
@@ -2495,7 +2915,7 @@ static int
2495mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) 2915mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2496{ 2916{
2497 struct mem_cgroup_idx_data *d = data; 2917 struct mem_cgroup_idx_data *d = data;
2498 d->val += mem_cgroup_read_stat(&mem->stat, d->idx); 2918 d->val += mem_cgroup_read_stat(mem, d->idx);
2499 return 0; 2919 return 0;
2500} 2920}
2501 2921
@@ -2510,39 +2930,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2510 *val = d.val; 2930 *val = d.val;
2511} 2931}
2512 2932
2933static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
2934{
2935 u64 idx_val, val;
2936
2937 if (!mem_cgroup_is_root(mem)) {
2938 if (!swap)
2939 return res_counter_read_u64(&mem->res, RES_USAGE);
2940 else
2941 return res_counter_read_u64(&mem->memsw, RES_USAGE);
2942 }
2943
2944 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
2945 val = idx_val;
2946 mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
2947 val += idx_val;
2948
2949 if (swap) {
2950 mem_cgroup_get_recursive_idx_stat(mem,
2951 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2952 val += idx_val;
2953 }
2954
2955 return val << PAGE_SHIFT;
2956}
2957
2513static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2958static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2514{ 2959{
2515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2960 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2516 u64 idx_val, val; 2961 u64 val;
2517 int type, name; 2962 int type, name;
2518 2963
2519 type = MEMFILE_TYPE(cft->private); 2964 type = MEMFILE_TYPE(cft->private);
2520 name = MEMFILE_ATTR(cft->private); 2965 name = MEMFILE_ATTR(cft->private);
2521 switch (type) { 2966 switch (type) {
2522 case _MEM: 2967 case _MEM:
2523 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2968 if (name == RES_USAGE)
2524 mem_cgroup_get_recursive_idx_stat(mem, 2969 val = mem_cgroup_usage(mem, false);
2525 MEM_CGROUP_STAT_CACHE, &idx_val); 2970 else
2526 val = idx_val;
2527 mem_cgroup_get_recursive_idx_stat(mem,
2528 MEM_CGROUP_STAT_RSS, &idx_val);
2529 val += idx_val;
2530 val <<= PAGE_SHIFT;
2531 } else
2532 val = res_counter_read_u64(&mem->res, name); 2971 val = res_counter_read_u64(&mem->res, name);
2533 break; 2972 break;
2534 case _MEMSWAP: 2973 case _MEMSWAP:
2535 if (name == RES_USAGE && mem_cgroup_is_root(mem)) { 2974 if (name == RES_USAGE)
2536 mem_cgroup_get_recursive_idx_stat(mem, 2975 val = mem_cgroup_usage(mem, true);
2537 MEM_CGROUP_STAT_CACHE, &idx_val); 2976 else
2538 val = idx_val;
2539 mem_cgroup_get_recursive_idx_stat(mem,
2540 MEM_CGROUP_STAT_RSS, &idx_val);
2541 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2544 val <<= PAGE_SHIFT;
2545 } else
2546 val = res_counter_read_u64(&mem->memsw, name); 2977 val = res_counter_read_u64(&mem->memsw, name);
2547 break; 2978 break;
2548 default: 2979 default:
@@ -2655,12 +3086,45 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2655 return 0; 3086 return 0;
2656} 3087}
2657 3088
3089static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3090 struct cftype *cft)
3091{
3092 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3093}
3094
3095#ifdef CONFIG_MMU
3096static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3097 struct cftype *cft, u64 val)
3098{
3099 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3100
3101 if (val >= (1 << NR_MOVE_TYPE))
3102 return -EINVAL;
3103 /*
3104 * We check this value several times in both in can_attach() and
3105 * attach(), so we need cgroup lock to prevent this value from being
3106 * inconsistent.
3107 */
3108 cgroup_lock();
3109 mem->move_charge_at_immigrate = val;
3110 cgroup_unlock();
3111
3112 return 0;
3113}
3114#else
3115static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3116 struct cftype *cft, u64 val)
3117{
3118 return -ENOSYS;
3119}
3120#endif
3121
2658 3122
2659/* For read statistics */ 3123/* For read statistics */
2660enum { 3124enum {
2661 MCS_CACHE, 3125 MCS_CACHE,
2662 MCS_RSS, 3126 MCS_RSS,
2663 MCS_MAPPED_FILE, 3127 MCS_FILE_MAPPED,
2664 MCS_PGPGIN, 3128 MCS_PGPGIN,
2665 MCS_PGPGOUT, 3129 MCS_PGPGOUT,
2666 MCS_SWAP, 3130 MCS_SWAP,
@@ -2700,18 +3164,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2700 s64 val; 3164 s64 val;
2701 3165
2702 /* per cpu stat */ 3166 /* per cpu stat */
2703 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); 3167 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
2704 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3168 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2705 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 3169 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
2706 s->stat[MCS_RSS] += val * PAGE_SIZE; 3170 s->stat[MCS_RSS] += val * PAGE_SIZE;
2707 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); 3171 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
2708 s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; 3172 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
2709 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); 3173 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
2710 s->stat[MCS_PGPGIN] += val; 3174 s->stat[MCS_PGPGIN] += val;
2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 3175 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2712 s->stat[MCS_PGPGOUT] += val; 3176 s->stat[MCS_PGPGOUT] += val;
2713 if (do_swap_account) { 3177 if (do_swap_account) {
2714 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); 3178 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
2715 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3179 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2716 } 3180 }
2717 3181
@@ -2839,12 +3303,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
2839 return 0; 3303 return 0;
2840} 3304}
2841 3305
3306static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3307{
3308 struct mem_cgroup_threshold_ary *t;
3309 u64 usage;
3310 int i;
3311
3312 rcu_read_lock();
3313 if (!swap)
3314 t = rcu_dereference(memcg->thresholds);
3315 else
3316 t = rcu_dereference(memcg->memsw_thresholds);
3317
3318 if (!t)
3319 goto unlock;
3320
3321 usage = mem_cgroup_usage(memcg, swap);
3322
3323 /*
3324 * current_threshold points to threshold just below usage.
3325 * If it's not true, a threshold was crossed after last
3326 * call of __mem_cgroup_threshold().
3327 */
3328 i = atomic_read(&t->current_threshold);
3329
3330 /*
3331 * Iterate backward over array of thresholds starting from
3332 * current_threshold and check if a threshold is crossed.
3333 * If none of thresholds below usage is crossed, we read
3334 * only one element of the array here.
3335 */
3336 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3337 eventfd_signal(t->entries[i].eventfd, 1);
3338
3339 /* i = current_threshold + 1 */
3340 i++;
3341
3342 /*
3343 * Iterate forward over array of thresholds starting from
3344 * current_threshold+1 and check if a threshold is crossed.
3345 * If none of thresholds above usage is crossed, we read
3346 * only one element of the array here.
3347 */
3348 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3349 eventfd_signal(t->entries[i].eventfd, 1);
3350
3351 /* Update current_threshold */
3352 atomic_set(&t->current_threshold, i - 1);
3353unlock:
3354 rcu_read_unlock();
3355}
3356
3357static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3358{
3359 __mem_cgroup_threshold(memcg, false);
3360 if (do_swap_account)
3361 __mem_cgroup_threshold(memcg, true);
3362}
3363
3364static int compare_thresholds(const void *a, const void *b)
3365{
3366 const struct mem_cgroup_threshold *_a = a;
3367 const struct mem_cgroup_threshold *_b = b;
3368
3369 return _a->threshold - _b->threshold;
3370}
3371
3372static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
3373 struct eventfd_ctx *eventfd, const char *args)
3374{
3375 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3376 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3377 int type = MEMFILE_TYPE(cft->private);
3378 u64 threshold, usage;
3379 int size;
3380 int i, ret;
3381
3382 ret = res_counter_memparse_write_strategy(args, &threshold);
3383 if (ret)
3384 return ret;
3385
3386 mutex_lock(&memcg->thresholds_lock);
3387 if (type == _MEM)
3388 thresholds = memcg->thresholds;
3389 else if (type == _MEMSWAP)
3390 thresholds = memcg->memsw_thresholds;
3391 else
3392 BUG();
3393
3394 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3395
3396 /* Check if a threshold crossed before adding a new one */
3397 if (thresholds)
3398 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3399
3400 if (thresholds)
3401 size = thresholds->size + 1;
3402 else
3403 size = 1;
3404
3405 /* Allocate memory for new array of thresholds */
3406 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3407 size * sizeof(struct mem_cgroup_threshold),
3408 GFP_KERNEL);
3409 if (!thresholds_new) {
3410 ret = -ENOMEM;
3411 goto unlock;
3412 }
3413 thresholds_new->size = size;
3414
3415 /* Copy thresholds (if any) to new array */
3416 if (thresholds)
3417 memcpy(thresholds_new->entries, thresholds->entries,
3418 thresholds->size *
3419 sizeof(struct mem_cgroup_threshold));
3420 /* Add new threshold */
3421 thresholds_new->entries[size - 1].eventfd = eventfd;
3422 thresholds_new->entries[size - 1].threshold = threshold;
3423
3424 /* Sort thresholds. Registering of new threshold isn't time-critical */
3425 sort(thresholds_new->entries, size,
3426 sizeof(struct mem_cgroup_threshold),
3427 compare_thresholds, NULL);
3428
3429 /* Find current threshold */
3430 atomic_set(&thresholds_new->current_threshold, -1);
3431 for (i = 0; i < size; i++) {
3432 if (thresholds_new->entries[i].threshold < usage) {
3433 /*
3434 * thresholds_new->current_threshold will not be used
3435 * until rcu_assign_pointer(), so it's safe to increment
3436 * it here.
3437 */
3438 atomic_inc(&thresholds_new->current_threshold);
3439 }
3440 }
3441
3442 if (type == _MEM)
3443 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3444 else
3445 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3446
3447 /* To be sure that nobody uses thresholds before freeing it */
3448 synchronize_rcu();
3449
3450 kfree(thresholds);
3451unlock:
3452 mutex_unlock(&memcg->thresholds_lock);
3453
3454 return ret;
3455}
3456
3457static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
3458 struct eventfd_ctx *eventfd)
3459{
3460 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3461 struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
3462 int type = MEMFILE_TYPE(cft->private);
3463 u64 usage;
3464 int size = 0;
3465 int i, j, ret;
3466
3467 mutex_lock(&memcg->thresholds_lock);
3468 if (type == _MEM)
3469 thresholds = memcg->thresholds;
3470 else if (type == _MEMSWAP)
3471 thresholds = memcg->memsw_thresholds;
3472 else
3473 BUG();
3474
3475 /*
3476 * Something went wrong if we trying to unregister a threshold
3477 * if we don't have thresholds
3478 */
3479 BUG_ON(!thresholds);
3480
3481 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
3482
3483 /* Check if a threshold crossed before removing */
3484 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3485
3486 /* Calculate new number of threshold */
3487 for (i = 0; i < thresholds->size; i++) {
3488 if (thresholds->entries[i].eventfd != eventfd)
3489 size++;
3490 }
3491
3492 /* Set thresholds array to NULL if we don't have thresholds */
3493 if (!size) {
3494 thresholds_new = NULL;
3495 goto assign;
3496 }
3497
3498 /* Allocate memory for new array of thresholds */
3499 thresholds_new = kmalloc(sizeof(*thresholds_new) +
3500 size * sizeof(struct mem_cgroup_threshold),
3501 GFP_KERNEL);
3502 if (!thresholds_new) {
3503 ret = -ENOMEM;
3504 goto unlock;
3505 }
3506 thresholds_new->size = size;
3507
3508 /* Copy thresholds and find current threshold */
3509 atomic_set(&thresholds_new->current_threshold, -1);
3510 for (i = 0, j = 0; i < thresholds->size; i++) {
3511 if (thresholds->entries[i].eventfd == eventfd)
3512 continue;
3513
3514 thresholds_new->entries[j] = thresholds->entries[i];
3515 if (thresholds_new->entries[j].threshold < usage) {
3516 /*
3517 * thresholds_new->current_threshold will not be used
3518 * until rcu_assign_pointer(), so it's safe to increment
3519 * it here.
3520 */
3521 atomic_inc(&thresholds_new->current_threshold);
3522 }
3523 j++;
3524 }
3525
3526assign:
3527 if (type == _MEM)
3528 rcu_assign_pointer(memcg->thresholds, thresholds_new);
3529 else
3530 rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
3531
3532 /* To be sure that nobody uses thresholds before freeing it */
3533 synchronize_rcu();
3534
3535 kfree(thresholds);
3536unlock:
3537 mutex_unlock(&memcg->thresholds_lock);
3538
3539 return ret;
3540}
2842 3541
2843static struct cftype mem_cgroup_files[] = { 3542static struct cftype mem_cgroup_files[] = {
2844 { 3543 {
2845 .name = "usage_in_bytes", 3544 .name = "usage_in_bytes",
2846 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 3545 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2847 .read_u64 = mem_cgroup_read, 3546 .read_u64 = mem_cgroup_read,
3547 .register_event = mem_cgroup_register_event,
3548 .unregister_event = mem_cgroup_unregister_event,
2848 }, 3549 },
2849 { 3550 {
2850 .name = "max_usage_in_bytes", 3551 .name = "max_usage_in_bytes",
@@ -2888,6 +3589,11 @@ static struct cftype mem_cgroup_files[] = {
2888 .read_u64 = mem_cgroup_swappiness_read, 3589 .read_u64 = mem_cgroup_swappiness_read,
2889 .write_u64 = mem_cgroup_swappiness_write, 3590 .write_u64 = mem_cgroup_swappiness_write,
2890 }, 3591 },
3592 {
3593 .name = "move_charge_at_immigrate",
3594 .read_u64 = mem_cgroup_move_charge_read,
3595 .write_u64 = mem_cgroup_move_charge_write,
3596 },
2891}; 3597};
2892 3598
2893#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3599#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -2896,6 +3602,8 @@ static struct cftype memsw_cgroup_files[] = {
2896 .name = "memsw.usage_in_bytes", 3602 .name = "memsw.usage_in_bytes",
2897 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 3603 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2898 .read_u64 = mem_cgroup_read, 3604 .read_u64 = mem_cgroup_read,
3605 .register_event = mem_cgroup_register_event,
3606 .unregister_event = mem_cgroup_unregister_event,
2899 }, 3607 },
2900 { 3608 {
2901 .name = "memsw.max_usage_in_bytes", 3609 .name = "memsw.max_usage_in_bytes",
@@ -2970,24 +3678,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2970 kfree(mem->info.nodeinfo[node]); 3678 kfree(mem->info.nodeinfo[node]);
2971} 3679}
2972 3680
2973static int mem_cgroup_size(void)
2974{
2975 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2976 return sizeof(struct mem_cgroup) + cpustat_size;
2977}
2978
2979static struct mem_cgroup *mem_cgroup_alloc(void) 3681static struct mem_cgroup *mem_cgroup_alloc(void)
2980{ 3682{
2981 struct mem_cgroup *mem; 3683 struct mem_cgroup *mem;
2982 int size = mem_cgroup_size(); 3684 int size = sizeof(struct mem_cgroup);
2983 3685
3686 /* Can be very big if MAX_NUMNODES is very big */
2984 if (size < PAGE_SIZE) 3687 if (size < PAGE_SIZE)
2985 mem = kmalloc(size, GFP_KERNEL); 3688 mem = kmalloc(size, GFP_KERNEL);
2986 else 3689 else
2987 mem = vmalloc(size); 3690 mem = vmalloc(size);
2988 3691
2989 if (mem) 3692 if (!mem)
2990 memset(mem, 0, size); 3693 return NULL;
3694
3695 memset(mem, 0, size);
3696 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3697 if (!mem->stat) {
3698 if (size < PAGE_SIZE)
3699 kfree(mem);
3700 else
3701 vfree(mem);
3702 mem = NULL;
3703 }
2991 return mem; 3704 return mem;
2992} 3705}
2993 3706
@@ -3012,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
3012 for_each_node_state(node, N_POSSIBLE) 3725 for_each_node_state(node, N_POSSIBLE)
3013 free_mem_cgroup_per_zone_info(mem, node); 3726 free_mem_cgroup_per_zone_info(mem, node);
3014 3727
3015 if (mem_cgroup_size() < PAGE_SIZE) 3728 free_percpu(mem->stat);
3729 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
3016 kfree(mem); 3730 kfree(mem);
3017 else 3731 else
3018 vfree(mem); 3732 vfree(mem);
@@ -3023,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
3023 atomic_inc(&mem->refcnt); 3737 atomic_inc(&mem->refcnt);
3024} 3738}
3025 3739
3026static void mem_cgroup_put(struct mem_cgroup *mem) 3740static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
3027{ 3741{
3028 if (atomic_dec_and_test(&mem->refcnt)) { 3742 if (atomic_sub_and_test(count, &mem->refcnt)) {
3029 struct mem_cgroup *parent = parent_mem_cgroup(mem); 3743 struct mem_cgroup *parent = parent_mem_cgroup(mem);
3030 __mem_cgroup_free(mem); 3744 __mem_cgroup_free(mem);
3031 if (parent) 3745 if (parent)
@@ -3033,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
3033 } 3747 }
3034} 3748}
3035 3749
3750static void mem_cgroup_put(struct mem_cgroup *mem)
3751{
3752 __mem_cgroup_put(mem, 1);
3753}
3754
3036/* 3755/*
3037 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 3756 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
3038 */ 3757 */
@@ -3097,12 +3816,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3097 3816
3098 /* root ? */ 3817 /* root ? */
3099 if (cont->parent == NULL) { 3818 if (cont->parent == NULL) {
3819 int cpu;
3100 enable_swap_cgroup(); 3820 enable_swap_cgroup();
3101 parent = NULL; 3821 parent = NULL;
3102 root_mem_cgroup = mem; 3822 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init()) 3823 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out; 3824 goto free_out;
3105 3825 for_each_possible_cpu(cpu) {
3826 struct memcg_stock_pcp *stock =
3827 &per_cpu(memcg_stock, cpu);
3828 INIT_WORK(&stock->work, drain_local_stock);
3829 }
3830 hotcpu_notifier(memcg_stock_cpu_callback, 0);
3106 } else { 3831 } else {
3107 parent = mem_cgroup_from_cont(cont->parent); 3832 parent = mem_cgroup_from_cont(cont->parent);
3108 mem->use_hierarchy = parent->use_hierarchy; 3833 mem->use_hierarchy = parent->use_hierarchy;
@@ -3128,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
3128 if (parent) 3853 if (parent)
3129 mem->swappiness = get_swappiness(parent); 3854 mem->swappiness = get_swappiness(parent);
3130 atomic_set(&mem->refcnt, 1); 3855 atomic_set(&mem->refcnt, 1);
3856 mem->move_charge_at_immigrate = 0;
3857 mutex_init(&mem->thresholds_lock);
3131 return &mem->css; 3858 return &mem->css;
3132free_out: 3859free_out:
3133 __mem_cgroup_free(mem); 3860 __mem_cgroup_free(mem);
@@ -3164,19 +3891,445 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
3164 return ret; 3891 return ret;
3165} 3892}
3166 3893
3894#ifdef CONFIG_MMU
3895/* Handlers for move charge at task migration. */
3896#define PRECHARGE_COUNT_AT_ONCE 256
3897static int mem_cgroup_do_precharge(unsigned long count)
3898{
3899 int ret = 0;
3900 int batch_count = PRECHARGE_COUNT_AT_ONCE;
3901 struct mem_cgroup *mem = mc.to;
3902
3903 if (mem_cgroup_is_root(mem)) {
3904 mc.precharge += count;
3905 /* we don't need css_get for root */
3906 return ret;
3907 }
3908 /* try to charge at once */
3909 if (count > 1) {
3910 struct res_counter *dummy;
3911 /*
3912 * "mem" cannot be under rmdir() because we've already checked
3913 * by cgroup_lock_live_cgroup() that it is not removed and we
3914 * are still under the same cgroup_mutex. So we can postpone
3915 * css_get().
3916 */
3917 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
3918 goto one_by_one;
3919 if (do_swap_account && res_counter_charge(&mem->memsw,
3920 PAGE_SIZE * count, &dummy)) {
3921 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
3922 goto one_by_one;
3923 }
3924 mc.precharge += count;
3925 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
3926 WARN_ON_ONCE(count > INT_MAX);
3927 __css_get(&mem->css, (int)count);
3928 return ret;
3929 }
3930one_by_one:
3931 /* fall back to one by one charge */
3932 while (count--) {
3933 if (signal_pending(current)) {
3934 ret = -EINTR;
3935 break;
3936 }
3937 if (!batch_count--) {
3938 batch_count = PRECHARGE_COUNT_AT_ONCE;
3939 cond_resched();
3940 }
3941 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
3942 if (ret || !mem)
3943 /* mem_cgroup_clear_mc() will do uncharge later */
3944 return -ENOMEM;
3945 mc.precharge++;
3946 }
3947 return ret;
3948}
3949
3950/**
3951 * is_target_pte_for_mc - check a pte whether it is valid for move charge
3952 * @vma: the vma the pte to be checked belongs
3953 * @addr: the address corresponding to the pte to be checked
3954 * @ptent: the pte to be checked
3955 * @target: the pointer the target page or swap ent will be stored(can be NULL)
3956 *
3957 * Returns
3958 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
3959 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
3960 * move charge. if @target is not NULL, the page is stored in target->page
3961 * with extra refcnt got(Callers should handle it).
3962 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
3963 * target for charge migration. if @target is not NULL, the entry is stored
3964 * in target->ent.
3965 *
3966 * Called with pte lock held.
3967 */
3968union mc_target {
3969 struct page *page;
3970 swp_entry_t ent;
3971};
3972
3973enum mc_target_type {
3974 MC_TARGET_NONE, /* not used */
3975 MC_TARGET_PAGE,
3976 MC_TARGET_SWAP,
3977};
3978
3979static int is_target_pte_for_mc(struct vm_area_struct *vma,
3980 unsigned long addr, pte_t ptent, union mc_target *target)
3981{
3982 struct page *page = NULL;
3983 struct page_cgroup *pc;
3984 int ret = 0;
3985 swp_entry_t ent = { .val = 0 };
3986 int usage_count = 0;
3987 bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
3988 &mc.to->move_charge_at_immigrate);
3989
3990 if (!pte_present(ptent)) {
3991 /* TODO: handle swap of shmes/tmpfs */
3992 if (pte_none(ptent) || pte_file(ptent))
3993 return 0;
3994 else if (is_swap_pte(ptent)) {
3995 ent = pte_to_swp_entry(ptent);
3996 if (!move_anon || non_swap_entry(ent))
3997 return 0;
3998 usage_count = mem_cgroup_count_swap_user(ent, &page);
3999 }
4000 } else {
4001 page = vm_normal_page(vma, addr, ptent);
4002 if (!page || !page_mapped(page))
4003 return 0;
4004 /*
4005 * TODO: We don't move charges of file(including shmem/tmpfs)
4006 * pages for now.
4007 */
4008 if (!move_anon || !PageAnon(page))
4009 return 0;
4010 if (!get_page_unless_zero(page))
4011 return 0;
4012 usage_count = page_mapcount(page);
4013 }
4014 if (usage_count > 1) {
4015 /*
4016 * TODO: We don't move charges of shared(used by multiple
4017 * processes) pages for now.
4018 */
4019 if (page)
4020 put_page(page);
4021 return 0;
4022 }
4023 if (page) {
4024 pc = lookup_page_cgroup(page);
4025 /*
4026 * Do only loose check w/o page_cgroup lock.
4027 * mem_cgroup_move_account() checks the pc is valid or not under
4028 * the lock.
4029 */
4030 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4031 ret = MC_TARGET_PAGE;
4032 if (target)
4033 target->page = page;
4034 }
4035 if (!ret || !target)
4036 put_page(page);
4037 }
4038 /* throught */
4039 if (ent.val && do_swap_account && !ret &&
4040 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4041 ret = MC_TARGET_SWAP;
4042 if (target)
4043 target->ent = ent;
4044 }
4045 return ret;
4046}
4047
4048static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4049 unsigned long addr, unsigned long end,
4050 struct mm_walk *walk)
4051{
4052 struct vm_area_struct *vma = walk->private;
4053 pte_t *pte;
4054 spinlock_t *ptl;
4055
4056 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4057 for (; addr != end; pte++, addr += PAGE_SIZE)
4058 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4059 mc.precharge++; /* increment precharge temporarily */
4060 pte_unmap_unlock(pte - 1, ptl);
4061 cond_resched();
4062
4063 return 0;
4064}
4065
4066static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4067{
4068 unsigned long precharge;
4069 struct vm_area_struct *vma;
4070
4071 down_read(&mm->mmap_sem);
4072 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4073 struct mm_walk mem_cgroup_count_precharge_walk = {
4074 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4075 .mm = mm,
4076 .private = vma,
4077 };
4078 if (is_vm_hugetlb_page(vma))
4079 continue;
4080 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4081 if (vma->vm_flags & VM_SHARED)
4082 continue;
4083 walk_page_range(vma->vm_start, vma->vm_end,
4084 &mem_cgroup_count_precharge_walk);
4085 }
4086 up_read(&mm->mmap_sem);
4087
4088 precharge = mc.precharge;
4089 mc.precharge = 0;
4090
4091 return precharge;
4092}
4093
4094static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4095{
4096 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4097}
4098
4099static void mem_cgroup_clear_mc(void)
4100{
4101 /* we must uncharge all the leftover precharges from mc.to */
4102 if (mc.precharge) {
4103 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4104 mc.precharge = 0;
4105 }
4106 /*
4107 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4108 * we must uncharge here.
4109 */
4110 if (mc.moved_charge) {
4111 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4112 mc.moved_charge = 0;
4113 }
4114 /* we must fixup refcnts and charges */
4115 if (mc.moved_swap) {
4116 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4117 /* uncharge swap account from the old cgroup */
4118 if (!mem_cgroup_is_root(mc.from))
4119 res_counter_uncharge(&mc.from->memsw,
4120 PAGE_SIZE * mc.moved_swap);
4121 __mem_cgroup_put(mc.from, mc.moved_swap);
4122
4123 if (!mem_cgroup_is_root(mc.to)) {
4124 /*
4125 * we charged both to->res and to->memsw, so we should
4126 * uncharge to->res.
4127 */
4128 res_counter_uncharge(&mc.to->res,
4129 PAGE_SIZE * mc.moved_swap);
4130 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4131 __css_put(&mc.to->css, mc.moved_swap);
4132 }
4133 /* we've already done mem_cgroup_get(mc.to) */
4134
4135 mc.moved_swap = 0;
4136 }
4137 mc.from = NULL;
4138 mc.to = NULL;
4139 mc.moving_task = NULL;
4140 wake_up_all(&mc.waitq);
4141}
4142
4143static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4144 struct cgroup *cgroup,
4145 struct task_struct *p,
4146 bool threadgroup)
4147{
4148 int ret = 0;
4149 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4150
4151 if (mem->move_charge_at_immigrate) {
4152 struct mm_struct *mm;
4153 struct mem_cgroup *from = mem_cgroup_from_task(p);
4154
4155 VM_BUG_ON(from == mem);
4156
4157 mm = get_task_mm(p);
4158 if (!mm)
4159 return 0;
4160 /* We move charges only when we move a owner of the mm */
4161 if (mm->owner == p) {
4162 VM_BUG_ON(mc.from);
4163 VM_BUG_ON(mc.to);
4164 VM_BUG_ON(mc.precharge);
4165 VM_BUG_ON(mc.moved_charge);
4166 VM_BUG_ON(mc.moved_swap);
4167 VM_BUG_ON(mc.moving_task);
4168 mc.from = from;
4169 mc.to = mem;
4170 mc.precharge = 0;
4171 mc.moved_charge = 0;
4172 mc.moved_swap = 0;
4173 mc.moving_task = current;
4174
4175 ret = mem_cgroup_precharge_mc(mm);
4176 if (ret)
4177 mem_cgroup_clear_mc();
4178 }
4179 mmput(mm);
4180 }
4181 return ret;
4182}
4183
4184static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4185 struct cgroup *cgroup,
4186 struct task_struct *p,
4187 bool threadgroup)
4188{
4189 mem_cgroup_clear_mc();
4190}
4191
4192static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4193 unsigned long addr, unsigned long end,
4194 struct mm_walk *walk)
4195{
4196 int ret = 0;
4197 struct vm_area_struct *vma = walk->private;
4198 pte_t *pte;
4199 spinlock_t *ptl;
4200
4201retry:
4202 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4203 for (; addr != end; addr += PAGE_SIZE) {
4204 pte_t ptent = *(pte++);
4205 union mc_target target;
4206 int type;
4207 struct page *page;
4208 struct page_cgroup *pc;
4209 swp_entry_t ent;
4210
4211 if (!mc.precharge)
4212 break;
4213
4214 type = is_target_pte_for_mc(vma, addr, ptent, &target);
4215 switch (type) {
4216 case MC_TARGET_PAGE:
4217 page = target.page;
4218 if (isolate_lru_page(page))
4219 goto put;
4220 pc = lookup_page_cgroup(page);
4221 if (!mem_cgroup_move_account(pc,
4222 mc.from, mc.to, false)) {
4223 mc.precharge--;
4224 /* we uncharge from mc.from later. */
4225 mc.moved_charge++;
4226 }
4227 putback_lru_page(page);
4228put: /* is_target_pte_for_mc() gets the page */
4229 put_page(page);
4230 break;
4231 case MC_TARGET_SWAP:
4232 ent = target.ent;
4233 if (!mem_cgroup_move_swap_account(ent,
4234 mc.from, mc.to, false)) {
4235 mc.precharge--;
4236 /* we fixup refcnts and charges later. */
4237 mc.moved_swap++;
4238 }
4239 break;
4240 default:
4241 break;
4242 }
4243 }
4244 pte_unmap_unlock(pte - 1, ptl);
4245 cond_resched();
4246
4247 if (addr != end) {
4248 /*
4249 * We have consumed all precharges we got in can_attach().
4250 * We try charge one by one, but don't do any additional
4251 * charges to mc.to if we have failed in charge once in attach()
4252 * phase.
4253 */
4254 ret = mem_cgroup_do_precharge(1);
4255 if (!ret)
4256 goto retry;
4257 }
4258
4259 return ret;
4260}
4261
4262static void mem_cgroup_move_charge(struct mm_struct *mm)
4263{
4264 struct vm_area_struct *vma;
4265
4266 lru_add_drain_all();
4267 down_read(&mm->mmap_sem);
4268 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4269 int ret;
4270 struct mm_walk mem_cgroup_move_charge_walk = {
4271 .pmd_entry = mem_cgroup_move_charge_pte_range,
4272 .mm = mm,
4273 .private = vma,
4274 };
4275 if (is_vm_hugetlb_page(vma))
4276 continue;
4277 /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4278 if (vma->vm_flags & VM_SHARED)
4279 continue;
4280 ret = walk_page_range(vma->vm_start, vma->vm_end,
4281 &mem_cgroup_move_charge_walk);
4282 if (ret)
4283 /*
4284 * means we have consumed all precharges and failed in
4285 * doing additional charge. Just abandon here.
4286 */
4287 break;
4288 }
4289 up_read(&mm->mmap_sem);
4290}
4291
3167static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4292static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3168 struct cgroup *cont, 4293 struct cgroup *cont,
3169 struct cgroup *old_cont, 4294 struct cgroup *old_cont,
3170 struct task_struct *p, 4295 struct task_struct *p,
3171 bool threadgroup) 4296 bool threadgroup)
3172{ 4297{
3173 mutex_lock(&memcg_tasklist); 4298 struct mm_struct *mm;
3174 /* 4299
3175 * FIXME: It's better to move charges of this process from old 4300 if (!mc.to)
3176 * memcg to new memcg. But it's just on TODO-List now. 4301 /* no need to move charge */
3177 */ 4302 return;
3178 mutex_unlock(&memcg_tasklist); 4303
4304 mm = get_task_mm(p);
4305 if (mm) {
4306 mem_cgroup_move_charge(mm);
4307 mmput(mm);
4308 }
4309 mem_cgroup_clear_mc();
4310}
4311#else /* !CONFIG_MMU */
4312static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4313 struct cgroup *cgroup,
4314 struct task_struct *p,
4315 bool threadgroup)
4316{
4317 return 0;
3179} 4318}
4319static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4320 struct cgroup *cgroup,
4321 struct task_struct *p,
4322 bool threadgroup)
4323{
4324}
4325static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4326 struct cgroup *cont,
4327 struct cgroup *old_cont,
4328 struct task_struct *p,
4329 bool threadgroup)
4330{
4331}
4332#endif
3180 4333
3181struct cgroup_subsys mem_cgroup_subsys = { 4334struct cgroup_subsys mem_cgroup_subsys = {
3182 .name = "memory", 4335 .name = "memory",
@@ -3185,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
3185 .pre_destroy = mem_cgroup_pre_destroy, 4338 .pre_destroy = mem_cgroup_pre_destroy,
3186 .destroy = mem_cgroup_destroy, 4339 .destroy = mem_cgroup_destroy,
3187 .populate = mem_cgroup_populate, 4340 .populate = mem_cgroup_populate,
4341 .can_attach = mem_cgroup_can_attach,
4342 .cancel_attach = mem_cgroup_cancel_attach,
3188 .attach = mem_cgroup_move_task, 4343 .attach = mem_cgroup_move_task,
3189 .early_init = 0, 4344 .early_init = 0,
3190 .use_id = 1, 4345 .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dacc64183874..620b0b461593 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,12 +34,17 @@
34#include <linux/kernel.h> 34#include <linux/kernel.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36#include <linux/page-flags.h> 36#include <linux/page-flags.h>
37#include <linux/kernel-page-flags.h>
37#include <linux/sched.h> 38#include <linux/sched.h>
38#include <linux/ksm.h> 39#include <linux/ksm.h>
39#include <linux/rmap.h> 40#include <linux/rmap.h>
40#include <linux/pagemap.h> 41#include <linux/pagemap.h>
41#include <linux/swap.h> 42#include <linux/swap.h>
42#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/migrate.h>
45#include <linux/page-isolation.h>
46#include <linux/suspend.h>
47#include <linux/slab.h>
43#include "internal.h" 48#include "internal.h"
44 49
45int sysctl_memory_failure_early_kill __read_mostly = 0; 50int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -48,6 +53,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
48 53
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 54atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50 55
56#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
57
58u32 hwpoison_filter_enable = 0;
59u32 hwpoison_filter_dev_major = ~0U;
60u32 hwpoison_filter_dev_minor = ~0U;
61u64 hwpoison_filter_flags_mask;
62u64 hwpoison_filter_flags_value;
63EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
64EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
65EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
66EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
67EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
68
69static int hwpoison_filter_dev(struct page *p)
70{
71 struct address_space *mapping;
72 dev_t dev;
73
74 if (hwpoison_filter_dev_major == ~0U &&
75 hwpoison_filter_dev_minor == ~0U)
76 return 0;
77
78 /*
79 * page_mapping() does not accept slab page
80 */
81 if (PageSlab(p))
82 return -EINVAL;
83
84 mapping = page_mapping(p);
85 if (mapping == NULL || mapping->host == NULL)
86 return -EINVAL;
87
88 dev = mapping->host->i_sb->s_dev;
89 if (hwpoison_filter_dev_major != ~0U &&
90 hwpoison_filter_dev_major != MAJOR(dev))
91 return -EINVAL;
92 if (hwpoison_filter_dev_minor != ~0U &&
93 hwpoison_filter_dev_minor != MINOR(dev))
94 return -EINVAL;
95
96 return 0;
97}
98
99static int hwpoison_filter_flags(struct page *p)
100{
101 if (!hwpoison_filter_flags_mask)
102 return 0;
103
104 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
105 hwpoison_filter_flags_value)
106 return 0;
107 else
108 return -EINVAL;
109}
110
111/*
112 * This allows stress tests to limit test scope to a collection of tasks
113 * by putting them under some memcg. This prevents killing unrelated/important
114 * processes such as /sbin/init. Note that the target task may share clean
115 * pages with init (eg. libc text), which is harmless. If the target task
116 * share _dirty_ pages with another task B, the test scheme must make sure B
117 * is also included in the memcg. At last, due to race conditions this filter
118 * can only guarantee that the page either belongs to the memcg tasks, or is
119 * a freed page.
120 */
121#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
122u64 hwpoison_filter_memcg;
123EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
124static int hwpoison_filter_task(struct page *p)
125{
126 struct mem_cgroup *mem;
127 struct cgroup_subsys_state *css;
128 unsigned long ino;
129
130 if (!hwpoison_filter_memcg)
131 return 0;
132
133 mem = try_get_mem_cgroup_from_page(p);
134 if (!mem)
135 return -EINVAL;
136
137 css = mem_cgroup_css(mem);
138 /* root_mem_cgroup has NULL dentries */
139 if (!css->cgroup->dentry)
140 return -EINVAL;
141
142 ino = css->cgroup->dentry->d_inode->i_ino;
143 css_put(css);
144
145 if (ino != hwpoison_filter_memcg)
146 return -EINVAL;
147
148 return 0;
149}
150#else
151static int hwpoison_filter_task(struct page *p) { return 0; }
152#endif
153
154int hwpoison_filter(struct page *p)
155{
156 if (!hwpoison_filter_enable)
157 return 0;
158
159 if (hwpoison_filter_dev(p))
160 return -EINVAL;
161
162 if (hwpoison_filter_flags(p))
163 return -EINVAL;
164
165 if (hwpoison_filter_task(p))
166 return -EINVAL;
167
168 return 0;
169}
170#else
171int hwpoison_filter(struct page *p)
172{
173 return 0;
174}
175#endif
176
177EXPORT_SYMBOL_GPL(hwpoison_filter);
178
51/* 179/*
52 * Send all the processes who have the page mapped an ``action optional'' 180 * Send all the processes who have the page mapped an ``action optional''
53 * signal. 181 * signal.
@@ -83,6 +211,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
83} 211}
84 212
85/* 213/*
214 * When a unknown page type is encountered drain as many buffers as possible
215 * in the hope to turn the page into a LRU or free page, which we can handle.
216 */
217void shake_page(struct page *p, int access)
218{
219 if (!PageSlab(p)) {
220 lru_add_drain_all();
221 if (PageLRU(p))
222 return;
223 drain_all_pages();
224 if (PageLRU(p) || is_free_buddy_page(p))
225 return;
226 }
227
228 /*
229 * Only all shrink_slab here (which would also
230 * shrink other caches) if access is not potentially fatal.
231 */
232 if (access) {
233 int nr;
234 do {
235 nr = shrink_slab(1000, GFP_KERNEL, 1000);
236 if (page_count(p) == 0)
237 break;
238 } while (nr > 10);
239 }
240}
241EXPORT_SYMBOL_GPL(shake_page);
242
243/*
86 * Kill all processes that have a poisoned page mapped and then isolate 244 * Kill all processes that have a poisoned page mapped and then isolate
87 * the page. 245 * the page.
88 * 246 *
@@ -174,10 +332,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
174 list_for_each_entry_safe (tk, next, to_kill, nd) { 332 list_for_each_entry_safe (tk, next, to_kill, nd) {
175 if (doit) { 333 if (doit) {
176 /* 334 /*
177 * In case something went wrong with munmaping 335 * In case something went wrong with munmapping
178 * make sure the process doesn't catch the 336 * make sure the process doesn't catch the
179 * signal and then access the memory. Just kill it. 337 * signal and then access the memory. Just kill it.
180 * the signal handlers
181 */ 338 */
182 if (fail || tk->addr_valid == 0) { 339 if (fail || tk->addr_valid == 0) {
183 printk(KERN_ERR 340 printk(KERN_ERR
@@ -227,9 +384,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
227 if (av == NULL) /* Not actually mapped anymore */ 384 if (av == NULL) /* Not actually mapped anymore */
228 goto out; 385 goto out;
229 for_each_process (tsk) { 386 for_each_process (tsk) {
387 struct anon_vma_chain *vmac;
388
230 if (!task_early_kill(tsk)) 389 if (!task_early_kill(tsk))
231 continue; 390 continue;
232 list_for_each_entry (vma, &av->head, anon_vma_node) { 391 list_for_each_entry(vmac, &av->head, same_anon_vma) {
392 vma = vmac->vma;
233 if (!page_mapped_in_vma(page, vma)) 393 if (!page_mapped_in_vma(page, vma))
234 continue; 394 continue;
235 if (vma->vm_mm == tsk->mm) 395 if (vma->vm_mm == tsk->mm)
@@ -314,33 +474,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
314 */ 474 */
315 475
316enum outcome { 476enum outcome {
317 FAILED, /* Error handling failed */ 477 IGNORED, /* Error: cannot be handled */
478 FAILED, /* Error: handling failed */
318 DELAYED, /* Will be handled later */ 479 DELAYED, /* Will be handled later */
319 IGNORED, /* Error safely ignored */
320 RECOVERED, /* Successfully recovered */ 480 RECOVERED, /* Successfully recovered */
321}; 481};
322 482
323static const char *action_name[] = { 483static const char *action_name[] = {
484 [IGNORED] = "Ignored",
324 [FAILED] = "Failed", 485 [FAILED] = "Failed",
325 [DELAYED] = "Delayed", 486 [DELAYED] = "Delayed",
326 [IGNORED] = "Ignored",
327 [RECOVERED] = "Recovered", 487 [RECOVERED] = "Recovered",
328}; 488};
329 489
330/* 490/*
331 * Error hit kernel page. 491 * XXX: It is possible that a page is isolated from LRU cache,
332 * Do nothing, try to be lucky and not touch this instead. For a few cases we 492 * and then kept in swap cache or failed to remove from page cache.
333 * could be more sophisticated. 493 * The page count will stop it from being freed by unpoison.
494 * Stress tests should be aware of this memory leak problem.
334 */ 495 */
335static int me_kernel(struct page *p, unsigned long pfn) 496static int delete_from_lru_cache(struct page *p)
336{ 497{
337 return DELAYED; 498 if (!isolate_lru_page(p)) {
499 /*
500 * Clear sensible page flags, so that the buddy system won't
501 * complain when the page is unpoison-and-freed.
502 */
503 ClearPageActive(p);
504 ClearPageUnevictable(p);
505 /*
506 * drop the page count elevated by isolate_lru_page()
507 */
508 page_cache_release(p);
509 return 0;
510 }
511 return -EIO;
338} 512}
339 513
340/* 514/*
341 * Already poisoned page. 515 * Error hit kernel page.
516 * Do nothing, try to be lucky and not touch this instead. For a few cases we
517 * could be more sophisticated.
342 */ 518 */
343static int me_ignore(struct page *p, unsigned long pfn) 519static int me_kernel(struct page *p, unsigned long pfn)
344{ 520{
345 return IGNORED; 521 return IGNORED;
346} 522}
@@ -355,14 +531,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
355} 531}
356 532
357/* 533/*
358 * Free memory
359 */
360static int me_free(struct page *p, unsigned long pfn)
361{
362 return DELAYED;
363}
364
365/*
366 * Clean (or cleaned) page cache page. 534 * Clean (or cleaned) page cache page.
367 */ 535 */
368static int me_pagecache_clean(struct page *p, unsigned long pfn) 536static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -371,6 +539,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
371 int ret = FAILED; 539 int ret = FAILED;
372 struct address_space *mapping; 540 struct address_space *mapping;
373 541
542 delete_from_lru_cache(p);
543
374 /* 544 /*
375 * For anonymous pages we're done the only reference left 545 * For anonymous pages we're done the only reference left
376 * should be the one m_f() holds. 546 * should be the one m_f() holds.
@@ -500,14 +670,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500 /* Trigger EIO in shmem: */ 670 /* Trigger EIO in shmem: */
501 ClearPageUptodate(p); 671 ClearPageUptodate(p);
502 672
503 return DELAYED; 673 if (!delete_from_lru_cache(p))
674 return DELAYED;
675 else
676 return FAILED;
504} 677}
505 678
506static int me_swapcache_clean(struct page *p, unsigned long pfn) 679static int me_swapcache_clean(struct page *p, unsigned long pfn)
507{ 680{
508 delete_from_swap_cache(p); 681 delete_from_swap_cache(p);
509 682
510 return RECOVERED; 683 if (!delete_from_lru_cache(p))
684 return RECOVERED;
685 else
686 return FAILED;
511} 687}
512 688
513/* 689/*
@@ -550,7 +726,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
550#define tail (1UL << PG_tail) 726#define tail (1UL << PG_tail)
551#define compound (1UL << PG_compound) 727#define compound (1UL << PG_compound)
552#define slab (1UL << PG_slab) 728#define slab (1UL << PG_slab)
553#define buddy (1UL << PG_buddy)
554#define reserved (1UL << PG_reserved) 729#define reserved (1UL << PG_reserved)
555 730
556static struct page_state { 731static struct page_state {
@@ -559,8 +734,11 @@ static struct page_state {
559 char *msg; 734 char *msg;
560 int (*action)(struct page *p, unsigned long pfn); 735 int (*action)(struct page *p, unsigned long pfn);
561} error_states[] = { 736} error_states[] = {
562 { reserved, reserved, "reserved kernel", me_ignore }, 737 { reserved, reserved, "reserved kernel", me_kernel },
563 { buddy, buddy, "free kernel", me_free }, 738 /*
739 * free pages are specially detected outside this table:
740 * PG_buddy pages only make a small fraction of all free pages.
741 */
564 742
565 /* 743 /*
566 * Could in theory check if slab page is free or if we can drop 744 * Could in theory check if slab page is free or if we can drop
@@ -582,14 +760,11 @@ static struct page_state {
582 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 760 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
583 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 761 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
584 762
585#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
586 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 763 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
587 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 764 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
588#endif
589 765
590 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 766 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
591 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 767 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
592 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
593 768
594 /* 769 /*
595 * Catchall entry: must be at end. 770 * Catchall entry: must be at end.
@@ -597,20 +772,31 @@ static struct page_state {
597 { 0, 0, "unknown page state", me_unknown }, 772 { 0, 0, "unknown page state", me_unknown },
598}; 773};
599 774
775#undef dirty
776#undef sc
777#undef unevict
778#undef mlock
779#undef writeback
780#undef lru
781#undef swapbacked
782#undef head
783#undef tail
784#undef compound
785#undef slab
786#undef reserved
787
600static void action_result(unsigned long pfn, char *msg, int result) 788static void action_result(unsigned long pfn, char *msg, int result)
601{ 789{
602 struct page *page = NULL; 790 struct page *page = pfn_to_page(pfn);
603 if (pfn_valid(pfn))
604 page = pfn_to_page(pfn);
605 791
606 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", 792 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
607 pfn, 793 pfn,
608 page && PageDirty(page) ? "dirty " : "", 794 PageDirty(page) ? "dirty " : "",
609 msg, action_name[result]); 795 msg, action_name[result]);
610} 796}
611 797
612static int page_action(struct page_state *ps, struct page *p, 798static int page_action(struct page_state *ps, struct page *p,
613 unsigned long pfn, int ref) 799 unsigned long pfn)
614{ 800{
615 int result; 801 int result;
616 int count; 802 int count;
@@ -618,18 +804,22 @@ static int page_action(struct page_state *ps, struct page *p,
618 result = ps->action(p, pfn); 804 result = ps->action(p, pfn);
619 action_result(pfn, ps->msg, result); 805 action_result(pfn, ps->msg, result);
620 806
621 count = page_count(p) - 1 - ref; 807 count = page_count(p) - 1;
622 if (count != 0) 808 if (ps->action == me_swapcache_dirty && result == DELAYED)
809 count--;
810 if (count != 0) {
623 printk(KERN_ERR 811 printk(KERN_ERR
624 "MCE %#lx: %s page still referenced by %d users\n", 812 "MCE %#lx: %s page still referenced by %d users\n",
625 pfn, ps->msg, count); 813 pfn, ps->msg, count);
814 result = FAILED;
815 }
626 816
627 /* Could do more checks here if page looks ok */ 817 /* Could do more checks here if page looks ok */
628 /* 818 /*
629 * Could adjust zone counters here to correct for the missing page. 819 * Could adjust zone counters here to correct for the missing page.
630 */ 820 */
631 821
632 return result == RECOVERED ? 0 : -EBUSY; 822 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
633} 823}
634 824
635#define N_UNMAP_TRIES 5 825#define N_UNMAP_TRIES 5
@@ -638,7 +828,7 @@ static int page_action(struct page_state *ps, struct page *p,
638 * Do all that is necessary to remove user space mappings. Unmap 828 * Do all that is necessary to remove user space mappings. Unmap
639 * the pages and send SIGBUS to the processes if the data was dirty. 829 * the pages and send SIGBUS to the processes if the data was dirty.
640 */ 830 */
641static void hwpoison_user_mappings(struct page *p, unsigned long pfn, 831static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
642 int trapno) 832 int trapno)
643{ 833{
644 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; 834 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -648,15 +838,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
648 int i; 838 int i;
649 int kill = 1; 839 int kill = 1;
650 840
651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) 841 if (PageReserved(p) || PageSlab(p))
652 return; 842 return SWAP_SUCCESS;
653 843
654 /* 844 /*
655 * This check implies we don't kill processes if their pages 845 * This check implies we don't kill processes if their pages
656 * are in the swap cache early. Those are always late kills. 846 * are in the swap cache early. Those are always late kills.
657 */ 847 */
658 if (!page_mapped(p)) 848 if (!page_mapped(p))
659 return; 849 return SWAP_SUCCESS;
850
851 if (PageCompound(p) || PageKsm(p))
852 return SWAP_FAIL;
660 853
661 if (PageSwapCache(p)) { 854 if (PageSwapCache(p)) {
662 printk(KERN_ERR 855 printk(KERN_ERR
@@ -667,6 +860,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
667 /* 860 /*
668 * Propagate the dirty bit from PTEs to struct page first, because we 861 * Propagate the dirty bit from PTEs to struct page first, because we
669 * need this to decide if we should kill or just drop the page. 862 * need this to decide if we should kill or just drop the page.
863 * XXX: the dirty test could be racy: set_page_dirty() may not always
864 * be called inside page lock (it's recommended but not enforced).
670 */ 865 */
671 mapping = page_mapping(p); 866 mapping = page_mapping(p);
672 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 867 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -718,11 +913,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
718 */ 913 */
719 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 914 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
720 ret != SWAP_SUCCESS, pfn); 915 ret != SWAP_SUCCESS, pfn);
916
917 return ret;
721} 918}
722 919
723int __memory_failure(unsigned long pfn, int trapno, int ref) 920int __memory_failure(unsigned long pfn, int trapno, int flags)
724{ 921{
725 unsigned long lru_flag;
726 struct page_state *ps; 922 struct page_state *ps;
727 struct page *p; 923 struct page *p;
728 int res; 924 int res;
@@ -731,13 +927,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
731 panic("Memory failure from trap %d on page %lx", trapno, pfn); 927 panic("Memory failure from trap %d on page %lx", trapno, pfn);
732 928
733 if (!pfn_valid(pfn)) { 929 if (!pfn_valid(pfn)) {
734 action_result(pfn, "memory outside kernel control", IGNORED); 930 printk(KERN_ERR
735 return -EIO; 931 "MCE %#lx: memory outside kernel control\n",
932 pfn);
933 return -ENXIO;
736 } 934 }
737 935
738 p = pfn_to_page(pfn); 936 p = pfn_to_page(pfn);
739 if (TestSetPageHWPoison(p)) { 937 if (TestSetPageHWPoison(p)) {
740 action_result(pfn, "already hardware poisoned", IGNORED); 938 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
741 return 0; 939 return 0;
742 } 940 }
743 941
@@ -754,9 +952,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
754 * In fact it's dangerous to directly bump up page count from 0, 952 * In fact it's dangerous to directly bump up page count from 0,
755 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 953 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
756 */ 954 */
757 if (!get_page_unless_zero(compound_head(p))) { 955 if (!(flags & MF_COUNT_INCREASED) &&
758 action_result(pfn, "free or high order kernel", IGNORED); 956 !get_page_unless_zero(compound_head(p))) {
759 return PageBuddy(compound_head(p)) ? 0 : -EBUSY; 957 if (is_free_buddy_page(p)) {
958 action_result(pfn, "free buddy", DELAYED);
959 return 0;
960 } else {
961 action_result(pfn, "high order kernel", IGNORED);
962 return -EBUSY;
963 }
760 } 964 }
761 965
762 /* 966 /*
@@ -768,14 +972,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
768 * walked by the page reclaim code, however that's not a big loss. 972 * walked by the page reclaim code, however that's not a big loss.
769 */ 973 */
770 if (!PageLRU(p)) 974 if (!PageLRU(p))
771 lru_add_drain_all(); 975 shake_page(p, 0);
772 lru_flag = p->flags & lru; 976 if (!PageLRU(p)) {
773 if (isolate_lru_page(p)) { 977 /*
978 * shake_page could have turned it free.
979 */
980 if (is_free_buddy_page(p)) {
981 action_result(pfn, "free buddy, 2nd try", DELAYED);
982 return 0;
983 }
774 action_result(pfn, "non LRU", IGNORED); 984 action_result(pfn, "non LRU", IGNORED);
775 put_page(p); 985 put_page(p);
776 return -EBUSY; 986 return -EBUSY;
777 } 987 }
778 page_cache_release(p);
779 988
780 /* 989 /*
781 * Lock the page and wait for writeback to finish. 990 * Lock the page and wait for writeback to finish.
@@ -783,26 +992,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
783 * and in many cases impossible, so we just avoid it here. 992 * and in many cases impossible, so we just avoid it here.
784 */ 993 */
785 lock_page_nosync(p); 994 lock_page_nosync(p);
995
996 /*
997 * unpoison always clear PG_hwpoison inside page lock
998 */
999 if (!PageHWPoison(p)) {
1000 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1001 res = 0;
1002 goto out;
1003 }
1004 if (hwpoison_filter(p)) {
1005 if (TestClearPageHWPoison(p))
1006 atomic_long_dec(&mce_bad_pages);
1007 unlock_page(p);
1008 put_page(p);
1009 return 0;
1010 }
1011
786 wait_on_page_writeback(p); 1012 wait_on_page_writeback(p);
787 1013
788 /* 1014 /*
789 * Now take care of user space mappings. 1015 * Now take care of user space mappings.
1016 * Abort on fail: __remove_from_page_cache() assumes unmapped page.
790 */ 1017 */
791 hwpoison_user_mappings(p, pfn, trapno); 1018 if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
1019 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1020 res = -EBUSY;
1021 goto out;
1022 }
792 1023
793 /* 1024 /*
794 * Torn down by someone else? 1025 * Torn down by someone else?
795 */ 1026 */
796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { 1027 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
797 action_result(pfn, "already truncated LRU", IGNORED); 1028 action_result(pfn, "already truncated LRU", IGNORED);
798 res = 0; 1029 res = -EBUSY;
799 goto out; 1030 goto out;
800 } 1031 }
801 1032
802 res = -EBUSY; 1033 res = -EBUSY;
803 for (ps = error_states;; ps++) { 1034 for (ps = error_states;; ps++) {
804 if (((p->flags | lru_flag)& ps->mask) == ps->res) { 1035 if ((p->flags & ps->mask) == ps->res) {
805 res = page_action(ps, p, pfn, ref); 1036 res = page_action(ps, p, pfn);
806 break; 1037 break;
807 } 1038 }
808 } 1039 }
@@ -833,3 +1064,235 @@ void memory_failure(unsigned long pfn, int trapno)
833{ 1064{
834 __memory_failure(pfn, trapno, 0); 1065 __memory_failure(pfn, trapno, 0);
835} 1066}
1067
1068/**
1069 * unpoison_memory - Unpoison a previously poisoned page
1070 * @pfn: Page number of the to be unpoisoned page
1071 *
1072 * Software-unpoison a page that has been poisoned by
1073 * memory_failure() earlier.
1074 *
1075 * This is only done on the software-level, so it only works
1076 * for linux injected failures, not real hardware failures
1077 *
1078 * Returns 0 for success, otherwise -errno.
1079 */
1080int unpoison_memory(unsigned long pfn)
1081{
1082 struct page *page;
1083 struct page *p;
1084 int freeit = 0;
1085
1086 if (!pfn_valid(pfn))
1087 return -ENXIO;
1088
1089 p = pfn_to_page(pfn);
1090 page = compound_head(p);
1091
1092 if (!PageHWPoison(p)) {
1093 pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
1094 return 0;
1095 }
1096
1097 if (!get_page_unless_zero(page)) {
1098 if (TestClearPageHWPoison(p))
1099 atomic_long_dec(&mce_bad_pages);
1100 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1101 return 0;
1102 }
1103
1104 lock_page_nosync(page);
1105 /*
1106 * This test is racy because PG_hwpoison is set outside of page lock.
1107 * That's acceptable because that won't trigger kernel panic. Instead,
1108 * the PG_hwpoison page will be caught and isolated on the entrance to
1109 * the free buddy page pool.
1110 */
1111 if (TestClearPageHWPoison(p)) {
1112 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1113 atomic_long_dec(&mce_bad_pages);
1114 freeit = 1;
1115 }
1116 unlock_page(page);
1117
1118 put_page(page);
1119 if (freeit)
1120 put_page(page);
1121
1122 return 0;
1123}
1124EXPORT_SYMBOL(unpoison_memory);
1125
1126static struct page *new_page(struct page *p, unsigned long private, int **x)
1127{
1128 int nid = page_to_nid(p);
1129 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1130}
1131
1132/*
1133 * Safely get reference count of an arbitrary page.
1134 * Returns 0 for a free page, -EIO for a zero refcount page
1135 * that is not free, and 1 for any other page type.
1136 * For 1 the page is returned with increased page count, otherwise not.
1137 */
1138static int get_any_page(struct page *p, unsigned long pfn, int flags)
1139{
1140 int ret;
1141
1142 if (flags & MF_COUNT_INCREASED)
1143 return 1;
1144
1145 /*
1146 * The lock_system_sleep prevents a race with memory hotplug,
1147 * because the isolation assumes there's only a single user.
1148 * This is a big hammer, a better would be nicer.
1149 */
1150 lock_system_sleep();
1151
1152 /*
1153 * Isolate the page, so that it doesn't get reallocated if it
1154 * was free.
1155 */
1156 set_migratetype_isolate(p);
1157 if (!get_page_unless_zero(compound_head(p))) {
1158 if (is_free_buddy_page(p)) {
1159 pr_debug("get_any_page: %#lx free buddy page\n", pfn);
1160 /* Set hwpoison bit while page is still isolated */
1161 SetPageHWPoison(p);
1162 ret = 0;
1163 } else {
1164 pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1165 pfn, p->flags);
1166 ret = -EIO;
1167 }
1168 } else {
1169 /* Not a free page */
1170 ret = 1;
1171 }
1172 unset_migratetype_isolate(p);
1173 unlock_system_sleep();
1174 return ret;
1175}
1176
1177/**
1178 * soft_offline_page - Soft offline a page.
1179 * @page: page to offline
1180 * @flags: flags. Same as memory_failure().
1181 *
1182 * Returns 0 on success, otherwise negated errno.
1183 *
1184 * Soft offline a page, by migration or invalidation,
1185 * without killing anything. This is for the case when
1186 * a page is not corrupted yet (so it's still valid to access),
1187 * but has had a number of corrected errors and is better taken
1188 * out.
1189 *
1190 * The actual policy on when to do that is maintained by
1191 * user space.
1192 *
1193 * This should never impact any application or cause data loss,
1194 * however it might take some time.
1195 *
1196 * This is not a 100% solution for all memory, but tries to be
1197 * ``good enough'' for the majority of memory.
1198 */
1199int soft_offline_page(struct page *page, int flags)
1200{
1201 int ret;
1202 unsigned long pfn = page_to_pfn(page);
1203
1204 ret = get_any_page(page, pfn, flags);
1205 if (ret < 0)
1206 return ret;
1207 if (ret == 0)
1208 goto done;
1209
1210 /*
1211 * Page cache page we can handle?
1212 */
1213 if (!PageLRU(page)) {
1214 /*
1215 * Try to free it.
1216 */
1217 put_page(page);
1218 shake_page(page, 1);
1219
1220 /*
1221 * Did it turn free?
1222 */
1223 ret = get_any_page(page, pfn, 0);
1224 if (ret < 0)
1225 return ret;
1226 if (ret == 0)
1227 goto done;
1228 }
1229 if (!PageLRU(page)) {
1230 pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
1231 pfn, page->flags);
1232 return -EIO;
1233 }
1234
1235 lock_page(page);
1236 wait_on_page_writeback(page);
1237
1238 /*
1239 * Synchronized using the page lock with memory_failure()
1240 */
1241 if (PageHWPoison(page)) {
1242 unlock_page(page);
1243 put_page(page);
1244 pr_debug("soft offline: %#lx page already poisoned\n", pfn);
1245 return -EBUSY;
1246 }
1247
1248 /*
1249 * Try to invalidate first. This should work for
1250 * non dirty unmapped page cache pages.
1251 */
1252 ret = invalidate_inode_page(page);
1253 unlock_page(page);
1254
1255 /*
1256 * Drop count because page migration doesn't like raised
1257 * counts. The page could get re-allocated, but if it becomes
1258 * LRU the isolation will just fail.
1259 * RED-PEN would be better to keep it isolated here, but we
1260 * would need to fix isolation locking first.
1261 */
1262 put_page(page);
1263 if (ret == 1) {
1264 ret = 0;
1265 pr_debug("soft_offline: %#lx: invalidated\n", pfn);
1266 goto done;
1267 }
1268
1269 /*
1270 * Simple invalidation didn't work.
1271 * Try to migrate to a new page instead. migrate.c
1272 * handles a large number of cases for us.
1273 */
1274 ret = isolate_lru_page(page);
1275 if (!ret) {
1276 LIST_HEAD(pagelist);
1277
1278 list_add(&page->lru, &pagelist);
1279 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
1280 if (ret) {
1281 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1282 pfn, ret, page->flags);
1283 if (ret > 0)
1284 ret = -EIO;
1285 }
1286 } else {
1287 pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1288 pfn, ret, page_count(page), page->flags);
1289 }
1290 if (ret)
1291 return ret;
1292
1293done:
1294 atomic_long_add(1, &mce_bad_pages);
1295 SetPageHWPoison(page);
1296 /* keep elevated page count for bad page */
1297 return ret;
1298}
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..833952d8b74d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
56#include <linux/kallsyms.h> 56#include <linux/kallsyms.h>
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h>
59 60
60#include <asm/io.h> 61#include <asm/io.h>
61#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
@@ -121,6 +122,77 @@ static int __init init_zero_pfn(void)
121} 122}
122core_initcall(init_zero_pfn); 123core_initcall(init_zero_pfn);
123 124
125
126#if defined(SPLIT_RSS_COUNTING)
127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129{
130 int i;
131
132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0;
136 }
137 }
138 task->rss_stat.events = 0;
139}
140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142{
143 struct task_struct *task = current;
144
145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val;
147 else
148 add_mm_counter(mm, member, val);
149}
150#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152
153/* sync counter once per 64 page faults */
154#define TASK_RSS_EVENTS_THRESH (64)
155static void check_sync_rss_stat(struct task_struct *task)
156{
157 if (unlikely(task != current))
158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184}
185#else
186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189
190static void check_sync_rss_stat(struct task_struct *task)
191{
192}
193
194#endif
195
124/* 196/*
125 * If a p?d_bad entry is found while walking page tables, report 197 * If a p?d_bad entry is found while walking page tables, report
126 * the error, before resetting entry to p?d_none. Usually (but 198 * the error, before resetting entry to p?d_none. Usually (but
@@ -300,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
300 * Hide vma from rmap and truncate_pagecache before freeing 372 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables 373 * pgtables
302 */ 374 */
303 anon_vma_unlink(vma); 375 unlink_anon_vmas(vma);
304 unlink_file_vma(vma); 376 unlink_file_vma(vma);
305 377
306 if (is_vm_hugetlb_page(vma)) { 378 if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
314 && !is_vm_hugetlb_page(next)) { 386 && !is_vm_hugetlb_page(next)) {
315 vma = next; 387 vma = next;
316 next = vma->vm_next; 388 next = vma->vm_next;
317 anon_vma_unlink(vma); 389 unlink_anon_vmas(vma);
318 unlink_file_vma(vma); 390 unlink_file_vma(vma);
319 } 391 }
320 free_pgd_range(tlb, addr, vma->vm_end, 392 free_pgd_range(tlb, addr, vma->vm_end,
@@ -376,12 +448,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
376 return 0; 448 return 0;
377} 449}
378 450
379static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) 451static inline void init_rss_vec(int *rss)
380{ 452{
381 if (file_rss) 453 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
382 add_mm_counter(mm, file_rss, file_rss); 454}
383 if (anon_rss) 455
384 add_mm_counter(mm, anon_rss, anon_rss); 456static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
457{
458 int i;
459
460 if (current->mm == mm)
461 sync_mm_rss(current, mm);
462 for (i = 0; i < NR_MM_COUNTERS; i++)
463 if (rss[i])
464 add_mm_counter(mm, i, rss[i]);
385} 465}
386 466
387/* 467/*
@@ -430,12 +510,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
430 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 510 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
431 current->comm, 511 current->comm,
432 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 512 (long long)pte_val(pte), (long long)pmd_val(*pmd));
433 if (page) { 513 if (page)
434 printk(KERN_ALERT 514 dump_page(page);
435 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
436 page, (void *)page->flags, page_count(page),
437 page_mapcount(page), page->mapping, page->index);
438 }
439 printk(KERN_ALERT 515 printk(KERN_ALERT
440 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 516 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
441 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 517 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -572,7 +648,7 @@ out:
572 * covered by this vma. 648 * covered by this vma.
573 */ 649 */
574 650
575static inline void 651static inline unsigned long
576copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 652copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 653 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
578 unsigned long addr, int *rss) 654 unsigned long addr, int *rss)
@@ -586,7 +662,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
586 if (!pte_file(pte)) { 662 if (!pte_file(pte)) {
587 swp_entry_t entry = pte_to_swp_entry(pte); 663 swp_entry_t entry = pte_to_swp_entry(pte);
588 664
589 swap_duplicate(entry); 665 if (swap_duplicate(entry) < 0)
666 return entry.val;
667
590 /* make sure dst_mm is on swapoff's mmlist. */ 668 /* make sure dst_mm is on swapoff's mmlist. */
591 if (unlikely(list_empty(&dst_mm->mmlist))) { 669 if (unlikely(list_empty(&dst_mm->mmlist))) {
592 spin_lock(&mmlist_lock); 670 spin_lock(&mmlist_lock);
@@ -595,7 +673,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
595 &src_mm->mmlist); 673 &src_mm->mmlist);
596 spin_unlock(&mmlist_lock); 674 spin_unlock(&mmlist_lock);
597 } 675 }
598 if (is_write_migration_entry(entry) && 676 if (likely(!non_swap_entry(entry)))
677 rss[MM_SWAPENTS]++;
678 else if (is_write_migration_entry(entry) &&
599 is_cow_mapping(vm_flags)) { 679 is_cow_mapping(vm_flags)) {
600 /* 680 /*
601 * COW mappings require pages in both parent 681 * COW mappings require pages in both parent
@@ -630,11 +710,15 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
630 if (page) { 710 if (page) {
631 get_page(page); 711 get_page(page);
632 page_dup_rmap(page); 712 page_dup_rmap(page);
633 rss[PageAnon(page)]++; 713 if (PageAnon(page))
714 rss[MM_ANONPAGES]++;
715 else
716 rss[MM_FILEPAGES]++;
634 } 717 }
635 718
636out_set_pte: 719out_set_pte:
637 set_pte_at(dst_mm, addr, dst_pte, pte); 720 set_pte_at(dst_mm, addr, dst_pte, pte);
721 return 0;
638} 722}
639 723
640static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 724static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -645,10 +729,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
645 pte_t *src_pte, *dst_pte; 729 pte_t *src_pte, *dst_pte;
646 spinlock_t *src_ptl, *dst_ptl; 730 spinlock_t *src_ptl, *dst_ptl;
647 int progress = 0; 731 int progress = 0;
648 int rss[2]; 732 int rss[NR_MM_COUNTERS];
733 swp_entry_t entry = (swp_entry_t){0};
649 734
650again: 735again:
651 rss[1] = rss[0] = 0; 736 init_rss_vec(rss);
737
652 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 738 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
653 if (!dst_pte) 739 if (!dst_pte)
654 return -ENOMEM; 740 return -ENOMEM;
@@ -674,16 +760,25 @@ again:
674 progress++; 760 progress++;
675 continue; 761 continue;
676 } 762 }
677 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 763 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
764 vma, addr, rss);
765 if (entry.val)
766 break;
678 progress += 8; 767 progress += 8;
679 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 768 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680 769
681 arch_leave_lazy_mmu_mode(); 770 arch_leave_lazy_mmu_mode();
682 spin_unlock(src_ptl); 771 spin_unlock(src_ptl);
683 pte_unmap_nested(orig_src_pte); 772 pte_unmap_nested(orig_src_pte);
684 add_mm_rss(dst_mm, rss[0], rss[1]); 773 add_mm_rss_vec(dst_mm, rss);
685 pte_unmap_unlock(orig_dst_pte, dst_ptl); 774 pte_unmap_unlock(orig_dst_pte, dst_ptl);
686 cond_resched(); 775 cond_resched();
776
777 if (entry.val) {
778 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
779 return -ENOMEM;
780 progress = 0;
781 }
687 if (addr != end) 782 if (addr != end)
688 goto again; 783 goto again;
689 return 0; 784 return 0;
@@ -803,8 +898,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
803 struct mm_struct *mm = tlb->mm; 898 struct mm_struct *mm = tlb->mm;
804 pte_t *pte; 899 pte_t *pte;
805 spinlock_t *ptl; 900 spinlock_t *ptl;
806 int file_rss = 0; 901 int rss[NR_MM_COUNTERS];
807 int anon_rss = 0; 902
903 init_rss_vec(rss);
808 904
809 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 905 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
810 arch_enter_lazy_mmu_mode(); 906 arch_enter_lazy_mmu_mode();
@@ -850,14 +946,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
850 set_pte_at(mm, addr, pte, 946 set_pte_at(mm, addr, pte,
851 pgoff_to_pte(page->index)); 947 pgoff_to_pte(page->index));
852 if (PageAnon(page)) 948 if (PageAnon(page))
853 anon_rss--; 949 rss[MM_ANONPAGES]--;
854 else { 950 else {
855 if (pte_dirty(ptent)) 951 if (pte_dirty(ptent))
856 set_page_dirty(page); 952 set_page_dirty(page);
857 if (pte_young(ptent) && 953 if (pte_young(ptent) &&
858 likely(!VM_SequentialReadHint(vma))) 954 likely(!VM_SequentialReadHint(vma)))
859 mark_page_accessed(page); 955 mark_page_accessed(page);
860 file_rss--; 956 rss[MM_FILEPAGES]--;
861 } 957 }
862 page_remove_rmap(page); 958 page_remove_rmap(page);
863 if (unlikely(page_mapcount(page) < 0)) 959 if (unlikely(page_mapcount(page) < 0))
@@ -874,13 +970,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
874 if (pte_file(ptent)) { 970 if (pte_file(ptent)) {
875 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 971 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
876 print_bad_pte(vma, addr, ptent, NULL); 972 print_bad_pte(vma, addr, ptent, NULL);
877 } else if 973 } else {
878 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) 974 swp_entry_t entry = pte_to_swp_entry(ptent);
879 print_bad_pte(vma, addr, ptent, NULL); 975
976 if (!non_swap_entry(entry))
977 rss[MM_SWAPENTS]--;
978 if (unlikely(!free_swap_and_cache(entry)))
979 print_bad_pte(vma, addr, ptent, NULL);
980 }
880 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 981 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
881 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 982 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
882 983
883 add_mm_rss(mm, file_rss, anon_rss); 984 add_mm_rss_vec(mm, rss);
884 arch_leave_lazy_mmu_mode(); 985 arch_leave_lazy_mmu_mode();
885 pte_unmap_unlock(pte - 1, ptl); 986 pte_unmap_unlock(pte - 1, ptl);
886 987
@@ -943,6 +1044,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
943 details = NULL; 1044 details = NULL;
944 1045
945 BUG_ON(addr >= end); 1046 BUG_ON(addr >= end);
1047 mem_cgroup_uncharge_start();
946 tlb_start_vma(tlb, vma); 1048 tlb_start_vma(tlb, vma);
947 pgd = pgd_offset(vma->vm_mm, addr); 1049 pgd = pgd_offset(vma->vm_mm, addr);
948 do { 1050 do {
@@ -955,6 +1057,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
955 zap_work, details); 1057 zap_work, details);
956 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 1058 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
957 tlb_end_vma(tlb, vma); 1059 tlb_end_vma(tlb, vma);
1060 mem_cgroup_uncharge_end();
958 1061
959 return addr; 1062 return addr;
960} 1063}
@@ -1512,7 +1615,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1512 1615
1513 /* Ok, finally just insert the thing.. */ 1616 /* Ok, finally just insert the thing.. */
1514 get_page(page); 1617 get_page(page);
1515 inc_mm_counter(mm, file_rss); 1618 inc_mm_counter_fast(mm, MM_FILEPAGES);
1516 page_add_file_rmap(page); 1619 page_add_file_rmap(page);
1517 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1620 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1518 1621
@@ -1578,7 +1681,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1578 /* Ok, finally just insert the thing.. */ 1681 /* Ok, finally just insert the thing.. */
1579 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1682 entry = pte_mkspecial(pfn_pte(pfn, prot));
1580 set_pte_at(mm, addr, pte, entry); 1683 set_pte_at(mm, addr, pte, entry);
1581 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1684 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1582 1685
1583 retval = 0; 1686 retval = 0;
1584out_unlock: 1687out_unlock:
@@ -2029,6 +2132,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2029 page_cache_release(old_page); 2132 page_cache_release(old_page);
2030 } 2133 }
2031 reuse = reuse_swap_page(old_page); 2134 reuse = reuse_swap_page(old_page);
2135 if (reuse)
2136 /*
2137 * The page is all ours. Move it to our anon_vma so
2138 * the rmap code will not search our parent or siblings.
2139 * Protected against the rmap code by the page lock.
2140 */
2141 page_move_anon_rmap(old_page, vma, address);
2032 unlock_page(old_page); 2142 unlock_page(old_page);
2033 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2143 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2034 (VM_WRITE|VM_SHARED))) { 2144 (VM_WRITE|VM_SHARED))) {
@@ -2101,7 +2211,7 @@ reuse:
2101 entry = pte_mkyoung(orig_pte); 2211 entry = pte_mkyoung(orig_pte);
2102 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2212 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2103 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2213 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2104 update_mmu_cache(vma, address, entry); 2214 update_mmu_cache(vma, address, page_table);
2105 ret |= VM_FAULT_WRITE; 2215 ret |= VM_FAULT_WRITE;
2106 goto unlock; 2216 goto unlock;
2107 } 2217 }
@@ -2148,11 +2258,11 @@ gotten:
2148 if (likely(pte_same(*page_table, orig_pte))) { 2258 if (likely(pte_same(*page_table, orig_pte))) {
2149 if (old_page) { 2259 if (old_page) {
2150 if (!PageAnon(old_page)) { 2260 if (!PageAnon(old_page)) {
2151 dec_mm_counter(mm, file_rss); 2261 dec_mm_counter_fast(mm, MM_FILEPAGES);
2152 inc_mm_counter(mm, anon_rss); 2262 inc_mm_counter_fast(mm, MM_ANONPAGES);
2153 } 2263 }
2154 } else 2264 } else
2155 inc_mm_counter(mm, anon_rss); 2265 inc_mm_counter_fast(mm, MM_ANONPAGES);
2156 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2266 flush_cache_page(vma, address, pte_pfn(orig_pte));
2157 entry = mk_pte(new_page, vma->vm_page_prot); 2267 entry = mk_pte(new_page, vma->vm_page_prot);
2158 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2268 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2170,7 +2280,7 @@ gotten:
2170 * new page to be mapped directly into the secondary page table. 2280 * new page to be mapped directly into the secondary page table.
2171 */ 2281 */
2172 set_pte_at_notify(mm, address, page_table, entry); 2282 set_pte_at_notify(mm, address, page_table, entry);
2173 update_mmu_cache(vma, address, entry); 2283 update_mmu_cache(vma, address, page_table);
2174 if (old_page) { 2284 if (old_page) {
2175 /* 2285 /*
2176 * Only after switching the pte to the new page may 2286 * Only after switching the pte to the new page may
@@ -2514,7 +2624,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2514 ret = VM_FAULT_HWPOISON; 2624 ret = VM_FAULT_HWPOISON;
2515 } else { 2625 } else {
2516 print_bad_pte(vma, address, orig_pte, NULL); 2626 print_bad_pte(vma, address, orig_pte, NULL);
2517 ret = VM_FAULT_OOM; 2627 ret = VM_FAULT_SIGBUS;
2518 } 2628 }
2519 goto out; 2629 goto out;
2520 } 2630 }
@@ -2540,6 +2650,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2540 ret = VM_FAULT_MAJOR; 2650 ret = VM_FAULT_MAJOR;
2541 count_vm_event(PGMAJFAULT); 2651 count_vm_event(PGMAJFAULT);
2542 } else if (PageHWPoison(page)) { 2652 } else if (PageHWPoison(page)) {
2653 /*
2654 * hwpoisoned dirty swapcache pages are kept for killing
2655 * owner processes (which may be unknown at hwpoison time)
2656 */
2543 ret = VM_FAULT_HWPOISON; 2657 ret = VM_FAULT_HWPOISON;
2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2658 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2545 goto out_release; 2659 goto out_release;
@@ -2548,6 +2662,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2548 lock_page(page); 2662 lock_page(page);
2549 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2663 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2550 2664
2665 page = ksm_might_need_to_copy(page, vma, address);
2666 if (!page) {
2667 ret = VM_FAULT_OOM;
2668 goto out;
2669 }
2670
2551 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2671 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2552 ret = VM_FAULT_OOM; 2672 ret = VM_FAULT_OOM;
2553 goto out_page; 2673 goto out_page;
@@ -2579,7 +2699,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2579 * discarded at swap_free(). 2699 * discarded at swap_free().
2580 */ 2700 */
2581 2701
2582 inc_mm_counter(mm, anon_rss); 2702 inc_mm_counter_fast(mm, MM_ANONPAGES);
2703 dec_mm_counter_fast(mm, MM_SWAPENTS);
2583 pte = mk_pte(page, vma->vm_page_prot); 2704 pte = mk_pte(page, vma->vm_page_prot);
2584 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2705 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2585 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2706 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2604,7 +2725,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2604 } 2725 }
2605 2726
2606 /* No need to invalidate - it was non-present before */ 2727 /* No need to invalidate - it was non-present before */
2607 update_mmu_cache(vma, address, pte); 2728 update_mmu_cache(vma, address, page_table);
2608unlock: 2729unlock:
2609 pte_unmap_unlock(page_table, ptl); 2730 pte_unmap_unlock(page_table, ptl);
2610out: 2731out:
@@ -2663,13 +2784,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2663 if (!pte_none(*page_table)) 2784 if (!pte_none(*page_table))
2664 goto release; 2785 goto release;
2665 2786
2666 inc_mm_counter(mm, anon_rss); 2787 inc_mm_counter_fast(mm, MM_ANONPAGES);
2667 page_add_new_anon_rmap(page, vma, address); 2788 page_add_new_anon_rmap(page, vma, address);
2668setpte: 2789setpte:
2669 set_pte_at(mm, address, page_table, entry); 2790 set_pte_at(mm, address, page_table, entry);
2670 2791
2671 /* No need to invalidate - it was non-present before */ 2792 /* No need to invalidate - it was non-present before */
2672 update_mmu_cache(vma, address, entry); 2793 update_mmu_cache(vma, address, page_table);
2673unlock: 2794unlock:
2674 pte_unmap_unlock(page_table, ptl); 2795 pte_unmap_unlock(page_table, ptl);
2675 return 0; 2796 return 0;
@@ -2817,10 +2938,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2817 if (flags & FAULT_FLAG_WRITE) 2938 if (flags & FAULT_FLAG_WRITE)
2818 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2939 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2819 if (anon) { 2940 if (anon) {
2820 inc_mm_counter(mm, anon_rss); 2941 inc_mm_counter_fast(mm, MM_ANONPAGES);
2821 page_add_new_anon_rmap(page, vma, address); 2942 page_add_new_anon_rmap(page, vma, address);
2822 } else { 2943 } else {
2823 inc_mm_counter(mm, file_rss); 2944 inc_mm_counter_fast(mm, MM_FILEPAGES);
2824 page_add_file_rmap(page); 2945 page_add_file_rmap(page);
2825 if (flags & FAULT_FLAG_WRITE) { 2946 if (flags & FAULT_FLAG_WRITE) {
2826 dirty_page = page; 2947 dirty_page = page;
@@ -2830,7 +2951,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2830 set_pte_at(mm, address, page_table, entry); 2951 set_pte_at(mm, address, page_table, entry);
2831 2952
2832 /* no need to invalidate: a not-present page won't be cached */ 2953 /* no need to invalidate: a not-present page won't be cached */
2833 update_mmu_cache(vma, address, entry); 2954 update_mmu_cache(vma, address, page_table);
2834 } else { 2955 } else {
2835 if (charged) 2956 if (charged)
2836 mem_cgroup_uncharge_page(page); 2957 mem_cgroup_uncharge_page(page);
@@ -2910,7 +3031,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2910 * Page table corrupted: show pte and kill process. 3031 * Page table corrupted: show pte and kill process.
2911 */ 3032 */
2912 print_bad_pte(vma, address, orig_pte, NULL); 3033 print_bad_pte(vma, address, orig_pte, NULL);
2913 return VM_FAULT_OOM; 3034 return VM_FAULT_SIGBUS;
2914 } 3035 }
2915 3036
2916 pgoff = pte_to_pgoff(orig_pte); 3037 pgoff = pte_to_pgoff(orig_pte);
@@ -2967,7 +3088,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2967 } 3088 }
2968 entry = pte_mkyoung(entry); 3089 entry = pte_mkyoung(entry);
2969 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3090 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
2970 update_mmu_cache(vma, address, entry); 3091 update_mmu_cache(vma, address, pte);
2971 } else { 3092 } else {
2972 /* 3093 /*
2973 * This is needed only for protection faults but the arch code 3094 * This is needed only for protection faults but the arch code
@@ -2998,6 +3119,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2998 3119
2999 count_vm_event(PGFAULT); 3120 count_vm_event(PGFAULT);
3000 3121
3122 /* do counter updates before entering really critical section. */
3123 check_sync_rss_stat(current);
3124
3001 if (unlikely(is_vm_hugetlb_page(vma))) 3125 if (unlikely(is_vm_hugetlb_page(vma)))
3002 return hugetlb_fault(mm, vma, address, flags); 3126 return hugetlb_fault(mm, vma, address, flags);
3003 3127
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2047465cd27c..be211a582930 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -27,6 +27,8 @@
27#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h> 28#include <linux/pfn.h>
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h>
30 32
31#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
32 34
@@ -71,7 +73,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
71 atomic_inc(&page->_count); 73 atomic_inc(&page->_count);
72} 74}
73 75
74void put_page_bootmem(struct page *page) 76/* reference to __meminit __free_pages_bootmem is valid
77 * so use __ref to tell modpost not to generate a warning */
78void __ref put_page_bootmem(struct page *page)
75{ 79{
76 int type; 80 int type;
77 81
@@ -520,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
520 BUG_ON(ret); 524 BUG_ON(ret);
521 } 525 }
522 526
527 /* create new memmap entry */
528 firmware_map_add_hotplug(start, start + size, "System RAM");
529
523 goto out; 530 goto out;
524 531
525error: 532error:
@@ -672,15 +679,18 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
672 if (!ret) { /* Success */ 679 if (!ret) { /* Success */
673 list_add_tail(&page->lru, &source); 680 list_add_tail(&page->lru, &source);
674 move_pages--; 681 move_pages--;
682 inc_zone_page_state(page, NR_ISOLATED_ANON +
683 page_is_file_cache(page));
684
675 } else { 685 } else {
676 /* Becasue we don't have big zone->lock. we should 686 /* Becasue we don't have big zone->lock. we should
677 check this again here. */ 687 check this again here. */
678 if (page_count(page)) 688 if (page_count(page))
679 not_managed++; 689 not_managed++;
680#ifdef CONFIG_DEBUG_VM 690#ifdef CONFIG_DEBUG_VM
681 printk(KERN_INFO "removing from LRU failed" 691 printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
682 " %lx/%d/%lx\n", 692 pfn);
683 pfn, page_count(page), page->flags); 693 dump_page(page);
684#endif 694#endif
685 } 695 }
686 } 696 }
@@ -694,7 +704,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
694 if (list_empty(&source)) 704 if (list_empty(&source))
695 goto out; 705 goto out;
696 /* this function returns # of failed pages */ 706 /* this function returns # of failed pages */
697 ret = migrate_pages(&source, hotremove_migrate_alloc, 0); 707 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
698 708
699out: 709out:
700 return ret; 710 return ret;
@@ -747,7 +757,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
747 return offlined; 757 return offlined;
748} 758}
749 759
750int offline_pages(unsigned long start_pfn, 760static int offline_pages(unsigned long start_pfn,
751 unsigned long end_pfn, unsigned long timeout) 761 unsigned long end_pfn, unsigned long timeout)
752{ 762{
753 unsigned long pfn, nr_pages, expire; 763 unsigned long pfn, nr_pages, expire;
@@ -849,6 +859,10 @@ repeat:
849 859
850 setup_per_zone_wmarks(); 860 setup_per_zone_wmarks();
851 calculate_zone_inactive_ratio(zone); 861 calculate_zone_inactive_ratio(zone);
862 if (!node_present_pages(node)) {
863 node_clear_state(node, N_HIGH_MEMORY);
864 kswapd_stop(node);
865 }
852 866
853 vm_total_pages = nr_free_pagecache_pages(); 867 vm_total_pages = nr_free_pagecache_pages();
854 writeback_set_ratelimit(); 868 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4545d5944243..08f40a2f3fe0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
73#include <linux/sched.h> 73#include <linux/sched.h>
74#include <linux/nodemask.h> 74#include <linux/nodemask.h>
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h> 76#include <linux/slab.h>
78#include <linux/string.h> 77#include <linux/string.h>
79#include <linux/module.h> 78#include <linux/module.h>
@@ -85,10 +84,12 @@
85#include <linux/seq_file.h> 84#include <linux/seq_file.h>
86#include <linux/proc_fs.h> 85#include <linux/proc_fs.h>
87#include <linux/migrate.h> 86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h> 88#include <linux/rmap.h>
89#include <linux/security.h> 89#include <linux/security.h>
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
92 93
93#include <asm/tlbflush.h> 94#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 95#include <asm/uaccess.h>
@@ -412,17 +413,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
412 if (!page) 413 if (!page)
413 continue; 414 continue;
414 /* 415 /*
415 * The check for PageReserved here is important to avoid 416 * vm_normal_page() filters out zero pages, but there might
416 * handling zero pages and other pages that may have been 417 * still be PageReserved pages to skip, perhaps in a VDSO.
417 * marked special by the system. 418 * And we cannot move PageKsm pages sensibly or safely yet.
418 *
419 * If the PageReserved would not be checked here then f.e.
420 * the location of the zero page could have an influence
421 * on MPOL_MF_STRICT, zero pages would be counted for
422 * the per node stats, and there would be useless attempts
423 * to put zero pages on the migration list.
424 */ 419 */
425 if (PageReserved(page)) 420 if (PageReserved(page) || PageKsm(page))
426 continue; 421 continue;
427 nid = page_to_nid(page); 422 nid = page_to_nid(page);
428 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 423 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -567,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
567} 562}
568 563
569/* Step 2: apply policy to a range and do splits. */ 564/* Step 2: apply policy to a range and do splits. */
570static int mbind_range(struct vm_area_struct *vma, unsigned long start, 565static int mbind_range(struct mm_struct *mm, unsigned long start,
571 unsigned long end, struct mempolicy *new) 566 unsigned long end, struct mempolicy *new_pol)
572{ 567{
573 struct vm_area_struct *next; 568 struct vm_area_struct *next;
574 int err; 569 struct vm_area_struct *prev;
570 struct vm_area_struct *vma;
571 int err = 0;
572 pgoff_t pgoff;
573 unsigned long vmstart;
574 unsigned long vmend;
575 575
576 err = 0; 576 vma = find_vma_prev(mm, start, &prev);
577 for (; vma && vma->vm_start < end; vma = next) { 577 if (!vma || vma->vm_start > start)
578 return -EFAULT;
579
580 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
578 next = vma->vm_next; 581 next = vma->vm_next;
579 if (vma->vm_start < start) 582 vmstart = max(start, vma->vm_start);
580 err = split_vma(vma->vm_mm, vma, start, 1); 583 vmend = min(end, vma->vm_end);
581 if (!err && vma->vm_end > end) 584
582 err = split_vma(vma->vm_mm, vma, end, 0); 585 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
583 if (!err) 586 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
584 err = policy_vma(vma, new); 587 vma->anon_vma, vma->vm_file, pgoff, new_pol);
588 if (prev) {
589 vma = prev;
590 next = vma->vm_next;
591 continue;
592 }
593 if (vma->vm_start != vmstart) {
594 err = split_vma(vma->vm_mm, vma, vmstart, 1);
595 if (err)
596 goto out;
597 }
598 if (vma->vm_end != vmend) {
599 err = split_vma(vma->vm_mm, vma, vmend, 0);
600 if (err)
601 goto out;
602 }
603 err = policy_vma(vma, new_pol);
585 if (err) 604 if (err)
586 break; 605 goto out;
587 } 606 }
607
608 out:
588 return err; 609 return err;
589} 610}
590 611
@@ -784,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
784 805
785 err = 0; 806 err = 0;
786 if (nmask) { 807 if (nmask) {
787 task_lock(current); 808 if (mpol_store_user_nodemask(pol)) {
788 get_policy_nodemask(pol, nmask); 809 *nmask = pol->w.user_nodemask;
789 task_unlock(current); 810 } else {
811 task_lock(current);
812 get_policy_nodemask(pol, nmask);
813 task_unlock(current);
814 }
790 } 815 }
791 816
792 out: 817 out:
@@ -809,6 +834,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
809 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 834 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810 if (!isolate_lru_page(page)) { 835 if (!isolate_lru_page(page)) {
811 list_add_tail(&page->lru, pagelist); 836 list_add_tail(&page->lru, pagelist);
837 inc_zone_page_state(page, NR_ISOLATED_ANON +
838 page_is_file_cache(page));
812 } 839 }
813 } 840 }
814} 841}
@@ -836,7 +863,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
836 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 863 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837 864
838 if (!list_empty(&pagelist)) 865 if (!list_empty(&pagelist))
839 err = migrate_pages(&pagelist, new_node_page, dest); 866 err = migrate_pages(&pagelist, new_node_page, dest, 0);
840 867
841 return err; 868 return err;
842} 869}
@@ -864,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm,
864 if (err) 891 if (err)
865 goto out; 892 goto out;
866 893
867/* 894 /*
868 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 895 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
869 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 896 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
870 * bit in 'tmp', and return that <source, dest> pair for migration. 897 * bit in 'tmp', and return that <source, dest> pair for migration.
871 * The pair of nodemasks 'to' and 'from' define the map. 898 * The pair of nodemasks 'to' and 'from' define the map.
872 * 899 *
873 * If no pair of bits is found that way, fallback to picking some 900 * If no pair of bits is found that way, fallback to picking some
874 * pair of 'source' and 'dest' bits that are not the same. If the 901 * pair of 'source' and 'dest' bits that are not the same. If the
875 * 'source' and 'dest' bits are the same, this represents a node 902 * 'source' and 'dest' bits are the same, this represents a node
876 * that will be migrating to itself, so no pages need move. 903 * that will be migrating to itself, so no pages need move.
877 * 904 *
878 * If no bits are left in 'tmp', or if all remaining bits left 905 * If no bits are left in 'tmp', or if all remaining bits left
879 * in 'tmp' correspond to the same bit in 'to', return false 906 * in 'tmp' correspond to the same bit in 'to', return false
880 * (nothing left to migrate). 907 * (nothing left to migrate).
881 * 908 *
882 * This lets us pick a pair of nodes to migrate between, such that 909 * This lets us pick a pair of nodes to migrate between, such that
883 * if possible the dest node is not already occupied by some other 910 * if possible the dest node is not already occupied by some other
884 * source node, minimizing the risk of overloading the memory on a 911 * source node, minimizing the risk of overloading the memory on a
885 * node that would happen if we migrated incoming memory to a node 912 * node that would happen if we migrated incoming memory to a node
886 * before migrating outgoing memory source that same node. 913 * before migrating outgoing memory source that same node.
887 * 914 *
888 * A single scan of tmp is sufficient. As we go, we remember the 915 * A single scan of tmp is sufficient. As we go, we remember the
889 * most recent <s, d> pair that moved (s != d). If we find a pair 916 * most recent <s, d> pair that moved (s != d). If we find a pair
890 * that not only moved, but what's better, moved to an empty slot 917 * that not only moved, but what's better, moved to an empty slot
891 * (d is not set in tmp), then we break out then, with that pair. 918 * (d is not set in tmp), then we break out then, with that pair.
892 * Otherwise when we finish scannng from_tmp, we at least have the 919 * Otherwise when we finish scannng from_tmp, we at least have the
893 * most recent <s, d> pair that moved. If we get all the way through 920 * most recent <s, d> pair that moved. If we get all the way through
894 * the scan of tmp without finding any node that moved, much less 921 * the scan of tmp without finding any node that moved, much less
895 * moved to an empty node, then there is nothing left worth migrating. 922 * moved to an empty node, then there is nothing left worth migrating.
896 */ 923 */
897 924
898 tmp = *from_nodes; 925 tmp = *from_nodes;
899 while (!nodes_empty(tmp)) { 926 while (!nodes_empty(tmp)) {
@@ -1049,11 +1076,11 @@ static long do_mbind(unsigned long start, unsigned long len,
1049 if (!IS_ERR(vma)) { 1076 if (!IS_ERR(vma)) {
1050 int nr_failed = 0; 1077 int nr_failed = 0;
1051 1078
1052 err = mbind_range(vma, start, end, new); 1079 err = mbind_range(mm, start, end, new);
1053 1080
1054 if (!list_empty(&pagelist)) 1081 if (!list_empty(&pagelist))
1055 nr_failed = migrate_pages(&pagelist, new_vma_page, 1082 nr_failed = migrate_pages(&pagelist, new_vma_page,
1056 (unsigned long)vma); 1083 (unsigned long)vma, 0);
1057 1084
1058 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1085 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1059 err = -EIO; 1086 err = -EIO;
@@ -1565,6 +1592,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1565 } 1592 }
1566 return zl; 1593 return zl;
1567} 1594}
1595
1596/*
1597 * init_nodemask_of_mempolicy
1598 *
1599 * If the current task's mempolicy is "default" [NULL], return 'false'
1600 * to indicate default policy. Otherwise, extract the policy nodemask
1601 * for 'bind' or 'interleave' policy into the argument nodemask, or
1602 * initialize the argument nodemask to contain the single node for
1603 * 'preferred' or 'local' policy and return 'true' to indicate presence
1604 * of non-default mempolicy.
1605 *
1606 * We don't bother with reference counting the mempolicy [mpol_get/put]
1607 * because the current task is examining it's own mempolicy and a task's
1608 * mempolicy is only ever changed by the task itself.
1609 *
1610 * N.B., it is the caller's responsibility to free a returned nodemask.
1611 */
1612bool init_nodemask_of_mempolicy(nodemask_t *mask)
1613{
1614 struct mempolicy *mempolicy;
1615 int nid;
1616
1617 if (!(mask && current->mempolicy))
1618 return false;
1619
1620 mempolicy = current->mempolicy;
1621 switch (mempolicy->mode) {
1622 case MPOL_PREFERRED:
1623 if (mempolicy->flags & MPOL_F_LOCAL)
1624 nid = numa_node_id();
1625 else
1626 nid = mempolicy->v.preferred_node;
1627 init_nodemask_of_node(mask, nid);
1628 break;
1629
1630 case MPOL_BIND:
1631 /* Fall through */
1632 case MPOL_INTERLEAVE:
1633 *mask = mempolicy->v.nodes;
1634 break;
1635
1636 default:
1637 BUG();
1638 }
1639
1640 return true;
1641}
1568#endif 1642#endif
1569 1643
1570/* Allocate a page in interleaved policy. 1644/* Allocate a page in interleaved policy.
@@ -1685,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1685 1759
1686 if (!new) 1760 if (!new)
1687 return ERR_PTR(-ENOMEM); 1761 return ERR_PTR(-ENOMEM);
1762 rcu_read_lock();
1688 if (current_cpuset_is_being_rebound()) { 1763 if (current_cpuset_is_being_rebound()) {
1689 nodemask_t mems = cpuset_mems_allowed(current); 1764 nodemask_t mems = cpuset_mems_allowed(current);
1690 mpol_rebind_policy(old, &mems); 1765 mpol_rebind_policy(old, &mems);
1691 } 1766 }
1767 rcu_read_unlock();
1692 *new = *old; 1768 *new = *old;
1693 atomic_set(&new->refcnt, 1); 1769 atomic_set(&new->refcnt, 1);
1694 return new; 1770 return new;
@@ -2122,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2122 char *rest = nodelist; 2198 char *rest = nodelist;
2123 while (isdigit(*rest)) 2199 while (isdigit(*rest))
2124 rest++; 2200 rest++;
2125 if (!*rest) 2201 if (*rest)
2126 err = 0; 2202 goto out;
2127 } 2203 }
2128 break; 2204 break;
2129 case MPOL_INTERLEAVE: 2205 case MPOL_INTERLEAVE:
@@ -2132,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2132 */ 2208 */
2133 if (!nodelist) 2209 if (!nodelist)
2134 nodes = node_states[N_HIGH_MEMORY]; 2210 nodes = node_states[N_HIGH_MEMORY];
2135 err = 0;
2136 break; 2211 break;
2137 case MPOL_LOCAL: 2212 case MPOL_LOCAL:
2138 /* 2213 /*
@@ -2142,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2142 goto out; 2217 goto out;
2143 mode = MPOL_PREFERRED; 2218 mode = MPOL_PREFERRED;
2144 break; 2219 break;
2145 2220 case MPOL_DEFAULT:
2146 /* 2221 /*
2147 * case MPOL_BIND: mpol_new() enforces non-empty nodemask. 2222 * Insist on a empty nodelist
2148 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. 2223 */
2149 */ 2224 if (!nodelist)
2225 err = 0;
2226 goto out;
2227 case MPOL_BIND:
2228 /*
2229 * Insist on a nodelist
2230 */
2231 if (!nodelist)
2232 goto out;
2150 } 2233 }
2151 2234
2152 mode_flags = 0; 2235 mode_flags = 0;
@@ -2160,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2160 else if (!strcmp(flags, "relative")) 2243 else if (!strcmp(flags, "relative"))
2161 mode_flags |= MPOL_F_RELATIVE_NODES; 2244 mode_flags |= MPOL_F_RELATIVE_NODES;
2162 else 2245 else
2163 err = 1; 2246 goto out;
2164 } 2247 }
2165 2248
2166 new = mpol_new(mode, mode_flags, &nodes); 2249 new = mpol_new(mode, mode_flags, &nodes);
2167 if (IS_ERR(new)) 2250 if (IS_ERR(new))
2168 err = 1; 2251 goto out;
2169 else { 2252
2253 {
2170 int ret; 2254 int ret;
2171 NODEMASK_SCRATCH(scratch); 2255 NODEMASK_SCRATCH(scratch);
2172 if (scratch) { 2256 if (scratch) {
@@ -2177,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2177 ret = -ENOMEM; 2261 ret = -ENOMEM;
2178 NODEMASK_SCRATCH_FREE(scratch); 2262 NODEMASK_SCRATCH_FREE(scratch);
2179 if (ret) { 2263 if (ret) {
2180 err = 1;
2181 mpol_put(new); 2264 mpol_put(new);
2182 } else if (no_context) { 2265 goto out;
2183 /* save for contextualization */
2184 new->w.user_nodemask = nodes;
2185 } 2266 }
2186 } 2267 }
2268 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2187 2273
2188out: 2274out:
2189 /* Restore string for error message */ 2275 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 7dbcb22316d2..d3f3f7f81075 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,6 +21,7 @@
21#include <linux/mm_inline.h> 21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h> 22#include <linux/nsproxy.h>
23#include <linux/pagevec.h> 23#include <linux/pagevec.h>
24#include <linux/ksm.h>
24#include <linux/rmap.h> 25#include <linux/rmap.h>
25#include <linux/topology.h> 26#include <linux/topology.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
@@ -31,6 +32,7 @@
31#include <linux/security.h> 32#include <linux/security.h>
32#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
33#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/gfp.h>
34 36
35#include "internal.h" 37#include "internal.h"
36 38
@@ -78,8 +80,8 @@ int putback_lru_pages(struct list_head *l)
78/* 80/*
79 * Restore a potential migration pte to a working pte entry 81 * Restore a potential migration pte to a working pte entry
80 */ 82 */
81static void remove_migration_pte(struct vm_area_struct *vma, 83static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
82 struct page *old, struct page *new) 84 unsigned long addr, void *old)
83{ 85{
84 struct mm_struct *mm = vma->vm_mm; 86 struct mm_struct *mm = vma->vm_mm;
85 swp_entry_t entry; 87 swp_entry_t entry;
@@ -88,40 +90,37 @@ static void remove_migration_pte(struct vm_area_struct *vma,
88 pmd_t *pmd; 90 pmd_t *pmd;
89 pte_t *ptep, pte; 91 pte_t *ptep, pte;
90 spinlock_t *ptl; 92 spinlock_t *ptl;
91 unsigned long addr = page_address_in_vma(new, vma);
92
93 if (addr == -EFAULT)
94 return;
95 93
96 pgd = pgd_offset(mm, addr); 94 pgd = pgd_offset(mm, addr);
97 if (!pgd_present(*pgd)) 95 if (!pgd_present(*pgd))
98 return; 96 goto out;
99 97
100 pud = pud_offset(pgd, addr); 98 pud = pud_offset(pgd, addr);
101 if (!pud_present(*pud)) 99 if (!pud_present(*pud))
102 return; 100 goto out;
103 101
104 pmd = pmd_offset(pud, addr); 102 pmd = pmd_offset(pud, addr);
105 if (!pmd_present(*pmd)) 103 if (!pmd_present(*pmd))
106 return; 104 goto out;
107 105
108 ptep = pte_offset_map(pmd, addr); 106 ptep = pte_offset_map(pmd, addr);
109 107
110 if (!is_swap_pte(*ptep)) { 108 if (!is_swap_pte(*ptep)) {
111 pte_unmap(ptep); 109 pte_unmap(ptep);
112 return; 110 goto out;
113 } 111 }
114 112
115 ptl = pte_lockptr(mm, pmd); 113 ptl = pte_lockptr(mm, pmd);
116 spin_lock(ptl); 114 spin_lock(ptl);
117 pte = *ptep; 115 pte = *ptep;
118 if (!is_swap_pte(pte)) 116 if (!is_swap_pte(pte))
119 goto out; 117 goto unlock;
120 118
121 entry = pte_to_swp_entry(pte); 119 entry = pte_to_swp_entry(pte);
122 120
123 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 121 if (!is_migration_entry(entry) ||
124 goto out; 122 migration_entry_to_page(entry) != old)
123 goto unlock;
125 124
126 get_page(new); 125 get_page(new);
127 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 126 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
@@ -136,59 +135,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
136 page_add_file_rmap(new); 135 page_add_file_rmap(new);
137 136
138 /* No need to invalidate - it was non-present before */ 137 /* No need to invalidate - it was non-present before */
139 update_mmu_cache(vma, addr, pte); 138 update_mmu_cache(vma, addr, ptep);
140 139unlock:
141out:
142 pte_unmap_unlock(ptep, ptl); 140 pte_unmap_unlock(ptep, ptl);
143} 141out:
144 142 return SWAP_AGAIN;
145/*
146 * Note that remove_file_migration_ptes will only work on regular mappings,
147 * Nonlinear mappings do not use migration entries.
148 */
149static void remove_file_migration_ptes(struct page *old, struct page *new)
150{
151 struct vm_area_struct *vma;
152 struct address_space *mapping = new->mapping;
153 struct prio_tree_iter iter;
154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
155
156 if (!mapping)
157 return;
158
159 spin_lock(&mapping->i_mmap_lock);
160
161 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
162 remove_migration_pte(vma, old, new);
163
164 spin_unlock(&mapping->i_mmap_lock);
165}
166
167/*
168 * Must hold mmap_sem lock on at least one of the vmas containing
169 * the page so that the anon_vma cannot vanish.
170 */
171static void remove_anon_migration_ptes(struct page *old, struct page *new)
172{
173 struct anon_vma *anon_vma;
174 struct vm_area_struct *vma;
175 unsigned long mapping;
176
177 mapping = (unsigned long)new->mapping;
178
179 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
180 return;
181
182 /*
183 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
184 */
185 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
186 spin_lock(&anon_vma->lock);
187
188 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
189 remove_migration_pte(vma, old, new);
190
191 spin_unlock(&anon_vma->lock);
192} 143}
193 144
194/* 145/*
@@ -197,10 +148,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
197 */ 148 */
198static void remove_migration_ptes(struct page *old, struct page *new) 149static void remove_migration_ptes(struct page *old, struct page *new)
199{ 150{
200 if (PageAnon(new)) 151 rmap_walk(new, remove_migration_pte, old);
201 remove_anon_migration_ptes(old, new);
202 else
203 remove_file_migration_ptes(old, new);
204} 152}
205 153
206/* 154/*
@@ -328,8 +276,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
328 */ 276 */
329static void migrate_page_copy(struct page *newpage, struct page *page) 277static void migrate_page_copy(struct page *newpage, struct page *page)
330{ 278{
331 int anon;
332
333 copy_highpage(newpage, page); 279 copy_highpage(newpage, page);
334 280
335 if (PageError(page)) 281 if (PageError(page))
@@ -341,8 +287,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
341 if (TestClearPageActive(page)) { 287 if (TestClearPageActive(page)) {
342 VM_BUG_ON(PageUnevictable(page)); 288 VM_BUG_ON(PageUnevictable(page));
343 SetPageActive(newpage); 289 SetPageActive(newpage);
344 } else 290 } else if (TestClearPageUnevictable(page))
345 unevictable_migrate_page(newpage, page); 291 SetPageUnevictable(newpage);
346 if (PageChecked(page)) 292 if (PageChecked(page))
347 SetPageChecked(newpage); 293 SetPageChecked(newpage);
348 if (PageMappedToDisk(page)) 294 if (PageMappedToDisk(page))
@@ -361,12 +307,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
361 } 307 }
362 308
363 mlock_migrate_page(newpage, page); 309 mlock_migrate_page(newpage, page);
310 ksm_migrate_page(newpage, page);
364 311
365 ClearPageSwapCache(page); 312 ClearPageSwapCache(page);
366 ClearPagePrivate(page); 313 ClearPagePrivate(page);
367 set_page_private(page, 0); 314 set_page_private(page, 0);
368 /* page->mapping contains a flag for PageAnon() */
369 anon = PageAnon(page);
370 page->mapping = NULL; 315 page->mapping = NULL;
371 316
372 /* 317 /*
@@ -580,9 +525,9 @@ static int move_to_new_page(struct page *newpage, struct page *page)
580 else 525 else
581 rc = fallback_migrate_page(mapping, newpage, page); 526 rc = fallback_migrate_page(mapping, newpage, page);
582 527
583 if (!rc) { 528 if (!rc)
584 remove_migration_ptes(page, newpage); 529 remove_migration_ptes(page, newpage);
585 } else 530 else
586 newpage->mapping = NULL; 531 newpage->mapping = NULL;
587 532
588 unlock_page(newpage); 533 unlock_page(newpage);
@@ -595,7 +540,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
595 * to the newly allocated page in newpage. 540 * to the newly allocated page in newpage.
596 */ 541 */
597static int unmap_and_move(new_page_t get_new_page, unsigned long private, 542static int unmap_and_move(new_page_t get_new_page, unsigned long private,
598 struct page *page, int force) 543 struct page *page, int force, int offlining)
599{ 544{
600 int rc = 0; 545 int rc = 0;
601 int *result = NULL; 546 int *result = NULL;
@@ -621,6 +566,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
621 lock_page(page); 566 lock_page(page);
622 } 567 }
623 568
569 /*
570 * Only memory hotplug's offline_pages() caller has locked out KSM,
571 * and can safely migrate a KSM page. The other cases have skipped
572 * PageKsm along with PageReserved - but it is only now when we have
573 * the page lock that we can be certain it will not go KSM beneath us
574 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
575 * its pagecount raised, but only here do we take the page lock which
576 * serializes that).
577 */
578 if (PageKsm(page) && !offlining) {
579 rc = -EBUSY;
580 goto unlock;
581 }
582
624 /* charge against new page */ 583 /* charge against new page */
625 charge = mem_cgroup_prepare_migration(page, &mem); 584 charge = mem_cgroup_prepare_migration(page, &mem);
626 if (charge == -ENOMEM) { 585 if (charge == -ENOMEM) {
@@ -737,7 +696,7 @@ move_newpage:
737 * Return: Number of pages not migrated or error code. 696 * Return: Number of pages not migrated or error code.
738 */ 697 */
739int migrate_pages(struct list_head *from, 698int migrate_pages(struct list_head *from,
740 new_page_t get_new_page, unsigned long private) 699 new_page_t get_new_page, unsigned long private, int offlining)
741{ 700{
742 int retry = 1; 701 int retry = 1;
743 int nr_failed = 0; 702 int nr_failed = 0;
@@ -746,13 +705,6 @@ int migrate_pages(struct list_head *from,
746 struct page *page2; 705 struct page *page2;
747 int swapwrite = current->flags & PF_SWAPWRITE; 706 int swapwrite = current->flags & PF_SWAPWRITE;
748 int rc; 707 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
756 708
757 if (!swapwrite) 709 if (!swapwrite)
758 current->flags |= PF_SWAPWRITE; 710 current->flags |= PF_SWAPWRITE;
@@ -764,7 +716,7 @@ int migrate_pages(struct list_head *from,
764 cond_resched(); 716 cond_resched();
765 717
766 rc = unmap_and_move(get_new_page, private, 718 rc = unmap_and_move(get_new_page, private,
767 page, pass > 2); 719 page, pass > 2, offlining);
768 720
769 switch(rc) { 721 switch(rc) {
770 case -ENOMEM: 722 case -ENOMEM:
@@ -860,7 +812,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
860 if (!page) 812 if (!page)
861 goto set_status; 813 goto set_status;
862 814
863 if (PageReserved(page)) /* Check for zero page */ 815 /* Use PageReserved to check for zero page */
816 if (PageReserved(page) || PageKsm(page))
864 goto put_and_set; 817 goto put_and_set;
865 818
866 pp->page = page; 819 pp->page = page;
@@ -878,8 +831,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
878 goto put_and_set; 831 goto put_and_set;
879 832
880 err = isolate_lru_page(page); 833 err = isolate_lru_page(page);
881 if (!err) 834 if (!err) {
882 list_add_tail(&page->lru, &pagelist); 835 list_add_tail(&page->lru, &pagelist);
836 inc_zone_page_state(page, NR_ISOLATED_ANON +
837 page_is_file_cache(page));
838 }
883put_and_set: 839put_and_set:
884 /* 840 /*
885 * Either remove the duplicate refcount from 841 * Either remove the duplicate refcount from
@@ -894,7 +850,7 @@ set_status:
894 err = 0; 850 err = 0;
895 if (!list_empty(&pagelist)) 851 if (!list_empty(&pagelist))
896 err = migrate_pages(&pagelist, new_page_node, 852 err = migrate_pages(&pagelist, new_page_node,
897 (unsigned long)pm); 853 (unsigned long)pm, 0);
898 854
899 up_read(&mm->mmap_sem); 855 up_read(&mm->mmap_sem);
900 return err; 856 return err;
@@ -953,6 +909,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
953 goto out_pm; 909 goto out_pm;
954 910
955 err = -ENODEV; 911 err = -ENODEV;
912 if (node < 0 || node >= MAX_NUMNODES)
913 goto out_pm;
914
956 if (!node_state(node, N_HIGH_MEMORY)) 915 if (!node_state(node, N_HIGH_MEMORY))
957 goto out_pm; 916 goto out_pm;
958 917
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1015 974
1016 err = -ENOENT; 975 err = -ENOENT;
1017 /* Use PageReserved to check for zero page */ 976 /* Use PageReserved to check for zero page */
1018 if (!page || PageReserved(page)) 977 if (!page || PageReserved(page) || PageKsm(page))
1019 goto set_status; 978 goto set_status;
1020 979
1021 err = page_to_nid(page); 980 err = page_to_nid(page);
@@ -1040,33 +999,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1040#define DO_PAGES_STAT_CHUNK_NR 16 999#define DO_PAGES_STAT_CHUNK_NR 16
1041 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 1000 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1042 int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 1001 int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1043 unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1044 int err;
1045 1002
1046 for (i = 0; i < nr_pages; i += chunk_nr) { 1003 while (nr_pages) {
1047 if (chunk_nr + i > nr_pages) 1004 unsigned long chunk_nr;
1048 chunk_nr = nr_pages - i;
1049 1005
1050 err = copy_from_user(chunk_pages, &pages[i], 1006 chunk_nr = nr_pages;
1051 chunk_nr * sizeof(*chunk_pages)); 1007 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1052 if (err) { 1008 chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1053 err = -EFAULT; 1009
1054 goto out; 1010 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1055 } 1011 break;
1056 1012
1057 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 1013 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1058 1014
1059 err = copy_to_user(&status[i], chunk_status, 1015 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1060 chunk_nr * sizeof(*chunk_status)); 1016 break;
1061 if (err) {
1062 err = -EFAULT;
1063 goto out;
1064 }
1065 }
1066 err = 0;
1067 1017
1068out: 1018 pages += chunk_nr;
1069 return err; 1019 status += chunk_nr;
1020 nr_pages -= chunk_nr;
1021 }
1022 return nr_pages ? -EFAULT : 0;
1070} 1023}
1071 1024
1072/* 1025/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 8cb508f84ea4..f77433c20279 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -7,13 +7,14 @@
7/* 7/*
8 * The mincore() system call. 8 * The mincore() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/gfp.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/mman.h> 13#include <linux/mman.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/swapops.h> 16#include <linux/swapops.h>
17#include <linux/hugetlb.h>
17 18
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
72 if (!vma || addr < vma->vm_start) 73 if (!vma || addr < vma->vm_start)
73 return -ENOMEM; 74 return -ENOMEM;
74 75
76#ifdef CONFIG_HUGETLB_PAGE
77 if (is_vm_hugetlb_page(vma)) {
78 struct hstate *h;
79 unsigned long nr_huge;
80 unsigned char present;
81
82 i = 0;
83 nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
84 h = hstate_vma(vma);
85 nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
86 - (addr >> huge_page_shift(h)) + 1;
87 nr_huge = min(nr_huge,
88 (vma->vm_end - addr) >> huge_page_shift(h));
89 while (1) {
90 /* hugepage always in RAM for now,
91 * but generally it needs to be check */
92 ptep = huge_pte_offset(current->mm,
93 addr & huge_page_mask(h));
94 present = !!(ptep &&
95 !huge_pte_none(huge_ptep_get(ptep)));
96 while (1) {
97 vec[i++] = present;
98 addr += PAGE_SIZE;
99 /* reach buffer limit */
100 if (i == nr)
101 return nr;
102 /* check hugepage border */
103 if (!((addr & ~huge_page_mask(h))
104 >> PAGE_SHIFT))
105 break;
106 }
107 }
108 return nr;
109 }
110#endif
111
75 /* 112 /*
76 * Calculate how many pages there are left in the last level of the 113 * Calculate how many pages there are left in the last level of the
77 * PTE array for our address. 114 * PTE array for our address.
diff --git a/mm/mlock.c b/mm/mlock.c
index bd6f0e466f6c..8f4e2dfceec1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
25{ 25{
26 if (capable(CAP_IPC_LOCK)) 26 if (capable(CAP_IPC_LOCK))
27 return 1; 27 return 1;
28 if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) 28 if (rlimit(RLIMIT_MEMLOCK) != 0)
29 return 1; 29 return 1;
30 return 0; 30 return 0;
31} 31}
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page)
88 } 88 }
89} 89}
90 90
91/* 91/**
92 * called from munlock()/munmap() path with page supposedly on the LRU. 92 * munlock_vma_page - munlock a vma page
93 * @page - page to be unlocked
93 * 94 *
94 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked 95 * called from munlock()/munmap() path with page supposedly on the LRU.
95 * [in try_to_munlock()] and then attempt to isolate the page. We must 96 * When we munlock a page, because the vma where we found the page is being
96 * isolate the page to keep others from messing with its unevictable 97 * munlock()ed or munmap()ed, we want to check whether other vmas hold the
97 * and mlocked state while trying to munlock. However, we pre-clear the 98 * page locked so that we can leave it on the unevictable lru list and not
98 * mlocked state anyway as we might lose the isolation race and we might 99 * bother vmscan with it. However, to walk the page's rmap list in
99 * not get another chance to clear PageMlocked. If we successfully 100 * try_to_munlock() we must isolate the page from the LRU. If some other
100 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas 101 * task has removed the page from the LRU, we won't be able to do that.
101 * mapping the page, it will restore the PageMlocked state, unless the page 102 * So we clear the PageMlocked as we might not get another chance. If we
102 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), 103 * can't isolate the page, we leave it for putback_lru_page() and vmscan
103 * perhaps redundantly. 104 * [page_referenced()/try_to_unmap()] to deal with.
104 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
105 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
106 * either of which will restore the PageMlocked state by calling
107 * mlock_vma_page() above, if it can grab the vma's mmap sem.
108 */ 105 */
109static void munlock_vma_page(struct page *page) 106void munlock_vma_page(struct page *page)
110{ 107{
111 BUG_ON(!PageLocked(page)); 108 BUG_ON(!PageLocked(page));
112 109
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page)
117 /* 114 /*
118 * did try_to_unlock() succeed or punt? 115 * did try_to_unlock() succeed or punt?
119 */ 116 */
120 if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) 117 if (ret != SWAP_MLOCK)
121 count_vm_event(UNEVICTABLE_PGMUNLOCKED); 118 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
122 119
123 putback_lru_page(page); 120 putback_lru_page(page);
124 } else { 121 } else {
125 /* 122 /*
126 * We lost the race. let try_to_unmap() deal 123 * Some other task has removed the page from the LRU.
127 * with it. At least we get the page state and 124 * putback_lru_page() will take care of removing the
128 * mlock stats right. However, page is still on 125 * page from the unevictable list, if necessary.
129 * the noreclaim list. We'll fix that up when 126 * vmscan [page_referenced()] will move the page back
130 * the page is eventually freed or we scan the 127 * to the unevictable list if some other vma has it
131 * noreclaim list. 128 * mlocked.
132 */ 129 */
133 if (PageUnevictable(page)) 130 if (PageUnevictable(page))
134 count_vm_event(UNEVICTABLE_PGSTRANDED); 131 count_vm_event(UNEVICTABLE_PGSTRANDED);
@@ -490,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
490 locked = len >> PAGE_SHIFT; 487 locked = len >> PAGE_SHIFT;
491 locked += current->mm->locked_vm; 488 locked += current->mm->locked_vm;
492 489
493 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 490 lock_limit = rlimit(RLIMIT_MEMLOCK);
494 lock_limit >>= PAGE_SHIFT; 491 lock_limit >>= PAGE_SHIFT;
495 492
496 /* check against resource limits */ 493 /* check against resource limits */
@@ -553,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
553 550
554 down_write(&current->mm->mmap_sem); 551 down_write(&current->mm->mmap_sem);
555 552
556 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 553 lock_limit = rlimit(RLIMIT_MEMLOCK);
557 lock_limit >>= PAGE_SHIFT; 554 lock_limit >>= PAGE_SHIFT;
558 555
559 ret = -ENOMEM; 556 ret = -ENOMEM;
@@ -587,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
587 int allowed = 0; 584 int allowed = 0;
588 585
589 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 586 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
590 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 587 lock_limit = rlimit(RLIMIT_MEMLOCK);
591 if (lock_limit == RLIM_INFINITY) 588 if (lock_limit == RLIM_INFINITY)
592 allowed = 1; 589 allowed = 1;
593 lock_limit >>= PAGE_SHIFT; 590 lock_limit >>= PAGE_SHIFT;
@@ -621,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
621 618
622 down_write(&mm->mmap_sem); 619 down_write(&mm->mmap_sem);
623 620
624 lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 621 lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
625 vm = mm->total_vm + pgsz; 622 vm = mm->total_vm + pgsz;
626 if (lim < vm) 623 if (lim < vm)
627 goto out; 624 goto out;
628 625
629 lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 626 lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
630 vm = mm->locked_vm + pgsz; 627 vm = mm->locked_vm + pgsz;
631 if (lim < vm) 628 if (lim < vm)
632 goto out; 629 goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index 73f5e4b64010..456ec6f27889 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,7 +20,6 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/personality.h> 21#include <linux/personality.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/ima.h>
24#include <linux/hugetlb.h> 23#include <linux/hugetlb.h>
25#include <linux/profile.h> 24#include <linux/profile.h>
26#include <linux/module.h> 25#include <linux/module.h>
@@ -266,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
266 * segment grow beyond its set limit the in case where the limit is 265 * segment grow beyond its set limit the in case where the limit is
267 * not page aligned -Ram Gupta 266 * not page aligned -Ram Gupta
268 */ 267 */
269 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 268 rlim = rlimit(RLIMIT_DATA);
270 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 269 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
271 (mm->end_data - mm->start_data) > rlim) 270 (mm->end_data - mm->start_data) > rlim)
272 goto out; 271 goto out;
@@ -438,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
438{ 437{
439 __vma_link_list(mm, vma, prev, rb_parent); 438 __vma_link_list(mm, vma, prev, rb_parent);
440 __vma_link_rb(mm, vma, rb_link, rb_parent); 439 __vma_link_rb(mm, vma, rb_link, rb_parent);
441 __anon_vma_link(vma);
442} 440}
443 441
444static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 442static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -500,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
500 * are necessary. The "insert" vma (if any) is to be inserted 498 * are necessary. The "insert" vma (if any) is to be inserted
501 * before we drop the necessary locks. 499 * before we drop the necessary locks.
502 */ 500 */
503void vma_adjust(struct vm_area_struct *vma, unsigned long start, 501int vma_adjust(struct vm_area_struct *vma, unsigned long start,
504 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 502 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
505{ 503{
506 struct mm_struct *mm = vma->vm_mm; 504 struct mm_struct *mm = vma->vm_mm;
@@ -509,11 +507,12 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
509 struct address_space *mapping = NULL; 507 struct address_space *mapping = NULL;
510 struct prio_tree_root *root = NULL; 508 struct prio_tree_root *root = NULL;
511 struct file *file = vma->vm_file; 509 struct file *file = vma->vm_file;
512 struct anon_vma *anon_vma = NULL;
513 long adjust_next = 0; 510 long adjust_next = 0;
514 int remove_next = 0; 511 int remove_next = 0;
515 512
516 if (next && !insert) { 513 if (next && !insert) {
514 struct vm_area_struct *exporter = NULL;
515
517 if (end >= next->vm_end) { 516 if (end >= next->vm_end) {
518 /* 517 /*
519 * vma expands, overlapping all the next, and 518 * vma expands, overlapping all the next, and
@@ -521,7 +520,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start,
521 */ 520 */
522again: remove_next = 1 + (end > next->vm_end); 521again: remove_next = 1 + (end > next->vm_end);
523 end = next->vm_end; 522 end = next->vm_end;
524 anon_vma = next->anon_vma; 523 exporter = next;
525 importer = vma; 524 importer = vma;
526 } else if (end > next->vm_start) { 525 } else if (end > next->vm_start) {
527 /* 526 /*
@@ -529,7 +528,7 @@ again: remove_next = 1 + (end > next->vm_end);
529 * mprotect case 5 shifting the boundary up. 528 * mprotect case 5 shifting the boundary up.
530 */ 529 */
531 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 530 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
532 anon_vma = next->anon_vma; 531 exporter = next;
533 importer = vma; 532 importer = vma;
534 } else if (end < vma->vm_end) { 533 } else if (end < vma->vm_end) {
535 /* 534 /*
@@ -538,9 +537,20 @@ again: remove_next = 1 + (end > next->vm_end);
538 * mprotect case 4 shifting the boundary down. 537 * mprotect case 4 shifting the boundary down.
539 */ 538 */
540 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 539 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
541 anon_vma = next->anon_vma; 540 exporter = vma;
542 importer = next; 541 importer = next;
543 } 542 }
543
544 /*
545 * Easily overlooked: when mprotect shifts the boundary,
546 * make sure the expanding vma has anon_vma set if the
547 * shrinking vma had, to cover any anon pages imported.
548 */
549 if (exporter && exporter->anon_vma && !importer->anon_vma) {
550 if (anon_vma_clone(importer, exporter))
551 return -ENOMEM;
552 importer->anon_vma = exporter->anon_vma;
553 }
544 } 554 }
545 555
546 if (file) { 556 if (file) {
@@ -568,25 +578,6 @@ again: remove_next = 1 + (end > next->vm_end);
568 } 578 }
569 } 579 }
570 580
571 /*
572 * When changing only vma->vm_end, we don't really need
573 * anon_vma lock.
574 */
575 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
576 anon_vma = vma->anon_vma;
577 if (anon_vma) {
578 spin_lock(&anon_vma->lock);
579 /*
580 * Easily overlooked: when mprotect shifts the boundary,
581 * make sure the expanding vma has anon_vma set if the
582 * shrinking vma had, to cover any anon pages imported.
583 */
584 if (importer && !importer->anon_vma) {
585 importer->anon_vma = anon_vma;
586 __anon_vma_link(importer);
587 }
588 }
589
590 if (root) { 581 if (root) {
591 flush_dcache_mmap_lock(mapping); 582 flush_dcache_mmap_lock(mapping);
592 vma_prio_tree_remove(vma, root); 583 vma_prio_tree_remove(vma, root);
@@ -617,8 +608,6 @@ again: remove_next = 1 + (end > next->vm_end);
617 __vma_unlink(mm, next, vma); 608 __vma_unlink(mm, next, vma);
618 if (file) 609 if (file)
619 __remove_shared_vm_struct(next, file, mapping); 610 __remove_shared_vm_struct(next, file, mapping);
620 if (next->anon_vma)
621 __anon_vma_merge(vma, next);
622 } else if (insert) { 611 } else if (insert) {
623 /* 612 /*
624 * split_vma has split insert from vma, and needs 613 * split_vma has split insert from vma, and needs
@@ -628,8 +617,6 @@ again: remove_next = 1 + (end > next->vm_end);
628 __insert_vm_struct(mm, insert); 617 __insert_vm_struct(mm, insert);
629 } 618 }
630 619
631 if (anon_vma)
632 spin_unlock(&anon_vma->lock);
633 if (mapping) 620 if (mapping)
634 spin_unlock(&mapping->i_mmap_lock); 621 spin_unlock(&mapping->i_mmap_lock);
635 622
@@ -639,6 +626,8 @@ again: remove_next = 1 + (end > next->vm_end);
639 if (next->vm_flags & VM_EXECUTABLE) 626 if (next->vm_flags & VM_EXECUTABLE)
640 removed_exe_file_vma(mm); 627 removed_exe_file_vma(mm);
641 } 628 }
629 if (next->anon_vma)
630 anon_vma_merge(vma, next);
642 mm->map_count--; 631 mm->map_count--;
643 mpol_put(vma_policy(next)); 632 mpol_put(vma_policy(next));
644 kmem_cache_free(vm_area_cachep, next); 633 kmem_cache_free(vm_area_cachep, next);
@@ -654,6 +643,8 @@ again: remove_next = 1 + (end > next->vm_end);
654 } 643 }
655 644
656 validate_mm(mm); 645 validate_mm(mm);
646
647 return 0;
657} 648}
658 649
659/* 650/*
@@ -760,6 +751,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
760{ 751{
761 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 752 pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
762 struct vm_area_struct *area, *next; 753 struct vm_area_struct *area, *next;
754 int err;
763 755
764 /* 756 /*
765 * We later require that vma->vm_flags == vm_flags, 757 * We later require that vma->vm_flags == vm_flags,
@@ -793,11 +785,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
793 is_mergeable_anon_vma(prev->anon_vma, 785 is_mergeable_anon_vma(prev->anon_vma,
794 next->anon_vma)) { 786 next->anon_vma)) {
795 /* cases 1, 6 */ 787 /* cases 1, 6 */
796 vma_adjust(prev, prev->vm_start, 788 err = vma_adjust(prev, prev->vm_start,
797 next->vm_end, prev->vm_pgoff, NULL); 789 next->vm_end, prev->vm_pgoff, NULL);
798 } else /* cases 2, 5, 7 */ 790 } else /* cases 2, 5, 7 */
799 vma_adjust(prev, prev->vm_start, 791 err = vma_adjust(prev, prev->vm_start,
800 end, prev->vm_pgoff, NULL); 792 end, prev->vm_pgoff, NULL);
793 if (err)
794 return NULL;
801 return prev; 795 return prev;
802 } 796 }
803 797
@@ -809,11 +803,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
809 can_vma_merge_before(next, vm_flags, 803 can_vma_merge_before(next, vm_flags,
810 anon_vma, file, pgoff+pglen)) { 804 anon_vma, file, pgoff+pglen)) {
811 if (prev && addr < prev->vm_end) /* case 4 */ 805 if (prev && addr < prev->vm_end) /* case 4 */
812 vma_adjust(prev, prev->vm_start, 806 err = vma_adjust(prev, prev->vm_start,
813 addr, prev->vm_pgoff, NULL); 807 addr, prev->vm_pgoff, NULL);
814 else /* cases 3, 8 */ 808 else /* cases 3, 8 */
815 vma_adjust(area, addr, next->vm_end, 809 err = vma_adjust(area, addr, next->vm_end,
816 next->vm_pgoff - pglen, NULL); 810 next->vm_pgoff - pglen, NULL);
811 if (err)
812 return NULL;
817 return area; 813 return area;
818 } 814 }
819 815
@@ -821,6 +817,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
821} 817}
822 818
823/* 819/*
820 * Rough compatbility check to quickly see if it's even worth looking
821 * at sharing an anon_vma.
822 *
823 * They need to have the same vm_file, and the flags can only differ
824 * in things that mprotect may change.
825 *
826 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
827 * we can merge the two vma's. For example, we refuse to merge a vma if
828 * there is a vm_ops->close() function, because that indicates that the
829 * driver is doing some kind of reference counting. But that doesn't
830 * really matter for the anon_vma sharing case.
831 */
832static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
833{
834 return a->vm_end == b->vm_start &&
835 mpol_equal(vma_policy(a), vma_policy(b)) &&
836 a->vm_file == b->vm_file &&
837 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
838 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
839}
840
841/*
842 * Do some basic sanity checking to see if we can re-use the anon_vma
843 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
844 * the same as 'old', the other will be the new one that is trying
845 * to share the anon_vma.
846 *
847 * NOTE! This runs with mm_sem held for reading, so it is possible that
848 * the anon_vma of 'old' is concurrently in the process of being set up
849 * by another page fault trying to merge _that_. But that's ok: if it
850 * is being set up, that automatically means that it will be a singleton
851 * acceptable for merging, so we can do all of this optimistically. But
852 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
853 *
854 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
855 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
856 * is to return an anon_vma that is "complex" due to having gone through
857 * a fork).
858 *
859 * We also make sure that the two vma's are compatible (adjacent,
860 * and with the same memory policies). That's all stable, even with just
861 * a read lock on the mm_sem.
862 */
863static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
864{
865 if (anon_vma_compatible(a, b)) {
866 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
867
868 if (anon_vma && list_is_singular(&old->anon_vma_chain))
869 return anon_vma;
870 }
871 return NULL;
872}
873
874/*
824 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 875 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
825 * neighbouring vmas for a suitable anon_vma, before it goes off 876 * neighbouring vmas for a suitable anon_vma, before it goes off
826 * to allocate a new anon_vma. It checks because a repetitive 877 * to allocate a new anon_vma. It checks because a repetitive
@@ -830,28 +881,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
830 */ 881 */
831struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 882struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
832{ 883{
884 struct anon_vma *anon_vma;
833 struct vm_area_struct *near; 885 struct vm_area_struct *near;
834 unsigned long vm_flags;
835 886
836 near = vma->vm_next; 887 near = vma->vm_next;
837 if (!near) 888 if (!near)
838 goto try_prev; 889 goto try_prev;
839 890
840 /* 891 anon_vma = reusable_anon_vma(near, vma, near);
841 * Since only mprotect tries to remerge vmas, match flags 892 if (anon_vma)
842 * which might be mprotected into each other later on. 893 return anon_vma;
843 * Neither mlock nor madvise tries to remerge at present,
844 * so leave their flags as obstructing a merge.
845 */
846 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
847 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
848
849 if (near->anon_vma && vma->vm_end == near->vm_start &&
850 mpol_equal(vma_policy(vma), vma_policy(near)) &&
851 can_vma_merge_before(near, vm_flags,
852 NULL, vma->vm_file, vma->vm_pgoff +
853 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
854 return near->anon_vma;
855try_prev: 894try_prev:
856 /* 895 /*
857 * It is potentially slow to have to call find_vma_prev here. 896 * It is potentially slow to have to call find_vma_prev here.
@@ -864,14 +903,9 @@ try_prev:
864 if (!near) 903 if (!near)
865 goto none; 904 goto none;
866 905
867 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); 906 anon_vma = reusable_anon_vma(near, near, vma);
868 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); 907 if (anon_vma)
869 908 return anon_vma;
870 if (near->anon_vma && near->vm_end == vma->vm_start &&
871 mpol_equal(vma_policy(near), vma_policy(vma)) &&
872 can_vma_merge_after(near, vm_flags,
873 NULL, vma->vm_file, vma->vm_pgoff))
874 return near->anon_vma;
875none: 909none:
876 /* 910 /*
877 * There's no absolute need to look only at touching neighbours: 911 * There's no absolute need to look only at touching neighbours:
@@ -932,13 +966,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
932 if (!(flags & MAP_FIXED)) 966 if (!(flags & MAP_FIXED))
933 addr = round_hint_to_min(addr); 967 addr = round_hint_to_min(addr);
934 968
935 error = arch_mmap_check(addr, len, flags);
936 if (error)
937 return error;
938
939 /* Careful about overflows.. */ 969 /* Careful about overflows.. */
940 len = PAGE_ALIGN(len); 970 len = PAGE_ALIGN(len);
941 if (!len || len > TASK_SIZE) 971 if (!len)
942 return -ENOMEM; 972 return -ENOMEM;
943 973
944 /* offset overflow? */ 974 /* offset overflow? */
@@ -949,24 +979,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
949 if (mm->map_count > sysctl_max_map_count) 979 if (mm->map_count > sysctl_max_map_count)
950 return -ENOMEM; 980 return -ENOMEM;
951 981
952 if (flags & MAP_HUGETLB) {
953 struct user_struct *user = NULL;
954 if (file)
955 return -EINVAL;
956
957 /*
958 * VM_NORESERVE is used because the reservations will be
959 * taken when vm_ops->mmap() is called
960 * A dummy user value is used because we are not locking
961 * memory so no accounting is necessary
962 */
963 len = ALIGN(len, huge_page_size(&default_hstate));
964 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
965 &user, HUGETLB_ANONHUGE_INODE);
966 if (IS_ERR(file))
967 return PTR_ERR(file);
968 }
969
970 /* Obtain the address to map to. we verify (or select) it and ensure 982 /* Obtain the address to map to. we verify (or select) it and ensure
971 * that it represents a valid section of the address space. 983 * that it represents a valid section of the address space.
972 */ 984 */
@@ -990,7 +1002,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
990 unsigned long locked, lock_limit; 1002 unsigned long locked, lock_limit;
991 locked = len >> PAGE_SHIFT; 1003 locked = len >> PAGE_SHIFT;
992 locked += mm->locked_vm; 1004 locked += mm->locked_vm;
993 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 1005 lock_limit = rlimit(RLIMIT_MEMLOCK);
994 lock_limit >>= PAGE_SHIFT; 1006 lock_limit >>= PAGE_SHIFT;
995 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1007 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
996 return -EAGAIN; 1008 return -EAGAIN;
@@ -1061,14 +1073,75 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1061 error = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1073 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1062 if (error) 1074 if (error)
1063 return error; 1075 return error;
1064 error = ima_file_mmap(file, prot);
1065 if (error)
1066 return error;
1067 1076
1068 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1077 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1069} 1078}
1070EXPORT_SYMBOL(do_mmap_pgoff); 1079EXPORT_SYMBOL(do_mmap_pgoff);
1071 1080
1081SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1082 unsigned long, prot, unsigned long, flags,
1083 unsigned long, fd, unsigned long, pgoff)
1084{
1085 struct file *file = NULL;
1086 unsigned long retval = -EBADF;
1087
1088 if (!(flags & MAP_ANONYMOUS)) {
1089 if (unlikely(flags & MAP_HUGETLB))
1090 return -EINVAL;
1091 file = fget(fd);
1092 if (!file)
1093 goto out;
1094 } else if (flags & MAP_HUGETLB) {
1095 struct user_struct *user = NULL;
1096 /*
1097 * VM_NORESERVE is used because the reservations will be
1098 * taken when vm_ops->mmap() is called
1099 * A dummy user value is used because we are not locking
1100 * memory so no accounting is necessary
1101 */
1102 len = ALIGN(len, huge_page_size(&default_hstate));
1103 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
1104 &user, HUGETLB_ANONHUGE_INODE);
1105 if (IS_ERR(file))
1106 return PTR_ERR(file);
1107 }
1108
1109 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1110
1111 down_write(&current->mm->mmap_sem);
1112 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1113 up_write(&current->mm->mmap_sem);
1114
1115 if (file)
1116 fput(file);
1117out:
1118 return retval;
1119}
1120
1121#ifdef __ARCH_WANT_SYS_OLD_MMAP
1122struct mmap_arg_struct {
1123 unsigned long addr;
1124 unsigned long len;
1125 unsigned long prot;
1126 unsigned long flags;
1127 unsigned long fd;
1128 unsigned long offset;
1129};
1130
1131SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1132{
1133 struct mmap_arg_struct a;
1134
1135 if (copy_from_user(&a, arg, sizeof(a)))
1136 return -EFAULT;
1137 if (a.offset & ~PAGE_MASK)
1138 return -EINVAL;
1139
1140 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1141 a.offset >> PAGE_SHIFT);
1142}
1143#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1144
1072/* 1145/*
1073 * Some shared mappigns will want the pages marked read-only 1146 * Some shared mappigns will want the pages marked read-only
1074 * to track write events. If so, we'll downgrade vm_page_prot 1147 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1191,6 +1264,7 @@ munmap_back:
1191 vma->vm_flags = vm_flags; 1264 vma->vm_flags = vm_flags;
1192 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1265 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1193 vma->vm_pgoff = pgoff; 1266 vma->vm_pgoff = pgoff;
1267 INIT_LIST_HEAD(&vma->anon_vma_chain);
1194 1268
1195 if (file) { 1269 if (file) {
1196 error = -EINVAL; 1270 error = -EINVAL;
@@ -1224,8 +1298,20 @@ munmap_back:
1224 goto free_vma; 1298 goto free_vma;
1225 } 1299 }
1226 1300
1227 if (vma_wants_writenotify(vma)) 1301 if (vma_wants_writenotify(vma)) {
1302 pgprot_t pprot = vma->vm_page_prot;
1303
1304 /* Can vma->vm_page_prot have changed??
1305 *
1306 * Answer: Yes, drivers may have changed it in their
1307 * f_op->mmap method.
1308 *
1309 * Ensures that vmas marked as uncached stay that way.
1310 */
1228 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1311 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1312 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
1313 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1314 }
1229 1315
1230 vma_link(mm, vma, prev, rb_link, rb_parent); 1316 vma_link(mm, vma, prev, rb_link, rb_parent);
1231 file = vma->vm_file; 1317 file = vma->vm_file;
@@ -1239,13 +1325,8 @@ out:
1239 mm->total_vm += len >> PAGE_SHIFT; 1325 mm->total_vm += len >> PAGE_SHIFT;
1240 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1326 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1241 if (vm_flags & VM_LOCKED) { 1327 if (vm_flags & VM_LOCKED) {
1242 /* 1328 if (!mlock_vma_pages_range(vma, addr, addr + len))
1243 * makes pages present; downgrades, drops, reacquires mmap_sem 1329 mm->locked_vm += (len >> PAGE_SHIFT);
1244 */
1245 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1246 if (nr_pages < 0)
1247 return nr_pages; /* vma gone! */
1248 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1249 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1330 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1250 make_pages_present(addr, addr + len); 1331 make_pages_present(addr, addr + len);
1251 return addr; 1332 return addr;
@@ -1459,6 +1540,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1459 unsigned long (*get_area)(struct file *, unsigned long, 1540 unsigned long (*get_area)(struct file *, unsigned long,
1460 unsigned long, unsigned long, unsigned long); 1541 unsigned long, unsigned long, unsigned long);
1461 1542
1543 unsigned long error = arch_mmap_check(addr, len, flags);
1544 if (error)
1545 return error;
1546
1547 /* Careful about overflows.. */
1548 if (len > TASK_SIZE)
1549 return -ENOMEM;
1550
1462 get_area = current->mm->get_unmapped_area; 1551 get_area = current->mm->get_unmapped_area;
1463 if (file && file->f_op && file->f_op->get_unmapped_area) 1552 if (file && file->f_op && file->f_op->get_unmapped_area)
1464 get_area = file->f_op->get_unmapped_area; 1553 get_area = file->f_op->get_unmapped_area;
@@ -1565,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1565 return -ENOMEM; 1654 return -ENOMEM;
1566 1655
1567 /* Stack limit test */ 1656 /* Stack limit test */
1568 if (size > rlim[RLIMIT_STACK].rlim_cur) 1657 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
1569 return -ENOMEM; 1658 return -ENOMEM;
1570 1659
1571 /* mlock limit tests */ 1660 /* mlock limit tests */
@@ -1573,7 +1662,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1573 unsigned long locked; 1662 unsigned long locked;
1574 unsigned long limit; 1663 unsigned long limit;
1575 locked = mm->locked_vm + grow; 1664 locked = mm->locked_vm + grow;
1576 limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 1665 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
1666 limit >>= PAGE_SHIFT;
1577 if (locked > limit && !capable(CAP_IPC_LOCK)) 1667 if (locked > limit && !capable(CAP_IPC_LOCK))
1578 return -ENOMEM; 1668 return -ENOMEM;
1579 } 1669 }
@@ -1720,8 +1810,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1720 if (!prev || expand_stack(prev, addr)) 1810 if (!prev || expand_stack(prev, addr))
1721 return NULL; 1811 return NULL;
1722 if (prev->vm_flags & VM_LOCKED) { 1812 if (prev->vm_flags & VM_LOCKED) {
1723 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) 1813 mlock_vma_pages_range(prev, addr, prev->vm_end);
1724 return NULL; /* vma gone! */
1725 } 1814 }
1726 return prev; 1815 return prev;
1727} 1816}
@@ -1749,8 +1838,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1749 if (expand_stack(vma, addr)) 1838 if (expand_stack(vma, addr))
1750 return NULL; 1839 return NULL;
1751 if (vma->vm_flags & VM_LOCKED) { 1840 if (vma->vm_flags & VM_LOCKED) {
1752 if (mlock_vma_pages_range(vma, addr, start) < 0) 1841 mlock_vma_pages_range(vma, addr, start);
1753 return NULL; /* vma gone! */
1754 } 1842 }
1755 return vma; 1843 return vma;
1756} 1844}
@@ -1829,29 +1917,29 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1829} 1917}
1830 1918
1831/* 1919/*
1832 * Split a vma into two pieces at address 'addr', a new vma is allocated 1920 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
1833 * either for the first part or the tail. 1921 * munmap path where it doesn't make sense to fail.
1834 */ 1922 */
1835int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 1923static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1836 unsigned long addr, int new_below) 1924 unsigned long addr, int new_below)
1837{ 1925{
1838 struct mempolicy *pol; 1926 struct mempolicy *pol;
1839 struct vm_area_struct *new; 1927 struct vm_area_struct *new;
1928 int err = -ENOMEM;
1840 1929
1841 if (is_vm_hugetlb_page(vma) && (addr & 1930 if (is_vm_hugetlb_page(vma) && (addr &
1842 ~(huge_page_mask(hstate_vma(vma))))) 1931 ~(huge_page_mask(hstate_vma(vma)))))
1843 return -EINVAL; 1932 return -EINVAL;
1844 1933
1845 if (mm->map_count >= sysctl_max_map_count)
1846 return -ENOMEM;
1847
1848 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1934 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1849 if (!new) 1935 if (!new)
1850 return -ENOMEM; 1936 goto out_err;
1851 1937
1852 /* most fields are the same, copy all, and then fixup */ 1938 /* most fields are the same, copy all, and then fixup */
1853 *new = *vma; 1939 *new = *vma;
1854 1940
1941 INIT_LIST_HEAD(&new->anon_vma_chain);
1942
1855 if (new_below) 1943 if (new_below)
1856 new->vm_end = addr; 1944 new->vm_end = addr;
1857 else { 1945 else {
@@ -1861,11 +1949,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1861 1949
1862 pol = mpol_dup(vma_policy(vma)); 1950 pol = mpol_dup(vma_policy(vma));
1863 if (IS_ERR(pol)) { 1951 if (IS_ERR(pol)) {
1864 kmem_cache_free(vm_area_cachep, new); 1952 err = PTR_ERR(pol);
1865 return PTR_ERR(pol); 1953 goto out_free_vma;
1866 } 1954 }
1867 vma_set_policy(new, pol); 1955 vma_set_policy(new, pol);
1868 1956
1957 if (anon_vma_clone(new, vma))
1958 goto out_free_mpol;
1959
1869 if (new->vm_file) { 1960 if (new->vm_file) {
1870 get_file(new->vm_file); 1961 get_file(new->vm_file);
1871 if (vma->vm_flags & VM_EXECUTABLE) 1962 if (vma->vm_flags & VM_EXECUTABLE)
@@ -1876,12 +1967,42 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1876 new->vm_ops->open(new); 1967 new->vm_ops->open(new);
1877 1968
1878 if (new_below) 1969 if (new_below)
1879 vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 1970 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
1880 ((addr - new->vm_start) >> PAGE_SHIFT), new); 1971 ((addr - new->vm_start) >> PAGE_SHIFT), new);
1881 else 1972 else
1882 vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 1973 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1883 1974
1884 return 0; 1975 /* Success. */
1976 if (!err)
1977 return 0;
1978
1979 /* Clean everything up if vma_adjust failed. */
1980 if (new->vm_ops && new->vm_ops->close)
1981 new->vm_ops->close(new);
1982 if (new->vm_file) {
1983 if (vma->vm_flags & VM_EXECUTABLE)
1984 removed_exe_file_vma(mm);
1985 fput(new->vm_file);
1986 }
1987 out_free_mpol:
1988 mpol_put(pol);
1989 out_free_vma:
1990 kmem_cache_free(vm_area_cachep, new);
1991 out_err:
1992 return err;
1993}
1994
1995/*
1996 * Split a vma into two pieces at address 'addr', a new vma is allocated
1997 * either for the first part or the tail.
1998 */
1999int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
2000 unsigned long addr, int new_below)
2001{
2002 if (mm->map_count >= sysctl_max_map_count)
2003 return -ENOMEM;
2004
2005 return __split_vma(mm, vma, addr, new_below);
1885} 2006}
1886 2007
1887/* Munmap is split into 2 main parts -- this part which finds 2008/* Munmap is split into 2 main parts -- this part which finds
@@ -1919,7 +2040,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1919 * places tmp vma above, and higher split_vma places tmp vma below. 2040 * places tmp vma above, and higher split_vma places tmp vma below.
1920 */ 2041 */
1921 if (start > vma->vm_start) { 2042 if (start > vma->vm_start) {
1922 int error = split_vma(mm, vma, start, 0); 2043 int error;
2044
2045 /*
2046 * Make sure that map_count on return from munmap() will
2047 * not exceed its limit; but let map_count go just above
2048 * its limit temporarily, to help free resources as expected.
2049 */
2050 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
2051 return -ENOMEM;
2052
2053 error = __split_vma(mm, vma, start, 0);
1923 if (error) 2054 if (error)
1924 return error; 2055 return error;
1925 prev = vma; 2056 prev = vma;
@@ -1928,7 +2059,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1928 /* Does it split the last one? */ 2059 /* Does it split the last one? */
1929 last = find_vma(mm, end); 2060 last = find_vma(mm, end);
1930 if (last && end > last->vm_start) { 2061 if (last && end > last->vm_start) {
1931 int error = split_vma(mm, last, end, 1); 2062 int error = __split_vma(mm, last, end, 1);
1932 if (error) 2063 if (error)
1933 return error; 2064 return error;
1934 } 2065 }
@@ -2003,20 +2134,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2003 if (!len) 2134 if (!len)
2004 return addr; 2135 return addr;
2005 2136
2006 if ((addr + len) > TASK_SIZE || (addr + len) < addr)
2007 return -EINVAL;
2008
2009 if (is_hugepage_only_range(mm, addr, len))
2010 return -EINVAL;
2011
2012 error = security_file_mmap(NULL, 0, 0, 0, addr, 1); 2137 error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
2013 if (error) 2138 if (error)
2014 return error; 2139 return error;
2015 2140
2016 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2141 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
2017 2142
2018 error = arch_mmap_check(addr, len, flags); 2143 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
2019 if (error) 2144 if (error & ~PAGE_MASK)
2020 return error; 2145 return error;
2021 2146
2022 /* 2147 /*
@@ -2026,7 +2151,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2026 unsigned long locked, lock_limit; 2151 unsigned long locked, lock_limit;
2027 locked = len >> PAGE_SHIFT; 2152 locked = len >> PAGE_SHIFT;
2028 locked += mm->locked_vm; 2153 locked += mm->locked_vm;
2029 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2154 lock_limit = rlimit(RLIMIT_MEMLOCK);
2030 lock_limit >>= PAGE_SHIFT; 2155 lock_limit >>= PAGE_SHIFT;
2031 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2156 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
2032 return -EAGAIN; 2157 return -EAGAIN;
@@ -2074,6 +2199,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2074 return -ENOMEM; 2199 return -ENOMEM;
2075 } 2200 }
2076 2201
2202 INIT_LIST_HEAD(&vma->anon_vma_chain);
2077 vma->vm_mm = mm; 2203 vma->vm_mm = mm;
2078 vma->vm_start = addr; 2204 vma->vm_start = addr;
2079 vma->vm_end = addr + len; 2205 vma->vm_end = addr + len;
@@ -2210,10 +2336,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2210 if (new_vma) { 2336 if (new_vma) {
2211 *new_vma = *vma; 2337 *new_vma = *vma;
2212 pol = mpol_dup(vma_policy(vma)); 2338 pol = mpol_dup(vma_policy(vma));
2213 if (IS_ERR(pol)) { 2339 if (IS_ERR(pol))
2214 kmem_cache_free(vm_area_cachep, new_vma); 2340 goto out_free_vma;
2215 return NULL; 2341 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2216 } 2342 if (anon_vma_clone(new_vma, vma))
2343 goto out_free_mempol;
2217 vma_set_policy(new_vma, pol); 2344 vma_set_policy(new_vma, pol);
2218 new_vma->vm_start = addr; 2345 new_vma->vm_start = addr;
2219 new_vma->vm_end = addr + len; 2346 new_vma->vm_end = addr + len;
@@ -2229,6 +2356,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2229 } 2356 }
2230 } 2357 }
2231 return new_vma; 2358 return new_vma;
2359
2360 out_free_mempol:
2361 mpol_put(pol);
2362 out_free_vma:
2363 kmem_cache_free(vm_area_cachep, new_vma);
2364 return NULL;
2232} 2365}
2233 2366
2234/* 2367/*
@@ -2240,7 +2373,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2240 unsigned long cur = mm->total_vm; /* pages */ 2373 unsigned long cur = mm->total_vm; /* pages */
2241 unsigned long lim; 2374 unsigned long lim;
2242 2375
2243 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; 2376 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
2244 2377
2245 if (cur + npages > lim) 2378 if (cur + npages > lim)
2246 return 0; 2379 return 0;
@@ -2306,6 +2439,7 @@ int install_special_mapping(struct mm_struct *mm,
2306 if (unlikely(vma == NULL)) 2439 if (unlikely(vma == NULL))
2307 return -ENOMEM; 2440 return -ENOMEM;
2308 2441
2442 INIT_LIST_HEAD(&vma->anon_vma_chain);
2309 vma->vm_mm = mm; 2443 vma->vm_mm = mm;
2310 vma->vm_start = addr; 2444 vma->vm_start = addr;
2311 vma->vm_end = addr + len; 2445 vma->vm_end = addr + len;
@@ -2406,6 +2540,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2406int mm_take_all_locks(struct mm_struct *mm) 2540int mm_take_all_locks(struct mm_struct *mm)
2407{ 2541{
2408 struct vm_area_struct *vma; 2542 struct vm_area_struct *vma;
2543 struct anon_vma_chain *avc;
2409 int ret = -EINTR; 2544 int ret = -EINTR;
2410 2545
2411 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2546 BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2423,7 +2558,8 @@ int mm_take_all_locks(struct mm_struct *mm)
2423 if (signal_pending(current)) 2558 if (signal_pending(current))
2424 goto out_unlock; 2559 goto out_unlock;
2425 if (vma->anon_vma) 2560 if (vma->anon_vma)
2426 vm_lock_anon_vma(mm, vma->anon_vma); 2561 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2562 vm_lock_anon_vma(mm, avc->anon_vma);
2427 } 2563 }
2428 2564
2429 ret = 0; 2565 ret = 0;
@@ -2478,13 +2614,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
2478void mm_drop_all_locks(struct mm_struct *mm) 2614void mm_drop_all_locks(struct mm_struct *mm)
2479{ 2615{
2480 struct vm_area_struct *vma; 2616 struct vm_area_struct *vma;
2617 struct anon_vma_chain *avc;
2481 2618
2482 BUG_ON(down_read_trylock(&mm->mmap_sem)); 2619 BUG_ON(down_read_trylock(&mm->mmap_sem));
2483 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 2620 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
2484 2621
2485 for (vma = mm->mmap; vma; vma = vma->vm_next) { 2622 for (vma = mm->mmap; vma; vma = vma->vm_next) {
2486 if (vma->anon_vma) 2623 if (vma->anon_vma)
2487 vm_unlock_anon_vma(vma->anon_vma); 2624 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
2625 vm_unlock_anon_vma(avc->anon_vma);
2488 if (vma->vm_file && vma->vm_file->f_mapping) 2626 if (vma->vm_file && vma->vm_file->f_mapping)
2489 vm_unlock_mapping(vma->vm_file->f_mapping); 2627 vm_unlock_mapping(vma->vm_file->f_mapping);
2490 } 2628 }
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..9e82e937000e 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/mmu_context.h> 7#include <linux/mmu_context.h>
8#include <linux/module.h>
8#include <linux/sched.h> 9#include <linux/sched.h>
9 10
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
37 if (active_mm != mm) 38 if (active_mm != mm)
38 mmdrop(active_mm); 39 mmdrop(active_mm);
39} 40}
41EXPORT_SYMBOL_GPL(use_mm);
40 42
41/* 43/*
42 * unuse_mm 44 * unuse_mm
@@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm)
51 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
52 54
53 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm);
54 tsk->mm = NULL; 57 tsk->mm = NULL;
55 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk); 60 task_unlock(tsk);
58} 61}
62EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7e33f2cb3c77..438951d366f2 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -16,6 +16,7 @@
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/slab.h>
19 20
20/* 21/*
21 * This function can't run concurrently against mmu_notifier_register 22 * This function can't run concurrently against mmu_notifier_register
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d8112d..2d1bf7cf8851 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -10,7 +10,6 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/slab.h>
14#include <linux/shm.h> 13#include <linux/shm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index 97bff2547719..cde56ee51ef7 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h>
13#include <linux/shm.h> 12#include <linux/shm.h>
14#include <linux/ksm.h> 13#include <linux/ksm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
@@ -261,6 +260,137 @@ static unsigned long move_vma(struct vm_area_struct *vma,
261 return new_addr; 260 return new_addr;
262} 261}
263 262
263static struct vm_area_struct *vma_to_resize(unsigned long addr,
264 unsigned long old_len, unsigned long new_len, unsigned long *p)
265{
266 struct mm_struct *mm = current->mm;
267 struct vm_area_struct *vma = find_vma(mm, addr);
268
269 if (!vma || vma->vm_start > addr)
270 goto Efault;
271
272 if (is_vm_hugetlb_page(vma))
273 goto Einval;
274
275 /* We can't remap across vm area boundaries */
276 if (old_len > vma->vm_end - addr)
277 goto Efault;
278
279 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
280 if (new_len > old_len)
281 goto Efault;
282 }
283
284 if (vma->vm_flags & VM_LOCKED) {
285 unsigned long locked, lock_limit;
286 locked = mm->locked_vm << PAGE_SHIFT;
287 lock_limit = rlimit(RLIMIT_MEMLOCK);
288 locked += new_len - old_len;
289 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
290 goto Eagain;
291 }
292
293 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
294 goto Enomem;
295
296 if (vma->vm_flags & VM_ACCOUNT) {
297 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
298 if (security_vm_enough_memory(charged))
299 goto Efault;
300 *p = charged;
301 }
302
303 return vma;
304
305Efault: /* very odd choice for most of the cases, but... */
306 return ERR_PTR(-EFAULT);
307Einval:
308 return ERR_PTR(-EINVAL);
309Enomem:
310 return ERR_PTR(-ENOMEM);
311Eagain:
312 return ERR_PTR(-EAGAIN);
313}
314
315static unsigned long mremap_to(unsigned long addr,
316 unsigned long old_len, unsigned long new_addr,
317 unsigned long new_len)
318{
319 struct mm_struct *mm = current->mm;
320 struct vm_area_struct *vma;
321 unsigned long ret = -EINVAL;
322 unsigned long charged = 0;
323 unsigned long map_flags;
324
325 if (new_addr & ~PAGE_MASK)
326 goto out;
327
328 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
329 goto out;
330
331 /* Check if the location we're moving into overlaps the
332 * old location at all, and fail if it does.
333 */
334 if ((new_addr <= addr) && (new_addr+new_len) > addr)
335 goto out;
336
337 if ((addr <= new_addr) && (addr+old_len) > new_addr)
338 goto out;
339
340 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
341 if (ret)
342 goto out;
343
344 ret = do_munmap(mm, new_addr, new_len);
345 if (ret)
346 goto out;
347
348 if (old_len >= new_len) {
349 ret = do_munmap(mm, addr+new_len, old_len - new_len);
350 if (ret && old_len != new_len)
351 goto out;
352 old_len = new_len;
353 }
354
355 vma = vma_to_resize(addr, old_len, new_len, &charged);
356 if (IS_ERR(vma)) {
357 ret = PTR_ERR(vma);
358 goto out;
359 }
360
361 map_flags = MAP_FIXED;
362 if (vma->vm_flags & VM_MAYSHARE)
363 map_flags |= MAP_SHARED;
364
365 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
366 ((addr - vma->vm_start) >> PAGE_SHIFT),
367 map_flags);
368 if (ret & ~PAGE_MASK)
369 goto out1;
370
371 ret = move_vma(vma, addr, old_len, new_len, new_addr);
372 if (!(ret & ~PAGE_MASK))
373 goto out;
374out1:
375 vm_unacct_memory(charged);
376
377out:
378 return ret;
379}
380
381static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
382{
383 unsigned long end = vma->vm_end + delta;
384 if (end < vma->vm_end) /* overflow */
385 return 0;
386 if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
387 return 0;
388 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
389 0, MAP_FIXED) & ~PAGE_MASK)
390 return 0;
391 return 1;
392}
393
264/* 394/*
265 * Expand (or shrink) an existing mapping, potentially moving it at the 395 * Expand (or shrink) an existing mapping, potentially moving it at the
266 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 396 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -294,32 +424,10 @@ unsigned long do_mremap(unsigned long addr,
294 if (!new_len) 424 if (!new_len)
295 goto out; 425 goto out;
296 426
297 /* new_addr is only valid if MREMAP_FIXED is specified */
298 if (flags & MREMAP_FIXED) { 427 if (flags & MREMAP_FIXED) {
299 if (new_addr & ~PAGE_MASK) 428 if (flags & MREMAP_MAYMOVE)
300 goto out; 429 ret = mremap_to(addr, old_len, new_addr, new_len);
301 if (!(flags & MREMAP_MAYMOVE)) 430 goto out;
302 goto out;
303
304 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
305 goto out;
306
307 /* Check if the location we're moving into overlaps the
308 * old location at all, and fail if it does.
309 */
310 if ((new_addr <= addr) && (new_addr+new_len) > addr)
311 goto out;
312
313 if ((addr <= new_addr) && (addr+old_len) > new_addr)
314 goto out;
315
316 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
317 if (ret)
318 goto out;
319
320 ret = do_munmap(mm, new_addr, new_len);
321 if (ret)
322 goto out;
323 } 431 }
324 432
325 /* 433 /*
@@ -332,64 +440,30 @@ unsigned long do_mremap(unsigned long addr,
332 if (ret && old_len != new_len) 440 if (ret && old_len != new_len)
333 goto out; 441 goto out;
334 ret = addr; 442 ret = addr;
335 if (!(flags & MREMAP_FIXED) || (new_addr == addr)) 443 goto out;
336 goto out;
337 old_len = new_len;
338 } 444 }
339 445
340 /* 446 /*
341 * Ok, we need to grow.. or relocate. 447 * Ok, we need to grow..
342 */ 448 */
343 ret = -EFAULT; 449 vma = vma_to_resize(addr, old_len, new_len, &charged);
344 vma = find_vma(mm, addr); 450 if (IS_ERR(vma)) {
345 if (!vma || vma->vm_start > addr) 451 ret = PTR_ERR(vma);
346 goto out;
347 if (is_vm_hugetlb_page(vma)) {
348 ret = -EINVAL;
349 goto out;
350 }
351 /* We can't remap across vm area boundaries */
352 if (old_len > vma->vm_end - addr)
353 goto out;
354 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
355 if (new_len > old_len)
356 goto out;
357 }
358 if (vma->vm_flags & VM_LOCKED) {
359 unsigned long locked, lock_limit;
360 locked = mm->locked_vm << PAGE_SHIFT;
361 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
362 locked += new_len - old_len;
363 ret = -EAGAIN;
364 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
365 goto out;
366 }
367 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
368 ret = -ENOMEM;
369 goto out; 452 goto out;
370 } 453 }
371 454
372 if (vma->vm_flags & VM_ACCOUNT) {
373 charged = (new_len - old_len) >> PAGE_SHIFT;
374 if (security_vm_enough_memory(charged))
375 goto out_nc;
376 }
377
378 /* old_len exactly to the end of the area.. 455 /* old_len exactly to the end of the area..
379 * And we're not relocating the area.
380 */ 456 */
381 if (old_len == vma->vm_end - addr && 457 if (old_len == vma->vm_end - addr) {
382 !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
383 (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
384 unsigned long max_addr = TASK_SIZE;
385 if (vma->vm_next)
386 max_addr = vma->vm_next->vm_start;
387 /* can we just expand the current mapping? */ 458 /* can we just expand the current mapping? */
388 if (max_addr - addr >= new_len) { 459 if (vma_expandable(vma, new_len - old_len)) {
389 int pages = (new_len - old_len) >> PAGE_SHIFT; 460 int pages = (new_len - old_len) >> PAGE_SHIFT;
390 461
391 vma_adjust(vma, vma->vm_start, 462 if (vma_adjust(vma, vma->vm_start, addr + new_len,
392 addr + new_len, vma->vm_pgoff, NULL); 463 vma->vm_pgoff, NULL)) {
464 ret = -ENOMEM;
465 goto out;
466 }
393 467
394 mm->total_vm += pages; 468 mm->total_vm += pages;
395 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 469 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
@@ -409,28 +483,27 @@ unsigned long do_mremap(unsigned long addr,
409 */ 483 */
410 ret = -ENOMEM; 484 ret = -ENOMEM;
411 if (flags & MREMAP_MAYMOVE) { 485 if (flags & MREMAP_MAYMOVE) {
412 if (!(flags & MREMAP_FIXED)) { 486 unsigned long map_flags = 0;
413 unsigned long map_flags = 0; 487 if (vma->vm_flags & VM_MAYSHARE)
414 if (vma->vm_flags & VM_MAYSHARE) 488 map_flags |= MAP_SHARED;
415 map_flags |= MAP_SHARED; 489
416 490 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
417 new_addr = get_unmapped_area(vma->vm_file, 0, new_len, 491 vma->vm_pgoff +
418 vma->vm_pgoff, map_flags); 492 ((addr - vma->vm_start) >> PAGE_SHIFT),
419 if (new_addr & ~PAGE_MASK) { 493 map_flags);
420 ret = new_addr; 494 if (new_addr & ~PAGE_MASK) {
421 goto out; 495 ret = new_addr;
422 } 496 goto out;
423
424 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
425 if (ret)
426 goto out;
427 } 497 }
498
499 ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
500 if (ret)
501 goto out;
428 ret = move_vma(vma, addr, old_len, new_len, new_addr); 502 ret = move_vma(vma, addr, old_len, new_len, new_addr);
429 } 503 }
430out: 504out:
431 if (ret & ~PAGE_MASK) 505 if (ret & ~PAGE_MASK)
432 vm_unacct_memory(charged); 506 vm_unacct_memory(charged);
433out_nc:
434 return ret; 507 return ret;
435} 508}
436 509
diff --git a/mm/nommu.c b/mm/nommu.c
index 9876fa0c3ad3..63fa17d121f0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
162 } 162 }
163 if (vmas) 163 if (vmas)
164 vmas[i] = vma; 164 vmas[i] = vma;
165 start += PAGE_SIZE; 165 start = (start + PAGE_SIZE) & PAGE_MASK;
166 } 166 }
167 167
168 return i; 168 return i;
@@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
432 /* 432 /*
433 * Ok, looks good - let it rip. 433 * Ok, looks good - let it rip.
434 */ 434 */
435 flush_icache_range(mm->brk, brk);
435 return mm->brk = brk; 436 return mm->brk = brk;
436} 437}
437 438
@@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to)
551static void __put_nommu_region(struct vm_region *region) 552static void __put_nommu_region(struct vm_region *region)
552 __releases(nommu_region_sem) 553 __releases(nommu_region_sem)
553{ 554{
554 kenter("%p{%d}", region, atomic_read(&region->vm_usage)); 555 kenter("%p{%d}", region, region->vm_usage);
555 556
556 BUG_ON(!nommu_region_tree.rb_node); 557 BUG_ON(!nommu_region_tree.rb_node);
557 558
558 if (atomic_dec_and_test(&region->vm_usage)) { 559 if (--region->vm_usage == 0) {
559 if (region->vm_top > region->vm_start) 560 if (region->vm_top > region->vm_start)
560 delete_nommu_region(region); 561 delete_nommu_region(region);
561 up_write(&nommu_region_sem); 562 up_write(&nommu_region_sem);
@@ -1039,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1039 if (ret != -ENOSYS) 1040 if (ret != -ENOSYS)
1040 return ret; 1041 return ret;
1041 1042
1042 /* getting an ENOSYS error indicates that direct mmap isn't 1043 /* getting -ENOSYS indicates that direct mmap isn't possible (as
1043 * possible (as opposed to tried but failed) so we'll fall 1044 * opposed to tried but failed) so we can only give a suitable error as
1044 * through to making a private copy of the data and mapping 1045 * it's not possible to make a private copy if MAP_SHARED was given */
1045 * that if we can */
1046 return -ENODEV; 1046 return -ENODEV;
1047} 1047}
1048 1048
@@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma,
1143 if (ret < rlen) 1143 if (ret < rlen)
1144 memset(base + ret, 0, rlen - ret); 1144 memset(base + ret, 0, rlen - ret);
1145 1145
1146 } else {
1147 /* if it's an anonymous mapping, then just clear it */
1148 memset(base, 0, rlen);
1149 } 1146 }
1150 1147
1151 return 0; 1148 return 0;
@@ -1207,11 +1204,11 @@ unsigned long do_mmap_pgoff(struct file *file,
1207 if (!vma) 1204 if (!vma)
1208 goto error_getting_vma; 1205 goto error_getting_vma;
1209 1206
1210 atomic_set(&region->vm_usage, 1); 1207 region->vm_usage = 1;
1211 region->vm_flags = vm_flags; 1208 region->vm_flags = vm_flags;
1212 region->vm_pgoff = pgoff; 1209 region->vm_pgoff = pgoff;
1213 1210
1214 INIT_LIST_HEAD(&vma->anon_vma_node); 1211 INIT_LIST_HEAD(&vma->anon_vma_chain);
1215 vma->vm_flags = vm_flags; 1212 vma->vm_flags = vm_flags;
1216 vma->vm_pgoff = pgoff; 1213 vma->vm_pgoff = pgoff;
1217 1214
@@ -1274,7 +1271,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1274 } 1271 }
1275 1272
1276 /* we've found a region we can share */ 1273 /* we've found a region we can share */
1277 atomic_inc(&pregion->vm_usage); 1274 pregion->vm_usage++;
1278 vma->vm_region = pregion; 1275 vma->vm_region = pregion;
1279 start = pregion->vm_start; 1276 start = pregion->vm_start;
1280 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1277 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
@@ -1291,7 +1288,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1291 vma->vm_region = NULL; 1288 vma->vm_region = NULL;
1292 vma->vm_start = 0; 1289 vma->vm_start = 0;
1293 vma->vm_end = 0; 1290 vma->vm_end = 0;
1294 atomic_dec(&pregion->vm_usage); 1291 pregion->vm_usage--;
1295 pregion = NULL; 1292 pregion = NULL;
1296 goto error_just_free; 1293 goto error_just_free;
1297 } 1294 }
@@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file,
1343 goto error_just_free; 1340 goto error_just_free;
1344 add_nommu_region(region); 1341 add_nommu_region(region);
1345 1342
1343 /* clear anonymous mappings that don't ask for uninitialized data */
1344 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1345 memset((void *)region->vm_start, 0,
1346 region->vm_end - region->vm_start);
1347
1346 /* okay... we have a mapping; now we have to register it */ 1348 /* okay... we have a mapping; now we have to register it */
1347 result = vma->vm_start; 1349 result = vma->vm_start;
1348 1350
@@ -1351,10 +1353,14 @@ unsigned long do_mmap_pgoff(struct file *file,
1351share: 1353share:
1352 add_vma_to_mm(current->mm, vma); 1354 add_vma_to_mm(current->mm, vma);
1353 1355
1354 up_write(&nommu_region_sem); 1356 /* we flush the region from the icache only when the first executable
1357 * mapping of it is made */
1358 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1359 flush_icache_range(region->vm_start, region->vm_end);
1360 region->vm_icache_flushed = true;
1361 }
1355 1362
1356 if (prot & PROT_EXEC) 1363 up_write(&nommu_region_sem);
1357 flush_icache_range(result, result + len);
1358 1364
1359 kleave(" = %lx", result); 1365 kleave(" = %lx", result);
1360 return result; 1366 return result;
@@ -1396,6 +1402,55 @@ error_getting_region:
1396} 1402}
1397EXPORT_SYMBOL(do_mmap_pgoff); 1403EXPORT_SYMBOL(do_mmap_pgoff);
1398 1404
1405SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1406 unsigned long, prot, unsigned long, flags,
1407 unsigned long, fd, unsigned long, pgoff)
1408{
1409 struct file *file = NULL;
1410 unsigned long retval = -EBADF;
1411
1412 if (!(flags & MAP_ANONYMOUS)) {
1413 file = fget(fd);
1414 if (!file)
1415 goto out;
1416 }
1417
1418 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1419
1420 down_write(&current->mm->mmap_sem);
1421 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1422 up_write(&current->mm->mmap_sem);
1423
1424 if (file)
1425 fput(file);
1426out:
1427 return retval;
1428}
1429
1430#ifdef __ARCH_WANT_SYS_OLD_MMAP
1431struct mmap_arg_struct {
1432 unsigned long addr;
1433 unsigned long len;
1434 unsigned long prot;
1435 unsigned long flags;
1436 unsigned long fd;
1437 unsigned long offset;
1438};
1439
1440SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1441{
1442 struct mmap_arg_struct a;
1443
1444 if (copy_from_user(&a, arg, sizeof(a)))
1445 return -EFAULT;
1446 if (a.offset & ~PAGE_MASK)
1447 return -EINVAL;
1448
1449 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1450 a.offset >> PAGE_SHIFT);
1451}
1452#endif /* __ARCH_WANT_SYS_OLD_MMAP */
1453
1399/* 1454/*
1400 * split a vma into two pieces at address 'addr', a new vma is allocated either 1455 * split a vma into two pieces at address 'addr', a new vma is allocated either
1401 * for the first part or the tail. 1456 * for the first part or the tail.
@@ -1409,10 +1464,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1409 1464
1410 kenter(""); 1465 kenter("");
1411 1466
1412 /* we're only permitted to split anonymous regions that have a single 1467 /* we're only permitted to split anonymous regions (these should have
1413 * owner */ 1468 * only a single usage on the region) */
1414 if (vma->vm_file || 1469 if (vma->vm_file)
1415 atomic_read(&vma->vm_region->vm_usage) != 1)
1416 return -ENOMEM; 1470 return -ENOMEM;
1417 1471
1418 if (mm->map_count >= sysctl_max_map_count) 1472 if (mm->map_count >= sysctl_max_map_count)
@@ -1486,7 +1540,7 @@ static int shrink_vma(struct mm_struct *mm,
1486 1540
1487 /* cut the backing region down to size */ 1541 /* cut the backing region down to size */
1488 region = vma->vm_region; 1542 region = vma->vm_region;
1489 BUG_ON(atomic_read(&region->vm_usage) != 1); 1543 BUG_ON(region->vm_usage != 1);
1490 1544
1491 down_write(&nommu_region_sem); 1545 down_write(&nommu_region_sem);
1492 delete_nommu_region(region); 1546 delete_nommu_region(region);
@@ -1730,27 +1784,6 @@ void unmap_mapping_range(struct address_space *mapping,
1730EXPORT_SYMBOL(unmap_mapping_range); 1784EXPORT_SYMBOL(unmap_mapping_range);
1731 1785
1732/* 1786/*
1733 * ask for an unmapped area at which to create a mapping on a file
1734 */
1735unsigned long get_unmapped_area(struct file *file, unsigned long addr,
1736 unsigned long len, unsigned long pgoff,
1737 unsigned long flags)
1738{
1739 unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
1740 unsigned long, unsigned long);
1741
1742 get_area = current->mm->get_unmapped_area;
1743 if (file && file->f_op && file->f_op->get_unmapped_area)
1744 get_area = file->f_op->get_unmapped_area;
1745
1746 if (!get_area)
1747 return -ENOSYS;
1748
1749 return get_area(file, addr, len, pgoff, flags);
1750}
1751EXPORT_SYMBOL(get_unmapped_area);
1752
1753/*
1754 * Check that a process has enough memory to allocate a new virtual 1787 * Check that a process has enough memory to allocate a new virtual
1755 * mapping. 0 means there is enough memory for the allocation to 1788 * mapping. 0 means there is enough memory for the allocation to
1756 * succeed and -ENOMEM implies there is not. 1789 * succeed and -ENOMEM implies there is not.
@@ -1889,9 +1922,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1889 1922
1890 /* only read or write mappings where it is permitted */ 1923 /* only read or write mappings where it is permitted */
1891 if (write && vma->vm_flags & VM_MAYWRITE) 1924 if (write && vma->vm_flags & VM_MAYWRITE)
1892 len -= copy_to_user((void *) addr, buf, len); 1925 copy_to_user_page(vma, NULL, addr,
1926 (void *) addr, buf, len);
1893 else if (!write && vma->vm_flags & VM_MAYREAD) 1927 else if (!write && vma->vm_flags & VM_MAYREAD)
1894 len -= copy_from_user(buf, (void *) addr, len); 1928 copy_from_user_page(vma, NULL, addr,
1929 buf, (void *) addr, len);
1895 else 1930 else
1896 len = 0; 1931 len = 0;
1897 } else { 1932 } else {
@@ -1902,3 +1937,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
1902 mmput(mm); 1937 mmput(mm);
1903 return len; 1938 return len;
1904} 1939}
1940
1941/**
1942 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1943 * @inode: The inode to check
1944 * @size: The current filesize of the inode
1945 * @newsize: The proposed filesize of the inode
1946 *
1947 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1948 * make sure that that any outstanding VMAs aren't broken and then shrink the
1949 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1950 * automatically grant mappings that are too large.
1951 */
1952int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1953 size_t newsize)
1954{
1955 struct vm_area_struct *vma;
1956 struct prio_tree_iter iter;
1957 struct vm_region *region;
1958 pgoff_t low, high;
1959 size_t r_size, r_top;
1960
1961 low = newsize >> PAGE_SHIFT;
1962 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1963
1964 down_write(&nommu_region_sem);
1965
1966 /* search for VMAs that fall within the dead zone */
1967 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1968 low, high) {
1969 /* found one - only interested if it's shared out of the page
1970 * cache */
1971 if (vma->vm_flags & VM_SHARED) {
1972 up_write(&nommu_region_sem);
1973 return -ETXTBSY; /* not quite true, but near enough */
1974 }
1975 }
1976
1977 /* reduce any regions that overlap the dead zone - if in existence,
1978 * these will be pointed to by VMAs that don't overlap the dead zone
1979 *
1980 * we don't check for any regions that start beyond the EOF as there
1981 * shouldn't be any
1982 */
1983 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1984 0, ULONG_MAX) {
1985 if (!(vma->vm_flags & VM_SHARED))
1986 continue;
1987
1988 region = vma->vm_region;
1989 r_size = region->vm_top - region->vm_start;
1990 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1991
1992 if (r_top > newsize) {
1993 region->vm_top -= r_top - newsize;
1994 if (region->vm_end > region->vm_top)
1995 region->vm_end = region->vm_top;
1996 }
1997 }
1998
1999 up_write(&nommu_region_sem);
2000 return 0;
2001}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ea2147dabba6..b68e802a7a7d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -18,6 +18,7 @@
18#include <linux/oom.h> 18#include <linux/oom.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/err.h> 20#include <linux/err.h>
21#include <linux/gfp.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/swap.h> 23#include <linux/swap.h>
23#include <linux/timex.h> 24#include <linux/timex.h>
@@ -196,27 +197,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
196/* 197/*
197 * Determine the type of allocation constraint. 198 * Determine the type of allocation constraint.
198 */ 199 */
199static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200 gfp_t gfp_mask)
201{
202#ifdef CONFIG_NUMA 200#ifdef CONFIG_NUMA
201static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
202 gfp_t gfp_mask, nodemask_t *nodemask)
203{
203 struct zone *zone; 204 struct zone *zone;
204 struct zoneref *z; 205 struct zoneref *z;
205 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 206 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
206 nodemask_t nodes = node_states[N_HIGH_MEMORY];
207 207
208 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 208 /*
209 if (cpuset_zone_allowed_softwall(zone, gfp_mask)) 209 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
210 node_clear(zone_to_nid(zone), nodes); 210 * to kill current.We have to random task kill in this case.
211 else 211 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
212 return CONSTRAINT_CPUSET; 212 */
213 if (gfp_mask & __GFP_THISNODE)
214 return CONSTRAINT_NONE;
213 215
214 if (!nodes_empty(nodes)) 216 /*
217 * The nodemask here is a nodemask passed to alloc_pages(). Now,
218 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
219 * feature. mempolicy is an only user of nodemask here.
220 * check mempolicy's nodemask contains all N_HIGH_MEMORY
221 */
222 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
215 return CONSTRAINT_MEMORY_POLICY; 223 return CONSTRAINT_MEMORY_POLICY;
216#endif 224
225 /* Check this allocation failure is caused by cpuset's wall function */
226 for_each_zone_zonelist_nodemask(zone, z, zonelist,
227 high_zoneidx, nodemask)
228 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
229 return CONSTRAINT_CPUSET;
217 230
218 return CONSTRAINT_NONE; 231 return CONSTRAINT_NONE;
219} 232}
233#else
234static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
235 gfp_t gfp_mask, nodemask_t *nodemask)
236{
237 return CONSTRAINT_NONE;
238}
239#endif
220 240
221/* 241/*
222 * Simple selection loop. We chose the process with the highest 242 * Simple selection loop. We chose the process with the highest
@@ -337,6 +357,24 @@ static void dump_tasks(const struct mem_cgroup *mem)
337 } while_each_thread(g, p); 357 } while_each_thread(g, p);
338} 358}
339 359
360static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
361 struct mem_cgroup *mem)
362{
363 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
364 "oom_adj=%d\n",
365 current->comm, gfp_mask, order, current->signal->oom_adj);
366 task_lock(current);
367 cpuset_print_task_mems_allowed(current);
368 task_unlock(current);
369 dump_stack();
370 mem_cgroup_print_oom_info(mem, p);
371 show_mem();
372 if (sysctl_oom_dump_tasks)
373 dump_tasks(mem);
374}
375
376#define K(x) ((x) << (PAGE_SHIFT-10))
377
340/* 378/*
341 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 379 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
342 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 380 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
@@ -350,15 +388,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
350 return; 388 return;
351 } 389 }
352 390
391 task_lock(p);
353 if (!p->mm) { 392 if (!p->mm) {
354 WARN_ON(1); 393 WARN_ON(1);
355 printk(KERN_WARNING "tried to kill an mm-less task!\n"); 394 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
395 task_pid_nr(p), p->comm);
396 task_unlock(p);
356 return; 397 return;
357 } 398 }
358 399
359 if (verbose) 400 if (verbose)
360 printk(KERN_ERR "Killed process %d (%s)\n", 401 printk(KERN_ERR "Killed process %d (%s) "
361 task_pid_nr(p), p->comm); 402 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
403 task_pid_nr(p), p->comm,
404 K(p->mm->total_vm),
405 K(get_mm_counter(p->mm, MM_ANONPAGES)),
406 K(get_mm_counter(p->mm, MM_FILEPAGES)));
407 task_unlock(p);
362 408
363 /* 409 /*
364 * We give our sacrificial lamb high priority and access to 410 * We give our sacrificial lamb high priority and access to
@@ -395,20 +441,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
395{ 441{
396 struct task_struct *c; 442 struct task_struct *c;
397 443
398 if (printk_ratelimit()) { 444 if (printk_ratelimit())
399 printk(KERN_WARNING "%s invoked oom-killer: " 445 dump_header(p, gfp_mask, order, mem);
400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
403 task_lock(current);
404 cpuset_print_task_mems_allowed(current);
405 task_unlock(current);
406 dump_stack();
407 mem_cgroup_print_oom_info(mem, current);
408 show_mem();
409 if (sysctl_oom_dump_tasks)
410 dump_tasks(mem);
411 }
412 446
413 /* 447 /*
414 * If the task is already exiting, don't alarm the sysadmin or kill 448 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -426,6 +460,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
426 list_for_each_entry(c, &p->children, sibling) { 460 list_for_each_entry(c, &p->children, sibling) {
427 if (c->mm == p->mm) 461 if (c->mm == p->mm)
428 continue; 462 continue;
463 if (mem && !task_in_mem_cgroup(c, mem))
464 continue;
429 if (!oom_kill_task(c)) 465 if (!oom_kill_task(c))
430 return 0; 466 return 0;
431 } 467 }
@@ -438,6 +474,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
438 unsigned long points = 0; 474 unsigned long points = 0;
439 struct task_struct *p; 475 struct task_struct *p;
440 476
477 if (sysctl_panic_on_oom == 2)
478 panic("out of memory(memcg). panic_on_oom is selected.\n");
441 read_lock(&tasklist_lock); 479 read_lock(&tasklist_lock);
442retry: 480retry:
443 p = select_bad_process(&points, mem); 481 p = select_bad_process(&points, mem);
@@ -544,6 +582,7 @@ retry:
544 /* Found nothing?!?! Either we hang forever, or we panic. */ 582 /* Found nothing?!?! Either we hang forever, or we panic. */
545 if (!p) { 583 if (!p) {
546 read_unlock(&tasklist_lock); 584 read_unlock(&tasklist_lock);
585 dump_header(NULL, gfp_mask, order, NULL);
547 panic("Out of memory and no killable processes...\n"); 586 panic("Out of memory and no killable processes...\n");
548 } 587 }
549 588
@@ -565,13 +604,6 @@ void pagefault_out_of_memory(void)
565 /* Got some memory back in the last second. */ 604 /* Got some memory back in the last second. */
566 return; 605 return;
567 606
568 /*
569 * If this is from memcg, oom-killer is already invoked.
570 * and not worth to go system-wide-oom.
571 */
572 if (mem_cgroup_oom_called(current))
573 goto rest_and_return;
574
575 if (sysctl_panic_on_oom) 607 if (sysctl_panic_on_oom)
576 panic("out of memory from page fault. panic_on_oom is selected.\n"); 608 panic("out of memory from page fault. panic_on_oom is selected.\n");
577 609
@@ -583,7 +615,6 @@ void pagefault_out_of_memory(void)
583 * Give "p" a good chance of killing itself before we 615 * Give "p" a good chance of killing itself before we
584 * retry to allocate memory. 616 * retry to allocate memory.
585 */ 617 */
586rest_and_return:
587 if (!test_thread_flag(TIF_MEMDIE)) 618 if (!test_thread_flag(TIF_MEMDIE))
588 schedule_timeout_uninterruptible(1); 619 schedule_timeout_uninterruptible(1);
589} 620}
@@ -599,7 +630,8 @@ rest_and_return:
599 * OR try to be smart about which process to kill. Note that we 630 * OR try to be smart about which process to kill. Note that we
600 * don't have to be perfect here, we just have to be good. 631 * don't have to be perfect here, we just have to be good.
601 */ 632 */
602void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 633void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
634 int order, nodemask_t *nodemask)
603{ 635{
604 unsigned long freed = 0; 636 unsigned long freed = 0;
605 enum oom_constraint constraint; 637 enum oom_constraint constraint;
@@ -609,14 +641,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
609 /* Got some memory back in the last second. */ 641 /* Got some memory back in the last second. */
610 return; 642 return;
611 643
612 if (sysctl_panic_on_oom == 2) 644 if (sysctl_panic_on_oom == 2) {
645 dump_header(NULL, gfp_mask, order, NULL);
613 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 646 panic("out of memory. Compulsory panic_on_oom is selected.\n");
647 }
614 648
615 /* 649 /*
616 * Check if there were limitations on the allocation (only relevant for 650 * Check if there were limitations on the allocation (only relevant for
617 * NUMA) that may require different handling. 651 * NUMA) that may require different handling.
618 */ 652 */
619 constraint = constrained_alloc(zonelist, gfp_mask); 653 constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
620 read_lock(&tasklist_lock); 654 read_lock(&tasklist_lock);
621 655
622 switch (constraint) { 656 switch (constraint) {
@@ -626,8 +660,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
626 break; 660 break;
627 661
628 case CONSTRAINT_NONE: 662 case CONSTRAINT_NONE:
629 if (sysctl_panic_on_oom) 663 if (sysctl_panic_on_oom) {
664 dump_header(NULL, gfp_mask, order, NULL);
630 panic("out of memory. panic_on_oom is selected\n"); 665 panic("out of memory. panic_on_oom is selected\n");
666 }
631 /* Fall-through */ 667 /* Fall-through */
632 case CONSTRAINT_CPUSET: 668 case CONSTRAINT_CPUSET:
633 __out_of_memory(gfp_mask, order); 669 __out_of_memory(gfp_mask, order);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d79236ead..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
821 struct writeback_control *wbc, writepage_t writepage, 821 struct writeback_control *wbc, writepage_t writepage,
822 void *data) 822 void *data)
823{ 823{
824 struct backing_dev_info *bdi = mapping->backing_dev_info;
825 int ret = 0; 824 int ret = 0;
826 int done = 0; 825 int done = 0;
827 struct pagevec pvec; 826 struct pagevec pvec;
@@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
834 int range_whole = 0; 833 int range_whole = 0;
835 long nr_to_write = wbc->nr_to_write; 834 long nr_to_write = wbc->nr_to_write;
836 835
837 if (wbc->nonblocking && bdi_write_congested(bdi)) {
838 wbc->encountered_congestion = 1;
839 return 0;
840 }
841
842 pagevec_init(&pvec, 0); 836 pagevec_init(&pvec, 0);
843 if (wbc->range_cyclic) { 837 if (wbc->range_cyclic) {
844 writeback_index = mapping->writeback_index; /* prev offset */ 838 writeback_index = mapping->writeback_index; /* prev offset */
@@ -957,12 +951,6 @@ continue_unlock:
957 break; 951 break;
958 } 952 }
959 } 953 }
960
961 if (wbc->nonblocking && bdi_write_congested(bdi)) {
962 wbc->encountered_congestion = 1;
963 done = 1;
964 break;
965 }
966 } 954 }
967 pagevec_release(&pvec); 955 pagevec_release(&pvec);
968 cond_resched(); 956 cond_resched();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bc2ac63f41e..d03c946d5566 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,9 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <linux/memory.h>
51#include <trace/events/kmem.h> 52#include <trace/events/kmem.h>
53#include <linux/ftrace_event.h>
52 54
53#include <asm/tlbflush.h> 55#include <asm/tlbflush.h>
54#include <asm/div64.h> 56#include <asm/div64.h>
@@ -75,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly;
75int percpu_pagelist_fraction; 77int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 78gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
77 79
80#ifdef CONFIG_PM_SLEEP
81/*
82 * The following functions are used by the suspend/hibernate code to temporarily
83 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
84 * while devices are suspended. To avoid races with the suspend/hibernate code,
85 * they should always be called with pm_mutex held (gfp_allowed_mask also should
86 * only be modified with pm_mutex held, unless the suspend/hibernate code is
87 * guaranteed not to run in parallel with that modification).
88 */
89void set_gfp_allowed_mask(gfp_t mask)
90{
91 WARN_ON(!mutex_is_locked(&pm_mutex));
92 gfp_allowed_mask = mask;
93}
94
95gfp_t clear_gfp_allowed_mask(gfp_t mask)
96{
97 gfp_t ret = gfp_allowed_mask;
98
99 WARN_ON(!mutex_is_locked(&pm_mutex));
100 gfp_allowed_mask &= ~mask;
101 return ret;
102}
103#endif /* CONFIG_PM_SLEEP */
104
78#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 105#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
79int pageblock_order __read_mostly; 106int pageblock_order __read_mostly;
80#endif 107#endif
@@ -262,10 +289,7 @@ static void bad_page(struct page *page)
262 289
263 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 290 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
264 current->comm, page_to_pfn(page)); 291 current->comm, page_to_pfn(page));
265 printk(KERN_ALERT 292 dump_page(page);
266 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
267 page, (void *)page->flags, page_count(page),
268 page_mapcount(page), page->mapping, page->index);
269 293
270 dump_stack(); 294 dump_stack();
271out: 295out:
@@ -486,7 +510,6 @@ static inline void __free_one_page(struct page *page,
486 zone->free_area[order].nr_free++; 510 zone->free_area[order].nr_free++;
487} 511}
488 512
489#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
490/* 513/*
491 * free_page_mlock() -- clean up attempts to free and mlocked() page. 514 * free_page_mlock() -- clean up attempts to free and mlocked() page.
492 * Page should not be on lru, so no need to fix that up. 515 * Page should not be on lru, so no need to fix that up.
@@ -497,9 +520,6 @@ static inline void free_page_mlock(struct page *page)
497 __dec_zone_page_state(page, NR_MLOCK); 520 __dec_zone_page_state(page, NR_MLOCK);
498 __count_vm_event(UNEVICTABLE_MLOCKFREED); 521 __count_vm_event(UNEVICTABLE_MLOCKFREED);
499} 522}
500#else
501static void free_page_mlock(struct page *page) { }
502#endif
503 523
504static inline int free_pages_check(struct page *page) 524static inline int free_pages_check(struct page *page)
505{ 525{
@@ -533,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
533 int batch_free = 0; 553 int batch_free = 0;
534 554
535 spin_lock(&zone->lock); 555 spin_lock(&zone->lock);
536 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 556 zone->all_unreclaimable = 0;
537 zone->pages_scanned = 0; 557 zone->pages_scanned = 0;
538 558
539 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 559 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -559,8 +579,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
559 page = list_entry(list->prev, struct page, lru); 579 page = list_entry(list->prev, struct page, lru);
560 /* must delete as __free_one_page list manipulates */ 580 /* must delete as __free_one_page list manipulates */
561 list_del(&page->lru); 581 list_del(&page->lru);
562 __free_one_page(page, zone, 0, migratetype); 582 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
563 trace_mm_page_pcpu_drain(page, 0, migratetype); 583 __free_one_page(page, zone, 0, page_private(page));
584 trace_mm_page_pcpu_drain(page, 0, page_private(page));
564 } while (--count && --batch_free && !list_empty(list)); 585 } while (--count && --batch_free && !list_empty(list));
565 } 586 }
566 spin_unlock(&zone->lock); 587 spin_unlock(&zone->lock);
@@ -570,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
570 int migratetype) 591 int migratetype)
571{ 592{
572 spin_lock(&zone->lock); 593 spin_lock(&zone->lock);
573 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 594 zone->all_unreclaimable = 0;
574 zone->pages_scanned = 0; 595 zone->pages_scanned = 0;
575 596
576 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 597 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -585,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
585 int bad = 0; 606 int bad = 0;
586 int wasMlocked = __TestClearPageMlocked(page); 607 int wasMlocked = __TestClearPageMlocked(page);
587 608
609 trace_mm_page_free_direct(page, order);
588 kmemcheck_free_shadow(page, order); 610 kmemcheck_free_shadow(page, order);
589 611
590 for (i = 0 ; i < (1 << order) ; ++i) 612 for (i = 0 ; i < (1 << order) ; ++i)
@@ -1011,10 +1033,10 @@ static void drain_pages(unsigned int cpu)
1011 struct per_cpu_pageset *pset; 1033 struct per_cpu_pageset *pset;
1012 struct per_cpu_pages *pcp; 1034 struct per_cpu_pages *pcp;
1013 1035
1014 pset = zone_pcp(zone, cpu); 1036 local_irq_save(flags);
1037 pset = per_cpu_ptr(zone->pageset, cpu);
1015 1038
1016 pcp = &pset->pcp; 1039 pcp = &pset->pcp;
1017 local_irq_save(flags);
1018 free_pcppages_bulk(zone, pcp->count, pcp); 1040 free_pcppages_bulk(zone, pcp->count, pcp);
1019 pcp->count = 0; 1041 pcp->count = 0;
1020 local_irq_restore(flags); 1042 local_irq_restore(flags);
@@ -1075,8 +1097,9 @@ void mark_free_pages(struct zone *zone)
1075 1097
1076/* 1098/*
1077 * Free a 0-order page 1099 * Free a 0-order page
1100 * cold == 1 ? free a cold page : free a hot page
1078 */ 1101 */
1079static void free_hot_cold_page(struct page *page, int cold) 1102void free_hot_cold_page(struct page *page, int cold)
1080{ 1103{
1081 struct zone *zone = page_zone(page); 1104 struct zone *zone = page_zone(page);
1082 struct per_cpu_pages *pcp; 1105 struct per_cpu_pages *pcp;
@@ -1084,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1084 int migratetype; 1107 int migratetype;
1085 int wasMlocked = __TestClearPageMlocked(page); 1108 int wasMlocked = __TestClearPageMlocked(page);
1086 1109
1110 trace_mm_page_free_direct(page, 0);
1087 kmemcheck_free_shadow(page, 0); 1111 kmemcheck_free_shadow(page, 0);
1088 1112
1089 if (PageAnon(page)) 1113 if (PageAnon(page))
@@ -1098,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold)
1098 arch_free_page(page, 0); 1122 arch_free_page(page, 0);
1099 kernel_map_pages(page, 1, 0); 1123 kernel_map_pages(page, 1, 0);
1100 1124
1101 pcp = &zone_pcp(zone, get_cpu())->pcp;
1102 migratetype = get_pageblock_migratetype(page); 1125 migratetype = get_pageblock_migratetype(page);
1103 set_page_private(page, migratetype); 1126 set_page_private(page, migratetype);
1104 local_irq_save(flags); 1127 local_irq_save(flags);
@@ -1121,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold)
1121 migratetype = MIGRATE_MOVABLE; 1144 migratetype = MIGRATE_MOVABLE;
1122 } 1145 }
1123 1146
1147 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1124 if (cold) 1148 if (cold)
1125 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1149 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1126 else 1150 else
@@ -1133,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1133 1157
1134out: 1158out:
1135 local_irq_restore(flags); 1159 local_irq_restore(flags);
1136 put_cpu();
1137} 1160}
1138 1161
1139void free_hot_page(struct page *page)
1140{
1141 trace_mm_page_free_direct(page, 0);
1142 free_hot_cold_page(page, 0);
1143}
1144
1145/* 1162/*
1146 * split_page takes a non-compound higher-order page, and splits it into 1163 * split_page takes a non-compound higher-order page, and splits it into
1147 * n (1<<order) sub-pages: page[0..n] 1164 * n (1<<order) sub-pages: page[0..n]
@@ -1183,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1183 unsigned long flags; 1200 unsigned long flags;
1184 struct page *page; 1201 struct page *page;
1185 int cold = !!(gfp_flags & __GFP_COLD); 1202 int cold = !!(gfp_flags & __GFP_COLD);
1186 int cpu;
1187 1203
1188again: 1204again:
1189 cpu = get_cpu();
1190 if (likely(order == 0)) { 1205 if (likely(order == 0)) {
1191 struct per_cpu_pages *pcp; 1206 struct per_cpu_pages *pcp;
1192 struct list_head *list; 1207 struct list_head *list;
1193 1208
1194 pcp = &zone_pcp(zone, cpu)->pcp;
1195 list = &pcp->lists[migratetype];
1196 local_irq_save(flags); 1209 local_irq_save(flags);
1210 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1211 list = &pcp->lists[migratetype];
1197 if (list_empty(list)) { 1212 if (list_empty(list)) {
1198 pcp->count += rmqueue_bulk(zone, 0, 1213 pcp->count += rmqueue_bulk(zone, 0,
1199 pcp->batch, list, 1214 pcp->batch, list,
@@ -1225,16 +1240,15 @@ again:
1225 } 1240 }
1226 spin_lock_irqsave(&zone->lock, flags); 1241 spin_lock_irqsave(&zone->lock, flags);
1227 page = __rmqueue(zone, order, migratetype); 1242 page = __rmqueue(zone, order, migratetype);
1228 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1229 spin_unlock(&zone->lock); 1243 spin_unlock(&zone->lock);
1230 if (!page) 1244 if (!page)
1231 goto failed; 1245 goto failed;
1246 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1232 } 1247 }
1233 1248
1234 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1249 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1235 zone_statistics(preferred_zone, zone); 1250 zone_statistics(preferred_zone, zone);
1236 local_irq_restore(flags); 1251 local_irq_restore(flags);
1237 put_cpu();
1238 1252
1239 VM_BUG_ON(bad_range(zone, page)); 1253 VM_BUG_ON(bad_range(zone, page));
1240 if (prep_new_page(page, order, gfp_flags)) 1254 if (prep_new_page(page, order, gfp_flags))
@@ -1243,7 +1257,6 @@ again:
1243 1257
1244failed: 1258failed:
1245 local_irq_restore(flags); 1259 local_irq_restore(flags);
1246 put_cpu();
1247 return NULL; 1260 return NULL;
1248} 1261}
1249 1262
@@ -1658,12 +1671,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1658 if (page) 1671 if (page)
1659 goto out; 1672 goto out;
1660 1673
1661 /* The OOM killer will not help higher order allocs */ 1674 if (!(gfp_mask & __GFP_NOFAIL)) {
1662 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) 1675 /* The OOM killer will not help higher order allocs */
1663 goto out; 1676 if (order > PAGE_ALLOC_COSTLY_ORDER)
1664 1677 goto out;
1678 /*
1679 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1680 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
1681 * The caller should handle page allocation failure by itself if
1682 * it specifies __GFP_THISNODE.
1683 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
1684 */
1685 if (gfp_mask & __GFP_THISNODE)
1686 goto out;
1687 }
1665 /* Exhausted what can be done so it's blamo time */ 1688 /* Exhausted what can be done so it's blamo time */
1666 out_of_memory(zonelist, gfp_mask, order); 1689 out_of_memory(zonelist, gfp_mask, order, nodemask);
1667 1690
1668out: 1691out:
1669 clear_zonelist_oom(zonelist, gfp_mask); 1692 clear_zonelist_oom(zonelist, gfp_mask);
@@ -2005,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec)
2005void __free_pages(struct page *page, unsigned int order) 2028void __free_pages(struct page *page, unsigned int order)
2006{ 2029{
2007 if (put_page_testzero(page)) { 2030 if (put_page_testzero(page)) {
2008 trace_mm_page_free_direct(page, order);
2009 if (order == 0) 2031 if (order == 0)
2010 free_hot_page(page); 2032 free_hot_cold_page(page, 0);
2011 else 2033 else
2012 __free_pages_ok(page, order); 2034 __free_pages_ok(page, order);
2013 } 2035 }
@@ -2172,7 +2194,7 @@ void show_free_areas(void)
2172 for_each_online_cpu(cpu) { 2194 for_each_online_cpu(cpu) {
2173 struct per_cpu_pageset *pageset; 2195 struct per_cpu_pageset *pageset;
2174 2196
2175 pageset = zone_pcp(zone, cpu); 2197 pageset = per_cpu_ptr(zone->pageset, cpu);
2176 2198
2177 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2199 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2178 cpu, pageset->pcp.high, 2200 cpu, pageset->pcp.high,
@@ -2263,7 +2285,7 @@ void show_free_areas(void)
2263 K(zone_page_state(zone, NR_BOUNCE)), 2285 K(zone_page_state(zone, NR_BOUNCE)),
2264 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2286 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2265 zone->pages_scanned, 2287 zone->pages_scanned,
2266 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2288 (zone->all_unreclaimable ? "yes" : "no")
2267 ); 2289 );
2268 printk("lowmem_reserve[]:"); 2290 printk("lowmem_reserve[]:");
2269 for (i = 0; i < MAX_NR_ZONES; i++) 2291 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2395,13 +2417,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2395{ 2417{
2396 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2418 char saved_string[NUMA_ZONELIST_ORDER_LEN];
2397 int ret; 2419 int ret;
2420 static DEFINE_MUTEX(zl_order_mutex);
2398 2421
2422 mutex_lock(&zl_order_mutex);
2399 if (write) 2423 if (write)
2400 strncpy(saved_string, (char*)table->data, 2424 strcpy(saved_string, (char*)table->data);
2401 NUMA_ZONELIST_ORDER_LEN);
2402 ret = proc_dostring(table, write, buffer, length, ppos); 2425 ret = proc_dostring(table, write, buffer, length, ppos);
2403 if (ret) 2426 if (ret)
2404 return ret; 2427 goto out;
2405 if (write) { 2428 if (write) {
2406 int oldval = user_zonelist_order; 2429 int oldval = user_zonelist_order;
2407 if (__parse_numa_zonelist_order((char*)table->data)) { 2430 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2414,7 +2437,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2414 } else if (oldval != user_zonelist_order) 2437 } else if (oldval != user_zonelist_order)
2415 build_all_zonelists(); 2438 build_all_zonelists();
2416 } 2439 }
2417 return 0; 2440out:
2441 mutex_unlock(&zl_order_mutex);
2442 return ret;
2418} 2443}
2419 2444
2420 2445
@@ -2734,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2734 2759
2735#endif /* CONFIG_NUMA */ 2760#endif /* CONFIG_NUMA */
2736 2761
2762/*
2763 * Boot pageset table. One per cpu which is going to be used for all
2764 * zones and all nodes. The parameters will be set in such a way
2765 * that an item put on a list will immediately be handed over to
2766 * the buddy list. This is safe since pageset manipulation is done
2767 * with interrupts disabled.
2768 *
2769 * The boot_pagesets must be kept even after bootup is complete for
2770 * unused processors and/or zones. They do play a role for bootstrapping
2771 * hotplugged processors.
2772 *
2773 * zoneinfo_show() and maybe other functions do
2774 * not check if the processor is online before following the pageset pointer.
2775 * Other parts of the kernel may not check if the zone is available.
2776 */
2777static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2779
2737/* return values int ....just for stop_machine() */ 2780/* return values int ....just for stop_machine() */
2738static int __build_all_zonelists(void *dummy) 2781static int __build_all_zonelists(void *dummy)
2739{ 2782{
2740 int nid; 2783 int nid;
2784 int cpu;
2741 2785
2742#ifdef CONFIG_NUMA 2786#ifdef CONFIG_NUMA
2743 memset(node_load, 0, sizeof(node_load)); 2787 memset(node_load, 0, sizeof(node_load));
@@ -2748,6 +2792,23 @@ static int __build_all_zonelists(void *dummy)
2748 build_zonelists(pgdat); 2792 build_zonelists(pgdat);
2749 build_zonelist_cache(pgdat); 2793 build_zonelist_cache(pgdat);
2750 } 2794 }
2795
2796 /*
2797 * Initialize the boot_pagesets that are going to be used
2798 * for bootstrapping processors. The real pagesets for
2799 * each zone will be allocated later when the per cpu
2800 * allocator is available.
2801 *
2802 * boot_pagesets are used also for bootstrapping offline
2803 * cpus if the system is already booted because the pagesets
2804 * are needed to initialize allocators on a specific cpu too.
2805 * F.e. the percpu allocator needs the page allocator which
2806 * needs the percpu allocator in order to allocate its pagesets
2807 * (a chicken-egg dilemma).
2808 */
2809 for_each_possible_cpu(cpu)
2810 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2811
2751 return 0; 2812 return 0;
2752} 2813}
2753 2814
@@ -3085,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3085 pcp->batch = PAGE_SHIFT * 8; 3146 pcp->batch = PAGE_SHIFT * 8;
3086} 3147}
3087 3148
3088
3089#ifdef CONFIG_NUMA
3090/*
3091 * Boot pageset table. One per cpu which is going to be used for all
3092 * zones and all nodes. The parameters will be set in such a way
3093 * that an item put on a list will immediately be handed over to
3094 * the buddy list. This is safe since pageset manipulation is done
3095 * with interrupts disabled.
3096 *
3097 * Some NUMA counter updates may also be caught by the boot pagesets.
3098 *
3099 * The boot_pagesets must be kept even after bootup is complete for
3100 * unused processors and/or zones. They do play a role for bootstrapping
3101 * hotplugged processors.
3102 *
3103 * zoneinfo_show() and maybe other functions do
3104 * not check if the processor is online before following the pageset pointer.
3105 * Other parts of the kernel may not check if the zone is available.
3106 */
3107static struct per_cpu_pageset boot_pageset[NR_CPUS];
3108
3109/* 3149/*
3110 * Dynamically allocate memory for the 3150 * Allocate per cpu pagesets and initialize them.
3111 * per cpu pageset array in struct zone. 3151 * Before this call only boot pagesets were available.
3152 * Boot pagesets will no longer be used by this processorr
3153 * after setup_per_cpu_pageset().
3112 */ 3154 */
3113static int __cpuinit process_zones(int cpu) 3155void __init setup_per_cpu_pageset(void)
3114{ 3156{
3115 struct zone *zone, *dzone; 3157 struct zone *zone;
3116 int node = cpu_to_node(cpu); 3158 int cpu;
3117
3118 node_set_state(node, N_CPU); /* this node has a cpu */
3119 3159
3120 for_each_populated_zone(zone) { 3160 for_each_populated_zone(zone) {
3121 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3161 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3122 GFP_KERNEL, node);
3123 if (!zone_pcp(zone, cpu))
3124 goto bad;
3125 3162
3126 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3163 for_each_possible_cpu(cpu) {
3164 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3127 3165
3128 if (percpu_pagelist_fraction) 3166 setup_pageset(pcp, zone_batchsize(zone));
3129 setup_pagelist_highmark(zone_pcp(zone, cpu),
3130 (zone->present_pages / percpu_pagelist_fraction));
3131 }
3132 3167
3133 return 0; 3168 if (percpu_pagelist_fraction)
3134bad: 3169 setup_pagelist_highmark(pcp,
3135 for_each_zone(dzone) { 3170 (zone->present_pages /
3136 if (!populated_zone(dzone)) 3171 percpu_pagelist_fraction));
3137 continue; 3172 }
3138 if (dzone == zone)
3139 break;
3140 kfree(zone_pcp(dzone, cpu));
3141 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
3142 }
3143 return -ENOMEM;
3144}
3145
3146static inline void free_zone_pagesets(int cpu)
3147{
3148 struct zone *zone;
3149
3150 for_each_zone(zone) {
3151 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3152
3153 /* Free per_cpu_pageset if it is slab allocated */
3154 if (pset != &boot_pageset[cpu])
3155 kfree(pset);
3156 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3157 }
3158}
3159
3160static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3161 unsigned long action,
3162 void *hcpu)
3163{
3164 int cpu = (long)hcpu;
3165 int ret = NOTIFY_OK;
3166
3167 switch (action) {
3168 case CPU_UP_PREPARE:
3169 case CPU_UP_PREPARE_FROZEN:
3170 if (process_zones(cpu))
3171 ret = NOTIFY_BAD;
3172 break;
3173 case CPU_UP_CANCELED:
3174 case CPU_UP_CANCELED_FROZEN:
3175 case CPU_DEAD:
3176 case CPU_DEAD_FROZEN:
3177 free_zone_pagesets(cpu);
3178 break;
3179 default:
3180 break;
3181 } 3173 }
3182 return ret;
3183} 3174}
3184 3175
3185static struct notifier_block __cpuinitdata pageset_notifier =
3186 { &pageset_cpuup_callback, NULL, 0 };
3187
3188void __init setup_per_cpu_pageset(void)
3189{
3190 int err;
3191
3192 /* Initialize per_cpu_pageset for cpu 0.
3193 * A cpuup callback will do this for every cpu
3194 * as it comes online
3195 */
3196 err = process_zones(smp_processor_id());
3197 BUG_ON(err);
3198 register_cpu_notifier(&pageset_notifier);
3199}
3200
3201#endif
3202
3203static noinline __init_refok 3176static noinline __init_refok
3204int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3177int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3205{ 3178{
@@ -3249,11 +3222,11 @@ static int __zone_pcp_update(void *data)
3249 int cpu; 3222 int cpu;
3250 unsigned long batch = zone_batchsize(zone), flags; 3223 unsigned long batch = zone_batchsize(zone), flags;
3251 3224
3252 for (cpu = 0; cpu < NR_CPUS; cpu++) { 3225 for_each_possible_cpu(cpu) {
3253 struct per_cpu_pageset *pset; 3226 struct per_cpu_pageset *pset;
3254 struct per_cpu_pages *pcp; 3227 struct per_cpu_pages *pcp;
3255 3228
3256 pset = zone_pcp(zone, cpu); 3229 pset = per_cpu_ptr(zone->pageset, cpu);
3257 pcp = &pset->pcp; 3230 pcp = &pset->pcp;
3258 3231
3259 local_irq_save(flags); 3232 local_irq_save(flags);
@@ -3271,21 +3244,17 @@ void zone_pcp_update(struct zone *zone)
3271 3244
3272static __meminit void zone_pcp_init(struct zone *zone) 3245static __meminit void zone_pcp_init(struct zone *zone)
3273{ 3246{
3274 int cpu; 3247 /*
3275 unsigned long batch = zone_batchsize(zone); 3248 * per cpu subsystem is not up at this point. The following code
3249 * relies on the ability of the linker to provide the
3250 * offset of a (static) per cpu variable into the per cpu area.
3251 */
3252 zone->pageset = &boot_pageset;
3276 3253
3277 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3278#ifdef CONFIG_NUMA
3279 /* Early boot. Slab allocator not functional yet */
3280 zone_pcp(zone, cpu) = &boot_pageset[cpu];
3281 setup_pageset(&boot_pageset[cpu],0);
3282#else
3283 setup_pageset(zone_pcp(zone,cpu), batch);
3284#endif
3285 }
3286 if (zone->present_pages) 3254 if (zone->present_pages)
3287 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3255 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3288 zone->name, zone->present_pages, batch); 3256 zone->name, zone->present_pages,
3257 zone_batchsize(zone));
3289} 3258}
3290 3259
3291__meminit int init_currently_empty_zone(struct zone *zone, 3260__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3424,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid,
3424 } 3393 }
3425} 3394}
3426 3395
3396int __init add_from_early_node_map(struct range *range, int az,
3397 int nr_range, int nid)
3398{
3399 int i;
3400 u64 start, end;
3401
3402 /* need to go over early_node_map to find out good range for node */
3403 for_each_active_range_index_in_nid(i, nid) {
3404 start = early_node_map[i].start_pfn;
3405 end = early_node_map[i].end_pfn;
3406 nr_range = add_range(range, az, nr_range, start, end);
3407 }
3408 return nr_range;
3409}
3410
3411#ifdef CONFIG_NO_BOOTMEM
3412void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3413 u64 goal, u64 limit)
3414{
3415 int i;
3416 void *ptr;
3417
3418 /* need to go over early_node_map to find out good range for node */
3419 for_each_active_range_index_in_nid(i, nid) {
3420 u64 addr;
3421 u64 ei_start, ei_last;
3422
3423 ei_last = early_node_map[i].end_pfn;
3424 ei_last <<= PAGE_SHIFT;
3425 ei_start = early_node_map[i].start_pfn;
3426 ei_start <<= PAGE_SHIFT;
3427 addr = find_early_area(ei_start, ei_last,
3428 goal, limit, size, align);
3429
3430 if (addr == -1ULL)
3431 continue;
3432
3433#if 0
3434 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3435 nid,
3436 ei_start, ei_last, goal, limit, size,
3437 align, addr);
3438#endif
3439
3440 ptr = phys_to_virt(addr);
3441 memset(ptr, 0, size);
3442 reserve_early_without_check(addr, addr + size, "BOOTMEM");
3443 return ptr;
3444 }
3445
3446 return NULL;
3447}
3448#endif
3449
3450
3427void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3451void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3428{ 3452{
3429 int i; 3453 int i;
@@ -3573,7 +3597,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3573 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3597 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3574 * then all holes in the requested range will be accounted for. 3598 * then all holes in the requested range will be accounted for.
3575 */ 3599 */
3576static unsigned long __meminit __absent_pages_in_range(int nid, 3600unsigned long __meminit __absent_pages_in_range(int nid,
3577 unsigned long range_start_pfn, 3601 unsigned long range_start_pfn,
3578 unsigned long range_end_pfn) 3602 unsigned long range_end_pfn)
3579{ 3603{
@@ -3988,7 +4012,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3988 } 4012 }
3989 4013
3990 /* Merge backward if suitable */ 4014 /* Merge backward if suitable */
3991 if (start_pfn < early_node_map[i].end_pfn && 4015 if (start_pfn < early_node_map[i].start_pfn &&
3992 end_pfn >= early_node_map[i].start_pfn) { 4016 end_pfn >= early_node_map[i].start_pfn) {
3993 early_node_map[i].start_pfn = start_pfn; 4017 early_node_map[i].start_pfn = start_pfn;
3994 return; 4018 return;
@@ -4102,7 +4126,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
4102} 4126}
4103 4127
4104/* sort the node_map by start_pfn */ 4128/* sort the node_map by start_pfn */
4105static void __init sort_node_map(void) 4129void __init sort_node_map(void)
4106{ 4130{
4107 sort(early_node_map, (size_t)nr_nodemap_entries, 4131 sort(early_node_map, (size_t)nr_nodemap_entries,
4108 sizeof(struct node_active_region), 4132 sizeof(struct node_active_region),
@@ -4366,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4366 for (i = 0; i < MAX_NR_ZONES; i++) { 4390 for (i = 0; i < MAX_NR_ZONES; i++) {
4367 if (i == ZONE_MOVABLE) 4391 if (i == ZONE_MOVABLE)
4368 continue; 4392 continue;
4369 printk(" %-8s %0#10lx -> %0#10lx\n", 4393 printk(" %-8s ", zone_names[i]);
4370 zone_names[i], 4394 if (arch_zone_lowest_possible_pfn[i] ==
4395 arch_zone_highest_possible_pfn[i])
4396 printk("empty\n");
4397 else
4398 printk("%0#10lx -> %0#10lx\n",
4371 arch_zone_lowest_possible_pfn[i], 4399 arch_zone_lowest_possible_pfn[i],
4372 arch_zone_highest_possible_pfn[i]); 4400 arch_zone_highest_possible_pfn[i]);
4373 } 4401 }
@@ -4456,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4456} 4484}
4457 4485
4458#ifndef CONFIG_NEED_MULTIPLE_NODES 4486#ifndef CONFIG_NEED_MULTIPLE_NODES
4459struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4487struct pglist_data __refdata contig_page_data = {
4488#ifndef CONFIG_NO_BOOTMEM
4489 .bdata = &bootmem_node_data[0]
4490#endif
4491 };
4460EXPORT_SYMBOL(contig_page_data); 4492EXPORT_SYMBOL(contig_page_data);
4461#endif 4493#endif
4462 4494
@@ -4799,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4799 if (!write || (ret == -EINVAL)) 4831 if (!write || (ret == -EINVAL))
4800 return ret; 4832 return ret;
4801 for_each_populated_zone(zone) { 4833 for_each_populated_zone(zone) {
4802 for_each_online_cpu(cpu) { 4834 for_each_possible_cpu(cpu) {
4803 unsigned long high; 4835 unsigned long high;
4804 high = zone->present_pages / percpu_pagelist_fraction; 4836 high = zone->present_pages / percpu_pagelist_fraction;
4805 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 4837 setup_pagelist_highmark(
4838 per_cpu_ptr(zone->pageset, cpu), high);
4806 } 4839 }
4807 } 4840 }
4808 return 0; 4841 return 0;
@@ -5002,23 +5035,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5002int set_migratetype_isolate(struct page *page) 5035int set_migratetype_isolate(struct page *page)
5003{ 5036{
5004 struct zone *zone; 5037 struct zone *zone;
5005 unsigned long flags; 5038 struct page *curr_page;
5039 unsigned long flags, pfn, iter;
5040 unsigned long immobile = 0;
5041 struct memory_isolate_notify arg;
5042 int notifier_ret;
5006 int ret = -EBUSY; 5043 int ret = -EBUSY;
5007 int zone_idx; 5044 int zone_idx;
5008 5045
5009 zone = page_zone(page); 5046 zone = page_zone(page);
5010 zone_idx = zone_idx(zone); 5047 zone_idx = zone_idx(zone);
5048
5011 spin_lock_irqsave(&zone->lock, flags); 5049 spin_lock_irqsave(&zone->lock, flags);
5050 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5051 zone_idx == ZONE_MOVABLE) {
5052 ret = 0;
5053 goto out;
5054 }
5055
5056 pfn = page_to_pfn(page);
5057 arg.start_pfn = pfn;
5058 arg.nr_pages = pageblock_nr_pages;
5059 arg.pages_found = 0;
5060
5012 /* 5061 /*
5013 * In future, more migrate types will be able to be isolation target. 5062 * It may be possible to isolate a pageblock even if the
5063 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5064 * notifier chain is used by balloon drivers to return the
5065 * number of pages in a range that are held by the balloon
5066 * driver to shrink memory. If all the pages are accounted for
5067 * by balloons, are free, or on the LRU, isolation can continue.
5068 * Later, for example, when memory hotplug notifier runs, these
5069 * pages reported as "can be isolated" should be isolated(freed)
5070 * by the balloon driver through the memory notifier chain.
5014 */ 5071 */
5015 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && 5072 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5016 zone_idx != ZONE_MOVABLE) 5073 notifier_ret = notifier_to_errno(notifier_ret);
5074 if (notifier_ret || !arg.pages_found)
5017 goto out; 5075 goto out;
5018 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5076
5019 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5077 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5020 ret = 0; 5078 if (!pfn_valid_within(pfn))
5079 continue;
5080
5081 curr_page = pfn_to_page(iter);
5082 if (!page_count(curr_page) || PageLRU(curr_page))
5083 continue;
5084
5085 immobile++;
5086 }
5087
5088 if (arg.pages_found == immobile)
5089 ret = 0;
5090
5021out: 5091out:
5092 if (!ret) {
5093 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5094 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5095 }
5096
5022 spin_unlock_irqrestore(&zone->lock, flags); 5097 spin_unlock_irqrestore(&zone->lock, flags);
5023 if (!ret) 5098 if (!ret)
5024 drain_all_pages(); 5099 drain_all_pages();
@@ -5085,3 +5160,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5085 spin_unlock_irqrestore(&zone->lock, flags); 5160 spin_unlock_irqrestore(&zone->lock, flags);
5086} 5161}
5087#endif 5162#endif
5163
5164#ifdef CONFIG_MEMORY_FAILURE
5165bool is_free_buddy_page(struct page *page)
5166{
5167 struct zone *zone = page_zone(page);
5168 unsigned long pfn = page_to_pfn(page);
5169 unsigned long flags;
5170 int order;
5171
5172 spin_lock_irqsave(&zone->lock, flags);
5173 for (order = 0; order < MAX_ORDER; order++) {
5174 struct page *page_head = page - (pfn & ((1 << order) - 1));
5175
5176 if (PageBuddy(page_head) && page_order(page_head) >= order)
5177 break;
5178 }
5179 spin_unlock_irqrestore(&zone->lock, flags);
5180
5181 return order < MAX_ORDER;
5182}
5183#endif
5184
5185static struct trace_print_flags pageflag_names[] = {
5186 {1UL << PG_locked, "locked" },
5187 {1UL << PG_error, "error" },
5188 {1UL << PG_referenced, "referenced" },
5189 {1UL << PG_uptodate, "uptodate" },
5190 {1UL << PG_dirty, "dirty" },
5191 {1UL << PG_lru, "lru" },
5192 {1UL << PG_active, "active" },
5193 {1UL << PG_slab, "slab" },
5194 {1UL << PG_owner_priv_1, "owner_priv_1" },
5195 {1UL << PG_arch_1, "arch_1" },
5196 {1UL << PG_reserved, "reserved" },
5197 {1UL << PG_private, "private" },
5198 {1UL << PG_private_2, "private_2" },
5199 {1UL << PG_writeback, "writeback" },
5200#ifdef CONFIG_PAGEFLAGS_EXTENDED
5201 {1UL << PG_head, "head" },
5202 {1UL << PG_tail, "tail" },
5203#else
5204 {1UL << PG_compound, "compound" },
5205#endif
5206 {1UL << PG_swapcache, "swapcache" },
5207 {1UL << PG_mappedtodisk, "mappedtodisk" },
5208 {1UL << PG_reclaim, "reclaim" },
5209 {1UL << PG_buddy, "buddy" },
5210 {1UL << PG_swapbacked, "swapbacked" },
5211 {1UL << PG_unevictable, "unevictable" },
5212#ifdef CONFIG_MMU
5213 {1UL << PG_mlocked, "mlocked" },
5214#endif
5215#ifdef CONFIG_ARCH_USES_PG_UNCACHED
5216 {1UL << PG_uncached, "uncached" },
5217#endif
5218#ifdef CONFIG_MEMORY_FAILURE
5219 {1UL << PG_hwpoison, "hwpoison" },
5220#endif
5221 {-1UL, NULL },
5222};
5223
5224static void dump_page_flags(unsigned long flags)
5225{
5226 const char *delim = "";
5227 unsigned long mask;
5228 int i;
5229
5230 printk(KERN_ALERT "page flags: %#lx(", flags);
5231
5232 /* remove zone id */
5233 flags &= (1UL << NR_PAGEFLAGS) - 1;
5234
5235 for (i = 0; pageflag_names[i].name && flags; i++) {
5236
5237 mask = pageflag_names[i].mask;
5238 if ((flags & mask) != mask)
5239 continue;
5240
5241 flags &= ~mask;
5242 printk("%s%s", delim, pageflag_names[i].name);
5243 delim = "|";
5244 }
5245
5246 /* check for left over flags */
5247 if (flags)
5248 printk("%s%#lx", delim, flags);
5249
5250 printk(")\n");
5251}
5252
5253void dump_page(struct page *page)
5254{
5255 printk(KERN_ALERT
5256 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5257 page, page_count(page), page_mapcount(page),
5258 page->mapping, page->index);
5259 dump_page_flags(page->flags);
5260}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..6c0081441a32 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex);
284struct swap_cgroup_ctrl { 284struct swap_cgroup_ctrl {
285 struct page **map; 285 struct page **map;
286 unsigned long length; 286 unsigned long length;
287 spinlock_t lock;
287}; 288};
288 289
289struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 290struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
@@ -335,6 +336,43 @@ not_enough_page:
335} 336}
336 337
337/** 338/**
339 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
340 * @end: swap entry to be cmpxchged
341 * @old: old id
342 * @new: new id
343 *
344 * Returns old id at success, 0 at failure.
345 * (There is no mem_cgroup useing 0 as its id)
346 */
347unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
348 unsigned short old, unsigned short new)
349{
350 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent);
352 unsigned long idx = offset / SC_PER_PAGE;
353 unsigned long pos = offset & SC_POS_MASK;
354 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage;
356 struct swap_cgroup *sc;
357 unsigned long flags;
358 unsigned short retval;
359
360 ctrl = &swap_cgroup_ctrl[type];
361
362 mappage = ctrl->map[idx];
363 sc = page_address(mappage);
364 sc += pos;
365 spin_lock_irqsave(&ctrl->lock, flags);
366 retval = sc->id;
367 if (retval == old)
368 sc->id = new;
369 else
370 retval = 0;
371 spin_unlock_irqrestore(&ctrl->lock, flags);
372 return retval;
373}
374
375/**
338 * swap_cgroup_record - record mem_cgroup for this swp_entry. 376 * swap_cgroup_record - record mem_cgroup for this swp_entry.
339 * @ent: swap entry to be recorded into 377 * @ent: swap entry to be recorded into
340 * @mem: mem_cgroup to be recorded 378 * @mem: mem_cgroup to be recorded
@@ -352,14 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
352 struct page *mappage; 390 struct page *mappage;
353 struct swap_cgroup *sc; 391 struct swap_cgroup *sc;
354 unsigned short old; 392 unsigned short old;
393 unsigned long flags;
355 394
356 ctrl = &swap_cgroup_ctrl[type]; 395 ctrl = &swap_cgroup_ctrl[type];
357 396
358 mappage = ctrl->map[idx]; 397 mappage = ctrl->map[idx];
359 sc = page_address(mappage); 398 sc = page_address(mappage);
360 sc += pos; 399 sc += pos;
400 spin_lock_irqsave(&ctrl->lock, flags);
361 old = sc->id; 401 old = sc->id;
362 sc->id = id; 402 sc->id = id;
403 spin_unlock_irqrestore(&ctrl->lock, flags);
363 404
364 return old; 405 return old;
365} 406}
@@ -411,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
411 mutex_lock(&swap_cgroup_mutex); 452 mutex_lock(&swap_cgroup_mutex);
412 ctrl->length = length; 453 ctrl->length = length;
413 ctrl->map = array; 454 ctrl->map = array;
455 spin_lock_init(&ctrl->lock);
414 if (swap_cgroup_prepare(type)) { 456 if (swap_cgroup_prepare(type)) {
415 /* memory shortage */ 457 /* memory shortage */
416 ctrl->map = NULL; 458 ctrl->map = NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index c6f3e5071de3..31a3b962230a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,6 +12,7 @@
12 12
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/gfp.h>
15#include <linux/pagemap.h> 16#include <linux/pagemap.h>
16#include <linux/swap.h> 17#include <linux/swap.h>
17#include <linux/bio.h> 18#include <linux/bio.h>
@@ -19,20 +20,15 @@
19#include <linux/writeback.h> 20#include <linux/writeback.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21 22
22static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, 23static struct bio *get_swap_bio(gfp_t gfp_flags,
23 struct page *page, bio_end_io_t end_io) 24 struct page *page, bio_end_io_t end_io)
24{ 25{
25 struct bio *bio; 26 struct bio *bio;
26 27
27 bio = bio_alloc(gfp_flags, 1); 28 bio = bio_alloc(gfp_flags, 1);
28 if (bio) { 29 if (bio) {
29 struct swap_info_struct *sis; 30 bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
30 swp_entry_t entry = { .val = index, }; 31 bio->bi_sector <<= PAGE_SHIFT - 9;
31
32 sis = get_swap_info_struct(swp_type(entry));
33 bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
34 (PAGE_SIZE >> 9);
35 bio->bi_bdev = sis->bdev;
36 bio->bi_io_vec[0].bv_page = page; 32 bio->bi_io_vec[0].bv_page = page;
37 bio->bi_io_vec[0].bv_len = PAGE_SIZE; 33 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
38 bio->bi_io_vec[0].bv_offset = 0; 34 bio->bi_io_vec[0].bv_offset = 0;
@@ -102,8 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
102 unlock_page(page); 98 unlock_page(page);
103 goto out; 99 goto out;
104 } 100 }
105 bio = get_swap_bio(GFP_NOIO, page_private(page), page, 101 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
106 end_swap_bio_write);
107 if (bio == NULL) { 102 if (bio == NULL) {
108 set_page_dirty(page); 103 set_page_dirty(page);
109 unlock_page(page); 104 unlock_page(page);
@@ -127,8 +122,7 @@ int swap_readpage(struct page *page)
127 122
128 VM_BUG_ON(!PageLocked(page)); 123 VM_BUG_ON(!PageLocked(page));
129 VM_BUG_ON(PageUptodate(page)); 124 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 125 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
131 end_swap_bio_read);
132 if (bio == NULL) { 126 if (bio == NULL) {
133 unlock_page(page); 127 unlock_page(page);
134 ret = -ENOMEM; 128 ret = -ENOMEM;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d5878bed7841..8b1a2ce21ee5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,6 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/highmem.h> 2#include <linux/highmem.h>
3#include <linux/sched.h> 3#include <linux/sched.h>
4#include <linux/hugetlb.h>
4 5
5static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 6static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
6 struct mm_walk *walk) 7 struct mm_walk *walk)
@@ -79,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
79 return err; 80 return err;
80} 81}
81 82
83#ifdef CONFIG_HUGETLB_PAGE
84static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
85 unsigned long end)
86{
87 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
88 return boundary < end ? boundary : end;
89}
90
91static int walk_hugetlb_range(struct vm_area_struct *vma,
92 unsigned long addr, unsigned long end,
93 struct mm_walk *walk)
94{
95 struct hstate *h = hstate_vma(vma);
96 unsigned long next;
97 unsigned long hmask = huge_page_mask(h);
98 pte_t *pte;
99 int err = 0;
100
101 do {
102 next = hugetlb_entry_end(h, addr, end);
103 pte = huge_pte_offset(walk->mm, addr & hmask);
104 if (pte && walk->hugetlb_entry)
105 err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
106 if (err)
107 return err;
108 } while (addr = next, addr != end);
109
110 return 0;
111}
112#endif
113
82/** 114/**
83 * walk_page_range - walk a memory map's page tables with a callback 115 * walk_page_range - walk a memory map's page tables with a callback
84 * @mm: memory map to walk 116 * @mm: memory map to walk
@@ -107,6 +139,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
107 pgd_t *pgd; 139 pgd_t *pgd;
108 unsigned long next; 140 unsigned long next;
109 int err = 0; 141 int err = 0;
142 struct vm_area_struct *vma;
110 143
111 if (addr >= end) 144 if (addr >= end)
112 return err; 145 return err;
@@ -117,11 +150,34 @@ int walk_page_range(unsigned long addr, unsigned long end,
117 pgd = pgd_offset(walk->mm, addr); 150 pgd = pgd_offset(walk->mm, addr);
118 do { 151 do {
119 next = pgd_addr_end(addr, end); 152 next = pgd_addr_end(addr, end);
153
154 /*
155 * handle hugetlb vma individually because pagetable walk for
156 * the hugetlb page is dependent on the architecture and
157 * we can't handled it in the same manner as non-huge pages.
158 */
159 vma = find_vma(walk->mm, addr);
160#ifdef CONFIG_HUGETLB_PAGE
161 if (vma && is_vm_hugetlb_page(vma)) {
162 if (vma->vm_end < next)
163 next = vma->vm_end;
164 /*
165 * Hugepage is very tightly coupled with vma, so
166 * walk through hugetlb entries within a given vma.
167 */
168 err = walk_hugetlb_range(vma, addr, next, walk);
169 if (err)
170 break;
171 pgd = pgd_offset(walk->mm, next);
172 continue;
173 }
174#endif
120 if (pgd_none_or_clear_bad(pgd)) { 175 if (pgd_none_or_clear_bad(pgd)) {
121 if (walk->pte_hole) 176 if (walk->pte_hole)
122 err = walk->pte_hole(addr, next, walk); 177 err = walk->pte_hole(addr, next, walk);
123 if (err) 178 if (err)
124 break; 179 break;
180 pgd++;
125 continue; 181 continue;
126 } 182 }
127 if (walk->pgd_entry) 183 if (walk->pgd_entry)
@@ -131,7 +187,8 @@ int walk_page_range(unsigned long addr, unsigned long end,
131 err = walk_pud_range(pgd, addr, next, walk); 187 err = walk_pud_range(pgd, addr, next, walk);
132 if (err) 188 if (err)
133 break; 189 break;
134 } while (pgd++, addr = next, addr != end); 190 pgd++;
191 } while (addr = next, addr != end);
135 192
136 return err; 193 return err;
137} 194}
diff --git a/mm/percpu.c b/mm/percpu.c
index 5adfc268b408..6e09741ddc62 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -46,8 +46,6 @@
46 * 46 *
47 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
48 * 48 *
49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
50 *
51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 49 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
52 * regular address to percpu pointer and back if they need to be 50 * regular address to percpu pointer and back if they need to be
53 * different from the default 51 * different from the default
@@ -74,6 +72,7 @@
74#include <asm/cacheflush.h> 72#include <asm/cacheflush.h>
75#include <asm/sections.h> 73#include <asm/sections.h>
76#include <asm/tlbflush.h> 74#include <asm/tlbflush.h>
75#include <asm/io.h>
77 76
78#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
79#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
@@ -81,13 +80,15 @@
81/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 80/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
82#ifndef __addr_to_pcpu_ptr 81#ifndef __addr_to_pcpu_ptr
83#define __addr_to_pcpu_ptr(addr) \ 82#define __addr_to_pcpu_ptr(addr) \
84 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ 83 (void __percpu *)((unsigned long)(addr) - \
85 + (unsigned long)__per_cpu_start) 84 (unsigned long)pcpu_base_addr + \
85 (unsigned long)__per_cpu_start)
86#endif 86#endif
87#ifndef __pcpu_ptr_to_addr 87#ifndef __pcpu_ptr_to_addr
88#define __pcpu_ptr_to_addr(ptr) \ 88#define __pcpu_ptr_to_addr(ptr) \
89 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ 89 (void __force *)((unsigned long)(ptr) + \
90 - (unsigned long)__per_cpu_start) 90 (unsigned long)pcpu_base_addr - \
91 (unsigned long)__per_cpu_start)
91#endif 92#endif
92 93
93struct pcpu_chunk { 94struct pcpu_chunk {
@@ -914,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
914 int rs, re; 915 int rs, re;
915 916
916 /* quick path, check whether it's empty already */ 917 /* quick path, check whether it's empty already */
917 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 918 rs = page_start;
918 if (rs == page_start && re == page_end) 919 pcpu_next_unpop(chunk, &rs, &re, page_end);
919 return; 920 if (rs == page_start && re == page_end)
920 break; 921 return;
921 }
922 922
923 /* immutable chunks can't be depopulated */ 923 /* immutable chunks can't be depopulated */
924 WARN_ON(chunk->immutable); 924 WARN_ON(chunk->immutable);
@@ -969,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
969 int rs, re, rc; 969 int rs, re, rc;
970 970
971 /* quick path, check whether all pages are already there */ 971 /* quick path, check whether all pages are already there */
972 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { 972 rs = page_start;
973 if (rs == page_start && re == page_end) 973 pcpu_next_pop(chunk, &rs, &re, page_end);
974 goto clear; 974 if (rs == page_start && re == page_end)
975 break; 975 goto clear;
976 }
977 976
978 /* need to allocate and map pages, this chunk can't be immutable */ 977 /* need to allocate and map pages, this chunk can't be immutable */
979 WARN_ON(chunk->immutable); 978 WARN_ON(chunk->immutable);
@@ -1068,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
1068 * RETURNS: 1067 * RETURNS:
1069 * Percpu pointer to the allocated area on success, NULL on failure. 1068 * Percpu pointer to the allocated area on success, NULL on failure.
1070 */ 1069 */
1071static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1070static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
1072{ 1071{
1073 static int warn_limit = 10; 1072 static int warn_limit = 10;
1074 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
@@ -1197,7 +1196,7 @@ fail_unlock_mutex:
1197 * RETURNS: 1196 * RETURNS:
1198 * Percpu pointer to the allocated area on success, NULL on failure. 1197 * Percpu pointer to the allocated area on success, NULL on failure.
1199 */ 1198 */
1200void *__alloc_percpu(size_t size, size_t align) 1199void __percpu *__alloc_percpu(size_t size, size_t align)
1201{ 1200{
1202 return pcpu_alloc(size, align, false); 1201 return pcpu_alloc(size, align, false);
1203} 1202}
@@ -1218,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
1218 * RETURNS: 1217 * RETURNS:
1219 * Percpu pointer to the allocated area on success, NULL on failure. 1218 * Percpu pointer to the allocated area on success, NULL on failure.
1220 */ 1219 */
1221void *__alloc_reserved_percpu(size_t size, size_t align) 1220void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1222{ 1221{
1223 return pcpu_alloc(size, align, true); 1222 return pcpu_alloc(size, align, true);
1224} 1223}
@@ -1270,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work)
1270 * CONTEXT: 1269 * CONTEXT:
1271 * Can be called from atomic context. 1270 * Can be called from atomic context.
1272 */ 1271 */
1273void free_percpu(void *ptr) 1272void free_percpu(void __percpu *ptr)
1274{ 1273{
1275 void *addr = __pcpu_ptr_to_addr(ptr); 1274 void *addr;
1276 struct pcpu_chunk *chunk; 1275 struct pcpu_chunk *chunk;
1277 unsigned long flags; 1276 unsigned long flags;
1278 int off; 1277 int off;
@@ -1280,6 +1279,8 @@ void free_percpu(void *ptr)
1280 if (!ptr) 1279 if (!ptr)
1281 return; 1280 return;
1282 1281
1282 addr = __pcpu_ptr_to_addr(ptr);
1283
1283 spin_lock_irqsave(&pcpu_lock, flags); 1284 spin_lock_irqsave(&pcpu_lock, flags);
1284 1285
1285 chunk = pcpu_chunk_addr_search(addr); 1286 chunk = pcpu_chunk_addr_search(addr);
@@ -1302,6 +1303,53 @@ void free_percpu(void *ptr)
1302} 1303}
1303EXPORT_SYMBOL_GPL(free_percpu); 1304EXPORT_SYMBOL_GPL(free_percpu);
1304 1305
1306/**
1307 * is_kernel_percpu_address - test whether address is from static percpu area
1308 * @addr: address to test
1309 *
1310 * Test whether @addr belongs to in-kernel static percpu area. Module
1311 * static percpu areas are not considered. For those, use
1312 * is_module_percpu_address().
1313 *
1314 * RETURNS:
1315 * %true if @addr is from in-kernel static percpu area, %false otherwise.
1316 */
1317bool is_kernel_percpu_address(unsigned long addr)
1318{
1319 const size_t static_size = __per_cpu_end - __per_cpu_start;
1320 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
1321 unsigned int cpu;
1322
1323 for_each_possible_cpu(cpu) {
1324 void *start = per_cpu_ptr(base, cpu);
1325
1326 if ((void *)addr >= start && (void *)addr < start + static_size)
1327 return true;
1328 }
1329 return false;
1330}
1331
1332/**
1333 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1334 * @addr: the address to be converted to physical address
1335 *
1336 * Given @addr which is dereferenceable address obtained via one of
1337 * percpu access macros, this function translates it into its physical
1338 * address. The caller is responsible for ensuring @addr stays valid
1339 * until this function finishes.
1340 *
1341 * RETURNS:
1342 * The physical address for @addr.
1343 */
1344phys_addr_t per_cpu_ptr_to_phys(void *addr)
1345{
1346 if ((unsigned long)addr < VMALLOC_START ||
1347 (unsigned long)addr >= VMALLOC_END)
1348 return __pa(addr);
1349 else
1350 return page_to_phys(vmalloc_to_page(addr));
1351}
1352
1305static inline size_t pcpu_calc_fc_sizes(size_t static_size, 1353static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1306 size_t reserved_size, 1354 size_t reserved_size,
1307 ssize_t *dyn_sizep) 1355 ssize_t *dyn_sizep)
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
new file mode 100644
index 000000000000..c4351c7f57d2
--- /dev/null
+++ b/mm/percpu_up.c
@@ -0,0 +1,30 @@
1/*
2 * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
3 */
4
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/slab.h>
8
9void __percpu *__alloc_percpu(size_t size, size_t align)
10{
11 /*
12 * Can't easily make larger alignment work with kmalloc. WARN
13 * on it. Larger alignment should only be used for module
14 * percpu sections on SMP for which this path isn't used.
15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return kzalloc(size, GFP_KERNEL);
18}
19EXPORT_SYMBOL_GPL(__alloc_percpu);
20
21void free_percpu(void __percpu *p)
22{
23 kfree(p);
24}
25EXPORT_SYMBOL_GPL(free_percpu);
26
27phys_addr_t per_cpu_ptr_to_phys(void *addr)
28{
29 return __pa(addr);
30}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6633965bb27b..2876349339a7 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -14,6 +14,7 @@
14 */ 14 */
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16 16
17#include <linux/gfp.h>
17#include <linux/mm.h> 18#include <linux/mm.h>
18#include <linux/mmzone.h> 19#include <linux/mmzone.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..dfa9a1a03a11 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/blkdev.h> 15#include <linux/blkdev.h>
@@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
501 if (!ra->ra_pages) 502 if (!ra->ra_pages)
502 return; 503 return;
503 504
505 /* be dumb */
506 if (filp && (filp->f_mode & FMODE_RANDOM)) {
507 force_page_cache_readahead(mapping, filp, offset, req_size);
508 return;
509 }
510
504 /* do read-ahead */ 511 /* do read-ahead */
505 ondemand_readahead(mapping, ra, filp, false, offset, req_size); 512 ondemand_readahead(mapping, ra, filp, false, offset, req_size);
506} 513}
@@ -547,5 +554,17 @@ page_cache_async_readahead(struct address_space *mapping,
547 554
548 /* do read-ahead */ 555 /* do read-ahead */
549 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 556 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
557
558#ifdef CONFIG_BLOCK
559 /*
560 * Normally the current page is !uptodate and lock_page() will be
561 * immediately called to implicitly unplug the device. However this
562 * is not always true for RAID conifgurations, where data arrives
563 * not strictly in their submission order. In this case we need to
564 * explicitly kick off the IO.
565 */
566 if (PageUptodate(page))
567 blk_run_backing_dev(mapping->backing_dev_info, NULL);
568#endif
550} 569}
551EXPORT_SYMBOL_GPL(page_cache_async_readahead); 570EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373a483f..0feeef860a8f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
49#include <linux/swapops.h> 49#include <linux/swapops.h>
50#include <linux/slab.h> 50#include <linux/slab.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/ksm.h>
52#include <linux/rmap.h> 53#include <linux/rmap.h>
53#include <linux/rcupdate.h> 54#include <linux/rcupdate.h>
54#include <linux/module.h> 55#include <linux/module.h>
@@ -61,17 +62,28 @@
61#include "internal.h" 62#include "internal.h"
62 63
63static struct kmem_cache *anon_vma_cachep; 64static struct kmem_cache *anon_vma_cachep;
65static struct kmem_cache *anon_vma_chain_cachep;
64 66
65static inline struct anon_vma *anon_vma_alloc(void) 67static inline struct anon_vma *anon_vma_alloc(void)
66{ 68{
67 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 69 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
68} 70}
69 71
70static inline void anon_vma_free(struct anon_vma *anon_vma) 72void anon_vma_free(struct anon_vma *anon_vma)
71{ 73{
72 kmem_cache_free(anon_vma_cachep, anon_vma); 74 kmem_cache_free(anon_vma_cachep, anon_vma);
73} 75}
74 76
77static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
78{
79 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
80}
81
82void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
83{
84 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
85}
86
75/** 87/**
76 * anon_vma_prepare - attach an anon_vma to a memory region 88 * anon_vma_prepare - attach an anon_vma to a memory region
77 * @vma: the memory region in question 89 * @vma: the memory region in question
@@ -102,87 +114,167 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
102int anon_vma_prepare(struct vm_area_struct *vma) 114int anon_vma_prepare(struct vm_area_struct *vma)
103{ 115{
104 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 struct anon_vma_chain *avc;
105 118
106 might_sleep(); 119 might_sleep();
107 if (unlikely(!anon_vma)) { 120 if (unlikely(!anon_vma)) {
108 struct mm_struct *mm = vma->vm_mm; 121 struct mm_struct *mm = vma->vm_mm;
109 struct anon_vma *allocated; 122 struct anon_vma *allocated;
110 123
124 avc = anon_vma_chain_alloc();
125 if (!avc)
126 goto out_enomem;
127
111 anon_vma = find_mergeable_anon_vma(vma); 128 anon_vma = find_mergeable_anon_vma(vma);
112 allocated = NULL; 129 allocated = NULL;
113 if (!anon_vma) { 130 if (!anon_vma) {
114 anon_vma = anon_vma_alloc(); 131 anon_vma = anon_vma_alloc();
115 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
116 return -ENOMEM; 133 goto out_enomem_free_avc;
117 allocated = anon_vma; 134 allocated = anon_vma;
118 } 135 }
119 spin_lock(&anon_vma->lock);
120 136
137 spin_lock(&anon_vma->lock);
121 /* page_table_lock to protect against threads */ 138 /* page_table_lock to protect against threads */
122 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
123 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
124 vma->anon_vma = anon_vma; 141 vma->anon_vma = anon_vma;
125 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 142 avc->anon_vma = anon_vma;
143 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head);
126 allocated = NULL; 146 allocated = NULL;
147 avc = NULL;
127 } 148 }
128 spin_unlock(&mm->page_table_lock); 149 spin_unlock(&mm->page_table_lock);
129
130 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
151
131 if (unlikely(allocated)) 152 if (unlikely(allocated))
132 anon_vma_free(allocated); 153 anon_vma_free(allocated);
154 if (unlikely(avc))
155 anon_vma_chain_free(avc);
133 } 156 }
134 return 0; 157 return 0;
158
159 out_enomem_free_avc:
160 anon_vma_chain_free(avc);
161 out_enomem:
162 return -ENOMEM;
135} 163}
136 164
137void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) 165static void anon_vma_chain_link(struct vm_area_struct *vma,
166 struct anon_vma_chain *avc,
167 struct anon_vma *anon_vma)
138{ 168{
139 BUG_ON(vma->anon_vma != next->anon_vma); 169 avc->vma = vma;
140 list_del(&next->anon_vma_node); 170 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain);
172
173 spin_lock(&anon_vma->lock);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock);
141} 176}
142 177
143void __anon_vma_link(struct vm_area_struct *vma) 178/*
179 * Attach the anon_vmas from src to dst.
180 * Returns 0 on success, -ENOMEM on failure.
181 */
182int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
144{ 183{
145 struct anon_vma *anon_vma = vma->anon_vma; 184 struct anon_vma_chain *avc, *pavc;
146 185
147 if (anon_vma) 186 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
148 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 187 avc = anon_vma_chain_alloc();
188 if (!avc)
189 goto enomem_failure;
190 anon_vma_chain_link(dst, avc, pavc->anon_vma);
191 }
192 return 0;
193
194 enomem_failure:
195 unlink_anon_vmas(dst);
196 return -ENOMEM;
149} 197}
150 198
151void anon_vma_link(struct vm_area_struct *vma) 199/*
200 * Attach vma to its own anon_vma, as well as to the anon_vmas that
201 * the corresponding VMA in the parent process is attached to.
202 * Returns 0 on success, non-zero on failure.
203 */
204int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
152{ 205{
153 struct anon_vma *anon_vma = vma->anon_vma; 206 struct anon_vma_chain *avc;
207 struct anon_vma *anon_vma;
154 208
155 if (anon_vma) { 209 /* Don't bother if the parent process has no anon_vma here. */
156 spin_lock(&anon_vma->lock); 210 if (!pvma->anon_vma)
157 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 211 return 0;
158 spin_unlock(&anon_vma->lock); 212
159 } 213 /*
214 * First, attach the new VMA to the parent VMA's anon_vmas,
215 * so rmap can find non-COWed pages in child processes.
216 */
217 if (anon_vma_clone(vma, pvma))
218 return -ENOMEM;
219
220 /* Then add our own anon_vma. */
221 anon_vma = anon_vma_alloc();
222 if (!anon_vma)
223 goto out_error;
224 avc = anon_vma_chain_alloc();
225 if (!avc)
226 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma;
230
231 return 0;
232
233 out_error_free_anon_vma:
234 anon_vma_free(anon_vma);
235 out_error:
236 unlink_anon_vmas(vma);
237 return -ENOMEM;
160} 238}
161 239
162void anon_vma_unlink(struct vm_area_struct *vma) 240static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
163{ 241{
164 struct anon_vma *anon_vma = vma->anon_vma; 242 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
165 int empty; 243 int empty;
166 244
245 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
167 if (!anon_vma) 246 if (!anon_vma)
168 return; 247 return;
169 248
170 spin_lock(&anon_vma->lock); 249 spin_lock(&anon_vma->lock);
171 list_del(&vma->anon_vma_node); 250 list_del(&anon_vma_chain->same_anon_vma);
172 251
173 /* We must garbage collect the anon_vma if it's empty */ 252 /* We must garbage collect the anon_vma if it's empty */
174 empty = list_empty(&anon_vma->head); 253 empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
175 spin_unlock(&anon_vma->lock); 254 spin_unlock(&anon_vma->lock);
176 255
177 if (empty) 256 if (empty)
178 anon_vma_free(anon_vma); 257 anon_vma_free(anon_vma);
179} 258}
180 259
260void unlink_anon_vmas(struct vm_area_struct *vma)
261{
262 struct anon_vma_chain *avc, *next;
263
264 /* Unlink each anon_vma chained to the VMA. */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc);
267 list_del(&avc->same_vma);
268 anon_vma_chain_free(avc);
269 }
270}
271
181static void anon_vma_ctor(void *data) 272static void anon_vma_ctor(void *data)
182{ 273{
183 struct anon_vma *anon_vma = data; 274 struct anon_vma *anon_vma = data;
184 275
185 spin_lock_init(&anon_vma->lock); 276 spin_lock_init(&anon_vma->lock);
277 ksm_refcount_init(anon_vma);
186 INIT_LIST_HEAD(&anon_vma->head); 278 INIT_LIST_HEAD(&anon_vma->head);
187} 279}
188 280
@@ -190,6 +282,7 @@ void __init anon_vma_init(void)
190{ 282{
191 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 283 anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
192 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); 284 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
285 anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
193} 286}
194 287
195/* 288/*
@@ -202,8 +295,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
202 unsigned long anon_mapping; 295 unsigned long anon_mapping;
203 296
204 rcu_read_lock(); 297 rcu_read_lock();
205 anon_mapping = (unsigned long) page->mapping; 298 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
206 if (!(anon_mapping & PAGE_MAPPING_ANON)) 299 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
207 goto out; 300 goto out;
208 if (!page_mapped(page)) 301 if (!page_mapped(page))
209 goto out; 302 goto out;
@@ -243,15 +336,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
243 336
244/* 337/*
245 * At what user virtual address is page expected in vma? 338 * At what user virtual address is page expected in vma?
246 * checking that the page matches the vma. 339 * Caller should check the page is actually part of the vma.
247 */ 340 */
248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 341unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
249{ 342{
250 if (PageAnon(page)) { 343 if (PageAnon(page))
251 if ((void *)vma->anon_vma != 344 ;
252 (void *)page->mapping - PAGE_MAPPING_ANON) 345 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
253 return -EFAULT;
254 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
255 if (!vma->vm_file || 346 if (!vma->vm_file ||
256 vma->vm_file->f_mapping != page->mapping) 347 vma->vm_file->f_mapping != page->mapping)
257 return -EFAULT; 348 return -EFAULT;
@@ -337,21 +428,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
337 * Subfunctions of page_referenced: page_referenced_one called 428 * Subfunctions of page_referenced: page_referenced_one called
338 * repeatedly from either page_referenced_anon or page_referenced_file. 429 * repeatedly from either page_referenced_anon or page_referenced_file.
339 */ 430 */
340static int page_referenced_one(struct page *page, 431int page_referenced_one(struct page *page, struct vm_area_struct *vma,
341 struct vm_area_struct *vma, 432 unsigned long address, unsigned int *mapcount,
342 unsigned int *mapcount, 433 unsigned long *vm_flags)
343 unsigned long *vm_flags)
344{ 434{
345 struct mm_struct *mm = vma->vm_mm; 435 struct mm_struct *mm = vma->vm_mm;
346 unsigned long address;
347 pte_t *pte; 436 pte_t *pte;
348 spinlock_t *ptl; 437 spinlock_t *ptl;
349 int referenced = 0; 438 int referenced = 0;
350 439
351 address = vma_address(page, vma);
352 if (address == -EFAULT)
353 goto out;
354
355 pte = page_check_address(page, mm, address, &ptl, 0); 440 pte = page_check_address(page, mm, address, &ptl, 0);
356 if (!pte) 441 if (!pte)
357 goto out; 442 goto out;
@@ -388,9 +473,10 @@ static int page_referenced_one(struct page *page,
388out_unmap: 473out_unmap:
389 (*mapcount)--; 474 (*mapcount)--;
390 pte_unmap_unlock(pte, ptl); 475 pte_unmap_unlock(pte, ptl);
391out: 476
392 if (referenced) 477 if (referenced)
393 *vm_flags |= vma->vm_flags; 478 *vm_flags |= vma->vm_flags;
479out:
394 return referenced; 480 return referenced;
395} 481}
396 482
@@ -400,7 +486,7 @@ static int page_referenced_anon(struct page *page,
400{ 486{
401 unsigned int mapcount; 487 unsigned int mapcount;
402 struct anon_vma *anon_vma; 488 struct anon_vma *anon_vma;
403 struct vm_area_struct *vma; 489 struct anon_vma_chain *avc;
404 int referenced = 0; 490 int referenced = 0;
405 491
406 anon_vma = page_lock_anon_vma(page); 492 anon_vma = page_lock_anon_vma(page);
@@ -408,7 +494,11 @@ static int page_referenced_anon(struct page *page,
408 return referenced; 494 return referenced;
409 495
410 mapcount = page_mapcount(page); 496 mapcount = page_mapcount(page);
411 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 497 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
498 struct vm_area_struct *vma = avc->vma;
499 unsigned long address = vma_address(page, vma);
500 if (address == -EFAULT)
501 continue;
412 /* 502 /*
413 * If we are reclaiming on behalf of a cgroup, skip 503 * If we are reclaiming on behalf of a cgroup, skip
414 * counting on behalf of references from different 504 * counting on behalf of references from different
@@ -416,7 +506,7 @@ static int page_referenced_anon(struct page *page,
416 */ 506 */
417 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 507 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
418 continue; 508 continue;
419 referenced += page_referenced_one(page, vma, 509 referenced += page_referenced_one(page, vma, address,
420 &mapcount, vm_flags); 510 &mapcount, vm_flags);
421 if (!mapcount) 511 if (!mapcount)
422 break; 512 break;
@@ -474,6 +564,9 @@ static int page_referenced_file(struct page *page,
474 mapcount = page_mapcount(page); 564 mapcount = page_mapcount(page);
475 565
476 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 566 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
567 unsigned long address = vma_address(page, vma);
568 if (address == -EFAULT)
569 continue;
477 /* 570 /*
478 * If we are reclaiming on behalf of a cgroup, skip 571 * If we are reclaiming on behalf of a cgroup, skip
479 * counting on behalf of references from different 572 * counting on behalf of references from different
@@ -481,7 +574,7 @@ static int page_referenced_file(struct page *page,
481 */ 574 */
482 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 575 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
483 continue; 576 continue;
484 referenced += page_referenced_one(page, vma, 577 referenced += page_referenced_one(page, vma, address,
485 &mapcount, vm_flags); 578 &mapcount, vm_flags);
486 if (!mapcount) 579 if (!mapcount)
487 break; 580 break;
@@ -507,46 +600,44 @@ int page_referenced(struct page *page,
507 unsigned long *vm_flags) 600 unsigned long *vm_flags)
508{ 601{
509 int referenced = 0; 602 int referenced = 0;
510 603 int we_locked = 0;
511 if (TestClearPageReferenced(page))
512 referenced++;
513 604
514 *vm_flags = 0; 605 *vm_flags = 0;
515 if (page_mapped(page) && page->mapping) { 606 if (page_mapped(page) && page_rmapping(page)) {
516 if (PageAnon(page)) 607 if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
608 we_locked = trylock_page(page);
609 if (!we_locked) {
610 referenced++;
611 goto out;
612 }
613 }
614 if (unlikely(PageKsm(page)))
615 referenced += page_referenced_ksm(page, mem_cont,
616 vm_flags);
617 else if (PageAnon(page))
517 referenced += page_referenced_anon(page, mem_cont, 618 referenced += page_referenced_anon(page, mem_cont,
518 vm_flags); 619 vm_flags);
519 else if (is_locked) 620 else if (page->mapping)
520 referenced += page_referenced_file(page, mem_cont, 621 referenced += page_referenced_file(page, mem_cont,
521 vm_flags); 622 vm_flags);
522 else if (!trylock_page(page)) 623 if (we_locked)
523 referenced++;
524 else {
525 if (page->mapping)
526 referenced += page_referenced_file(page,
527 mem_cont, vm_flags);
528 unlock_page(page); 624 unlock_page(page);
529 }
530 } 625 }
531 626out:
532 if (page_test_and_clear_young(page)) 627 if (page_test_and_clear_young(page))
533 referenced++; 628 referenced++;
534 629
535 return referenced; 630 return referenced;
536} 631}
537 632
538static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) 633static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
634 unsigned long address)
539{ 635{
540 struct mm_struct *mm = vma->vm_mm; 636 struct mm_struct *mm = vma->vm_mm;
541 unsigned long address;
542 pte_t *pte; 637 pte_t *pte;
543 spinlock_t *ptl; 638 spinlock_t *ptl;
544 int ret = 0; 639 int ret = 0;
545 640
546 address = vma_address(page, vma);
547 if (address == -EFAULT)
548 goto out;
549
550 pte = page_check_address(page, mm, address, &ptl, 1); 641 pte = page_check_address(page, mm, address, &ptl, 1);
551 if (!pte) 642 if (!pte)
552 goto out; 643 goto out;
@@ -578,8 +669,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
578 669
579 spin_lock(&mapping->i_mmap_lock); 670 spin_lock(&mapping->i_mmap_lock);
580 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 671 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
581 if (vma->vm_flags & VM_SHARED) 672 if (vma->vm_flags & VM_SHARED) {
582 ret += page_mkclean_one(page, vma); 673 unsigned long address = vma_address(page, vma);
674 if (address == -EFAULT)
675 continue;
676 ret += page_mkclean_one(page, vma, address);
677 }
583 } 678 }
584 spin_unlock(&mapping->i_mmap_lock); 679 spin_unlock(&mapping->i_mmap_lock);
585 return ret; 680 return ret;
@@ -607,27 +702,60 @@ int page_mkclean(struct page *page)
607EXPORT_SYMBOL_GPL(page_mkclean); 702EXPORT_SYMBOL_GPL(page_mkclean);
608 703
609/** 704/**
705 * page_move_anon_rmap - move a page to our anon_vma
706 * @page: the page to move to our anon_vma
707 * @vma: the vma the page belongs to
708 * @address: the user virtual address mapped
709 *
710 * When a page belongs exclusively to one process after a COW event,
711 * that page can be moved into the anon_vma that belongs to just that
712 * process, so the rmap code will not search the parent or sibling
713 * processes.
714 */
715void page_move_anon_rmap(struct page *page,
716 struct vm_area_struct *vma, unsigned long address)
717{
718 struct anon_vma *anon_vma = vma->anon_vma;
719
720 VM_BUG_ON(!PageLocked(page));
721 VM_BUG_ON(!anon_vma);
722 VM_BUG_ON(page->index != linear_page_index(vma, address));
723
724 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
725 page->mapping = (struct address_space *) anon_vma;
726}
727
728/**
610 * __page_set_anon_rmap - setup new anonymous rmap 729 * __page_set_anon_rmap - setup new anonymous rmap
611 * @page: the page to add the mapping to 730 * @page: the page to add the mapping to
612 * @vma: the vm area in which the mapping is added 731 * @vma: the vm area in which the mapping is added
613 * @address: the user virtual address mapped 732 * @address: the user virtual address mapped
733 * @exclusive: the page is exclusively owned by the current process
614 */ 734 */
615static void __page_set_anon_rmap(struct page *page, 735static void __page_set_anon_rmap(struct page *page,
616 struct vm_area_struct *vma, unsigned long address) 736 struct vm_area_struct *vma, unsigned long address, int exclusive)
617{ 737{
618 struct anon_vma *anon_vma = vma->anon_vma; 738 struct anon_vma *anon_vma = vma->anon_vma;
619 739
620 BUG_ON(!anon_vma); 740 BUG_ON(!anon_vma);
621 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
622 page->mapping = (struct address_space *) anon_vma;
623
624 page->index = linear_page_index(vma, address);
625 741
626 /* 742 /*
627 * nr_mapped state can be updated without turning off 743 * If the page isn't exclusively mapped into this vma,
628 * interrupts because it is not modified via interrupt. 744 * we must use the _oldest_ possible anon_vma for the
745 * page mapping!
746 *
747 * So take the last AVC chain entry in the vma, which is
748 * the deepest ancestor, and use the anon_vma from that.
629 */ 749 */
630 __inc_zone_page_state(page, NR_ANON_PAGES); 750 if (!exclusive) {
751 struct anon_vma_chain *avc;
752 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
753 anon_vma = avc->anon_vma;
754 }
755
756 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
757 page->mapping = (struct address_space *) anon_vma;
758 page->index = linear_page_index(vma, address);
631} 759}
632 760
633/** 761/**
@@ -652,9 +780,6 @@ static void __page_check_anon_rmap(struct page *page,
652 * are initially only visible via the pagetables, and the pte is locked 780 * are initially only visible via the pagetables, and the pte is locked
653 * over the call to page_add_new_anon_rmap. 781 * over the call to page_add_new_anon_rmap.
654 */ 782 */
655 struct anon_vma *anon_vma = vma->anon_vma;
656 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
657 BUG_ON(page->mapping != (struct address_space *)anon_vma);
658 BUG_ON(page->index != linear_page_index(vma, address)); 783 BUG_ON(page->index != linear_page_index(vma, address));
659#endif 784#endif
660} 785}
@@ -665,15 +790,24 @@ static void __page_check_anon_rmap(struct page *page,
665 * @vma: the vm area in which the mapping is added 790 * @vma: the vm area in which the mapping is added
666 * @address: the user virtual address mapped 791 * @address: the user virtual address mapped
667 * 792 *
668 * The caller needs to hold the pte lock and the page must be locked. 793 * The caller needs to hold the pte lock, and the page must be locked in
794 * the anon_vma case: to serialize mapping,index checking after setting,
795 * and to ensure that PageAnon is not being upgraded racily to PageKsm
796 * (but PageKsm is never downgraded to PageAnon).
669 */ 797 */
670void page_add_anon_rmap(struct page *page, 798void page_add_anon_rmap(struct page *page,
671 struct vm_area_struct *vma, unsigned long address) 799 struct vm_area_struct *vma, unsigned long address)
672{ 800{
801 int first = atomic_inc_and_test(&page->_mapcount);
802 if (first)
803 __inc_zone_page_state(page, NR_ANON_PAGES);
804 if (unlikely(PageKsm(page)))
805 return;
806
673 VM_BUG_ON(!PageLocked(page)); 807 VM_BUG_ON(!PageLocked(page));
674 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 808 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
675 if (atomic_inc_and_test(&page->_mapcount)) 809 if (first)
676 __page_set_anon_rmap(page, vma, address); 810 __page_set_anon_rmap(page, vma, address, 0);
677 else 811 else
678 __page_check_anon_rmap(page, vma, address); 812 __page_check_anon_rmap(page, vma, address);
679} 813}
@@ -694,7 +828,8 @@ void page_add_new_anon_rmap(struct page *page,
694 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 828 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
695 SetPageSwapBacked(page); 829 SetPageSwapBacked(page);
696 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 830 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
697 __page_set_anon_rmap(page, vma, address); 831 __inc_zone_page_state(page, NR_ANON_PAGES);
832 __page_set_anon_rmap(page, vma, address, 1);
698 if (page_evictable(page, vma)) 833 if (page_evictable(page, vma))
699 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 834 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
700 else 835 else
@@ -711,7 +846,7 @@ void page_add_file_rmap(struct page *page)
711{ 846{
712 if (atomic_inc_and_test(&page->_mapcount)) { 847 if (atomic_inc_and_test(&page->_mapcount)) {
713 __inc_zone_page_state(page, NR_FILE_MAPPED); 848 __inc_zone_page_state(page, NR_FILE_MAPPED);
714 mem_cgroup_update_mapped_file_stat(page, 1); 849 mem_cgroup_update_file_mapped(page, 1);
715 } 850 }
716} 851}
717 852
@@ -743,8 +878,8 @@ void page_remove_rmap(struct page *page)
743 __dec_zone_page_state(page, NR_ANON_PAGES); 878 __dec_zone_page_state(page, NR_ANON_PAGES);
744 } else { 879 } else {
745 __dec_zone_page_state(page, NR_FILE_MAPPED); 880 __dec_zone_page_state(page, NR_FILE_MAPPED);
881 mem_cgroup_update_file_mapped(page, -1);
746 } 882 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /* 883 /*
749 * It would be tidy to reset the PageAnon mapping here, 884 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap 885 * but that might overwrite a racing page_add_anon_rmap
@@ -760,20 +895,15 @@ void page_remove_rmap(struct page *page)
760 * Subfunctions of try_to_unmap: try_to_unmap_one called 895 * Subfunctions of try_to_unmap: try_to_unmap_one called
761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 896 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
762 */ 897 */
763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 898int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
764 enum ttu_flags flags) 899 unsigned long address, enum ttu_flags flags)
765{ 900{
766 struct mm_struct *mm = vma->vm_mm; 901 struct mm_struct *mm = vma->vm_mm;
767 unsigned long address;
768 pte_t *pte; 902 pte_t *pte;
769 pte_t pteval; 903 pte_t pteval;
770 spinlock_t *ptl; 904 spinlock_t *ptl;
771 int ret = SWAP_AGAIN; 905 int ret = SWAP_AGAIN;
772 906
773 address = vma_address(page, vma);
774 if (address == -EFAULT)
775 goto out;
776
777 pte = page_check_address(page, mm, address, &ptl, 0); 907 pte = page_check_address(page, mm, address, &ptl, 0);
778 if (!pte) 908 if (!pte)
779 goto out; 909 goto out;
@@ -784,10 +914,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
784 * skipped over this mm) then we should reactivate it. 914 * skipped over this mm) then we should reactivate it.
785 */ 915 */
786 if (!(flags & TTU_IGNORE_MLOCK)) { 916 if (!(flags & TTU_IGNORE_MLOCK)) {
787 if (vma->vm_flags & VM_LOCKED) { 917 if (vma->vm_flags & VM_LOCKED)
788 ret = SWAP_MLOCK; 918 goto out_mlock;
919
920 if (TTU_ACTION(flags) == TTU_MUNLOCK)
789 goto out_unmap; 921 goto out_unmap;
790 }
791 } 922 }
792 if (!(flags & TTU_IGNORE_ACCESS)) { 923 if (!(flags & TTU_IGNORE_ACCESS)) {
793 if (ptep_clear_flush_young_notify(vma, address, pte)) { 924 if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -809,9 +940,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
809 940
810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 941 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page)) 942 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss); 943 dec_mm_counter(mm, MM_ANONPAGES);
813 else 944 else
814 dec_mm_counter(mm, file_rss); 945 dec_mm_counter(mm, MM_FILEPAGES);
815 set_pte_at(mm, address, pte, 946 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page))); 947 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) { 948 } else if (PageAnon(page)) {
@@ -822,14 +953,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
822 * Store the swap location in the pte. 953 * Store the swap location in the pte.
823 * See handle_pte_fault() ... 954 * See handle_pte_fault() ...
824 */ 955 */
825 swap_duplicate(entry); 956 if (swap_duplicate(entry) < 0) {
957 set_pte_at(mm, address, pte, pteval);
958 ret = SWAP_FAIL;
959 goto out_unmap;
960 }
826 if (list_empty(&mm->mmlist)) { 961 if (list_empty(&mm->mmlist)) {
827 spin_lock(&mmlist_lock); 962 spin_lock(&mmlist_lock);
828 if (list_empty(&mm->mmlist)) 963 if (list_empty(&mm->mmlist))
829 list_add(&mm->mmlist, &init_mm.mmlist); 964 list_add(&mm->mmlist, &init_mm.mmlist);
830 spin_unlock(&mmlist_lock); 965 spin_unlock(&mmlist_lock);
831 } 966 }
832 dec_mm_counter(mm, anon_rss); 967 dec_mm_counter(mm, MM_ANONPAGES);
968 inc_mm_counter(mm, MM_SWAPENTS);
833 } else if (PAGE_MIGRATION) { 969 } else if (PAGE_MIGRATION) {
834 /* 970 /*
835 * Store the pfn of the page in a special migration 971 * Store the pfn of the page in a special migration
@@ -847,8 +983,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
847 entry = make_migration_entry(page, pte_write(pteval)); 983 entry = make_migration_entry(page, pte_write(pteval));
848 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 984 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
849 } else 985 } else
850 dec_mm_counter(mm, file_rss); 986 dec_mm_counter(mm, MM_FILEPAGES);
851
852 987
853 page_remove_rmap(page); 988 page_remove_rmap(page);
854 page_cache_release(page); 989 page_cache_release(page);
@@ -857,6 +992,27 @@ out_unmap:
857 pte_unmap_unlock(pte, ptl); 992 pte_unmap_unlock(pte, ptl);
858out: 993out:
859 return ret; 994 return ret;
995
996out_mlock:
997 pte_unmap_unlock(pte, ptl);
998
999
1000 /*
1001 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1002 * unstable result and race. Plus, We can't wait here because
1003 * we now hold anon_vma->lock or mapping->i_mmap_lock.
1004 * if trylock failed, the page remain in evictable lru and later
1005 * vmscan could retry to move the page to unevictable lru if the
1006 * page is actually mlocked.
1007 */
1008 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1009 if (vma->vm_flags & VM_LOCKED) {
1010 mlock_vma_page(page);
1011 ret = SWAP_MLOCK;
1012 }
1013 up_read(&vma->vm_mm->mmap_sem);
1014 }
1015 return ret;
860} 1016}
861 1017
862/* 1018/*
@@ -922,11 +1078,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
922 return ret; 1078 return ret;
923 1079
924 /* 1080 /*
925 * MLOCK_PAGES => feature is configured. 1081 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
926 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
927 * keep the sem while scanning the cluster for mlocking pages. 1082 * keep the sem while scanning the cluster for mlocking pages.
928 */ 1083 */
929 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { 1084 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
930 locked_vma = (vma->vm_flags & VM_LOCKED); 1085 locked_vma = (vma->vm_flags & VM_LOCKED);
931 if (!locked_vma) 1086 if (!locked_vma)
932 up_read(&vma->vm_mm->mmap_sem); /* don't need it */ 1087 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
@@ -967,7 +1122,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
967 1122
968 page_remove_rmap(page); 1123 page_remove_rmap(page);
969 page_cache_release(page); 1124 page_cache_release(page);
970 dec_mm_counter(mm, file_rss); 1125 dec_mm_counter(mm, MM_FILEPAGES);
971 (*mapcount)--; 1126 (*mapcount)--;
972 } 1127 }
973 pte_unmap_unlock(pte - 1, ptl); 1128 pte_unmap_unlock(pte - 1, ptl);
@@ -976,29 +1131,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
976 return ret; 1131 return ret;
977} 1132}
978 1133
979/*
980 * common handling for pages mapped in VM_LOCKED vmas
981 */
982static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
983{
984 int mlocked = 0;
985
986 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
987 if (vma->vm_flags & VM_LOCKED) {
988 mlock_vma_page(page);
989 mlocked++; /* really mlocked the page */
990 }
991 up_read(&vma->vm_mm->mmap_sem);
992 }
993 return mlocked;
994}
995
996/** 1134/**
997 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based 1135 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
998 * rmap method 1136 * rmap method
999 * @page: the page to unmap/unlock 1137 * @page: the page to unmap/unlock
1000 * @unlock: request for unlock rather than unmap [unlikely] 1138 * @flags: action and flags
1001 * @migration: unmapping for migration - ignored if @unlock
1002 * 1139 *
1003 * Find all the mappings of a page using the mapping pointer and the vma chains 1140 * Find all the mappings of a page using the mapping pointer and the vma chains
1004 * contained in the anon_vma struct it points to. 1141 * contained in the anon_vma struct it points to.
@@ -1013,43 +1150,24 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1150static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1014{ 1151{
1015 struct anon_vma *anon_vma; 1152 struct anon_vma *anon_vma;
1016 struct vm_area_struct *vma; 1153 struct anon_vma_chain *avc;
1017 unsigned int mlocked = 0;
1018 int ret = SWAP_AGAIN; 1154 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1020
1021 if (MLOCK_PAGES && unlikely(unlock))
1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1023 1155
1024 anon_vma = page_lock_anon_vma(page); 1156 anon_vma = page_lock_anon_vma(page);
1025 if (!anon_vma) 1157 if (!anon_vma)
1026 return ret; 1158 return ret;
1027 1159
1028 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1160 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1029 if (MLOCK_PAGES && unlikely(unlock)) { 1161 struct vm_area_struct *vma = avc->vma;
1030 if (!((vma->vm_flags & VM_LOCKED) && 1162 unsigned long address = vma_address(page, vma);
1031 page_mapped_in_vma(page, vma))) 1163 if (address == -EFAULT)
1032 continue; /* must visit all unlocked vmas */ 1164 continue;
1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1165 ret = try_to_unmap_one(page, vma, address, flags);
1034 } else { 1166 if (ret != SWAP_AGAIN || !page_mapped(page))
1035 ret = try_to_unmap_one(page, vma, flags); 1167 break;
1036 if (ret == SWAP_FAIL || !page_mapped(page))
1037 break;
1038 }
1039 if (ret == SWAP_MLOCK) {
1040 mlocked = try_to_mlock_page(page, vma);
1041 if (mlocked)
1042 break; /* stop if actually mlocked page */
1043 }
1044 } 1168 }
1045 1169
1046 page_unlock_anon_vma(anon_vma); 1170 page_unlock_anon_vma(anon_vma);
1047
1048 if (mlocked)
1049 ret = SWAP_MLOCK; /* actually mlocked the page */
1050 else if (ret == SWAP_MLOCK)
1051 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1052
1053 return ret; 1171 return ret;
1054} 1172}
1055 1173
@@ -1079,48 +1197,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1079 unsigned long max_nl_cursor = 0; 1197 unsigned long max_nl_cursor = 0;
1080 unsigned long max_nl_size = 0; 1198 unsigned long max_nl_size = 0;
1081 unsigned int mapcount; 1199 unsigned int mapcount;
1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1084
1085 if (MLOCK_PAGES && unlikely(unlock))
1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1087 1200
1088 spin_lock(&mapping->i_mmap_lock); 1201 spin_lock(&mapping->i_mmap_lock);
1089 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1202 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1090 if (MLOCK_PAGES && unlikely(unlock)) { 1203 unsigned long address = vma_address(page, vma);
1091 if (!((vma->vm_flags & VM_LOCKED) && 1204 if (address == -EFAULT)
1092 page_mapped_in_vma(page, vma))) 1205 continue;
1093 continue; /* must visit all vmas */ 1206 ret = try_to_unmap_one(page, vma, address, flags);
1094 ret = SWAP_MLOCK; 1207 if (ret != SWAP_AGAIN || !page_mapped(page))
1095 } else { 1208 goto out;
1096 ret = try_to_unmap_one(page, vma, flags);
1097 if (ret == SWAP_FAIL || !page_mapped(page))
1098 goto out;
1099 }
1100 if (ret == SWAP_MLOCK) {
1101 mlocked = try_to_mlock_page(page, vma);
1102 if (mlocked)
1103 break; /* stop if actually mlocked page */
1104 }
1105 } 1209 }
1106 1210
1107 if (mlocked) 1211 if (list_empty(&mapping->i_mmap_nonlinear))
1108 goto out; 1212 goto out;
1109 1213
1110 if (list_empty(&mapping->i_mmap_nonlinear)) 1214 /*
1215 * We don't bother to try to find the munlocked page in nonlinears.
1216 * It's costly. Instead, later, page reclaim logic may call
1217 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
1218 */
1219 if (TTU_ACTION(flags) == TTU_MUNLOCK)
1111 goto out; 1220 goto out;
1112 1221
1113 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1222 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1114 shared.vm_set.list) { 1223 shared.vm_set.list) {
1115 if (MLOCK_PAGES && unlikely(unlock)) {
1116 if (!(vma->vm_flags & VM_LOCKED))
1117 continue; /* must visit all vmas */
1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1119 goto out; /* no need to look further */
1120 }
1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1123 continue;
1124 cursor = (unsigned long) vma->vm_private_data; 1224 cursor = (unsigned long) vma->vm_private_data;
1125 if (cursor > max_nl_cursor) 1225 if (cursor > max_nl_cursor)
1126 max_nl_cursor = cursor; 1226 max_nl_cursor = cursor;
@@ -1153,16 +1253,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1153 do { 1253 do {
1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1254 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1155 shared.vm_set.list) { 1255 shared.vm_set.list) {
1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1157 (vma->vm_flags & VM_LOCKED))
1158 continue;
1159 cursor = (unsigned long) vma->vm_private_data; 1256 cursor = (unsigned long) vma->vm_private_data;
1160 while ( cursor < max_nl_cursor && 1257 while ( cursor < max_nl_cursor &&
1161 cursor < vma->vm_end - vma->vm_start) { 1258 cursor < vma->vm_end - vma->vm_start) {
1162 ret = try_to_unmap_cluster(cursor, &mapcount, 1259 if (try_to_unmap_cluster(cursor, &mapcount,
1163 vma, page); 1260 vma, page) == SWAP_MLOCK)
1164 if (ret == SWAP_MLOCK) 1261 ret = SWAP_MLOCK;
1165 mlocked = 2; /* to return below */
1166 cursor += CLUSTER_SIZE; 1262 cursor += CLUSTER_SIZE;
1167 vma->vm_private_data = (void *) cursor; 1263 vma->vm_private_data = (void *) cursor;
1168 if ((int)mapcount <= 0) 1264 if ((int)mapcount <= 0)
@@ -1183,10 +1279,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1183 vma->vm_private_data = NULL; 1279 vma->vm_private_data = NULL;
1184out: 1280out:
1185 spin_unlock(&mapping->i_mmap_lock); 1281 spin_unlock(&mapping->i_mmap_lock);
1186 if (mlocked)
1187 ret = SWAP_MLOCK; /* actually mlocked the page */
1188 else if (ret == SWAP_MLOCK)
1189 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1190 return ret; 1282 return ret;
1191} 1283}
1192 1284
@@ -1210,7 +1302,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1210 1302
1211 BUG_ON(!PageLocked(page)); 1303 BUG_ON(!PageLocked(page));
1212 1304
1213 if (PageAnon(page)) 1305 if (unlikely(PageKsm(page)))
1306 ret = try_to_unmap_ksm(page, flags);
1307 else if (PageAnon(page))
1214 ret = try_to_unmap_anon(page, flags); 1308 ret = try_to_unmap_anon(page, flags);
1215 else 1309 else
1216 ret = try_to_unmap_file(page, flags); 1310 ret = try_to_unmap_file(page, flags);
@@ -1229,17 +1323,99 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1229 * 1323 *
1230 * Return values are: 1324 * Return values are:
1231 * 1325 *
1232 * SWAP_SUCCESS - no vma's holding page mlocked. 1326 * SWAP_AGAIN - no vma is holding page mlocked, or,
1233 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem 1327 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1328 * SWAP_FAIL - page cannot be located at present
1234 * SWAP_MLOCK - page is now mlocked. 1329 * SWAP_MLOCK - page is now mlocked.
1235 */ 1330 */
1236int try_to_munlock(struct page *page) 1331int try_to_munlock(struct page *page)
1237{ 1332{
1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1333 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1239 1334
1240 if (PageAnon(page)) 1335 if (unlikely(PageKsm(page)))
1336 return try_to_unmap_ksm(page, TTU_MUNLOCK);
1337 else if (PageAnon(page))
1241 return try_to_unmap_anon(page, TTU_MUNLOCK); 1338 return try_to_unmap_anon(page, TTU_MUNLOCK);
1242 else 1339 else
1243 return try_to_unmap_file(page, TTU_MUNLOCK); 1340 return try_to_unmap_file(page, TTU_MUNLOCK);
1244} 1341}
1245 1342
1343#ifdef CONFIG_MIGRATION
1344/*
1345 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
1346 * Called by migrate.c to remove migration ptes, but might be used more later.
1347 */
1348static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1349 struct vm_area_struct *, unsigned long, void *), void *arg)
1350{
1351 struct anon_vma *anon_vma;
1352 struct anon_vma_chain *avc;
1353 int ret = SWAP_AGAIN;
1354
1355 /*
1356 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
1357 * because that depends on page_mapped(); but not all its usages
1358 * are holding mmap_sem, which also gave the necessary guarantee
1359 * (that this anon_vma's slab has not already been destroyed).
1360 * This needs to be reviewed later: avoiding page_lock_anon_vma()
1361 * is risky, and currently limits the usefulness of rmap_walk().
1362 */
1363 anon_vma = page_anon_vma(page);
1364 if (!anon_vma)
1365 return ret;
1366 spin_lock(&anon_vma->lock);
1367 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1368 struct vm_area_struct *vma = avc->vma;
1369 unsigned long address = vma_address(page, vma);
1370 if (address == -EFAULT)
1371 continue;
1372 ret = rmap_one(page, vma, address, arg);
1373 if (ret != SWAP_AGAIN)
1374 break;
1375 }
1376 spin_unlock(&anon_vma->lock);
1377 return ret;
1378}
1379
1380static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1381 struct vm_area_struct *, unsigned long, void *), void *arg)
1382{
1383 struct address_space *mapping = page->mapping;
1384 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1385 struct vm_area_struct *vma;
1386 struct prio_tree_iter iter;
1387 int ret = SWAP_AGAIN;
1388
1389 if (!mapping)
1390 return ret;
1391 spin_lock(&mapping->i_mmap_lock);
1392 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1393 unsigned long address = vma_address(page, vma);
1394 if (address == -EFAULT)
1395 continue;
1396 ret = rmap_one(page, vma, address, arg);
1397 if (ret != SWAP_AGAIN)
1398 break;
1399 }
1400 /*
1401 * No nonlinear handling: being always shared, nonlinear vmas
1402 * never contain migration ptes. Decide what to do about this
1403 * limitation to linear when we need rmap_walk() on nonlinear.
1404 */
1405 spin_unlock(&mapping->i_mmap_lock);
1406 return ret;
1407}
1408
1409int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1410 struct vm_area_struct *, unsigned long, void *), void *arg)
1411{
1412 VM_BUG_ON(!PageLocked(page));
1413
1414 if (unlikely(PageKsm(page)))
1415 return rmap_walk_ksm(page, rmap_one, arg);
1416 else if (PageAnon(page))
1417 return rmap_walk_anon(page, rmap_one, arg);
1418 else
1419 return rmap_walk_file(page, rmap_one, arg);
1420}
1421#endif /* CONFIG_MIGRATION */
diff --git a/mm/shmem.c b/mm/shmem.c
index 356dd99566ec..eef4ebea5158 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,6 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/ima.h>
33 32
34static struct vfsmount *shm_mnt; 33static struct vfsmount *shm_mnt;
35 34
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt;
42 41
43#include <linux/xattr.h> 42#include <linux/xattr.h>
44#include <linux/exportfs.h> 43#include <linux/exportfs.h>
44#include <linux/posix_acl.h>
45#include <linux/generic_acl.h> 45#include <linux/generic_acl.h>
46#include <linux/mman.h> 46#include <linux/mman.h>
47#include <linux/string.h> 47#include <linux/string.h>
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
810 error = inode_setattr(inode, attr); 810 error = inode_setattr(inode, attr);
811#ifdef CONFIG_TMPFS_POSIX_ACL 811#ifdef CONFIG_TMPFS_POSIX_ACL
812 if (!error && (attr->ia_valid & ATTR_MODE)) 812 if (!error && (attr->ia_valid & ATTR_MODE))
813 error = generic_acl_chmod(inode, &shmem_acl_ops); 813 error = generic_acl_chmod(inode);
814#endif 814#endif
815 if (page) 815 if (page)
816 page_cache_release(page); 816 page_cache_release(page);
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
1017 goto out; 1017 goto out;
1018 } 1018 }
1019 mutex_unlock(&shmem_swaplist_mutex); 1019 mutex_unlock(&shmem_swaplist_mutex);
1020out: return found; /* 0 or 1 or -ENOMEM */ 1020 /*
1021 * Can some race bring us here? We've been holding page lock,
1022 * so I think not; but would rather try again later than BUG()
1023 */
1024 unlock_page(page);
1025 page_cache_release(page);
1026out:
1027 return (found < 0) ? found : 0;
1021} 1028}
1022 1029
1023/* 1030/*
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1080 else 1087 else
1081 inode = NULL; 1088 inode = NULL;
1082 spin_unlock(&info->lock); 1089 spin_unlock(&info->lock);
1083 swap_duplicate(swap); 1090 swap_shmem_alloc(swap);
1084 BUG_ON(page_mapped(page)); 1091 BUG_ON(page_mapped(page));
1085 page_cache_release(page); /* pagecache ref */ 1092 page_cache_release(page); /* pagecache ref */
1086 swap_writepage(page, wbc); 1093 swap_writepage(page, wbc);
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1817 return error; 1824 return error;
1818 } 1825 }
1819 } 1826 }
1820 error = shmem_acl_init(inode, dir); 1827#ifdef CONFIG_TMPFS_POSIX_ACL
1828 error = generic_acl_init(inode, dir);
1821 if (error) { 1829 if (error) {
1822 iput(inode); 1830 iput(inode);
1823 return error; 1831 return error;
1824 } 1832 }
1833#else
1834 error = 0;
1835#endif
1825 if (dir->i_mode & S_ISGID) { 1836 if (dir->i_mode & S_ISGID) {
1826 inode->i_gid = dir->i_gid; 1837 inode->i_gid = dir->i_gid;
1827 if (S_ISDIR(mode)) 1838 if (S_ISDIR(mode))
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = {
2036 * filesystem level, though. 2047 * filesystem level, though.
2037 */ 2048 */
2038 2049
2039static size_t shmem_xattr_security_list(struct inode *inode, char *list, 2050static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
2040 size_t list_len, const char *name, 2051 size_t list_len, const char *name,
2041 size_t name_len) 2052 size_t name_len, int handler_flags)
2042{ 2053{
2043 return security_inode_listsecurity(inode, list, list_len); 2054 return security_inode_listsecurity(dentry->d_inode, list, list_len);
2044} 2055}
2045 2056
2046static int shmem_xattr_security_get(struct inode *inode, const char *name, 2057static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
2047 void *buffer, size_t size) 2058 void *buffer, size_t size, int handler_flags)
2048{ 2059{
2049 if (strcmp(name, "") == 0) 2060 if (strcmp(name, "") == 0)
2050 return -EINVAL; 2061 return -EINVAL;
2051 return xattr_getsecurity(inode, name, buffer, size); 2062 return xattr_getsecurity(dentry->d_inode, name, buffer, size);
2052} 2063}
2053 2064
2054static int shmem_xattr_security_set(struct inode *inode, const char *name, 2065static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
2055 const void *value, size_t size, int flags) 2066 const void *value, size_t size, int flags, int handler_flags)
2056{ 2067{
2057 if (strcmp(name, "") == 0) 2068 if (strcmp(name, "") == 0)
2058 return -EINVAL; 2069 return -EINVAL;
2059 return security_inode_setsecurity(inode, name, value, size, flags); 2070 return security_inode_setsecurity(dentry->d_inode, name, value,
2071 size, flags);
2060} 2072}
2061 2073
2062static struct xattr_handler shmem_xattr_security_handler = { 2074static struct xattr_handler shmem_xattr_security_handler = {
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = {
2067}; 2079};
2068 2080
2069static struct xattr_handler *shmem_xattr_handlers[] = { 2081static struct xattr_handler *shmem_xattr_handlers[] = {
2070 &shmem_xattr_acl_access_handler, 2082 &generic_acl_access_handler,
2071 &shmem_xattr_acl_default_handler, 2083 &generic_acl_default_handler,
2072 &shmem_xattr_security_handler, 2084 &shmem_xattr_security_handler,
2073 NULL 2085 NULL
2074}; 2086};
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = {
2447 .getxattr = generic_getxattr, 2459 .getxattr = generic_getxattr,
2448 .listxattr = generic_listxattr, 2460 .listxattr = generic_listxattr,
2449 .removexattr = generic_removexattr, 2461 .removexattr = generic_removexattr,
2450 .check_acl = shmem_check_acl, 2462 .check_acl = generic_check_acl,
2451#endif 2463#endif
2452 2464
2453}; 2465};
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2470 .getxattr = generic_getxattr, 2482 .getxattr = generic_getxattr,
2471 .listxattr = generic_listxattr, 2483 .listxattr = generic_listxattr,
2472 .removexattr = generic_removexattr, 2484 .removexattr = generic_removexattr,
2473 .check_acl = shmem_check_acl, 2485 .check_acl = generic_check_acl,
2474#endif 2486#endif
2475}; 2487};
2476 2488
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2481 .getxattr = generic_getxattr, 2493 .getxattr = generic_getxattr,
2482 .listxattr = generic_listxattr, 2494 .listxattr = generic_listxattr,
2483 .removexattr = generic_removexattr, 2495 .removexattr = generic_removexattr,
2484 .check_acl = shmem_check_acl, 2496 .check_acl = generic_check_acl,
2485#endif 2497#endif
2486}; 2498};
2487 2499
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2619 int error; 2631 int error;
2620 struct file *file; 2632 struct file *file;
2621 struct inode *inode; 2633 struct inode *inode;
2622 struct dentry *dentry, *root; 2634 struct path path;
2635 struct dentry *root;
2623 struct qstr this; 2636 struct qstr this;
2624 2637
2625 if (IS_ERR(shm_mnt)) 2638 if (IS_ERR(shm_mnt))
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2636 this.len = strlen(name); 2649 this.len = strlen(name);
2637 this.hash = 0; /* will go */ 2650 this.hash = 0; /* will go */
2638 root = shm_mnt->mnt_root; 2651 root = shm_mnt->mnt_root;
2639 dentry = d_alloc(root, &this); 2652 path.dentry = d_alloc(root, &this);
2640 if (!dentry) 2653 if (!path.dentry)
2641 goto put_memory; 2654 goto put_memory;
2642 2655 path.mnt = mntget(shm_mnt);
2643 error = -ENFILE;
2644 file = get_empty_filp();
2645 if (!file)
2646 goto put_dentry;
2647 2656
2648 error = -ENOSPC; 2657 error = -ENOSPC;
2649 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); 2658 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
2650 if (!inode) 2659 if (!inode)
2651 goto close_file; 2660 goto put_dentry;
2652 2661
2653 d_instantiate(dentry, inode); 2662 d_instantiate(path.dentry, inode);
2654 inode->i_size = size; 2663 inode->i_size = size;
2655 inode->i_nlink = 0; /* It is unlinked */ 2664 inode->i_nlink = 0; /* It is unlinked */
2656 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2657 &shmem_file_operations);
2658
2659#ifndef CONFIG_MMU 2665#ifndef CONFIG_MMU
2660 error = ramfs_nommu_expand_for_mapping(inode, size); 2666 error = ramfs_nommu_expand_for_mapping(inode, size);
2661 if (error) 2667 if (error)
2662 goto close_file; 2668 goto put_dentry;
2663#endif 2669#endif
2664 ima_counts_get(file); 2670
2671 error = -ENFILE;
2672 file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
2673 &shmem_file_operations);
2674 if (!file)
2675 goto put_dentry;
2676
2665 return file; 2677 return file;
2666 2678
2667close_file:
2668 put_filp(file);
2669put_dentry: 2679put_dentry:
2670 dput(dentry); 2680 path_put(&path);
2671put_memory: 2681put_memory:
2672 shmem_unacct_size(flags, size); 2682 shmem_unacct_size(flags, size);
2673 return ERR_PTR(error); 2683 return ERR_PTR(error);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
deleted file mode 100644
index df2c87fdae50..000000000000
--- a/mm/shmem_acl.c
+++ /dev/null
@@ -1,171 +0,0 @@
1/*
2 * mm/shmem_acl.c
3 *
4 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/fs.h>
10#include <linux/shmem_fs.h>
11#include <linux/xattr.h>
12#include <linux/generic_acl.h>
13
14/**
15 * shmem_get_acl - generic_acl_operations->getacl() operation
16 */
17static struct posix_acl *
18shmem_get_acl(struct inode *inode, int type)
19{
20 struct posix_acl *acl = NULL;
21
22 spin_lock(&inode->i_lock);
23 switch(type) {
24 case ACL_TYPE_ACCESS:
25 acl = posix_acl_dup(inode->i_acl);
26 break;
27
28 case ACL_TYPE_DEFAULT:
29 acl = posix_acl_dup(inode->i_default_acl);
30 break;
31 }
32 spin_unlock(&inode->i_lock);
33
34 return acl;
35}
36
37/**
38 * shmem_set_acl - generic_acl_operations->setacl() operation
39 */
40static void
41shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
42{
43 struct posix_acl *free = NULL;
44
45 spin_lock(&inode->i_lock);
46 switch(type) {
47 case ACL_TYPE_ACCESS:
48 free = inode->i_acl;
49 inode->i_acl = posix_acl_dup(acl);
50 break;
51
52 case ACL_TYPE_DEFAULT:
53 free = inode->i_default_acl;
54 inode->i_default_acl = posix_acl_dup(acl);
55 break;
56 }
57 spin_unlock(&inode->i_lock);
58 posix_acl_release(free);
59}
60
61struct generic_acl_operations shmem_acl_ops = {
62 .getacl = shmem_get_acl,
63 .setacl = shmem_set_acl,
64};
65
66/**
67 * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
68 * shmem_xattr_acl_access_handler - plumbing code to implement the
69 * system.posix_acl_access xattr using the generic acl functions.
70 */
71
72static size_t
73shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
74 const char *name, size_t name_len)
75{
76 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
77 list, list_size);
78}
79
80static int
81shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
82 size_t size)
83{
84 if (strcmp(name, "") != 0)
85 return -EINVAL;
86 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
87 size);
88}
89
90static int
91shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
92 size_t size, int flags)
93{
94 if (strcmp(name, "") != 0)
95 return -EINVAL;
96 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
97 size);
98}
99
100struct xattr_handler shmem_xattr_acl_access_handler = {
101 .prefix = POSIX_ACL_XATTR_ACCESS,
102 .list = shmem_list_acl_access,
103 .get = shmem_get_acl_access,
104 .set = shmem_set_acl_access,
105};
106
107/**
108 * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
109 * shmem_xattr_acl_default_handler - plumbing code to implement the
110 * system.posix_acl_default xattr using the generic acl functions.
111 */
112
113static size_t
114shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
115 const char *name, size_t name_len)
116{
117 return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
118 list, list_size);
119}
120
121static int
122shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
123 size_t size)
124{
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127 return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
128 size);
129}
130
131static int
132shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
133 size_t size, int flags)
134{
135 if (strcmp(name, "") != 0)
136 return -EINVAL;
137 return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
138 size);
139}
140
141struct xattr_handler shmem_xattr_acl_default_handler = {
142 .prefix = POSIX_ACL_XATTR_DEFAULT,
143 .list = shmem_list_acl_default,
144 .get = shmem_get_acl_default,
145 .set = shmem_set_acl_default,
146};
147
148/**
149 * shmem_acl_init - Inizialize the acl(s) of a new inode
150 */
151int
152shmem_acl_init(struct inode *inode, struct inode *dir)
153{
154 return generic_acl_init(inode, dir, &shmem_acl_ops);
155}
156
157/**
158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */
160int
161shmem_check_acl(struct inode *inode, int mask)
162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
164
165 if (acl) {
166 int error = posix_acl_permission(inode, acl, mask);
167 posix_acl_release(acl);
168 return error;
169 }
170 return -EAGAIN;
171}
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481c96ba..bac0f4fcc216 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
490 490
491#endif 491#endif
492 492
493#ifdef CONFIG_KMEMTRACE 493#ifdef CONFIG_TRACING
494size_t slab_buffer_size(struct kmem_cache *cachep) 494size_t slab_buffer_size(struct kmem_cache *cachep)
495{ 495{
496 return cachep->buffer_size; 496 return cachep->buffer_size;
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = {
604 604
605#define BAD_ALIEN_MAGIC 0x01020304ul 605#define BAD_ALIEN_MAGIC 0x01020304ul
606 606
607/*
608 * chicken and egg problem: delay the per-cpu array allocation
609 * until the general caches are up.
610 */
611static enum {
612 NONE,
613 PARTIAL_AC,
614 PARTIAL_L3,
615 EARLY,
616 FULL
617} g_cpucache_up;
618
619/*
620 * used by boot code to determine if it can use slab based allocator
621 */
622int slab_is_available(void)
623{
624 return g_cpucache_up >= EARLY;
625}
626
607#ifdef CONFIG_LOCKDEP 627#ifdef CONFIG_LOCKDEP
608 628
609/* 629/*
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = {
620static struct lock_class_key on_slab_l3_key; 640static struct lock_class_key on_slab_l3_key;
621static struct lock_class_key on_slab_alc_key; 641static struct lock_class_key on_slab_alc_key;
622 642
623static inline void init_lock_keys(void) 643static void init_node_lock_keys(int q)
624
625{ 644{
626 int q;
627 struct cache_sizes *s = malloc_sizes; 645 struct cache_sizes *s = malloc_sizes;
628 646
629 while (s->cs_size != ULONG_MAX) { 647 if (g_cpucache_up != FULL)
630 for_each_node(q) { 648 return;
631 struct array_cache **alc; 649
632 int r; 650 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
633 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; 651 struct array_cache **alc;
634 if (!l3 || OFF_SLAB(s->cs_cachep)) 652 struct kmem_list3 *l3;
635 continue; 653 int r;
636 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 654
637 alc = l3->alien; 655 l3 = s->cs_cachep->nodelists[q];
638 /* 656 if (!l3 || OFF_SLAB(s->cs_cachep))
639 * FIXME: This check for BAD_ALIEN_MAGIC 657 continue;
640 * should go away when common slab code is taught to 658 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
641 * work even without alien caches. 659 alc = l3->alien;
642 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 660 /*
643 * for alloc_alien_cache, 661 * FIXME: This check for BAD_ALIEN_MAGIC
644 */ 662 * should go away when common slab code is taught to
645 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 663 * work even without alien caches.
646 continue; 664 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 for_each_node(r) { 665 * for alloc_alien_cache,
648 if (alc[r]) 666 */
649 lockdep_set_class(&alc[r]->lock, 667 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 &on_slab_alc_key); 668 continue;
651 } 669 for_each_node(r) {
670 if (alc[r])
671 lockdep_set_class(&alc[r]->lock,
672 &on_slab_alc_key);
652 } 673 }
653 s++;
654 } 674 }
655} 675}
676
677static inline void init_lock_keys(void)
678{
679 int node;
680
681 for_each_node(node)
682 init_node_lock_keys(node);
683}
656#else 684#else
685static void init_node_lock_keys(int q)
686{
687}
688
657static inline void init_lock_keys(void) 689static inline void init_lock_keys(void)
658{ 690{
659} 691}
@@ -665,27 +697,7 @@ static inline void init_lock_keys(void)
665static DEFINE_MUTEX(cache_chain_mutex); 697static DEFINE_MUTEX(cache_chain_mutex);
666static struct list_head cache_chain; 698static struct list_head cache_chain;
667 699
668/* 700static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
669 * chicken and egg problem: delay the per-cpu array allocation
670 * until the general caches are up.
671 */
672static enum {
673 NONE,
674 PARTIAL_AC,
675 PARTIAL_L3,
676 EARLY,
677 FULL
678} g_cpucache_up;
679
680/*
681 * used by boot code to determine if it can use slab based allocator
682 */
683int slab_is_available(void)
684{
685 return g_cpucache_up >= EARLY;
686}
687
688static DEFINE_PER_CPU(struct delayed_work, reap_work);
689 701
690static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 702static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
691{ 703{
@@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup);
826 * objects freed on different nodes from which they were allocated) and the 838 * objects freed on different nodes from which they were allocated) and the
827 * flushing of remote pcps by calling drain_node_pages. 839 * flushing of remote pcps by calling drain_node_pages.
828 */ 840 */
829static DEFINE_PER_CPU(unsigned long, reap_node); 841static DEFINE_PER_CPU(unsigned long, slab_reap_node);
830 842
831static void init_reap_node(int cpu) 843static void init_reap_node(int cpu)
832{ 844{
@@ -836,17 +848,17 @@ static void init_reap_node(int cpu)
836 if (node == MAX_NUMNODES) 848 if (node == MAX_NUMNODES)
837 node = first_node(node_online_map); 849 node = first_node(node_online_map);
838 850
839 per_cpu(reap_node, cpu) = node; 851 per_cpu(slab_reap_node, cpu) = node;
840} 852}
841 853
842static void next_reap_node(void) 854static void next_reap_node(void)
843{ 855{
844 int node = __get_cpu_var(reap_node); 856 int node = __get_cpu_var(slab_reap_node);
845 857
846 node = next_node(node, node_online_map); 858 node = next_node(node, node_online_map);
847 if (unlikely(node >= MAX_NUMNODES)) 859 if (unlikely(node >= MAX_NUMNODES))
848 node = first_node(node_online_map); 860 node = first_node(node_online_map);
849 __get_cpu_var(reap_node) = node; 861 __get_cpu_var(slab_reap_node) = node;
850} 862}
851 863
852#else 864#else
@@ -863,7 +875,7 @@ static void next_reap_node(void)
863 */ 875 */
864static void __cpuinit start_cpu_timer(int cpu) 876static void __cpuinit start_cpu_timer(int cpu)
865{ 877{
866 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 878 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
867 879
868 /* 880 /*
869 * When this gets called from do_initcalls via cpucache_init(), 881 * When this gets called from do_initcalls via cpucache_init(),
@@ -923,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
923 935
924 from->avail -= nr; 936 from->avail -= nr;
925 to->avail += nr; 937 to->avail += nr;
926 to->touched = 1;
927 return nr; 938 return nr;
928} 939}
929 940
@@ -971,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
971 982
972 if (limit > 1) 983 if (limit > 1)
973 limit = 12; 984 limit = 12;
974 ac_ptr = kmalloc_node(memsize, gfp, node); 985 ac_ptr = kzalloc_node(memsize, gfp, node);
975 if (ac_ptr) { 986 if (ac_ptr) {
976 for_each_node(i) { 987 for_each_node(i) {
977 if (i == node || !node_online(i)) { 988 if (i == node || !node_online(i))
978 ac_ptr[i] = NULL;
979 continue; 989 continue;
980 }
981 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); 990 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
982 if (!ac_ptr[i]) { 991 if (!ac_ptr[i]) {
983 for (i--; i >= 0; i--) 992 for (i--; i >= 0; i--)
@@ -1027,7 +1036,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1027 */ 1036 */
1028static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1037static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1029{ 1038{
1030 int node = __get_cpu_var(reap_node); 1039 int node = __get_cpu_var(slab_reap_node);
1031 1040
1032 if (l3->alien) { 1041 if (l3->alien) {
1033 struct array_cache *ac = l3->alien[node]; 1042 struct array_cache *ac = l3->alien[node];
@@ -1120,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1120 if (nc) 1129 if (nc)
1121 free_block(cachep, nc->entry, nc->avail, node); 1130 free_block(cachep, nc->entry, nc->avail, node);
1122 1131
1123 if (!cpus_empty(*mask)) { 1132 if (!cpumask_empty(mask)) {
1124 spin_unlock_irq(&l3->list_lock); 1133 spin_unlock_irq(&l3->list_lock);
1125 goto free_array_cache; 1134 goto free_array_cache;
1126 } 1135 }
@@ -1254,6 +1263,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1254 kfree(shared); 1263 kfree(shared);
1255 free_alien_cache(alien); 1264 free_alien_cache(alien);
1256 } 1265 }
1266 init_node_lock_keys(node);
1267
1257 return 0; 1268 return 0;
1258bad: 1269bad:
1259 cpuup_canceled(cpu); 1270 cpuup_canceled(cpu);
@@ -1286,9 +1297,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1286 * anything expensive but will only modify reap_work 1297 * anything expensive but will only modify reap_work
1287 * and reschedule the timer. 1298 * and reschedule the timer.
1288 */ 1299 */
1289 cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); 1300 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
1290 /* Now the cache_reaper is guaranteed to be not running. */ 1301 /* Now the cache_reaper is guaranteed to be not running. */
1291 per_cpu(reap_work, cpu).work.func = NULL; 1302 per_cpu(slab_reap_work, cpu).work.func = NULL;
1292 break; 1303 break;
1293 case CPU_DOWN_FAILED: 1304 case CPU_DOWN_FAILED:
1294 case CPU_DOWN_FAILED_FROZEN: 1305 case CPU_DOWN_FAILED_FROZEN:
@@ -2261,9 +2272,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2261 /* 2272 /*
2262 * Determine if the slab management is 'on' or 'off' slab. 2273 * Determine if the slab management is 'on' or 'off' slab.
2263 * (bootstrapping cannot cope with offslab caches so don't do 2274 * (bootstrapping cannot cope with offslab caches so don't do
2264 * it too early on.) 2275 * it too early on. Always use on-slab management when
2276 * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
2265 */ 2277 */
2266 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 2278 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
2279 !(flags & SLAB_NOLEAKTRACE))
2267 /* 2280 /*
2268 * Size is large, assume best to place the slab management obj 2281 * Size is large, assume best to place the slab management obj
2269 * off-slab (should allow better packing of objs). 2282 * off-slab (should allow better packing of objs).
@@ -2582,8 +2595,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2582 * kmemleak does not treat the ->s_mem pointer as a reference 2595 * kmemleak does not treat the ->s_mem pointer as a reference
2583 * to the object. Otherwise we will not report the leak. 2596 * to the object. Otherwise we will not report the leak.
2584 */ 2597 */
2585 kmemleak_scan_area(slabp, offsetof(struct slab, list), 2598 kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
2586 sizeof(struct list_head), local_flags); 2599 local_flags);
2587 if (!slabp) 2600 if (!slabp)
2588 return NULL; 2601 return NULL;
2589 } else { 2602 } else {
@@ -2947,8 +2960,10 @@ retry:
2947 spin_lock(&l3->list_lock); 2960 spin_lock(&l3->list_lock);
2948 2961
2949 /* See if we can refill from the shared array */ 2962 /* See if we can refill from the shared array */
2950 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2963 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
2964 l3->shared->touched = 1;
2951 goto alloc_done; 2965 goto alloc_done;
2966 }
2952 2967
2953 while (batchcount > 0) { 2968 while (batchcount > 0) {
2954 struct list_head *entry; 2969 struct list_head *entry;
@@ -3085,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
3085 if (cachep == &cache_cache) 3100 if (cachep == &cache_cache)
3086 return false; 3101 return false;
3087 3102
3088 return should_failslab(obj_size(cachep), flags); 3103 return should_failslab(obj_size(cachep), flags, cachep->flags);
3089} 3104}
3090 3105
3091static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3106static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
@@ -3103,13 +3118,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3103 } else { 3118 } else {
3104 STATS_INC_ALLOCMISS(cachep); 3119 STATS_INC_ALLOCMISS(cachep);
3105 objp = cache_alloc_refill(cachep, flags); 3120 objp = cache_alloc_refill(cachep, flags);
3121 /*
3122 * the 'ac' may be updated by cache_alloc_refill(),
3123 * and kmemleak_erase() requires its correct value.
3124 */
3125 ac = cpu_cache_get(cachep);
3106 } 3126 }
3107 /* 3127 /*
3108 * To avoid a false negative, if an object that is in one of the 3128 * To avoid a false negative, if an object that is in one of the
3109 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3129 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
3110 * treat the array pointers as a reference to the object. 3130 * treat the array pointers as a reference to the object.
3111 */ 3131 */
3112 kmemleak_erase(&ac->entry[ac->avail]); 3132 if (objp)
3133 kmemleak_erase(&ac->entry[ac->avail]);
3113 return objp; 3134 return objp;
3114} 3135}
3115 3136
@@ -3306,7 +3327,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3306 cache_alloc_debugcheck_before(cachep, flags); 3327 cache_alloc_debugcheck_before(cachep, flags);
3307 local_irq_save(save_flags); 3328 local_irq_save(save_flags);
3308 3329
3309 if (unlikely(nodeid == -1)) 3330 if (nodeid == -1)
3310 nodeid = numa_node_id(); 3331 nodeid = numa_node_id();
3311 3332
3312 if (unlikely(!cachep->nodelists[nodeid])) { 3333 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3558,7 +3579,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3558} 3579}
3559EXPORT_SYMBOL(kmem_cache_alloc); 3580EXPORT_SYMBOL(kmem_cache_alloc);
3560 3581
3561#ifdef CONFIG_KMEMTRACE 3582#ifdef CONFIG_TRACING
3562void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3583void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3563{ 3584{
3564 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3585 return __cache_alloc(cachep, flags, __builtin_return_address(0));
@@ -3581,21 +3602,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3581 */ 3602 */
3582int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) 3603int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3583{ 3604{
3584 unsigned long addr = (unsigned long)ptr;
3585 unsigned long min_addr = PAGE_OFFSET;
3586 unsigned long align_mask = BYTES_PER_WORD - 1;
3587 unsigned long size = cachep->buffer_size; 3605 unsigned long size = cachep->buffer_size;
3588 struct page *page; 3606 struct page *page;
3589 3607
3590 if (unlikely(addr < min_addr)) 3608 if (unlikely(!kern_ptr_validate(ptr, size)))
3591 goto out;
3592 if (unlikely(addr > (unsigned long)high_memory - size))
3593 goto out;
3594 if (unlikely(addr & align_mask))
3595 goto out;
3596 if (unlikely(!kern_addr_valid(addr)))
3597 goto out;
3598 if (unlikely(!kern_addr_valid(addr + size - 1)))
3599 goto out; 3609 goto out;
3600 page = virt_to_page(ptr); 3610 page = virt_to_page(ptr);
3601 if (unlikely(!PageSlab(page))) 3611 if (unlikely(!PageSlab(page)))
@@ -3621,7 +3631,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3621} 3631}
3622EXPORT_SYMBOL(kmem_cache_alloc_node); 3632EXPORT_SYMBOL(kmem_cache_alloc_node);
3623 3633
3624#ifdef CONFIG_KMEMTRACE 3634#ifdef CONFIG_TRACING
3625void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3635void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3626 gfp_t flags, 3636 gfp_t flags,
3627 int nodeid) 3637 int nodeid)
@@ -3649,7 +3659,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3649 return ret; 3659 return ret;
3650} 3660}
3651 3661
3652#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3662#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3653void *__kmalloc_node(size_t size, gfp_t flags, int node) 3663void *__kmalloc_node(size_t size, gfp_t flags, int node)
3654{ 3664{
3655 return __do_kmalloc_node(size, flags, node, 3665 return __do_kmalloc_node(size, flags, node,
@@ -3669,7 +3679,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3669 return __do_kmalloc_node(size, flags, node, NULL); 3679 return __do_kmalloc_node(size, flags, node, NULL);
3670} 3680}
3671EXPORT_SYMBOL(__kmalloc_node); 3681EXPORT_SYMBOL(__kmalloc_node);
3672#endif /* CONFIG_DEBUG_SLAB */ 3682#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
3673#endif /* CONFIG_NUMA */ 3683#endif /* CONFIG_NUMA */
3674 3684
3675/** 3685/**
@@ -3701,7 +3711,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3701} 3711}
3702 3712
3703 3713
3704#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) 3714#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
3705void *__kmalloc(size_t size, gfp_t flags) 3715void *__kmalloc(size_t size, gfp_t flags)
3706{ 3716{
3707 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3717 return __do_kmalloc(size, flags, __builtin_return_address(0));
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc719552..d2a54fe71ea2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
151 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
152 */ 152 */
153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) 154 SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
155 SLAB_FAILSLAB)
155 156
156#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
157 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
217 218
218#endif 219#endif
219 220
220static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) 221static inline void stat(struct kmem_cache *s, enum stat_item si)
221{ 222{
222#ifdef CONFIG_SLUB_STATS 223#ifdef CONFIG_SLUB_STATS
223 c->stat[si]++; 224 __this_cpu_inc(s->cpu_slab->stat[si]);
224#endif 225#endif
225} 226}
226 227
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
242#endif 243#endif
243} 244}
244 245
245static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
246{
247#ifdef CONFIG_SMP
248 return s->cpu_slab[cpu];
249#else
250 return &s->cpu_slab;
251#endif
252}
253
254/* Verify that a pointer has an address that is valid within a slab page */ 246/* Verify that a pointer has an address that is valid within a slab page */
255static inline int check_valid_pointer(struct kmem_cache *s, 247static inline int check_valid_pointer(struct kmem_cache *s,
256 struct page *page, const void *object) 248 struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
269 return 1; 261 return 1;
270} 262}
271 263
272/*
273 * Slow version of get and set free pointer.
274 *
275 * This version requires touching the cache lines of kmem_cache which
276 * we avoid to do in the fast alloc free paths. There we obtain the offset
277 * from the page struct.
278 */
279static inline void *get_freepointer(struct kmem_cache *s, void *object) 264static inline void *get_freepointer(struct kmem_cache *s, void *object)
280{ 265{
281 return *(void **)(object + s->offset); 266 return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
1020 case 't': 1005 case 't':
1021 slub_debug |= SLAB_TRACE; 1006 slub_debug |= SLAB_TRACE;
1022 break; 1007 break;
1008 case 'a':
1009 slub_debug |= SLAB_FAILSLAB;
1010 break;
1023 default: 1011 default:
1024 printk(KERN_ERR "slub_debug option '%c' " 1012 printk(KERN_ERR "slub_debug option '%c' "
1025 "unknown. skipped\n", *str); 1013 "unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1124 if (!page) 1112 if (!page)
1125 return NULL; 1113 return NULL;
1126 1114
1127 stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); 1115 stat(s, ORDER_FALLBACK);
1128 } 1116 }
1129 1117
1130 if (kmemcheck_enabled 1118 if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1422static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) 1410static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1423{ 1411{
1424 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1412 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1425 struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
1426 1413
1427 __ClearPageSlubFrozen(page); 1414 __ClearPageSlubFrozen(page);
1428 if (page->inuse) { 1415 if (page->inuse) {
1429 1416
1430 if (page->freelist) { 1417 if (page->freelist) {
1431 add_partial(n, page, tail); 1418 add_partial(n, page, tail);
1432 stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); 1419 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1433 } else { 1420 } else {
1434 stat(c, DEACTIVATE_FULL); 1421 stat(s, DEACTIVATE_FULL);
1435 if (SLABDEBUG && PageSlubDebug(page) && 1422 if (SLABDEBUG && PageSlubDebug(page) &&
1436 (s->flags & SLAB_STORE_USER)) 1423 (s->flags & SLAB_STORE_USER))
1437 add_full(n, page); 1424 add_full(n, page);
1438 } 1425 }
1439 slab_unlock(page); 1426 slab_unlock(page);
1440 } else { 1427 } else {
1441 stat(c, DEACTIVATE_EMPTY); 1428 stat(s, DEACTIVATE_EMPTY);
1442 if (n->nr_partial < s->min_partial) { 1429 if (n->nr_partial < s->min_partial) {
1443 /* 1430 /*
1444 * Adding an empty slab to the partial slabs in order 1431 * Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1454 slab_unlock(page); 1441 slab_unlock(page);
1455 } else { 1442 } else {
1456 slab_unlock(page); 1443 slab_unlock(page);
1457 stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); 1444 stat(s, FREE_SLAB);
1458 discard_slab(s, page); 1445 discard_slab(s, page);
1459 } 1446 }
1460 } 1447 }
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1469 int tail = 1; 1456 int tail = 1;
1470 1457
1471 if (page->freelist) 1458 if (page->freelist)
1472 stat(c, DEACTIVATE_REMOTE_FREES); 1459 stat(s, DEACTIVATE_REMOTE_FREES);
1473 /* 1460 /*
1474 * Merge cpu freelist into slab freelist. Typically we get here 1461 * Merge cpu freelist into slab freelist. Typically we get here
1475 * because both freelists are empty. So this is unlikely 1462 * because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1482 1469
1483 /* Retrieve object from cpu_freelist */ 1470 /* Retrieve object from cpu_freelist */
1484 object = c->freelist; 1471 object = c->freelist;
1485 c->freelist = c->freelist[c->offset]; 1472 c->freelist = get_freepointer(s, c->freelist);
1486 1473
1487 /* And put onto the regular freelist */ 1474 /* And put onto the regular freelist */
1488 object[c->offset] = page->freelist; 1475 set_freepointer(s, object, page->freelist);
1489 page->freelist = object; 1476 page->freelist = object;
1490 page->inuse--; 1477 page->inuse--;
1491 } 1478 }
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1495 1482
1496static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1483static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1497{ 1484{
1498 stat(c, CPUSLAB_FLUSH); 1485 stat(s, CPUSLAB_FLUSH);
1499 slab_lock(c->page); 1486 slab_lock(c->page);
1500 deactivate_slab(s, c); 1487 deactivate_slab(s, c);
1501} 1488}
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1507 */ 1494 */
1508static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 1495static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
1509{ 1496{
1510 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 1497 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
1511 1498
1512 if (likely(c && c->page)) 1499 if (likely(c && c->page))
1513 flush_slab(s, c); 1500 flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1635 if (unlikely(!node_match(c, node))) 1622 if (unlikely(!node_match(c, node)))
1636 goto another_slab; 1623 goto another_slab;
1637 1624
1638 stat(c, ALLOC_REFILL); 1625 stat(s, ALLOC_REFILL);
1639 1626
1640load_freelist: 1627load_freelist:
1641 object = c->page->freelist; 1628 object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
1644 if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) 1631 if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
1645 goto debug; 1632 goto debug;
1646 1633
1647 c->freelist = object[c->offset]; 1634 c->freelist = get_freepointer(s, object);
1648 c->page->inuse = c->page->objects; 1635 c->page->inuse = c->page->objects;
1649 c->page->freelist = NULL; 1636 c->page->freelist = NULL;
1650 c->node = page_to_nid(c->page); 1637 c->node = page_to_nid(c->page);
1651unlock_out: 1638unlock_out:
1652 slab_unlock(c->page); 1639 slab_unlock(c->page);
1653 stat(c, ALLOC_SLOWPATH); 1640 stat(s, ALLOC_SLOWPATH);
1654 return object; 1641 return object;
1655 1642
1656another_slab: 1643another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
1660 new = get_partial(s, gfpflags, node); 1647 new = get_partial(s, gfpflags, node);
1661 if (new) { 1648 if (new) {
1662 c->page = new; 1649 c->page = new;
1663 stat(c, ALLOC_FROM_PARTIAL); 1650 stat(s, ALLOC_FROM_PARTIAL);
1664 goto load_freelist; 1651 goto load_freelist;
1665 } 1652 }
1666 1653
@@ -1673,8 +1660,8 @@ new_slab:
1673 local_irq_disable(); 1660 local_irq_disable();
1674 1661
1675 if (new) { 1662 if (new) {
1676 c = get_cpu_slab(s, smp_processor_id()); 1663 c = __this_cpu_ptr(s->cpu_slab);
1677 stat(c, ALLOC_SLAB); 1664 stat(s, ALLOC_SLAB);
1678 if (c->page) 1665 if (c->page)
1679 flush_slab(s, c); 1666 flush_slab(s, c);
1680 slab_lock(new); 1667 slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
1690 goto another_slab; 1677 goto another_slab;
1691 1678
1692 c->page->inuse++; 1679 c->page->inuse++;
1693 c->page->freelist = object[c->offset]; 1680 c->page->freelist = get_freepointer(s, object);
1694 c->node = -1; 1681 c->node = -1;
1695 goto unlock_out; 1682 goto unlock_out;
1696} 1683}
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1711 void **object; 1698 void **object;
1712 struct kmem_cache_cpu *c; 1699 struct kmem_cache_cpu *c;
1713 unsigned long flags; 1700 unsigned long flags;
1714 unsigned int objsize;
1715 1701
1716 gfpflags &= gfp_allowed_mask; 1702 gfpflags &= gfp_allowed_mask;
1717 1703
1718 lockdep_trace_alloc(gfpflags); 1704 lockdep_trace_alloc(gfpflags);
1719 might_sleep_if(gfpflags & __GFP_WAIT); 1705 might_sleep_if(gfpflags & __GFP_WAIT);
1720 1706
1721 if (should_failslab(s->objsize, gfpflags)) 1707 if (should_failslab(s->objsize, gfpflags, s->flags))
1722 return NULL; 1708 return NULL;
1723 1709
1724 local_irq_save(flags); 1710 local_irq_save(flags);
1725 c = get_cpu_slab(s, smp_processor_id()); 1711 c = __this_cpu_ptr(s->cpu_slab);
1726 objsize = c->objsize; 1712 object = c->freelist;
1727 if (unlikely(!c->freelist || !node_match(c, node))) 1713 if (unlikely(!object || !node_match(c, node)))
1728 1714
1729 object = __slab_alloc(s, gfpflags, node, addr, c); 1715 object = __slab_alloc(s, gfpflags, node, addr, c);
1730 1716
1731 else { 1717 else {
1732 object = c->freelist; 1718 c->freelist = get_freepointer(s, object);
1733 c->freelist = object[c->offset]; 1719 stat(s, ALLOC_FASTPATH);
1734 stat(c, ALLOC_FASTPATH);
1735 } 1720 }
1736 local_irq_restore(flags); 1721 local_irq_restore(flags);
1737 1722
1738 if (unlikely((gfpflags & __GFP_ZERO) && object)) 1723 if (unlikely(gfpflags & __GFP_ZERO) && object)
1739 memset(object, 0, objsize); 1724 memset(object, 0, s->objsize);
1740 1725
1741 kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); 1726 kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
1742 kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); 1727 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
1743 1728
1744 return object; 1729 return object;
1745} 1730}
@@ -1754,7 +1739,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1754} 1739}
1755EXPORT_SYMBOL(kmem_cache_alloc); 1740EXPORT_SYMBOL(kmem_cache_alloc);
1756 1741
1757#ifdef CONFIG_KMEMTRACE 1742#ifdef CONFIG_TRACING
1758void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1743void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1759{ 1744{
1760 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1745 return slab_alloc(s, gfpflags, -1, _RET_IP_);
@@ -1775,7 +1760,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1775EXPORT_SYMBOL(kmem_cache_alloc_node); 1760EXPORT_SYMBOL(kmem_cache_alloc_node);
1776#endif 1761#endif
1777 1762
1778#ifdef CONFIG_KMEMTRACE 1763#ifdef CONFIG_TRACING
1779void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1764void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1780 gfp_t gfpflags, 1765 gfp_t gfpflags,
1781 int node) 1766 int node)
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1794 * handling required then we can return immediately. 1779 * handling required then we can return immediately.
1795 */ 1780 */
1796static void __slab_free(struct kmem_cache *s, struct page *page, 1781static void __slab_free(struct kmem_cache *s, struct page *page,
1797 void *x, unsigned long addr, unsigned int offset) 1782 void *x, unsigned long addr)
1798{ 1783{
1799 void *prior; 1784 void *prior;
1800 void **object = (void *)x; 1785 void **object = (void *)x;
1801 struct kmem_cache_cpu *c;
1802 1786
1803 c = get_cpu_slab(s, raw_smp_processor_id()); 1787 stat(s, FREE_SLOWPATH);
1804 stat(c, FREE_SLOWPATH);
1805 slab_lock(page); 1788 slab_lock(page);
1806 1789
1807 if (unlikely(SLABDEBUG && PageSlubDebug(page))) 1790 if (unlikely(SLABDEBUG && PageSlubDebug(page)))
1808 goto debug; 1791 goto debug;
1809 1792
1810checks_ok: 1793checks_ok:
1811 prior = object[offset] = page->freelist; 1794 prior = page->freelist;
1795 set_freepointer(s, object, prior);
1812 page->freelist = object; 1796 page->freelist = object;
1813 page->inuse--; 1797 page->inuse--;
1814 1798
1815 if (unlikely(PageSlubFrozen(page))) { 1799 if (unlikely(PageSlubFrozen(page))) {
1816 stat(c, FREE_FROZEN); 1800 stat(s, FREE_FROZEN);
1817 goto out_unlock; 1801 goto out_unlock;
1818 } 1802 }
1819 1803
@@ -1826,7 +1810,7 @@ checks_ok:
1826 */ 1810 */
1827 if (unlikely(!prior)) { 1811 if (unlikely(!prior)) {
1828 add_partial(get_node(s, page_to_nid(page)), page, 1); 1812 add_partial(get_node(s, page_to_nid(page)), page, 1);
1829 stat(c, FREE_ADD_PARTIAL); 1813 stat(s, FREE_ADD_PARTIAL);
1830 } 1814 }
1831 1815
1832out_unlock: 1816out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
1839 * Slab still on the partial list. 1823 * Slab still on the partial list.
1840 */ 1824 */
1841 remove_partial(s, page); 1825 remove_partial(s, page);
1842 stat(c, FREE_REMOVE_PARTIAL); 1826 stat(s, FREE_REMOVE_PARTIAL);
1843 } 1827 }
1844 slab_unlock(page); 1828 slab_unlock(page);
1845 stat(c, FREE_SLAB); 1829 stat(s, FREE_SLAB);
1846 discard_slab(s, page); 1830 discard_slab(s, page);
1847 return; 1831 return;
1848 1832
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
1872 1856
1873 kmemleak_free_recursive(x, s->flags); 1857 kmemleak_free_recursive(x, s->flags);
1874 local_irq_save(flags); 1858 local_irq_save(flags);
1875 c = get_cpu_slab(s, smp_processor_id()); 1859 c = __this_cpu_ptr(s->cpu_slab);
1876 kmemcheck_slab_free(s, object, c->objsize); 1860 kmemcheck_slab_free(s, object, s->objsize);
1877 debug_check_no_locks_freed(object, c->objsize); 1861 debug_check_no_locks_freed(object, s->objsize);
1878 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1862 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1879 debug_check_no_obj_freed(object, c->objsize); 1863 debug_check_no_obj_freed(object, s->objsize);
1880 if (likely(page == c->page && c->node >= 0)) { 1864 if (likely(page == c->page && c->node >= 0)) {
1881 object[c->offset] = c->freelist; 1865 set_freepointer(s, object, c->freelist);
1882 c->freelist = object; 1866 c->freelist = object;
1883 stat(c, FREE_FASTPATH); 1867 stat(s, FREE_FASTPATH);
1884 } else 1868 } else
1885 __slab_free(s, page, x, addr, c->offset); 1869 __slab_free(s, page, x, addr);
1886 1870
1887 local_irq_restore(flags); 1871 local_irq_restore(flags);
1888} 1872}
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
2069 return ALIGN(align, sizeof(void *)); 2053 return ALIGN(align, sizeof(void *));
2070} 2054}
2071 2055
2072static void init_kmem_cache_cpu(struct kmem_cache *s,
2073 struct kmem_cache_cpu *c)
2074{
2075 c->page = NULL;
2076 c->freelist = NULL;
2077 c->node = 0;
2078 c->offset = s->offset / sizeof(void *);
2079 c->objsize = s->objsize;
2080#ifdef CONFIG_SLUB_STATS
2081 memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
2082#endif
2083}
2084
2085static void 2056static void
2086init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 2057init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2087{ 2058{
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2095#endif 2066#endif
2096} 2067}
2097 2068
2098#ifdef CONFIG_SMP 2069static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
2099/*
2100 * Per cpu array for per cpu structures.
2101 *
2102 * The per cpu array places all kmem_cache_cpu structures from one processor
2103 * close together meaning that it becomes possible that multiple per cpu
2104 * structures are contained in one cacheline. This may be particularly
2105 * beneficial for the kmalloc caches.
2106 *
2107 * A desktop system typically has around 60-80 slabs. With 100 here we are
2108 * likely able to get per cpu structures for all caches from the array defined
2109 * here. We must be able to cover all kmalloc caches during bootstrap.
2110 *
2111 * If the per cpu array is exhausted then fall back to kmalloc
2112 * of individual cachelines. No sharing is possible then.
2113 */
2114#define NR_KMEM_CACHE_CPU 100
2115
2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2117 kmem_cache_cpu);
2118
2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
2121
2122static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
2123 int cpu, gfp_t flags)
2124{
2125 struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
2126
2127 if (c)
2128 per_cpu(kmem_cache_cpu_free, cpu) =
2129 (void *)c->freelist;
2130 else {
2131 /* Table overflow: So allocate ourselves */
2132 c = kmalloc_node(
2133 ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
2134 flags, cpu_to_node(cpu));
2135 if (!c)
2136 return NULL;
2137 }
2138
2139 init_kmem_cache_cpu(s, c);
2140 return c;
2141}
2142
2143static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
2144{
2145 if (c < per_cpu(kmem_cache_cpu, cpu) ||
2146 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2147 kfree(c);
2148 return;
2149 }
2150 c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
2151 per_cpu(kmem_cache_cpu_free, cpu) = c;
2152}
2153
2154static void free_kmem_cache_cpus(struct kmem_cache *s)
2155{
2156 int cpu;
2157
2158 for_each_online_cpu(cpu) {
2159 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2160
2161 if (c) {
2162 s->cpu_slab[cpu] = NULL;
2163 free_kmem_cache_cpu(c, cpu);
2164 }
2165 }
2166}
2167
2168static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2169{
2170 int cpu;
2171
2172 for_each_online_cpu(cpu) {
2173 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2174
2175 if (c)
2176 continue;
2177
2178 c = alloc_kmem_cache_cpu(s, cpu, flags);
2179 if (!c) {
2180 free_kmem_cache_cpus(s);
2181 return 0;
2182 }
2183 s->cpu_slab[cpu] = c;
2184 }
2185 return 1;
2186}
2187
2188/*
2189 * Initialize the per cpu array.
2190 */
2191static void init_alloc_cpu_cpu(int cpu)
2192{
2193 int i;
2194
2195 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
2196 return;
2197
2198 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2199 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2200
2201 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2202}
2203 2070
2204static void __init init_alloc_cpu(void) 2071static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2205{ 2072{
2206 int cpu; 2073 if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
2207 2074 /*
2208 for_each_online_cpu(cpu) 2075 * Boot time creation of the kmalloc array. Use static per cpu data
2209 init_alloc_cpu_cpu(cpu); 2076 * since the per cpu allocator is not available yet.
2210 } 2077 */
2078 s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
2079 else
2080 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2211 2081
2212#else 2082 if (!s->cpu_slab)
2213static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} 2083 return 0;
2214static inline void init_alloc_cpu(void) {}
2215 2084
2216static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
2217{
2218 init_kmem_cache_cpu(s, &s->cpu_slab);
2219 return 1; 2085 return 1;
2220} 2086}
2221#endif
2222 2087
2223#ifdef CONFIG_NUMA 2088#ifdef CONFIG_NUMA
2224/* 2089/*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2287 int node; 2152 int node;
2288 int local_node; 2153 int local_node;
2289 2154
2290 if (slab_state >= UP) 2155 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s >= kmalloc_caches + KMALLOC_CACHES))
2291 local_node = page_to_nid(virt_to_page(s)); 2157 local_node = page_to_nid(virt_to_page(s));
2292 else 2158 else
2293 local_node = 0; 2159 local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2502 2368
2503 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) 2369 if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
2504 return 1; 2370 return 1;
2371
2505 free_kmem_cache_nodes(s); 2372 free_kmem_cache_nodes(s);
2506error: 2373error:
2507 if (flags & SLAB_PANIC) 2374 if (flags & SLAB_PANIC)
@@ -2519,6 +2386,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2519{ 2386{
2520 struct page *page; 2387 struct page *page;
2521 2388
2389 if (!kern_ptr_validate(object, s->size))
2390 return 0;
2391
2522 page = get_object_page(object); 2392 page = get_object_page(object);
2523 2393
2524 if (!page || s != page->slab) 2394 if (!page || s != page->slab)
@@ -2609,9 +2479,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2609 int node; 2479 int node;
2610 2480
2611 flush_all(s); 2481 flush_all(s);
2612 2482 free_percpu(s->cpu_slab);
2613 /* Attempt to free all objects */ 2483 /* Attempt to free all objects */
2614 free_kmem_cache_cpus(s);
2615 for_each_node_state(node, N_NORMAL_MEMORY) { 2484 for_each_node_state(node, N_NORMAL_MEMORY) {
2616 struct kmem_cache_node *n = get_node(s, node); 2485 struct kmem_cache_node *n = get_node(s, node);
2617 2486
@@ -2651,7 +2520,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2651 * Kmalloc subsystem 2520 * Kmalloc subsystem
2652 *******************************************************************/ 2521 *******************************************************************/
2653 2522
2654struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; 2523struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
2655EXPORT_SYMBOL(kmalloc_caches); 2524EXPORT_SYMBOL(kmalloc_caches);
2656 2525
2657static int __init setup_slub_min_order(char *str) 2526static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2610,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2741 char *text; 2610 char *text;
2742 size_t realsize; 2611 size_t realsize;
2743 unsigned long slabflags; 2612 unsigned long slabflags;
2613 int i;
2744 2614
2745 s = kmalloc_caches_dma[index]; 2615 s = kmalloc_caches_dma[index];
2746 if (s) 2616 if (s)
@@ -2760,7 +2630,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2760 realsize = kmalloc_caches[index].objsize; 2630 realsize = kmalloc_caches[index].objsize;
2761 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", 2631 text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
2762 (unsigned int)realsize); 2632 (unsigned int)realsize);
2763 s = kmalloc(kmem_size, flags & ~SLUB_DMA); 2633
2634 s = NULL;
2635 for (i = 0; i < KMALLOC_CACHES; i++)
2636 if (!kmalloc_caches[i].size)
2637 break;
2638
2639 BUG_ON(i >= KMALLOC_CACHES);
2640 s = kmalloc_caches + i;
2764 2641
2765 /* 2642 /*
2766 * Must defer sysfs creation to a workqueue because we don't know 2643 * Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2649,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
2772 if (slab_state >= SYSFS) 2649 if (slab_state >= SYSFS)
2773 slabflags |= __SYSFS_ADD_DEFERRED; 2650 slabflags |= __SYSFS_ADD_DEFERRED;
2774 2651
2775 if (!s || !text || !kmem_cache_open(s, flags, text, 2652 if (!text || !kmem_cache_open(s, flags, text,
2776 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { 2653 realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
2777 kfree(s); 2654 s->size = 0;
2778 kfree(text); 2655 kfree(text);
2779 goto unlock_out; 2656 goto unlock_out;
2780 } 2657 }
@@ -3086,7 +2963,7 @@ static void slab_mem_offline_callback(void *arg)
3086 /* 2963 /*
3087 * if n->nr_slabs > 0, slabs still exist on the node 2964 * if n->nr_slabs > 0, slabs still exist on the node
3088 * that is going down. We were unable to free them, 2965 * that is going down. We were unable to free them,
3089 * and offline_pages() function shoudn't call this 2966 * and offline_pages() function shouldn't call this
3090 * callback. So, we must fail. 2967 * callback. So, we must fail.
3091 */ 2968 */
3092 BUG_ON(slabs_node(s, offline_node)); 2969 BUG_ON(slabs_node(s, offline_node));
@@ -3176,8 +3053,6 @@ void __init kmem_cache_init(void)
3176 int i; 3053 int i;
3177 int caches = 0; 3054 int caches = 0;
3178 3055
3179 init_alloc_cpu();
3180
3181#ifdef CONFIG_NUMA 3056#ifdef CONFIG_NUMA
3182 /* 3057 /*
3183 * Must first have the slab cache available for the allocations of the 3058 * Must first have the slab cache available for the allocations of the
@@ -3261,8 +3136,10 @@ void __init kmem_cache_init(void)
3261 3136
3262#ifdef CONFIG_SMP 3137#ifdef CONFIG_SMP
3263 register_cpu_notifier(&slab_notifier); 3138 register_cpu_notifier(&slab_notifier);
3264 kmem_size = offsetof(struct kmem_cache, cpu_slab) + 3139#endif
3265 nr_cpu_ids * sizeof(struct kmem_cache_cpu *); 3140#ifdef CONFIG_NUMA
3141 kmem_size = offsetof(struct kmem_cache, node) +
3142 nr_node_ids * sizeof(struct kmem_cache_node *);
3266#else 3143#else
3267 kmem_size = sizeof(struct kmem_cache); 3144 kmem_size = sizeof(struct kmem_cache);
3268#endif 3145#endif
@@ -3351,22 +3228,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3351 down_write(&slub_lock); 3228 down_write(&slub_lock);
3352 s = find_mergeable(size, align, flags, name, ctor); 3229 s = find_mergeable(size, align, flags, name, ctor);
3353 if (s) { 3230 if (s) {
3354 int cpu;
3355
3356 s->refcount++; 3231 s->refcount++;
3357 /* 3232 /*
3358 * Adjust the object sizes so that we clear 3233 * Adjust the object sizes so that we clear
3359 * the complete object on kzalloc. 3234 * the complete object on kzalloc.
3360 */ 3235 */
3361 s->objsize = max(s->objsize, (int)size); 3236 s->objsize = max(s->objsize, (int)size);
3362
3363 /*
3364 * And then we need to update the object size in the
3365 * per cpu structures
3366 */
3367 for_each_online_cpu(cpu)
3368 get_cpu_slab(s, cpu)->objsize = s->objsize;
3369
3370 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3237 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3371 up_write(&slub_lock); 3238 up_write(&slub_lock);
3372 3239
@@ -3420,29 +3287,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3420 unsigned long flags; 3287 unsigned long flags;
3421 3288
3422 switch (action) { 3289 switch (action) {
3423 case CPU_UP_PREPARE:
3424 case CPU_UP_PREPARE_FROZEN:
3425 init_alloc_cpu_cpu(cpu);
3426 down_read(&slub_lock);
3427 list_for_each_entry(s, &slab_caches, list)
3428 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
3429 GFP_KERNEL);
3430 up_read(&slub_lock);
3431 break;
3432
3433 case CPU_UP_CANCELED: 3290 case CPU_UP_CANCELED:
3434 case CPU_UP_CANCELED_FROZEN: 3291 case CPU_UP_CANCELED_FROZEN:
3435 case CPU_DEAD: 3292 case CPU_DEAD:
3436 case CPU_DEAD_FROZEN: 3293 case CPU_DEAD_FROZEN:
3437 down_read(&slub_lock); 3294 down_read(&slub_lock);
3438 list_for_each_entry(s, &slab_caches, list) { 3295 list_for_each_entry(s, &slab_caches, list) {
3439 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3440
3441 local_irq_save(flags); 3296 local_irq_save(flags);
3442 __flush_cpu_slab(s, cpu); 3297 __flush_cpu_slab(s, cpu);
3443 local_irq_restore(flags); 3298 local_irq_restore(flags);
3444 free_kmem_cache_cpu(c, cpu);
3445 s->cpu_slab[cpu] = NULL;
3446 } 3299 }
3447 up_read(&slub_lock); 3300 up_read(&slub_lock);
3448 break; 3301 break;
@@ -3928,7 +3781,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3928 int cpu; 3781 int cpu;
3929 3782
3930 for_each_possible_cpu(cpu) { 3783 for_each_possible_cpu(cpu) {
3931 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3784 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
3932 3785
3933 if (!c || c->node < 0) 3786 if (!c || c->node < 0)
3934 continue; 3787 continue;
@@ -4171,6 +4024,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4171} 4024}
4172SLAB_ATTR(trace); 4025SLAB_ATTR(trace);
4173 4026
4027#ifdef CONFIG_FAILSLAB
4028static ssize_t failslab_show(struct kmem_cache *s, char *buf)
4029{
4030 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
4031}
4032
4033static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
4034 size_t length)
4035{
4036 s->flags &= ~SLAB_FAILSLAB;
4037 if (buf[0] == '1')
4038 s->flags |= SLAB_FAILSLAB;
4039 return length;
4040}
4041SLAB_ATTR(failslab);
4042#endif
4043
4174static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4044static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4175{ 4045{
4176 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4046 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4223,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4353 return -ENOMEM; 4223 return -ENOMEM;
4354 4224
4355 for_each_online_cpu(cpu) { 4225 for_each_online_cpu(cpu) {
4356 unsigned x = get_cpu_slab(s, cpu)->stat[si]; 4226 unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
4357 4227
4358 data[cpu] = x; 4228 data[cpu] = x;
4359 sum += x; 4229 sum += x;
@@ -4371,12 +4241,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
4371 return len + sprintf(buf + len, "\n"); 4241 return len + sprintf(buf + len, "\n");
4372} 4242}
4373 4243
4244static void clear_stat(struct kmem_cache *s, enum stat_item si)
4245{
4246 int cpu;
4247
4248 for_each_online_cpu(cpu)
4249 per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
4250}
4251
4374#define STAT_ATTR(si, text) \ 4252#define STAT_ATTR(si, text) \
4375static ssize_t text##_show(struct kmem_cache *s, char *buf) \ 4253static ssize_t text##_show(struct kmem_cache *s, char *buf) \
4376{ \ 4254{ \
4377 return show_stat(s, buf, si); \ 4255 return show_stat(s, buf, si); \
4378} \ 4256} \
4379SLAB_ATTR_RO(text); \ 4257static ssize_t text##_store(struct kmem_cache *s, \
4258 const char *buf, size_t length) \
4259{ \
4260 if (buf[0] != '0') \
4261 return -EINVAL; \
4262 clear_stat(s, si); \
4263 return length; \
4264} \
4265SLAB_ATTR(text); \
4380 4266
4381STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 4267STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
4382STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 4268STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
@@ -4451,6 +4337,10 @@ static struct attribute *slab_attrs[] = {
4451 &deactivate_remote_frees_attr.attr, 4337 &deactivate_remote_frees_attr.attr,
4452 &order_fallback_attr.attr, 4338 &order_fallback_attr.attr,
4453#endif 4339#endif
4340#ifdef CONFIG_FAILSLAB
4341 &failslab_attr.attr,
4342#endif
4343
4454 NULL 4344 NULL
4455}; 4345};
4456 4346
@@ -4503,7 +4393,7 @@ static void kmem_cache_release(struct kobject *kobj)
4503 kfree(s); 4393 kfree(s);
4504} 4394}
4505 4395
4506static struct sysfs_ops slab_sysfs_ops = { 4396static const struct sysfs_ops slab_sysfs_ops = {
4507 .show = slab_attr_show, 4397 .show = slab_attr_show,
4508 .store = slab_attr_store, 4398 .store = slab_attr_store,
4509}; 4399};
@@ -4522,7 +4412,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
4522 return 0; 4412 return 0;
4523} 4413}
4524 4414
4525static struct kset_uevent_ops slab_uevent_ops = { 4415static const struct kset_uevent_ops slab_uevent_ops = {
4526 .filter = uevent_filter, 4416 .filter = uevent_filter,
4527}; 4417};
4528 4418
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..aa33fd67fa41 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -22,6 +22,7 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/slab.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
@@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
40 unsigned long align, 41 unsigned long align,
41 unsigned long goal) 42 unsigned long goal)
42{ 43{
43 return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); 44 return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
44} 45}
45 46
47static void *vmemmap_buf;
48static void *vmemmap_buf_end;
46 49
47void * __meminit vmemmap_alloc_block(unsigned long size, int node) 50void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 51{
@@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
64 __pa(MAX_DMA_ADDRESS)); 67 __pa(MAX_DMA_ADDRESS));
65} 68}
66 69
70/* need to make sure size is all the same during early stage */
71void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
72{
73 void *ptr;
74
75 if (!vmemmap_buf)
76 return vmemmap_alloc_block(size, node);
77
78 /* take the from buf */
79 ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
80 if (ptr + size > vmemmap_buf_end)
81 return vmemmap_alloc_block(size, node);
82
83 vmemmap_buf = ptr + size;
84
85 return ptr;
86}
87
67void __meminit vmemmap_verify(pte_t *pte, int node, 88void __meminit vmemmap_verify(pte_t *pte, int node,
68 unsigned long start, unsigned long end) 89 unsigned long start, unsigned long end)
69{ 90{
@@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
80 pte_t *pte = pte_offset_kernel(pmd, addr); 101 pte_t *pte = pte_offset_kernel(pmd, addr);
81 if (pte_none(*pte)) { 102 if (pte_none(*pte)) {
82 pte_t entry; 103 pte_t entry;
83 void *p = vmemmap_alloc_block(PAGE_SIZE, node); 104 void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
84 if (!p) 105 if (!p)
85 return NULL; 106 return NULL;
86 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 107 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +184,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
163 184
164 return map; 185 return map;
165} 186}
187
188void __init sparse_mem_maps_populate_node(struct page **map_map,
189 unsigned long pnum_begin,
190 unsigned long pnum_end,
191 unsigned long map_count, int nodeid)
192{
193 unsigned long pnum;
194 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
195 void *vmemmap_buf_start;
196
197 size = ALIGN(size, PMD_SIZE);
198 vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
199 PMD_SIZE, __pa(MAX_DMA_ADDRESS));
200
201 if (vmemmap_buf_start) {
202 vmemmap_buf = vmemmap_buf_start;
203 vmemmap_buf_end = vmemmap_buf_start + size * map_count;
204 }
205
206 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
207 struct mem_section *ms;
208
209 if (!present_section_nr(pnum))
210 continue;
211
212 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
213 if (map_map[pnum])
214 continue;
215 ms = __nr_to_section(pnum);
216 printk(KERN_ERR "%s: sparsemem memory map backing failed "
217 "some memory will not be available.\n", __func__);
218 ms->section_mem_map = 0;
219 }
220
221 if (vmemmap_buf_start) {
222 /* need to free left buf */
223#ifdef CONFIG_NO_BOOTMEM
224 free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
225 if (vmemmap_buf_start < vmemmap_buf) {
226 char name[15];
227
228 snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
229 reserve_early_without_check(__pa(vmemmap_buf_start),
230 __pa(vmemmap_buf), name);
231 }
232#else
233 free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
234#endif
235 vmemmap_buf = NULL;
236 vmemmap_buf_end = NULL;
237 }
238}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..dc0cc4d43ff3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,6 +2,7 @@
2 * sparse memory mappings. 2 * sparse memory mappings.
3 */ 3 */
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/slab.h>
5#include <linux/mmzone.h> 6#include <linux/mmzone.h>
6#include <linux/bootmem.h> 7#include <linux/bootmem.h>
7#include <linux/highmem.h> 8#include <linux/highmem.h>
@@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void)
271 272
272#ifdef CONFIG_MEMORY_HOTREMOVE 273#ifdef CONFIG_MEMORY_HOTREMOVE
273static unsigned long * __init 274static unsigned long * __init
274sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long count)
275{ 277{
276 unsigned long section_nr; 278 unsigned long section_nr;
277 279
@@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
286 * this problem. 288 * this problem.
287 */ 289 */
288 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 290 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
289 return alloc_bootmem_section(usemap_size(), section_nr); 291 return alloc_bootmem_section(usemap_size() * count, section_nr);
290} 292}
291 293
292static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 294static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
329} 331}
330#else 332#else
331static unsigned long * __init 333static unsigned long * __init
332sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) 334sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
335 unsigned long count)
333{ 336{
334 return NULL; 337 return NULL;
335} 338}
@@ -339,27 +342,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 342}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 343#endif /* CONFIG_MEMORY_HOTREMOVE */
341 344
342static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 345static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
346 unsigned long pnum_begin,
347 unsigned long pnum_end,
348 unsigned long usemap_count, int nodeid)
343{ 349{
344 unsigned long *usemap; 350 void *usemap;
345 struct mem_section *ms = __nr_to_section(pnum); 351 unsigned long pnum;
346 int nid = sparse_early_nid(ms); 352 int size = usemap_size();
347
348 usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
349 if (usemap)
350 return usemap;
351 353
352 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count);
353 if (usemap) { 356 if (usemap) {
354 check_usemap_section_nr(nid, usemap); 357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
355 return usemap; 358 if (!present_section_nr(pnum))
359 continue;
360 usemap_map[pnum] = usemap;
361 usemap += size;
362 }
363 return;
356 } 364 }
357 365
358 /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ 366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
359 nid = 0; 367 if (usemap) {
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
369 if (!present_section_nr(pnum))
370 continue;
371 usemap_map[pnum] = usemap;
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 }
360 377
361 printk(KERN_WARNING "%s: allocation failed\n", __func__); 378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
362 return NULL;
363} 379}
364 380
365#ifndef CONFIG_SPARSEMEM_VMEMMAP 381#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +391,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
375 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); 391 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
376 return map; 392 return map;
377} 393}
394void __init sparse_mem_maps_populate_node(struct page **map_map,
395 unsigned long pnum_begin,
396 unsigned long pnum_end,
397 unsigned long map_count, int nodeid)
398{
399 void *map;
400 unsigned long pnum;
401 unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
402
403 map = alloc_remap(nodeid, size * map_count);
404 if (map) {
405 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
406 if (!present_section_nr(pnum))
407 continue;
408 map_map[pnum] = map;
409 map += size;
410 }
411 return;
412 }
413
414 size = PAGE_ALIGN(size);
415 map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
416 if (map) {
417 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
418 if (!present_section_nr(pnum))
419 continue;
420 map_map[pnum] = map;
421 map += size;
422 }
423 return;
424 }
425
426 /* fallback */
427 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
428 struct mem_section *ms;
429
430 if (!present_section_nr(pnum))
431 continue;
432 map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
433 if (map_map[pnum])
434 continue;
435 ms = __nr_to_section(pnum);
436 printk(KERN_ERR "%s: sparsemem memory map backing failed "
437 "some memory will not be available.\n", __func__);
438 ms->section_mem_map = 0;
439 }
440}
378#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 441#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
379 442
443#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
444static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
445 unsigned long pnum_begin,
446 unsigned long pnum_end,
447 unsigned long map_count, int nodeid)
448{
449 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
450 map_count, nodeid);
451}
452#else
380static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) 453static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
381{ 454{
382 struct page *map; 455 struct page *map;
@@ -392,10 +465,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
392 ms->section_mem_map = 0; 465 ms->section_mem_map = 0;
393 return NULL; 466 return NULL;
394} 467}
468#endif
395 469
396void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) 470void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
397{ 471{
398} 472}
473
399/* 474/*
400 * Allocate the accumulated non-linear sections, allocate a mem_map 475 * Allocate the accumulated non-linear sections, allocate a mem_map
401 * for each and record the physical to section mapping. 476 * for each and record the physical to section mapping.
@@ -407,6 +482,14 @@ void __init sparse_init(void)
407 unsigned long *usemap; 482 unsigned long *usemap;
408 unsigned long **usemap_map; 483 unsigned long **usemap_map;
409 int size; 484 int size;
485 int nodeid_begin = 0;
486 unsigned long pnum_begin = 0;
487 unsigned long usemap_count;
488#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
489 unsigned long map_count;
490 int size2;
491 struct page **map_map;
492#endif
410 493
411 /* 494 /*
412 * map is using big page (aka 2M in x86 64 bit) 495 * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +508,81 @@ void __init sparse_init(void)
425 panic("can not allocate usemap_map\n"); 508 panic("can not allocate usemap_map\n");
426 509
427 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 510 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
511 struct mem_section *ms;
512
428 if (!present_section_nr(pnum)) 513 if (!present_section_nr(pnum))
429 continue; 514 continue;
430 usemap_map[pnum] = sparse_early_usemap_alloc(pnum); 515 ms = __nr_to_section(pnum);
516 nodeid_begin = sparse_early_nid(ms);
517 pnum_begin = pnum;
518 break;
431 } 519 }
520 usemap_count = 1;
521 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
522 struct mem_section *ms;
523 int nodeid;
524
525 if (!present_section_nr(pnum))
526 continue;
527 ms = __nr_to_section(pnum);
528 nodeid = sparse_early_nid(ms);
529 if (nodeid == nodeid_begin) {
530 usemap_count++;
531 continue;
532 }
533 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
534 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
535 usemap_count, nodeid_begin);
536 /* new start, update count etc*/
537 nodeid_begin = nodeid;
538 pnum_begin = pnum;
539 usemap_count = 1;
540 }
541 /* ok, last chunk */
542 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
543 usemap_count, nodeid_begin);
544
545#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
546 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
547 map_map = alloc_bootmem(size2);
548 if (!map_map)
549 panic("can not allocate map_map\n");
550
551 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
552 struct mem_section *ms;
553
554 if (!present_section_nr(pnum))
555 continue;
556 ms = __nr_to_section(pnum);
557 nodeid_begin = sparse_early_nid(ms);
558 pnum_begin = pnum;
559 break;
560 }
561 map_count = 1;
562 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
563 struct mem_section *ms;
564 int nodeid;
565
566 if (!present_section_nr(pnum))
567 continue;
568 ms = __nr_to_section(pnum);
569 nodeid = sparse_early_nid(ms);
570 if (nodeid == nodeid_begin) {
571 map_count++;
572 continue;
573 }
574 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
575 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
576 map_count, nodeid_begin);
577 /* new start, update count etc*/
578 nodeid_begin = nodeid;
579 pnum_begin = pnum;
580 map_count = 1;
581 }
582 /* ok, last chunk */
583 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
584 map_count, nodeid_begin);
585#endif
432 586
433 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 587 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
434 if (!present_section_nr(pnum)) 588 if (!present_section_nr(pnum))
@@ -438,7 +592,11 @@ void __init sparse_init(void)
438 if (!usemap) 592 if (!usemap)
439 continue; 593 continue;
440 594
595#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
596 map = map_map[pnum];
597#else
441 map = sparse_early_mem_map_alloc(pnum); 598 map = sparse_early_mem_map_alloc(pnum);
599#endif
442 if (!map) 600 if (!map)
443 continue; 601 continue;
444 602
@@ -448,6 +606,9 @@ void __init sparse_init(void)
448 606
449 vmemmap_populate_print_last(); 607 vmemmap_populate_print_last();
450 608
609#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
610 free_bootmem(__pa(map_map), size2);
611#endif
451 free_bootmem(__pa(usemap_map), size); 612 free_bootmem(__pa(usemap_map), size);
452} 613}
453 614
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7ed..7cd60bf0a972 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/gfp.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
@@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page)
55 del_page_from_lru(zone, page); 56 del_page_from_lru(zone, page);
56 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
57 } 58 }
58 free_hot_page(page); 59 free_hot_cold_page(page, 0);
59} 60}
60 61
61static void put_compound_page(struct page *page) 62static void put_compound_page(struct page *page)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb1cb4a..e10f5833167f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/gfp.h>
11#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13#include <linux/swapops.h> 14#include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c590eef7912..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/ksm.h>
25#include <linux/rmap.h> 26#include <linux/rmap.h>
26#include <linux/security.h> 27#include <linux/security.h>
27#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
@@ -35,11 +36,15 @@
35#include <linux/swapops.h> 36#include <linux/swapops.h>
36#include <linux/page_cgroup.h> 37#include <linux/page_cgroup.h>
37 38
39static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
40 unsigned char);
41static void free_swap_count_continuations(struct swap_info_struct *);
42static sector_t map_swap_entry(swp_entry_t, struct block_device**);
43
38static DEFINE_SPINLOCK(swap_lock); 44static DEFINE_SPINLOCK(swap_lock);
39static unsigned int nr_swapfiles; 45static unsigned int nr_swapfiles;
40long nr_swap_pages; 46long nr_swap_pages;
41long total_swap_pages; 47long total_swap_pages;
42static int swap_overflow;
43static int least_priority; 48static int least_priority;
44 49
45static const char Bad_file[] = "Bad swap file entry "; 50static const char Bad_file[] = "Bad swap file entry ";
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry ";
49 54
50static struct swap_list_t swap_list = {-1, -1}; 55static struct swap_list_t swap_list = {-1, -1};
51 56
52static struct swap_info_struct swap_info[MAX_SWAPFILES]; 57static struct swap_info_struct *swap_info[MAX_SWAPFILES];
53 58
54static DEFINE_MUTEX(swapon_mutex); 59static DEFINE_MUTEX(swapon_mutex);
55 60
56/* For reference count accounting in swap_map */ 61static inline unsigned char swap_count(unsigned char ent)
57/* enum for swap_map[] handling. internal use only */
58enum {
59 SWAP_MAP = 0, /* ops for reference from swap users */
60 SWAP_CACHE, /* ops for reference from swap cache */
61};
62
63static inline int swap_count(unsigned short ent)
64{
65 return ent & SWAP_COUNT_MASK;
66}
67
68static inline bool swap_has_cache(unsigned short ent)
69{
70 return !!(ent & SWAP_HAS_CACHE);
71}
72
73static inline unsigned short encode_swapmap(int count, bool has_cache)
74{ 62{
75 unsigned short ret = count; 63 return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
76
77 if (has_cache)
78 return SWAP_HAS_CACHE | ret;
79 return ret;
80} 64}
81 65
82/* returnes 1 if swap entry is freed */ 66/* returns 1 if swap entry is freed */
83static int 67static int
84__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) 68__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
85{ 69{
86 int type = si - swap_info; 70 swp_entry_t entry = swp_entry(si->type, offset);
87 swp_entry_t entry = swp_entry(type, offset);
88 struct page *page; 71 struct page *page;
89 int ret = 0; 72 int ret = 0;
90 73
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
120 down_read(&swap_unplug_sem); 103 down_read(&swap_unplug_sem);
121 entry.val = page_private(page); 104 entry.val = page_private(page);
122 if (PageSwapCache(page)) { 105 if (PageSwapCache(page)) {
123 struct block_device *bdev = swap_info[swp_type(entry)].bdev; 106 struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
124 struct backing_dev_info *bdi; 107 struct backing_dev_info *bdi;
125 108
126 /* 109 /*
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
146static int discard_swap(struct swap_info_struct *si) 129static int discard_swap(struct swap_info_struct *si)
147{ 130{
148 struct swap_extent *se; 131 struct swap_extent *se;
132 sector_t start_block;
133 sector_t nr_blocks;
149 int err = 0; 134 int err = 0;
150 135
151 list_for_each_entry(se, &si->extent_list, list) { 136 /* Do not discard the swap header page! */
152 sector_t start_block = se->start_block << (PAGE_SHIFT - 9); 137 se = &si->first_swap_extent;
153 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 138 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
143 if (err)
144 return err;
145 cond_resched();
146 }
154 147
155 if (se->start_page == 0) { 148 list_for_each_entry(se, &si->first_swap_extent.list, list) {
156 /* Do not discard the swap header page! */ 149 start_block = se->start_block << (PAGE_SHIFT - 9);
157 start_block += 1 << (PAGE_SHIFT - 9); 150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
158 nr_blocks -= 1 << (PAGE_SHIFT - 9);
159 if (!nr_blocks)
160 continue;
161 }
162 151
163 err = blkdev_issue_discard(si->bdev, start_block, 152 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL, 153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
165 DISCARD_FL_BARRIER);
166 if (err) 154 if (err)
167 break; 155 break;
168 156
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
201 start_block <<= PAGE_SHIFT - 9; 189 start_block <<= PAGE_SHIFT - 9;
202 nr_blocks <<= PAGE_SHIFT - 9; 190 nr_blocks <<= PAGE_SHIFT - 9;
203 if (blkdev_issue_discard(si->bdev, start_block, 191 if (blkdev_issue_discard(si->bdev, start_block,
204 nr_blocks, GFP_NOIO, 192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
205 DISCARD_FL_BARRIER))
206 break; 193 break;
207 } 194 }
208 195
209 lh = se->list.next; 196 lh = se->list.next;
210 if (lh == &si->extent_list)
211 lh = lh->next;
212 se = list_entry(lh, struct swap_extent, list); 197 se = list_entry(lh, struct swap_extent, list);
213 } 198 }
214} 199}
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word)
223#define LATENCY_LIMIT 256 208#define LATENCY_LIMIT 256
224 209
225static inline unsigned long scan_swap_map(struct swap_info_struct *si, 210static inline unsigned long scan_swap_map(struct swap_info_struct *si,
226 int cache) 211 unsigned char usage)
227{ 212{
228 unsigned long offset; 213 unsigned long offset;
229 unsigned long scan_base; 214 unsigned long scan_base;
@@ -354,10 +339,7 @@ checks:
354 si->lowest_bit = si->max; 339 si->lowest_bit = si->max;
355 si->highest_bit = 0; 340 si->highest_bit = 0;
356 } 341 }
357 if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ 342 si->swap_map[offset] = usage;
358 si->swap_map[offset] = encode_swapmap(0, true);
359 else /* at suspend */
360 si->swap_map[offset] = encode_swapmap(1, false);
361 si->cluster_next = offset + 1; 343 si->cluster_next = offset + 1;
362 si->flags -= SWP_SCANNING; 344 si->flags -= SWP_SCANNING;
363 345
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void)
467 nr_swap_pages--; 449 nr_swap_pages--;
468 450
469 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 451 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
470 si = swap_info + type; 452 si = swap_info[type];
471 next = si->next; 453 next = si->next;
472 if (next < 0 || 454 if (next < 0 ||
473 (!wrapped && si->prio != swap_info[next].prio)) { 455 (!wrapped && si->prio != swap_info[next]->prio)) {
474 next = swap_list.head; 456 next = swap_list.head;
475 wrapped++; 457 wrapped++;
476 } 458 }
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void)
482 464
483 swap_list.next = next; 465 swap_list.next = next;
484 /* This is called for allocating swap entry for cache */ 466 /* This is called for allocating swap entry for cache */
485 offset = scan_swap_map(si, SWAP_CACHE); 467 offset = scan_swap_map(si, SWAP_HAS_CACHE);
486 if (offset) { 468 if (offset) {
487 spin_unlock(&swap_lock); 469 spin_unlock(&swap_lock);
488 return swp_entry(type, offset); 470 return swp_entry(type, offset);
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type)
503 pgoff_t offset; 485 pgoff_t offset;
504 486
505 spin_lock(&swap_lock); 487 spin_lock(&swap_lock);
506 si = swap_info + type; 488 si = swap_info[type];
507 if (si->flags & SWP_WRITEOK) { 489 if (si && (si->flags & SWP_WRITEOK)) {
508 nr_swap_pages--; 490 nr_swap_pages--;
509 /* This is called for allocating swap entry, not cache */ 491 /* This is called for allocating swap entry, not cache */
510 offset = scan_swap_map(si, SWAP_MAP); 492 offset = scan_swap_map(si, 1);
511 if (offset) { 493 if (offset) {
512 spin_unlock(&swap_lock); 494 spin_unlock(&swap_lock);
513 return swp_entry(type, offset); 495 return swp_entry(type, offset);
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type)
518 return (swp_entry_t) {0}; 500 return (swp_entry_t) {0};
519} 501}
520 502
521static struct swap_info_struct * swap_info_get(swp_entry_t entry) 503static struct swap_info_struct *swap_info_get(swp_entry_t entry)
522{ 504{
523 struct swap_info_struct * p; 505 struct swap_info_struct *p;
524 unsigned long offset, type; 506 unsigned long offset, type;
525 507
526 if (!entry.val) 508 if (!entry.val)
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
528 type = swp_type(entry); 510 type = swp_type(entry);
529 if (type >= nr_swapfiles) 511 if (type >= nr_swapfiles)
530 goto bad_nofile; 512 goto bad_nofile;
531 p = & swap_info[type]; 513 p = swap_info[type];
532 if (!(p->flags & SWP_USED)) 514 if (!(p->flags & SWP_USED))
533 goto bad_device; 515 goto bad_device;
534 offset = swp_offset(entry); 516 offset = swp_offset(entry);
@@ -554,41 +536,56 @@ out:
554 return NULL; 536 return NULL;
555} 537}
556 538
557static int swap_entry_free(struct swap_info_struct *p, 539static unsigned char swap_entry_free(struct swap_info_struct *p,
558 swp_entry_t ent, int cache) 540 swp_entry_t entry, unsigned char usage)
559{ 541{
560 unsigned long offset = swp_offset(ent); 542 unsigned long offset = swp_offset(entry);
561 int count = swap_count(p->swap_map[offset]); 543 unsigned char count;
562 bool has_cache; 544 unsigned char has_cache;
563 545
564 has_cache = swap_has_cache(p->swap_map[offset]); 546 count = p->swap_map[offset];
547 has_cache = count & SWAP_HAS_CACHE;
548 count &= ~SWAP_HAS_CACHE;
565 549
566 if (cache == SWAP_MAP) { /* dropping usage count of swap */ 550 if (usage == SWAP_HAS_CACHE) {
567 if (count < SWAP_MAP_MAX) {
568 count--;
569 p->swap_map[offset] = encode_swapmap(count, has_cache);
570 }
571 } else { /* dropping swap cache flag */
572 VM_BUG_ON(!has_cache); 551 VM_BUG_ON(!has_cache);
573 p->swap_map[offset] = encode_swapmap(count, false); 552 has_cache = 0;
574 553 } else if (count == SWAP_MAP_SHMEM) {
554 /*
555 * Or we could insist on shmem.c using a special
556 * swap_shmem_free() and free_shmem_swap_and_cache()...
557 */
558 count = 0;
559 } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
560 if (count == COUNT_CONTINUED) {
561 if (swap_count_continued(p, offset, count))
562 count = SWAP_MAP_MAX | COUNT_CONTINUED;
563 else
564 count = SWAP_MAP_MAX;
565 } else
566 count--;
575 } 567 }
576 /* return code. */ 568
577 count = p->swap_map[offset]; 569 if (!count)
570 mem_cgroup_uncharge_swap(entry);
571
572 usage = count | has_cache;
573 p->swap_map[offset] = usage;
574
578 /* free if no reference */ 575 /* free if no reference */
579 if (!count) { 576 if (!usage) {
580 if (offset < p->lowest_bit) 577 if (offset < p->lowest_bit)
581 p->lowest_bit = offset; 578 p->lowest_bit = offset;
582 if (offset > p->highest_bit) 579 if (offset > p->highest_bit)
583 p->highest_bit = offset; 580 p->highest_bit = offset;
584 if (p->prio > swap_info[swap_list.next].prio) 581 if (swap_list.next >= 0 &&
585 swap_list.next = p - swap_info; 582 p->prio > swap_info[swap_list.next]->prio)
583 swap_list.next = p->type;
586 nr_swap_pages++; 584 nr_swap_pages++;
587 p->inuse_pages--; 585 p->inuse_pages--;
588 } 586 }
589 if (!swap_count(count)) 587
590 mem_cgroup_uncharge_swap(ent); 588 return usage;
591 return count;
592} 589}
593 590
594/* 591/*
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p,
597 */ 594 */
598void swap_free(swp_entry_t entry) 595void swap_free(swp_entry_t entry)
599{ 596{
600 struct swap_info_struct * p; 597 struct swap_info_struct *p;
601 598
602 p = swap_info_get(entry); 599 p = swap_info_get(entry);
603 if (p) { 600 if (p) {
604 swap_entry_free(p, entry, SWAP_MAP); 601 swap_entry_free(p, entry, 1);
605 spin_unlock(&swap_lock); 602 spin_unlock(&swap_lock);
606 } 603 }
607} 604}
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry)
612void swapcache_free(swp_entry_t entry, struct page *page) 609void swapcache_free(swp_entry_t entry, struct page *page)
613{ 610{
614 struct swap_info_struct *p; 611 struct swap_info_struct *p;
615 int ret; 612 unsigned char count;
616 613
617 p = swap_info_get(entry); 614 p = swap_info_get(entry);
618 if (p) { 615 if (p) {
619 ret = swap_entry_free(p, entry, SWAP_CACHE); 616 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
620 if (page) { 617 if (page)
621 bool swapout; 618 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
622 if (ret)
623 swapout = true; /* the end of swap out */
624 else
625 swapout = false; /* no more swap users! */
626 mem_cgroup_uncharge_swapcache(page, entry, swapout);
627 }
628 spin_unlock(&swap_lock); 619 spin_unlock(&swap_lock);
629 } 620 }
630 return;
631} 621}
632 622
633/* 623/*
634 * How many references to page are currently swapped out? 624 * How many references to page are currently swapped out?
625 * This does not give an exact answer when swap count is continued,
626 * but does include the high COUNT_CONTINUED flag to allow for that.
635 */ 627 */
636static inline int page_swapcount(struct page *page) 628static inline int page_swapcount(struct page *page)
637{ 629{
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page)
659 int count; 651 int count;
660 652
661 VM_BUG_ON(!PageLocked(page)); 653 VM_BUG_ON(!PageLocked(page));
654 if (unlikely(PageKsm(page)))
655 return 0;
662 count = page_mapcount(page); 656 count = page_mapcount(page);
663 if (count <= 1 && PageSwapCache(page)) { 657 if (count <= 1 && PageSwapCache(page)) {
664 count += page_swapcount(page); 658 count += page_swapcount(page);
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page)
667 SetPageDirty(page); 661 SetPageDirty(page);
668 } 662 }
669 } 663 }
670 return count == 1; 664 return count <= 1;
671} 665}
672 666
673/* 667/*
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry)
704 698
705 p = swap_info_get(entry); 699 p = swap_info_get(entry);
706 if (p) { 700 if (p) {
707 if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { 701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
708 page = find_get_page(&swapper_space, entry.val); 702 page = find_get_page(&swapper_space, entry.val);
709 if (page && !trylock_page(page)) { 703 if (page && !trylock_page(page)) {
710 page_cache_release(page); 704 page_cache_release(page);
@@ -729,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
729 return p != NULL; 723 return p != NULL;
730} 724}
731 725
726#ifdef CONFIG_CGROUP_MEM_RES_CTLR
727/**
728 * mem_cgroup_count_swap_user - count the user of a swap entry
729 * @ent: the swap entry to be checked
730 * @pagep: the pointer for the swap cache page of the entry to be stored
731 *
732 * Returns the number of the user of the swap entry. The number is valid only
733 * for swaps of anonymous pages.
734 * If the entry is found on swap cache, the page is stored to pagep with
735 * refcount of it being incremented.
736 */
737int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
738{
739 struct page *page;
740 struct swap_info_struct *p;
741 int count = 0;
742
743 page = find_get_page(&swapper_space, ent.val);
744 if (page)
745 count += page_mapcount(page);
746 p = swap_info_get(ent);
747 if (p) {
748 count += swap_count(p->swap_map[swp_offset(ent)]);
749 spin_unlock(&swap_lock);
750 }
751
752 *pagep = page;
753 return count;
754}
755#endif
756
732#ifdef CONFIG_HIBERNATION 757#ifdef CONFIG_HIBERNATION
733/* 758/*
734 * Find the swap type that corresponds to given device (if any). 759 * Find the swap type that corresponds to given device (if any).
@@ -741,14 +766,14 @@ int free_swap_and_cache(swp_entry_t entry)
741int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) 766int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
742{ 767{
743 struct block_device *bdev = NULL; 768 struct block_device *bdev = NULL;
744 int i; 769 int type;
745 770
746 if (device) 771 if (device)
747 bdev = bdget(device); 772 bdev = bdget(device);
748 773
749 spin_lock(&swap_lock); 774 spin_lock(&swap_lock);
750 for (i = 0; i < nr_swapfiles; i++) { 775 for (type = 0; type < nr_swapfiles; type++) {
751 struct swap_info_struct *sis = swap_info + i; 776 struct swap_info_struct *sis = swap_info[type];
752 777
753 if (!(sis->flags & SWP_WRITEOK)) 778 if (!(sis->flags & SWP_WRITEOK))
754 continue; 779 continue;
@@ -758,20 +783,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
758 *bdev_p = bdgrab(sis->bdev); 783 *bdev_p = bdgrab(sis->bdev);
759 784
760 spin_unlock(&swap_lock); 785 spin_unlock(&swap_lock);
761 return i; 786 return type;
762 } 787 }
763 if (bdev == sis->bdev) { 788 if (bdev == sis->bdev) {
764 struct swap_extent *se; 789 struct swap_extent *se = &sis->first_swap_extent;
765 790
766 se = list_entry(sis->extent_list.next,
767 struct swap_extent, list);
768 if (se->start_block == offset) { 791 if (se->start_block == offset) {
769 if (bdev_p) 792 if (bdev_p)
770 *bdev_p = bdgrab(sis->bdev); 793 *bdev_p = bdgrab(sis->bdev);
771 794
772 spin_unlock(&swap_lock); 795 spin_unlock(&swap_lock);
773 bdput(bdev); 796 bdput(bdev);
774 return i; 797 return type;
775 } 798 }
776 } 799 }
777 } 800 }
@@ -783,6 +806,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
783} 806}
784 807
785/* 808/*
809 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
810 * corresponding to given index in swap_info (swap type).
811 */
812sector_t swapdev_block(int type, pgoff_t offset)
813{
814 struct block_device *bdev;
815
816 if ((unsigned int)type >= nr_swapfiles)
817 return 0;
818 if (!(swap_info[type]->flags & SWP_WRITEOK))
819 return 0;
820 return map_swap_entry(swp_entry(type, offset), &bdev);
821}
822
823/*
786 * Return either the total number of swap pages of given type, or the number 824 * Return either the total number of swap pages of given type, or the number
787 * of free pages of that type (depending on @free) 825 * of free pages of that type (depending on @free)
788 * 826 *
@@ -792,18 +830,20 @@ unsigned int count_swap_pages(int type, int free)
792{ 830{
793 unsigned int n = 0; 831 unsigned int n = 0;
794 832
795 if (type < nr_swapfiles) { 833 spin_lock(&swap_lock);
796 spin_lock(&swap_lock); 834 if ((unsigned int)type < nr_swapfiles) {
797 if (swap_info[type].flags & SWP_WRITEOK) { 835 struct swap_info_struct *sis = swap_info[type];
798 n = swap_info[type].pages; 836
837 if (sis->flags & SWP_WRITEOK) {
838 n = sis->pages;
799 if (free) 839 if (free)
800 n -= swap_info[type].inuse_pages; 840 n -= sis->inuse_pages;
801 } 841 }
802 spin_unlock(&swap_lock);
803 } 842 }
843 spin_unlock(&swap_lock);
804 return n; 844 return n;
805} 845}
806#endif 846#endif /* CONFIG_HIBERNATION */
807 847
808/* 848/*
809 * No need to decide whether this PTE shares the swap entry with others, 849 * No need to decide whether this PTE shares the swap entry with others,
@@ -831,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
831 goto out; 871 goto out;
832 } 872 }
833 873
834 inc_mm_counter(vma->vm_mm, anon_rss); 874 dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
875 inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
835 get_page(page); 876 get_page(page);
836 set_pte_at(vma->vm_mm, addr, pte, 877 set_pte_at(vma->vm_mm, addr, pte,
837 pte_mkold(mk_pte(page, vma->vm_page_prot))); 878 pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -932,7 +973,7 @@ static int unuse_vma(struct vm_area_struct *vma,
932 unsigned long addr, end, next; 973 unsigned long addr, end, next;
933 int ret; 974 int ret;
934 975
935 if (page->mapping) { 976 if (page_anon_vma(page)) {
936 addr = page_address_in_vma(page, vma); 977 addr = page_address_in_vma(page, vma);
937 if (addr == -EFAULT) 978 if (addr == -EFAULT)
938 return 0; 979 return 0;
@@ -988,7 +1029,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
988{ 1029{
989 unsigned int max = si->max; 1030 unsigned int max = si->max;
990 unsigned int i = prev; 1031 unsigned int i = prev;
991 int count; 1032 unsigned char count;
992 1033
993 /* 1034 /*
994 * No need for swap_lock here: we're just looking 1035 * No need for swap_lock here: we're just looking
@@ -1024,16 +1065,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1024 */ 1065 */
1025static int try_to_unuse(unsigned int type) 1066static int try_to_unuse(unsigned int type)
1026{ 1067{
1027 struct swap_info_struct * si = &swap_info[type]; 1068 struct swap_info_struct *si = swap_info[type];
1028 struct mm_struct *start_mm; 1069 struct mm_struct *start_mm;
1029 unsigned short *swap_map; 1070 unsigned char *swap_map;
1030 unsigned short swcount; 1071 unsigned char swcount;
1031 struct page *page; 1072 struct page *page;
1032 swp_entry_t entry; 1073 swp_entry_t entry;
1033 unsigned int i = 0; 1074 unsigned int i = 0;
1034 int retval = 0; 1075 int retval = 0;
1035 int reset_overflow = 0;
1036 int shmem;
1037 1076
1038 /* 1077 /*
1039 * When searching mms for an entry, a good strategy is to 1078 * When searching mms for an entry, a good strategy is to
@@ -1047,8 +1086,7 @@ static int try_to_unuse(unsigned int type)
1047 * together, child after parent. If we race with dup_mmap(), we 1086 * together, child after parent. If we race with dup_mmap(), we
1048 * prefer to resolve parent before child, lest we miss entries 1087 * prefer to resolve parent before child, lest we miss entries
1049 * duplicated after we scanned child: using last mm would invert 1088 * duplicated after we scanned child: using last mm would invert
1050 * that. Though it's only a serious concern when an overflowed 1089 * that.
1051 * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
1052 */ 1090 */
1053 start_mm = &init_mm; 1091 start_mm = &init_mm;
1054 atomic_inc(&init_mm.mm_users); 1092 atomic_inc(&init_mm.mm_users);
@@ -1110,17 +1148,18 @@ static int try_to_unuse(unsigned int type)
1110 1148
1111 /* 1149 /*
1112 * Remove all references to entry. 1150 * Remove all references to entry.
1113 * Whenever we reach init_mm, there's no address space
1114 * to search, but use it as a reminder to search shmem.
1115 */ 1151 */
1116 shmem = 0;
1117 swcount = *swap_map; 1152 swcount = *swap_map;
1118 if (swap_count(swcount)) { 1153 if (swap_count(swcount) == SWAP_MAP_SHMEM) {
1119 if (start_mm == &init_mm) 1154 retval = shmem_unuse(entry, page);
1120 shmem = shmem_unuse(entry, page); 1155 /* page has already been unlocked and released */
1121 else 1156 if (retval < 0)
1122 retval = unuse_mm(start_mm, entry, page); 1157 break;
1158 continue;
1123 } 1159 }
1160 if (swap_count(swcount) && start_mm != &init_mm)
1161 retval = unuse_mm(start_mm, entry, page);
1162
1124 if (swap_count(*swap_map)) { 1163 if (swap_count(*swap_map)) {
1125 int set_start_mm = (*swap_map >= swcount); 1164 int set_start_mm = (*swap_map >= swcount);
1126 struct list_head *p = &start_mm->mmlist; 1165 struct list_head *p = &start_mm->mmlist;
@@ -1131,7 +1170,7 @@ static int try_to_unuse(unsigned int type)
1131 atomic_inc(&new_start_mm->mm_users); 1170 atomic_inc(&new_start_mm->mm_users);
1132 atomic_inc(&prev_mm->mm_users); 1171 atomic_inc(&prev_mm->mm_users);
1133 spin_lock(&mmlist_lock); 1172 spin_lock(&mmlist_lock);
1134 while (swap_count(*swap_map) && !retval && !shmem && 1173 while (swap_count(*swap_map) && !retval &&
1135 (p = p->next) != &start_mm->mmlist) { 1174 (p = p->next) != &start_mm->mmlist) {
1136 mm = list_entry(p, struct mm_struct, mmlist); 1175 mm = list_entry(p, struct mm_struct, mmlist);
1137 if (!atomic_inc_not_zero(&mm->mm_users)) 1176 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1145,10 +1184,9 @@ static int try_to_unuse(unsigned int type)
1145 swcount = *swap_map; 1184 swcount = *swap_map;
1146 if (!swap_count(swcount)) /* any usage ? */ 1185 if (!swap_count(swcount)) /* any usage ? */
1147 ; 1186 ;
1148 else if (mm == &init_mm) { 1187 else if (mm == &init_mm)
1149 set_start_mm = 1; 1188 set_start_mm = 1;
1150 shmem = shmem_unuse(entry, page); 1189 else
1151 } else
1152 retval = unuse_mm(mm, entry, page); 1190 retval = unuse_mm(mm, entry, page);
1153 1191
1154 if (set_start_mm && *swap_map < swcount) { 1192 if (set_start_mm && *swap_map < swcount) {
@@ -1164,13 +1202,6 @@ static int try_to_unuse(unsigned int type)
1164 mmput(start_mm); 1202 mmput(start_mm);
1165 start_mm = new_start_mm; 1203 start_mm = new_start_mm;
1166 } 1204 }
1167 if (shmem) {
1168 /* page has already been unlocked and released */
1169 if (shmem > 0)
1170 continue;
1171 retval = shmem;
1172 break;
1173 }
1174 if (retval) { 1205 if (retval) {
1175 unlock_page(page); 1206 unlock_page(page);
1176 page_cache_release(page); 1207 page_cache_release(page);
@@ -1178,30 +1209,6 @@ static int try_to_unuse(unsigned int type)
1178 } 1209 }
1179 1210
1180 /* 1211 /*
1181 * How could swap count reach 0x7ffe ?
1182 * There's no way to repeat a swap page within an mm
1183 * (except in shmem, where it's the shared object which takes
1184 * the reference count)?
1185 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
1186 * short is too small....)
1187 * If that's wrong, then we should worry more about
1188 * exit_mmap() and do_munmap() cases described above:
1189 * we might be resetting SWAP_MAP_MAX too early here.
1190 * We know "Undead"s can happen, they're okay, so don't
1191 * report them; but do report if we reset SWAP_MAP_MAX.
1192 */
1193 /* We might release the lock_page() in unuse_mm(). */
1194 if (!PageSwapCache(page) || page_private(page) != entry.val)
1195 goto retry;
1196
1197 if (swap_count(*swap_map) == SWAP_MAP_MAX) {
1198 spin_lock(&swap_lock);
1199 *swap_map = encode_swapmap(0, true);
1200 spin_unlock(&swap_lock);
1201 reset_overflow = 1;
1202 }
1203
1204 /*
1205 * If a reference remains (rare), we would like to leave 1212 * If a reference remains (rare), we would like to leave
1206 * the page in the swap cache; but try_to_unmap could 1213 * the page in the swap cache; but try_to_unmap could
1207 * then re-duplicate the entry once we drop page lock, 1214 * then re-duplicate the entry once we drop page lock,
@@ -1213,6 +1220,12 @@ static int try_to_unuse(unsigned int type)
1213 * read from disk into another page. Splitting into two 1220 * read from disk into another page. Splitting into two
1214 * pages would be incorrect if swap supported "shared 1221 * pages would be incorrect if swap supported "shared
1215 * private" pages, but they are handled by tmpfs files. 1222 * private" pages, but they are handled by tmpfs files.
1223 *
1224 * Given how unuse_vma() targets one particular offset
1225 * in an anon_vma, once the anon_vma has been determined,
1226 * this splitting happens to be just what is needed to
1227 * handle where KSM pages have been swapped out: re-reading
1228 * is unnecessarily slow, but we can fix that later on.
1216 */ 1229 */
1217 if (swap_count(*swap_map) && 1230 if (swap_count(*swap_map) &&
1218 PageDirty(page) && PageSwapCache(page)) { 1231 PageDirty(page) && PageSwapCache(page)) {
@@ -1242,7 +1255,6 @@ static int try_to_unuse(unsigned int type)
1242 * mark page dirty so shrink_page_list will preserve it. 1255 * mark page dirty so shrink_page_list will preserve it.
1243 */ 1256 */
1244 SetPageDirty(page); 1257 SetPageDirty(page);
1245retry:
1246 unlock_page(page); 1258 unlock_page(page);
1247 page_cache_release(page); 1259 page_cache_release(page);
1248 1260
@@ -1254,10 +1266,6 @@ retry:
1254 } 1266 }
1255 1267
1256 mmput(start_mm); 1268 mmput(start_mm);
1257 if (reset_overflow) {
1258 printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
1259 swap_overflow = 0;
1260 }
1261 return retval; 1269 return retval;
1262} 1270}
1263 1271
@@ -1270,10 +1278,10 @@ retry:
1270static void drain_mmlist(void) 1278static void drain_mmlist(void)
1271{ 1279{
1272 struct list_head *p, *next; 1280 struct list_head *p, *next;
1273 unsigned int i; 1281 unsigned int type;
1274 1282
1275 for (i = 0; i < nr_swapfiles; i++) 1283 for (type = 0; type < nr_swapfiles; type++)
1276 if (swap_info[i].inuse_pages) 1284 if (swap_info[type]->inuse_pages)
1277 return; 1285 return;
1278 spin_lock(&mmlist_lock); 1286 spin_lock(&mmlist_lock);
1279 list_for_each_safe(p, next, &init_mm.mmlist) 1287 list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1283,12 +1291,23 @@ static void drain_mmlist(void)
1283 1291
1284/* 1292/*
1285 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which 1293 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
1286 * corresponds to page offset `offset'. 1294 * corresponds to page offset for the specified swap entry.
1295 * Note that the type of this function is sector_t, but it returns page offset
1296 * into the bdev, not sector offset.
1287 */ 1297 */
1288sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) 1298static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
1289{ 1299{
1290 struct swap_extent *se = sis->curr_swap_extent; 1300 struct swap_info_struct *sis;
1291 struct swap_extent *start_se = se; 1301 struct swap_extent *start_se;
1302 struct swap_extent *se;
1303 pgoff_t offset;
1304
1305 sis = swap_info[swp_type(entry)];
1306 *bdev = sis->bdev;
1307
1308 offset = swp_offset(entry);
1309 start_se = sis->curr_swap_extent;
1310 se = start_se;
1292 1311
1293 for ( ; ; ) { 1312 for ( ; ; ) {
1294 struct list_head *lh; 1313 struct list_head *lh;
@@ -1298,40 +1317,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
1298 return se->start_block + (offset - se->start_page); 1317 return se->start_block + (offset - se->start_page);
1299 } 1318 }
1300 lh = se->list.next; 1319 lh = se->list.next;
1301 if (lh == &sis->extent_list)
1302 lh = lh->next;
1303 se = list_entry(lh, struct swap_extent, list); 1320 se = list_entry(lh, struct swap_extent, list);
1304 sis->curr_swap_extent = se; 1321 sis->curr_swap_extent = se;
1305 BUG_ON(se == start_se); /* It *must* be present */ 1322 BUG_ON(se == start_se); /* It *must* be present */
1306 } 1323 }
1307} 1324}
1308 1325
1309#ifdef CONFIG_HIBERNATION
1310/* 1326/*
1311 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev 1327 * Returns the page offset into bdev for the specified page's swap entry.
1312 * corresponding to given index in swap_info (swap type).
1313 */ 1328 */
1314sector_t swapdev_block(int swap_type, pgoff_t offset) 1329sector_t map_swap_page(struct page *page, struct block_device **bdev)
1315{ 1330{
1316 struct swap_info_struct *sis; 1331 swp_entry_t entry;
1317 1332 entry.val = page_private(page);
1318 if (swap_type >= nr_swapfiles) 1333 return map_swap_entry(entry, bdev);
1319 return 0;
1320
1321 sis = swap_info + swap_type;
1322 return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
1323} 1334}
1324#endif /* CONFIG_HIBERNATION */
1325 1335
1326/* 1336/*
1327 * Free all of a swapdev's extent information 1337 * Free all of a swapdev's extent information
1328 */ 1338 */
1329static void destroy_swap_extents(struct swap_info_struct *sis) 1339static void destroy_swap_extents(struct swap_info_struct *sis)
1330{ 1340{
1331 while (!list_empty(&sis->extent_list)) { 1341 while (!list_empty(&sis->first_swap_extent.list)) {
1332 struct swap_extent *se; 1342 struct swap_extent *se;
1333 1343
1334 se = list_entry(sis->extent_list.next, 1344 se = list_entry(sis->first_swap_extent.list.next,
1335 struct swap_extent, list); 1345 struct swap_extent, list);
1336 list_del(&se->list); 1346 list_del(&se->list);
1337 kfree(se); 1347 kfree(se);
@@ -1352,8 +1362,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1352 struct swap_extent *new_se; 1362 struct swap_extent *new_se;
1353 struct list_head *lh; 1363 struct list_head *lh;
1354 1364
1355 lh = sis->extent_list.prev; /* The highest page extent */ 1365 if (start_page == 0) {
1356 if (lh != &sis->extent_list) { 1366 se = &sis->first_swap_extent;
1367 sis->curr_swap_extent = se;
1368 se->start_page = 0;
1369 se->nr_pages = nr_pages;
1370 se->start_block = start_block;
1371 return 1;
1372 } else {
1373 lh = sis->first_swap_extent.list.prev; /* Highest extent */
1357 se = list_entry(lh, struct swap_extent, list); 1374 se = list_entry(lh, struct swap_extent, list);
1358 BUG_ON(se->start_page + se->nr_pages != start_page); 1375 BUG_ON(se->start_page + se->nr_pages != start_page);
1359 if (se->start_block + se->nr_pages == start_block) { 1376 if (se->start_block + se->nr_pages == start_block) {
@@ -1373,7 +1390,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1373 new_se->nr_pages = nr_pages; 1390 new_se->nr_pages = nr_pages;
1374 new_se->start_block = start_block; 1391 new_se->start_block = start_block;
1375 1392
1376 list_add_tail(&new_se->list, &sis->extent_list); 1393 list_add_tail(&new_se->list, &sis->first_swap_extent.list);
1377 return 1; 1394 return 1;
1378} 1395}
1379 1396
@@ -1425,7 +1442,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1425 if (S_ISBLK(inode->i_mode)) { 1442 if (S_ISBLK(inode->i_mode)) {
1426 ret = add_swap_extent(sis, 0, sis->max, 0); 1443 ret = add_swap_extent(sis, 0, sis->max, 0);
1427 *span = sis->pages; 1444 *span = sis->pages;
1428 goto done; 1445 goto out;
1429 } 1446 }
1430 1447
1431 blkbits = inode->i_blkbits; 1448 blkbits = inode->i_blkbits;
@@ -1496,25 +1513,22 @@ reprobe:
1496 sis->max = page_no; 1513 sis->max = page_no;
1497 sis->pages = page_no - 1; 1514 sis->pages = page_no - 1;
1498 sis->highest_bit = page_no - 1; 1515 sis->highest_bit = page_no - 1;
1499done: 1516out:
1500 sis->curr_swap_extent = list_entry(sis->extent_list.prev, 1517 return ret;
1501 struct swap_extent, list);
1502 goto out;
1503bad_bmap: 1518bad_bmap:
1504 printk(KERN_ERR "swapon: swapfile has holes\n"); 1519 printk(KERN_ERR "swapon: swapfile has holes\n");
1505 ret = -EINVAL; 1520 ret = -EINVAL;
1506out: 1521 goto out;
1507 return ret;
1508} 1522}
1509 1523
1510SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1524SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1511{ 1525{
1512 struct swap_info_struct * p = NULL; 1526 struct swap_info_struct *p = NULL;
1513 unsigned short *swap_map; 1527 unsigned char *swap_map;
1514 struct file *swap_file, *victim; 1528 struct file *swap_file, *victim;
1515 struct address_space *mapping; 1529 struct address_space *mapping;
1516 struct inode *inode; 1530 struct inode *inode;
1517 char * pathname; 1531 char *pathname;
1518 int i, type, prev; 1532 int i, type, prev;
1519 int err; 1533 int err;
1520 1534
@@ -1535,8 +1549,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1535 mapping = victim->f_mapping; 1549 mapping = victim->f_mapping;
1536 prev = -1; 1550 prev = -1;
1537 spin_lock(&swap_lock); 1551 spin_lock(&swap_lock);
1538 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1552 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
1539 p = swap_info + type; 1553 p = swap_info[type];
1540 if (p->flags & SWP_WRITEOK) { 1554 if (p->flags & SWP_WRITEOK) {
1541 if (p->swap_file->f_mapping == mapping) 1555 if (p->swap_file->f_mapping == mapping)
1542 break; 1556 break;
@@ -1555,18 +1569,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1555 spin_unlock(&swap_lock); 1569 spin_unlock(&swap_lock);
1556 goto out_dput; 1570 goto out_dput;
1557 } 1571 }
1558 if (prev < 0) { 1572 if (prev < 0)
1559 swap_list.head = p->next; 1573 swap_list.head = p->next;
1560 } else { 1574 else
1561 swap_info[prev].next = p->next; 1575 swap_info[prev]->next = p->next;
1562 }
1563 if (type == swap_list.next) { 1576 if (type == swap_list.next) {
1564 /* just pick something that's safe... */ 1577 /* just pick something that's safe... */
1565 swap_list.next = swap_list.head; 1578 swap_list.next = swap_list.head;
1566 } 1579 }
1567 if (p->prio < 0) { 1580 if (p->prio < 0) {
1568 for (i = p->next; i >= 0; i = swap_info[i].next) 1581 for (i = p->next; i >= 0; i = swap_info[i]->next)
1569 swap_info[i].prio = p->prio--; 1582 swap_info[i]->prio = p->prio--;
1570 least_priority++; 1583 least_priority++;
1571 } 1584 }
1572 nr_swap_pages -= p->pages; 1585 nr_swap_pages -= p->pages;
@@ -1584,16 +1597,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1584 if (p->prio < 0) 1597 if (p->prio < 0)
1585 p->prio = --least_priority; 1598 p->prio = --least_priority;
1586 prev = -1; 1599 prev = -1;
1587 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 1600 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
1588 if (p->prio >= swap_info[i].prio) 1601 if (p->prio >= swap_info[i]->prio)
1589 break; 1602 break;
1590 prev = i; 1603 prev = i;
1591 } 1604 }
1592 p->next = i; 1605 p->next = i;
1593 if (prev < 0) 1606 if (prev < 0)
1594 swap_list.head = swap_list.next = p - swap_info; 1607 swap_list.head = swap_list.next = type;
1595 else 1608 else
1596 swap_info[prev].next = p - swap_info; 1609 swap_info[prev]->next = type;
1597 nr_swap_pages += p->pages; 1610 nr_swap_pages += p->pages;
1598 total_swap_pages += p->pages; 1611 total_swap_pages += p->pages;
1599 p->flags |= SWP_WRITEOK; 1612 p->flags |= SWP_WRITEOK;
@@ -1606,6 +1619,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1606 up_write(&swap_unplug_sem); 1619 up_write(&swap_unplug_sem);
1607 1620
1608 destroy_swap_extents(p); 1621 destroy_swap_extents(p);
1622 if (p->flags & SWP_CONTINUED)
1623 free_swap_count_continuations(p);
1624
1609 mutex_lock(&swapon_mutex); 1625 mutex_lock(&swapon_mutex);
1610 spin_lock(&swap_lock); 1626 spin_lock(&swap_lock);
1611 drain_mmlist(); 1627 drain_mmlist();
@@ -1653,8 +1669,8 @@ out:
1653/* iterator */ 1669/* iterator */
1654static void *swap_start(struct seq_file *swap, loff_t *pos) 1670static void *swap_start(struct seq_file *swap, loff_t *pos)
1655{ 1671{
1656 struct swap_info_struct *ptr = swap_info; 1672 struct swap_info_struct *si;
1657 int i; 1673 int type;
1658 loff_t l = *pos; 1674 loff_t l = *pos;
1659 1675
1660 mutex_lock(&swapon_mutex); 1676 mutex_lock(&swapon_mutex);
@@ -1662,11 +1678,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1662 if (!l) 1678 if (!l)
1663 return SEQ_START_TOKEN; 1679 return SEQ_START_TOKEN;
1664 1680
1665 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1681 for (type = 0; type < nr_swapfiles; type++) {
1666 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1682 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1683 si = swap_info[type];
1684 if (!(si->flags & SWP_USED) || !si->swap_map)
1667 continue; 1685 continue;
1668 if (!--l) 1686 if (!--l)
1669 return ptr; 1687 return si;
1670 } 1688 }
1671 1689
1672 return NULL; 1690 return NULL;
@@ -1674,21 +1692,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1674 1692
1675static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) 1693static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1676{ 1694{
1677 struct swap_info_struct *ptr; 1695 struct swap_info_struct *si = v;
1678 struct swap_info_struct *endptr = swap_info + nr_swapfiles; 1696 int type;
1679 1697
1680 if (v == SEQ_START_TOKEN) 1698 if (v == SEQ_START_TOKEN)
1681 ptr = swap_info; 1699 type = 0;
1682 else { 1700 else
1683 ptr = v; 1701 type = si->type + 1;
1684 ptr++;
1685 }
1686 1702
1687 for (; ptr < endptr; ptr++) { 1703 for (; type < nr_swapfiles; type++) {
1688 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1704 smp_rmb(); /* read nr_swapfiles before swap_info[type] */
1705 si = swap_info[type];
1706 if (!(si->flags & SWP_USED) || !si->swap_map)
1689 continue; 1707 continue;
1690 ++*pos; 1708 ++*pos;
1691 return ptr; 1709 return si;
1692 } 1710 }
1693 1711
1694 return NULL; 1712 return NULL;
@@ -1701,24 +1719,24 @@ static void swap_stop(struct seq_file *swap, void *v)
1701 1719
1702static int swap_show(struct seq_file *swap, void *v) 1720static int swap_show(struct seq_file *swap, void *v)
1703{ 1721{
1704 struct swap_info_struct *ptr = v; 1722 struct swap_info_struct *si = v;
1705 struct file *file; 1723 struct file *file;
1706 int len; 1724 int len;
1707 1725
1708 if (ptr == SEQ_START_TOKEN) { 1726 if (si == SEQ_START_TOKEN) {
1709 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); 1727 seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
1710 return 0; 1728 return 0;
1711 } 1729 }
1712 1730
1713 file = ptr->swap_file; 1731 file = si->swap_file;
1714 len = seq_path(swap, &file->f_path, " \t\n\\"); 1732 len = seq_path(swap, &file->f_path, " \t\n\\");
1715 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1733 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1716 len < 40 ? 40 - len : 1, " ", 1734 len < 40 ? 40 - len : 1, " ",
1717 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1735 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1718 "partition" : "file\t", 1736 "partition" : "file\t",
1719 ptr->pages << (PAGE_SHIFT - 10), 1737 si->pages << (PAGE_SHIFT - 10),
1720 ptr->inuse_pages << (PAGE_SHIFT - 10), 1738 si->inuse_pages << (PAGE_SHIFT - 10),
1721 ptr->prio); 1739 si->prio);
1722 return 0; 1740 return 0;
1723} 1741}
1724 1742
@@ -1765,7 +1783,7 @@ late_initcall(max_swapfiles_check);
1765 */ 1783 */
1766SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 1784SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1767{ 1785{
1768 struct swap_info_struct * p; 1786 struct swap_info_struct *p;
1769 char *name = NULL; 1787 char *name = NULL;
1770 struct block_device *bdev = NULL; 1788 struct block_device *bdev = NULL;
1771 struct file *swap_file = NULL; 1789 struct file *swap_file = NULL;
@@ -1773,36 +1791,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1773 unsigned int type; 1791 unsigned int type;
1774 int i, prev; 1792 int i, prev;
1775 int error; 1793 int error;
1776 union swap_header *swap_header = NULL; 1794 union swap_header *swap_header;
1777 unsigned int nr_good_pages = 0; 1795 unsigned int nr_good_pages;
1778 int nr_extents = 0; 1796 int nr_extents = 0;
1779 sector_t span; 1797 sector_t span;
1780 unsigned long maxpages = 1; 1798 unsigned long maxpages;
1781 unsigned long swapfilepages; 1799 unsigned long swapfilepages;
1782 unsigned short *swap_map = NULL; 1800 unsigned char *swap_map = NULL;
1783 struct page *page = NULL; 1801 struct page *page = NULL;
1784 struct inode *inode = NULL; 1802 struct inode *inode = NULL;
1785 int did_down = 0; 1803 int did_down = 0;
1786 1804
1787 if (!capable(CAP_SYS_ADMIN)) 1805 if (!capable(CAP_SYS_ADMIN))
1788 return -EPERM; 1806 return -EPERM;
1807
1808 p = kzalloc(sizeof(*p), GFP_KERNEL);
1809 if (!p)
1810 return -ENOMEM;
1811
1789 spin_lock(&swap_lock); 1812 spin_lock(&swap_lock);
1790 p = swap_info; 1813 for (type = 0; type < nr_swapfiles; type++) {
1791 for (type = 0 ; type < nr_swapfiles ; type++,p++) 1814 if (!(swap_info[type]->flags & SWP_USED))
1792 if (!(p->flags & SWP_USED))
1793 break; 1815 break;
1816 }
1794 error = -EPERM; 1817 error = -EPERM;
1795 if (type >= MAX_SWAPFILES) { 1818 if (type >= MAX_SWAPFILES) {
1796 spin_unlock(&swap_lock); 1819 spin_unlock(&swap_lock);
1820 kfree(p);
1797 goto out; 1821 goto out;
1798 } 1822 }
1799 if (type >= nr_swapfiles) 1823 if (type >= nr_swapfiles) {
1800 nr_swapfiles = type+1; 1824 p->type = type;
1801 memset(p, 0, sizeof(*p)); 1825 swap_info[type] = p;
1802 INIT_LIST_HEAD(&p->extent_list); 1826 /*
1827 * Write swap_info[type] before nr_swapfiles, in case a
1828 * racing procfs swap_start() or swap_next() is reading them.
1829 * (We never shrink nr_swapfiles, we never free this entry.)
1830 */
1831 smp_wmb();
1832 nr_swapfiles++;
1833 } else {
1834 kfree(p);
1835 p = swap_info[type];
1836 /*
1837 * Do not memset this entry: a racing procfs swap_next()
1838 * would be relying on p->type to remain valid.
1839 */
1840 }
1841 INIT_LIST_HEAD(&p->first_swap_extent.list);
1803 p->flags = SWP_USED; 1842 p->flags = SWP_USED;
1804 p->next = -1; 1843 p->next = -1;
1805 spin_unlock(&swap_lock); 1844 spin_unlock(&swap_lock);
1845
1806 name = getname(specialfile); 1846 name = getname(specialfile);
1807 error = PTR_ERR(name); 1847 error = PTR_ERR(name);
1808 if (IS_ERR(name)) { 1848 if (IS_ERR(name)) {
@@ -1822,7 +1862,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1822 1862
1823 error = -EBUSY; 1863 error = -EBUSY;
1824 for (i = 0; i < nr_swapfiles; i++) { 1864 for (i = 0; i < nr_swapfiles; i++) {
1825 struct swap_info_struct *q = &swap_info[i]; 1865 struct swap_info_struct *q = swap_info[i];
1826 1866
1827 if (i == type || !q->swap_file) 1867 if (i == type || !q->swap_file)
1828 continue; 1868 continue;
@@ -1897,6 +1937,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1897 1937
1898 p->lowest_bit = 1; 1938 p->lowest_bit = 1;
1899 p->cluster_next = 1; 1939 p->cluster_next = 1;
1940 p->cluster_nr = 0;
1900 1941
1901 /* 1942 /*
1902 * Find out how many pages are allowed for a single swap 1943 * Find out how many pages are allowed for a single swap
@@ -1913,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1913 * swap pte. 1954 * swap pte.
1914 */ 1955 */
1915 maxpages = swp_offset(pte_to_swp_entry( 1956 maxpages = swp_offset(pte_to_swp_entry(
1916 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; 1957 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1917 if (maxpages > swap_header->info.last_page) 1958 if (maxpages > swap_header->info.last_page) {
1918 maxpages = swap_header->info.last_page; 1959 maxpages = swap_header->info.last_page + 1;
1960 /* p->max is an unsigned int: don't overflow it */
1961 if ((unsigned int)maxpages == 0)
1962 maxpages = UINT_MAX;
1963 }
1919 p->highest_bit = maxpages - 1; 1964 p->highest_bit = maxpages - 1;
1920 1965
1921 error = -EINVAL; 1966 error = -EINVAL;
@@ -1932,30 +1977,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1932 goto bad_swap; 1977 goto bad_swap;
1933 1978
1934 /* OK, set up the swap map and apply the bad block list */ 1979 /* OK, set up the swap map and apply the bad block list */
1935 swap_map = vmalloc(maxpages * sizeof(short)); 1980 swap_map = vmalloc(maxpages);
1936 if (!swap_map) { 1981 if (!swap_map) {
1937 error = -ENOMEM; 1982 error = -ENOMEM;
1938 goto bad_swap; 1983 goto bad_swap;
1939 } 1984 }
1940 1985
1941 memset(swap_map, 0, maxpages * sizeof(short)); 1986 memset(swap_map, 0, maxpages);
1987 nr_good_pages = maxpages - 1; /* omit header page */
1988
1942 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1989 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1943 int page_nr = swap_header->info.badpages[i]; 1990 unsigned int page_nr = swap_header->info.badpages[i];
1944 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { 1991 if (page_nr == 0 || page_nr > swap_header->info.last_page) {
1945 error = -EINVAL; 1992 error = -EINVAL;
1946 goto bad_swap; 1993 goto bad_swap;
1947 } 1994 }
1948 swap_map[page_nr] = SWAP_MAP_BAD; 1995 if (page_nr < maxpages) {
1996 swap_map[page_nr] = SWAP_MAP_BAD;
1997 nr_good_pages--;
1998 }
1949 } 1999 }
1950 2000
1951 error = swap_cgroup_swapon(type, maxpages); 2001 error = swap_cgroup_swapon(type, maxpages);
1952 if (error) 2002 if (error)
1953 goto bad_swap; 2003 goto bad_swap;
1954 2004
1955 nr_good_pages = swap_header->info.last_page -
1956 swap_header->info.nr_badpages -
1957 1 /* header page */;
1958
1959 if (nr_good_pages) { 2005 if (nr_good_pages) {
1960 swap_map[0] = SWAP_MAP_BAD; 2006 swap_map[0] = SWAP_MAP_BAD;
1961 p->max = maxpages; 2007 p->max = maxpages;
@@ -2003,18 +2049,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2003 2049
2004 /* insert swap space into swap_list: */ 2050 /* insert swap space into swap_list: */
2005 prev = -1; 2051 prev = -1;
2006 for (i = swap_list.head; i >= 0; i = swap_info[i].next) { 2052 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
2007 if (p->prio >= swap_info[i].prio) { 2053 if (p->prio >= swap_info[i]->prio)
2008 break; 2054 break;
2009 }
2010 prev = i; 2055 prev = i;
2011 } 2056 }
2012 p->next = i; 2057 p->next = i;
2013 if (prev < 0) { 2058 if (prev < 0)
2014 swap_list.head = swap_list.next = p - swap_info; 2059 swap_list.head = swap_list.next = type;
2015 } else { 2060 else
2016 swap_info[prev].next = p - swap_info; 2061 swap_info[prev]->next = type;
2017 }
2018 spin_unlock(&swap_lock); 2062 spin_unlock(&swap_lock);
2019 mutex_unlock(&swapon_mutex); 2063 mutex_unlock(&swapon_mutex);
2020 error = 0; 2064 error = 0;
@@ -2051,15 +2095,15 @@ out:
2051 2095
2052void si_swapinfo(struct sysinfo *val) 2096void si_swapinfo(struct sysinfo *val)
2053{ 2097{
2054 unsigned int i; 2098 unsigned int type;
2055 unsigned long nr_to_be_unused = 0; 2099 unsigned long nr_to_be_unused = 0;
2056 2100
2057 spin_lock(&swap_lock); 2101 spin_lock(&swap_lock);
2058 for (i = 0; i < nr_swapfiles; i++) { 2102 for (type = 0; type < nr_swapfiles; type++) {
2059 if (!(swap_info[i].flags & SWP_USED) || 2103 struct swap_info_struct *si = swap_info[type];
2060 (swap_info[i].flags & SWP_WRITEOK)) 2104
2061 continue; 2105 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2062 nr_to_be_unused += swap_info[i].inuse_pages; 2106 nr_to_be_unused += si->inuse_pages;
2063 } 2107 }
2064 val->freeswap = nr_swap_pages + nr_to_be_unused; 2108 val->freeswap = nr_swap_pages + nr_to_be_unused;
2065 val->totalswap = total_swap_pages + nr_to_be_unused; 2109 val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2069,101 +2113,111 @@ void si_swapinfo(struct sysinfo *val)
2069/* 2113/*
2070 * Verify that a swap entry is valid and increment its swap map count. 2114 * Verify that a swap entry is valid and increment its swap map count.
2071 * 2115 *
2072 * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
2073 * "permanent", but will be reclaimed by the next swapoff.
2074 * Returns error code in following case. 2116 * Returns error code in following case.
2075 * - success -> 0 2117 * - success -> 0
2076 * - swp_entry is invalid -> EINVAL 2118 * - swp_entry is invalid -> EINVAL
2077 * - swp_entry is migration entry -> EINVAL 2119 * - swp_entry is migration entry -> EINVAL
2078 * - swap-cache reference is requested but there is already one. -> EEXIST 2120 * - swap-cache reference is requested but there is already one. -> EEXIST
2079 * - swap-cache reference is requested but the entry is not used. -> ENOENT 2121 * - swap-cache reference is requested but the entry is not used. -> ENOENT
2122 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
2080 */ 2123 */
2081static int __swap_duplicate(swp_entry_t entry, bool cache) 2124static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2082{ 2125{
2083 struct swap_info_struct * p; 2126 struct swap_info_struct *p;
2084 unsigned long offset, type; 2127 unsigned long offset, type;
2085 int result = -EINVAL; 2128 unsigned char count;
2086 int count; 2129 unsigned char has_cache;
2087 bool has_cache; 2130 int err = -EINVAL;
2088 2131
2089 if (non_swap_entry(entry)) 2132 if (non_swap_entry(entry))
2090 return -EINVAL; 2133 goto out;
2091 2134
2092 type = swp_type(entry); 2135 type = swp_type(entry);
2093 if (type >= nr_swapfiles) 2136 if (type >= nr_swapfiles)
2094 goto bad_file; 2137 goto bad_file;
2095 p = type + swap_info; 2138 p = swap_info[type];
2096 offset = swp_offset(entry); 2139 offset = swp_offset(entry);
2097 2140
2098 spin_lock(&swap_lock); 2141 spin_lock(&swap_lock);
2099
2100 if (unlikely(offset >= p->max)) 2142 if (unlikely(offset >= p->max))
2101 goto unlock_out; 2143 goto unlock_out;
2102 2144
2103 count = swap_count(p->swap_map[offset]); 2145 count = p->swap_map[offset];
2104 has_cache = swap_has_cache(p->swap_map[offset]); 2146 has_cache = count & SWAP_HAS_CACHE;
2147 count &= ~SWAP_HAS_CACHE;
2148 err = 0;
2105 2149
2106 if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ 2150 if (usage == SWAP_HAS_CACHE) {
2107 2151
2108 /* set SWAP_HAS_CACHE if there is no cache and entry is used */ 2152 /* set SWAP_HAS_CACHE if there is no cache and entry is used */
2109 if (!has_cache && count) { 2153 if (!has_cache && count)
2110 p->swap_map[offset] = encode_swapmap(count, true); 2154 has_cache = SWAP_HAS_CACHE;
2111 result = 0; 2155 else if (has_cache) /* someone else added cache */
2112 } else if (has_cache) /* someone added cache */ 2156 err = -EEXIST;
2113 result = -EEXIST; 2157 else /* no users remaining */
2114 else if (!count) /* no users */ 2158 err = -ENOENT;
2115 result = -ENOENT;
2116 2159
2117 } else if (count || has_cache) { 2160 } else if (count || has_cache) {
2118 if (count < SWAP_MAP_MAX - 1) { 2161
2119 p->swap_map[offset] = encode_swapmap(count + 1, 2162 if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
2120 has_cache); 2163 count += usage;
2121 result = 0; 2164 else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
2122 } else if (count <= SWAP_MAP_MAX) { 2165 err = -EINVAL;
2123 if (swap_overflow++ < 5) 2166 else if (swap_count_continued(p, offset, count))
2124 printk(KERN_WARNING 2167 count = COUNT_CONTINUED;
2125 "swap_dup: swap entry overflow\n"); 2168 else
2126 p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, 2169 err = -ENOMEM;
2127 has_cache);
2128 result = 0;
2129 }
2130 } else 2170 } else
2131 result = -ENOENT; /* unused swap entry */ 2171 err = -ENOENT; /* unused swap entry */
2172
2173 p->swap_map[offset] = count | has_cache;
2174
2132unlock_out: 2175unlock_out:
2133 spin_unlock(&swap_lock); 2176 spin_unlock(&swap_lock);
2134out: 2177out:
2135 return result; 2178 return err;
2136 2179
2137bad_file: 2180bad_file:
2138 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2181 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
2139 goto out; 2182 goto out;
2140} 2183}
2184
2185/*
2186 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
2187 * (in which case its reference count is never incremented).
2188 */
2189void swap_shmem_alloc(swp_entry_t entry)
2190{
2191 __swap_duplicate(entry, SWAP_MAP_SHMEM);
2192}
2193
2141/* 2194/*
2142 * increase reference count of swap entry by 1. 2195 * Increase reference count of swap entry by 1.
2196 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
2197 * but could not be atomically allocated. Returns 0, just as if it succeeded,
2198 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
2199 * might occur if a page table entry has got corrupted.
2143 */ 2200 */
2144void swap_duplicate(swp_entry_t entry) 2201int swap_duplicate(swp_entry_t entry)
2145{ 2202{
2146 __swap_duplicate(entry, SWAP_MAP); 2203 int err = 0;
2204
2205 while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
2206 err = add_swap_count_continuation(entry, GFP_ATOMIC);
2207 return err;
2147} 2208}
2148 2209
2149/* 2210/*
2150 * @entry: swap entry for which we allocate swap cache. 2211 * @entry: swap entry for which we allocate swap cache.
2151 * 2212 *
2152 * Called when allocating swap cache for exising swap entry, 2213 * Called when allocating swap cache for existing swap entry,
2153 * This can return error codes. Returns 0 at success. 2214 * This can return error codes. Returns 0 at success.
2154 * -EBUSY means there is a swap cache. 2215 * -EBUSY means there is a swap cache.
2155 * Note: return code is different from swap_duplicate(). 2216 * Note: return code is different from swap_duplicate().
2156 */ 2217 */
2157int swapcache_prepare(swp_entry_t entry) 2218int swapcache_prepare(swp_entry_t entry)
2158{ 2219{
2159 return __swap_duplicate(entry, SWAP_CACHE); 2220 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2160}
2161
2162
2163struct swap_info_struct *
2164get_swap_info_struct(unsigned type)
2165{
2166 return &swap_info[type];
2167} 2221}
2168 2222
2169/* 2223/*
@@ -2181,7 +2235,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2181 if (!our_page_cluster) /* no readahead */ 2235 if (!our_page_cluster) /* no readahead */
2182 return 0; 2236 return 0;
2183 2237
2184 si = &swap_info[swp_type(entry)]; 2238 si = swap_info[swp_type(entry)];
2185 target = swp_offset(entry); 2239 target = swp_offset(entry);
2186 base = (target >> our_page_cluster) << our_page_cluster; 2240 base = (target >> our_page_cluster) << our_page_cluster;
2187 end = base + (1 << our_page_cluster); 2241 end = base + (1 << our_page_cluster);
@@ -2217,3 +2271,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2217 *offset = ++toff; 2271 *offset = ++toff;
2218 return nr_pages? ++nr_pages: 0; 2272 return nr_pages? ++nr_pages: 0;
2219} 2273}
2274
2275/*
2276 * add_swap_count_continuation - called when a swap count is duplicated
2277 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2278 * page of the original vmalloc'ed swap_map, to hold the continuation count
2279 * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
2280 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
2281 *
2282 * These continuation pages are seldom referenced: the common paths all work
2283 * on the original swap_map, only referring to a continuation page when the
2284 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
2285 *
2286 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
2287 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
2288 * can be called after dropping locks.
2289 */
2290int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2291{
2292 struct swap_info_struct *si;
2293 struct page *head;
2294 struct page *page;
2295 struct page *list_page;
2296 pgoff_t offset;
2297 unsigned char count;
2298
2299 /*
2300 * When debugging, it's easier to use __GFP_ZERO here; but it's better
2301 * for latency not to zero a page while GFP_ATOMIC and holding locks.
2302 */
2303 page = alloc_page(gfp_mask | __GFP_HIGHMEM);
2304
2305 si = swap_info_get(entry);
2306 if (!si) {
2307 /*
2308 * An acceptable race has occurred since the failing
2309 * __swap_duplicate(): the swap entry has been freed,
2310 * perhaps even the whole swap_map cleared for swapoff.
2311 */
2312 goto outer;
2313 }
2314
2315 offset = swp_offset(entry);
2316 count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
2317
2318 if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
2319 /*
2320 * The higher the swap count, the more likely it is that tasks
2321 * will race to add swap count continuation: we need to avoid
2322 * over-provisioning.
2323 */
2324 goto out;
2325 }
2326
2327 if (!page) {
2328 spin_unlock(&swap_lock);
2329 return -ENOMEM;
2330 }
2331
2332 /*
2333 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2334 * no architecture is using highmem pages for kernel pagetables: so it
2335 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
2336 */
2337 head = vmalloc_to_page(si->swap_map + offset);
2338 offset &= ~PAGE_MASK;
2339
2340 /*
2341 * Page allocation does not initialize the page's lru field,
2342 * but it does always reset its private field.
2343 */
2344 if (!page_private(head)) {
2345 BUG_ON(count & COUNT_CONTINUED);
2346 INIT_LIST_HEAD(&head->lru);
2347 set_page_private(head, SWP_CONTINUED);
2348 si->flags |= SWP_CONTINUED;
2349 }
2350
2351 list_for_each_entry(list_page, &head->lru, lru) {
2352 unsigned char *map;
2353
2354 /*
2355 * If the previous map said no continuation, but we've found
2356 * a continuation page, free our allocation and use this one.
2357 */
2358 if (!(count & COUNT_CONTINUED))
2359 goto out;
2360
2361 map = kmap_atomic(list_page, KM_USER0) + offset;
2362 count = *map;
2363 kunmap_atomic(map, KM_USER0);
2364
2365 /*
2366 * If this continuation count now has some space in it,
2367 * free our allocation and use this one.
2368 */
2369 if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
2370 goto out;
2371 }
2372
2373 list_add_tail(&page->lru, &head->lru);
2374 page = NULL; /* now it's attached, don't free it */
2375out:
2376 spin_unlock(&swap_lock);
2377outer:
2378 if (page)
2379 __free_page(page);
2380 return 0;
2381}
2382
2383/*
2384 * swap_count_continued - when the original swap_map count is incremented
2385 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
2386 * into, carry if so, or else fail until a new continuation page is allocated;
2387 * when the original swap_map count is decremented from 0 with continuation,
2388 * borrow from the continuation and report whether it still holds more.
2389 * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
2390 */
2391static bool swap_count_continued(struct swap_info_struct *si,
2392 pgoff_t offset, unsigned char count)
2393{
2394 struct page *head;
2395 struct page *page;
2396 unsigned char *map;
2397
2398 head = vmalloc_to_page(si->swap_map + offset);
2399 if (page_private(head) != SWP_CONTINUED) {
2400 BUG_ON(count & COUNT_CONTINUED);
2401 return false; /* need to add count continuation */
2402 }
2403
2404 offset &= ~PAGE_MASK;
2405 page = list_entry(head->lru.next, struct page, lru);
2406 map = kmap_atomic(page, KM_USER0) + offset;
2407
2408 if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
2409 goto init_map; /* jump over SWAP_CONT_MAX checks */
2410
2411 if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
2412 /*
2413 * Think of how you add 1 to 999
2414 */
2415 while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
2416 kunmap_atomic(map, KM_USER0);
2417 page = list_entry(page->lru.next, struct page, lru);
2418 BUG_ON(page == head);
2419 map = kmap_atomic(page, KM_USER0) + offset;
2420 }
2421 if (*map == SWAP_CONT_MAX) {
2422 kunmap_atomic(map, KM_USER0);
2423 page = list_entry(page->lru.next, struct page, lru);
2424 if (page == head)
2425 return false; /* add count continuation */
2426 map = kmap_atomic(page, KM_USER0) + offset;
2427init_map: *map = 0; /* we didn't zero the page */
2428 }
2429 *map += 1;
2430 kunmap_atomic(map, KM_USER0);
2431 page = list_entry(page->lru.prev, struct page, lru);
2432 while (page != head) {
2433 map = kmap_atomic(page, KM_USER0) + offset;
2434 *map = COUNT_CONTINUED;
2435 kunmap_atomic(map, KM_USER0);
2436 page = list_entry(page->lru.prev, struct page, lru);
2437 }
2438 return true; /* incremented */
2439
2440 } else { /* decrementing */
2441 /*
2442 * Think of how you subtract 1 from 1000
2443 */
2444 BUG_ON(count != COUNT_CONTINUED);
2445 while (*map == COUNT_CONTINUED) {
2446 kunmap_atomic(map, KM_USER0);
2447 page = list_entry(page->lru.next, struct page, lru);
2448 BUG_ON(page == head);
2449 map = kmap_atomic(page, KM_USER0) + offset;
2450 }
2451 BUG_ON(*map == 0);
2452 *map -= 1;
2453 if (*map == 0)
2454 count = 0;
2455 kunmap_atomic(map, KM_USER0);
2456 page = list_entry(page->lru.prev, struct page, lru);
2457 while (page != head) {
2458 map = kmap_atomic(page, KM_USER0) + offset;
2459 *map = SWAP_CONT_MAX | count;
2460 count = COUNT_CONTINUED;
2461 kunmap_atomic(map, KM_USER0);
2462 page = list_entry(page->lru.prev, struct page, lru);
2463 }
2464 return count == COUNT_CONTINUED;
2465 }
2466}
2467
2468/*
2469 * free_swap_count_continuations - swapoff free all the continuation pages
2470 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
2471 */
2472static void free_swap_count_continuations(struct swap_info_struct *si)
2473{
2474 pgoff_t offset;
2475
2476 for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
2477 struct page *head;
2478 head = vmalloc_to_page(si->swap_map + offset);
2479 if (page_private(head)) {
2480 struct list_head *this, *next;
2481 list_for_each_safe(this, next, &head->lru) {
2482 struct page *page;
2483 page = list_entry(this, struct page, lru);
2484 list_del(this);
2485 __free_page(page);
2486 }
2487 }
2488 }
2489}
diff --git a/mm/truncate.c b/mm/truncate.c
index 450cebdabfc0..f42675a3615d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/swap.h> 14#include <linux/swap.h>
14#include <linux/module.h> 15#include <linux/module.h>
@@ -272,6 +273,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
272 pagevec_release(&pvec); 273 pagevec_release(&pvec);
273 break; 274 break;
274 } 275 }
276 mem_cgroup_uncharge_start();
275 for (i = 0; i < pagevec_count(&pvec); i++) { 277 for (i = 0; i < pagevec_count(&pvec); i++) {
276 struct page *page = pvec.pages[i]; 278 struct page *page = pvec.pages[i];
277 279
@@ -286,6 +288,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
286 unlock_page(page); 288 unlock_page(page);
287 } 289 }
288 pagevec_release(&pvec); 290 pagevec_release(&pvec);
291 mem_cgroup_uncharge_end();
289 } 292 }
290} 293}
291EXPORT_SYMBOL(truncate_inode_pages_range); 294EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +330,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
327 pagevec_init(&pvec, 0); 330 pagevec_init(&pvec, 0);
328 while (next <= end && 331 while (next <= end &&
329 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 332 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
333 mem_cgroup_uncharge_start();
330 for (i = 0; i < pagevec_count(&pvec); i++) { 334 for (i = 0; i < pagevec_count(&pvec); i++) {
331 struct page *page = pvec.pages[i]; 335 struct page *page = pvec.pages[i];
332 pgoff_t index; 336 pgoff_t index;
@@ -354,6 +358,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
354 break; 358 break;
355 } 359 }
356 pagevec_release(&pvec); 360 pagevec_release(&pvec);
361 mem_cgroup_uncharge_end();
357 cond_resched(); 362 cond_resched();
358 } 363 }
359 return ret; 364 return ret;
@@ -428,6 +433,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
428 while (next <= end && !wrapped && 433 while (next <= end && !wrapped &&
429 pagevec_lookup(&pvec, mapping, next, 434 pagevec_lookup(&pvec, mapping, next,
430 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 435 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
436 mem_cgroup_uncharge_start();
431 for (i = 0; i < pagevec_count(&pvec); i++) { 437 for (i = 0; i < pagevec_count(&pvec); i++) {
432 struct page *page = pvec.pages[i]; 438 struct page *page = pvec.pages[i];
433 pgoff_t page_index; 439 pgoff_t page_index;
@@ -477,6 +483,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
477 unlock_page(page); 483 unlock_page(page);
478 } 484 }
479 pagevec_release(&pvec); 485 pagevec_release(&pvec);
486 mem_cgroup_uncharge_end();
480 cond_resched(); 487 cond_resched();
481 } 488 }
482 return ret; 489 return ret;
@@ -490,7 +497,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
490 * Any pages which are found to be mapped into pagetables are unmapped prior to 497 * Any pages which are found to be mapped into pagetables are unmapped prior to
491 * invalidation. 498 * invalidation.
492 * 499 *
493 * Returns -EIO if any pages could not be invalidated. 500 * Returns -EBUSY if any pages could not be invalidated.
494 */ 501 */
495int invalidate_inode_pages2(struct address_space *mapping) 502int invalidate_inode_pages2(struct address_space *mapping)
496{ 503{
@@ -516,22 +523,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
516 */ 523 */
517void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 524void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
518{ 525{
519 if (new < old) { 526 struct address_space *mapping = inode->i_mapping;
520 struct address_space *mapping = inode->i_mapping; 527
521 528 /*
522 /* 529 * unmap_mapping_range is called twice, first simply for
523 * unmap_mapping_range is called twice, first simply for 530 * efficiency so that truncate_inode_pages does fewer
524 * efficiency so that truncate_inode_pages does fewer 531 * single-page unmaps. However after this first call, and
525 * single-page unmaps. However after this first call, and 532 * before truncate_inode_pages finishes, it is possible for
526 * before truncate_inode_pages finishes, it is possible for 533 * private pages to be COWed, which remain after
527 * private pages to be COWed, which remain after 534 * truncate_inode_pages finishes, hence the second
528 * truncate_inode_pages finishes, hence the second 535 * unmap_mapping_range call must be made for correctness.
529 * unmap_mapping_range call must be made for correctness. 536 */
530 */ 537 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
531 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 538 truncate_inode_pages(mapping, new);
532 truncate_inode_pages(mapping, new); 539 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
533 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
534 }
535} 540}
536EXPORT_SYMBOL(truncate_pagecache); 541EXPORT_SYMBOL(truncate_pagecache);
537 542
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f927..f5712e8964be 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,6 +186,27 @@ void kzfree(const void *p)
186} 186}
187EXPORT_SYMBOL(kzfree); 187EXPORT_SYMBOL(kzfree);
188 188
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
189/* 210/*
190 * strndup_user - duplicate an existing string from user space 211 * strndup_user - duplicate an existing string from user space
191 * @s: The string to duplicate 212 * @s: The string to duplicate
@@ -220,7 +241,7 @@ char *strndup_user(const char __user *s, long n)
220} 241}
221EXPORT_SYMBOL(strndup_user); 242EXPORT_SYMBOL(strndup_user);
222 243
223#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT 244#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
224void arch_pick_mmap_layout(struct mm_struct *mm) 245void arch_pick_mmap_layout(struct mm_struct *mm)
225{ 246{
226 mm->mmap_base = TASK_UNMAPPED_BASE; 247 mm->mmap_base = TASK_UNMAPPED_BASE;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f551a4a44cd..ae007462b7f6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
509 509
510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 510static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
511 511
512/* for per-CPU blocks */
513static void purge_fragmented_blocks_allcpus(void);
514
512/* 515/*
513 * Purges all lazily-freed vmap areas. 516 * Purges all lazily-freed vmap areas.
514 * 517 *
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
539 } else 542 } else
540 spin_lock(&purge_lock); 543 spin_lock(&purge_lock);
541 544
545 if (sync)
546 purge_fragmented_blocks_allcpus();
547
542 rcu_read_lock(); 548 rcu_read_lock();
543 list_for_each_entry_rcu(va, &vmap_area_list, list) { 549 list_for_each_entry_rcu(va, &vmap_area_list, list) {
544 if (va->flags & VM_LAZY_FREE) { 550 if (va->flags & VM_LAZY_FREE) {
@@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
555 } 561 }
556 rcu_read_unlock(); 562 rcu_read_unlock();
557 563
558 if (nr) { 564 if (nr)
559 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
560 atomic_sub(nr, &vmap_lazy_nr); 565 atomic_sub(nr, &vmap_lazy_nr);
561 }
562 566
563 if (nr || force_flush) 567 if (nr || force_flush)
564 flush_tlb_kernel_range(*start, *end); 568 flush_tlb_kernel_range(*start, *end);
@@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
669struct vmap_block_queue { 673struct vmap_block_queue {
670 spinlock_t lock; 674 spinlock_t lock;
671 struct list_head free; 675 struct list_head free;
672 struct list_head dirty;
673 unsigned int nr_dirty;
674}; 676};
675 677
676struct vmap_block { 678struct vmap_block {
@@ -680,10 +682,9 @@ struct vmap_block {
680 unsigned long free, dirty; 682 unsigned long free, dirty;
681 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 683 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
682 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 684 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
683 union { 685 struct list_head free_list;
684 struct list_head free_list; 686 struct rcu_head rcu_head;
685 struct rcu_head rcu_head; 687 struct list_head purge;
686 };
687}; 688};
688 689
689/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 690/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -759,9 +760,9 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
759 vbq = &get_cpu_var(vmap_block_queue); 760 vbq = &get_cpu_var(vmap_block_queue);
760 vb->vbq = vbq; 761 vb->vbq = vbq;
761 spin_lock(&vbq->lock); 762 spin_lock(&vbq->lock);
762 list_add(&vb->free_list, &vbq->free); 763 list_add_rcu(&vb->free_list, &vbq->free);
763 spin_unlock(&vbq->lock); 764 spin_unlock(&vbq->lock);
764 put_cpu_var(vmap_cpu_blocks); 765 put_cpu_var(vmap_block_queue);
765 766
766 return vb; 767 return vb;
767} 768}
@@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
778 struct vmap_block *tmp; 779 struct vmap_block *tmp;
779 unsigned long vb_idx; 780 unsigned long vb_idx;
780 781
781 BUG_ON(!list_empty(&vb->free_list));
782
783 vb_idx = addr_to_vb_idx(vb->va->va_start); 782 vb_idx = addr_to_vb_idx(vb->va->va_start);
784 spin_lock(&vmap_block_tree_lock); 783 spin_lock(&vmap_block_tree_lock);
785 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 784 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
790 call_rcu(&vb->rcu_head, rcu_free_vb); 789 call_rcu(&vb->rcu_head, rcu_free_vb);
791} 790}
792 791
792static void purge_fragmented_blocks(int cpu)
793{
794 LIST_HEAD(purge);
795 struct vmap_block *vb;
796 struct vmap_block *n_vb;
797 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
798
799 rcu_read_lock();
800 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
801
802 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
803 continue;
804
805 spin_lock(&vb->lock);
806 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
807 vb->free = 0; /* prevent further allocs after releasing lock */
808 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
809 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
810 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
811 spin_lock(&vbq->lock);
812 list_del_rcu(&vb->free_list);
813 spin_unlock(&vbq->lock);
814 spin_unlock(&vb->lock);
815 list_add_tail(&vb->purge, &purge);
816 } else
817 spin_unlock(&vb->lock);
818 }
819 rcu_read_unlock();
820
821 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
822 list_del(&vb->purge);
823 free_vmap_block(vb);
824 }
825}
826
827static void purge_fragmented_blocks_thiscpu(void)
828{
829 purge_fragmented_blocks(smp_processor_id());
830}
831
832static void purge_fragmented_blocks_allcpus(void)
833{
834 int cpu;
835
836 for_each_possible_cpu(cpu)
837 purge_fragmented_blocks(cpu);
838}
839
793static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 840static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
794{ 841{
795 struct vmap_block_queue *vbq; 842 struct vmap_block_queue *vbq;
796 struct vmap_block *vb; 843 struct vmap_block *vb;
797 unsigned long addr = 0; 844 unsigned long addr = 0;
798 unsigned int order; 845 unsigned int order;
846 int purge = 0;
799 847
800 BUG_ON(size & ~PAGE_MASK); 848 BUG_ON(size & ~PAGE_MASK);
801 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 849 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -808,25 +856,39 @@ again:
808 int i; 856 int i;
809 857
810 spin_lock(&vb->lock); 858 spin_lock(&vb->lock);
859 if (vb->free < 1UL << order)
860 goto next;
861
811 i = bitmap_find_free_region(vb->alloc_map, 862 i = bitmap_find_free_region(vb->alloc_map,
812 VMAP_BBMAP_BITS, order); 863 VMAP_BBMAP_BITS, order);
813 864
814 if (i >= 0) { 865 if (i < 0) {
815 addr = vb->va->va_start + (i << PAGE_SHIFT); 866 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
816 BUG_ON(addr_to_vb_idx(addr) != 867 /* fragmented and no outstanding allocations */
817 addr_to_vb_idx(vb->va->va_start)); 868 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
818 vb->free -= 1UL << order; 869 purge = 1;
819 if (vb->free == 0) {
820 spin_lock(&vbq->lock);
821 list_del_init(&vb->free_list);
822 spin_unlock(&vbq->lock);
823 } 870 }
824 spin_unlock(&vb->lock); 871 goto next;
825 break; 872 }
873 addr = vb->va->va_start + (i << PAGE_SHIFT);
874 BUG_ON(addr_to_vb_idx(addr) !=
875 addr_to_vb_idx(vb->va->va_start));
876 vb->free -= 1UL << order;
877 if (vb->free == 0) {
878 spin_lock(&vbq->lock);
879 list_del_rcu(&vb->free_list);
880 spin_unlock(&vbq->lock);
826 } 881 }
827 spin_unlock(&vb->lock); 882 spin_unlock(&vb->lock);
883 break;
884next:
885 spin_unlock(&vb->lock);
828 } 886 }
829 put_cpu_var(vmap_cpu_blocks); 887
888 if (purge)
889 purge_fragmented_blocks_thiscpu();
890
891 put_cpu_var(vmap_block_queue);
830 rcu_read_unlock(); 892 rcu_read_unlock();
831 893
832 if (!addr) { 894 if (!addr) {
@@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size)
862 BUG_ON(!vb); 924 BUG_ON(!vb);
863 925
864 spin_lock(&vb->lock); 926 spin_lock(&vb->lock);
865 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 927 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
866 928
867 vb->dirty += 1UL << order; 929 vb->dirty += 1UL << order;
868 if (vb->dirty == VMAP_BBMAP_BITS) { 930 if (vb->dirty == VMAP_BBMAP_BITS) {
869 BUG_ON(vb->free || !list_empty(&vb->free_list)); 931 BUG_ON(vb->free);
870 spin_unlock(&vb->lock); 932 spin_unlock(&vb->lock);
871 free_vmap_block(vb); 933 free_vmap_block(vb);
872 } else 934 } else
@@ -1035,8 +1097,6 @@ void __init vmalloc_init(void)
1035 vbq = &per_cpu(vmap_block_queue, i); 1097 vbq = &per_cpu(vmap_block_queue, i);
1036 spin_lock_init(&vbq->lock); 1098 spin_lock_init(&vbq->lock);
1037 INIT_LIST_HEAD(&vbq->free); 1099 INIT_LIST_HEAD(&vbq->free);
1038 INIT_LIST_HEAD(&vbq->dirty);
1039 vbq->nr_dirty = 0;
1040 } 1100 }
1041 1101
1042 /* Import existing vmlist entries. */ 1102 /* Import existing vmlist entries. */
@@ -1411,6 +1471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1411{ 1471{
1412 struct page **pages; 1472 struct page **pages;
1413 unsigned int nr_pages, array_size, i; 1473 unsigned int nr_pages, array_size, i;
1474 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1414 1475
1415 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1476 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1416 array_size = (nr_pages * sizeof(struct page *)); 1477 array_size = (nr_pages * sizeof(struct page *));
@@ -1418,13 +1479,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1418 area->nr_pages = nr_pages; 1479 area->nr_pages = nr_pages;
1419 /* Please note that the recursion is strictly bounded. */ 1480 /* Please note that the recursion is strictly bounded. */
1420 if (array_size > PAGE_SIZE) { 1481 if (array_size > PAGE_SIZE) {
1421 pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, 1482 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1422 PAGE_KERNEL, node, caller); 1483 PAGE_KERNEL, node, caller);
1423 area->flags |= VM_VPAGES; 1484 area->flags |= VM_VPAGES;
1424 } else { 1485 } else {
1425 pages = kmalloc_node(array_size, 1486 pages = kmalloc_node(array_size, nested_gfp, node);
1426 (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
1427 node);
1428 } 1487 }
1429 area->pages = pages; 1488 area->pages = pages;
1430 area->caller = caller; 1489 area->caller = caller;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 777af57fd8c8..3ff3311447f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -13,7 +13,7 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/gfp.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
@@ -55,6 +55,11 @@ struct scan_control {
55 /* Number of pages freed so far during a call to shrink_zones() */ 55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed; 56 unsigned long nr_reclaimed;
57 57
58 /* How many pages shrink_list() should reclaim */
59 unsigned long nr_to_reclaim;
60
61 unsigned long hibernation_mode;
62
58 /* This context's GFP mask */ 63 /* This context's GFP mask */
59 gfp_t gfp_mask; 64 gfp_t gfp_mask;
60 65
@@ -66,12 +71,6 @@ struct scan_control {
66 /* Can pages be swapped as part of reclaim? */ 71 /* Can pages be swapped as part of reclaim? */
67 int may_swap; 72 int may_swap;
68 73
69 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
70 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
71 * In this context, it doesn't matter that we scan the
72 * whole list at once. */
73 int swap_cluster_max;
74
75 int swappiness; 74 int swappiness;
76 75
77 int all_unreclaimable; 76 int all_unreclaimable;
@@ -263,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
263 return ret; 262 return ret;
264} 263}
265 264
266/* Called without lock on whether page is mapped, so answer is unstable */
267static inline int page_mapping_inuse(struct page *page)
268{
269 struct address_space *mapping;
270
271 /* Page is in somebody's page tables. */
272 if (page_mapped(page))
273 return 1;
274
275 /* Be more reluctant to reclaim swapcache than pagecache */
276 if (PageSwapCache(page))
277 return 1;
278
279 mapping = page_mapping(page);
280 if (!mapping)
281 return 0;
282
283 /* File is mmap'd by somebody? */
284 return mapping_mapped(mapping);
285}
286
287static inline int is_page_cache_freeable(struct page *page) 265static inline int is_page_cache_freeable(struct page *page)
288{ 266{
289 /* 267 /*
@@ -358,7 +336,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
358 * stalls if we need to run get_block(). We could test 336 * stalls if we need to run get_block(). We could test
359 * PagePrivate for that. 337 * PagePrivate for that.
360 * 338 *
361 * If this process is currently in generic_file_write() against 339 * If this process is currently in __generic_file_aio_write() against
362 * this page's queue, we can perform writeback even if that 340 * this page's queue, we can perform writeback even if that
363 * will block. 341 * will block.
364 * 342 *
@@ -580,6 +558,65 @@ redo:
580 put_page(page); /* drop ref from isolate */ 558 put_page(page); /* drop ref from isolate */
581} 559}
582 560
561enum page_references {
562 PAGEREF_RECLAIM,
563 PAGEREF_RECLAIM_CLEAN,
564 PAGEREF_KEEP,
565 PAGEREF_ACTIVATE,
566};
567
568static enum page_references page_check_references(struct page *page,
569 struct scan_control *sc)
570{
571 int referenced_ptes, referenced_page;
572 unsigned long vm_flags;
573
574 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
575 referenced_page = TestClearPageReferenced(page);
576
577 /* Lumpy reclaim - ignore references */
578 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
579 return PAGEREF_RECLAIM;
580
581 /*
582 * Mlock lost the isolation race with us. Let try_to_unmap()
583 * move the page to the unevictable list.
584 */
585 if (vm_flags & VM_LOCKED)
586 return PAGEREF_RECLAIM;
587
588 if (referenced_ptes) {
589 if (PageAnon(page))
590 return PAGEREF_ACTIVATE;
591 /*
592 * All mapped pages start out with page table
593 * references from the instantiating fault, so we need
594 * to look twice if a mapped file page is used more
595 * than once.
596 *
597 * Mark it and spare it for another trip around the
598 * inactive list. Another page table reference will
599 * lead to its activation.
600 *
601 * Note: the mark is set for activated pages as well
602 * so that recently deactivated but used pages are
603 * quickly recovered.
604 */
605 SetPageReferenced(page);
606
607 if (referenced_page)
608 return PAGEREF_ACTIVATE;
609
610 return PAGEREF_KEEP;
611 }
612
613 /* Reclaim if clean, defer dirty pages to writeback */
614 if (referenced_page)
615 return PAGEREF_RECLAIM_CLEAN;
616
617 return PAGEREF_RECLAIM;
618}
619
583/* 620/*
584 * shrink_page_list() returns the number of reclaimed pages 621 * shrink_page_list() returns the number of reclaimed pages
585 */ 622 */
@@ -591,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
591 struct pagevec freed_pvec; 628 struct pagevec freed_pvec;
592 int pgactivate = 0; 629 int pgactivate = 0;
593 unsigned long nr_reclaimed = 0; 630 unsigned long nr_reclaimed = 0;
594 unsigned long vm_flags;
595 631
596 cond_resched(); 632 cond_resched();
597 633
598 pagevec_init(&freed_pvec, 1); 634 pagevec_init(&freed_pvec, 1);
599 while (!list_empty(page_list)) { 635 while (!list_empty(page_list)) {
636 enum page_references references;
600 struct address_space *mapping; 637 struct address_space *mapping;
601 struct page *page; 638 struct page *page;
602 int may_enter_fs; 639 int may_enter_fs;
603 int referenced;
604 640
605 cond_resched(); 641 cond_resched();
606 642
@@ -642,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
642 goto keep_locked; 678 goto keep_locked;
643 } 679 }
644 680
645 referenced = page_referenced(page, 1, 681 references = page_check_references(page, sc);
646 sc->mem_cgroup, &vm_flags); 682 switch (references) {
647 /* 683 case PAGEREF_ACTIVATE:
648 * In active use or really unfreeable? Activate it.
649 * If page which have PG_mlocked lost isoltation race,
650 * try_to_unmap moves it to unevictable list
651 */
652 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
653 referenced && page_mapping_inuse(page)
654 && !(vm_flags & VM_LOCKED))
655 goto activate_locked; 684 goto activate_locked;
685 case PAGEREF_KEEP:
686 goto keep_locked;
687 case PAGEREF_RECLAIM:
688 case PAGEREF_RECLAIM_CLEAN:
689 ; /* try to reclaim the page below */
690 }
656 691
657 /* 692 /*
658 * Anonymous process memory has backing store? 693 * Anonymous process memory has backing store?
@@ -686,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
686 } 721 }
687 722
688 if (PageDirty(page)) { 723 if (PageDirty(page)) {
689 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) 724 if (references == PAGEREF_RECLAIM_CLEAN)
690 goto keep_locked; 725 goto keep_locked;
691 if (!may_enter_fs) 726 if (!may_enter_fs)
692 goto keep_locked; 727 goto keep_locked;
@@ -1132,7 +1167,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1132 unsigned long nr_anon; 1167 unsigned long nr_anon;
1133 unsigned long nr_file; 1168 unsigned long nr_file;
1134 1169
1135 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1170 nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
1136 &page_list, &nr_scan, sc->order, mode, 1171 &page_list, &nr_scan, sc->order, mode,
1137 zone, sc->mem_cgroup, 0, file); 1172 zone, sc->mem_cgroup, 0, file);
1138 1173
@@ -1166,10 +1201,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1166 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1201 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1167 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); 1202 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1168 1203
1169 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1204 reclaim_stat->recent_scanned[0] += nr_anon;
1170 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1205 reclaim_stat->recent_scanned[1] += nr_file;
1171 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1172 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1173 1206
1174 spin_unlock_irq(&zone->lru_lock); 1207 spin_unlock_irq(&zone->lru_lock);
1175 1208
@@ -1353,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1353 continue; 1386 continue;
1354 } 1387 }
1355 1388
1356 /* page_referenced clears PageReferenced */ 1389 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1357 if (page_mapping_inuse(page) &&
1358 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1359 nr_rotated++; 1390 nr_rotated++;
1360 /* 1391 /*
1361 * Identify referenced, file-backed active pages and 1392 * Identify referenced, file-backed active pages and
@@ -1464,20 +1495,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1464 return low; 1495 return low;
1465} 1496}
1466 1497
1498static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1499 int file)
1500{
1501 if (file)
1502 return inactive_file_is_low(zone, sc);
1503 else
1504 return inactive_anon_is_low(zone, sc);
1505}
1506
1467static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1507static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1468 struct zone *zone, struct scan_control *sc, int priority) 1508 struct zone *zone, struct scan_control *sc, int priority)
1469{ 1509{
1470 int file = is_file_lru(lru); 1510 int file = is_file_lru(lru);
1471 1511
1472 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { 1512 if (is_active_lru(lru)) {
1473 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1513 if (inactive_list_is_low(zone, sc, file))
1514 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1474 return 0; 1515 return 0;
1475 } 1516 }
1476 1517
1477 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1478 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1479 return 0;
1480 }
1481 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1518 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1482} 1519}
1483 1520
@@ -1567,15 +1604,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1567 * until we collected @swap_cluster_max pages to scan. 1604 * until we collected @swap_cluster_max pages to scan.
1568 */ 1605 */
1569static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, 1606static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1570 unsigned long *nr_saved_scan, 1607 unsigned long *nr_saved_scan)
1571 unsigned long swap_cluster_max)
1572{ 1608{
1573 unsigned long nr; 1609 unsigned long nr;
1574 1610
1575 *nr_saved_scan += nr_to_scan; 1611 *nr_saved_scan += nr_to_scan;
1576 nr = *nr_saved_scan; 1612 nr = *nr_saved_scan;
1577 1613
1578 if (nr >= swap_cluster_max) 1614 if (nr >= SWAP_CLUSTER_MAX)
1579 *nr_saved_scan = 0; 1615 *nr_saved_scan = 0;
1580 else 1616 else
1581 nr = 0; 1617 nr = 0;
@@ -1594,7 +1630,7 @@ static void shrink_zone(int priority, struct zone *zone,
1594 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1630 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1595 enum lru_list l; 1631 enum lru_list l;
1596 unsigned long nr_reclaimed = sc->nr_reclaimed; 1632 unsigned long nr_reclaimed = sc->nr_reclaimed;
1597 unsigned long swap_cluster_max = sc->swap_cluster_max; 1633 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1598 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1634 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1599 int noswap = 0; 1635 int noswap = 0;
1600 1636
@@ -1616,15 +1652,15 @@ static void shrink_zone(int priority, struct zone *zone,
1616 scan = (scan * percent[file]) / 100; 1652 scan = (scan * percent[file]) / 100;
1617 } 1653 }
1618 nr[l] = nr_scan_try_batch(scan, 1654 nr[l] = nr_scan_try_batch(scan,
1619 &reclaim_stat->nr_saved_scan[l], 1655 &reclaim_stat->nr_saved_scan[l]);
1620 swap_cluster_max);
1621 } 1656 }
1622 1657
1623 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1658 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1624 nr[LRU_INACTIVE_FILE]) { 1659 nr[LRU_INACTIVE_FILE]) {
1625 for_each_evictable_lru(l) { 1660 for_each_evictable_lru(l) {
1626 if (nr[l]) { 1661 if (nr[l]) {
1627 nr_to_scan = min(nr[l], swap_cluster_max); 1662 nr_to_scan = min_t(unsigned long,
1663 nr[l], SWAP_CLUSTER_MAX);
1628 nr[l] -= nr_to_scan; 1664 nr[l] -= nr_to_scan;
1629 1665
1630 nr_reclaimed += shrink_list(l, nr_to_scan, 1666 nr_reclaimed += shrink_list(l, nr_to_scan,
@@ -1639,8 +1675,7 @@ static void shrink_zone(int priority, struct zone *zone,
1639 * with multiple processes reclaiming pages, the total 1675 * with multiple processes reclaiming pages, the total
1640 * freeing target can get unreasonably large. 1676 * freeing target can get unreasonably large.
1641 */ 1677 */
1642 if (nr_reclaimed > swap_cluster_max && 1678 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1643 priority < DEF_PRIORITY && !current_is_kswapd())
1644 break; 1679 break;
1645 } 1680 }
1646 1681
@@ -1693,8 +1728,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1693 continue; 1728 continue;
1694 note_zone_scanning_priority(zone, priority); 1729 note_zone_scanning_priority(zone, priority);
1695 1730
1696 if (zone_is_all_unreclaimable(zone) && 1731 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1697 priority != DEF_PRIORITY)
1698 continue; /* Let kswapd poll it */ 1732 continue; /* Let kswapd poll it */
1699 sc->all_unreclaimable = 0; 1733 sc->all_unreclaimable = 0;
1700 } else { 1734 } else {
@@ -1738,6 +1772,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1738 struct zoneref *z; 1772 struct zoneref *z;
1739 struct zone *zone; 1773 struct zone *zone;
1740 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1774 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1775 unsigned long writeback_threshold;
1741 1776
1742 delayacct_freepages_start(); 1777 delayacct_freepages_start();
1743 1778
@@ -1773,7 +1808,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1773 } 1808 }
1774 } 1809 }
1775 total_scanned += sc->nr_scanned; 1810 total_scanned += sc->nr_scanned;
1776 if (sc->nr_reclaimed >= sc->swap_cluster_max) { 1811 if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
1777 ret = sc->nr_reclaimed; 1812 ret = sc->nr_reclaimed;
1778 goto out; 1813 goto out;
1779 } 1814 }
@@ -1785,14 +1820,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1785 * that's undesirable in laptop mode, where we *want* lumpy 1820 * that's undesirable in laptop mode, where we *want* lumpy
1786 * writeout. So in laptop mode, write out the whole world. 1821 * writeout. So in laptop mode, write out the whole world.
1787 */ 1822 */
1788 if (total_scanned > sc->swap_cluster_max + 1823 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
1789 sc->swap_cluster_max / 2) { 1824 if (total_scanned > writeback_threshold) {
1790 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 1825 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1791 sc->may_writepage = 1; 1826 sc->may_writepage = 1;
1792 } 1827 }
1793 1828
1794 /* Take a nap, wait for some writeback to complete */ 1829 /* Take a nap, wait for some writeback to complete */
1795 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1830 if (!sc->hibernation_mode && sc->nr_scanned &&
1831 priority < DEF_PRIORITY - 2)
1796 congestion_wait(BLK_RW_ASYNC, HZ/10); 1832 congestion_wait(BLK_RW_ASYNC, HZ/10);
1797 } 1833 }
1798 /* top priority shrink_zones still had more to do? don't OOM, then */ 1834 /* top priority shrink_zones still had more to do? don't OOM, then */
@@ -1831,7 +1867,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1831 struct scan_control sc = { 1867 struct scan_control sc = {
1832 .gfp_mask = gfp_mask, 1868 .gfp_mask = gfp_mask,
1833 .may_writepage = !laptop_mode, 1869 .may_writepage = !laptop_mode,
1834 .swap_cluster_max = SWAP_CLUSTER_MAX, 1870 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1835 .may_unmap = 1, 1871 .may_unmap = 1,
1836 .may_swap = 1, 1872 .may_swap = 1,
1837 .swappiness = vm_swappiness, 1873 .swappiness = vm_swappiness,
@@ -1855,7 +1891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1855 .may_writepage = !laptop_mode, 1891 .may_writepage = !laptop_mode,
1856 .may_unmap = 1, 1892 .may_unmap = 1,
1857 .may_swap = !noswap, 1893 .may_swap = !noswap,
1858 .swap_cluster_max = SWAP_CLUSTER_MAX,
1859 .swappiness = swappiness, 1894 .swappiness = swappiness,
1860 .order = 0, 1895 .order = 0,
1861 .mem_cgroup = mem, 1896 .mem_cgroup = mem,
@@ -1889,7 +1924,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1889 .may_writepage = !laptop_mode, 1924 .may_writepage = !laptop_mode,
1890 .may_unmap = 1, 1925 .may_unmap = 1,
1891 .may_swap = !noswap, 1926 .may_swap = !noswap,
1892 .swap_cluster_max = SWAP_CLUSTER_MAX, 1927 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1893 .swappiness = swappiness, 1928 .swappiness = swappiness,
1894 .order = 0, 1929 .order = 0,
1895 .mem_cgroup = mem_cont, 1930 .mem_cgroup = mem_cont,
@@ -1904,6 +1939,33 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1904} 1939}
1905#endif 1940#endif
1906 1941
1942/* is kswapd sleeping prematurely? */
1943static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
1944{
1945 int i;
1946
1947 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
1948 if (remaining)
1949 return 1;
1950
1951 /* If after HZ/10, a zone is below the high mark, it's premature */
1952 for (i = 0; i < pgdat->nr_zones; i++) {
1953 struct zone *zone = pgdat->node_zones + i;
1954
1955 if (!populated_zone(zone))
1956 continue;
1957
1958 if (zone->all_unreclaimable)
1959 continue;
1960
1961 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
1962 0, 0))
1963 return 1;
1964 }
1965
1966 return 0;
1967}
1968
1907/* 1969/*
1908 * For kswapd, balance_pgdat() will work across all this node's zones until 1970 * For kswapd, balance_pgdat() will work across all this node's zones until
1909 * they are all at high_wmark_pages(zone). 1971 * they are all at high_wmark_pages(zone).
@@ -1936,7 +1998,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1936 .gfp_mask = GFP_KERNEL, 1998 .gfp_mask = GFP_KERNEL,
1937 .may_unmap = 1, 1999 .may_unmap = 1,
1938 .may_swap = 1, 2000 .may_swap = 1,
1939 .swap_cluster_max = SWAP_CLUSTER_MAX, 2001 /*
2002 * kswapd doesn't want to be bailed out while reclaim. because
2003 * we want to put equal scanning pressure on each zone.
2004 */
2005 .nr_to_reclaim = ULONG_MAX,
1940 .swappiness = vm_swappiness, 2006 .swappiness = vm_swappiness,
1941 .order = order, 2007 .order = order,
1942 .mem_cgroup = NULL, 2008 .mem_cgroup = NULL,
@@ -1961,6 +2027,7 @@ loop_again:
1961 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2027 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1962 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2028 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1963 unsigned long lru_pages = 0; 2029 unsigned long lru_pages = 0;
2030 int has_under_min_watermark_zone = 0;
1964 2031
1965 /* The swap token gets in the way of swapout... */ 2032 /* The swap token gets in the way of swapout... */
1966 if (!priority) 2033 if (!priority)
@@ -1978,8 +2045,7 @@ loop_again:
1978 if (!populated_zone(zone)) 2045 if (!populated_zone(zone))
1979 continue; 2046 continue;
1980 2047
1981 if (zone_is_all_unreclaimable(zone) && 2048 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1982 priority != DEF_PRIORITY)
1983 continue; 2049 continue;
1984 2050
1985 /* 2051 /*
@@ -2022,13 +2088,9 @@ loop_again:
2022 if (!populated_zone(zone)) 2088 if (!populated_zone(zone))
2023 continue; 2089 continue;
2024 2090
2025 if (zone_is_all_unreclaimable(zone) && 2091 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2026 priority != DEF_PRIORITY)
2027 continue; 2092 continue;
2028 2093
2029 if (!zone_watermark_ok(zone, order,
2030 high_wmark_pages(zone), end_zone, 0))
2031 all_zones_ok = 0;
2032 temp_priority[i] = priority; 2094 temp_priority[i] = priority;
2033 sc.nr_scanned = 0; 2095 sc.nr_scanned = 0;
2034 note_zone_scanning_priority(zone, priority); 2096 note_zone_scanning_priority(zone, priority);
@@ -2053,12 +2115,11 @@ loop_again:
2053 lru_pages); 2115 lru_pages);
2054 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2116 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2055 total_scanned += sc.nr_scanned; 2117 total_scanned += sc.nr_scanned;
2056 if (zone_is_all_unreclaimable(zone)) 2118 if (zone->all_unreclaimable)
2057 continue; 2119 continue;
2058 if (nr_slab == 0 && zone->pages_scanned >= 2120 if (nr_slab == 0 &&
2059 (zone_reclaimable_pages(zone) * 6)) 2121 zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
2060 zone_set_flag(zone, 2122 zone->all_unreclaimable = 1;
2061 ZONE_ALL_UNRECLAIMABLE);
2062 /* 2123 /*
2063 * If we've done a decent amount of scanning and 2124 * If we've done a decent amount of scanning and
2064 * the reclaim ratio is low, start doing writepage 2125 * the reclaim ratio is low, start doing writepage
@@ -2067,6 +2128,20 @@ loop_again:
2067 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2128 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2068 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2129 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2069 sc.may_writepage = 1; 2130 sc.may_writepage = 1;
2131
2132 if (!zone_watermark_ok(zone, order,
2133 high_wmark_pages(zone), end_zone, 0)) {
2134 all_zones_ok = 0;
2135 /*
2136 * We are still under min water mark. This
2137 * means that we have a GFP_ATOMIC allocation
2138 * failure risk. Hurry up!
2139 */
2140 if (!zone_watermark_ok(zone, order,
2141 min_wmark_pages(zone), end_zone, 0))
2142 has_under_min_watermark_zone = 1;
2143 }
2144
2070 } 2145 }
2071 if (all_zones_ok) 2146 if (all_zones_ok)
2072 break; /* kswapd: all done */ 2147 break; /* kswapd: all done */
@@ -2074,8 +2149,12 @@ loop_again:
2074 * OK, kswapd is getting into trouble. Take a nap, then take 2149 * OK, kswapd is getting into trouble. Take a nap, then take
2075 * another pass across the zones. 2150 * another pass across the zones.
2076 */ 2151 */
2077 if (total_scanned && priority < DEF_PRIORITY - 2) 2152 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2078 congestion_wait(BLK_RW_ASYNC, HZ/10); 2153 if (has_under_min_watermark_zone)
2154 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2155 else
2156 congestion_wait(BLK_RW_ASYNC, HZ/10);
2157 }
2079 2158
2080 /* 2159 /*
2081 * We do this so kswapd doesn't build up large priorities for 2160 * We do this so kswapd doesn't build up large priorities for
@@ -2173,6 +2252,7 @@ static int kswapd(void *p)
2173 order = 0; 2252 order = 0;
2174 for ( ; ; ) { 2253 for ( ; ; ) {
2175 unsigned long new_order; 2254 unsigned long new_order;
2255 int ret;
2176 2256
2177 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2257 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2178 new_order = pgdat->kswapd_max_order; 2258 new_order = pgdat->kswapd_max_order;
@@ -2184,19 +2264,45 @@ static int kswapd(void *p)
2184 */ 2264 */
2185 order = new_order; 2265 order = new_order;
2186 } else { 2266 } else {
2187 if (!freezing(current)) 2267 if (!freezing(current) && !kthread_should_stop()) {
2188 schedule(); 2268 long remaining = 0;
2269
2270 /* Try to sleep for a short interval */
2271 if (!sleeping_prematurely(pgdat, order, remaining)) {
2272 remaining = schedule_timeout(HZ/10);
2273 finish_wait(&pgdat->kswapd_wait, &wait);
2274 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2275 }
2276
2277 /*
2278 * After a short sleep, check if it was a
2279 * premature sleep. If not, then go fully
2280 * to sleep until explicitly woken up
2281 */
2282 if (!sleeping_prematurely(pgdat, order, remaining))
2283 schedule();
2284 else {
2285 if (remaining)
2286 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2287 else
2288 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2289 }
2290 }
2189 2291
2190 order = pgdat->kswapd_max_order; 2292 order = pgdat->kswapd_max_order;
2191 } 2293 }
2192 finish_wait(&pgdat->kswapd_wait, &wait); 2294 finish_wait(&pgdat->kswapd_wait, &wait);
2193 2295
2194 if (!try_to_freeze()) { 2296 ret = try_to_freeze();
2195 /* We can speed up thawing tasks if we don't call 2297 if (kthread_should_stop())
2196 * balance_pgdat after returning from the refrigerator 2298 break;
2197 */ 2299
2300 /*
2301 * We can speed up thawing tasks if we don't call balance_pgdat
2302 * after returning from the refrigerator
2303 */
2304 if (!ret)
2198 balance_pgdat(pgdat, order); 2305 balance_pgdat(pgdat, order);
2199 }
2200 } 2306 }
2201 return 0; 2307 return 0;
2202} 2308}
@@ -2260,148 +2366,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
2260 2366
2261#ifdef CONFIG_HIBERNATION 2367#ifdef CONFIG_HIBERNATION
2262/* 2368/*
2263 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2369 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
2264 * from LRU lists system-wide, for given pass and priority.
2265 *
2266 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2267 */
2268static void shrink_all_zones(unsigned long nr_pages, int prio,
2269 int pass, struct scan_control *sc)
2270{
2271 struct zone *zone;
2272 unsigned long nr_reclaimed = 0;
2273 struct zone_reclaim_stat *reclaim_stat;
2274
2275 for_each_populated_zone(zone) {
2276 enum lru_list l;
2277
2278 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2279 continue;
2280
2281 for_each_evictable_lru(l) {
2282 enum zone_stat_item ls = NR_LRU_BASE + l;
2283 unsigned long lru_pages = zone_page_state(zone, ls);
2284
2285 /* For pass = 0, we don't shrink the active list */
2286 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2287 l == LRU_ACTIVE_FILE))
2288 continue;
2289
2290 reclaim_stat = get_reclaim_stat(zone, sc);
2291 reclaim_stat->nr_saved_scan[l] +=
2292 (lru_pages >> prio) + 1;
2293 if (reclaim_stat->nr_saved_scan[l]
2294 >= nr_pages || pass > 3) {
2295 unsigned long nr_to_scan;
2296
2297 reclaim_stat->nr_saved_scan[l] = 0;
2298 nr_to_scan = min(nr_pages, lru_pages);
2299 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2300 sc, prio);
2301 if (nr_reclaimed >= nr_pages) {
2302 sc->nr_reclaimed += nr_reclaimed;
2303 return;
2304 }
2305 }
2306 }
2307 }
2308 sc->nr_reclaimed += nr_reclaimed;
2309}
2310
2311/*
2312 * Try to free `nr_pages' of memory, system-wide, and return the number of
2313 * freed pages. 2370 * freed pages.
2314 * 2371 *
2315 * Rather than trying to age LRUs the aim is to preserve the overall 2372 * Rather than trying to age LRUs the aim is to preserve the overall
2316 * LRU order by reclaiming preferentially 2373 * LRU order by reclaiming preferentially
2317 * inactive > active > active referenced > active mapped 2374 * inactive > active > active referenced > active mapped
2318 */ 2375 */
2319unsigned long shrink_all_memory(unsigned long nr_pages) 2376unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2320{ 2377{
2321 unsigned long lru_pages, nr_slab;
2322 int pass;
2323 struct reclaim_state reclaim_state; 2378 struct reclaim_state reclaim_state;
2324 struct scan_control sc = { 2379 struct scan_control sc = {
2325 .gfp_mask = GFP_KERNEL, 2380 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2326 .may_unmap = 0, 2381 .may_swap = 1,
2382 .may_unmap = 1,
2327 .may_writepage = 1, 2383 .may_writepage = 1,
2384 .nr_to_reclaim = nr_to_reclaim,
2385 .hibernation_mode = 1,
2386 .swappiness = vm_swappiness,
2387 .order = 0,
2328 .isolate_pages = isolate_pages_global, 2388 .isolate_pages = isolate_pages_global,
2329 .nr_reclaimed = 0,
2330 }; 2389 };
2390 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2391 struct task_struct *p = current;
2392 unsigned long nr_reclaimed;
2331 2393
2332 current->reclaim_state = &reclaim_state; 2394 p->flags |= PF_MEMALLOC;
2333 2395 lockdep_set_current_reclaim_state(sc.gfp_mask);
2334 lru_pages = global_reclaimable_pages(); 2396 reclaim_state.reclaimed_slab = 0;
2335 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2397 p->reclaim_state = &reclaim_state;
2336 /* If slab caches are huge, it's better to hit them first */
2337 while (nr_slab >= lru_pages) {
2338 reclaim_state.reclaimed_slab = 0;
2339 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2340 if (!reclaim_state.reclaimed_slab)
2341 break;
2342
2343 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2344 if (sc.nr_reclaimed >= nr_pages)
2345 goto out;
2346
2347 nr_slab -= reclaim_state.reclaimed_slab;
2348 }
2349
2350 /*
2351 * We try to shrink LRUs in 5 passes:
2352 * 0 = Reclaim from inactive_list only
2353 * 1 = Reclaim from active list but don't reclaim mapped
2354 * 2 = 2nd pass of type 1
2355 * 3 = Reclaim mapped (normal reclaim)
2356 * 4 = 2nd pass of type 3
2357 */
2358 for (pass = 0; pass < 5; pass++) {
2359 int prio;
2360
2361 /* Force reclaiming mapped pages in the passes #3 and #4 */
2362 if (pass > 2)
2363 sc.may_unmap = 1;
2364
2365 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2366 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2367
2368 sc.nr_scanned = 0;
2369 sc.swap_cluster_max = nr_to_scan;
2370 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2371 if (sc.nr_reclaimed >= nr_pages)
2372 goto out;
2373
2374 reclaim_state.reclaimed_slab = 0;
2375 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2376 global_reclaimable_pages());
2377 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2378 if (sc.nr_reclaimed >= nr_pages)
2379 goto out;
2380
2381 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2382 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2383 }
2384 }
2385
2386 /*
2387 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2388 * something in slab caches
2389 */
2390 if (!sc.nr_reclaimed) {
2391 do {
2392 reclaim_state.reclaimed_slab = 0;
2393 shrink_slab(nr_pages, sc.gfp_mask,
2394 global_reclaimable_pages());
2395 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2396 } while (sc.nr_reclaimed < nr_pages &&
2397 reclaim_state.reclaimed_slab > 0);
2398 }
2399 2398
2399 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2400 2400
2401out: 2401 p->reclaim_state = NULL;
2402 current->reclaim_state = NULL; 2402 lockdep_clear_current_reclaim_state();
2403 p->flags &= ~PF_MEMALLOC;
2403 2404
2404 return sc.nr_reclaimed; 2405 return nr_reclaimed;
2405} 2406}
2406#endif /* CONFIG_HIBERNATION */ 2407#endif /* CONFIG_HIBERNATION */
2407 2408
@@ -2451,6 +2452,17 @@ int kswapd_run(int nid)
2451 return ret; 2452 return ret;
2452} 2453}
2453 2454
2455/*
2456 * Called by memory hotplug when all memory in a node is offlined.
2457 */
2458void kswapd_stop(int nid)
2459{
2460 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2461
2462 if (kswapd)
2463 kthread_stop(kswapd);
2464}
2465
2454static int __init kswapd_init(void) 2466static int __init kswapd_init(void)
2455{ 2467{
2456 int nid; 2468 int nid;
@@ -2553,8 +2565,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2553 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2565 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2554 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2566 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2555 .may_swap = 1, 2567 .may_swap = 1,
2556 .swap_cluster_max = max_t(unsigned long, nr_pages, 2568 .nr_to_reclaim = max_t(unsigned long, nr_pages,
2557 SWAP_CLUSTER_MAX), 2569 SWAP_CLUSTER_MAX),
2558 .gfp_mask = gfp_mask, 2570 .gfp_mask = gfp_mask,
2559 .swappiness = vm_swappiness, 2571 .swappiness = vm_swappiness,
2560 .order = order, 2572 .order = order,
@@ -2570,6 +2582,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2570 * and RECLAIM_SWAP. 2582 * and RECLAIM_SWAP.
2571 */ 2583 */
2572 p->flags |= PF_MEMALLOC | PF_SWAPWRITE; 2584 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2585 lockdep_set_current_reclaim_state(gfp_mask);
2573 reclaim_state.reclaimed_slab = 0; 2586 reclaim_state.reclaimed_slab = 0;
2574 p->reclaim_state = &reclaim_state; 2587 p->reclaim_state = &reclaim_state;
2575 2588
@@ -2613,6 +2626,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2613 2626
2614 p->reclaim_state = NULL; 2627 p->reclaim_state = NULL;
2615 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2628 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2629 lockdep_clear_current_reclaim_state();
2616 return sc.nr_reclaimed >= nr_pages; 2630 return sc.nr_reclaimed >= nr_pages;
2617} 2631}
2618 2632
@@ -2635,7 +2649,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2635 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 2649 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2636 return ZONE_RECLAIM_FULL; 2650 return ZONE_RECLAIM_FULL;
2637 2651
2638 if (zone_is_all_unreclaimable(zone)) 2652 if (zone->all_unreclaimable)
2639 return ZONE_RECLAIM_FULL; 2653 return ZONE_RECLAIM_FULL;
2640 2654
2641 /* 2655 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c81321f9feec..fa12ea3051fb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/cpu.h> 16#include <linux/cpu.h>
16#include <linux/vmstat.h> 17#include <linux/vmstat.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
@@ -139,7 +140,8 @@ static void refresh_zone_stat_thresholds(void)
139 threshold = calculate_threshold(zone); 140 threshold = calculate_threshold(zone);
140 141
141 for_each_online_cpu(cpu) 142 for_each_online_cpu(cpu)
142 zone_pcp(zone, cpu)->stat_threshold = threshold; 143 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
144 = threshold;
143 } 145 }
144} 146}
145 147
@@ -149,7 +151,8 @@ static void refresh_zone_stat_thresholds(void)
149void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 151void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
150 int delta) 152 int delta)
151{ 153{
152 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 154 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
155
153 s8 *p = pcp->vm_stat_diff + item; 156 s8 *p = pcp->vm_stat_diff + item;
154 long x; 157 long x;
155 158
@@ -202,7 +205,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
202 */ 205 */
203void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 206void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
204{ 207{
205 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 208 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
206 s8 *p = pcp->vm_stat_diff + item; 209 s8 *p = pcp->vm_stat_diff + item;
207 210
208 (*p)++; 211 (*p)++;
@@ -223,7 +226,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
223 226
224void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 227void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
225{ 228{
226 struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); 229 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
227 s8 *p = pcp->vm_stat_diff + item; 230 s8 *p = pcp->vm_stat_diff + item;
228 231
229 (*p)--; 232 (*p)--;
@@ -300,7 +303,7 @@ void refresh_cpu_vm_stats(int cpu)
300 for_each_populated_zone(zone) { 303 for_each_populated_zone(zone) {
301 struct per_cpu_pageset *p; 304 struct per_cpu_pageset *p;
302 305
303 p = zone_pcp(zone, cpu); 306 p = per_cpu_ptr(zone->pageset, cpu);
304 307
305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 308 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
306 if (p->vm_stat_diff[i]) { 309 if (p->vm_stat_diff[i]) {
@@ -683,6 +686,9 @@ static const char * const vmstat_text[] = {
683 "slabs_scanned", 686 "slabs_scanned",
684 "kswapd_steal", 687 "kswapd_steal",
685 "kswapd_inodesteal", 688 "kswapd_inodesteal",
689 "kswapd_low_wmark_hit_quickly",
690 "kswapd_high_wmark_hit_quickly",
691 "kswapd_skip_congestion_wait",
686 "pageoutrun", 692 "pageoutrun",
687 "allocstall", 693 "allocstall",
688 694
@@ -738,7 +744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
738 for_each_online_cpu(i) { 744 for_each_online_cpu(i) {
739 struct per_cpu_pageset *pageset; 745 struct per_cpu_pageset *pageset;
740 746
741 pageset = zone_pcp(zone, i); 747 pageset = per_cpu_ptr(zone->pageset, i);
742 seq_printf(m, 748 seq_printf(m,
743 "\n cpu: %i" 749 "\n cpu: %i"
744 "\n count: %i" 750 "\n count: %i"
@@ -758,7 +764,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
758 "\n prev_priority: %i" 764 "\n prev_priority: %i"
759 "\n start_pfn: %lu" 765 "\n start_pfn: %lu"
760 "\n inactive_ratio: %u", 766 "\n inactive_ratio: %u",
761 zone_is_all_unreclaimable(zone), 767 zone->all_unreclaimable,
762 zone->prev_priority, 768 zone->prev_priority,
763 zone->zone_start_pfn, 769 zone->zone_start_pfn,
764 zone->inactive_ratio); 770 zone->inactive_ratio);
@@ -883,11 +889,10 @@ static void vmstat_update(struct work_struct *w)
883 889
884static void __cpuinit start_cpu_timer(int cpu) 890static void __cpuinit start_cpu_timer(int cpu)
885{ 891{
886 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); 892 struct delayed_work *work = &per_cpu(vmstat_work, cpu);
887 893
888 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); 894 INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
889 schedule_delayed_work_on(cpu, vmstat_work, 895 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
890 __round_jiffies_relative(HZ, cpu));
891} 896}
892 897
893/* 898/*
@@ -904,6 +909,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
904 case CPU_ONLINE: 909 case CPU_ONLINE:
905 case CPU_ONLINE_FROZEN: 910 case CPU_ONLINE_FROZEN:
906 start_cpu_timer(cpu); 911 start_cpu_timer(cpu);
912 node_set_state(cpu_to_node(cpu), N_CPU);
907 break; 913 break;
908 case CPU_DOWN_PREPARE: 914 case CPU_DOWN_PREPARE:
909 case CPU_DOWN_PREPARE_FROZEN: 915 case CPU_DOWN_PREPARE_FROZEN: