aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig36
-rw-r--r--mm/Kconfig.debug12
-rw-r--r--mm/Makefile13
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/backing-dev.c427
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/filemap.c180
-rw-r--r--mm/hugetlb.c264
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/internal.h10
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c336
-rw-r--r--mm/ksm.c1711
-rw-r--r--mm/madvise.c83
-rw-r--r--mm/memcontrol.c739
-rw-r--r--mm/memory-failure.c832
-rw-r--r--mm/memory.c299
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/mempool.c7
-rw-r--r--mm/migrate.c26
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mmap.c62
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c18
-rw-r--r--mm/nommu.c134
-rw-r--r--mm/oom_kill.c96
-rw-r--r--mm/page-writeback.c244
-rw-r--r--mm/page_alloc.c338
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/percpu.c1418
-rw-r--r--mm/quicklist.c5
-rw-r--r--mm/rmap.c139
-rw-r--r--mm/shmem.c40
-rw-r--r--mm/shmem_acl.c11
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c95
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c144
-rw-r--r--mm/swapfile.c14
-rw-r--r--mm/truncate.c136
-rw-r--r--mm/vmalloc.c561
-rw-r--r--mm/vmscan.c283
-rw-r--r--mm/vmstat.c5
49 files changed, 7271 insertions, 2064 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c948d4ca8bde..edd300aca173 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE
153# 153#
154config PAGEFLAGS_EXTENDED 154config PAGEFLAGS_EXTENDED
155 def_bool y 155 def_bool y
156 depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM 156 depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
157 157
158# Heavily threaded applications may benefit from splitting the mm-wide 158# Heavily threaded applications may benefit from splitting the mm-wide
159# page_table_lock, so that faults on different parts of the user address 159# page_table_lock, so that faults on different parts of the user address
@@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT
214config MMU_NOTIFIER 214config MMU_NOTIFIER
215 bool 215 bool
216 216
217config KSM
218 bool "Enable KSM for page merging"
219 depends on MMU
220 help
221 Enable Kernel Samepage Merging: KSM periodically scans those areas
222 of an application's address space that an app has advised may be
223 mergeable. When it finds pages of identical content, it replaces
224 the many instances by a single resident page with that content, so
225 saving memory until one or another app needs to modify the content.
226 Recommended for use with KVM, or with other duplicative applications.
227 See Documentation/vm/ksm.txt for more information.
228
217config DEFAULT_MMAP_MIN_ADDR 229config DEFAULT_MMAP_MIN_ADDR
218 int "Low address space to protect from user allocation" 230 int "Low address space to protect from user allocation"
219 default 4096 231 default 4096
@@ -225,13 +237,29 @@ config DEFAULT_MMAP_MIN_ADDR
225 For most ia64, ppc64 and x86 users with lots of address space 237 For most ia64, ppc64 and x86 users with lots of address space
226 a value of 65536 is reasonable and should cause no problems. 238 a value of 65536 is reasonable and should cause no problems.
227 On arm and other archs it should not be higher than 32768. 239 On arm and other archs it should not be higher than 32768.
228 Programs which use vm86 functionality would either need additional 240 Programs which use vm86 functionality or have some need to map
229 permissions from either the LSM or the capabilities module or have 241 this low address space will need CAP_SYS_RAWIO or disable this
230 this protection disabled. 242 protection by setting the value to 0.
231 243
232 This value can be changed after boot using the 244 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable. 245 /proc/sys/vm/mmap_min_addr tunable.
234 246
247config ARCH_SUPPORTS_MEMORY_FAILURE
248 bool
249
250config MEMORY_FAILURE
251 depends on MMU
252 depends on ARCH_SUPPORTS_MEMORY_FAILURE
253 bool "Enable recovery from hardware memory errors"
254 help
255 Enables code to recover from some memory failures on systems
256 with MCA recovery. This allows a system to continue running
257 even when some of its memory has uncorrected errors. This requires
258 special hardware support and typically ECC memory.
259
260config HWPOISON_INJECT
261 tristate "Poison pages injector"
262 depends on MEMORY_FAILURE && DEBUG_KERNEL
235 263
236config NOMMU_INITIAL_TRIM_EXCESS 264config NOMMU_INITIAL_TRIM_EXCESS
237 int "Turn on mmap() excess space trimming before booting" 265 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index aa99fd1f7109..af7cfb43d2f0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC
6 ---help--- 6 ---help---
7 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
9 of memory corruptions. 9 of memory corruption.
10 10
11config WANT_PAGE_DEBUG_FLAGS 11config WANT_PAGE_DEBUG_FLAGS
12 bool 12 bool
@@ -17,11 +17,11 @@ config PAGE_POISONING
17 depends on !HIBERNATION 17 depends on !HIBERNATION
18 select DEBUG_PAGEALLOC 18 select DEBUG_PAGEALLOC
19 select WANT_PAGE_DEBUG_FLAGS 19 select WANT_PAGE_DEBUG_FLAGS
20 help 20 ---help---
21 Fill the pages with poison patterns after free_pages() and verify 21 Fill the pages with poison patterns after free_pages() and verify
22 the patterns before alloc_pages(). This results in a large slowdown, 22 the patterns before alloc_pages(). This results in a large slowdown,
23 but helps to find certain types of memory corruptions. 23 but helps to find certain types of memory corruption.
24 24
25 This option cannot enalbe with hibernation. Otherwise, it will get 25 This option cannot be enabled in combination with hibernation as
26 wrong messages for memory corruption because the free pages are not 26 that would result in incorrect warnings of memory corruption after
27 saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..ebf849042ed3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,16 +5,16 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o 8 vmalloc.o pagewalk.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o mmu_context.o \
15 $(mmu-y)
15obj-y += init-mm.o 16obj-y += init-mm.o
16 17
17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
18obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
20obj-$(CONFIG_HAS_DMA) += dmapool.o 20obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 26obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
29obj-$(CONFIG_SLAB) += slab.o 30obj-$(CONFIG_SLAB) += slab.o
30obj-$(CONFIG_SLUB) += slub.o 31obj-$(CONFIG_SLUB) += slub.o
@@ -33,12 +34,14 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 34obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 35obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 36obj-$(CONFIG_MIGRATION) += migrate.o
36ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 37ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
37obj-$(CONFIG_SMP) += percpu.o 38obj-$(CONFIG_SMP) += percpu.o
38else 39else
39obj-$(CONFIG_SMP) += allocpercpu.o 40obj-$(CONFIG_SMP) += allocpercpu.o
40endif 41endif
41obj-$(CONFIG_QUICKLIST) += quicklist.o 42obj-$(CONFIG_QUICKLIST) += quicklist.o
42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
45obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
43obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 46obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
44obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 47obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
8 10
9#ifndef cache_line_size 11#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES 12#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
147 kfree(__percpu_disguise(__pdata)); 149 kfree(__percpu_disguise(__pdata));
148} 150}
149EXPORT_SYMBOL_GPL(free_percpu); 151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..3d3accb1f800 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -23,6 +27,24 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
25 29
30/*
31 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
32 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
33 * locking.
34 */
35DEFINE_SPINLOCK(bdi_lock);
36LIST_HEAD(bdi_list);
37LIST_HEAD(bdi_pending_list);
38
39static struct task_struct *sync_supers_tsk;
40static struct timer_list sync_supers_timer;
41
42static int bdi_sync_supers(void *);
43static void sync_supers_timer_fn(unsigned long);
44static void arm_supers_timer(void);
45
46static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
47
26#ifdef CONFIG_DEBUG_FS 48#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 49#include <linux/debugfs.h>
28#include <linux/seq_file.h> 50#include <linux/seq_file.h>
@@ -37,9 +59,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 59static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 60{
39 struct backing_dev_info *bdi = m->private; 61 struct backing_dev_info *bdi = m->private;
62 struct bdi_writeback *wb;
40 unsigned long background_thresh; 63 unsigned long background_thresh;
41 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
67 struct inode *inode;
68
69 /*
70 * inode lock is enough here, the bdi->wb_list is protected by
71 * RCU on the reader side
72 */
73 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
74 spin_lock(&inode_lock);
75 list_for_each_entry(wb, &bdi->wb_list, list) {
76 nr_wb++;
77 list_for_each_entry(inode, &wb->b_dirty, i_list)
78 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list)
80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list)
82 nr_more_io++;
83 }
84 spin_unlock(&inode_lock);
43 85
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 86 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 87
@@ -49,12 +91,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 91 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 94 "BackgroundThresh: %8lu kB\n"
95 "WriteBack threads:%8lu\n"
96 "b_dirty: %8lu\n"
97 "b_io: %8lu\n"
98 "b_more_io: %8lu\n"
99 "bdi_list: %8u\n"
100 "state: %8lx\n"
101 "wb_mask: %8lx\n"
102 "wb_list: %8u\n"
103 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 104 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 105 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 106 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 107 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 108 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
109 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 110#undef K
59 111
60 return 0; 112 return 0;
@@ -185,6 +237,13 @@ static int __init default_bdi_init(void)
185{ 237{
186 int err; 238 int err;
187 239
240 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
241 BUG_ON(IS_ERR(sync_supers_tsk));
242
243 init_timer(&sync_supers_timer);
244 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
245 arm_supers_timer();
246
188 err = bdi_init(&default_backing_dev_info); 247 err = bdi_init(&default_backing_dev_info);
189 if (!err) 248 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 249 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +252,279 @@ static int __init default_bdi_init(void)
193} 252}
194subsys_initcall(default_bdi_init); 253subsys_initcall(default_bdi_init);
195 254
255static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
256{
257 memset(wb, 0, sizeof(*wb));
258
259 wb->bdi = bdi;
260 wb->last_old_flush = jiffies;
261 INIT_LIST_HEAD(&wb->b_dirty);
262 INIT_LIST_HEAD(&wb->b_io);
263 INIT_LIST_HEAD(&wb->b_more_io);
264}
265
266static void bdi_task_init(struct backing_dev_info *bdi,
267 struct bdi_writeback *wb)
268{
269 struct task_struct *tsk = current;
270
271 spin_lock(&bdi->wb_lock);
272 list_add_tail_rcu(&wb->list, &bdi->wb_list);
273 spin_unlock(&bdi->wb_lock);
274
275 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
276 set_freezable();
277
278 /*
279 * Our parent may run at a different priority, just set us to normal
280 */
281 set_user_nice(tsk, 0);
282}
283
284static int bdi_start_fn(void *ptr)
285{
286 struct bdi_writeback *wb = ptr;
287 struct backing_dev_info *bdi = wb->bdi;
288 int ret;
289
290 /*
291 * Add us to the active bdi_list
292 */
293 spin_lock_bh(&bdi_lock);
294 list_add_rcu(&bdi->bdi_list, &bdi_list);
295 spin_unlock_bh(&bdi_lock);
296
297 bdi_task_init(bdi, wb);
298
299 /*
300 * Clear pending bit and wakeup anybody waiting to tear us down
301 */
302 clear_bit(BDI_pending, &bdi->state);
303 smp_mb__after_clear_bit();
304 wake_up_bit(&bdi->state, BDI_pending);
305
306 ret = bdi_writeback_task(wb);
307
308 /*
309 * Remove us from the list
310 */
311 spin_lock(&bdi->wb_lock);
312 list_del_rcu(&wb->list);
313 spin_unlock(&bdi->wb_lock);
314
315 /*
316 * Flush any work that raced with us exiting. No new work
317 * will be added, since this bdi isn't discoverable anymore.
318 */
319 if (!list_empty(&bdi->work_list))
320 wb_do_writeback(wb, 1);
321
322 wb->task = NULL;
323 return ret;
324}
325
326int bdi_has_dirty_io(struct backing_dev_info *bdi)
327{
328 return wb_has_dirty_io(&bdi->wb);
329}
330
331static void bdi_flush_io(struct backing_dev_info *bdi)
332{
333 struct writeback_control wbc = {
334 .bdi = bdi,
335 .sync_mode = WB_SYNC_NONE,
336 .older_than_this = NULL,
337 .range_cyclic = 1,
338 .nr_to_write = 1024,
339 };
340
341 writeback_inodes_wbc(&wbc);
342}
343
344/*
345 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
346 * or we risk deadlocking on ->s_umount. The longer term solution would be
347 * to implement sync_supers_bdi() or similar and simply do it from the
348 * bdi writeback tasks individually.
349 */
350static int bdi_sync_supers(void *unused)
351{
352 set_user_nice(current, 0);
353
354 while (!kthread_should_stop()) {
355 set_current_state(TASK_INTERRUPTIBLE);
356 schedule();
357
358 /*
359 * Do this periodically, like kupdated() did before.
360 */
361 sync_supers();
362 }
363
364 return 0;
365}
366
367static void arm_supers_timer(void)
368{
369 unsigned long next;
370
371 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
372 mod_timer(&sync_supers_timer, round_jiffies_up(next));
373}
374
375static void sync_supers_timer_fn(unsigned long unused)
376{
377 wake_up_process(sync_supers_tsk);
378 arm_supers_timer();
379}
380
381static int bdi_forker_task(void *ptr)
382{
383 struct bdi_writeback *me = ptr;
384
385 bdi_task_init(me->bdi, me);
386
387 for (;;) {
388 struct backing_dev_info *bdi, *tmp;
389 struct bdi_writeback *wb;
390
391 /*
392 * Temporary measure, we want to make sure we don't see
393 * dirty data on the default backing_dev_info
394 */
395 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
396 wb_do_writeback(me, 0);
397
398 spin_lock_bh(&bdi_lock);
399
400 /*
401 * Check if any existing bdi's have dirty data without
402 * a thread registered. If so, set that up.
403 */
404 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
405 if (bdi->wb.task)
406 continue;
407 if (list_empty(&bdi->work_list) &&
408 !bdi_has_dirty_io(bdi))
409 continue;
410
411 bdi_add_default_flusher_task(bdi);
412 }
413
414 set_current_state(TASK_INTERRUPTIBLE);
415
416 if (list_empty(&bdi_pending_list)) {
417 unsigned long wait;
418
419 spin_unlock_bh(&bdi_lock);
420 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
421 schedule_timeout(wait);
422 try_to_freeze();
423 continue;
424 }
425
426 __set_current_state(TASK_RUNNING);
427
428 /*
429 * This is our real job - check for pending entries in
430 * bdi_pending_list, and create the tasks that got added
431 */
432 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
433 bdi_list);
434 list_del_init(&bdi->bdi_list);
435 spin_unlock_bh(&bdi_lock);
436
437 wb = &bdi->wb;
438 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
439 dev_name(bdi->dev));
440 /*
441 * If task creation fails, then readd the bdi to
442 * the pending list and force writeout of the bdi
443 * from this forker thread. That will free some memory
444 * and we can try again.
445 */
446 if (IS_ERR(wb->task)) {
447 wb->task = NULL;
448
449 /*
450 * Add this 'bdi' to the back, so we get
451 * a chance to flush other bdi's to free
452 * memory.
453 */
454 spin_lock_bh(&bdi_lock);
455 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
456 spin_unlock_bh(&bdi_lock);
457
458 bdi_flush_io(bdi);
459 }
460 }
461
462 return 0;
463}
464
465static void bdi_add_to_pending(struct rcu_head *head)
466{
467 struct backing_dev_info *bdi;
468
469 bdi = container_of(head, struct backing_dev_info, rcu_head);
470 INIT_LIST_HEAD(&bdi->bdi_list);
471
472 spin_lock(&bdi_lock);
473 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
474 spin_unlock(&bdi_lock);
475
476 /*
477 * We are now on the pending list, wake up bdi_forker_task()
478 * to finish the job and add us back to the active bdi_list
479 */
480 wake_up_process(default_backing_dev_info.wb.task);
481}
482
483/*
484 * Add the default flusher task that gets created for any bdi
485 * that has dirty data pending writeout
486 */
487void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
488{
489 if (!bdi_cap_writeback_dirty(bdi))
490 return;
491
492 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
493 printk(KERN_ERR "bdi %p/%s is not registered!\n",
494 bdi, bdi->name);
495 return;
496 }
497
498 /*
499 * Check with the helper whether to proceed adding a task. Will only
500 * abort if we two or more simultanous calls to
501 * bdi_add_default_flusher_task() occured, further additions will block
502 * waiting for previous additions to finish.
503 */
504 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
505 list_del_rcu(&bdi->bdi_list);
506
507 /*
508 * We must wait for the current RCU period to end before
509 * moving to the pending list. So schedule that operation
510 * from an RCU callback.
511 */
512 call_rcu(&bdi->rcu_head, bdi_add_to_pending);
513 }
514}
515
516/*
517 * Remove bdi from bdi_list, and ensure that it is no longer visible
518 */
519static void bdi_remove_from_list(struct backing_dev_info *bdi)
520{
521 spin_lock_bh(&bdi_lock);
522 list_del_rcu(&bdi->bdi_list);
523 spin_unlock_bh(&bdi_lock);
524
525 synchronize_rcu();
526}
527
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 528int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 529 const char *fmt, ...)
198{ 530{
@@ -211,9 +543,33 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 543 goto exit;
212 } 544 }
213 545
546 spin_lock_bh(&bdi_lock);
547 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
548 spin_unlock_bh(&bdi_lock);
549
214 bdi->dev = dev; 550 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 551
552 /*
553 * Just start the forker thread for our default backing_dev_info,
554 * and add other bdi's to the list. They will get a thread created
555 * on-demand when they need it.
556 */
557 if (bdi_cap_flush_forker(bdi)) {
558 struct bdi_writeback *wb = &bdi->wb;
559
560 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
561 dev_name(dev));
562 if (IS_ERR(wb->task)) {
563 wb->task = NULL;
564 ret = -ENOMEM;
565
566 bdi_remove_from_list(bdi);
567 goto exit;
568 }
569 }
570
571 bdi_debug_register(bdi, dev_name(dev));
572 set_bit(BDI_registered, &bdi->state);
217exit: 573exit:
218 return ret; 574 return ret;
219} 575}
@@ -225,9 +581,40 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 581}
226EXPORT_SYMBOL(bdi_register_dev); 582EXPORT_SYMBOL(bdi_register_dev);
227 583
584/*
585 * Remove bdi from the global list and shutdown any threads we have running
586 */
587static void bdi_wb_shutdown(struct backing_dev_info *bdi)
588{
589 struct bdi_writeback *wb;
590
591 if (!bdi_cap_writeback_dirty(bdi))
592 return;
593
594 /*
595 * If setup is pending, wait for that to complete first
596 */
597 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
598 TASK_UNINTERRUPTIBLE);
599
600 /*
601 * Make sure nobody finds us on the bdi_list anymore
602 */
603 bdi_remove_from_list(bdi);
604
605 /*
606 * Finally, kill the kernel threads. We don't need to be RCU
607 * safe anymore, since the bdi is gone from visibility.
608 */
609 list_for_each_entry(wb, &bdi->wb_list, list)
610 kthread_stop(wb->task);
611}
612
228void bdi_unregister(struct backing_dev_info *bdi) 613void bdi_unregister(struct backing_dev_info *bdi)
229{ 614{
230 if (bdi->dev) { 615 if (bdi->dev) {
616 if (!bdi_cap_flush_forker(bdi))
617 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 618 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 619 device_unregister(bdi->dev);
233 bdi->dev = NULL; 620 bdi->dev = NULL;
@@ -237,14 +624,26 @@ EXPORT_SYMBOL(bdi_unregister);
237 624
238int bdi_init(struct backing_dev_info *bdi) 625int bdi_init(struct backing_dev_info *bdi)
239{ 626{
240 int i; 627 int i, err;
241 int err;
242 628
243 bdi->dev = NULL; 629 bdi->dev = NULL;
244 630
245 bdi->min_ratio = 0; 631 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 632 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 633 bdi->max_prop_frac = PROP_FRAC_BASE;
634 spin_lock_init(&bdi->wb_lock);
635 INIT_RCU_HEAD(&bdi->rcu_head);
636 INIT_LIST_HEAD(&bdi->bdi_list);
637 INIT_LIST_HEAD(&bdi->wb_list);
638 INIT_LIST_HEAD(&bdi->work_list);
639
640 bdi_wb_init(&bdi->wb, bdi);
641
642 /*
643 * Just one thread support for now, hard code mask and count
644 */
645 bdi->wb_mask = 1;
646 bdi->wb_cnt = 1;
248 647
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 648 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 649 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +668,20 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 668{
270 int i; 669 int i;
271 670
671 /*
672 * Splice our entries to the default_backing_dev_info, if this
673 * bdi disappears
674 */
675 if (bdi_has_dirty_io(bdi)) {
676 struct bdi_writeback *dst = &default_backing_dev_info.wb;
677
678 spin_lock(&inode_lock);
679 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
680 list_splice(&bdi->wb.b_io, &dst->b_io);
681 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
682 spin_unlock(&inode_lock);
683 }
684
272 bdi_unregister(bdi); 685 bdi_unregister(bdi);
273 686
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 687 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 701740c9e81b..555d5d2731c6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -521,7 +521,11 @@ find_block:
521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522 start_off); 522 start_off);
523 memset(region, 0, size); 523 memset(region, 0, size);
524 kmemleak_alloc(region, size, 1, 0); 524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
525 return region; 529 return region;
526 } 530 }
527 531
diff --git a/mm/filemap.c b/mm/filemap.c
index ccea3b665c12..6c84e598b4a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
39/* 39/*
40 * FIXME: remove all knowledge of the buffer layer from the core VM 40 * FIXME: remove all knowledge of the buffer layer from the core VM
41 */ 41 */
42#include <linux/buffer_head.h> /* for generic_osync_inode */ 42#include <linux/buffer_head.h> /* for try_to_free_buffers */
43 43
44#include <asm/mman.h> 44#include <asm/mman.h>
45 45
46
47/* 46/*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 48 * though.
@@ -59,7 +58,7 @@
59/* 58/*
60 * Lock ordering: 59 * Lock ordering:
61 * 60 *
62 * ->i_mmap_lock (vmtruncate) 61 * ->i_mmap_lock (truncate_pagecache)
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 63 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 64 * ->mapping->tree_lock
@@ -105,6 +104,10 @@
105 * 104 *
106 * ->task->proc_lock 105 * ->task->proc_lock
107 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
108 */ 111 */
109 112
110/* 113/*
@@ -120,6 +123,8 @@ void __remove_from_page_cache(struct page *page)
120 page->mapping = NULL; 123 page->mapping = NULL;
121 mapping->nrpages--; 124 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 125 __dec_zone_page_state(page, NR_FILE_PAGES);
126 if (PageSwapBacked(page))
127 __dec_zone_page_state(page, NR_SHMEM);
123 BUG_ON(page_mapped(page)); 128 BUG_ON(page_mapped(page));
124 129
125 /* 130 /*
@@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
307} 312}
308 313
309/** 314/**
310 * sync_page_range - write and wait on all pages in the passed range 315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
311 * @inode: target inode 316 * @mapping: address space structure to wait for
312 * @mapping: target address_space 317 * @start: offset in bytes where the range starts
313 * @pos: beginning offset in pages to write 318 * @end: offset in bytes where the range ends (inclusive)
314 * @count: number of bytes to write
315 *
316 * Write and wait upon all the pages in the passed range. This is a "data
317 * integrity" operation. It waits upon in-flight writeout before starting and
318 * waiting upon new writeout. If there was an IO error, return it.
319 * 319 *
320 * We need to re-take i_mutex during the generic_osync_inode list walk because 320 * Walk the list of under-writeback pages of the given address space
321 * it is otherwise livelockable. 321 * in the given range and wait for all of them.
322 */
323int sync_page_range(struct inode *inode, struct address_space *mapping,
324 loff_t pos, loff_t count)
325{
326 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
327 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
328 int ret;
329
330 if (!mapping_cap_writeback_dirty(mapping) || !count)
331 return 0;
332 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
333 if (ret == 0) {
334 mutex_lock(&inode->i_mutex);
335 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
336 mutex_unlock(&inode->i_mutex);
337 }
338 if (ret == 0)
339 ret = wait_on_page_writeback_range(mapping, start, end);
340 return ret;
341}
342EXPORT_SYMBOL(sync_page_range);
343
344/**
345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
346 * @inode: target inode
347 * @mapping: target address_space
348 * @pos: beginning offset in pages to write
349 * @count: number of bytes to write
350 * 322 *
351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 323 * This is just a simple wrapper so that callers don't have to convert offsets
352 * as it forces O_SYNC writers to different parts of the same file 324 * to page indexes themselves
353 * to be serialised right until io completion.
354 */ 325 */
355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
356 loff_t pos, loff_t count) 327 loff_t end)
357{ 328{
358 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 329 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
359 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 330 end >> PAGE_CACHE_SHIFT);
360 int ret;
361
362 if (!mapping_cap_writeback_dirty(mapping) || !count)
363 return 0;
364 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
365 if (ret == 0)
366 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
367 if (ret == 0)
368 ret = wait_on_page_writeback_range(mapping, start, end);
369 return ret;
370} 331}
371EXPORT_SYMBOL(sync_page_range_nolock); 332EXPORT_SYMBOL(filemap_fdatawait_range);
372 333
373/** 334/**
374 * filemap_fdatawait - wait for all under-writeback pages to complete 335 * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -476,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
476 if (likely(!error)) { 437 if (likely(!error)) {
477 mapping->nrpages++; 438 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 439 __inc_zone_page_state(page, NR_FILE_PAGES);
440 if (PageSwapBacked(page))
441 __inc_zone_page_state(page, NR_SHMEM);
479 spin_unlock_irq(&mapping->tree_lock); 442 spin_unlock_irq(&mapping->tree_lock);
480 } else { 443 } else {
481 page->mapping = NULL; 444 page->mapping = NULL;
@@ -2167,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2167 } 2130 }
2168 *ppos = end; 2131 *ppos = end;
2169 } 2132 }
2170
2171 /*
2172 * Sync the fs metadata but not the minor inode changes and
2173 * of course not the data as we did direct DMA for the IO.
2174 * i_mutex is held, which protects generic_osync_inode() from
2175 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2176 */
2177out: 2133out:
2178 if ((written >= 0 || written == -EIOCBQUEUED) &&
2179 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2180 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2181 if (err < 0)
2182 written = err;
2183 }
2184 return written; 2134 return written;
2185} 2135}
2186EXPORT_SYMBOL(generic_file_direct_write); 2136EXPORT_SYMBOL(generic_file_direct_write);
@@ -2312,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2312{ 2262{
2313 struct file *file = iocb->ki_filp; 2263 struct file *file = iocb->ki_filp;
2314 struct address_space *mapping = file->f_mapping; 2264 struct address_space *mapping = file->f_mapping;
2315 const struct address_space_operations *a_ops = mapping->a_ops;
2316 struct inode *inode = mapping->host;
2317 ssize_t status; 2265 ssize_t status;
2318 struct iov_iter i; 2266 struct iov_iter i;
2319 2267
@@ -2323,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2323 if (likely(status >= 0)) { 2271 if (likely(status >= 0)) {
2324 written += status; 2272 written += status;
2325 *ppos = pos + status; 2273 *ppos = pos + status;
2326
2327 /*
2328 * For now, when the user asks for O_SYNC, we'll actually give
2329 * O_DSYNC
2330 */
2331 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2332 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2333 status = generic_osync_inode(inode, mapping,
2334 OSYNC_METADATA|OSYNC_DATA);
2335 }
2336 } 2274 }
2337 2275
2338 /* 2276 /*
@@ -2348,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2348} 2286}
2349EXPORT_SYMBOL(generic_file_buffered_write); 2287EXPORT_SYMBOL(generic_file_buffered_write);
2350 2288
2351static ssize_t 2289/**
2352__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2290 * __generic_file_aio_write - write data to a file
2353 unsigned long nr_segs, loff_t *ppos) 2291 * @iocb: IO state structure (file, offset, etc.)
2292 * @iov: vector with data to write
2293 * @nr_segs: number of segments in the vector
2294 * @ppos: position where to write
2295 *
2296 * This function does all the work needed for actually writing data to a
2297 * file. It does all basic checks, removes SUID from the file, updates
2298 * modification times and calls proper subroutines depending on whether we
2299 * do direct IO or a standard buffered write.
2300 *
2301 * It expects i_mutex to be grabbed unless we work on a block device or similar
2302 * object which does not need locking at all.
2303 *
2304 * This function does *not* take care of syncing data in case of O_SYNC write.
2305 * A caller has to handle it. This is mainly due to the fact that we want to
2306 * avoid syncing under i_mutex.
2307 */
2308ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2309 unsigned long nr_segs, loff_t *ppos)
2354{ 2310{
2355 struct file *file = iocb->ki_filp; 2311 struct file *file = iocb->ki_filp;
2356 struct address_space * mapping = file->f_mapping; 2312 struct address_space * mapping = file->f_mapping;
@@ -2447,51 +2403,37 @@ out:
2447 current->backing_dev_info = NULL; 2403 current->backing_dev_info = NULL;
2448 return written ? written : err; 2404 return written ? written : err;
2449} 2405}
2406EXPORT_SYMBOL(__generic_file_aio_write);
2450 2407
2451ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2408/**
2452 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2409 * generic_file_aio_write - write data to a file
2453{ 2410 * @iocb: IO state structure
2454 struct file *file = iocb->ki_filp; 2411 * @iov: vector with data to write
2455 struct address_space *mapping = file->f_mapping; 2412 * @nr_segs: number of segments in the vector
2456 struct inode *inode = mapping->host; 2413 * @pos: position in file where to write
2457 ssize_t ret; 2414 *
2458 2415 * This is a wrapper around __generic_file_aio_write() to be used by most
2459 BUG_ON(iocb->ki_pos != pos); 2416 * filesystems. It takes care of syncing the file in case of O_SYNC file
2460 2417 * and acquires i_mutex as needed.
2461 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2418 */
2462 &iocb->ki_pos);
2463
2464 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2465 ssize_t err;
2466
2467 err = sync_page_range_nolock(inode, mapping, pos, ret);
2468 if (err < 0)
2469 ret = err;
2470 }
2471 return ret;
2472}
2473EXPORT_SYMBOL(generic_file_aio_write_nolock);
2474
2475ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2419ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2476 unsigned long nr_segs, loff_t pos) 2420 unsigned long nr_segs, loff_t pos)
2477{ 2421{
2478 struct file *file = iocb->ki_filp; 2422 struct file *file = iocb->ki_filp;
2479 struct address_space *mapping = file->f_mapping; 2423 struct inode *inode = file->f_mapping->host;
2480 struct inode *inode = mapping->host;
2481 ssize_t ret; 2424 ssize_t ret;
2482 2425
2483 BUG_ON(iocb->ki_pos != pos); 2426 BUG_ON(iocb->ki_pos != pos);
2484 2427
2485 mutex_lock(&inode->i_mutex); 2428 mutex_lock(&inode->i_mutex);
2486 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2429 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2487 &iocb->ki_pos);
2488 mutex_unlock(&inode->i_mutex); 2430 mutex_unlock(&inode->i_mutex);
2489 2431
2490 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2432 if (ret > 0 || ret == -EIOCBQUEUED) {
2491 ssize_t err; 2433 ssize_t err;
2492 2434
2493 err = sync_page_range(inode, mapping, pos, ret); 2435 err = generic_write_sync(file, pos, ret);
2494 if (err < 0) 2436 if (err < 0 && ret > 0)
2495 ret = err; 2437 ret = err;
2496 } 2438 }
2497 return ret; 2439 return ret;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafdcee154e8..6f048fcc749c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234 234
235 return 1UL << (hstate->order + PAGE_SHIFT); 235 return 1UL << (hstate->order + PAGE_SHIFT);
236} 236}
237EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237 238
238/* 239/*
239 * Return the page size being used by the MMU to back a VMA. In the majority 240 * Return the page size being used by the MMU to back a VMA. In the majority
@@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
455 h->free_huge_pages_node[nid]++; 456 h->free_huge_pages_node[nid]++;
456} 457}
457 458
458static struct page *dequeue_huge_page(struct hstate *h)
459{
460 int nid;
461 struct page *page = NULL;
462
463 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
464 if (!list_empty(&h->hugepage_freelists[nid])) {
465 page = list_entry(h->hugepage_freelists[nid].next,
466 struct page, lru);
467 list_del(&page->lru);
468 h->free_huge_pages--;
469 h->free_huge_pages_node[nid]--;
470 break;
471 }
472 }
473 return page;
474}
475
476static struct page *dequeue_huge_page_vma(struct hstate *h, 459static struct page *dequeue_huge_page_vma(struct hstate *h,
477 struct vm_area_struct *vma, 460 struct vm_area_struct *vma,
478 unsigned long address, int avoid_reserve) 461 unsigned long address, int avoid_reserve)
@@ -640,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
640 623
641/* 624/*
642 * Use a helper variable to find the next node and then 625 * Use a helper variable to find the next node and then
643 * copy it back to hugetlb_next_nid afterwards: 626 * copy it back to next_nid_to_alloc afterwards:
644 * otherwise there's a window in which a racer might 627 * otherwise there's a window in which a racer might
645 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
646 * But we don't need to use a spin_lock here: it really 629 * But we don't need to use a spin_lock here: it really
@@ -649,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
649 * if we just successfully allocated a hugepage so that 632 * if we just successfully allocated a hugepage so that
650 * the next caller gets hugepages on the next node. 633 * the next caller gets hugepages on the next node.
651 */ 634 */
652static int hstate_next_node(struct hstate *h) 635static int hstate_next_node_to_alloc(struct hstate *h)
653{ 636{
654 int next_nid; 637 int next_nid;
655 next_nid = next_node(h->hugetlb_next_nid, node_online_map); 638 next_nid = next_node(h->next_nid_to_alloc, node_online_map);
656 if (next_nid == MAX_NUMNODES) 639 if (next_nid == MAX_NUMNODES)
657 next_nid = first_node(node_online_map); 640 next_nid = first_node(node_online_map);
658 h->hugetlb_next_nid = next_nid; 641 h->next_nid_to_alloc = next_nid;
659 return next_nid; 642 return next_nid;
660} 643}
661 644
@@ -666,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
666 int next_nid; 649 int next_nid;
667 int ret = 0; 650 int ret = 0;
668 651
669 start_nid = h->hugetlb_next_nid; 652 start_nid = h->next_nid_to_alloc;
653 next_nid = start_nid;
670 654
671 do { 655 do {
672 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); 656 page = alloc_fresh_huge_page_node(h, next_nid);
673 if (page) 657 if (page)
674 ret = 1; 658 ret = 1;
675 next_nid = hstate_next_node(h); 659 next_nid = hstate_next_node_to_alloc(h);
676 } while (!page && h->hugetlb_next_nid != start_nid); 660 } while (!page && next_nid != start_nid);
677 661
678 if (ret) 662 if (ret)
679 count_vm_event(HTLB_BUDDY_PGALLOC); 663 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -683,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
683 return ret; 667 return ret;
684} 668}
685 669
670/*
671 * helper for free_pool_huge_page() - find next node
672 * from which to free a huge page
673 */
674static int hstate_next_node_to_free(struct hstate *h)
675{
676 int next_nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map);
678 if (next_nid == MAX_NUMNODES)
679 next_nid = first_node(node_online_map);
680 h->next_nid_to_free = next_nid;
681 return next_nid;
682}
683
684/*
685 * Free huge page from pool from next node to free.
686 * Attempt to keep persistent huge pages more or less
687 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked.
689 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691{
692 int start_nid;
693 int next_nid;
694 int ret = 0;
695
696 start_nid = h->next_nid_to_free;
697 next_nid = start_nid;
698
699 do {
700 /*
701 * If we're returning unused surplus pages, only examine
702 * nodes with surplus pages.
703 */
704 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 !list_empty(&h->hugepage_freelists[next_nid])) {
706 struct page *page =
707 list_entry(h->hugepage_freelists[next_nid].next,
708 struct page, lru);
709 list_del(&page->lru);
710 h->free_huge_pages--;
711 h->free_huge_pages_node[next_nid]--;
712 if (acct_surplus) {
713 h->surplus_huge_pages--;
714 h->surplus_huge_pages_node[next_nid]--;
715 }
716 update_and_free_page(h, page);
717 ret = 1;
718 }
719 next_nid = hstate_next_node_to_free(h);
720 } while (!ret && next_nid != start_nid);
721
722 return ret;
723}
724
686static struct page *alloc_buddy_huge_page(struct hstate *h, 725static struct page *alloc_buddy_huge_page(struct hstate *h,
687 struct vm_area_struct *vma, unsigned long address) 726 struct vm_area_struct *vma, unsigned long address)
688{ 727{
@@ -854,22 +893,13 @@ free:
854 * When releasing a hugetlb pool reservation, any surplus pages that were 893 * When releasing a hugetlb pool reservation, any surplus pages that were
855 * allocated to satisfy the reservation must be explicitly freed if they were 894 * allocated to satisfy the reservation must be explicitly freed if they were
856 * never used. 895 * never used.
896 * Called with hugetlb_lock held.
857 */ 897 */
858static void return_unused_surplus_pages(struct hstate *h, 898static void return_unused_surplus_pages(struct hstate *h,
859 unsigned long unused_resv_pages) 899 unsigned long unused_resv_pages)
860{ 900{
861 static int nid = -1;
862 struct page *page;
863 unsigned long nr_pages; 901 unsigned long nr_pages;
864 902
865 /*
866 * We want to release as many surplus pages as possible, spread
867 * evenly across all nodes. Iterate across all nodes until we
868 * can no longer free unreserved surplus pages. This occurs when
869 * the nodes with surplus pages have no free pages.
870 */
871 unsigned long remaining_iterations = nr_online_nodes;
872
873 /* Uncommit the reservation */ 903 /* Uncommit the reservation */
874 h->resv_huge_pages -= unused_resv_pages; 904 h->resv_huge_pages -= unused_resv_pages;
875 905
@@ -879,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
879 909
880 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 910 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
881 911
882 while (remaining_iterations-- && nr_pages) { 912 /*
883 nid = next_node(nid, node_online_map); 913 * We want to release as many surplus pages as possible, spread
884 if (nid == MAX_NUMNODES) 914 * evenly across all nodes. Iterate across all nodes until we
885 nid = first_node(node_online_map); 915 * can no longer free unreserved surplus pages. This occurs when
886 916 * the nodes with surplus pages have no free pages.
887 if (!h->surplus_huge_pages_node[nid]) 917 * free_pool_huge_page() will balance the the frees across the
888 continue; 918 * on-line nodes for us and will handle the hstate accounting.
889 919 */
890 if (!list_empty(&h->hugepage_freelists[nid])) { 920 while (nr_pages--) {
891 page = list_entry(h->hugepage_freelists[nid].next, 921 if (!free_pool_huge_page(h, 1))
892 struct page, lru); 922 break;
893 list_del(&page->lru);
894 update_and_free_page(h, page);
895 h->free_huge_pages--;
896 h->free_huge_pages_node[nid]--;
897 h->surplus_huge_pages--;
898 h->surplus_huge_pages_node[nid]--;
899 nr_pages--;
900 remaining_iterations = nr_online_nodes;
901 }
902 } 923 }
903} 924}
904 925
@@ -1007,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1007 void *addr; 1028 void *addr;
1008 1029
1009 addr = __alloc_bootmem_node_nopanic( 1030 addr = __alloc_bootmem_node_nopanic(
1010 NODE_DATA(h->hugetlb_next_nid), 1031 NODE_DATA(h->next_nid_to_alloc),
1011 huge_page_size(h), huge_page_size(h), 0); 1032 huge_page_size(h), huge_page_size(h), 0);
1012 1033
1034 hstate_next_node_to_alloc(h);
1013 if (addr) { 1035 if (addr) {
1014 /* 1036 /*
1015 * Use the beginning of the huge page to store the 1037 * Use the beginning of the huge page to store the
@@ -1019,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1019 m = addr; 1041 m = addr;
1020 goto found; 1042 goto found;
1021 } 1043 }
1022 hstate_next_node(h);
1023 nr_nodes--; 1044 nr_nodes--;
1024 } 1045 }
1025 return 0; 1046 return 0;
@@ -1140,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140 */ 1161 */
1141static int adjust_pool_surplus(struct hstate *h, int delta) 1162static int adjust_pool_surplus(struct hstate *h, int delta)
1142{ 1163{
1143 static int prev_nid; 1164 int start_nid, next_nid;
1144 int nid = prev_nid;
1145 int ret = 0; 1165 int ret = 0;
1146 1166
1147 VM_BUG_ON(delta != -1 && delta != 1); 1167 VM_BUG_ON(delta != -1 && delta != 1);
1148 do {
1149 nid = next_node(nid, node_online_map);
1150 if (nid == MAX_NUMNODES)
1151 nid = first_node(node_online_map);
1152 1168
1153 /* To shrink on this node, there must be a surplus page */ 1169 if (delta < 0)
1154 if (delta < 0 && !h->surplus_huge_pages_node[nid]) 1170 start_nid = h->next_nid_to_alloc;
1155 continue; 1171 else
1156 /* Surplus cannot exceed the total number of pages */ 1172 start_nid = h->next_nid_to_free;
1157 if (delta > 0 && h->surplus_huge_pages_node[nid] >= 1173 next_nid = start_nid;
1174
1175 do {
1176 int nid = next_nid;
1177 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /*
1180 * To shrink on this node, there must be a surplus page
1181 */
1182 if (!h->surplus_huge_pages_node[nid])
1183 continue;
1184 }
1185 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /*
1188 * Surplus cannot exceed the total number of pages
1189 */
1190 if (h->surplus_huge_pages_node[nid] >=
1158 h->nr_huge_pages_node[nid]) 1191 h->nr_huge_pages_node[nid])
1159 continue; 1192 continue;
1193 }
1160 1194
1161 h->surplus_huge_pages += delta; 1195 h->surplus_huge_pages += delta;
1162 h->surplus_huge_pages_node[nid] += delta; 1196 h->surplus_huge_pages_node[nid] += delta;
1163 ret = 1; 1197 ret = 1;
1164 break; 1198 break;
1165 } while (nid != prev_nid); 1199 } while (next_nid != start_nid);
1166 1200
1167 prev_nid = nid;
1168 return ret; 1201 return ret;
1169} 1202}
1170 1203
@@ -1226,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1226 min_count = max(count, min_count); 1259 min_count = max(count, min_count);
1227 try_to_free_low(h, min_count); 1260 try_to_free_low(h, min_count);
1228 while (min_count < persistent_huge_pages(h)) { 1261 while (min_count < persistent_huge_pages(h)) {
1229 struct page *page = dequeue_huge_page(h); 1262 if (!free_pool_huge_page(h, 0))
1230 if (!page)
1231 break; 1263 break;
1232 update_and_free_page(h, page);
1233 } 1264 }
1234 while (count < persistent_huge_pages(h)) { 1265 while (count < persistent_huge_pages(h)) {
1235 if (!adjust_pool_surplus(h, 1)) 1266 if (!adjust_pool_surplus(h, 1))
@@ -1441,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1441 h->free_huge_pages = 0; 1472 h->free_huge_pages = 0;
1442 for (i = 0; i < MAX_NUMNODES; ++i) 1473 for (i = 0; i < MAX_NUMNODES; ++i)
1443 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1444 h->hugetlb_next_nid = first_node(node_online_map); 1475 h->next_nid_to_alloc = first_node(node_online_map);
1476 h->next_nid_to_free = first_node(node_online_map);
1445 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1446 huge_page_size(h)/1024); 1478 huge_page_size(h)/1024);
1447 1479
@@ -1505,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1505 1537
1506#ifdef CONFIG_SYSCTL 1538#ifdef CONFIG_SYSCTL
1507int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1539int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1508 struct file *file, void __user *buffer, 1540 void __user *buffer,
1509 size_t *length, loff_t *ppos) 1541 size_t *length, loff_t *ppos)
1510{ 1542{
1511 struct hstate *h = &default_hstate; 1543 struct hstate *h = &default_hstate;
@@ -1516,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1516 1548
1517 table->data = &tmp; 1549 table->data = &tmp;
1518 table->maxlen = sizeof(unsigned long); 1550 table->maxlen = sizeof(unsigned long);
1519 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1551 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1520 1552
1521 if (write) 1553 if (write)
1522 h->max_huge_pages = set_max_huge_pages(h, tmp); 1554 h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1525,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1525} 1557}
1526 1558
1527int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1528 struct file *file, void __user *buffer, 1560 void __user *buffer,
1529 size_t *length, loff_t *ppos) 1561 size_t *length, loff_t *ppos)
1530{ 1562{
1531 proc_dointvec(table, write, file, buffer, length, ppos); 1563 proc_dointvec(table, write, buffer, length, ppos);
1532 if (hugepages_treat_as_movable) 1564 if (hugepages_treat_as_movable)
1533 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1565 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1534 else 1566 else
@@ -1537,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1537} 1569}
1538 1570
1539int hugetlb_overcommit_handler(struct ctl_table *table, int write, 1571int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1540 struct file *file, void __user *buffer, 1572 void __user *buffer,
1541 size_t *length, loff_t *ppos) 1573 size_t *length, loff_t *ppos)
1542{ 1574{
1543 struct hstate *h = &default_hstate; 1575 struct hstate *h = &default_hstate;
@@ -1548,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1548 1580
1549 table->data = &tmp; 1581 table->data = &tmp;
1550 table->maxlen = sizeof(unsigned long); 1582 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1583 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1584
1553 if (write) { 1585 if (write) {
1554 spin_lock(&hugetlb_lock); 1586 spin_lock(&hugetlb_lock);
@@ -1984,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1984 return find_lock_page(mapping, idx); 2016 return find_lock_page(mapping, idx);
1985} 2017}
1986 2018
2019/*
2020 * Return whether there is a pagecache page to back given address within VMA.
2021 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022 */
2023static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 struct vm_area_struct *vma, unsigned long address)
2025{
2026 struct address_space *mapping;
2027 pgoff_t idx;
2028 struct page *page;
2029
2030 mapping = vma->vm_file->f_mapping;
2031 idx = vma_hugecache_offset(h, vma, address);
2032
2033 page = find_get_page(mapping, idx);
2034 if (page)
2035 put_page(page);
2036 return page != NULL;
2037}
2038
1987static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1988 unsigned long address, pte_t *ptep, unsigned int flags) 2040 unsigned long address, pte_t *ptep, unsigned int flags)
1989{ 2041{
@@ -2179,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2179 return NULL; 2231 return NULL;
2180} 2232}
2181 2233
2182static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2183{
2184 if (!ptep || write || shared)
2185 return 0;
2186 else
2187 return huge_pte_none(huge_ptep_get(ptep));
2188}
2189
2190int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2234int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2191 struct page **pages, struct vm_area_struct **vmas, 2235 struct page **pages, struct vm_area_struct **vmas,
2192 unsigned long *position, int *length, int i, 2236 unsigned long *position, int *length, int i,
2193 int write) 2237 unsigned int flags)
2194{ 2238{
2195 unsigned long pfn_offset; 2239 unsigned long pfn_offset;
2196 unsigned long vaddr = *position; 2240 unsigned long vaddr = *position;
2197 int remainder = *length; 2241 int remainder = *length;
2198 struct hstate *h = hstate_vma(vma); 2242 struct hstate *h = hstate_vma(vma);
2199 int zeropage_ok = 0;
2200 int shared = vma->vm_flags & VM_SHARED;
2201 2243
2202 spin_lock(&mm->page_table_lock); 2244 spin_lock(&mm->page_table_lock);
2203 while (vaddr < vma->vm_end && remainder) { 2245 while (vaddr < vma->vm_end && remainder) {
2204 pte_t *pte; 2246 pte_t *pte;
2247 int absent;
2205 struct page *page; 2248 struct page *page;
2206 2249
2207 /* 2250 /*
2208 * Some archs (sparc64, sh*) have multiple pte_ts to 2251 * Some archs (sparc64, sh*) have multiple pte_ts to
2209 * each hugepage. We have to make * sure we get the 2252 * each hugepage. We have to make sure we get the
2210 * first, for the page indexing below to work. 2253 * first, for the page indexing below to work.
2211 */ 2254 */
2212 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2255 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2213 if (huge_zeropage_ok(pte, write, shared)) 2256 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2214 zeropage_ok = 1; 2257
2258 /*
2259 * When coredumping, it suits get_dump_page if we just return
2260 * an error where there's an empty slot with no huge pagecache
2261 * to back it. This way, we avoid allocating a hugepage, and
2262 * the sparse dumpfile avoids allocating disk blocks, but its
2263 * huge holes still show up with zeroes where they need to be.
2264 */
2265 if (absent && (flags & FOLL_DUMP) &&
2266 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 remainder = 0;
2268 break;
2269 }
2215 2270
2216 if (!pte || 2271 if (absent ||
2217 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || 2272 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2218 (write && !pte_write(huge_ptep_get(pte)))) {
2219 int ret; 2273 int ret;
2220 2274
2221 spin_unlock(&mm->page_table_lock); 2275 spin_unlock(&mm->page_table_lock);
2222 ret = hugetlb_fault(mm, vma, vaddr, write); 2276 ret = hugetlb_fault(mm, vma, vaddr,
2277 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2223 spin_lock(&mm->page_table_lock); 2278 spin_lock(&mm->page_table_lock);
2224 if (!(ret & VM_FAULT_ERROR)) 2279 if (!(ret & VM_FAULT_ERROR))
2225 continue; 2280 continue;
2226 2281
2227 remainder = 0; 2282 remainder = 0;
2228 if (!i)
2229 i = -EFAULT;
2230 break; 2283 break;
2231 } 2284 }
2232 2285
@@ -2234,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2234 page = pte_page(huge_ptep_get(pte)); 2287 page = pte_page(huge_ptep_get(pte));
2235same_page: 2288same_page:
2236 if (pages) { 2289 if (pages) {
2237 if (zeropage_ok) 2290 pages[i] = mem_map_offset(page, pfn_offset);
2238 pages[i] = ZERO_PAGE(0);
2239 else
2240 pages[i] = mem_map_offset(page, pfn_offset);
2241 get_page(pages[i]); 2291 get_page(pages[i]);
2242 } 2292 }
2243 2293
@@ -2261,7 +2311,7 @@ same_page:
2261 *length = remainder; 2311 *length = remainder;
2262 *position = vaddr; 2312 *position = vaddr;
2263 2313
2264 return i; 2314 return i ? i : -EFAULT;
2265} 2315}
2266 2316
2267void hugetlb_change_protection(struct vm_area_struct *vma, 2317void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */
2#include <linux/module.h>
3#include <linux/debugfs.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6
7static struct dentry *hwpoison_dir, *corrupt_pfn;
8
9static int hwpoison_inject(void *data, u64 val)
10{
11 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14 return __memory_failure(val, 18, 0);
15}
16
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
18
19static void pfn_inject_exit(void)
20{
21 if (hwpoison_dir)
22 debugfs_remove_recursive(hwpoison_dir);
23}
24
25static int pfn_inject_init(void)
26{
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL)
29 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) {
33 pfn_inject_exit();
34 return -ENOMEM;
35 }
36 return 0;
37}
38
39module_init(pfn_inject_init);
40module_exit(pfn_inject_exit);
41MODULE_LICENSE("GPL");
diff --git a/mm/internal.h b/mm/internal.h
index f290c4db528b..22ec8d2b0fb8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,8 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40extern unsigned long highest_memmap_pfn;
41
40/* 42/*
41 * in mm/vmscan.c: 43 * in mm/vmscan.c:
42 */ 44 */
@@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page);
46/* 48/*
47 * in mm/page_alloc.c 49 * in mm/page_alloc.c
48 */ 50 */
49extern unsigned long highest_memmap_pfn;
50extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
52 53
@@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
250} 251}
251#endif /* CONFIG_SPARSEMEM */ 252#endif /* CONFIG_SPARSEMEM */
252 253
253#define GUP_FLAGS_WRITE 0x1
254#define GUP_FLAGS_FORCE 0x2
255#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
256#define GUP_FLAGS_IGNORE_SIGKILL 0x8
257
258int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 254int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
259 unsigned long start, int len, int flags, 255 unsigned long start, int len, unsigned int foll_flags,
260 struct page **pages, struct vm_area_struct **vmas); 256 struct page **pages, struct vm_area_struct **vmas);
261 257
262#define ZONE_RECLAIM_NOSCAN -2 258#define ZONE_RECLAIM_NOSCAN -2
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f523..177a5169bbde 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
36}; 36};
37 37
38static LIST_HEAD(test_list); 38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer); 39static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
40 40
41/* 41/*
42 * Some very simple testing. This function needs to be extended for 42 * Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
86 } 86 }
87 87
88 for_each_possible_cpu(i) { 88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); 89 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n", 90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i)); 91 per_cpu(kmemleak_test_pointer, i));
92 } 92 }
93 93
94 return 0; 94 return 0;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 487267310a84..4ea4510e2996 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,11 +92,13 @@
92#include <linux/string.h> 92#include <linux/string.h>
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h>
95 96
96#include <asm/sections.h> 97#include <asm/sections.h>
97#include <asm/processor.h> 98#include <asm/processor.h>
98#include <asm/atomic.h> 99#include <asm/atomic.h>
99 100
101#include <linux/kmemcheck.h>
100#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
101 103
102/* 104/*
@@ -107,6 +109,7 @@
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ 111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
110 113
111#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
112 115
@@ -120,6 +123,9 @@ struct kmemleak_scan_area {
120 size_t length; 123 size_t length;
121}; 124};
122 125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
123/* 129/*
124 * Structure holding the metadata for each allocated memory block. 130 * Structure holding the metadata for each allocated memory block.
125 * Modifications to such objects should be made while holding the 131 * Modifications to such objects should be made while holding the
@@ -161,6 +167,15 @@ struct kmemleak_object {
161/* flag set on newly allocated objects */ 167/* flag set on newly allocated objects */
162#define OBJECT_NEW (1 << 3) 168#define OBJECT_NEW (1 << 3)
163 169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
178
164/* the list of all allocated objects */ 179/* the list of all allocated objects */
165static LIST_HEAD(object_list); 180static LIST_HEAD(object_list);
166/* the list of gray-colored objects (see color_gray comment below) */ 181/* the list of gray-colored objects (see color_gray comment below) */
@@ -228,11 +243,14 @@ struct early_log {
228 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
229 unsigned long offset; /* scan area offset */ 244 unsigned long offset; /* scan area offset */
230 size_t length; /* scan area length */ 245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
231}; 248};
232 249
233/* early logging buffer and current position */ 250/* early logging buffer and current position */
234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; 251static struct early_log
235static int crt_early_log; 252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
236 254
237static void kmemleak_disable(void); 255static void kmemleak_disable(void);
238 256
@@ -255,6 +273,35 @@ static void kmemleak_disable(void);
255} while (0) 273} while (0)
256 274
257/* 275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
258 * Object colors, encoded with count and min_count: 305 * Object colors, encoded with count and min_count:
259 * - white - orphan object, not enough references to it (count < min_count) 306 * - white - orphan object, not enough references to it (count < min_count)
260 * - gray - not orphan, not marked as false positive (min_count == 0) or 307 * - gray - not orphan, not marked as false positive (min_count == 0) or
@@ -264,19 +311,21 @@ static void kmemleak_disable(void);
264 * Newly created objects don't have any color assigned (object->count == -1) 311 * Newly created objects don't have any color assigned (object->count == -1)
265 * before the next memory scan when they become white. 312 * before the next memory scan when they become white.
266 */ 313 */
267static int color_white(const struct kmemleak_object *object) 314static bool color_white(const struct kmemleak_object *object)
268{ 315{
269 return object->count != -1 && object->count < object->min_count; 316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
270} 318}
271 319
272static int color_gray(const struct kmemleak_object *object) 320static bool color_gray(const struct kmemleak_object *object)
273{ 321{
274 return object->min_count != -1 && object->count >= object->min_count; 322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
275} 324}
276 325
277static int color_black(const struct kmemleak_object *object) 326static bool color_black(const struct kmemleak_object *object)
278{ 327{
279 return object->min_count == -1; 328 return object->min_count == KMEMLEAK_BLACK;
280} 329}
281 330
282/* 331/*
@@ -284,7 +333,7 @@ static int color_black(const struct kmemleak_object *object)
284 * not be deleted and have a minimum age to avoid false positives caused by 333 * not be deleted and have a minimum age to avoid false positives caused by
285 * pointers temporarily stored in CPU registers. 334 * pointers temporarily stored in CPU registers.
286 */ 335 */
287static int unreferenced_object(struct kmemleak_object *object) 336static bool unreferenced_object(struct kmemleak_object *object)
288{ 337{
289 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
290 time_before_eq(object->jiffies + jiffies_min_age, 339 time_before_eq(object->jiffies + jiffies_min_age,
@@ -304,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq,
304 object->pointer, object->size); 353 object->pointer, object->size);
305 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
306 object->comm, object->pid, object->jiffies); 355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
307 seq_printf(seq, " backtrace:\n"); 357 seq_printf(seq, " backtrace:\n");
308 358
309 for (i = 0; i < object->trace_len; i++) { 359 for (i = 0; i < object->trace_len; i++) {
@@ -330,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
330 object->comm, object->pid, object->jiffies); 380 object->comm, object->pid, object->jiffies);
331 pr_notice(" min_count = %d\n", object->min_count); 381 pr_notice(" min_count = %d\n", object->min_count);
332 pr_notice(" count = %d\n", object->count); 382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
333 pr_notice(" backtrace:\n"); 384 pr_notice(" backtrace:\n");
334 print_stack_trace(&trace, 4); 385 print_stack_trace(&trace, 4);
335} 386}
@@ -434,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
434} 485}
435 486
436/* 487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
437 * Create the metadata (struct kmemleak_object) corresponding to an allocated 504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
438 * memory block and add it to the object_list and object_tree_root. 505 * memory block and add it to the object_list and object_tree_root.
439 */ 506 */
440static void create_object(unsigned long ptr, size_t size, int min_count, 507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
441 gfp_t gfp) 508 int min_count, gfp_t gfp)
442{ 509{
443 unsigned long flags; 510 unsigned long flags;
444 struct kmemleak_object *object; 511 struct kmemleak_object *object;
445 struct prio_tree_node *node; 512 struct prio_tree_node *node;
446 struct stack_trace trace;
447 513
448 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
449 if (!object) { 515 if (!object) {
450 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
451 return; 517 return NULL;
452 } 518 }
453 519
454 INIT_LIST_HEAD(&object->object_list); 520 INIT_LIST_HEAD(&object->object_list);
@@ -482,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
482 } 548 }
483 549
484 /* kernel backtrace */ 550 /* kernel backtrace */
485 trace.max_entries = MAX_TRACE; 551 object->trace_len = __save_stack_trace(object->trace);
486 trace.nr_entries = 0;
487 trace.entries = object->trace;
488 trace.skip = 1;
489 save_stack_trace(&trace);
490 object->trace_len = trace.nr_entries;
491 552
492 INIT_PRIO_TREE_NODE(&object->tree_node); 553 INIT_PRIO_TREE_NODE(&object->tree_node);
493 object->tree_node.start = ptr; 554 object->tree_node.start = ptr;
494 object->tree_node.last = ptr + size - 1; 555 object->tree_node.last = ptr + size - 1;
495 556
496 write_lock_irqsave(&kmemleak_lock, flags); 557 write_lock_irqsave(&kmemleak_lock, flags);
558
497 min_addr = min(min_addr, ptr); 559 min_addr = min(min_addr, ptr);
498 max_addr = max(max_addr, ptr + size); 560 max_addr = max(max_addr, ptr + size);
499 node = prio_tree_insert(&object_tree_root, &object->tree_node); 561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
@@ -504,20 +566,19 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
504 * random memory blocks. 566 * random memory blocks.
505 */ 567 */
506 if (node != &object->tree_node) { 568 if (node != &object->tree_node) {
507 unsigned long flags;
508
509 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
510 "(already existing)\n", ptr); 570 "(already existing)\n", ptr);
511 object = lookup_object(ptr, 1); 571 object = lookup_object(ptr, 1);
512 spin_lock_irqsave(&object->lock, flags); 572 spin_lock(&object->lock);
513 dump_object_info(object); 573 dump_object_info(object);
514 spin_unlock_irqrestore(&object->lock, flags); 574 spin_unlock(&object->lock);
515 575
516 goto out; 576 goto out;
517 } 577 }
518 list_add_tail_rcu(&object->object_list, &object_list); 578 list_add_tail_rcu(&object->object_list, &object_list);
519out: 579out:
520 write_unlock_irqrestore(&kmemleak_lock, flags); 580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
521} 582}
522 583
523/* 584/*
@@ -604,46 +665,55 @@ static void delete_object_part(unsigned long ptr, size_t size)
604 665
605 put_object(object); 666 put_object(object);
606} 667}
607/* 668
608 * Make a object permanently as gray-colored so that it can no longer be 669static void __paint_it(struct kmemleak_object *object, int color)
609 * reported as a leak. This is used in general to mark a false positive. 670{
610 */ 671 object->min_count = color;
611static void make_gray_object(unsigned long ptr) 672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
612{ 677{
613 unsigned long flags; 678 unsigned long flags;
679
680 spin_lock_irqsave(&object->lock, flags);
681 __paint_it(object, color);
682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
614 struct kmemleak_object *object; 687 struct kmemleak_object *object;
615 688
616 object = find_and_get_object(ptr, 0); 689 object = find_and_get_object(ptr, 0);
617 if (!object) { 690 if (!object) {
618 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); 691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
619 return; 695 return;
620 } 696 }
621 697 paint_it(object, color);
622 spin_lock_irqsave(&object->lock, flags);
623 object->min_count = 0;
624 spin_unlock_irqrestore(&object->lock, flags);
625 put_object(object); 698 put_object(object);
626} 699}
627 700
628/* 701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
629 * Mark the object as black-colored so that it is ignored from scans and 711 * Mark the object as black-colored so that it is ignored from scans and
630 * reporting. 712 * reporting.
631 */ 713 */
632static void make_black_object(unsigned long ptr) 714static void make_black_object(unsigned long ptr)
633{ 715{
634 unsigned long flags; 716 paint_ptr(ptr, KMEMLEAK_BLACK);
635 struct kmemleak_object *object;
636
637 object = find_and_get_object(ptr, 0);
638 if (!object) {
639 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr);
640 return;
641 }
642
643 spin_lock_irqsave(&object->lock, flags);
644 object->min_count = -1;
645 spin_unlock_irqrestore(&object->lock, flags);
646 put_object(object);
647} 717}
648 718
649/* 719/*
@@ -715,14 +785,15 @@ static void object_no_scan(unsigned long ptr)
715 * Log an early kmemleak_* call to the early_log buffer. These calls will be 785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
716 * processed later once kmemleak is fully initialized. 786 * processed later once kmemleak is fully initialized.
717 */ 787 */
718static void log_early(int op_type, const void *ptr, size_t size, 788static void __init log_early(int op_type, const void *ptr, size_t size,
719 int min_count, unsigned long offset, size_t length) 789 int min_count, unsigned long offset, size_t length)
720{ 790{
721 unsigned long flags; 791 unsigned long flags;
722 struct early_log *log; 792 struct early_log *log;
723 793
724 if (crt_early_log >= ARRAY_SIZE(early_log)) { 794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
725 pr_warning("Early log buffer exceeded\n"); 795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
726 kmemleak_disable(); 797 kmemleak_disable();
727 return; 798 return;
728 } 799 }
@@ -739,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size,
739 log->min_count = min_count; 810 log->min_count = min_count;
740 log->offset = offset; 811 log->offset = offset;
741 log->length = length; 812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
742 crt_early_log++; 815 crt_early_log++;
743 local_irq_restore(flags); 816 local_irq_restore(flags);
744} 817}
745 818
746/* 819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL);
837 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags);
842 rcu_read_unlock();
843}
844
845/*
747 * Memory allocation function callback. This function is called from the 846 * Memory allocation function callback. This function is called from the
748 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 847 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
749 * vmalloc etc.). 848 * vmalloc etc.).
750 */ 849 */
751void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) 850void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
851 gfp_t gfp)
752{ 852{
753 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); 853 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
754 854
@@ -763,7 +863,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
763 * Memory freeing function callback. This function is called from the kernel 863 * Memory freeing function callback. This function is called from the kernel
764 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 864 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
765 */ 865 */
766void kmemleak_free(const void *ptr) 866void __ref kmemleak_free(const void *ptr)
767{ 867{
768 pr_debug("%s(0x%p)\n", __func__, ptr); 868 pr_debug("%s(0x%p)\n", __func__, ptr);
769 869
@@ -778,7 +878,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free);
778 * Partial memory freeing function callback. This function is usually called 878 * Partial memory freeing function callback. This function is usually called
779 * from bootmem allocator when (part of) a memory block is freed. 879 * from bootmem allocator when (part of) a memory block is freed.
780 */ 880 */
781void kmemleak_free_part(const void *ptr, size_t size) 881void __ref kmemleak_free_part(const void *ptr, size_t size)
782{ 882{
783 pr_debug("%s(0x%p)\n", __func__, ptr); 883 pr_debug("%s(0x%p)\n", __func__, ptr);
784 884
@@ -793,7 +893,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part);
793 * Mark an already allocated memory block as a false positive. This will cause 893 * Mark an already allocated memory block as a false positive. This will cause
794 * the block to no longer be reported as leak and always be scanned. 894 * the block to no longer be reported as leak and always be scanned.
795 */ 895 */
796void kmemleak_not_leak(const void *ptr) 896void __ref kmemleak_not_leak(const void *ptr)
797{ 897{
798 pr_debug("%s(0x%p)\n", __func__, ptr); 898 pr_debug("%s(0x%p)\n", __func__, ptr);
799 899
@@ -809,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
809 * corresponding block is not a leak and does not contain any references to 909 * corresponding block is not a leak and does not contain any references to
810 * other allocated memory blocks. 910 * other allocated memory blocks.
811 */ 911 */
812void kmemleak_ignore(const void *ptr) 912void __ref kmemleak_ignore(const void *ptr)
813{ 913{
814 pr_debug("%s(0x%p)\n", __func__, ptr); 914 pr_debug("%s(0x%p)\n", __func__, ptr);
815 915
@@ -823,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore);
823/* 923/*
824 * Limit the range to be scanned in an allocated memory block. 924 * Limit the range to be scanned in an allocated memory block.
825 */ 925 */
826void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, 926void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
827 gfp_t gfp) 927 size_t length, gfp_t gfp)
828{ 928{
829 pr_debug("%s(0x%p)\n", __func__, ptr); 929 pr_debug("%s(0x%p)\n", __func__, ptr);
830 930
@@ -838,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
838/* 938/*
839 * Inform kmemleak not to scan the given memory block. 939 * Inform kmemleak not to scan the given memory block.
840 */ 940 */
841void kmemleak_no_scan(const void *ptr) 941void __ref kmemleak_no_scan(const void *ptr)
842{ 942{
843 pr_debug("%s(0x%p)\n", __func__, ptr); 943 pr_debug("%s(0x%p)\n", __func__, ptr);
844 944
@@ -882,15 +982,22 @@ static void scan_block(void *_start, void *_end,
882 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 982 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
883 983
884 for (ptr = start; ptr < end; ptr++) { 984 for (ptr = start; ptr < end; ptr++) {
885 unsigned long flags;
886 unsigned long pointer = *ptr;
887 struct kmemleak_object *object; 985 struct kmemleak_object *object;
986 unsigned long flags;
987 unsigned long pointer;
888 988
889 if (allow_resched) 989 if (allow_resched)
890 cond_resched(); 990 cond_resched();
891 if (scan_should_stop()) 991 if (scan_should_stop())
892 break; 992 break;
893 993
994 /* don't scan uninitialized memory */
995 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
996 BYTES_PER_POINTER))
997 continue;
998
999 pointer = *ptr;
1000
894 object = find_and_get_object(pointer, 1); 1001 object = find_and_get_object(pointer, 1);
895 if (!object) 1002 if (!object)
896 continue; 1003 continue;
@@ -949,10 +1056,21 @@ static void scan_object(struct kmemleak_object *object)
949 if (!(object->flags & OBJECT_ALLOCATED)) 1056 if (!(object->flags & OBJECT_ALLOCATED))
950 /* already freed object */ 1057 /* already freed object */
951 goto out; 1058 goto out;
952 if (hlist_empty(&object->area_list)) 1059 if (hlist_empty(&object->area_list)) {
953 scan_block((void *)object->pointer, 1060 void *start = (void *)object->pointer;
954 (void *)(object->pointer + object->size), object, 0); 1061 void *end = (void *)(object->pointer + object->size);
955 else 1062
1063 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1064 !(object->flags & OBJECT_NO_SCAN)) {
1065 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1066 object, 0);
1067 start += MAX_SCAN_SIZE;
1068
1069 spin_unlock_irqrestore(&object->lock, flags);
1070 cond_resched();
1071 spin_lock_irqsave(&object->lock, flags);
1072 }
1073 } else
956 hlist_for_each_entry(area, elem, &object->area_list, node) 1074 hlist_for_each_entry(area, elem, &object->area_list, node)
957 scan_block((void *)(object->pointer + area->offset), 1075 scan_block((void *)(object->pointer + area->offset),
958 (void *)(object->pointer + area->offset 1076 (void *)(object->pointer + area->offset
@@ -970,7 +1088,6 @@ static void kmemleak_scan(void)
970{ 1088{
971 unsigned long flags; 1089 unsigned long flags;
972 struct kmemleak_object *object, *tmp; 1090 struct kmemleak_object *object, *tmp;
973 struct task_struct *task;
974 int i; 1091 int i;
975 int new_leaks = 0; 1092 int new_leaks = 0;
976 int gray_list_pass = 0; 1093 int gray_list_pass = 0;
@@ -1037,15 +1154,16 @@ static void kmemleak_scan(void)
1037 } 1154 }
1038 1155
1039 /* 1156 /*
1040 * Scanning the task stacks may introduce false negatives and it is 1157 * Scanning the task stacks (may introduce false negatives).
1041 * not enabled by default.
1042 */ 1158 */
1043 if (kmemleak_stack_scan) { 1159 if (kmemleak_stack_scan) {
1160 struct task_struct *p, *g;
1161
1044 read_lock(&tasklist_lock); 1162 read_lock(&tasklist_lock);
1045 for_each_process(task) 1163 do_each_thread(g, p) {
1046 scan_block(task_stack_page(task), 1164 scan_block(task_stack_page(p), task_stack_page(p) +
1047 task_stack_page(task) + THREAD_SIZE, 1165 THREAD_SIZE, NULL, 0);
1048 NULL, 0); 1166 } while_each_thread(g, p);
1049 read_unlock(&tasklist_lock); 1167 read_unlock(&tasklist_lock);
1050 } 1168 }
1051 1169
@@ -1170,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg)
1170 * Start the automatic memory scanning thread. This function must be called 1288 * Start the automatic memory scanning thread. This function must be called
1171 * with the scan_mutex held. 1289 * with the scan_mutex held.
1172 */ 1290 */
1173void start_scan_thread(void) 1291static void start_scan_thread(void)
1174{ 1292{
1175 if (scan_thread) 1293 if (scan_thread)
1176 return; 1294 return;
@@ -1185,7 +1303,7 @@ void start_scan_thread(void)
1185 * Stop the automatic memory scanning thread. This function must be called 1303 * Stop the automatic memory scanning thread. This function must be called
1186 * with the scan_mutex held. 1304 * with the scan_mutex held.
1187 */ 1305 */
1188void stop_scan_thread(void) 1306static void stop_scan_thread(void)
1189{ 1307{
1190 if (scan_thread) { 1308 if (scan_thread) {
1191 kthread_stop(scan_thread); 1309 kthread_stop(scan_thread);
@@ -1294,6 +1412,49 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1294 return seq_release(inode, file); 1412 return seq_release(inode, file);
1295} 1413}
1296 1414
1415static int dump_str_object_info(const char *str)
1416{
1417 unsigned long flags;
1418 struct kmemleak_object *object;
1419 unsigned long addr;
1420
1421 addr= simple_strtoul(str, NULL, 0);
1422 object = find_and_get_object(addr, 0);
1423 if (!object) {
1424 pr_info("Unknown object at 0x%08lx\n", addr);
1425 return -EINVAL;
1426 }
1427
1428 spin_lock_irqsave(&object->lock, flags);
1429 dump_object_info(object);
1430 spin_unlock_irqrestore(&object->lock, flags);
1431
1432 put_object(object);
1433 return 0;
1434}
1435
1436/*
1437 * We use grey instead of black to ensure we can do future scans on the same
1438 * objects. If we did not do future scans these black objects could
1439 * potentially contain references to newly allocated objects in the future and
1440 * we'd end up with false positives.
1441 */
1442static void kmemleak_clear(void)
1443{
1444 struct kmemleak_object *object;
1445 unsigned long flags;
1446
1447 rcu_read_lock();
1448 list_for_each_entry_rcu(object, &object_list, object_list) {
1449 spin_lock_irqsave(&object->lock, flags);
1450 if ((object->flags & OBJECT_REPORTED) &&
1451 unreferenced_object(object))
1452 __paint_it(object, KMEMLEAK_GREY);
1453 spin_unlock_irqrestore(&object->lock, flags);
1454 }
1455 rcu_read_unlock();
1456}
1457
1297/* 1458/*
1298 * File write operation to configure kmemleak at run-time. The following 1459 * File write operation to configure kmemleak at run-time. The following
1299 * commands can be written to the /sys/kernel/debug/kmemleak file: 1460 * commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1305,6 +1466,9 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1305 * scan=... - set the automatic memory scanning period in seconds (0 to 1466 * scan=... - set the automatic memory scanning period in seconds (0 to
1306 * disable it) 1467 * disable it)
1307 * scan - trigger a memory scan 1468 * scan - trigger a memory scan
1469 * clear - mark all current reported unreferenced kmemleak objects as
1470 * grey to ignore printing them
1471 * dump=... - dump information about the object found at the given address
1308 */ 1472 */
1309static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1473static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1310 size_t size, loff_t *ppos) 1474 size_t size, loff_t *ppos)
@@ -1345,6 +1509,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1345 } 1509 }
1346 } else if (strncmp(buf, "scan", 4) == 0) 1510 } else if (strncmp(buf, "scan", 4) == 0)
1347 kmemleak_scan(); 1511 kmemleak_scan();
1512 else if (strncmp(buf, "clear", 5) == 0)
1513 kmemleak_clear();
1514 else if (strncmp(buf, "dump=", 5) == 0)
1515 ret = dump_str_object_info(buf + 5);
1348 else 1516 else
1349 ret = -EINVAL; 1517 ret = -EINVAL;
1350 1518
@@ -1371,7 +1539,7 @@ static const struct file_operations kmemleak_fops = {
1371 * Perform the freeing of the kmemleak internal objects after waiting for any 1539 * Perform the freeing of the kmemleak internal objects after waiting for any
1372 * current memory scan to complete. 1540 * current memory scan to complete.
1373 */ 1541 */
1374static int kmemleak_cleanup_thread(void *arg) 1542static void kmemleak_do_cleanup(struct work_struct *work)
1375{ 1543{
1376 struct kmemleak_object *object; 1544 struct kmemleak_object *object;
1377 1545
@@ -1383,22 +1551,9 @@ static int kmemleak_cleanup_thread(void *arg)
1383 delete_object_full(object->pointer); 1551 delete_object_full(object->pointer);
1384 rcu_read_unlock(); 1552 rcu_read_unlock();
1385 mutex_unlock(&scan_mutex); 1553 mutex_unlock(&scan_mutex);
1386
1387 return 0;
1388} 1554}
1389 1555
1390/* 1556static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1391 * Start the clean-up thread.
1392 */
1393static void kmemleak_cleanup(void)
1394{
1395 struct task_struct *cleanup_thread;
1396
1397 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1398 "kmemleak-clean");
1399 if (IS_ERR(cleanup_thread))
1400 pr_warning("Failed to create the clean-up thread\n");
1401}
1402 1557
1403/* 1558/*
1404 * Disable kmemleak. No memory allocation/freeing will be traced once this 1559 * Disable kmemleak. No memory allocation/freeing will be traced once this
@@ -1416,7 +1571,7 @@ static void kmemleak_disable(void)
1416 1571
1417 /* check whether it is too early for a kernel thread */ 1572 /* check whether it is too early for a kernel thread */
1418 if (atomic_read(&kmemleak_initialized)) 1573 if (atomic_read(&kmemleak_initialized))
1419 kmemleak_cleanup(); 1574 schedule_work(&cleanup_work);
1420 1575
1421 pr_info("Kernel memory leak detector disabled\n"); 1576 pr_info("Kernel memory leak detector disabled\n");
1422} 1577}
@@ -1469,8 +1624,7 @@ void __init kmemleak_init(void)
1469 1624
1470 switch (log->op_type) { 1625 switch (log->op_type) {
1471 case KMEMLEAK_ALLOC: 1626 case KMEMLEAK_ALLOC:
1472 kmemleak_alloc(log->ptr, log->size, log->min_count, 1627 early_alloc(log);
1473 GFP_KERNEL);
1474 break; 1628 break;
1475 case KMEMLEAK_FREE: 1629 case KMEMLEAK_FREE:
1476 kmemleak_free(log->ptr); 1630 kmemleak_free(log->ptr);
@@ -1513,7 +1667,7 @@ static int __init kmemleak_late_init(void)
1513 * after setting kmemleak_initialized and we may end up with 1667 * after setting kmemleak_initialized and we may end up with
1514 * two clean-up threads but serialized by scan_mutex. 1668 * two clean-up threads but serialized by scan_mutex.
1515 */ 1669 */
1516 kmemleak_cleanup(); 1670 schedule_work(&cleanup_work);
1517 return -ENOMEM; 1671 return -ENOMEM;
1518 } 1672 }
1519 1673
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000000..f7edac356f46
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1711 @@
1/*
2 * Memory merging support.
3 *
4 * This code enables dynamic sharing of identical pages found in different
5 * memory areas, even if they are not shared by fork()
6 *
7 * Copyright (C) 2008-2009 Red Hat, Inc.
8 * Authors:
9 * Izik Eidus
10 * Andrea Arcangeli
11 * Chris Wright
12 * Hugh Dickins
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2.
15 */
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h>
33#include <linux/swap.h>
34#include <linux/ksm.h>
35
36#include <asm/tlbflush.h>
37
38/*
39 * A few notes about the KSM scanning process,
40 * to make it easier to understand the data structures below:
41 *
42 * In order to reduce excessive scanning, KSM sorts the memory pages by their
43 * contents into a data structure that holds pointers to the pages' locations.
44 *
45 * Since the contents of the pages may change at any moment, KSM cannot just
46 * insert the pages into a normal sorted tree and expect it to find anything.
47 * Therefore KSM uses two data structures - the stable and the unstable tree.
48 *
49 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
50 * by their contents. Because each such page is write-protected, searching on
51 * this tree is fully assured to be working (except when pages are unmapped),
52 * and therefore this tree is called the stable tree.
53 *
54 * In addition to the stable tree, KSM uses a second data structure called the
55 * unstable tree: this tree holds pointers to pages which have been found to
56 * be "unchanged for a period of time". The unstable tree sorts these pages
57 * by their contents, but since they are not write-protected, KSM cannot rely
58 * upon the unstable tree to work correctly - the unstable tree is liable to
59 * be corrupted as its contents are modified, and so it is called unstable.
60 *
61 * KSM solves this problem by several techniques:
62 *
63 * 1) The unstable tree is flushed every time KSM completes scanning all
64 * memory areas, and then the tree is rebuilt again from the beginning.
65 * 2) KSM will only insert into the unstable tree, pages whose hash value
66 * has not changed since the previous scan of all memory areas.
67 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
68 * colors of the nodes and not on their contents, assuring that even when
69 * the tree gets "corrupted" it won't get out of balance, so scanning time
70 * remains the same (also, searching and inserting nodes in an rbtree uses
71 * the same algorithm, so we have no overhead when we flush and rebuild).
72 * 4) KSM never flushes the stable tree, which means that even if it were to
73 * take 10 attempts to find a page in the unstable tree, once it is found,
74 * it is secured in the stable tree. (When we scan a new page, we first
75 * compare it against the stable tree, and then against the unstable tree.)
76 */
77
78/**
79 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items
83 * @mm: the mm that this information is valid for
84 */
85struct mm_slot {
86 struct hlist_node link;
87 struct list_head mm_list;
88 struct list_head rmap_list;
89 struct mm_struct *mm;
90};
91
92/**
93 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node)
98 *
99 * There is only the one ksm_scan instance of this cursor structure.
100 */
101struct ksm_scan {
102 struct mm_slot *mm_slot;
103 unsigned long address;
104 struct rmap_item *rmap_item;
105 unsigned long seqnr;
106};
107
108/**
109 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm)
111 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree
115 * @next: next rmap_item hanging off the same node of the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree
117 */
118struct rmap_item {
119 struct list_head link;
120 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */
122 union {
123 unsigned int oldchecksum; /* when unstable */
124 struct rmap_item *next; /* when stable */
125 };
126 union {
127 struct rb_node node; /* when tree node */
128 struct rmap_item *prev; /* in stable list */
129 };
130};
131
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
135
136/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT;
138static struct rb_root root_unstable_tree = RB_ROOT;
139
140#define MM_SLOTS_HASH_HEADS 1024
141static struct hlist_head *mm_slots_hash;
142
143static struct mm_slot ksm_mm_head = {
144 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
145};
146static struct ksm_scan ksm_scan = {
147 .mm_slot = &ksm_mm_head,
148};
149
150static struct kmem_cache *rmap_item_cache;
151static struct kmem_cache *mm_slot_cache;
152
153/* The number of nodes in the stable tree */
154static unsigned long ksm_pages_shared;
155
156/* The number of page slots additionally sharing those nodes */
157static unsigned long ksm_pages_sharing;
158
159/* The number of nodes in the unstable tree */
160static unsigned long ksm_pages_unshared;
161
162/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items;
164
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100;
170
171/* Milliseconds ksmd should sleep between batches */
172static unsigned int ksm_thread_sleep_millisecs = 20;
173
174#define KSM_RUN_STOP 0
175#define KSM_RUN_MERGE 1
176#define KSM_RUN_UNMERGE 2
177static unsigned int ksm_run = KSM_RUN_STOP;
178
179static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
180static DEFINE_MUTEX(ksm_thread_mutex);
181static DEFINE_SPINLOCK(ksm_mmlist_lock);
182
183#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
184 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL)
186
187static void __init ksm_init_max_kernel_pages(void)
188{
189 ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
190}
191
192static int __init ksm_slab_init(void)
193{
194 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
195 if (!rmap_item_cache)
196 goto out;
197
198 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
199 if (!mm_slot_cache)
200 goto out_free;
201
202 return 0;
203
204out_free:
205 kmem_cache_destroy(rmap_item_cache);
206out:
207 return -ENOMEM;
208}
209
210static void __init ksm_slab_free(void)
211{
212 kmem_cache_destroy(mm_slot_cache);
213 kmem_cache_destroy(rmap_item_cache);
214 mm_slot_cache = NULL;
215}
216
217static inline struct rmap_item *alloc_rmap_item(void)
218{
219 struct rmap_item *rmap_item;
220
221 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
222 if (rmap_item)
223 ksm_rmap_items++;
224 return rmap_item;
225}
226
227static inline void free_rmap_item(struct rmap_item *rmap_item)
228{
229 ksm_rmap_items--;
230 rmap_item->mm = NULL; /* debug safety */
231 kmem_cache_free(rmap_item_cache, rmap_item);
232}
233
234static inline struct mm_slot *alloc_mm_slot(void)
235{
236 if (!mm_slot_cache) /* initialization failed */
237 return NULL;
238 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
239}
240
241static inline void free_mm_slot(struct mm_slot *mm_slot)
242{
243 kmem_cache_free(mm_slot_cache, mm_slot);
244}
245
246static int __init mm_slots_hash_init(void)
247{
248 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
249 GFP_KERNEL);
250 if (!mm_slots_hash)
251 return -ENOMEM;
252 return 0;
253}
254
255static void __init mm_slots_hash_free(void)
256{
257 kfree(mm_slots_hash);
258}
259
260static struct mm_slot *get_mm_slot(struct mm_struct *mm)
261{
262 struct mm_slot *mm_slot;
263 struct hlist_head *bucket;
264 struct hlist_node *node;
265
266 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
267 % MM_SLOTS_HASH_HEADS];
268 hlist_for_each_entry(mm_slot, node, bucket, link) {
269 if (mm == mm_slot->mm)
270 return mm_slot;
271 }
272 return NULL;
273}
274
275static void insert_to_mm_slots_hash(struct mm_struct *mm,
276 struct mm_slot *mm_slot)
277{
278 struct hlist_head *bucket;
279
280 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
281 % MM_SLOTS_HASH_HEADS];
282 mm_slot->mm = mm;
283 INIT_LIST_HEAD(&mm_slot->rmap_list);
284 hlist_add_head(&mm_slot->link, bucket);
285}
286
287static inline int in_stable_tree(struct rmap_item *rmap_item)
288{
289 return rmap_item->address & STABLE_FLAG;
290}
291
292/*
293 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
294 * page tables after it has passed through ksm_exit() - which, if necessary,
295 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
296 * a special flag: they can just back out as soon as mm_users goes to zero.
297 * ksm_test_exit() is used throughout to make this test for exit: in some
298 * places for correctness, in some places just to avoid unnecessary work.
299 */
300static inline bool ksm_test_exit(struct mm_struct *mm)
301{
302 return atomic_read(&mm->mm_users) == 0;
303}
304
305/*
306 * We use break_ksm to break COW on a ksm page: it's a stripped down
307 *
308 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
309 * put_page(page);
310 *
311 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
312 * in case the application has unmapped and remapped mm,addr meanwhile.
313 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
314 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
315 */
316static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
317{
318 struct page *page;
319 int ret = 0;
320
321 do {
322 cond_resched();
323 page = follow_page(vma, addr, FOLL_GET);
324 if (!page)
325 break;
326 if (PageKsm(page))
327 ret = handle_mm_fault(vma->vm_mm, vma, addr,
328 FAULT_FLAG_WRITE);
329 else
330 ret = VM_FAULT_WRITE;
331 put_page(page);
332 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
333 /*
334 * We must loop because handle_mm_fault() may back out if there's
335 * any difficulty e.g. if pte accessed bit gets updated concurrently.
336 *
337 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
338 * COW has been broken, even if the vma does not permit VM_WRITE;
339 * but note that a concurrent fault might break PageKsm for us.
340 *
341 * VM_FAULT_SIGBUS could occur if we race with truncation of the
342 * backing file, which also invalidates anonymous pages: that's
343 * okay, that truncation will have unmapped the PageKsm for us.
344 *
345 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
346 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
347 * current task has TIF_MEMDIE set, and will be OOM killed on return
348 * to user; and ksmd, having no mm, would never be chosen for that.
349 *
350 * But if the mm is in a limited mem_cgroup, then the fault may fail
351 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
352 * even ksmd can fail in this way - though it's usually breaking ksm
353 * just to undo a merge it made a moment before, so unlikely to oom.
354 *
355 * That's a pity: we might therefore have more kernel pages allocated
356 * than we're counting as nodes in the stable tree; but ksm_do_scan
357 * will retry to break_cow on each pass, so should recover the page
358 * in due course. The important thing is to not let VM_MERGEABLE
359 * be cleared while any such pages might remain in the area.
360 */
361 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
362}
363
364static void break_cow(struct mm_struct *mm, unsigned long addr)
365{
366 struct vm_area_struct *vma;
367
368 down_read(&mm->mmap_sem);
369 if (ksm_test_exit(mm))
370 goto out;
371 vma = find_vma(mm, addr);
372 if (!vma || vma->vm_start > addr)
373 goto out;
374 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
375 goto out;
376 break_ksm(vma, addr);
377out:
378 up_read(&mm->mmap_sem);
379}
380
381static struct page *get_mergeable_page(struct rmap_item *rmap_item)
382{
383 struct mm_struct *mm = rmap_item->mm;
384 unsigned long addr = rmap_item->address;
385 struct vm_area_struct *vma;
386 struct page *page;
387
388 down_read(&mm->mmap_sem);
389 if (ksm_test_exit(mm))
390 goto out;
391 vma = find_vma(mm, addr);
392 if (!vma || vma->vm_start > addr)
393 goto out;
394 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
395 goto out;
396
397 page = follow_page(vma, addr, FOLL_GET);
398 if (!page)
399 goto out;
400 if (PageAnon(page)) {
401 flush_anon_page(vma, page, addr);
402 flush_dcache_page(page);
403 } else {
404 put_page(page);
405out: page = NULL;
406 }
407 up_read(&mm->mmap_sem);
408 return page;
409}
410
411/*
412 * get_ksm_page: checks if the page at the virtual address in rmap_item
413 * is still PageKsm, in which case we can trust the content of the page,
414 * and it returns the gotten page; but NULL if the page has been zapped.
415 */
416static struct page *get_ksm_page(struct rmap_item *rmap_item)
417{
418 struct page *page;
419
420 page = get_mergeable_page(rmap_item);
421 if (page && !PageKsm(page)) {
422 put_page(page);
423 page = NULL;
424 }
425 return page;
426}
427
428/*
429 * Removing rmap_item from stable or unstable tree.
430 * This function will clean the information from the stable/unstable tree.
431 */
432static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
433{
434 if (in_stable_tree(rmap_item)) {
435 struct rmap_item *next_item = rmap_item->next;
436
437 if (rmap_item->address & NODE_FLAG) {
438 if (next_item) {
439 rb_replace_node(&rmap_item->node,
440 &next_item->node,
441 &root_stable_tree);
442 next_item->address |= NODE_FLAG;
443 ksm_pages_sharing--;
444 } else {
445 rb_erase(&rmap_item->node, &root_stable_tree);
446 ksm_pages_shared--;
447 }
448 } else {
449 struct rmap_item *prev_item = rmap_item->prev;
450
451 BUG_ON(prev_item->next != rmap_item);
452 prev_item->next = next_item;
453 if (next_item) {
454 BUG_ON(next_item->prev != rmap_item);
455 next_item->prev = rmap_item->prev;
456 }
457 ksm_pages_sharing--;
458 }
459
460 rmap_item->next = NULL;
461
462 } else if (rmap_item->address & NODE_FLAG) {
463 unsigned char age;
464 /*
465 * Usually ksmd can and must skip the rb_erase, because
466 * root_unstable_tree was already reset to RB_ROOT.
467 * But be careful when an mm is exiting: do the rb_erase
468 * if this rmap_item was inserted by this scan, rather
469 * than left over from before.
470 */
471 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
472 BUG_ON(age > 1);
473 if (!age)
474 rb_erase(&rmap_item->node, &root_unstable_tree);
475 ksm_pages_unshared--;
476 }
477
478 rmap_item->address &= PAGE_MASK;
479
480 cond_resched(); /* we're called from many long loops */
481}
482
483static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
484 struct list_head *cur)
485{
486 struct rmap_item *rmap_item;
487
488 while (cur != &mm_slot->rmap_list) {
489 rmap_item = list_entry(cur, struct rmap_item, link);
490 cur = cur->next;
491 remove_rmap_item_from_tree(rmap_item);
492 list_del(&rmap_item->link);
493 free_rmap_item(rmap_item);
494 }
495}
496
497/*
498 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
499 * than check every pte of a given vma, the locking doesn't quite work for
500 * that - an rmap_item is assigned to the stable tree after inserting ksm
501 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
502 * rmap_items from parent to child at fork time (so as not to waste time
503 * if exit comes before the next scan reaches it).
504 *
505 * Similarly, although we'd like to remove rmap_items (so updating counts
506 * and freeing memory) when unmerging an area, it's easier to leave that
507 * to the next pass of ksmd - consider, for example, how ksmd might be
508 * in cmp_and_merge_page on one of the rmap_items we would be removing.
509 */
510static int unmerge_ksm_pages(struct vm_area_struct *vma,
511 unsigned long start, unsigned long end)
512{
513 unsigned long addr;
514 int err = 0;
515
516 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
517 if (ksm_test_exit(vma->vm_mm))
518 break;
519 if (signal_pending(current))
520 err = -ERESTARTSYS;
521 else
522 err = break_ksm(vma, addr);
523 }
524 return err;
525}
526
527#ifdef CONFIG_SYSFS
528/*
529 * Only called through the sysfs control interface:
530 */
531static int unmerge_and_remove_all_rmap_items(void)
532{
533 struct mm_slot *mm_slot;
534 struct mm_struct *mm;
535 struct vm_area_struct *vma;
536 int err = 0;
537
538 spin_lock(&ksm_mmlist_lock);
539 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
540 struct mm_slot, mm_list);
541 spin_unlock(&ksm_mmlist_lock);
542
543 for (mm_slot = ksm_scan.mm_slot;
544 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
545 mm = mm_slot->mm;
546 down_read(&mm->mmap_sem);
547 for (vma = mm->mmap; vma; vma = vma->vm_next) {
548 if (ksm_test_exit(mm))
549 break;
550 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
551 continue;
552 err = unmerge_ksm_pages(vma,
553 vma->vm_start, vma->vm_end);
554 if (err)
555 goto error;
556 }
557
558 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
559
560 spin_lock(&ksm_mmlist_lock);
561 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
562 struct mm_slot, mm_list);
563 if (ksm_test_exit(mm)) {
564 hlist_del(&mm_slot->link);
565 list_del(&mm_slot->mm_list);
566 spin_unlock(&ksm_mmlist_lock);
567
568 free_mm_slot(mm_slot);
569 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
570 up_read(&mm->mmap_sem);
571 mmdrop(mm);
572 } else {
573 spin_unlock(&ksm_mmlist_lock);
574 up_read(&mm->mmap_sem);
575 }
576 }
577
578 ksm_scan.seqnr = 0;
579 return 0;
580
581error:
582 up_read(&mm->mmap_sem);
583 spin_lock(&ksm_mmlist_lock);
584 ksm_scan.mm_slot = &ksm_mm_head;
585 spin_unlock(&ksm_mmlist_lock);
586 return err;
587}
588#endif /* CONFIG_SYSFS */
589
590static u32 calc_checksum(struct page *page)
591{
592 u32 checksum;
593 void *addr = kmap_atomic(page, KM_USER0);
594 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
595 kunmap_atomic(addr, KM_USER0);
596 return checksum;
597}
598
599static int memcmp_pages(struct page *page1, struct page *page2)
600{
601 char *addr1, *addr2;
602 int ret;
603
604 addr1 = kmap_atomic(page1, KM_USER0);
605 addr2 = kmap_atomic(page2, KM_USER1);
606 ret = memcmp(addr1, addr2, PAGE_SIZE);
607 kunmap_atomic(addr2, KM_USER1);
608 kunmap_atomic(addr1, KM_USER0);
609 return ret;
610}
611
612static inline int pages_identical(struct page *page1, struct page *page2)
613{
614 return !memcmp_pages(page1, page2);
615}
616
617static int write_protect_page(struct vm_area_struct *vma, struct page *page,
618 pte_t *orig_pte)
619{
620 struct mm_struct *mm = vma->vm_mm;
621 unsigned long addr;
622 pte_t *ptep;
623 spinlock_t *ptl;
624 int swapped;
625 int err = -EFAULT;
626
627 addr = page_address_in_vma(page, vma);
628 if (addr == -EFAULT)
629 goto out;
630
631 ptep = page_check_address(page, mm, addr, &ptl, 0);
632 if (!ptep)
633 goto out;
634
635 if (pte_write(*ptep)) {
636 pte_t entry;
637
638 swapped = PageSwapCache(page);
639 flush_cache_page(vma, addr, page_to_pfn(page));
640 /*
641 * Ok this is tricky, when get_user_pages_fast() run it doesnt
642 * take any lock, therefore the check that we are going to make
643 * with the pagecount against the mapcount is racey and
644 * O_DIRECT can happen right after the check.
645 * So we clear the pte and flush the tlb before the check
646 * this assure us that no O_DIRECT can happen after the check
647 * or in the middle of the check.
648 */
649 entry = ptep_clear_flush(vma, addr, ptep);
650 /*
651 * Check that no O_DIRECT or similar I/O is in progress on the
652 * page
653 */
654 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
655 set_pte_at_notify(mm, addr, ptep, entry);
656 goto out_unlock;
657 }
658 entry = pte_wrprotect(entry);
659 set_pte_at_notify(mm, addr, ptep, entry);
660 }
661 *orig_pte = *ptep;
662 err = 0;
663
664out_unlock:
665 pte_unmap_unlock(ptep, ptl);
666out:
667 return err;
668}
669
670/**
671 * replace_page - replace page in vma by new ksm page
672 * @vma: vma that holds the pte pointing to oldpage
673 * @oldpage: the page we are replacing by newpage
674 * @newpage: the ksm page we replace oldpage by
675 * @orig_pte: the original value of the pte
676 *
677 * Returns 0 on success, -EFAULT on failure.
678 */
679static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
680 struct page *newpage, pte_t orig_pte)
681{
682 struct mm_struct *mm = vma->vm_mm;
683 pgd_t *pgd;
684 pud_t *pud;
685 pmd_t *pmd;
686 pte_t *ptep;
687 spinlock_t *ptl;
688 unsigned long addr;
689 pgprot_t prot;
690 int err = -EFAULT;
691
692 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
693
694 addr = page_address_in_vma(oldpage, vma);
695 if (addr == -EFAULT)
696 goto out;
697
698 pgd = pgd_offset(mm, addr);
699 if (!pgd_present(*pgd))
700 goto out;
701
702 pud = pud_offset(pgd, addr);
703 if (!pud_present(*pud))
704 goto out;
705
706 pmd = pmd_offset(pud, addr);
707 if (!pmd_present(*pmd))
708 goto out;
709
710 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
711 if (!pte_same(*ptep, orig_pte)) {
712 pte_unmap_unlock(ptep, ptl);
713 goto out;
714 }
715
716 get_page(newpage);
717 page_add_ksm_rmap(newpage);
718
719 flush_cache_page(vma, addr, pte_pfn(*ptep));
720 ptep_clear_flush(vma, addr, ptep);
721 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
722
723 page_remove_rmap(oldpage);
724 put_page(oldpage);
725
726 pte_unmap_unlock(ptep, ptl);
727 err = 0;
728out:
729 return err;
730}
731
732/*
733 * try_to_merge_one_page - take two pages and merge them into one
734 * @vma: the vma that hold the pte pointing into oldpage
735 * @oldpage: the page that we want to replace with newpage
736 * @newpage: the page that we want to map instead of oldpage
737 *
738 * Note:
739 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
740 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
741 *
742 * This function returns 0 if the pages were merged, -EFAULT otherwise.
743 */
744static int try_to_merge_one_page(struct vm_area_struct *vma,
745 struct page *oldpage,
746 struct page *newpage)
747{
748 pte_t orig_pte = __pte(0);
749 int err = -EFAULT;
750
751 if (!(vma->vm_flags & VM_MERGEABLE))
752 goto out;
753
754 if (!PageAnon(oldpage))
755 goto out;
756
757 get_page(newpage);
758 get_page(oldpage);
759
760 /*
761 * We need the page lock to read a stable PageSwapCache in
762 * write_protect_page(). We use trylock_page() instead of
763 * lock_page() because we don't want to wait here - we
764 * prefer to continue scanning and merging different pages,
765 * then come back to this page when it is unlocked.
766 */
767 if (!trylock_page(oldpage))
768 goto out_putpage;
769 /*
770 * If this anonymous page is mapped only here, its pte may need
771 * to be write-protected. If it's mapped elsewhere, all of its
772 * ptes are necessarily already write-protected. But in either
773 * case, we need to lock and check page_count is not raised.
774 */
775 if (write_protect_page(vma, oldpage, &orig_pte)) {
776 unlock_page(oldpage);
777 goto out_putpage;
778 }
779 unlock_page(oldpage);
780
781 if (pages_identical(oldpage, newpage))
782 err = replace_page(vma, oldpage, newpage, orig_pte);
783
784out_putpage:
785 put_page(oldpage);
786 put_page(newpage);
787out:
788 return err;
789}
790
791/*
792 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
793 * but no new kernel page is allocated: kpage must already be a ksm page.
794 */
795static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
796 unsigned long addr1,
797 struct page *page1,
798 struct page *kpage)
799{
800 struct vm_area_struct *vma;
801 int err = -EFAULT;
802
803 down_read(&mm1->mmap_sem);
804 if (ksm_test_exit(mm1))
805 goto out;
806
807 vma = find_vma(mm1, addr1);
808 if (!vma || vma->vm_start > addr1)
809 goto out;
810
811 err = try_to_merge_one_page(vma, page1, kpage);
812out:
813 up_read(&mm1->mmap_sem);
814 return err;
815}
816
817/*
818 * try_to_merge_two_pages - take two identical pages and prepare them
819 * to be merged into one page.
820 *
821 * This function returns 0 if we successfully mapped two identical pages
822 * into one page, -EFAULT otherwise.
823 *
824 * Note that this function allocates a new kernel page: if one of the pages
825 * is already a ksm page, try_to_merge_with_ksm_page should be used.
826 */
827static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
828 struct page *page1, struct mm_struct *mm2,
829 unsigned long addr2, struct page *page2)
830{
831 struct vm_area_struct *vma;
832 struct page *kpage;
833 int err = -EFAULT;
834
835 /*
836 * The number of nodes in the stable tree
837 * is the number of kernel pages that we hold.
838 */
839 if (ksm_max_kernel_pages &&
840 ksm_max_kernel_pages <= ksm_pages_shared)
841 return err;
842
843 kpage = alloc_page(GFP_HIGHUSER);
844 if (!kpage)
845 return err;
846
847 down_read(&mm1->mmap_sem);
848 if (ksm_test_exit(mm1)) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852 vma = find_vma(mm1, addr1);
853 if (!vma || vma->vm_start > addr1) {
854 up_read(&mm1->mmap_sem);
855 goto out;
856 }
857
858 copy_user_highpage(kpage, page1, addr1, vma);
859 err = try_to_merge_one_page(vma, page1, kpage);
860 up_read(&mm1->mmap_sem);
861
862 if (!err) {
863 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
864 /*
865 * If that fails, we have a ksm page with only one pte
866 * pointing to it: so break it.
867 */
868 if (err)
869 break_cow(mm1, addr1);
870 }
871out:
872 put_page(kpage);
873 return err;
874}
875
876/*
877 * stable_tree_search - search page inside the stable tree
878 * @page: the page that we are searching identical pages to.
879 * @page2: pointer into identical page that we are holding inside the stable
880 * tree that we have found.
881 * @rmap_item: the reverse mapping item
882 *
883 * This function checks if there is a page inside the stable tree
884 * with identical content to the page that we are scanning right now.
885 *
886 * This function return rmap_item pointer to the identical item if found,
887 * NULL otherwise.
888 */
889static struct rmap_item *stable_tree_search(struct page *page,
890 struct page **page2,
891 struct rmap_item *rmap_item)
892{
893 struct rb_node *node = root_stable_tree.rb_node;
894
895 while (node) {
896 struct rmap_item *tree_rmap_item, *next_rmap_item;
897 int ret;
898
899 tree_rmap_item = rb_entry(node, struct rmap_item, node);
900 while (tree_rmap_item) {
901 BUG_ON(!in_stable_tree(tree_rmap_item));
902 cond_resched();
903 page2[0] = get_ksm_page(tree_rmap_item);
904 if (page2[0])
905 break;
906 next_rmap_item = tree_rmap_item->next;
907 remove_rmap_item_from_tree(tree_rmap_item);
908 tree_rmap_item = next_rmap_item;
909 }
910 if (!tree_rmap_item)
911 return NULL;
912
913 ret = memcmp_pages(page, page2[0]);
914
915 if (ret < 0) {
916 put_page(page2[0]);
917 node = node->rb_left;
918 } else if (ret > 0) {
919 put_page(page2[0]);
920 node = node->rb_right;
921 } else {
922 return tree_rmap_item;
923 }
924 }
925
926 return NULL;
927}
928
929/*
930 * stable_tree_insert - insert rmap_item pointing to new ksm page
931 * into the stable tree.
932 *
933 * @page: the page that we are searching identical page to inside the stable
934 * tree.
935 * @rmap_item: pointer to the reverse mapping item.
936 *
937 * This function returns rmap_item if success, NULL otherwise.
938 */
939static struct rmap_item *stable_tree_insert(struct page *page,
940 struct rmap_item *rmap_item)
941{
942 struct rb_node **new = &root_stable_tree.rb_node;
943 struct rb_node *parent = NULL;
944
945 while (*new) {
946 struct rmap_item *tree_rmap_item, *next_rmap_item;
947 struct page *tree_page;
948 int ret;
949
950 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
951 while (tree_rmap_item) {
952 BUG_ON(!in_stable_tree(tree_rmap_item));
953 cond_resched();
954 tree_page = get_ksm_page(tree_rmap_item);
955 if (tree_page)
956 break;
957 next_rmap_item = tree_rmap_item->next;
958 remove_rmap_item_from_tree(tree_rmap_item);
959 tree_rmap_item = next_rmap_item;
960 }
961 if (!tree_rmap_item)
962 return NULL;
963
964 ret = memcmp_pages(page, tree_page);
965 put_page(tree_page);
966
967 parent = *new;
968 if (ret < 0)
969 new = &parent->rb_left;
970 else if (ret > 0)
971 new = &parent->rb_right;
972 else {
973 /*
974 * It is not a bug that stable_tree_search() didn't
975 * find this node: because at that time our page was
976 * not yet write-protected, so may have changed since.
977 */
978 return NULL;
979 }
980 }
981
982 rmap_item->address |= NODE_FLAG | STABLE_FLAG;
983 rmap_item->next = NULL;
984 rb_link_node(&rmap_item->node, parent, new);
985 rb_insert_color(&rmap_item->node, &root_stable_tree);
986
987 ksm_pages_shared++;
988 return rmap_item;
989}
990
991/*
992 * unstable_tree_search_insert - search and insert items into the unstable tree.
993 *
994 * @page: the page that we are going to search for identical page or to insert
995 * into the unstable tree
996 * @page2: pointer into identical page that was found inside the unstable tree
997 * @rmap_item: the reverse mapping item of page
998 *
999 * This function searches for a page in the unstable tree identical to the
1000 * page currently being scanned; and if no identical page is found in the
1001 * tree, we insert rmap_item as a new object into the unstable tree.
1002 *
1003 * This function returns pointer to rmap_item found to be identical
1004 * to the currently scanned page, NULL otherwise.
1005 *
1006 * This function does both searching and inserting, because they share
1007 * the same walking algorithm in an rbtree.
1008 */
1009static struct rmap_item *unstable_tree_search_insert(struct page *page,
1010 struct page **page2,
1011 struct rmap_item *rmap_item)
1012{
1013 struct rb_node **new = &root_unstable_tree.rb_node;
1014 struct rb_node *parent = NULL;
1015
1016 while (*new) {
1017 struct rmap_item *tree_rmap_item;
1018 int ret;
1019
1020 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1021 page2[0] = get_mergeable_page(tree_rmap_item);
1022 if (!page2[0])
1023 return NULL;
1024
1025 /*
1026 * Don't substitute an unswappable ksm page
1027 * just for one good swappable forked page.
1028 */
1029 if (page == page2[0]) {
1030 put_page(page2[0]);
1031 return NULL;
1032 }
1033
1034 ret = memcmp_pages(page, page2[0]);
1035
1036 parent = *new;
1037 if (ret < 0) {
1038 put_page(page2[0]);
1039 new = &parent->rb_left;
1040 } else if (ret > 0) {
1041 put_page(page2[0]);
1042 new = &parent->rb_right;
1043 } else {
1044 return tree_rmap_item;
1045 }
1046 }
1047
1048 rmap_item->address |= NODE_FLAG;
1049 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1050 rb_link_node(&rmap_item->node, parent, new);
1051 rb_insert_color(&rmap_item->node, &root_unstable_tree);
1052
1053 ksm_pages_unshared++;
1054 return NULL;
1055}
1056
1057/*
1058 * stable_tree_append - add another rmap_item to the linked list of
1059 * rmap_items hanging off a given node of the stable tree, all sharing
1060 * the same ksm page.
1061 */
1062static void stable_tree_append(struct rmap_item *rmap_item,
1063 struct rmap_item *tree_rmap_item)
1064{
1065 rmap_item->next = tree_rmap_item->next;
1066 rmap_item->prev = tree_rmap_item;
1067
1068 if (tree_rmap_item->next)
1069 tree_rmap_item->next->prev = rmap_item;
1070
1071 tree_rmap_item->next = rmap_item;
1072 rmap_item->address |= STABLE_FLAG;
1073
1074 ksm_pages_sharing++;
1075}
1076
1077/*
1078 * cmp_and_merge_page - first see if page can be merged into the stable tree;
1079 * if not, compare checksum to previous and if it's the same, see if page can
1080 * be inserted into the unstable tree, or merged with a page already there and
1081 * both transferred to the stable tree.
1082 *
1083 * @page: the page that we are searching identical page to.
1084 * @rmap_item: the reverse mapping into the virtual address of this page
1085 */
1086static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1087{
1088 struct page *page2[1];
1089 struct rmap_item *tree_rmap_item;
1090 unsigned int checksum;
1091 int err;
1092
1093 if (in_stable_tree(rmap_item))
1094 remove_rmap_item_from_tree(rmap_item);
1095
1096 /* We first start with searching the page inside the stable tree */
1097 tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1098 if (tree_rmap_item) {
1099 if (page == page2[0]) /* forked */
1100 err = 0;
1101 else
1102 err = try_to_merge_with_ksm_page(rmap_item->mm,
1103 rmap_item->address,
1104 page, page2[0]);
1105 put_page(page2[0]);
1106
1107 if (!err) {
1108 /*
1109 * The page was successfully merged:
1110 * add its rmap_item to the stable tree.
1111 */
1112 stable_tree_append(rmap_item, tree_rmap_item);
1113 }
1114 return;
1115 }
1116
1117 /*
1118 * A ksm page might have got here by fork, but its other
1119 * references have already been removed from the stable tree.
1120 * Or it might be left over from a break_ksm which failed
1121 * when the mem_cgroup had reached its limit: try again now.
1122 */
1123 if (PageKsm(page))
1124 break_cow(rmap_item->mm, rmap_item->address);
1125
1126 /*
1127 * In case the hash value of the page was changed from the last time we
1128 * have calculated it, this page to be changed frequely, therefore we
1129 * don't want to insert it to the unstable tree, and we don't want to
1130 * waste our time to search if there is something identical to it there.
1131 */
1132 checksum = calc_checksum(page);
1133 if (rmap_item->oldchecksum != checksum) {
1134 rmap_item->oldchecksum = checksum;
1135 return;
1136 }
1137
1138 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1139 if (tree_rmap_item) {
1140 err = try_to_merge_two_pages(rmap_item->mm,
1141 rmap_item->address, page,
1142 tree_rmap_item->mm,
1143 tree_rmap_item->address, page2[0]);
1144 /*
1145 * As soon as we merge this page, we want to remove the
1146 * rmap_item of the page we have merged with from the unstable
1147 * tree, and insert it instead as new node in the stable tree.
1148 */
1149 if (!err) {
1150 rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1151 tree_rmap_item->address &= ~NODE_FLAG;
1152 ksm_pages_unshared--;
1153
1154 /*
1155 * If we fail to insert the page into the stable tree,
1156 * we will have 2 virtual addresses that are pointing
1157 * to a ksm page left outside the stable tree,
1158 * in which case we need to break_cow on both.
1159 */
1160 if (stable_tree_insert(page2[0], tree_rmap_item))
1161 stable_tree_append(rmap_item, tree_rmap_item);
1162 else {
1163 break_cow(tree_rmap_item->mm,
1164 tree_rmap_item->address);
1165 break_cow(rmap_item->mm, rmap_item->address);
1166 }
1167 }
1168
1169 put_page(page2[0]);
1170 }
1171}
1172
1173static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1174 struct list_head *cur,
1175 unsigned long addr)
1176{
1177 struct rmap_item *rmap_item;
1178
1179 while (cur != &mm_slot->rmap_list) {
1180 rmap_item = list_entry(cur, struct rmap_item, link);
1181 if ((rmap_item->address & PAGE_MASK) == addr) {
1182 if (!in_stable_tree(rmap_item))
1183 remove_rmap_item_from_tree(rmap_item);
1184 return rmap_item;
1185 }
1186 if (rmap_item->address > addr)
1187 break;
1188 cur = cur->next;
1189 remove_rmap_item_from_tree(rmap_item);
1190 list_del(&rmap_item->link);
1191 free_rmap_item(rmap_item);
1192 }
1193
1194 rmap_item = alloc_rmap_item();
1195 if (rmap_item) {
1196 /* It has already been zeroed */
1197 rmap_item->mm = mm_slot->mm;
1198 rmap_item->address = addr;
1199 list_add_tail(&rmap_item->link, cur);
1200 }
1201 return rmap_item;
1202}
1203
1204static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1205{
1206 struct mm_struct *mm;
1207 struct mm_slot *slot;
1208 struct vm_area_struct *vma;
1209 struct rmap_item *rmap_item;
1210
1211 if (list_empty(&ksm_mm_head.mm_list))
1212 return NULL;
1213
1214 slot = ksm_scan.mm_slot;
1215 if (slot == &ksm_mm_head) {
1216 root_unstable_tree = RB_ROOT;
1217
1218 spin_lock(&ksm_mmlist_lock);
1219 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1220 ksm_scan.mm_slot = slot;
1221 spin_unlock(&ksm_mmlist_lock);
1222next_mm:
1223 ksm_scan.address = 0;
1224 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1225 struct rmap_item, link);
1226 }
1227
1228 mm = slot->mm;
1229 down_read(&mm->mmap_sem);
1230 if (ksm_test_exit(mm))
1231 vma = NULL;
1232 else
1233 vma = find_vma(mm, ksm_scan.address);
1234
1235 for (; vma; vma = vma->vm_next) {
1236 if (!(vma->vm_flags & VM_MERGEABLE))
1237 continue;
1238 if (ksm_scan.address < vma->vm_start)
1239 ksm_scan.address = vma->vm_start;
1240 if (!vma->anon_vma)
1241 ksm_scan.address = vma->vm_end;
1242
1243 while (ksm_scan.address < vma->vm_end) {
1244 if (ksm_test_exit(mm))
1245 break;
1246 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1247 if (*page && PageAnon(*page)) {
1248 flush_anon_page(vma, *page, ksm_scan.address);
1249 flush_dcache_page(*page);
1250 rmap_item = get_next_rmap_item(slot,
1251 ksm_scan.rmap_item->link.next,
1252 ksm_scan.address);
1253 if (rmap_item) {
1254 ksm_scan.rmap_item = rmap_item;
1255 ksm_scan.address += PAGE_SIZE;
1256 } else
1257 put_page(*page);
1258 up_read(&mm->mmap_sem);
1259 return rmap_item;
1260 }
1261 if (*page)
1262 put_page(*page);
1263 ksm_scan.address += PAGE_SIZE;
1264 cond_resched();
1265 }
1266 }
1267
1268 if (ksm_test_exit(mm)) {
1269 ksm_scan.address = 0;
1270 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1271 struct rmap_item, link);
1272 }
1273 /*
1274 * Nuke all the rmap_items that are above this current rmap:
1275 * because there were no VM_MERGEABLE vmas with such addresses.
1276 */
1277 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1278
1279 spin_lock(&ksm_mmlist_lock);
1280 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1281 struct mm_slot, mm_list);
1282 if (ksm_scan.address == 0) {
1283 /*
1284 * We've completed a full scan of all vmas, holding mmap_sem
1285 * throughout, and found no VM_MERGEABLE: so do the same as
1286 * __ksm_exit does to remove this mm from all our lists now.
1287 * This applies either when cleaning up after __ksm_exit
1288 * (but beware: we can reach here even before __ksm_exit),
1289 * or when all VM_MERGEABLE areas have been unmapped (and
1290 * mmap_sem then protects against race with MADV_MERGEABLE).
1291 */
1292 hlist_del(&slot->link);
1293 list_del(&slot->mm_list);
1294 spin_unlock(&ksm_mmlist_lock);
1295
1296 free_mm_slot(slot);
1297 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1298 up_read(&mm->mmap_sem);
1299 mmdrop(mm);
1300 } else {
1301 spin_unlock(&ksm_mmlist_lock);
1302 up_read(&mm->mmap_sem);
1303 }
1304
1305 /* Repeat until we've completed scanning the whole list */
1306 slot = ksm_scan.mm_slot;
1307 if (slot != &ksm_mm_head)
1308 goto next_mm;
1309
1310 ksm_scan.seqnr++;
1311 return NULL;
1312}
1313
1314/**
1315 * ksm_do_scan - the ksm scanner main worker function.
1316 * @scan_npages - number of pages we want to scan before we return.
1317 */
1318static void ksm_do_scan(unsigned int scan_npages)
1319{
1320 struct rmap_item *rmap_item;
1321 struct page *page;
1322
1323 while (scan_npages--) {
1324 cond_resched();
1325 rmap_item = scan_get_next_rmap_item(&page);
1326 if (!rmap_item)
1327 return;
1328 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1329 cmp_and_merge_page(page, rmap_item);
1330 else if (page_mapcount(page) == 1) {
1331 /*
1332 * Replace now-unshared ksm page by ordinary page.
1333 */
1334 break_cow(rmap_item->mm, rmap_item->address);
1335 remove_rmap_item_from_tree(rmap_item);
1336 rmap_item->oldchecksum = calc_checksum(page);
1337 }
1338 put_page(page);
1339 }
1340}
1341
1342static int ksmd_should_run(void)
1343{
1344 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1345}
1346
1347static int ksm_scan_thread(void *nothing)
1348{
1349 set_user_nice(current, 5);
1350
1351 while (!kthread_should_stop()) {
1352 mutex_lock(&ksm_thread_mutex);
1353 if (ksmd_should_run())
1354 ksm_do_scan(ksm_thread_pages_to_scan);
1355 mutex_unlock(&ksm_thread_mutex);
1356
1357 if (ksmd_should_run()) {
1358 schedule_timeout_interruptible(
1359 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1360 } else {
1361 wait_event_interruptible(ksm_thread_wait,
1362 ksmd_should_run() || kthread_should_stop());
1363 }
1364 }
1365 return 0;
1366}
1367
1368int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1369 unsigned long end, int advice, unsigned long *vm_flags)
1370{
1371 struct mm_struct *mm = vma->vm_mm;
1372 int err;
1373
1374 switch (advice) {
1375 case MADV_MERGEABLE:
1376 /*
1377 * Be somewhat over-protective for now!
1378 */
1379 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1380 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1381 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1382 VM_MIXEDMAP | VM_SAO))
1383 return 0; /* just ignore the advice */
1384
1385 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1386 err = __ksm_enter(mm);
1387 if (err)
1388 return err;
1389 }
1390
1391 *vm_flags |= VM_MERGEABLE;
1392 break;
1393
1394 case MADV_UNMERGEABLE:
1395 if (!(*vm_flags & VM_MERGEABLE))
1396 return 0; /* just ignore the advice */
1397
1398 if (vma->anon_vma) {
1399 err = unmerge_ksm_pages(vma, start, end);
1400 if (err)
1401 return err;
1402 }
1403
1404 *vm_flags &= ~VM_MERGEABLE;
1405 break;
1406 }
1407
1408 return 0;
1409}
1410
1411int __ksm_enter(struct mm_struct *mm)
1412{
1413 struct mm_slot *mm_slot;
1414 int needs_wakeup;
1415
1416 mm_slot = alloc_mm_slot();
1417 if (!mm_slot)
1418 return -ENOMEM;
1419
1420 /* Check ksm_run too? Would need tighter locking */
1421 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1422
1423 spin_lock(&ksm_mmlist_lock);
1424 insert_to_mm_slots_hash(mm, mm_slot);
1425 /*
1426 * Insert just behind the scanning cursor, to let the area settle
1427 * down a little; when fork is followed by immediate exec, we don't
1428 * want ksmd to waste time setting up and tearing down an rmap_list.
1429 */
1430 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1431 spin_unlock(&ksm_mmlist_lock);
1432
1433 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1434 atomic_inc(&mm->mm_count);
1435
1436 if (needs_wakeup)
1437 wake_up_interruptible(&ksm_thread_wait);
1438
1439 return 0;
1440}
1441
1442void __ksm_exit(struct mm_struct *mm)
1443{
1444 struct mm_slot *mm_slot;
1445 int easy_to_free = 0;
1446
1447 /*
1448 * This process is exiting: if it's straightforward (as is the
1449 * case when ksmd was never running), free mm_slot immediately.
1450 * But if it's at the cursor or has rmap_items linked to it, use
1451 * mmap_sem to synchronize with any break_cows before pagetables
1452 * are freed, and leave the mm_slot on the list for ksmd to free.
1453 * Beware: ksm may already have noticed it exiting and freed the slot.
1454 */
1455
1456 spin_lock(&ksm_mmlist_lock);
1457 mm_slot = get_mm_slot(mm);
1458 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1459 if (list_empty(&mm_slot->rmap_list)) {
1460 hlist_del(&mm_slot->link);
1461 list_del(&mm_slot->mm_list);
1462 easy_to_free = 1;
1463 } else {
1464 list_move(&mm_slot->mm_list,
1465 &ksm_scan.mm_slot->mm_list);
1466 }
1467 }
1468 spin_unlock(&ksm_mmlist_lock);
1469
1470 if (easy_to_free) {
1471 free_mm_slot(mm_slot);
1472 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1473 mmdrop(mm);
1474 } else if (mm_slot) {
1475 down_write(&mm->mmap_sem);
1476 up_write(&mm->mmap_sem);
1477 }
1478}
1479
1480#ifdef CONFIG_SYSFS
1481/*
1482 * This all compiles without CONFIG_SYSFS, but is a waste of space.
1483 */
1484
1485#define KSM_ATTR_RO(_name) \
1486 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1487#define KSM_ATTR(_name) \
1488 static struct kobj_attribute _name##_attr = \
1489 __ATTR(_name, 0644, _name##_show, _name##_store)
1490
1491static ssize_t sleep_millisecs_show(struct kobject *kobj,
1492 struct kobj_attribute *attr, char *buf)
1493{
1494 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1495}
1496
1497static ssize_t sleep_millisecs_store(struct kobject *kobj,
1498 struct kobj_attribute *attr,
1499 const char *buf, size_t count)
1500{
1501 unsigned long msecs;
1502 int err;
1503
1504 err = strict_strtoul(buf, 10, &msecs);
1505 if (err || msecs > UINT_MAX)
1506 return -EINVAL;
1507
1508 ksm_thread_sleep_millisecs = msecs;
1509
1510 return count;
1511}
1512KSM_ATTR(sleep_millisecs);
1513
1514static ssize_t pages_to_scan_show(struct kobject *kobj,
1515 struct kobj_attribute *attr, char *buf)
1516{
1517 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1518}
1519
1520static ssize_t pages_to_scan_store(struct kobject *kobj,
1521 struct kobj_attribute *attr,
1522 const char *buf, size_t count)
1523{
1524 int err;
1525 unsigned long nr_pages;
1526
1527 err = strict_strtoul(buf, 10, &nr_pages);
1528 if (err || nr_pages > UINT_MAX)
1529 return -EINVAL;
1530
1531 ksm_thread_pages_to_scan = nr_pages;
1532
1533 return count;
1534}
1535KSM_ATTR(pages_to_scan);
1536
1537static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1538 char *buf)
1539{
1540 return sprintf(buf, "%u\n", ksm_run);
1541}
1542
1543static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1544 const char *buf, size_t count)
1545{
1546 int err;
1547 unsigned long flags;
1548
1549 err = strict_strtoul(buf, 10, &flags);
1550 if (err || flags > UINT_MAX)
1551 return -EINVAL;
1552 if (flags > KSM_RUN_UNMERGE)
1553 return -EINVAL;
1554
1555 /*
1556 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1557 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1558 * breaking COW to free the unswappable pages_shared (but leaves
1559 * mm_slots on the list for when ksmd may be set running again).
1560 */
1561
1562 mutex_lock(&ksm_thread_mutex);
1563 if (ksm_run != flags) {
1564 ksm_run = flags;
1565 if (flags & KSM_RUN_UNMERGE) {
1566 current->flags |= PF_OOM_ORIGIN;
1567 err = unmerge_and_remove_all_rmap_items();
1568 current->flags &= ~PF_OOM_ORIGIN;
1569 if (err) {
1570 ksm_run = KSM_RUN_STOP;
1571 count = err;
1572 }
1573 }
1574 }
1575 mutex_unlock(&ksm_thread_mutex);
1576
1577 if (flags & KSM_RUN_MERGE)
1578 wake_up_interruptible(&ksm_thread_wait);
1579
1580 return count;
1581}
1582KSM_ATTR(run);
1583
1584static ssize_t max_kernel_pages_store(struct kobject *kobj,
1585 struct kobj_attribute *attr,
1586 const char *buf, size_t count)
1587{
1588 int err;
1589 unsigned long nr_pages;
1590
1591 err = strict_strtoul(buf, 10, &nr_pages);
1592 if (err)
1593 return -EINVAL;
1594
1595 ksm_max_kernel_pages = nr_pages;
1596
1597 return count;
1598}
1599
1600static ssize_t max_kernel_pages_show(struct kobject *kobj,
1601 struct kobj_attribute *attr, char *buf)
1602{
1603 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1604}
1605KSM_ATTR(max_kernel_pages);
1606
1607static ssize_t pages_shared_show(struct kobject *kobj,
1608 struct kobj_attribute *attr, char *buf)
1609{
1610 return sprintf(buf, "%lu\n", ksm_pages_shared);
1611}
1612KSM_ATTR_RO(pages_shared);
1613
1614static ssize_t pages_sharing_show(struct kobject *kobj,
1615 struct kobj_attribute *attr, char *buf)
1616{
1617 return sprintf(buf, "%lu\n", ksm_pages_sharing);
1618}
1619KSM_ATTR_RO(pages_sharing);
1620
1621static ssize_t pages_unshared_show(struct kobject *kobj,
1622 struct kobj_attribute *attr, char *buf)
1623{
1624 return sprintf(buf, "%lu\n", ksm_pages_unshared);
1625}
1626KSM_ATTR_RO(pages_unshared);
1627
1628static ssize_t pages_volatile_show(struct kobject *kobj,
1629 struct kobj_attribute *attr, char *buf)
1630{
1631 long ksm_pages_volatile;
1632
1633 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1634 - ksm_pages_sharing - ksm_pages_unshared;
1635 /*
1636 * It was not worth any locking to calculate that statistic,
1637 * but it might therefore sometimes be negative: conceal that.
1638 */
1639 if (ksm_pages_volatile < 0)
1640 ksm_pages_volatile = 0;
1641 return sprintf(buf, "%ld\n", ksm_pages_volatile);
1642}
1643KSM_ATTR_RO(pages_volatile);
1644
1645static ssize_t full_scans_show(struct kobject *kobj,
1646 struct kobj_attribute *attr, char *buf)
1647{
1648 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
1649}
1650KSM_ATTR_RO(full_scans);
1651
1652static struct attribute *ksm_attrs[] = {
1653 &sleep_millisecs_attr.attr,
1654 &pages_to_scan_attr.attr,
1655 &run_attr.attr,
1656 &max_kernel_pages_attr.attr,
1657 &pages_shared_attr.attr,
1658 &pages_sharing_attr.attr,
1659 &pages_unshared_attr.attr,
1660 &pages_volatile_attr.attr,
1661 &full_scans_attr.attr,
1662 NULL,
1663};
1664
1665static struct attribute_group ksm_attr_group = {
1666 .attrs = ksm_attrs,
1667 .name = "ksm",
1668};
1669#endif /* CONFIG_SYSFS */
1670
1671static int __init ksm_init(void)
1672{
1673 struct task_struct *ksm_thread;
1674 int err;
1675
1676 ksm_init_max_kernel_pages();
1677
1678 err = ksm_slab_init();
1679 if (err)
1680 goto out;
1681
1682 err = mm_slots_hash_init();
1683 if (err)
1684 goto out_free1;
1685
1686 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1687 if (IS_ERR(ksm_thread)) {
1688 printk(KERN_ERR "ksm: creating kthread failed\n");
1689 err = PTR_ERR(ksm_thread);
1690 goto out_free2;
1691 }
1692
1693#ifdef CONFIG_SYSFS
1694 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1695 if (err) {
1696 printk(KERN_ERR "ksm: register sysfs failed\n");
1697 kthread_stop(ksm_thread);
1698 goto out_free2;
1699 }
1700#endif /* CONFIG_SYSFS */
1701
1702 return 0;
1703
1704out_free2:
1705 mm_slots_hash_free();
1706out_free1:
1707 ksm_slab_free();
1708out:
1709 return err;
1710}
1711module_init(ksm_init)
diff --git a/mm/madvise.c b/mm/madvise.c
index 76eb4193acdd..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/ksm.h>
14 15
15/* 16/*
16 * Any behaviour which results in changes to the vma->vm_flags needs to 17 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
41 struct mm_struct * mm = vma->vm_mm; 42 struct mm_struct * mm = vma->vm_mm;
42 int error = 0; 43 int error = 0;
43 pgoff_t pgoff; 44 pgoff_t pgoff;
44 int new_flags = vma->vm_flags; 45 unsigned long new_flags = vma->vm_flags;
45 46
46 switch (behavior) { 47 switch (behavior) {
47 case MADV_NORMAL: 48 case MADV_NORMAL:
@@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma,
57 new_flags |= VM_DONTCOPY; 58 new_flags |= VM_DONTCOPY;
58 break; 59 break;
59 case MADV_DOFORK: 60 case MADV_DOFORK:
61 if (vma->vm_flags & VM_IO) {
62 error = -EINVAL;
63 goto out;
64 }
60 new_flags &= ~VM_DONTCOPY; 65 new_flags &= ~VM_DONTCOPY;
61 break; 66 break;
67 case MADV_MERGEABLE:
68 case MADV_UNMERGEABLE:
69 error = ksm_madvise(vma, start, end, behavior, &new_flags);
70 if (error)
71 goto out;
72 break;
62 } 73 }
63 74
64 if (new_flags == vma->vm_flags) { 75 if (new_flags == vma->vm_flags) {
@@ -207,41 +218,46 @@ static long madvise_remove(struct vm_area_struct *vma,
207 return error; 218 return error;
208} 219}
209 220
221#ifdef CONFIG_MEMORY_FAILURE
222/*
223 * Error injection support for memory error handling.
224 */
225static int madvise_hwpoison(unsigned long start, unsigned long end)
226{
227 int ret = 0;
228
229 if (!capable(CAP_SYS_ADMIN))
230 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) {
232 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1,
234 0, 0, &p, NULL);
235 if (ret != 1)
236 return ret;
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start);
239 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1);
241 put_page(p);
242 }
243 return ret;
244}
245#endif
246
210static long 247static long
211madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
212 unsigned long start, unsigned long end, int behavior) 249 unsigned long start, unsigned long end, int behavior)
213{ 250{
214 long error;
215
216 switch (behavior) { 251 switch (behavior) {
217 case MADV_DOFORK:
218 if (vma->vm_flags & VM_IO) {
219 error = -EINVAL;
220 break;
221 }
222 case MADV_DONTFORK:
223 case MADV_NORMAL:
224 case MADV_SEQUENTIAL:
225 case MADV_RANDOM:
226 error = madvise_behavior(vma, prev, start, end, behavior);
227 break;
228 case MADV_REMOVE: 252 case MADV_REMOVE:
229 error = madvise_remove(vma, prev, start, end); 253 return madvise_remove(vma, prev, start, end);
230 break;
231
232 case MADV_WILLNEED: 254 case MADV_WILLNEED:
233 error = madvise_willneed(vma, prev, start, end); 255 return madvise_willneed(vma, prev, start, end);
234 break;
235
236 case MADV_DONTNEED: 256 case MADV_DONTNEED:
237 error = madvise_dontneed(vma, prev, start, end); 257 return madvise_dontneed(vma, prev, start, end);
238 break;
239
240 default: 258 default:
241 BUG(); 259 return madvise_behavior(vma, prev, start, end, behavior);
242 break;
243 } 260 }
244 return error;
245} 261}
246 262
247static int 263static int
@@ -256,12 +272,17 @@ madvise_behavior_valid(int behavior)
256 case MADV_REMOVE: 272 case MADV_REMOVE:
257 case MADV_WILLNEED: 273 case MADV_WILLNEED:
258 case MADV_DONTNEED: 274 case MADV_DONTNEED:
275#ifdef CONFIG_KSM
276 case MADV_MERGEABLE:
277 case MADV_UNMERGEABLE:
278#endif
259 return 1; 279 return 1;
260 280
261 default: 281 default:
262 return 0; 282 return 0;
263 } 283 }
264} 284}
285
265/* 286/*
266 * The madvise(2) system call. 287 * The madvise(2) system call.
267 * 288 *
@@ -286,6 +307,12 @@ madvise_behavior_valid(int behavior)
286 * so the kernel can free resources associated with it. 307 * so the kernel can free resources associated with it.
287 * MADV_REMOVE - the application wants to free up the given range of 308 * MADV_REMOVE - the application wants to free up the given range of
288 * pages and associated backing store. 309 * pages and associated backing store.
310 * MADV_DONTFORK - omit this area from child's address space when forking:
311 * typically, to avoid COWing pages pinned by get_user_pages().
312 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
313 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
314 * this area with pages of identical content from other such areas.
315 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
289 * 316 *
290 * return values: 317 * return values:
291 * zero - success 318 * zero - success
@@ -307,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
307 int write; 334 int write;
308 size_t len; 335 size_t len;
309 336
337#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON)
339 return madvise_hwpoison(start, start+len_in);
340#endif
310 if (!madvise_behavior_valid(behavior)) 341 if (!madvise_behavior_valid(behavior))
311 return error; 342 return error;
312 343
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fd4529d86de5..e2b98a6875c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -43,6 +44,7 @@
43 44
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
46 48
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53#endif 55#endif
54 56
55static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
56 59
57/* 60/*
58 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
69 74
70 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
71}; 76};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
78 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
79}; 84};
80 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
81/* 100/*
82 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
83 */ 102 */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
117 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
118 137
119 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
120}; 145};
121/* Macro for accessing counter */ 146/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
130}; 155};
131 156
132/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
133 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
186 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
187}; 232};
188 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
189enum charge_type { 241enum charge_type {
190 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
191 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 255/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 257
211/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
212#define _MEM (0) 259#define _MEM (0)
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
215#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
216#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
217 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
218static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
219static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
220static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
221 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz)
317{
318 struct rb_node **p = &mctz->rb_root.rb_node;
319 struct rb_node *parent = NULL;
320 struct mem_cgroup_per_zone *mz_node;
321
322 if (mz->on_tree)
323 return;
324
325 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
326 while (*p) {
327 parent = *p;
328 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
329 tree_node);
330 if (mz->usage_in_excess < mz_node->usage_in_excess)
331 p = &(*p)->rb_left;
332 /*
333 * We can't avoid mem cgroups that are over their soft
334 * limit by the same amount
335 */
336 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
337 p = &(*p)->rb_right;
338 }
339 rb_link_node(&mz->tree_node, parent, p);
340 rb_insert_color(&mz->tree_node, &mctz->rb_root);
341 mz->on_tree = true;
342}
343
344static void
345__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
346 struct mem_cgroup_per_zone *mz,
347 struct mem_cgroup_tree_per_zone *mctz)
348{
349 if (!mz->on_tree)
350 return;
351 rb_erase(&mz->tree_node, &mctz->rb_root);
352 mz->on_tree = false;
353}
354
355static void
356mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357 struct mem_cgroup_per_zone *mz,
358 struct mem_cgroup_tree_per_zone *mctz)
359{
360 spin_lock(&mctz->lock);
361 __mem_cgroup_insert_exceeded(mem, mz, mctz);
362 spin_unlock(&mctz->lock);
363}
364
365static void
366mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367 struct mem_cgroup_per_zone *mz,
368 struct mem_cgroup_tree_per_zone *mctz)
369{
370 spin_lock(&mctz->lock);
371 __mem_cgroup_remove_exceeded(mem, mz, mctz);
372 spin_unlock(&mctz->lock);
373}
374
375static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
376{
377 bool ret = false;
378 int cpu;
379 s64 val;
380 struct mem_cgroup_stat_cpu *cpustat;
381
382 cpu = get_cpu();
383 cpustat = &mem->stat.cpustat[cpu];
384 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
385 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
386 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
387 ret = true;
388 }
389 put_cpu();
390 return ret;
391}
392
393static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394{
395 unsigned long long prev_usage_in_excess, new_usage_in_excess;
396 bool updated_tree = false;
397 struct mem_cgroup_per_zone *mz;
398 struct mem_cgroup_tree_per_zone *mctz;
399
400 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
401 mctz = soft_limit_tree_from_page(page);
402
403 /*
404 * We do updates in lazy mode, mem's are removed
405 * lazily from the per-zone, per-node rb tree
406 */
407 prev_usage_in_excess = mz->usage_in_excess;
408
409 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
410 if (prev_usage_in_excess) {
411 mem_cgroup_remove_exceeded(mem, mz, mctz);
412 updated_tree = true;
413 }
414 if (!new_usage_in_excess)
415 goto done;
416 mem_cgroup_insert_exceeded(mem, mz, mctz);
417
418done:
419 if (updated_tree) {
420 spin_lock(&mctz->lock);
421 mz->usage_in_excess = new_usage_in_excess;
422 spin_unlock(&mctz->lock);
423 }
424}
425
426static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
427{
428 int node, zone;
429 struct mem_cgroup_per_zone *mz;
430 struct mem_cgroup_tree_per_zone *mctz;
431
432 for_each_node_state(node, N_POSSIBLE) {
433 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
434 mz = mem_cgroup_zoneinfo(mem, node, zone);
435 mctz = soft_limit_tree_node_zone(node, zone);
436 mem_cgroup_remove_exceeded(mem, mz, mctz);
437 }
438 }
439}
440
441static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
442{
443 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
444}
445
446static struct mem_cgroup_per_zone *
447__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
448{
449 struct rb_node *rightmost = NULL;
450 struct mem_cgroup_per_zone *mz = NULL;
451
452retry:
453 rightmost = rb_last(&mctz->rb_root);
454 if (!rightmost)
455 goto done; /* Nothing to reclaim from */
456
457 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
458 /*
459 * Remove the node now but someone else can add it back,
460 * we will to add it back at the end of reclaim to its correct
461 * position in the tree.
462 */
463 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
464 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
465 !css_tryget(&mz->mem->css))
466 goto retry;
467done:
468 return mz;
469}
470
471static struct mem_cgroup_per_zone *
472mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
473{
474 struct mem_cgroup_per_zone *mz;
475
476 spin_lock(&mctz->lock);
477 mz = __mem_cgroup_largest_soft_limit_node(mctz);
478 spin_unlock(&mctz->lock);
479 return mz;
480}
481
482static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
483 bool charge)
484{
485 int val = (charge) ? 1 : -1;
486 struct mem_cgroup_stat *stat = &mem->stat;
487 struct mem_cgroup_stat_cpu *cpustat;
488 int cpu = get_cpu();
489
490 cpustat = &stat->cpustat[cpu];
491 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
492 put_cpu();
493}
494
222static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 495static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
223 struct page_cgroup *pc, 496 struct page_cgroup *pc,
224 bool charge) 497 bool charge)
225{ 498{
226 int val = (charge)? 1 : -1; 499 int val = (charge) ? 1 : -1;
227 struct mem_cgroup_stat *stat = &mem->stat; 500 struct mem_cgroup_stat *stat = &mem->stat;
228 struct mem_cgroup_stat_cpu *cpustat; 501 struct mem_cgroup_stat_cpu *cpustat;
229 int cpu = get_cpu(); 502 int cpu = get_cpu();
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
240 else 513 else
241 __mem_cgroup_stat_add_safe(cpustat, 514 __mem_cgroup_stat_add_safe(cpustat,
242 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 515 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
516 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
243 put_cpu(); 517 put_cpu();
244} 518}
245 519
246static struct mem_cgroup_per_zone *
247mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
248{
249 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
250}
251
252static struct mem_cgroup_per_zone *
253page_cgroup_zoneinfo(struct page_cgroup *pc)
254{
255 struct mem_cgroup *mem = pc->mem_cgroup;
256 int nid = page_cgroup_nid(pc);
257 int zid = page_cgroup_zid(pc);
258
259 if (!mem)
260 return NULL;
261
262 return mem_cgroup_zoneinfo(mem, nid, zid);
263}
264
265static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 520static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
266 enum lru_list idx) 521 enum lru_list idx)
267{ 522{
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 609 return ret;
355} 610}
356 611
612static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
613{
614 return (mem == root_mem_cgroup);
615}
616
357/* 617/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 618 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 619 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 631void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 632{
373 struct page_cgroup *pc; 633 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 634 struct mem_cgroup_per_zone *mz;
376 635
377 if (mem_cgroup_disabled()) 636 if (mem_cgroup_disabled())
378 return; 637 return;
379 pc = lookup_page_cgroup(page); 638 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 639 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 640 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 641 return;
642 VM_BUG_ON(!pc->mem_cgroup);
383 /* 643 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 644 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 645 * removed from global LRU.
386 */ 646 */
387 mz = page_cgroup_zoneinfo(pc); 647 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 648 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
649 if (mem_cgroup_is_root(pc->mem_cgroup))
650 return;
651 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 652 list_del_init(&pc->lru);
391 return; 653 return;
392} 654}
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 672 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 673 */
412 smp_rmb(); 674 smp_rmb();
413 /* unused page is not rotated. */ 675 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 676 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 677 return;
416 mz = page_cgroup_zoneinfo(pc); 678 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 679 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 687 if (mem_cgroup_disabled())
426 return; 688 return;
427 pc = lookup_page_cgroup(page); 689 pc = lookup_page_cgroup(page);
690 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 691 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 692 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 693 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 698
436 mz = page_cgroup_zoneinfo(pc); 699 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 700 MEM_CGROUP_ZSTAT(mz, lru) += 1;
701 SetPageCgroupAcctLRU(pc);
702 if (mem_cgroup_is_root(pc->mem_cgroup))
703 return;
438 list_add(&pc->lru, &mz->lists[lru]); 704 list_add(&pc->lru, &mz->lists[lru]);
439} 705}
440 706
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 735
470 spin_lock_irqsave(&zone->lru_lock, flags); 736 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 737 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 738 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 739 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 740 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 741}
@@ -648,7 +914,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
648 int nid = z->zone_pgdat->node_id; 914 int nid = z->zone_pgdat->node_id;
649 int zid = zone_idx(z); 915 int zid = zone_idx(z);
650 struct mem_cgroup_per_zone *mz; 916 struct mem_cgroup_per_zone *mz;
651 int lru = LRU_FILE * !!file + !!active; 917 int lru = LRU_FILE * file + active;
652 int ret; 918 int ret;
653 919
654 BUG_ON(!mem_cont); 920 BUG_ON(!mem_cont);
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
855 * If shrink==true, for avoiding to free too much, this returns immedieately. 1121 * If shrink==true, for avoiding to free too much, this returns immedieately.
856 */ 1122 */
857static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1123static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
858 gfp_t gfp_mask, bool noswap, bool shrink) 1124 struct zone *zone,
1125 gfp_t gfp_mask,
1126 unsigned long reclaim_options)
859{ 1127{
860 struct mem_cgroup *victim; 1128 struct mem_cgroup *victim;
861 int ret, total = 0; 1129 int ret, total = 0;
862 int loop = 0; 1130 int loop = 0;
1131 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1132 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1133 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1134 unsigned long excess = mem_cgroup_get_excess(root_mem);
863 1135
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1136 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum) 1137 if (root_mem->memsw_is_minimum)
866 noswap = true; 1138 noswap = true;
867 1139
868 while (loop < 2) { 1140 while (1) {
869 victim = mem_cgroup_select_victim(root_mem); 1141 victim = mem_cgroup_select_victim(root_mem);
870 if (victim == root_mem) 1142 if (victim == root_mem) {
871 loop++; 1143 loop++;
1144 if (loop >= 2) {
1145 /*
1146 * If we have not been able to reclaim
1147 * anything, it might because there are
1148 * no reclaimable pages under this hierarchy
1149 */
1150 if (!check_soft || !total) {
1151 css_put(&victim->css);
1152 break;
1153 }
1154 /*
1155 * We want to do more targetted reclaim.
1156 * excess >> 2 is not to excessive so as to
1157 * reclaim too much, nor too less that we keep
1158 * coming back to reclaim from this cgroup
1159 */
1160 if (total >= (excess >> 2) ||
1161 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1162 css_put(&victim->css);
1163 break;
1164 }
1165 }
1166 }
872 if (!mem_cgroup_local_usage(&victim->stat)) { 1167 if (!mem_cgroup_local_usage(&victim->stat)) {
873 /* this cgroup's local usage == 0 */ 1168 /* this cgroup's local usage == 0 */
874 css_put(&victim->css); 1169 css_put(&victim->css);
875 continue; 1170 continue;
876 } 1171 }
877 /* we use swappiness of local cgroup */ 1172 /* we use swappiness of local cgroup */
878 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1173 if (check_soft)
879 get_swappiness(victim)); 1174 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1175 noswap, get_swappiness(victim), zone,
1176 zone->zone_pgdat->node_id);
1177 else
1178 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1179 noswap, get_swappiness(victim));
880 css_put(&victim->css); 1180 css_put(&victim->css);
881 /* 1181 /*
882 * At shrinking usage, we can't check we should stop here or 1182 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
886 if (shrink) 1186 if (shrink)
887 return ret; 1187 return ret;
888 total += ret; 1188 total += ret;
889 if (mem_cgroup_check_under_limit(root_mem)) 1189 if (check_soft) {
1190 if (res_counter_check_under_soft_limit(&root_mem->res))
1191 return total;
1192 } else if (mem_cgroup_check_under_limit(root_mem))
890 return 1 + total; 1193 return 1 + total;
891 } 1194 }
892 return total; 1195 return total;
@@ -965,11 +1268,11 @@ done:
965 */ 1268 */
966static int __mem_cgroup_try_charge(struct mm_struct *mm, 1269static int __mem_cgroup_try_charge(struct mm_struct *mm,
967 gfp_t gfp_mask, struct mem_cgroup **memcg, 1270 gfp_t gfp_mask, struct mem_cgroup **memcg,
968 bool oom) 1271 bool oom, struct page *page)
969{ 1272{
970 struct mem_cgroup *mem, *mem_over_limit; 1273 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
971 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1274 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
972 struct res_counter *fail_res; 1275 struct res_counter *fail_res, *soft_fail_res = NULL;
973 1276
974 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1277 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
975 /* Don't account this! */ 1278 /* Don't account this! */
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
996 VM_BUG_ON(css_is_removed(&mem->css)); 1299 VM_BUG_ON(css_is_removed(&mem->css));
997 1300
998 while (1) { 1301 while (1) {
999 int ret; 1302 int ret = 0;
1000 bool noswap = false; 1303 unsigned long flags = 0;
1001 1304
1002 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1305 if (mem_cgroup_is_root(mem))
1306 goto done;
1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1308 &soft_fail_res);
1003 if (likely(!ret)) { 1309 if (likely(!ret)) {
1004 if (!do_swap_account) 1310 if (!do_swap_account)
1005 break; 1311 break;
1006 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1312 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1007 &fail_res); 1313 &fail_res, NULL);
1008 if (likely(!ret)) 1314 if (likely(!ret))
1009 break; 1315 break;
1010 /* mem+swap counter fails */ 1316 /* mem+swap counter fails */
1011 res_counter_uncharge(&mem->res, PAGE_SIZE); 1317 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1012 noswap = true; 1318 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1013 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1319 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1014 memsw); 1320 memsw);
1015 } else 1321 } else
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1020 if (!(gfp_mask & __GFP_WAIT)) 1326 if (!(gfp_mask & __GFP_WAIT))
1021 goto nomem; 1327 goto nomem;
1022 1328
1023 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1329 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1024 noswap, false); 1330 gfp_mask, flags);
1025 if (ret) 1331 if (ret)
1026 continue; 1332 continue;
1027 1333
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1046 goto nomem; 1352 goto nomem;
1047 } 1353 }
1048 } 1354 }
1355 /*
1356 * Insert just the ancestor, we should trickle down to the correct
1357 * cgroup for reclaim, since the other nodes will be below their
1358 * soft limit
1359 */
1360 if (soft_fail_res) {
1361 mem_over_soft_limit =
1362 mem_cgroup_from_res_counter(soft_fail_res, res);
1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1364 mem_cgroup_update_tree(mem_over_soft_limit, page);
1365 }
1366done:
1049 return 0; 1367 return 0;
1050nomem: 1368nomem:
1051 css_put(&mem->css); 1369 css_put(&mem->css);
1052 return -ENOMEM; 1370 return -ENOMEM;
1053} 1371}
1054 1372
1055
1056/* 1373/*
1057 * A helper function to get mem_cgroup from ID. must be called under 1374 * A helper function to get mem_cgroup from ID. must be called under
1058 * rcu_read_lock(). The caller must check css_is_removed() or some if 1375 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1119 lock_page_cgroup(pc); 1436 lock_page_cgroup(pc);
1120 if (unlikely(PageCgroupUsed(pc))) { 1437 if (unlikely(PageCgroupUsed(pc))) {
1121 unlock_page_cgroup(pc); 1438 unlock_page_cgroup(pc);
1122 res_counter_uncharge(&mem->res, PAGE_SIZE); 1439 if (!mem_cgroup_is_root(mem)) {
1123 if (do_swap_account) 1440 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1124 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1441 if (do_swap_account)
1442 res_counter_uncharge(&mem->memsw, PAGE_SIZE,
1443 NULL);
1444 }
1125 css_put(&mem->css); 1445 css_put(&mem->css);
1126 return; 1446 return;
1127 } 1447 }
1448
1128 pc->mem_cgroup = mem; 1449 pc->mem_cgroup = mem;
1450 /*
1451 * We access a page_cgroup asynchronously without lock_page_cgroup().
1452 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1453 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1454 * before USED bit, we need memory barrier here.
1455 * See mem_cgroup_add_lru_list(), etc.
1456 */
1129 smp_wmb(); 1457 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1458 switch (ctype) {
1459 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1460 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1461 SetPageCgroupCache(pc);
1462 SetPageCgroupUsed(pc);
1463 break;
1464 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1465 ClearPageCgroupCache(pc);
1466 SetPageCgroupUsed(pc);
1467 break;
1468 default:
1469 break;
1470 }
1131 1471
1132 mem_cgroup_charge_statistics(mem, pc, true); 1472 mem_cgroup_charge_statistics(mem, pc, true);
1133 1473
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1178 if (pc->mem_cgroup != from) 1518 if (pc->mem_cgroup != from)
1179 goto out; 1519 goto out;
1180 1520
1181 res_counter_uncharge(&from->res, PAGE_SIZE); 1521 if (!mem_cgroup_is_root(from))
1522 res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1182 mem_cgroup_charge_statistics(from, pc, false); 1523 mem_cgroup_charge_statistics(from, pc, false);
1183 1524
1184 page = pc->page; 1525 page = pc->page;
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1197 1); 1538 1);
1198 } 1539 }
1199 1540
1200 if (do_swap_account) 1541 if (do_swap_account && !mem_cgroup_is_root(from))
1201 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1202 css_put(&from->css); 1543 css_put(&from->css);
1203 1544
1204 css_get(&to->css); 1545 css_get(&to->css);
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1238 parent = mem_cgroup_from_cont(pcg); 1579 parent = mem_cgroup_from_cont(pcg);
1239 1580
1240 1581
1241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1582 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1242 if (ret || !parent) 1583 if (ret || !parent)
1243 return ret; 1584 return ret;
1244 1585
@@ -1268,9 +1609,11 @@ uncharge:
1268 /* drop extra refcnt by try_charge() */ 1609 /* drop extra refcnt by try_charge() */
1269 css_put(&parent->css); 1610 css_put(&parent->css);
1270 /* uncharge if move fails */ 1611 /* uncharge if move fails */
1271 res_counter_uncharge(&parent->res, PAGE_SIZE); 1612 if (!mem_cgroup_is_root(parent)) {
1272 if (do_swap_account) 1613 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1273 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1614 if (do_swap_account)
1615 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
1616 }
1274 return ret; 1617 return ret;
1275} 1618}
1276 1619
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1295 prefetchw(pc); 1638 prefetchw(pc);
1296 1639
1297 mem = memcg; 1640 mem = memcg;
1298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1641 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1299 if (ret || !mem) 1642 if (ret || !mem)
1300 return ret; 1643 return ret;
1301 1644
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1414 if (!mem) 1757 if (!mem)
1415 goto charge_cur_mm; 1758 goto charge_cur_mm;
1416 *ptr = mem; 1759 *ptr = mem;
1417 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1760 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1418 /* drop extra refcnt from tryget */ 1761 /* drop extra refcnt from tryget */
1419 css_put(&mem->css); 1762 css_put(&mem->css);
1420 return ret; 1763 return ret;
1421charge_cur_mm: 1764charge_cur_mm:
1422 if (unlikely(!mm)) 1765 if (unlikely(!mm))
1423 mm = &init_mm; 1766 mm = &init_mm;
1424 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1767 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1425} 1768}
1426 1769
1427static void 1770static void
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1459 * This recorded memcg can be obsolete one. So, avoid 1802 * This recorded memcg can be obsolete one. So, avoid
1460 * calling css_tryget 1803 * calling css_tryget
1461 */ 1804 */
1462 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1805 if (!mem_cgroup_is_root(memcg))
1806 res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
1807 NULL);
1808 mem_cgroup_swap_statistics(memcg, false);
1463 mem_cgroup_put(memcg); 1809 mem_cgroup_put(memcg);
1464 } 1810 }
1465 rcu_read_unlock(); 1811 rcu_read_unlock();
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1484 return; 1830 return;
1485 if (!mem) 1831 if (!mem)
1486 return; 1832 return;
1487 res_counter_uncharge(&mem->res, PAGE_SIZE); 1833 if (!mem_cgroup_is_root(mem)) {
1488 if (do_swap_account) 1834 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1489 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1835 if (do_swap_account)
1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1837 }
1490 css_put(&mem->css); 1838 css_put(&mem->css);
1491} 1839}
1492 1840
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1500 struct page_cgroup *pc; 1848 struct page_cgroup *pc;
1501 struct mem_cgroup *mem = NULL; 1849 struct mem_cgroup *mem = NULL;
1502 struct mem_cgroup_per_zone *mz; 1850 struct mem_cgroup_per_zone *mz;
1851 bool soft_limit_excess = false;
1503 1852
1504 if (mem_cgroup_disabled()) 1853 if (mem_cgroup_disabled())
1505 return NULL; 1854 return NULL;
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1538 break; 1887 break;
1539 } 1888 }
1540 1889
1541 res_counter_uncharge(&mem->res, PAGE_SIZE); 1890 if (!mem_cgroup_is_root(mem)) {
1542 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1891 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1543 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1892 if (do_swap_account &&
1893 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1894 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1895 }
1896 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1897 mem_cgroup_swap_statistics(mem, true);
1544 mem_cgroup_charge_statistics(mem, pc, false); 1898 mem_cgroup_charge_statistics(mem, pc, false);
1545 1899
1546 ClearPageCgroupUsed(pc); 1900 ClearPageCgroupUsed(pc);
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1554 mz = page_cgroup_zoneinfo(pc); 1908 mz = page_cgroup_zoneinfo(pc);
1555 unlock_page_cgroup(pc); 1909 unlock_page_cgroup(pc);
1556 1910
1911 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
1912 mem_cgroup_update_tree(mem, page);
1557 /* at swapout, this memcg will be accessed to record to swap */ 1913 /* at swapout, this memcg will be accessed to record to swap */
1558 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1914 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1559 css_put(&mem->css); 1915 css_put(&mem->css);
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1629 * We uncharge this because swap is freed. 1985 * We uncharge this because swap is freed.
1630 * This memcg can be obsolete one. We avoid calling css_tryget 1986 * This memcg can be obsolete one. We avoid calling css_tryget
1631 */ 1987 */
1632 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1988 if (!mem_cgroup_is_root(memcg))
1989 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1990 mem_cgroup_swap_statistics(memcg, false);
1633 mem_cgroup_put(memcg); 1991 mem_cgroup_put(memcg);
1634 } 1992 }
1635 rcu_read_unlock(); 1993 rcu_read_unlock();
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1658 unlock_page_cgroup(pc); 2016 unlock_page_cgroup(pc);
1659 2017
1660 if (mem) { 2018 if (mem) {
1661 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2019 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2020 page);
1662 css_put(&mem->css); 2021 css_put(&mem->css);
1663 } 2022 }
1664 *ptr = mem; 2023 *ptr = mem;
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1798 if (!ret) 2157 if (!ret)
1799 break; 2158 break;
1800 2159
1801 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2160 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1802 false, true); 2161 GFP_KERNEL,
2162 MEM_CGROUP_RECLAIM_SHRINK);
1803 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2163 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1804 /* Usage is reduced ? */ 2164 /* Usage is reduced ? */
1805 if (curusage >= oldusage) 2165 if (curusage >= oldusage)
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1851 if (!ret) 2211 if (!ret)
1852 break; 2212 break;
1853 2213
1854 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); 2214 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2215 MEM_CGROUP_RECLAIM_NOSWAP |
2216 MEM_CGROUP_RECLAIM_SHRINK);
1855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2217 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1856 /* Usage is reduced ? */ 2218 /* Usage is reduced ? */
1857 if (curusage >= oldusage) 2219 if (curusage >= oldusage)
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1862 return ret; 2224 return ret;
1863} 2225}
1864 2226
2227unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2228 gfp_t gfp_mask, int nid,
2229 int zid)
2230{
2231 unsigned long nr_reclaimed = 0;
2232 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2233 unsigned long reclaimed;
2234 int loop = 0;
2235 struct mem_cgroup_tree_per_zone *mctz;
2236
2237 if (order > 0)
2238 return 0;
2239
2240 mctz = soft_limit_tree_node_zone(nid, zid);
2241 /*
2242 * This loop can run a while, specially if mem_cgroup's continuously
2243 * keep exceeding their soft limit and putting the system under
2244 * pressure
2245 */
2246 do {
2247 if (next_mz)
2248 mz = next_mz;
2249 else
2250 mz = mem_cgroup_largest_soft_limit_node(mctz);
2251 if (!mz)
2252 break;
2253
2254 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2255 gfp_mask,
2256 MEM_CGROUP_RECLAIM_SOFT);
2257 nr_reclaimed += reclaimed;
2258 spin_lock(&mctz->lock);
2259
2260 /*
2261 * If we failed to reclaim anything from this memory cgroup
2262 * it is time to move on to the next cgroup
2263 */
2264 next_mz = NULL;
2265 if (!reclaimed) {
2266 do {
2267 /*
2268 * Loop until we find yet another one.
2269 *
2270 * By the time we get the soft_limit lock
2271 * again, someone might have aded the
2272 * group back on the RB tree. Iterate to
2273 * make sure we get a different mem.
2274 * mem_cgroup_largest_soft_limit_node returns
2275 * NULL if no other cgroup is present on
2276 * the tree
2277 */
2278 next_mz =
2279 __mem_cgroup_largest_soft_limit_node(mctz);
2280 if (next_mz == mz) {
2281 css_put(&next_mz->mem->css);
2282 next_mz = NULL;
2283 } else /* next_mz == NULL or other memcg */
2284 break;
2285 } while (1);
2286 }
2287 mz->usage_in_excess =
2288 res_counter_soft_limit_excess(&mz->mem->res);
2289 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2290 /*
2291 * One school of thought says that we should not add
2292 * back the node to the tree if reclaim returns 0.
2293 * But our reclaim could return 0, simply because due
2294 * to priority we are exposing a smaller subset of
2295 * memory to reclaim from. Consider this as a longer
2296 * term TODO.
2297 */
2298 if (mz->usage_in_excess)
2299 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
2300 spin_unlock(&mctz->lock);
2301 css_put(&mz->mem->css);
2302 loop++;
2303 /*
2304 * Could not reclaim anything and there are no more
2305 * mem cgroups to try or we seem to be looping without
2306 * reclaiming anything.
2307 */
2308 if (!nr_reclaimed &&
2309 (next_mz == NULL ||
2310 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2311 break;
2312 } while (!nr_reclaimed);
2313 if (next_mz)
2314 css_put(&next_mz->mem->css);
2315 return nr_reclaimed;
2316}
2317
1865/* 2318/*
1866 * This routine traverse page_cgroup in given list and drop them all. 2319 * This routine traverse page_cgroup in given list and drop them all.
1867 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2320 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2046 return retval; 2499 return retval;
2047} 2500}
2048 2501
2502struct mem_cgroup_idx_data {
2503 s64 val;
2504 enum mem_cgroup_stat_index idx;
2505};
2506
2507static int
2508mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2509{
2510 struct mem_cgroup_idx_data *d = data;
2511 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2512 return 0;
2513}
2514
2515static void
2516mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2517 enum mem_cgroup_stat_index idx, s64 *val)
2518{
2519 struct mem_cgroup_idx_data d;
2520 d.idx = idx;
2521 d.val = 0;
2522 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2523 *val = d.val;
2524}
2525
2049static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2526static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2050{ 2527{
2051 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2528 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2052 u64 val = 0; 2529 u64 idx_val, val;
2053 int type, name; 2530 int type, name;
2054 2531
2055 type = MEMFILE_TYPE(cft->private); 2532 type = MEMFILE_TYPE(cft->private);
2056 name = MEMFILE_ATTR(cft->private); 2533 name = MEMFILE_ATTR(cft->private);
2057 switch (type) { 2534 switch (type) {
2058 case _MEM: 2535 case _MEM:
2059 val = res_counter_read_u64(&mem->res, name); 2536 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2537 mem_cgroup_get_recursive_idx_stat(mem,
2538 MEM_CGROUP_STAT_CACHE, &idx_val);
2539 val = idx_val;
2540 mem_cgroup_get_recursive_idx_stat(mem,
2541 MEM_CGROUP_STAT_RSS, &idx_val);
2542 val += idx_val;
2543 val <<= PAGE_SHIFT;
2544 } else
2545 val = res_counter_read_u64(&mem->res, name);
2060 break; 2546 break;
2061 case _MEMSWAP: 2547 case _MEMSWAP:
2062 val = res_counter_read_u64(&mem->memsw, name); 2548 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2549 mem_cgroup_get_recursive_idx_stat(mem,
2550 MEM_CGROUP_STAT_CACHE, &idx_val);
2551 val = idx_val;
2552 mem_cgroup_get_recursive_idx_stat(mem,
2553 MEM_CGROUP_STAT_RSS, &idx_val);
2554 val += idx_val;
2555 mem_cgroup_get_recursive_idx_stat(mem,
2556 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2557 val <<= PAGE_SHIFT;
2558 } else
2559 val = res_counter_read_u64(&mem->memsw, name);
2063 break; 2560 break;
2064 default: 2561 default:
2065 BUG(); 2562 BUG();
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2580 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2581 switch (name) {
2085 case RES_LIMIT: 2582 case RES_LIMIT:
2583 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2584 ret = -EINVAL;
2585 break;
2586 }
2086 /* This function does all necessary parse...reuse it */ 2587 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2588 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2589 if (ret)
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2092 else 2593 else
2093 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2594 ret = mem_cgroup_resize_memsw_limit(memcg, val);
2094 break; 2595 break;
2596 case RES_SOFT_LIMIT:
2597 ret = res_counter_memparse_write_strategy(buffer, &val);
2598 if (ret)
2599 break;
2600 /*
2601 * For memsw, soft limits are hard to implement in terms
2602 * of semantics, for now, we support soft limits for
2603 * control without swap
2604 */
2605 if (type == _MEM)
2606 ret = res_counter_set_soft_limit(&memcg->res, val);
2607 else
2608 ret = -EINVAL;
2609 break;
2095 default: 2610 default:
2096 ret = -EINVAL; /* should be BUG() ? */ 2611 ret = -EINVAL; /* should be BUG() ? */
2097 break; 2612 break;
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2149 res_counter_reset_failcnt(&mem->memsw); 2664 res_counter_reset_failcnt(&mem->memsw);
2150 break; 2665 break;
2151 } 2666 }
2667
2152 return 0; 2668 return 0;
2153} 2669}
2154 2670
@@ -2160,6 +2676,7 @@ enum {
2160 MCS_MAPPED_FILE, 2676 MCS_MAPPED_FILE,
2161 MCS_PGPGIN, 2677 MCS_PGPGIN,
2162 MCS_PGPGOUT, 2678 MCS_PGPGOUT,
2679 MCS_SWAP,
2163 MCS_INACTIVE_ANON, 2680 MCS_INACTIVE_ANON,
2164 MCS_ACTIVE_ANON, 2681 MCS_ACTIVE_ANON,
2165 MCS_INACTIVE_FILE, 2682 MCS_INACTIVE_FILE,
@@ -2181,6 +2698,7 @@ struct {
2181 {"mapped_file", "total_mapped_file"}, 2698 {"mapped_file", "total_mapped_file"},
2182 {"pgpgin", "total_pgpgin"}, 2699 {"pgpgin", "total_pgpgin"},
2183 {"pgpgout", "total_pgpgout"}, 2700 {"pgpgout", "total_pgpgout"},
2701 {"swap", "total_swap"},
2184 {"inactive_anon", "total_inactive_anon"}, 2702 {"inactive_anon", "total_inactive_anon"},
2185 {"active_anon", "total_active_anon"}, 2703 {"active_anon", "total_active_anon"},
2186 {"inactive_file", "total_inactive_file"}, 2704 {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2205 s->stat[MCS_PGPGIN] += val; 2723 s->stat[MCS_PGPGIN] += val;
2206 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2724 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2207 s->stat[MCS_PGPGOUT] += val; 2725 s->stat[MCS_PGPGOUT] += val;
2726 if (do_swap_account) {
2727 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2728 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2729 }
2208 2730
2209 /* per zone stat */ 2731 /* per zone stat */
2210 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2732 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2236 memset(&mystat, 0, sizeof(mystat)); 2758 memset(&mystat, 0, sizeof(mystat));
2237 mem_cgroup_get_local_stat(mem_cont, &mystat); 2759 mem_cgroup_get_local_stat(mem_cont, &mystat);
2238 2760
2239 for (i = 0; i < NR_MCS_STAT; i++) 2761 for (i = 0; i < NR_MCS_STAT; i++) {
2762 if (i == MCS_SWAP && !do_swap_account)
2763 continue;
2240 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2764 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2765 }
2241 2766
2242 /* Hierarchical information */ 2767 /* Hierarchical information */
2243 { 2768 {
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2250 2775
2251 memset(&mystat, 0, sizeof(mystat)); 2776 memset(&mystat, 0, sizeof(mystat));
2252 mem_cgroup_get_total_stat(mem_cont, &mystat); 2777 mem_cgroup_get_total_stat(mem_cont, &mystat);
2253 for (i = 0; i < NR_MCS_STAT; i++) 2778 for (i = 0; i < NR_MCS_STAT; i++) {
2779 if (i == MCS_SWAP && !do_swap_account)
2780 continue;
2254 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2781 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2255 2782 }
2256 2783
2257#ifdef CONFIG_DEBUG_VM 2784#ifdef CONFIG_DEBUG_VM
2258 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2785 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = {
2345 .read_u64 = mem_cgroup_read, 2872 .read_u64 = mem_cgroup_read,
2346 }, 2873 },
2347 { 2874 {
2875 .name = "soft_limit_in_bytes",
2876 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2877 .write_string = mem_cgroup_write,
2878 .read_u64 = mem_cgroup_read,
2879 },
2880 {
2348 .name = "failcnt", 2881 .name = "failcnt",
2349 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2882 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2350 .trigger = mem_cgroup_reset, 2883 .trigger = mem_cgroup_reset,
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2438 mz = &pn->zoneinfo[zone]; 2971 mz = &pn->zoneinfo[zone];
2439 for_each_lru(l) 2972 for_each_lru(l)
2440 INIT_LIST_HEAD(&mz->lists[l]); 2973 INIT_LIST_HEAD(&mz->lists[l]);
2974 mz->usage_in_excess = 0;
2975 mz->on_tree = false;
2976 mz->mem = mem;
2441 } 2977 }
2442 return 0; 2978 return 0;
2443} 2979}
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2483{ 3019{
2484 int node; 3020 int node;
2485 3021
3022 mem_cgroup_remove_from_trees(mem);
2486 free_css_id(&mem_cgroup_subsys, &mem->css); 3023 free_css_id(&mem_cgroup_subsys, &mem->css);
2487 3024
2488 for_each_node_state(node, N_POSSIBLE) 3025 for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void)
2531} 3068}
2532#endif 3069#endif
2533 3070
3071static int mem_cgroup_soft_limit_tree_init(void)
3072{
3073 struct mem_cgroup_tree_per_node *rtpn;
3074 struct mem_cgroup_tree_per_zone *rtpz;
3075 int tmp, node, zone;
3076
3077 for_each_node_state(node, N_POSSIBLE) {
3078 tmp = node;
3079 if (!node_state(node, N_NORMAL_MEMORY))
3080 tmp = -1;
3081 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3082 if (!rtpn)
3083 return 1;
3084
3085 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3086
3087 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3088 rtpz = &rtpn->rb_tree_per_zone[zone];
3089 rtpz->rb_root = RB_ROOT;
3090 spin_lock_init(&rtpz->lock);
3091 }
3092 }
3093 return 0;
3094}
3095
2534static struct cgroup_subsys_state * __ref 3096static struct cgroup_subsys_state * __ref
2535mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3097mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2536{ 3098{
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2545 for_each_node_state(node, N_POSSIBLE) 3107 for_each_node_state(node, N_POSSIBLE)
2546 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3108 if (alloc_mem_cgroup_per_zone_info(mem, node))
2547 goto free_out; 3109 goto free_out;
3110
2548 /* root ? */ 3111 /* root ? */
2549 if (cont->parent == NULL) { 3112 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 3113 enable_swap_cgroup();
2551 parent = NULL; 3114 parent = NULL;
3115 root_mem_cgroup = mem;
3116 if (mem_cgroup_soft_limit_tree_init())
3117 goto free_out;
3118
2552 } else { 3119 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 3120 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 3121 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 3144 return &mem->css;
2578free_out: 3145free_out:
2579 __mem_cgroup_free(mem); 3146 __mem_cgroup_free(mem);
3147 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 3148 return ERR_PTR(error);
2581} 3149}
2582 3150
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2612static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3180static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2613 struct cgroup *cont, 3181 struct cgroup *cont,
2614 struct cgroup *old_cont, 3182 struct cgroup *old_cont,
2615 struct task_struct *p) 3183 struct task_struct *p,
3184 bool threadgroup)
2616{ 3185{
2617 mutex_lock(&memcg_tasklist); 3186 mutex_lock(&memcg_tasklist);
2618 /* 3187 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..729d4b15b645
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/rmap.h>
39#include <linux/pagemap.h>
40#include <linux/swap.h>
41#include <linux/backing-dev.h>
42#include "internal.h"
43
44int sysctl_memory_failure_early_kill __read_mostly = 0;
45
46int sysctl_memory_failure_recovery __read_mostly = 1;
47
48atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
49
50/*
51 * Send all the processes who have the page mapped an ``action optional''
52 * signal.
53 */
54static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
55 unsigned long pfn)
56{
57 struct siginfo si;
58 int ret;
59
60 printk(KERN_ERR
61 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
62 pfn, t->comm, t->pid);
63 si.si_signo = SIGBUS;
64 si.si_errno = 0;
65 si.si_code = BUS_MCEERR_AO;
66 si.si_addr = (void *)addr;
67#ifdef __ARCH_SI_TRAPNO
68 si.si_trapno = trapno;
69#endif
70 si.si_addr_lsb = PAGE_SHIFT;
71 /*
72 * Don't use force here, it's convenient if the signal
73 * can be temporarily blocked.
74 * This could cause a loop when the user sets SIGBUS
75 * to SIG_IGN, but hopefully noone will do that?
76 */
77 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
78 if (ret < 0)
79 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
80 t->comm, t->pid, ret);
81 return ret;
82}
83
84/*
85 * Kill all processes that have a poisoned page mapped and then isolate
86 * the page.
87 *
88 * General strategy:
89 * Find all processes having the page mapped and kill them.
90 * But we keep a page reference around so that the page is not
91 * actually freed yet.
92 * Then stash the page away
93 *
94 * There's no convenient way to get back to mapped processes
95 * from the VMAs. So do a brute-force search over all
96 * running processes.
97 *
98 * Remember that machine checks are not common (or rather
99 * if they are common you have other problems), so this shouldn't
100 * be a performance issue.
101 *
102 * Also there are some races possible while we get from the
103 * error detection to actually handle it.
104 */
105
106struct to_kill {
107 struct list_head nd;
108 struct task_struct *tsk;
109 unsigned long addr;
110 unsigned addr_valid:1;
111};
112
113/*
114 * Failure handling: if we can't find or can't kill a process there's
115 * not much we can do. We just print a message and ignore otherwise.
116 */
117
118/*
119 * Schedule a process for later kill.
120 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
121 * TBD would GFP_NOIO be enough?
122 */
123static void add_to_kill(struct task_struct *tsk, struct page *p,
124 struct vm_area_struct *vma,
125 struct list_head *to_kill,
126 struct to_kill **tkc)
127{
128 struct to_kill *tk;
129
130 if (*tkc) {
131 tk = *tkc;
132 *tkc = NULL;
133 } else {
134 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
135 if (!tk) {
136 printk(KERN_ERR
137 "MCE: Out of memory while machine check handling\n");
138 return;
139 }
140 }
141 tk->addr = page_address_in_vma(p, vma);
142 tk->addr_valid = 1;
143
144 /*
145 * In theory we don't have to kill when the page was
146 * munmaped. But it could be also a mremap. Since that's
147 * likely very rare kill anyways just out of paranoia, but use
148 * a SIGKILL because the error is not contained anymore.
149 */
150 if (tk->addr == -EFAULT) {
151 pr_debug("MCE: Unable to find user space address %lx in %s\n",
152 page_to_pfn(p), tsk->comm);
153 tk->addr_valid = 0;
154 }
155 get_task_struct(tsk);
156 tk->tsk = tsk;
157 list_add_tail(&tk->nd, to_kill);
158}
159
160/*
161 * Kill the processes that have been collected earlier.
162 *
163 * Only do anything when DOIT is set, otherwise just free the list
164 * (this is used for clean pages which do not need killing)
165 * Also when FAIL is set do a force kill because something went
166 * wrong earlier.
167 */
168static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
169 int fail, unsigned long pfn)
170{
171 struct to_kill *tk, *next;
172
173 list_for_each_entry_safe (tk, next, to_kill, nd) {
174 if (doit) {
175 /*
176 * In case something went wrong with munmaping
177 * make sure the process doesn't catch the
178 * signal and then access the memory. Just kill it.
179 * the signal handlers
180 */
181 if (fail || tk->addr_valid == 0) {
182 printk(KERN_ERR
183 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
184 pfn, tk->tsk->comm, tk->tsk->pid);
185 force_sig(SIGKILL, tk->tsk);
186 }
187
188 /*
189 * In theory the process could have mapped
190 * something else on the address in-between. We could
191 * check for that, but we need to tell the
192 * process anyways.
193 */
194 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
195 pfn) < 0)
196 printk(KERN_ERR
197 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
198 pfn, tk->tsk->comm, tk->tsk->pid);
199 }
200 put_task_struct(tk->tsk);
201 kfree(tk);
202 }
203}
204
205static int task_early_kill(struct task_struct *tsk)
206{
207 if (!tsk->mm)
208 return 0;
209 if (tsk->flags & PF_MCE_PROCESS)
210 return !!(tsk->flags & PF_MCE_EARLY);
211 return sysctl_memory_failure_early_kill;
212}
213
214/*
215 * Collect processes when the error hit an anonymous page.
216 */
217static void collect_procs_anon(struct page *page, struct list_head *to_kill,
218 struct to_kill **tkc)
219{
220 struct vm_area_struct *vma;
221 struct task_struct *tsk;
222 struct anon_vma *av;
223
224 read_lock(&tasklist_lock);
225 av = page_lock_anon_vma(page);
226 if (av == NULL) /* Not actually mapped anymore */
227 goto out;
228 for_each_process (tsk) {
229 if (!task_early_kill(tsk))
230 continue;
231 list_for_each_entry (vma, &av->head, anon_vma_node) {
232 if (!page_mapped_in_vma(page, vma))
233 continue;
234 if (vma->vm_mm == tsk->mm)
235 add_to_kill(tsk, page, vma, to_kill, tkc);
236 }
237 }
238 page_unlock_anon_vma(av);
239out:
240 read_unlock(&tasklist_lock);
241}
242
243/*
244 * Collect processes when the error hit a file mapped page.
245 */
246static void collect_procs_file(struct page *page, struct list_head *to_kill,
247 struct to_kill **tkc)
248{
249 struct vm_area_struct *vma;
250 struct task_struct *tsk;
251 struct prio_tree_iter iter;
252 struct address_space *mapping = page->mapping;
253
254 /*
255 * A note on the locking order between the two locks.
256 * We don't rely on this particular order.
257 * If you have some other code that needs a different order
258 * feel free to switch them around. Or add a reverse link
259 * from mm_struct to task_struct, then this could be all
260 * done without taking tasklist_lock and looping over all tasks.
261 */
262
263 read_lock(&tasklist_lock);
264 spin_lock(&mapping->i_mmap_lock);
265 for_each_process(tsk) {
266 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
267
268 if (!task_early_kill(tsk))
269 continue;
270
271 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
272 pgoff) {
273 /*
274 * Send early kill signal to tasks where a vma covers
275 * the page but the corrupted page is not necessarily
276 * mapped it in its pte.
277 * Assume applications who requested early kill want
278 * to be informed of all such data corruptions.
279 */
280 if (vma->vm_mm == tsk->mm)
281 add_to_kill(tsk, page, vma, to_kill, tkc);
282 }
283 }
284 spin_unlock(&mapping->i_mmap_lock);
285 read_unlock(&tasklist_lock);
286}
287
288/*
289 * Collect the processes who have the corrupted page mapped to kill.
290 * This is done in two steps for locking reasons.
291 * First preallocate one tokill structure outside the spin locks,
292 * so that we can kill at least one process reasonably reliable.
293 */
294static void collect_procs(struct page *page, struct list_head *tokill)
295{
296 struct to_kill *tk;
297
298 if (!page->mapping)
299 return;
300
301 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
302 if (!tk)
303 return;
304 if (PageAnon(page))
305 collect_procs_anon(page, tokill, &tk);
306 else
307 collect_procs_file(page, tokill, &tk);
308 kfree(tk);
309}
310
311/*
312 * Error handlers for various types of pages.
313 */
314
315enum outcome {
316 FAILED, /* Error handling failed */
317 DELAYED, /* Will be handled later */
318 IGNORED, /* Error safely ignored */
319 RECOVERED, /* Successfully recovered */
320};
321
322static const char *action_name[] = {
323 [FAILED] = "Failed",
324 [DELAYED] = "Delayed",
325 [IGNORED] = "Ignored",
326 [RECOVERED] = "Recovered",
327};
328
329/*
330 * Error hit kernel page.
331 * Do nothing, try to be lucky and not touch this instead. For a few cases we
332 * could be more sophisticated.
333 */
334static int me_kernel(struct page *p, unsigned long pfn)
335{
336 return DELAYED;
337}
338
339/*
340 * Already poisoned page.
341 */
342static int me_ignore(struct page *p, unsigned long pfn)
343{
344 return IGNORED;
345}
346
347/*
348 * Page in unknown state. Do nothing.
349 */
350static int me_unknown(struct page *p, unsigned long pfn)
351{
352 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
353 return FAILED;
354}
355
356/*
357 * Free memory
358 */
359static int me_free(struct page *p, unsigned long pfn)
360{
361 return DELAYED;
362}
363
364/*
365 * Clean (or cleaned) page cache page.
366 */
367static int me_pagecache_clean(struct page *p, unsigned long pfn)
368{
369 int err;
370 int ret = FAILED;
371 struct address_space *mapping;
372
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /*
377 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds.
379 */
380 if (PageAnon(p))
381 return RECOVERED;
382
383 /*
384 * Now truncate the page in the page cache. This is really
385 * more like a "temporary hole punch"
386 * Don't do this for block devices when someone else
387 * has a reference, because it could be file system metadata
388 * and that's not safe to truncate.
389 */
390 mapping = page_mapping(p);
391 if (!mapping) {
392 /*
393 * Page has been teared down in the meanwhile
394 */
395 return FAILED;
396 }
397
398 /*
399 * Truncation is a bit tricky. Enable it per file system for now.
400 *
401 * Open: to take i_mutex or not for this? Right now we don't.
402 */
403 if (mapping->a_ops->error_remove_page) {
404 err = mapping->a_ops->error_remove_page(mapping, p);
405 if (err != 0) {
406 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
407 pfn, err);
408 } else if (page_has_private(p) &&
409 !try_to_release_page(p, GFP_NOIO)) {
410 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
411 } else {
412 ret = RECOVERED;
413 }
414 } else {
415 /*
416 * If the file system doesn't support it just invalidate
417 * This fails on dirty or anything with private pages
418 */
419 if (invalidate_inode_page(p))
420 ret = RECOVERED;
421 else
422 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
423 pfn);
424 }
425 return ret;
426}
427
428/*
429 * Dirty cache page page
430 * Issues: when the error hit a hole page the error is not properly
431 * propagated.
432 */
433static int me_pagecache_dirty(struct page *p, unsigned long pfn)
434{
435 struct address_space *mapping = page_mapping(p);
436
437 SetPageError(p);
438 /* TBD: print more information about the file. */
439 if (mapping) {
440 /*
441 * IO error will be reported by write(), fsync(), etc.
442 * who check the mapping.
443 * This way the application knows that something went
444 * wrong with its dirty file data.
445 *
446 * There's one open issue:
447 *
448 * The EIO will be only reported on the next IO
449 * operation and then cleared through the IO map.
450 * Normally Linux has two mechanisms to pass IO error
451 * first through the AS_EIO flag in the address space
452 * and then through the PageError flag in the page.
453 * Since we drop pages on memory failure handling the
454 * only mechanism open to use is through AS_AIO.
455 *
456 * This has the disadvantage that it gets cleared on
457 * the first operation that returns an error, while
458 * the PageError bit is more sticky and only cleared
459 * when the page is reread or dropped. If an
460 * application assumes it will always get error on
461 * fsync, but does other operations on the fd before
462 * and the page is dropped inbetween then the error
463 * will not be properly reported.
464 *
465 * This can already happen even without hwpoisoned
466 * pages: first on metadata IO errors (which only
467 * report through AS_EIO) or when the page is dropped
468 * at the wrong time.
469 *
470 * So right now we assume that the application DTRT on
471 * the first EIO, but we're not worse than other parts
472 * of the kernel.
473 */
474 mapping_set_error(mapping, EIO);
475 }
476
477 return me_pagecache_clean(p, pfn);
478}
479
480/*
481 * Clean and dirty swap cache.
482 *
483 * Dirty swap cache page is tricky to handle. The page could live both in page
484 * cache and swap cache(ie. page is freshly swapped in). So it could be
485 * referenced concurrently by 2 types of PTEs:
486 * normal PTEs and swap PTEs. We try to handle them consistently by calling
487 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
488 * and then
489 * - clear dirty bit to prevent IO
490 * - remove from LRU
491 * - but keep in the swap cache, so that when we return to it on
492 * a later page fault, we know the application is accessing
493 * corrupted data and shall be killed (we installed simple
494 * interception code in do_swap_page to catch it).
495 *
496 * Clean swap cache pages can be directly isolated. A later page fault will
497 * bring in the known good data from disk.
498 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{
501 int ret = FAILED;
502
503 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p);
506
507 if (!isolate_lru_page(p)) {
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513}
514
515static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p);
524 return ret;
525}
526
527/*
528 * Huge pages. Needs work.
529 * Issues:
530 * No rmap support so we cannot find the original mapper. In theory could walk
531 * all MMs and look for the mappings, but that would be non atomic and racy.
532 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
533 * like just walking the current process and hoping it has it mapped (that
534 * should be usually true for the common "shared database cache" case)
535 * Should handle free huge pages and dequeue them too, but this needs to
536 * handle huge page accounting correctly.
537 */
538static int me_huge_page(struct page *p, unsigned long pfn)
539{
540 return FAILED;
541}
542
543/*
544 * Various page states we can handle.
545 *
546 * A page state is defined by its current page->flags bits.
547 * The table matches them in order and calls the right handler.
548 *
549 * This is quite tricky because we can access page at any time
550 * in its live cycle, so all accesses have to be extremly careful.
551 *
552 * This is not complete. More states could be added.
553 * For any missing state don't attempt recovery.
554 */
555
556#define dirty (1UL << PG_dirty)
557#define sc (1UL << PG_swapcache)
558#define unevict (1UL << PG_unevictable)
559#define mlock (1UL << PG_mlocked)
560#define writeback (1UL << PG_writeback)
561#define lru (1UL << PG_lru)
562#define swapbacked (1UL << PG_swapbacked)
563#define head (1UL << PG_head)
564#define tail (1UL << PG_tail)
565#define compound (1UL << PG_compound)
566#define slab (1UL << PG_slab)
567#define buddy (1UL << PG_buddy)
568#define reserved (1UL << PG_reserved)
569
570static struct page_state {
571 unsigned long mask;
572 unsigned long res;
573 char *msg;
574 int (*action)(struct page *p, unsigned long pfn);
575} error_states[] = {
576 { reserved, reserved, "reserved kernel", me_ignore },
577 { buddy, buddy, "free kernel", me_free },
578
579 /*
580 * Could in theory check if slab page is free or if we can drop
581 * currently unused objects without touching them. But just
582 * treat it as standard kernel for now.
583 */
584 { slab, slab, "kernel slab", me_kernel },
585
586#ifdef CONFIG_PAGEFLAGS_EXTENDED
587 { head, head, "huge", me_huge_page },
588 { tail, tail, "huge", me_huge_page },
589#else
590 { compound, compound, "huge", me_huge_page },
591#endif
592
593 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
594 { sc|dirty, sc, "swapcache", me_swapcache_clean },
595
596 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
597 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
598
599#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
600 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
601 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
602#endif
603
604 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
605 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
606 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
607
608 /*
609 * Catchall entry: must be at end.
610 */
611 { 0, 0, "unknown page state", me_unknown },
612};
613
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result)
617{
618 struct page *page = NULL;
619 if (pfn_valid(pfn))
620 page = pfn_to_page(pfn);
621
622 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
623 pfn,
624 page && PageDirty(page) ? "dirty " : "",
625 msg, action_name[result]);
626}
627
628static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref)
630{
631 int result;
632
633 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref)
636 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1);
639
640 /* Could do more checks here if page looks ok */
641 /*
642 * Could adjust zone counters here to correct for the missing page.
643 */
644
645 return result == RECOVERED ? 0 : -EBUSY;
646}
647
648#define N_UNMAP_TRIES 5
649
650/*
651 * Do all that is necessary to remove user space mappings. Unmap
652 * the pages and send SIGBUS to the processes if the data was dirty.
653 */
654static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
655 int trapno)
656{
657 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
658 struct address_space *mapping;
659 LIST_HEAD(tokill);
660 int ret;
661 int i;
662 int kill = 1;
663
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p))
665 return;
666
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /*
671 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills.
673 */
674 if (!page_mapped(p))
675 return;
676
677 if (PageSwapCache(p)) {
678 printk(KERN_ERR
679 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
680 ttu |= TTU_IGNORE_HWPOISON;
681 }
682
683 /*
684 * Propagate the dirty bit from PTEs to struct page first, because we
685 * need this to decide if we should kill or just drop the page.
686 */
687 mapping = page_mapping(p);
688 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
689 if (page_mkclean(p)) {
690 SetPageDirty(p);
691 } else {
692 kill = 0;
693 ttu |= TTU_IGNORE_HWPOISON;
694 printk(KERN_INFO
695 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
696 pfn);
697 }
698 }
699
700 /*
701 * First collect all the processes that have the page
702 * mapped in dirty form. This has to be done before try_to_unmap,
703 * because ttu takes the rmap data structures down.
704 *
705 * Error handling: We ignore errors here because
706 * there's nothing that can be done.
707 */
708 if (kill)
709 collect_procs(p, &tokill);
710
711 /*
712 * try_to_unmap can fail temporarily due to races.
713 * Try a few times (RED-PEN better strategy?)
714 */
715 for (i = 0; i < N_UNMAP_TRIES; i++) {
716 ret = try_to_unmap(p, ttu);
717 if (ret == SWAP_SUCCESS)
718 break;
719 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
720 }
721
722 if (ret != SWAP_SUCCESS)
723 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
724 pfn, page_mapcount(p));
725
726 /*
727 * Now that the dirty bit has been propagated to the
728 * struct page and all unmaps done we can decide if
729 * killing is needed or not. Only kill when the page
730 * was dirty, otherwise the tokill list is merely
731 * freed. When there was a problem unmapping earlier
732 * use a more force-full uncatchable kill to prevent
733 * any accesses to the poisoned memory.
734 */
735 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
736 ret != SWAP_SUCCESS, pfn);
737}
738
739int __memory_failure(unsigned long pfn, int trapno, int ref)
740{
741 struct page_state *ps;
742 struct page *p;
743 int res;
744
745 if (!sysctl_memory_failure_recovery)
746 panic("Memory failure from trap %d on page %lx", trapno, pfn);
747
748 if (!pfn_valid(pfn)) {
749 action_result(pfn, "memory outside kernel control", IGNORED);
750 return -EIO;
751 }
752
753 p = pfn_to_page(pfn);
754 if (TestSetPageHWPoison(p)) {
755 action_result(pfn, "already hardware poisoned", IGNORED);
756 return 0;
757 }
758
759 atomic_long_add(1, &mce_bad_pages);
760
761 /*
762 * We need/can do nothing about count=0 pages.
763 * 1) it's a free page, and therefore in safe hand:
764 * prep_new_page() will be the gate keeper.
765 * 2) it's part of a non-compound high order page.
766 * Implies some kernel user: cannot stop them from
767 * R/W the page; let's pray that the page has been
768 * used and will be freed some time later.
769 * In fact it's dangerous to directly bump up page count from 0,
770 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
771 */
772 if (!get_page_unless_zero(compound_head(p))) {
773 action_result(pfn, "free or high order kernel", IGNORED);
774 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
775 }
776
777 /*
778 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here.
781 */
782 lock_page_nosync(p);
783 wait_on_page_writeback(p);
784
785 /*
786 * Now take care of user space mappings.
787 */
788 hwpoison_user_mappings(p, pfn, trapno);
789
790 /*
791 * Torn down by someone else?
792 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0;
796 goto out;
797 }
798
799 res = -EBUSY;
800 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref);
803 break;
804 }
805 }
806out:
807 unlock_page(p);
808 return res;
809}
810EXPORT_SYMBOL_GPL(__memory_failure);
811
812/**
813 * memory_failure - Handle memory failure of a page.
814 * @pfn: Page Number of the corrupted page
815 * @trapno: Trap number reported in the signal to user space.
816 *
817 * This function is called by the low level machine check code
818 * of an architecture when it detects hardware memory corruption
819 * of a page. It tries its best to recover, which includes
820 * dropping pages, killing processes etc.
821 *
822 * The function is primarily of use for corruptions that
823 * happen outside the current execution context (e.g. when
824 * detected by a background scrubber)
825 *
826 * Must run in process context (e.g. a work queue) with interrupts
827 * enabled and no spinlocks hold.
828 */
829void memory_failure(unsigned long pfn, int trapno)
830{
831 __memory_failure(pfn, trapno, 0);
832}
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce3aba4..7e91b5f9f690 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -56,6 +57,7 @@
56#include <linux/swapops.h> 57#include <linux/swapops.h>
57#include <linux/elf.h> 58#include <linux/elf.h>
58 59
60#include <asm/io.h>
59#include <asm/pgalloc.h> 61#include <asm/pgalloc.h>
60#include <asm/uaccess.h> 62#include <asm/uaccess.h>
61#include <asm/tlb.h> 63#include <asm/tlb.h>
@@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s)
106} 108}
107__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
108 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
109 123
110/* 124/*
111 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -283,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
283 unsigned long addr = vma->vm_start; 297 unsigned long addr = vma->vm_start;
284 298
285 /* 299 /*
286 * Hide vma from rmap and vmtruncate before freeing pgtables 300 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables
287 */ 302 */
288 anon_vma_unlink(vma); 303 anon_vma_unlink(vma);
289 unlink_file_vma(vma); 304 unlink_file_vma(vma);
@@ -442,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags)
442 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 457 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
443} 458}
444 459
460#ifndef is_zero_pfn
461static inline int is_zero_pfn(unsigned long pfn)
462{
463 return pfn == zero_pfn;
464}
465#endif
466
467#ifndef my_zero_pfn
468static inline unsigned long my_zero_pfn(unsigned long addr)
469{
470 return zero_pfn;
471}
472#endif
473
445/* 474/*
446 * vm_normal_page -- This function gets the "struct page" associated with a pte. 475 * vm_normal_page -- This function gets the "struct page" associated with a pte.
447 * 476 *
@@ -497,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
497 if (HAVE_PTE_SPECIAL) { 526 if (HAVE_PTE_SPECIAL) {
498 if (likely(!pte_special(pte))) 527 if (likely(!pte_special(pte)))
499 goto check_pfn; 528 goto check_pfn;
500 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 529 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
530 return NULL;
531 if (!is_zero_pfn(pfn))
501 print_bad_pte(vma, addr, pte, NULL); 532 print_bad_pte(vma, addr, pte, NULL);
502 return NULL; 533 return NULL;
503 } 534 }
@@ -519,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
519 } 550 }
520 } 551 }
521 552
553 if (is_zero_pfn(pfn))
554 return NULL;
522check_pfn: 555check_pfn:
523 if (unlikely(pfn > highest_memmap_pfn)) { 556 if (unlikely(pfn > highest_memmap_pfn)) {
524 print_bad_pte(vma, addr, pte, NULL); 557 print_bad_pte(vma, addr, pte, NULL);
@@ -596,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
596 page = vm_normal_page(vma, addr, pte); 629 page = vm_normal_page(vma, addr, pte);
597 if (page) { 630 if (page) {
598 get_page(page); 631 get_page(page);
599 page_dup_rmap(page, vma, addr); 632 page_dup_rmap(page);
600 rss[!!PageAnon(page)]++; 633 rss[PageAnon(page)]++;
601 } 634 }
602 635
603out_set_pte: 636out_set_pte:
@@ -1142,9 +1175,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1142 goto no_page; 1175 goto no_page;
1143 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1176 if ((flags & FOLL_WRITE) && !pte_write(pte))
1144 goto unlock; 1177 goto unlock;
1178
1145 page = vm_normal_page(vma, address, pte); 1179 page = vm_normal_page(vma, address, pte);
1146 if (unlikely(!page)) 1180 if (unlikely(!page)) {
1147 goto bad_page; 1181 if ((flags & FOLL_DUMP) ||
1182 !is_zero_pfn(pte_pfn(pte)))
1183 goto bad_page;
1184 page = pte_page(pte);
1185 }
1148 1186
1149 if (flags & FOLL_GET) 1187 if (flags & FOLL_GET)
1150 get_page(page); 1188 get_page(page);
@@ -1172,65 +1210,46 @@ no_page:
1172 pte_unmap_unlock(ptep, ptl); 1210 pte_unmap_unlock(ptep, ptl);
1173 if (!pte_none(pte)) 1211 if (!pte_none(pte))
1174 return page; 1212 return page;
1175 /* Fall through to ZERO_PAGE handling */ 1213
1176no_page_table: 1214no_page_table:
1177 /* 1215 /*
1178 * When core dumping an enormous anonymous area that nobody 1216 * When core dumping an enormous anonymous area that nobody
1179 * has touched so far, we don't want to allocate page tables. 1217 * has touched so far, we don't want to allocate unnecessary pages or
1218 * page tables. Return error instead of NULL to skip handle_mm_fault,
1219 * then get_dump_page() will return NULL to leave a hole in the dump.
1220 * But we can only make this optimization where a hole would surely
1221 * be zero-filled if handle_mm_fault() actually did handle it.
1180 */ 1222 */
1181 if (flags & FOLL_ANON) { 1223 if ((flags & FOLL_DUMP) &&
1182 page = ZERO_PAGE(0); 1224 (!vma->vm_ops || !vma->vm_ops->fault))
1183 if (flags & FOLL_GET) 1225 return ERR_PTR(-EFAULT);
1184 get_page(page);
1185 BUG_ON(flags & FOLL_WRITE);
1186 }
1187 return page; 1226 return page;
1188} 1227}
1189 1228
1190/* Can we do the FOLL_ANON optimization? */
1191static inline int use_zero_page(struct vm_area_struct *vma)
1192{
1193 /*
1194 * We don't want to optimize FOLL_ANON for make_pages_present()
1195 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1196 * we want to get the page from the page tables to make sure
1197 * that we serialize and update with any other user of that
1198 * mapping.
1199 */
1200 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1201 return 0;
1202 /*
1203 * And if we have a fault routine, it's not an anonymous region.
1204 */
1205 return !vma->vm_ops || !vma->vm_ops->fault;
1206}
1207
1208
1209
1210int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1229int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1211 unsigned long start, int nr_pages, int flags, 1230 unsigned long start, int nr_pages, unsigned int gup_flags,
1212 struct page **pages, struct vm_area_struct **vmas) 1231 struct page **pages, struct vm_area_struct **vmas)
1213{ 1232{
1214 int i; 1233 int i;
1215 unsigned int vm_flags = 0; 1234 unsigned long vm_flags;
1216 int write = !!(flags & GUP_FLAGS_WRITE);
1217 int force = !!(flags & GUP_FLAGS_FORCE);
1218 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1219 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1220 1235
1221 if (nr_pages <= 0) 1236 if (nr_pages <= 0)
1222 return 0; 1237 return 0;
1238
1239 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1240
1223 /* 1241 /*
1224 * Require read or write permissions. 1242 * Require read or write permissions.
1225 * If 'force' is set, we only require the "MAY" flags. 1243 * If FOLL_FORCE is set, we only require the "MAY" flags.
1226 */ 1244 */
1227 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1245 vm_flags = (gup_flags & FOLL_WRITE) ?
1228 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1246 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1247 vm_flags &= (gup_flags & FOLL_FORCE) ?
1248 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1229 i = 0; 1249 i = 0;
1230 1250
1231 do { 1251 do {
1232 struct vm_area_struct *vma; 1252 struct vm_area_struct *vma;
1233 unsigned int foll_flags;
1234 1253
1235 vma = find_extend_vma(mm, start); 1254 vma = find_extend_vma(mm, start);
1236 if (!vma && in_gate_area(tsk, start)) { 1255 if (!vma && in_gate_area(tsk, start)) {
@@ -1242,7 +1261,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1242 pte_t *pte; 1261 pte_t *pte;
1243 1262
1244 /* user gate pages are read-only */ 1263 /* user gate pages are read-only */
1245 if (!ignore && write) 1264 if (gup_flags & FOLL_WRITE)
1246 return i ? : -EFAULT; 1265 return i ? : -EFAULT;
1247 if (pg > TASK_SIZE) 1266 if (pg > TASK_SIZE)
1248 pgd = pgd_offset_k(pg); 1267 pgd = pgd_offset_k(pg);
@@ -1276,38 +1295,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1276 1295
1277 if (!vma || 1296 if (!vma ||
1278 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1297 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1279 (!ignore && !(vm_flags & vma->vm_flags))) 1298 !(vm_flags & vma->vm_flags))
1280 return i ? : -EFAULT; 1299 return i ? : -EFAULT;
1281 1300
1282 if (is_vm_hugetlb_page(vma)) { 1301 if (is_vm_hugetlb_page(vma)) {
1283 i = follow_hugetlb_page(mm, vma, pages, vmas, 1302 i = follow_hugetlb_page(mm, vma, pages, vmas,
1284 &start, &nr_pages, i, write); 1303 &start, &nr_pages, i, gup_flags);
1285 continue; 1304 continue;
1286 } 1305 }
1287 1306
1288 foll_flags = FOLL_TOUCH;
1289 if (pages)
1290 foll_flags |= FOLL_GET;
1291 if (!write && use_zero_page(vma))
1292 foll_flags |= FOLL_ANON;
1293
1294 do { 1307 do {
1295 struct page *page; 1308 struct page *page;
1309 unsigned int foll_flags = gup_flags;
1296 1310
1297 /* 1311 /*
1298 * If we have a pending SIGKILL, don't keep faulting 1312 * If we have a pending SIGKILL, don't keep faulting
1299 * pages and potentially allocating memory, unless 1313 * pages and potentially allocating memory.
1300 * current is handling munlock--e.g., on exit. In
1301 * that case, we are not allocating memory. Rather,
1302 * we're only unlocking already resident/mapped pages.
1303 */ 1314 */
1304 if (unlikely(!ignore_sigkill && 1315 if (unlikely(fatal_signal_pending(current)))
1305 fatal_signal_pending(current)))
1306 return i ? i : -ERESTARTSYS; 1316 return i ? i : -ERESTARTSYS;
1307 1317
1308 if (write)
1309 foll_flags |= FOLL_WRITE;
1310
1311 cond_resched(); 1318 cond_resched();
1312 while (!(page = follow_page(vma, start, foll_flags))) { 1319 while (!(page = follow_page(vma, start, foll_flags))) {
1313 int ret; 1320 int ret;
@@ -1319,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1319 if (ret & VM_FAULT_ERROR) { 1326 if (ret & VM_FAULT_ERROR) {
1320 if (ret & VM_FAULT_OOM) 1327 if (ret & VM_FAULT_OOM)
1321 return i ? i : -ENOMEM; 1328 return i ? i : -ENOMEM;
1322 else if (ret & VM_FAULT_SIGBUS) 1329 if (ret &
1330 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1323 return i ? i : -EFAULT; 1331 return i ? i : -EFAULT;
1324 BUG(); 1332 BUG();
1325 } 1333 }
@@ -1418,18 +1426,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1418 unsigned long start, int nr_pages, int write, int force, 1426 unsigned long start, int nr_pages, int write, int force,
1419 struct page **pages, struct vm_area_struct **vmas) 1427 struct page **pages, struct vm_area_struct **vmas)
1420{ 1428{
1421 int flags = 0; 1429 int flags = FOLL_TOUCH;
1422 1430
1431 if (pages)
1432 flags |= FOLL_GET;
1423 if (write) 1433 if (write)
1424 flags |= GUP_FLAGS_WRITE; 1434 flags |= FOLL_WRITE;
1425 if (force) 1435 if (force)
1426 flags |= GUP_FLAGS_FORCE; 1436 flags |= FOLL_FORCE;
1427 1437
1428 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1438 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1429} 1439}
1430
1431EXPORT_SYMBOL(get_user_pages); 1440EXPORT_SYMBOL(get_user_pages);
1432 1441
1442/**
1443 * get_dump_page() - pin user page in memory while writing it to core dump
1444 * @addr: user address
1445 *
1446 * Returns struct page pointer of user page pinned for dump,
1447 * to be freed afterwards by page_cache_release() or put_page().
1448 *
1449 * Returns NULL on any kind of failure - a hole must then be inserted into
1450 * the corefile, to preserve alignment with its headers; and also returns
1451 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1452 * allowing a hole to be left in the corefile to save diskspace.
1453 *
1454 * Called without mmap_sem, but after all other threads have been killed.
1455 */
1456#ifdef CONFIG_ELF_CORE
1457struct page *get_dump_page(unsigned long addr)
1458{
1459 struct vm_area_struct *vma;
1460 struct page *page;
1461
1462 if (__get_user_pages(current, current->mm, addr, 1,
1463 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1464 return NULL;
1465 flush_cache_page(vma, addr, page_to_pfn(page));
1466 return page;
1467}
1468#endif /* CONFIG_ELF_CORE */
1469
1433pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1470pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1434 spinlock_t **ptl) 1471 spinlock_t **ptl)
1435{ 1472{
@@ -1607,7 +1644,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1607 * If we don't have pte special, then we have to use the pfn_valid() 1644 * If we don't have pte special, then we have to use the pfn_valid()
1608 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1645 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1609 * refcount the page if pfn_valid is true (hence insert_page rather 1646 * refcount the page if pfn_valid is true (hence insert_page rather
1610 * than insert_pfn). 1647 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1648 * without pte special, it would there be refcounted as a normal page.
1611 */ 1649 */
1612 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1650 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1613 struct page *page; 1651 struct page *page;
@@ -1973,7 +2011,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1973 * Take out anonymous pages first, anonymous shared vmas are 2011 * Take out anonymous pages first, anonymous shared vmas are
1974 * not dirty accountable. 2012 * not dirty accountable.
1975 */ 2013 */
1976 if (PageAnon(old_page)) { 2014 if (PageAnon(old_page) && !PageKsm(old_page)) {
1977 if (!trylock_page(old_page)) { 2015 if (!trylock_page(old_page)) {
1978 page_cache_get(old_page); 2016 page_cache_get(old_page);
1979 pte_unmap_unlock(page_table, ptl); 2017 pte_unmap_unlock(page_table, ptl);
@@ -2074,10 +2112,19 @@ gotten:
2074 2112
2075 if (unlikely(anon_vma_prepare(vma))) 2113 if (unlikely(anon_vma_prepare(vma)))
2076 goto oom; 2114 goto oom;
2077 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2115
2078 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2116 if (is_zero_pfn(pte_pfn(orig_pte))) {
2079 if (!new_page) 2117 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2080 goto oom; 2118 if (!new_page)
2119 goto oom;
2120 } else {
2121 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2122 if (!new_page)
2123 goto oom;
2124 cow_user_page(new_page, old_page, address, vma);
2125 }
2126 __SetPageUptodate(new_page);
2127
2081 /* 2128 /*
2082 * Don't let another task, with possibly unlocked vma, 2129 * Don't let another task, with possibly unlocked vma,
2083 * keep the mlocked page. 2130 * keep the mlocked page.
@@ -2087,8 +2134,6 @@ gotten:
2087 clear_page_mlock(old_page); 2134 clear_page_mlock(old_page);
2088 unlock_page(old_page); 2135 unlock_page(old_page);
2089 } 2136 }
2090 cow_user_page(new_page, old_page, address, vma);
2091 __SetPageUptodate(new_page);
2092 2137
2093 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2138 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2094 goto oom_free_new; 2139 goto oom_free_new;
@@ -2114,9 +2159,14 @@ gotten:
2114 * seen in the presence of one thread doing SMC and another 2159 * seen in the presence of one thread doing SMC and another
2115 * thread doing COW. 2160 * thread doing COW.
2116 */ 2161 */
2117 ptep_clear_flush_notify(vma, address, page_table); 2162 ptep_clear_flush(vma, address, page_table);
2118 page_add_new_anon_rmap(new_page, vma, address); 2163 page_add_new_anon_rmap(new_page, vma, address);
2119 set_pte_at(mm, address, page_table, entry); 2164 /*
2165 * We call the notify macro here because, when using secondary
2166 * mmu page tables (such as kvm shadow page tables), we want the
2167 * new page to be mapped directly into the secondary page table.
2168 */
2169 set_pte_at_notify(mm, address, page_table, entry);
2120 update_mmu_cache(vma, address, entry); 2170 update_mmu_cache(vma, address, entry);
2121 if (old_page) { 2171 if (old_page) {
2122 /* 2172 /*
@@ -2359,7 +2409,7 @@ restart:
2359 * @mapping: the address space containing mmaps to be unmapped. 2409 * @mapping: the address space containing mmaps to be unmapped.
2360 * @holebegin: byte in first page to unmap, relative to the start of 2410 * @holebegin: byte in first page to unmap, relative to the start of
2361 * the underlying file. This will be rounded down to a PAGE_SIZE 2411 * the underlying file. This will be rounded down to a PAGE_SIZE
2362 * boundary. Note that this is different from vmtruncate(), which 2412 * boundary. Note that this is different from truncate_pagecache(), which
2363 * must keep the partial page. In contrast, we must get rid of 2413 * must keep the partial page. In contrast, we must get rid of
2364 * partial pages. 2414 * partial pages.
2365 * @holelen: size of prospective hole in bytes. This will be rounded 2415 * @holelen: size of prospective hole in bytes. This will be rounded
@@ -2410,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping,
2410} 2460}
2411EXPORT_SYMBOL(unmap_mapping_range); 2461EXPORT_SYMBOL(unmap_mapping_range);
2412 2462
2413/**
2414 * vmtruncate - unmap mappings "freed" by truncate() syscall
2415 * @inode: inode of the file used
2416 * @offset: file offset to start truncating
2417 *
2418 * NOTE! We have to be ready to update the memory sharing
2419 * between the file and the memory map for a potential last
2420 * incomplete page. Ugly, but necessary.
2421 */
2422int vmtruncate(struct inode * inode, loff_t offset)
2423{
2424 if (inode->i_size < offset) {
2425 unsigned long limit;
2426
2427 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2428 if (limit != RLIM_INFINITY && offset > limit)
2429 goto out_sig;
2430 if (offset > inode->i_sb->s_maxbytes)
2431 goto out_big;
2432 i_size_write(inode, offset);
2433 } else {
2434 struct address_space *mapping = inode->i_mapping;
2435
2436 /*
2437 * truncation of in-use swapfiles is disallowed - it would
2438 * cause subsequent swapout to scribble on the now-freed
2439 * blocks.
2440 */
2441 if (IS_SWAPFILE(inode))
2442 return -ETXTBSY;
2443 i_size_write(inode, offset);
2444
2445 /*
2446 * unmap_mapping_range is called twice, first simply for
2447 * efficiency so that truncate_inode_pages does fewer
2448 * single-page unmaps. However after this first call, and
2449 * before truncate_inode_pages finishes, it is possible for
2450 * private pages to be COWed, which remain after
2451 * truncate_inode_pages finishes, hence the second
2452 * unmap_mapping_range call must be made for correctness.
2453 */
2454 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2455 truncate_inode_pages(mapping, offset);
2456 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2457 }
2458
2459 if (inode->i_op->truncate)
2460 inode->i_op->truncate(inode);
2461 return 0;
2462
2463out_sig:
2464 send_sig(SIGXFSZ, current, 0);
2465out_big:
2466 return -EFBIG;
2467}
2468EXPORT_SYMBOL(vmtruncate);
2469
2470int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 2463int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2471{ 2464{
2472 struct address_space *mapping = inode->i_mapping; 2465 struct address_space *mapping = inode->i_mapping;
@@ -2511,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2511 goto out; 2504 goto out;
2512 2505
2513 entry = pte_to_swp_entry(orig_pte); 2506 entry = pte_to_swp_entry(orig_pte);
2514 if (is_migration_entry(entry)) { 2507 if (unlikely(non_swap_entry(entry))) {
2515 migration_entry_wait(mm, pmd, address); 2508 if (is_migration_entry(entry)) {
2509 migration_entry_wait(mm, pmd, address);
2510 } else if (is_hwpoison_entry(entry)) {
2511 ret = VM_FAULT_HWPOISON;
2512 } else {
2513 print_bad_pte(vma, address, orig_pte, NULL);
2514 ret = VM_FAULT_OOM;
2515 }
2516 goto out; 2516 goto out;
2517 } 2517 }
2518 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2518 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2536,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2536 /* Had to read the page from swap area: Major fault */ 2536 /* Had to read the page from swap area: Major fault */
2537 ret = VM_FAULT_MAJOR; 2537 ret = VM_FAULT_MAJOR;
2538 count_vm_event(PGMAJFAULT); 2538 count_vm_event(PGMAJFAULT);
2539 } else if (PageHWPoison(page)) {
2540 ret = VM_FAULT_HWPOISON;
2541 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2542 goto out;
2539 } 2543 }
2540 2544
2541 lock_page(page); 2545 lock_page(page);
@@ -2624,6 +2628,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2624 spinlock_t *ptl; 2628 spinlock_t *ptl;
2625 pte_t entry; 2629 pte_t entry;
2626 2630
2631 if (!(flags & FAULT_FLAG_WRITE)) {
2632 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2633 vma->vm_page_prot));
2634 ptl = pte_lockptr(mm, pmd);
2635 spin_lock(ptl);
2636 if (!pte_none(*page_table))
2637 goto unlock;
2638 goto setpte;
2639 }
2640
2627 /* Allocate our own private page. */ 2641 /* Allocate our own private page. */
2628 pte_unmap(page_table); 2642 pte_unmap(page_table);
2629 2643
@@ -2638,13 +2652,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2638 goto oom_free_page; 2652 goto oom_free_page;
2639 2653
2640 entry = mk_pte(page, vma->vm_page_prot); 2654 entry = mk_pte(page, vma->vm_page_prot);
2641 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2655 if (vma->vm_flags & VM_WRITE)
2656 entry = pte_mkwrite(pte_mkdirty(entry));
2642 2657
2643 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2658 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2644 if (!pte_none(*page_table)) 2659 if (!pte_none(*page_table))
2645 goto release; 2660 goto release;
2661
2646 inc_mm_counter(mm, anon_rss); 2662 inc_mm_counter(mm, anon_rss);
2647 page_add_new_anon_rmap(page, vma, address); 2663 page_add_new_anon_rmap(page, vma, address);
2664setpte:
2648 set_pte_at(mm, address, page_table, entry); 2665 set_pte_at(mm, address, page_table, entry);
2649 2666
2650 /* No need to invalidate - it was non-present before */ 2667 /* No need to invalidate - it was non-present before */
@@ -2699,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2699 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2716 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2700 return ret; 2717 return ret;
2701 2718
2719 if (unlikely(PageHWPoison(vmf.page))) {
2720 if (ret & VM_FAULT_LOCKED)
2721 unlock_page(vmf.page);
2722 return VM_FAULT_HWPOISON;
2723 }
2724
2702 /* 2725 /*
2703 * For consistency in subsequent calls, make the faulted page always 2726 * For consistency in subsequent calls, make the faulted page always
2704 * locked. 2727 * locked.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4412a676c88..821dee596377 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages);
339 339
340void online_page(struct page *page) 340void online_page(struct page *page)
341{ 341{
342 unsigned long pfn = page_to_pfn(page);
343
342 totalram_pages++; 344 totalram_pages++;
343 num_physpages++; 345 if (pfn >= num_physpages)
346 num_physpages = pfn + 1;
344 347
345#ifdef CONFIG_HIGHMEM 348#ifdef CONFIG_HIGHMEM
346 if (PageHighMem(page)) 349 if (PageHighMem(page))
@@ -410,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
410 if (!populated_zone(zone)) 413 if (!populated_zone(zone))
411 need_zonelists_rebuild = 1; 414 need_zonelists_rebuild = 1;
412 415
413 ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, 416 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
414 online_pages_range); 417 online_pages_range);
415 if (ret) { 418 if (ret) {
416 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 419 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
@@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 425 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 426 zone->zone_pgdat->node_present_pages += onlined_pages;
424 427
428 zone_pcp_update(zone);
425 setup_per_zone_wmarks(); 429 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone); 430 calculate_zone_inactive_ratio(zone);
427 if (onlined_pages) { 431 if (onlined_pages) {
@@ -701,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
701static void 705static void
702offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 706offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
703{ 707{
704 walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, 708 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
705 offline_isolated_pages_cb); 709 offline_isolated_pages_cb);
706} 710}
707 711
@@ -727,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
727 long offlined = 0; 731 long offlined = 0;
728 int ret; 732 int ret;
729 733
730 ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, 734 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
731 check_pages_isolated_cb); 735 check_pages_isolated_cb);
732 if (ret < 0) 736 if (ret < 0)
733 offlined = (long)ret; 737 offlined = (long)ret;
@@ -831,7 +835,6 @@ repeat:
831 zone->present_pages -= offlined_pages; 835 zone->present_pages -= offlined_pages;
832 zone->zone_pgdat->node_present_pages -= offlined_pages; 836 zone->zone_pgdat->node_present_pages -= offlined_pages;
833 totalram_pages -= offlined_pages; 837 totalram_pages -= offlined_pages;
834 num_physpages -= offlined_pages;
835 838
836 setup_per_zone_wmarks(); 839 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone); 840 calculate_zone_inactive_ratio(zone);
diff --git a/mm/mempool.c b/mm/mempool.c
index 32e75d400503..1a3bc3d4d554 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{
313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask);
315}
316EXPORT_SYMBOL(mempool_kzalloc);
317
318void mempool_kfree(void *element, void *pool_data) 311void mempool_kfree(void *element, void *pool_data)
319{ 312{
320 kfree(element); 313 kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index 939888f9ddab..1a4bf4813780 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
67 67
68 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
69 list_del(&page->lru); 69 list_del(&page->lru);
70 dec_zone_page_state(page, NR_ISOLATED_ANON +
71 page_is_file_cache(page));
70 putback_lru_page(page); 72 putback_lru_page(page);
71 count++; 73 count++;
72 } 74 }
@@ -147,7 +149,7 @@ out:
147static void remove_file_migration_ptes(struct page *old, struct page *new) 149static void remove_file_migration_ptes(struct page *old, struct page *new)
148{ 150{
149 struct vm_area_struct *vma; 151 struct vm_area_struct *vma;
150 struct address_space *mapping = page_mapping(new); 152 struct address_space *mapping = new->mapping;
151 struct prio_tree_iter iter; 153 struct prio_tree_iter iter;
152 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
153 155
@@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 272 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 273 page_index(page));
272 274
273 expected_count = 2 + !!page_has_private(page); 275 expected_count = 2 + page_has_private(page);
274 if (page_count(page) != expected_count || 276 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 277 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 278 spin_unlock_irq(&mapping->tree_lock);
@@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
312 */ 314 */
313 __dec_zone_page_state(page, NR_FILE_PAGES); 315 __dec_zone_page_state(page, NR_FILE_PAGES);
314 __inc_zone_page_state(newpage, NR_FILE_PAGES); 316 __inc_zone_page_state(newpage, NR_FILE_PAGES);
315 317 if (PageSwapBacked(page)) {
318 __dec_zone_page_state(page, NR_SHMEM);
319 __inc_zone_page_state(newpage, NR_SHMEM);
320 }
316 spin_unlock_irq(&mapping->tree_lock); 321 spin_unlock_irq(&mapping->tree_lock);
317 322
318 return 0; 323 return 0;
@@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
664 * needs to be effective. 669 * needs to be effective.
665 */ 670 */
666 try_to_free_buffers(page); 671 try_to_free_buffers(page);
672 goto rcu_unlock;
667 } 673 }
668 goto rcu_unlock; 674 goto skip_unmap;
669 } 675 }
670 676
671 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
672 try_to_unmap(page, 1); 678 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
673 679
680skip_unmap:
674 if (!page_mapped(page)) 681 if (!page_mapped(page))
675 rc = move_to_new_page(newpage, page); 682 rc = move_to_new_page(newpage, page);
676 683
@@ -693,6 +700,8 @@ unlock:
693 * restored. 700 * restored.
694 */ 701 */
695 list_del(&page->lru); 702 list_del(&page->lru);
703 dec_zone_page_state(page, NR_ISOLATED_ANON +
704 page_is_file_cache(page));
696 putback_lru_page(page); 705 putback_lru_page(page);
697 } 706 }
698 707
@@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from,
737 struct page *page2; 746 struct page *page2;
738 int swapwrite = current->flags & PF_SWAPWRITE; 747 int swapwrite = current->flags & PF_SWAPWRITE;
739 int rc; 748 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
740 756
741 if (!swapwrite) 757 if (!swapwrite)
742 current->flags |= PF_SWAPWRITE; 758 current->flags |= PF_SWAPWRITE;
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650b9654..bd6f0e466f6c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
139} 139}
140 140
141/** 141/**
142 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. 142 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
143 * @vma: target vma 143 * @vma: target vma
144 * @start: start address 144 * @start: start address
145 * @end: end address 145 * @end: end address
146 * @mlock: 0 indicate munlock, otherwise mlock.
147 * 146 *
148 * If @mlock == 0, unlock an mlocked range; 147 * This takes care of making the pages present too.
149 * else mlock the range of pages. This takes care of making the pages present ,
150 * too.
151 * 148 *
152 * return 0 on success, negative error code on error. 149 * return 0 on success, negative error code on error.
153 * 150 *
154 * vma->vm_mm->mmap_sem must be held for at least read. 151 * vma->vm_mm->mmap_sem must be held for at least read.
155 */ 152 */
156static long __mlock_vma_pages_range(struct vm_area_struct *vma, 153static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157 unsigned long start, unsigned long end, 154 unsigned long start, unsigned long end)
158 int mlock)
159{ 155{
160 struct mm_struct *mm = vma->vm_mm; 156 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 157 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */ 158 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 159 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0; 160 int ret = 0;
165 int gup_flags = 0; 161 int gup_flags;
166 162
167 VM_BUG_ON(start & ~PAGE_MASK); 163 VM_BUG_ON(start & ~PAGE_MASK);
168 VM_BUG_ON(end & ~PAGE_MASK); 164 VM_BUG_ON(end & ~PAGE_MASK);
169 VM_BUG_ON(start < vma->vm_start); 165 VM_BUG_ON(start < vma->vm_start);
170 VM_BUG_ON(end > vma->vm_end); 166 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && 167 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 (atomic_read(&mm->mm_users) != 0));
173
174 /*
175 * mlock: don't page populate if vma has PROT_NONE permission.
176 * munlock: always do munlock although the vma has PROT_NONE
177 * permission, or SIGKILL is pending.
178 */
179 if (!mlock)
180 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
181 GUP_FLAGS_IGNORE_SIGKILL;
182 168
169 gup_flags = FOLL_TOUCH | FOLL_GET;
183 if (vma->vm_flags & VM_WRITE) 170 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 171 gup_flags |= FOLL_WRITE;
185 172
186 while (nr_pages > 0) { 173 while (nr_pages > 0) {
187 int i; 174 int i;
@@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
201 * This can happen for, e.g., VM_NONLINEAR regions before 188 * This can happen for, e.g., VM_NONLINEAR regions before
202 * a page has been allocated and mapped at a given offset, 189 * a page has been allocated and mapped at a given offset,
203 * or for addresses that map beyond end of a file. 190 * or for addresses that map beyond end of a file.
204 * We'll mlock the the pages if/when they get faulted in. 191 * We'll mlock the pages if/when they get faulted in.
205 */ 192 */
206 if (ret < 0) 193 if (ret < 0)
207 break; 194 break;
208 if (ret == 0) {
209 /*
210 * We know the vma is there, so the only time
211 * we cannot get a single page should be an
212 * error (ret < 0) case.
213 */
214 WARN_ON(1);
215 break;
216 }
217 195
218 lru_add_drain(); /* push cached pages to LRU */ 196 lru_add_drain(); /* push cached pages to LRU */
219 197
220 for (i = 0; i < ret; i++) { 198 for (i = 0; i < ret; i++) {
221 struct page *page = pages[i]; 199 struct page *page = pages[i];
222 200
223 lock_page(page);
224 /*
225 * Because we lock page here and migration is blocked
226 * by the elevated reference, we need only check for
227 * page truncation (file-cache only).
228 */
229 if (page->mapping) { 201 if (page->mapping) {
230 if (mlock) 202 /*
203 * That preliminary check is mainly to avoid
204 * the pointless overhead of lock_page on the
205 * ZERO_PAGE: which might bounce very badly if
206 * there is contention. However, we're still
207 * dirtying its cacheline with get/put_page:
208 * we'll add another __get_user_pages flag to
209 * avoid it if that case turns out to matter.
210 */
211 lock_page(page);
212 /*
213 * Because we lock page here and migration is
214 * blocked by the elevated reference, we need
215 * only check for file-cache page truncation.
216 */
217 if (page->mapping)
231 mlock_vma_page(page); 218 mlock_vma_page(page);
232 else 219 unlock_page(page);
233 munlock_vma_page(page);
234 } 220 }
235 unlock_page(page); 221 put_page(page); /* ref from get_user_pages() */
236 put_page(page); /* ref from get_user_pages() */
237
238 /*
239 * here we assume that get_user_pages() has given us
240 * a list of virtually contiguous pages.
241 */
242 addr += PAGE_SIZE; /* for next get_user_pages() */
243 nr_pages--;
244 } 222 }
223
224 addr += ret * PAGE_SIZE;
225 nr_pages -= ret;
245 ret = 0; 226 ret = 0;
246 } 227 }
247 228
248 return ret; /* count entire vma as locked_vm */ 229 return ret; /* 0 or negative error code */
249} 230}
250 231
251/* 232/*
@@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
289 is_vm_hugetlb_page(vma) || 270 is_vm_hugetlb_page(vma) ||
290 vma == get_gate_vma(current))) { 271 vma == get_gate_vma(current))) {
291 272
292 __mlock_vma_pages_range(vma, start, end, 1); 273 __mlock_vma_pages_range(vma, start, end);
293 274
294 /* Hide errors from mmap() and other callers */ 275 /* Hide errors from mmap() and other callers */
295 return 0; 276 return 0;
@@ -310,7 +291,6 @@ no_mlock:
310 return nr_pages; /* error or pages NOT mlocked */ 291 return nr_pages; /* error or pages NOT mlocked */
311} 292}
312 293
313
314/* 294/*
315 * munlock_vma_pages_range() - munlock all pages in the vma range.' 295 * munlock_vma_pages_range() - munlock all pages in the vma range.'
316 * @vma - vma containing range to be munlock()ed. 296 * @vma - vma containing range to be munlock()ed.
@@ -330,10 +310,38 @@ no_mlock:
330 * free them. This will result in freeing mlocked pages. 310 * free them. This will result in freeing mlocked pages.
331 */ 311 */
332void munlock_vma_pages_range(struct vm_area_struct *vma, 312void munlock_vma_pages_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long end) 313 unsigned long start, unsigned long end)
334{ 314{
315 unsigned long addr;
316
317 lru_add_drain();
335 vma->vm_flags &= ~VM_LOCKED; 318 vma->vm_flags &= ~VM_LOCKED;
336 __mlock_vma_pages_range(vma, start, end, 0); 319
320 for (addr = start; addr < end; addr += PAGE_SIZE) {
321 struct page *page;
322 /*
323 * Although FOLL_DUMP is intended for get_dump_page(),
324 * it just so happens that its special treatment of the
325 * ZERO_PAGE (returning an error instead of doing get_page)
326 * suits munlock very well (and if somehow an abnormal page
327 * has sneaked into the range, we won't oops here: great).
328 */
329 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
330 if (page && !IS_ERR(page)) {
331 lock_page(page);
332 /*
333 * Like in __mlock_vma_pages_range(),
334 * because we lock page here and migration is
335 * blocked by the elevated reference, we need
336 * only check for file-cache page truncation.
337 */
338 if (page->mapping)
339 munlock_vma_page(page);
340 unlock_page(page);
341 put_page(page);
342 }
343 cond_resched();
344 }
337} 345}
338 346
339/* 347/*
@@ -400,18 +408,14 @@ success:
400 * It's okay if try_to_unmap_one unmaps a page just after we 408 * It's okay if try_to_unmap_one unmaps a page just after we
401 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 409 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402 */ 410 */
403 vma->vm_flags = newflags;
404 411
405 if (lock) { 412 if (lock) {
406 ret = __mlock_vma_pages_range(vma, start, end, 1); 413 vma->vm_flags = newflags;
407 414 ret = __mlock_vma_pages_range(vma, start, end);
408 if (ret > 0) { 415 if (ret < 0)
409 mm->locked_vm -= ret; 416 ret = __mlock_posix_error_return(ret);
410 ret = 0;
411 } else
412 ret = __mlock_posix_error_return(ret); /* translate if needed */
413 } else { 417 } else {
414 __mlock_vma_pages_range(vma, start, end, 0); 418 munlock_vma_pages_range(vma, start, end);
415 } 419 }
416 420
417out: 421out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd5..21d4029a07b3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h> 31#include <linux/perf_event.h>
32 32
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 88int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
89struct percpu_counter vm_committed_as; 89struct percpu_counter vm_committed_as;
90 90
91/* amount of vm to protect from userspace access */
92unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
93
94/* 91/*
95 * Check that a process has enough memory to allocate a new virtual 92 * Check that a process has enough memory to allocate a new virtual
96 * mapping. 0 means there is enough memory for the allocation to 93 * mapping. 0 means there is enough memory for the allocation to
@@ -573,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end);
573 570
574 /* 571 /*
575 * When changing only vma->vm_end, we don't really need 572 * When changing only vma->vm_end, we don't really need
576 * anon_vma lock: but is that case worth optimizing out? 573 * anon_vma lock.
577 */ 574 */
578 if (vma->anon_vma) 575 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
579 anon_vma = vma->anon_vma; 576 anon_vma = vma->anon_vma;
580 if (anon_vma) { 577 if (anon_vma) {
581 spin_lock(&anon_vma->lock); 578 spin_lock(&anon_vma->lock);
@@ -659,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end);
659 validate_mm(mm); 656 validate_mm(mm);
660} 657}
661 658
662/* Flags that can be inherited from an existing mapping when merging */
663#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
664
665/* 659/*
666 * If the vma has a ->close operation then the driver probably needs to release 660 * If the vma has a ->close operation then the driver probably needs to release
667 * per-vma resources, so we don't attempt to merge those. 661 * per-vma resources, so we don't attempt to merge those.
@@ -669,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end);
669static inline int is_mergeable_vma(struct vm_area_struct *vma, 663static inline int is_mergeable_vma(struct vm_area_struct *vma,
670 struct file *file, unsigned long vm_flags) 664 struct file *file, unsigned long vm_flags)
671{ 665{
672 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) 666 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
667 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
673 return 0; 668 return 0;
674 if (vma->vm_file != file) 669 if (vma->vm_file != file)
675 return 0; 670 return 0;
@@ -908,7 +903,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
908#endif /* CONFIG_PROC_FS */ 903#endif /* CONFIG_PROC_FS */
909 904
910/* 905/*
911 * The caller must hold down_write(current->mm->mmap_sem). 906 * The caller must hold down_write(&current->mm->mmap_sem).
912 */ 907 */
913 908
914unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 909unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
@@ -954,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
954 if (mm->map_count > sysctl_max_map_count) 949 if (mm->map_count > sysctl_max_map_count)
955 return -ENOMEM; 950 return -ENOMEM;
956 951
952 if (flags & MAP_HUGETLB) {
953 struct user_struct *user = NULL;
954 if (file)
955 return -EINVAL;
956
957 /*
958 * VM_NORESERVE is used because the reservations will be
959 * taken when vm_ops->mmap() is called
960 * A dummy user value is used because we are not locking
961 * memory so no accounting is necessary
962 */
963 len = ALIGN(len, huge_page_size(&default_hstate));
964 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
965 &user, HUGETLB_ANONHUGE_INODE);
966 if (IS_ERR(file))
967 return PTR_ERR(file);
968 }
969
957 /* Obtain the address to map to. we verify (or select) it and ensure 970 /* Obtain the address to map to. we verify (or select) it and ensure
958 * that it represents a valid section of the address space. 971 * that it represents a valid section of the address space.
959 */ 972 */
@@ -968,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
968 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 981 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
969 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 982 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
970 983
971 if (flags & MAP_LOCKED) { 984 if (flags & MAP_LOCKED)
972 if (!can_do_mlock()) 985 if (!can_do_mlock())
973 return -EPERM; 986 return -EPERM;
974 vm_flags |= VM_LOCKED;
975 }
976 987
977 /* mlock MCL_FUTURE? */ 988 /* mlock MCL_FUTURE? */
978 if (vm_flags & VM_LOCKED) { 989 if (vm_flags & VM_LOCKED) {
@@ -1198,21 +1209,21 @@ munmap_back:
1198 goto unmap_and_free_vma; 1209 goto unmap_and_free_vma;
1199 if (vm_flags & VM_EXECUTABLE) 1210 if (vm_flags & VM_EXECUTABLE)
1200 added_exe_file_vma(mm); 1211 added_exe_file_vma(mm);
1212
1213 /* Can addr have changed??
1214 *
1215 * Answer: Yes, several device drivers can do it in their
1216 * f_op->mmap method. -DaveM
1217 */
1218 addr = vma->vm_start;
1219 pgoff = vma->vm_pgoff;
1220 vm_flags = vma->vm_flags;
1201 } else if (vm_flags & VM_SHARED) { 1221 } else if (vm_flags & VM_SHARED) {
1202 error = shmem_zero_setup(vma); 1222 error = shmem_zero_setup(vma);
1203 if (error) 1223 if (error)
1204 goto free_vma; 1224 goto free_vma;
1205 } 1225 }
1206 1226
1207 /* Can addr have changed??
1208 *
1209 * Answer: Yes, several device drivers can do it in their
1210 * f_op->mmap method. -DaveM
1211 */
1212 addr = vma->vm_start;
1213 pgoff = vma->vm_pgoff;
1214 vm_flags = vma->vm_flags;
1215
1216 if (vma_wants_writenotify(vma)) 1227 if (vma_wants_writenotify(vma))
1217 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1228 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1218 1229
@@ -1223,7 +1234,7 @@ munmap_back:
1223 if (correct_wcount) 1234 if (correct_wcount)
1224 atomic_inc(&inode->i_writecount); 1235 atomic_inc(&inode->i_writecount);
1225out: 1236out:
1226 perf_counter_mmap(vma); 1237 perf_event_mmap(vma);
1227 1238
1228 mm->total_vm += len >> PAGE_SHIFT; 1239 mm->total_vm += len >> PAGE_SHIFT;
1229 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1240 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2114,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm)
2114 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2125 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2115 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2126 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2116 vm_unacct_memory(nr_accounted); 2127 vm_unacct_memory(nr_accounted);
2128
2117 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2129 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2118 tlb_finish_mmu(tlb, 0, end); 2130 tlb_finish_mmu(tlb, 0, end);
2119 2131
@@ -2311,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm,
2311 2323
2312 mm->total_vm += len >> PAGE_SHIFT; 2324 mm->total_vm += len >> PAGE_SHIFT;
2313 2325
2314 perf_counter_mmap(vma); 2326 perf_event_mmap(vma);
2315 2327
2316 return 0; 2328 return 0;
2317} 2329}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 000000000000..ded9081f4021
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,58 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 *
3 * See ../COPYING for licensing terms.
4 */
5
6#include <linux/mm.h>
7#include <linux/mmu_context.h>
8#include <linux/sched.h>
9
10#include <asm/mmu_context.h>
11
12/*
13 * use_mm
14 * Makes the calling kernel thread take on the specified
15 * mm context.
16 * Called by the retry thread execute retries within the
17 * iocb issuer's mm context, so that copy_from/to_user
18 * operations work seamlessly for aio.
19 * (Note: this routine is intended to be called only
20 * from a kernel thread context)
21 */
22void use_mm(struct mm_struct *mm)
23{
24 struct mm_struct *active_mm;
25 struct task_struct *tsk = current;
26
27 task_lock(tsk);
28 active_mm = tsk->active_mm;
29 if (active_mm != mm) {
30 atomic_inc(&mm->mm_count);
31 tsk->active_mm = mm;
32 }
33 tsk->mm = mm;
34 switch_mm(active_mm, mm, tsk);
35 task_unlock(tsk);
36
37 if (active_mm != mm)
38 mmdrop(active_mm);
39}
40
41/*
42 * unuse_mm
43 * Reverses the effect of use_mm, i.e. releases the
44 * specified mm context which was earlier taken on
45 * by the calling kernel thread
46 * (Note: this routine is intended to be called only
47 * from a kernel thread context)
48 */
49void unuse_mm(struct mm_struct *mm)
50{
51 struct task_struct *tsk = current;
52
53 task_lock(tsk);
54 tsk->mm = NULL;
55 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk);
58}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
99 return young; 99 return young;
100} 100}
101 101
102void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
103 pte_t pte)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->change_pte)
111 mn->ops->change_pte(mn, mm, address, pte);
112 /*
113 * Some drivers don't have change_pte,
114 * so we must call invalidate_page in that case.
115 */
116 else if (mn->ops->invalidate_page)
117 mn->ops->invalidate_page(mn, mm, address);
118 }
119 rcu_read_unlock();
120}
121
102void __mmu_notifier_invalidate_page(struct mm_struct *mm, 122void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address) 123 unsigned long address)
104{ 124{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2d..8bc969d8112d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h> 26#include <linux/perf_event.h>
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <asm/pgtable.h> 28#include <asm/pgtable.h>
29#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
301 if (error) 301 if (error)
302 goto out; 302 goto out;
303 perf_counter_mmap(vma); 303 perf_event_mmap(vma);
304 nstart = tmp; 304 nstart = tmp;
305 305
306 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b91be46..97bff2547719 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/ksm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/swap.h> 16#include <linux/swap.h>
16#include <linux/capability.h> 17#include <linux/capability.h>
@@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
85 if (vma->vm_file) { 86 if (vma->vm_file) {
86 /* 87 /*
87 * Subtle point from Rajesh Venkatasubramanian: before 88 * Subtle point from Rajesh Venkatasubramanian: before
88 * moving file-based ptes, we must lock vmtruncate out, 89 * moving file-based ptes, we must lock truncate_pagecache
89 * since it might clean the dst vma before the src vma, 90 * out, since it might clean the dst vma before the src vma,
90 * and we propagate stale pages into the dst afterward. 91 * and we propagate stale pages into the dst afterward.
91 */ 92 */
92 mapping = vma->vm_file->f_mapping; 93 mapping = vma->vm_file->f_mapping;
@@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
174 unsigned long excess = 0; 175 unsigned long excess = 0;
175 unsigned long hiwater_vm; 176 unsigned long hiwater_vm;
176 int split = 0; 177 int split = 0;
178 int err;
177 179
178 /* 180 /*
179 * We'd prefer to avoid failure later on in do_munmap: 181 * We'd prefer to avoid failure later on in do_munmap:
@@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
182 if (mm->map_count >= sysctl_max_map_count - 3) 184 if (mm->map_count >= sysctl_max_map_count - 3)
183 return -ENOMEM; 185 return -ENOMEM;
184 186
187 /*
188 * Advise KSM to break any KSM pages in the area to be moved:
189 * it would be confusing if they were to turn up at the new
190 * location, where they happen to coincide with different KSM
191 * pages recently unmapped. But leave vma->vm_flags as it was,
192 * so KSM can come around to merge on vma and new_vma afterwards.
193 */
194 err = ksm_madvise(vma, old_addr, old_addr + old_len,
195 MADV_UNMERGEABLE, &vm_flags);
196 if (err)
197 return err;
198
185 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 199 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
186 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 200 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
187 if (!new_vma) 201 if (!new_vma)
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10fece4..c73aa4753d79 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,6 +33,7 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/mmu_context.h>
36#include "internal.h" 37#include "internal.h"
37 38
38static inline __attribute__((format(printf, 1, 2))) 39static inline __attribute__((format(printf, 1, 2)))
@@ -56,12 +57,11 @@ void no_printk(const char *fmt, ...)
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) 57 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif 58#endif
58 59
59#include "internal.h"
60
61void *high_memory; 60void *high_memory;
62struct page *mem_map; 61struct page *mem_map;
63unsigned long max_mapnr; 62unsigned long max_mapnr;
64unsigned long num_physpages; 63unsigned long num_physpages;
64unsigned long highest_memmap_pfn;
65struct percpu_counter vm_committed_as; 65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 69int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72/* amount of vm to protect from userspace access */
73unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
74
75atomic_long_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
76 73
77EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
@@ -86,46 +83,6 @@ struct vm_operations_struct generic_file_vm_ops = {
86}; 83};
87 84
88/* 85/*
89 * Handle all mappings that got truncated by a "truncate()"
90 * system call.
91 *
92 * NOTE! We have to be ready to update the memory sharing
93 * between the file and the memory map for a potential last
94 * incomplete page. Ugly, but necessary.
95 */
96int vmtruncate(struct inode *inode, loff_t offset)
97{
98 struct address_space *mapping = inode->i_mapping;
99 unsigned long limit;
100
101 if (inode->i_size < offset)
102 goto do_expand;
103 i_size_write(inode, offset);
104
105 truncate_inode_pages(mapping, offset);
106 goto out_truncate;
107
108do_expand:
109 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
110 if (limit != RLIM_INFINITY && offset > limit)
111 goto out_sig;
112 if (offset > inode->i_sb->s_maxbytes)
113 goto out;
114 i_size_write(inode, offset);
115
116out_truncate:
117 if (inode->i_op->truncate)
118 inode->i_op->truncate(inode);
119 return 0;
120out_sig:
121 send_sig(SIGXFSZ, current, 0);
122out:
123 return -EFBIG;
124}
125
126EXPORT_SYMBOL(vmtruncate);
127
128/*
129 * Return the total memory allocated for this pointer, not 86 * Return the total memory allocated for this pointer, not
130 * just what the caller asked for. 87 * just what the caller asked for.
131 * 88 *
@@ -173,21 +130,20 @@ unsigned int kobjsize(const void *objp)
173} 130}
174 131
175int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 132int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
176 unsigned long start, int nr_pages, int flags, 133 unsigned long start, int nr_pages, unsigned int foll_flags,
177 struct page **pages, struct vm_area_struct **vmas) 134 struct page **pages, struct vm_area_struct **vmas)
178{ 135{
179 struct vm_area_struct *vma; 136 struct vm_area_struct *vma;
180 unsigned long vm_flags; 137 unsigned long vm_flags;
181 int i; 138 int i;
182 int write = !!(flags & GUP_FLAGS_WRITE);
183 int force = !!(flags & GUP_FLAGS_FORCE);
184 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
185 139
186 /* calculate required read or write permissions. 140 /* calculate required read or write permissions.
187 * - if 'force' is set, we only require the "MAY" flags. 141 * If FOLL_FORCE is set, we only require the "MAY" flags.
188 */ 142 */
189 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 143 vm_flags = (foll_flags & FOLL_WRITE) ?
190 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 144 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
145 vm_flags &= (foll_flags & FOLL_FORCE) ?
146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
191 147
192 for (i = 0; i < nr_pages; i++) { 148 for (i = 0; i < nr_pages; i++) {
193 vma = find_vma(mm, start); 149 vma = find_vma(mm, start);
@@ -195,8 +151,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
195 goto finish_or_fault; 151 goto finish_or_fault;
196 152
197 /* protect what we can, including chardevs */ 153 /* protect what we can, including chardevs */
198 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 154 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
199 (!ignore && !(vm_flags & vma->vm_flags))) 155 !(vm_flags & vma->vm_flags))
200 goto finish_or_fault; 156 goto finish_or_fault;
201 157
202 if (pages) { 158 if (pages) {
@@ -215,7 +171,6 @@ finish_or_fault:
215 return i ? : -EFAULT; 171 return i ? : -EFAULT;
216} 172}
217 173
218
219/* 174/*
220 * get a list of pages in an address range belonging to the specified process 175 * get a list of pages in an address range belonging to the specified process
221 * and indicate the VMA that covers each page 176 * and indicate the VMA that covers each page
@@ -230,9 +185,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
230 int flags = 0; 185 int flags = 0;
231 186
232 if (write) 187 if (write)
233 flags |= GUP_FLAGS_WRITE; 188 flags |= FOLL_WRITE;
234 if (force) 189 if (force)
235 flags |= GUP_FLAGS_FORCE; 190 flags |= FOLL_FORCE;
236 191
237 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 192 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
238} 193}
@@ -630,6 +585,22 @@ static void put_nommu_region(struct vm_region *region)
630} 585}
631 586
632/* 587/*
588 * update protection on a vma
589 */
590static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
591{
592#ifdef CONFIG_MPU
593 struct mm_struct *mm = vma->vm_mm;
594 long start = vma->vm_start & PAGE_MASK;
595 while (start < vma->vm_end) {
596 protect_page(mm, start, flags);
597 start += PAGE_SIZE;
598 }
599 update_protections(mm);
600#endif
601}
602
603/*
633 * add a VMA into a process's mm_struct in the appropriate place in the list 604 * add a VMA into a process's mm_struct in the appropriate place in the list
634 * and tree and add to the address space's page tree also if not an anonymous 605 * and tree and add to the address space's page tree also if not an anonymous
635 * page 606 * page
@@ -648,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
648 mm->map_count++; 619 mm->map_count++;
649 vma->vm_mm = mm; 620 vma->vm_mm = mm;
650 621
622 protect_vma(vma, vma->vm_flags);
623
651 /* add the VMA to the mapping */ 624 /* add the VMA to the mapping */
652 if (vma->vm_file) { 625 if (vma->vm_file) {
653 mapping = vma->vm_file->f_mapping; 626 mapping = vma->vm_file->f_mapping;
@@ -710,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
710 683
711 kenter("%p", vma); 684 kenter("%p", vma);
712 685
686 protect_vma(vma, 0);
687
713 mm->map_count--; 688 mm->map_count--;
714 if (mm->mmap_cache == vma) 689 if (mm->mmap_cache == vma)
715 mm->mmap_cache = NULL; 690 mm->mmap_cache = NULL;
@@ -851,7 +826,7 @@ static int validate_mmap_request(struct file *file,
851 int ret; 826 int ret;
852 827
853 /* do the simple checks first */ 828 /* do the simple checks first */
854 if (flags & MAP_FIXED || addr) { 829 if (flags & MAP_FIXED) {
855 printk(KERN_DEBUG 830 printk(KERN_DEBUG
856 "%d: Can't do fixed-address/overlay mmap of RAM\n", 831 "%d: Can't do fixed-address/overlay mmap of RAM\n",
857 current->pid); 832 current->pid);
@@ -922,6 +897,10 @@ static int validate_mmap_request(struct file *file,
922 if (!file->f_op->read) 897 if (!file->f_op->read)
923 capabilities &= ~BDI_CAP_MAP_COPY; 898 capabilities &= ~BDI_CAP_MAP_COPY;
924 899
900 /* The file shall have been opened with read permission. */
901 if (!(file->f_mode & FMODE_READ))
902 return -EACCES;
903
925 if (flags & MAP_SHARED) { 904 if (flags & MAP_SHARED) {
926 /* do checks for writing, appending and locking */ 905 /* do checks for writing, appending and locking */
927 if ((prot & PROT_WRITE) && 906 if ((prot & PROT_WRITE) &&
@@ -1055,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1055 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1034 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1056 if (ret == 0) { 1035 if (ret == 0) {
1057 vma->vm_region->vm_top = vma->vm_region->vm_end; 1036 vma->vm_region->vm_top = vma->vm_region->vm_end;
1058 return ret; 1037 return 0;
1059 } 1038 }
1060 if (ret != -ENOSYS) 1039 if (ret != -ENOSYS)
1061 return ret; 1040 return ret;
@@ -1072,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1072 */ 1051 */
1073static int do_mmap_private(struct vm_area_struct *vma, 1052static int do_mmap_private(struct vm_area_struct *vma,
1074 struct vm_region *region, 1053 struct vm_region *region,
1075 unsigned long len) 1054 unsigned long len,
1055 unsigned long capabilities)
1076{ 1056{
1077 struct page *pages; 1057 struct page *pages;
1078 unsigned long total, point, n, rlen; 1058 unsigned long total, point, n, rlen;
@@ -1083,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma,
1083 * shared mappings on devices or memory 1063 * shared mappings on devices or memory
1084 * - VM_MAYSHARE will be set if it may attempt to share 1064 * - VM_MAYSHARE will be set if it may attempt to share
1085 */ 1065 */
1086 if (vma->vm_file) { 1066 if (capabilities & BDI_CAP_MAP_DIRECT) {
1087 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1067 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1088 if (ret == 0) { 1068 if (ret == 0) {
1089 /* shouldn't return success if we're not sharing */ 1069 /* shouldn't return success if we're not sharing */
1090 BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); 1070 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1091 vma->vm_region->vm_top = vma->vm_region->vm_end; 1071 vma->vm_region->vm_top = vma->vm_region->vm_end;
1092 return ret; 1072 return 0;
1093 } 1073 }
1094 if (ret != -ENOSYS) 1074 if (ret != -ENOSYS)
1095 return ret; 1075 return ret;
@@ -1202,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1202 1182
1203 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1183 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1204 1184
1205 if (!(flags & MAP_FIXED))
1206 addr = round_hint_to_min(addr);
1207
1208 /* decide whether we should attempt the mapping, and if so what sort of 1185 /* decide whether we should attempt the mapping, and if so what sort of
1209 * mapping */ 1186 * mapping */
1210 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1187 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1214,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file,
1214 return ret; 1191 return ret;
1215 } 1192 }
1216 1193
1194 /* we ignore the address hint */
1195 addr = 0;
1196
1217 /* we've determined that we can make the mapping, now translate what we 1197 /* we've determined that we can make the mapping, now translate what we
1218 * now know into VMA flags */ 1198 * now know into VMA flags */
1219 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1199 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
@@ -1327,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1327 * - this is the hook for quasi-memory character devices to 1307 * - this is the hook for quasi-memory character devices to
1328 * tell us the location of a shared mapping 1308 * tell us the location of a shared mapping
1329 */ 1309 */
1330 if (file && file->f_op->get_unmapped_area) { 1310 if (capabilities & BDI_CAP_MAP_DIRECT) {
1331 addr = file->f_op->get_unmapped_area(file, addr, len, 1311 addr = file->f_op->get_unmapped_area(file, addr, len,
1332 pgoff, flags); 1312 pgoff, flags);
1333 if (IS_ERR((void *) addr)) { 1313 if (IS_ERR((void *) addr)) {
@@ -1352,14 +1332,15 @@ unsigned long do_mmap_pgoff(struct file *file,
1352 1332
1353 vma->vm_region = region; 1333 vma->vm_region = region;
1354 1334
1355 /* set up the mapping */ 1335 /* set up the mapping
1336 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1337 */
1356 if (file && vma->vm_flags & VM_SHARED) 1338 if (file && vma->vm_flags & VM_SHARED)
1357 ret = do_mmap_shared_file(vma); 1339 ret = do_mmap_shared_file(vma);
1358 else 1340 else
1359 ret = do_mmap_private(vma, region, len); 1341 ret = do_mmap_private(vma, region, len, capabilities);
1360 if (ret < 0) 1342 if (ret < 0)
1361 goto error_put_region; 1343 goto error_just_free;
1362
1363 add_nommu_region(region); 1344 add_nommu_region(region);
1364 1345
1365 /* okay... we have a mapping; now we have to register it */ 1346 /* okay... we have a mapping; now we have to register it */
@@ -1378,19 +1359,6 @@ share:
1378 kleave(" = %lx", result); 1359 kleave(" = %lx", result);
1379 return result; 1360 return result;
1380 1361
1381error_put_region:
1382 __put_nommu_region(region);
1383 if (vma) {
1384 if (vma->vm_file) {
1385 fput(vma->vm_file);
1386 if (vma->vm_flags & VM_EXECUTABLE)
1387 removed_exe_file_vma(vma->vm_mm);
1388 }
1389 kmem_cache_free(vm_area_cachep, vma);
1390 }
1391 kleave(" = %d [pr]", ret);
1392 return ret;
1393
1394error_just_free: 1362error_just_free:
1395 up_write(&nommu_region_sem); 1363 up_write(&nommu_region_sem);
1396error: 1364error:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..ea2147dabba6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_lock); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/*
38 * Is all threads of the target process nodes overlap ours?
39 */
40static int has_intersects_mems_allowed(struct task_struct *tsk)
41{
42 struct task_struct *t;
43
44 t = tsk;
45 do {
46 if (cpuset_mems_allowed_intersects(current, t))
47 return 1;
48 t = next_thread(t);
49 } while (t != tsk);
50
51 return 0;
52}
53
37/** 54/**
38 * badness - calculate a numeric value for how bad this task has been 55 * badness - calculate a numeric value for how bad this task has been
39 * @p: task struct of which task we should calculate 56 * @p: task struct of which task we should calculate
@@ -58,7 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 75 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 76 struct mm_struct *mm;
60 struct task_struct *child; 77 struct task_struct *child;
61 int oom_adj; 78 int oom_adj = p->signal->oom_adj;
79 struct task_cputime task_time;
80 unsigned long utime;
81 unsigned long stime;
82
83 if (oom_adj == OOM_DISABLE)
84 return 0;
62 85
63 task_lock(p); 86 task_lock(p);
64 mm = p->mm; 87 mm = p->mm;
@@ -66,11 +89,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
66 task_unlock(p); 89 task_unlock(p);
67 return 0; 90 return 0;
68 } 91 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
74 92
75 /* 93 /*
76 * The memory size of the process is the basis for the badness. 94 * The memory size of the process is the basis for the badness.
@@ -85,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
85 /* 103 /*
86 * swapoff can easily use up all memory, so kill those first. 104 * swapoff can easily use up all memory, so kill those first.
87 */ 105 */
88 if (p->flags & PF_SWAPOFF) 106 if (p->flags & PF_OOM_ORIGIN)
89 return ULONG_MAX; 107 return ULONG_MAX;
90 108
91 /* 109 /*
@@ -108,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
108 * of seconds. There is no particular reason for this other than 126 * of seconds. There is no particular reason for this other than
109 * that it turned out to work very well in practice. 127 * that it turned out to work very well in practice.
110 */ 128 */
111 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) 129 thread_group_cputime(p, &task_time);
112 >> (SHIFT_HZ + 3); 130 utime = cputime_to_jiffies(task_time.utime);
131 stime = cputime_to_jiffies(task_time.stime);
132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
133
113 134
114 if (uptime >= p->start_time.tv_sec) 135 if (uptime >= p->start_time.tv_sec)
115 run_time = (uptime - p->start_time.tv_sec) >> 10; 136 run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -150,7 +171,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
150 * because p may have allocated or otherwise mapped memory on 171 * because p may have allocated or otherwise mapped memory on
151 * this node before. However it will be less likely. 172 * this node before. However it will be less likely.
152 */ 173 */
153 if (!cpuset_mems_allowed_intersects(current, p)) 174 if (!has_intersects_mems_allowed(p))
154 points /= 8; 175 points /= 8;
155 176
156 /* 177 /*
@@ -206,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
206static struct task_struct *select_bad_process(unsigned long *ppoints, 227static struct task_struct *select_bad_process(unsigned long *ppoints,
207 struct mem_cgroup *mem) 228 struct mem_cgroup *mem)
208{ 229{
209 struct task_struct *g, *p; 230 struct task_struct *p;
210 struct task_struct *chosen = NULL; 231 struct task_struct *chosen = NULL;
211 struct timespec uptime; 232 struct timespec uptime;
212 *ppoints = 0; 233 *ppoints = 0;
213 234
214 do_posix_clock_monotonic_gettime(&uptime); 235 do_posix_clock_monotonic_gettime(&uptime);
215 do_each_thread(g, p) { 236 for_each_process(p) {
216 unsigned long points; 237 unsigned long points;
217 238
218 /* 239 /*
@@ -257,12 +278,15 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
257 *ppoints = ULONG_MAX; 278 *ppoints = ULONG_MAX;
258 } 279 }
259 280
281 if (p->signal->oom_adj == OOM_DISABLE)
282 continue;
283
260 points = badness(p, uptime.tv_sec); 284 points = badness(p, uptime.tv_sec);
261 if (points > *ppoints) { 285 if (points > *ppoints || !chosen) {
262 chosen = p; 286 chosen = p;
263 *ppoints = points; 287 *ppoints = points;
264 } 288 }
265 } while_each_thread(g, p); 289 }
266 290
267 return chosen; 291 return chosen;
268} 292}
@@ -307,7 +331,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
307 } 331 }
308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
335 p->comm);
311 task_unlock(p); 336 task_unlock(p);
312 } while_each_thread(g, p); 337 } while_each_thread(g, p);
313} 338}
@@ -325,8 +350,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
325 return; 350 return;
326 } 351 }
327 352
328 if (!p->mm) 353 if (!p->mm) {
354 WARN_ON(1);
355 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 356 return;
357 }
330 358
331 if (verbose) 359 if (verbose)
332 printk(KERN_ERR "Killed process %d (%s)\n", 360 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -345,27 +373,18 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
345 373
346static int oom_kill_task(struct task_struct *p) 374static int oom_kill_task(struct task_struct *p)
347{ 375{
348 struct mm_struct *mm; 376 /* WARNING: mm may not be dereferenced since we did not obtain its
349 struct task_struct *g, *q; 377 * value from get_task_mm(p). This is OK since all we need to do is
350 378 * compare mm to q->mm below.
351 task_lock(p); 379 *
352 mm = p->mm; 380 * Furthermore, even if mm contains a non-NULL value, p->mm may
353 if (!mm || mm->oom_adj == OOM_DISABLE) { 381 * change to NULL at any time since we do not hold task_lock(p).
354 task_unlock(p); 382 * However, this is of no concern to us.
383 */
384 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
355 return 1; 385 return 1;
356 }
357 task_unlock(p);
358 __oom_kill_task(p, 1);
359 386
360 /* 387 __oom_kill_task(p, 1);
361 * kill all processes that share the ->mm (i.e. all threads),
362 * but are in a different thread group. Don't let them have access
363 * to memory reserves though, otherwise we might deplete all memory.
364 */
365 do_each_thread(g, q) {
366 if (q->mm == mm && !same_thread_group(q, p))
367 force_sig(SIGKILL, q);
368 } while_each_thread(g, q);
369 388
370 return 0; 389 return 0;
371} 390}
@@ -377,11 +396,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
377 struct task_struct *c; 396 struct task_struct *c;
378 397
379 if (printk_ratelimit()) { 398 if (printk_ratelimit()) {
380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: " 399 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n", 400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
383 current->comm, gfp_mask, order, 401 current->comm, gfp_mask, order,
384 current->mm ? current->mm->oom_adj : OOM_DISABLE); 402 current->signal->oom_adj);
403 task_lock(current);
385 cpuset_print_task_mems_allowed(current); 404 cpuset_print_task_mems_allowed(current);
386 task_unlock(current); 405 task_unlock(current);
387 dump_stack(); 406 dump_stack();
@@ -394,9 +413,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 /* 413 /*
395 * If the task is already exiting, don't alarm the sysadmin or kill 414 * If the task is already exiting, don't alarm the sysadmin or kill
396 * its children or threads, just set TIF_MEMDIE so it can die quickly 415 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
398 */ 416 */
399 if (p->mm && (p->flags & PF_EXITING)) { 417 if (p->flags & PF_EXITING) {
400 __oom_kill_task(p, 0); 418 __oom_kill_task(p, 0);
401 return 0; 419 return 0;
402 } 420 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..a3b14090b1fb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -53,18 +44,21 @@ static long ratelimit_pages = 32;
53/* 44/*
54 * When balance_dirty_pages decides that the caller needs to perform some 45 * When balance_dirty_pages decides that the caller needs to perform some
55 * non-background writeback, this is how many pages it will attempt to write. 46 * non-background writeback, this is how many pages it will attempt to write.
56 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably 47 * It should be somewhat larger than dirtied pages to ensure that reasonably
57 * large amounts of I/O are submitted. 48 * large amounts of I/O are submitted.
58 */ 49 */
59static inline long sync_writeback_pages(void) 50static inline long sync_writeback_pages(unsigned long dirtied)
60{ 51{
61 return ratelimit_pages + ratelimit_pages / 2; 52 if (dirtied < ratelimit_pages)
53 dirtied = ratelimit_pages;
54
55 return dirtied + dirtied / 2;
62} 56}
63 57
64/* The following parameters are exported via /proc/sys/vm */ 58/* The following parameters are exported via /proc/sys/vm */
65 59
66/* 60/*
67 * Start background writeback (via pdflush) at this percentage 61 * Start background writeback (via writeback threads) at this percentage
68 */ 62 */
69int dirty_background_ratio = 10; 63int dirty_background_ratio = 10;
70 64
@@ -117,8 +111,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 111/* End of sysctl-exported parameters */
118 112
119 113
120static void background_writeout(unsigned long _min_pages);
121
122/* 114/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 115 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 116 *
@@ -166,37 +158,37 @@ static void update_completion_period(void)
166} 158}
167 159
168int dirty_background_ratio_handler(struct ctl_table *table, int write, 160int dirty_background_ratio_handler(struct ctl_table *table, int write,
169 struct file *filp, void __user *buffer, size_t *lenp, 161 void __user *buffer, size_t *lenp,
170 loff_t *ppos) 162 loff_t *ppos)
171{ 163{
172 int ret; 164 int ret;
173 165
174 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 166 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
175 if (ret == 0 && write) 167 if (ret == 0 && write)
176 dirty_background_bytes = 0; 168 dirty_background_bytes = 0;
177 return ret; 169 return ret;
178} 170}
179 171
180int dirty_background_bytes_handler(struct ctl_table *table, int write, 172int dirty_background_bytes_handler(struct ctl_table *table, int write,
181 struct file *filp, void __user *buffer, size_t *lenp, 173 void __user *buffer, size_t *lenp,
182 loff_t *ppos) 174 loff_t *ppos)
183{ 175{
184 int ret; 176 int ret;
185 177
186 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 178 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
187 if (ret == 0 && write) 179 if (ret == 0 && write)
188 dirty_background_ratio = 0; 180 dirty_background_ratio = 0;
189 return ret; 181 return ret;
190} 182}
191 183
192int dirty_ratio_handler(struct ctl_table *table, int write, 184int dirty_ratio_handler(struct ctl_table *table, int write,
193 struct file *filp, void __user *buffer, size_t *lenp, 185 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 186 loff_t *ppos)
195{ 187{
196 int old_ratio = vm_dirty_ratio; 188 int old_ratio = vm_dirty_ratio;
197 int ret; 189 int ret;
198 190
199 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 191 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
200 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 192 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
201 update_completion_period(); 193 update_completion_period();
202 vm_dirty_bytes = 0; 194 vm_dirty_bytes = 0;
@@ -206,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
206 198
207 199
208int dirty_bytes_handler(struct ctl_table *table, int write, 200int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp, 201 void __user *buffer, size_t *lenp,
210 loff_t *ppos) 202 loff_t *ppos)
211{ 203{
212 unsigned long old_bytes = vm_dirty_bytes; 204 unsigned long old_bytes = vm_dirty_bytes;
213 int ret; 205 int ret;
214 206
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 207 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
216 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 208 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
217 update_completion_period(); 209 update_completion_period();
218 vm_dirty_ratio = 0; 210 vm_dirty_ratio = 0;
@@ -320,15 +312,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 312/*
321 * 313 *
322 */ 314 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 315static unsigned int bdi_min_ratio;
325 316
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 317int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 318{
328 int ret = 0; 319 int ret = 0;
329 unsigned long flags;
330 320
331 spin_lock_irqsave(&bdi_lock, flags); 321 spin_lock_bh(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 322 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 323 ret = -EINVAL;
334 } else { 324 } else {
@@ -340,27 +330,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 330 ret = -EINVAL;
341 } 331 }
342 } 332 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 333 spin_unlock_bh(&bdi_lock);
344 334
345 return ret; 335 return ret;
346} 336}
347 337
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 338int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 339{
350 unsigned long flags;
351 int ret = 0; 340 int ret = 0;
352 341
353 if (max_ratio > 100) 342 if (max_ratio > 100)
354 return -EINVAL; 343 return -EINVAL;
355 344
356 spin_lock_irqsave(&bdi_lock, flags); 345 spin_lock_bh(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 346 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 347 ret = -EINVAL;
359 } else { 348 } else {
360 bdi->max_ratio = max_ratio; 349 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 350 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 351 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 352 spin_unlock_bh(&bdi_lock);
364 353
365 return ret; 354 return ret;
366} 355}
@@ -394,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
394 struct zone *z = 383 struct zone *z =
395 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 384 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
396 385
397 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); 386 x += zone_page_state(z, NR_FREE_PAGES) +
387 zone_reclaimable_pages(z);
398 } 388 }
399 /* 389 /*
400 * Make sure that the number of highmem pages is never larger 390 * Make sure that the number of highmem pages is never larger
@@ -418,7 +408,7 @@ unsigned long determine_dirtyable_memory(void)
418{ 408{
419 unsigned long x; 409 unsigned long x;
420 410
421 x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); 411 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
422 412
423 if (!vm_highmem_is_dirtyable) 413 if (!vm_highmem_is_dirtyable)
424 x -= highmem_dirtyable_memory(x); 414 x -= highmem_dirtyable_memory(x);
@@ -487,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
487 * balance_dirty_pages() must be called by processes which are generating dirty 477 * balance_dirty_pages() must be called by processes which are generating dirty
488 * data. It looks at the number of dirty pages in the machine and will force 478 * data. It looks at the number of dirty pages in the machine and will force
489 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 479 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
490 * If we're over `background_thresh' then pdflush is woken to perform some 480 * If we're over `background_thresh' then the writeback threads are woken to
491 * writeout. 481 * perform some writeout.
492 */ 482 */
493static void balance_dirty_pages(struct address_space *mapping) 483static void balance_dirty_pages(struct address_space *mapping,
484 unsigned long write_chunk)
494{ 485{
495 long nr_reclaimable, bdi_nr_reclaimable; 486 long nr_reclaimable, bdi_nr_reclaimable;
496 long nr_writeback, bdi_nr_writeback; 487 long nr_writeback, bdi_nr_writeback;
@@ -498,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping)
498 unsigned long dirty_thresh; 489 unsigned long dirty_thresh;
499 unsigned long bdi_thresh; 490 unsigned long bdi_thresh;
500 unsigned long pages_written = 0; 491 unsigned long pages_written = 0;
501 unsigned long write_chunk = sync_writeback_pages(); 492 unsigned long pause = 1;
502 493
503 struct backing_dev_info *bdi = mapping->backing_dev_info; 494 struct backing_dev_info *bdi = mapping->backing_dev_info;
504 495
@@ -546,7 +537,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 537 * up.
547 */ 538 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 539 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 540 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 541 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 542 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 543 &bdi_thresh, bdi);
@@ -575,7 +566,15 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 566 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 567 break; /* We've done our duty */
577 568
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 569 schedule_timeout_interruptible(pause);
570
571 /*
572 * Increase the delay for each loop, up to our previous
573 * default of taking a 100ms nap.
574 */
575 pause <<= 1;
576 if (pause > HZ / 10)
577 pause = HZ / 10;
579 } 578 }
580 579
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 580 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -583,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping)
583 bdi->dirty_exceeded = 0; 582 bdi->dirty_exceeded = 0;
584 583
585 if (writeback_in_progress(bdi)) 584 if (writeback_in_progress(bdi))
586 return; /* pdflush is already working this queue */ 585 return;
587 586
588 /* 587 /*
589 * In laptop mode, we wait until hitting the higher threshold before 588 * In laptop mode, we wait until hitting the higher threshold before
@@ -594,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 593 * background_thresh, to keep the amount of dirty memory low.
595 */ 594 */
596 if ((laptop_mode && pages_written) || 595 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 596 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 597 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 598 > background_thresh)))
600 pdflush_operation(background_writeout, 0); 599 bdi_start_writeback(bdi, NULL, 0);
601} 600}
602 601
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 602void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -610,6 +609,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
610 } 609 }
611} 610}
612 611
612static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
613
613/** 614/**
614 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 615 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
615 * @mapping: address_space which was dirtied 616 * @mapping: address_space which was dirtied
@@ -627,7 +628,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
627void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 628void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
628 unsigned long nr_pages_dirtied) 629 unsigned long nr_pages_dirtied)
629{ 630{
630 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
631 unsigned long ratelimit; 631 unsigned long ratelimit;
632 unsigned long *p; 632 unsigned long *p;
633 633
@@ -640,12 +640,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
640 * tasks in balance_dirty_pages(). Period. 640 * tasks in balance_dirty_pages(). Period.
641 */ 641 */
642 preempt_disable(); 642 preempt_disable();
643 p = &__get_cpu_var(ratelimits); 643 p = &__get_cpu_var(bdp_ratelimits);
644 *p += nr_pages_dirtied; 644 *p += nr_pages_dirtied;
645 if (unlikely(*p >= ratelimit)) { 645 if (unlikely(*p >= ratelimit)) {
646 ratelimit = sync_writeback_pages(*p);
646 *p = 0; 647 *p = 0;
647 preempt_enable(); 648 preempt_enable();
648 balance_dirty_pages(mapping); 649 balance_dirty_pages(mapping, ratelimit);
649 return; 650 return;
650 } 651 }
651 preempt_enable(); 652 preempt_enable();
@@ -681,153 +682,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 682 }
682} 683}
683 684
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(BLK_RW_ASYNC, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 685static void laptop_timer_fn(unsigned long unused);
740 686
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 687static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 688
744/* 689/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(BLK_RW_ASYNC, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 690 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 691 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 692int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 693 void __user *buffer, size_t *length, loff_t *ppos)
807{ 694{
808 proc_dointvec(table, write, file, buffer, length, ppos); 695 proc_dointvec(table, write, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 696 return 0;
815} 697}
816 698
817static void wb_timer_fn(unsigned long unused) 699static void do_laptop_sync(struct work_struct *work)
818{ 700{
819 if (pdflush_operation(wb_kupdate, 0) < 0) 701 wakeup_flusher_threads(0);
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ 702 kfree(work);
821}
822
823static void laptop_flush(unsigned long unused)
824{
825 sys_sync();
826} 703}
827 704
828static void laptop_timer_fn(unsigned long unused) 705static void laptop_timer_fn(unsigned long unused)
829{ 706{
830 pdflush_operation(laptop_flush, 0); 707 struct work_struct *work;
708
709 work = kmalloc(sizeof(*work), GFP_ATOMIC);
710 if (work) {
711 INIT_WORK(work, do_laptop_sync);
712 schedule_work(work);
713 }
831} 714}
832 715
833/* 716/*
@@ -910,8 +793,6 @@ void __init page_writeback_init(void)
910{ 793{
911 int shift; 794 int shift;
912 795
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 796 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 797 register_cpu_notifier(&ratelimit_nb);
917 798
@@ -1145,12 +1026,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1145 1026
1146 if (wbc->nr_to_write <= 0) 1027 if (wbc->nr_to_write <= 0)
1147 return 0; 1028 return 0;
1148 wbc->for_writepages = 1;
1149 if (mapping->a_ops->writepages) 1029 if (mapping->a_ops->writepages)
1150 ret = mapping->a_ops->writepages(mapping, wbc); 1030 ret = mapping->a_ops->writepages(mapping, wbc);
1151 else 1031 else
1152 ret = generic_writepages(mapping, wbc); 1032 ret = generic_writepages(mapping, wbc);
1153 wbc->for_writepages = 0;
1154 return ret; 1033 return ret;
1155} 1034}
1156 1035
@@ -1274,6 +1153,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1274EXPORT_SYMBOL(redirty_page_for_writepage); 1153EXPORT_SYMBOL(redirty_page_for_writepage);
1275 1154
1276/* 1155/*
1156 * Dirty a page.
1157 *
1158 * For pages with a mapping this should be done under the page lock
1159 * for the benefit of asynchronous memory errors who prefer a consistent
1160 * dirty state. This rule can be broken in some special cases,
1161 * but should be better not to.
1162 *
1277 * If the mapping doesn't provide a set_page_dirty a_op, then 1163 * If the mapping doesn't provide a set_page_dirty a_op, then
1278 * just fall through and assume that it wants buffer_heads. 1164 * just fall through and assume that it wants buffer_heads.
1279 */ 1165 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d052abbe3063..bf720550b44d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <trace/events/kmem.h>
51 52
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
53#include <asm/div64.h> 54#include <asm/div64.h>
@@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states);
71 72
72unsigned long totalram_pages __read_mostly; 73unsigned long totalram_pages __read_mostly;
73unsigned long totalreserve_pages __read_mostly; 74unsigned long totalreserve_pages __read_mostly;
74unsigned long highest_memmap_pfn __read_mostly;
75int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
77 77
@@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
123 123
124int min_free_kbytes = 1024; 124int min_free_kbytes = 1024;
125 125
126unsigned long __meminitdata nr_kernel_pages; 126static unsigned long __meminitdata nr_kernel_pages;
127unsigned long __meminitdata nr_all_pages; 127static unsigned long __meminitdata nr_all_pages;
128static unsigned long __meminitdata dma_reserve; 128static unsigned long __meminitdata dma_reserve;
129 129
130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
234 static unsigned long nr_shown; 234 static unsigned long nr_shown;
235 static unsigned long nr_unshown; 235 static unsigned long nr_unshown;
236 236
237 /* Don't complain about poisoned pages */
238 if (PageHWPoison(page)) {
239 __ClearPageBuddy(page);
240 return;
241 }
242
237 /* 243 /*
238 * Allow a burst of 60 reports, then keep quiet for that minute; 244 * Allow a burst of 60 reports, then keep quiet for that minute;
239 * or allow a steady drip of one report per second. 245 * or allow a steady drip of one report per second.
@@ -510,7 +516,7 @@ static inline int free_pages_check(struct page *page)
510} 516}
511 517
512/* 518/*
513 * Frees a list of pages. 519 * Frees a number of pages from the PCP lists
514 * Assumes all pages on list are in same zone, and of same order. 520 * Assumes all pages on list are in same zone, and of same order.
515 * count is the number of pages to free. 521 * count is the number of pages to free.
516 * 522 *
@@ -520,22 +526,42 @@ static inline int free_pages_check(struct page *page)
520 * And clear the zone's pages_scanned counter, to hold off the "all pages are 526 * And clear the zone's pages_scanned counter, to hold off the "all pages are
521 * pinned" detection logic. 527 * pinned" detection logic.
522 */ 528 */
523static void free_pages_bulk(struct zone *zone, int count, 529static void free_pcppages_bulk(struct zone *zone, int count,
524 struct list_head *list, int order) 530 struct per_cpu_pages *pcp)
525{ 531{
532 int migratetype = 0;
533 int batch_free = 0;
534
526 spin_lock(&zone->lock); 535 spin_lock(&zone->lock);
527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 536 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
528 zone->pages_scanned = 0; 537 zone->pages_scanned = 0;
529 538
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); 539 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
531 while (count--) { 540 while (count) {
532 struct page *page; 541 struct page *page;
542 struct list_head *list;
533 543
534 VM_BUG_ON(list_empty(list)); 544 /*
535 page = list_entry(list->prev, struct page, lru); 545 * Remove pages from lists in a round-robin fashion. A
536 /* have to delete it as __free_one_page list manipulates */ 546 * batch_free count is maintained that is incremented when an
537 list_del(&page->lru); 547 * empty list is encountered. This is so more pages are freed
538 __free_one_page(page, zone, order, page_private(page)); 548 * off fuller lists instead of spinning excessively around empty
549 * lists
550 */
551 do {
552 batch_free++;
553 if (++migratetype == MIGRATE_PCPTYPES)
554 migratetype = 0;
555 list = &pcp->lists[migratetype];
556 } while (list_empty(list));
557
558 do {
559 page = list_entry(list->prev, struct page, lru);
560 /* must delete as __free_one_page list manipulates */
561 list_del(&page->lru);
562 __free_one_page(page, zone, 0, migratetype);
563 trace_mm_page_pcpu_drain(page, 0, migratetype);
564 } while (--count && --batch_free && !list_empty(list));
539 } 565 }
540 spin_unlock(&zone->lock); 566 spin_unlock(&zone->lock);
541} 567}
@@ -557,7 +583,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
557 unsigned long flags; 583 unsigned long flags;
558 int i; 584 int i;
559 int bad = 0; 585 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page); 586 int wasMlocked = __TestClearPageMlocked(page);
561 587
562 kmemcheck_free_shadow(page, order); 588 kmemcheck_free_shadow(page, order);
563 589
@@ -646,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
646/* 672/*
647 * This page is about to be returned from the page allocator 673 * This page is about to be returned from the page allocator
648 */ 674 */
649static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 675static inline int check_new_page(struct page *page)
650{ 676{
651 if (unlikely(page_mapcount(page) | 677 if (unlikely(page_mapcount(page) |
652 (page->mapping != NULL) | 678 (page->mapping != NULL) |
@@ -655,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
655 bad_page(page); 681 bad_page(page);
656 return 1; 682 return 1;
657 } 683 }
684 return 0;
685}
686
687static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
688{
689 int i;
690
691 for (i = 0; i < (1 << order); i++) {
692 struct page *p = page + i;
693 if (unlikely(check_new_page(p)))
694 return 1;
695 }
658 696
659 set_page_private(page, 0); 697 set_page_private(page, 0);
660 set_page_refcounted(page); 698 set_page_refcounted(page);
@@ -783,6 +821,17 @@ static int move_freepages_block(struct zone *zone, struct page *page,
783 return move_freepages(zone, start_page, end_page, migratetype); 821 return move_freepages(zone, start_page, end_page, migratetype);
784} 822}
785 823
824static void change_pageblock_range(struct page *pageblock_page,
825 int start_order, int migratetype)
826{
827 int nr_pageblocks = 1 << (start_order - pageblock_order);
828
829 while (nr_pageblocks--) {
830 set_pageblock_migratetype(pageblock_page, migratetype);
831 pageblock_page += pageblock_nr_pages;
832 }
833}
834
786/* Remove an element from the buddy allocator from the fallback list */ 835/* Remove an element from the buddy allocator from the fallback list */
787static inline struct page * 836static inline struct page *
788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 837__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -817,13 +866,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
817 * agressive about taking ownership of free pages 866 * agressive about taking ownership of free pages
818 */ 867 */
819 if (unlikely(current_order >= (pageblock_order >> 1)) || 868 if (unlikely(current_order >= (pageblock_order >> 1)) ||
820 start_migratetype == MIGRATE_RECLAIMABLE) { 869 start_migratetype == MIGRATE_RECLAIMABLE ||
870 page_group_by_mobility_disabled) {
821 unsigned long pages; 871 unsigned long pages;
822 pages = move_freepages_block(zone, page, 872 pages = move_freepages_block(zone, page,
823 start_migratetype); 873 start_migratetype);
824 874
825 /* Claim the whole block if over half of it is free */ 875 /* Claim the whole block if over half of it is free */
826 if (pages >= (1 << (pageblock_order-1))) 876 if (pages >= (1 << (pageblock_order-1)) ||
877 page_group_by_mobility_disabled)
827 set_pageblock_migratetype(page, 878 set_pageblock_migratetype(page,
828 start_migratetype); 879 start_migratetype);
829 880
@@ -834,11 +885,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
834 list_del(&page->lru); 885 list_del(&page->lru);
835 rmv_page_order(page); 886 rmv_page_order(page);
836 887
837 if (current_order == pageblock_order) 888 /* Take ownership for orders >= pageblock_order */
838 set_pageblock_migratetype(page, 889 if (current_order >= pageblock_order)
890 change_pageblock_range(page, current_order,
839 start_migratetype); 891 start_migratetype);
840 892
841 expand(zone, page, order, current_order, area, migratetype); 893 expand(zone, page, order, current_order, area, migratetype);
894
895 trace_mm_page_alloc_extfrag(page, order, current_order,
896 start_migratetype, migratetype);
897
842 return page; 898 return page;
843 } 899 }
844 } 900 }
@@ -872,6 +928,7 @@ retry_reserve:
872 } 928 }
873 } 929 }
874 930
931 trace_mm_page_alloc_zone_locked(page, order, migratetype);
875 return page; 932 return page;
876} 933}
877 934
@@ -932,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
932 to_drain = pcp->batch; 989 to_drain = pcp->batch;
933 else 990 else
934 to_drain = pcp->count; 991 to_drain = pcp->count;
935 free_pages_bulk(zone, to_drain, &pcp->list, 0); 992 free_pcppages_bulk(zone, to_drain, pcp);
936 pcp->count -= to_drain; 993 pcp->count -= to_drain;
937 local_irq_restore(flags); 994 local_irq_restore(flags);
938} 995}
@@ -958,7 +1015,7 @@ static void drain_pages(unsigned int cpu)
958 1015
959 pcp = &pset->pcp; 1016 pcp = &pset->pcp;
960 local_irq_save(flags); 1017 local_irq_save(flags);
961 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 1018 free_pcppages_bulk(zone, pcp->count, pcp);
962 pcp->count = 0; 1019 pcp->count = 0;
963 local_irq_restore(flags); 1020 local_irq_restore(flags);
964 } 1021 }
@@ -1024,7 +1081,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1024 struct zone *zone = page_zone(page); 1081 struct zone *zone = page_zone(page);
1025 struct per_cpu_pages *pcp; 1082 struct per_cpu_pages *pcp;
1026 unsigned long flags; 1083 unsigned long flags;
1027 int wasMlocked = TestClearPageMlocked(page); 1084 int migratetype;
1085 int wasMlocked = __TestClearPageMlocked(page);
1028 1086
1029 kmemcheck_free_shadow(page, 0); 1087 kmemcheck_free_shadow(page, 0);
1030 1088
@@ -1041,35 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold)
1041 kernel_map_pages(page, 1, 0); 1099 kernel_map_pages(page, 1, 0);
1042 1100
1043 pcp = &zone_pcp(zone, get_cpu())->pcp; 1101 pcp = &zone_pcp(zone, get_cpu())->pcp;
1044 set_page_private(page, get_pageblock_migratetype(page)); 1102 migratetype = get_pageblock_migratetype(page);
1103 set_page_private(page, migratetype);
1045 local_irq_save(flags); 1104 local_irq_save(flags);
1046 if (unlikely(wasMlocked)) 1105 if (unlikely(wasMlocked))
1047 free_page_mlock(page); 1106 free_page_mlock(page);
1048 __count_vm_event(PGFREE); 1107 __count_vm_event(PGFREE);
1049 1108
1109 /*
1110 * We only track unmovable, reclaimable and movable on pcp lists.
1111 * Free ISOLATE pages back to the allocator because they are being
1112 * offlined but treat RESERVE as movable pages so we can get those
1113 * areas back if necessary. Otherwise, we may have to free
1114 * excessively into the page allocator
1115 */
1116 if (migratetype >= MIGRATE_PCPTYPES) {
1117 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1118 free_one_page(zone, page, 0, migratetype);
1119 goto out;
1120 }
1121 migratetype = MIGRATE_MOVABLE;
1122 }
1123
1050 if (cold) 1124 if (cold)
1051 list_add_tail(&page->lru, &pcp->list); 1125 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1052 else 1126 else
1053 list_add(&page->lru, &pcp->list); 1127 list_add(&page->lru, &pcp->lists[migratetype]);
1054 pcp->count++; 1128 pcp->count++;
1055 if (pcp->count >= pcp->high) { 1129 if (pcp->count >= pcp->high) {
1056 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1130 free_pcppages_bulk(zone, pcp->batch, pcp);
1057 pcp->count -= pcp->batch; 1131 pcp->count -= pcp->batch;
1058 } 1132 }
1133
1134out:
1059 local_irq_restore(flags); 1135 local_irq_restore(flags);
1060 put_cpu(); 1136 put_cpu();
1061} 1137}
1062 1138
1063void free_hot_page(struct page *page) 1139void free_hot_page(struct page *page)
1064{ 1140{
1141 trace_mm_page_free_direct(page, 0);
1065 free_hot_cold_page(page, 0); 1142 free_hot_cold_page(page, 0);
1066} 1143}
1067 1144
1068void free_cold_page(struct page *page)
1069{
1070 free_hot_cold_page(page, 1);
1071}
1072
1073/* 1145/*
1074 * split_page takes a non-compound higher-order page, and splits it into 1146 * split_page takes a non-compound higher-order page, and splits it into
1075 * n (1<<order) sub-pages: page[0..n] 1147 * n (1<<order) sub-pages: page[0..n]
@@ -1117,35 +1189,23 @@ again:
1117 cpu = get_cpu(); 1189 cpu = get_cpu();
1118 if (likely(order == 0)) { 1190 if (likely(order == 0)) {
1119 struct per_cpu_pages *pcp; 1191 struct per_cpu_pages *pcp;
1192 struct list_head *list;
1120 1193
1121 pcp = &zone_pcp(zone, cpu)->pcp; 1194 pcp = &zone_pcp(zone, cpu)->pcp;
1195 list = &pcp->lists[migratetype];
1122 local_irq_save(flags); 1196 local_irq_save(flags);
1123 if (!pcp->count) { 1197 if (list_empty(list)) {
1124 pcp->count = rmqueue_bulk(zone, 0, 1198 pcp->count += rmqueue_bulk(zone, 0,
1125 pcp->batch, &pcp->list, 1199 pcp->batch, list,
1126 migratetype, cold); 1200 migratetype, cold);
1127 if (unlikely(!pcp->count)) 1201 if (unlikely(list_empty(list)))
1128 goto failed; 1202 goto failed;
1129 } 1203 }
1130 1204
1131 /* Find a page of the appropriate migrate type */ 1205 if (cold)
1132 if (cold) { 1206 page = list_entry(list->prev, struct page, lru);
1133 list_for_each_entry_reverse(page, &pcp->list, lru) 1207 else
1134 if (page_private(page) == migratetype) 1208 page = list_entry(list->next, struct page, lru);
1135 break;
1136 } else {
1137 list_for_each_entry(page, &pcp->list, lru)
1138 if (page_private(page) == migratetype)
1139 break;
1140 }
1141
1142 /* Allocate more to the pcp list if necessary */
1143 if (unlikely(&page->lru == &pcp->list)) {
1144 pcp->count += rmqueue_bulk(zone, 0,
1145 pcp->batch, &pcp->list,
1146 migratetype, cold);
1147 page = list_entry(pcp->list.next, struct page, lru);
1148 }
1149 1209
1150 list_del(&page->lru); 1210 list_del(&page->lru);
1151 pcp->count--; 1211 pcp->count--;
@@ -1625,10 +1685,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1625 1685
1626 /* We now go into synchronous reclaim */ 1686 /* We now go into synchronous reclaim */
1627 cpuset_memory_pressure_bump(); 1687 cpuset_memory_pressure_bump();
1628
1629 /*
1630 * The task's cpuset might have expanded its set of allowable nodes
1631 */
1632 p->flags |= PF_MEMALLOC; 1688 p->flags |= PF_MEMALLOC;
1633 lockdep_set_current_reclaim_state(gfp_mask); 1689 lockdep_set_current_reclaim_state(gfp_mask);
1634 reclaim_state.reclaimed_slab = 0; 1690 reclaim_state.reclaimed_slab = 0;
@@ -1763,6 +1819,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1763 1819
1764 wake_all_kswapd(order, zonelist, high_zoneidx); 1820 wake_all_kswapd(order, zonelist, high_zoneidx);
1765 1821
1822restart:
1766 /* 1823 /*
1767 * OK, we're below the kswapd watermark and have kicked background 1824 * OK, we're below the kswapd watermark and have kicked background
1768 * reclaim. Now things get more complex, so set up alloc_flags according 1825 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -1770,7 +1827,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1770 */ 1827 */
1771 alloc_flags = gfp_to_alloc_flags(gfp_mask); 1828 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1772 1829
1773restart:
1774 /* This is the last chance, in general, before the goto nopage. */ 1830 /* This is the last chance, in general, before the goto nopage. */
1775 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1831 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1776 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 1832 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1905,6 +1961,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1905 zonelist, high_zoneidx, nodemask, 1961 zonelist, high_zoneidx, nodemask,
1906 preferred_zone, migratetype); 1962 preferred_zone, migratetype);
1907 1963
1964 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1908 return page; 1965 return page;
1909} 1966}
1910EXPORT_SYMBOL(__alloc_pages_nodemask); 1967EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -1914,44 +1971,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask);
1914 */ 1971 */
1915unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1972unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1916{ 1973{
1917 struct page * page; 1974 struct page *page;
1975
1976 /*
1977 * __get_free_pages() returns a 32-bit address, which cannot represent
1978 * a highmem page
1979 */
1980 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1981
1918 page = alloc_pages(gfp_mask, order); 1982 page = alloc_pages(gfp_mask, order);
1919 if (!page) 1983 if (!page)
1920 return 0; 1984 return 0;
1921 return (unsigned long) page_address(page); 1985 return (unsigned long) page_address(page);
1922} 1986}
1923
1924EXPORT_SYMBOL(__get_free_pages); 1987EXPORT_SYMBOL(__get_free_pages);
1925 1988
1926unsigned long get_zeroed_page(gfp_t gfp_mask) 1989unsigned long get_zeroed_page(gfp_t gfp_mask)
1927{ 1990{
1928 struct page * page; 1991 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1929
1930 /*
1931 * get_zeroed_page() returns a 32-bit address, which cannot represent
1932 * a highmem page
1933 */
1934 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1935
1936 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1937 if (page)
1938 return (unsigned long) page_address(page);
1939 return 0;
1940} 1992}
1941
1942EXPORT_SYMBOL(get_zeroed_page); 1993EXPORT_SYMBOL(get_zeroed_page);
1943 1994
1944void __pagevec_free(struct pagevec *pvec) 1995void __pagevec_free(struct pagevec *pvec)
1945{ 1996{
1946 int i = pagevec_count(pvec); 1997 int i = pagevec_count(pvec);
1947 1998
1948 while (--i >= 0) 1999 while (--i >= 0) {
2000 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1949 free_hot_cold_page(pvec->pages[i], pvec->cold); 2001 free_hot_cold_page(pvec->pages[i], pvec->cold);
2002 }
1950} 2003}
1951 2004
1952void __free_pages(struct page *page, unsigned int order) 2005void __free_pages(struct page *page, unsigned int order)
1953{ 2006{
1954 if (put_page_testzero(page)) { 2007 if (put_page_testzero(page)) {
2008 trace_mm_page_free_direct(page, order);
1955 if (order == 0) 2009 if (order == 0)
1956 free_hot_page(page); 2010 free_hot_page(page);
1957 else 2011 else
@@ -2126,23 +2180,28 @@ void show_free_areas(void)
2126 } 2180 }
2127 } 2181 }
2128 2182
2129 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2130 " inactive_file:%lu" 2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2131 " unevictable:%lu" 2185 " unevictable:%lu"
2132 " dirty:%lu writeback:%lu unstable:%lu\n" 2186 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
2133 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2134 global_page_state(NR_ACTIVE_ANON), 2189 global_page_state(NR_ACTIVE_ANON),
2135 global_page_state(NR_ACTIVE_FILE),
2136 global_page_state(NR_INACTIVE_ANON), 2190 global_page_state(NR_INACTIVE_ANON),
2191 global_page_state(NR_ISOLATED_ANON),
2192 global_page_state(NR_ACTIVE_FILE),
2137 global_page_state(NR_INACTIVE_FILE), 2193 global_page_state(NR_INACTIVE_FILE),
2194 global_page_state(NR_ISOLATED_FILE),
2138 global_page_state(NR_UNEVICTABLE), 2195 global_page_state(NR_UNEVICTABLE),
2139 global_page_state(NR_FILE_DIRTY), 2196 global_page_state(NR_FILE_DIRTY),
2140 global_page_state(NR_WRITEBACK), 2197 global_page_state(NR_WRITEBACK),
2141 global_page_state(NR_UNSTABLE_NFS), 2198 global_page_state(NR_UNSTABLE_NFS),
2199 nr_blockdev_pages(),
2142 global_page_state(NR_FREE_PAGES), 2200 global_page_state(NR_FREE_PAGES),
2143 global_page_state(NR_SLAB_RECLAIMABLE) + 2201 global_page_state(NR_SLAB_RECLAIMABLE),
2144 global_page_state(NR_SLAB_UNRECLAIMABLE), 2202 global_page_state(NR_SLAB_UNRECLAIMABLE),
2145 global_page_state(NR_FILE_MAPPED), 2203 global_page_state(NR_FILE_MAPPED),
2204 global_page_state(NR_SHMEM),
2146 global_page_state(NR_PAGETABLE), 2205 global_page_state(NR_PAGETABLE),
2147 global_page_state(NR_BOUNCE)); 2206 global_page_state(NR_BOUNCE));
2148 2207
@@ -2160,7 +2219,21 @@ void show_free_areas(void)
2160 " active_file:%lukB" 2219 " active_file:%lukB"
2161 " inactive_file:%lukB" 2220 " inactive_file:%lukB"
2162 " unevictable:%lukB" 2221 " unevictable:%lukB"
2222 " isolated(anon):%lukB"
2223 " isolated(file):%lukB"
2163 " present:%lukB" 2224 " present:%lukB"
2225 " mlocked:%lukB"
2226 " dirty:%lukB"
2227 " writeback:%lukB"
2228 " mapped:%lukB"
2229 " shmem:%lukB"
2230 " slab_reclaimable:%lukB"
2231 " slab_unreclaimable:%lukB"
2232 " kernel_stack:%lukB"
2233 " pagetables:%lukB"
2234 " unstable:%lukB"
2235 " bounce:%lukB"
2236 " writeback_tmp:%lukB"
2164 " pages_scanned:%lu" 2237 " pages_scanned:%lu"
2165 " all_unreclaimable? %s" 2238 " all_unreclaimable? %s"
2166 "\n", 2239 "\n",
@@ -2174,7 +2247,22 @@ void show_free_areas(void)
2174 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2247 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2175 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2248 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2176 K(zone_page_state(zone, NR_UNEVICTABLE)), 2249 K(zone_page_state(zone, NR_UNEVICTABLE)),
2250 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2251 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2177 K(zone->present_pages), 2252 K(zone->present_pages),
2253 K(zone_page_state(zone, NR_MLOCK)),
2254 K(zone_page_state(zone, NR_FILE_DIRTY)),
2255 K(zone_page_state(zone, NR_WRITEBACK)),
2256 K(zone_page_state(zone, NR_FILE_MAPPED)),
2257 K(zone_page_state(zone, NR_SHMEM)),
2258 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2259 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2260 zone_page_state(zone, NR_KERNEL_STACK) *
2261 THREAD_SIZE / 1024,
2262 K(zone_page_state(zone, NR_PAGETABLE)),
2263 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2264 K(zone_page_state(zone, NR_BOUNCE)),
2265 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2178 zone->pages_scanned, 2266 zone->pages_scanned,
2179 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2267 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
2180 ); 2268 );
@@ -2303,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
2303 * sysctl handler for numa_zonelist_order 2391 * sysctl handler for numa_zonelist_order
2304 */ 2392 */
2305int numa_zonelist_order_handler(ctl_table *table, int write, 2393int numa_zonelist_order_handler(ctl_table *table, int write,
2306 struct file *file, void __user *buffer, size_t *length, 2394 void __user *buffer, size_t *length,
2307 loff_t *ppos) 2395 loff_t *ppos)
2308{ 2396{
2309 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2397 char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2312,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2312 if (write) 2400 if (write)
2313 strncpy(saved_string, (char*)table->data, 2401 strncpy(saved_string, (char*)table->data,
2314 NUMA_ZONELIST_ORDER_LEN); 2402 NUMA_ZONELIST_ORDER_LEN);
2315 ret = proc_dostring(table, write, file, buffer, length, ppos); 2403 ret = proc_dostring(table, write, buffer, length, ppos);
2316 if (ret) 2404 if (ret)
2317 return ret; 2405 return ret;
2318 if (write) { 2406 if (write) {
@@ -2544,7 +2632,6 @@ static void build_zonelists(pg_data_t *pgdat)
2544 prev_node = local_node; 2632 prev_node = local_node;
2545 nodes_clear(used_mask); 2633 nodes_clear(used_mask);
2546 2634
2547 memset(node_load, 0, sizeof(node_load));
2548 memset(node_order, 0, sizeof(node_order)); 2635 memset(node_order, 0, sizeof(node_order));
2549 j = 0; 2636 j = 0;
2550 2637
@@ -2653,6 +2740,9 @@ static int __build_all_zonelists(void *dummy)
2653{ 2740{
2654 int nid; 2741 int nid;
2655 2742
2743#ifdef CONFIG_NUMA
2744 memset(node_load, 0, sizeof(node_load));
2745#endif
2656 for_each_online_node(nid) { 2746 for_each_online_node(nid) {
2657 pg_data_t *pgdat = NODE_DATA(nid); 2747 pg_data_t *pgdat = NODE_DATA(nid);
2658 2748
@@ -2779,7 +2869,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2779{ 2869{
2780 unsigned long start_pfn, pfn, end_pfn; 2870 unsigned long start_pfn, pfn, end_pfn;
2781 struct page *page; 2871 struct page *page;
2782 unsigned long reserve, block_migratetype; 2872 unsigned long block_migratetype;
2873 int reserve;
2783 2874
2784 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2875 /* Get the start pfn, end pfn and the number of blocks to reserve */
2785 start_pfn = zone->zone_start_pfn; 2876 start_pfn = zone->zone_start_pfn;
@@ -2787,6 +2878,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2787 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 2878 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2788 pageblock_order; 2879 pageblock_order;
2789 2880
2881 /*
2882 * Reserve blocks are generally in place to help high-order atomic
2883 * allocations that are short-lived. A min_free_kbytes value that
2884 * would result in more than 2 reserve blocks for atomic allocations
2885 * is assumed to be in place to help anti-fragmentation for the
2886 * future allocation of hugepages at runtime.
2887 */
2888 reserve = min(2, reserve);
2889
2790 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2890 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2791 if (!pfn_valid(pfn)) 2891 if (!pfn_valid(pfn))
2792 continue; 2892 continue;
@@ -2957,6 +3057,7 @@ static int zone_batchsize(struct zone *zone)
2957static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3057static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2958{ 3058{
2959 struct per_cpu_pages *pcp; 3059 struct per_cpu_pages *pcp;
3060 int migratetype;
2960 3061
2961 memset(p, 0, sizeof(*p)); 3062 memset(p, 0, sizeof(*p));
2962 3063
@@ -2964,7 +3065,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2964 pcp->count = 0; 3065 pcp->count = 0;
2965 pcp->high = 6 * batch; 3066 pcp->high = 6 * batch;
2966 pcp->batch = max(1UL, 1 * batch); 3067 pcp->batch = max(1UL, 1 * batch);
2967 INIT_LIST_HEAD(&pcp->list); 3068 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3069 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2968} 3070}
2969 3071
2970/* 3072/*
@@ -3142,6 +3244,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3142 return 0; 3244 return 0;
3143} 3245}
3144 3246
3247static int __zone_pcp_update(void *data)
3248{
3249 struct zone *zone = data;
3250 int cpu;
3251 unsigned long batch = zone_batchsize(zone), flags;
3252
3253 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3254 struct per_cpu_pageset *pset;
3255 struct per_cpu_pages *pcp;
3256
3257 pset = zone_pcp(zone, cpu);
3258 pcp = &pset->pcp;
3259
3260 local_irq_save(flags);
3261 free_pcppages_bulk(zone, pcp->count, pcp);
3262 setup_pageset(pset, batch);
3263 local_irq_restore(flags);
3264 }
3265 return 0;
3266}
3267
3268void zone_pcp_update(struct zone *zone)
3269{
3270 stop_machine(__zone_pcp_update, zone, NULL);
3271}
3272
3145static __meminit void zone_pcp_init(struct zone *zone) 3273static __meminit void zone_pcp_init(struct zone *zone)
3146{ 3274{
3147 int cpu; 3275 int cpu;
@@ -3716,7 +3844,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3716 zone_pcp_init(zone); 3844 zone_pcp_init(zone);
3717 for_each_lru(l) { 3845 for_each_lru(l) {
3718 INIT_LIST_HEAD(&zone->lru[l].list); 3846 INIT_LIST_HEAD(&zone->lru[l].list);
3719 zone->lru[l].nr_saved_scan = 0; 3847 zone->reclaim_stat.nr_saved_scan[l] = 0;
3720 } 3848 }
3721 zone->reclaim_stat.recent_rotated[0] = 0; 3849 zone->reclaim_stat.recent_rotated[0] = 0;
3722 zone->reclaim_stat.recent_rotated[1] = 0; 3850 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4505,7 +4633,7 @@ void setup_per_zone_wmarks(void)
4505 calculate_totalreserve_pages(); 4633 calculate_totalreserve_pages();
4506} 4634}
4507 4635
4508/** 4636/*
4509 * The inactive anon list should be small enough that the VM never has to 4637 * The inactive anon list should be small enough that the VM never has to
4510 * do too much work, but large enough that each inactive page has a chance 4638 * do too much work, but large enough that each inactive page has a chance
4511 * to be referenced again before it is swapped out. 4639 * to be referenced again before it is swapped out.
@@ -4596,9 +4724,9 @@ module_init(init_per_zone_wmark_min)
4596 * changes. 4724 * changes.
4597 */ 4725 */
4598int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4726int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4599 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4727 void __user *buffer, size_t *length, loff_t *ppos)
4600{ 4728{
4601 proc_dointvec(table, write, file, buffer, length, ppos); 4729 proc_dointvec(table, write, buffer, length, ppos);
4602 if (write) 4730 if (write)
4603 setup_per_zone_wmarks(); 4731 setup_per_zone_wmarks();
4604 return 0; 4732 return 0;
@@ -4606,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4606 4734
4607#ifdef CONFIG_NUMA 4735#ifdef CONFIG_NUMA
4608int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4736int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4609 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4737 void __user *buffer, size_t *length, loff_t *ppos)
4610{ 4738{
4611 struct zone *zone; 4739 struct zone *zone;
4612 int rc; 4740 int rc;
4613 4741
4614 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4742 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4615 if (rc) 4743 if (rc)
4616 return rc; 4744 return rc;
4617 4745
@@ -4622,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4622} 4750}
4623 4751
4624int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4752int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4625 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4753 void __user *buffer, size_t *length, loff_t *ppos)
4626{ 4754{
4627 struct zone *zone; 4755 struct zone *zone;
4628 int rc; 4756 int rc;
4629 4757
4630 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4758 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4631 if (rc) 4759 if (rc)
4632 return rc; 4760 return rc;
4633 4761
@@ -4648,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4648 * if in function of the boot time zone sizes. 4776 * if in function of the boot time zone sizes.
4649 */ 4777 */
4650int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4778int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4651 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4779 void __user *buffer, size_t *length, loff_t *ppos)
4652{ 4780{
4653 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4781 proc_dointvec_minmax(table, write, buffer, length, ppos);
4654 setup_per_zone_lowmem_reserve(); 4782 setup_per_zone_lowmem_reserve();
4655 return 0; 4783 return 0;
4656} 4784}
@@ -4662,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4662 */ 4790 */
4663 4791
4664int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4792int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4665 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4793 void __user *buffer, size_t *length, loff_t *ppos)
4666{ 4794{
4667 struct zone *zone; 4795 struct zone *zone;
4668 unsigned int cpu; 4796 unsigned int cpu;
4669 int ret; 4797 int ret;
4670 4798
4671 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4799 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4672 if (!write || (ret == -EINVAL)) 4800 if (!write || (ret == -EINVAL))
4673 return ret; 4801 return ret;
4674 for_each_populated_zone(zone) { 4802 for_each_populated_zone(zone) {
@@ -4728,7 +4856,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4728 numentries <<= (PAGE_SHIFT - scale); 4856 numentries <<= (PAGE_SHIFT - scale);
4729 4857
4730 /* Make sure we've got at least a 0-order allocation.. */ 4858 /* Make sure we've got at least a 0-order allocation.. */
4731 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4859 if (unlikely(flags & HASH_SMALL)) {
4860 /* Makes no sense without HASH_EARLY */
4861 WARN_ON(!(flags & HASH_EARLY));
4862 if (!(numentries >> *_hash_shift)) {
4863 numentries = 1UL << *_hash_shift;
4864 BUG_ON(!numentries);
4865 }
4866 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4732 numentries = PAGE_SIZE / bucketsize; 4867 numentries = PAGE_SIZE / bucketsize;
4733 } 4868 }
4734 numentries = roundup_pow_of_two(numentries); 4869 numentries = roundup_pow_of_two(numentries);
@@ -4870,13 +5005,16 @@ int set_migratetype_isolate(struct page *page)
4870 struct zone *zone; 5005 struct zone *zone;
4871 unsigned long flags; 5006 unsigned long flags;
4872 int ret = -EBUSY; 5007 int ret = -EBUSY;
5008 int zone_idx;
4873 5009
4874 zone = page_zone(page); 5010 zone = page_zone(page);
5011 zone_idx = zone_idx(zone);
4875 spin_lock_irqsave(&zone->lock, flags); 5012 spin_lock_irqsave(&zone->lock, flags);
4876 /* 5013 /*
4877 * In future, more migrate types will be able to be isolation target. 5014 * In future, more migrate types will be able to be isolation target.
4878 */ 5015 */
4879 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 5016 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
5017 zone_idx != ZONE_MOVABLE)
4880 goto out; 5018 goto out;
4881 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5019 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4882 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5020 move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f22b4ebbd8dc..3d535d594826 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
116 nid = page_to_nid(pfn_to_page(pfn)); 116 nid = page_to_nid(pfn_to_page(pfn));
117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
118 VM_BUG_ON(!slab_is_available()); 118 VM_BUG_ON(!slab_is_available());
119 base = kmalloc_node(table_size, 119 if (node_state(nid, N_HIGH_MEMORY)) {
120 base = kmalloc_node(table_size,
120 GFP_KERNEL | __GFP_NOWARN, nid); 121 GFP_KERNEL | __GFP_NOWARN, nid);
121 if (!base) 122 if (!base)
122 base = vmalloc_node(table_size, nid); 123 base = vmalloc_node(table_size, nid);
124 } else {
125 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
126 if (!base)
127 base = vmalloc(table_size);
128 }
123 } else { 129 } else {
124 /* 130 /*
125 * We don't have to allocate page_cgroup again, but 131 * We don't have to allocate page_cgroup again, but
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd8853..43d8cacfdaa5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk 11 * chunk is consisted of boot-time determined number of units and the
12 * is used for static percpu variables in the kernel image (special 12 * first chunk is used for static percpu variables in the kernel image
13 * boot time alloc/init handling necessary as these areas need to be 13 * (special boot time alloc/init handling necessary as these areas
14 * brought up before allocation services are running). Unit grows as 14 * need to be brought up before allocation services are running).
15 * necessary and all units grow or shrink in unison. When a chunk is 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * filled up, another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers pcpu_unit_size apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
43 * 46 *
44 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
45 * 48 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
47 * 50 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be 52 * regular address to percpu pointer and back if they need to be
@@ -55,7 +58,9 @@
55 58
56#include <linux/bitmap.h> 59#include <linux/bitmap.h>
57#include <linux/bootmem.h> 60#include <linux/bootmem.h>
61#include <linux/err.h>
58#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/log2.h>
59#include <linux/mm.h> 64#include <linux/mm.h>
60#include <linux/module.h> 65#include <linux/module.h>
61#include <linux/mutex.h> 66#include <linux/mutex.h>
@@ -89,25 +94,38 @@ struct pcpu_chunk {
89 struct list_head list; /* linked to pcpu_slot lists */ 94 struct list_head list; /* linked to pcpu_slot lists */
90 int free_size; /* free bytes in the chunk */ 95 int free_size; /* free bytes in the chunk */
91 int contig_hint; /* max contiguous size hint */ 96 int contig_hint; /* max contiguous size hint */
92 struct vm_struct *vm; /* mapped vmalloc region */ 97 void *base_addr; /* base address of this chunk */
93 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
94 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 100 int *map; /* allocation map */
101 struct vm_struct **vms; /* mapped vmalloc regions */
96 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 103 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 104};
100 105
101static int pcpu_unit_pages __read_mostly; 106static int pcpu_unit_pages __read_mostly;
102static int pcpu_unit_size __read_mostly; 107static int pcpu_unit_size __read_mostly;
103static int pcpu_chunk_size __read_mostly; 108static int pcpu_nr_units __read_mostly;
109static int pcpu_atom_size __read_mostly;
104static int pcpu_nr_slots __read_mostly; 110static int pcpu_nr_slots __read_mostly;
105static size_t pcpu_chunk_struct_size __read_mostly; 111static size_t pcpu_chunk_struct_size __read_mostly;
106 112
113/* cpus with the lowest and highest unit numbers */
114static unsigned int pcpu_first_unit_cpu __read_mostly;
115static unsigned int pcpu_last_unit_cpu __read_mostly;
116
107/* the address of the first chunk which starts with the kernel static area */ 117/* the address of the first chunk which starts with the kernel static area */
108void *pcpu_base_addr __read_mostly; 118void *pcpu_base_addr __read_mostly;
109EXPORT_SYMBOL_GPL(pcpu_base_addr); 119EXPORT_SYMBOL_GPL(pcpu_base_addr);
110 120
121static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
122const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
123
124/* group information, used for vm allocation */
125static int pcpu_nr_groups __read_mostly;
126static const unsigned long *pcpu_group_offsets __read_mostly;
127static const size_t *pcpu_group_sizes __read_mostly;
128
111/* 129/*
112 * The first chunk which always exists. Note that unlike other 130 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different 131 * chunks, this one can be allocated and mapped in several different
@@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 147 * Synchronization rules.
130 * 148 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 149 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 150 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 151 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 152 * data structures - chunk slots, chunks and area maps in chunks.
135 * 153 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,26 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
178 196
179static int pcpu_page_idx(unsigned int cpu, int page_idx) 197static int pcpu_page_idx(unsigned int cpu, int page_idx)
180{ 198{
181 return cpu * pcpu_unit_pages + page_idx; 199 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
182}
183
184static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
185 unsigned int cpu, int page_idx)
186{
187 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
188} 200}
189 201
190static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 202static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
191 unsigned int cpu, int page_idx) 203 unsigned int cpu, int page_idx)
192{ 204{
193 return (unsigned long)chunk->vm->addr + 205 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 206 (page_idx << PAGE_SHIFT);
195} 207}
196 208
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 209static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
198 int page_idx) 210 unsigned int cpu, int page_idx)
199{ 211{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 212 /* must not be used on pre-mapped chunk */
213 WARN_ON(chunk->immutable);
214
215 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
201} 216}
202 217
203/* set the pointer to a chunk in a page struct */ 218/* set the pointer to a chunk in a page struct */
@@ -212,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
212 return (struct pcpu_chunk *)page->index; 227 return (struct pcpu_chunk *)page->index;
213} 228}
214 229
230static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
231{
232 *rs = find_next_zero_bit(chunk->populated, end, *rs);
233 *re = find_next_bit(chunk->populated, end, *rs + 1);
234}
235
236static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
237{
238 *rs = find_next_bit(chunk->populated, end, *rs);
239 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
240}
241
242/*
243 * (Un)populated page region iterators. Iterate over (un)populated
244 * page regions betwen @start and @end in @chunk. @rs and @re should
245 * be integer variables and will be set to start and end page index of
246 * the current region.
247 */
248#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
249 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
250 (rs) < (re); \
251 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
252
253#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
254 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
255 (rs) < (re); \
256 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
257
215/** 258/**
216 * pcpu_mem_alloc - allocate memory 259 * pcpu_mem_alloc - allocate memory
217 * @size: bytes to allocate 260 * @size: bytes to allocate
@@ -287,16 +330,24 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
287 */ 330 */
288static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 331static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
289{ 332{
290 void *first_start = pcpu_first_chunk->vm->addr; 333 void *first_start = pcpu_first_chunk->base_addr;
291 334
292 /* is it in the first chunk? */ 335 /* is it in the first chunk? */
293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 336 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
294 /* is it in the reserved area? */ 337 /* is it in the reserved area? */
295 if (addr < first_start + pcpu_reserved_chunk_limit) 338 if (addr < first_start + pcpu_reserved_chunk_limit)
296 return pcpu_reserved_chunk; 339 return pcpu_reserved_chunk;
297 return pcpu_first_chunk; 340 return pcpu_first_chunk;
298 } 341 }
299 342
343 /*
344 * The address is relative to unit0 which might be unused and
345 * thus unmapped. Offset the address to the unit space of the
346 * current processor before looking it up in the vmalloc
347 * space. Note that any possible cpu id can be used here, so
348 * there's no need to worry about preemption or cpu hotplug.
349 */
350 addr += pcpu_unit_offsets[raw_smp_processor_id()];
300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 351 return pcpu_get_page_chunk(vmalloc_to_page(addr));
301} 352}
302 353
@@ -545,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
545} 596}
546 597
547/** 598/**
548 * pcpu_unmap - unmap pages out of a pcpu_chunk 599 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
549 * @chunk: chunk of interest 600 * @chunk: chunk of interest
550 * @page_start: page index of the first page to unmap 601 * @bitmapp: output parameter for bitmap
551 * @page_end: page index of the last page to unmap + 1 602 * @may_alloc: may allocate the array
552 * @flush_tlb: whether to flush tlb or not
553 * 603 *
554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 604 * Returns pointer to array of pointers to struct page and bitmap,
555 * If @flush is true, vcache is flushed before unmapping and tlb 605 * both of which can be indexed with pcpu_page_idx(). The returned
556 * after. 606 * array is cleared to zero and *@bitmapp is copied from
607 * @chunk->populated. Note that there is only one array and bitmap
608 * and access exclusion is the caller's responsibility.
609 *
610 * CONTEXT:
611 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
612 * Otherwise, don't care.
613 *
614 * RETURNS:
615 * Pointer to temp pages array on success, NULL on failure.
557 */ 616 */
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 617static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
559 bool flush_tlb) 618 unsigned long **bitmapp,
619 bool may_alloc)
560{ 620{
561 unsigned int last = num_possible_cpus() - 1; 621 static struct page **pages;
562 unsigned int cpu; 622 static unsigned long *bitmap;
623 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
624 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
625 sizeof(unsigned long);
626
627 if (!pages || !bitmap) {
628 if (may_alloc && !pages)
629 pages = pcpu_mem_alloc(pages_size);
630 if (may_alloc && !bitmap)
631 bitmap = pcpu_mem_alloc(bitmap_size);
632 if (!pages || !bitmap)
633 return NULL;
634 }
563 635
564 /* unmap must not be done on immutable chunk */ 636 memset(pages, 0, pages_size);
565 WARN_ON(chunk->immutable); 637 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
566 638
567 /* 639 *bitmapp = bitmap;
568 * Each flushing trial can be very expensive, issue flush on 640 return pages;
569 * the whole region at once rather than doing it for each cpu. 641}
570 * This could be an overkill but is more scalable.
571 */
572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
573 pcpu_chunk_addr(chunk, last, page_end));
574 642
575 for_each_possible_cpu(cpu) 643/**
576 unmap_kernel_range_noflush( 644 * pcpu_free_pages - free pages which were allocated for @chunk
577 pcpu_chunk_addr(chunk, cpu, page_start), 645 * @chunk: chunk pages were allocated for
578 (page_end - page_start) << PAGE_SHIFT); 646 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
579 647 * @populated: populated bitmap
580 /* ditto as flush_cache_vunmap() */ 648 * @page_start: page index of the first page to be freed
581 if (flush_tlb) 649 * @page_end: page index of the last page to be freed + 1
582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 650 *
583 pcpu_chunk_addr(chunk, last, page_end)); 651 * Free pages [@page_start and @page_end) in @pages for all units.
652 * The pages were allocated for @chunk.
653 */
654static void pcpu_free_pages(struct pcpu_chunk *chunk,
655 struct page **pages, unsigned long *populated,
656 int page_start, int page_end)
657{
658 unsigned int cpu;
659 int i;
660
661 for_each_possible_cpu(cpu) {
662 for (i = page_start; i < page_end; i++) {
663 struct page *page = pages[pcpu_page_idx(cpu, i)];
664
665 if (page)
666 __free_page(page);
667 }
668 }
584} 669}
585 670
586/** 671/**
587 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 672 * pcpu_alloc_pages - allocates pages for @chunk
588 * @chunk: chunk to depopulate 673 * @chunk: target chunk
589 * @off: offset to the area to depopulate 674 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
590 * @size: size of the area to depopulate in bytes 675 * @populated: populated bitmap
591 * @flush: whether to flush cache and tlb or not 676 * @page_start: page index of the first page to be allocated
592 * 677 * @page_end: page index of the last page to be allocated + 1
593 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 678 *
594 * from @chunk. If @flush is true, vcache is flushed before unmapping 679 * Allocate pages [@page_start,@page_end) into @pages for all units.
595 * and tlb after. 680 * The allocation is for @chunk. Percpu core doesn't care about the
596 * 681 * content of @pages and will pass it verbatim to pcpu_map_pages().
597 * CONTEXT:
598 * pcpu_alloc_mutex.
599 */ 682 */
600static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 683static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
601 bool flush) 684 struct page **pages, unsigned long *populated,
685 int page_start, int page_end)
602{ 686{
603 int page_start = PFN_DOWN(off); 687 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
604 int page_end = PFN_UP(off + size);
605 int unmap_start = -1;
606 int uninitialized_var(unmap_end);
607 unsigned int cpu; 688 unsigned int cpu;
608 int i; 689 int i;
609 690
610 for (i = page_start; i < page_end; i++) { 691 for_each_possible_cpu(cpu) {
611 for_each_possible_cpu(cpu) { 692 for (i = page_start; i < page_end; i++) {
612 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 693 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
694
695 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
696 if (!*pagep) {
697 pcpu_free_pages(chunk, pages, populated,
698 page_start, page_end);
699 return -ENOMEM;
700 }
701 }
702 }
703 return 0;
704}
613 705
614 if (!*pagep) 706/**
615 continue; 707 * pcpu_pre_unmap_flush - flush cache prior to unmapping
708 * @chunk: chunk the regions to be flushed belongs to
709 * @page_start: page index of the first page to be flushed
710 * @page_end: page index of the last page to be flushed + 1
711 *
712 * Pages in [@page_start,@page_end) of @chunk are about to be
713 * unmapped. Flush cache. As each flushing trial can be very
714 * expensive, issue flush on the whole region at once rather than
715 * doing it for each cpu. This could be an overkill but is more
716 * scalable.
717 */
718static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
719 int page_start, int page_end)
720{
721 flush_cache_vunmap(
722 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
723 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
724}
725
726static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
727{
728 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
729}
616 730
617 __free_page(*pagep); 731/**
732 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
733 * @chunk: chunk of interest
734 * @pages: pages array which can be used to pass information to free
735 * @populated: populated bitmap
736 * @page_start: page index of the first page to unmap
737 * @page_end: page index of the last page to unmap + 1
738 *
739 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
740 * Corresponding elements in @pages were cleared by the caller and can
741 * be used to carry information to pcpu_free_pages() which will be
742 * called after all unmaps are finished. The caller should call
743 * proper pre/post flush functions.
744 */
745static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
746 struct page **pages, unsigned long *populated,
747 int page_start, int page_end)
748{
749 unsigned int cpu;
750 int i;
618 751
619 /* 752 for_each_possible_cpu(cpu) {
620 * If it's partial depopulation, it might get 753 for (i = page_start; i < page_end; i++) {
621 * populated or depopulated again. Mark the 754 struct page *page;
622 * page gone.
623 */
624 *pagep = NULL;
625 755
626 unmap_start = unmap_start < 0 ? i : unmap_start; 756 page = pcpu_chunk_page(chunk, cpu, i);
627 unmap_end = i + 1; 757 WARN_ON(!page);
758 pages[pcpu_page_idx(cpu, i)] = page;
628 } 759 }
760 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
761 page_end - page_start);
629 } 762 }
630 763
631 if (unmap_start >= 0) 764 for (i = page_start; i < page_end; i++)
632 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 765 __clear_bit(i, populated);
766}
767
768/**
769 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
770 * @chunk: pcpu_chunk the regions to be flushed belong to
771 * @page_start: page index of the first page to be flushed
772 * @page_end: page index of the last page to be flushed + 1
773 *
774 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
775 * TLB for the regions. This can be skipped if the area is to be
776 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
777 *
778 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
779 * for the whole region.
780 */
781static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
782 int page_start, int page_end)
783{
784 flush_tlb_kernel_range(
785 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
786 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
787}
788
789static int __pcpu_map_pages(unsigned long addr, struct page **pages,
790 int nr_pages)
791{
792 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
793 PAGE_KERNEL, pages);
633} 794}
634 795
635/** 796/**
636 * pcpu_map - map pages into a pcpu_chunk 797 * pcpu_map_pages - map pages into a pcpu_chunk
637 * @chunk: chunk of interest 798 * @chunk: chunk of interest
799 * @pages: pages array containing pages to be mapped
800 * @populated: populated bitmap
638 * @page_start: page index of the first page to map 801 * @page_start: page index of the first page to map
639 * @page_end: page index of the last page to map + 1 802 * @page_end: page index of the last page to map + 1
640 * 803 *
641 * For each cpu, map pages [@page_start,@page_end) into @chunk. 804 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
642 * vcache is flushed afterwards. 805 * caller is responsible for calling pcpu_post_map_flush() after all
806 * mappings are complete.
807 *
808 * This function is responsible for setting corresponding bits in
809 * @chunk->populated bitmap and whatever is necessary for reverse
810 * lookup (addr -> chunk).
643 */ 811 */
644static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 812static int pcpu_map_pages(struct pcpu_chunk *chunk,
813 struct page **pages, unsigned long *populated,
814 int page_start, int page_end)
645{ 815{
646 unsigned int last = num_possible_cpus() - 1; 816 unsigned int cpu, tcpu;
647 unsigned int cpu; 817 int i, err;
648 int err;
649
650 /* map must not be done on immutable chunk */
651 WARN_ON(chunk->immutable);
652 818
653 for_each_possible_cpu(cpu) { 819 for_each_possible_cpu(cpu) {
654 err = map_kernel_range_noflush( 820 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
655 pcpu_chunk_addr(chunk, cpu, page_start), 821 &pages[pcpu_page_idx(cpu, page_start)],
656 (page_end - page_start) << PAGE_SHIFT, 822 page_end - page_start);
657 PAGE_KERNEL,
658 pcpu_chunk_pagep(chunk, cpu, page_start));
659 if (err < 0) 823 if (err < 0)
660 return err; 824 goto err;
825 }
826
827 /* mapping successful, link chunk and mark populated */
828 for (i = page_start; i < page_end; i++) {
829 for_each_possible_cpu(cpu)
830 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
831 chunk);
832 __set_bit(i, populated);
661 } 833 }
662 834
663 /* flush at once, please read comments in pcpu_unmap() */
664 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
665 pcpu_chunk_addr(chunk, last, page_end));
666 return 0; 835 return 0;
836
837err:
838 for_each_possible_cpu(tcpu) {
839 if (tcpu == cpu)
840 break;
841 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
842 page_end - page_start);
843 }
844 return err;
845}
846
847/**
848 * pcpu_post_map_flush - flush cache after mapping
849 * @chunk: pcpu_chunk the regions to be flushed belong to
850 * @page_start: page index of the first page to be flushed
851 * @page_end: page index of the last page to be flushed + 1
852 *
853 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
854 * cache.
855 *
856 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
857 * for the whole region.
858 */
859static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
860 int page_start, int page_end)
861{
862 flush_cache_vmap(
863 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
864 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
865}
866
867/**
868 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
869 * @chunk: chunk to depopulate
870 * @off: offset to the area to depopulate
871 * @size: size of the area to depopulate in bytes
872 * @flush: whether to flush cache and tlb or not
873 *
874 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
875 * from @chunk. If @flush is true, vcache is flushed before unmapping
876 * and tlb after.
877 *
878 * CONTEXT:
879 * pcpu_alloc_mutex.
880 */
881static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
882{
883 int page_start = PFN_DOWN(off);
884 int page_end = PFN_UP(off + size);
885 struct page **pages;
886 unsigned long *populated;
887 int rs, re;
888
889 /* quick path, check whether it's empty already */
890 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
891 if (rs == page_start && re == page_end)
892 return;
893 break;
894 }
895
896 /* immutable chunks can't be depopulated */
897 WARN_ON(chunk->immutable);
898
899 /*
900 * If control reaches here, there must have been at least one
901 * successful population attempt so the temp pages array must
902 * be available now.
903 */
904 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
905 BUG_ON(!pages);
906
907 /* unmap and free */
908 pcpu_pre_unmap_flush(chunk, page_start, page_end);
909
910 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
911 pcpu_unmap_pages(chunk, pages, populated, rs, re);
912
913 /* no need to flush tlb, vmalloc will handle it lazily */
914
915 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
916 pcpu_free_pages(chunk, pages, populated, rs, re);
917
918 /* commit new bitmap */
919 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
667} 920}
668 921
669/** 922/**
@@ -680,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
680 */ 933 */
681static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 934static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
682{ 935{
683 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
684 int page_start = PFN_DOWN(off); 936 int page_start = PFN_DOWN(off);
685 int page_end = PFN_UP(off + size); 937 int page_end = PFN_UP(off + size);
686 int map_start = -1; 938 int free_end = page_start, unmap_end = page_start;
687 int uninitialized_var(map_end); 939 struct page **pages;
940 unsigned long *populated;
688 unsigned int cpu; 941 unsigned int cpu;
689 int i; 942 int rs, re, rc;
690 943
691 for (i = page_start; i < page_end; i++) { 944 /* quick path, check whether all pages are already there */
692 if (pcpu_chunk_page_occupied(chunk, i)) { 945 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
693 if (map_start >= 0) { 946 if (rs == page_start && re == page_end)
694 if (pcpu_map(chunk, map_start, map_end)) 947 goto clear;
695 goto err; 948 break;
696 map_start = -1; 949 }
697 }
698 continue;
699 }
700 950
701 map_start = map_start < 0 ? i : map_start; 951 /* need to allocate and map pages, this chunk can't be immutable */
702 map_end = i + 1; 952 WARN_ON(chunk->immutable);
703 953
704 for_each_possible_cpu(cpu) { 954 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
705 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 955 if (!pages)
956 return -ENOMEM;
706 957
707 *pagep = alloc_pages_node(cpu_to_node(cpu), 958 /* alloc and map */
708 alloc_mask, 0); 959 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
709 if (!*pagep) 960 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
710 goto err; 961 if (rc)
711 pcpu_set_page_chunk(*pagep, chunk); 962 goto err_free;
712 } 963 free_end = re;
713 } 964 }
714 965
715 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 966 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
716 goto err; 967 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
968 if (rc)
969 goto err_unmap;
970 unmap_end = re;
971 }
972 pcpu_post_map_flush(chunk, page_start, page_end);
717 973
974 /* commit new bitmap */
975 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
976clear:
718 for_each_possible_cpu(cpu) 977 for_each_possible_cpu(cpu)
719 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 978 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
720 size);
721
722 return 0; 979 return 0;
723err: 980
724 /* likely under heavy memory pressure, give memory back */ 981err_unmap:
725 pcpu_depopulate_chunk(chunk, off, size, true); 982 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
726 return -ENOMEM; 983 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
984 pcpu_unmap_pages(chunk, pages, populated, rs, re);
985 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
986err_free:
987 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
988 pcpu_free_pages(chunk, pages, populated, rs, re);
989 return rc;
727} 990}
728 991
729static void free_pcpu_chunk(struct pcpu_chunk *chunk) 992static void free_pcpu_chunk(struct pcpu_chunk *chunk)
730{ 993{
731 if (!chunk) 994 if (!chunk)
732 return; 995 return;
733 if (chunk->vm) 996 if (chunk->vms)
734 free_vm_area(chunk->vm); 997 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
735 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 998 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
736 kfree(chunk); 999 kfree(chunk);
737} 1000}
@@ -747,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
747 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1010 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
748 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1011 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
749 chunk->map[chunk->map_used++] = pcpu_unit_size; 1012 chunk->map[chunk->map_used++] = pcpu_unit_size;
750 chunk->page = chunk->page_ar;
751 1013
752 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); 1014 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
753 if (!chunk->vm) { 1015 pcpu_nr_groups, pcpu_atom_size,
1016 GFP_KERNEL);
1017 if (!chunk->vms) {
754 free_pcpu_chunk(chunk); 1018 free_pcpu_chunk(chunk);
755 return NULL; 1019 return NULL;
756 } 1020 }
@@ -758,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
758 INIT_LIST_HEAD(&chunk->list); 1022 INIT_LIST_HEAD(&chunk->list);
759 chunk->free_size = pcpu_unit_size; 1023 chunk->free_size = pcpu_unit_size;
760 chunk->contig_hint = pcpu_unit_size; 1024 chunk->contig_hint = pcpu_unit_size;
1025 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
761 1026
762 return chunk; 1027 return chunk;
763} 1028}
@@ -847,7 +1112,8 @@ area_found:
847 1112
848 mutex_unlock(&pcpu_alloc_mutex); 1113 mutex_unlock(&pcpu_alloc_mutex);
849 1114
850 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1115 /* return address relative to base address */
1116 return __addr_to_pcpu_ptr(chunk->base_addr + off);
851 1117
852fail_unlock: 1118fail_unlock:
853 spin_unlock_irq(&pcpu_lock); 1119 spin_unlock_irq(&pcpu_lock);
@@ -925,12 +1191,13 @@ static void pcpu_reclaim(struct work_struct *work)
925 } 1191 }
926 1192
927 spin_unlock_irq(&pcpu_lock); 1193 spin_unlock_irq(&pcpu_lock);
928 mutex_unlock(&pcpu_alloc_mutex);
929 1194
930 list_for_each_entry_safe(chunk, next, &todo, list) { 1195 list_for_each_entry_safe(chunk, next, &todo, list) {
931 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1196 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
932 free_pcpu_chunk(chunk); 1197 free_pcpu_chunk(chunk);
933 } 1198 }
1199
1200 mutex_unlock(&pcpu_alloc_mutex);
934} 1201}
935 1202
936/** 1203/**
@@ -955,7 +1222,7 @@ void free_percpu(void *ptr)
955 spin_lock_irqsave(&pcpu_lock, flags); 1222 spin_lock_irqsave(&pcpu_lock, flags);
956 1223
957 chunk = pcpu_chunk_addr_search(addr); 1224 chunk = pcpu_chunk_addr_search(addr);
958 off = addr - chunk->vm->addr; 1225 off = addr - chunk->base_addr;
959 1226
960 pcpu_free_area(chunk, off); 1227 pcpu_free_area(chunk, off);
961 1228
@@ -974,30 +1241,295 @@ void free_percpu(void *ptr)
974} 1241}
975EXPORT_SYMBOL_GPL(free_percpu); 1242EXPORT_SYMBOL_GPL(free_percpu);
976 1243
1244static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1245 size_t reserved_size,
1246 ssize_t *dyn_sizep)
1247{
1248 size_t size_sum;
1249
1250 size_sum = PFN_ALIGN(static_size + reserved_size +
1251 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1252 if (*dyn_sizep != 0)
1253 *dyn_sizep = size_sum - static_size - reserved_size;
1254
1255 return size_sum;
1256}
1257
977/** 1258/**
978 * pcpu_setup_first_chunk - initialize the first percpu chunk 1259 * pcpu_alloc_alloc_info - allocate percpu allocation info
979 * @get_page_fn: callback to fetch page pointer 1260 * @nr_groups: the number of groups
980 * @static_size: the size of static percpu area in bytes 1261 * @nr_units: the number of units
1262 *
1263 * Allocate ai which is large enough for @nr_groups groups containing
1264 * @nr_units units. The returned ai's groups[0].cpu_map points to the
1265 * cpu_map array which is long enough for @nr_units and filled with
1266 * NR_CPUS. It's the caller's responsibility to initialize cpu_map
1267 * pointer of other groups.
1268 *
1269 * RETURNS:
1270 * Pointer to the allocated pcpu_alloc_info on success, NULL on
1271 * failure.
1272 */
1273struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1274 int nr_units)
1275{
1276 struct pcpu_alloc_info *ai;
1277 size_t base_size, ai_size;
1278 void *ptr;
1279 int unit;
1280
1281 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1282 __alignof__(ai->groups[0].cpu_map[0]));
1283 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1284
1285 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
1286 if (!ptr)
1287 return NULL;
1288 ai = ptr;
1289 ptr += base_size;
1290
1291 ai->groups[0].cpu_map = ptr;
1292
1293 for (unit = 0; unit < nr_units; unit++)
1294 ai->groups[0].cpu_map[unit] = NR_CPUS;
1295
1296 ai->nr_groups = nr_groups;
1297 ai->__ai_size = PFN_ALIGN(ai_size);
1298
1299 return ai;
1300}
1301
1302/**
1303 * pcpu_free_alloc_info - free percpu allocation info
1304 * @ai: pcpu_alloc_info to free
1305 *
1306 * Free @ai which was allocated by pcpu_alloc_alloc_info().
1307 */
1308void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1309{
1310 free_bootmem(__pa(ai), ai->__ai_size);
1311}
1312
1313/**
1314 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
981 * @reserved_size: the size of reserved percpu area in bytes 1315 * @reserved_size: the size of reserved percpu area in bytes
982 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1316 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
983 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1317 * @atom_size: allocation atom size
984 * @base_addr: mapped address, NULL for auto 1318 * @cpu_distance_fn: callback to determine distance between cpus, optional
985 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1319 *
1320 * This function determines grouping of units, their mappings to cpus
1321 * and other parameters considering needed percpu size, allocation
1322 * atom size and distances between CPUs.
1323 *
1324 * Groups are always mutliples of atom size and CPUs which are of
1325 * LOCAL_DISTANCE both ways are grouped together and share space for
1326 * units in the same group. The returned configuration is guaranteed
1327 * to have CPUs on different nodes on different groups and >=75% usage
1328 * of allocated virtual address space.
1329 *
1330 * RETURNS:
1331 * On success, pointer to the new allocation_info is returned. On
1332 * failure, ERR_PTR value is returned.
1333 */
1334struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1335 size_t reserved_size, ssize_t dyn_size,
1336 size_t atom_size,
1337 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1338{
1339 static int group_map[NR_CPUS] __initdata;
1340 static int group_cnt[NR_CPUS] __initdata;
1341 const size_t static_size = __per_cpu_end - __per_cpu_start;
1342 int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
1343 size_t size_sum, min_unit_size, alloc_size;
1344 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1345 int last_allocs, group, unit;
1346 unsigned int cpu, tcpu;
1347 struct pcpu_alloc_info *ai;
1348 unsigned int *cpu_map;
1349
1350 /*
1351 * Determine min_unit_size, alloc_size and max_upa such that
1352 * alloc_size is multiple of atom_size and is the smallest
1353 * which can accomodate 4k aligned segments which are equal to
1354 * or larger than min_unit_size.
1355 */
1356 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1357 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1358
1359 alloc_size = roundup(min_unit_size, atom_size);
1360 upa = alloc_size / min_unit_size;
1361 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1362 upa--;
1363 max_upa = upa;
1364
1365 /* group cpus according to their proximity */
1366 for_each_possible_cpu(cpu) {
1367 group = 0;
1368 next_group:
1369 for_each_possible_cpu(tcpu) {
1370 if (cpu == tcpu)
1371 break;
1372 if (group_map[tcpu] == group && cpu_distance_fn &&
1373 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1374 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1375 group++;
1376 nr_groups = max(nr_groups, group + 1);
1377 goto next_group;
1378 }
1379 }
1380 group_map[cpu] = group;
1381 group_cnt[group]++;
1382 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1383 }
1384
1385 /*
1386 * Expand unit size until address space usage goes over 75%
1387 * and then as much as possible without using more address
1388 * space.
1389 */
1390 last_allocs = INT_MAX;
1391 for (upa = max_upa; upa; upa--) {
1392 int allocs = 0, wasted = 0;
1393
1394 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1395 continue;
1396
1397 for (group = 0; group < nr_groups; group++) {
1398 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1399 allocs += this_allocs;
1400 wasted += this_allocs * upa - group_cnt[group];
1401 }
1402
1403 /*
1404 * Don't accept if wastage is over 25%. The
1405 * greater-than comparison ensures upa==1 always
1406 * passes the following check.
1407 */
1408 if (wasted > num_possible_cpus() / 3)
1409 continue;
1410
1411 /* and then don't consume more memory */
1412 if (allocs > last_allocs)
1413 break;
1414 last_allocs = allocs;
1415 best_upa = upa;
1416 }
1417 upa = best_upa;
1418
1419 /* allocate and fill alloc_info */
1420 for (group = 0; group < nr_groups; group++)
1421 nr_units += roundup(group_cnt[group], upa);
1422
1423 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1424 if (!ai)
1425 return ERR_PTR(-ENOMEM);
1426 cpu_map = ai->groups[0].cpu_map;
1427
1428 for (group = 0; group < nr_groups; group++) {
1429 ai->groups[group].cpu_map = cpu_map;
1430 cpu_map += roundup(group_cnt[group], upa);
1431 }
1432
1433 ai->static_size = static_size;
1434 ai->reserved_size = reserved_size;
1435 ai->dyn_size = dyn_size;
1436 ai->unit_size = alloc_size / upa;
1437 ai->atom_size = atom_size;
1438 ai->alloc_size = alloc_size;
1439
1440 for (group = 0, unit = 0; group_cnt[group]; group++) {
1441 struct pcpu_group_info *gi = &ai->groups[group];
1442
1443 /*
1444 * Initialize base_offset as if all groups are located
1445 * back-to-back. The caller should update this to
1446 * reflect actual allocation.
1447 */
1448 gi->base_offset = unit * ai->unit_size;
1449
1450 for_each_possible_cpu(cpu)
1451 if (group_map[cpu] == group)
1452 gi->cpu_map[gi->nr_units++] = cpu;
1453 gi->nr_units = roundup(gi->nr_units, upa);
1454 unit += gi->nr_units;
1455 }
1456 BUG_ON(unit != nr_units);
1457
1458 return ai;
1459}
1460
1461/**
1462 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1463 * @lvl: loglevel
1464 * @ai: allocation info to dump
1465 *
1466 * Print out information about @ai using loglevel @lvl.
1467 */
1468static void pcpu_dump_alloc_info(const char *lvl,
1469 const struct pcpu_alloc_info *ai)
1470{
1471 int group_width = 1, cpu_width = 1, width;
1472 char empty_str[] = "--------";
1473 int alloc = 0, alloc_end = 0;
1474 int group, v;
1475 int upa, apl; /* units per alloc, allocs per line */
1476
1477 v = ai->nr_groups;
1478 while (v /= 10)
1479 group_width++;
1480
1481 v = num_possible_cpus();
1482 while (v /= 10)
1483 cpu_width++;
1484 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1485
1486 upa = ai->alloc_size / ai->unit_size;
1487 width = upa * (cpu_width + 1) + group_width + 3;
1488 apl = rounddown_pow_of_two(max(60 / width, 1));
1489
1490 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1491 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1492 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1493
1494 for (group = 0; group < ai->nr_groups; group++) {
1495 const struct pcpu_group_info *gi = &ai->groups[group];
1496 int unit = 0, unit_end = 0;
1497
1498 BUG_ON(gi->nr_units % upa);
1499 for (alloc_end += gi->nr_units / upa;
1500 alloc < alloc_end; alloc++) {
1501 if (!(alloc % apl)) {
1502 printk("\n");
1503 printk("%spcpu-alloc: ", lvl);
1504 }
1505 printk("[%0*d] ", group_width, group);
1506
1507 for (unit_end += upa; unit < unit_end; unit++)
1508 if (gi->cpu_map[unit] != NR_CPUS)
1509 printk("%0*d ", cpu_width,
1510 gi->cpu_map[unit]);
1511 else
1512 printk("%s ", empty_str);
1513 }
1514 }
1515 printk("\n");
1516}
1517
1518/**
1519 * pcpu_setup_first_chunk - initialize the first percpu chunk
1520 * @ai: pcpu_alloc_info describing how to percpu area is shaped
1521 * @base_addr: mapped address
986 * 1522 *
987 * Initialize the first percpu chunk which contains the kernel static 1523 * Initialize the first percpu chunk which contains the kernel static
988 * perpcu area. This function is to be called from arch percpu area 1524 * perpcu area. This function is to be called from arch percpu area
989 * setup path. The first two parameters are mandatory. The rest are 1525 * setup path.
990 * optional. 1526 *
991 * 1527 * @ai contains all information necessary to initialize the first
992 * @get_page_fn() should return pointer to percpu page given cpu 1528 * chunk and prime the dynamic percpu allocator.
993 * number and page number. It should at least return enough pages to 1529 *
994 * cover the static area. The returned pages for static area should 1530 * @ai->static_size is the size of static percpu area.
995 * have been initialized with valid data. If @unit_size is specified, 1531 *
996 * it can also return pages after the static area. NULL return 1532 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
997 * indicates end of pages for the cpu. Note that @get_page_fn() must
998 * return the same number of pages for all cpus.
999 *
1000 * @reserved_size, if non-zero, specifies the amount of bytes to
1001 * reserve after the static area in the first chunk. This reserves 1533 * reserve after the static area in the first chunk. This reserves
1002 * the first chunk such that it's available only through reserved 1534 * the first chunk such that it's available only through reserved
1003 * percpu allocation. This is primarily used to serve module percpu 1535 * percpu allocation. This is primarily used to serve module percpu
@@ -1005,22 +1537,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
1005 * limited offset range for symbol relocations to guarantee module 1537 * limited offset range for symbol relocations to guarantee module
1006 * percpu symbols fall inside the relocatable range. 1538 * percpu symbols fall inside the relocatable range.
1007 * 1539 *
1008 * @dyn_size, if non-negative, determines the number of bytes 1540 * @ai->dyn_size determines the number of bytes available for dynamic
1009 * available for dynamic allocation in the first chunk. Specifying 1541 * allocation in the first chunk. The area between @ai->static_size +
1010 * non-negative value makes percpu leave alone the area beyond 1542 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
1011 * @static_size + @reserved_size + @dyn_size.
1012 * 1543 *
1013 * @unit_size, if non-negative, specifies unit size and must be 1544 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
1014 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1545 * and equal to or larger than @ai->static_size + @ai->reserved_size +
1015 * @reserved_size + if non-negative, @dyn_size. 1546 * @ai->dyn_size.
1016 * 1547 *
1017 * Non-null @base_addr means that the caller already allocated virtual 1548 * @ai->atom_size is the allocation atom size and used as alignment
1018 * region for the first chunk and mapped it. percpu must not mess 1549 * for vm areas.
1019 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
1020 * @populate_pte_fn doesn't make any sense.
1021 * 1550 *
1022 * @populate_pte_fn is used to populate the pagetable. NULL means the 1551 * @ai->alloc_size is the allocation size and always multiple of
1023 * caller already populated the pagetable. 1552 * @ai->atom_size. This is larger than @ai->atom_size if
1553 * @ai->unit_size is larger than @ai->atom_size.
1554 *
1555 * @ai->nr_groups and @ai->groups describe virtual memory layout of
1556 * percpu areas. Units which should be colocated are put into the
1557 * same group. Dynamic VM areas will be allocated according to these
1558 * groupings. If @ai->nr_groups is zero, a single group containing
1559 * all units is assumed.
1560 *
1561 * The caller should have mapped the first chunk at @base_addr and
1562 * copied static data to each unit.
1024 * 1563 *
1025 * If the first chunk ends up with both reserved and dynamic areas, it 1564 * If the first chunk ends up with both reserved and dynamic areas, it
1026 * is served by two chunks - one to serve the core static and reserved 1565 * is served by two chunks - one to serve the core static and reserved
@@ -1030,49 +1569,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
1030 * and available for dynamic allocation like any other chunks. 1569 * and available for dynamic allocation like any other chunks.
1031 * 1570 *
1032 * RETURNS: 1571 * RETURNS:
1033 * The determined pcpu_unit_size which can be used to initialize 1572 * 0 on success, -errno on failure.
1034 * percpu access.
1035 */ 1573 */
1036size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1574int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1037 size_t static_size, size_t reserved_size, 1575 void *base_addr)
1038 ssize_t dyn_size, ssize_t unit_size,
1039 void *base_addr,
1040 pcpu_populate_pte_fn_t populate_pte_fn)
1041{ 1576{
1042 static struct vm_struct first_vm;
1043 static int smap[2], dmap[2]; 1577 static int smap[2], dmap[2];
1044 size_t size_sum = static_size + reserved_size + 1578 size_t dyn_size = ai->dyn_size;
1045 (dyn_size >= 0 ? dyn_size : 0); 1579 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1046 struct pcpu_chunk *schunk, *dchunk = NULL; 1580 struct pcpu_chunk *schunk, *dchunk = NULL;
1581 unsigned long *group_offsets;
1582 size_t *group_sizes;
1583 unsigned long *unit_off;
1047 unsigned int cpu; 1584 unsigned int cpu;
1048 int nr_pages; 1585 int *unit_map;
1049 int err, i; 1586 int group, unit, i;
1050 1587
1051 /* santiy checks */ 1588 /* sanity checks */
1052 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1589 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1053 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1590 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1054 BUG_ON(!static_size); 1591 BUG_ON(ai->nr_groups <= 0);
1055 if (unit_size >= 0) { 1592 BUG_ON(!ai->static_size);
1056 BUG_ON(unit_size < size_sum); 1593 BUG_ON(!base_addr);
1057 BUG_ON(unit_size & ~PAGE_MASK); 1594 BUG_ON(ai->unit_size < size_sum);
1058 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1595 BUG_ON(ai->unit_size & ~PAGE_MASK);
1059 } else 1596 BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1060 BUG_ON(base_addr); 1597
1061 BUG_ON(base_addr && populate_pte_fn); 1598 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1062 1599
1063 if (unit_size >= 0) 1600 /* process group information and build config tables accordingly */
1064 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1601 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
1065 else 1602 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
1066 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1603 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
1067 PFN_UP(size_sum)); 1604 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1605
1606 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1607 unit_map[cpu] = NR_CPUS;
1608 pcpu_first_unit_cpu = NR_CPUS;
1609
1610 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1611 const struct pcpu_group_info *gi = &ai->groups[group];
1612
1613 group_offsets[group] = gi->base_offset;
1614 group_sizes[group] = gi->nr_units * ai->unit_size;
1615
1616 for (i = 0; i < gi->nr_units; i++) {
1617 cpu = gi->cpu_map[i];
1618 if (cpu == NR_CPUS)
1619 continue;
1068 1620
1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1621 BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
1070 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1622 BUG_ON(unit_map[cpu] != NR_CPUS);
1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1072 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
1073 1623
1074 if (dyn_size < 0) 1624 unit_map[cpu] = unit + i;
1075 dyn_size = pcpu_unit_size - static_size - reserved_size; 1625 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1626
1627 if (pcpu_first_unit_cpu == NR_CPUS)
1628 pcpu_first_unit_cpu = cpu;
1629 }
1630 }
1631 pcpu_last_unit_cpu = cpu;
1632 pcpu_nr_units = unit;
1633
1634 for_each_possible_cpu(cpu)
1635 BUG_ON(unit_map[cpu] == NR_CPUS);
1636
1637 pcpu_nr_groups = ai->nr_groups;
1638 pcpu_group_offsets = group_offsets;
1639 pcpu_group_sizes = group_sizes;
1640 pcpu_unit_map = unit_map;
1641 pcpu_unit_offsets = unit_off;
1642
1643 /* determine basic parameters */
1644 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1645 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1646 pcpu_atom_size = ai->atom_size;
1647 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1648 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1076 1649
1077 /* 1650 /*
1078 * Allocate chunk slots. The additional last slot is for 1651 * Allocate chunk slots. The additional last slot is for
@@ -1092,186 +1665,351 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1092 */ 1665 */
1093 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1666 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1094 INIT_LIST_HEAD(&schunk->list); 1667 INIT_LIST_HEAD(&schunk->list);
1095 schunk->vm = &first_vm; 1668 schunk->base_addr = base_addr;
1096 schunk->map = smap; 1669 schunk->map = smap;
1097 schunk->map_alloc = ARRAY_SIZE(smap); 1670 schunk->map_alloc = ARRAY_SIZE(smap);
1098 schunk->page = schunk->page_ar; 1671 schunk->immutable = true;
1672 bitmap_fill(schunk->populated, pcpu_unit_pages);
1099 1673
1100 if (reserved_size) { 1674 if (ai->reserved_size) {
1101 schunk->free_size = reserved_size; 1675 schunk->free_size = ai->reserved_size;
1102 pcpu_reserved_chunk = schunk; 1676 pcpu_reserved_chunk = schunk;
1103 pcpu_reserved_chunk_limit = static_size + reserved_size; 1677 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1104 } else { 1678 } else {
1105 schunk->free_size = dyn_size; 1679 schunk->free_size = dyn_size;
1106 dyn_size = 0; /* dynamic area covered */ 1680 dyn_size = 0; /* dynamic area covered */
1107 } 1681 }
1108 schunk->contig_hint = schunk->free_size; 1682 schunk->contig_hint = schunk->free_size;
1109 1683
1110 schunk->map[schunk->map_used++] = -static_size; 1684 schunk->map[schunk->map_used++] = -ai->static_size;
1111 if (schunk->free_size) 1685 if (schunk->free_size)
1112 schunk->map[schunk->map_used++] = schunk->free_size; 1686 schunk->map[schunk->map_used++] = schunk->free_size;
1113 1687
1114 /* init dynamic chunk if necessary */ 1688 /* init dynamic chunk if necessary */
1115 if (dyn_size) { 1689 if (dyn_size) {
1116 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1690 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1117 INIT_LIST_HEAD(&dchunk->list); 1691 INIT_LIST_HEAD(&dchunk->list);
1118 dchunk->vm = &first_vm; 1692 dchunk->base_addr = base_addr;
1119 dchunk->map = dmap; 1693 dchunk->map = dmap;
1120 dchunk->map_alloc = ARRAY_SIZE(dmap); 1694 dchunk->map_alloc = ARRAY_SIZE(dmap);
1121 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1695 dchunk->immutable = true;
1696 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1122 1697
1123 dchunk->contig_hint = dchunk->free_size = dyn_size; 1698 dchunk->contig_hint = dchunk->free_size = dyn_size;
1124 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1699 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1125 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1700 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1126 } 1701 }
1127 1702
1128 /* allocate vm address */
1129 first_vm.flags = VM_ALLOC;
1130 first_vm.size = pcpu_chunk_size;
1131
1132 if (!base_addr)
1133 vm_area_register_early(&first_vm, PAGE_SIZE);
1134 else {
1135 /*
1136 * Pages already mapped. No need to remap into
1137 * vmalloc area. In this case the first chunks can't
1138 * be mapped or unmapped by percpu and are marked
1139 * immutable.
1140 */
1141 first_vm.addr = base_addr;
1142 schunk->immutable = true;
1143 if (dchunk)
1144 dchunk->immutable = true;
1145 }
1146
1147 /* assign pages */
1148 nr_pages = -1;
1149 for_each_possible_cpu(cpu) {
1150 for (i = 0; i < pcpu_unit_pages; i++) {
1151 struct page *page = get_page_fn(cpu, i);
1152
1153 if (!page)
1154 break;
1155 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1156 }
1157
1158 BUG_ON(i < PFN_UP(static_size));
1159
1160 if (nr_pages < 0)
1161 nr_pages = i;
1162 else
1163 BUG_ON(nr_pages != i);
1164 }
1165
1166 /* map them */
1167 if (populate_pte_fn) {
1168 for_each_possible_cpu(cpu)
1169 for (i = 0; i < nr_pages; i++)
1170 populate_pte_fn(pcpu_chunk_addr(schunk,
1171 cpu, i));
1172
1173 err = pcpu_map(schunk, 0, nr_pages);
1174 if (err)
1175 panic("failed to setup static percpu area, err=%d\n",
1176 err);
1177 }
1178
1179 /* link the first chunk in */ 1703 /* link the first chunk in */
1180 pcpu_first_chunk = dchunk ?: schunk; 1704 pcpu_first_chunk = dchunk ?: schunk;
1181 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1705 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1182 1706
1183 /* we're done */ 1707 /* we're done */
1184 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1708 pcpu_base_addr = base_addr;
1185 return pcpu_unit_size; 1709 return 0;
1186} 1710}
1187 1711
1188/* 1712const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1189 * Embedding first chunk setup helper. 1713 [PCPU_FC_AUTO] = "auto",
1190 */ 1714 [PCPU_FC_EMBED] = "embed",
1191static void *pcpue_ptr __initdata; 1715 [PCPU_FC_PAGE] = "page",
1192static size_t pcpue_size __initdata; 1716};
1193static size_t pcpue_unit_size __initdata;
1194 1717
1195static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1718enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1196{
1197 size_t off = (size_t)pageno << PAGE_SHIFT;
1198 1719
1199 if (off >= pcpue_size) 1720static int __init percpu_alloc_setup(char *str)
1200 return NULL; 1721{
1722 if (0)
1723 /* nada */;
1724#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1725 else if (!strcmp(str, "embed"))
1726 pcpu_chosen_fc = PCPU_FC_EMBED;
1727#endif
1728#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1729 else if (!strcmp(str, "page"))
1730 pcpu_chosen_fc = PCPU_FC_PAGE;
1731#endif
1732 else
1733 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1201 1734
1202 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1735 return 0;
1203} 1736}
1737early_param("percpu_alloc", percpu_alloc_setup);
1204 1738
1739#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1740 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1205/** 1741/**
1206 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1742 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1207 * @static_size: the size of static percpu area in bytes
1208 * @reserved_size: the size of reserved percpu area in bytes 1743 * @reserved_size: the size of reserved percpu area in bytes
1209 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1744 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1210 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1745 * @atom_size: allocation atom size
1746 * @cpu_distance_fn: callback to determine distance between cpus, optional
1747 * @alloc_fn: function to allocate percpu page
1748 * @free_fn: funtion to free percpu page
1211 * 1749 *
1212 * This is a helper to ease setting up embedded first percpu chunk and 1750 * This is a helper to ease setting up embedded first percpu chunk and
1213 * can be called where pcpu_setup_first_chunk() is expected. 1751 * can be called where pcpu_setup_first_chunk() is expected.
1214 * 1752 *
1215 * If this function is used to setup the first chunk, it is allocated 1753 * If this function is used to setup the first chunk, it is allocated
1216 * as a contiguous area using bootmem allocator and used as-is without 1754 * by calling @alloc_fn and used as-is without being mapped into
1217 * being mapped into vmalloc area. This enables the first chunk to 1755 * vmalloc area. Allocations are always whole multiples of @atom_size
1218 * piggy back on the linear physical mapping which often uses larger 1756 * aligned to @atom_size.
1219 * page size. 1757 *
1758 * This enables the first chunk to piggy back on the linear physical
1759 * mapping which often uses larger page size. Please note that this
1760 * can result in very sparse cpu->unit mapping on NUMA machines thus
1761 * requiring large vmalloc address space. Don't use this allocator if
1762 * vmalloc space is not orders of magnitude larger than distances
1763 * between node memory addresses (ie. 32bit NUMA machines).
1220 * 1764 *
1221 * When @dyn_size is positive, dynamic area might be larger than 1765 * When @dyn_size is positive, dynamic area might be larger than
1222 * specified to fill page alignment. Also, when @dyn_size is auto, 1766 * specified to fill page alignment. When @dyn_size is auto,
1223 * @dyn_size does not fill the whole first chunk but only what's 1767 * @dyn_size is just big enough to fill page alignment after static
1224 * necessary for page alignment after static and reserved areas. 1768 * and reserved areas.
1225 * 1769 *
1226 * If the needed size is smaller than the minimum or specified unit 1770 * If the needed size is smaller than the minimum or specified unit
1227 * size, the leftover is returned to the bootmem allocator. 1771 * size, the leftover is returned using @free_fn.
1228 * 1772 *
1229 * RETURNS: 1773 * RETURNS:
1230 * The determined pcpu_unit_size which can be used to initialize 1774 * 0 on success, -errno on failure.
1231 * percpu access on success, -errno on failure.
1232 */ 1775 */
1233ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1776int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1234 ssize_t dyn_size, ssize_t unit_size) 1777 size_t atom_size,
1778 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1779 pcpu_fc_alloc_fn_t alloc_fn,
1780 pcpu_fc_free_fn_t free_fn)
1235{ 1781{
1236 size_t chunk_size; 1782 void *base = (void *)ULONG_MAX;
1237 unsigned int cpu; 1783 void **areas = NULL;
1784 struct pcpu_alloc_info *ai;
1785 size_t size_sum, areas_size;
1786 int group, i, rc;
1787
1788 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1789 cpu_distance_fn);
1790 if (IS_ERR(ai))
1791 return PTR_ERR(ai);
1792
1793 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1794 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1795
1796 areas = alloc_bootmem_nopanic(areas_size);
1797 if (!areas) {
1798 rc = -ENOMEM;
1799 goto out_free;
1800 }
1238 1801
1239 /* determine parameters and allocate */ 1802 /* allocate, copy and determine base address */
1240 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1803 for (group = 0; group < ai->nr_groups; group++) {
1241 (dyn_size >= 0 ? dyn_size : 0)); 1804 struct pcpu_group_info *gi = &ai->groups[group];
1242 if (dyn_size != 0) 1805 unsigned int cpu = NR_CPUS;
1243 dyn_size = pcpue_size - static_size - reserved_size; 1806 void *ptr;
1244 1807
1245 if (unit_size >= 0) { 1808 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1246 BUG_ON(unit_size < pcpue_size); 1809 cpu = gi->cpu_map[i];
1247 pcpue_unit_size = unit_size; 1810 BUG_ON(cpu == NR_CPUS);
1248 } else 1811
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1812 /* allocate space for the whole group */
1250 1813 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1251 chunk_size = pcpue_unit_size * num_possible_cpus(); 1814 if (!ptr) {
1252 1815 rc = -ENOMEM;
1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1816 goto out_free_areas;
1254 __pa(MAX_DMA_ADDRESS)); 1817 }
1255 if (!pcpue_ptr) { 1818 areas[group] = ptr;
1256 pr_warning("PERCPU: failed to allocate %zu bytes for " 1819
1257 "embedding\n", chunk_size); 1820 base = min(ptr, base);
1258 return -ENOMEM; 1821
1822 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1823 if (gi->cpu_map[i] == NR_CPUS) {
1824 /* unused unit, free whole */
1825 free_fn(ptr, ai->unit_size);
1826 continue;
1827 }
1828 /* copy and return the unused part */
1829 memcpy(ptr, __per_cpu_load, ai->static_size);
1830 free_fn(ptr + size_sum, ai->unit_size - size_sum);
1831 }
1259 } 1832 }
1260 1833
1261 /* return the leftover and copy */ 1834 /* base address is now known, determine group base offsets */
1262 for_each_possible_cpu(cpu) { 1835 for (group = 0; group < ai->nr_groups; group++)
1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1836 ai->groups[group].base_offset = areas[group] - base;
1837
1838 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1839 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
1840 ai->dyn_size, ai->unit_size);
1841
1842 rc = pcpu_setup_first_chunk(ai, base);
1843 goto out_free;
1844
1845out_free_areas:
1846 for (group = 0; group < ai->nr_groups; group++)
1847 free_fn(areas[group],
1848 ai->groups[group].nr_units * ai->unit_size);
1849out_free:
1850 pcpu_free_alloc_info(ai);
1851 if (areas)
1852 free_bootmem(__pa(areas), areas_size);
1853 return rc;
1854}
1855#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
1856 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1857
1858#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1859/**
1860 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1861 * @reserved_size: the size of reserved percpu area in bytes
1862 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1863 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
1864 * @populate_pte_fn: function to populate pte
1865 *
1866 * This is a helper to ease setting up page-remapped first percpu
1867 * chunk and can be called where pcpu_setup_first_chunk() is expected.
1868 *
1869 * This is the basic allocator. Static percpu area is allocated
1870 * page-by-page into vmalloc area.
1871 *
1872 * RETURNS:
1873 * 0 on success, -errno on failure.
1874 */
1875int __init pcpu_page_first_chunk(size_t reserved_size,
1876 pcpu_fc_alloc_fn_t alloc_fn,
1877 pcpu_fc_free_fn_t free_fn,
1878 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1879{
1880 static struct vm_struct vm;
1881 struct pcpu_alloc_info *ai;
1882 char psize_str[16];
1883 int unit_pages;
1884 size_t pages_size;
1885 struct page **pages;
1886 int unit, i, j, rc;
1887
1888 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
1889
1890 ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
1891 if (IS_ERR(ai))
1892 return PTR_ERR(ai);
1893 BUG_ON(ai->nr_groups != 1);
1894 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
1895
1896 unit_pages = ai->unit_size >> PAGE_SHIFT;
1897
1898 /* unaligned allocations can't be freed, round up to page size */
1899 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1900 sizeof(pages[0]));
1901 pages = alloc_bootmem(pages_size);
1902
1903 /* allocate pages */
1904 j = 0;
1905 for (unit = 0; unit < num_possible_cpus(); unit++)
1906 for (i = 0; i < unit_pages; i++) {
1907 unsigned int cpu = ai->groups[0].cpu_map[unit];
1908 void *ptr;
1909
1910 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
1911 if (!ptr) {
1912 pr_warning("PERCPU: failed to allocate %s page "
1913 "for cpu%u\n", psize_str, cpu);
1914 goto enomem;
1915 }
1916 pages[j++] = virt_to_page(ptr);
1917 }
1918
1919 /* allocate vm area, map the pages and copy static data */
1920 vm.flags = VM_ALLOC;
1921 vm.size = num_possible_cpus() * ai->unit_size;
1922 vm_area_register_early(&vm, PAGE_SIZE);
1923
1924 for (unit = 0; unit < num_possible_cpus(); unit++) {
1925 unsigned long unit_addr =
1926 (unsigned long)vm.addr + unit * ai->unit_size;
1927
1928 for (i = 0; i < unit_pages; i++)
1929 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1264 1930
1265 free_bootmem(__pa(ptr + pcpue_size), 1931 /* pte already populated, the following shouldn't fail */
1266 pcpue_unit_size - pcpue_size); 1932 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
1267 memcpy(ptr, __per_cpu_load, static_size); 1933 unit_pages);
1934 if (rc < 0)
1935 panic("failed to map percpu area, err=%d\n", rc);
1936
1937 /*
1938 * FIXME: Archs with virtual cache should flush local
1939 * cache for the linear mapping here - something
1940 * equivalent to flush_cache_vmap() on the local cpu.
1941 * flush_cache_vmap() can't be used as most supporting
1942 * data structures are not set up yet.
1943 */
1944
1945 /* copy static data */
1946 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
1268 } 1947 }
1269 1948
1270 /* we're ready, commit */ 1949 /* we're ready, commit */
1271 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1950 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
1272 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1951 unit_pages, psize_str, vm.addr, ai->static_size,
1952 ai->reserved_size, ai->dyn_size);
1953
1954 rc = pcpu_setup_first_chunk(ai, vm.addr);
1955 goto out_free_ar;
1956
1957enomem:
1958 while (--j >= 0)
1959 free_fn(page_address(pages[j]), PAGE_SIZE);
1960 rc = -ENOMEM;
1961out_free_ar:
1962 free_bootmem(__pa(pages), pages_size);
1963 pcpu_free_alloc_info(ai);
1964 return rc;
1965}
1966#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
1967
1968/*
1969 * Generic percpu area setup.
1970 *
1971 * The embedding helper is used because its behavior closely resembles
1972 * the original non-dynamic generic percpu area setup. This is
1973 * important because many archs have addressing restrictions and might
1974 * fail if the percpu area is located far away from the previous
1975 * location. As an added bonus, in non-NUMA cases, embedding is
1976 * generally a good idea TLB-wise because percpu area can piggy back
1977 * on the physical linear memory mapping which uses large page
1978 * mappings on applicable archs.
1979 */
1980#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1981unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1982EXPORT_SYMBOL(__per_cpu_offset);
1983
1984static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1985 size_t align)
1986{
1987 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
1988}
1273 1989
1274 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1990static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1275 reserved_size, dyn_size, 1991{
1276 pcpue_unit_size, pcpue_ptr, NULL); 1992 free_bootmem(__pa(ptr), size);
1993}
1994
1995void __init setup_per_cpu_areas(void)
1996{
1997 unsigned long delta;
1998 unsigned int cpu;
1999 int rc;
2000
2001 /*
2002 * Always reserve area for module percpu variables. That's
2003 * what the legacy allocator did.
2004 */
2005 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2006 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2007 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2008 if (rc < 0)
2009 panic("Failed to initialized percpu areas.");
2010
2011 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2012 for_each_possible_cpu(cpu)
2013 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1277} 2014}
2015#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..6633965bb27b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
29 int node = numa_node_id(); 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones; 30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node; 31 int num_cpus_on_node;
32 const struct cpumask *cpumask_on_node = cpumask_of_node(node);
33 32
34 node_free_pages = 33 node_free_pages =
35#ifdef CONFIG_ZONE_DMA 34#ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
42 41
43 max = node_free_pages / FRACTION_OF_NODE_MEM; 42 max = node_free_pages / FRACTION_OF_NODE_MEM;
44 43
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); 44 num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
46 max /= num_cpus_on_node; 45 max /= num_cpus_on_node;
47 46
48 return max(max, min_pages); 47 return max(max, min_pages);
diff --git a/mm/rmap.c b/mm/rmap.c
index 836c6c63e1f2..28aafe2b5306 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 *
40 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock
39 */ 44 */
40 45
41#include <linux/mm.h> 46#include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 196 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 197 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 198 */
194static struct anon_vma *page_lock_anon_vma(struct page *page) 199struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 200{
196 struct anon_vma *anon_vma; 201 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 202 unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
211 return NULL; 216 return NULL;
212} 217}
213 218
214static void page_unlock_anon_vma(struct anon_vma *anon_vma) 219void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 220{
216 spin_unlock(&anon_vma->lock); 221 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 222 rcu_read_unlock();
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
311 * if the page is not mapped into the page tables of this VMA. Only 316 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs. 317 * valid for normal file or anonymous VMAs.
313 */ 318 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 319int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{ 320{
316 unsigned long address; 321 unsigned long address;
317 pte_t *pte; 322 pte_t *pte;
@@ -358,6 +363,7 @@ static int page_referenced_one(struct page *page,
358 */ 363 */
359 if (vma->vm_flags & VM_LOCKED) { 364 if (vma->vm_flags & VM_LOCKED) {
360 *mapcount = 1; /* break early from loop */ 365 *mapcount = 1; /* break early from loop */
366 *vm_flags |= VM_LOCKED;
361 goto out_unmap; 367 goto out_unmap;
362 } 368 }
363 369
@@ -709,27 +715,6 @@ void page_add_file_rmap(struct page *page)
709 } 715 }
710} 716}
711 717
712#ifdef CONFIG_DEBUG_VM
713/**
714 * page_dup_rmap - duplicate pte mapping to a page
715 * @page: the page to add the mapping to
716 * @vma: the vm area being duplicated
717 * @address: the user virtual address mapped
718 *
719 * For copy_page_range only: minimal extract from page_add_file_rmap /
720 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
721 * quicker.
722 *
723 * The caller needs to hold the pte lock.
724 */
725void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
726{
727 if (PageAnon(page))
728 __page_check_anon_rmap(page, vma, address);
729 atomic_inc(&page->_mapcount);
730}
731#endif
732
733/** 718/**
734 * page_remove_rmap - take down pte mapping from a page 719 * page_remove_rmap - take down pte mapping from a page
735 * @page: page to remove mapping from 720 * @page: page to remove mapping from
@@ -738,34 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
738 */ 723 */
739void page_remove_rmap(struct page *page) 724void page_remove_rmap(struct page *page)
740{ 725{
741 if (atomic_add_negative(-1, &page->_mapcount)) { 726 /* page still mapped by someone else? */
742 /* 727 if (!atomic_add_negative(-1, &page->_mapcount))
743 * Now that the last pte has gone, s390 must transfer dirty 728 return;
744 * flag from storage key to struct page. We can usually skip 729
745 * this if the page is anon, so about to be freed; but perhaps 730 /*
746 * not if it's in swapcache - there might be another pte slot 731 * Now that the last pte has gone, s390 must transfer dirty
747 * containing the swap entry, but page not yet written to swap. 732 * flag from storage key to struct page. We can usually skip
748 */ 733 * this if the page is anon, so about to be freed; but perhaps
749 if ((!PageAnon(page) || PageSwapCache(page)) && 734 * not if it's in swapcache - there might be another pte slot
750 page_test_dirty(page)) { 735 * containing the swap entry, but page not yet written to swap.
751 page_clear_dirty(page); 736 */
752 set_page_dirty(page); 737 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
753 } 738 page_clear_dirty(page);
754 if (PageAnon(page)) 739 set_page_dirty(page);
755 mem_cgroup_uncharge_page(page); 740 }
756 __dec_zone_page_state(page, 741 if (PageAnon(page)) {
757 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 742 mem_cgroup_uncharge_page(page);
758 mem_cgroup_update_mapped_file_stat(page, -1); 743 __dec_zone_page_state(page, NR_ANON_PAGES);
759 /* 744 } else {
760 * It would be tidy to reset the PageAnon mapping here, 745 __dec_zone_page_state(page, NR_FILE_MAPPED);
761 * but that might overwrite a racing page_add_anon_rmap
762 * which increments mapcount after us but sets mapping
763 * before us: so leave the reset to free_hot_cold_page,
764 * and remember that it's only reliable while mapped.
765 * Leaving it set also helps swapoff to reinstate ptes
766 * faster for those pages still in swapcache.
767 */
768 } 746 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /*
749 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap
751 * which increments mapcount after us but sets mapping
752 * before us: so leave the reset to free_hot_cold_page,
753 * and remember that it's only reliable while mapped.
754 * Leaving it set also helps swapoff to reinstate ptes
755 * faster for those pages still in swapcache.
756 */
769} 757}
770 758
771/* 759/*
@@ -773,7 +761,7 @@ void page_remove_rmap(struct page *page)
773 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
774 */ 762 */
775static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
776 int migration) 764 enum ttu_flags flags)
777{ 765{
778 struct mm_struct *mm = vma->vm_mm; 766 struct mm_struct *mm = vma->vm_mm;
779 unsigned long address; 767 unsigned long address;
@@ -795,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
795 * If it's recently referenced (perhaps page_referenced 783 * If it's recently referenced (perhaps page_referenced
796 * skipped over this mm) then we should reactivate it. 784 * skipped over this mm) then we should reactivate it.
797 */ 785 */
798 if (!migration) { 786 if (!(flags & TTU_IGNORE_MLOCK)) {
799 if (vma->vm_flags & VM_LOCKED) { 787 if (vma->vm_flags & VM_LOCKED) {
800 ret = SWAP_MLOCK; 788 ret = SWAP_MLOCK;
801 goto out_unmap; 789 goto out_unmap;
802 } 790 }
791 }
792 if (!(flags & TTU_IGNORE_ACCESS)) {
803 if (ptep_clear_flush_young_notify(vma, address, pte)) { 793 if (ptep_clear_flush_young_notify(vma, address, pte)) {
804 ret = SWAP_FAIL; 794 ret = SWAP_FAIL;
805 goto out_unmap; 795 goto out_unmap;
@@ -817,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
817 /* Update high watermark before we lower rss */ 807 /* Update high watermark before we lower rss */
818 update_hiwater_rss(mm); 808 update_hiwater_rss(mm);
819 809
820 if (PageAnon(page)) { 810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss);
813 else
814 dec_mm_counter(mm, file_rss);
815 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) {
821 swp_entry_t entry = { .val = page_private(page) }; 818 swp_entry_t entry = { .val = page_private(page) };
822 819
823 if (PageSwapCache(page)) { 820 if (PageSwapCache(page)) {
@@ -839,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
839 * pte. do_swap_page() will wait until the migration 836 * pte. do_swap_page() will wait until the migration
840 * pte is removed and then restart fault handling. 837 * pte is removed and then restart fault handling.
841 */ 838 */
842 BUG_ON(!migration); 839 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
843 entry = make_migration_entry(page, pte_write(pteval)); 840 entry = make_migration_entry(page, pte_write(pteval));
844 } 841 }
845 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 842 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
846 BUG_ON(pte_file(*pte)); 843 BUG_ON(pte_file(*pte));
847 } else if (PAGE_MIGRATION && migration) { 844 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
848 /* Establish migration entry for a file page */ 845 /* Establish migration entry for a file page */
849 swp_entry_t entry; 846 swp_entry_t entry;
850 entry = make_migration_entry(page, pte_write(pteval)); 847 entry = make_migration_entry(page, pte_write(pteval));
@@ -1013,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
1013 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1010 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1014 * 'LOCKED. 1011 * 'LOCKED.
1015 */ 1012 */
1016static int try_to_unmap_anon(struct page *page, int unlock, int migration) 1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1017{ 1014{
1018 struct anon_vma *anon_vma; 1015 struct anon_vma *anon_vma;
1019 struct vm_area_struct *vma; 1016 struct vm_area_struct *vma;
1020 unsigned int mlocked = 0; 1017 unsigned int mlocked = 0;
1021 int ret = SWAP_AGAIN; 1018 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1022 1020
1023 if (MLOCK_PAGES && unlikely(unlock)) 1021 if (MLOCK_PAGES && unlikely(unlock))
1024 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1034,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1034 continue; /* must visit all unlocked vmas */ 1032 continue; /* must visit all unlocked vmas */
1035 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1036 } else { 1034 } else {
1037 ret = try_to_unmap_one(page, vma, migration); 1035 ret = try_to_unmap_one(page, vma, flags);
1038 if (ret == SWAP_FAIL || !page_mapped(page)) 1036 if (ret == SWAP_FAIL || !page_mapped(page))
1039 break; 1037 break;
1040 } 1038 }
@@ -1058,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1058/** 1056/**
1059 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1057 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1060 * @page: the page to unmap/unlock 1058 * @page: the page to unmap/unlock
1061 * @unlock: request for unlock rather than unmap [unlikely] 1059 * @flags: action and flags
1062 * @migration: unmapping for migration - ignored if @unlock
1063 * 1060 *
1064 * Find all the mappings of a page using the mapping pointer and the vma chains 1061 * Find all the mappings of a page using the mapping pointer and the vma chains
1065 * contained in the address_space struct it points to. 1062 * contained in the address_space struct it points to.
@@ -1071,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1071 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1068 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1072 * 'LOCKED. 1069 * 'LOCKED.
1073 */ 1070 */
1074static int try_to_unmap_file(struct page *page, int unlock, int migration) 1071static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1075{ 1072{
1076 struct address_space *mapping = page->mapping; 1073 struct address_space *mapping = page->mapping;
1077 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1074 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1083,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1083 unsigned long max_nl_size = 0; 1080 unsigned long max_nl_size = 0;
1084 unsigned int mapcount; 1081 unsigned int mapcount;
1085 unsigned int mlocked = 0; 1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1086 1084
1087 if (MLOCK_PAGES && unlikely(unlock)) 1085 if (MLOCK_PAGES && unlikely(unlock))
1088 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1095,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1095 continue; /* must visit all vmas */ 1093 continue; /* must visit all vmas */
1096 ret = SWAP_MLOCK; 1094 ret = SWAP_MLOCK;
1097 } else { 1095 } else {
1098 ret = try_to_unmap_one(page, vma, migration); 1096 ret = try_to_unmap_one(page, vma, flags);
1099 if (ret == SWAP_FAIL || !page_mapped(page)) 1097 if (ret == SWAP_FAIL || !page_mapped(page))
1100 goto out; 1098 goto out;
1101 } 1099 }
@@ -1120,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1120 ret = SWAP_MLOCK; /* leave mlocked == 0 */ 1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1121 goto out; /* no need to look further */ 1119 goto out; /* no need to look further */
1122 } 1120 }
1123 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) 1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1124 continue; 1123 continue;
1125 cursor = (unsigned long) vma->vm_private_data; 1124 cursor = (unsigned long) vma->vm_private_data;
1126 if (cursor > max_nl_cursor) 1125 if (cursor > max_nl_cursor)
@@ -1154,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1154 do { 1153 do {
1155 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1156 shared.vm_set.list) { 1155 shared.vm_set.list) {
1157 if (!MLOCK_PAGES && !migration && 1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1158 (vma->vm_flags & VM_LOCKED)) 1157 (vma->vm_flags & VM_LOCKED))
1159 continue; 1158 continue;
1160 cursor = (unsigned long) vma->vm_private_data; 1159 cursor = (unsigned long) vma->vm_private_data;
@@ -1194,7 +1193,7 @@ out:
1194/** 1193/**
1195 * try_to_unmap - try to remove all page table mappings to a page 1194 * try_to_unmap - try to remove all page table mappings to a page
1196 * @page: the page to get unmapped 1195 * @page: the page to get unmapped
1197 * @migration: migration flag 1196 * @flags: action and flags
1198 * 1197 *
1199 * Tries to remove all the page table entries which are mapping this 1198 * Tries to remove all the page table entries which are mapping this
1200 * page, used in the pageout path. Caller must hold the page lock. 1199 * page, used in the pageout path. Caller must hold the page lock.
@@ -1205,16 +1204,16 @@ out:
1205 * SWAP_FAIL - the page is unswappable 1204 * SWAP_FAIL - the page is unswappable
1206 * SWAP_MLOCK - page is mlocked. 1205 * SWAP_MLOCK - page is mlocked.
1207 */ 1206 */
1208int try_to_unmap(struct page *page, int migration) 1207int try_to_unmap(struct page *page, enum ttu_flags flags)
1209{ 1208{
1210 int ret; 1209 int ret;
1211 1210
1212 BUG_ON(!PageLocked(page)); 1211 BUG_ON(!PageLocked(page));
1213 1212
1214 if (PageAnon(page)) 1213 if (PageAnon(page))
1215 ret = try_to_unmap_anon(page, 0, migration); 1214 ret = try_to_unmap_anon(page, flags);
1216 else 1215 else
1217 ret = try_to_unmap_file(page, 0, migration); 1216 ret = try_to_unmap_file(page, flags);
1218 if (ret != SWAP_MLOCK && !page_mapped(page)) 1217 if (ret != SWAP_MLOCK && !page_mapped(page))
1219 ret = SWAP_SUCCESS; 1218 ret = SWAP_SUCCESS;
1220 return ret; 1219 return ret;
@@ -1239,8 +1238,8 @@ int try_to_munlock(struct page *page)
1239 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1240 1239
1241 if (PageAnon(page)) 1240 if (PageAnon(page))
1242 return try_to_unmap_anon(page, 1, 0); 1241 return try_to_unmap_anon(page, TTU_MUNLOCK);
1243 else 1242 else
1244 return try_to_unmap_file(page, 1, 0); 1243 return try_to_unmap_file(page, TTU_MUNLOCK);
1245} 1244}
1246 1245
diff --git a/mm/shmem.c b/mm/shmem.c
index d713239ce2ce..ccf446a9faa1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt;
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
50#include <linux/shmem_fs.h> 50#include <linux/shmem_fs.h>
51#include <linux/writeback.h> 51#include <linux/writeback.h>
52#include <linux/vfs.h>
53#include <linux/blkdev.h> 52#include <linux/blkdev.h>
54#include <linux/security.h> 53#include <linux/security.h>
55#include <linux/swapops.h> 54#include <linux/swapops.h>
@@ -1047,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1047 * sync from ever calling shmem_writepage; but a stacking filesystem 1046 * sync from ever calling shmem_writepage; but a stacking filesystem
1048 * may use the ->writepage of its underlying filesystem, in which case 1047 * may use the ->writepage of its underlying filesystem, in which case
1049 * tmpfs should write out to swap only in response to memory pressure, 1048 * tmpfs should write out to swap only in response to memory pressure,
1050 * and not for pdflush or sync. However, in those cases, we do still 1049 * and not for the writeback threads or sync. However, in those cases,
1051 * want to check if there's a redundant swappage to be discarded. 1050 * we do still want to check if there's a redundant swappage to be
1051 * discarded.
1052 */ 1052 */
1053 if (wbc->for_reclaim) 1053 if (wbc->for_reclaim)
1054 swap = get_swap_page(); 1054 swap = get_swap_page();
@@ -1097,6 +1097,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 /*
1101 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1102 * clear SWAP_HAS_CACHE flag.
1103 */
1100 swapcache_free(swap, NULL); 1104 swapcache_free(swap, NULL);
1101redirty: 1105redirty:
1102 set_page_dirty(page); 1106 set_page_dirty(page);
@@ -1630,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1630 if (pos + copied > inode->i_size) 1634 if (pos + copied > inode->i_size)
1631 i_size_write(inode, pos + copied); 1635 i_size_write(inode, pos + copied);
1632 1636
1633 unlock_page(page);
1634 set_page_dirty(page); 1637 set_page_dirty(page);
1638 unlock_page(page);
1635 page_cache_release(page); 1639 page_cache_release(page);
1636 1640
1637 return copied; 1641 return copied;
@@ -1968,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1968 iput(inode); 1972 iput(inode);
1969 return error; 1973 return error;
1970 } 1974 }
1971 unlock_page(page);
1972 inode->i_mapping->a_ops = &shmem_aops; 1975 inode->i_mapping->a_ops = &shmem_aops;
1973 inode->i_op = &shmem_symlink_inode_operations; 1976 inode->i_op = &shmem_symlink_inode_operations;
1974 kaddr = kmap_atomic(page, KM_USER0); 1977 kaddr = kmap_atomic(page, KM_USER0);
1975 memcpy(kaddr, symname, len); 1978 memcpy(kaddr, symname, len);
1976 kunmap_atomic(kaddr, KM_USER0); 1979 kunmap_atomic(kaddr, KM_USER0);
1977 set_page_dirty(page); 1980 set_page_dirty(page);
1981 unlock_page(page);
1978 page_cache_release(page); 1982 page_cache_release(page);
1979 } 1983 }
1980 if (dir->i_mode & S_ISGID) 1984 if (dir->i_mode & S_ISGID)
@@ -2298,8 +2302,7 @@ static void shmem_put_super(struct super_block *sb)
2298 sb->s_fs_info = NULL; 2302 sb->s_fs_info = NULL;
2299} 2303}
2300 2304
2301static int shmem_fill_super(struct super_block *sb, 2305int shmem_fill_super(struct super_block *sb, void *data, int silent)
2302 void *data, int silent)
2303{ 2306{
2304 struct inode *inode; 2307 struct inode *inode;
2305 struct dentry *root; 2308 struct dentry *root;
@@ -2307,17 +2310,14 @@ static int shmem_fill_super(struct super_block *sb,
2307 int err = -ENOMEM; 2310 int err = -ENOMEM;
2308 2311
2309 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2312 /* Round up to L1_CACHE_BYTES to resist false sharing */
2310 sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), 2313 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2311 L1_CACHE_BYTES), GFP_KERNEL); 2314 L1_CACHE_BYTES), GFP_KERNEL);
2312 if (!sbinfo) 2315 if (!sbinfo)
2313 return -ENOMEM; 2316 return -ENOMEM;
2314 2317
2315 sbinfo->max_blocks = 0;
2316 sbinfo->max_inodes = 0;
2317 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2318 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2318 sbinfo->uid = current_fsuid(); 2319 sbinfo->uid = current_fsuid();
2319 sbinfo->gid = current_fsgid(); 2320 sbinfo->gid = current_fsgid();
2320 sbinfo->mpol = NULL;
2321 sb->s_fs_info = sbinfo; 2321 sb->s_fs_info = sbinfo;
2322 2322
2323#ifdef CONFIG_TMPFS 2323#ifdef CONFIG_TMPFS
@@ -2421,6 +2421,7 @@ static const struct address_space_operations shmem_aops = {
2421 .write_end = shmem_write_end, 2421 .write_end = shmem_write_end,
2422#endif 2422#endif
2423 .migratepage = migrate_page, 2423 .migratepage = migrate_page,
2424 .error_remove_page = generic_error_remove_page,
2424}; 2425};
2425 2426
2426static const struct file_operations shmem_file_operations = { 2427static const struct file_operations shmem_file_operations = {
@@ -2446,7 +2447,7 @@ static const struct inode_operations shmem_inode_operations = {
2446 .getxattr = generic_getxattr, 2447 .getxattr = generic_getxattr,
2447 .listxattr = generic_listxattr, 2448 .listxattr = generic_listxattr,
2448 .removexattr = generic_removexattr, 2449 .removexattr = generic_removexattr,
2449 .permission = shmem_permission, 2450 .check_acl = shmem_check_acl,
2450#endif 2451#endif
2451 2452
2452}; 2453};
@@ -2469,7 +2470,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2469 .getxattr = generic_getxattr, 2470 .getxattr = generic_getxattr,
2470 .listxattr = generic_listxattr, 2471 .listxattr = generic_listxattr,
2471 .removexattr = generic_removexattr, 2472 .removexattr = generic_removexattr,
2472 .permission = shmem_permission, 2473 .check_acl = shmem_check_acl,
2473#endif 2474#endif
2474}; 2475};
2475 2476
@@ -2480,7 +2481,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2480 .getxattr = generic_getxattr, 2481 .getxattr = generic_getxattr,
2481 .listxattr = generic_listxattr, 2482 .listxattr = generic_listxattr,
2482 .removexattr = generic_removexattr, 2483 .removexattr = generic_removexattr,
2483 .permission = shmem_permission, 2484 .check_acl = shmem_check_acl,
2484#endif 2485#endif
2485}; 2486};
2486 2487
@@ -2519,7 +2520,7 @@ static struct file_system_type tmpfs_fs_type = {
2519 .kill_sb = kill_litter_super, 2520 .kill_sb = kill_litter_super,
2520}; 2521};
2521 2522
2522static int __init init_tmpfs(void) 2523int __init init_tmpfs(void)
2523{ 2524{
2524 int error; 2525 int error;
2525 2526
@@ -2576,7 +2577,7 @@ static struct file_system_type tmpfs_fs_type = {
2576 .kill_sb = kill_litter_super, 2577 .kill_sb = kill_litter_super,
2577}; 2578};
2578 2579
2579static int __init init_tmpfs(void) 2580int __init init_tmpfs(void)
2580{ 2581{
2581 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2582 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2582 2583
@@ -2591,6 +2592,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2591 return 0; 2592 return 0;
2592} 2593}
2593 2594
2595int shmem_lock(struct file *file, int lock, struct user_struct *user)
2596{
2597 return 0;
2598}
2599
2594#define shmem_vm_ops generic_file_vm_ops 2600#define shmem_vm_ops generic_file_vm_ops
2595#define shmem_file_operations ramfs_file_operations 2601#define shmem_file_operations ramfs_file_operations
2596#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2602#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
@@ -2687,5 +2693,3 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2687 vma->vm_ops = &shmem_vm_ops; 2693 vma->vm_ops = &shmem_vm_ops;
2688 return 0; 2694 return 0;
2689} 2695}
2690
2691module_init(init_tmpfs)
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 606a8e757a42..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
157/** 157/**
158 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */ 159 */
160static int 160int
161shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
162{ 162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
169 } 169 }
170 return -EAGAIN; 170 return -EAGAIN;
171} 171}
172
173/**
174 * shmem_permission - permission() inode operation
175 */
176int
177shmem_permission(struct inode *inode, int mask)
178{
179 return generic_permission(inode, mask, shmem_check_acl);
180}
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..7dfa481c96ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
1384 * Fragmentation resistance on low memory - only use bigger 1384 * Fragmentation resistance on low memory - only use bigger
1385 * page orders on machines with more than 32MB of memory. 1385 * page orders on machines with more than 32MB of memory.
1386 */ 1386 */
1387 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1387 if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1389 1389
1390 /* Bootstrap is tricky, because several objects are allocated 1390 /* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/slob.c b/mm/slob.c
index 9641da3d5e58..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void)
692{ 692{
693 slob_ready = 1; 693 slob_ready = 1;
694} 694}
695
696void __init kmem_cache_init_late(void)
697{
698 /* Nothing to do */
699}
diff --git a/mm/slub.c b/mm/slub.c
index b9f1491a58a1..4996fc719552 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -141,6 +141,13 @@
141 SLAB_POISON | SLAB_STORE_USER) 141 SLAB_POISON | SLAB_STORE_USER)
142 142
143/* 143/*
144 * Debugging flags that require metadata to be stored in the slab. These get
145 * disabled when slub_debug=O is used and a cache's min order increases with
146 * metadata.
147 */
148#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
149
150/*
144 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
145 */ 152 */
146#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
@@ -325,6 +332,7 @@ static int slub_debug;
325#endif 332#endif
326 333
327static char *slub_debug_slabs; 334static char *slub_debug_slabs;
335static int disable_higher_order_debug;
328 336
329/* 337/*
330 * Object debugging 338 * Object debugging
@@ -646,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
646 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 654 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
647 print_section("Padding", end - remainder, remainder); 655 print_section("Padding", end - remainder, remainder);
648 656
649 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 657 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
650 return 0; 658 return 0;
651} 659}
652 660
@@ -976,6 +984,15 @@ static int __init setup_slub_debug(char *str)
976 */ 984 */
977 goto check_slabs; 985 goto check_slabs;
978 986
987 if (tolower(*str) == 'o') {
988 /*
989 * Avoid enabling debugging on caches if its minimum order
990 * would increase as a result.
991 */
992 disable_higher_order_debug = 1;
993 goto out;
994 }
995
979 slub_debug = 0; 996 slub_debug = 0;
980 if (*str == '-') 997 if (*str == '-')
981 /* 998 /*
@@ -1026,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
1026 * Enable debugging if selected on the kernel commandline. 1043 * Enable debugging if selected on the kernel commandline.
1027 */ 1044 */
1028 if (slub_debug && (!slub_debug_slabs || 1045 if (slub_debug && (!slub_debug_slabs ||
1029 strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) 1046 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1030 flags |= slub_debug; 1047 flags |= slub_debug;
1031 1048
1032 return flags; 1049 return flags;
1033} 1050}
@@ -1054,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1054} 1071}
1055#define slub_debug 0 1072#define slub_debug 0
1056 1073
1074#define disable_higher_order_debug 0
1075
1057static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1076static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1058 { return 0; } 1077 { return 0; }
1059static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1078static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
@@ -1109,8 +1128,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1109 } 1128 }
1110 1129
1111 if (kmemcheck_enabled 1130 if (kmemcheck_enabled
1112 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) 1131 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1113 {
1114 int pages = 1 << oo_order(oo); 1132 int pages = 1 << oo_order(oo);
1115 1133
1116 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1134 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@ -1560,6 +1578,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1560 "default order: %d, min order: %d\n", s->name, s->objsize, 1578 "default order: %d, min order: %d\n", s->name, s->objsize,
1561 s->size, oo_order(s->oo), oo_order(s->min)); 1579 s->size, oo_order(s->oo), oo_order(s->min));
1562 1580
1581 if (oo_order(s->min) > get_order(s->objsize))
1582 printk(KERN_WARNING " %s debugging increased min order, use "
1583 "slub_debug=O to disable.\n", s->name);
1584
1563 for_each_online_node(node) { 1585 for_each_online_node(node) {
1564 struct kmem_cache_node *n = get_node(s, node); 1586 struct kmem_cache_node *n = get_node(s, node);
1565 unsigned long nr_slabs; 1587 unsigned long nr_slabs;
@@ -2001,7 +2023,7 @@ static inline int calculate_order(int size)
2001 return order; 2023 return order;
2002 fraction /= 2; 2024 fraction /= 2;
2003 } 2025 }
2004 min_objects --; 2026 min_objects--;
2005 } 2027 }
2006 2028
2007 /* 2029 /*
@@ -2091,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2091 */ 2113 */
2092#define NR_KMEM_CACHE_CPU 100 2114#define NR_KMEM_CACHE_CPU 100
2093 2115
2094static DEFINE_PER_CPU(struct kmem_cache_cpu, 2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2095 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2117 kmem_cache_cpu);
2096 2118
2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); 2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@ -2400,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2400 * on bootup. 2422 * on bootup.
2401 */ 2423 */
2402 align = calculate_alignment(flags, align, s->objsize); 2424 align = calculate_alignment(flags, align, s->objsize);
2425 s->align = align;
2403 2426
2404 /* 2427 /*
2405 * SLUB stores one object immediately after another beginning from 2428 * SLUB stores one object immediately after another beginning from
@@ -2452,6 +2475,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2452 2475
2453 if (!calculate_sizes(s, -1)) 2476 if (!calculate_sizes(s, -1))
2454 goto error; 2477 goto error;
2478 if (disable_higher_order_debug) {
2479 /*
2480 * Disable debugging flags that store metadata if the min slab
2481 * order increased.
2482 */
2483 if (get_order(s->size) > get_order(s->objsize)) {
2484 s->flags &= ~DEBUG_METADATA_FLAGS;
2485 s->offset = 0;
2486 if (!calculate_sizes(s, -1))
2487 goto error;
2488 }
2489 }
2455 2490
2456 /* 2491 /*
2457 * The larger the object size is, the more pages we want on the partial 2492 * The larger the object size is, the more pages we want on the partial
@@ -2594,8 +2629,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2594 */ 2629 */
2595void kmem_cache_destroy(struct kmem_cache *s) 2630void kmem_cache_destroy(struct kmem_cache *s)
2596{ 2631{
2597 if (s->flags & SLAB_DESTROY_BY_RCU)
2598 rcu_barrier();
2599 down_write(&slub_lock); 2632 down_write(&slub_lock);
2600 s->refcount--; 2633 s->refcount--;
2601 if (!s->refcount) { 2634 if (!s->refcount) {
@@ -2606,6 +2639,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2606 "still has objects.\n", s->name, __func__); 2639 "still has objects.\n", s->name, __func__);
2607 dump_stack(); 2640 dump_stack();
2608 } 2641 }
2642 if (s->flags & SLAB_DESTROY_BY_RCU)
2643 rcu_barrier();
2609 sysfs_slab_remove(s); 2644 sysfs_slab_remove(s);
2610 } else 2645 } else
2611 up_write(&slub_lock); 2646 up_write(&slub_lock);
@@ -2790,6 +2825,11 @@ static s8 size_index[24] = {
2790 2 /* 192 */ 2825 2 /* 192 */
2791}; 2826};
2792 2827
2828static inline int size_index_elem(size_t bytes)
2829{
2830 return (bytes - 1) / 8;
2831}
2832
2793static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2833static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2794{ 2834{
2795 int index; 2835 int index;
@@ -2798,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2798 if (!size) 2838 if (!size)
2799 return ZERO_SIZE_PTR; 2839 return ZERO_SIZE_PTR;
2800 2840
2801 index = size_index[(size - 1) / 8]; 2841 index = size_index[size_index_elem(size)];
2802 } else 2842 } else
2803 index = fls(size - 1); 2843 index = fls(size - 1);
2804 2844
@@ -3156,10 +3196,12 @@ void __init kmem_cache_init(void)
3156 slab_state = PARTIAL; 3196 slab_state = PARTIAL;
3157 3197
3158 /* Caches that are not of the two-to-the-power-of size */ 3198 /* Caches that are not of the two-to-the-power-of size */
3159 if (KMALLOC_MIN_SIZE <= 64) { 3199 if (KMALLOC_MIN_SIZE <= 32) {
3160 create_kmalloc_cache(&kmalloc_caches[1], 3200 create_kmalloc_cache(&kmalloc_caches[1],
3161 "kmalloc-96", 96, GFP_NOWAIT); 3201 "kmalloc-96", 96, GFP_NOWAIT);
3162 caches++; 3202 caches++;
3203 }
3204 if (KMALLOC_MIN_SIZE <= 64) {
3163 create_kmalloc_cache(&kmalloc_caches[2], 3205 create_kmalloc_cache(&kmalloc_caches[2],
3164 "kmalloc-192", 192, GFP_NOWAIT); 3206 "kmalloc-192", 192, GFP_NOWAIT);
3165 caches++; 3207 caches++;
@@ -3186,17 +3228,28 @@ void __init kmem_cache_init(void)
3186 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3228 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3187 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3229 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3188 3230
3189 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 3231 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3190 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3232 int elem = size_index_elem(i);
3233 if (elem >= ARRAY_SIZE(size_index))
3234 break;
3235 size_index[elem] = KMALLOC_SHIFT_LOW;
3236 }
3191 3237
3192 if (KMALLOC_MIN_SIZE == 128) { 3238 if (KMALLOC_MIN_SIZE == 64) {
3239 /*
3240 * The 96 byte size cache is not used if the alignment
3241 * is 64 byte.
3242 */
3243 for (i = 64 + 8; i <= 96; i += 8)
3244 size_index[size_index_elem(i)] = 7;
3245 } else if (KMALLOC_MIN_SIZE == 128) {
3193 /* 3246 /*
3194 * The 192 byte sized cache is not used if the alignment 3247 * The 192 byte sized cache is not used if the alignment
3195 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3248 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3196 * instead. 3249 * instead.
3197 */ 3250 */
3198 for (i = 128 + 8; i <= 192; i += 8) 3251 for (i = 128 + 8; i <= 192; i += 8)
3199 size_index[(i - 1) / 8] = 8; 3252 size_index[size_index_elem(i)] = 8;
3200 } 3253 }
3201 3254
3202 slab_state = UP; 3255 slab_state = UP;
@@ -3292,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3292{ 3345{
3293 struct kmem_cache *s; 3346 struct kmem_cache *s;
3294 3347
3348 if (WARN_ON(!name))
3349 return NULL;
3350
3295 down_write(&slub_lock); 3351 down_write(&slub_lock);
3296 s = find_mergeable(size, align, flags, name, ctor); 3352 s = find_mergeable(size, align, flags, name, ctor);
3297 if (s) { 3353 if (s) {
@@ -4543,8 +4599,11 @@ static int sysfs_slab_add(struct kmem_cache *s)
4543 } 4599 }
4544 4600
4545 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4601 err = sysfs_create_group(&s->kobj, &slab_attr_group);
4546 if (err) 4602 if (err) {
4603 kobject_del(&s->kobj);
4604 kobject_put(&s->kobj);
4547 return err; 4605 return err;
4606 }
4548 kobject_uevent(&s->kobj, KOBJ_ADD); 4607 kobject_uevent(&s->kobj, KOBJ_ADD);
4549 if (!unmergeable) { 4608 if (!unmergeable) {
4550 /* Setup first alias */ 4609 /* Setup first alias */
@@ -4726,7 +4785,7 @@ static const struct file_operations proc_slabinfo_operations = {
4726 4785
4727static int __init slab_proc_init(void) 4786static int __init slab_proc_init(void)
4728{ 4787{
4729 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4788 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4730 return 0; 4789 return 0;
4731} 4790}
4732module_init(slab_proc_init); 4791module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea6401ae7..d9714bdcb4a3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 48{
49 /* If the main allocator is up use that, fallback to bootmem. */ 49 /* If the main allocator is up use that, fallback to bootmem. */
50 if (slab_is_available()) { 50 if (slab_is_available()) {
51 struct page *page = alloc_pages_node(node, 51 struct page *page;
52
53 if (node_state(node, N_HIGH_MEMORY))
54 page = alloc_pages_node(node,
52 GFP_KERNEL | __GFP_ZERO, get_order(size)); 55 GFP_KERNEL | __GFP_ZERO, get_order(size));
56 else
57 page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
58 get_order(size));
53 if (page) 59 if (page)
54 return page_address(page); 60 return page_address(page);
55 return NULL; 61 return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9f0ae8..6ce4aab69e99 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
62 unsigned long array_size = SECTIONS_PER_ROOT * 62 unsigned long array_size = SECTIONS_PER_ROOT *
63 sizeof(struct mem_section); 63 sizeof(struct mem_section);
64 64
65 if (slab_is_available()) 65 if (slab_is_available()) {
66 section = kmalloc_node(array_size, GFP_KERNEL, nid); 66 if (node_state(nid, N_HIGH_MEMORY))
67 else 67 section = kmalloc_node(array_size, GFP_KERNEL, nid);
68 else
69 section = kmalloc(array_size, GFP_KERNEL);
70 } else
68 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 71 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
69 72
70 if (section) 73 if (section)
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..308e57d8d7ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
118 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
119 } 119 }
120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
121 int lru = page_is_file_cache(page); 121 int lru = page_lru_base_type(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list); 122 list_move_tail(&page->lru, &zone->lru[lru].list);
123 pgmoved++; 123 pgmoved++;
124 } 124 }
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
181 spin_lock_irq(&zone->lru_lock); 181 spin_lock_irq(&zone->lru_lock);
182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
183 int file = page_is_file_cache(page); 183 int file = page_is_file_cache(page);
184 int lru = LRU_BASE + file; 184 int lru = page_lru_base_type(page);
185 del_page_from_lru_list(zone, page, lru); 185 del_page_from_lru_list(zone, page, lru);
186 186
187 SetPageActive(page); 187 SetPageActive(page);
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
189 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
190 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
191 191
192 update_page_reclaim_stat(zone, page, !!file, 1); 192 update_page_reclaim_stat(zone, page, file, 1);
193 } 193 }
194 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
195} 195}
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
496 */ 496 */
497void __init swap_setup(void) 497void __init swap_setup(void)
498{ 498{
499 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 499 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
500 500
501#ifdef CONFIG_SWAP 501#ifdef CONFIG_SWAP
502 bdi_init(swapper_space.backing_dev_info); 502 bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..6d1daeb1cb4a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
@@ -66,10 +67,10 @@ void show_swap_cache_info(void)
66} 67}
67 68
68/* 69/*
69 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 70 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
70 * but sets SwapCache flag and private instead of mapping and index. 71 * but sets SwapCache flag and private instead of mapping and index.
71 */ 72 */
72int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 73static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
73{ 74{
74 int error; 75 int error;
75 76
@@ -77,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
77 VM_BUG_ON(PageSwapCache(page)); 78 VM_BUG_ON(PageSwapCache(page));
78 VM_BUG_ON(!PageSwapBacked(page)); 79 VM_BUG_ON(!PageSwapBacked(page));
79 80
81 page_cache_get(page);
82 SetPageSwapCache(page);
83 set_page_private(page, entry.val);
84
85 spin_lock_irq(&swapper_space.tree_lock);
86 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
87 if (likely(!error)) {
88 total_swapcache_pages++;
89 __inc_zone_page_state(page, NR_FILE_PAGES);
90 INC_CACHE_INFO(add_total);
91 }
92 spin_unlock_irq(&swapper_space.tree_lock);
93
94 if (unlikely(error)) {
95 /*
96 * Only the context which have set SWAP_HAS_CACHE flag
97 * would call add_to_swap_cache().
98 * So add_to_swap_cache() doesn't returns -EEXIST.
99 */
100 VM_BUG_ON(error == -EEXIST);
101 set_page_private(page, 0UL);
102 ClearPageSwapCache(page);
103 page_cache_release(page);
104 }
105
106 return error;
107}
108
109
110int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
111{
112 int error;
113
80 error = radix_tree_preload(gfp_mask); 114 error = radix_tree_preload(gfp_mask);
81 if (!error) { 115 if (!error) {
82 page_cache_get(page); 116 error = __add_to_swap_cache(page, entry);
83 SetPageSwapCache(page);
84 set_page_private(page, entry.val);
85
86 spin_lock_irq(&swapper_space.tree_lock);
87 error = radix_tree_insert(&swapper_space.page_tree,
88 entry.val, page);
89 if (likely(!error)) {
90 total_swapcache_pages++;
91 __inc_zone_page_state(page, NR_FILE_PAGES);
92 INC_CACHE_INFO(add_total);
93 }
94 spin_unlock_irq(&swapper_space.tree_lock);
95 radix_tree_preload_end(); 117 radix_tree_preload_end();
96
97 if (unlikely(error)) {
98 set_page_private(page, 0UL);
99 ClearPageSwapCache(page);
100 page_cache_release(page);
101 }
102 } 118 }
103 return error; 119 return error;
104} 120}
@@ -136,38 +152,34 @@ int add_to_swap(struct page *page)
136 VM_BUG_ON(!PageLocked(page)); 152 VM_BUG_ON(!PageLocked(page));
137 VM_BUG_ON(!PageUptodate(page)); 153 VM_BUG_ON(!PageUptodate(page));
138 154
139 for (;;) { 155 entry = get_swap_page();
140 entry = get_swap_page(); 156 if (!entry.val)
141 if (!entry.val) 157 return 0;
142 return 0;
143 158
159 /*
160 * Radix-tree node allocations from PF_MEMALLOC contexts could
161 * completely exhaust the page allocator. __GFP_NOMEMALLOC
162 * stops emergency reserves from being allocated.
163 *
164 * TODO: this could cause a theoretical memory reclaim
165 * deadlock in the swap out path.
166 */
167 /*
168 * Add it to the swap cache and mark it dirty
169 */
170 err = add_to_swap_cache(page, entry,
171 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
172
173 if (!err) { /* Success */
174 SetPageDirty(page);
175 return 1;
176 } else { /* -ENOMEM radix-tree allocation failure */
144 /* 177 /*
145 * Radix-tree node allocations from PF_MEMALLOC contexts could 178 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
146 * completely exhaust the page allocator. __GFP_NOMEMALLOC 179 * clear SWAP_HAS_CACHE flag.
147 * stops emergency reserves from being allocated.
148 *
149 * TODO: this could cause a theoretical memory reclaim
150 * deadlock in the swap out path.
151 */
152 /*
153 * Add it to the swap cache and mark it dirty
154 */ 180 */
155 err = add_to_swap_cache(page, entry, 181 swapcache_free(entry, NULL);
156 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 182 return 0;
157
158 switch (err) {
159 case 0: /* Success */
160 SetPageDirty(page);
161 return 1;
162 case -EEXIST:
163 /* Raced with "speculative" read_swap_cache_async */
164 swapcache_free(entry, NULL);
165 continue;
166 default:
167 /* -ENOMEM radix-tree allocation failure */
168 swapcache_free(entry, NULL);
169 return 0;
170 }
171 } 183 }
172} 184}
173 185
@@ -289,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
289 } 301 }
290 302
291 /* 303 /*
304 * call radix_tree_preload() while we can wait.
305 */
306 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307 if (err)
308 break;
309
310 /*
292 * Swap entry may have been freed since our caller observed it. 311 * Swap entry may have been freed since our caller observed it.
293 */ 312 */
294 err = swapcache_prepare(entry); 313 err = swapcache_prepare(entry);
295 if (err == -EEXIST) /* seems racy */ 314 if (err == -EEXIST) { /* seems racy */
315 radix_tree_preload_end();
296 continue; 316 continue;
297 if (err) /* swp entry is obsolete ? */ 317 }
318 if (err) { /* swp entry is obsolete ? */
319 radix_tree_preload_end();
298 break; 320 break;
321 }
299 322
300 /* 323 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
301 * Associate the page with swap entry in the swap cache.
302 * May fail (-EEXIST) if there is already a page associated
303 * with this entry in the swap cache: added by a racing
304 * read_swap_cache_async, or add_to_swap or shmem_writepage
305 * re-using the just freed swap entry for an existing page.
306 * May fail (-ENOMEM) if radix-tree node allocation failed.
307 */
308 __set_page_locked(new_page); 324 __set_page_locked(new_page);
309 SetPageSwapBacked(new_page); 325 SetPageSwapBacked(new_page);
310 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 326 err = __add_to_swap_cache(new_page, entry);
311 if (likely(!err)) { 327 if (likely(!err)) {
328 radix_tree_preload_end();
312 /* 329 /*
313 * Initiate read into locked page and return. 330 * Initiate read into locked page and return.
314 */ 331 */
@@ -316,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
316 swap_readpage(new_page); 333 swap_readpage(new_page);
317 return new_page; 334 return new_page;
318 } 335 }
336 radix_tree_preload_end();
319 ClearPageSwapBacked(new_page); 337 ClearPageSwapBacked(new_page);
320 __clear_page_locked(new_page); 338 __clear_page_locked(new_page);
339 /*
340 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341 * clear SWAP_HAS_CACHE flag.
342 */
321 swapcache_free(entry, NULL); 343 swapcache_free(entry, NULL);
322 } while (err != -ENOMEM); 344 } while (err != -ENOMEM);
323 345
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d23c53..4de7f02f820b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
161 } 161 }
162 162
163 err = blkdev_issue_discard(si->bdev, start_block, 163 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL); 164 nr_blocks, GFP_KERNEL,
165 DISCARD_FL_BARRIER);
165 if (err) 166 if (err)
166 break; 167 break;
167 168
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
200 start_block <<= PAGE_SHIFT - 9; 201 start_block <<= PAGE_SHIFT - 9;
201 nr_blocks <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9;
202 if (blkdev_issue_discard(si->bdev, start_block, 203 if (blkdev_issue_discard(si->bdev, start_block,
203 nr_blocks, GFP_NOIO)) 204 nr_blocks, GFP_NOIO,
205 DISCARD_FL_BARRIER))
204 break; 206 break;
205 } 207 }
206 208
@@ -697,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
697 struct swap_info_struct *p; 699 struct swap_info_struct *p;
698 struct page *page = NULL; 700 struct page *page = NULL;
699 701
700 if (is_migration_entry(entry)) 702 if (non_swap_entry(entry))
701 return 1; 703 return 1;
702 704
703 p = swap_info_get(entry); 705 p = swap_info_get(entry);
@@ -1573,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1573 p->flags &= ~SWP_WRITEOK; 1575 p->flags &= ~SWP_WRITEOK;
1574 spin_unlock(&swap_lock); 1576 spin_unlock(&swap_lock);
1575 1577
1576 current->flags |= PF_SWAPOFF; 1578 current->flags |= PF_OOM_ORIGIN;
1577 err = try_to_unuse(type); 1579 err = try_to_unuse(type);
1578 current->flags &= ~PF_SWAPOFF; 1580 current->flags &= ~PF_OOM_ORIGIN;
1579 1581
1580 if (err) { 1582 if (err) {
1581 /* re-insert swap space back into swap_list */ 1583 /* re-insert swap space back into swap_list */
@@ -2083,7 +2085,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
2083 int count; 2085 int count;
2084 bool has_cache; 2086 bool has_cache;
2085 2087
2086 if (is_migration_entry(entry)) 2088 if (non_swap_entry(entry))
2087 return -EINVAL; 2089 return -EINVAL;
2088 2090
2089 type = swp_type(entry); 2091 type = swp_type(entry);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..450cebdabfc0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
95 */ 95 */
96static void 96static int
97truncate_complete_page(struct address_space *mapping, struct page *page) 97truncate_complete_page(struct address_space *mapping, struct page *page)
98{ 98{
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return -EIO;
101 101
102 if (page_has_private(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
108 remove_from_page_cache(page); 108 remove_from_page_cache(page);
109 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
110 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
111 return 0;
111} 112}
112 113
113/* 114/*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
135 return ret; 136 return ret;
136} 137}
137 138
139int truncate_inode_page(struct address_space *mapping, struct page *page)
140{
141 if (page_mapped(page)) {
142 unmap_mapping_range(mapping,
143 (loff_t)page->index << PAGE_CACHE_SHIFT,
144 PAGE_CACHE_SIZE, 0);
145 }
146 return truncate_complete_page(mapping, page);
147}
148
149/*
150 * Used to get rid of pages on hardware memory corruption.
151 */
152int generic_error_remove_page(struct address_space *mapping, struct page *page)
153{
154 if (!mapping)
155 return -EINVAL;
156 /*
157 * Only punch for normal data pages for now.
158 * Handling other types like directories would need more auditing.
159 */
160 if (!S_ISREG(mapping->host->i_mode))
161 return -EIO;
162 return truncate_inode_page(mapping, page);
163}
164EXPORT_SYMBOL(generic_error_remove_page);
165
166/*
167 * Safely invalidate one page from its pagecache mapping.
168 * It only drops clean, unused pages. The page must be locked.
169 *
170 * Returns 1 if the page is successfully invalidated, otherwise 0.
171 */
172int invalidate_inode_page(struct page *page)
173{
174 struct address_space *mapping = page_mapping(page);
175 if (!mapping)
176 return 0;
177 if (PageDirty(page) || PageWriteback(page))
178 return 0;
179 if (page_mapped(page))
180 return 0;
181 return invalidate_complete_page(mapping, page);
182}
183
138/** 184/**
139 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 185 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
140 * @mapping: mapping to truncate 186 * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
196 unlock_page(page); 242 unlock_page(page);
197 continue; 243 continue;
198 } 244 }
199 if (page_mapped(page)) { 245 truncate_inode_page(mapping, page);
200 unmap_mapping_range(mapping,
201 (loff_t)page_index<<PAGE_CACHE_SHIFT,
202 PAGE_CACHE_SIZE, 0);
203 }
204 truncate_complete_page(mapping, page);
205 unlock_page(page); 246 unlock_page(page);
206 } 247 }
207 pagevec_release(&pvec); 248 pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 break; 279 break;
239 lock_page(page); 280 lock_page(page);
240 wait_on_page_writeback(page); 281 wait_on_page_writeback(page);
241 if (page_mapped(page)) { 282 truncate_inode_page(mapping, page);
242 unmap_mapping_range(mapping,
243 (loff_t)page->index<<PAGE_CACHE_SHIFT,
244 PAGE_CACHE_SIZE, 0);
245 }
246 if (page->index > next) 283 if (page->index > next)
247 next = page->index; 284 next = page->index;
248 next++; 285 next++;
249 truncate_complete_page(mapping, page);
250 unlock_page(page); 286 unlock_page(page);
251 } 287 }
252 pagevec_release(&pvec); 288 pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
311 if (lock_failed) 347 if (lock_failed)
312 continue; 348 continue;
313 349
314 if (PageDirty(page) || PageWriteback(page)) 350 ret += invalidate_inode_page(page);
315 goto unlock; 351
316 if (page_mapped(page))
317 goto unlock;
318 ret += invalidate_complete_page(mapping, page);
319unlock:
320 unlock_page(page); 352 unlock_page(page);
321 if (next > end) 353 if (next > end)
322 break; 354 break;
@@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
465 return invalidate_inode_pages2_range(mapping, 0, -1); 497 return invalidate_inode_pages2_range(mapping, 0, -1);
466} 498}
467EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 499EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
500
501/**
502 * truncate_pagecache - unmap and remove pagecache that has been truncated
503 * @inode: inode
504 * @old: old file offset
505 * @new: new file offset
506 *
507 * inode's new i_size must already be written before truncate_pagecache
508 * is called.
509 *
510 * This function should typically be called before the filesystem
511 * releases resources associated with the freed range (eg. deallocates
512 * blocks). This way, pagecache will always stay logically coherent
513 * with on-disk format, and the filesystem would not have to deal with
514 * situations such as writepage being called for a page that has already
515 * had its underlying blocks deallocated.
516 */
517void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
518{
519 if (new < old) {
520 struct address_space *mapping = inode->i_mapping;
521
522 /*
523 * unmap_mapping_range is called twice, first simply for
524 * efficiency so that truncate_inode_pages does fewer
525 * single-page unmaps. However after this first call, and
526 * before truncate_inode_pages finishes, it is possible for
527 * private pages to be COWed, which remain after
528 * truncate_inode_pages finishes, hence the second
529 * unmap_mapping_range call must be made for correctness.
530 */
531 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
532 truncate_inode_pages(mapping, new);
533 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
534 }
535}
536EXPORT_SYMBOL(truncate_pagecache);
537
538/**
539 * vmtruncate - unmap mappings "freed" by truncate() syscall
540 * @inode: inode of the file used
541 * @offset: file offset to start truncating
542 *
543 * NOTE! We have to be ready to update the memory sharing
544 * between the file and the memory map for a potential last
545 * incomplete page. Ugly, but necessary.
546 */
547int vmtruncate(struct inode *inode, loff_t offset)
548{
549 loff_t oldsize;
550 int error;
551
552 error = inode_newsize_ok(inode, offset);
553 if (error)
554 return error;
555 oldsize = inode->i_size;
556 i_size_write(inode, offset);
557 truncate_pagecache(inode, oldsize, offset);
558 if (inode->i_op->truncate)
559 inode->i_op->truncate(inode);
560
561 return error;
562}
563EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f8189a4b3e13..69511e663234 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,7 +25,7 @@
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/pfn.h> 26#include <linux/pfn.h>
27#include <linux/kmemleak.h> 27#include <linux/kmemleak.h>
28 28#include <linux/highmem.h>
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
@@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
168 next = pgd_addr_end(addr, end); 168 next = pgd_addr_end(addr, end);
169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
170 if (err) 170 if (err)
171 break; 171 return err;
172 } while (pgd++, addr = next, addr != end); 172 } while (pgd++, addr = next, addr != end);
173 173
174 if (unlikely(err))
175 return err;
176 return nr; 174 return nr;
177} 175}
178 176
@@ -186,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
186 return ret; 184 return ret;
187} 185}
188 186
189static inline int is_vmalloc_or_module_addr(const void *x) 187int is_vmalloc_or_module_addr(const void *x)
190{ 188{
191 /* 189 /*
192 * ARM, x86-64 and sparc64 put modules in a special place, 190 * ARM, x86-64 and sparc64 put modules in a special place,
@@ -265,6 +263,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 263static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 264static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 265static LIST_HEAD(vmap_area_list);
266static unsigned long vmap_area_pcpu_hole;
268 267
269static struct vmap_area *__find_vmap_area(unsigned long addr) 268static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 269{
@@ -431,6 +430,15 @@ static void __free_vmap_area(struct vmap_area *va)
431 RB_CLEAR_NODE(&va->rb_node); 430 RB_CLEAR_NODE(&va->rb_node);
432 list_del_rcu(&va->list); 431 list_del_rcu(&va->list);
433 432
433 /*
434 * Track the highest possible candidate for pcpu area
435 * allocation. Areas outside of vmalloc area can be returned
436 * here too, consider only end addresses which fall inside
437 * vmalloc area proper.
438 */
439 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
440 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
441
434 call_rcu(&va->rcu_head, rcu_free_va); 442 call_rcu(&va->rcu_head, rcu_free_va);
435} 443}
436 444
@@ -1038,6 +1046,9 @@ void __init vmalloc_init(void)
1038 va->va_end = va->va_start + tmp->size; 1046 va->va_end = va->va_start + tmp->size;
1039 __insert_vmap_area(va); 1047 __insert_vmap_area(va);
1040 } 1048 }
1049
1050 vmap_area_pcpu_hole = VMALLOC_END;
1051
1041 vmap_initialized = true; 1052 vmap_initialized = true;
1042} 1053}
1043 1054
@@ -1122,13 +1133,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1122DEFINE_RWLOCK(vmlist_lock); 1133DEFINE_RWLOCK(vmlist_lock);
1123struct vm_struct *vmlist; 1134struct vm_struct *vmlist;
1124 1135
1136static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1137 unsigned long flags, void *caller)
1138{
1139 struct vm_struct *tmp, **p;
1140
1141 vm->flags = flags;
1142 vm->addr = (void *)va->va_start;
1143 vm->size = va->va_end - va->va_start;
1144 vm->caller = caller;
1145 va->private = vm;
1146 va->flags |= VM_VM_AREA;
1147
1148 write_lock(&vmlist_lock);
1149 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1150 if (tmp->addr >= vm->addr)
1151 break;
1152 }
1153 vm->next = *p;
1154 *p = vm;
1155 write_unlock(&vmlist_lock);
1156}
1157
1125static struct vm_struct *__get_vm_area_node(unsigned long size, 1158static struct vm_struct *__get_vm_area_node(unsigned long size,
1126 unsigned long flags, unsigned long start, unsigned long end, 1159 unsigned long flags, unsigned long start, unsigned long end,
1127 int node, gfp_t gfp_mask, void *caller) 1160 int node, gfp_t gfp_mask, void *caller)
1128{ 1161{
1129 static struct vmap_area *va; 1162 static struct vmap_area *va;
1130 struct vm_struct *area; 1163 struct vm_struct *area;
1131 struct vm_struct *tmp, **p;
1132 unsigned long align = 1; 1164 unsigned long align = 1;
1133 1165
1134 BUG_ON(in_interrupt()); 1166 BUG_ON(in_interrupt());
@@ -1147,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1147 if (unlikely(!size)) 1179 if (unlikely(!size))
1148 return NULL; 1180 return NULL;
1149 1181
1150 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1182 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1151 if (unlikely(!area)) 1183 if (unlikely(!area))
1152 return NULL; 1184 return NULL;
1153 1185
@@ -1162,25 +1194,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1162 return NULL; 1194 return NULL;
1163 } 1195 }
1164 1196
1165 area->flags = flags; 1197 insert_vmalloc_vm(area, va, flags, caller);
1166 area->addr = (void *)va->va_start;
1167 area->size = size;
1168 area->pages = NULL;
1169 area->nr_pages = 0;
1170 area->phys_addr = 0;
1171 area->caller = caller;
1172 va->private = area;
1173 va->flags |= VM_VM_AREA;
1174
1175 write_lock(&vmlist_lock);
1176 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1177 if (tmp->addr >= area->addr)
1178 break;
1179 }
1180 area->next = *p;
1181 *p = area;
1182 write_unlock(&vmlist_lock);
1183
1184 return area; 1198 return area;
1185} 1199}
1186 1200
@@ -1256,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1256 if (va && va->flags & VM_VM_AREA) { 1270 if (va && va->flags & VM_VM_AREA) {
1257 struct vm_struct *vm = va->private; 1271 struct vm_struct *vm = va->private;
1258 struct vm_struct *tmp, **p; 1272 struct vm_struct *tmp, **p;
1259 1273 /*
1260 vmap_debug_free_range(va->va_start, va->va_end); 1274 * remove from list and disallow access to this vm_struct
1261 free_unmap_vmap_area(va); 1275 * before unmap. (address range confliction is maintained by
1262 vm->size -= PAGE_SIZE; 1276 * vmap.)
1263 1277 */
1264 write_lock(&vmlist_lock); 1278 write_lock(&vmlist_lock);
1265 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1279 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1266 ; 1280 ;
1267 *p = tmp->next; 1281 *p = tmp->next;
1268 write_unlock(&vmlist_lock); 1282 write_unlock(&vmlist_lock);
1269 1283
1284 vmap_debug_free_range(va->va_start, va->va_end);
1285 free_unmap_vmap_area(va);
1286 vm->size -= PAGE_SIZE;
1287
1270 return vm; 1288 return vm;
1271 } 1289 }
1272 return NULL; 1290 return NULL;
@@ -1368,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
1368 1386
1369 might_sleep(); 1387 might_sleep();
1370 1388
1371 if (count > num_physpages) 1389 if (count > totalram_pages)
1372 return NULL; 1390 return NULL;
1373 1391
1374 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1392 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1475,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1475 unsigned long real_size = size; 1493 unsigned long real_size = size;
1476 1494
1477 size = PAGE_ALIGN(size); 1495 size = PAGE_ALIGN(size);
1478 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1496 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1479 return NULL; 1497 return NULL;
1480 1498
1481 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1499 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
@@ -1625,10 +1643,120 @@ void *vmalloc_32_user(unsigned long size)
1625} 1643}
1626EXPORT_SYMBOL(vmalloc_32_user); 1644EXPORT_SYMBOL(vmalloc_32_user);
1627 1645
1646/*
1647 * small helper routine , copy contents to buf from addr.
1648 * If the page is not present, fill zero.
1649 */
1650
1651static int aligned_vread(char *buf, char *addr, unsigned long count)
1652{
1653 struct page *p;
1654 int copied = 0;
1655
1656 while (count) {
1657 unsigned long offset, length;
1658
1659 offset = (unsigned long)addr & ~PAGE_MASK;
1660 length = PAGE_SIZE - offset;
1661 if (length > count)
1662 length = count;
1663 p = vmalloc_to_page(addr);
1664 /*
1665 * To do safe access to this _mapped_ area, we need
1666 * lock. But adding lock here means that we need to add
1667 * overhead of vmalloc()/vfree() calles for this _debug_
1668 * interface, rarely used. Instead of that, we'll use
1669 * kmap() and get small overhead in this access function.
1670 */
1671 if (p) {
1672 /*
1673 * we can expect USER0 is not used (see vread/vwrite's
1674 * function description)
1675 */
1676 void *map = kmap_atomic(p, KM_USER0);
1677 memcpy(buf, map + offset, length);
1678 kunmap_atomic(map, KM_USER0);
1679 } else
1680 memset(buf, 0, length);
1681
1682 addr += length;
1683 buf += length;
1684 copied += length;
1685 count -= length;
1686 }
1687 return copied;
1688}
1689
1690static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1691{
1692 struct page *p;
1693 int copied = 0;
1694
1695 while (count) {
1696 unsigned long offset, length;
1697
1698 offset = (unsigned long)addr & ~PAGE_MASK;
1699 length = PAGE_SIZE - offset;
1700 if (length > count)
1701 length = count;
1702 p = vmalloc_to_page(addr);
1703 /*
1704 * To do safe access to this _mapped_ area, we need
1705 * lock. But adding lock here means that we need to add
1706 * overhead of vmalloc()/vfree() calles for this _debug_
1707 * interface, rarely used. Instead of that, we'll use
1708 * kmap() and get small overhead in this access function.
1709 */
1710 if (p) {
1711 /*
1712 * we can expect USER0 is not used (see vread/vwrite's
1713 * function description)
1714 */
1715 void *map = kmap_atomic(p, KM_USER0);
1716 memcpy(map + offset, buf, length);
1717 kunmap_atomic(map, KM_USER0);
1718 }
1719 addr += length;
1720 buf += length;
1721 copied += length;
1722 count -= length;
1723 }
1724 return copied;
1725}
1726
1727/**
1728 * vread() - read vmalloc area in a safe way.
1729 * @buf: buffer for reading data
1730 * @addr: vm address.
1731 * @count: number of bytes to be read.
1732 *
1733 * Returns # of bytes which addr and buf should be increased.
1734 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1735 * includes any intersect with alive vmalloc area.
1736 *
1737 * This function checks that addr is a valid vmalloc'ed area, and
1738 * copy data from that area to a given buffer. If the given memory range
1739 * of [addr...addr+count) includes some valid address, data is copied to
1740 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1741 * IOREMAP area is treated as memory hole and no copy is done.
1742 *
1743 * If [addr...addr+count) doesn't includes any intersects with alive
1744 * vm_struct area, returns 0.
1745 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1746 * the caller should guarantee KM_USER0 is not used.
1747 *
1748 * Note: In usual ops, vread() is never necessary because the caller
1749 * should know vmalloc() area is valid and can use memcpy().
1750 * This is for routines which have to access vmalloc area without
1751 * any informaion, as /dev/kmem.
1752 *
1753 */
1754
1628long vread(char *buf, char *addr, unsigned long count) 1755long vread(char *buf, char *addr, unsigned long count)
1629{ 1756{
1630 struct vm_struct *tmp; 1757 struct vm_struct *tmp;
1631 char *vaddr, *buf_start = buf; 1758 char *vaddr, *buf_start = buf;
1759 unsigned long buflen = count;
1632 unsigned long n; 1760 unsigned long n;
1633 1761
1634 /* Don't allow overflow */ 1762 /* Don't allow overflow */
@@ -1636,7 +1764,7 @@ long vread(char *buf, char *addr, unsigned long count)
1636 count = -(unsigned long) addr; 1764 count = -(unsigned long) addr;
1637 1765
1638 read_lock(&vmlist_lock); 1766 read_lock(&vmlist_lock);
1639 for (tmp = vmlist; tmp; tmp = tmp->next) { 1767 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1640 vaddr = (char *) tmp->addr; 1768 vaddr = (char *) tmp->addr;
1641 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1769 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1642 continue; 1770 continue;
@@ -1649,32 +1777,72 @@ long vread(char *buf, char *addr, unsigned long count)
1649 count--; 1777 count--;
1650 } 1778 }
1651 n = vaddr + tmp->size - PAGE_SIZE - addr; 1779 n = vaddr + tmp->size - PAGE_SIZE - addr;
1652 do { 1780 if (n > count)
1653 if (count == 0) 1781 n = count;
1654 goto finished; 1782 if (!(tmp->flags & VM_IOREMAP))
1655 *buf = *addr; 1783 aligned_vread(buf, addr, n);
1656 buf++; 1784 else /* IOREMAP area is treated as memory hole */
1657 addr++; 1785 memset(buf, 0, n);
1658 count--; 1786 buf += n;
1659 } while (--n > 0); 1787 addr += n;
1788 count -= n;
1660 } 1789 }
1661finished: 1790finished:
1662 read_unlock(&vmlist_lock); 1791 read_unlock(&vmlist_lock);
1663 return buf - buf_start; 1792
1793 if (buf == buf_start)
1794 return 0;
1795 /* zero-fill memory holes */
1796 if (buf != buf_start + buflen)
1797 memset(buf, 0, buflen - (buf - buf_start));
1798
1799 return buflen;
1664} 1800}
1665 1801
1802/**
1803 * vwrite() - write vmalloc area in a safe way.
1804 * @buf: buffer for source data
1805 * @addr: vm address.
1806 * @count: number of bytes to be read.
1807 *
1808 * Returns # of bytes which addr and buf should be incresed.
1809 * (same number to @count).
1810 * If [addr...addr+count) doesn't includes any intersect with valid
1811 * vmalloc area, returns 0.
1812 *
1813 * This function checks that addr is a valid vmalloc'ed area, and
1814 * copy data from a buffer to the given addr. If specified range of
1815 * [addr...addr+count) includes some valid address, data is copied from
1816 * proper area of @buf. If there are memory holes, no copy to hole.
1817 * IOREMAP area is treated as memory hole and no copy is done.
1818 *
1819 * If [addr...addr+count) doesn't includes any intersects with alive
1820 * vm_struct area, returns 0.
1821 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1822 * the caller should guarantee KM_USER0 is not used.
1823 *
1824 * Note: In usual ops, vwrite() is never necessary because the caller
1825 * should know vmalloc() area is valid and can use memcpy().
1826 * This is for routines which have to access vmalloc area without
1827 * any informaion, as /dev/kmem.
1828 *
1829 * The caller should guarantee KM_USER1 is not used.
1830 */
1831
1666long vwrite(char *buf, char *addr, unsigned long count) 1832long vwrite(char *buf, char *addr, unsigned long count)
1667{ 1833{
1668 struct vm_struct *tmp; 1834 struct vm_struct *tmp;
1669 char *vaddr, *buf_start = buf; 1835 char *vaddr;
1670 unsigned long n; 1836 unsigned long n, buflen;
1837 int copied = 0;
1671 1838
1672 /* Don't allow overflow */ 1839 /* Don't allow overflow */
1673 if ((unsigned long) addr + count < count) 1840 if ((unsigned long) addr + count < count)
1674 count = -(unsigned long) addr; 1841 count = -(unsigned long) addr;
1842 buflen = count;
1675 1843
1676 read_lock(&vmlist_lock); 1844 read_lock(&vmlist_lock);
1677 for (tmp = vmlist; tmp; tmp = tmp->next) { 1845 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1678 vaddr = (char *) tmp->addr; 1846 vaddr = (char *) tmp->addr;
1679 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1847 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1680 continue; 1848 continue;
@@ -1686,18 +1854,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
1686 count--; 1854 count--;
1687 } 1855 }
1688 n = vaddr + tmp->size - PAGE_SIZE - addr; 1856 n = vaddr + tmp->size - PAGE_SIZE - addr;
1689 do { 1857 if (n > count)
1690 if (count == 0) 1858 n = count;
1691 goto finished; 1859 if (!(tmp->flags & VM_IOREMAP)) {
1692 *addr = *buf; 1860 aligned_vwrite(buf, addr, n);
1693 buf++; 1861 copied++;
1694 addr++; 1862 }
1695 count--; 1863 buf += n;
1696 } while (--n > 0); 1864 addr += n;
1865 count -= n;
1697 } 1866 }
1698finished: 1867finished:
1699 read_unlock(&vmlist_lock); 1868 read_unlock(&vmlist_lock);
1700 return buf - buf_start; 1869 if (!copied)
1870 return 0;
1871 return buflen;
1701} 1872}
1702 1873
1703/** 1874/**
@@ -1818,6 +1989,286 @@ void free_vm_area(struct vm_struct *area)
1818} 1989}
1819EXPORT_SYMBOL_GPL(free_vm_area); 1990EXPORT_SYMBOL_GPL(free_vm_area);
1820 1991
1992static struct vmap_area *node_to_va(struct rb_node *n)
1993{
1994 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1995}
1996
1997/**
1998 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
1999 * @end: target address
2000 * @pnext: out arg for the next vmap_area
2001 * @pprev: out arg for the previous vmap_area
2002 *
2003 * Returns: %true if either or both of next and prev are found,
2004 * %false if no vmap_area exists
2005 *
2006 * Find vmap_areas end addresses of which enclose @end. ie. if not
2007 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
2008 */
2009static bool pvm_find_next_prev(unsigned long end,
2010 struct vmap_area **pnext,
2011 struct vmap_area **pprev)
2012{
2013 struct rb_node *n = vmap_area_root.rb_node;
2014 struct vmap_area *va = NULL;
2015
2016 while (n) {
2017 va = rb_entry(n, struct vmap_area, rb_node);
2018 if (end < va->va_end)
2019 n = n->rb_left;
2020 else if (end > va->va_end)
2021 n = n->rb_right;
2022 else
2023 break;
2024 }
2025
2026 if (!va)
2027 return false;
2028
2029 if (va->va_end > end) {
2030 *pnext = va;
2031 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2032 } else {
2033 *pprev = va;
2034 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2035 }
2036 return true;
2037}
2038
2039/**
2040 * pvm_determine_end - find the highest aligned address between two vmap_areas
2041 * @pnext: in/out arg for the next vmap_area
2042 * @pprev: in/out arg for the previous vmap_area
2043 * @align: alignment
2044 *
2045 * Returns: determined end address
2046 *
2047 * Find the highest aligned address between *@pnext and *@pprev below
2048 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2049 * down address is between the end addresses of the two vmap_areas.
2050 *
2051 * Please note that the address returned by this function may fall
2052 * inside *@pnext vmap_area. The caller is responsible for checking
2053 * that.
2054 */
2055static unsigned long pvm_determine_end(struct vmap_area **pnext,
2056 struct vmap_area **pprev,
2057 unsigned long align)
2058{
2059 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2060 unsigned long addr;
2061
2062 if (*pnext)
2063 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2064 else
2065 addr = vmalloc_end;
2066
2067 while (*pprev && (*pprev)->va_end > addr) {
2068 *pnext = *pprev;
2069 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2070 }
2071
2072 return addr;
2073}
2074
2075/**
2076 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
2077 * @offsets: array containing offset of each area
2078 * @sizes: array containing size of each area
2079 * @nr_vms: the number of areas to allocate
2080 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2081 * @gfp_mask: allocation mask
2082 *
2083 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2084 * vm_structs on success, %NULL on failure
2085 *
2086 * Percpu allocator wants to use congruent vm areas so that it can
2087 * maintain the offsets among percpu areas. This function allocates
2088 * congruent vmalloc areas for it. These areas tend to be scattered
2089 * pretty far, distance between two areas easily going up to
2090 * gigabytes. To avoid interacting with regular vmallocs, these areas
2091 * are allocated from top.
2092 *
2093 * Despite its complicated look, this allocator is rather simple. It
2094 * does everything top-down and scans areas from the end looking for
2095 * matching slot. While scanning, if any of the areas overlaps with
2096 * existing vmap_area, the base address is pulled down to fit the
2097 * area. Scanning is repeated till all the areas fit and then all
2098 * necessary data structres are inserted and the result is returned.
2099 */
2100struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2101 const size_t *sizes, int nr_vms,
2102 size_t align, gfp_t gfp_mask)
2103{
2104 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2105 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2106 struct vmap_area **vas, *prev, *next;
2107 struct vm_struct **vms;
2108 int area, area2, last_area, term_area;
2109 unsigned long base, start, end, last_end;
2110 bool purged = false;
2111
2112 gfp_mask &= GFP_RECLAIM_MASK;
2113
2114 /* verify parameters and allocate data structures */
2115 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2116 for (last_area = 0, area = 0; area < nr_vms; area++) {
2117 start = offsets[area];
2118 end = start + sizes[area];
2119
2120 /* is everything aligned properly? */
2121 BUG_ON(!IS_ALIGNED(offsets[area], align));
2122 BUG_ON(!IS_ALIGNED(sizes[area], align));
2123
2124 /* detect the area with the highest address */
2125 if (start > offsets[last_area])
2126 last_area = area;
2127
2128 for (area2 = 0; area2 < nr_vms; area2++) {
2129 unsigned long start2 = offsets[area2];
2130 unsigned long end2 = start2 + sizes[area2];
2131
2132 if (area2 == area)
2133 continue;
2134
2135 BUG_ON(start2 >= start && start2 < end);
2136 BUG_ON(end2 <= end && end2 > start);
2137 }
2138 }
2139 last_end = offsets[last_area] + sizes[last_area];
2140
2141 if (vmalloc_end - vmalloc_start < last_end) {
2142 WARN_ON(true);
2143 return NULL;
2144 }
2145
2146 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
2147 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
2148 if (!vas || !vms)
2149 goto err_free;
2150
2151 for (area = 0; area < nr_vms; area++) {
2152 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
2153 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
2154 if (!vas[area] || !vms[area])
2155 goto err_free;
2156 }
2157retry:
2158 spin_lock(&vmap_area_lock);
2159
2160 /* start scanning - we scan from the top, begin with the last area */
2161 area = term_area = last_area;
2162 start = offsets[area];
2163 end = start + sizes[area];
2164
2165 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2166 base = vmalloc_end - last_end;
2167 goto found;
2168 }
2169 base = pvm_determine_end(&next, &prev, align) - end;
2170
2171 while (true) {
2172 BUG_ON(next && next->va_end <= base + end);
2173 BUG_ON(prev && prev->va_end > base + end);
2174
2175 /*
2176 * base might have underflowed, add last_end before
2177 * comparing.
2178 */
2179 if (base + last_end < vmalloc_start + last_end) {
2180 spin_unlock(&vmap_area_lock);
2181 if (!purged) {
2182 purge_vmap_area_lazy();
2183 purged = true;
2184 goto retry;
2185 }
2186 goto err_free;
2187 }
2188
2189 /*
2190 * If next overlaps, move base downwards so that it's
2191 * right below next and then recheck.
2192 */
2193 if (next && next->va_start < base + end) {
2194 base = pvm_determine_end(&next, &prev, align) - end;
2195 term_area = area;
2196 continue;
2197 }
2198
2199 /*
2200 * If prev overlaps, shift down next and prev and move
2201 * base so that it's right below new next and then
2202 * recheck.
2203 */
2204 if (prev && prev->va_end > base + start) {
2205 next = prev;
2206 prev = node_to_va(rb_prev(&next->rb_node));
2207 base = pvm_determine_end(&next, &prev, align) - end;
2208 term_area = area;
2209 continue;
2210 }
2211
2212 /*
2213 * This area fits, move on to the previous one. If
2214 * the previous one is the terminal one, we're done.
2215 */
2216 area = (area + nr_vms - 1) % nr_vms;
2217 if (area == term_area)
2218 break;
2219 start = offsets[area];
2220 end = start + sizes[area];
2221 pvm_find_next_prev(base + end, &next, &prev);
2222 }
2223found:
2224 /* we've found a fitting base, insert all va's */
2225 for (area = 0; area < nr_vms; area++) {
2226 struct vmap_area *va = vas[area];
2227
2228 va->va_start = base + offsets[area];
2229 va->va_end = va->va_start + sizes[area];
2230 __insert_vmap_area(va);
2231 }
2232
2233 vmap_area_pcpu_hole = base + offsets[last_area];
2234
2235 spin_unlock(&vmap_area_lock);
2236
2237 /* insert all vm's */
2238 for (area = 0; area < nr_vms; area++)
2239 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2240 pcpu_get_vm_areas);
2241
2242 kfree(vas);
2243 return vms;
2244
2245err_free:
2246 for (area = 0; area < nr_vms; area++) {
2247 if (vas)
2248 kfree(vas[area]);
2249 if (vms)
2250 kfree(vms[area]);
2251 }
2252 kfree(vas);
2253 kfree(vms);
2254 return NULL;
2255}
2256
2257/**
2258 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2259 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2260 * @nr_vms: the number of allocated areas
2261 *
2262 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2263 */
2264void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2265{
2266 int i;
2267
2268 for (i = 0; i < nr_vms; i++)
2269 free_vm_area(vms[i]);
2270 kfree(vms);
2271}
1821 2272
1822#ifdef CONFIG_PROC_FS 2273#ifdef CONFIG_PROC_FS
1823static void *s_start(struct seq_file *m, loff_t *pos) 2274static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..64e438898832 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
148 return &zone->reclaim_stat; 148 return &zone->reclaim_stat;
149} 149}
150 150
151static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, 151static unsigned long zone_nr_lru_pages(struct zone *zone,
152 enum lru_list lru) 152 struct scan_control *sc, enum lru_list lru)
153{ 153{
154 if (!scanning_global_lru(sc)) 154 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page)
286 286
287static inline int is_page_cache_freeable(struct page *page) 287static inline int is_page_cache_freeable(struct page *page)
288{ 288{
289 return page_count(page) - !!page_has_private(page) == 2; 289 /*
290 * A freeable page cache page is referenced only by the caller
291 * that isolated the page, the page cache radix tree and
292 * optional buffer heads at page->private.
293 */
294 return page_count(page) - page_has_private(page) == 2;
290} 295}
291 296
292static int may_write_to_queue(struct backing_dev_info *bdi) 297static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
361 * block, for some throttling. This happens by accident, because 366 * block, for some throttling. This happens by accident, because
362 * swap_backing_dev_info is bust: it doesn't reflect the 367 * swap_backing_dev_info is bust: it doesn't reflect the
363 * congestion state of the swapdevs. Easy to fix, if needed. 368 * congestion state of the swapdevs. Easy to fix, if needed.
364 * See swapfile.c:page_queue_congested().
365 */ 369 */
366 if (!is_page_cache_freeable(page)) 370 if (!is_page_cache_freeable(page))
367 return PAGE_KEEP; 371 return PAGE_KEEP;
@@ -531,7 +535,7 @@ redo:
531 * unevictable page on [in]active list. 535 * unevictable page on [in]active list.
532 * We know how to handle that. 536 * We know how to handle that.
533 */ 537 */
534 lru = active + page_is_file_cache(page); 538 lru = active + page_lru_base_type(page);
535 lru_cache_add_lru(page, lru); 539 lru_cache_add_lru(page, lru);
536 } else { 540 } else {
537 /* 541 /*
@@ -630,9 +634,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
630 634
631 referenced = page_referenced(page, 1, 635 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags); 636 sc->mem_cgroup, &vm_flags);
633 /* In active use or really unfreeable? Activate it. */ 637 /*
638 * In active use or really unfreeable? Activate it.
639 * If page which have PG_mlocked lost isoltation race,
640 * try_to_unmap moves it to unevictable list
641 */
634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 642 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
635 referenced && page_mapping_inuse(page)) 643 referenced && page_mapping_inuse(page)
644 && !(vm_flags & VM_LOCKED))
636 goto activate_locked; 645 goto activate_locked;
637 646
638 /* 647 /*
@@ -654,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
654 * processes. Try to unmap it here. 663 * processes. Try to unmap it here.
655 */ 664 */
656 if (page_mapped(page) && mapping) { 665 if (page_mapped(page) && mapping) {
657 switch (try_to_unmap(page, 0)) { 666 switch (try_to_unmap(page, TTU_UNMAP)) {
658 case SWAP_FAIL: 667 case SWAP_FAIL:
659 goto activate_locked; 668 goto activate_locked;
660 case SWAP_AGAIN: 669 case SWAP_AGAIN:
@@ -816,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
816 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 825 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
817 return ret; 826 return ret;
818 827
819 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) 828 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
820 return ret; 829 return ret;
821 830
822 /* 831 /*
@@ -930,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
930 /* Check that we have not crossed a zone boundary. */ 939 /* Check that we have not crossed a zone boundary. */
931 if (unlikely(page_zone_id(cursor_page) != zone_id)) 940 if (unlikely(page_zone_id(cursor_page) != zone_id))
932 continue; 941 continue;
942
943 /*
944 * If we don't have enough swap space, reclaiming of
945 * anon page which don't already have a swap slot is
946 * pointless.
947 */
948 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
949 !PageSwapCache(cursor_page))
950 continue;
951
933 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 952 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
934 list_move(&cursor_page->lru, dst); 953 list_move(&cursor_page->lru, dst);
935 mem_cgroup_del_lru(cursor_page); 954 mem_cgroup_del_lru(cursor_page);
@@ -956,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr,
956 if (file) 975 if (file)
957 lru += LRU_FILE; 976 lru += LRU_FILE;
958 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 977 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
959 mode, !!file); 978 mode, file);
960} 979}
961 980
962/* 981/*
@@ -971,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
971 struct page *page; 990 struct page *page;
972 991
973 list_for_each_entry(page, page_list, lru) { 992 list_for_each_entry(page, page_list, lru) {
974 lru = page_is_file_cache(page); 993 lru = page_lru_base_type(page);
975 if (PageActive(page)) { 994 if (PageActive(page)) {
976 lru += LRU_ACTIVE; 995 lru += LRU_ACTIVE;
977 ClearPageActive(page); 996 ClearPageActive(page);
@@ -1029,6 +1048,31 @@ int isolate_lru_page(struct page *page)
1029} 1048}
1030 1049
1031/* 1050/*
1051 * Are there way too many processes in the direct reclaim path already?
1052 */
1053static int too_many_isolated(struct zone *zone, int file,
1054 struct scan_control *sc)
1055{
1056 unsigned long inactive, isolated;
1057
1058 if (current_is_kswapd())
1059 return 0;
1060
1061 if (!scanning_global_lru(sc))
1062 return 0;
1063
1064 if (file) {
1065 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1066 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1067 } else {
1068 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1069 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1070 }
1071
1072 return isolated > inactive;
1073}
1074
1075/*
1032 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1076 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1033 * of reclaimed pages 1077 * of reclaimed pages
1034 */ 1078 */
@@ -1043,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1043 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1087 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1044 int lumpy_reclaim = 0; 1088 int lumpy_reclaim = 0;
1045 1089
1090 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10);
1092
1093 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current))
1095 return SWAP_CLUSTER_MAX;
1096 }
1097
1046 /* 1098 /*
1047 * If we need a large contiguous chunk of memory, or have 1099 * If we need a large contiguous chunk of memory, or have
1048 * trouble getting a small set of contiguous pages, we 1100 * trouble getting a small set of contiguous pages, we
@@ -1067,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1067 unsigned long nr_active; 1119 unsigned long nr_active;
1068 unsigned int count[NR_LRU_LISTS] = { 0, }; 1120 unsigned int count[NR_LRU_LISTS] = { 0, };
1069 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1121 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1122 unsigned long nr_anon;
1123 unsigned long nr_file;
1070 1124
1071 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1125 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1072 &page_list, &nr_scan, sc->order, mode, 1126 &page_list, &nr_scan, sc->order, mode,
1073 zone, sc->mem_cgroup, 0, file); 1127 zone, sc->mem_cgroup, 0, file);
1128
1129 if (scanning_global_lru(sc)) {
1130 zone->pages_scanned += nr_scan;
1131 if (current_is_kswapd())
1132 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1133 nr_scan);
1134 else
1135 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1136 nr_scan);
1137 }
1138
1139 if (nr_taken == 0)
1140 goto done;
1141
1074 nr_active = clear_active_flags(&page_list, count); 1142 nr_active = clear_active_flags(&page_list, count);
1075 __count_vm_events(PGDEACTIVATE, nr_active); 1143 __count_vm_events(PGDEACTIVATE, nr_active);
1076 1144
@@ -1083,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1083 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1151 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1084 -count[LRU_INACTIVE_ANON]); 1152 -count[LRU_INACTIVE_ANON]);
1085 1153
1086 if (scanning_global_lru(sc)) 1154 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1087 zone->pages_scanned += nr_scan; 1155 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1156 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1157 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1088 1158
1089 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1159 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1090 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1160 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1118,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1118 } 1188 }
1119 1189
1120 nr_reclaimed += nr_freed; 1190 nr_reclaimed += nr_freed;
1191
1121 local_irq_disable(); 1192 local_irq_disable();
1122 if (current_is_kswapd()) { 1193 if (current_is_kswapd())
1123 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1124 __count_vm_events(KSWAPD_STEAL, nr_freed); 1194 __count_vm_events(KSWAPD_STEAL, nr_freed);
1125 } else if (scanning_global_lru(sc))
1126 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1127
1128 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1195 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1129 1196
1130 if (nr_taken == 0)
1131 goto done;
1132
1133 spin_lock(&zone->lru_lock); 1197 spin_lock(&zone->lru_lock);
1134 /* 1198 /*
1135 * Put back any unfreeable pages. 1199 * Put back any unfreeable pages.
@@ -1148,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1148 SetPageLRU(page); 1212 SetPageLRU(page);
1149 lru = page_lru(page); 1213 lru = page_lru(page);
1150 add_page_to_lru_list(zone, page, lru); 1214 add_page_to_lru_list(zone, page, lru);
1151 if (PageActive(page)) { 1215 if (is_active_lru(lru)) {
1152 int file = !!page_is_file_cache(page); 1216 int file = is_file_lru(lru);
1153 reclaim_stat->recent_rotated[file]++; 1217 reclaim_stat->recent_rotated[file]++;
1154 } 1218 }
1155 if (!pagevec_add(&pvec, page)) { 1219 if (!pagevec_add(&pvec, page)) {
@@ -1158,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1158 spin_lock_irq(&zone->lru_lock); 1222 spin_lock_irq(&zone->lru_lock);
1159 } 1223 }
1160 } 1224 }
1225 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1226 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1227
1161 } while (nr_scanned < max_scan); 1228 } while (nr_scanned < max_scan);
1162 spin_unlock(&zone->lru_lock); 1229
1163done: 1230done:
1164 local_irq_enable(); 1231 spin_unlock_irq(&zone->lru_lock);
1165 pagevec_release(&pvec); 1232 pagevec_release(&pvec);
1166 return nr_reclaimed; 1233 return nr_reclaimed;
1167} 1234}
@@ -1210,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone,
1210 1277
1211 while (!list_empty(list)) { 1278 while (!list_empty(list)) {
1212 page = lru_to_page(list); 1279 page = lru_to_page(list);
1213 prefetchw_prev_lru_page(page, list, flags);
1214 1280
1215 VM_BUG_ON(PageLRU(page)); 1281 VM_BUG_ON(PageLRU(page));
1216 SetPageLRU(page); 1282 SetPageLRU(page);
1217 1283
1218 VM_BUG_ON(!PageActive(page));
1219 if (!is_active_lru(lru))
1220 ClearPageActive(page); /* we are de-activating */
1221
1222 list_move(&page->lru, &zone->lru[lru].list); 1284 list_move(&page->lru, &zone->lru[lru].list);
1223 mem_cgroup_add_lru_list(page, lru); 1285 mem_cgroup_add_lru_list(page, lru);
1224 pgmoved++; 1286 pgmoved++;
@@ -1239,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1239static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1301static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1240 struct scan_control *sc, int priority, int file) 1302 struct scan_control *sc, int priority, int file)
1241{ 1303{
1242 unsigned long pgmoved; 1304 unsigned long nr_taken;
1243 unsigned long pgscanned; 1305 unsigned long pgscanned;
1244 unsigned long vm_flags; 1306 unsigned long vm_flags;
1245 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1307 LIST_HEAD(l_hold); /* The pages which were snipped off */
@@ -1247,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1247 LIST_HEAD(l_inactive); 1309 LIST_HEAD(l_inactive);
1248 struct page *page; 1310 struct page *page;
1249 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1311 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1312 unsigned long nr_rotated = 0;
1250 1313
1251 lru_add_drain(); 1314 lru_add_drain();
1252 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1253 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1316 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1254 ISOLATE_ACTIVE, zone, 1317 ISOLATE_ACTIVE, zone,
1255 sc->mem_cgroup, 1, file); 1318 sc->mem_cgroup, 1, file);
1256 /* 1319 /*
@@ -1260,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1260 if (scanning_global_lru(sc)) { 1323 if (scanning_global_lru(sc)) {
1261 zone->pages_scanned += pgscanned; 1324 zone->pages_scanned += pgscanned;
1262 } 1325 }
1263 reclaim_stat->recent_scanned[!!file] += pgmoved; 1326 reclaim_stat->recent_scanned[file] += nr_taken;
1264 1327
1265 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1328 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1266 if (file) 1329 if (file)
1267 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1330 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1268 else 1331 else
1269 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1332 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1333 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1270 spin_unlock_irq(&zone->lru_lock); 1334 spin_unlock_irq(&zone->lru_lock);
1271 1335
1272 pgmoved = 0; /* count referenced (mapping) mapped pages */
1273 while (!list_empty(&l_hold)) { 1336 while (!list_empty(&l_hold)) {
1274 cond_resched(); 1337 cond_resched();
1275 page = lru_to_page(&l_hold); 1338 page = lru_to_page(&l_hold);
@@ -1283,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1283 /* page_referenced clears PageReferenced */ 1346 /* page_referenced clears PageReferenced */
1284 if (page_mapping_inuse(page) && 1347 if (page_mapping_inuse(page) &&
1285 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1348 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1286 pgmoved++; 1349 nr_rotated++;
1287 /* 1350 /*
1288 * Identify referenced, file-backed active pages and 1351 * Identify referenced, file-backed active pages and
1289 * give them one more trip around the active list. So 1352 * give them one more trip around the active list. So
@@ -1299,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1299 } 1362 }
1300 } 1363 }
1301 1364
1365 ClearPageActive(page); /* we are de-activating */
1302 list_add(&page->lru, &l_inactive); 1366 list_add(&page->lru, &l_inactive);
1303 } 1367 }
1304 1368
@@ -1312,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1312 * helps balance scan pressure between file and anonymous pages in 1376 * helps balance scan pressure between file and anonymous pages in
1313 * get_scan_ratio. 1377 * get_scan_ratio.
1314 */ 1378 */
1315 reclaim_stat->recent_rotated[!!file] += pgmoved; 1379 reclaim_stat->recent_rotated[file] += nr_rotated;
1316 1380
1317 move_active_pages_to_lru(zone, &l_active, 1381 move_active_pages_to_lru(zone, &l_active,
1318 LRU_ACTIVE + file * LRU_FILE); 1382 LRU_ACTIVE + file * LRU_FILE);
1319 move_active_pages_to_lru(zone, &l_inactive, 1383 move_active_pages_to_lru(zone, &l_inactive,
1320 LRU_BASE + file * LRU_FILE); 1384 LRU_BASE + file * LRU_FILE);
1321 1385 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1322 spin_unlock_irq(&zone->lru_lock); 1386 spin_unlock_irq(&zone->lru_lock);
1323} 1387}
1324 1388
@@ -1424,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1424 unsigned long ap, fp; 1488 unsigned long ap, fp;
1425 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1489 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1426 1490
1427 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1491 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1428 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1492 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1429 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1493 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1430 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); 1494 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1431 1495
1432 if (scanning_global_lru(sc)) { 1496 if (scanning_global_lru(sc)) {
1433 free = zone_page_state(zone, NR_FREE_PAGES); 1497 free = zone_page_state(zone, NR_FREE_PAGES);
@@ -1521,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone,
1521 enum lru_list l; 1585 enum lru_list l;
1522 unsigned long nr_reclaimed = sc->nr_reclaimed; 1586 unsigned long nr_reclaimed = sc->nr_reclaimed;
1523 unsigned long swap_cluster_max = sc->swap_cluster_max; 1587 unsigned long swap_cluster_max = sc->swap_cluster_max;
1588 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1524 int noswap = 0; 1589 int noswap = 0;
1525 1590
1526 /* If we have no swap space, do not bother scanning anon pages. */ 1591 /* If we have no swap space, do not bother scanning anon pages. */
@@ -1535,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone,
1535 int file = is_file_lru(l); 1600 int file = is_file_lru(l);
1536 unsigned long scan; 1601 unsigned long scan;
1537 1602
1538 scan = zone_nr_pages(zone, sc, l); 1603 scan = zone_nr_lru_pages(zone, sc, l);
1539 if (priority || noswap) { 1604 if (priority || noswap) {
1540 scan >>= priority; 1605 scan >>= priority;
1541 scan = (scan * percent[file]) / 100; 1606 scan = (scan * percent[file]) / 100;
1542 } 1607 }
1543 if (scanning_global_lru(sc)) 1608 nr[l] = nr_scan_try_batch(scan,
1544 nr[l] = nr_scan_try_batch(scan, 1609 &reclaim_stat->nr_saved_scan[l],
1545 &zone->lru[l].nr_saved_scan, 1610 swap_cluster_max);
1546 swap_cluster_max);
1547 else
1548 nr[l] = scan;
1549 } 1611 }
1550 1612
1551 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1613 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1647,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1647 * 1709 *
1648 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1710 * If the caller is !__GFP_FS then the probability of a failure is reasonably
1649 * high - the zone may be full of dirty or under-writeback pages, which this 1711 * high - the zone may be full of dirty or under-writeback pages, which this
1650 * caller can't do much about. We kick pdflush and take explicit naps in the 1712 * caller can't do much about. We kick the writeback threads and take explicit
1651 * hope that some of these pages can be written. But if the allocating task 1713 * naps in the hope that some of these pages can be written. But if the
1652 * holds filesystem locks which prevent writeout this might not work, and the 1714 * allocating task holds filesystem locks which prevent writeout this might not
1653 * allocation attempt will fail. 1715 * work, and the allocation attempt will fail.
1654 * 1716 *
1655 * returns: 0, if no pages reclaimed 1717 * returns: 0, if no pages reclaimed
1656 * else, the number of pages reclaimed 1718 * else, the number of pages reclaimed
@@ -1680,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1680 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1742 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1681 continue; 1743 continue;
1682 1744
1683 lru_pages += zone_lru_pages(zone); 1745 lru_pages += zone_reclaimable_pages(zone);
1684 } 1746 }
1685 } 1747 }
1686 1748
@@ -1715,7 +1777,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1715 */ 1777 */
1716 if (total_scanned > sc->swap_cluster_max + 1778 if (total_scanned > sc->swap_cluster_max +
1717 sc->swap_cluster_max / 2) { 1779 sc->swap_cluster_max / 2) {
1718 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1780 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1719 sc->may_writepage = 1; 1781 sc->may_writepage = 1;
1720 } 1782 }
1721 1783
@@ -1774,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1774 1836
1775#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1776 1838
1839unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1840 gfp_t gfp_mask, bool noswap,
1841 unsigned int swappiness,
1842 struct zone *zone, int nid)
1843{
1844 struct scan_control sc = {
1845 .may_writepage = !laptop_mode,
1846 .may_unmap = 1,
1847 .may_swap = !noswap,
1848 .swap_cluster_max = SWAP_CLUSTER_MAX,
1849 .swappiness = swappiness,
1850 .order = 0,
1851 .mem_cgroup = mem,
1852 .isolate_pages = mem_cgroup_isolate_pages,
1853 };
1854 nodemask_t nm = nodemask_of_node(nid);
1855
1856 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1857 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1858 sc.nodemask = &nm;
1859 sc.nr_reclaimed = 0;
1860 sc.nr_scanned = 0;
1861 /*
1862 * NOTE: Although we can get the priority field, using it
1863 * here is not a good idea, since it limits the pages we can scan.
1864 * if we don't reclaim here, the shrink_zone from balance_pgdat
1865 * will pick up pages from other mem cgroup's as well. We hack
1866 * the priority and make it zero.
1867 */
1868 shrink_zone(0, zone, &sc);
1869 return sc.nr_reclaimed;
1870}
1871
1777unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1872unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1778 gfp_t gfp_mask, 1873 gfp_t gfp_mask,
1779 bool noswap, 1874 bool noswap,
1780 unsigned int swappiness) 1875 unsigned int swappiness)
1781{ 1876{
1877 struct zonelist *zonelist;
1782 struct scan_control sc = { 1878 struct scan_control sc = {
1783 .may_writepage = !laptop_mode, 1879 .may_writepage = !laptop_mode,
1784 .may_unmap = 1, 1880 .may_unmap = 1,
@@ -1790,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1790 .isolate_pages = mem_cgroup_isolate_pages, 1886 .isolate_pages = mem_cgroup_isolate_pages,
1791 .nodemask = NULL, /* we don't care the placement */ 1887 .nodemask = NULL, /* we don't care the placement */
1792 }; 1888 };
1793 struct zonelist *zonelist;
1794 1889
1795 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1890 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1796 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1891 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1897,7 +1992,7 @@ loop_again:
1897 for (i = 0; i <= end_zone; i++) { 1992 for (i = 0; i <= end_zone; i++) {
1898 struct zone *zone = pgdat->node_zones + i; 1993 struct zone *zone = pgdat->node_zones + i;
1899 1994
1900 lru_pages += zone_lru_pages(zone); 1995 lru_pages += zone_reclaimable_pages(zone);
1901 } 1996 }
1902 1997
1903 /* 1998 /*
@@ -1912,6 +2007,7 @@ loop_again:
1912 for (i = 0; i <= end_zone; i++) { 2007 for (i = 0; i <= end_zone; i++) {
1913 struct zone *zone = pgdat->node_zones + i; 2008 struct zone *zone = pgdat->node_zones + i;
1914 int nr_slab; 2009 int nr_slab;
2010 int nid, zid;
1915 2011
1916 if (!populated_zone(zone)) 2012 if (!populated_zone(zone))
1917 continue; 2013 continue;
@@ -1926,6 +2022,15 @@ loop_again:
1926 temp_priority[i] = priority; 2022 temp_priority[i] = priority;
1927 sc.nr_scanned = 0; 2023 sc.nr_scanned = 0;
1928 note_zone_scanning_priority(zone, priority); 2024 note_zone_scanning_priority(zone, priority);
2025
2026 nid = pgdat->node_id;
2027 zid = zone_idx(zone);
2028 /*
2029 * Call soft limit reclaim before calling shrink_zone.
2030 * For now we ignore the return value
2031 */
2032 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
2033 nid, zid);
1929 /* 2034 /*
1930 * We put equal pressure on every zone, unless one 2035 * We put equal pressure on every zone, unless one
1931 * zone has way too many pages free already. 2036 * zone has way too many pages free already.
@@ -1941,7 +2046,7 @@ loop_again:
1941 if (zone_is_all_unreclaimable(zone)) 2046 if (zone_is_all_unreclaimable(zone))
1942 continue; 2047 continue;
1943 if (nr_slab == 0 && zone->pages_scanned >= 2048 if (nr_slab == 0 && zone->pages_scanned >=
1944 (zone_lru_pages(zone) * 6)) 2049 (zone_reclaimable_pages(zone) * 6))
1945 zone_set_flag(zone, 2050 zone_set_flag(zone,
1946 ZONE_ALL_UNRECLAIMABLE); 2051 ZONE_ALL_UNRECLAIMABLE);
1947 /* 2052 /*
@@ -2108,12 +2213,39 @@ void wakeup_kswapd(struct zone *zone, int order)
2108 wake_up_interruptible(&pgdat->kswapd_wait); 2213 wake_up_interruptible(&pgdat->kswapd_wait);
2109} 2214}
2110 2215
2111unsigned long global_lru_pages(void) 2216/*
2217 * The reclaimable count would be mostly accurate.
2218 * The less reclaimable pages may be
2219 * - mlocked pages, which will be moved to unevictable list when encountered
2220 * - mapped pages, which may require several travels to be reclaimed
2221 * - dirty pages, which is not "instantly" reclaimable
2222 */
2223unsigned long global_reclaimable_pages(void)
2112{ 2224{
2113 return global_page_state(NR_ACTIVE_ANON) 2225 int nr;
2114 + global_page_state(NR_ACTIVE_FILE) 2226
2115 + global_page_state(NR_INACTIVE_ANON) 2227 nr = global_page_state(NR_ACTIVE_FILE) +
2116 + global_page_state(NR_INACTIVE_FILE); 2228 global_page_state(NR_INACTIVE_FILE);
2229
2230 if (nr_swap_pages > 0)
2231 nr += global_page_state(NR_ACTIVE_ANON) +
2232 global_page_state(NR_INACTIVE_ANON);
2233
2234 return nr;
2235}
2236
2237unsigned long zone_reclaimable_pages(struct zone *zone)
2238{
2239 int nr;
2240
2241 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2242 zone_page_state(zone, NR_INACTIVE_FILE);
2243
2244 if (nr_swap_pages > 0)
2245 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2246 zone_page_state(zone, NR_INACTIVE_ANON);
2247
2248 return nr;
2117} 2249}
2118 2250
2119#ifdef CONFIG_HIBERNATION 2251#ifdef CONFIG_HIBERNATION
@@ -2128,6 +2260,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2128{ 2260{
2129 struct zone *zone; 2261 struct zone *zone;
2130 unsigned long nr_reclaimed = 0; 2262 unsigned long nr_reclaimed = 0;
2263 struct zone_reclaim_stat *reclaim_stat;
2131 2264
2132 for_each_populated_zone(zone) { 2265 for_each_populated_zone(zone) {
2133 enum lru_list l; 2266 enum lru_list l;
@@ -2144,11 +2277,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2144 l == LRU_ACTIVE_FILE)) 2277 l == LRU_ACTIVE_FILE))
2145 continue; 2278 continue;
2146 2279
2147 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; 2280 reclaim_stat = get_reclaim_stat(zone, sc);
2148 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { 2281 reclaim_stat->nr_saved_scan[l] +=
2282 (lru_pages >> prio) + 1;
2283 if (reclaim_stat->nr_saved_scan[l]
2284 >= nr_pages || pass > 3) {
2149 unsigned long nr_to_scan; 2285 unsigned long nr_to_scan;
2150 2286
2151 zone->lru[l].nr_saved_scan = 0; 2287 reclaim_stat->nr_saved_scan[l] = 0;
2152 nr_to_scan = min(nr_pages, lru_pages); 2288 nr_to_scan = min(nr_pages, lru_pages);
2153 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2289 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2154 sc, prio); 2290 sc, prio);
@@ -2185,7 +2321,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2185 2321
2186 current->reclaim_state = &reclaim_state; 2322 current->reclaim_state = &reclaim_state;
2187 2323
2188 lru_pages = global_lru_pages(); 2324 lru_pages = global_reclaimable_pages();
2189 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2325 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2190 /* If slab caches are huge, it's better to hit them first */ 2326 /* If slab caches are huge, it's better to hit them first */
2191 while (nr_slab >= lru_pages) { 2327 while (nr_slab >= lru_pages) {
@@ -2227,7 +2363,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2227 2363
2228 reclaim_state.reclaimed_slab = 0; 2364 reclaim_state.reclaimed_slab = 0;
2229 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2365 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2230 global_lru_pages()); 2366 global_reclaimable_pages());
2231 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2367 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2232 if (sc.nr_reclaimed >= nr_pages) 2368 if (sc.nr_reclaimed >= nr_pages)
2233 goto out; 2369 goto out;
@@ -2244,7 +2380,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2244 if (!sc.nr_reclaimed) { 2380 if (!sc.nr_reclaimed) {
2245 do { 2381 do {
2246 reclaim_state.reclaimed_slab = 0; 2382 reclaim_state.reclaimed_slab = 0;
2247 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2383 shrink_slab(nr_pages, sc.gfp_mask,
2384 global_reclaimable_pages());
2248 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2385 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2249 } while (sc.nr_reclaimed < nr_pages && 2386 } while (sc.nr_reclaimed < nr_pages &&
2250 reclaim_state.reclaimed_slab > 0); 2387 reclaim_state.reclaimed_slab > 0);
@@ -2564,7 +2701,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
2564retry: 2701retry:
2565 ClearPageUnevictable(page); 2702 ClearPageUnevictable(page);
2566 if (page_evictable(page, NULL)) { 2703 if (page_evictable(page, NULL)) {
2567 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); 2704 enum lru_list l = page_lru_base_type(page);
2568 2705
2569 __dec_zone_state(zone, NR_UNEVICTABLE); 2706 __dec_zone_state(zone, NR_UNEVICTABLE);
2570 list_move(&page->lru, &zone->lru[l].list); 2707 list_move(&page->lru, &zone->lru[l].list);
@@ -2707,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
2707unsigned long scan_unevictable_pages; 2844unsigned long scan_unevictable_pages;
2708 2845
2709int scan_unevictable_handler(struct ctl_table *table, int write, 2846int scan_unevictable_handler(struct ctl_table *table, int write,
2710 struct file *file, void __user *buffer, 2847 void __user *buffer,
2711 size_t *length, loff_t *ppos) 2848 size_t *length, loff_t *ppos)
2712{ 2849{
2713 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2850 proc_doulongvec_minmax(table, write, buffer, length, ppos);
2714 2851
2715 if (write && *(unsigned long *)table->data) 2852 if (write && *(unsigned long *)table->data)
2716 scan_all_zones_unevictable_pages(); 2853 scan_all_zones_unevictable_pages();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..c81321f9feec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -639,11 +639,14 @@ static const char * const vmstat_text[] = {
639 "nr_slab_reclaimable", 639 "nr_slab_reclaimable",
640 "nr_slab_unreclaimable", 640 "nr_slab_unreclaimable",
641 "nr_page_table_pages", 641 "nr_page_table_pages",
642 "nr_kernel_stack",
642 "nr_unstable", 643 "nr_unstable",
643 "nr_bounce", 644 "nr_bounce",
644 "nr_vmscan_write", 645 "nr_vmscan_write",
645 "nr_writeback_temp", 646 "nr_writeback_temp",
646 647 "nr_isolated_anon",
648 "nr_isolated_file",
649 "nr_shmem",
647#ifdef CONFIG_NUMA 650#ifdef CONFIG_NUMA
648 "numa_hit", 651 "numa_hit",
649 "numa_miss", 652 "numa_miss",