aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig32
-rw-r--r--mm/Kconfig.debug12
-rw-r--r--mm/Makefile13
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/backing-dev.c427
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/filemap.c182
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/hugetlb.c266
-rw-r--r--mm/hwpoison-inject.c41
-rw-r--r--mm/internal.h10
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c339
-rw-r--r--mm/ksm.c1709
-rw-r--r--mm/madvise.c83
-rw-r--r--mm/memcontrol.c714
-rw-r--r--mm/memory-failure.c832
-rw-r--r--mm/memory.c299
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/mempool.c7
-rw-r--r--mm/migrate.c26
-rw-r--r--mm/mlock.c128
-rw-r--r--mm/mmap.c61
-rw-r--r--mm/mmu_context.c58
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/mremap.c18
-rw-r--r--mm/nommu.c130
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c245
-rw-r--r--mm/page_alloc.c328
-rw-r--r--mm/page_cgroup.c12
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/percpu.c1476
-rw-r--r--mm/quicklist.c5
-rw-r--r--mm/rmap.c142
-rw-r--r--mm/shmem.c44
-rw-r--r--mm/shmem_acl.c11
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c91
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c9
-rw-r--r--mm/swap.c8
-rw-r--r--mm/swap_state.c144
-rw-r--r--mm/swapfile.c26
-rw-r--r--mm/truncate.c136
-rw-r--r--mm/vmalloc.c609
-rw-r--r--mm/vmscan.c274
-rw-r--r--mm/vmstat.c5
50 files changed, 7310 insertions, 2091 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fe5f674d7a7d..57963c6063d1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE
153# 153#
154config PAGEFLAGS_EXTENDED 154config PAGEFLAGS_EXTENDED
155 def_bool y 155 def_bool y
156 depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM 156 depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
157 157
158# Heavily threaded applications may benefit from splitting the mm-wide 158# Heavily threaded applications may benefit from splitting the mm-wide
159# page_table_lock, so that faults on different parts of the user address 159# page_table_lock, so that faults on different parts of the user address
@@ -214,6 +214,20 @@ config HAVE_MLOCKED_PAGE_BIT
214config MMU_NOTIFIER 214config MMU_NOTIFIER
215 bool 215 bool
216 216
217config KSM
218 bool "Enable KSM for page merging"
219 depends on MMU
220 help
221 Enable Kernel Samepage Merging: KSM periodically scans those areas
222 of an application's address space that an app has advised may be
223 mergeable. When it finds pages of identical content, it replaces
224 the many instances by a single resident page with that content, so
225 saving memory until one or another app needs to modify the content.
226 Recommended for use with KVM, or with other duplicative applications.
227 See Documentation/vm/ksm.txt for more information: KSM is inactive
228 until a program has madvised that an area is MADV_MERGEABLE, and
229 root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
230
217config DEFAULT_MMAP_MIN_ADDR 231config DEFAULT_MMAP_MIN_ADDR
218 int "Low address space to protect from user allocation" 232 int "Low address space to protect from user allocation"
219 default 4096 233 default 4096
@@ -232,6 +246,22 @@ config DEFAULT_MMAP_MIN_ADDR
232 This value can be changed after boot using the 246 This value can be changed after boot using the
233 /proc/sys/vm/mmap_min_addr tunable. 247 /proc/sys/vm/mmap_min_addr tunable.
234 248
249config ARCH_SUPPORTS_MEMORY_FAILURE
250 bool
251
252config MEMORY_FAILURE
253 depends on MMU
254 depends on ARCH_SUPPORTS_MEMORY_FAILURE
255 bool "Enable recovery from hardware memory errors"
256 help
257 Enables code to recover from some memory failures on systems
258 with MCA recovery. This allows a system to continue running
259 even when some of its memory has uncorrected errors. This requires
260 special hardware support and typically ECC memory.
261
262config HWPOISON_INJECT
263 tristate "Poison pages injector"
264 depends on MEMORY_FAILURE && DEBUG_KERNEL
235 265
236config NOMMU_INITIAL_TRIM_EXCESS 266config NOMMU_INITIAL_TRIM_EXCESS
237 int "Turn on mmap() excess space trimming before booting" 267 int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index aa99fd1f7109..af7cfb43d2f0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC
6 ---help--- 6 ---help---
7 Unmap pages from the kernel linear mapping after free_pages(). 7 Unmap pages from the kernel linear mapping after free_pages().
8 This results in a large slowdown, but helps to find certain types 8 This results in a large slowdown, but helps to find certain types
9 of memory corruptions. 9 of memory corruption.
10 10
11config WANT_PAGE_DEBUG_FLAGS 11config WANT_PAGE_DEBUG_FLAGS
12 bool 12 bool
@@ -17,11 +17,11 @@ config PAGE_POISONING
17 depends on !HIBERNATION 17 depends on !HIBERNATION
18 select DEBUG_PAGEALLOC 18 select DEBUG_PAGEALLOC
19 select WANT_PAGE_DEBUG_FLAGS 19 select WANT_PAGE_DEBUG_FLAGS
20 help 20 ---help---
21 Fill the pages with poison patterns after free_pages() and verify 21 Fill the pages with poison patterns after free_pages() and verify
22 the patterns before alloc_pages(). This results in a large slowdown, 22 the patterns before alloc_pages(). This results in a large slowdown,
23 but helps to find certain types of memory corruptions. 23 but helps to find certain types of memory corruption.
24 24
25 This option cannot enalbe with hibernation. Otherwise, it will get 25 This option cannot be enabled in combination with hibernation as
26 wrong messages for memory corruption because the free pages are not 26 that would result in incorrect warnings of memory corruption after
27 saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..ebf849042ed3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,16 +5,16 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o 8 vmalloc.o pagewalk.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o mmu_context.o \
15 $(mmu-y)
15obj-y += init-mm.o 16obj-y += init-mm.o
16 17
17obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
18obj-$(CONFIG_BOUNCE) += bounce.o 18obj-$(CONFIG_BOUNCE) += bounce.o
19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 19obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
20obj-$(CONFIG_HAS_DMA) += dmapool.o 20obj-$(CONFIG_HAS_DMA) += dmapool.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_SLOB) += slob.o 26obj-$(CONFIG_SLOB) += slob.o
27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 27obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
28obj-$(CONFIG_KSM) += ksm.o
28obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o 29obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
29obj-$(CONFIG_SLAB) += slab.o 30obj-$(CONFIG_SLAB) += slab.o
30obj-$(CONFIG_SLUB) += slub.o 31obj-$(CONFIG_SLUB) += slub.o
@@ -33,12 +34,14 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 34obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 35obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 36obj-$(CONFIG_MIGRATION) += migrate.o
36ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 37ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
37obj-$(CONFIG_SMP) += percpu.o 38obj-$(CONFIG_SMP) += percpu.o
38else 39else
39obj-$(CONFIG_SMP) += allocpercpu.o 40obj-$(CONFIG_SMP) += allocpercpu.o
40endif 41endif
41obj-$(CONFIG_QUICKLIST) += quicklist.o 42obj-$(CONFIG_QUICKLIST) += quicklist.o
42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 43obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
44obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
45obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
43obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 46obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
44obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 47obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
8 10
9#ifndef cache_line_size 11#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES 12#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
147 kfree(__percpu_disguise(__pdata)); 149 kfree(__percpu_disguise(__pdata));
148} 150}
149EXPORT_SYMBOL_GPL(free_percpu); 151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..5a37e2055717 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -23,6 +27,24 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
25 29
30/*
31 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
32 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
33 * locking.
34 */
35DEFINE_SPINLOCK(bdi_lock);
36LIST_HEAD(bdi_list);
37LIST_HEAD(bdi_pending_list);
38
39static struct task_struct *sync_supers_tsk;
40static struct timer_list sync_supers_timer;
41
42static int bdi_sync_supers(void *);
43static void sync_supers_timer_fn(unsigned long);
44static void arm_supers_timer(void);
45
46static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
47
26#ifdef CONFIG_DEBUG_FS 48#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 49#include <linux/debugfs.h>
28#include <linux/seq_file.h> 50#include <linux/seq_file.h>
@@ -37,9 +59,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 59static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 60{
39 struct backing_dev_info *bdi = m->private; 61 struct backing_dev_info *bdi = m->private;
62 struct bdi_writeback *wb;
40 unsigned long background_thresh; 63 unsigned long background_thresh;
41 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
67 struct inode *inode;
68
69 /*
70 * inode lock is enough here, the bdi->wb_list is protected by
71 * RCU on the reader side
72 */
73 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
74 spin_lock(&inode_lock);
75 list_for_each_entry(wb, &bdi->wb_list, list) {
76 nr_wb++;
77 list_for_each_entry(inode, &wb->b_dirty, i_list)
78 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list)
80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list)
82 nr_more_io++;
83 }
84 spin_unlock(&inode_lock);
43 85
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 86 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 87
@@ -49,12 +91,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 91 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 94 "BackgroundThresh: %8lu kB\n"
95 "WritebackThreads: %8lu\n"
96 "b_dirty: %8lu\n"
97 "b_io: %8lu\n"
98 "b_more_io: %8lu\n"
99 "bdi_list: %8u\n"
100 "state: %8lx\n"
101 "wb_mask: %8lx\n"
102 "wb_list: %8u\n"
103 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 104 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 105 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 106 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 107 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 108 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
109 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 110#undef K
59 111
60 return 0; 112 return 0;
@@ -185,6 +237,13 @@ static int __init default_bdi_init(void)
185{ 237{
186 int err; 238 int err;
187 239
240 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
241 BUG_ON(IS_ERR(sync_supers_tsk));
242
243 init_timer(&sync_supers_timer);
244 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
245 arm_supers_timer();
246
188 err = bdi_init(&default_backing_dev_info); 247 err = bdi_init(&default_backing_dev_info);
189 if (!err) 248 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 249 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +252,279 @@ static int __init default_bdi_init(void)
193} 252}
194subsys_initcall(default_bdi_init); 253subsys_initcall(default_bdi_init);
195 254
255static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
256{
257 memset(wb, 0, sizeof(*wb));
258
259 wb->bdi = bdi;
260 wb->last_old_flush = jiffies;
261 INIT_LIST_HEAD(&wb->b_dirty);
262 INIT_LIST_HEAD(&wb->b_io);
263 INIT_LIST_HEAD(&wb->b_more_io);
264}
265
266static void bdi_task_init(struct backing_dev_info *bdi,
267 struct bdi_writeback *wb)
268{
269 struct task_struct *tsk = current;
270
271 spin_lock(&bdi->wb_lock);
272 list_add_tail_rcu(&wb->list, &bdi->wb_list);
273 spin_unlock(&bdi->wb_lock);
274
275 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
276 set_freezable();
277
278 /*
279 * Our parent may run at a different priority, just set us to normal
280 */
281 set_user_nice(tsk, 0);
282}
283
284static int bdi_start_fn(void *ptr)
285{
286 struct bdi_writeback *wb = ptr;
287 struct backing_dev_info *bdi = wb->bdi;
288 int ret;
289
290 /*
291 * Add us to the active bdi_list
292 */
293 spin_lock_bh(&bdi_lock);
294 list_add_rcu(&bdi->bdi_list, &bdi_list);
295 spin_unlock_bh(&bdi_lock);
296
297 bdi_task_init(bdi, wb);
298
299 /*
300 * Clear pending bit and wakeup anybody waiting to tear us down
301 */
302 clear_bit(BDI_pending, &bdi->state);
303 smp_mb__after_clear_bit();
304 wake_up_bit(&bdi->state, BDI_pending);
305
306 ret = bdi_writeback_task(wb);
307
308 /*
309 * Remove us from the list
310 */
311 spin_lock(&bdi->wb_lock);
312 list_del_rcu(&wb->list);
313 spin_unlock(&bdi->wb_lock);
314
315 /*
316 * Flush any work that raced with us exiting. No new work
317 * will be added, since this bdi isn't discoverable anymore.
318 */
319 if (!list_empty(&bdi->work_list))
320 wb_do_writeback(wb, 1);
321
322 wb->task = NULL;
323 return ret;
324}
325
326int bdi_has_dirty_io(struct backing_dev_info *bdi)
327{
328 return wb_has_dirty_io(&bdi->wb);
329}
330
331static void bdi_flush_io(struct backing_dev_info *bdi)
332{
333 struct writeback_control wbc = {
334 .bdi = bdi,
335 .sync_mode = WB_SYNC_NONE,
336 .older_than_this = NULL,
337 .range_cyclic = 1,
338 .nr_to_write = 1024,
339 };
340
341 writeback_inodes_wbc(&wbc);
342}
343
344/*
345 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
346 * or we risk deadlocking on ->s_umount. The longer term solution would be
347 * to implement sync_supers_bdi() or similar and simply do it from the
348 * bdi writeback tasks individually.
349 */
350static int bdi_sync_supers(void *unused)
351{
352 set_user_nice(current, 0);
353
354 while (!kthread_should_stop()) {
355 set_current_state(TASK_INTERRUPTIBLE);
356 schedule();
357
358 /*
359 * Do this periodically, like kupdated() did before.
360 */
361 sync_supers();
362 }
363
364 return 0;
365}
366
367static void arm_supers_timer(void)
368{
369 unsigned long next;
370
371 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
372 mod_timer(&sync_supers_timer, round_jiffies_up(next));
373}
374
375static void sync_supers_timer_fn(unsigned long unused)
376{
377 wake_up_process(sync_supers_tsk);
378 arm_supers_timer();
379}
380
381static int bdi_forker_task(void *ptr)
382{
383 struct bdi_writeback *me = ptr;
384
385 bdi_task_init(me->bdi, me);
386
387 for (;;) {
388 struct backing_dev_info *bdi, *tmp;
389 struct bdi_writeback *wb;
390
391 /*
392 * Temporary measure, we want to make sure we don't see
393 * dirty data on the default backing_dev_info
394 */
395 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
396 wb_do_writeback(me, 0);
397
398 spin_lock_bh(&bdi_lock);
399
400 /*
401 * Check if any existing bdi's have dirty data without
402 * a thread registered. If so, set that up.
403 */
404 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
405 if (bdi->wb.task)
406 continue;
407 if (list_empty(&bdi->work_list) &&
408 !bdi_has_dirty_io(bdi))
409 continue;
410
411 bdi_add_default_flusher_task(bdi);
412 }
413
414 set_current_state(TASK_INTERRUPTIBLE);
415
416 if (list_empty(&bdi_pending_list)) {
417 unsigned long wait;
418
419 spin_unlock_bh(&bdi_lock);
420 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
421 schedule_timeout(wait);
422 try_to_freeze();
423 continue;
424 }
425
426 __set_current_state(TASK_RUNNING);
427
428 /*
429 * This is our real job - check for pending entries in
430 * bdi_pending_list, and create the tasks that got added
431 */
432 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
433 bdi_list);
434 list_del_init(&bdi->bdi_list);
435 spin_unlock_bh(&bdi_lock);
436
437 wb = &bdi->wb;
438 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
439 dev_name(bdi->dev));
440 /*
441 * If task creation fails, then readd the bdi to
442 * the pending list and force writeout of the bdi
443 * from this forker thread. That will free some memory
444 * and we can try again.
445 */
446 if (IS_ERR(wb->task)) {
447 wb->task = NULL;
448
449 /*
450 * Add this 'bdi' to the back, so we get
451 * a chance to flush other bdi's to free
452 * memory.
453 */
454 spin_lock_bh(&bdi_lock);
455 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
456 spin_unlock_bh(&bdi_lock);
457
458 bdi_flush_io(bdi);
459 }
460 }
461
462 return 0;
463}
464
465static void bdi_add_to_pending(struct rcu_head *head)
466{
467 struct backing_dev_info *bdi;
468
469 bdi = container_of(head, struct backing_dev_info, rcu_head);
470 INIT_LIST_HEAD(&bdi->bdi_list);
471
472 spin_lock(&bdi_lock);
473 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
474 spin_unlock(&bdi_lock);
475
476 /*
477 * We are now on the pending list, wake up bdi_forker_task()
478 * to finish the job and add us back to the active bdi_list
479 */
480 wake_up_process(default_backing_dev_info.wb.task);
481}
482
483/*
484 * Add the default flusher task that gets created for any bdi
485 * that has dirty data pending writeout
486 */
487void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
488{
489 if (!bdi_cap_writeback_dirty(bdi))
490 return;
491
492 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
493 printk(KERN_ERR "bdi %p/%s is not registered!\n",
494 bdi, bdi->name);
495 return;
496 }
497
498 /*
499 * Check with the helper whether to proceed adding a task. Will only
500 * abort if we two or more simultanous calls to
501 * bdi_add_default_flusher_task() occured, further additions will block
502 * waiting for previous additions to finish.
503 */
504 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
505 list_del_rcu(&bdi->bdi_list);
506
507 /*
508 * We must wait for the current RCU period to end before
509 * moving to the pending list. So schedule that operation
510 * from an RCU callback.
511 */
512 call_rcu(&bdi->rcu_head, bdi_add_to_pending);
513 }
514}
515
516/*
517 * Remove bdi from bdi_list, and ensure that it is no longer visible
518 */
519static void bdi_remove_from_list(struct backing_dev_info *bdi)
520{
521 spin_lock_bh(&bdi_lock);
522 list_del_rcu(&bdi->bdi_list);
523 spin_unlock_bh(&bdi_lock);
524
525 synchronize_rcu();
526}
527
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 528int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 529 const char *fmt, ...)
198{ 530{
@@ -211,9 +543,33 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 543 goto exit;
212 } 544 }
213 545
546 spin_lock_bh(&bdi_lock);
547 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
548 spin_unlock_bh(&bdi_lock);
549
214 bdi->dev = dev; 550 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 551
552 /*
553 * Just start the forker thread for our default backing_dev_info,
554 * and add other bdi's to the list. They will get a thread created
555 * on-demand when they need it.
556 */
557 if (bdi_cap_flush_forker(bdi)) {
558 struct bdi_writeback *wb = &bdi->wb;
559
560 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
561 dev_name(dev));
562 if (IS_ERR(wb->task)) {
563 wb->task = NULL;
564 ret = -ENOMEM;
565
566 bdi_remove_from_list(bdi);
567 goto exit;
568 }
569 }
570
571 bdi_debug_register(bdi, dev_name(dev));
572 set_bit(BDI_registered, &bdi->state);
217exit: 573exit:
218 return ret; 574 return ret;
219} 575}
@@ -225,9 +581,40 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 581}
226EXPORT_SYMBOL(bdi_register_dev); 582EXPORT_SYMBOL(bdi_register_dev);
227 583
584/*
585 * Remove bdi from the global list and shutdown any threads we have running
586 */
587static void bdi_wb_shutdown(struct backing_dev_info *bdi)
588{
589 struct bdi_writeback *wb;
590
591 if (!bdi_cap_writeback_dirty(bdi))
592 return;
593
594 /*
595 * If setup is pending, wait for that to complete first
596 */
597 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
598 TASK_UNINTERRUPTIBLE);
599
600 /*
601 * Make sure nobody finds us on the bdi_list anymore
602 */
603 bdi_remove_from_list(bdi);
604
605 /*
606 * Finally, kill the kernel threads. We don't need to be RCU
607 * safe anymore, since the bdi is gone from visibility.
608 */
609 list_for_each_entry(wb, &bdi->wb_list, list)
610 kthread_stop(wb->task);
611}
612
228void bdi_unregister(struct backing_dev_info *bdi) 613void bdi_unregister(struct backing_dev_info *bdi)
229{ 614{
230 if (bdi->dev) { 615 if (bdi->dev) {
616 if (!bdi_cap_flush_forker(bdi))
617 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 618 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 619 device_unregister(bdi->dev);
233 bdi->dev = NULL; 620 bdi->dev = NULL;
@@ -237,14 +624,26 @@ EXPORT_SYMBOL(bdi_unregister);
237 624
238int bdi_init(struct backing_dev_info *bdi) 625int bdi_init(struct backing_dev_info *bdi)
239{ 626{
240 int i; 627 int i, err;
241 int err;
242 628
243 bdi->dev = NULL; 629 bdi->dev = NULL;
244 630
245 bdi->min_ratio = 0; 631 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 632 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 633 bdi->max_prop_frac = PROP_FRAC_BASE;
634 spin_lock_init(&bdi->wb_lock);
635 INIT_RCU_HEAD(&bdi->rcu_head);
636 INIT_LIST_HEAD(&bdi->bdi_list);
637 INIT_LIST_HEAD(&bdi->wb_list);
638 INIT_LIST_HEAD(&bdi->work_list);
639
640 bdi_wb_init(&bdi->wb, bdi);
641
642 /*
643 * Just one thread support for now, hard code mask and count
644 */
645 bdi->wb_mask = 1;
646 bdi->wb_cnt = 1;
248 647
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 648 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 649 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +668,20 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 668{
270 int i; 669 int i;
271 670
671 /*
672 * Splice our entries to the default_backing_dev_info, if this
673 * bdi disappears
674 */
675 if (bdi_has_dirty_io(bdi)) {
676 struct bdi_writeback *dst = &default_backing_dev_info.wb;
677
678 spin_lock(&inode_lock);
679 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
680 list_splice(&bdi->wb.b_io, &dst->b_io);
681 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
682 spin_unlock(&inode_lock);
683 }
684
272 bdi_unregister(bdi); 685 bdi_unregister(bdi);
273 686
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 687 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 701740c9e81b..555d5d2731c6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -521,7 +521,11 @@ find_block:
521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522 start_off); 522 start_off);
523 memset(region, 0, size); 523 memset(region, 0, size);
524 kmemleak_alloc(region, size, 1, 0); 524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
525 return region; 529 return region;
526 } 530 }
527 531
diff --git a/mm/filemap.c b/mm/filemap.c
index ccea3b665c12..ef169f37156d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
39/* 39/*
40 * FIXME: remove all knowledge of the buffer layer from the core VM 40 * FIXME: remove all knowledge of the buffer layer from the core VM
41 */ 41 */
42#include <linux/buffer_head.h> /* for generic_osync_inode */ 42#include <linux/buffer_head.h> /* for try_to_free_buffers */
43 43
44#include <asm/mman.h> 44#include <asm/mman.h>
45 45
46
47/* 46/*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 48 * though.
@@ -59,7 +58,7 @@
59/* 58/*
60 * Lock ordering: 59 * Lock ordering:
61 * 60 *
62 * ->i_mmap_lock (vmtruncate) 61 * ->i_mmap_lock (truncate_pagecache)
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 62 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 63 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 64 * ->mapping->tree_lock
@@ -105,6 +104,10 @@
105 * 104 *
106 * ->task->proc_lock 105 * ->task->proc_lock
107 * ->dcache_lock (proc_pid_lookup) 106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock
108 */ 111 */
109 112
110/* 113/*
@@ -120,6 +123,8 @@ void __remove_from_page_cache(struct page *page)
120 page->mapping = NULL; 123 page->mapping = NULL;
121 mapping->nrpages--; 124 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 125 __dec_zone_page_state(page, NR_FILE_PAGES);
126 if (PageSwapBacked(page))
127 __dec_zone_page_state(page, NR_SHMEM);
123 BUG_ON(page_mapped(page)); 128 BUG_ON(page_mapped(page));
124 129
125 /* 130 /*
@@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
307} 312}
308 313
309/** 314/**
310 * sync_page_range - write and wait on all pages in the passed range 315 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
311 * @inode: target inode 316 * @mapping: address space structure to wait for
312 * @mapping: target address_space 317 * @start: offset in bytes where the range starts
313 * @pos: beginning offset in pages to write 318 * @end: offset in bytes where the range ends (inclusive)
314 * @count: number of bytes to write
315 *
316 * Write and wait upon all the pages in the passed range. This is a "data
317 * integrity" operation. It waits upon in-flight writeout before starting and
318 * waiting upon new writeout. If there was an IO error, return it.
319 * 319 *
320 * We need to re-take i_mutex during the generic_osync_inode list walk because 320 * Walk the list of under-writeback pages of the given address space
321 * it is otherwise livelockable. 321 * in the given range and wait for all of them.
322 */
323int sync_page_range(struct inode *inode, struct address_space *mapping,
324 loff_t pos, loff_t count)
325{
326 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
327 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
328 int ret;
329
330 if (!mapping_cap_writeback_dirty(mapping) || !count)
331 return 0;
332 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
333 if (ret == 0) {
334 mutex_lock(&inode->i_mutex);
335 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
336 mutex_unlock(&inode->i_mutex);
337 }
338 if (ret == 0)
339 ret = wait_on_page_writeback_range(mapping, start, end);
340 return ret;
341}
342EXPORT_SYMBOL(sync_page_range);
343
344/**
345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
346 * @inode: target inode
347 * @mapping: target address_space
348 * @pos: beginning offset in pages to write
349 * @count: number of bytes to write
350 * 322 *
351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 323 * This is just a simple wrapper so that callers don't have to convert offsets
352 * as it forces O_SYNC writers to different parts of the same file 324 * to page indexes themselves
353 * to be serialised right until io completion.
354 */ 325 */
355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 326int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
356 loff_t pos, loff_t count) 327 loff_t end)
357{ 328{
358 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 329 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
359 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 330 end >> PAGE_CACHE_SHIFT);
360 int ret;
361
362 if (!mapping_cap_writeback_dirty(mapping) || !count)
363 return 0;
364 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
365 if (ret == 0)
366 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
367 if (ret == 0)
368 ret = wait_on_page_writeback_range(mapping, start, end);
369 return ret;
370} 331}
371EXPORT_SYMBOL(sync_page_range_nolock); 332EXPORT_SYMBOL(filemap_fdatawait_range);
372 333
373/** 334/**
374 * filemap_fdatawait - wait for all under-writeback pages to complete 335 * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -476,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
476 if (likely(!error)) { 437 if (likely(!error)) {
477 mapping->nrpages++; 438 mapping->nrpages++;
478 __inc_zone_page_state(page, NR_FILE_PAGES); 439 __inc_zone_page_state(page, NR_FILE_PAGES);
440 if (PageSwapBacked(page))
441 __inc_zone_page_state(page, NR_SHMEM);
479 spin_unlock_irq(&mapping->tree_lock); 442 spin_unlock_irq(&mapping->tree_lock);
480 } else { 443 } else {
481 page->mapping = NULL; 444 page->mapping = NULL;
@@ -1648,7 +1611,7 @@ page_not_uptodate:
1648} 1611}
1649EXPORT_SYMBOL(filemap_fault); 1612EXPORT_SYMBOL(filemap_fault);
1650 1613
1651struct vm_operations_struct generic_file_vm_ops = { 1614const struct vm_operations_struct generic_file_vm_ops = {
1652 .fault = filemap_fault, 1615 .fault = filemap_fault,
1653}; 1616};
1654 1617
@@ -2167,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2167 } 2130 }
2168 *ppos = end; 2131 *ppos = end;
2169 } 2132 }
2170
2171 /*
2172 * Sync the fs metadata but not the minor inode changes and
2173 * of course not the data as we did direct DMA for the IO.
2174 * i_mutex is held, which protects generic_osync_inode() from
2175 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2176 */
2177out: 2133out:
2178 if ((written >= 0 || written == -EIOCBQUEUED) &&
2179 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2180 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2181 if (err < 0)
2182 written = err;
2183 }
2184 return written; 2134 return written;
2185} 2135}
2186EXPORT_SYMBOL(generic_file_direct_write); 2136EXPORT_SYMBOL(generic_file_direct_write);
@@ -2312,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2312{ 2262{
2313 struct file *file = iocb->ki_filp; 2263 struct file *file = iocb->ki_filp;
2314 struct address_space *mapping = file->f_mapping; 2264 struct address_space *mapping = file->f_mapping;
2315 const struct address_space_operations *a_ops = mapping->a_ops;
2316 struct inode *inode = mapping->host;
2317 ssize_t status; 2265 ssize_t status;
2318 struct iov_iter i; 2266 struct iov_iter i;
2319 2267
@@ -2323,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2323 if (likely(status >= 0)) { 2271 if (likely(status >= 0)) {
2324 written += status; 2272 written += status;
2325 *ppos = pos + status; 2273 *ppos = pos + status;
2326
2327 /*
2328 * For now, when the user asks for O_SYNC, we'll actually give
2329 * O_DSYNC
2330 */
2331 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2332 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2333 status = generic_osync_inode(inode, mapping,
2334 OSYNC_METADATA|OSYNC_DATA);
2335 }
2336 } 2274 }
2337 2275
2338 /* 2276 /*
@@ -2348,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2348} 2286}
2349EXPORT_SYMBOL(generic_file_buffered_write); 2287EXPORT_SYMBOL(generic_file_buffered_write);
2350 2288
2351static ssize_t 2289/**
2352__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2290 * __generic_file_aio_write - write data to a file
2353 unsigned long nr_segs, loff_t *ppos) 2291 * @iocb: IO state structure (file, offset, etc.)
2292 * @iov: vector with data to write
2293 * @nr_segs: number of segments in the vector
2294 * @ppos: position where to write
2295 *
2296 * This function does all the work needed for actually writing data to a
2297 * file. It does all basic checks, removes SUID from the file, updates
2298 * modification times and calls proper subroutines depending on whether we
2299 * do direct IO or a standard buffered write.
2300 *
2301 * It expects i_mutex to be grabbed unless we work on a block device or similar
2302 * object which does not need locking at all.
2303 *
2304 * This function does *not* take care of syncing data in case of O_SYNC write.
2305 * A caller has to handle it. This is mainly due to the fact that we want to
2306 * avoid syncing under i_mutex.
2307 */
2308ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2309 unsigned long nr_segs, loff_t *ppos)
2354{ 2310{
2355 struct file *file = iocb->ki_filp; 2311 struct file *file = iocb->ki_filp;
2356 struct address_space * mapping = file->f_mapping; 2312 struct address_space * mapping = file->f_mapping;
@@ -2447,51 +2403,37 @@ out:
2447 current->backing_dev_info = NULL; 2403 current->backing_dev_info = NULL;
2448 return written ? written : err; 2404 return written ? written : err;
2449} 2405}
2406EXPORT_SYMBOL(__generic_file_aio_write);
2450 2407
2451ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2408/**
2452 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2409 * generic_file_aio_write - write data to a file
2453{ 2410 * @iocb: IO state structure
2454 struct file *file = iocb->ki_filp; 2411 * @iov: vector with data to write
2455 struct address_space *mapping = file->f_mapping; 2412 * @nr_segs: number of segments in the vector
2456 struct inode *inode = mapping->host; 2413 * @pos: position in file where to write
2457 ssize_t ret; 2414 *
2458 2415 * This is a wrapper around __generic_file_aio_write() to be used by most
2459 BUG_ON(iocb->ki_pos != pos); 2416 * filesystems. It takes care of syncing the file in case of O_SYNC file
2460 2417 * and acquires i_mutex as needed.
2461 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2418 */
2462 &iocb->ki_pos);
2463
2464 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2465 ssize_t err;
2466
2467 err = sync_page_range_nolock(inode, mapping, pos, ret);
2468 if (err < 0)
2469 ret = err;
2470 }
2471 return ret;
2472}
2473EXPORT_SYMBOL(generic_file_aio_write_nolock);
2474
2475ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2419ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2476 unsigned long nr_segs, loff_t pos) 2420 unsigned long nr_segs, loff_t pos)
2477{ 2421{
2478 struct file *file = iocb->ki_filp; 2422 struct file *file = iocb->ki_filp;
2479 struct address_space *mapping = file->f_mapping; 2423 struct inode *inode = file->f_mapping->host;
2480 struct inode *inode = mapping->host;
2481 ssize_t ret; 2424 ssize_t ret;
2482 2425
2483 BUG_ON(iocb->ki_pos != pos); 2426 BUG_ON(iocb->ki_pos != pos);
2484 2427
2485 mutex_lock(&inode->i_mutex); 2428 mutex_lock(&inode->i_mutex);
2486 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2429 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2487 &iocb->ki_pos);
2488 mutex_unlock(&inode->i_mutex); 2430 mutex_unlock(&inode->i_mutex);
2489 2431
2490 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2432 if (ret > 0 || ret == -EIOCBQUEUED) {
2491 ssize_t err; 2433 ssize_t err;
2492 2434
2493 err = sync_page_range(inode, mapping, pos, ret); 2435 err = generic_write_sync(file, pos, ret);
2494 if (err < 0) 2436 if (err < 0 && ret > 0)
2495 ret = err; 2437 ret = err;
2496 } 2438 }
2497 return ret; 2439 return ret;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 427dfe3ce78c..1888b2d71bb8 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -296,7 +296,7 @@ out:
296 } 296 }
297} 297}
298 298
299static struct vm_operations_struct xip_file_vm_ops = { 299static const struct vm_operations_struct xip_file_vm_ops = {
300 .fault = xip_file_fault, 300 .fault = xip_file_fault,
301}; 301};
302 302
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafdcee154e8..5d7601b02874 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234 234
235 return 1UL << (hstate->order + PAGE_SHIFT); 235 return 1UL << (hstate->order + PAGE_SHIFT);
236} 236}
237EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237 238
238/* 239/*
239 * Return the page size being used by the MMU to back a VMA. In the majority 240 * Return the page size being used by the MMU to back a VMA. In the majority
@@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
455 h->free_huge_pages_node[nid]++; 456 h->free_huge_pages_node[nid]++;
456} 457}
457 458
458static struct page *dequeue_huge_page(struct hstate *h)
459{
460 int nid;
461 struct page *page = NULL;
462
463 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
464 if (!list_empty(&h->hugepage_freelists[nid])) {
465 page = list_entry(h->hugepage_freelists[nid].next,
466 struct page, lru);
467 list_del(&page->lru);
468 h->free_huge_pages--;
469 h->free_huge_pages_node[nid]--;
470 break;
471 }
472 }
473 return page;
474}
475
476static struct page *dequeue_huge_page_vma(struct hstate *h, 459static struct page *dequeue_huge_page_vma(struct hstate *h,
477 struct vm_area_struct *vma, 460 struct vm_area_struct *vma,
478 unsigned long address, int avoid_reserve) 461 unsigned long address, int avoid_reserve)
@@ -640,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
640 623
641/* 624/*
642 * Use a helper variable to find the next node and then 625 * Use a helper variable to find the next node and then
643 * copy it back to hugetlb_next_nid afterwards: 626 * copy it back to next_nid_to_alloc afterwards:
644 * otherwise there's a window in which a racer might 627 * otherwise there's a window in which a racer might
645 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. 628 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
646 * But we don't need to use a spin_lock here: it really 629 * But we don't need to use a spin_lock here: it really
@@ -649,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
649 * if we just successfully allocated a hugepage so that 632 * if we just successfully allocated a hugepage so that
650 * the next caller gets hugepages on the next node. 633 * the next caller gets hugepages on the next node.
651 */ 634 */
652static int hstate_next_node(struct hstate *h) 635static int hstate_next_node_to_alloc(struct hstate *h)
653{ 636{
654 int next_nid; 637 int next_nid;
655 next_nid = next_node(h->hugetlb_next_nid, node_online_map); 638 next_nid = next_node(h->next_nid_to_alloc, node_online_map);
656 if (next_nid == MAX_NUMNODES) 639 if (next_nid == MAX_NUMNODES)
657 next_nid = first_node(node_online_map); 640 next_nid = first_node(node_online_map);
658 h->hugetlb_next_nid = next_nid; 641 h->next_nid_to_alloc = next_nid;
659 return next_nid; 642 return next_nid;
660} 643}
661 644
@@ -666,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
666 int next_nid; 649 int next_nid;
667 int ret = 0; 650 int ret = 0;
668 651
669 start_nid = h->hugetlb_next_nid; 652 start_nid = h->next_nid_to_alloc;
653 next_nid = start_nid;
670 654
671 do { 655 do {
672 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); 656 page = alloc_fresh_huge_page_node(h, next_nid);
673 if (page) 657 if (page)
674 ret = 1; 658 ret = 1;
675 next_nid = hstate_next_node(h); 659 next_nid = hstate_next_node_to_alloc(h);
676 } while (!page && h->hugetlb_next_nid != start_nid); 660 } while (!page && next_nid != start_nid);
677 661
678 if (ret) 662 if (ret)
679 count_vm_event(HTLB_BUDDY_PGALLOC); 663 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -683,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
683 return ret; 667 return ret;
684} 668}
685 669
670/*
671 * helper for free_pool_huge_page() - find next node
672 * from which to free a huge page
673 */
674static int hstate_next_node_to_free(struct hstate *h)
675{
676 int next_nid;
677 next_nid = next_node(h->next_nid_to_free, node_online_map);
678 if (next_nid == MAX_NUMNODES)
679 next_nid = first_node(node_online_map);
680 h->next_nid_to_free = next_nid;
681 return next_nid;
682}
683
684/*
685 * Free huge page from pool from next node to free.
686 * Attempt to keep persistent huge pages more or less
687 * balanced over allowed nodes.
688 * Called with hugetlb_lock locked.
689 */
690static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
691{
692 int start_nid;
693 int next_nid;
694 int ret = 0;
695
696 start_nid = h->next_nid_to_free;
697 next_nid = start_nid;
698
699 do {
700 /*
701 * If we're returning unused surplus pages, only examine
702 * nodes with surplus pages.
703 */
704 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
705 !list_empty(&h->hugepage_freelists[next_nid])) {
706 struct page *page =
707 list_entry(h->hugepage_freelists[next_nid].next,
708 struct page, lru);
709 list_del(&page->lru);
710 h->free_huge_pages--;
711 h->free_huge_pages_node[next_nid]--;
712 if (acct_surplus) {
713 h->surplus_huge_pages--;
714 h->surplus_huge_pages_node[next_nid]--;
715 }
716 update_and_free_page(h, page);
717 ret = 1;
718 }
719 next_nid = hstate_next_node_to_free(h);
720 } while (!ret && next_nid != start_nid);
721
722 return ret;
723}
724
686static struct page *alloc_buddy_huge_page(struct hstate *h, 725static struct page *alloc_buddy_huge_page(struct hstate *h,
687 struct vm_area_struct *vma, unsigned long address) 726 struct vm_area_struct *vma, unsigned long address)
688{ 727{
@@ -854,22 +893,13 @@ free:
854 * When releasing a hugetlb pool reservation, any surplus pages that were 893 * When releasing a hugetlb pool reservation, any surplus pages that were
855 * allocated to satisfy the reservation must be explicitly freed if they were 894 * allocated to satisfy the reservation must be explicitly freed if they were
856 * never used. 895 * never used.
896 * Called with hugetlb_lock held.
857 */ 897 */
858static void return_unused_surplus_pages(struct hstate *h, 898static void return_unused_surplus_pages(struct hstate *h,
859 unsigned long unused_resv_pages) 899 unsigned long unused_resv_pages)
860{ 900{
861 static int nid = -1;
862 struct page *page;
863 unsigned long nr_pages; 901 unsigned long nr_pages;
864 902
865 /*
866 * We want to release as many surplus pages as possible, spread
867 * evenly across all nodes. Iterate across all nodes until we
868 * can no longer free unreserved surplus pages. This occurs when
869 * the nodes with surplus pages have no free pages.
870 */
871 unsigned long remaining_iterations = nr_online_nodes;
872
873 /* Uncommit the reservation */ 903 /* Uncommit the reservation */
874 h->resv_huge_pages -= unused_resv_pages; 904 h->resv_huge_pages -= unused_resv_pages;
875 905
@@ -879,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
879 909
880 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 910 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
881 911
882 while (remaining_iterations-- && nr_pages) { 912 /*
883 nid = next_node(nid, node_online_map); 913 * We want to release as many surplus pages as possible, spread
884 if (nid == MAX_NUMNODES) 914 * evenly across all nodes. Iterate across all nodes until we
885 nid = first_node(node_online_map); 915 * can no longer free unreserved surplus pages. This occurs when
886 916 * the nodes with surplus pages have no free pages.
887 if (!h->surplus_huge_pages_node[nid]) 917 * free_pool_huge_page() will balance the the frees across the
888 continue; 918 * on-line nodes for us and will handle the hstate accounting.
889 919 */
890 if (!list_empty(&h->hugepage_freelists[nid])) { 920 while (nr_pages--) {
891 page = list_entry(h->hugepage_freelists[nid].next, 921 if (!free_pool_huge_page(h, 1))
892 struct page, lru); 922 break;
893 list_del(&page->lru);
894 update_and_free_page(h, page);
895 h->free_huge_pages--;
896 h->free_huge_pages_node[nid]--;
897 h->surplus_huge_pages--;
898 h->surplus_huge_pages_node[nid]--;
899 nr_pages--;
900 remaining_iterations = nr_online_nodes;
901 }
902 } 923 }
903} 924}
904 925
@@ -1007,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1007 void *addr; 1028 void *addr;
1008 1029
1009 addr = __alloc_bootmem_node_nopanic( 1030 addr = __alloc_bootmem_node_nopanic(
1010 NODE_DATA(h->hugetlb_next_nid), 1031 NODE_DATA(h->next_nid_to_alloc),
1011 huge_page_size(h), huge_page_size(h), 0); 1032 huge_page_size(h), huge_page_size(h), 0);
1012 1033
1034 hstate_next_node_to_alloc(h);
1013 if (addr) { 1035 if (addr) {
1014 /* 1036 /*
1015 * Use the beginning of the huge page to store the 1037 * Use the beginning of the huge page to store the
@@ -1019,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1019 m = addr; 1041 m = addr;
1020 goto found; 1042 goto found;
1021 } 1043 }
1022 hstate_next_node(h);
1023 nr_nodes--; 1044 nr_nodes--;
1024 } 1045 }
1025 return 0; 1046 return 0;
@@ -1140,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140 */ 1161 */
1141static int adjust_pool_surplus(struct hstate *h, int delta) 1162static int adjust_pool_surplus(struct hstate *h, int delta)
1142{ 1163{
1143 static int prev_nid; 1164 int start_nid, next_nid;
1144 int nid = prev_nid;
1145 int ret = 0; 1165 int ret = 0;
1146 1166
1147 VM_BUG_ON(delta != -1 && delta != 1); 1167 VM_BUG_ON(delta != -1 && delta != 1);
1148 do {
1149 nid = next_node(nid, node_online_map);
1150 if (nid == MAX_NUMNODES)
1151 nid = first_node(node_online_map);
1152 1168
1153 /* To shrink on this node, there must be a surplus page */ 1169 if (delta < 0)
1154 if (delta < 0 && !h->surplus_huge_pages_node[nid]) 1170 start_nid = h->next_nid_to_alloc;
1155 continue; 1171 else
1156 /* Surplus cannot exceed the total number of pages */ 1172 start_nid = h->next_nid_to_free;
1157 if (delta > 0 && h->surplus_huge_pages_node[nid] >= 1173 next_nid = start_nid;
1174
1175 do {
1176 int nid = next_nid;
1177 if (delta < 0) {
1178 next_nid = hstate_next_node_to_alloc(h);
1179 /*
1180 * To shrink on this node, there must be a surplus page
1181 */
1182 if (!h->surplus_huge_pages_node[nid])
1183 continue;
1184 }
1185 if (delta > 0) {
1186 next_nid = hstate_next_node_to_free(h);
1187 /*
1188 * Surplus cannot exceed the total number of pages
1189 */
1190 if (h->surplus_huge_pages_node[nid] >=
1158 h->nr_huge_pages_node[nid]) 1191 h->nr_huge_pages_node[nid])
1159 continue; 1192 continue;
1193 }
1160 1194
1161 h->surplus_huge_pages += delta; 1195 h->surplus_huge_pages += delta;
1162 h->surplus_huge_pages_node[nid] += delta; 1196 h->surplus_huge_pages_node[nid] += delta;
1163 ret = 1; 1197 ret = 1;
1164 break; 1198 break;
1165 } while (nid != prev_nid); 1199 } while (next_nid != start_nid);
1166 1200
1167 prev_nid = nid;
1168 return ret; 1201 return ret;
1169} 1202}
1170 1203
@@ -1226,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1226 min_count = max(count, min_count); 1259 min_count = max(count, min_count);
1227 try_to_free_low(h, min_count); 1260 try_to_free_low(h, min_count);
1228 while (min_count < persistent_huge_pages(h)) { 1261 while (min_count < persistent_huge_pages(h)) {
1229 struct page *page = dequeue_huge_page(h); 1262 if (!free_pool_huge_page(h, 0))
1230 if (!page)
1231 break; 1263 break;
1232 update_and_free_page(h, page);
1233 } 1264 }
1234 while (count < persistent_huge_pages(h)) { 1265 while (count < persistent_huge_pages(h)) {
1235 if (!adjust_pool_surplus(h, 1)) 1266 if (!adjust_pool_surplus(h, 1))
@@ -1441,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1441 h->free_huge_pages = 0; 1472 h->free_huge_pages = 0;
1442 for (i = 0; i < MAX_NUMNODES; ++i) 1473 for (i = 0; i < MAX_NUMNODES; ++i)
1443 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1474 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1444 h->hugetlb_next_nid = first_node(node_online_map); 1475 h->next_nid_to_alloc = first_node(node_online_map);
1476 h->next_nid_to_free = first_node(node_online_map);
1445 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1477 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1446 huge_page_size(h)/1024); 1478 huge_page_size(h)/1024);
1447 1479
@@ -1505,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1505 1537
1506#ifdef CONFIG_SYSCTL 1538#ifdef CONFIG_SYSCTL
1507int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1539int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1508 struct file *file, void __user *buffer, 1540 void __user *buffer,
1509 size_t *length, loff_t *ppos) 1541 size_t *length, loff_t *ppos)
1510{ 1542{
1511 struct hstate *h = &default_hstate; 1543 struct hstate *h = &default_hstate;
@@ -1516,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1516 1548
1517 table->data = &tmp; 1549 table->data = &tmp;
1518 table->maxlen = sizeof(unsigned long); 1550 table->maxlen = sizeof(unsigned long);
1519 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1551 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1520 1552
1521 if (write) 1553 if (write)
1522 h->max_huge_pages = set_max_huge_pages(h, tmp); 1554 h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1525,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1525} 1557}
1526 1558
1527int hugetlb_treat_movable_handler(struct ctl_table *table, int write, 1559int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1528 struct file *file, void __user *buffer, 1560 void __user *buffer,
1529 size_t *length, loff_t *ppos) 1561 size_t *length, loff_t *ppos)
1530{ 1562{
1531 proc_dointvec(table, write, file, buffer, length, ppos); 1563 proc_dointvec(table, write, buffer, length, ppos);
1532 if (hugepages_treat_as_movable) 1564 if (hugepages_treat_as_movable)
1533 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; 1565 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1534 else 1566 else
@@ -1537,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1537} 1569}
1538 1570
1539int hugetlb_overcommit_handler(struct ctl_table *table, int write, 1571int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1540 struct file *file, void __user *buffer, 1572 void __user *buffer,
1541 size_t *length, loff_t *ppos) 1573 size_t *length, loff_t *ppos)
1542{ 1574{
1543 struct hstate *h = &default_hstate; 1575 struct hstate *h = &default_hstate;
@@ -1548,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1548 1580
1549 table->data = &tmp; 1581 table->data = &tmp;
1550 table->maxlen = sizeof(unsigned long); 1582 table->maxlen = sizeof(unsigned long);
1551 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1583 proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552 1584
1553 if (write) { 1585 if (write) {
1554 spin_lock(&hugetlb_lock); 1586 spin_lock(&hugetlb_lock);
@@ -1689,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1689 return 0; 1721 return 0;
1690} 1722}
1691 1723
1692struct vm_operations_struct hugetlb_vm_ops = { 1724const struct vm_operations_struct hugetlb_vm_ops = {
1693 .fault = hugetlb_vm_op_fault, 1725 .fault = hugetlb_vm_op_fault,
1694 .open = hugetlb_vm_op_open, 1726 .open = hugetlb_vm_op_open,
1695 .close = hugetlb_vm_op_close, 1727 .close = hugetlb_vm_op_close,
@@ -1984,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1984 return find_lock_page(mapping, idx); 2016 return find_lock_page(mapping, idx);
1985} 2017}
1986 2018
2019/*
2020 * Return whether there is a pagecache page to back given address within VMA.
2021 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2022 */
2023static bool hugetlbfs_pagecache_present(struct hstate *h,
2024 struct vm_area_struct *vma, unsigned long address)
2025{
2026 struct address_space *mapping;
2027 pgoff_t idx;
2028 struct page *page;
2029
2030 mapping = vma->vm_file->f_mapping;
2031 idx = vma_hugecache_offset(h, vma, address);
2032
2033 page = find_get_page(mapping, idx);
2034 if (page)
2035 put_page(page);
2036 return page != NULL;
2037}
2038
1987static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 2039static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1988 unsigned long address, pte_t *ptep, unsigned int flags) 2040 unsigned long address, pte_t *ptep, unsigned int flags)
1989{ 2041{
@@ -2179,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2179 return NULL; 2231 return NULL;
2180} 2232}
2181 2233
2182static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2183{
2184 if (!ptep || write || shared)
2185 return 0;
2186 else
2187 return huge_pte_none(huge_ptep_get(ptep));
2188}
2189
2190int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2234int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2191 struct page **pages, struct vm_area_struct **vmas, 2235 struct page **pages, struct vm_area_struct **vmas,
2192 unsigned long *position, int *length, int i, 2236 unsigned long *position, int *length, int i,
2193 int write) 2237 unsigned int flags)
2194{ 2238{
2195 unsigned long pfn_offset; 2239 unsigned long pfn_offset;
2196 unsigned long vaddr = *position; 2240 unsigned long vaddr = *position;
2197 int remainder = *length; 2241 int remainder = *length;
2198 struct hstate *h = hstate_vma(vma); 2242 struct hstate *h = hstate_vma(vma);
2199 int zeropage_ok = 0;
2200 int shared = vma->vm_flags & VM_SHARED;
2201 2243
2202 spin_lock(&mm->page_table_lock); 2244 spin_lock(&mm->page_table_lock);
2203 while (vaddr < vma->vm_end && remainder) { 2245 while (vaddr < vma->vm_end && remainder) {
2204 pte_t *pte; 2246 pte_t *pte;
2247 int absent;
2205 struct page *page; 2248 struct page *page;
2206 2249
2207 /* 2250 /*
2208 * Some archs (sparc64, sh*) have multiple pte_ts to 2251 * Some archs (sparc64, sh*) have multiple pte_ts to
2209 * each hugepage. We have to make * sure we get the 2252 * each hugepage. We have to make sure we get the
2210 * first, for the page indexing below to work. 2253 * first, for the page indexing below to work.
2211 */ 2254 */
2212 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2255 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2213 if (huge_zeropage_ok(pte, write, shared)) 2256 absent = !pte || huge_pte_none(huge_ptep_get(pte));
2214 zeropage_ok = 1; 2257
2258 /*
2259 * When coredumping, it suits get_dump_page if we just return
2260 * an error where there's an empty slot with no huge pagecache
2261 * to back it. This way, we avoid allocating a hugepage, and
2262 * the sparse dumpfile avoids allocating disk blocks, but its
2263 * huge holes still show up with zeroes where they need to be.
2264 */
2265 if (absent && (flags & FOLL_DUMP) &&
2266 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2267 remainder = 0;
2268 break;
2269 }
2215 2270
2216 if (!pte || 2271 if (absent ||
2217 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || 2272 ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2218 (write && !pte_write(huge_ptep_get(pte)))) {
2219 int ret; 2273 int ret;
2220 2274
2221 spin_unlock(&mm->page_table_lock); 2275 spin_unlock(&mm->page_table_lock);
2222 ret = hugetlb_fault(mm, vma, vaddr, write); 2276 ret = hugetlb_fault(mm, vma, vaddr,
2277 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2223 spin_lock(&mm->page_table_lock); 2278 spin_lock(&mm->page_table_lock);
2224 if (!(ret & VM_FAULT_ERROR)) 2279 if (!(ret & VM_FAULT_ERROR))
2225 continue; 2280 continue;
2226 2281
2227 remainder = 0; 2282 remainder = 0;
2228 if (!i)
2229 i = -EFAULT;
2230 break; 2283 break;
2231 } 2284 }
2232 2285
@@ -2234,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2234 page = pte_page(huge_ptep_get(pte)); 2287 page = pte_page(huge_ptep_get(pte));
2235same_page: 2288same_page:
2236 if (pages) { 2289 if (pages) {
2237 if (zeropage_ok) 2290 pages[i] = mem_map_offset(page, pfn_offset);
2238 pages[i] = ZERO_PAGE(0);
2239 else
2240 pages[i] = mem_map_offset(page, pfn_offset);
2241 get_page(pages[i]); 2291 get_page(pages[i]);
2242 } 2292 }
2243 2293
@@ -2261,7 +2311,7 @@ same_page:
2261 *length = remainder; 2311 *length = remainder;
2262 *position = vaddr; 2312 *position = vaddr;
2263 2313
2264 return i; 2314 return i ? i : -EFAULT;
2265} 2315}
2266 2316
2267void hugetlb_change_protection(struct vm_area_struct *vma, 2317void hugetlb_change_protection(struct vm_area_struct *vma,
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
new file mode 100644
index 000000000000..e1d85137f086
--- /dev/null
+++ b/mm/hwpoison-inject.c
@@ -0,0 +1,41 @@
1/* Inject a hwpoison memory failure on a arbitary pfn */
2#include <linux/module.h>
3#include <linux/debugfs.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6
7static struct dentry *hwpoison_dir, *corrupt_pfn;
8
9static int hwpoison_inject(void *data, u64 val)
10{
11 if (!capable(CAP_SYS_ADMIN))
12 return -EPERM;
13 printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
14 return __memory_failure(val, 18, 0);
15}
16
17DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
18
19static void pfn_inject_exit(void)
20{
21 if (hwpoison_dir)
22 debugfs_remove_recursive(hwpoison_dir);
23}
24
25static int pfn_inject_init(void)
26{
27 hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
28 if (hwpoison_dir == NULL)
29 return -ENOMEM;
30 corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
31 NULL, &hwpoison_fops);
32 if (corrupt_pfn == NULL) {
33 pfn_inject_exit();
34 return -ENOMEM;
35 }
36 return 0;
37}
38
39module_init(pfn_inject_init);
40module_exit(pfn_inject_exit);
41MODULE_LICENSE("GPL");
diff --git a/mm/internal.h b/mm/internal.h
index f290c4db528b..22ec8d2b0fb8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,8 @@ static inline void __put_page(struct page *page)
37 atomic_dec(&page->_count); 37 atomic_dec(&page->_count);
38} 38}
39 39
40extern unsigned long highest_memmap_pfn;
41
40/* 42/*
41 * in mm/vmscan.c: 43 * in mm/vmscan.c:
42 */ 44 */
@@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page);
46/* 48/*
47 * in mm/page_alloc.c 49 * in mm/page_alloc.c
48 */ 50 */
49extern unsigned long highest_memmap_pfn;
50extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
51extern void prep_compound_page(struct page *page, unsigned long order); 52extern void prep_compound_page(struct page *page, unsigned long order);
52 53
@@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
250} 251}
251#endif /* CONFIG_SPARSEMEM */ 252#endif /* CONFIG_SPARSEMEM */
252 253
253#define GUP_FLAGS_WRITE 0x1
254#define GUP_FLAGS_FORCE 0x2
255#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
256#define GUP_FLAGS_IGNORE_SIGKILL 0x8
257
258int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 254int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
259 unsigned long start, int len, int flags, 255 unsigned long start, int len, unsigned int foll_flags,
260 struct page **pages, struct vm_area_struct **vmas); 256 struct page **pages, struct vm_area_struct **vmas);
261 257
262#define ZONE_RECLAIM_NOSCAN -2 258#define ZONE_RECLAIM_NOSCAN -2
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f523..177a5169bbde 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
36}; 36};
37 37
38static LIST_HEAD(test_list); 38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer); 39static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
40 40
41/* 41/*
42 * Some very simple testing. This function needs to be extended for 42 * Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
86 } 86 }
87 87
88 for_each_possible_cpu(i) { 88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); 89 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n", 90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i)); 91 per_cpu(kmemleak_test_pointer, i));
92 } 92 }
93 93
94 return 0; 94 return 0;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 487267310a84..8bf765c4f58d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,11 +92,13 @@
92#include <linux/string.h> 92#include <linux/string.h>
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h>
95 96
96#include <asm/sections.h> 97#include <asm/sections.h>
97#include <asm/processor.h> 98#include <asm/processor.h>
98#include <asm/atomic.h> 99#include <asm/atomic.h>
99 100
101#include <linux/kmemcheck.h>
100#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
101 103
102/* 104/*
@@ -107,6 +109,7 @@
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ 111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
110 113
111#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
112 115
@@ -120,6 +123,9 @@ struct kmemleak_scan_area {
120 size_t length; 123 size_t length;
121}; 124};
122 125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
123/* 129/*
124 * Structure holding the metadata for each allocated memory block. 130 * Structure holding the metadata for each allocated memory block.
125 * Modifications to such objects should be made while holding the 131 * Modifications to such objects should be made while holding the
@@ -161,6 +167,15 @@ struct kmemleak_object {
161/* flag set on newly allocated objects */ 167/* flag set on newly allocated objects */
162#define OBJECT_NEW (1 << 3) 168#define OBJECT_NEW (1 << 3)
163 169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
178
164/* the list of all allocated objects */ 179/* the list of all allocated objects */
165static LIST_HEAD(object_list); 180static LIST_HEAD(object_list);
166/* the list of gray-colored objects (see color_gray comment below) */ 181/* the list of gray-colored objects (see color_gray comment below) */
@@ -228,11 +243,14 @@ struct early_log {
228 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
229 unsigned long offset; /* scan area offset */ 244 unsigned long offset; /* scan area offset */
230 size_t length; /* scan area length */ 245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
231}; 248};
232 249
233/* early logging buffer and current position */ 250/* early logging buffer and current position */
234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; 251static struct early_log
235static int crt_early_log; 252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
236 254
237static void kmemleak_disable(void); 255static void kmemleak_disable(void);
238 256
@@ -255,6 +273,35 @@ static void kmemleak_disable(void);
255} while (0) 273} while (0)
256 274
257/* 275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
258 * Object colors, encoded with count and min_count: 305 * Object colors, encoded with count and min_count:
259 * - white - orphan object, not enough references to it (count < min_count) 306 * - white - orphan object, not enough references to it (count < min_count)
260 * - gray - not orphan, not marked as false positive (min_count == 0) or 307 * - gray - not orphan, not marked as false positive (min_count == 0) or
@@ -264,19 +311,21 @@ static void kmemleak_disable(void);
264 * Newly created objects don't have any color assigned (object->count == -1) 311 * Newly created objects don't have any color assigned (object->count == -1)
265 * before the next memory scan when they become white. 312 * before the next memory scan when they become white.
266 */ 313 */
267static int color_white(const struct kmemleak_object *object) 314static bool color_white(const struct kmemleak_object *object)
268{ 315{
269 return object->count != -1 && object->count < object->min_count; 316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
270} 318}
271 319
272static int color_gray(const struct kmemleak_object *object) 320static bool color_gray(const struct kmemleak_object *object)
273{ 321{
274 return object->min_count != -1 && object->count >= object->min_count; 322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
275} 324}
276 325
277static int color_black(const struct kmemleak_object *object) 326static bool color_black(const struct kmemleak_object *object)
278{ 327{
279 return object->min_count == -1; 328 return object->min_count == KMEMLEAK_BLACK;
280} 329}
281 330
282/* 331/*
@@ -284,7 +333,7 @@ static int color_black(const struct kmemleak_object *object)
284 * not be deleted and have a minimum age to avoid false positives caused by 333 * not be deleted and have a minimum age to avoid false positives caused by
285 * pointers temporarily stored in CPU registers. 334 * pointers temporarily stored in CPU registers.
286 */ 335 */
287static int unreferenced_object(struct kmemleak_object *object) 336static bool unreferenced_object(struct kmemleak_object *object)
288{ 337{
289 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
290 time_before_eq(object->jiffies + jiffies_min_age, 339 time_before_eq(object->jiffies + jiffies_min_age,
@@ -304,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq,
304 object->pointer, object->size); 353 object->pointer, object->size);
305 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
306 object->comm, object->pid, object->jiffies); 355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
307 seq_printf(seq, " backtrace:\n"); 357 seq_printf(seq, " backtrace:\n");
308 358
309 for (i = 0; i < object->trace_len; i++) { 359 for (i = 0; i < object->trace_len; i++) {
@@ -330,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
330 object->comm, object->pid, object->jiffies); 380 object->comm, object->pid, object->jiffies);
331 pr_notice(" min_count = %d\n", object->min_count); 381 pr_notice(" min_count = %d\n", object->min_count);
332 pr_notice(" count = %d\n", object->count); 382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
333 pr_notice(" backtrace:\n"); 384 pr_notice(" backtrace:\n");
334 print_stack_trace(&trace, 4); 385 print_stack_trace(&trace, 4);
335} 386}
@@ -434,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
434} 485}
435 486
436/* 487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
437 * Create the metadata (struct kmemleak_object) corresponding to an allocated 504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
438 * memory block and add it to the object_list and object_tree_root. 505 * memory block and add it to the object_list and object_tree_root.
439 */ 506 */
440static void create_object(unsigned long ptr, size_t size, int min_count, 507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
441 gfp_t gfp) 508 int min_count, gfp_t gfp)
442{ 509{
443 unsigned long flags; 510 unsigned long flags;
444 struct kmemleak_object *object; 511 struct kmemleak_object *object;
445 struct prio_tree_node *node; 512 struct prio_tree_node *node;
446 struct stack_trace trace;
447 513
448 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
449 if (!object) { 515 if (!object) {
450 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
451 return; 517 return NULL;
452 } 518 }
453 519
454 INIT_LIST_HEAD(&object->object_list); 520 INIT_LIST_HEAD(&object->object_list);
@@ -482,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
482 } 548 }
483 549
484 /* kernel backtrace */ 550 /* kernel backtrace */
485 trace.max_entries = MAX_TRACE; 551 object->trace_len = __save_stack_trace(object->trace);
486 trace.nr_entries = 0;
487 trace.entries = object->trace;
488 trace.skip = 1;
489 save_stack_trace(&trace);
490 object->trace_len = trace.nr_entries;
491 552
492 INIT_PRIO_TREE_NODE(&object->tree_node); 553 INIT_PRIO_TREE_NODE(&object->tree_node);
493 object->tree_node.start = ptr; 554 object->tree_node.start = ptr;
494 object->tree_node.last = ptr + size - 1; 555 object->tree_node.last = ptr + size - 1;
495 556
496 write_lock_irqsave(&kmemleak_lock, flags); 557 write_lock_irqsave(&kmemleak_lock, flags);
558
497 min_addr = min(min_addr, ptr); 559 min_addr = min(min_addr, ptr);
498 max_addr = max(max_addr, ptr + size); 560 max_addr = max(max_addr, ptr + size);
499 node = prio_tree_insert(&object_tree_root, &object->tree_node); 561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
@@ -504,20 +566,19 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
504 * random memory blocks. 566 * random memory blocks.
505 */ 567 */
506 if (node != &object->tree_node) { 568 if (node != &object->tree_node) {
507 unsigned long flags;
508
509 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
510 "(already existing)\n", ptr); 570 "(already existing)\n", ptr);
511 object = lookup_object(ptr, 1); 571 object = lookup_object(ptr, 1);
512 spin_lock_irqsave(&object->lock, flags); 572 spin_lock(&object->lock);
513 dump_object_info(object); 573 dump_object_info(object);
514 spin_unlock_irqrestore(&object->lock, flags); 574 spin_unlock(&object->lock);
515 575
516 goto out; 576 goto out;
517 } 577 }
518 list_add_tail_rcu(&object->object_list, &object_list); 578 list_add_tail_rcu(&object->object_list, &object_list);
519out: 579out:
520 write_unlock_irqrestore(&kmemleak_lock, flags); 580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
521} 582}
522 583
523/* 584/*
@@ -604,46 +665,55 @@ static void delete_object_part(unsigned long ptr, size_t size)
604 665
605 put_object(object); 666 put_object(object);
606} 667}
607/* 668
608 * Make a object permanently as gray-colored so that it can no longer be 669static void __paint_it(struct kmemleak_object *object, int color)
609 * reported as a leak. This is used in general to mark a false positive. 670{
610 */ 671 object->min_count = color;
611static void make_gray_object(unsigned long ptr) 672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
612{ 677{
613 unsigned long flags; 678 unsigned long flags;
679
680 spin_lock_irqsave(&object->lock, flags);
681 __paint_it(object, color);
682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
614 struct kmemleak_object *object; 687 struct kmemleak_object *object;
615 688
616 object = find_and_get_object(ptr, 0); 689 object = find_and_get_object(ptr, 0);
617 if (!object) { 690 if (!object) {
618 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); 691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
619 return; 695 return;
620 } 696 }
621 697 paint_it(object, color);
622 spin_lock_irqsave(&object->lock, flags);
623 object->min_count = 0;
624 spin_unlock_irqrestore(&object->lock, flags);
625 put_object(object); 698 put_object(object);
626} 699}
627 700
628/* 701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
629 * Mark the object as black-colored so that it is ignored from scans and 711 * Mark the object as black-colored so that it is ignored from scans and
630 * reporting. 712 * reporting.
631 */ 713 */
632static void make_black_object(unsigned long ptr) 714static void make_black_object(unsigned long ptr)
633{ 715{
634 unsigned long flags; 716 paint_ptr(ptr, KMEMLEAK_BLACK);
635 struct kmemleak_object *object;
636
637 object = find_and_get_object(ptr, 0);
638 if (!object) {
639 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr);
640 return;
641 }
642
643 spin_lock_irqsave(&object->lock, flags);
644 object->min_count = -1;
645 spin_unlock_irqrestore(&object->lock, flags);
646 put_object(object);
647} 717}
648 718
649/* 719/*
@@ -715,14 +785,15 @@ static void object_no_scan(unsigned long ptr)
715 * Log an early kmemleak_* call to the early_log buffer. These calls will be 785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
716 * processed later once kmemleak is fully initialized. 786 * processed later once kmemleak is fully initialized.
717 */ 787 */
718static void log_early(int op_type, const void *ptr, size_t size, 788static void __init log_early(int op_type, const void *ptr, size_t size,
719 int min_count, unsigned long offset, size_t length) 789 int min_count, unsigned long offset, size_t length)
720{ 790{
721 unsigned long flags; 791 unsigned long flags;
722 struct early_log *log; 792 struct early_log *log;
723 793
724 if (crt_early_log >= ARRAY_SIZE(early_log)) { 794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
725 pr_warning("Early log buffer exceeded\n"); 795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
726 kmemleak_disable(); 797 kmemleak_disable();
727 return; 798 return;
728 } 799 }
@@ -739,16 +810,48 @@ static void log_early(int op_type, const void *ptr, size_t size,
739 log->min_count = min_count; 810 log->min_count = min_count;
740 log->offset = offset; 811 log->offset = offset;
741 log->length = length; 812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
742 crt_early_log++; 815 crt_early_log++;
743 local_irq_restore(flags); 816 local_irq_restore(flags);
744} 817}
745 818
746/* 819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_ATOMIC);
837 if (!object)
838 goto out;
839 spin_lock_irqsave(&object->lock, flags);
840 for (i = 0; i < log->trace_len; i++)
841 object->trace[i] = log->trace[i];
842 object->trace_len = log->trace_len;
843 spin_unlock_irqrestore(&object->lock, flags);
844out:
845 rcu_read_unlock();
846}
847
848/*
747 * Memory allocation function callback. This function is called from the 849 * Memory allocation function callback. This function is called from the
748 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 850 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
749 * vmalloc etc.). 851 * vmalloc etc.).
750 */ 852 */
751void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) 853void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
854 gfp_t gfp)
752{ 855{
753 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); 856 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
754 857
@@ -763,7 +866,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
763 * Memory freeing function callback. This function is called from the kernel 866 * Memory freeing function callback. This function is called from the kernel
764 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 867 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
765 */ 868 */
766void kmemleak_free(const void *ptr) 869void __ref kmemleak_free(const void *ptr)
767{ 870{
768 pr_debug("%s(0x%p)\n", __func__, ptr); 871 pr_debug("%s(0x%p)\n", __func__, ptr);
769 872
@@ -778,7 +881,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free);
778 * Partial memory freeing function callback. This function is usually called 881 * Partial memory freeing function callback. This function is usually called
779 * from bootmem allocator when (part of) a memory block is freed. 882 * from bootmem allocator when (part of) a memory block is freed.
780 */ 883 */
781void kmemleak_free_part(const void *ptr, size_t size) 884void __ref kmemleak_free_part(const void *ptr, size_t size)
782{ 885{
783 pr_debug("%s(0x%p)\n", __func__, ptr); 886 pr_debug("%s(0x%p)\n", __func__, ptr);
784 887
@@ -793,7 +896,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part);
793 * Mark an already allocated memory block as a false positive. This will cause 896 * Mark an already allocated memory block as a false positive. This will cause
794 * the block to no longer be reported as leak and always be scanned. 897 * the block to no longer be reported as leak and always be scanned.
795 */ 898 */
796void kmemleak_not_leak(const void *ptr) 899void __ref kmemleak_not_leak(const void *ptr)
797{ 900{
798 pr_debug("%s(0x%p)\n", __func__, ptr); 901 pr_debug("%s(0x%p)\n", __func__, ptr);
799 902
@@ -809,7 +912,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
809 * corresponding block is not a leak and does not contain any references to 912 * corresponding block is not a leak and does not contain any references to
810 * other allocated memory blocks. 913 * other allocated memory blocks.
811 */ 914 */
812void kmemleak_ignore(const void *ptr) 915void __ref kmemleak_ignore(const void *ptr)
813{ 916{
814 pr_debug("%s(0x%p)\n", __func__, ptr); 917 pr_debug("%s(0x%p)\n", __func__, ptr);
815 918
@@ -823,8 +926,8 @@ EXPORT_SYMBOL(kmemleak_ignore);
823/* 926/*
824 * Limit the range to be scanned in an allocated memory block. 927 * Limit the range to be scanned in an allocated memory block.
825 */ 928 */
826void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, 929void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
827 gfp_t gfp) 930 size_t length, gfp_t gfp)
828{ 931{
829 pr_debug("%s(0x%p)\n", __func__, ptr); 932 pr_debug("%s(0x%p)\n", __func__, ptr);
830 933
@@ -838,7 +941,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
838/* 941/*
839 * Inform kmemleak not to scan the given memory block. 942 * Inform kmemleak not to scan the given memory block.
840 */ 943 */
841void kmemleak_no_scan(const void *ptr) 944void __ref kmemleak_no_scan(const void *ptr)
842{ 945{
843 pr_debug("%s(0x%p)\n", __func__, ptr); 946 pr_debug("%s(0x%p)\n", __func__, ptr);
844 947
@@ -882,15 +985,22 @@ static void scan_block(void *_start, void *_end,
882 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 985 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
883 986
884 for (ptr = start; ptr < end; ptr++) { 987 for (ptr = start; ptr < end; ptr++) {
885 unsigned long flags;
886 unsigned long pointer = *ptr;
887 struct kmemleak_object *object; 988 struct kmemleak_object *object;
989 unsigned long flags;
990 unsigned long pointer;
888 991
889 if (allow_resched) 992 if (allow_resched)
890 cond_resched(); 993 cond_resched();
891 if (scan_should_stop()) 994 if (scan_should_stop())
892 break; 995 break;
893 996
997 /* don't scan uninitialized memory */
998 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
999 BYTES_PER_POINTER))
1000 continue;
1001
1002 pointer = *ptr;
1003
894 object = find_and_get_object(pointer, 1); 1004 object = find_and_get_object(pointer, 1);
895 if (!object) 1005 if (!object)
896 continue; 1006 continue;
@@ -949,10 +1059,21 @@ static void scan_object(struct kmemleak_object *object)
949 if (!(object->flags & OBJECT_ALLOCATED)) 1059 if (!(object->flags & OBJECT_ALLOCATED))
950 /* already freed object */ 1060 /* already freed object */
951 goto out; 1061 goto out;
952 if (hlist_empty(&object->area_list)) 1062 if (hlist_empty(&object->area_list)) {
953 scan_block((void *)object->pointer, 1063 void *start = (void *)object->pointer;
954 (void *)(object->pointer + object->size), object, 0); 1064 void *end = (void *)(object->pointer + object->size);
955 else 1065
1066 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1067 !(object->flags & OBJECT_NO_SCAN)) {
1068 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1069 object, 0);
1070 start += MAX_SCAN_SIZE;
1071
1072 spin_unlock_irqrestore(&object->lock, flags);
1073 cond_resched();
1074 spin_lock_irqsave(&object->lock, flags);
1075 }
1076 } else
956 hlist_for_each_entry(area, elem, &object->area_list, node) 1077 hlist_for_each_entry(area, elem, &object->area_list, node)
957 scan_block((void *)(object->pointer + area->offset), 1078 scan_block((void *)(object->pointer + area->offset),
958 (void *)(object->pointer + area->offset 1079 (void *)(object->pointer + area->offset
@@ -970,7 +1091,6 @@ static void kmemleak_scan(void)
970{ 1091{
971 unsigned long flags; 1092 unsigned long flags;
972 struct kmemleak_object *object, *tmp; 1093 struct kmemleak_object *object, *tmp;
973 struct task_struct *task;
974 int i; 1094 int i;
975 int new_leaks = 0; 1095 int new_leaks = 0;
976 int gray_list_pass = 0; 1096 int gray_list_pass = 0;
@@ -1037,15 +1157,16 @@ static void kmemleak_scan(void)
1037 } 1157 }
1038 1158
1039 /* 1159 /*
1040 * Scanning the task stacks may introduce false negatives and it is 1160 * Scanning the task stacks (may introduce false negatives).
1041 * not enabled by default.
1042 */ 1161 */
1043 if (kmemleak_stack_scan) { 1162 if (kmemleak_stack_scan) {
1163 struct task_struct *p, *g;
1164
1044 read_lock(&tasklist_lock); 1165 read_lock(&tasklist_lock);
1045 for_each_process(task) 1166 do_each_thread(g, p) {
1046 scan_block(task_stack_page(task), 1167 scan_block(task_stack_page(p), task_stack_page(p) +
1047 task_stack_page(task) + THREAD_SIZE, 1168 THREAD_SIZE, NULL, 0);
1048 NULL, 0); 1169 } while_each_thread(g, p);
1049 read_unlock(&tasklist_lock); 1170 read_unlock(&tasklist_lock);
1050 } 1171 }
1051 1172
@@ -1170,7 +1291,7 @@ static int kmemleak_scan_thread(void *arg)
1170 * Start the automatic memory scanning thread. This function must be called 1291 * Start the automatic memory scanning thread. This function must be called
1171 * with the scan_mutex held. 1292 * with the scan_mutex held.
1172 */ 1293 */
1173void start_scan_thread(void) 1294static void start_scan_thread(void)
1174{ 1295{
1175 if (scan_thread) 1296 if (scan_thread)
1176 return; 1297 return;
@@ -1185,7 +1306,7 @@ void start_scan_thread(void)
1185 * Stop the automatic memory scanning thread. This function must be called 1306 * Stop the automatic memory scanning thread. This function must be called
1186 * with the scan_mutex held. 1307 * with the scan_mutex held.
1187 */ 1308 */
1188void stop_scan_thread(void) 1309static void stop_scan_thread(void)
1189{ 1310{
1190 if (scan_thread) { 1311 if (scan_thread) {
1191 kthread_stop(scan_thread); 1312 kthread_stop(scan_thread);
@@ -1294,6 +1415,49 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1294 return seq_release(inode, file); 1415 return seq_release(inode, file);
1295} 1416}
1296 1417
1418static int dump_str_object_info(const char *str)
1419{
1420 unsigned long flags;
1421 struct kmemleak_object *object;
1422 unsigned long addr;
1423
1424 addr= simple_strtoul(str, NULL, 0);
1425 object = find_and_get_object(addr, 0);
1426 if (!object) {
1427 pr_info("Unknown object at 0x%08lx\n", addr);
1428 return -EINVAL;
1429 }
1430
1431 spin_lock_irqsave(&object->lock, flags);
1432 dump_object_info(object);
1433 spin_unlock_irqrestore(&object->lock, flags);
1434
1435 put_object(object);
1436 return 0;
1437}
1438
1439/*
1440 * We use grey instead of black to ensure we can do future scans on the same
1441 * objects. If we did not do future scans these black objects could
1442 * potentially contain references to newly allocated objects in the future and
1443 * we'd end up with false positives.
1444 */
1445static void kmemleak_clear(void)
1446{
1447 struct kmemleak_object *object;
1448 unsigned long flags;
1449
1450 rcu_read_lock();
1451 list_for_each_entry_rcu(object, &object_list, object_list) {
1452 spin_lock_irqsave(&object->lock, flags);
1453 if ((object->flags & OBJECT_REPORTED) &&
1454 unreferenced_object(object))
1455 __paint_it(object, KMEMLEAK_GREY);
1456 spin_unlock_irqrestore(&object->lock, flags);
1457 }
1458 rcu_read_unlock();
1459}
1460
1297/* 1461/*
1298 * File write operation to configure kmemleak at run-time. The following 1462 * File write operation to configure kmemleak at run-time. The following
1299 * commands can be written to the /sys/kernel/debug/kmemleak file: 1463 * commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1305,6 +1469,9 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1305 * scan=... - set the automatic memory scanning period in seconds (0 to 1469 * scan=... - set the automatic memory scanning period in seconds (0 to
1306 * disable it) 1470 * disable it)
1307 * scan - trigger a memory scan 1471 * scan - trigger a memory scan
1472 * clear - mark all current reported unreferenced kmemleak objects as
1473 * grey to ignore printing them
1474 * dump=... - dump information about the object found at the given address
1308 */ 1475 */
1309static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1476static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1310 size_t size, loff_t *ppos) 1477 size_t size, loff_t *ppos)
@@ -1345,6 +1512,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1345 } 1512 }
1346 } else if (strncmp(buf, "scan", 4) == 0) 1513 } else if (strncmp(buf, "scan", 4) == 0)
1347 kmemleak_scan(); 1514 kmemleak_scan();
1515 else if (strncmp(buf, "clear", 5) == 0)
1516 kmemleak_clear();
1517 else if (strncmp(buf, "dump=", 5) == 0)
1518 ret = dump_str_object_info(buf + 5);
1348 else 1519 else
1349 ret = -EINVAL; 1520 ret = -EINVAL;
1350 1521
@@ -1371,7 +1542,7 @@ static const struct file_operations kmemleak_fops = {
1371 * Perform the freeing of the kmemleak internal objects after waiting for any 1542 * Perform the freeing of the kmemleak internal objects after waiting for any
1372 * current memory scan to complete. 1543 * current memory scan to complete.
1373 */ 1544 */
1374static int kmemleak_cleanup_thread(void *arg) 1545static void kmemleak_do_cleanup(struct work_struct *work)
1375{ 1546{
1376 struct kmemleak_object *object; 1547 struct kmemleak_object *object;
1377 1548
@@ -1383,22 +1554,9 @@ static int kmemleak_cleanup_thread(void *arg)
1383 delete_object_full(object->pointer); 1554 delete_object_full(object->pointer);
1384 rcu_read_unlock(); 1555 rcu_read_unlock();
1385 mutex_unlock(&scan_mutex); 1556 mutex_unlock(&scan_mutex);
1386
1387 return 0;
1388} 1557}
1389 1558
1390/* 1559static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1391 * Start the clean-up thread.
1392 */
1393static void kmemleak_cleanup(void)
1394{
1395 struct task_struct *cleanup_thread;
1396
1397 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1398 "kmemleak-clean");
1399 if (IS_ERR(cleanup_thread))
1400 pr_warning("Failed to create the clean-up thread\n");
1401}
1402 1560
1403/* 1561/*
1404 * Disable kmemleak. No memory allocation/freeing will be traced once this 1562 * Disable kmemleak. No memory allocation/freeing will be traced once this
@@ -1416,7 +1574,7 @@ static void kmemleak_disable(void)
1416 1574
1417 /* check whether it is too early for a kernel thread */ 1575 /* check whether it is too early for a kernel thread */
1418 if (atomic_read(&kmemleak_initialized)) 1576 if (atomic_read(&kmemleak_initialized))
1419 kmemleak_cleanup(); 1577 schedule_work(&cleanup_work);
1420 1578
1421 pr_info("Kernel memory leak detector disabled\n"); 1579 pr_info("Kernel memory leak detector disabled\n");
1422} 1580}
@@ -1469,8 +1627,7 @@ void __init kmemleak_init(void)
1469 1627
1470 switch (log->op_type) { 1628 switch (log->op_type) {
1471 case KMEMLEAK_ALLOC: 1629 case KMEMLEAK_ALLOC:
1472 kmemleak_alloc(log->ptr, log->size, log->min_count, 1630 early_alloc(log);
1473 GFP_KERNEL);
1474 break; 1631 break;
1475 case KMEMLEAK_FREE: 1632 case KMEMLEAK_FREE:
1476 kmemleak_free(log->ptr); 1633 kmemleak_free(log->ptr);
@@ -1513,7 +1670,7 @@ static int __init kmemleak_late_init(void)
1513 * after setting kmemleak_initialized and we may end up with 1670 * after setting kmemleak_initialized and we may end up with
1514 * two clean-up threads but serialized by scan_mutex. 1671 * two clean-up threads but serialized by scan_mutex.
1515 */ 1672 */
1516 kmemleak_cleanup(); 1673 schedule_work(&cleanup_work);
1517 return -ENOMEM; 1674 return -ENOMEM;
1518 } 1675 }
1519 1676
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 000000000000..bef1af4f77e3
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1709 @@
1/*
2 * Memory merging support.
3 *
4 * This code enables dynamic sharing of identical pages found in different
5 * memory areas, even if they are not shared by fork()
6 *
7 * Copyright (C) 2008-2009 Red Hat, Inc.
8 * Authors:
9 * Izik Eidus
10 * Andrea Arcangeli
11 * Chris Wright
12 * Hugh Dickins
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2.
15 */
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h>
33#include <linux/swap.h>
34#include <linux/ksm.h>
35
36#include <asm/tlbflush.h>
37
38/*
39 * A few notes about the KSM scanning process,
40 * to make it easier to understand the data structures below:
41 *
42 * In order to reduce excessive scanning, KSM sorts the memory pages by their
43 * contents into a data structure that holds pointers to the pages' locations.
44 *
45 * Since the contents of the pages may change at any moment, KSM cannot just
46 * insert the pages into a normal sorted tree and expect it to find anything.
47 * Therefore KSM uses two data structures - the stable and the unstable tree.
48 *
49 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
50 * by their contents. Because each such page is write-protected, searching on
51 * this tree is fully assured to be working (except when pages are unmapped),
52 * and therefore this tree is called the stable tree.
53 *
54 * In addition to the stable tree, KSM uses a second data structure called the
55 * unstable tree: this tree holds pointers to pages which have been found to
56 * be "unchanged for a period of time". The unstable tree sorts these pages
57 * by their contents, but since they are not write-protected, KSM cannot rely
58 * upon the unstable tree to work correctly - the unstable tree is liable to
59 * be corrupted as its contents are modified, and so it is called unstable.
60 *
61 * KSM solves this problem by several techniques:
62 *
63 * 1) The unstable tree is flushed every time KSM completes scanning all
64 * memory areas, and then the tree is rebuilt again from the beginning.
65 * 2) KSM will only insert into the unstable tree, pages whose hash value
66 * has not changed since the previous scan of all memory areas.
67 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
68 * colors of the nodes and not on their contents, assuring that even when
69 * the tree gets "corrupted" it won't get out of balance, so scanning time
70 * remains the same (also, searching and inserting nodes in an rbtree uses
71 * the same algorithm, so we have no overhead when we flush and rebuild).
72 * 4) KSM never flushes the stable tree, which means that even if it were to
73 * take 10 attempts to find a page in the unstable tree, once it is found,
74 * it is secured in the stable tree. (When we scan a new page, we first
75 * compare it against the stable tree, and then against the unstable tree.)
76 */
77
78/**
79 * struct mm_slot - ksm information per mm that is being scanned
80 * @link: link to the mm_slots hash list
81 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
82 * @rmap_list: head for this mm_slot's list of rmap_items
83 * @mm: the mm that this information is valid for
84 */
85struct mm_slot {
86 struct hlist_node link;
87 struct list_head mm_list;
88 struct list_head rmap_list;
89 struct mm_struct *mm;
90};
91
92/**
93 * struct ksm_scan - cursor for scanning
94 * @mm_slot: the current mm_slot we are scanning
95 * @address: the next address inside that to be scanned
96 * @rmap_item: the current rmap that we are scanning inside the rmap_list
97 * @seqnr: count of completed full scans (needed when removing unstable node)
98 *
99 * There is only the one ksm_scan instance of this cursor structure.
100 */
101struct ksm_scan {
102 struct mm_slot *mm_slot;
103 unsigned long address;
104 struct rmap_item *rmap_item;
105 unsigned long seqnr;
106};
107
108/**
109 * struct rmap_item - reverse mapping item for virtual addresses
110 * @link: link into mm_slot's rmap_list (rmap_list is per mm)
111 * @mm: the memory structure this rmap_item is pointing into
112 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
113 * @oldchecksum: previous checksum of the page at that virtual address
114 * @node: rb_node of this rmap_item in either unstable or stable tree
115 * @next: next rmap_item hanging off the same node of the stable tree
116 * @prev: previous rmap_item hanging off the same node of the stable tree
117 */
118struct rmap_item {
119 struct list_head link;
120 struct mm_struct *mm;
121 unsigned long address; /* + low bits used for flags below */
122 union {
123 unsigned int oldchecksum; /* when unstable */
124 struct rmap_item *next; /* when stable */
125 };
126 union {
127 struct rb_node node; /* when tree node */
128 struct rmap_item *prev; /* in stable list */
129 };
130};
131
132#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
133#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
134#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
135
136/* The stable and unstable tree heads */
137static struct rb_root root_stable_tree = RB_ROOT;
138static struct rb_root root_unstable_tree = RB_ROOT;
139
140#define MM_SLOTS_HASH_HEADS 1024
141static struct hlist_head *mm_slots_hash;
142
143static struct mm_slot ksm_mm_head = {
144 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
145};
146static struct ksm_scan ksm_scan = {
147 .mm_slot = &ksm_mm_head,
148};
149
150static struct kmem_cache *rmap_item_cache;
151static struct kmem_cache *mm_slot_cache;
152
153/* The number of nodes in the stable tree */
154static unsigned long ksm_pages_shared;
155
156/* The number of page slots additionally sharing those nodes */
157static unsigned long ksm_pages_sharing;
158
159/* The number of nodes in the unstable tree */
160static unsigned long ksm_pages_unshared;
161
162/* The number of rmap_items in use: to calculate pages_volatile */
163static unsigned long ksm_rmap_items;
164
165/* Limit on the number of unswappable pages used */
166static unsigned long ksm_max_kernel_pages;
167
168/* Number of pages ksmd should scan in one batch */
169static unsigned int ksm_thread_pages_to_scan = 100;
170
171/* Milliseconds ksmd should sleep between batches */
172static unsigned int ksm_thread_sleep_millisecs = 20;
173
174#define KSM_RUN_STOP 0
175#define KSM_RUN_MERGE 1
176#define KSM_RUN_UNMERGE 2
177static unsigned int ksm_run = KSM_RUN_STOP;
178
179static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
180static DEFINE_MUTEX(ksm_thread_mutex);
181static DEFINE_SPINLOCK(ksm_mmlist_lock);
182
183#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
184 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL)
186
187static int __init ksm_slab_init(void)
188{
189 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
190 if (!rmap_item_cache)
191 goto out;
192
193 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
194 if (!mm_slot_cache)
195 goto out_free;
196
197 return 0;
198
199out_free:
200 kmem_cache_destroy(rmap_item_cache);
201out:
202 return -ENOMEM;
203}
204
205static void __init ksm_slab_free(void)
206{
207 kmem_cache_destroy(mm_slot_cache);
208 kmem_cache_destroy(rmap_item_cache);
209 mm_slot_cache = NULL;
210}
211
212static inline struct rmap_item *alloc_rmap_item(void)
213{
214 struct rmap_item *rmap_item;
215
216 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
217 if (rmap_item)
218 ksm_rmap_items++;
219 return rmap_item;
220}
221
222static inline void free_rmap_item(struct rmap_item *rmap_item)
223{
224 ksm_rmap_items--;
225 rmap_item->mm = NULL; /* debug safety */
226 kmem_cache_free(rmap_item_cache, rmap_item);
227}
228
229static inline struct mm_slot *alloc_mm_slot(void)
230{
231 if (!mm_slot_cache) /* initialization failed */
232 return NULL;
233 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
234}
235
236static inline void free_mm_slot(struct mm_slot *mm_slot)
237{
238 kmem_cache_free(mm_slot_cache, mm_slot);
239}
240
241static int __init mm_slots_hash_init(void)
242{
243 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
244 GFP_KERNEL);
245 if (!mm_slots_hash)
246 return -ENOMEM;
247 return 0;
248}
249
250static void __init mm_slots_hash_free(void)
251{
252 kfree(mm_slots_hash);
253}
254
255static struct mm_slot *get_mm_slot(struct mm_struct *mm)
256{
257 struct mm_slot *mm_slot;
258 struct hlist_head *bucket;
259 struct hlist_node *node;
260
261 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
262 % MM_SLOTS_HASH_HEADS];
263 hlist_for_each_entry(mm_slot, node, bucket, link) {
264 if (mm == mm_slot->mm)
265 return mm_slot;
266 }
267 return NULL;
268}
269
270static void insert_to_mm_slots_hash(struct mm_struct *mm,
271 struct mm_slot *mm_slot)
272{
273 struct hlist_head *bucket;
274
275 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
276 % MM_SLOTS_HASH_HEADS];
277 mm_slot->mm = mm;
278 INIT_LIST_HEAD(&mm_slot->rmap_list);
279 hlist_add_head(&mm_slot->link, bucket);
280}
281
282static inline int in_stable_tree(struct rmap_item *rmap_item)
283{
284 return rmap_item->address & STABLE_FLAG;
285}
286
287/*
288 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
289 * page tables after it has passed through ksm_exit() - which, if necessary,
290 * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
291 * a special flag: they can just back out as soon as mm_users goes to zero.
292 * ksm_test_exit() is used throughout to make this test for exit: in some
293 * places for correctness, in some places just to avoid unnecessary work.
294 */
295static inline bool ksm_test_exit(struct mm_struct *mm)
296{
297 return atomic_read(&mm->mm_users) == 0;
298}
299
300/*
301 * We use break_ksm to break COW on a ksm page: it's a stripped down
302 *
303 * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
304 * put_page(page);
305 *
306 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
307 * in case the application has unmapped and remapped mm,addr meanwhile.
308 * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
309 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
310 */
311static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
312{
313 struct page *page;
314 int ret = 0;
315
316 do {
317 cond_resched();
318 page = follow_page(vma, addr, FOLL_GET);
319 if (!page)
320 break;
321 if (PageKsm(page))
322 ret = handle_mm_fault(vma->vm_mm, vma, addr,
323 FAULT_FLAG_WRITE);
324 else
325 ret = VM_FAULT_WRITE;
326 put_page(page);
327 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
328 /*
329 * We must loop because handle_mm_fault() may back out if there's
330 * any difficulty e.g. if pte accessed bit gets updated concurrently.
331 *
332 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
333 * COW has been broken, even if the vma does not permit VM_WRITE;
334 * but note that a concurrent fault might break PageKsm for us.
335 *
336 * VM_FAULT_SIGBUS could occur if we race with truncation of the
337 * backing file, which also invalidates anonymous pages: that's
338 * okay, that truncation will have unmapped the PageKsm for us.
339 *
340 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
341 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
342 * current task has TIF_MEMDIE set, and will be OOM killed on return
343 * to user; and ksmd, having no mm, would never be chosen for that.
344 *
345 * But if the mm is in a limited mem_cgroup, then the fault may fail
346 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
347 * even ksmd can fail in this way - though it's usually breaking ksm
348 * just to undo a merge it made a moment before, so unlikely to oom.
349 *
350 * That's a pity: we might therefore have more kernel pages allocated
351 * than we're counting as nodes in the stable tree; but ksm_do_scan
352 * will retry to break_cow on each pass, so should recover the page
353 * in due course. The important thing is to not let VM_MERGEABLE
354 * be cleared while any such pages might remain in the area.
355 */
356 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
357}
358
359static void break_cow(struct mm_struct *mm, unsigned long addr)
360{
361 struct vm_area_struct *vma;
362
363 down_read(&mm->mmap_sem);
364 if (ksm_test_exit(mm))
365 goto out;
366 vma = find_vma(mm, addr);
367 if (!vma || vma->vm_start > addr)
368 goto out;
369 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
370 goto out;
371 break_ksm(vma, addr);
372out:
373 up_read(&mm->mmap_sem);
374}
375
376static struct page *get_mergeable_page(struct rmap_item *rmap_item)
377{
378 struct mm_struct *mm = rmap_item->mm;
379 unsigned long addr = rmap_item->address;
380 struct vm_area_struct *vma;
381 struct page *page;
382
383 down_read(&mm->mmap_sem);
384 if (ksm_test_exit(mm))
385 goto out;
386 vma = find_vma(mm, addr);
387 if (!vma || vma->vm_start > addr)
388 goto out;
389 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
390 goto out;
391
392 page = follow_page(vma, addr, FOLL_GET);
393 if (!page)
394 goto out;
395 if (PageAnon(page)) {
396 flush_anon_page(vma, page, addr);
397 flush_dcache_page(page);
398 } else {
399 put_page(page);
400out: page = NULL;
401 }
402 up_read(&mm->mmap_sem);
403 return page;
404}
405
406/*
407 * get_ksm_page: checks if the page at the virtual address in rmap_item
408 * is still PageKsm, in which case we can trust the content of the page,
409 * and it returns the gotten page; but NULL if the page has been zapped.
410 */
411static struct page *get_ksm_page(struct rmap_item *rmap_item)
412{
413 struct page *page;
414
415 page = get_mergeable_page(rmap_item);
416 if (page && !PageKsm(page)) {
417 put_page(page);
418 page = NULL;
419 }
420 return page;
421}
422
423/*
424 * Removing rmap_item from stable or unstable tree.
425 * This function will clean the information from the stable/unstable tree.
426 */
427static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
428{
429 if (in_stable_tree(rmap_item)) {
430 struct rmap_item *next_item = rmap_item->next;
431
432 if (rmap_item->address & NODE_FLAG) {
433 if (next_item) {
434 rb_replace_node(&rmap_item->node,
435 &next_item->node,
436 &root_stable_tree);
437 next_item->address |= NODE_FLAG;
438 ksm_pages_sharing--;
439 } else {
440 rb_erase(&rmap_item->node, &root_stable_tree);
441 ksm_pages_shared--;
442 }
443 } else {
444 struct rmap_item *prev_item = rmap_item->prev;
445
446 BUG_ON(prev_item->next != rmap_item);
447 prev_item->next = next_item;
448 if (next_item) {
449 BUG_ON(next_item->prev != rmap_item);
450 next_item->prev = rmap_item->prev;
451 }
452 ksm_pages_sharing--;
453 }
454
455 rmap_item->next = NULL;
456
457 } else if (rmap_item->address & NODE_FLAG) {
458 unsigned char age;
459 /*
460 * Usually ksmd can and must skip the rb_erase, because
461 * root_unstable_tree was already reset to RB_ROOT.
462 * But be careful when an mm is exiting: do the rb_erase
463 * if this rmap_item was inserted by this scan, rather
464 * than left over from before.
465 */
466 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
467 BUG_ON(age > 1);
468 if (!age)
469 rb_erase(&rmap_item->node, &root_unstable_tree);
470 ksm_pages_unshared--;
471 }
472
473 rmap_item->address &= PAGE_MASK;
474
475 cond_resched(); /* we're called from many long loops */
476}
477
478static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
479 struct list_head *cur)
480{
481 struct rmap_item *rmap_item;
482
483 while (cur != &mm_slot->rmap_list) {
484 rmap_item = list_entry(cur, struct rmap_item, link);
485 cur = cur->next;
486 remove_rmap_item_from_tree(rmap_item);
487 list_del(&rmap_item->link);
488 free_rmap_item(rmap_item);
489 }
490}
491
492/*
493 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
494 * than check every pte of a given vma, the locking doesn't quite work for
495 * that - an rmap_item is assigned to the stable tree after inserting ksm
496 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
497 * rmap_items from parent to child at fork time (so as not to waste time
498 * if exit comes before the next scan reaches it).
499 *
500 * Similarly, although we'd like to remove rmap_items (so updating counts
501 * and freeing memory) when unmerging an area, it's easier to leave that
502 * to the next pass of ksmd - consider, for example, how ksmd might be
503 * in cmp_and_merge_page on one of the rmap_items we would be removing.
504 */
505static int unmerge_ksm_pages(struct vm_area_struct *vma,
506 unsigned long start, unsigned long end)
507{
508 unsigned long addr;
509 int err = 0;
510
511 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
512 if (ksm_test_exit(vma->vm_mm))
513 break;
514 if (signal_pending(current))
515 err = -ERESTARTSYS;
516 else
517 err = break_ksm(vma, addr);
518 }
519 return err;
520}
521
522#ifdef CONFIG_SYSFS
523/*
524 * Only called through the sysfs control interface:
525 */
526static int unmerge_and_remove_all_rmap_items(void)
527{
528 struct mm_slot *mm_slot;
529 struct mm_struct *mm;
530 struct vm_area_struct *vma;
531 int err = 0;
532
533 spin_lock(&ksm_mmlist_lock);
534 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
535 struct mm_slot, mm_list);
536 spin_unlock(&ksm_mmlist_lock);
537
538 for (mm_slot = ksm_scan.mm_slot;
539 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
540 mm = mm_slot->mm;
541 down_read(&mm->mmap_sem);
542 for (vma = mm->mmap; vma; vma = vma->vm_next) {
543 if (ksm_test_exit(mm))
544 break;
545 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
546 continue;
547 err = unmerge_ksm_pages(vma,
548 vma->vm_start, vma->vm_end);
549 if (err)
550 goto error;
551 }
552
553 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
554
555 spin_lock(&ksm_mmlist_lock);
556 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
557 struct mm_slot, mm_list);
558 if (ksm_test_exit(mm)) {
559 hlist_del(&mm_slot->link);
560 list_del(&mm_slot->mm_list);
561 spin_unlock(&ksm_mmlist_lock);
562
563 free_mm_slot(mm_slot);
564 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
565 up_read(&mm->mmap_sem);
566 mmdrop(mm);
567 } else {
568 spin_unlock(&ksm_mmlist_lock);
569 up_read(&mm->mmap_sem);
570 }
571 }
572
573 ksm_scan.seqnr = 0;
574 return 0;
575
576error:
577 up_read(&mm->mmap_sem);
578 spin_lock(&ksm_mmlist_lock);
579 ksm_scan.mm_slot = &ksm_mm_head;
580 spin_unlock(&ksm_mmlist_lock);
581 return err;
582}
583#endif /* CONFIG_SYSFS */
584
585static u32 calc_checksum(struct page *page)
586{
587 u32 checksum;
588 void *addr = kmap_atomic(page, KM_USER0);
589 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
590 kunmap_atomic(addr, KM_USER0);
591 return checksum;
592}
593
594static int memcmp_pages(struct page *page1, struct page *page2)
595{
596 char *addr1, *addr2;
597 int ret;
598
599 addr1 = kmap_atomic(page1, KM_USER0);
600 addr2 = kmap_atomic(page2, KM_USER1);
601 ret = memcmp(addr1, addr2, PAGE_SIZE);
602 kunmap_atomic(addr2, KM_USER1);
603 kunmap_atomic(addr1, KM_USER0);
604 return ret;
605}
606
607static inline int pages_identical(struct page *page1, struct page *page2)
608{
609 return !memcmp_pages(page1, page2);
610}
611
612static int write_protect_page(struct vm_area_struct *vma, struct page *page,
613 pte_t *orig_pte)
614{
615 struct mm_struct *mm = vma->vm_mm;
616 unsigned long addr;
617 pte_t *ptep;
618 spinlock_t *ptl;
619 int swapped;
620 int err = -EFAULT;
621
622 addr = page_address_in_vma(page, vma);
623 if (addr == -EFAULT)
624 goto out;
625
626 ptep = page_check_address(page, mm, addr, &ptl, 0);
627 if (!ptep)
628 goto out;
629
630 if (pte_write(*ptep)) {
631 pte_t entry;
632
633 swapped = PageSwapCache(page);
634 flush_cache_page(vma, addr, page_to_pfn(page));
635 /*
636 * Ok this is tricky, when get_user_pages_fast() run it doesnt
637 * take any lock, therefore the check that we are going to make
638 * with the pagecount against the mapcount is racey and
639 * O_DIRECT can happen right after the check.
640 * So we clear the pte and flush the tlb before the check
641 * this assure us that no O_DIRECT can happen after the check
642 * or in the middle of the check.
643 */
644 entry = ptep_clear_flush(vma, addr, ptep);
645 /*
646 * Check that no O_DIRECT or similar I/O is in progress on the
647 * page
648 */
649 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
650 set_pte_at_notify(mm, addr, ptep, entry);
651 goto out_unlock;
652 }
653 entry = pte_wrprotect(entry);
654 set_pte_at_notify(mm, addr, ptep, entry);
655 }
656 *orig_pte = *ptep;
657 err = 0;
658
659out_unlock:
660 pte_unmap_unlock(ptep, ptl);
661out:
662 return err;
663}
664
665/**
666 * replace_page - replace page in vma by new ksm page
667 * @vma: vma that holds the pte pointing to oldpage
668 * @oldpage: the page we are replacing by newpage
669 * @newpage: the ksm page we replace oldpage by
670 * @orig_pte: the original value of the pte
671 *
672 * Returns 0 on success, -EFAULT on failure.
673 */
674static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
675 struct page *newpage, pte_t orig_pte)
676{
677 struct mm_struct *mm = vma->vm_mm;
678 pgd_t *pgd;
679 pud_t *pud;
680 pmd_t *pmd;
681 pte_t *ptep;
682 spinlock_t *ptl;
683 unsigned long addr;
684 pgprot_t prot;
685 int err = -EFAULT;
686
687 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
688
689 addr = page_address_in_vma(oldpage, vma);
690 if (addr == -EFAULT)
691 goto out;
692
693 pgd = pgd_offset(mm, addr);
694 if (!pgd_present(*pgd))
695 goto out;
696
697 pud = pud_offset(pgd, addr);
698 if (!pud_present(*pud))
699 goto out;
700
701 pmd = pmd_offset(pud, addr);
702 if (!pmd_present(*pmd))
703 goto out;
704
705 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
706 if (!pte_same(*ptep, orig_pte)) {
707 pte_unmap_unlock(ptep, ptl);
708 goto out;
709 }
710
711 get_page(newpage);
712 page_add_ksm_rmap(newpage);
713
714 flush_cache_page(vma, addr, pte_pfn(*ptep));
715 ptep_clear_flush(vma, addr, ptep);
716 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
717
718 page_remove_rmap(oldpage);
719 put_page(oldpage);
720
721 pte_unmap_unlock(ptep, ptl);
722 err = 0;
723out:
724 return err;
725}
726
727/*
728 * try_to_merge_one_page - take two pages and merge them into one
729 * @vma: the vma that hold the pte pointing into oldpage
730 * @oldpage: the page that we want to replace with newpage
731 * @newpage: the page that we want to map instead of oldpage
732 *
733 * Note:
734 * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
735 * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
736 *
737 * This function returns 0 if the pages were merged, -EFAULT otherwise.
738 */
739static int try_to_merge_one_page(struct vm_area_struct *vma,
740 struct page *oldpage,
741 struct page *newpage)
742{
743 pte_t orig_pte = __pte(0);
744 int err = -EFAULT;
745
746 if (!(vma->vm_flags & VM_MERGEABLE))
747 goto out;
748
749 if (!PageAnon(oldpage))
750 goto out;
751
752 get_page(newpage);
753 get_page(oldpage);
754
755 /*
756 * We need the page lock to read a stable PageSwapCache in
757 * write_protect_page(). We use trylock_page() instead of
758 * lock_page() because we don't want to wait here - we
759 * prefer to continue scanning and merging different pages,
760 * then come back to this page when it is unlocked.
761 */
762 if (!trylock_page(oldpage))
763 goto out_putpage;
764 /*
765 * If this anonymous page is mapped only here, its pte may need
766 * to be write-protected. If it's mapped elsewhere, all of its
767 * ptes are necessarily already write-protected. But in either
768 * case, we need to lock and check page_count is not raised.
769 */
770 if (write_protect_page(vma, oldpage, &orig_pte)) {
771 unlock_page(oldpage);
772 goto out_putpage;
773 }
774 unlock_page(oldpage);
775
776 if (pages_identical(oldpage, newpage))
777 err = replace_page(vma, oldpage, newpage, orig_pte);
778
779out_putpage:
780 put_page(oldpage);
781 put_page(newpage);
782out:
783 return err;
784}
785
786/*
787 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
788 * but no new kernel page is allocated: kpage must already be a ksm page.
789 */
790static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
791 unsigned long addr1,
792 struct page *page1,
793 struct page *kpage)
794{
795 struct vm_area_struct *vma;
796 int err = -EFAULT;
797
798 down_read(&mm1->mmap_sem);
799 if (ksm_test_exit(mm1))
800 goto out;
801
802 vma = find_vma(mm1, addr1);
803 if (!vma || vma->vm_start > addr1)
804 goto out;
805
806 err = try_to_merge_one_page(vma, page1, kpage);
807out:
808 up_read(&mm1->mmap_sem);
809 return err;
810}
811
812/*
813 * try_to_merge_two_pages - take two identical pages and prepare them
814 * to be merged into one page.
815 *
816 * This function returns 0 if we successfully mapped two identical pages
817 * into one page, -EFAULT otherwise.
818 *
819 * Note that this function allocates a new kernel page: if one of the pages
820 * is already a ksm page, try_to_merge_with_ksm_page should be used.
821 */
822static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
823 struct page *page1, struct mm_struct *mm2,
824 unsigned long addr2, struct page *page2)
825{
826 struct vm_area_struct *vma;
827 struct page *kpage;
828 int err = -EFAULT;
829
830 /*
831 * The number of nodes in the stable tree
832 * is the number of kernel pages that we hold.
833 */
834 if (ksm_max_kernel_pages &&
835 ksm_max_kernel_pages <= ksm_pages_shared)
836 return err;
837
838 kpage = alloc_page(GFP_HIGHUSER);
839 if (!kpage)
840 return err;
841
842 down_read(&mm1->mmap_sem);
843 if (ksm_test_exit(mm1)) {
844 up_read(&mm1->mmap_sem);
845 goto out;
846 }
847 vma = find_vma(mm1, addr1);
848 if (!vma || vma->vm_start > addr1) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852
853 copy_user_highpage(kpage, page1, addr1, vma);
854 err = try_to_merge_one_page(vma, page1, kpage);
855 up_read(&mm1->mmap_sem);
856
857 if (!err) {
858 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
859 /*
860 * If that fails, we have a ksm page with only one pte
861 * pointing to it: so break it.
862 */
863 if (err)
864 break_cow(mm1, addr1);
865 }
866out:
867 put_page(kpage);
868 return err;
869}
870
871/*
872 * stable_tree_search - search page inside the stable tree
873 * @page: the page that we are searching identical pages to.
874 * @page2: pointer into identical page that we are holding inside the stable
875 * tree that we have found.
876 * @rmap_item: the reverse mapping item
877 *
878 * This function checks if there is a page inside the stable tree
879 * with identical content to the page that we are scanning right now.
880 *
881 * This function return rmap_item pointer to the identical item if found,
882 * NULL otherwise.
883 */
884static struct rmap_item *stable_tree_search(struct page *page,
885 struct page **page2,
886 struct rmap_item *rmap_item)
887{
888 struct rb_node *node = root_stable_tree.rb_node;
889
890 while (node) {
891 struct rmap_item *tree_rmap_item, *next_rmap_item;
892 int ret;
893
894 tree_rmap_item = rb_entry(node, struct rmap_item, node);
895 while (tree_rmap_item) {
896 BUG_ON(!in_stable_tree(tree_rmap_item));
897 cond_resched();
898 page2[0] = get_ksm_page(tree_rmap_item);
899 if (page2[0])
900 break;
901 next_rmap_item = tree_rmap_item->next;
902 remove_rmap_item_from_tree(tree_rmap_item);
903 tree_rmap_item = next_rmap_item;
904 }
905 if (!tree_rmap_item)
906 return NULL;
907
908 ret = memcmp_pages(page, page2[0]);
909
910 if (ret < 0) {
911 put_page(page2[0]);
912 node = node->rb_left;
913 } else if (ret > 0) {
914 put_page(page2[0]);
915 node = node->rb_right;
916 } else {
917 return tree_rmap_item;
918 }
919 }
920
921 return NULL;
922}
923
924/*
925 * stable_tree_insert - insert rmap_item pointing to new ksm page
926 * into the stable tree.
927 *
928 * @page: the page that we are searching identical page to inside the stable
929 * tree.
930 * @rmap_item: pointer to the reverse mapping item.
931 *
932 * This function returns rmap_item if success, NULL otherwise.
933 */
934static struct rmap_item *stable_tree_insert(struct page *page,
935 struct rmap_item *rmap_item)
936{
937 struct rb_node **new = &root_stable_tree.rb_node;
938 struct rb_node *parent = NULL;
939
940 while (*new) {
941 struct rmap_item *tree_rmap_item, *next_rmap_item;
942 struct page *tree_page;
943 int ret;
944
945 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
946 while (tree_rmap_item) {
947 BUG_ON(!in_stable_tree(tree_rmap_item));
948 cond_resched();
949 tree_page = get_ksm_page(tree_rmap_item);
950 if (tree_page)
951 break;
952 next_rmap_item = tree_rmap_item->next;
953 remove_rmap_item_from_tree(tree_rmap_item);
954 tree_rmap_item = next_rmap_item;
955 }
956 if (!tree_rmap_item)
957 return NULL;
958
959 ret = memcmp_pages(page, tree_page);
960 put_page(tree_page);
961
962 parent = *new;
963 if (ret < 0)
964 new = &parent->rb_left;
965 else if (ret > 0)
966 new = &parent->rb_right;
967 else {
968 /*
969 * It is not a bug that stable_tree_search() didn't
970 * find this node: because at that time our page was
971 * not yet write-protected, so may have changed since.
972 */
973 return NULL;
974 }
975 }
976
977 rmap_item->address |= NODE_FLAG | STABLE_FLAG;
978 rmap_item->next = NULL;
979 rb_link_node(&rmap_item->node, parent, new);
980 rb_insert_color(&rmap_item->node, &root_stable_tree);
981
982 ksm_pages_shared++;
983 return rmap_item;
984}
985
986/*
987 * unstable_tree_search_insert - search and insert items into the unstable tree.
988 *
989 * @page: the page that we are going to search for identical page or to insert
990 * into the unstable tree
991 * @page2: pointer into identical page that was found inside the unstable tree
992 * @rmap_item: the reverse mapping item of page
993 *
994 * This function searches for a page in the unstable tree identical to the
995 * page currently being scanned; and if no identical page is found in the
996 * tree, we insert rmap_item as a new object into the unstable tree.
997 *
998 * This function returns pointer to rmap_item found to be identical
999 * to the currently scanned page, NULL otherwise.
1000 *
1001 * This function does both searching and inserting, because they share
1002 * the same walking algorithm in an rbtree.
1003 */
1004static struct rmap_item *unstable_tree_search_insert(struct page *page,
1005 struct page **page2,
1006 struct rmap_item *rmap_item)
1007{
1008 struct rb_node **new = &root_unstable_tree.rb_node;
1009 struct rb_node *parent = NULL;
1010
1011 while (*new) {
1012 struct rmap_item *tree_rmap_item;
1013 int ret;
1014
1015 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1016 page2[0] = get_mergeable_page(tree_rmap_item);
1017 if (!page2[0])
1018 return NULL;
1019
1020 /*
1021 * Don't substitute an unswappable ksm page
1022 * just for one good swappable forked page.
1023 */
1024 if (page == page2[0]) {
1025 put_page(page2[0]);
1026 return NULL;
1027 }
1028
1029 ret = memcmp_pages(page, page2[0]);
1030
1031 parent = *new;
1032 if (ret < 0) {
1033 put_page(page2[0]);
1034 new = &parent->rb_left;
1035 } else if (ret > 0) {
1036 put_page(page2[0]);
1037 new = &parent->rb_right;
1038 } else {
1039 return tree_rmap_item;
1040 }
1041 }
1042
1043 rmap_item->address |= NODE_FLAG;
1044 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1045 rb_link_node(&rmap_item->node, parent, new);
1046 rb_insert_color(&rmap_item->node, &root_unstable_tree);
1047
1048 ksm_pages_unshared++;
1049 return NULL;
1050}
1051
1052/*
1053 * stable_tree_append - add another rmap_item to the linked list of
1054 * rmap_items hanging off a given node of the stable tree, all sharing
1055 * the same ksm page.
1056 */
1057static void stable_tree_append(struct rmap_item *rmap_item,
1058 struct rmap_item *tree_rmap_item)
1059{
1060 rmap_item->next = tree_rmap_item->next;
1061 rmap_item->prev = tree_rmap_item;
1062
1063 if (tree_rmap_item->next)
1064 tree_rmap_item->next->prev = rmap_item;
1065
1066 tree_rmap_item->next = rmap_item;
1067 rmap_item->address |= STABLE_FLAG;
1068
1069 ksm_pages_sharing++;
1070}
1071
1072/*
1073 * cmp_and_merge_page - first see if page can be merged into the stable tree;
1074 * if not, compare checksum to previous and if it's the same, see if page can
1075 * be inserted into the unstable tree, or merged with a page already there and
1076 * both transferred to the stable tree.
1077 *
1078 * @page: the page that we are searching identical page to.
1079 * @rmap_item: the reverse mapping into the virtual address of this page
1080 */
1081static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1082{
1083 struct page *page2[1];
1084 struct rmap_item *tree_rmap_item;
1085 unsigned int checksum;
1086 int err;
1087
1088 if (in_stable_tree(rmap_item))
1089 remove_rmap_item_from_tree(rmap_item);
1090
1091 /* We first start with searching the page inside the stable tree */
1092 tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1093 if (tree_rmap_item) {
1094 if (page == page2[0]) /* forked */
1095 err = 0;
1096 else
1097 err = try_to_merge_with_ksm_page(rmap_item->mm,
1098 rmap_item->address,
1099 page, page2[0]);
1100 put_page(page2[0]);
1101
1102 if (!err) {
1103 /*
1104 * The page was successfully merged:
1105 * add its rmap_item to the stable tree.
1106 */
1107 stable_tree_append(rmap_item, tree_rmap_item);
1108 }
1109 return;
1110 }
1111
1112 /*
1113 * A ksm page might have got here by fork, but its other
1114 * references have already been removed from the stable tree.
1115 * Or it might be left over from a break_ksm which failed
1116 * when the mem_cgroup had reached its limit: try again now.
1117 */
1118 if (PageKsm(page))
1119 break_cow(rmap_item->mm, rmap_item->address);
1120
1121 /*
1122 * In case the hash value of the page was changed from the last time we
1123 * have calculated it, this page to be changed frequely, therefore we
1124 * don't want to insert it to the unstable tree, and we don't want to
1125 * waste our time to search if there is something identical to it there.
1126 */
1127 checksum = calc_checksum(page);
1128 if (rmap_item->oldchecksum != checksum) {
1129 rmap_item->oldchecksum = checksum;
1130 return;
1131 }
1132
1133 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1134 if (tree_rmap_item) {
1135 err = try_to_merge_two_pages(rmap_item->mm,
1136 rmap_item->address, page,
1137 tree_rmap_item->mm,
1138 tree_rmap_item->address, page2[0]);
1139 /*
1140 * As soon as we merge this page, we want to remove the
1141 * rmap_item of the page we have merged with from the unstable
1142 * tree, and insert it instead as new node in the stable tree.
1143 */
1144 if (!err) {
1145 rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1146 tree_rmap_item->address &= ~NODE_FLAG;
1147 ksm_pages_unshared--;
1148
1149 /*
1150 * If we fail to insert the page into the stable tree,
1151 * we will have 2 virtual addresses that are pointing
1152 * to a ksm page left outside the stable tree,
1153 * in which case we need to break_cow on both.
1154 */
1155 if (stable_tree_insert(page2[0], tree_rmap_item))
1156 stable_tree_append(rmap_item, tree_rmap_item);
1157 else {
1158 break_cow(tree_rmap_item->mm,
1159 tree_rmap_item->address);
1160 break_cow(rmap_item->mm, rmap_item->address);
1161 }
1162 }
1163
1164 put_page(page2[0]);
1165 }
1166}
1167
1168static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1169 struct list_head *cur,
1170 unsigned long addr)
1171{
1172 struct rmap_item *rmap_item;
1173
1174 while (cur != &mm_slot->rmap_list) {
1175 rmap_item = list_entry(cur, struct rmap_item, link);
1176 if ((rmap_item->address & PAGE_MASK) == addr) {
1177 if (!in_stable_tree(rmap_item))
1178 remove_rmap_item_from_tree(rmap_item);
1179 return rmap_item;
1180 }
1181 if (rmap_item->address > addr)
1182 break;
1183 cur = cur->next;
1184 remove_rmap_item_from_tree(rmap_item);
1185 list_del(&rmap_item->link);
1186 free_rmap_item(rmap_item);
1187 }
1188
1189 rmap_item = alloc_rmap_item();
1190 if (rmap_item) {
1191 /* It has already been zeroed */
1192 rmap_item->mm = mm_slot->mm;
1193 rmap_item->address = addr;
1194 list_add_tail(&rmap_item->link, cur);
1195 }
1196 return rmap_item;
1197}
1198
1199static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1200{
1201 struct mm_struct *mm;
1202 struct mm_slot *slot;
1203 struct vm_area_struct *vma;
1204 struct rmap_item *rmap_item;
1205
1206 if (list_empty(&ksm_mm_head.mm_list))
1207 return NULL;
1208
1209 slot = ksm_scan.mm_slot;
1210 if (slot == &ksm_mm_head) {
1211 root_unstable_tree = RB_ROOT;
1212
1213 spin_lock(&ksm_mmlist_lock);
1214 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1215 ksm_scan.mm_slot = slot;
1216 spin_unlock(&ksm_mmlist_lock);
1217next_mm:
1218 ksm_scan.address = 0;
1219 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1220 struct rmap_item, link);
1221 }
1222
1223 mm = slot->mm;
1224 down_read(&mm->mmap_sem);
1225 if (ksm_test_exit(mm))
1226 vma = NULL;
1227 else
1228 vma = find_vma(mm, ksm_scan.address);
1229
1230 for (; vma; vma = vma->vm_next) {
1231 if (!(vma->vm_flags & VM_MERGEABLE))
1232 continue;
1233 if (ksm_scan.address < vma->vm_start)
1234 ksm_scan.address = vma->vm_start;
1235 if (!vma->anon_vma)
1236 ksm_scan.address = vma->vm_end;
1237
1238 while (ksm_scan.address < vma->vm_end) {
1239 if (ksm_test_exit(mm))
1240 break;
1241 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1242 if (*page && PageAnon(*page)) {
1243 flush_anon_page(vma, *page, ksm_scan.address);
1244 flush_dcache_page(*page);
1245 rmap_item = get_next_rmap_item(slot,
1246 ksm_scan.rmap_item->link.next,
1247 ksm_scan.address);
1248 if (rmap_item) {
1249 ksm_scan.rmap_item = rmap_item;
1250 ksm_scan.address += PAGE_SIZE;
1251 } else
1252 put_page(*page);
1253 up_read(&mm->mmap_sem);
1254 return rmap_item;
1255 }
1256 if (*page)
1257 put_page(*page);
1258 ksm_scan.address += PAGE_SIZE;
1259 cond_resched();
1260 }
1261 }
1262
1263 if (ksm_test_exit(mm)) {
1264 ksm_scan.address = 0;
1265 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1266 struct rmap_item, link);
1267 }
1268 /*
1269 * Nuke all the rmap_items that are above this current rmap:
1270 * because there were no VM_MERGEABLE vmas with such addresses.
1271 */
1272 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1273
1274 spin_lock(&ksm_mmlist_lock);
1275 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1276 struct mm_slot, mm_list);
1277 if (ksm_scan.address == 0) {
1278 /*
1279 * We've completed a full scan of all vmas, holding mmap_sem
1280 * throughout, and found no VM_MERGEABLE: so do the same as
1281 * __ksm_exit does to remove this mm from all our lists now.
1282 * This applies either when cleaning up after __ksm_exit
1283 * (but beware: we can reach here even before __ksm_exit),
1284 * or when all VM_MERGEABLE areas have been unmapped (and
1285 * mmap_sem then protects against race with MADV_MERGEABLE).
1286 */
1287 hlist_del(&slot->link);
1288 list_del(&slot->mm_list);
1289 spin_unlock(&ksm_mmlist_lock);
1290
1291 free_mm_slot(slot);
1292 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1293 up_read(&mm->mmap_sem);
1294 mmdrop(mm);
1295 } else {
1296 spin_unlock(&ksm_mmlist_lock);
1297 up_read(&mm->mmap_sem);
1298 }
1299
1300 /* Repeat until we've completed scanning the whole list */
1301 slot = ksm_scan.mm_slot;
1302 if (slot != &ksm_mm_head)
1303 goto next_mm;
1304
1305 ksm_scan.seqnr++;
1306 return NULL;
1307}
1308
1309/**
1310 * ksm_do_scan - the ksm scanner main worker function.
1311 * @scan_npages - number of pages we want to scan before we return.
1312 */
1313static void ksm_do_scan(unsigned int scan_npages)
1314{
1315 struct rmap_item *rmap_item;
1316 struct page *page;
1317
1318 while (scan_npages--) {
1319 cond_resched();
1320 rmap_item = scan_get_next_rmap_item(&page);
1321 if (!rmap_item)
1322 return;
1323 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1324 cmp_and_merge_page(page, rmap_item);
1325 else if (page_mapcount(page) == 1) {
1326 /*
1327 * Replace now-unshared ksm page by ordinary page.
1328 */
1329 break_cow(rmap_item->mm, rmap_item->address);
1330 remove_rmap_item_from_tree(rmap_item);
1331 rmap_item->oldchecksum = calc_checksum(page);
1332 }
1333 put_page(page);
1334 }
1335}
1336
1337static int ksmd_should_run(void)
1338{
1339 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1340}
1341
1342static int ksm_scan_thread(void *nothing)
1343{
1344 set_user_nice(current, 5);
1345
1346 while (!kthread_should_stop()) {
1347 mutex_lock(&ksm_thread_mutex);
1348 if (ksmd_should_run())
1349 ksm_do_scan(ksm_thread_pages_to_scan);
1350 mutex_unlock(&ksm_thread_mutex);
1351
1352 if (ksmd_should_run()) {
1353 schedule_timeout_interruptible(
1354 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1355 } else {
1356 wait_event_interruptible(ksm_thread_wait,
1357 ksmd_should_run() || kthread_should_stop());
1358 }
1359 }
1360 return 0;
1361}
1362
1363int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1364 unsigned long end, int advice, unsigned long *vm_flags)
1365{
1366 struct mm_struct *mm = vma->vm_mm;
1367 int err;
1368
1369 switch (advice) {
1370 case MADV_MERGEABLE:
1371 /*
1372 * Be somewhat over-protective for now!
1373 */
1374 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1375 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1376 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1377 VM_MIXEDMAP | VM_SAO))
1378 return 0; /* just ignore the advice */
1379
1380 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1381 err = __ksm_enter(mm);
1382 if (err)
1383 return err;
1384 }
1385
1386 *vm_flags |= VM_MERGEABLE;
1387 break;
1388
1389 case MADV_UNMERGEABLE:
1390 if (!(*vm_flags & VM_MERGEABLE))
1391 return 0; /* just ignore the advice */
1392
1393 if (vma->anon_vma) {
1394 err = unmerge_ksm_pages(vma, start, end);
1395 if (err)
1396 return err;
1397 }
1398
1399 *vm_flags &= ~VM_MERGEABLE;
1400 break;
1401 }
1402
1403 return 0;
1404}
1405
1406int __ksm_enter(struct mm_struct *mm)
1407{
1408 struct mm_slot *mm_slot;
1409 int needs_wakeup;
1410
1411 mm_slot = alloc_mm_slot();
1412 if (!mm_slot)
1413 return -ENOMEM;
1414
1415 /* Check ksm_run too? Would need tighter locking */
1416 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1417
1418 spin_lock(&ksm_mmlist_lock);
1419 insert_to_mm_slots_hash(mm, mm_slot);
1420 /*
1421 * Insert just behind the scanning cursor, to let the area settle
1422 * down a little; when fork is followed by immediate exec, we don't
1423 * want ksmd to waste time setting up and tearing down an rmap_list.
1424 */
1425 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1426 spin_unlock(&ksm_mmlist_lock);
1427
1428 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1429 atomic_inc(&mm->mm_count);
1430
1431 if (needs_wakeup)
1432 wake_up_interruptible(&ksm_thread_wait);
1433
1434 return 0;
1435}
1436
1437void __ksm_exit(struct mm_struct *mm)
1438{
1439 struct mm_slot *mm_slot;
1440 int easy_to_free = 0;
1441
1442 /*
1443 * This process is exiting: if it's straightforward (as is the
1444 * case when ksmd was never running), free mm_slot immediately.
1445 * But if it's at the cursor or has rmap_items linked to it, use
1446 * mmap_sem to synchronize with any break_cows before pagetables
1447 * are freed, and leave the mm_slot on the list for ksmd to free.
1448 * Beware: ksm may already have noticed it exiting and freed the slot.
1449 */
1450
1451 spin_lock(&ksm_mmlist_lock);
1452 mm_slot = get_mm_slot(mm);
1453 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1454 if (list_empty(&mm_slot->rmap_list)) {
1455 hlist_del(&mm_slot->link);
1456 list_del(&mm_slot->mm_list);
1457 easy_to_free = 1;
1458 } else {
1459 list_move(&mm_slot->mm_list,
1460 &ksm_scan.mm_slot->mm_list);
1461 }
1462 }
1463 spin_unlock(&ksm_mmlist_lock);
1464
1465 if (easy_to_free) {
1466 free_mm_slot(mm_slot);
1467 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1468 mmdrop(mm);
1469 } else if (mm_slot) {
1470 down_write(&mm->mmap_sem);
1471 up_write(&mm->mmap_sem);
1472 }
1473}
1474
1475#ifdef CONFIG_SYSFS
1476/*
1477 * This all compiles without CONFIG_SYSFS, but is a waste of space.
1478 */
1479
1480#define KSM_ATTR_RO(_name) \
1481 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1482#define KSM_ATTR(_name) \
1483 static struct kobj_attribute _name##_attr = \
1484 __ATTR(_name, 0644, _name##_show, _name##_store)
1485
1486static ssize_t sleep_millisecs_show(struct kobject *kobj,
1487 struct kobj_attribute *attr, char *buf)
1488{
1489 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1490}
1491
1492static ssize_t sleep_millisecs_store(struct kobject *kobj,
1493 struct kobj_attribute *attr,
1494 const char *buf, size_t count)
1495{
1496 unsigned long msecs;
1497 int err;
1498
1499 err = strict_strtoul(buf, 10, &msecs);
1500 if (err || msecs > UINT_MAX)
1501 return -EINVAL;
1502
1503 ksm_thread_sleep_millisecs = msecs;
1504
1505 return count;
1506}
1507KSM_ATTR(sleep_millisecs);
1508
1509static ssize_t pages_to_scan_show(struct kobject *kobj,
1510 struct kobj_attribute *attr, char *buf)
1511{
1512 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1513}
1514
1515static ssize_t pages_to_scan_store(struct kobject *kobj,
1516 struct kobj_attribute *attr,
1517 const char *buf, size_t count)
1518{
1519 int err;
1520 unsigned long nr_pages;
1521
1522 err = strict_strtoul(buf, 10, &nr_pages);
1523 if (err || nr_pages > UINT_MAX)
1524 return -EINVAL;
1525
1526 ksm_thread_pages_to_scan = nr_pages;
1527
1528 return count;
1529}
1530KSM_ATTR(pages_to_scan);
1531
1532static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1533 char *buf)
1534{
1535 return sprintf(buf, "%u\n", ksm_run);
1536}
1537
1538static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1539 const char *buf, size_t count)
1540{
1541 int err;
1542 unsigned long flags;
1543
1544 err = strict_strtoul(buf, 10, &flags);
1545 if (err || flags > UINT_MAX)
1546 return -EINVAL;
1547 if (flags > KSM_RUN_UNMERGE)
1548 return -EINVAL;
1549
1550 /*
1551 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
1552 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
1553 * breaking COW to free the unswappable pages_shared (but leaves
1554 * mm_slots on the list for when ksmd may be set running again).
1555 */
1556
1557 mutex_lock(&ksm_thread_mutex);
1558 if (ksm_run != flags) {
1559 ksm_run = flags;
1560 if (flags & KSM_RUN_UNMERGE) {
1561 current->flags |= PF_OOM_ORIGIN;
1562 err = unmerge_and_remove_all_rmap_items();
1563 current->flags &= ~PF_OOM_ORIGIN;
1564 if (err) {
1565 ksm_run = KSM_RUN_STOP;
1566 count = err;
1567 }
1568 }
1569 }
1570 mutex_unlock(&ksm_thread_mutex);
1571
1572 if (flags & KSM_RUN_MERGE)
1573 wake_up_interruptible(&ksm_thread_wait);
1574
1575 return count;
1576}
1577KSM_ATTR(run);
1578
1579static ssize_t max_kernel_pages_store(struct kobject *kobj,
1580 struct kobj_attribute *attr,
1581 const char *buf, size_t count)
1582{
1583 int err;
1584 unsigned long nr_pages;
1585
1586 err = strict_strtoul(buf, 10, &nr_pages);
1587 if (err)
1588 return -EINVAL;
1589
1590 ksm_max_kernel_pages = nr_pages;
1591
1592 return count;
1593}
1594
1595static ssize_t max_kernel_pages_show(struct kobject *kobj,
1596 struct kobj_attribute *attr, char *buf)
1597{
1598 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1599}
1600KSM_ATTR(max_kernel_pages);
1601
1602static ssize_t pages_shared_show(struct kobject *kobj,
1603 struct kobj_attribute *attr, char *buf)
1604{
1605 return sprintf(buf, "%lu\n", ksm_pages_shared);
1606}
1607KSM_ATTR_RO(pages_shared);
1608
1609static ssize_t pages_sharing_show(struct kobject *kobj,
1610 struct kobj_attribute *attr, char *buf)
1611{
1612 return sprintf(buf, "%lu\n", ksm_pages_sharing);
1613}
1614KSM_ATTR_RO(pages_sharing);
1615
1616static ssize_t pages_unshared_show(struct kobject *kobj,
1617 struct kobj_attribute *attr, char *buf)
1618{
1619 return sprintf(buf, "%lu\n", ksm_pages_unshared);
1620}
1621KSM_ATTR_RO(pages_unshared);
1622
1623static ssize_t pages_volatile_show(struct kobject *kobj,
1624 struct kobj_attribute *attr, char *buf)
1625{
1626 long ksm_pages_volatile;
1627
1628 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1629 - ksm_pages_sharing - ksm_pages_unshared;
1630 /*
1631 * It was not worth any locking to calculate that statistic,
1632 * but it might therefore sometimes be negative: conceal that.
1633 */
1634 if (ksm_pages_volatile < 0)
1635 ksm_pages_volatile = 0;
1636 return sprintf(buf, "%ld\n", ksm_pages_volatile);
1637}
1638KSM_ATTR_RO(pages_volatile);
1639
1640static ssize_t full_scans_show(struct kobject *kobj,
1641 struct kobj_attribute *attr, char *buf)
1642{
1643 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
1644}
1645KSM_ATTR_RO(full_scans);
1646
1647static struct attribute *ksm_attrs[] = {
1648 &sleep_millisecs_attr.attr,
1649 &pages_to_scan_attr.attr,
1650 &run_attr.attr,
1651 &max_kernel_pages_attr.attr,
1652 &pages_shared_attr.attr,
1653 &pages_sharing_attr.attr,
1654 &pages_unshared_attr.attr,
1655 &pages_volatile_attr.attr,
1656 &full_scans_attr.attr,
1657 NULL,
1658};
1659
1660static struct attribute_group ksm_attr_group = {
1661 .attrs = ksm_attrs,
1662 .name = "ksm",
1663};
1664#endif /* CONFIG_SYSFS */
1665
1666static int __init ksm_init(void)
1667{
1668 struct task_struct *ksm_thread;
1669 int err;
1670
1671 ksm_max_kernel_pages = totalram_pages / 4;
1672
1673 err = ksm_slab_init();
1674 if (err)
1675 goto out;
1676
1677 err = mm_slots_hash_init();
1678 if (err)
1679 goto out_free1;
1680
1681 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1682 if (IS_ERR(ksm_thread)) {
1683 printk(KERN_ERR "ksm: creating kthread failed\n");
1684 err = PTR_ERR(ksm_thread);
1685 goto out_free2;
1686 }
1687
1688#ifdef CONFIG_SYSFS
1689 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1690 if (err) {
1691 printk(KERN_ERR "ksm: register sysfs failed\n");
1692 kthread_stop(ksm_thread);
1693 goto out_free2;
1694 }
1695#else
1696 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
1697
1698#endif /* CONFIG_SYSFS */
1699
1700 return 0;
1701
1702out_free2:
1703 mm_slots_hash_free();
1704out_free1:
1705 ksm_slab_free();
1706out:
1707 return err;
1708}
1709module_init(ksm_init)
diff --git a/mm/madvise.c b/mm/madvise.c
index 76eb4193acdd..35b1479b7c9d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,6 +11,7 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/ksm.h>
14 15
15/* 16/*
16 * Any behaviour which results in changes to the vma->vm_flags needs to 17 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
41 struct mm_struct * mm = vma->vm_mm; 42 struct mm_struct * mm = vma->vm_mm;
42 int error = 0; 43 int error = 0;
43 pgoff_t pgoff; 44 pgoff_t pgoff;
44 int new_flags = vma->vm_flags; 45 unsigned long new_flags = vma->vm_flags;
45 46
46 switch (behavior) { 47 switch (behavior) {
47 case MADV_NORMAL: 48 case MADV_NORMAL:
@@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma,
57 new_flags |= VM_DONTCOPY; 58 new_flags |= VM_DONTCOPY;
58 break; 59 break;
59 case MADV_DOFORK: 60 case MADV_DOFORK:
61 if (vma->vm_flags & VM_IO) {
62 error = -EINVAL;
63 goto out;
64 }
60 new_flags &= ~VM_DONTCOPY; 65 new_flags &= ~VM_DONTCOPY;
61 break; 66 break;
67 case MADV_MERGEABLE:
68 case MADV_UNMERGEABLE:
69 error = ksm_madvise(vma, start, end, behavior, &new_flags);
70 if (error)
71 goto out;
72 break;
62 } 73 }
63 74
64 if (new_flags == vma->vm_flags) { 75 if (new_flags == vma->vm_flags) {
@@ -207,41 +218,46 @@ static long madvise_remove(struct vm_area_struct *vma,
207 return error; 218 return error;
208} 219}
209 220
221#ifdef CONFIG_MEMORY_FAILURE
222/*
223 * Error injection support for memory error handling.
224 */
225static int madvise_hwpoison(unsigned long start, unsigned long end)
226{
227 int ret = 0;
228
229 if (!capable(CAP_SYS_ADMIN))
230 return -EPERM;
231 for (; start < end; start += PAGE_SIZE) {
232 struct page *p;
233 int ret = get_user_pages(current, current->mm, start, 1,
234 0, 0, &p, NULL);
235 if (ret != 1)
236 return ret;
237 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
238 page_to_pfn(p), start);
239 /* Ignore return value for now */
240 __memory_failure(page_to_pfn(p), 0, 1);
241 put_page(p);
242 }
243 return ret;
244}
245#endif
246
210static long 247static long
211madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 248madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
212 unsigned long start, unsigned long end, int behavior) 249 unsigned long start, unsigned long end, int behavior)
213{ 250{
214 long error;
215
216 switch (behavior) { 251 switch (behavior) {
217 case MADV_DOFORK:
218 if (vma->vm_flags & VM_IO) {
219 error = -EINVAL;
220 break;
221 }
222 case MADV_DONTFORK:
223 case MADV_NORMAL:
224 case MADV_SEQUENTIAL:
225 case MADV_RANDOM:
226 error = madvise_behavior(vma, prev, start, end, behavior);
227 break;
228 case MADV_REMOVE: 252 case MADV_REMOVE:
229 error = madvise_remove(vma, prev, start, end); 253 return madvise_remove(vma, prev, start, end);
230 break;
231
232 case MADV_WILLNEED: 254 case MADV_WILLNEED:
233 error = madvise_willneed(vma, prev, start, end); 255 return madvise_willneed(vma, prev, start, end);
234 break;
235
236 case MADV_DONTNEED: 256 case MADV_DONTNEED:
237 error = madvise_dontneed(vma, prev, start, end); 257 return madvise_dontneed(vma, prev, start, end);
238 break;
239
240 default: 258 default:
241 BUG(); 259 return madvise_behavior(vma, prev, start, end, behavior);
242 break;
243 } 260 }
244 return error;
245} 261}
246 262
247static int 263static int
@@ -256,12 +272,17 @@ madvise_behavior_valid(int behavior)
256 case MADV_REMOVE: 272 case MADV_REMOVE:
257 case MADV_WILLNEED: 273 case MADV_WILLNEED:
258 case MADV_DONTNEED: 274 case MADV_DONTNEED:
275#ifdef CONFIG_KSM
276 case MADV_MERGEABLE:
277 case MADV_UNMERGEABLE:
278#endif
259 return 1; 279 return 1;
260 280
261 default: 281 default:
262 return 0; 282 return 0;
263 } 283 }
264} 284}
285
265/* 286/*
266 * The madvise(2) system call. 287 * The madvise(2) system call.
267 * 288 *
@@ -286,6 +307,12 @@ madvise_behavior_valid(int behavior)
286 * so the kernel can free resources associated with it. 307 * so the kernel can free resources associated with it.
287 * MADV_REMOVE - the application wants to free up the given range of 308 * MADV_REMOVE - the application wants to free up the given range of
288 * pages and associated backing store. 309 * pages and associated backing store.
310 * MADV_DONTFORK - omit this area from child's address space when forking:
311 * typically, to avoid COWing pages pinned by get_user_pages().
312 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
313 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
314 * this area with pages of identical content from other such areas.
315 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
289 * 316 *
290 * return values: 317 * return values:
291 * zero - success 318 * zero - success
@@ -307,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
307 int write; 334 int write;
308 size_t len; 335 size_t len;
309 336
337#ifdef CONFIG_MEMORY_FAILURE
338 if (behavior == MADV_HWPOISON)
339 return madvise_hwpoison(start, start+len_in);
340#endif
310 if (!madvise_behavior_valid(behavior)) 341 if (!madvise_behavior_valid(behavior))
311 return error; 342 return error;
312 343
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fd4529d86de5..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -43,6 +44,7 @@
43 44
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
46 48
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53#endif 55#endif
54 56
55static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
56 59
57/* 60/*
58 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
69 74
70 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
71}; 76};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
78 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
79}; 84};
80 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
81/* 100/*
82 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
83 */ 102 */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
117 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
118 137
119 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
120}; 145};
121/* Macro for accessing counter */ 146/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
130}; 155};
131 156
132/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
133 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
186 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
187}; 232};
188 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
189enum charge_type { 241enum charge_type {
190 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
191 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 255/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 257
211/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
212#define _MEM (0) 259#define _MEM (0)
@@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
215#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
216#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
217 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
218static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
219static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
220static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
221 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz,
317 unsigned long long new_usage_in_excess)
318{
319 struct rb_node **p = &mctz->rb_root.rb_node;
320 struct rb_node *parent = NULL;
321 struct mem_cgroup_per_zone *mz_node;
322
323 if (mz->on_tree)
324 return;
325
326 mz->usage_in_excess = new_usage_in_excess;
327 if (!mz->usage_in_excess)
328 return;
329 while (*p) {
330 parent = *p;
331 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
332 tree_node);
333 if (mz->usage_in_excess < mz_node->usage_in_excess)
334 p = &(*p)->rb_left;
335 /*
336 * We can't avoid mem cgroups that are over their soft
337 * limit by the same amount
338 */
339 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
340 p = &(*p)->rb_right;
341 }
342 rb_link_node(&mz->tree_node, parent, p);
343 rb_insert_color(&mz->tree_node, &mctz->rb_root);
344 mz->on_tree = true;
345}
346
347static void
348__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
349 struct mem_cgroup_per_zone *mz,
350 struct mem_cgroup_tree_per_zone *mctz)
351{
352 if (!mz->on_tree)
353 return;
354 rb_erase(&mz->tree_node, &mctz->rb_root);
355 mz->on_tree = false;
356}
357
358static void
359mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
360 struct mem_cgroup_per_zone *mz,
361 struct mem_cgroup_tree_per_zone *mctz)
362{
363 spin_lock(&mctz->lock);
364 __mem_cgroup_remove_exceeded(mem, mz, mctz);
365 spin_unlock(&mctz->lock);
366}
367
368static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
369{
370 bool ret = false;
371 int cpu;
372 s64 val;
373 struct mem_cgroup_stat_cpu *cpustat;
374
375 cpu = get_cpu();
376 cpustat = &mem->stat.cpustat[cpu];
377 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
378 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
379 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
380 ret = true;
381 }
382 put_cpu();
383 return ret;
384}
385
386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
387{
388 unsigned long long excess;
389 struct mem_cgroup_per_zone *mz;
390 struct mem_cgroup_tree_per_zone *mctz;
391 int nid = page_to_nid(page);
392 int zid = page_zonenum(page);
393 mctz = soft_limit_tree_from_page(page);
394
395 /*
396 * Necessary to update all ancestors when hierarchy is used.
397 * because their event counter is not touched.
398 */
399 for (; mem; mem = parent_mem_cgroup(mem)) {
400 mz = mem_cgroup_zoneinfo(mem, nid, zid);
401 excess = res_counter_soft_limit_excess(&mem->res);
402 /*
403 * We have to update the tree if mz is on RB-tree or
404 * mem is over its softlimit.
405 */
406 if (excess || mz->on_tree) {
407 spin_lock(&mctz->lock);
408 /* if on-tree, remove it */
409 if (mz->on_tree)
410 __mem_cgroup_remove_exceeded(mem, mz, mctz);
411 /*
412 * Insert again. mz->usage_in_excess will be updated.
413 * If excess is 0, no tree ops.
414 */
415 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
416 spin_unlock(&mctz->lock);
417 }
418 }
419}
420
421static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
422{
423 int node, zone;
424 struct mem_cgroup_per_zone *mz;
425 struct mem_cgroup_tree_per_zone *mctz;
426
427 for_each_node_state(node, N_POSSIBLE) {
428 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
429 mz = mem_cgroup_zoneinfo(mem, node, zone);
430 mctz = soft_limit_tree_node_zone(node, zone);
431 mem_cgroup_remove_exceeded(mem, mz, mctz);
432 }
433 }
434}
435
436static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
437{
438 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
439}
440
441static struct mem_cgroup_per_zone *
442__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
443{
444 struct rb_node *rightmost = NULL;
445 struct mem_cgroup_per_zone *mz;
446
447retry:
448 mz = NULL;
449 rightmost = rb_last(&mctz->rb_root);
450 if (!rightmost)
451 goto done; /* Nothing to reclaim from */
452
453 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
454 /*
455 * Remove the node now but someone else can add it back,
456 * we will to add it back at the end of reclaim to its correct
457 * position in the tree.
458 */
459 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
460 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
461 !css_tryget(&mz->mem->css))
462 goto retry;
463done:
464 return mz;
465}
466
467static struct mem_cgroup_per_zone *
468mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
469{
470 struct mem_cgroup_per_zone *mz;
471
472 spin_lock(&mctz->lock);
473 mz = __mem_cgroup_largest_soft_limit_node(mctz);
474 spin_unlock(&mctz->lock);
475 return mz;
476}
477
478static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
479 bool charge)
480{
481 int val = (charge) ? 1 : -1;
482 struct mem_cgroup_stat *stat = &mem->stat;
483 struct mem_cgroup_stat_cpu *cpustat;
484 int cpu = get_cpu();
485
486 cpustat = &stat->cpustat[cpu];
487 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
488 put_cpu();
489}
490
222static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 491static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
223 struct page_cgroup *pc, 492 struct page_cgroup *pc,
224 bool charge) 493 bool charge)
225{ 494{
226 int val = (charge)? 1 : -1; 495 int val = (charge) ? 1 : -1;
227 struct mem_cgroup_stat *stat = &mem->stat; 496 struct mem_cgroup_stat *stat = &mem->stat;
228 struct mem_cgroup_stat_cpu *cpustat; 497 struct mem_cgroup_stat_cpu *cpustat;
229 int cpu = get_cpu(); 498 int cpu = get_cpu();
@@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
240 else 509 else
241 __mem_cgroup_stat_add_safe(cpustat, 510 __mem_cgroup_stat_add_safe(cpustat,
242 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 511 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
512 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
243 put_cpu(); 513 put_cpu();
244} 514}
245 515
246static struct mem_cgroup_per_zone *
247mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
248{
249 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
250}
251
252static struct mem_cgroup_per_zone *
253page_cgroup_zoneinfo(struct page_cgroup *pc)
254{
255 struct mem_cgroup *mem = pc->mem_cgroup;
256 int nid = page_cgroup_nid(pc);
257 int zid = page_cgroup_zid(pc);
258
259 if (!mem)
260 return NULL;
261
262 return mem_cgroup_zoneinfo(mem, nid, zid);
263}
264
265static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 516static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
266 enum lru_list idx) 517 enum lru_list idx)
267{ 518{
@@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 605 return ret;
355} 606}
356 607
608static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
609{
610 return (mem == root_mem_cgroup);
611}
612
357/* 613/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 614 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 615 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 627void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 628{
373 struct page_cgroup *pc; 629 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 630 struct mem_cgroup_per_zone *mz;
376 631
377 if (mem_cgroup_disabled()) 632 if (mem_cgroup_disabled())
378 return; 633 return;
379 pc = lookup_page_cgroup(page); 634 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 635 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 636 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 637 return;
638 VM_BUG_ON(!pc->mem_cgroup);
383 /* 639 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 640 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 641 * removed from global LRU.
386 */ 642 */
387 mz = page_cgroup_zoneinfo(pc); 643 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 644 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
645 if (mem_cgroup_is_root(pc->mem_cgroup))
646 return;
647 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 648 list_del_init(&pc->lru);
391 return; 649 return;
392} 650}
@@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 668 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 669 */
412 smp_rmb(); 670 smp_rmb();
413 /* unused page is not rotated. */ 671 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 672 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 673 return;
416 mz = page_cgroup_zoneinfo(pc); 674 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 675 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 683 if (mem_cgroup_disabled())
426 return; 684 return;
427 pc = lookup_page_cgroup(page); 685 pc = lookup_page_cgroup(page);
686 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 687 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 688 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 689 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 694
436 mz = page_cgroup_zoneinfo(pc); 695 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 696 MEM_CGROUP_ZSTAT(mz, lru) += 1;
697 SetPageCgroupAcctLRU(pc);
698 if (mem_cgroup_is_root(pc->mem_cgroup))
699 return;
438 list_add(&pc->lru, &mz->lists[lru]); 700 list_add(&pc->lru, &mz->lists[lru]);
439} 701}
440 702
@@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 731
470 spin_lock_irqsave(&zone->lru_lock, flags); 732 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 733 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 734 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 735 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 736 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 737}
@@ -648,7 +910,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
648 int nid = z->zone_pgdat->node_id; 910 int nid = z->zone_pgdat->node_id;
649 int zid = zone_idx(z); 911 int zid = zone_idx(z);
650 struct mem_cgroup_per_zone *mz; 912 struct mem_cgroup_per_zone *mz;
651 int lru = LRU_FILE * !!file + !!active; 913 int lru = LRU_FILE * file + active;
652 int ret; 914 int ret;
653 915
654 BUG_ON(!mem_cont); 916 BUG_ON(!mem_cont);
@@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
855 * If shrink==true, for avoiding to free too much, this returns immedieately. 1117 * If shrink==true, for avoiding to free too much, this returns immedieately.
856 */ 1118 */
857static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1119static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
858 gfp_t gfp_mask, bool noswap, bool shrink) 1120 struct zone *zone,
1121 gfp_t gfp_mask,
1122 unsigned long reclaim_options)
859{ 1123{
860 struct mem_cgroup *victim; 1124 struct mem_cgroup *victim;
861 int ret, total = 0; 1125 int ret, total = 0;
862 int loop = 0; 1126 int loop = 0;
1127 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1128 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1129 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1130 unsigned long excess = mem_cgroup_get_excess(root_mem);
863 1131
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1132 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum) 1133 if (root_mem->memsw_is_minimum)
866 noswap = true; 1134 noswap = true;
867 1135
868 while (loop < 2) { 1136 while (1) {
869 victim = mem_cgroup_select_victim(root_mem); 1137 victim = mem_cgroup_select_victim(root_mem);
870 if (victim == root_mem) 1138 if (victim == root_mem) {
871 loop++; 1139 loop++;
1140 if (loop >= 2) {
1141 /*
1142 * If we have not been able to reclaim
1143 * anything, it might because there are
1144 * no reclaimable pages under this hierarchy
1145 */
1146 if (!check_soft || !total) {
1147 css_put(&victim->css);
1148 break;
1149 }
1150 /*
1151 * We want to do more targetted reclaim.
1152 * excess >> 2 is not to excessive so as to
1153 * reclaim too much, nor too less that we keep
1154 * coming back to reclaim from this cgroup
1155 */
1156 if (total >= (excess >> 2) ||
1157 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1158 css_put(&victim->css);
1159 break;
1160 }
1161 }
1162 }
872 if (!mem_cgroup_local_usage(&victim->stat)) { 1163 if (!mem_cgroup_local_usage(&victim->stat)) {
873 /* this cgroup's local usage == 0 */ 1164 /* this cgroup's local usage == 0 */
874 css_put(&victim->css); 1165 css_put(&victim->css);
875 continue; 1166 continue;
876 } 1167 }
877 /* we use swappiness of local cgroup */ 1168 /* we use swappiness of local cgroup */
878 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1169 if (check_soft)
879 get_swappiness(victim)); 1170 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1171 noswap, get_swappiness(victim), zone,
1172 zone->zone_pgdat->node_id);
1173 else
1174 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1175 noswap, get_swappiness(victim));
880 css_put(&victim->css); 1176 css_put(&victim->css);
881 /* 1177 /*
882 * At shrinking usage, we can't check we should stop here or 1178 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
886 if (shrink) 1182 if (shrink)
887 return ret; 1183 return ret;
888 total += ret; 1184 total += ret;
889 if (mem_cgroup_check_under_limit(root_mem)) 1185 if (check_soft) {
1186 if (res_counter_check_under_soft_limit(&root_mem->res))
1187 return total;
1188 } else if (mem_cgroup_check_under_limit(root_mem))
890 return 1 + total; 1189 return 1 + total;
891 } 1190 }
892 return total; 1191 return total;
@@ -965,7 +1264,7 @@ done:
965 */ 1264 */
966static int __mem_cgroup_try_charge(struct mm_struct *mm, 1265static int __mem_cgroup_try_charge(struct mm_struct *mm,
967 gfp_t gfp_mask, struct mem_cgroup **memcg, 1266 gfp_t gfp_mask, struct mem_cgroup **memcg,
968 bool oom) 1267 bool oom, struct page *page)
969{ 1268{
970 struct mem_cgroup *mem, *mem_over_limit; 1269 struct mem_cgroup *mem, *mem_over_limit;
971 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
996 VM_BUG_ON(css_is_removed(&mem->css)); 1295 VM_BUG_ON(css_is_removed(&mem->css));
997 1296
998 while (1) { 1297 while (1) {
999 int ret; 1298 int ret = 0;
1000 bool noswap = false; 1299 unsigned long flags = 0;
1001 1300
1301 if (mem_cgroup_is_root(mem))
1302 goto done;
1002 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
1003 if (likely(!ret)) { 1304 if (likely(!ret)) {
1004 if (!do_swap_account) 1305 if (!do_swap_account)
@@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1009 break; 1310 break;
1010 /* mem+swap counter fails */ 1311 /* mem+swap counter fails */
1011 res_counter_uncharge(&mem->res, PAGE_SIZE); 1312 res_counter_uncharge(&mem->res, PAGE_SIZE);
1012 noswap = true; 1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1013 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1014 memsw); 1315 memsw);
1015 } else 1316 } else
@@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1020 if (!(gfp_mask & __GFP_WAIT)) 1321 if (!(gfp_mask & __GFP_WAIT))
1021 goto nomem; 1322 goto nomem;
1022 1323
1023 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1324 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1024 noswap, false); 1325 gfp_mask, flags);
1025 if (ret) 1326 if (ret)
1026 continue; 1327 continue;
1027 1328
@@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1046 goto nomem; 1347 goto nomem;
1047 } 1348 }
1048 } 1349 }
1350 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit.
1353 */
1354 if (mem_cgroup_soft_limit_check(mem))
1355 mem_cgroup_update_tree(mem, page);
1356done:
1049 return 0; 1357 return 0;
1050nomem: 1358nomem:
1051 css_put(&mem->css); 1359 css_put(&mem->css);
1052 return -ENOMEM; 1360 return -ENOMEM;
1053} 1361}
1054 1362
1055
1056/* 1363/*
1057 * A helper function to get mem_cgroup from ID. must be called under 1364 * A helper function to get mem_cgroup from ID. must be called under
1058 * rcu_read_lock(). The caller must check css_is_removed() or some if 1365 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1119 lock_page_cgroup(pc); 1426 lock_page_cgroup(pc);
1120 if (unlikely(PageCgroupUsed(pc))) { 1427 if (unlikely(PageCgroupUsed(pc))) {
1121 unlock_page_cgroup(pc); 1428 unlock_page_cgroup(pc);
1122 res_counter_uncharge(&mem->res, PAGE_SIZE); 1429 if (!mem_cgroup_is_root(mem)) {
1123 if (do_swap_account) 1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1124 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1125 css_put(&mem->css); 1434 css_put(&mem->css);
1126 return; 1435 return;
1127 } 1436 }
1437
1128 pc->mem_cgroup = mem; 1438 pc->mem_cgroup = mem;
1439 /*
1440 * We access a page_cgroup asynchronously without lock_page_cgroup().
1441 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1442 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1443 * before USED bit, we need memory barrier here.
1444 * See mem_cgroup_add_lru_list(), etc.
1445 */
1129 smp_wmb(); 1446 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1447 switch (ctype) {
1448 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1449 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1450 SetPageCgroupCache(pc);
1451 SetPageCgroupUsed(pc);
1452 break;
1453 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1454 ClearPageCgroupCache(pc);
1455 SetPageCgroupUsed(pc);
1456 break;
1457 default:
1458 break;
1459 }
1131 1460
1132 mem_cgroup_charge_statistics(mem, pc, true); 1461 mem_cgroup_charge_statistics(mem, pc, true);
1133 1462
@@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1178 if (pc->mem_cgroup != from) 1507 if (pc->mem_cgroup != from)
1179 goto out; 1508 goto out;
1180 1509
1181 res_counter_uncharge(&from->res, PAGE_SIZE); 1510 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE);
1182 mem_cgroup_charge_statistics(from, pc, false); 1512 mem_cgroup_charge_statistics(from, pc, false);
1183 1513
1184 page = pc->page; 1514 page = pc->page;
@@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1197 1); 1527 1);
1198 } 1528 }
1199 1529
1200 if (do_swap_account) 1530 if (do_swap_account && !mem_cgroup_is_root(from))
1201 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1531 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1202 css_put(&from->css); 1532 css_put(&from->css);
1203 1533
@@ -1238,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1238 parent = mem_cgroup_from_cont(pcg); 1568 parent = mem_cgroup_from_cont(pcg);
1239 1569
1240 1570
1241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1242 if (ret || !parent) 1572 if (ret || !parent)
1243 return ret; 1573 return ret;
1244 1574
@@ -1268,9 +1598,11 @@ uncharge:
1268 /* drop extra refcnt by try_charge() */ 1598 /* drop extra refcnt by try_charge() */
1269 css_put(&parent->css); 1599 css_put(&parent->css);
1270 /* uncharge if move fails */ 1600 /* uncharge if move fails */
1271 res_counter_uncharge(&parent->res, PAGE_SIZE); 1601 if (!mem_cgroup_is_root(parent)) {
1272 if (do_swap_account) 1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1273 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1274 return ret; 1606 return ret;
1275} 1607}
1276 1608
@@ -1295,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1295 prefetchw(pc); 1627 prefetchw(pc);
1296 1628
1297 mem = memcg; 1629 mem = memcg;
1298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1630 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1299 if (ret || !mem) 1631 if (ret || !mem)
1300 return ret; 1632 return ret;
1301 1633
@@ -1414,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1414 if (!mem) 1746 if (!mem)
1415 goto charge_cur_mm; 1747 goto charge_cur_mm;
1416 *ptr = mem; 1748 *ptr = mem;
1417 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1749 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1418 /* drop extra refcnt from tryget */ 1750 /* drop extra refcnt from tryget */
1419 css_put(&mem->css); 1751 css_put(&mem->css);
1420 return ret; 1752 return ret;
1421charge_cur_mm: 1753charge_cur_mm:
1422 if (unlikely(!mm)) 1754 if (unlikely(!mm))
1423 mm = &init_mm; 1755 mm = &init_mm;
1424 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1756 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1425} 1757}
1426 1758
1427static void 1759static void
@@ -1459,7 +1791,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1459 * This recorded memcg can be obsolete one. So, avoid 1791 * This recorded memcg can be obsolete one. So, avoid
1460 * calling css_tryget 1792 * calling css_tryget
1461 */ 1793 */
1462 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1794 if (!mem_cgroup_is_root(memcg))
1795 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1796 mem_cgroup_swap_statistics(memcg, false);
1463 mem_cgroup_put(memcg); 1797 mem_cgroup_put(memcg);
1464 } 1798 }
1465 rcu_read_unlock(); 1799 rcu_read_unlock();
@@ -1484,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1484 return; 1818 return;
1485 if (!mem) 1819 if (!mem)
1486 return; 1820 return;
1487 res_counter_uncharge(&mem->res, PAGE_SIZE); 1821 if (!mem_cgroup_is_root(mem)) {
1488 if (do_swap_account) 1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1489 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1490 css_put(&mem->css); 1826 css_put(&mem->css);
1491} 1827}
1492 1828
@@ -1538,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1538 break; 1874 break;
1539 } 1875 }
1540 1876
1541 res_counter_uncharge(&mem->res, PAGE_SIZE); 1877 if (!mem_cgroup_is_root(mem)) {
1542 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1878 res_counter_uncharge(&mem->res, PAGE_SIZE);
1543 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true);
1544 mem_cgroup_charge_statistics(mem, pc, false); 1885 mem_cgroup_charge_statistics(mem, pc, false);
1545 1886
1546 ClearPageCgroupUsed(pc); 1887 ClearPageCgroupUsed(pc);
@@ -1554,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1554 mz = page_cgroup_zoneinfo(pc); 1895 mz = page_cgroup_zoneinfo(pc);
1555 unlock_page_cgroup(pc); 1896 unlock_page_cgroup(pc);
1556 1897
1898 if (mem_cgroup_soft_limit_check(mem))
1899 mem_cgroup_update_tree(mem, page);
1557 /* at swapout, this memcg will be accessed to record to swap */ 1900 /* at swapout, this memcg will be accessed to record to swap */
1558 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1559 css_put(&mem->css); 1902 css_put(&mem->css);
@@ -1629,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1629 * We uncharge this because swap is freed. 1972 * We uncharge this because swap is freed.
1630 * This memcg can be obsolete one. We avoid calling css_tryget 1973 * This memcg can be obsolete one. We avoid calling css_tryget
1631 */ 1974 */
1632 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1975 if (!mem_cgroup_is_root(memcg))
1976 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1977 mem_cgroup_swap_statistics(memcg, false);
1633 mem_cgroup_put(memcg); 1978 mem_cgroup_put(memcg);
1634 } 1979 }
1635 rcu_read_unlock(); 1980 rcu_read_unlock();
@@ -1658,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1658 unlock_page_cgroup(pc); 2003 unlock_page_cgroup(pc);
1659 2004
1660 if (mem) { 2005 if (mem) {
1661 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2006 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2007 page);
1662 css_put(&mem->css); 2008 css_put(&mem->css);
1663 } 2009 }
1664 *ptr = mem; 2010 *ptr = mem;
@@ -1798,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1798 if (!ret) 2144 if (!ret)
1799 break; 2145 break;
1800 2146
1801 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1802 false, true); 2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK);
1803 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1804 /* Usage is reduced ? */ 2151 /* Usage is reduced ? */
1805 if (curusage >= oldusage) 2152 if (curusage >= oldusage)
@@ -1851,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1851 if (!ret) 2198 if (!ret)
1852 break; 2199 break;
1853 2200
1854 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); 2201 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2202 MEM_CGROUP_RECLAIM_NOSWAP |
2203 MEM_CGROUP_RECLAIM_SHRINK);
1855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2204 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1856 /* Usage is reduced ? */ 2205 /* Usage is reduced ? */
1857 if (curusage >= oldusage) 2206 if (curusage >= oldusage)
@@ -1862,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1862 return ret; 2211 return ret;
1863} 2212}
1864 2213
2214unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2215 gfp_t gfp_mask, int nid,
2216 int zid)
2217{
2218 unsigned long nr_reclaimed = 0;
2219 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2220 unsigned long reclaimed;
2221 int loop = 0;
2222 struct mem_cgroup_tree_per_zone *mctz;
2223 unsigned long long excess;
2224
2225 if (order > 0)
2226 return 0;
2227
2228 mctz = soft_limit_tree_node_zone(nid, zid);
2229 /*
2230 * This loop can run a while, specially if mem_cgroup's continuously
2231 * keep exceeding their soft limit and putting the system under
2232 * pressure
2233 */
2234 do {
2235 if (next_mz)
2236 mz = next_mz;
2237 else
2238 mz = mem_cgroup_largest_soft_limit_node(mctz);
2239 if (!mz)
2240 break;
2241
2242 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2243 gfp_mask,
2244 MEM_CGROUP_RECLAIM_SOFT);
2245 nr_reclaimed += reclaimed;
2246 spin_lock(&mctz->lock);
2247
2248 /*
2249 * If we failed to reclaim anything from this memory cgroup
2250 * it is time to move on to the next cgroup
2251 */
2252 next_mz = NULL;
2253 if (!reclaimed) {
2254 do {
2255 /*
2256 * Loop until we find yet another one.
2257 *
2258 * By the time we get the soft_limit lock
2259 * again, someone might have aded the
2260 * group back on the RB tree. Iterate to
2261 * make sure we get a different mem.
2262 * mem_cgroup_largest_soft_limit_node returns
2263 * NULL if no other cgroup is present on
2264 * the tree
2265 */
2266 next_mz =
2267 __mem_cgroup_largest_soft_limit_node(mctz);
2268 if (next_mz == mz) {
2269 css_put(&next_mz->mem->css);
2270 next_mz = NULL;
2271 } else /* next_mz == NULL or other memcg */
2272 break;
2273 } while (1);
2274 }
2275 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2276 excess = res_counter_soft_limit_excess(&mz->mem->res);
2277 /*
2278 * One school of thought says that we should not add
2279 * back the node to the tree if reclaim returns 0.
2280 * But our reclaim could return 0, simply because due
2281 * to priority we are exposing a smaller subset of
2282 * memory to reclaim from. Consider this as a longer
2283 * term TODO.
2284 */
2285 /* If excess == 0, no tree ops */
2286 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2287 spin_unlock(&mctz->lock);
2288 css_put(&mz->mem->css);
2289 loop++;
2290 /*
2291 * Could not reclaim anything and there are no more
2292 * mem cgroups to try or we seem to be looping without
2293 * reclaiming anything.
2294 */
2295 if (!nr_reclaimed &&
2296 (next_mz == NULL ||
2297 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2298 break;
2299 } while (!nr_reclaimed);
2300 if (next_mz)
2301 css_put(&next_mz->mem->css);
2302 return nr_reclaimed;
2303}
2304
1865/* 2305/*
1866 * This routine traverse page_cgroup in given list and drop them all. 2306 * This routine traverse page_cgroup in given list and drop them all.
1867 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2307 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2046 return retval; 2486 return retval;
2047} 2487}
2048 2488
2489struct mem_cgroup_idx_data {
2490 s64 val;
2491 enum mem_cgroup_stat_index idx;
2492};
2493
2494static int
2495mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2496{
2497 struct mem_cgroup_idx_data *d = data;
2498 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2499 return 0;
2500}
2501
2502static void
2503mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2504 enum mem_cgroup_stat_index idx, s64 *val)
2505{
2506 struct mem_cgroup_idx_data d;
2507 d.idx = idx;
2508 d.val = 0;
2509 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2510 *val = d.val;
2511}
2512
2049static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2513static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2050{ 2514{
2051 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2052 u64 val = 0; 2516 u64 idx_val, val;
2053 int type, name; 2517 int type, name;
2054 2518
2055 type = MEMFILE_TYPE(cft->private); 2519 type = MEMFILE_TYPE(cft->private);
2056 name = MEMFILE_ATTR(cft->private); 2520 name = MEMFILE_ATTR(cft->private);
2057 switch (type) { 2521 switch (type) {
2058 case _MEM: 2522 case _MEM:
2059 val = res_counter_read_u64(&mem->res, name); 2523 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2524 mem_cgroup_get_recursive_idx_stat(mem,
2525 MEM_CGROUP_STAT_CACHE, &idx_val);
2526 val = idx_val;
2527 mem_cgroup_get_recursive_idx_stat(mem,
2528 MEM_CGROUP_STAT_RSS, &idx_val);
2529 val += idx_val;
2530 val <<= PAGE_SHIFT;
2531 } else
2532 val = res_counter_read_u64(&mem->res, name);
2060 break; 2533 break;
2061 case _MEMSWAP: 2534 case _MEMSWAP:
2062 val = res_counter_read_u64(&mem->memsw, name); 2535 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2536 mem_cgroup_get_recursive_idx_stat(mem,
2537 MEM_CGROUP_STAT_CACHE, &idx_val);
2538 val = idx_val;
2539 mem_cgroup_get_recursive_idx_stat(mem,
2540 MEM_CGROUP_STAT_RSS, &idx_val);
2541 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2544 val <<= PAGE_SHIFT;
2545 } else
2546 val = res_counter_read_u64(&mem->memsw, name);
2063 break; 2547 break;
2064 default: 2548 default:
2065 BUG(); 2549 BUG();
@@ -2083,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2567 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2568 switch (name) {
2085 case RES_LIMIT: 2569 case RES_LIMIT:
2570 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2571 ret = -EINVAL;
2572 break;
2573 }
2086 /* This function does all necessary parse...reuse it */ 2574 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2575 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2576 if (ret)
@@ -2092,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2092 else 2580 else
2093 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2581 ret = mem_cgroup_resize_memsw_limit(memcg, val);
2094 break; 2582 break;
2583 case RES_SOFT_LIMIT:
2584 ret = res_counter_memparse_write_strategy(buffer, &val);
2585 if (ret)
2586 break;
2587 /*
2588 * For memsw, soft limits are hard to implement in terms
2589 * of semantics, for now, we support soft limits for
2590 * control without swap
2591 */
2592 if (type == _MEM)
2593 ret = res_counter_set_soft_limit(&memcg->res, val);
2594 else
2595 ret = -EINVAL;
2596 break;
2095 default: 2597 default:
2096 ret = -EINVAL; /* should be BUG() ? */ 2598 ret = -EINVAL; /* should be BUG() ? */
2097 break; 2599 break;
@@ -2149,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2149 res_counter_reset_failcnt(&mem->memsw); 2651 res_counter_reset_failcnt(&mem->memsw);
2150 break; 2652 break;
2151 } 2653 }
2654
2152 return 0; 2655 return 0;
2153} 2656}
2154 2657
@@ -2160,6 +2663,7 @@ enum {
2160 MCS_MAPPED_FILE, 2663 MCS_MAPPED_FILE,
2161 MCS_PGPGIN, 2664 MCS_PGPGIN,
2162 MCS_PGPGOUT, 2665 MCS_PGPGOUT,
2666 MCS_SWAP,
2163 MCS_INACTIVE_ANON, 2667 MCS_INACTIVE_ANON,
2164 MCS_ACTIVE_ANON, 2668 MCS_ACTIVE_ANON,
2165 MCS_INACTIVE_FILE, 2669 MCS_INACTIVE_FILE,
@@ -2181,6 +2685,7 @@ struct {
2181 {"mapped_file", "total_mapped_file"}, 2685 {"mapped_file", "total_mapped_file"},
2182 {"pgpgin", "total_pgpgin"}, 2686 {"pgpgin", "total_pgpgin"},
2183 {"pgpgout", "total_pgpgout"}, 2687 {"pgpgout", "total_pgpgout"},
2688 {"swap", "total_swap"},
2184 {"inactive_anon", "total_inactive_anon"}, 2689 {"inactive_anon", "total_inactive_anon"},
2185 {"active_anon", "total_active_anon"}, 2690 {"active_anon", "total_active_anon"},
2186 {"inactive_file", "total_inactive_file"}, 2691 {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2205 s->stat[MCS_PGPGIN] += val; 2710 s->stat[MCS_PGPGIN] += val;
2206 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2207 s->stat[MCS_PGPGOUT] += val; 2712 s->stat[MCS_PGPGOUT] += val;
2713 if (do_swap_account) {
2714 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2715 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2716 }
2208 2717
2209 /* per zone stat */ 2718 /* per zone stat */
2210 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2719 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2236 memset(&mystat, 0, sizeof(mystat)); 2745 memset(&mystat, 0, sizeof(mystat));
2237 mem_cgroup_get_local_stat(mem_cont, &mystat); 2746 mem_cgroup_get_local_stat(mem_cont, &mystat);
2238 2747
2239 for (i = 0; i < NR_MCS_STAT; i++) 2748 for (i = 0; i < NR_MCS_STAT; i++) {
2749 if (i == MCS_SWAP && !do_swap_account)
2750 continue;
2240 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2751 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2752 }
2241 2753
2242 /* Hierarchical information */ 2754 /* Hierarchical information */
2243 { 2755 {
@@ -2250,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2250 2762
2251 memset(&mystat, 0, sizeof(mystat)); 2763 memset(&mystat, 0, sizeof(mystat));
2252 mem_cgroup_get_total_stat(mem_cont, &mystat); 2764 mem_cgroup_get_total_stat(mem_cont, &mystat);
2253 for (i = 0; i < NR_MCS_STAT; i++) 2765 for (i = 0; i < NR_MCS_STAT; i++) {
2766 if (i == MCS_SWAP && !do_swap_account)
2767 continue;
2254 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2768 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2255 2769 }
2256 2770
2257#ifdef CONFIG_DEBUG_VM 2771#ifdef CONFIG_DEBUG_VM
2258 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2772 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = {
2345 .read_u64 = mem_cgroup_read, 2859 .read_u64 = mem_cgroup_read,
2346 }, 2860 },
2347 { 2861 {
2862 .name = "soft_limit_in_bytes",
2863 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2864 .write_string = mem_cgroup_write,
2865 .read_u64 = mem_cgroup_read,
2866 },
2867 {
2348 .name = "failcnt", 2868 .name = "failcnt",
2349 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2869 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2350 .trigger = mem_cgroup_reset, 2870 .trigger = mem_cgroup_reset,
@@ -2438,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2438 mz = &pn->zoneinfo[zone]; 2958 mz = &pn->zoneinfo[zone];
2439 for_each_lru(l) 2959 for_each_lru(l)
2440 INIT_LIST_HEAD(&mz->lists[l]); 2960 INIT_LIST_HEAD(&mz->lists[l]);
2961 mz->usage_in_excess = 0;
2962 mz->on_tree = false;
2963 mz->mem = mem;
2441 } 2964 }
2442 return 0; 2965 return 0;
2443} 2966}
@@ -2483,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2483{ 3006{
2484 int node; 3007 int node;
2485 3008
3009 mem_cgroup_remove_from_trees(mem);
2486 free_css_id(&mem_cgroup_subsys, &mem->css); 3010 free_css_id(&mem_cgroup_subsys, &mem->css);
2487 3011
2488 for_each_node_state(node, N_POSSIBLE) 3012 for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3055,31 @@ static void __init enable_swap_cgroup(void)
2531} 3055}
2532#endif 3056#endif
2533 3057
3058static int mem_cgroup_soft_limit_tree_init(void)
3059{
3060 struct mem_cgroup_tree_per_node *rtpn;
3061 struct mem_cgroup_tree_per_zone *rtpz;
3062 int tmp, node, zone;
3063
3064 for_each_node_state(node, N_POSSIBLE) {
3065 tmp = node;
3066 if (!node_state(node, N_NORMAL_MEMORY))
3067 tmp = -1;
3068 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3069 if (!rtpn)
3070 return 1;
3071
3072 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3073
3074 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3075 rtpz = &rtpn->rb_tree_per_zone[zone];
3076 rtpz->rb_root = RB_ROOT;
3077 spin_lock_init(&rtpz->lock);
3078 }
3079 }
3080 return 0;
3081}
3082
2534static struct cgroup_subsys_state * __ref 3083static struct cgroup_subsys_state * __ref
2535mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3084mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2536{ 3085{
@@ -2545,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2545 for_each_node_state(node, N_POSSIBLE) 3094 for_each_node_state(node, N_POSSIBLE)
2546 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3095 if (alloc_mem_cgroup_per_zone_info(mem, node))
2547 goto free_out; 3096 goto free_out;
3097
2548 /* root ? */ 3098 /* root ? */
2549 if (cont->parent == NULL) { 3099 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 3100 enable_swap_cgroup();
2551 parent = NULL; 3101 parent = NULL;
3102 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out;
3105
2552 } else { 3106 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 3107 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 3108 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 3131 return &mem->css;
2578free_out: 3132free_out:
2579 __mem_cgroup_free(mem); 3133 __mem_cgroup_free(mem);
3134 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 3135 return ERR_PTR(error);
2581} 3136}
2582 3137
@@ -2612,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2612static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3167static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2613 struct cgroup *cont, 3168 struct cgroup *cont,
2614 struct cgroup *old_cont, 3169 struct cgroup *old_cont,
2615 struct task_struct *p) 3170 struct task_struct *p,
3171 bool threadgroup)
2616{ 3172{
2617 mutex_lock(&memcg_tasklist); 3173 mutex_lock(&memcg_tasklist);
2618 /* 3174 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
new file mode 100644
index 000000000000..729d4b15b645
--- /dev/null
+++ b/mm/memory-failure.c
@@ -0,0 +1,832 @@
1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a 2bit ECC memory or cache
11 * failure.
12 *
13 * Handles page cache pages in various states. The tricky part
14 * here is that we can access any page asynchronous to other VM
15 * users, because memory failures could happen anytime and anywhere,
16 * possibly violating some of their assumptions. This is why this code
17 * has to be extremely careful. Generally it tries to use normal locking
18 * rules, as in get the standard locks, even if that means the
19 * error handling takes potentially a long time.
20 *
21 * The operation to map back from RMAP chains to processes has to walk
22 * the complete process list and has non linear complexity with the number
23 * mappings. In short it can be quite slow. But since memory corruptions
24 * are rare we hope to get away with this.
25 */
26
27/*
28 * Notebook:
29 * - hugetlb needs more code
30 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
31 * - pass bad pages to kdump next kernel
32 */
33#define DEBUG 1 /* remove me in 2.6.34 */
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/rmap.h>
39#include <linux/pagemap.h>
40#include <linux/swap.h>
41#include <linux/backing-dev.h>
42#include "internal.h"
43
44int sysctl_memory_failure_early_kill __read_mostly = 0;
45
46int sysctl_memory_failure_recovery __read_mostly = 1;
47
48atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
49
50/*
51 * Send all the processes who have the page mapped an ``action optional''
52 * signal.
53 */
54static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
55 unsigned long pfn)
56{
57 struct siginfo si;
58 int ret;
59
60 printk(KERN_ERR
61 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
62 pfn, t->comm, t->pid);
63 si.si_signo = SIGBUS;
64 si.si_errno = 0;
65 si.si_code = BUS_MCEERR_AO;
66 si.si_addr = (void *)addr;
67#ifdef __ARCH_SI_TRAPNO
68 si.si_trapno = trapno;
69#endif
70 si.si_addr_lsb = PAGE_SHIFT;
71 /*
72 * Don't use force here, it's convenient if the signal
73 * can be temporarily blocked.
74 * This could cause a loop when the user sets SIGBUS
75 * to SIG_IGN, but hopefully noone will do that?
76 */
77 ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
78 if (ret < 0)
79 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
80 t->comm, t->pid, ret);
81 return ret;
82}
83
84/*
85 * Kill all processes that have a poisoned page mapped and then isolate
86 * the page.
87 *
88 * General strategy:
89 * Find all processes having the page mapped and kill them.
90 * But we keep a page reference around so that the page is not
91 * actually freed yet.
92 * Then stash the page away
93 *
94 * There's no convenient way to get back to mapped processes
95 * from the VMAs. So do a brute-force search over all
96 * running processes.
97 *
98 * Remember that machine checks are not common (or rather
99 * if they are common you have other problems), so this shouldn't
100 * be a performance issue.
101 *
102 * Also there are some races possible while we get from the
103 * error detection to actually handle it.
104 */
105
106struct to_kill {
107 struct list_head nd;
108 struct task_struct *tsk;
109 unsigned long addr;
110 unsigned addr_valid:1;
111};
112
113/*
114 * Failure handling: if we can't find or can't kill a process there's
115 * not much we can do. We just print a message and ignore otherwise.
116 */
117
118/*
119 * Schedule a process for later kill.
120 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
121 * TBD would GFP_NOIO be enough?
122 */
123static void add_to_kill(struct task_struct *tsk, struct page *p,
124 struct vm_area_struct *vma,
125 struct list_head *to_kill,
126 struct to_kill **tkc)
127{
128 struct to_kill *tk;
129
130 if (*tkc) {
131 tk = *tkc;
132 *tkc = NULL;
133 } else {
134 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
135 if (!tk) {
136 printk(KERN_ERR
137 "MCE: Out of memory while machine check handling\n");
138 return;
139 }
140 }
141 tk->addr = page_address_in_vma(p, vma);
142 tk->addr_valid = 1;
143
144 /*
145 * In theory we don't have to kill when the page was
146 * munmaped. But it could be also a mremap. Since that's
147 * likely very rare kill anyways just out of paranoia, but use
148 * a SIGKILL because the error is not contained anymore.
149 */
150 if (tk->addr == -EFAULT) {
151 pr_debug("MCE: Unable to find user space address %lx in %s\n",
152 page_to_pfn(p), tsk->comm);
153 tk->addr_valid = 0;
154 }
155 get_task_struct(tsk);
156 tk->tsk = tsk;
157 list_add_tail(&tk->nd, to_kill);
158}
159
160/*
161 * Kill the processes that have been collected earlier.
162 *
163 * Only do anything when DOIT is set, otherwise just free the list
164 * (this is used for clean pages which do not need killing)
165 * Also when FAIL is set do a force kill because something went
166 * wrong earlier.
167 */
168static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
169 int fail, unsigned long pfn)
170{
171 struct to_kill *tk, *next;
172
173 list_for_each_entry_safe (tk, next, to_kill, nd) {
174 if (doit) {
175 /*
176 * In case something went wrong with munmaping
177 * make sure the process doesn't catch the
178 * signal and then access the memory. Just kill it.
179 * the signal handlers
180 */
181 if (fail || tk->addr_valid == 0) {
182 printk(KERN_ERR
183 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
184 pfn, tk->tsk->comm, tk->tsk->pid);
185 force_sig(SIGKILL, tk->tsk);
186 }
187
188 /*
189 * In theory the process could have mapped
190 * something else on the address in-between. We could
191 * check for that, but we need to tell the
192 * process anyways.
193 */
194 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
195 pfn) < 0)
196 printk(KERN_ERR
197 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
198 pfn, tk->tsk->comm, tk->tsk->pid);
199 }
200 put_task_struct(tk->tsk);
201 kfree(tk);
202 }
203}
204
205static int task_early_kill(struct task_struct *tsk)
206{
207 if (!tsk->mm)
208 return 0;
209 if (tsk->flags & PF_MCE_PROCESS)
210 return !!(tsk->flags & PF_MCE_EARLY);
211 return sysctl_memory_failure_early_kill;
212}
213
214/*
215 * Collect processes when the error hit an anonymous page.
216 */
217static void collect_procs_anon(struct page *page, struct list_head *to_kill,
218 struct to_kill **tkc)
219{
220 struct vm_area_struct *vma;
221 struct task_struct *tsk;
222 struct anon_vma *av;
223
224 read_lock(&tasklist_lock);
225 av = page_lock_anon_vma(page);
226 if (av == NULL) /* Not actually mapped anymore */
227 goto out;
228 for_each_process (tsk) {
229 if (!task_early_kill(tsk))
230 continue;
231 list_for_each_entry (vma, &av->head, anon_vma_node) {
232 if (!page_mapped_in_vma(page, vma))
233 continue;
234 if (vma->vm_mm == tsk->mm)
235 add_to_kill(tsk, page, vma, to_kill, tkc);
236 }
237 }
238 page_unlock_anon_vma(av);
239out:
240 read_unlock(&tasklist_lock);
241}
242
243/*
244 * Collect processes when the error hit a file mapped page.
245 */
246static void collect_procs_file(struct page *page, struct list_head *to_kill,
247 struct to_kill **tkc)
248{
249 struct vm_area_struct *vma;
250 struct task_struct *tsk;
251 struct prio_tree_iter iter;
252 struct address_space *mapping = page->mapping;
253
254 /*
255 * A note on the locking order between the two locks.
256 * We don't rely on this particular order.
257 * If you have some other code that needs a different order
258 * feel free to switch them around. Or add a reverse link
259 * from mm_struct to task_struct, then this could be all
260 * done without taking tasklist_lock and looping over all tasks.
261 */
262
263 read_lock(&tasklist_lock);
264 spin_lock(&mapping->i_mmap_lock);
265 for_each_process(tsk) {
266 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
267
268 if (!task_early_kill(tsk))
269 continue;
270
271 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
272 pgoff) {
273 /*
274 * Send early kill signal to tasks where a vma covers
275 * the page but the corrupted page is not necessarily
276 * mapped it in its pte.
277 * Assume applications who requested early kill want
278 * to be informed of all such data corruptions.
279 */
280 if (vma->vm_mm == tsk->mm)
281 add_to_kill(tsk, page, vma, to_kill, tkc);
282 }
283 }
284 spin_unlock(&mapping->i_mmap_lock);
285 read_unlock(&tasklist_lock);
286}
287
288/*
289 * Collect the processes who have the corrupted page mapped to kill.
290 * This is done in two steps for locking reasons.
291 * First preallocate one tokill structure outside the spin locks,
292 * so that we can kill at least one process reasonably reliable.
293 */
294static void collect_procs(struct page *page, struct list_head *tokill)
295{
296 struct to_kill *tk;
297
298 if (!page->mapping)
299 return;
300
301 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
302 if (!tk)
303 return;
304 if (PageAnon(page))
305 collect_procs_anon(page, tokill, &tk);
306 else
307 collect_procs_file(page, tokill, &tk);
308 kfree(tk);
309}
310
311/*
312 * Error handlers for various types of pages.
313 */
314
315enum outcome {
316 FAILED, /* Error handling failed */
317 DELAYED, /* Will be handled later */
318 IGNORED, /* Error safely ignored */
319 RECOVERED, /* Successfully recovered */
320};
321
322static const char *action_name[] = {
323 [FAILED] = "Failed",
324 [DELAYED] = "Delayed",
325 [IGNORED] = "Ignored",
326 [RECOVERED] = "Recovered",
327};
328
329/*
330 * Error hit kernel page.
331 * Do nothing, try to be lucky and not touch this instead. For a few cases we
332 * could be more sophisticated.
333 */
334static int me_kernel(struct page *p, unsigned long pfn)
335{
336 return DELAYED;
337}
338
339/*
340 * Already poisoned page.
341 */
342static int me_ignore(struct page *p, unsigned long pfn)
343{
344 return IGNORED;
345}
346
347/*
348 * Page in unknown state. Do nothing.
349 */
350static int me_unknown(struct page *p, unsigned long pfn)
351{
352 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
353 return FAILED;
354}
355
356/*
357 * Free memory
358 */
359static int me_free(struct page *p, unsigned long pfn)
360{
361 return DELAYED;
362}
363
364/*
365 * Clean (or cleaned) page cache page.
366 */
367static int me_pagecache_clean(struct page *p, unsigned long pfn)
368{
369 int err;
370 int ret = FAILED;
371 struct address_space *mapping;
372
373 if (!isolate_lru_page(p))
374 page_cache_release(p);
375
376 /*
377 * For anonymous pages we're done the only reference left
378 * should be the one m_f() holds.
379 */
380 if (PageAnon(p))
381 return RECOVERED;
382
383 /*
384 * Now truncate the page in the page cache. This is really
385 * more like a "temporary hole punch"
386 * Don't do this for block devices when someone else
387 * has a reference, because it could be file system metadata
388 * and that's not safe to truncate.
389 */
390 mapping = page_mapping(p);
391 if (!mapping) {
392 /*
393 * Page has been teared down in the meanwhile
394 */
395 return FAILED;
396 }
397
398 /*
399 * Truncation is a bit tricky. Enable it per file system for now.
400 *
401 * Open: to take i_mutex or not for this? Right now we don't.
402 */
403 if (mapping->a_ops->error_remove_page) {
404 err = mapping->a_ops->error_remove_page(mapping, p);
405 if (err != 0) {
406 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
407 pfn, err);
408 } else if (page_has_private(p) &&
409 !try_to_release_page(p, GFP_NOIO)) {
410 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
411 } else {
412 ret = RECOVERED;
413 }
414 } else {
415 /*
416 * If the file system doesn't support it just invalidate
417 * This fails on dirty or anything with private pages
418 */
419 if (invalidate_inode_page(p))
420 ret = RECOVERED;
421 else
422 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
423 pfn);
424 }
425 return ret;
426}
427
428/*
429 * Dirty cache page page
430 * Issues: when the error hit a hole page the error is not properly
431 * propagated.
432 */
433static int me_pagecache_dirty(struct page *p, unsigned long pfn)
434{
435 struct address_space *mapping = page_mapping(p);
436
437 SetPageError(p);
438 /* TBD: print more information about the file. */
439 if (mapping) {
440 /*
441 * IO error will be reported by write(), fsync(), etc.
442 * who check the mapping.
443 * This way the application knows that something went
444 * wrong with its dirty file data.
445 *
446 * There's one open issue:
447 *
448 * The EIO will be only reported on the next IO
449 * operation and then cleared through the IO map.
450 * Normally Linux has two mechanisms to pass IO error
451 * first through the AS_EIO flag in the address space
452 * and then through the PageError flag in the page.
453 * Since we drop pages on memory failure handling the
454 * only mechanism open to use is through AS_AIO.
455 *
456 * This has the disadvantage that it gets cleared on
457 * the first operation that returns an error, while
458 * the PageError bit is more sticky and only cleared
459 * when the page is reread or dropped. If an
460 * application assumes it will always get error on
461 * fsync, but does other operations on the fd before
462 * and the page is dropped inbetween then the error
463 * will not be properly reported.
464 *
465 * This can already happen even without hwpoisoned
466 * pages: first on metadata IO errors (which only
467 * report through AS_EIO) or when the page is dropped
468 * at the wrong time.
469 *
470 * So right now we assume that the application DTRT on
471 * the first EIO, but we're not worse than other parts
472 * of the kernel.
473 */
474 mapping_set_error(mapping, EIO);
475 }
476
477 return me_pagecache_clean(p, pfn);
478}
479
480/*
481 * Clean and dirty swap cache.
482 *
483 * Dirty swap cache page is tricky to handle. The page could live both in page
484 * cache and swap cache(ie. page is freshly swapped in). So it could be
485 * referenced concurrently by 2 types of PTEs:
486 * normal PTEs and swap PTEs. We try to handle them consistently by calling
487 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
488 * and then
489 * - clear dirty bit to prevent IO
490 * - remove from LRU
491 * - but keep in the swap cache, so that when we return to it on
492 * a later page fault, we know the application is accessing
493 * corrupted data and shall be killed (we installed simple
494 * interception code in do_swap_page to catch it).
495 *
496 * Clean swap cache pages can be directly isolated. A later page fault will
497 * bring in the known good data from disk.
498 */
499static int me_swapcache_dirty(struct page *p, unsigned long pfn)
500{
501 int ret = FAILED;
502
503 ClearPageDirty(p);
504 /* Trigger EIO in shmem: */
505 ClearPageUptodate(p);
506
507 if (!isolate_lru_page(p)) {
508 page_cache_release(p);
509 ret = DELAYED;
510 }
511
512 return ret;
513}
514
515static int me_swapcache_clean(struct page *p, unsigned long pfn)
516{
517 int ret = FAILED;
518
519 if (!isolate_lru_page(p)) {
520 page_cache_release(p);
521 ret = RECOVERED;
522 }
523 delete_from_swap_cache(p);
524 return ret;
525}
526
527/*
528 * Huge pages. Needs work.
529 * Issues:
530 * No rmap support so we cannot find the original mapper. In theory could walk
531 * all MMs and look for the mappings, but that would be non atomic and racy.
532 * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
533 * like just walking the current process and hoping it has it mapped (that
534 * should be usually true for the common "shared database cache" case)
535 * Should handle free huge pages and dequeue them too, but this needs to
536 * handle huge page accounting correctly.
537 */
538static int me_huge_page(struct page *p, unsigned long pfn)
539{
540 return FAILED;
541}
542
543/*
544 * Various page states we can handle.
545 *
546 * A page state is defined by its current page->flags bits.
547 * The table matches them in order and calls the right handler.
548 *
549 * This is quite tricky because we can access page at any time
550 * in its live cycle, so all accesses have to be extremly careful.
551 *
552 * This is not complete. More states could be added.
553 * For any missing state don't attempt recovery.
554 */
555
556#define dirty (1UL << PG_dirty)
557#define sc (1UL << PG_swapcache)
558#define unevict (1UL << PG_unevictable)
559#define mlock (1UL << PG_mlocked)
560#define writeback (1UL << PG_writeback)
561#define lru (1UL << PG_lru)
562#define swapbacked (1UL << PG_swapbacked)
563#define head (1UL << PG_head)
564#define tail (1UL << PG_tail)
565#define compound (1UL << PG_compound)
566#define slab (1UL << PG_slab)
567#define buddy (1UL << PG_buddy)
568#define reserved (1UL << PG_reserved)
569
570static struct page_state {
571 unsigned long mask;
572 unsigned long res;
573 char *msg;
574 int (*action)(struct page *p, unsigned long pfn);
575} error_states[] = {
576 { reserved, reserved, "reserved kernel", me_ignore },
577 { buddy, buddy, "free kernel", me_free },
578
579 /*
580 * Could in theory check if slab page is free or if we can drop
581 * currently unused objects without touching them. But just
582 * treat it as standard kernel for now.
583 */
584 { slab, slab, "kernel slab", me_kernel },
585
586#ifdef CONFIG_PAGEFLAGS_EXTENDED
587 { head, head, "huge", me_huge_page },
588 { tail, tail, "huge", me_huge_page },
589#else
590 { compound, compound, "huge", me_huge_page },
591#endif
592
593 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
594 { sc|dirty, sc, "swapcache", me_swapcache_clean },
595
596 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
597 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
598
599#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
600 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
601 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
602#endif
603
604 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
605 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
606 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
607
608 /*
609 * Catchall entry: must be at end.
610 */
611 { 0, 0, "unknown page state", me_unknown },
612};
613
614#undef lru
615
616static void action_result(unsigned long pfn, char *msg, int result)
617{
618 struct page *page = NULL;
619 if (pfn_valid(pfn))
620 page = pfn_to_page(pfn);
621
622 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
623 pfn,
624 page && PageDirty(page) ? "dirty " : "",
625 msg, action_name[result]);
626}
627
628static int page_action(struct page_state *ps, struct page *p,
629 unsigned long pfn, int ref)
630{
631 int result;
632
633 result = ps->action(p, pfn);
634 action_result(pfn, ps->msg, result);
635 if (page_count(p) != 1 + ref)
636 printk(KERN_ERR
637 "MCE %#lx: %s page still referenced by %d users\n",
638 pfn, ps->msg, page_count(p) - 1);
639
640 /* Could do more checks here if page looks ok */
641 /*
642 * Could adjust zone counters here to correct for the missing page.
643 */
644
645 return result == RECOVERED ? 0 : -EBUSY;
646}
647
648#define N_UNMAP_TRIES 5
649
650/*
651 * Do all that is necessary to remove user space mappings. Unmap
652 * the pages and send SIGBUS to the processes if the data was dirty.
653 */
654static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
655 int trapno)
656{
657 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
658 struct address_space *mapping;
659 LIST_HEAD(tokill);
660 int ret;
661 int i;
662 int kill = 1;
663
664 if (PageReserved(p) || PageCompound(p) || PageSlab(p))
665 return;
666
667 if (!PageLRU(p))
668 lru_add_drain_all();
669
670 /*
671 * This check implies we don't kill processes if their pages
672 * are in the swap cache early. Those are always late kills.
673 */
674 if (!page_mapped(p))
675 return;
676
677 if (PageSwapCache(p)) {
678 printk(KERN_ERR
679 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
680 ttu |= TTU_IGNORE_HWPOISON;
681 }
682
683 /*
684 * Propagate the dirty bit from PTEs to struct page first, because we
685 * need this to decide if we should kill or just drop the page.
686 */
687 mapping = page_mapping(p);
688 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
689 if (page_mkclean(p)) {
690 SetPageDirty(p);
691 } else {
692 kill = 0;
693 ttu |= TTU_IGNORE_HWPOISON;
694 printk(KERN_INFO
695 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
696 pfn);
697 }
698 }
699
700 /*
701 * First collect all the processes that have the page
702 * mapped in dirty form. This has to be done before try_to_unmap,
703 * because ttu takes the rmap data structures down.
704 *
705 * Error handling: We ignore errors here because
706 * there's nothing that can be done.
707 */
708 if (kill)
709 collect_procs(p, &tokill);
710
711 /*
712 * try_to_unmap can fail temporarily due to races.
713 * Try a few times (RED-PEN better strategy?)
714 */
715 for (i = 0; i < N_UNMAP_TRIES; i++) {
716 ret = try_to_unmap(p, ttu);
717 if (ret == SWAP_SUCCESS)
718 break;
719 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
720 }
721
722 if (ret != SWAP_SUCCESS)
723 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
724 pfn, page_mapcount(p));
725
726 /*
727 * Now that the dirty bit has been propagated to the
728 * struct page and all unmaps done we can decide if
729 * killing is needed or not. Only kill when the page
730 * was dirty, otherwise the tokill list is merely
731 * freed. When there was a problem unmapping earlier
732 * use a more force-full uncatchable kill to prevent
733 * any accesses to the poisoned memory.
734 */
735 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
736 ret != SWAP_SUCCESS, pfn);
737}
738
739int __memory_failure(unsigned long pfn, int trapno, int ref)
740{
741 struct page_state *ps;
742 struct page *p;
743 int res;
744
745 if (!sysctl_memory_failure_recovery)
746 panic("Memory failure from trap %d on page %lx", trapno, pfn);
747
748 if (!pfn_valid(pfn)) {
749 action_result(pfn, "memory outside kernel control", IGNORED);
750 return -EIO;
751 }
752
753 p = pfn_to_page(pfn);
754 if (TestSetPageHWPoison(p)) {
755 action_result(pfn, "already hardware poisoned", IGNORED);
756 return 0;
757 }
758
759 atomic_long_add(1, &mce_bad_pages);
760
761 /*
762 * We need/can do nothing about count=0 pages.
763 * 1) it's a free page, and therefore in safe hand:
764 * prep_new_page() will be the gate keeper.
765 * 2) it's part of a non-compound high order page.
766 * Implies some kernel user: cannot stop them from
767 * R/W the page; let's pray that the page has been
768 * used and will be freed some time later.
769 * In fact it's dangerous to directly bump up page count from 0,
770 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
771 */
772 if (!get_page_unless_zero(compound_head(p))) {
773 action_result(pfn, "free or high order kernel", IGNORED);
774 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
775 }
776
777 /*
778 * Lock the page and wait for writeback to finish.
779 * It's very difficult to mess with pages currently under IO
780 * and in many cases impossible, so we just avoid it here.
781 */
782 lock_page_nosync(p);
783 wait_on_page_writeback(p);
784
785 /*
786 * Now take care of user space mappings.
787 */
788 hwpoison_user_mappings(p, pfn, trapno);
789
790 /*
791 * Torn down by someone else?
792 */
793 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
794 action_result(pfn, "already truncated LRU", IGNORED);
795 res = 0;
796 goto out;
797 }
798
799 res = -EBUSY;
800 for (ps = error_states;; ps++) {
801 if ((p->flags & ps->mask) == ps->res) {
802 res = page_action(ps, p, pfn, ref);
803 break;
804 }
805 }
806out:
807 unlock_page(p);
808 return res;
809}
810EXPORT_SYMBOL_GPL(__memory_failure);
811
812/**
813 * memory_failure - Handle memory failure of a page.
814 * @pfn: Page Number of the corrupted page
815 * @trapno: Trap number reported in the signal to user space.
816 *
817 * This function is called by the low level machine check code
818 * of an architecture when it detects hardware memory corruption
819 * of a page. It tries its best to recover, which includes
820 * dropping pages, killing processes etc.
821 *
822 * The function is primarily of use for corruptions that
823 * happen outside the current execution context (e.g. when
824 * detected by a background scrubber)
825 *
826 * Must run in process context (e.g. a work queue) with interrupts
827 * enabled and no spinlocks hold.
828 */
829void memory_failure(unsigned long pfn, int trapno)
830{
831 __memory_failure(pfn, trapno, 0);
832}
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce3aba4..7e91b5f9f690 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -45,6 +45,7 @@
45#include <linux/swap.h> 45#include <linux/swap.h>
46#include <linux/highmem.h> 46#include <linux/highmem.h>
47#include <linux/pagemap.h> 47#include <linux/pagemap.h>
48#include <linux/ksm.h>
48#include <linux/rmap.h> 49#include <linux/rmap.h>
49#include <linux/module.h> 50#include <linux/module.h>
50#include <linux/delayacct.h> 51#include <linux/delayacct.h>
@@ -56,6 +57,7 @@
56#include <linux/swapops.h> 57#include <linux/swapops.h>
57#include <linux/elf.h> 58#include <linux/elf.h>
58 59
60#include <asm/io.h>
59#include <asm/pgalloc.h> 61#include <asm/pgalloc.h>
60#include <asm/uaccess.h> 62#include <asm/uaccess.h>
61#include <asm/tlb.h> 63#include <asm/tlb.h>
@@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s)
106} 108}
107__setup("norandmaps", disable_randmaps); 109__setup("norandmaps", disable_randmaps);
108 110
111unsigned long zero_pfn __read_mostly;
112unsigned long highest_memmap_pfn __read_mostly;
113
114/*
115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
116 */
117static int __init init_zero_pfn(void)
118{
119 zero_pfn = page_to_pfn(ZERO_PAGE(0));
120 return 0;
121}
122core_initcall(init_zero_pfn);
109 123
110/* 124/*
111 * If a p?d_bad entry is found while walking page tables, report 125 * If a p?d_bad entry is found while walking page tables, report
@@ -283,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
283 unsigned long addr = vma->vm_start; 297 unsigned long addr = vma->vm_start;
284 298
285 /* 299 /*
286 * Hide vma from rmap and vmtruncate before freeing pgtables 300 * Hide vma from rmap and truncate_pagecache before freeing
301 * pgtables
287 */ 302 */
288 anon_vma_unlink(vma); 303 anon_vma_unlink(vma);
289 unlink_file_vma(vma); 304 unlink_file_vma(vma);
@@ -442,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags)
442 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 457 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
443} 458}
444 459
460#ifndef is_zero_pfn
461static inline int is_zero_pfn(unsigned long pfn)
462{
463 return pfn == zero_pfn;
464}
465#endif
466
467#ifndef my_zero_pfn
468static inline unsigned long my_zero_pfn(unsigned long addr)
469{
470 return zero_pfn;
471}
472#endif
473
445/* 474/*
446 * vm_normal_page -- This function gets the "struct page" associated with a pte. 475 * vm_normal_page -- This function gets the "struct page" associated with a pte.
447 * 476 *
@@ -497,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
497 if (HAVE_PTE_SPECIAL) { 526 if (HAVE_PTE_SPECIAL) {
498 if (likely(!pte_special(pte))) 527 if (likely(!pte_special(pte)))
499 goto check_pfn; 528 goto check_pfn;
500 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) 529 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
530 return NULL;
531 if (!is_zero_pfn(pfn))
501 print_bad_pte(vma, addr, pte, NULL); 532 print_bad_pte(vma, addr, pte, NULL);
502 return NULL; 533 return NULL;
503 } 534 }
@@ -519,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
519 } 550 }
520 } 551 }
521 552
553 if (is_zero_pfn(pfn))
554 return NULL;
522check_pfn: 555check_pfn:
523 if (unlikely(pfn > highest_memmap_pfn)) { 556 if (unlikely(pfn > highest_memmap_pfn)) {
524 print_bad_pte(vma, addr, pte, NULL); 557 print_bad_pte(vma, addr, pte, NULL);
@@ -596,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
596 page = vm_normal_page(vma, addr, pte); 629 page = vm_normal_page(vma, addr, pte);
597 if (page) { 630 if (page) {
598 get_page(page); 631 get_page(page);
599 page_dup_rmap(page, vma, addr); 632 page_dup_rmap(page);
600 rss[!!PageAnon(page)]++; 633 rss[PageAnon(page)]++;
601 } 634 }
602 635
603out_set_pte: 636out_set_pte:
@@ -1142,9 +1175,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1142 goto no_page; 1175 goto no_page;
1143 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1176 if ((flags & FOLL_WRITE) && !pte_write(pte))
1144 goto unlock; 1177 goto unlock;
1178
1145 page = vm_normal_page(vma, address, pte); 1179 page = vm_normal_page(vma, address, pte);
1146 if (unlikely(!page)) 1180 if (unlikely(!page)) {
1147 goto bad_page; 1181 if ((flags & FOLL_DUMP) ||
1182 !is_zero_pfn(pte_pfn(pte)))
1183 goto bad_page;
1184 page = pte_page(pte);
1185 }
1148 1186
1149 if (flags & FOLL_GET) 1187 if (flags & FOLL_GET)
1150 get_page(page); 1188 get_page(page);
@@ -1172,65 +1210,46 @@ no_page:
1172 pte_unmap_unlock(ptep, ptl); 1210 pte_unmap_unlock(ptep, ptl);
1173 if (!pte_none(pte)) 1211 if (!pte_none(pte))
1174 return page; 1212 return page;
1175 /* Fall through to ZERO_PAGE handling */ 1213
1176no_page_table: 1214no_page_table:
1177 /* 1215 /*
1178 * When core dumping an enormous anonymous area that nobody 1216 * When core dumping an enormous anonymous area that nobody
1179 * has touched so far, we don't want to allocate page tables. 1217 * has touched so far, we don't want to allocate unnecessary pages or
1218 * page tables. Return error instead of NULL to skip handle_mm_fault,
1219 * then get_dump_page() will return NULL to leave a hole in the dump.
1220 * But we can only make this optimization where a hole would surely
1221 * be zero-filled if handle_mm_fault() actually did handle it.
1180 */ 1222 */
1181 if (flags & FOLL_ANON) { 1223 if ((flags & FOLL_DUMP) &&
1182 page = ZERO_PAGE(0); 1224 (!vma->vm_ops || !vma->vm_ops->fault))
1183 if (flags & FOLL_GET) 1225 return ERR_PTR(-EFAULT);
1184 get_page(page);
1185 BUG_ON(flags & FOLL_WRITE);
1186 }
1187 return page; 1226 return page;
1188} 1227}
1189 1228
1190/* Can we do the FOLL_ANON optimization? */
1191static inline int use_zero_page(struct vm_area_struct *vma)
1192{
1193 /*
1194 * We don't want to optimize FOLL_ANON for make_pages_present()
1195 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
1196 * we want to get the page from the page tables to make sure
1197 * that we serialize and update with any other user of that
1198 * mapping.
1199 */
1200 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1201 return 0;
1202 /*
1203 * And if we have a fault routine, it's not an anonymous region.
1204 */
1205 return !vma->vm_ops || !vma->vm_ops->fault;
1206}
1207
1208
1209
1210int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1229int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1211 unsigned long start, int nr_pages, int flags, 1230 unsigned long start, int nr_pages, unsigned int gup_flags,
1212 struct page **pages, struct vm_area_struct **vmas) 1231 struct page **pages, struct vm_area_struct **vmas)
1213{ 1232{
1214 int i; 1233 int i;
1215 unsigned int vm_flags = 0; 1234 unsigned long vm_flags;
1216 int write = !!(flags & GUP_FLAGS_WRITE);
1217 int force = !!(flags & GUP_FLAGS_FORCE);
1218 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1219 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1220 1235
1221 if (nr_pages <= 0) 1236 if (nr_pages <= 0)
1222 return 0; 1237 return 0;
1238
1239 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1240
1223 /* 1241 /*
1224 * Require read or write permissions. 1242 * Require read or write permissions.
1225 * If 'force' is set, we only require the "MAY" flags. 1243 * If FOLL_FORCE is set, we only require the "MAY" flags.
1226 */ 1244 */
1227 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1245 vm_flags = (gup_flags & FOLL_WRITE) ?
1228 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1246 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1247 vm_flags &= (gup_flags & FOLL_FORCE) ?
1248 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1229 i = 0; 1249 i = 0;
1230 1250
1231 do { 1251 do {
1232 struct vm_area_struct *vma; 1252 struct vm_area_struct *vma;
1233 unsigned int foll_flags;
1234 1253
1235 vma = find_extend_vma(mm, start); 1254 vma = find_extend_vma(mm, start);
1236 if (!vma && in_gate_area(tsk, start)) { 1255 if (!vma && in_gate_area(tsk, start)) {
@@ -1242,7 +1261,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1242 pte_t *pte; 1261 pte_t *pte;
1243 1262
1244 /* user gate pages are read-only */ 1263 /* user gate pages are read-only */
1245 if (!ignore && write) 1264 if (gup_flags & FOLL_WRITE)
1246 return i ? : -EFAULT; 1265 return i ? : -EFAULT;
1247 if (pg > TASK_SIZE) 1266 if (pg > TASK_SIZE)
1248 pgd = pgd_offset_k(pg); 1267 pgd = pgd_offset_k(pg);
@@ -1276,38 +1295,26 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1276 1295
1277 if (!vma || 1296 if (!vma ||
1278 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1297 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1279 (!ignore && !(vm_flags & vma->vm_flags))) 1298 !(vm_flags & vma->vm_flags))
1280 return i ? : -EFAULT; 1299 return i ? : -EFAULT;
1281 1300
1282 if (is_vm_hugetlb_page(vma)) { 1301 if (is_vm_hugetlb_page(vma)) {
1283 i = follow_hugetlb_page(mm, vma, pages, vmas, 1302 i = follow_hugetlb_page(mm, vma, pages, vmas,
1284 &start, &nr_pages, i, write); 1303 &start, &nr_pages, i, gup_flags);
1285 continue; 1304 continue;
1286 } 1305 }
1287 1306
1288 foll_flags = FOLL_TOUCH;
1289 if (pages)
1290 foll_flags |= FOLL_GET;
1291 if (!write && use_zero_page(vma))
1292 foll_flags |= FOLL_ANON;
1293
1294 do { 1307 do {
1295 struct page *page; 1308 struct page *page;
1309 unsigned int foll_flags = gup_flags;
1296 1310
1297 /* 1311 /*
1298 * If we have a pending SIGKILL, don't keep faulting 1312 * If we have a pending SIGKILL, don't keep faulting
1299 * pages and potentially allocating memory, unless 1313 * pages and potentially allocating memory.
1300 * current is handling munlock--e.g., on exit. In
1301 * that case, we are not allocating memory. Rather,
1302 * we're only unlocking already resident/mapped pages.
1303 */ 1314 */
1304 if (unlikely(!ignore_sigkill && 1315 if (unlikely(fatal_signal_pending(current)))
1305 fatal_signal_pending(current)))
1306 return i ? i : -ERESTARTSYS; 1316 return i ? i : -ERESTARTSYS;
1307 1317
1308 if (write)
1309 foll_flags |= FOLL_WRITE;
1310
1311 cond_resched(); 1318 cond_resched();
1312 while (!(page = follow_page(vma, start, foll_flags))) { 1319 while (!(page = follow_page(vma, start, foll_flags))) {
1313 int ret; 1320 int ret;
@@ -1319,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1319 if (ret & VM_FAULT_ERROR) { 1326 if (ret & VM_FAULT_ERROR) {
1320 if (ret & VM_FAULT_OOM) 1327 if (ret & VM_FAULT_OOM)
1321 return i ? i : -ENOMEM; 1328 return i ? i : -ENOMEM;
1322 else if (ret & VM_FAULT_SIGBUS) 1329 if (ret &
1330 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
1323 return i ? i : -EFAULT; 1331 return i ? i : -EFAULT;
1324 BUG(); 1332 BUG();
1325 } 1333 }
@@ -1418,18 +1426,47 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1418 unsigned long start, int nr_pages, int write, int force, 1426 unsigned long start, int nr_pages, int write, int force,
1419 struct page **pages, struct vm_area_struct **vmas) 1427 struct page **pages, struct vm_area_struct **vmas)
1420{ 1428{
1421 int flags = 0; 1429 int flags = FOLL_TOUCH;
1422 1430
1431 if (pages)
1432 flags |= FOLL_GET;
1423 if (write) 1433 if (write)
1424 flags |= GUP_FLAGS_WRITE; 1434 flags |= FOLL_WRITE;
1425 if (force) 1435 if (force)
1426 flags |= GUP_FLAGS_FORCE; 1436 flags |= FOLL_FORCE;
1427 1437
1428 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1438 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1429} 1439}
1430
1431EXPORT_SYMBOL(get_user_pages); 1440EXPORT_SYMBOL(get_user_pages);
1432 1441
1442/**
1443 * get_dump_page() - pin user page in memory while writing it to core dump
1444 * @addr: user address
1445 *
1446 * Returns struct page pointer of user page pinned for dump,
1447 * to be freed afterwards by page_cache_release() or put_page().
1448 *
1449 * Returns NULL on any kind of failure - a hole must then be inserted into
1450 * the corefile, to preserve alignment with its headers; and also returns
1451 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1452 * allowing a hole to be left in the corefile to save diskspace.
1453 *
1454 * Called without mmap_sem, but after all other threads have been killed.
1455 */
1456#ifdef CONFIG_ELF_CORE
1457struct page *get_dump_page(unsigned long addr)
1458{
1459 struct vm_area_struct *vma;
1460 struct page *page;
1461
1462 if (__get_user_pages(current, current->mm, addr, 1,
1463 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
1464 return NULL;
1465 flush_cache_page(vma, addr, page_to_pfn(page));
1466 return page;
1467}
1468#endif /* CONFIG_ELF_CORE */
1469
1433pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1470pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1434 spinlock_t **ptl) 1471 spinlock_t **ptl)
1435{ 1472{
@@ -1607,7 +1644,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1607 * If we don't have pte special, then we have to use the pfn_valid() 1644 * If we don't have pte special, then we have to use the pfn_valid()
1608 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1645 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1609 * refcount the page if pfn_valid is true (hence insert_page rather 1646 * refcount the page if pfn_valid is true (hence insert_page rather
1610 * than insert_pfn). 1647 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1648 * without pte special, it would there be refcounted as a normal page.
1611 */ 1649 */
1612 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1650 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1613 struct page *page; 1651 struct page *page;
@@ -1973,7 +2011,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1973 * Take out anonymous pages first, anonymous shared vmas are 2011 * Take out anonymous pages first, anonymous shared vmas are
1974 * not dirty accountable. 2012 * not dirty accountable.
1975 */ 2013 */
1976 if (PageAnon(old_page)) { 2014 if (PageAnon(old_page) && !PageKsm(old_page)) {
1977 if (!trylock_page(old_page)) { 2015 if (!trylock_page(old_page)) {
1978 page_cache_get(old_page); 2016 page_cache_get(old_page);
1979 pte_unmap_unlock(page_table, ptl); 2017 pte_unmap_unlock(page_table, ptl);
@@ -2074,10 +2112,19 @@ gotten:
2074 2112
2075 if (unlikely(anon_vma_prepare(vma))) 2113 if (unlikely(anon_vma_prepare(vma)))
2076 goto oom; 2114 goto oom;
2077 VM_BUG_ON(old_page == ZERO_PAGE(0)); 2115
2078 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2116 if (is_zero_pfn(pte_pfn(orig_pte))) {
2079 if (!new_page) 2117 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2080 goto oom; 2118 if (!new_page)
2119 goto oom;
2120 } else {
2121 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2122 if (!new_page)
2123 goto oom;
2124 cow_user_page(new_page, old_page, address, vma);
2125 }
2126 __SetPageUptodate(new_page);
2127
2081 /* 2128 /*
2082 * Don't let another task, with possibly unlocked vma, 2129 * Don't let another task, with possibly unlocked vma,
2083 * keep the mlocked page. 2130 * keep the mlocked page.
@@ -2087,8 +2134,6 @@ gotten:
2087 clear_page_mlock(old_page); 2134 clear_page_mlock(old_page);
2088 unlock_page(old_page); 2135 unlock_page(old_page);
2089 } 2136 }
2090 cow_user_page(new_page, old_page, address, vma);
2091 __SetPageUptodate(new_page);
2092 2137
2093 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2138 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2094 goto oom_free_new; 2139 goto oom_free_new;
@@ -2114,9 +2159,14 @@ gotten:
2114 * seen in the presence of one thread doing SMC and another 2159 * seen in the presence of one thread doing SMC and another
2115 * thread doing COW. 2160 * thread doing COW.
2116 */ 2161 */
2117 ptep_clear_flush_notify(vma, address, page_table); 2162 ptep_clear_flush(vma, address, page_table);
2118 page_add_new_anon_rmap(new_page, vma, address); 2163 page_add_new_anon_rmap(new_page, vma, address);
2119 set_pte_at(mm, address, page_table, entry); 2164 /*
2165 * We call the notify macro here because, when using secondary
2166 * mmu page tables (such as kvm shadow page tables), we want the
2167 * new page to be mapped directly into the secondary page table.
2168 */
2169 set_pte_at_notify(mm, address, page_table, entry);
2120 update_mmu_cache(vma, address, entry); 2170 update_mmu_cache(vma, address, entry);
2121 if (old_page) { 2171 if (old_page) {
2122 /* 2172 /*
@@ -2359,7 +2409,7 @@ restart:
2359 * @mapping: the address space containing mmaps to be unmapped. 2409 * @mapping: the address space containing mmaps to be unmapped.
2360 * @holebegin: byte in first page to unmap, relative to the start of 2410 * @holebegin: byte in first page to unmap, relative to the start of
2361 * the underlying file. This will be rounded down to a PAGE_SIZE 2411 * the underlying file. This will be rounded down to a PAGE_SIZE
2362 * boundary. Note that this is different from vmtruncate(), which 2412 * boundary. Note that this is different from truncate_pagecache(), which
2363 * must keep the partial page. In contrast, we must get rid of 2413 * must keep the partial page. In contrast, we must get rid of
2364 * partial pages. 2414 * partial pages.
2365 * @holelen: size of prospective hole in bytes. This will be rounded 2415 * @holelen: size of prospective hole in bytes. This will be rounded
@@ -2410,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping,
2410} 2460}
2411EXPORT_SYMBOL(unmap_mapping_range); 2461EXPORT_SYMBOL(unmap_mapping_range);
2412 2462
2413/**
2414 * vmtruncate - unmap mappings "freed" by truncate() syscall
2415 * @inode: inode of the file used
2416 * @offset: file offset to start truncating
2417 *
2418 * NOTE! We have to be ready to update the memory sharing
2419 * between the file and the memory map for a potential last
2420 * incomplete page. Ugly, but necessary.
2421 */
2422int vmtruncate(struct inode * inode, loff_t offset)
2423{
2424 if (inode->i_size < offset) {
2425 unsigned long limit;
2426
2427 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2428 if (limit != RLIM_INFINITY && offset > limit)
2429 goto out_sig;
2430 if (offset > inode->i_sb->s_maxbytes)
2431 goto out_big;
2432 i_size_write(inode, offset);
2433 } else {
2434 struct address_space *mapping = inode->i_mapping;
2435
2436 /*
2437 * truncation of in-use swapfiles is disallowed - it would
2438 * cause subsequent swapout to scribble on the now-freed
2439 * blocks.
2440 */
2441 if (IS_SWAPFILE(inode))
2442 return -ETXTBSY;
2443 i_size_write(inode, offset);
2444
2445 /*
2446 * unmap_mapping_range is called twice, first simply for
2447 * efficiency so that truncate_inode_pages does fewer
2448 * single-page unmaps. However after this first call, and
2449 * before truncate_inode_pages finishes, it is possible for
2450 * private pages to be COWed, which remain after
2451 * truncate_inode_pages finishes, hence the second
2452 * unmap_mapping_range call must be made for correctness.
2453 */
2454 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2455 truncate_inode_pages(mapping, offset);
2456 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2457 }
2458
2459 if (inode->i_op->truncate)
2460 inode->i_op->truncate(inode);
2461 return 0;
2462
2463out_sig:
2464 send_sig(SIGXFSZ, current, 0);
2465out_big:
2466 return -EFBIG;
2467}
2468EXPORT_SYMBOL(vmtruncate);
2469
2470int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 2463int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2471{ 2464{
2472 struct address_space *mapping = inode->i_mapping; 2465 struct address_space *mapping = inode->i_mapping;
@@ -2511,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2511 goto out; 2504 goto out;
2512 2505
2513 entry = pte_to_swp_entry(orig_pte); 2506 entry = pte_to_swp_entry(orig_pte);
2514 if (is_migration_entry(entry)) { 2507 if (unlikely(non_swap_entry(entry))) {
2515 migration_entry_wait(mm, pmd, address); 2508 if (is_migration_entry(entry)) {
2509 migration_entry_wait(mm, pmd, address);
2510 } else if (is_hwpoison_entry(entry)) {
2511 ret = VM_FAULT_HWPOISON;
2512 } else {
2513 print_bad_pte(vma, address, orig_pte, NULL);
2514 ret = VM_FAULT_OOM;
2515 }
2516 goto out; 2516 goto out;
2517 } 2517 }
2518 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2518 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@ -2536,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2536 /* Had to read the page from swap area: Major fault */ 2536 /* Had to read the page from swap area: Major fault */
2537 ret = VM_FAULT_MAJOR; 2537 ret = VM_FAULT_MAJOR;
2538 count_vm_event(PGMAJFAULT); 2538 count_vm_event(PGMAJFAULT);
2539 } else if (PageHWPoison(page)) {
2540 ret = VM_FAULT_HWPOISON;
2541 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2542 goto out;
2539 } 2543 }
2540 2544
2541 lock_page(page); 2545 lock_page(page);
@@ -2624,6 +2628,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2624 spinlock_t *ptl; 2628 spinlock_t *ptl;
2625 pte_t entry; 2629 pte_t entry;
2626 2630
2631 if (!(flags & FAULT_FLAG_WRITE)) {
2632 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2633 vma->vm_page_prot));
2634 ptl = pte_lockptr(mm, pmd);
2635 spin_lock(ptl);
2636 if (!pte_none(*page_table))
2637 goto unlock;
2638 goto setpte;
2639 }
2640
2627 /* Allocate our own private page. */ 2641 /* Allocate our own private page. */
2628 pte_unmap(page_table); 2642 pte_unmap(page_table);
2629 2643
@@ -2638,13 +2652,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2638 goto oom_free_page; 2652 goto oom_free_page;
2639 2653
2640 entry = mk_pte(page, vma->vm_page_prot); 2654 entry = mk_pte(page, vma->vm_page_prot);
2641 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2655 if (vma->vm_flags & VM_WRITE)
2656 entry = pte_mkwrite(pte_mkdirty(entry));
2642 2657
2643 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2658 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2644 if (!pte_none(*page_table)) 2659 if (!pte_none(*page_table))
2645 goto release; 2660 goto release;
2661
2646 inc_mm_counter(mm, anon_rss); 2662 inc_mm_counter(mm, anon_rss);
2647 page_add_new_anon_rmap(page, vma, address); 2663 page_add_new_anon_rmap(page, vma, address);
2664setpte:
2648 set_pte_at(mm, address, page_table, entry); 2665 set_pte_at(mm, address, page_table, entry);
2649 2666
2650 /* No need to invalidate - it was non-present before */ 2667 /* No need to invalidate - it was non-present before */
@@ -2699,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2699 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2716 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2700 return ret; 2717 return ret;
2701 2718
2719 if (unlikely(PageHWPoison(vmf.page))) {
2720 if (ret & VM_FAULT_LOCKED)
2721 unlock_page(vmf.page);
2722 return VM_FAULT_HWPOISON;
2723 }
2724
2702 /* 2725 /*
2703 * For consistency in subsequent calls, make the faulted page always 2726 * For consistency in subsequent calls, make the faulted page always
2704 * locked. 2727 * locked.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4412a676c88..821dee596377 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages);
339 339
340void online_page(struct page *page) 340void online_page(struct page *page)
341{ 341{
342 unsigned long pfn = page_to_pfn(page);
343
342 totalram_pages++; 344 totalram_pages++;
343 num_physpages++; 345 if (pfn >= num_physpages)
346 num_physpages = pfn + 1;
344 347
345#ifdef CONFIG_HIGHMEM 348#ifdef CONFIG_HIGHMEM
346 if (PageHighMem(page)) 349 if (PageHighMem(page))
@@ -410,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
410 if (!populated_zone(zone)) 413 if (!populated_zone(zone))
411 need_zonelists_rebuild = 1; 414 need_zonelists_rebuild = 1;
412 415
413 ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, 416 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
414 online_pages_range); 417 online_pages_range);
415 if (ret) { 418 if (ret) {
416 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 419 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
@@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
422 zone->present_pages += onlined_pages; 425 zone->present_pages += onlined_pages;
423 zone->zone_pgdat->node_present_pages += onlined_pages; 426 zone->zone_pgdat->node_present_pages += onlined_pages;
424 427
428 zone_pcp_update(zone);
425 setup_per_zone_wmarks(); 429 setup_per_zone_wmarks();
426 calculate_zone_inactive_ratio(zone); 430 calculate_zone_inactive_ratio(zone);
427 if (onlined_pages) { 431 if (onlined_pages) {
@@ -701,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
701static void 705static void
702offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) 706offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
703{ 707{
704 walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, 708 walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
705 offline_isolated_pages_cb); 709 offline_isolated_pages_cb);
706} 710}
707 711
@@ -727,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
727 long offlined = 0; 731 long offlined = 0;
728 int ret; 732 int ret;
729 733
730 ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, 734 ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
731 check_pages_isolated_cb); 735 check_pages_isolated_cb);
732 if (ret < 0) 736 if (ret < 0)
733 offlined = (long)ret; 737 offlined = (long)ret;
@@ -831,7 +835,6 @@ repeat:
831 zone->present_pages -= offlined_pages; 835 zone->present_pages -= offlined_pages;
832 zone->zone_pgdat->node_present_pages -= offlined_pages; 836 zone->zone_pgdat->node_present_pages -= offlined_pages;
833 totalram_pages -= offlined_pages; 837 totalram_pages -= offlined_pages;
834 num_physpages -= offlined_pages;
835 838
836 setup_per_zone_wmarks(); 839 setup_per_zone_wmarks();
837 calculate_zone_inactive_ratio(zone); 840 calculate_zone_inactive_ratio(zone);
diff --git a/mm/mempool.c b/mm/mempool.c
index 32e75d400503..1a3bc3d4d554 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -308,13 +308,6 @@ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{
313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask);
315}
316EXPORT_SYMBOL(mempool_kzalloc);
317
318void mempool_kfree(void *element, void *pool_data) 311void mempool_kfree(void *element, void *pool_data)
319{ 312{
320 kfree(element); 313 kfree(element);
diff --git a/mm/migrate.c b/mm/migrate.c
index 939888f9ddab..1a4bf4813780 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l)
67 67
68 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
69 list_del(&page->lru); 69 list_del(&page->lru);
70 dec_zone_page_state(page, NR_ISOLATED_ANON +
71 page_is_file_cache(page));
70 putback_lru_page(page); 72 putback_lru_page(page);
71 count++; 73 count++;
72 } 74 }
@@ -147,7 +149,7 @@ out:
147static void remove_file_migration_ptes(struct page *old, struct page *new) 149static void remove_file_migration_ptes(struct page *old, struct page *new)
148{ 150{
149 struct vm_area_struct *vma; 151 struct vm_area_struct *vma;
150 struct address_space *mapping = page_mapping(new); 152 struct address_space *mapping = new->mapping;
151 struct prio_tree_iter iter; 153 struct prio_tree_iter iter;
152 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 154 pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
153 155
@@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 272 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 273 page_index(page));
272 274
273 expected_count = 2 + !!page_has_private(page); 275 expected_count = 2 + page_has_private(page);
274 if (page_count(page) != expected_count || 276 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 277 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 278 spin_unlock_irq(&mapping->tree_lock);
@@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
312 */ 314 */
313 __dec_zone_page_state(page, NR_FILE_PAGES); 315 __dec_zone_page_state(page, NR_FILE_PAGES);
314 __inc_zone_page_state(newpage, NR_FILE_PAGES); 316 __inc_zone_page_state(newpage, NR_FILE_PAGES);
315 317 if (PageSwapBacked(page)) {
318 __dec_zone_page_state(page, NR_SHMEM);
319 __inc_zone_page_state(newpage, NR_SHMEM);
320 }
316 spin_unlock_irq(&mapping->tree_lock); 321 spin_unlock_irq(&mapping->tree_lock);
317 322
318 return 0; 323 return 0;
@@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
664 * needs to be effective. 669 * needs to be effective.
665 */ 670 */
666 try_to_free_buffers(page); 671 try_to_free_buffers(page);
672 goto rcu_unlock;
667 } 673 }
668 goto rcu_unlock; 674 goto skip_unmap;
669 } 675 }
670 676
671 /* Establish migration ptes or remove ptes */ 677 /* Establish migration ptes or remove ptes */
672 try_to_unmap(page, 1); 678 try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
673 679
680skip_unmap:
674 if (!page_mapped(page)) 681 if (!page_mapped(page))
675 rc = move_to_new_page(newpage, page); 682 rc = move_to_new_page(newpage, page);
676 683
@@ -693,6 +700,8 @@ unlock:
693 * restored. 700 * restored.
694 */ 701 */
695 list_del(&page->lru); 702 list_del(&page->lru);
703 dec_zone_page_state(page, NR_ISOLATED_ANON +
704 page_is_file_cache(page));
696 putback_lru_page(page); 705 putback_lru_page(page);
697 } 706 }
698 707
@@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from,
737 struct page *page2; 746 struct page *page2;
738 int swapwrite = current->flags & PF_SWAPWRITE; 747 int swapwrite = current->flags & PF_SWAPWRITE;
739 int rc; 748 int rc;
749 unsigned long flags;
750
751 local_irq_save(flags);
752 list_for_each_entry(page, from, lru)
753 __inc_zone_page_state(page, NR_ISOLATED_ANON +
754 page_is_file_cache(page));
755 local_irq_restore(flags);
740 756
741 if (!swapwrite) 757 if (!swapwrite)
742 current->flags |= PF_SWAPWRITE; 758 current->flags |= PF_SWAPWRITE;
diff --git a/mm/mlock.c b/mm/mlock.c
index 45eb650b9654..bd6f0e466f6c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page)
139} 139}
140 140
141/** 141/**
142 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. 142 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
143 * @vma: target vma 143 * @vma: target vma
144 * @start: start address 144 * @start: start address
145 * @end: end address 145 * @end: end address
146 * @mlock: 0 indicate munlock, otherwise mlock.
147 * 146 *
148 * If @mlock == 0, unlock an mlocked range; 147 * This takes care of making the pages present too.
149 * else mlock the range of pages. This takes care of making the pages present ,
150 * too.
151 * 148 *
152 * return 0 on success, negative error code on error. 149 * return 0 on success, negative error code on error.
153 * 150 *
154 * vma->vm_mm->mmap_sem must be held for at least read. 151 * vma->vm_mm->mmap_sem must be held for at least read.
155 */ 152 */
156static long __mlock_vma_pages_range(struct vm_area_struct *vma, 153static long __mlock_vma_pages_range(struct vm_area_struct *vma,
157 unsigned long start, unsigned long end, 154 unsigned long start, unsigned long end)
158 int mlock)
159{ 155{
160 struct mm_struct *mm = vma->vm_mm; 156 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 157 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */ 158 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 159 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0; 160 int ret = 0;
165 int gup_flags = 0; 161 int gup_flags;
166 162
167 VM_BUG_ON(start & ~PAGE_MASK); 163 VM_BUG_ON(start & ~PAGE_MASK);
168 VM_BUG_ON(end & ~PAGE_MASK); 164 VM_BUG_ON(end & ~PAGE_MASK);
169 VM_BUG_ON(start < vma->vm_start); 165 VM_BUG_ON(start < vma->vm_start);
170 VM_BUG_ON(end > vma->vm_end); 166 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && 167 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 (atomic_read(&mm->mm_users) != 0));
173
174 /*
175 * mlock: don't page populate if vma has PROT_NONE permission.
176 * munlock: always do munlock although the vma has PROT_NONE
177 * permission, or SIGKILL is pending.
178 */
179 if (!mlock)
180 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
181 GUP_FLAGS_IGNORE_SIGKILL;
182 168
169 gup_flags = FOLL_TOUCH | FOLL_GET;
183 if (vma->vm_flags & VM_WRITE) 170 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 171 gup_flags |= FOLL_WRITE;
185 172
186 while (nr_pages > 0) { 173 while (nr_pages > 0) {
187 int i; 174 int i;
@@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
201 * This can happen for, e.g., VM_NONLINEAR regions before 188 * This can happen for, e.g., VM_NONLINEAR regions before
202 * a page has been allocated and mapped at a given offset, 189 * a page has been allocated and mapped at a given offset,
203 * or for addresses that map beyond end of a file. 190 * or for addresses that map beyond end of a file.
204 * We'll mlock the the pages if/when they get faulted in. 191 * We'll mlock the pages if/when they get faulted in.
205 */ 192 */
206 if (ret < 0) 193 if (ret < 0)
207 break; 194 break;
208 if (ret == 0) {
209 /*
210 * We know the vma is there, so the only time
211 * we cannot get a single page should be an
212 * error (ret < 0) case.
213 */
214 WARN_ON(1);
215 break;
216 }
217 195
218 lru_add_drain(); /* push cached pages to LRU */ 196 lru_add_drain(); /* push cached pages to LRU */
219 197
220 for (i = 0; i < ret; i++) { 198 for (i = 0; i < ret; i++) {
221 struct page *page = pages[i]; 199 struct page *page = pages[i];
222 200
223 lock_page(page);
224 /*
225 * Because we lock page here and migration is blocked
226 * by the elevated reference, we need only check for
227 * page truncation (file-cache only).
228 */
229 if (page->mapping) { 201 if (page->mapping) {
230 if (mlock) 202 /*
203 * That preliminary check is mainly to avoid
204 * the pointless overhead of lock_page on the
205 * ZERO_PAGE: which might bounce very badly if
206 * there is contention. However, we're still
207 * dirtying its cacheline with get/put_page:
208 * we'll add another __get_user_pages flag to
209 * avoid it if that case turns out to matter.
210 */
211 lock_page(page);
212 /*
213 * Because we lock page here and migration is
214 * blocked by the elevated reference, we need
215 * only check for file-cache page truncation.
216 */
217 if (page->mapping)
231 mlock_vma_page(page); 218 mlock_vma_page(page);
232 else 219 unlock_page(page);
233 munlock_vma_page(page);
234 } 220 }
235 unlock_page(page); 221 put_page(page); /* ref from get_user_pages() */
236 put_page(page); /* ref from get_user_pages() */
237
238 /*
239 * here we assume that get_user_pages() has given us
240 * a list of virtually contiguous pages.
241 */
242 addr += PAGE_SIZE; /* for next get_user_pages() */
243 nr_pages--;
244 } 222 }
223
224 addr += ret * PAGE_SIZE;
225 nr_pages -= ret;
245 ret = 0; 226 ret = 0;
246 } 227 }
247 228
248 return ret; /* count entire vma as locked_vm */ 229 return ret; /* 0 or negative error code */
249} 230}
250 231
251/* 232/*
@@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
289 is_vm_hugetlb_page(vma) || 270 is_vm_hugetlb_page(vma) ||
290 vma == get_gate_vma(current))) { 271 vma == get_gate_vma(current))) {
291 272
292 __mlock_vma_pages_range(vma, start, end, 1); 273 __mlock_vma_pages_range(vma, start, end);
293 274
294 /* Hide errors from mmap() and other callers */ 275 /* Hide errors from mmap() and other callers */
295 return 0; 276 return 0;
@@ -310,7 +291,6 @@ no_mlock:
310 return nr_pages; /* error or pages NOT mlocked */ 291 return nr_pages; /* error or pages NOT mlocked */
311} 292}
312 293
313
314/* 294/*
315 * munlock_vma_pages_range() - munlock all pages in the vma range.' 295 * munlock_vma_pages_range() - munlock all pages in the vma range.'
316 * @vma - vma containing range to be munlock()ed. 296 * @vma - vma containing range to be munlock()ed.
@@ -330,10 +310,38 @@ no_mlock:
330 * free them. This will result in freeing mlocked pages. 310 * free them. This will result in freeing mlocked pages.
331 */ 311 */
332void munlock_vma_pages_range(struct vm_area_struct *vma, 312void munlock_vma_pages_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long end) 313 unsigned long start, unsigned long end)
334{ 314{
315 unsigned long addr;
316
317 lru_add_drain();
335 vma->vm_flags &= ~VM_LOCKED; 318 vma->vm_flags &= ~VM_LOCKED;
336 __mlock_vma_pages_range(vma, start, end, 0); 319
320 for (addr = start; addr < end; addr += PAGE_SIZE) {
321 struct page *page;
322 /*
323 * Although FOLL_DUMP is intended for get_dump_page(),
324 * it just so happens that its special treatment of the
325 * ZERO_PAGE (returning an error instead of doing get_page)
326 * suits munlock very well (and if somehow an abnormal page
327 * has sneaked into the range, we won't oops here: great).
328 */
329 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
330 if (page && !IS_ERR(page)) {
331 lock_page(page);
332 /*
333 * Like in __mlock_vma_pages_range(),
334 * because we lock page here and migration is
335 * blocked by the elevated reference, we need
336 * only check for file-cache page truncation.
337 */
338 if (page->mapping)
339 munlock_vma_page(page);
340 unlock_page(page);
341 put_page(page);
342 }
343 cond_resched();
344 }
337} 345}
338 346
339/* 347/*
@@ -400,18 +408,14 @@ success:
400 * It's okay if try_to_unmap_one unmaps a page just after we 408 * It's okay if try_to_unmap_one unmaps a page just after we
401 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 409 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
402 */ 410 */
403 vma->vm_flags = newflags;
404 411
405 if (lock) { 412 if (lock) {
406 ret = __mlock_vma_pages_range(vma, start, end, 1); 413 vma->vm_flags = newflags;
407 414 ret = __mlock_vma_pages_range(vma, start, end);
408 if (ret > 0) { 415 if (ret < 0)
409 mm->locked_vm -= ret; 416 ret = __mlock_posix_error_return(ret);
410 ret = 0;
411 } else
412 ret = __mlock_posix_error_return(ret); /* translate if needed */
413 } else { 417 } else {
414 __mlock_vma_pages_range(vma, start, end, 0); 418 munlock_vma_pages_range(vma, start, end);
415 } 419 }
416 420
417out: 421out:
diff --git a/mm/mmap.c b/mm/mmap.c
index 8101de490c73..73f5e4b64010 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,7 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h> 31#include <linux/perf_event.h>
32 32
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -570,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end);
570 570
571 /* 571 /*
572 * When changing only vma->vm_end, we don't really need 572 * When changing only vma->vm_end, we don't really need
573 * anon_vma lock: but is that case worth optimizing out? 573 * anon_vma lock.
574 */ 574 */
575 if (vma->anon_vma) 575 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
576 anon_vma = vma->anon_vma; 576 anon_vma = vma->anon_vma;
577 if (anon_vma) { 577 if (anon_vma) {
578 spin_lock(&anon_vma->lock); 578 spin_lock(&anon_vma->lock);
@@ -656,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end);
656 validate_mm(mm); 656 validate_mm(mm);
657} 657}
658 658
659/* Flags that can be inherited from an existing mapping when merging */
660#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
661
662/* 659/*
663 * If the vma has a ->close operation then the driver probably needs to release 660 * If the vma has a ->close operation then the driver probably needs to release
664 * per-vma resources, so we don't attempt to merge those. 661 * per-vma resources, so we don't attempt to merge those.
@@ -666,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end);
666static inline int is_mergeable_vma(struct vm_area_struct *vma, 663static inline int is_mergeable_vma(struct vm_area_struct *vma,
667 struct file *file, unsigned long vm_flags) 664 struct file *file, unsigned long vm_flags)
668{ 665{
669 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) 666 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
667 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
670 return 0; 668 return 0;
671 if (vma->vm_file != file) 669 if (vma->vm_file != file)
672 return 0; 670 return 0;
@@ -905,7 +903,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
905#endif /* CONFIG_PROC_FS */ 903#endif /* CONFIG_PROC_FS */
906 904
907/* 905/*
908 * The caller must hold down_write(current->mm->mmap_sem). 906 * The caller must hold down_write(&current->mm->mmap_sem).
909 */ 907 */
910 908
911unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 909unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
@@ -951,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
951 if (mm->map_count > sysctl_max_map_count) 949 if (mm->map_count > sysctl_max_map_count)
952 return -ENOMEM; 950 return -ENOMEM;
953 951
952 if (flags & MAP_HUGETLB) {
953 struct user_struct *user = NULL;
954 if (file)
955 return -EINVAL;
956
957 /*
958 * VM_NORESERVE is used because the reservations will be
959 * taken when vm_ops->mmap() is called
960 * A dummy user value is used because we are not locking
961 * memory so no accounting is necessary
962 */
963 len = ALIGN(len, huge_page_size(&default_hstate));
964 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
965 &user, HUGETLB_ANONHUGE_INODE);
966 if (IS_ERR(file))
967 return PTR_ERR(file);
968 }
969
954 /* Obtain the address to map to. we verify (or select) it and ensure 970 /* Obtain the address to map to. we verify (or select) it and ensure
955 * that it represents a valid section of the address space. 971 * that it represents a valid section of the address space.
956 */ 972 */
@@ -965,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
965 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 981 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
966 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 982 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
967 983
968 if (flags & MAP_LOCKED) { 984 if (flags & MAP_LOCKED)
969 if (!can_do_mlock()) 985 if (!can_do_mlock())
970 return -EPERM; 986 return -EPERM;
971 vm_flags |= VM_LOCKED;
972 }
973 987
974 /* mlock MCL_FUTURE? */ 988 /* mlock MCL_FUTURE? */
975 if (vm_flags & VM_LOCKED) { 989 if (vm_flags & VM_LOCKED) {
@@ -1195,21 +1209,21 @@ munmap_back:
1195 goto unmap_and_free_vma; 1209 goto unmap_and_free_vma;
1196 if (vm_flags & VM_EXECUTABLE) 1210 if (vm_flags & VM_EXECUTABLE)
1197 added_exe_file_vma(mm); 1211 added_exe_file_vma(mm);
1212
1213 /* Can addr have changed??
1214 *
1215 * Answer: Yes, several device drivers can do it in their
1216 * f_op->mmap method. -DaveM
1217 */
1218 addr = vma->vm_start;
1219 pgoff = vma->vm_pgoff;
1220 vm_flags = vma->vm_flags;
1198 } else if (vm_flags & VM_SHARED) { 1221 } else if (vm_flags & VM_SHARED) {
1199 error = shmem_zero_setup(vma); 1222 error = shmem_zero_setup(vma);
1200 if (error) 1223 if (error)
1201 goto free_vma; 1224 goto free_vma;
1202 } 1225 }
1203 1226
1204 /* Can addr have changed??
1205 *
1206 * Answer: Yes, several device drivers can do it in their
1207 * f_op->mmap method. -DaveM
1208 */
1209 addr = vma->vm_start;
1210 pgoff = vma->vm_pgoff;
1211 vm_flags = vma->vm_flags;
1212
1213 if (vma_wants_writenotify(vma)) 1227 if (vma_wants_writenotify(vma))
1214 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1228 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1215 1229
@@ -1220,7 +1234,7 @@ munmap_back:
1220 if (correct_wcount) 1234 if (correct_wcount)
1221 atomic_inc(&inode->i_writecount); 1235 atomic_inc(&inode->i_writecount);
1222out: 1236out:
1223 perf_counter_mmap(vma); 1237 perf_event_mmap(vma);
1224 1238
1225 mm->total_vm += len >> PAGE_SHIFT; 1239 mm->total_vm += len >> PAGE_SHIFT;
1226 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1240 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2111,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm)
2111 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2125 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2112 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2126 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2113 vm_unacct_memory(nr_accounted); 2127 vm_unacct_memory(nr_accounted);
2128
2114 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2129 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
2115 tlb_finish_mmu(tlb, 0, end); 2130 tlb_finish_mmu(tlb, 0, end);
2116 2131
@@ -2267,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
2267{ 2282{
2268} 2283}
2269 2284
2270static struct vm_operations_struct special_mapping_vmops = { 2285static const struct vm_operations_struct special_mapping_vmops = {
2271 .close = special_mapping_close, 2286 .close = special_mapping_close,
2272 .fault = special_mapping_fault, 2287 .fault = special_mapping_fault,
2273}; 2288};
@@ -2308,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm,
2308 2323
2309 mm->total_vm += len >> PAGE_SHIFT; 2324 mm->total_vm += len >> PAGE_SHIFT;
2310 2325
2311 perf_counter_mmap(vma); 2326 perf_event_mmap(vma);
2312 2327
2313 return 0; 2328 return 0;
2314} 2329}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
new file mode 100644
index 000000000000..ded9081f4021
--- /dev/null
+++ b/mm/mmu_context.c
@@ -0,0 +1,58 @@
1/* Copyright (C) 2009 Red Hat, Inc.
2 *
3 * See ../COPYING for licensing terms.
4 */
5
6#include <linux/mm.h>
7#include <linux/mmu_context.h>
8#include <linux/sched.h>
9
10#include <asm/mmu_context.h>
11
12/*
13 * use_mm
14 * Makes the calling kernel thread take on the specified
15 * mm context.
16 * Called by the retry thread execute retries within the
17 * iocb issuer's mm context, so that copy_from/to_user
18 * operations work seamlessly for aio.
19 * (Note: this routine is intended to be called only
20 * from a kernel thread context)
21 */
22void use_mm(struct mm_struct *mm)
23{
24 struct mm_struct *active_mm;
25 struct task_struct *tsk = current;
26
27 task_lock(tsk);
28 active_mm = tsk->active_mm;
29 if (active_mm != mm) {
30 atomic_inc(&mm->mm_count);
31 tsk->active_mm = mm;
32 }
33 tsk->mm = mm;
34 switch_mm(active_mm, mm, tsk);
35 task_unlock(tsk);
36
37 if (active_mm != mm)
38 mmdrop(active_mm);
39}
40
41/*
42 * unuse_mm
43 * Reverses the effect of use_mm, i.e. releases the
44 * specified mm context which was earlier taken on
45 * by the calling kernel thread
46 * (Note: this routine is intended to be called only
47 * from a kernel thread context)
48 */
49void unuse_mm(struct mm_struct *mm)
50{
51 struct task_struct *tsk = current;
52
53 task_lock(tsk);
54 tsk->mm = NULL;
55 /* active_mm is still 'mm' */
56 enter_lazy_tlb(mm, tsk);
57 task_unlock(tsk);
58}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef0250bee..7e33f2cb3c77 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
99 return young; 99 return young;
100} 100}
101 101
102void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
103 pte_t pte)
104{
105 struct mmu_notifier *mn;
106 struct hlist_node *n;
107
108 rcu_read_lock();
109 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
110 if (mn->ops->change_pte)
111 mn->ops->change_pte(mn, mm, address, pte);
112 /*
113 * Some drivers don't have change_pte,
114 * so we must call invalidate_page in that case.
115 */
116 else if (mn->ops->invalidate_page)
117 mn->ops->invalidate_page(mn, mm, address);
118 }
119 rcu_read_unlock();
120}
121
102void __mmu_notifier_invalidate_page(struct mm_struct *mm, 122void __mmu_notifier_invalidate_page(struct mm_struct *mm,
103 unsigned long address) 123 unsigned long address)
104{ 124{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index d80311baeb2d..8bc969d8112d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,7 +23,7 @@
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h> 25#include <linux/migrate.h>
26#include <linux/perf_counter.h> 26#include <linux/perf_event.h>
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28#include <asm/pgtable.h> 28#include <asm/pgtable.h>
29#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
@@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
301 if (error) 301 if (error)
302 goto out; 302 goto out;
303 perf_counter_mmap(vma); 303 perf_event_mmap(vma);
304 nstart = tmp; 304 nstart = tmp;
305 305
306 if (nstart < prev->vm_end) 306 if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index a39b7b91be46..97bff2547719 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -11,6 +11,7 @@
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/ksm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/swap.h> 16#include <linux/swap.h>
16#include <linux/capability.h> 17#include <linux/capability.h>
@@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
85 if (vma->vm_file) { 86 if (vma->vm_file) {
86 /* 87 /*
87 * Subtle point from Rajesh Venkatasubramanian: before 88 * Subtle point from Rajesh Venkatasubramanian: before
88 * moving file-based ptes, we must lock vmtruncate out, 89 * moving file-based ptes, we must lock truncate_pagecache
89 * since it might clean the dst vma before the src vma, 90 * out, since it might clean the dst vma before the src vma,
90 * and we propagate stale pages into the dst afterward. 91 * and we propagate stale pages into the dst afterward.
91 */ 92 */
92 mapping = vma->vm_file->f_mapping; 93 mapping = vma->vm_file->f_mapping;
@@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
174 unsigned long excess = 0; 175 unsigned long excess = 0;
175 unsigned long hiwater_vm; 176 unsigned long hiwater_vm;
176 int split = 0; 177 int split = 0;
178 int err;
177 179
178 /* 180 /*
179 * We'd prefer to avoid failure later on in do_munmap: 181 * We'd prefer to avoid failure later on in do_munmap:
@@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
182 if (mm->map_count >= sysctl_max_map_count - 3) 184 if (mm->map_count >= sysctl_max_map_count - 3)
183 return -ENOMEM; 185 return -ENOMEM;
184 186
187 /*
188 * Advise KSM to break any KSM pages in the area to be moved:
189 * it would be confusing if they were to turn up at the new
190 * location, where they happen to coincide with different KSM
191 * pages recently unmapped. But leave vma->vm_flags as it was,
192 * so KSM can come around to merge on vma and new_vma afterwards.
193 */
194 err = ksm_madvise(vma, old_addr, old_addr + old_len,
195 MADV_UNMERGEABLE, &vm_flags);
196 if (err)
197 return err;
198
185 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 199 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
186 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 200 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
187 if (!new_vma) 201 if (!new_vma)
diff --git a/mm/nommu.c b/mm/nommu.c
index 66e81e7e9fe9..5189b5aed8c0 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,6 +33,7 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/mmu_context.h>
36#include "internal.h" 37#include "internal.h"
37 38
38static inline __attribute__((format(printf, 1, 2))) 39static inline __attribute__((format(printf, 1, 2)))
@@ -56,12 +57,11 @@ void no_printk(const char *fmt, ...)
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) 57 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif 58#endif
58 59
59#include "internal.h"
60
61void *high_memory; 60void *high_memory;
62struct page *mem_map; 61struct page *mem_map;
63unsigned long max_mapnr; 62unsigned long max_mapnr;
64unsigned long num_physpages; 63unsigned long num_physpages;
64unsigned long highest_memmap_pfn;
65struct percpu_counter vm_committed_as; 65struct percpu_counter vm_committed_as;
66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
67int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
@@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar;
79struct rb_root nommu_region_tree = RB_ROOT; 79struct rb_root nommu_region_tree = RB_ROOT;
80DECLARE_RWSEM(nommu_region_sem); 80DECLARE_RWSEM(nommu_region_sem);
81 81
82struct vm_operations_struct generic_file_vm_ops = { 82const struct vm_operations_struct generic_file_vm_ops = {
83}; 83};
84 84
85/* 85/*
86 * Handle all mappings that got truncated by a "truncate()"
87 * system call.
88 *
89 * NOTE! We have to be ready to update the memory sharing
90 * between the file and the memory map for a potential last
91 * incomplete page. Ugly, but necessary.
92 */
93int vmtruncate(struct inode *inode, loff_t offset)
94{
95 struct address_space *mapping = inode->i_mapping;
96 unsigned long limit;
97
98 if (inode->i_size < offset)
99 goto do_expand;
100 i_size_write(inode, offset);
101
102 truncate_inode_pages(mapping, offset);
103 goto out_truncate;
104
105do_expand:
106 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
107 if (limit != RLIM_INFINITY && offset > limit)
108 goto out_sig;
109 if (offset > inode->i_sb->s_maxbytes)
110 goto out;
111 i_size_write(inode, offset);
112
113out_truncate:
114 if (inode->i_op->truncate)
115 inode->i_op->truncate(inode);
116 return 0;
117out_sig:
118 send_sig(SIGXFSZ, current, 0);
119out:
120 return -EFBIG;
121}
122
123EXPORT_SYMBOL(vmtruncate);
124
125/*
126 * Return the total memory allocated for this pointer, not 86 * Return the total memory allocated for this pointer, not
127 * just what the caller asked for. 87 * just what the caller asked for.
128 * 88 *
@@ -170,21 +130,20 @@ unsigned int kobjsize(const void *objp)
170} 130}
171 131
172int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 132int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
173 unsigned long start, int nr_pages, int flags, 133 unsigned long start, int nr_pages, unsigned int foll_flags,
174 struct page **pages, struct vm_area_struct **vmas) 134 struct page **pages, struct vm_area_struct **vmas)
175{ 135{
176 struct vm_area_struct *vma; 136 struct vm_area_struct *vma;
177 unsigned long vm_flags; 137 unsigned long vm_flags;
178 int i; 138 int i;
179 int write = !!(flags & GUP_FLAGS_WRITE);
180 int force = !!(flags & GUP_FLAGS_FORCE);
181 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
182 139
183 /* calculate required read or write permissions. 140 /* calculate required read or write permissions.
184 * - if 'force' is set, we only require the "MAY" flags. 141 * If FOLL_FORCE is set, we only require the "MAY" flags.
185 */ 142 */
186 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 143 vm_flags = (foll_flags & FOLL_WRITE) ?
187 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 144 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
145 vm_flags &= (foll_flags & FOLL_FORCE) ?
146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
188 147
189 for (i = 0; i < nr_pages; i++) { 148 for (i = 0; i < nr_pages; i++) {
190 vma = find_vma(mm, start); 149 vma = find_vma(mm, start);
@@ -192,8 +151,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
192 goto finish_or_fault; 151 goto finish_or_fault;
193 152
194 /* protect what we can, including chardevs */ 153 /* protect what we can, including chardevs */
195 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 154 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
196 (!ignore && !(vm_flags & vma->vm_flags))) 155 !(vm_flags & vma->vm_flags))
197 goto finish_or_fault; 156 goto finish_or_fault;
198 157
199 if (pages) { 158 if (pages) {
@@ -212,7 +171,6 @@ finish_or_fault:
212 return i ? : -EFAULT; 171 return i ? : -EFAULT;
213} 172}
214 173
215
216/* 174/*
217 * get a list of pages in an address range belonging to the specified process 175 * get a list of pages in an address range belonging to the specified process
218 * and indicate the VMA that covers each page 176 * and indicate the VMA that covers each page
@@ -227,9 +185,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
227 int flags = 0; 185 int flags = 0;
228 186
229 if (write) 187 if (write)
230 flags |= GUP_FLAGS_WRITE; 188 flags |= FOLL_WRITE;
231 if (force) 189 if (force)
232 flags |= GUP_FLAGS_FORCE; 190 flags |= FOLL_FORCE;
233 191
234 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 192 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
235} 193}
@@ -627,6 +585,22 @@ static void put_nommu_region(struct vm_region *region)
627} 585}
628 586
629/* 587/*
588 * update protection on a vma
589 */
590static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
591{
592#ifdef CONFIG_MPU
593 struct mm_struct *mm = vma->vm_mm;
594 long start = vma->vm_start & PAGE_MASK;
595 while (start < vma->vm_end) {
596 protect_page(mm, start, flags);
597 start += PAGE_SIZE;
598 }
599 update_protections(mm);
600#endif
601}
602
603/*
630 * add a VMA into a process's mm_struct in the appropriate place in the list 604 * add a VMA into a process's mm_struct in the appropriate place in the list
631 * and tree and add to the address space's page tree also if not an anonymous 605 * and tree and add to the address space's page tree also if not an anonymous
632 * page 606 * page
@@ -645,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
645 mm->map_count++; 619 mm->map_count++;
646 vma->vm_mm = mm; 620 vma->vm_mm = mm;
647 621
622 protect_vma(vma, vma->vm_flags);
623
648 /* add the VMA to the mapping */ 624 /* add the VMA to the mapping */
649 if (vma->vm_file) { 625 if (vma->vm_file) {
650 mapping = vma->vm_file->f_mapping; 626 mapping = vma->vm_file->f_mapping;
@@ -707,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
707 683
708 kenter("%p", vma); 684 kenter("%p", vma);
709 685
686 protect_vma(vma, 0);
687
710 mm->map_count--; 688 mm->map_count--;
711 if (mm->mmap_cache == vma) 689 if (mm->mmap_cache == vma)
712 mm->mmap_cache = NULL; 690 mm->mmap_cache = NULL;
@@ -848,7 +826,7 @@ static int validate_mmap_request(struct file *file,
848 int ret; 826 int ret;
849 827
850 /* do the simple checks first */ 828 /* do the simple checks first */
851 if (flags & MAP_FIXED || addr) { 829 if (flags & MAP_FIXED) {
852 printk(KERN_DEBUG 830 printk(KERN_DEBUG
853 "%d: Can't do fixed-address/overlay mmap of RAM\n", 831 "%d: Can't do fixed-address/overlay mmap of RAM\n",
854 current->pid); 832 current->pid);
@@ -1056,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1056 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1034 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1057 if (ret == 0) { 1035 if (ret == 0) {
1058 vma->vm_region->vm_top = vma->vm_region->vm_end; 1036 vma->vm_region->vm_top = vma->vm_region->vm_end;
1059 return ret; 1037 return 0;
1060 } 1038 }
1061 if (ret != -ENOSYS) 1039 if (ret != -ENOSYS)
1062 return ret; 1040 return ret;
@@ -1073,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1073 */ 1051 */
1074static int do_mmap_private(struct vm_area_struct *vma, 1052static int do_mmap_private(struct vm_area_struct *vma,
1075 struct vm_region *region, 1053 struct vm_region *region,
1076 unsigned long len) 1054 unsigned long len,
1055 unsigned long capabilities)
1077{ 1056{
1078 struct page *pages; 1057 struct page *pages;
1079 unsigned long total, point, n, rlen; 1058 unsigned long total, point, n, rlen;
@@ -1084,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma,
1084 * shared mappings on devices or memory 1063 * shared mappings on devices or memory
1085 * - VM_MAYSHARE will be set if it may attempt to share 1064 * - VM_MAYSHARE will be set if it may attempt to share
1086 */ 1065 */
1087 if (vma->vm_file) { 1066 if (capabilities & BDI_CAP_MAP_DIRECT) {
1088 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1067 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1089 if (ret == 0) { 1068 if (ret == 0) {
1090 /* shouldn't return success if we're not sharing */ 1069 /* shouldn't return success if we're not sharing */
1091 BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); 1070 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1092 vma->vm_region->vm_top = vma->vm_region->vm_end; 1071 vma->vm_region->vm_top = vma->vm_region->vm_end;
1093 return ret; 1072 return 0;
1094 } 1073 }
1095 if (ret != -ENOSYS) 1074 if (ret != -ENOSYS)
1096 return ret; 1075 return ret;
@@ -1203,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1203 1182
1204 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1183 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1205 1184
1206 if (!(flags & MAP_FIXED))
1207 addr = round_hint_to_min(addr);
1208
1209 /* decide whether we should attempt the mapping, and if so what sort of 1185 /* decide whether we should attempt the mapping, and if so what sort of
1210 * mapping */ 1186 * mapping */
1211 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1187 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1215,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file,
1215 return ret; 1191 return ret;
1216 } 1192 }
1217 1193
1194 /* we ignore the address hint */
1195 addr = 0;
1196
1218 /* we've determined that we can make the mapping, now translate what we 1197 /* we've determined that we can make the mapping, now translate what we
1219 * now know into VMA flags */ 1198 * now know into VMA flags */
1220 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1199 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
@@ -1328,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1328 * - this is the hook for quasi-memory character devices to 1307 * - this is the hook for quasi-memory character devices to
1329 * tell us the location of a shared mapping 1308 * tell us the location of a shared mapping
1330 */ 1309 */
1331 if (file && file->f_op->get_unmapped_area) { 1310 if (capabilities & BDI_CAP_MAP_DIRECT) {
1332 addr = file->f_op->get_unmapped_area(file, addr, len, 1311 addr = file->f_op->get_unmapped_area(file, addr, len,
1333 pgoff, flags); 1312 pgoff, flags);
1334 if (IS_ERR((void *) addr)) { 1313 if (IS_ERR((void *) addr)) {
@@ -1352,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file,
1352 } 1331 }
1353 1332
1354 vma->vm_region = region; 1333 vma->vm_region = region;
1355 add_nommu_region(region);
1356 1334
1357 /* set up the mapping */ 1335 /* set up the mapping
1336 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1337 */
1358 if (file && vma->vm_flags & VM_SHARED) 1338 if (file && vma->vm_flags & VM_SHARED)
1359 ret = do_mmap_shared_file(vma); 1339 ret = do_mmap_shared_file(vma);
1360 else 1340 else
1361 ret = do_mmap_private(vma, region, len); 1341 ret = do_mmap_private(vma, region, len, capabilities);
1362 if (ret < 0) 1342 if (ret < 0)
1363 goto error_put_region; 1343 goto error_just_free;
1344 add_nommu_region(region);
1364 1345
1365 /* okay... we have a mapping; now we have to register it */ 1346 /* okay... we have a mapping; now we have to register it */
1366 result = vma->vm_start; 1347 result = vma->vm_start;
@@ -1378,19 +1359,6 @@ share:
1378 kleave(" = %lx", result); 1359 kleave(" = %lx", result);
1379 return result; 1360 return result;
1380 1361
1381error_put_region:
1382 __put_nommu_region(region);
1383 if (vma) {
1384 if (vma->vm_file) {
1385 fput(vma->vm_file);
1386 if (vma->vm_flags & VM_EXECUTABLE)
1387 removed_exe_file_vma(vma->vm_mm);
1388 }
1389 kmem_cache_free(vm_area_cachep, vma);
1390 }
1391 kleave(" = %d [pr]", ret);
1392 return ret;
1393
1394error_just_free: 1362error_just_free:
1395 up_write(&nommu_region_sem); 1363 up_write(&nommu_region_sem);
1396error: 1364error:
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..ea2147dabba6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_lock); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/*
38 * Is all threads of the target process nodes overlap ours?
39 */
40static int has_intersects_mems_allowed(struct task_struct *tsk)
41{
42 struct task_struct *t;
43
44 t = tsk;
45 do {
46 if (cpuset_mems_allowed_intersects(current, t))
47 return 1;
48 t = next_thread(t);
49 } while (t != tsk);
50
51 return 0;
52}
53
37/** 54/**
38 * badness - calculate a numeric value for how bad this task has been 55 * badness - calculate a numeric value for how bad this task has been
39 * @p: task struct of which task we should calculate 56 * @p: task struct of which task we should calculate
@@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 75 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 76 struct mm_struct *mm;
60 struct task_struct *child; 77 struct task_struct *child;
78 int oom_adj = p->signal->oom_adj;
79 struct task_cputime task_time;
80 unsigned long utime;
81 unsigned long stime;
82
83 if (oom_adj == OOM_DISABLE)
84 return 0;
61 85
62 task_lock(p); 86 task_lock(p);
63 mm = p->mm; 87 mm = p->mm;
@@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
79 /* 103 /*
80 * swapoff can easily use up all memory, so kill those first. 104 * swapoff can easily use up all memory, so kill those first.
81 */ 105 */
82 if (p->flags & PF_SWAPOFF) 106 if (p->flags & PF_OOM_ORIGIN)
83 return ULONG_MAX; 107 return ULONG_MAX;
84 108
85 /* 109 /*
@@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
102 * of seconds. There is no particular reason for this other than 126 * of seconds. There is no particular reason for this other than
103 * that it turned out to work very well in practice. 127 * that it turned out to work very well in practice.
104 */ 128 */
105 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) 129 thread_group_cputime(p, &task_time);
106 >> (SHIFT_HZ + 3); 130 utime = cputime_to_jiffies(task_time.utime);
131 stime = cputime_to_jiffies(task_time.stime);
132 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
133
107 134
108 if (uptime >= p->start_time.tv_sec) 135 if (uptime >= p->start_time.tv_sec)
109 run_time = (uptime - p->start_time.tv_sec) >> 10; 136 run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
144 * because p may have allocated or otherwise mapped memory on 171 * because p may have allocated or otherwise mapped memory on
145 * this node before. However it will be less likely. 172 * this node before. However it will be less likely.
146 */ 173 */
147 if (!cpuset_mems_allowed_intersects(current, p)) 174 if (!has_intersects_mems_allowed(p))
148 points /= 8; 175 points /= 8;
149 176
150 /* 177 /*
151 * Adjust the score by oomkilladj. 178 * Adjust the score by oom_adj.
152 */ 179 */
153 if (p->oomkilladj) { 180 if (oom_adj) {
154 if (p->oomkilladj > 0) { 181 if (oom_adj > 0) {
155 if (!points) 182 if (!points)
156 points = 1; 183 points = 1;
157 points <<= p->oomkilladj; 184 points <<= oom_adj;
158 } else 185 } else
159 points >>= -(p->oomkilladj); 186 points >>= -(oom_adj);
160 } 187 }
161 188
162#ifdef DEBUG 189#ifdef DEBUG
@@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
200static struct task_struct *select_bad_process(unsigned long *ppoints, 227static struct task_struct *select_bad_process(unsigned long *ppoints,
201 struct mem_cgroup *mem) 228 struct mem_cgroup *mem)
202{ 229{
203 struct task_struct *g, *p; 230 struct task_struct *p;
204 struct task_struct *chosen = NULL; 231 struct task_struct *chosen = NULL;
205 struct timespec uptime; 232 struct timespec uptime;
206 *ppoints = 0; 233 *ppoints = 0;
207 234
208 do_posix_clock_monotonic_gettime(&uptime); 235 do_posix_clock_monotonic_gettime(&uptime);
209 do_each_thread(g, p) { 236 for_each_process(p) {
210 unsigned long points; 237 unsigned long points;
211 238
212 /* 239 /*
@@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 278 *ppoints = ULONG_MAX;
252 } 279 }
253 280
254 if (p->oomkilladj == OOM_DISABLE) 281 if (p->signal->oom_adj == OOM_DISABLE)
255 continue; 282 continue;
256 283
257 points = badness(p, uptime.tv_sec); 284 points = badness(p, uptime.tv_sec);
@@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
259 chosen = p; 286 chosen = p;
260 *ppoints = points; 287 *ppoints = points;
261 } 288 }
262 } while_each_thread(g, p); 289 }
263 290
264 return chosen; 291 return chosen;
265} 292}
@@ -304,7 +331,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 331 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 332 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 333 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 334 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
308 p->comm); 335 p->comm);
309 task_unlock(p); 336 task_unlock(p);
310 } while_each_thread(g, p); 337 } while_each_thread(g, p);
@@ -346,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
346 373
347static int oom_kill_task(struct task_struct *p) 374static int oom_kill_task(struct task_struct *p)
348{ 375{
349 struct mm_struct *mm;
350 struct task_struct *g, *q;
351
352 mm = p->mm;
353
354 /* WARNING: mm may not be dereferenced since we did not obtain its 376 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is 377 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below. 378 * compare mm to q->mm below.
@@ -359,30 +381,11 @@ static int oom_kill_task(struct task_struct *p)
359 * change to NULL at any time since we do not hold task_lock(p). 381 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us. 382 * However, this is of no concern to us.
361 */ 383 */
362 384 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
363 if (mm == NULL)
364 return 1; 385 return 1;
365 386
366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 387 __oom_kill_task(p, 1);
375 388
376 /*
377 * kill all processes that share the ->mm (i.e. all threads),
378 * but are in a different thread group. Don't let them have access
379 * to memory reserves though, otherwise we might deplete all memory.
380 */
381 do_each_thread(g, q) {
382 if (q->mm == mm && !same_thread_group(q, p))
383 force_sig(SIGKILL, q);
384 } while_each_thread(g, q);
385
386 return 0; 389 return 0;
387} 390}
388 391
@@ -394,8 +397,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 397
395 if (printk_ratelimit()) { 398 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: " 399 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 400 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj); 401 current->comm, gfp_mask, order,
402 current->signal->oom_adj);
399 task_lock(current); 403 task_lock(current);
400 cpuset_print_task_mems_allowed(current); 404 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 405 task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..2c5d79236ead 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -53,18 +44,21 @@ static long ratelimit_pages = 32;
53/* 44/*
54 * When balance_dirty_pages decides that the caller needs to perform some 45 * When balance_dirty_pages decides that the caller needs to perform some
55 * non-background writeback, this is how many pages it will attempt to write. 46 * non-background writeback, this is how many pages it will attempt to write.
56 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably 47 * It should be somewhat larger than dirtied pages to ensure that reasonably
57 * large amounts of I/O are submitted. 48 * large amounts of I/O are submitted.
58 */ 49 */
59static inline long sync_writeback_pages(void) 50static inline long sync_writeback_pages(unsigned long dirtied)
60{ 51{
61 return ratelimit_pages + ratelimit_pages / 2; 52 if (dirtied < ratelimit_pages)
53 dirtied = ratelimit_pages;
54
55 return dirtied + dirtied / 2;
62} 56}
63 57
64/* The following parameters are exported via /proc/sys/vm */ 58/* The following parameters are exported via /proc/sys/vm */
65 59
66/* 60/*
67 * Start background writeback (via pdflush) at this percentage 61 * Start background writeback (via writeback threads) at this percentage
68 */ 62 */
69int dirty_background_ratio = 10; 63int dirty_background_ratio = 10;
70 64
@@ -117,8 +111,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 111/* End of sysctl-exported parameters */
118 112
119 113
120static void background_writeout(unsigned long _min_pages);
121
122/* 114/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 115 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 116 *
@@ -166,37 +158,37 @@ static void update_completion_period(void)
166} 158}
167 159
168int dirty_background_ratio_handler(struct ctl_table *table, int write, 160int dirty_background_ratio_handler(struct ctl_table *table, int write,
169 struct file *filp, void __user *buffer, size_t *lenp, 161 void __user *buffer, size_t *lenp,
170 loff_t *ppos) 162 loff_t *ppos)
171{ 163{
172 int ret; 164 int ret;
173 165
174 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 166 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
175 if (ret == 0 && write) 167 if (ret == 0 && write)
176 dirty_background_bytes = 0; 168 dirty_background_bytes = 0;
177 return ret; 169 return ret;
178} 170}
179 171
180int dirty_background_bytes_handler(struct ctl_table *table, int write, 172int dirty_background_bytes_handler(struct ctl_table *table, int write,
181 struct file *filp, void __user *buffer, size_t *lenp, 173 void __user *buffer, size_t *lenp,
182 loff_t *ppos) 174 loff_t *ppos)
183{ 175{
184 int ret; 176 int ret;
185 177
186 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 178 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
187 if (ret == 0 && write) 179 if (ret == 0 && write)
188 dirty_background_ratio = 0; 180 dirty_background_ratio = 0;
189 return ret; 181 return ret;
190} 182}
191 183
192int dirty_ratio_handler(struct ctl_table *table, int write, 184int dirty_ratio_handler(struct ctl_table *table, int write,
193 struct file *filp, void __user *buffer, size_t *lenp, 185 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 186 loff_t *ppos)
195{ 187{
196 int old_ratio = vm_dirty_ratio; 188 int old_ratio = vm_dirty_ratio;
197 int ret; 189 int ret;
198 190
199 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 191 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
200 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 192 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
201 update_completion_period(); 193 update_completion_period();
202 vm_dirty_bytes = 0; 194 vm_dirty_bytes = 0;
@@ -206,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
206 198
207 199
208int dirty_bytes_handler(struct ctl_table *table, int write, 200int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp, 201 void __user *buffer, size_t *lenp,
210 loff_t *ppos) 202 loff_t *ppos)
211{ 203{
212 unsigned long old_bytes = vm_dirty_bytes; 204 unsigned long old_bytes = vm_dirty_bytes;
213 int ret; 205 int ret;
214 206
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 207 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
216 if (ret == 0 && write && vm_dirty_bytes != old_bytes) { 208 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
217 update_completion_period(); 209 update_completion_period();
218 vm_dirty_ratio = 0; 210 vm_dirty_ratio = 0;
@@ -320,15 +312,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 312/*
321 * 313 *
322 */ 314 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 315static unsigned int bdi_min_ratio;
325 316
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 317int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 318{
328 int ret = 0; 319 int ret = 0;
329 unsigned long flags;
330 320
331 spin_lock_irqsave(&bdi_lock, flags); 321 spin_lock_bh(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 322 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 323 ret = -EINVAL;
334 } else { 324 } else {
@@ -340,27 +330,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 330 ret = -EINVAL;
341 } 331 }
342 } 332 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 333 spin_unlock_bh(&bdi_lock);
344 334
345 return ret; 335 return ret;
346} 336}
347 337
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 338int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 339{
350 unsigned long flags;
351 int ret = 0; 340 int ret = 0;
352 341
353 if (max_ratio > 100) 342 if (max_ratio > 100)
354 return -EINVAL; 343 return -EINVAL;
355 344
356 spin_lock_irqsave(&bdi_lock, flags); 345 spin_lock_bh(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 346 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 347 ret = -EINVAL;
359 } else { 348 } else {
360 bdi->max_ratio = max_ratio; 349 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 350 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 351 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 352 spin_unlock_bh(&bdi_lock);
364 353
365 return ret; 354 return ret;
366} 355}
@@ -394,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
394 struct zone *z = 383 struct zone *z =
395 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 384 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
396 385
397 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); 386 x += zone_page_state(z, NR_FREE_PAGES) +
387 zone_reclaimable_pages(z);
398 } 388 }
399 /* 389 /*
400 * Make sure that the number of highmem pages is never larger 390 * Make sure that the number of highmem pages is never larger
@@ -418,7 +408,7 @@ unsigned long determine_dirtyable_memory(void)
418{ 408{
419 unsigned long x; 409 unsigned long x;
420 410
421 x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); 411 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
422 412
423 if (!vm_highmem_is_dirtyable) 413 if (!vm_highmem_is_dirtyable)
424 x -= highmem_dirtyable_memory(x); 414 x -= highmem_dirtyable_memory(x);
@@ -487,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
487 * balance_dirty_pages() must be called by processes which are generating dirty 477 * balance_dirty_pages() must be called by processes which are generating dirty
488 * data. It looks at the number of dirty pages in the machine and will force 478 * data. It looks at the number of dirty pages in the machine and will force
489 * the caller to perform writeback if the system is over `vm_dirty_ratio'. 479 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
490 * If we're over `background_thresh' then pdflush is woken to perform some 480 * If we're over `background_thresh' then the writeback threads are woken to
491 * writeout. 481 * perform some writeout.
492 */ 482 */
493static void balance_dirty_pages(struct address_space *mapping) 483static void balance_dirty_pages(struct address_space *mapping,
484 unsigned long write_chunk)
494{ 485{
495 long nr_reclaimable, bdi_nr_reclaimable; 486 long nr_reclaimable, bdi_nr_reclaimable;
496 long nr_writeback, bdi_nr_writeback; 487 long nr_writeback, bdi_nr_writeback;
@@ -498,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping)
498 unsigned long dirty_thresh; 489 unsigned long dirty_thresh;
499 unsigned long bdi_thresh; 490 unsigned long bdi_thresh;
500 unsigned long pages_written = 0; 491 unsigned long pages_written = 0;
501 unsigned long write_chunk = sync_writeback_pages(); 492 unsigned long pause = 1;
502 493
503 struct backing_dev_info *bdi = mapping->backing_dev_info; 494 struct backing_dev_info *bdi = mapping->backing_dev_info;
504 495
@@ -546,7 +537,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 537 * up.
547 */ 538 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 539 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 540 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 541 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 542 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 543 &bdi_thresh, bdi);
@@ -575,7 +566,16 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 566 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 567 break; /* We've done our duty */
577 568
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 569 __set_current_state(TASK_INTERRUPTIBLE);
570 io_schedule_timeout(pause);
571
572 /*
573 * Increase the delay for each loop, up to our previous
574 * default of taking a 100ms nap.
575 */
576 pause <<= 1;
577 if (pause > HZ / 10)
578 pause = HZ / 10;
579 } 579 }
580 580
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -583,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping)
583 bdi->dirty_exceeded = 0; 583 bdi->dirty_exceeded = 0;
584 584
585 if (writeback_in_progress(bdi)) 585 if (writeback_in_progress(bdi))
586 return; /* pdflush is already working this queue */ 586 return;
587 587
588 /* 588 /*
589 * In laptop mode, we wait until hitting the higher threshold before 589 * In laptop mode, we wait until hitting the higher threshold before
@@ -594,10 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 594 * background_thresh, to keep the amount of dirty memory low.
595 */ 595 */
596 if ((laptop_mode && pages_written) || 596 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 598 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 599 > background_thresh)))
600 pdflush_operation(background_writeout, 0); 600 bdi_start_writeback(bdi, NULL, 0);
601} 601}
602 602
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 603void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -610,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
610 } 610 }
611} 611}
612 612
613static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
614
613/** 615/**
614 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 616 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
615 * @mapping: address_space which was dirtied 617 * @mapping: address_space which was dirtied
@@ -627,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
627void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 629void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
628 unsigned long nr_pages_dirtied) 630 unsigned long nr_pages_dirtied)
629{ 631{
630 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
631 unsigned long ratelimit; 632 unsigned long ratelimit;
632 unsigned long *p; 633 unsigned long *p;
633 634
@@ -640,12 +641,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
640 * tasks in balance_dirty_pages(). Period. 641 * tasks in balance_dirty_pages(). Period.
641 */ 642 */
642 preempt_disable(); 643 preempt_disable();
643 p = &__get_cpu_var(ratelimits); 644 p = &__get_cpu_var(bdp_ratelimits);
644 *p += nr_pages_dirtied; 645 *p += nr_pages_dirtied;
645 if (unlikely(*p >= ratelimit)) { 646 if (unlikely(*p >= ratelimit)) {
647 ratelimit = sync_writeback_pages(*p);
646 *p = 0; 648 *p = 0;
647 preempt_enable(); 649 preempt_enable();
648 balance_dirty_pages(mapping); 650 balance_dirty_pages(mapping, ratelimit);
649 return; 651 return;
650 } 652 }
651 preempt_enable(); 653 preempt_enable();
@@ -681,153 +683,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 683 }
682} 684}
683 685
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(BLK_RW_ASYNC, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 686static void laptop_timer_fn(unsigned long unused);
740 687
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 688static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 689
744/* 690/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(BLK_RW_ASYNC, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 691 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 692 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 693int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 694 void __user *buffer, size_t *length, loff_t *ppos)
807{ 695{
808 proc_dointvec(table, write, file, buffer, length, ppos); 696 proc_dointvec(table, write, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 697 return 0;
815} 698}
816 699
817static void wb_timer_fn(unsigned long unused) 700static void do_laptop_sync(struct work_struct *work)
818{ 701{
819 if (pdflush_operation(wb_kupdate, 0) < 0) 702 wakeup_flusher_threads(0);
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ 703 kfree(work);
821}
822
823static void laptop_flush(unsigned long unused)
824{
825 sys_sync();
826} 704}
827 705
828static void laptop_timer_fn(unsigned long unused) 706static void laptop_timer_fn(unsigned long unused)
829{ 707{
830 pdflush_operation(laptop_flush, 0); 708 struct work_struct *work;
709
710 work = kmalloc(sizeof(*work), GFP_ATOMIC);
711 if (work) {
712 INIT_WORK(work, do_laptop_sync);
713 schedule_work(work);
714 }
831} 715}
832 716
833/* 717/*
@@ -910,8 +794,6 @@ void __init page_writeback_init(void)
910{ 794{
911 int shift; 795 int shift;
912 796
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 797 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 798 register_cpu_notifier(&ratelimit_nb);
917 799
@@ -1145,12 +1027,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1145 1027
1146 if (wbc->nr_to_write <= 0) 1028 if (wbc->nr_to_write <= 0)
1147 return 0; 1029 return 0;
1148 wbc->for_writepages = 1;
1149 if (mapping->a_ops->writepages) 1030 if (mapping->a_ops->writepages)
1150 ret = mapping->a_ops->writepages(mapping, wbc); 1031 ret = mapping->a_ops->writepages(mapping, wbc);
1151 else 1032 else
1152 ret = generic_writepages(mapping, wbc); 1033 ret = generic_writepages(mapping, wbc);
1153 wbc->for_writepages = 0;
1154 return ret; 1034 return ret;
1155} 1035}
1156 1036
@@ -1274,6 +1154,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
1274EXPORT_SYMBOL(redirty_page_for_writepage); 1154EXPORT_SYMBOL(redirty_page_for_writepage);
1275 1155
1276/* 1156/*
1157 * Dirty a page.
1158 *
1159 * For pages with a mapping this should be done under the page lock
1160 * for the benefit of asynchronous memory errors who prefer a consistent
1161 * dirty state. This rule can be broken in some special cases,
1162 * but should be better not to.
1163 *
1277 * If the mapping doesn't provide a set_page_dirty a_op, then 1164 * If the mapping doesn't provide a set_page_dirty a_op, then
1278 * just fall through and assume that it wants buffer_heads. 1165 * just fall through and assume that it wants buffer_heads.
1279 */ 1166 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a0de15f46987..bf720550b44d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 50#include <linux/kmemleak.h>
51#include <trace/events/kmem.h>
51 52
52#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
53#include <asm/div64.h> 54#include <asm/div64.h>
@@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states);
71 72
72unsigned long totalram_pages __read_mostly; 73unsigned long totalram_pages __read_mostly;
73unsigned long totalreserve_pages __read_mostly; 74unsigned long totalreserve_pages __read_mostly;
74unsigned long highest_memmap_pfn __read_mostly;
75int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
77 77
@@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
123 123
124int min_free_kbytes = 1024; 124int min_free_kbytes = 1024;
125 125
126unsigned long __meminitdata nr_kernel_pages; 126static unsigned long __meminitdata nr_kernel_pages;
127unsigned long __meminitdata nr_all_pages; 127static unsigned long __meminitdata nr_all_pages;
128static unsigned long __meminitdata dma_reserve; 128static unsigned long __meminitdata dma_reserve;
129 129
130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -234,6 +234,12 @@ static void bad_page(struct page *page)
234 static unsigned long nr_shown; 234 static unsigned long nr_shown;
235 static unsigned long nr_unshown; 235 static unsigned long nr_unshown;
236 236
237 /* Don't complain about poisoned pages */
238 if (PageHWPoison(page)) {
239 __ClearPageBuddy(page);
240 return;
241 }
242
237 /* 243 /*
238 * Allow a burst of 60 reports, then keep quiet for that minute; 244 * Allow a burst of 60 reports, then keep quiet for that minute;
239 * or allow a steady drip of one report per second. 245 * or allow a steady drip of one report per second.
@@ -510,7 +516,7 @@ static inline int free_pages_check(struct page *page)
510} 516}
511 517
512/* 518/*
513 * Frees a list of pages. 519 * Frees a number of pages from the PCP lists
514 * Assumes all pages on list are in same zone, and of same order. 520 * Assumes all pages on list are in same zone, and of same order.
515 * count is the number of pages to free. 521 * count is the number of pages to free.
516 * 522 *
@@ -520,22 +526,42 @@ static inline int free_pages_check(struct page *page)
520 * And clear the zone's pages_scanned counter, to hold off the "all pages are 526 * And clear the zone's pages_scanned counter, to hold off the "all pages are
521 * pinned" detection logic. 527 * pinned" detection logic.
522 */ 528 */
523static void free_pages_bulk(struct zone *zone, int count, 529static void free_pcppages_bulk(struct zone *zone, int count,
524 struct list_head *list, int order) 530 struct per_cpu_pages *pcp)
525{ 531{
532 int migratetype = 0;
533 int batch_free = 0;
534
526 spin_lock(&zone->lock); 535 spin_lock(&zone->lock);
527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 536 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
528 zone->pages_scanned = 0; 537 zone->pages_scanned = 0;
529 538
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); 539 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
531 while (count--) { 540 while (count) {
532 struct page *page; 541 struct page *page;
542 struct list_head *list;
533 543
534 VM_BUG_ON(list_empty(list)); 544 /*
535 page = list_entry(list->prev, struct page, lru); 545 * Remove pages from lists in a round-robin fashion. A
536 /* have to delete it as __free_one_page list manipulates */ 546 * batch_free count is maintained that is incremented when an
537 list_del(&page->lru); 547 * empty list is encountered. This is so more pages are freed
538 __free_one_page(page, zone, order, page_private(page)); 548 * off fuller lists instead of spinning excessively around empty
549 * lists
550 */
551 do {
552 batch_free++;
553 if (++migratetype == MIGRATE_PCPTYPES)
554 migratetype = 0;
555 list = &pcp->lists[migratetype];
556 } while (list_empty(list));
557
558 do {
559 page = list_entry(list->prev, struct page, lru);
560 /* must delete as __free_one_page list manipulates */
561 list_del(&page->lru);
562 __free_one_page(page, zone, 0, migratetype);
563 trace_mm_page_pcpu_drain(page, 0, migratetype);
564 } while (--count && --batch_free && !list_empty(list));
539 } 565 }
540 spin_unlock(&zone->lock); 566 spin_unlock(&zone->lock);
541} 567}
@@ -557,7 +583,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
557 unsigned long flags; 583 unsigned long flags;
558 int i; 584 int i;
559 int bad = 0; 585 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page); 586 int wasMlocked = __TestClearPageMlocked(page);
561 587
562 kmemcheck_free_shadow(page, order); 588 kmemcheck_free_shadow(page, order);
563 589
@@ -646,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page,
646/* 672/*
647 * This page is about to be returned from the page allocator 673 * This page is about to be returned from the page allocator
648 */ 674 */
649static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 675static inline int check_new_page(struct page *page)
650{ 676{
651 if (unlikely(page_mapcount(page) | 677 if (unlikely(page_mapcount(page) |
652 (page->mapping != NULL) | 678 (page->mapping != NULL) |
@@ -655,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
655 bad_page(page); 681 bad_page(page);
656 return 1; 682 return 1;
657 } 683 }
684 return 0;
685}
686
687static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
688{
689 int i;
690
691 for (i = 0; i < (1 << order); i++) {
692 struct page *p = page + i;
693 if (unlikely(check_new_page(p)))
694 return 1;
695 }
658 696
659 set_page_private(page, 0); 697 set_page_private(page, 0);
660 set_page_refcounted(page); 698 set_page_refcounted(page);
@@ -783,6 +821,17 @@ static int move_freepages_block(struct zone *zone, struct page *page,
783 return move_freepages(zone, start_page, end_page, migratetype); 821 return move_freepages(zone, start_page, end_page, migratetype);
784} 822}
785 823
824static void change_pageblock_range(struct page *pageblock_page,
825 int start_order, int migratetype)
826{
827 int nr_pageblocks = 1 << (start_order - pageblock_order);
828
829 while (nr_pageblocks--) {
830 set_pageblock_migratetype(pageblock_page, migratetype);
831 pageblock_page += pageblock_nr_pages;
832 }
833}
834
786/* Remove an element from the buddy allocator from the fallback list */ 835/* Remove an element from the buddy allocator from the fallback list */
787static inline struct page * 836static inline struct page *
788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 837__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -836,11 +885,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
836 list_del(&page->lru); 885 list_del(&page->lru);
837 rmv_page_order(page); 886 rmv_page_order(page);
838 887
839 if (current_order == pageblock_order) 888 /* Take ownership for orders >= pageblock_order */
840 set_pageblock_migratetype(page, 889 if (current_order >= pageblock_order)
890 change_pageblock_range(page, current_order,
841 start_migratetype); 891 start_migratetype);
842 892
843 expand(zone, page, order, current_order, area, migratetype); 893 expand(zone, page, order, current_order, area, migratetype);
894
895 trace_mm_page_alloc_extfrag(page, order, current_order,
896 start_migratetype, migratetype);
897
844 return page; 898 return page;
845 } 899 }
846 } 900 }
@@ -874,6 +928,7 @@ retry_reserve:
874 } 928 }
875 } 929 }
876 930
931 trace_mm_page_alloc_zone_locked(page, order, migratetype);
877 return page; 932 return page;
878} 933}
879 934
@@ -934,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
934 to_drain = pcp->batch; 989 to_drain = pcp->batch;
935 else 990 else
936 to_drain = pcp->count; 991 to_drain = pcp->count;
937 free_pages_bulk(zone, to_drain, &pcp->list, 0); 992 free_pcppages_bulk(zone, to_drain, pcp);
938 pcp->count -= to_drain; 993 pcp->count -= to_drain;
939 local_irq_restore(flags); 994 local_irq_restore(flags);
940} 995}
@@ -960,7 +1015,7 @@ static void drain_pages(unsigned int cpu)
960 1015
961 pcp = &pset->pcp; 1016 pcp = &pset->pcp;
962 local_irq_save(flags); 1017 local_irq_save(flags);
963 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 1018 free_pcppages_bulk(zone, pcp->count, pcp);
964 pcp->count = 0; 1019 pcp->count = 0;
965 local_irq_restore(flags); 1020 local_irq_restore(flags);
966 } 1021 }
@@ -1026,7 +1081,8 @@ static void free_hot_cold_page(struct page *page, int cold)
1026 struct zone *zone = page_zone(page); 1081 struct zone *zone = page_zone(page);
1027 struct per_cpu_pages *pcp; 1082 struct per_cpu_pages *pcp;
1028 unsigned long flags; 1083 unsigned long flags;
1029 int wasMlocked = TestClearPageMlocked(page); 1084 int migratetype;
1085 int wasMlocked = __TestClearPageMlocked(page);
1030 1086
1031 kmemcheck_free_shadow(page, 0); 1087 kmemcheck_free_shadow(page, 0);
1032 1088
@@ -1043,35 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold)
1043 kernel_map_pages(page, 1, 0); 1099 kernel_map_pages(page, 1, 0);
1044 1100
1045 pcp = &zone_pcp(zone, get_cpu())->pcp; 1101 pcp = &zone_pcp(zone, get_cpu())->pcp;
1046 set_page_private(page, get_pageblock_migratetype(page)); 1102 migratetype = get_pageblock_migratetype(page);
1103 set_page_private(page, migratetype);
1047 local_irq_save(flags); 1104 local_irq_save(flags);
1048 if (unlikely(wasMlocked)) 1105 if (unlikely(wasMlocked))
1049 free_page_mlock(page); 1106 free_page_mlock(page);
1050 __count_vm_event(PGFREE); 1107 __count_vm_event(PGFREE);
1051 1108
1109 /*
1110 * We only track unmovable, reclaimable and movable on pcp lists.
1111 * Free ISOLATE pages back to the allocator because they are being
1112 * offlined but treat RESERVE as movable pages so we can get those
1113 * areas back if necessary. Otherwise, we may have to free
1114 * excessively into the page allocator
1115 */
1116 if (migratetype >= MIGRATE_PCPTYPES) {
1117 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1118 free_one_page(zone, page, 0, migratetype);
1119 goto out;
1120 }
1121 migratetype = MIGRATE_MOVABLE;
1122 }
1123
1052 if (cold) 1124 if (cold)
1053 list_add_tail(&page->lru, &pcp->list); 1125 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1054 else 1126 else
1055 list_add(&page->lru, &pcp->list); 1127 list_add(&page->lru, &pcp->lists[migratetype]);
1056 pcp->count++; 1128 pcp->count++;
1057 if (pcp->count >= pcp->high) { 1129 if (pcp->count >= pcp->high) {
1058 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1130 free_pcppages_bulk(zone, pcp->batch, pcp);
1059 pcp->count -= pcp->batch; 1131 pcp->count -= pcp->batch;
1060 } 1132 }
1133
1134out:
1061 local_irq_restore(flags); 1135 local_irq_restore(flags);
1062 put_cpu(); 1136 put_cpu();
1063} 1137}
1064 1138
1065void free_hot_page(struct page *page) 1139void free_hot_page(struct page *page)
1066{ 1140{
1141 trace_mm_page_free_direct(page, 0);
1067 free_hot_cold_page(page, 0); 1142 free_hot_cold_page(page, 0);
1068} 1143}
1069 1144
1070void free_cold_page(struct page *page)
1071{
1072 free_hot_cold_page(page, 1);
1073}
1074
1075/* 1145/*
1076 * split_page takes a non-compound higher-order page, and splits it into 1146 * split_page takes a non-compound higher-order page, and splits it into
1077 * n (1<<order) sub-pages: page[0..n] 1147 * n (1<<order) sub-pages: page[0..n]
@@ -1119,35 +1189,23 @@ again:
1119 cpu = get_cpu(); 1189 cpu = get_cpu();
1120 if (likely(order == 0)) { 1190 if (likely(order == 0)) {
1121 struct per_cpu_pages *pcp; 1191 struct per_cpu_pages *pcp;
1192 struct list_head *list;
1122 1193
1123 pcp = &zone_pcp(zone, cpu)->pcp; 1194 pcp = &zone_pcp(zone, cpu)->pcp;
1195 list = &pcp->lists[migratetype];
1124 local_irq_save(flags); 1196 local_irq_save(flags);
1125 if (!pcp->count) { 1197 if (list_empty(list)) {
1126 pcp->count = rmqueue_bulk(zone, 0, 1198 pcp->count += rmqueue_bulk(zone, 0,
1127 pcp->batch, &pcp->list, 1199 pcp->batch, list,
1128 migratetype, cold); 1200 migratetype, cold);
1129 if (unlikely(!pcp->count)) 1201 if (unlikely(list_empty(list)))
1130 goto failed; 1202 goto failed;
1131 } 1203 }
1132 1204
1133 /* Find a page of the appropriate migrate type */ 1205 if (cold)
1134 if (cold) { 1206 page = list_entry(list->prev, struct page, lru);
1135 list_for_each_entry_reverse(page, &pcp->list, lru) 1207 else
1136 if (page_private(page) == migratetype) 1208 page = list_entry(list->next, struct page, lru);
1137 break;
1138 } else {
1139 list_for_each_entry(page, &pcp->list, lru)
1140 if (page_private(page) == migratetype)
1141 break;
1142 }
1143
1144 /* Allocate more to the pcp list if necessary */
1145 if (unlikely(&page->lru == &pcp->list)) {
1146 pcp->count += rmqueue_bulk(zone, 0,
1147 pcp->batch, &pcp->list,
1148 migratetype, cold);
1149 page = list_entry(pcp->list.next, struct page, lru);
1150 }
1151 1209
1152 list_del(&page->lru); 1210 list_del(&page->lru);
1153 pcp->count--; 1211 pcp->count--;
@@ -1627,10 +1685,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1627 1685
1628 /* We now go into synchronous reclaim */ 1686 /* We now go into synchronous reclaim */
1629 cpuset_memory_pressure_bump(); 1687 cpuset_memory_pressure_bump();
1630
1631 /*
1632 * The task's cpuset might have expanded its set of allowable nodes
1633 */
1634 p->flags |= PF_MEMALLOC; 1688 p->flags |= PF_MEMALLOC;
1635 lockdep_set_current_reclaim_state(gfp_mask); 1689 lockdep_set_current_reclaim_state(gfp_mask);
1636 reclaim_state.reclaimed_slab = 0; 1690 reclaim_state.reclaimed_slab = 0;
@@ -1765,6 +1819,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1765 1819
1766 wake_all_kswapd(order, zonelist, high_zoneidx); 1820 wake_all_kswapd(order, zonelist, high_zoneidx);
1767 1821
1822restart:
1768 /* 1823 /*
1769 * OK, we're below the kswapd watermark and have kicked background 1824 * OK, we're below the kswapd watermark and have kicked background
1770 * reclaim. Now things get more complex, so set up alloc_flags according 1825 * reclaim. Now things get more complex, so set up alloc_flags according
@@ -1772,7 +1827,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1772 */ 1827 */
1773 alloc_flags = gfp_to_alloc_flags(gfp_mask); 1828 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1774 1829
1775restart:
1776 /* This is the last chance, in general, before the goto nopage. */ 1830 /* This is the last chance, in general, before the goto nopage. */
1777 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1831 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1778 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 1832 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1907,6 +1961,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1907 zonelist, high_zoneidx, nodemask, 1961 zonelist, high_zoneidx, nodemask,
1908 preferred_zone, migratetype); 1962 preferred_zone, migratetype);
1909 1963
1964 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1910 return page; 1965 return page;
1911} 1966}
1912EXPORT_SYMBOL(__alloc_pages_nodemask); 1967EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -1916,44 +1971,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask);
1916 */ 1971 */
1917unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1972unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1918{ 1973{
1919 struct page * page; 1974 struct page *page;
1975
1976 /*
1977 * __get_free_pages() returns a 32-bit address, which cannot represent
1978 * a highmem page
1979 */
1980 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1981
1920 page = alloc_pages(gfp_mask, order); 1982 page = alloc_pages(gfp_mask, order);
1921 if (!page) 1983 if (!page)
1922 return 0; 1984 return 0;
1923 return (unsigned long) page_address(page); 1985 return (unsigned long) page_address(page);
1924} 1986}
1925
1926EXPORT_SYMBOL(__get_free_pages); 1987EXPORT_SYMBOL(__get_free_pages);
1927 1988
1928unsigned long get_zeroed_page(gfp_t gfp_mask) 1989unsigned long get_zeroed_page(gfp_t gfp_mask)
1929{ 1990{
1930 struct page * page; 1991 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1931
1932 /*
1933 * get_zeroed_page() returns a 32-bit address, which cannot represent
1934 * a highmem page
1935 */
1936 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1937
1938 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1939 if (page)
1940 return (unsigned long) page_address(page);
1941 return 0;
1942} 1992}
1943
1944EXPORT_SYMBOL(get_zeroed_page); 1993EXPORT_SYMBOL(get_zeroed_page);
1945 1994
1946void __pagevec_free(struct pagevec *pvec) 1995void __pagevec_free(struct pagevec *pvec)
1947{ 1996{
1948 int i = pagevec_count(pvec); 1997 int i = pagevec_count(pvec);
1949 1998
1950 while (--i >= 0) 1999 while (--i >= 0) {
2000 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1951 free_hot_cold_page(pvec->pages[i], pvec->cold); 2001 free_hot_cold_page(pvec->pages[i], pvec->cold);
2002 }
1952} 2003}
1953 2004
1954void __free_pages(struct page *page, unsigned int order) 2005void __free_pages(struct page *page, unsigned int order)
1955{ 2006{
1956 if (put_page_testzero(page)) { 2007 if (put_page_testzero(page)) {
2008 trace_mm_page_free_direct(page, order);
1957 if (order == 0) 2009 if (order == 0)
1958 free_hot_page(page); 2010 free_hot_page(page);
1959 else 2011 else
@@ -2128,23 +2180,28 @@ void show_free_areas(void)
2128 } 2180 }
2129 } 2181 }
2130 2182
2131 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2183 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2132 " inactive_file:%lu" 2184 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2133 " unevictable:%lu" 2185 " unevictable:%lu"
2134 " dirty:%lu writeback:%lu unstable:%lu\n" 2186 " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
2135 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2187 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2188 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2136 global_page_state(NR_ACTIVE_ANON), 2189 global_page_state(NR_ACTIVE_ANON),
2137 global_page_state(NR_ACTIVE_FILE),
2138 global_page_state(NR_INACTIVE_ANON), 2190 global_page_state(NR_INACTIVE_ANON),
2191 global_page_state(NR_ISOLATED_ANON),
2192 global_page_state(NR_ACTIVE_FILE),
2139 global_page_state(NR_INACTIVE_FILE), 2193 global_page_state(NR_INACTIVE_FILE),
2194 global_page_state(NR_ISOLATED_FILE),
2140 global_page_state(NR_UNEVICTABLE), 2195 global_page_state(NR_UNEVICTABLE),
2141 global_page_state(NR_FILE_DIRTY), 2196 global_page_state(NR_FILE_DIRTY),
2142 global_page_state(NR_WRITEBACK), 2197 global_page_state(NR_WRITEBACK),
2143 global_page_state(NR_UNSTABLE_NFS), 2198 global_page_state(NR_UNSTABLE_NFS),
2199 nr_blockdev_pages(),
2144 global_page_state(NR_FREE_PAGES), 2200 global_page_state(NR_FREE_PAGES),
2145 global_page_state(NR_SLAB_RECLAIMABLE) + 2201 global_page_state(NR_SLAB_RECLAIMABLE),
2146 global_page_state(NR_SLAB_UNRECLAIMABLE), 2202 global_page_state(NR_SLAB_UNRECLAIMABLE),
2147 global_page_state(NR_FILE_MAPPED), 2203 global_page_state(NR_FILE_MAPPED),
2204 global_page_state(NR_SHMEM),
2148 global_page_state(NR_PAGETABLE), 2205 global_page_state(NR_PAGETABLE),
2149 global_page_state(NR_BOUNCE)); 2206 global_page_state(NR_BOUNCE));
2150 2207
@@ -2162,7 +2219,21 @@ void show_free_areas(void)
2162 " active_file:%lukB" 2219 " active_file:%lukB"
2163 " inactive_file:%lukB" 2220 " inactive_file:%lukB"
2164 " unevictable:%lukB" 2221 " unevictable:%lukB"
2222 " isolated(anon):%lukB"
2223 " isolated(file):%lukB"
2165 " present:%lukB" 2224 " present:%lukB"
2225 " mlocked:%lukB"
2226 " dirty:%lukB"
2227 " writeback:%lukB"
2228 " mapped:%lukB"
2229 " shmem:%lukB"
2230 " slab_reclaimable:%lukB"
2231 " slab_unreclaimable:%lukB"
2232 " kernel_stack:%lukB"
2233 " pagetables:%lukB"
2234 " unstable:%lukB"
2235 " bounce:%lukB"
2236 " writeback_tmp:%lukB"
2166 " pages_scanned:%lu" 2237 " pages_scanned:%lu"
2167 " all_unreclaimable? %s" 2238 " all_unreclaimable? %s"
2168 "\n", 2239 "\n",
@@ -2176,7 +2247,22 @@ void show_free_areas(void)
2176 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2247 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2177 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2248 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2178 K(zone_page_state(zone, NR_UNEVICTABLE)), 2249 K(zone_page_state(zone, NR_UNEVICTABLE)),
2250 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2251 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2179 K(zone->present_pages), 2252 K(zone->present_pages),
2253 K(zone_page_state(zone, NR_MLOCK)),
2254 K(zone_page_state(zone, NR_FILE_DIRTY)),
2255 K(zone_page_state(zone, NR_WRITEBACK)),
2256 K(zone_page_state(zone, NR_FILE_MAPPED)),
2257 K(zone_page_state(zone, NR_SHMEM)),
2258 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2259 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2260 zone_page_state(zone, NR_KERNEL_STACK) *
2261 THREAD_SIZE / 1024,
2262 K(zone_page_state(zone, NR_PAGETABLE)),
2263 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2264 K(zone_page_state(zone, NR_BOUNCE)),
2265 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2180 zone->pages_scanned, 2266 zone->pages_scanned,
2181 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2267 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
2182 ); 2268 );
@@ -2305,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
2305 * sysctl handler for numa_zonelist_order 2391 * sysctl handler for numa_zonelist_order
2306 */ 2392 */
2307int numa_zonelist_order_handler(ctl_table *table, int write, 2393int numa_zonelist_order_handler(ctl_table *table, int write,
2308 struct file *file, void __user *buffer, size_t *length, 2394 void __user *buffer, size_t *length,
2309 loff_t *ppos) 2395 loff_t *ppos)
2310{ 2396{
2311 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2397 char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@ -2314,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2314 if (write) 2400 if (write)
2315 strncpy(saved_string, (char*)table->data, 2401 strncpy(saved_string, (char*)table->data,
2316 NUMA_ZONELIST_ORDER_LEN); 2402 NUMA_ZONELIST_ORDER_LEN);
2317 ret = proc_dostring(table, write, file, buffer, length, ppos); 2403 ret = proc_dostring(table, write, buffer, length, ppos);
2318 if (ret) 2404 if (ret)
2319 return ret; 2405 return ret;
2320 if (write) { 2406 if (write) {
@@ -2783,7 +2869,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2783{ 2869{
2784 unsigned long start_pfn, pfn, end_pfn; 2870 unsigned long start_pfn, pfn, end_pfn;
2785 struct page *page; 2871 struct page *page;
2786 unsigned long reserve, block_migratetype; 2872 unsigned long block_migratetype;
2873 int reserve;
2787 2874
2788 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2875 /* Get the start pfn, end pfn and the number of blocks to reserve */
2789 start_pfn = zone->zone_start_pfn; 2876 start_pfn = zone->zone_start_pfn;
@@ -2791,6 +2878,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2791 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 2878 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2792 pageblock_order; 2879 pageblock_order;
2793 2880
2881 /*
2882 * Reserve blocks are generally in place to help high-order atomic
2883 * allocations that are short-lived. A min_free_kbytes value that
2884 * would result in more than 2 reserve blocks for atomic allocations
2885 * is assumed to be in place to help anti-fragmentation for the
2886 * future allocation of hugepages at runtime.
2887 */
2888 reserve = min(2, reserve);
2889
2794 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2890 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2795 if (!pfn_valid(pfn)) 2891 if (!pfn_valid(pfn))
2796 continue; 2892 continue;
@@ -2961,6 +3057,7 @@ static int zone_batchsize(struct zone *zone)
2961static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3057static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2962{ 3058{
2963 struct per_cpu_pages *pcp; 3059 struct per_cpu_pages *pcp;
3060 int migratetype;
2964 3061
2965 memset(p, 0, sizeof(*p)); 3062 memset(p, 0, sizeof(*p));
2966 3063
@@ -2968,7 +3065,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2968 pcp->count = 0; 3065 pcp->count = 0;
2969 pcp->high = 6 * batch; 3066 pcp->high = 6 * batch;
2970 pcp->batch = max(1UL, 1 * batch); 3067 pcp->batch = max(1UL, 1 * batch);
2971 INIT_LIST_HEAD(&pcp->list); 3068 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3069 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2972} 3070}
2973 3071
2974/* 3072/*
@@ -3146,6 +3244,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3146 return 0; 3244 return 0;
3147} 3245}
3148 3246
3247static int __zone_pcp_update(void *data)
3248{
3249 struct zone *zone = data;
3250 int cpu;
3251 unsigned long batch = zone_batchsize(zone), flags;
3252
3253 for (cpu = 0; cpu < NR_CPUS; cpu++) {
3254 struct per_cpu_pageset *pset;
3255 struct per_cpu_pages *pcp;
3256
3257 pset = zone_pcp(zone, cpu);
3258 pcp = &pset->pcp;
3259
3260 local_irq_save(flags);
3261 free_pcppages_bulk(zone, pcp->count, pcp);
3262 setup_pageset(pset, batch);
3263 local_irq_restore(flags);
3264 }
3265 return 0;
3266}
3267
3268void zone_pcp_update(struct zone *zone)
3269{
3270 stop_machine(__zone_pcp_update, zone, NULL);
3271}
3272
3149static __meminit void zone_pcp_init(struct zone *zone) 3273static __meminit void zone_pcp_init(struct zone *zone)
3150{ 3274{
3151 int cpu; 3275 int cpu;
@@ -3720,7 +3844,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3720 zone_pcp_init(zone); 3844 zone_pcp_init(zone);
3721 for_each_lru(l) { 3845 for_each_lru(l) {
3722 INIT_LIST_HEAD(&zone->lru[l].list); 3846 INIT_LIST_HEAD(&zone->lru[l].list);
3723 zone->lru[l].nr_saved_scan = 0; 3847 zone->reclaim_stat.nr_saved_scan[l] = 0;
3724 } 3848 }
3725 zone->reclaim_stat.recent_rotated[0] = 0; 3849 zone->reclaim_stat.recent_rotated[0] = 0;
3726 zone->reclaim_stat.recent_rotated[1] = 0; 3850 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4509,7 +4633,7 @@ void setup_per_zone_wmarks(void)
4509 calculate_totalreserve_pages(); 4633 calculate_totalreserve_pages();
4510} 4634}
4511 4635
4512/** 4636/*
4513 * The inactive anon list should be small enough that the VM never has to 4637 * The inactive anon list should be small enough that the VM never has to
4514 * do too much work, but large enough that each inactive page has a chance 4638 * do too much work, but large enough that each inactive page has a chance
4515 * to be referenced again before it is swapped out. 4639 * to be referenced again before it is swapped out.
@@ -4600,9 +4724,9 @@ module_init(init_per_zone_wmark_min)
4600 * changes. 4724 * changes.
4601 */ 4725 */
4602int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 4726int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4603 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4727 void __user *buffer, size_t *length, loff_t *ppos)
4604{ 4728{
4605 proc_dointvec(table, write, file, buffer, length, ppos); 4729 proc_dointvec(table, write, buffer, length, ppos);
4606 if (write) 4730 if (write)
4607 setup_per_zone_wmarks(); 4731 setup_per_zone_wmarks();
4608 return 0; 4732 return 0;
@@ -4610,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4610 4734
4611#ifdef CONFIG_NUMA 4735#ifdef CONFIG_NUMA
4612int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 4736int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4613 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4737 void __user *buffer, size_t *length, loff_t *ppos)
4614{ 4738{
4615 struct zone *zone; 4739 struct zone *zone;
4616 int rc; 4740 int rc;
4617 4741
4618 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4742 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4619 if (rc) 4743 if (rc)
4620 return rc; 4744 return rc;
4621 4745
@@ -4626,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4626} 4750}
4627 4751
4628int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 4752int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4629 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4753 void __user *buffer, size_t *length, loff_t *ppos)
4630{ 4754{
4631 struct zone *zone; 4755 struct zone *zone;
4632 int rc; 4756 int rc;
4633 4757
4634 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4758 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4635 if (rc) 4759 if (rc)
4636 return rc; 4760 return rc;
4637 4761
@@ -4652,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4652 * if in function of the boot time zone sizes. 4776 * if in function of the boot time zone sizes.
4653 */ 4777 */
4654int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4778int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4655 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4779 void __user *buffer, size_t *length, loff_t *ppos)
4656{ 4780{
4657 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4781 proc_dointvec_minmax(table, write, buffer, length, ppos);
4658 setup_per_zone_lowmem_reserve(); 4782 setup_per_zone_lowmem_reserve();
4659 return 0; 4783 return 0;
4660} 4784}
@@ -4666,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4666 */ 4790 */
4667 4791
4668int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 4792int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4669 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 4793 void __user *buffer, size_t *length, loff_t *ppos)
4670{ 4794{
4671 struct zone *zone; 4795 struct zone *zone;
4672 unsigned int cpu; 4796 unsigned int cpu;
4673 int ret; 4797 int ret;
4674 4798
4675 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4799 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4676 if (!write || (ret == -EINVAL)) 4800 if (!write || (ret == -EINVAL))
4677 return ret; 4801 return ret;
4678 for_each_populated_zone(zone) { 4802 for_each_populated_zone(zone) {
@@ -4732,7 +4856,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4732 numentries <<= (PAGE_SHIFT - scale); 4856 numentries <<= (PAGE_SHIFT - scale);
4733 4857
4734 /* Make sure we've got at least a 0-order allocation.. */ 4858 /* Make sure we've got at least a 0-order allocation.. */
4735 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 4859 if (unlikely(flags & HASH_SMALL)) {
4860 /* Makes no sense without HASH_EARLY */
4861 WARN_ON(!(flags & HASH_EARLY));
4862 if (!(numentries >> *_hash_shift)) {
4863 numentries = 1UL << *_hash_shift;
4864 BUG_ON(!numentries);
4865 }
4866 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4736 numentries = PAGE_SIZE / bucketsize; 4867 numentries = PAGE_SIZE / bucketsize;
4737 } 4868 }
4738 numentries = roundup_pow_of_two(numentries); 4869 numentries = roundup_pow_of_two(numentries);
@@ -4874,13 +5005,16 @@ int set_migratetype_isolate(struct page *page)
4874 struct zone *zone; 5005 struct zone *zone;
4875 unsigned long flags; 5006 unsigned long flags;
4876 int ret = -EBUSY; 5007 int ret = -EBUSY;
5008 int zone_idx;
4877 5009
4878 zone = page_zone(page); 5010 zone = page_zone(page);
5011 zone_idx = zone_idx(zone);
4879 spin_lock_irqsave(&zone->lock, flags); 5012 spin_lock_irqsave(&zone->lock, flags);
4880 /* 5013 /*
4881 * In future, more migrate types will be able to be isolation target. 5014 * In future, more migrate types will be able to be isolation target.
4882 */ 5015 */
4883 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 5016 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
5017 zone_idx != ZONE_MOVABLE)
4884 goto out; 5018 goto out;
4885 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5019 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4886 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5020 move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f22b4ebbd8dc..3d535d594826 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
116 nid = page_to_nid(pfn_to_page(pfn)); 116 nid = page_to_nid(pfn_to_page(pfn));
117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 117 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
118 VM_BUG_ON(!slab_is_available()); 118 VM_BUG_ON(!slab_is_available());
119 base = kmalloc_node(table_size, 119 if (node_state(nid, N_HIGH_MEMORY)) {
120 base = kmalloc_node(table_size,
120 GFP_KERNEL | __GFP_NOWARN, nid); 121 GFP_KERNEL | __GFP_NOWARN, nid);
121 if (!base) 122 if (!base)
122 base = vmalloc_node(table_size, nid); 123 base = vmalloc_node(table_size, nid);
124 } else {
125 base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
126 if (!base)
127 base = vmalloc(table_size);
128 }
123 } else { 129 } else {
124 /* 130 /*
125 * We don't have to allocate page_cgroup again, but 131 * We don't have to allocate page_cgroup again, but
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index 3311c8919f37..6af78c1ee704 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of nr_cpu_ids units and the first chunk is used 11 * chunk is consisted of boot-time determined number of units and the
12 * for static percpu variables in the kernel image (special boot time 12 * first chunk is used for static percpu variables in the kernel image
13 * alloc/init handling necessary as these areas need to be brought up 13 * (special boot time alloc/init handling necessary as these areas
14 * before allocation services are running). Unit grows as necessary 14 * need to be brought up before allocation services are running).
15 * and all units grow or shrink in unison. When a chunk is filled up, 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers pcpu_unit_size apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
43 * 46 *
44 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
45 * 48 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
47 * 50 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be 52 * regular address to percpu pointer and back if they need to be
@@ -55,7 +58,9 @@
55 58
56#include <linux/bitmap.h> 59#include <linux/bitmap.h>
57#include <linux/bootmem.h> 60#include <linux/bootmem.h>
61#include <linux/err.h>
58#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/log2.h>
59#include <linux/mm.h> 64#include <linux/mm.h>
60#include <linux/module.h> 65#include <linux/module.h>
61#include <linux/mutex.h> 66#include <linux/mutex.h>
@@ -89,25 +94,38 @@ struct pcpu_chunk {
89 struct list_head list; /* linked to pcpu_slot lists */ 94 struct list_head list; /* linked to pcpu_slot lists */
90 int free_size; /* free bytes in the chunk */ 95 int free_size; /* free bytes in the chunk */
91 int contig_hint; /* max contiguous size hint */ 96 int contig_hint; /* max contiguous size hint */
92 struct vm_struct *vm; /* mapped vmalloc region */ 97 void *base_addr; /* base address of this chunk */
93 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
94 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 100 int *map; /* allocation map */
101 struct vm_struct **vms; /* mapped vmalloc regions */
96 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 103 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 104};
100 105
101static int pcpu_unit_pages __read_mostly; 106static int pcpu_unit_pages __read_mostly;
102static int pcpu_unit_size __read_mostly; 107static int pcpu_unit_size __read_mostly;
103static int pcpu_chunk_size __read_mostly; 108static int pcpu_nr_units __read_mostly;
109static int pcpu_atom_size __read_mostly;
104static int pcpu_nr_slots __read_mostly; 110static int pcpu_nr_slots __read_mostly;
105static size_t pcpu_chunk_struct_size __read_mostly; 111static size_t pcpu_chunk_struct_size __read_mostly;
106 112
113/* cpus with the lowest and highest unit numbers */
114static unsigned int pcpu_first_unit_cpu __read_mostly;
115static unsigned int pcpu_last_unit_cpu __read_mostly;
116
107/* the address of the first chunk which starts with the kernel static area */ 117/* the address of the first chunk which starts with the kernel static area */
108void *pcpu_base_addr __read_mostly; 118void *pcpu_base_addr __read_mostly;
109EXPORT_SYMBOL_GPL(pcpu_base_addr); 119EXPORT_SYMBOL_GPL(pcpu_base_addr);
110 120
121static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
122const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
123
124/* group information, used for vm allocation */
125static int pcpu_nr_groups __read_mostly;
126static const unsigned long *pcpu_group_offsets __read_mostly;
127static const size_t *pcpu_group_sizes __read_mostly;
128
111/* 129/*
112 * The first chunk which always exists. Note that unlike other 130 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different 131 * chunks, this one can be allocated and mapped in several different
@@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 147 * Synchronization rules.
130 * 148 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 149 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 150 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 151 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 152 * data structures - chunk slots, chunks and area maps in chunks.
135 * 153 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,31 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
178 196
179static int pcpu_page_idx(unsigned int cpu, int page_idx) 197static int pcpu_page_idx(unsigned int cpu, int page_idx)
180{ 198{
181 return cpu * pcpu_unit_pages + page_idx; 199 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
182}
183
184static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
185 unsigned int cpu, int page_idx)
186{
187 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
188} 200}
189 201
190static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 202static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
191 unsigned int cpu, int page_idx) 203 unsigned int cpu, int page_idx)
192{ 204{
193 return (unsigned long)chunk->vm->addr + 205 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 206 (page_idx << PAGE_SHIFT);
195} 207}
196 208
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 209static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
198 int page_idx) 210 unsigned int cpu, int page_idx)
199{ 211{
200 /* 212 /* must not be used on pre-mapped chunk */
201 * Any possible cpu id can be used here, so there's no need to 213 WARN_ON(chunk->immutable);
202 * worry about preemption or cpu hotplug. 214
203 */ 215 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
204 return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(),
205 page_idx) != NULL;
206} 216}
207 217
208/* set the pointer to a chunk in a page struct */ 218/* set the pointer to a chunk in a page struct */
@@ -217,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
217 return (struct pcpu_chunk *)page->index; 227 return (struct pcpu_chunk *)page->index;
218} 228}
219 229
230static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
231{
232 *rs = find_next_zero_bit(chunk->populated, end, *rs);
233 *re = find_next_bit(chunk->populated, end, *rs + 1);
234}
235
236static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
237{
238 *rs = find_next_bit(chunk->populated, end, *rs);
239 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
240}
241
242/*
243 * (Un)populated page region iterators. Iterate over (un)populated
244 * page regions betwen @start and @end in @chunk. @rs and @re should
245 * be integer variables and will be set to start and end page index of
246 * the current region.
247 */
248#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
249 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
250 (rs) < (re); \
251 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
252
253#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
254 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
255 (rs) < (re); \
256 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
257
220/** 258/**
221 * pcpu_mem_alloc - allocate memory 259 * pcpu_mem_alloc - allocate memory
222 * @size: bytes to allocate 260 * @size: bytes to allocate
@@ -292,10 +330,10 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
292 */ 330 */
293static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 331static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
294{ 332{
295 void *first_start = pcpu_first_chunk->vm->addr; 333 void *first_start = pcpu_first_chunk->base_addr;
296 334
297 /* is it in the first chunk? */ 335 /* is it in the first chunk? */
298 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 336 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
299 /* is it in the reserved area? */ 337 /* is it in the reserved area? */
300 if (addr < first_start + pcpu_reserved_chunk_limit) 338 if (addr < first_start + pcpu_reserved_chunk_limit)
301 return pcpu_reserved_chunk; 339 return pcpu_reserved_chunk;
@@ -309,7 +347,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
309 * space. Note that any possible cpu id can be used here, so 347 * space. Note that any possible cpu id can be used here, so
310 * there's no need to worry about preemption or cpu hotplug. 348 * there's no need to worry about preemption or cpu hotplug.
311 */ 349 */
312 addr += raw_smp_processor_id() * pcpu_unit_size; 350 addr += pcpu_unit_offsets[raw_smp_processor_id()];
313 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 351 return pcpu_get_page_chunk(vmalloc_to_page(addr));
314} 352}
315 353
@@ -558,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
558} 596}
559 597
560/** 598/**
561 * pcpu_unmap - unmap pages out of a pcpu_chunk 599 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
562 * @chunk: chunk of interest 600 * @chunk: chunk of interest
563 * @page_start: page index of the first page to unmap 601 * @bitmapp: output parameter for bitmap
564 * @page_end: page index of the last page to unmap + 1 602 * @may_alloc: may allocate the array
565 * @flush_tlb: whether to flush tlb or not
566 * 603 *
567 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 604 * Returns pointer to array of pointers to struct page and bitmap,
568 * If @flush is true, vcache is flushed before unmapping and tlb 605 * both of which can be indexed with pcpu_page_idx(). The returned
569 * after. 606 * array is cleared to zero and *@bitmapp is copied from
607 * @chunk->populated. Note that there is only one array and bitmap
608 * and access exclusion is the caller's responsibility.
609 *
610 * CONTEXT:
611 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
612 * Otherwise, don't care.
613 *
614 * RETURNS:
615 * Pointer to temp pages array on success, NULL on failure.
570 */ 616 */
571static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 617static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
572 bool flush_tlb) 618 unsigned long **bitmapp,
619 bool may_alloc)
573{ 620{
574 unsigned int last = nr_cpu_ids - 1; 621 static struct page **pages;
575 unsigned int cpu; 622 static unsigned long *bitmap;
623 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
624 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
625 sizeof(unsigned long);
626
627 if (!pages || !bitmap) {
628 if (may_alloc && !pages)
629 pages = pcpu_mem_alloc(pages_size);
630 if (may_alloc && !bitmap)
631 bitmap = pcpu_mem_alloc(bitmap_size);
632 if (!pages || !bitmap)
633 return NULL;
634 }
576 635
577 /* unmap must not be done on immutable chunk */ 636 memset(pages, 0, pages_size);
578 WARN_ON(chunk->immutable); 637 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
579 638
580 /* 639 *bitmapp = bitmap;
581 * Each flushing trial can be very expensive, issue flush on 640 return pages;
582 * the whole region at once rather than doing it for each cpu. 641}
583 * This could be an overkill but is more scalable.
584 */
585 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
586 pcpu_chunk_addr(chunk, last, page_end));
587 642
588 for_each_possible_cpu(cpu) 643/**
589 unmap_kernel_range_noflush( 644 * pcpu_free_pages - free pages which were allocated for @chunk
590 pcpu_chunk_addr(chunk, cpu, page_start), 645 * @chunk: chunk pages were allocated for
591 (page_end - page_start) << PAGE_SHIFT); 646 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
592 647 * @populated: populated bitmap
593 /* ditto as flush_cache_vunmap() */ 648 * @page_start: page index of the first page to be freed
594 if (flush_tlb) 649 * @page_end: page index of the last page to be freed + 1
595 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 650 *
596 pcpu_chunk_addr(chunk, last, page_end)); 651 * Free pages [@page_start and @page_end) in @pages for all units.
652 * The pages were allocated for @chunk.
653 */
654static void pcpu_free_pages(struct pcpu_chunk *chunk,
655 struct page **pages, unsigned long *populated,
656 int page_start, int page_end)
657{
658 unsigned int cpu;
659 int i;
660
661 for_each_possible_cpu(cpu) {
662 for (i = page_start; i < page_end; i++) {
663 struct page *page = pages[pcpu_page_idx(cpu, i)];
664
665 if (page)
666 __free_page(page);
667 }
668 }
597} 669}
598 670
599/** 671/**
600 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 672 * pcpu_alloc_pages - allocates pages for @chunk
601 * @chunk: chunk to depopulate 673 * @chunk: target chunk
602 * @off: offset to the area to depopulate 674 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
603 * @size: size of the area to depopulate in bytes 675 * @populated: populated bitmap
604 * @flush: whether to flush cache and tlb or not 676 * @page_start: page index of the first page to be allocated
605 * 677 * @page_end: page index of the last page to be allocated + 1
606 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 678 *
607 * from @chunk. If @flush is true, vcache is flushed before unmapping 679 * Allocate pages [@page_start,@page_end) into @pages for all units.
608 * and tlb after. 680 * The allocation is for @chunk. Percpu core doesn't care about the
609 * 681 * content of @pages and will pass it verbatim to pcpu_map_pages().
610 * CONTEXT:
611 * pcpu_alloc_mutex.
612 */ 682 */
613static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 683static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
614 bool flush) 684 struct page **pages, unsigned long *populated,
685 int page_start, int page_end)
615{ 686{
616 int page_start = PFN_DOWN(off); 687 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
617 int page_end = PFN_UP(off + size);
618 int unmap_start = -1;
619 int uninitialized_var(unmap_end);
620 unsigned int cpu; 688 unsigned int cpu;
621 int i; 689 int i;
622 690
623 for (i = page_start; i < page_end; i++) { 691 for_each_possible_cpu(cpu) {
624 for_each_possible_cpu(cpu) { 692 for (i = page_start; i < page_end; i++) {
625 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 693 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
694
695 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
696 if (!*pagep) {
697 pcpu_free_pages(chunk, pages, populated,
698 page_start, page_end);
699 return -ENOMEM;
700 }
701 }
702 }
703 return 0;
704}
626 705
627 if (!*pagep) 706/**
628 continue; 707 * pcpu_pre_unmap_flush - flush cache prior to unmapping
708 * @chunk: chunk the regions to be flushed belongs to
709 * @page_start: page index of the first page to be flushed
710 * @page_end: page index of the last page to be flushed + 1
711 *
712 * Pages in [@page_start,@page_end) of @chunk are about to be
713 * unmapped. Flush cache. As each flushing trial can be very
714 * expensive, issue flush on the whole region at once rather than
715 * doing it for each cpu. This could be an overkill but is more
716 * scalable.
717 */
718static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
719 int page_start, int page_end)
720{
721 flush_cache_vunmap(
722 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
723 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
724}
629 725
630 __free_page(*pagep); 726static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
727{
728 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
729}
631 730
632 /* 731/**
633 * If it's partial depopulation, it might get 732 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
634 * populated or depopulated again. Mark the 733 * @chunk: chunk of interest
635 * page gone. 734 * @pages: pages array which can be used to pass information to free
636 */ 735 * @populated: populated bitmap
637 *pagep = NULL; 736 * @page_start: page index of the first page to unmap
737 * @page_end: page index of the last page to unmap + 1
738 *
739 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
740 * Corresponding elements in @pages were cleared by the caller and can
741 * be used to carry information to pcpu_free_pages() which will be
742 * called after all unmaps are finished. The caller should call
743 * proper pre/post flush functions.
744 */
745static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
746 struct page **pages, unsigned long *populated,
747 int page_start, int page_end)
748{
749 unsigned int cpu;
750 int i;
751
752 for_each_possible_cpu(cpu) {
753 for (i = page_start; i < page_end; i++) {
754 struct page *page;
638 755
639 unmap_start = unmap_start < 0 ? i : unmap_start; 756 page = pcpu_chunk_page(chunk, cpu, i);
640 unmap_end = i + 1; 757 WARN_ON(!page);
758 pages[pcpu_page_idx(cpu, i)] = page;
641 } 759 }
760 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
761 page_end - page_start);
642 } 762 }
643 763
644 if (unmap_start >= 0) 764 for (i = page_start; i < page_end; i++)
645 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 765 __clear_bit(i, populated);
646} 766}
647 767
648/** 768/**
649 * pcpu_map - map pages into a pcpu_chunk 769 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
770 * @chunk: pcpu_chunk the regions to be flushed belong to
771 * @page_start: page index of the first page to be flushed
772 * @page_end: page index of the last page to be flushed + 1
773 *
774 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
775 * TLB for the regions. This can be skipped if the area is to be
776 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
777 *
778 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
779 * for the whole region.
780 */
781static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
782 int page_start, int page_end)
783{
784 flush_tlb_kernel_range(
785 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
786 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
787}
788
789static int __pcpu_map_pages(unsigned long addr, struct page **pages,
790 int nr_pages)
791{
792 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
793 PAGE_KERNEL, pages);
794}
795
796/**
797 * pcpu_map_pages - map pages into a pcpu_chunk
650 * @chunk: chunk of interest 798 * @chunk: chunk of interest
799 * @pages: pages array containing pages to be mapped
800 * @populated: populated bitmap
651 * @page_start: page index of the first page to map 801 * @page_start: page index of the first page to map
652 * @page_end: page index of the last page to map + 1 802 * @page_end: page index of the last page to map + 1
653 * 803 *
654 * For each cpu, map pages [@page_start,@page_end) into @chunk. 804 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
655 * vcache is flushed afterwards. 805 * caller is responsible for calling pcpu_post_map_flush() after all
806 * mappings are complete.
807 *
808 * This function is responsible for setting corresponding bits in
809 * @chunk->populated bitmap and whatever is necessary for reverse
810 * lookup (addr -> chunk).
656 */ 811 */
657static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 812static int pcpu_map_pages(struct pcpu_chunk *chunk,
813 struct page **pages, unsigned long *populated,
814 int page_start, int page_end)
658{ 815{
659 unsigned int last = nr_cpu_ids - 1; 816 unsigned int cpu, tcpu;
660 unsigned int cpu; 817 int i, err;
661 int err;
662
663 /* map must not be done on immutable chunk */
664 WARN_ON(chunk->immutable);
665 818
666 for_each_possible_cpu(cpu) { 819 for_each_possible_cpu(cpu) {
667 err = map_kernel_range_noflush( 820 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
668 pcpu_chunk_addr(chunk, cpu, page_start), 821 &pages[pcpu_page_idx(cpu, page_start)],
669 (page_end - page_start) << PAGE_SHIFT, 822 page_end - page_start);
670 PAGE_KERNEL,
671 pcpu_chunk_pagep(chunk, cpu, page_start));
672 if (err < 0) 823 if (err < 0)
673 return err; 824 goto err;
825 }
826
827 /* mapping successful, link chunk and mark populated */
828 for (i = page_start; i < page_end; i++) {
829 for_each_possible_cpu(cpu)
830 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
831 chunk);
832 __set_bit(i, populated);
674 } 833 }
675 834
676 /* flush at once, please read comments in pcpu_unmap() */
677 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
678 pcpu_chunk_addr(chunk, last, page_end));
679 return 0; 835 return 0;
836
837err:
838 for_each_possible_cpu(tcpu) {
839 if (tcpu == cpu)
840 break;
841 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
842 page_end - page_start);
843 }
844 return err;
845}
846
847/**
848 * pcpu_post_map_flush - flush cache after mapping
849 * @chunk: pcpu_chunk the regions to be flushed belong to
850 * @page_start: page index of the first page to be flushed
851 * @page_end: page index of the last page to be flushed + 1
852 *
853 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
854 * cache.
855 *
856 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
857 * for the whole region.
858 */
859static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
860 int page_start, int page_end)
861{
862 flush_cache_vmap(
863 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
864 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
865}
866
867/**
868 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
869 * @chunk: chunk to depopulate
870 * @off: offset to the area to depopulate
871 * @size: size of the area to depopulate in bytes
872 * @flush: whether to flush cache and tlb or not
873 *
874 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
875 * from @chunk. If @flush is true, vcache is flushed before unmapping
876 * and tlb after.
877 *
878 * CONTEXT:
879 * pcpu_alloc_mutex.
880 */
881static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
882{
883 int page_start = PFN_DOWN(off);
884 int page_end = PFN_UP(off + size);
885 struct page **pages;
886 unsigned long *populated;
887 int rs, re;
888
889 /* quick path, check whether it's empty already */
890 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
891 if (rs == page_start && re == page_end)
892 return;
893 break;
894 }
895
896 /* immutable chunks can't be depopulated */
897 WARN_ON(chunk->immutable);
898
899 /*
900 * If control reaches here, there must have been at least one
901 * successful population attempt so the temp pages array must
902 * be available now.
903 */
904 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
905 BUG_ON(!pages);
906
907 /* unmap and free */
908 pcpu_pre_unmap_flush(chunk, page_start, page_end);
909
910 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
911 pcpu_unmap_pages(chunk, pages, populated, rs, re);
912
913 /* no need to flush tlb, vmalloc will handle it lazily */
914
915 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
916 pcpu_free_pages(chunk, pages, populated, rs, re);
917
918 /* commit new bitmap */
919 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
680} 920}
681 921
682/** 922/**
@@ -693,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
693 */ 933 */
694static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 934static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
695{ 935{
696 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
697 int page_start = PFN_DOWN(off); 936 int page_start = PFN_DOWN(off);
698 int page_end = PFN_UP(off + size); 937 int page_end = PFN_UP(off + size);
699 int map_start = -1; 938 int free_end = page_start, unmap_end = page_start;
700 int uninitialized_var(map_end); 939 struct page **pages;
940 unsigned long *populated;
701 unsigned int cpu; 941 unsigned int cpu;
702 int i; 942 int rs, re, rc;
703 943
704 for (i = page_start; i < page_end; i++) { 944 /* quick path, check whether all pages are already there */
705 if (pcpu_chunk_page_occupied(chunk, i)) { 945 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
706 if (map_start >= 0) { 946 if (rs == page_start && re == page_end)
707 if (pcpu_map(chunk, map_start, map_end)) 947 goto clear;
708 goto err; 948 break;
709 map_start = -1; 949 }
710 }
711 continue;
712 }
713 950
714 map_start = map_start < 0 ? i : map_start; 951 /* need to allocate and map pages, this chunk can't be immutable */
715 map_end = i + 1; 952 WARN_ON(chunk->immutable);
716 953
717 for_each_possible_cpu(cpu) { 954 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
718 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 955 if (!pages)
956 return -ENOMEM;
719 957
720 *pagep = alloc_pages_node(cpu_to_node(cpu), 958 /* alloc and map */
721 alloc_mask, 0); 959 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
722 if (!*pagep) 960 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
723 goto err; 961 if (rc)
724 pcpu_set_page_chunk(*pagep, chunk); 962 goto err_free;
725 } 963 free_end = re;
726 } 964 }
727 965
728 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 966 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
729 goto err; 967 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
968 if (rc)
969 goto err_unmap;
970 unmap_end = re;
971 }
972 pcpu_post_map_flush(chunk, page_start, page_end);
730 973
974 /* commit new bitmap */
975 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
976clear:
731 for_each_possible_cpu(cpu) 977 for_each_possible_cpu(cpu)
732 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 978 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
733 size);
734
735 return 0; 979 return 0;
736err: 980
737 /* likely under heavy memory pressure, give memory back */ 981err_unmap:
738 pcpu_depopulate_chunk(chunk, off, size, true); 982 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
739 return -ENOMEM; 983 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
984 pcpu_unmap_pages(chunk, pages, populated, rs, re);
985 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
986err_free:
987 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
988 pcpu_free_pages(chunk, pages, populated, rs, re);
989 return rc;
740} 990}
741 991
742static void free_pcpu_chunk(struct pcpu_chunk *chunk) 992static void free_pcpu_chunk(struct pcpu_chunk *chunk)
743{ 993{
744 if (!chunk) 994 if (!chunk)
745 return; 995 return;
746 if (chunk->vm) 996 if (chunk->vms)
747 free_vm_area(chunk->vm); 997 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
748 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 998 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
749 kfree(chunk); 999 kfree(chunk);
750} 1000}
@@ -760,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
760 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1010 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
761 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1011 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
762 chunk->map[chunk->map_used++] = pcpu_unit_size; 1012 chunk->map[chunk->map_used++] = pcpu_unit_size;
763 chunk->page = chunk->page_ar;
764 1013
765 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); 1014 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
766 if (!chunk->vm) { 1015 pcpu_nr_groups, pcpu_atom_size,
1016 GFP_KERNEL);
1017 if (!chunk->vms) {
767 free_pcpu_chunk(chunk); 1018 free_pcpu_chunk(chunk);
768 return NULL; 1019 return NULL;
769 } 1020 }
@@ -771,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
771 INIT_LIST_HEAD(&chunk->list); 1022 INIT_LIST_HEAD(&chunk->list);
772 chunk->free_size = pcpu_unit_size; 1023 chunk->free_size = pcpu_unit_size;
773 chunk->contig_hint = pcpu_unit_size; 1024 chunk->contig_hint = pcpu_unit_size;
1025 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
774 1026
775 return chunk; 1027 return chunk;
776} 1028}
@@ -791,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
791 */ 1043 */
792static void *pcpu_alloc(size_t size, size_t align, bool reserved) 1044static void *pcpu_alloc(size_t size, size_t align, bool reserved)
793{ 1045{
1046 static int warn_limit = 10;
794 struct pcpu_chunk *chunk; 1047 struct pcpu_chunk *chunk;
1048 const char *err;
795 int slot, off; 1049 int slot, off;
796 1050
797 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1051 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
@@ -807,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
807 if (reserved && pcpu_reserved_chunk) { 1061 if (reserved && pcpu_reserved_chunk) {
808 chunk = pcpu_reserved_chunk; 1062 chunk = pcpu_reserved_chunk;
809 if (size > chunk->contig_hint || 1063 if (size > chunk->contig_hint ||
810 pcpu_extend_area_map(chunk) < 0) 1064 pcpu_extend_area_map(chunk) < 0) {
1065 err = "failed to extend area map of reserved chunk";
811 goto fail_unlock; 1066 goto fail_unlock;
1067 }
812 off = pcpu_alloc_area(chunk, size, align); 1068 off = pcpu_alloc_area(chunk, size, align);
813 if (off >= 0) 1069 if (off >= 0)
814 goto area_found; 1070 goto area_found;
1071 err = "alloc from reserved chunk failed";
815 goto fail_unlock; 1072 goto fail_unlock;
816 } 1073 }
817 1074
@@ -828,6 +1085,7 @@ restart:
828 case 1: 1085 case 1:
829 goto restart; /* pcpu_lock dropped, restart */ 1086 goto restart; /* pcpu_lock dropped, restart */
830 default: 1087 default:
1088 err = "failed to extend area map";
831 goto fail_unlock; 1089 goto fail_unlock;
832 } 1090 }
833 1091
@@ -841,8 +1099,10 @@ restart:
841 spin_unlock_irq(&pcpu_lock); 1099 spin_unlock_irq(&pcpu_lock);
842 1100
843 chunk = alloc_pcpu_chunk(); 1101 chunk = alloc_pcpu_chunk();
844 if (!chunk) 1102 if (!chunk) {
1103 err = "failed to allocate new chunk";
845 goto fail_unlock_mutex; 1104 goto fail_unlock_mutex;
1105 }
846 1106
847 spin_lock_irq(&pcpu_lock); 1107 spin_lock_irq(&pcpu_lock);
848 pcpu_chunk_relocate(chunk, -1); 1108 pcpu_chunk_relocate(chunk, -1);
@@ -855,17 +1115,26 @@ area_found:
855 if (pcpu_populate_chunk(chunk, off, size)) { 1115 if (pcpu_populate_chunk(chunk, off, size)) {
856 spin_lock_irq(&pcpu_lock); 1116 spin_lock_irq(&pcpu_lock);
857 pcpu_free_area(chunk, off); 1117 pcpu_free_area(chunk, off);
1118 err = "failed to populate";
858 goto fail_unlock; 1119 goto fail_unlock;
859 } 1120 }
860 1121
861 mutex_unlock(&pcpu_alloc_mutex); 1122 mutex_unlock(&pcpu_alloc_mutex);
862 1123
863 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1124 /* return address relative to base address */
1125 return __addr_to_pcpu_ptr(chunk->base_addr + off);
864 1126
865fail_unlock: 1127fail_unlock:
866 spin_unlock_irq(&pcpu_lock); 1128 spin_unlock_irq(&pcpu_lock);
867fail_unlock_mutex: 1129fail_unlock_mutex:
868 mutex_unlock(&pcpu_alloc_mutex); 1130 mutex_unlock(&pcpu_alloc_mutex);
1131 if (warn_limit) {
1132 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
1133 "%s\n", size, align, err);
1134 dump_stack();
1135 if (!--warn_limit)
1136 pr_info("PERCPU: limit reached, disable warning\n");
1137 }
869 return NULL; 1138 return NULL;
870} 1139}
871 1140
@@ -938,12 +1207,13 @@ static void pcpu_reclaim(struct work_struct *work)
938 } 1207 }
939 1208
940 spin_unlock_irq(&pcpu_lock); 1209 spin_unlock_irq(&pcpu_lock);
941 mutex_unlock(&pcpu_alloc_mutex);
942 1210
943 list_for_each_entry_safe(chunk, next, &todo, list) { 1211 list_for_each_entry_safe(chunk, next, &todo, list) {
944 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1212 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
945 free_pcpu_chunk(chunk); 1213 free_pcpu_chunk(chunk);
946 } 1214 }
1215
1216 mutex_unlock(&pcpu_alloc_mutex);
947} 1217}
948 1218
949/** 1219/**
@@ -968,7 +1238,7 @@ void free_percpu(void *ptr)
968 spin_lock_irqsave(&pcpu_lock, flags); 1238 spin_lock_irqsave(&pcpu_lock, flags);
969 1239
970 chunk = pcpu_chunk_addr_search(addr); 1240 chunk = pcpu_chunk_addr_search(addr);
971 off = addr - chunk->vm->addr; 1241 off = addr - chunk->base_addr;
972 1242
973 pcpu_free_area(chunk, off); 1243 pcpu_free_area(chunk, off);
974 1244
@@ -987,30 +1257,299 @@ void free_percpu(void *ptr)
987} 1257}
988EXPORT_SYMBOL_GPL(free_percpu); 1258EXPORT_SYMBOL_GPL(free_percpu);
989 1259
1260static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1261 size_t reserved_size,
1262 ssize_t *dyn_sizep)
1263{
1264 size_t size_sum;
1265
1266 size_sum = PFN_ALIGN(static_size + reserved_size +
1267 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1268 if (*dyn_sizep != 0)
1269 *dyn_sizep = size_sum - static_size - reserved_size;
1270
1271 return size_sum;
1272}
1273
990/** 1274/**
991 * pcpu_setup_first_chunk - initialize the first percpu chunk 1275 * pcpu_alloc_alloc_info - allocate percpu allocation info
992 * @get_page_fn: callback to fetch page pointer 1276 * @nr_groups: the number of groups
993 * @static_size: the size of static percpu area in bytes 1277 * @nr_units: the number of units
1278 *
1279 * Allocate ai which is large enough for @nr_groups groups containing
1280 * @nr_units units. The returned ai's groups[0].cpu_map points to the
1281 * cpu_map array which is long enough for @nr_units and filled with
1282 * NR_CPUS. It's the caller's responsibility to initialize cpu_map
1283 * pointer of other groups.
1284 *
1285 * RETURNS:
1286 * Pointer to the allocated pcpu_alloc_info on success, NULL on
1287 * failure.
1288 */
1289struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1290 int nr_units)
1291{
1292 struct pcpu_alloc_info *ai;
1293 size_t base_size, ai_size;
1294 void *ptr;
1295 int unit;
1296
1297 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1298 __alignof__(ai->groups[0].cpu_map[0]));
1299 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1300
1301 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
1302 if (!ptr)
1303 return NULL;
1304 ai = ptr;
1305 ptr += base_size;
1306
1307 ai->groups[0].cpu_map = ptr;
1308
1309 for (unit = 0; unit < nr_units; unit++)
1310 ai->groups[0].cpu_map[unit] = NR_CPUS;
1311
1312 ai->nr_groups = nr_groups;
1313 ai->__ai_size = PFN_ALIGN(ai_size);
1314
1315 return ai;
1316}
1317
1318/**
1319 * pcpu_free_alloc_info - free percpu allocation info
1320 * @ai: pcpu_alloc_info to free
1321 *
1322 * Free @ai which was allocated by pcpu_alloc_alloc_info().
1323 */
1324void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1325{
1326 free_bootmem(__pa(ai), ai->__ai_size);
1327}
1328
1329/**
1330 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
994 * @reserved_size: the size of reserved percpu area in bytes 1331 * @reserved_size: the size of reserved percpu area in bytes
995 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1332 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
996 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1333 * @atom_size: allocation atom size
997 * @base_addr: mapped address, NULL for auto 1334 * @cpu_distance_fn: callback to determine distance between cpus, optional
998 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1335 *
1336 * This function determines grouping of units, their mappings to cpus
1337 * and other parameters considering needed percpu size, allocation
1338 * atom size and distances between CPUs.
1339 *
1340 * Groups are always mutliples of atom size and CPUs which are of
1341 * LOCAL_DISTANCE both ways are grouped together and share space for
1342 * units in the same group. The returned configuration is guaranteed
1343 * to have CPUs on different nodes on different groups and >=75% usage
1344 * of allocated virtual address space.
1345 *
1346 * RETURNS:
1347 * On success, pointer to the new allocation_info is returned. On
1348 * failure, ERR_PTR value is returned.
1349 */
1350struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1351 size_t reserved_size, ssize_t dyn_size,
1352 size_t atom_size,
1353 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1354{
1355 static int group_map[NR_CPUS] __initdata;
1356 static int group_cnt[NR_CPUS] __initdata;
1357 const size_t static_size = __per_cpu_end - __per_cpu_start;
1358 int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
1359 size_t size_sum, min_unit_size, alloc_size;
1360 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1361 int last_allocs, group, unit;
1362 unsigned int cpu, tcpu;
1363 struct pcpu_alloc_info *ai;
1364 unsigned int *cpu_map;
1365
1366 /* this function may be called multiple times */
1367 memset(group_map, 0, sizeof(group_map));
1368 memset(group_cnt, 0, sizeof(group_map));
1369
1370 /*
1371 * Determine min_unit_size, alloc_size and max_upa such that
1372 * alloc_size is multiple of atom_size and is the smallest
1373 * which can accomodate 4k aligned segments which are equal to
1374 * or larger than min_unit_size.
1375 */
1376 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1377 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1378
1379 alloc_size = roundup(min_unit_size, atom_size);
1380 upa = alloc_size / min_unit_size;
1381 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1382 upa--;
1383 max_upa = upa;
1384
1385 /* group cpus according to their proximity */
1386 for_each_possible_cpu(cpu) {
1387 group = 0;
1388 next_group:
1389 for_each_possible_cpu(tcpu) {
1390 if (cpu == tcpu)
1391 break;
1392 if (group_map[tcpu] == group && cpu_distance_fn &&
1393 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1394 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1395 group++;
1396 nr_groups = max(nr_groups, group + 1);
1397 goto next_group;
1398 }
1399 }
1400 group_map[cpu] = group;
1401 group_cnt[group]++;
1402 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1403 }
1404
1405 /*
1406 * Expand unit size until address space usage goes over 75%
1407 * and then as much as possible without using more address
1408 * space.
1409 */
1410 last_allocs = INT_MAX;
1411 for (upa = max_upa; upa; upa--) {
1412 int allocs = 0, wasted = 0;
1413
1414 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1415 continue;
1416
1417 for (group = 0; group < nr_groups; group++) {
1418 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1419 allocs += this_allocs;
1420 wasted += this_allocs * upa - group_cnt[group];
1421 }
1422
1423 /*
1424 * Don't accept if wastage is over 25%. The
1425 * greater-than comparison ensures upa==1 always
1426 * passes the following check.
1427 */
1428 if (wasted > num_possible_cpus() / 3)
1429 continue;
1430
1431 /* and then don't consume more memory */
1432 if (allocs > last_allocs)
1433 break;
1434 last_allocs = allocs;
1435 best_upa = upa;
1436 }
1437 upa = best_upa;
1438
1439 /* allocate and fill alloc_info */
1440 for (group = 0; group < nr_groups; group++)
1441 nr_units += roundup(group_cnt[group], upa);
1442
1443 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1444 if (!ai)
1445 return ERR_PTR(-ENOMEM);
1446 cpu_map = ai->groups[0].cpu_map;
1447
1448 for (group = 0; group < nr_groups; group++) {
1449 ai->groups[group].cpu_map = cpu_map;
1450 cpu_map += roundup(group_cnt[group], upa);
1451 }
1452
1453 ai->static_size = static_size;
1454 ai->reserved_size = reserved_size;
1455 ai->dyn_size = dyn_size;
1456 ai->unit_size = alloc_size / upa;
1457 ai->atom_size = atom_size;
1458 ai->alloc_size = alloc_size;
1459
1460 for (group = 0, unit = 0; group_cnt[group]; group++) {
1461 struct pcpu_group_info *gi = &ai->groups[group];
1462
1463 /*
1464 * Initialize base_offset as if all groups are located
1465 * back-to-back. The caller should update this to
1466 * reflect actual allocation.
1467 */
1468 gi->base_offset = unit * ai->unit_size;
1469
1470 for_each_possible_cpu(cpu)
1471 if (group_map[cpu] == group)
1472 gi->cpu_map[gi->nr_units++] = cpu;
1473 gi->nr_units = roundup(gi->nr_units, upa);
1474 unit += gi->nr_units;
1475 }
1476 BUG_ON(unit != nr_units);
1477
1478 return ai;
1479}
1480
1481/**
1482 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1483 * @lvl: loglevel
1484 * @ai: allocation info to dump
1485 *
1486 * Print out information about @ai using loglevel @lvl.
1487 */
1488static void pcpu_dump_alloc_info(const char *lvl,
1489 const struct pcpu_alloc_info *ai)
1490{
1491 int group_width = 1, cpu_width = 1, width;
1492 char empty_str[] = "--------";
1493 int alloc = 0, alloc_end = 0;
1494 int group, v;
1495 int upa, apl; /* units per alloc, allocs per line */
1496
1497 v = ai->nr_groups;
1498 while (v /= 10)
1499 group_width++;
1500
1501 v = num_possible_cpus();
1502 while (v /= 10)
1503 cpu_width++;
1504 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1505
1506 upa = ai->alloc_size / ai->unit_size;
1507 width = upa * (cpu_width + 1) + group_width + 3;
1508 apl = rounddown_pow_of_two(max(60 / width, 1));
1509
1510 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1511 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1512 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1513
1514 for (group = 0; group < ai->nr_groups; group++) {
1515 const struct pcpu_group_info *gi = &ai->groups[group];
1516 int unit = 0, unit_end = 0;
1517
1518 BUG_ON(gi->nr_units % upa);
1519 for (alloc_end += gi->nr_units / upa;
1520 alloc < alloc_end; alloc++) {
1521 if (!(alloc % apl)) {
1522 printk("\n");
1523 printk("%spcpu-alloc: ", lvl);
1524 }
1525 printk("[%0*d] ", group_width, group);
1526
1527 for (unit_end += upa; unit < unit_end; unit++)
1528 if (gi->cpu_map[unit] != NR_CPUS)
1529 printk("%0*d ", cpu_width,
1530 gi->cpu_map[unit]);
1531 else
1532 printk("%s ", empty_str);
1533 }
1534 }
1535 printk("\n");
1536}
1537
1538/**
1539 * pcpu_setup_first_chunk - initialize the first percpu chunk
1540 * @ai: pcpu_alloc_info describing how to percpu area is shaped
1541 * @base_addr: mapped address
999 * 1542 *
1000 * Initialize the first percpu chunk which contains the kernel static 1543 * Initialize the first percpu chunk which contains the kernel static
1001 * perpcu area. This function is to be called from arch percpu area 1544 * perpcu area. This function is to be called from arch percpu area
1002 * setup path. The first two parameters are mandatory. The rest are 1545 * setup path.
1003 * optional. 1546 *
1004 * 1547 * @ai contains all information necessary to initialize the first
1005 * @get_page_fn() should return pointer to percpu page given cpu 1548 * chunk and prime the dynamic percpu allocator.
1006 * number and page number. It should at least return enough pages to 1549 *
1007 * cover the static area. The returned pages for static area should 1550 * @ai->static_size is the size of static percpu area.
1008 * have been initialized with valid data. If @unit_size is specified, 1551 *
1009 * it can also return pages after the static area. NULL return 1552 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
1010 * indicates end of pages for the cpu. Note that @get_page_fn() must
1011 * return the same number of pages for all cpus.
1012 *
1013 * @reserved_size, if non-zero, specifies the amount of bytes to
1014 * reserve after the static area in the first chunk. This reserves 1553 * reserve after the static area in the first chunk. This reserves
1015 * the first chunk such that it's available only through reserved 1554 * the first chunk such that it's available only through reserved
1016 * percpu allocation. This is primarily used to serve module percpu 1555 * percpu allocation. This is primarily used to serve module percpu
@@ -1018,22 +1557,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
1018 * limited offset range for symbol relocations to guarantee module 1557 * limited offset range for symbol relocations to guarantee module
1019 * percpu symbols fall inside the relocatable range. 1558 * percpu symbols fall inside the relocatable range.
1020 * 1559 *
1021 * @dyn_size, if non-negative, determines the number of bytes 1560 * @ai->dyn_size determines the number of bytes available for dynamic
1022 * available for dynamic allocation in the first chunk. Specifying 1561 * allocation in the first chunk. The area between @ai->static_size +
1023 * non-negative value makes percpu leave alone the area beyond 1562 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
1024 * @static_size + @reserved_size + @dyn_size. 1563 *
1564 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
1565 * and equal to or larger than @ai->static_size + @ai->reserved_size +
1566 * @ai->dyn_size.
1567 *
1568 * @ai->atom_size is the allocation atom size and used as alignment
1569 * for vm areas.
1025 * 1570 *
1026 * @unit_size, if non-negative, specifies unit size and must be 1571 * @ai->alloc_size is the allocation size and always multiple of
1027 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1572 * @ai->atom_size. This is larger than @ai->atom_size if
1028 * @reserved_size + if non-negative, @dyn_size. 1573 * @ai->unit_size is larger than @ai->atom_size.
1029 * 1574 *
1030 * Non-null @base_addr means that the caller already allocated virtual 1575 * @ai->nr_groups and @ai->groups describe virtual memory layout of
1031 * region for the first chunk and mapped it. percpu must not mess 1576 * percpu areas. Units which should be colocated are put into the
1032 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL 1577 * same group. Dynamic VM areas will be allocated according to these
1033 * @populate_pte_fn doesn't make any sense. 1578 * groupings. If @ai->nr_groups is zero, a single group containing
1579 * all units is assumed.
1034 * 1580 *
1035 * @populate_pte_fn is used to populate the pagetable. NULL means the 1581 * The caller should have mapped the first chunk at @base_addr and
1036 * caller already populated the pagetable. 1582 * copied static data to each unit.
1037 * 1583 *
1038 * If the first chunk ends up with both reserved and dynamic areas, it 1584 * If the first chunk ends up with both reserved and dynamic areas, it
1039 * is served by two chunks - one to serve the core static and reserved 1585 * is served by two chunks - one to serve the core static and reserved
@@ -1043,49 +1589,98 @@ EXPORT_SYMBOL_GPL(free_percpu);
1043 * and available for dynamic allocation like any other chunks. 1589 * and available for dynamic allocation like any other chunks.
1044 * 1590 *
1045 * RETURNS: 1591 * RETURNS:
1046 * The determined pcpu_unit_size which can be used to initialize 1592 * 0 on success, -errno on failure.
1047 * percpu access.
1048 */ 1593 */
1049size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1594int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1050 size_t static_size, size_t reserved_size, 1595 void *base_addr)
1051 ssize_t dyn_size, ssize_t unit_size,
1052 void *base_addr,
1053 pcpu_populate_pte_fn_t populate_pte_fn)
1054{ 1596{
1055 static struct vm_struct first_vm; 1597 static char cpus_buf[4096] __initdata;
1056 static int smap[2], dmap[2]; 1598 static int smap[2], dmap[2];
1057 size_t size_sum = static_size + reserved_size + 1599 size_t dyn_size = ai->dyn_size;
1058 (dyn_size >= 0 ? dyn_size : 0); 1600 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1059 struct pcpu_chunk *schunk, *dchunk = NULL; 1601 struct pcpu_chunk *schunk, *dchunk = NULL;
1602 unsigned long *group_offsets;
1603 size_t *group_sizes;
1604 unsigned long *unit_off;
1060 unsigned int cpu; 1605 unsigned int cpu;
1061 int nr_pages; 1606 int *unit_map;
1062 int err, i; 1607 int group, unit, i;
1608
1609 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1610
1611#define PCPU_SETUP_BUG_ON(cond) do { \
1612 if (unlikely(cond)) { \
1613 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1614 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
1615 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1616 BUG(); \
1617 } \
1618} while (0)
1063 1619
1064 /* santiy checks */ 1620 /* sanity checks */
1065 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1621 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1066 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1622 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1067 BUG_ON(!static_size); 1623 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1068 if (unit_size >= 0) { 1624 PCPU_SETUP_BUG_ON(!ai->static_size);
1069 BUG_ON(unit_size < size_sum); 1625 PCPU_SETUP_BUG_ON(!base_addr);
1070 BUG_ON(unit_size & ~PAGE_MASK); 1626 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1071 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1627 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1072 } else 1628 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1073 BUG_ON(base_addr); 1629
1074 BUG_ON(base_addr && populate_pte_fn); 1630 /* process group information and build config tables accordingly */
1075 1631 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
1076 if (unit_size >= 0) 1632 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
1077 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1633 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
1078 else 1634 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1079 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1635
1080 PFN_UP(size_sum)); 1636 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1637 unit_map[cpu] = UINT_MAX;
1638 pcpu_first_unit_cpu = NR_CPUS;
1639
1640 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1641 const struct pcpu_group_info *gi = &ai->groups[group];
1642
1643 group_offsets[group] = gi->base_offset;
1644 group_sizes[group] = gi->nr_units * ai->unit_size;
1645
1646 for (i = 0; i < gi->nr_units; i++) {
1647 cpu = gi->cpu_map[i];
1648 if (cpu == NR_CPUS)
1649 continue;
1081 1650
1082 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1651 PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
1083 pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; 1652 PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
1084 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) 1653 PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
1085 + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
1086 1654
1087 if (dyn_size < 0) 1655 unit_map[cpu] = unit + i;
1088 dyn_size = pcpu_unit_size - static_size - reserved_size; 1656 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1657
1658 if (pcpu_first_unit_cpu == NR_CPUS)
1659 pcpu_first_unit_cpu = cpu;
1660 }
1661 }
1662 pcpu_last_unit_cpu = cpu;
1663 pcpu_nr_units = unit;
1664
1665 for_each_possible_cpu(cpu)
1666 PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
1667
1668 /* we're done parsing the input, undefine BUG macro and dump config */
1669#undef PCPU_SETUP_BUG_ON
1670 pcpu_dump_alloc_info(KERN_INFO, ai);
1671
1672 pcpu_nr_groups = ai->nr_groups;
1673 pcpu_group_offsets = group_offsets;
1674 pcpu_group_sizes = group_sizes;
1675 pcpu_unit_map = unit_map;
1676 pcpu_unit_offsets = unit_off;
1677
1678 /* determine basic parameters */
1679 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1680 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1681 pcpu_atom_size = ai->atom_size;
1682 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1683 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1089 1684
1090 /* 1685 /*
1091 * Allocate chunk slots. The additional last slot is for 1686 * Allocate chunk slots. The additional last slot is for
@@ -1105,189 +1700,368 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1105 */ 1700 */
1106 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1701 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1107 INIT_LIST_HEAD(&schunk->list); 1702 INIT_LIST_HEAD(&schunk->list);
1108 schunk->vm = &first_vm; 1703 schunk->base_addr = base_addr;
1109 schunk->map = smap; 1704 schunk->map = smap;
1110 schunk->map_alloc = ARRAY_SIZE(smap); 1705 schunk->map_alloc = ARRAY_SIZE(smap);
1111 schunk->page = schunk->page_ar; 1706 schunk->immutable = true;
1707 bitmap_fill(schunk->populated, pcpu_unit_pages);
1112 1708
1113 if (reserved_size) { 1709 if (ai->reserved_size) {
1114 schunk->free_size = reserved_size; 1710 schunk->free_size = ai->reserved_size;
1115 pcpu_reserved_chunk = schunk; 1711 pcpu_reserved_chunk = schunk;
1116 pcpu_reserved_chunk_limit = static_size + reserved_size; 1712 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1117 } else { 1713 } else {
1118 schunk->free_size = dyn_size; 1714 schunk->free_size = dyn_size;
1119 dyn_size = 0; /* dynamic area covered */ 1715 dyn_size = 0; /* dynamic area covered */
1120 } 1716 }
1121 schunk->contig_hint = schunk->free_size; 1717 schunk->contig_hint = schunk->free_size;
1122 1718
1123 schunk->map[schunk->map_used++] = -static_size; 1719 schunk->map[schunk->map_used++] = -ai->static_size;
1124 if (schunk->free_size) 1720 if (schunk->free_size)
1125 schunk->map[schunk->map_used++] = schunk->free_size; 1721 schunk->map[schunk->map_used++] = schunk->free_size;
1126 1722
1127 /* init dynamic chunk if necessary */ 1723 /* init dynamic chunk if necessary */
1128 if (dyn_size) { 1724 if (dyn_size) {
1129 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1725 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1130 INIT_LIST_HEAD(&dchunk->list); 1726 INIT_LIST_HEAD(&dchunk->list);
1131 dchunk->vm = &first_vm; 1727 dchunk->base_addr = base_addr;
1132 dchunk->map = dmap; 1728 dchunk->map = dmap;
1133 dchunk->map_alloc = ARRAY_SIZE(dmap); 1729 dchunk->map_alloc = ARRAY_SIZE(dmap);
1134 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1730 dchunk->immutable = true;
1731 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1135 1732
1136 dchunk->contig_hint = dchunk->free_size = dyn_size; 1733 dchunk->contig_hint = dchunk->free_size = dyn_size;
1137 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1734 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1138 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1735 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1139 } 1736 }
1140 1737
1141 /* allocate vm address */
1142 first_vm.flags = VM_ALLOC;
1143 first_vm.size = pcpu_chunk_size;
1144
1145 if (!base_addr)
1146 vm_area_register_early(&first_vm, PAGE_SIZE);
1147 else {
1148 /*
1149 * Pages already mapped. No need to remap into
1150 * vmalloc area. In this case the first chunks can't
1151 * be mapped or unmapped by percpu and are marked
1152 * immutable.
1153 */
1154 first_vm.addr = base_addr;
1155 schunk->immutable = true;
1156 if (dchunk)
1157 dchunk->immutable = true;
1158 }
1159
1160 /* assign pages */
1161 nr_pages = -1;
1162 for_each_possible_cpu(cpu) {
1163 for (i = 0; i < pcpu_unit_pages; i++) {
1164 struct page *page = get_page_fn(cpu, i);
1165
1166 if (!page)
1167 break;
1168 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1169 }
1170
1171 BUG_ON(i < PFN_UP(static_size));
1172
1173 if (nr_pages < 0)
1174 nr_pages = i;
1175 else
1176 BUG_ON(nr_pages != i);
1177 }
1178
1179 /* map them */
1180 if (populate_pte_fn) {
1181 for_each_possible_cpu(cpu)
1182 for (i = 0; i < nr_pages; i++)
1183 populate_pte_fn(pcpu_chunk_addr(schunk,
1184 cpu, i));
1185
1186 err = pcpu_map(schunk, 0, nr_pages);
1187 if (err)
1188 panic("failed to setup static percpu area, err=%d\n",
1189 err);
1190 }
1191
1192 /* link the first chunk in */ 1738 /* link the first chunk in */
1193 pcpu_first_chunk = dchunk ?: schunk; 1739 pcpu_first_chunk = dchunk ?: schunk;
1194 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1740 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1195 1741
1196 /* we're done */ 1742 /* we're done */
1197 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1743 pcpu_base_addr = base_addr;
1198 return pcpu_unit_size; 1744 return 0;
1199} 1745}
1200 1746
1201/* 1747const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1202 * Embedding first chunk setup helper. 1748 [PCPU_FC_AUTO] = "auto",
1203 */ 1749 [PCPU_FC_EMBED] = "embed",
1204static void *pcpue_ptr __initdata; 1750 [PCPU_FC_PAGE] = "page",
1205static size_t pcpue_size __initdata; 1751};
1206static size_t pcpue_unit_size __initdata;
1207 1752
1208static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1753enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1209{
1210 size_t off = (size_t)pageno << PAGE_SHIFT;
1211 1754
1212 if (off >= pcpue_size) 1755static int __init percpu_alloc_setup(char *str)
1213 return NULL; 1756{
1757 if (0)
1758 /* nada */;
1759#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1760 else if (!strcmp(str, "embed"))
1761 pcpu_chosen_fc = PCPU_FC_EMBED;
1762#endif
1763#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1764 else if (!strcmp(str, "page"))
1765 pcpu_chosen_fc = PCPU_FC_PAGE;
1766#endif
1767 else
1768 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1214 1769
1215 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1770 return 0;
1216} 1771}
1772early_param("percpu_alloc", percpu_alloc_setup);
1217 1773
1774#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1775 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1218/** 1776/**
1219 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1777 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1220 * @static_size: the size of static percpu area in bytes
1221 * @reserved_size: the size of reserved percpu area in bytes 1778 * @reserved_size: the size of reserved percpu area in bytes
1222 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1779 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1223 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1780 * @atom_size: allocation atom size
1781 * @cpu_distance_fn: callback to determine distance between cpus, optional
1782 * @alloc_fn: function to allocate percpu page
1783 * @free_fn: funtion to free percpu page
1224 * 1784 *
1225 * This is a helper to ease setting up embedded first percpu chunk and 1785 * This is a helper to ease setting up embedded first percpu chunk and
1226 * can be called where pcpu_setup_first_chunk() is expected. 1786 * can be called where pcpu_setup_first_chunk() is expected.
1227 * 1787 *
1228 * If this function is used to setup the first chunk, it is allocated 1788 * If this function is used to setup the first chunk, it is allocated
1229 * as a contiguous area using bootmem allocator and used as-is without 1789 * by calling @alloc_fn and used as-is without being mapped into
1230 * being mapped into vmalloc area. This enables the first chunk to 1790 * vmalloc area. Allocations are always whole multiples of @atom_size
1231 * piggy back on the linear physical mapping which often uses larger 1791 * aligned to @atom_size.
1232 * page size. 1792 *
1793 * This enables the first chunk to piggy back on the linear physical
1794 * mapping which often uses larger page size. Please note that this
1795 * can result in very sparse cpu->unit mapping on NUMA machines thus
1796 * requiring large vmalloc address space. Don't use this allocator if
1797 * vmalloc space is not orders of magnitude larger than distances
1798 * between node memory addresses (ie. 32bit NUMA machines).
1233 * 1799 *
1234 * When @dyn_size is positive, dynamic area might be larger than 1800 * When @dyn_size is positive, dynamic area might be larger than
1235 * specified to fill page alignment. Also, when @dyn_size is auto, 1801 * specified to fill page alignment. When @dyn_size is auto,
1236 * @dyn_size does not fill the whole first chunk but only what's 1802 * @dyn_size is just big enough to fill page alignment after static
1237 * necessary for page alignment after static and reserved areas. 1803 * and reserved areas.
1238 * 1804 *
1239 * If the needed size is smaller than the minimum or specified unit 1805 * If the needed size is smaller than the minimum or specified unit
1240 * size, the leftover is returned to the bootmem allocator. 1806 * size, the leftover is returned using @free_fn.
1241 * 1807 *
1242 * RETURNS: 1808 * RETURNS:
1243 * The determined pcpu_unit_size which can be used to initialize 1809 * 0 on success, -errno on failure.
1244 * percpu access on success, -errno on failure.
1245 */ 1810 */
1246ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1811int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1247 ssize_t dyn_size, ssize_t unit_size) 1812 size_t atom_size,
1813 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1814 pcpu_fc_alloc_fn_t alloc_fn,
1815 pcpu_fc_free_fn_t free_fn)
1248{ 1816{
1249 size_t chunk_size; 1817 void *base = (void *)ULONG_MAX;
1250 unsigned int cpu; 1818 void **areas = NULL;
1819 struct pcpu_alloc_info *ai;
1820 size_t size_sum, areas_size, max_distance;
1821 int group, i, rc;
1822
1823 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1824 cpu_distance_fn);
1825 if (IS_ERR(ai))
1826 return PTR_ERR(ai);
1827
1828 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1829 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1830
1831 areas = alloc_bootmem_nopanic(areas_size);
1832 if (!areas) {
1833 rc = -ENOMEM;
1834 goto out_free;
1835 }
1251 1836
1252 /* determine parameters and allocate */ 1837 /* allocate, copy and determine base address */
1253 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1838 for (group = 0; group < ai->nr_groups; group++) {
1254 (dyn_size >= 0 ? dyn_size : 0)); 1839 struct pcpu_group_info *gi = &ai->groups[group];
1255 if (dyn_size != 0) 1840 unsigned int cpu = NR_CPUS;
1256 dyn_size = pcpue_size - static_size - reserved_size; 1841 void *ptr;
1257 1842
1258 if (unit_size >= 0) { 1843 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1259 BUG_ON(unit_size < pcpue_size); 1844 cpu = gi->cpu_map[i];
1260 pcpue_unit_size = unit_size; 1845 BUG_ON(cpu == NR_CPUS);
1261 } else 1846
1262 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1847 /* allocate space for the whole group */
1263 1848 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1264 chunk_size = pcpue_unit_size * nr_cpu_ids; 1849 if (!ptr) {
1265 1850 rc = -ENOMEM;
1266 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1851 goto out_free_areas;
1267 __pa(MAX_DMA_ADDRESS)); 1852 }
1268 if (!pcpue_ptr) { 1853 areas[group] = ptr;
1269 pr_warning("PERCPU: failed to allocate %zu bytes for " 1854
1270 "embedding\n", chunk_size); 1855 base = min(ptr, base);
1271 return -ENOMEM; 1856
1857 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1858 if (gi->cpu_map[i] == NR_CPUS) {
1859 /* unused unit, free whole */
1860 free_fn(ptr, ai->unit_size);
1861 continue;
1862 }
1863 /* copy and return the unused part */
1864 memcpy(ptr, __per_cpu_load, ai->static_size);
1865 free_fn(ptr + size_sum, ai->unit_size - size_sum);
1866 }
1272 } 1867 }
1273 1868
1274 /* return the leftover and copy */ 1869 /* base address is now known, determine group base offsets */
1275 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 1870 max_distance = 0;
1276 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1871 for (group = 0; group < ai->nr_groups; group++) {
1872 ai->groups[group].base_offset = areas[group] - base;
1873 max_distance = max_t(size_t, max_distance,
1874 ai->groups[group].base_offset);
1875 }
1876 max_distance += ai->unit_size;
1877
1878 /* warn if maximum distance is further than 75% of vmalloc space */
1879 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
1880 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1881 "space 0x%lx\n",
1882 max_distance, VMALLOC_END - VMALLOC_START);
1883#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1884 /* and fail if we have fallback */
1885 rc = -EINVAL;
1886 goto out_free;
1887#endif
1888 }
1889
1890 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1891 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
1892 ai->dyn_size, ai->unit_size);
1893
1894 rc = pcpu_setup_first_chunk(ai, base);
1895 goto out_free;
1896
1897out_free_areas:
1898 for (group = 0; group < ai->nr_groups; group++)
1899 free_fn(areas[group],
1900 ai->groups[group].nr_units * ai->unit_size);
1901out_free:
1902 pcpu_free_alloc_info(ai);
1903 if (areas)
1904 free_bootmem(__pa(areas), areas_size);
1905 return rc;
1906}
1907#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
1908 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1909
1910#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1911/**
1912 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1913 * @reserved_size: the size of reserved percpu area in bytes
1914 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1915 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
1916 * @populate_pte_fn: function to populate pte
1917 *
1918 * This is a helper to ease setting up page-remapped first percpu
1919 * chunk and can be called where pcpu_setup_first_chunk() is expected.
1920 *
1921 * This is the basic allocator. Static percpu area is allocated
1922 * page-by-page into vmalloc area.
1923 *
1924 * RETURNS:
1925 * 0 on success, -errno on failure.
1926 */
1927int __init pcpu_page_first_chunk(size_t reserved_size,
1928 pcpu_fc_alloc_fn_t alloc_fn,
1929 pcpu_fc_free_fn_t free_fn,
1930 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1931{
1932 static struct vm_struct vm;
1933 struct pcpu_alloc_info *ai;
1934 char psize_str[16];
1935 int unit_pages;
1936 size_t pages_size;
1937 struct page **pages;
1938 int unit, i, j, rc;
1939
1940 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
1941
1942 ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
1943 if (IS_ERR(ai))
1944 return PTR_ERR(ai);
1945 BUG_ON(ai->nr_groups != 1);
1946 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
1947
1948 unit_pages = ai->unit_size >> PAGE_SHIFT;
1949
1950 /* unaligned allocations can't be freed, round up to page size */
1951 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1952 sizeof(pages[0]));
1953 pages = alloc_bootmem(pages_size);
1954
1955 /* allocate pages */
1956 j = 0;
1957 for (unit = 0; unit < num_possible_cpus(); unit++)
1958 for (i = 0; i < unit_pages; i++) {
1959 unsigned int cpu = ai->groups[0].cpu_map[unit];
1960 void *ptr;
1961
1962 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
1963 if (!ptr) {
1964 pr_warning("PERCPU: failed to allocate %s page "
1965 "for cpu%u\n", psize_str, cpu);
1966 goto enomem;
1967 }
1968 pages[j++] = virt_to_page(ptr);
1969 }
1970
1971 /* allocate vm area, map the pages and copy static data */
1972 vm.flags = VM_ALLOC;
1973 vm.size = num_possible_cpus() * ai->unit_size;
1974 vm_area_register_early(&vm, PAGE_SIZE);
1975
1976 for (unit = 0; unit < num_possible_cpus(); unit++) {
1977 unsigned long unit_addr =
1978 (unsigned long)vm.addr + unit * ai->unit_size;
1979
1980 for (i = 0; i < unit_pages; i++)
1981 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1277 1982
1278 if (cpu_possible(cpu)) { 1983 /* pte already populated, the following shouldn't fail */
1279 free_bootmem(__pa(ptr + pcpue_size), 1984 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
1280 pcpue_unit_size - pcpue_size); 1985 unit_pages);
1281 memcpy(ptr, __per_cpu_load, static_size); 1986 if (rc < 0)
1282 } else 1987 panic("failed to map percpu area, err=%d\n", rc);
1283 free_bootmem(__pa(ptr), pcpue_unit_size); 1988
1989 /*
1990 * FIXME: Archs with virtual cache should flush local
1991 * cache for the linear mapping here - something
1992 * equivalent to flush_cache_vmap() on the local cpu.
1993 * flush_cache_vmap() can't be used as most supporting
1994 * data structures are not set up yet.
1995 */
1996
1997 /* copy static data */
1998 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
1284 } 1999 }
1285 2000
1286 /* we're ready, commit */ 2001 /* we're ready, commit */
1287 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 2002 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
1288 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 2003 unit_pages, psize_str, vm.addr, ai->static_size,
2004 ai->reserved_size, ai->dyn_size);
2005
2006 rc = pcpu_setup_first_chunk(ai, vm.addr);
2007 goto out_free_ar;
2008
2009enomem:
2010 while (--j >= 0)
2011 free_fn(page_address(pages[j]), PAGE_SIZE);
2012 rc = -ENOMEM;
2013out_free_ar:
2014 free_bootmem(__pa(pages), pages_size);
2015 pcpu_free_alloc_info(ai);
2016 return rc;
2017}
2018#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
2019
2020/*
2021 * Generic percpu area setup.
2022 *
2023 * The embedding helper is used because its behavior closely resembles
2024 * the original non-dynamic generic percpu area setup. This is
2025 * important because many archs have addressing restrictions and might
2026 * fail if the percpu area is located far away from the previous
2027 * location. As an added bonus, in non-NUMA cases, embedding is
2028 * generally a good idea TLB-wise because percpu area can piggy back
2029 * on the physical linear memory mapping which uses large page
2030 * mappings on applicable archs.
2031 */
2032#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
2033unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2034EXPORT_SYMBOL(__per_cpu_offset);
2035
2036static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2037 size_t align)
2038{
2039 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
2040}
1289 2041
1290 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 2042static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1291 reserved_size, dyn_size, 2043{
1292 pcpue_unit_size, pcpue_ptr, NULL); 2044 free_bootmem(__pa(ptr), size);
2045}
2046
2047void __init setup_per_cpu_areas(void)
2048{
2049 unsigned long delta;
2050 unsigned int cpu;
2051 int rc;
2052
2053 /*
2054 * Always reserve area for module percpu variables. That's
2055 * what the legacy allocator did.
2056 */
2057 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2058 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2059 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2060 if (rc < 0)
2061 panic("Failed to initialized percpu areas.");
2062
2063 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2064 for_each_possible_cpu(cpu)
2065 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1293} 2066}
2067#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..6633965bb27b 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
@@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages)
29 int node = numa_node_id(); 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones; 30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node; 31 int num_cpus_on_node;
32 const struct cpumask *cpumask_on_node = cpumask_of_node(node);
33 32
34 node_free_pages = 33 node_free_pages =
35#ifdef CONFIG_ZONE_DMA 34#ifdef CONFIG_ZONE_DMA
@@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages)
42 41
43 max = node_free_pages / FRACTION_OF_NODE_MEM; 42 max = node_free_pages / FRACTION_OF_NODE_MEM;
44 43
45 num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); 44 num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
46 max /= num_cpus_on_node; 45 max /= num_cpus_on_node;
47 46
48 return max(max, min_pages); 47 return max(max, min_pages);
diff --git a/mm/rmap.c b/mm/rmap.c
index 0895b5c7cbff..dd43373a483f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,11 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 *
40 * (code doesn't rely on that order so it could be switched around)
41 * ->tasklist_lock
42 * anon_vma->lock (memory_failure, collect_procs_anon)
43 * pte map lock
39 */ 44 */
40 45
41#include <linux/mm.h> 46#include <linux/mm.h>
@@ -191,7 +196,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 196 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 197 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 198 */
194static struct anon_vma *page_lock_anon_vma(struct page *page) 199struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 200{
196 struct anon_vma *anon_vma; 201 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 202 unsigned long anon_mapping;
@@ -211,7 +216,7 @@ out:
211 return NULL; 216 return NULL;
212} 217}
213 218
214static void page_unlock_anon_vma(struct anon_vma *anon_vma) 219void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 220{
216 spin_unlock(&anon_vma->lock); 221 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 222 rcu_read_unlock();
@@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
237} 242}
238 243
239/* 244/*
240 * At what user virtual address is page expected in vma? checking that the 245 * At what user virtual address is page expected in vma?
241 * page matches the vma: currently only used on anon pages, by unuse_vma; 246 * checking that the page matches the vma.
242 */ 247 */
243unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 248unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
244{ 249{
@@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
311 * if the page is not mapped into the page tables of this VMA. Only 316 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs. 317 * valid for normal file or anonymous VMAs.
313 */ 318 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) 319int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{ 320{
316 unsigned long address; 321 unsigned long address;
317 pte_t *pte; 322 pte_t *pte;
@@ -710,27 +715,6 @@ void page_add_file_rmap(struct page *page)
710 } 715 }
711} 716}
712 717
713#ifdef CONFIG_DEBUG_VM
714/**
715 * page_dup_rmap - duplicate pte mapping to a page
716 * @page: the page to add the mapping to
717 * @vma: the vm area being duplicated
718 * @address: the user virtual address mapped
719 *
720 * For copy_page_range only: minimal extract from page_add_file_rmap /
721 * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
722 * quicker.
723 *
724 * The caller needs to hold the pte lock.
725 */
726void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
727{
728 if (PageAnon(page))
729 __page_check_anon_rmap(page, vma, address);
730 atomic_inc(&page->_mapcount);
731}
732#endif
733
734/** 718/**
735 * page_remove_rmap - take down pte mapping from a page 719 * page_remove_rmap - take down pte mapping from a page
736 * @page: page to remove mapping from 720 * @page: page to remove mapping from
@@ -739,34 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
739 */ 723 */
740void page_remove_rmap(struct page *page) 724void page_remove_rmap(struct page *page)
741{ 725{
742 if (atomic_add_negative(-1, &page->_mapcount)) { 726 /* page still mapped by someone else? */
743 /* 727 if (!atomic_add_negative(-1, &page->_mapcount))
744 * Now that the last pte has gone, s390 must transfer dirty 728 return;
745 * flag from storage key to struct page. We can usually skip 729
746 * this if the page is anon, so about to be freed; but perhaps 730 /*
747 * not if it's in swapcache - there might be another pte slot 731 * Now that the last pte has gone, s390 must transfer dirty
748 * containing the swap entry, but page not yet written to swap. 732 * flag from storage key to struct page. We can usually skip
749 */ 733 * this if the page is anon, so about to be freed; but perhaps
750 if ((!PageAnon(page) || PageSwapCache(page)) && 734 * not if it's in swapcache - there might be another pte slot
751 page_test_dirty(page)) { 735 * containing the swap entry, but page not yet written to swap.
752 page_clear_dirty(page); 736 */
753 set_page_dirty(page); 737 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
754 } 738 page_clear_dirty(page);
755 if (PageAnon(page)) 739 set_page_dirty(page);
756 mem_cgroup_uncharge_page(page); 740 }
757 __dec_zone_page_state(page, 741 if (PageAnon(page)) {
758 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 742 mem_cgroup_uncharge_page(page);
759 mem_cgroup_update_mapped_file_stat(page, -1); 743 __dec_zone_page_state(page, NR_ANON_PAGES);
760 /* 744 } else {
761 * It would be tidy to reset the PageAnon mapping here, 745 __dec_zone_page_state(page, NR_FILE_MAPPED);
762 * but that might overwrite a racing page_add_anon_rmap
763 * which increments mapcount after us but sets mapping
764 * before us: so leave the reset to free_hot_cold_page,
765 * and remember that it's only reliable while mapped.
766 * Leaving it set also helps swapoff to reinstate ptes
767 * faster for those pages still in swapcache.
768 */
769 } 746 }
747 mem_cgroup_update_mapped_file_stat(page, -1);
748 /*
749 * It would be tidy to reset the PageAnon mapping here,
750 * but that might overwrite a racing page_add_anon_rmap
751 * which increments mapcount after us but sets mapping
752 * before us: so leave the reset to free_hot_cold_page,
753 * and remember that it's only reliable while mapped.
754 * Leaving it set also helps swapoff to reinstate ptes
755 * faster for those pages still in swapcache.
756 */
770} 757}
771 758
772/* 759/*
@@ -774,7 +761,7 @@ void page_remove_rmap(struct page *page)
774 * repeatedly from either try_to_unmap_anon or try_to_unmap_file. 761 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
775 */ 762 */
776static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, 763static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
777 int migration) 764 enum ttu_flags flags)
778{ 765{
779 struct mm_struct *mm = vma->vm_mm; 766 struct mm_struct *mm = vma->vm_mm;
780 unsigned long address; 767 unsigned long address;
@@ -796,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
796 * If it's recently referenced (perhaps page_referenced 783 * If it's recently referenced (perhaps page_referenced
797 * skipped over this mm) then we should reactivate it. 784 * skipped over this mm) then we should reactivate it.
798 */ 785 */
799 if (!migration) { 786 if (!(flags & TTU_IGNORE_MLOCK)) {
800 if (vma->vm_flags & VM_LOCKED) { 787 if (vma->vm_flags & VM_LOCKED) {
801 ret = SWAP_MLOCK; 788 ret = SWAP_MLOCK;
802 goto out_unmap; 789 goto out_unmap;
803 } 790 }
791 }
792 if (!(flags & TTU_IGNORE_ACCESS)) {
804 if (ptep_clear_flush_young_notify(vma, address, pte)) { 793 if (ptep_clear_flush_young_notify(vma, address, pte)) {
805 ret = SWAP_FAIL; 794 ret = SWAP_FAIL;
806 goto out_unmap; 795 goto out_unmap;
@@ -818,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
818 /* Update high watermark before we lower rss */ 807 /* Update high watermark before we lower rss */
819 update_hiwater_rss(mm); 808 update_hiwater_rss(mm);
820 809
821 if (PageAnon(page)) { 810 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
811 if (PageAnon(page))
812 dec_mm_counter(mm, anon_rss);
813 else
814 dec_mm_counter(mm, file_rss);
815 set_pte_at(mm, address, pte,
816 swp_entry_to_pte(make_hwpoison_entry(page)));
817 } else if (PageAnon(page)) {
822 swp_entry_t entry = { .val = page_private(page) }; 818 swp_entry_t entry = { .val = page_private(page) };
823 819
824 if (PageSwapCache(page)) { 820 if (PageSwapCache(page)) {
@@ -840,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
840 * pte. do_swap_page() will wait until the migration 836 * pte. do_swap_page() will wait until the migration
841 * pte is removed and then restart fault handling. 837 * pte is removed and then restart fault handling.
842 */ 838 */
843 BUG_ON(!migration); 839 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
844 entry = make_migration_entry(page, pte_write(pteval)); 840 entry = make_migration_entry(page, pte_write(pteval));
845 } 841 }
846 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 842 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
847 BUG_ON(pte_file(*pte)); 843 BUG_ON(pte_file(*pte));
848 } else if (PAGE_MIGRATION && migration) { 844 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
849 /* Establish migration entry for a file page */ 845 /* Establish migration entry for a file page */
850 swp_entry_t entry; 846 swp_entry_t entry;
851 entry = make_migration_entry(page, pte_write(pteval)); 847 entry = make_migration_entry(page, pte_write(pteval));
@@ -1014,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
1014 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1010 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1015 * 'LOCKED. 1011 * 'LOCKED.
1016 */ 1012 */
1017static int try_to_unmap_anon(struct page *page, int unlock, int migration) 1013static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1018{ 1014{
1019 struct anon_vma *anon_vma; 1015 struct anon_vma *anon_vma;
1020 struct vm_area_struct *vma; 1016 struct vm_area_struct *vma;
1021 unsigned int mlocked = 0; 1017 unsigned int mlocked = 0;
1022 int ret = SWAP_AGAIN; 1018 int ret = SWAP_AGAIN;
1019 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1023 1020
1024 if (MLOCK_PAGES && unlikely(unlock)) 1021 if (MLOCK_PAGES && unlikely(unlock))
1025 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1022 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1035,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1035 continue; /* must visit all unlocked vmas */ 1032 continue; /* must visit all unlocked vmas */
1036 ret = SWAP_MLOCK; /* saw at least one mlocked vma */ 1033 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1037 } else { 1034 } else {
1038 ret = try_to_unmap_one(page, vma, migration); 1035 ret = try_to_unmap_one(page, vma, flags);
1039 if (ret == SWAP_FAIL || !page_mapped(page)) 1036 if (ret == SWAP_FAIL || !page_mapped(page))
1040 break; 1037 break;
1041 } 1038 }
@@ -1059,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1059/** 1056/**
1060 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method 1057 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
1061 * @page: the page to unmap/unlock 1058 * @page: the page to unmap/unlock
1062 * @unlock: request for unlock rather than unmap [unlikely] 1059 * @flags: action and flags
1063 * @migration: unmapping for migration - ignored if @unlock
1064 * 1060 *
1065 * Find all the mappings of a page using the mapping pointer and the vma chains 1061 * Find all the mappings of a page using the mapping pointer and the vma chains
1066 * contained in the address_space struct it points to. 1062 * contained in the address_space struct it points to.
@@ -1072,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
1072 * vm_flags for that VMA. That should be OK, because that vma shouldn't be 1068 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1073 * 'LOCKED. 1069 * 'LOCKED.
1074 */ 1070 */
1075static int try_to_unmap_file(struct page *page, int unlock, int migration) 1071static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1076{ 1072{
1077 struct address_space *mapping = page->mapping; 1073 struct address_space *mapping = page->mapping;
1078 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1074 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1084,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1084 unsigned long max_nl_size = 0; 1080 unsigned long max_nl_size = 0;
1085 unsigned int mapcount; 1081 unsigned int mapcount;
1086 unsigned int mlocked = 0; 1082 unsigned int mlocked = 0;
1083 int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
1087 1084
1088 if (MLOCK_PAGES && unlikely(unlock)) 1085 if (MLOCK_PAGES && unlikely(unlock))
1089 ret = SWAP_SUCCESS; /* default for try_to_munlock() */ 1086 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
@@ -1096,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1096 continue; /* must visit all vmas */ 1093 continue; /* must visit all vmas */
1097 ret = SWAP_MLOCK; 1094 ret = SWAP_MLOCK;
1098 } else { 1095 } else {
1099 ret = try_to_unmap_one(page, vma, migration); 1096 ret = try_to_unmap_one(page, vma, flags);
1100 if (ret == SWAP_FAIL || !page_mapped(page)) 1097 if (ret == SWAP_FAIL || !page_mapped(page))
1101 goto out; 1098 goto out;
1102 } 1099 }
@@ -1121,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1121 ret = SWAP_MLOCK; /* leave mlocked == 0 */ 1118 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1122 goto out; /* no need to look further */ 1119 goto out; /* no need to look further */
1123 } 1120 }
1124 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) 1121 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1122 (vma->vm_flags & VM_LOCKED))
1125 continue; 1123 continue;
1126 cursor = (unsigned long) vma->vm_private_data; 1124 cursor = (unsigned long) vma->vm_private_data;
1127 if (cursor > max_nl_cursor) 1125 if (cursor > max_nl_cursor)
@@ -1155,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
1155 do { 1153 do {
1156 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1154 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1157 shared.vm_set.list) { 1155 shared.vm_set.list) {
1158 if (!MLOCK_PAGES && !migration && 1156 if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
1159 (vma->vm_flags & VM_LOCKED)) 1157 (vma->vm_flags & VM_LOCKED))
1160 continue; 1158 continue;
1161 cursor = (unsigned long) vma->vm_private_data; 1159 cursor = (unsigned long) vma->vm_private_data;
@@ -1195,7 +1193,7 @@ out:
1195/** 1193/**
1196 * try_to_unmap - try to remove all page table mappings to a page 1194 * try_to_unmap - try to remove all page table mappings to a page
1197 * @page: the page to get unmapped 1195 * @page: the page to get unmapped
1198 * @migration: migration flag 1196 * @flags: action and flags
1199 * 1197 *
1200 * Tries to remove all the page table entries which are mapping this 1198 * Tries to remove all the page table entries which are mapping this
1201 * page, used in the pageout path. Caller must hold the page lock. 1199 * page, used in the pageout path. Caller must hold the page lock.
@@ -1206,16 +1204,16 @@ out:
1206 * SWAP_FAIL - the page is unswappable 1204 * SWAP_FAIL - the page is unswappable
1207 * SWAP_MLOCK - page is mlocked. 1205 * SWAP_MLOCK - page is mlocked.
1208 */ 1206 */
1209int try_to_unmap(struct page *page, int migration) 1207int try_to_unmap(struct page *page, enum ttu_flags flags)
1210{ 1208{
1211 int ret; 1209 int ret;
1212 1210
1213 BUG_ON(!PageLocked(page)); 1211 BUG_ON(!PageLocked(page));
1214 1212
1215 if (PageAnon(page)) 1213 if (PageAnon(page))
1216 ret = try_to_unmap_anon(page, 0, migration); 1214 ret = try_to_unmap_anon(page, flags);
1217 else 1215 else
1218 ret = try_to_unmap_file(page, 0, migration); 1216 ret = try_to_unmap_file(page, flags);
1219 if (ret != SWAP_MLOCK && !page_mapped(page)) 1217 if (ret != SWAP_MLOCK && !page_mapped(page))
1220 ret = SWAP_SUCCESS; 1218 ret = SWAP_SUCCESS;
1221 return ret; 1219 return ret;
@@ -1240,8 +1238,8 @@ int try_to_munlock(struct page *page)
1240 VM_BUG_ON(!PageLocked(page) || PageLRU(page)); 1238 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1241 1239
1242 if (PageAnon(page)) 1240 if (PageAnon(page))
1243 return try_to_unmap_anon(page, 1, 0); 1241 return try_to_unmap_anon(page, TTU_MUNLOCK);
1244 else 1242 else
1245 return try_to_unmap_file(page, 1, 0); 1243 return try_to_unmap_file(page, TTU_MUNLOCK);
1246} 1244}
1247 1245
diff --git a/mm/shmem.c b/mm/shmem.c
index d713239ce2ce..356dd99566ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt;
49#include <linux/backing-dev.h> 49#include <linux/backing-dev.h>
50#include <linux/shmem_fs.h> 50#include <linux/shmem_fs.h>
51#include <linux/writeback.h> 51#include <linux/writeback.h>
52#include <linux/vfs.h>
53#include <linux/blkdev.h> 52#include <linux/blkdev.h>
54#include <linux/security.h> 53#include <linux/security.h>
55#include <linux/swapops.h> 54#include <linux/swapops.h>
@@ -219,7 +218,7 @@ static const struct file_operations shmem_file_operations;
219static const struct inode_operations shmem_inode_operations; 218static const struct inode_operations shmem_inode_operations;
220static const struct inode_operations shmem_dir_inode_operations; 219static const struct inode_operations shmem_dir_inode_operations;
221static const struct inode_operations shmem_special_inode_operations; 220static const struct inode_operations shmem_special_inode_operations;
222static struct vm_operations_struct shmem_vm_ops; 221static const struct vm_operations_struct shmem_vm_ops;
223 222
224static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 223static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
225 .ra_pages = 0, /* No readahead */ 224 .ra_pages = 0, /* No readahead */
@@ -1047,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1047 * sync from ever calling shmem_writepage; but a stacking filesystem 1046 * sync from ever calling shmem_writepage; but a stacking filesystem
1048 * may use the ->writepage of its underlying filesystem, in which case 1047 * may use the ->writepage of its underlying filesystem, in which case
1049 * tmpfs should write out to swap only in response to memory pressure, 1048 * tmpfs should write out to swap only in response to memory pressure,
1050 * and not for pdflush or sync. However, in those cases, we do still 1049 * and not for the writeback threads or sync. However, in those cases,
1051 * want to check if there's a redundant swappage to be discarded. 1050 * we do still want to check if there's a redundant swappage to be
1051 * discarded.
1052 */ 1052 */
1053 if (wbc->for_reclaim) 1053 if (wbc->for_reclaim)
1054 swap = get_swap_page(); 1054 swap = get_swap_page();
@@ -1097,6 +1097,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1097 shmem_swp_unmap(entry); 1097 shmem_swp_unmap(entry);
1098unlock: 1098unlock:
1099 spin_unlock(&info->lock); 1099 spin_unlock(&info->lock);
1100 /*
1101 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1102 * clear SWAP_HAS_CACHE flag.
1103 */
1100 swapcache_free(swap, NULL); 1104 swapcache_free(swap, NULL);
1101redirty: 1105redirty:
1102 set_page_dirty(page); 1106 set_page_dirty(page);
@@ -1630,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1630 if (pos + copied > inode->i_size) 1634 if (pos + copied > inode->i_size)
1631 i_size_write(inode, pos + copied); 1635 i_size_write(inode, pos + copied);
1632 1636
1633 unlock_page(page);
1634 set_page_dirty(page); 1637 set_page_dirty(page);
1638 unlock_page(page);
1635 page_cache_release(page); 1639 page_cache_release(page);
1636 1640
1637 return copied; 1641 return copied;
@@ -1968,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1968 iput(inode); 1972 iput(inode);
1969 return error; 1973 return error;
1970 } 1974 }
1971 unlock_page(page);
1972 inode->i_mapping->a_ops = &shmem_aops; 1975 inode->i_mapping->a_ops = &shmem_aops;
1973 inode->i_op = &shmem_symlink_inode_operations; 1976 inode->i_op = &shmem_symlink_inode_operations;
1974 kaddr = kmap_atomic(page, KM_USER0); 1977 kaddr = kmap_atomic(page, KM_USER0);
1975 memcpy(kaddr, symname, len); 1978 memcpy(kaddr, symname, len);
1976 kunmap_atomic(kaddr, KM_USER0); 1979 kunmap_atomic(kaddr, KM_USER0);
1977 set_page_dirty(page); 1980 set_page_dirty(page);
1981 unlock_page(page);
1978 page_cache_release(page); 1982 page_cache_release(page);
1979 } 1983 }
1980 if (dir->i_mode & S_ISGID) 1984 if (dir->i_mode & S_ISGID)
@@ -2298,8 +2302,7 @@ static void shmem_put_super(struct super_block *sb)
2298 sb->s_fs_info = NULL; 2302 sb->s_fs_info = NULL;
2299} 2303}
2300 2304
2301static int shmem_fill_super(struct super_block *sb, 2305int shmem_fill_super(struct super_block *sb, void *data, int silent)
2302 void *data, int silent)
2303{ 2306{
2304 struct inode *inode; 2307 struct inode *inode;
2305 struct dentry *root; 2308 struct dentry *root;
@@ -2307,17 +2310,14 @@ static int shmem_fill_super(struct super_block *sb,
2307 int err = -ENOMEM; 2310 int err = -ENOMEM;
2308 2311
2309 /* Round up to L1_CACHE_BYTES to resist false sharing */ 2312 /* Round up to L1_CACHE_BYTES to resist false sharing */
2310 sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), 2313 sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2311 L1_CACHE_BYTES), GFP_KERNEL); 2314 L1_CACHE_BYTES), GFP_KERNEL);
2312 if (!sbinfo) 2315 if (!sbinfo)
2313 return -ENOMEM; 2316 return -ENOMEM;
2314 2317
2315 sbinfo->max_blocks = 0;
2316 sbinfo->max_inodes = 0;
2317 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2318 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2318 sbinfo->uid = current_fsuid(); 2319 sbinfo->uid = current_fsuid();
2319 sbinfo->gid = current_fsgid(); 2320 sbinfo->gid = current_fsgid();
2320 sbinfo->mpol = NULL;
2321 sb->s_fs_info = sbinfo; 2321 sb->s_fs_info = sbinfo;
2322 2322
2323#ifdef CONFIG_TMPFS 2323#ifdef CONFIG_TMPFS
@@ -2421,6 +2421,7 @@ static const struct address_space_operations shmem_aops = {
2421 .write_end = shmem_write_end, 2421 .write_end = shmem_write_end,
2422#endif 2422#endif
2423 .migratepage = migrate_page, 2423 .migratepage = migrate_page,
2424 .error_remove_page = generic_error_remove_page,
2424}; 2425};
2425 2426
2426static const struct file_operations shmem_file_operations = { 2427static const struct file_operations shmem_file_operations = {
@@ -2446,7 +2447,7 @@ static const struct inode_operations shmem_inode_operations = {
2446 .getxattr = generic_getxattr, 2447 .getxattr = generic_getxattr,
2447 .listxattr = generic_listxattr, 2448 .listxattr = generic_listxattr,
2448 .removexattr = generic_removexattr, 2449 .removexattr = generic_removexattr,
2449 .permission = shmem_permission, 2450 .check_acl = shmem_check_acl,
2450#endif 2451#endif
2451 2452
2452}; 2453};
@@ -2469,7 +2470,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2469 .getxattr = generic_getxattr, 2470 .getxattr = generic_getxattr,
2470 .listxattr = generic_listxattr, 2471 .listxattr = generic_listxattr,
2471 .removexattr = generic_removexattr, 2472 .removexattr = generic_removexattr,
2472 .permission = shmem_permission, 2473 .check_acl = shmem_check_acl,
2473#endif 2474#endif
2474}; 2475};
2475 2476
@@ -2480,7 +2481,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2480 .getxattr = generic_getxattr, 2481 .getxattr = generic_getxattr,
2481 .listxattr = generic_listxattr, 2482 .listxattr = generic_listxattr,
2482 .removexattr = generic_removexattr, 2483 .removexattr = generic_removexattr,
2483 .permission = shmem_permission, 2484 .check_acl = shmem_check_acl,
2484#endif 2485#endif
2485}; 2486};
2486 2487
@@ -2497,7 +2498,7 @@ static const struct super_operations shmem_ops = {
2497 .put_super = shmem_put_super, 2498 .put_super = shmem_put_super,
2498}; 2499};
2499 2500
2500static struct vm_operations_struct shmem_vm_ops = { 2501static const struct vm_operations_struct shmem_vm_ops = {
2501 .fault = shmem_fault, 2502 .fault = shmem_fault,
2502#ifdef CONFIG_NUMA 2503#ifdef CONFIG_NUMA
2503 .set_policy = shmem_set_policy, 2504 .set_policy = shmem_set_policy,
@@ -2519,7 +2520,7 @@ static struct file_system_type tmpfs_fs_type = {
2519 .kill_sb = kill_litter_super, 2520 .kill_sb = kill_litter_super,
2520}; 2521};
2521 2522
2522static int __init init_tmpfs(void) 2523int __init init_tmpfs(void)
2523{ 2524{
2524 int error; 2525 int error;
2525 2526
@@ -2576,7 +2577,7 @@ static struct file_system_type tmpfs_fs_type = {
2576 .kill_sb = kill_litter_super, 2577 .kill_sb = kill_litter_super,
2577}; 2578};
2578 2579
2579static int __init init_tmpfs(void) 2580int __init init_tmpfs(void)
2580{ 2581{
2581 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2582 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2582 2583
@@ -2591,6 +2592,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
2591 return 0; 2592 return 0;
2592} 2593}
2593 2594
2595int shmem_lock(struct file *file, int lock, struct user_struct *user)
2596{
2597 return 0;
2598}
2599
2594#define shmem_vm_ops generic_file_vm_ops 2600#define shmem_vm_ops generic_file_vm_ops
2595#define shmem_file_operations ramfs_file_operations 2601#define shmem_file_operations ramfs_file_operations
2596#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2602#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
@@ -2687,5 +2693,3 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2687 vma->vm_ops = &shmem_vm_ops; 2693 vma->vm_ops = &shmem_vm_ops;
2688 return 0; 2694 return 0;
2689} 2695}
2690
2691module_init(init_tmpfs)
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 606a8e757a42..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
157/** 157/**
158 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */ 159 */
160static int 160int
161shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
162{ 162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
169 } 169 }
170 return -EAGAIN; 170 return -EAGAIN;
171} 171}
172
173/**
174 * shmem_permission - permission() inode operation
175 */
176int
177shmem_permission(struct inode *inode, int mask)
178{
179 return generic_permission(inode, mask, shmem_check_acl);
180}
diff --git a/mm/slab.c b/mm/slab.c
index 7b5d4deacfcd..7dfa481c96ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void)
1384 * Fragmentation resistance on low memory - only use bigger 1384 * Fragmentation resistance on low memory - only use bigger
1385 * page orders on machines with more than 32MB of memory. 1385 * page orders on machines with more than 32MB of memory.
1386 */ 1386 */
1387 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1387 if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1388 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1389 1389
1390 /* Bootstrap is tricky, because several objects are allocated 1390 /* Bootstrap is tricky, because several objects are allocated
diff --git a/mm/slob.c b/mm/slob.c
index 9641da3d5e58..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void)
692{ 692{
693 slob_ready = 1; 693 slob_ready = 1;
694} 694}
695
696void __init kmem_cache_init_late(void)
697{
698 /* Nothing to do */
699}
diff --git a/mm/slub.c b/mm/slub.c
index b6276753626e..4996fc719552 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -141,6 +141,13 @@
141 SLAB_POISON | SLAB_STORE_USER) 141 SLAB_POISON | SLAB_STORE_USER)
142 142
143/* 143/*
144 * Debugging flags that require metadata to be stored in the slab. These get
145 * disabled when slub_debug=O is used and a cache's min order increases with
146 * metadata.
147 */
148#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
149
150/*
144 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
145 */ 152 */
146#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
@@ -325,6 +332,7 @@ static int slub_debug;
325#endif 332#endif
326 333
327static char *slub_debug_slabs; 334static char *slub_debug_slabs;
335static int disable_higher_order_debug;
328 336
329/* 337/*
330 * Object debugging 338 * Object debugging
@@ -646,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
646 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 654 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
647 print_section("Padding", end - remainder, remainder); 655 print_section("Padding", end - remainder, remainder);
648 656
649 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 657 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
650 return 0; 658 return 0;
651} 659}
652 660
@@ -976,6 +984,15 @@ static int __init setup_slub_debug(char *str)
976 */ 984 */
977 goto check_slabs; 985 goto check_slabs;
978 986
987 if (tolower(*str) == 'o') {
988 /*
989 * Avoid enabling debugging on caches if its minimum order
990 * would increase as a result.
991 */
992 disable_higher_order_debug = 1;
993 goto out;
994 }
995
979 slub_debug = 0; 996 slub_debug = 0;
980 if (*str == '-') 997 if (*str == '-')
981 /* 998 /*
@@ -1026,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
1026 * Enable debugging if selected on the kernel commandline. 1043 * Enable debugging if selected on the kernel commandline.
1027 */ 1044 */
1028 if (slub_debug && (!slub_debug_slabs || 1045 if (slub_debug && (!slub_debug_slabs ||
1029 strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) 1046 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1030 flags |= slub_debug; 1047 flags |= slub_debug;
1031 1048
1032 return flags; 1049 return flags;
1033} 1050}
@@ -1054,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1054} 1071}
1055#define slub_debug 0 1072#define slub_debug 0
1056 1073
1074#define disable_higher_order_debug 0
1075
1057static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1076static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1058 { return 0; } 1077 { return 0; }
1059static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1078static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
@@ -1109,8 +1128,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1109 } 1128 }
1110 1129
1111 if (kmemcheck_enabled 1130 if (kmemcheck_enabled
1112 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) 1131 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1113 {
1114 int pages = 1 << oo_order(oo); 1132 int pages = 1 << oo_order(oo);
1115 1133
1116 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1134 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@ -1560,6 +1578,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1560 "default order: %d, min order: %d\n", s->name, s->objsize, 1578 "default order: %d, min order: %d\n", s->name, s->objsize,
1561 s->size, oo_order(s->oo), oo_order(s->min)); 1579 s->size, oo_order(s->oo), oo_order(s->min));
1562 1580
1581 if (oo_order(s->min) > get_order(s->objsize))
1582 printk(KERN_WARNING " %s debugging increased min order, use "
1583 "slub_debug=O to disable.\n", s->name);
1584
1563 for_each_online_node(node) { 1585 for_each_online_node(node) {
1564 struct kmem_cache_node *n = get_node(s, node); 1586 struct kmem_cache_node *n = get_node(s, node);
1565 unsigned long nr_slabs; 1587 unsigned long nr_slabs;
@@ -2001,7 +2023,7 @@ static inline int calculate_order(int size)
2001 return order; 2023 return order;
2002 fraction /= 2; 2024 fraction /= 2;
2003 } 2025 }
2004 min_objects --; 2026 min_objects--;
2005 } 2027 }
2006 2028
2007 /* 2029 /*
@@ -2091,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2091 */ 2113 */
2092#define NR_KMEM_CACHE_CPU 100 2114#define NR_KMEM_CACHE_CPU 100
2093 2115
2094static DEFINE_PER_CPU(struct kmem_cache_cpu, 2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2095 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2117 kmem_cache_cpu);
2096 2118
2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); 2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@ -2400,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2400 * on bootup. 2422 * on bootup.
2401 */ 2423 */
2402 align = calculate_alignment(flags, align, s->objsize); 2424 align = calculate_alignment(flags, align, s->objsize);
2425 s->align = align;
2403 2426
2404 /* 2427 /*
2405 * SLUB stores one object immediately after another beginning from 2428 * SLUB stores one object immediately after another beginning from
@@ -2452,6 +2475,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2452 2475
2453 if (!calculate_sizes(s, -1)) 2476 if (!calculate_sizes(s, -1))
2454 goto error; 2477 goto error;
2478 if (disable_higher_order_debug) {
2479 /*
2480 * Disable debugging flags that store metadata if the min slab
2481 * order increased.
2482 */
2483 if (get_order(s->size) > get_order(s->objsize)) {
2484 s->flags &= ~DEBUG_METADATA_FLAGS;
2485 s->offset = 0;
2486 if (!calculate_sizes(s, -1))
2487 goto error;
2488 }
2489 }
2455 2490
2456 /* 2491 /*
2457 * The larger the object size is, the more pages we want on the partial 2492 * The larger the object size is, the more pages we want on the partial
@@ -2790,6 +2825,11 @@ static s8 size_index[24] = {
2790 2 /* 192 */ 2825 2 /* 192 */
2791}; 2826};
2792 2827
2828static inline int size_index_elem(size_t bytes)
2829{
2830 return (bytes - 1) / 8;
2831}
2832
2793static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2833static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2794{ 2834{
2795 int index; 2835 int index;
@@ -2798,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2798 if (!size) 2838 if (!size)
2799 return ZERO_SIZE_PTR; 2839 return ZERO_SIZE_PTR;
2800 2840
2801 index = size_index[(size - 1) / 8]; 2841 index = size_index[size_index_elem(size)];
2802 } else 2842 } else
2803 index = fls(size - 1); 2843 index = fls(size - 1);
2804 2844
@@ -3156,10 +3196,12 @@ void __init kmem_cache_init(void)
3156 slab_state = PARTIAL; 3196 slab_state = PARTIAL;
3157 3197
3158 /* Caches that are not of the two-to-the-power-of size */ 3198 /* Caches that are not of the two-to-the-power-of size */
3159 if (KMALLOC_MIN_SIZE <= 64) { 3199 if (KMALLOC_MIN_SIZE <= 32) {
3160 create_kmalloc_cache(&kmalloc_caches[1], 3200 create_kmalloc_cache(&kmalloc_caches[1],
3161 "kmalloc-96", 96, GFP_NOWAIT); 3201 "kmalloc-96", 96, GFP_NOWAIT);
3162 caches++; 3202 caches++;
3203 }
3204 if (KMALLOC_MIN_SIZE <= 64) {
3163 create_kmalloc_cache(&kmalloc_caches[2], 3205 create_kmalloc_cache(&kmalloc_caches[2],
3164 "kmalloc-192", 192, GFP_NOWAIT); 3206 "kmalloc-192", 192, GFP_NOWAIT);
3165 caches++; 3207 caches++;
@@ -3186,17 +3228,28 @@ void __init kmem_cache_init(void)
3186 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3228 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3187 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3229 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3188 3230
3189 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 3231 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3190 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3232 int elem = size_index_elem(i);
3233 if (elem >= ARRAY_SIZE(size_index))
3234 break;
3235 size_index[elem] = KMALLOC_SHIFT_LOW;
3236 }
3191 3237
3192 if (KMALLOC_MIN_SIZE == 128) { 3238 if (KMALLOC_MIN_SIZE == 64) {
3239 /*
3240 * The 96 byte size cache is not used if the alignment
3241 * is 64 byte.
3242 */
3243 for (i = 64 + 8; i <= 96; i += 8)
3244 size_index[size_index_elem(i)] = 7;
3245 } else if (KMALLOC_MIN_SIZE == 128) {
3193 /* 3246 /*
3194 * The 192 byte sized cache is not used if the alignment 3247 * The 192 byte sized cache is not used if the alignment
3195 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3248 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3196 * instead. 3249 * instead.
3197 */ 3250 */
3198 for (i = 128 + 8; i <= 192; i += 8) 3251 for (i = 128 + 8; i <= 192; i += 8)
3199 size_index[(i - 1) / 8] = 8; 3252 size_index[size_index_elem(i)] = 8;
3200 } 3253 }
3201 3254
3202 slab_state = UP; 3255 slab_state = UP;
@@ -3292,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3292{ 3345{
3293 struct kmem_cache *s; 3346 struct kmem_cache *s;
3294 3347
3348 if (WARN_ON(!name))
3349 return NULL;
3350
3295 down_write(&slub_lock); 3351 down_write(&slub_lock);
3296 s = find_mergeable(size, align, flags, name, ctor); 3352 s = find_mergeable(size, align, flags, name, ctor);
3297 if (s) { 3353 if (s) {
@@ -4543,8 +4599,11 @@ static int sysfs_slab_add(struct kmem_cache *s)
4543 } 4599 }
4544 4600
4545 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4601 err = sysfs_create_group(&s->kobj, &slab_attr_group);
4546 if (err) 4602 if (err) {
4603 kobject_del(&s->kobj);
4604 kobject_put(&s->kobj);
4547 return err; 4605 return err;
4606 }
4548 kobject_uevent(&s->kobj, KOBJ_ADD); 4607 kobject_uevent(&s->kobj, KOBJ_ADD);
4549 if (!unmergeable) { 4608 if (!unmergeable) {
4550 /* Setup first alias */ 4609 /* Setup first alias */
@@ -4726,7 +4785,7 @@ static const struct file_operations proc_slabinfo_operations = {
4726 4785
4727static int __init slab_proc_init(void) 4786static int __init slab_proc_init(void)
4728{ 4787{
4729 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4788 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4730 return 0; 4789 return 0;
4731} 4790}
4732module_init(slab_proc_init); 4791module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a13ea6401ae7..d9714bdcb4a3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
48{ 48{
49 /* If the main allocator is up use that, fallback to bootmem. */ 49 /* If the main allocator is up use that, fallback to bootmem. */
50 if (slab_is_available()) { 50 if (slab_is_available()) {
51 struct page *page = alloc_pages_node(node, 51 struct page *page;
52
53 if (node_state(node, N_HIGH_MEMORY))
54 page = alloc_pages_node(node,
52 GFP_KERNEL | __GFP_ZERO, get_order(size)); 55 GFP_KERNEL | __GFP_ZERO, get_order(size));
56 else
57 page = alloc_pages(GFP_KERNEL | __GFP_ZERO,
58 get_order(size));
53 if (page) 59 if (page)
54 return page_address(page); 60 return page_address(page);
55 return NULL; 61 return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index da432d9f0ae8..6ce4aab69e99 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
62 unsigned long array_size = SECTIONS_PER_ROOT * 62 unsigned long array_size = SECTIONS_PER_ROOT *
63 sizeof(struct mem_section); 63 sizeof(struct mem_section);
64 64
65 if (slab_is_available()) 65 if (slab_is_available()) {
66 section = kmalloc_node(array_size, GFP_KERNEL, nid); 66 if (node_state(nid, N_HIGH_MEMORY))
67 else 67 section = kmalloc_node(array_size, GFP_KERNEL, nid);
68 else
69 section = kmalloc(array_size, GFP_KERNEL);
70 } else
68 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 71 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
69 72
70 if (section) 73 if (section)
diff --git a/mm/swap.c b/mm/swap.c
index cb29ae5d33ab..308e57d8d7ed 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
118 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
119 } 119 }
120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
121 int lru = page_is_file_cache(page); 121 int lru = page_lru_base_type(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list); 122 list_move_tail(&page->lru, &zone->lru[lru].list);
123 pgmoved++; 123 pgmoved++;
124 } 124 }
@@ -181,7 +181,7 @@ void activate_page(struct page *page)
181 spin_lock_irq(&zone->lru_lock); 181 spin_lock_irq(&zone->lru_lock);
182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 182 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
183 int file = page_is_file_cache(page); 183 int file = page_is_file_cache(page);
184 int lru = LRU_BASE + file; 184 int lru = page_lru_base_type(page);
185 del_page_from_lru_list(zone, page, lru); 185 del_page_from_lru_list(zone, page, lru);
186 186
187 SetPageActive(page); 187 SetPageActive(page);
@@ -189,7 +189,7 @@ void activate_page(struct page *page)
189 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
190 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
191 191
192 update_page_reclaim_stat(zone, page, !!file, 1); 192 update_page_reclaim_stat(zone, page, file, 1);
193 } 193 }
194 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
195} 195}
@@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
496 */ 496 */
497void __init swap_setup(void) 497void __init swap_setup(void)
498{ 498{
499 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 499 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
500 500
501#ifdef CONFIG_SWAP 501#ifdef CONFIG_SWAP
502 bdi_init(swapper_space.backing_dev_info); 502 bdi_init(swapper_space.backing_dev_info);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..6d1daeb1cb4a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
@@ -66,10 +67,10 @@ void show_swap_cache_info(void)
66} 67}
67 68
68/* 69/*
69 * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, 70 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
70 * but sets SwapCache flag and private instead of mapping and index. 71 * but sets SwapCache flag and private instead of mapping and index.
71 */ 72 */
72int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) 73static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
73{ 74{
74 int error; 75 int error;
75 76
@@ -77,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
77 VM_BUG_ON(PageSwapCache(page)); 78 VM_BUG_ON(PageSwapCache(page));
78 VM_BUG_ON(!PageSwapBacked(page)); 79 VM_BUG_ON(!PageSwapBacked(page));
79 80
81 page_cache_get(page);
82 SetPageSwapCache(page);
83 set_page_private(page, entry.val);
84
85 spin_lock_irq(&swapper_space.tree_lock);
86 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
87 if (likely(!error)) {
88 total_swapcache_pages++;
89 __inc_zone_page_state(page, NR_FILE_PAGES);
90 INC_CACHE_INFO(add_total);
91 }
92 spin_unlock_irq(&swapper_space.tree_lock);
93
94 if (unlikely(error)) {
95 /*
96 * Only the context which have set SWAP_HAS_CACHE flag
97 * would call add_to_swap_cache().
98 * So add_to_swap_cache() doesn't returns -EEXIST.
99 */
100 VM_BUG_ON(error == -EEXIST);
101 set_page_private(page, 0UL);
102 ClearPageSwapCache(page);
103 page_cache_release(page);
104 }
105
106 return error;
107}
108
109
110int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
111{
112 int error;
113
80 error = radix_tree_preload(gfp_mask); 114 error = radix_tree_preload(gfp_mask);
81 if (!error) { 115 if (!error) {
82 page_cache_get(page); 116 error = __add_to_swap_cache(page, entry);
83 SetPageSwapCache(page);
84 set_page_private(page, entry.val);
85
86 spin_lock_irq(&swapper_space.tree_lock);
87 error = radix_tree_insert(&swapper_space.page_tree,
88 entry.val, page);
89 if (likely(!error)) {
90 total_swapcache_pages++;
91 __inc_zone_page_state(page, NR_FILE_PAGES);
92 INC_CACHE_INFO(add_total);
93 }
94 spin_unlock_irq(&swapper_space.tree_lock);
95 radix_tree_preload_end(); 117 radix_tree_preload_end();
96
97 if (unlikely(error)) {
98 set_page_private(page, 0UL);
99 ClearPageSwapCache(page);
100 page_cache_release(page);
101 }
102 } 118 }
103 return error; 119 return error;
104} 120}
@@ -136,38 +152,34 @@ int add_to_swap(struct page *page)
136 VM_BUG_ON(!PageLocked(page)); 152 VM_BUG_ON(!PageLocked(page));
137 VM_BUG_ON(!PageUptodate(page)); 153 VM_BUG_ON(!PageUptodate(page));
138 154
139 for (;;) { 155 entry = get_swap_page();
140 entry = get_swap_page(); 156 if (!entry.val)
141 if (!entry.val) 157 return 0;
142 return 0;
143 158
159 /*
160 * Radix-tree node allocations from PF_MEMALLOC contexts could
161 * completely exhaust the page allocator. __GFP_NOMEMALLOC
162 * stops emergency reserves from being allocated.
163 *
164 * TODO: this could cause a theoretical memory reclaim
165 * deadlock in the swap out path.
166 */
167 /*
168 * Add it to the swap cache and mark it dirty
169 */
170 err = add_to_swap_cache(page, entry,
171 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
172
173 if (!err) { /* Success */
174 SetPageDirty(page);
175 return 1;
176 } else { /* -ENOMEM radix-tree allocation failure */
144 /* 177 /*
145 * Radix-tree node allocations from PF_MEMALLOC contexts could 178 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
146 * completely exhaust the page allocator. __GFP_NOMEMALLOC 179 * clear SWAP_HAS_CACHE flag.
147 * stops emergency reserves from being allocated.
148 *
149 * TODO: this could cause a theoretical memory reclaim
150 * deadlock in the swap out path.
151 */
152 /*
153 * Add it to the swap cache and mark it dirty
154 */ 180 */
155 err = add_to_swap_cache(page, entry, 181 swapcache_free(entry, NULL);
156 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); 182 return 0;
157
158 switch (err) {
159 case 0: /* Success */
160 SetPageDirty(page);
161 return 1;
162 case -EEXIST:
163 /* Raced with "speculative" read_swap_cache_async */
164 swapcache_free(entry, NULL);
165 continue;
166 default:
167 /* -ENOMEM radix-tree allocation failure */
168 swapcache_free(entry, NULL);
169 return 0;
170 }
171 } 183 }
172} 184}
173 185
@@ -289,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
289 } 301 }
290 302
291 /* 303 /*
304 * call radix_tree_preload() while we can wait.
305 */
306 err = radix_tree_preload(gfp_mask & GFP_KERNEL);
307 if (err)
308 break;
309
310 /*
292 * Swap entry may have been freed since our caller observed it. 311 * Swap entry may have been freed since our caller observed it.
293 */ 312 */
294 err = swapcache_prepare(entry); 313 err = swapcache_prepare(entry);
295 if (err == -EEXIST) /* seems racy */ 314 if (err == -EEXIST) { /* seems racy */
315 radix_tree_preload_end();
296 continue; 316 continue;
297 if (err) /* swp entry is obsolete ? */ 317 }
318 if (err) { /* swp entry is obsolete ? */
319 radix_tree_preload_end();
298 break; 320 break;
321 }
299 322
300 /* 323 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
301 * Associate the page with swap entry in the swap cache.
302 * May fail (-EEXIST) if there is already a page associated
303 * with this entry in the swap cache: added by a racing
304 * read_swap_cache_async, or add_to_swap or shmem_writepage
305 * re-using the just freed swap entry for an existing page.
306 * May fail (-ENOMEM) if radix-tree node allocation failed.
307 */
308 __set_page_locked(new_page); 324 __set_page_locked(new_page);
309 SetPageSwapBacked(new_page); 325 SetPageSwapBacked(new_page);
310 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 326 err = __add_to_swap_cache(new_page, entry);
311 if (likely(!err)) { 327 if (likely(!err)) {
328 radix_tree_preload_end();
312 /* 329 /*
313 * Initiate read into locked page and return. 330 * Initiate read into locked page and return.
314 */ 331 */
@@ -316,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
316 swap_readpage(new_page); 333 swap_readpage(new_page);
317 return new_page; 334 return new_page;
318 } 335 }
336 radix_tree_preload_end();
319 ClearPageSwapBacked(new_page); 337 ClearPageSwapBacked(new_page);
320 __clear_page_locked(new_page); 338 __clear_page_locked(new_page);
339 /*
340 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
341 * clear SWAP_HAS_CACHE flag.
342 */
321 swapcache_free(entry, NULL); 343 swapcache_free(entry, NULL);
322 } while (err != -ENOMEM); 344 } while (err != -ENOMEM);
323 345
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d23c53..a1bc6b9af9a2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
161 } 161 }
162 162
163 err = blkdev_issue_discard(si->bdev, start_block, 163 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL); 164 nr_blocks, GFP_KERNEL,
165 DISCARD_FL_BARRIER);
165 if (err) 166 if (err)
166 break; 167 break;
167 168
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
200 start_block <<= PAGE_SHIFT - 9; 201 start_block <<= PAGE_SHIFT - 9;
201 nr_blocks <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9;
202 if (blkdev_issue_discard(si->bdev, start_block, 203 if (blkdev_issue_discard(si->bdev, start_block,
203 nr_blocks, GFP_NOIO)) 204 nr_blocks, GFP_NOIO,
205 DISCARD_FL_BARRIER))
204 break; 206 break;
205 } 207 }
206 208
@@ -697,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry)
697 struct swap_info_struct *p; 699 struct swap_info_struct *p;
698 struct page *page = NULL; 700 struct page *page = NULL;
699 701
700 if (is_migration_entry(entry)) 702 if (non_swap_entry(entry))
701 return 1; 703 return 1;
702 704
703 p = swap_info_get(entry); 705 p = swap_info_get(entry);
@@ -1573,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1573 p->flags &= ~SWP_WRITEOK; 1575 p->flags &= ~SWP_WRITEOK;
1574 spin_unlock(&swap_lock); 1576 spin_unlock(&swap_lock);
1575 1577
1576 current->flags |= PF_SWAPOFF; 1578 current->flags |= PF_OOM_ORIGIN;
1577 err = try_to_unuse(type); 1579 err = try_to_unuse(type);
1578 current->flags &= ~PF_SWAPOFF; 1580 current->flags &= ~PF_OOM_ORIGIN;
1579 1581
1580 if (err) { 1582 if (err) {
1581 /* re-insert swap space back into swap_list */ 1583 /* re-insert swap space back into swap_list */
@@ -1972,12 +1974,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1972 goto bad_swap; 1974 goto bad_swap;
1973 } 1975 }
1974 1976
1975 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 1977 if (p->bdev) {
1976 p->flags |= SWP_SOLIDSTATE; 1978 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1977 p->cluster_next = 1 + (random32() % p->highest_bit); 1979 p->flags |= SWP_SOLIDSTATE;
1980 p->cluster_next = 1 + (random32() % p->highest_bit);
1981 }
1982 if (discard_swap(p) == 0)
1983 p->flags |= SWP_DISCARDABLE;
1978 } 1984 }
1979 if (discard_swap(p) == 0)
1980 p->flags |= SWP_DISCARDABLE;
1981 1985
1982 mutex_lock(&swapon_mutex); 1986 mutex_lock(&swapon_mutex);
1983 spin_lock(&swap_lock); 1987 spin_lock(&swap_lock);
@@ -2083,7 +2087,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache)
2083 int count; 2087 int count;
2084 bool has_cache; 2088 bool has_cache;
2085 2089
2086 if (is_migration_entry(entry)) 2090 if (non_swap_entry(entry))
2087 return -EINVAL; 2091 return -EINVAL;
2088 2092
2089 type = swp_type(entry); 2093 type = swp_type(entry);
diff --git a/mm/truncate.c b/mm/truncate.c
index ccc3ecf7cb98..450cebdabfc0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page);
93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and 93 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. 94 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
95 */ 95 */
96static void 96static int
97truncate_complete_page(struct address_space *mapping, struct page *page) 97truncate_complete_page(struct address_space *mapping, struct page *page)
98{ 98{
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return -EIO;
101 101
102 if (page_has_private(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
@@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
108 remove_from_page_cache(page); 108 remove_from_page_cache(page);
109 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
110 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
111 return 0;
111} 112}
112 113
113/* 114/*
@@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
135 return ret; 136 return ret;
136} 137}
137 138
139int truncate_inode_page(struct address_space *mapping, struct page *page)
140{
141 if (page_mapped(page)) {
142 unmap_mapping_range(mapping,
143 (loff_t)page->index << PAGE_CACHE_SHIFT,
144 PAGE_CACHE_SIZE, 0);
145 }
146 return truncate_complete_page(mapping, page);
147}
148
149/*
150 * Used to get rid of pages on hardware memory corruption.
151 */
152int generic_error_remove_page(struct address_space *mapping, struct page *page)
153{
154 if (!mapping)
155 return -EINVAL;
156 /*
157 * Only punch for normal data pages for now.
158 * Handling other types like directories would need more auditing.
159 */
160 if (!S_ISREG(mapping->host->i_mode))
161 return -EIO;
162 return truncate_inode_page(mapping, page);
163}
164EXPORT_SYMBOL(generic_error_remove_page);
165
166/*
167 * Safely invalidate one page from its pagecache mapping.
168 * It only drops clean, unused pages. The page must be locked.
169 *
170 * Returns 1 if the page is successfully invalidated, otherwise 0.
171 */
172int invalidate_inode_page(struct page *page)
173{
174 struct address_space *mapping = page_mapping(page);
175 if (!mapping)
176 return 0;
177 if (PageDirty(page) || PageWriteback(page))
178 return 0;
179 if (page_mapped(page))
180 return 0;
181 return invalidate_complete_page(mapping, page);
182}
183
138/** 184/**
139 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets 185 * truncate_inode_pages - truncate range of pages specified by start & end byte offsets
140 * @mapping: mapping to truncate 186 * @mapping: mapping to truncate
@@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
196 unlock_page(page); 242 unlock_page(page);
197 continue; 243 continue;
198 } 244 }
199 if (page_mapped(page)) { 245 truncate_inode_page(mapping, page);
200 unmap_mapping_range(mapping,
201 (loff_t)page_index<<PAGE_CACHE_SHIFT,
202 PAGE_CACHE_SIZE, 0);
203 }
204 truncate_complete_page(mapping, page);
205 unlock_page(page); 246 unlock_page(page);
206 } 247 }
207 pagevec_release(&pvec); 248 pagevec_release(&pvec);
@@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 break; 279 break;
239 lock_page(page); 280 lock_page(page);
240 wait_on_page_writeback(page); 281 wait_on_page_writeback(page);
241 if (page_mapped(page)) { 282 truncate_inode_page(mapping, page);
242 unmap_mapping_range(mapping,
243 (loff_t)page->index<<PAGE_CACHE_SHIFT,
244 PAGE_CACHE_SIZE, 0);
245 }
246 if (page->index > next) 283 if (page->index > next)
247 next = page->index; 284 next = page->index;
248 next++; 285 next++;
249 truncate_complete_page(mapping, page);
250 unlock_page(page); 286 unlock_page(page);
251 } 287 }
252 pagevec_release(&pvec); 288 pagevec_release(&pvec);
@@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
311 if (lock_failed) 347 if (lock_failed)
312 continue; 348 continue;
313 349
314 if (PageDirty(page) || PageWriteback(page)) 350 ret += invalidate_inode_page(page);
315 goto unlock; 351
316 if (page_mapped(page))
317 goto unlock;
318 ret += invalidate_complete_page(mapping, page);
319unlock:
320 unlock_page(page); 352 unlock_page(page);
321 if (next > end) 353 if (next > end)
322 break; 354 break;
@@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping)
465 return invalidate_inode_pages2_range(mapping, 0, -1); 497 return invalidate_inode_pages2_range(mapping, 0, -1);
466} 498}
467EXPORT_SYMBOL_GPL(invalidate_inode_pages2); 499EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
500
501/**
502 * truncate_pagecache - unmap and remove pagecache that has been truncated
503 * @inode: inode
504 * @old: old file offset
505 * @new: new file offset
506 *
507 * inode's new i_size must already be written before truncate_pagecache
508 * is called.
509 *
510 * This function should typically be called before the filesystem
511 * releases resources associated with the freed range (eg. deallocates
512 * blocks). This way, pagecache will always stay logically coherent
513 * with on-disk format, and the filesystem would not have to deal with
514 * situations such as writepage being called for a page that has already
515 * had its underlying blocks deallocated.
516 */
517void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
518{
519 if (new < old) {
520 struct address_space *mapping = inode->i_mapping;
521
522 /*
523 * unmap_mapping_range is called twice, first simply for
524 * efficiency so that truncate_inode_pages does fewer
525 * single-page unmaps. However after this first call, and
526 * before truncate_inode_pages finishes, it is possible for
527 * private pages to be COWed, which remain after
528 * truncate_inode_pages finishes, hence the second
529 * unmap_mapping_range call must be made for correctness.
530 */
531 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
532 truncate_inode_pages(mapping, new);
533 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
534 }
535}
536EXPORT_SYMBOL(truncate_pagecache);
537
538/**
539 * vmtruncate - unmap mappings "freed" by truncate() syscall
540 * @inode: inode of the file used
541 * @offset: file offset to start truncating
542 *
543 * NOTE! We have to be ready to update the memory sharing
544 * between the file and the memory map for a potential last
545 * incomplete page. Ugly, but necessary.
546 */
547int vmtruncate(struct inode *inode, loff_t offset)
548{
549 loff_t oldsize;
550 int error;
551
552 error = inode_newsize_ok(inode, offset);
553 if (error)
554 return error;
555 oldsize = inode->i_size;
556 i_size_write(inode, offset);
557 truncate_pagecache(inode, oldsize, offset);
558 if (inode->i_op->truncate)
559 inode->i_op->truncate(inode);
560
561 return error;
562}
563EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f8189a4b3e13..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/sched.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/spinlock.h> 17#include <linux/spinlock.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
@@ -25,10 +26,10 @@
25#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h>
32 33
33 34
34/*** Page table manipulation functions ***/ 35/*** Page table manipulation functions ***/
@@ -168,11 +169,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
168 next = pgd_addr_end(addr, end); 169 next = pgd_addr_end(addr, end);
169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 170 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
170 if (err) 171 if (err)
171 break; 172 return err;
172 } while (pgd++, addr = next, addr != end); 173 } while (pgd++, addr = next, addr != end);
173 174
174 if (unlikely(err))
175 return err;
176 return nr; 175 return nr;
177} 176}
178 177
@@ -186,7 +185,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
186 return ret; 185 return ret;
187} 186}
188 187
189static inline int is_vmalloc_or_module_addr(const void *x) 188int is_vmalloc_or_module_addr(const void *x)
190{ 189{
191 /* 190 /*
192 * ARM, x86-64 and sparc64 put modules in a special place, 191 * ARM, x86-64 and sparc64 put modules in a special place,
@@ -265,6 +264,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 264static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 265static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 266static LIST_HEAD(vmap_area_list);
267static unsigned long vmap_area_pcpu_hole;
268 268
269static struct vmap_area *__find_vmap_area(unsigned long addr) 269static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 270{
@@ -431,6 +431,15 @@ static void __free_vmap_area(struct vmap_area *va)
431 RB_CLEAR_NODE(&va->rb_node); 431 RB_CLEAR_NODE(&va->rb_node);
432 list_del_rcu(&va->list); 432 list_del_rcu(&va->list);
433 433
434 /*
435 * Track the highest possible candidate for pcpu area
436 * allocation. Areas outside of vmalloc area can be returned
437 * here too, consider only end addresses which fall inside
438 * vmalloc area proper.
439 */
440 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
441 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
442
434 call_rcu(&va->rcu_head, rcu_free_va); 443 call_rcu(&va->rcu_head, rcu_free_va);
435} 444}
436 445
@@ -1038,6 +1047,9 @@ void __init vmalloc_init(void)
1038 va->va_end = va->va_start + tmp->size; 1047 va->va_end = va->va_start + tmp->size;
1039 __insert_vmap_area(va); 1048 __insert_vmap_area(va);
1040 } 1049 }
1050
1051 vmap_area_pcpu_hole = VMALLOC_END;
1052
1041 vmap_initialized = true; 1053 vmap_initialized = true;
1042} 1054}
1043 1055
@@ -1122,14 +1134,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1122DEFINE_RWLOCK(vmlist_lock); 1134DEFINE_RWLOCK(vmlist_lock);
1123struct vm_struct *vmlist; 1135struct vm_struct *vmlist;
1124 1136
1137static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1138 unsigned long flags, void *caller)
1139{
1140 struct vm_struct *tmp, **p;
1141
1142 vm->flags = flags;
1143 vm->addr = (void *)va->va_start;
1144 vm->size = va->va_end - va->va_start;
1145 vm->caller = caller;
1146 va->private = vm;
1147 va->flags |= VM_VM_AREA;
1148
1149 write_lock(&vmlist_lock);
1150 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1151 if (tmp->addr >= vm->addr)
1152 break;
1153 }
1154 vm->next = *p;
1155 *p = vm;
1156 write_unlock(&vmlist_lock);
1157}
1158
1125static struct vm_struct *__get_vm_area_node(unsigned long size, 1159static struct vm_struct *__get_vm_area_node(unsigned long size,
1126 unsigned long flags, unsigned long start, unsigned long end, 1160 unsigned long align, unsigned long flags, unsigned long start,
1127 int node, gfp_t gfp_mask, void *caller) 1161 unsigned long end, int node, gfp_t gfp_mask, void *caller)
1128{ 1162{
1129 static struct vmap_area *va; 1163 static struct vmap_area *va;
1130 struct vm_struct *area; 1164 struct vm_struct *area;
1131 struct vm_struct *tmp, **p;
1132 unsigned long align = 1;
1133 1165
1134 BUG_ON(in_interrupt()); 1166 BUG_ON(in_interrupt());
1135 if (flags & VM_IOREMAP) { 1167 if (flags & VM_IOREMAP) {
@@ -1147,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1147 if (unlikely(!size)) 1179 if (unlikely(!size))
1148 return NULL; 1180 return NULL;
1149 1181
1150 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1182 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1151 if (unlikely(!area)) 1183 if (unlikely(!area))
1152 return NULL; 1184 return NULL;
1153 1185
@@ -1162,32 +1194,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1162 return NULL; 1194 return NULL;
1163 } 1195 }
1164 1196
1165 area->flags = flags; 1197 insert_vmalloc_vm(area, va, flags, caller);
1166 area->addr = (void *)va->va_start;
1167 area->size = size;
1168 area->pages = NULL;
1169 area->nr_pages = 0;
1170 area->phys_addr = 0;
1171 area->caller = caller;
1172 va->private = area;
1173 va->flags |= VM_VM_AREA;
1174
1175 write_lock(&vmlist_lock);
1176 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1177 if (tmp->addr >= area->addr)
1178 break;
1179 }
1180 area->next = *p;
1181 *p = area;
1182 write_unlock(&vmlist_lock);
1183
1184 return area; 1198 return area;
1185} 1199}
1186 1200
1187struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1188 unsigned long start, unsigned long end) 1202 unsigned long start, unsigned long end)
1189{ 1203{
1190 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1204 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1191 __builtin_return_address(0)); 1205 __builtin_return_address(0));
1192} 1206}
1193EXPORT_SYMBOL_GPL(__get_vm_area); 1207EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1196,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1196 unsigned long start, unsigned long end, 1210 unsigned long start, unsigned long end,
1197 void *caller) 1211 void *caller)
1198{ 1212{
1199 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1213 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1200 caller); 1214 caller);
1201} 1215}
1202 1216
@@ -1211,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1211 */ 1225 */
1212struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1213{ 1227{
1214 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1228 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1215 -1, GFP_KERNEL, __builtin_return_address(0)); 1229 -1, GFP_KERNEL, __builtin_return_address(0));
1216} 1230}
1217 1231
1218struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1219 void *caller) 1233 void *caller)
1220{ 1234{
1221 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1235 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1222 -1, GFP_KERNEL, caller); 1236 -1, GFP_KERNEL, caller);
1223} 1237}
1224 1238
1225struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1226 int node, gfp_t gfp_mask) 1240 int node, gfp_t gfp_mask)
1227{ 1241{
1228 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, 1242 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1229 gfp_mask, __builtin_return_address(0)); 1243 node, gfp_mask, __builtin_return_address(0));
1230} 1244}
1231 1245
1232static struct vm_struct *find_vm_area(const void *addr) 1246static struct vm_struct *find_vm_area(const void *addr)
@@ -1256,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1256 if (va && va->flags & VM_VM_AREA) { 1270 if (va && va->flags & VM_VM_AREA) {
1257 struct vm_struct *vm = va->private; 1271 struct vm_struct *vm = va->private;
1258 struct vm_struct *tmp, **p; 1272 struct vm_struct *tmp, **p;
1259 1273 /*
1260 vmap_debug_free_range(va->va_start, va->va_end); 1274 * remove from list and disallow access to this vm_struct
1261 free_unmap_vmap_area(va); 1275 * before unmap. (address range confliction is maintained by
1262 vm->size -= PAGE_SIZE; 1276 * vmap.)
1263 1277 */
1264 write_lock(&vmlist_lock); 1278 write_lock(&vmlist_lock);
1265 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1279 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1266 ; 1280 ;
1267 *p = tmp->next; 1281 *p = tmp->next;
1268 write_unlock(&vmlist_lock); 1282 write_unlock(&vmlist_lock);
1269 1283
1284 vmap_debug_free_range(va->va_start, va->va_end);
1285 free_unmap_vmap_area(va);
1286 vm->size -= PAGE_SIZE;
1287
1270 return vm; 1288 return vm;
1271 } 1289 }
1272 return NULL; 1290 return NULL;
@@ -1368,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
1368 1386
1369 might_sleep(); 1387 might_sleep();
1370 1388
1371 if (count > num_physpages) 1389 if (count > totalram_pages)
1372 return NULL; 1390 return NULL;
1373 1391
1374 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1392 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1385,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count,
1385} 1403}
1386EXPORT_SYMBOL(vmap); 1404EXPORT_SYMBOL(vmap);
1387 1405
1388static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1406static void *__vmalloc_node(unsigned long size, unsigned long align,
1407 gfp_t gfp_mask, pgprot_t prot,
1389 int node, void *caller); 1408 int node, void *caller);
1390static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1409static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1391 pgprot_t prot, int node, void *caller) 1410 pgprot_t prot, int node, void *caller)
@@ -1399,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1399 area->nr_pages = nr_pages; 1418 area->nr_pages = nr_pages;
1400 /* Please note that the recursion is strictly bounded. */ 1419 /* Please note that the recursion is strictly bounded. */
1401 if (array_size > PAGE_SIZE) { 1420 if (array_size > PAGE_SIZE) {
1402 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, 1421 pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
1403 PAGE_KERNEL, node, caller); 1422 PAGE_KERNEL, node, caller);
1404 area->flags |= VM_VPAGES; 1423 area->flags |= VM_VPAGES;
1405 } else { 1424 } else {
@@ -1458,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1458/** 1477/**
1459 * __vmalloc_node - allocate virtually contiguous memory 1478 * __vmalloc_node - allocate virtually contiguous memory
1460 * @size: allocation size 1479 * @size: allocation size
1480 * @align: desired alignment
1461 * @gfp_mask: flags for the page level allocator 1481 * @gfp_mask: flags for the page level allocator
1462 * @prot: protection mask for the allocated pages 1482 * @prot: protection mask for the allocated pages
1463 * @node: node to use for allocation or -1 1483 * @node: node to use for allocation or -1
@@ -1467,19 +1487,20 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1467 * allocator with @gfp_mask flags. Map them into contiguous 1487 * allocator with @gfp_mask flags. Map them into contiguous
1468 * kernel virtual space, using a pagetable protection of @prot. 1488 * kernel virtual space, using a pagetable protection of @prot.
1469 */ 1489 */
1470static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1490static void *__vmalloc_node(unsigned long size, unsigned long align,
1471 int node, void *caller) 1491 gfp_t gfp_mask, pgprot_t prot,
1492 int node, void *caller)
1472{ 1493{
1473 struct vm_struct *area; 1494 struct vm_struct *area;
1474 void *addr; 1495 void *addr;
1475 unsigned long real_size = size; 1496 unsigned long real_size = size;
1476 1497
1477 size = PAGE_ALIGN(size); 1498 size = PAGE_ALIGN(size);
1478 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1499 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1479 return NULL; 1500 return NULL;
1480 1501
1481 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1502 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
1482 node, gfp_mask, caller); 1503 VMALLOC_END, node, gfp_mask, caller);
1483 1504
1484 if (!area) 1505 if (!area)
1485 return NULL; 1506 return NULL;
@@ -1498,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1498 1519
1499void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1520void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1500{ 1521{
1501 return __vmalloc_node(size, gfp_mask, prot, -1, 1522 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
1502 __builtin_return_address(0)); 1523 __builtin_return_address(0));
1503} 1524}
1504EXPORT_SYMBOL(__vmalloc); 1525EXPORT_SYMBOL(__vmalloc);
@@ -1514,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc);
1514 */ 1535 */
1515void *vmalloc(unsigned long size) 1536void *vmalloc(unsigned long size)
1516{ 1537{
1517 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1538 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1518 -1, __builtin_return_address(0)); 1539 -1, __builtin_return_address(0));
1519} 1540}
1520EXPORT_SYMBOL(vmalloc); 1541EXPORT_SYMBOL(vmalloc);
@@ -1531,7 +1552,8 @@ void *vmalloc_user(unsigned long size)
1531 struct vm_struct *area; 1552 struct vm_struct *area;
1532 void *ret; 1553 void *ret;
1533 1554
1534 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1555 ret = __vmalloc_node(size, SHMLBA,
1556 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1535 PAGE_KERNEL, -1, __builtin_return_address(0)); 1557 PAGE_KERNEL, -1, __builtin_return_address(0));
1536 if (ret) { 1558 if (ret) {
1537 area = find_vm_area(ret); 1559 area = find_vm_area(ret);
@@ -1554,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user);
1554 */ 1576 */
1555void *vmalloc_node(unsigned long size, int node) 1577void *vmalloc_node(unsigned long size, int node)
1556{ 1578{
1557 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1579 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1558 node, __builtin_return_address(0)); 1580 node, __builtin_return_address(0));
1559} 1581}
1560EXPORT_SYMBOL(vmalloc_node); 1582EXPORT_SYMBOL(vmalloc_node);
@@ -1577,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node);
1577 1599
1578void *vmalloc_exec(unsigned long size) 1600void *vmalloc_exec(unsigned long size)
1579{ 1601{
1580 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1602 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1581 -1, __builtin_return_address(0)); 1603 -1, __builtin_return_address(0));
1582} 1604}
1583 1605
@@ -1598,7 +1620,7 @@ void *vmalloc_exec(unsigned long size)
1598 */ 1620 */
1599void *vmalloc_32(unsigned long size) 1621void *vmalloc_32(unsigned long size)
1600{ 1622{
1601 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, 1623 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1602 -1, __builtin_return_address(0)); 1624 -1, __builtin_return_address(0));
1603} 1625}
1604EXPORT_SYMBOL(vmalloc_32); 1626EXPORT_SYMBOL(vmalloc_32);
@@ -1615,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size)
1615 struct vm_struct *area; 1637 struct vm_struct *area;
1616 void *ret; 1638 void *ret;
1617 1639
1618 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1640 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1619 -1, __builtin_return_address(0)); 1641 -1, __builtin_return_address(0));
1620 if (ret) { 1642 if (ret) {
1621 area = find_vm_area(ret); 1643 area = find_vm_area(ret);
@@ -1625,10 +1647,120 @@ void *vmalloc_32_user(unsigned long size)
1625} 1647}
1626EXPORT_SYMBOL(vmalloc_32_user); 1648EXPORT_SYMBOL(vmalloc_32_user);
1627 1649
1650/*
1651 * small helper routine , copy contents to buf from addr.
1652 * If the page is not present, fill zero.
1653 */
1654
1655static int aligned_vread(char *buf, char *addr, unsigned long count)
1656{
1657 struct page *p;
1658 int copied = 0;
1659
1660 while (count) {
1661 unsigned long offset, length;
1662
1663 offset = (unsigned long)addr & ~PAGE_MASK;
1664 length = PAGE_SIZE - offset;
1665 if (length > count)
1666 length = count;
1667 p = vmalloc_to_page(addr);
1668 /*
1669 * To do safe access to this _mapped_ area, we need
1670 * lock. But adding lock here means that we need to add
1671 * overhead of vmalloc()/vfree() calles for this _debug_
1672 * interface, rarely used. Instead of that, we'll use
1673 * kmap() and get small overhead in this access function.
1674 */
1675 if (p) {
1676 /*
1677 * we can expect USER0 is not used (see vread/vwrite's
1678 * function description)
1679 */
1680 void *map = kmap_atomic(p, KM_USER0);
1681 memcpy(buf, map + offset, length);
1682 kunmap_atomic(map, KM_USER0);
1683 } else
1684 memset(buf, 0, length);
1685
1686 addr += length;
1687 buf += length;
1688 copied += length;
1689 count -= length;
1690 }
1691 return copied;
1692}
1693
1694static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1695{
1696 struct page *p;
1697 int copied = 0;
1698
1699 while (count) {
1700 unsigned long offset, length;
1701
1702 offset = (unsigned long)addr & ~PAGE_MASK;
1703 length = PAGE_SIZE - offset;
1704 if (length > count)
1705 length = count;
1706 p = vmalloc_to_page(addr);
1707 /*
1708 * To do safe access to this _mapped_ area, we need
1709 * lock. But adding lock here means that we need to add
1710 * overhead of vmalloc()/vfree() calles for this _debug_
1711 * interface, rarely used. Instead of that, we'll use
1712 * kmap() and get small overhead in this access function.
1713 */
1714 if (p) {
1715 /*
1716 * we can expect USER0 is not used (see vread/vwrite's
1717 * function description)
1718 */
1719 void *map = kmap_atomic(p, KM_USER0);
1720 memcpy(map + offset, buf, length);
1721 kunmap_atomic(map, KM_USER0);
1722 }
1723 addr += length;
1724 buf += length;
1725 copied += length;
1726 count -= length;
1727 }
1728 return copied;
1729}
1730
1731/**
1732 * vread() - read vmalloc area in a safe way.
1733 * @buf: buffer for reading data
1734 * @addr: vm address.
1735 * @count: number of bytes to be read.
1736 *
1737 * Returns # of bytes which addr and buf should be increased.
1738 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1739 * includes any intersect with alive vmalloc area.
1740 *
1741 * This function checks that addr is a valid vmalloc'ed area, and
1742 * copy data from that area to a given buffer. If the given memory range
1743 * of [addr...addr+count) includes some valid address, data is copied to
1744 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1745 * IOREMAP area is treated as memory hole and no copy is done.
1746 *
1747 * If [addr...addr+count) doesn't includes any intersects with alive
1748 * vm_struct area, returns 0.
1749 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1750 * the caller should guarantee KM_USER0 is not used.
1751 *
1752 * Note: In usual ops, vread() is never necessary because the caller
1753 * should know vmalloc() area is valid and can use memcpy().
1754 * This is for routines which have to access vmalloc area without
1755 * any informaion, as /dev/kmem.
1756 *
1757 */
1758
1628long vread(char *buf, char *addr, unsigned long count) 1759long vread(char *buf, char *addr, unsigned long count)
1629{ 1760{
1630 struct vm_struct *tmp; 1761 struct vm_struct *tmp;
1631 char *vaddr, *buf_start = buf; 1762 char *vaddr, *buf_start = buf;
1763 unsigned long buflen = count;
1632 unsigned long n; 1764 unsigned long n;
1633 1765
1634 /* Don't allow overflow */ 1766 /* Don't allow overflow */
@@ -1636,7 +1768,7 @@ long vread(char *buf, char *addr, unsigned long count)
1636 count = -(unsigned long) addr; 1768 count = -(unsigned long) addr;
1637 1769
1638 read_lock(&vmlist_lock); 1770 read_lock(&vmlist_lock);
1639 for (tmp = vmlist; tmp; tmp = tmp->next) { 1771 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1640 vaddr = (char *) tmp->addr; 1772 vaddr = (char *) tmp->addr;
1641 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1773 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1642 continue; 1774 continue;
@@ -1649,32 +1781,72 @@ long vread(char *buf, char *addr, unsigned long count)
1649 count--; 1781 count--;
1650 } 1782 }
1651 n = vaddr + tmp->size - PAGE_SIZE - addr; 1783 n = vaddr + tmp->size - PAGE_SIZE - addr;
1652 do { 1784 if (n > count)
1653 if (count == 0) 1785 n = count;
1654 goto finished; 1786 if (!(tmp->flags & VM_IOREMAP))
1655 *buf = *addr; 1787 aligned_vread(buf, addr, n);
1656 buf++; 1788 else /* IOREMAP area is treated as memory hole */
1657 addr++; 1789 memset(buf, 0, n);
1658 count--; 1790 buf += n;
1659 } while (--n > 0); 1791 addr += n;
1792 count -= n;
1660 } 1793 }
1661finished: 1794finished:
1662 read_unlock(&vmlist_lock); 1795 read_unlock(&vmlist_lock);
1663 return buf - buf_start; 1796
1797 if (buf == buf_start)
1798 return 0;
1799 /* zero-fill memory holes */
1800 if (buf != buf_start + buflen)
1801 memset(buf, 0, buflen - (buf - buf_start));
1802
1803 return buflen;
1664} 1804}
1665 1805
1806/**
1807 * vwrite() - write vmalloc area in a safe way.
1808 * @buf: buffer for source data
1809 * @addr: vm address.
1810 * @count: number of bytes to be read.
1811 *
1812 * Returns # of bytes which addr and buf should be incresed.
1813 * (same number to @count).
1814 * If [addr...addr+count) doesn't includes any intersect with valid
1815 * vmalloc area, returns 0.
1816 *
1817 * This function checks that addr is a valid vmalloc'ed area, and
1818 * copy data from a buffer to the given addr. If specified range of
1819 * [addr...addr+count) includes some valid address, data is copied from
1820 * proper area of @buf. If there are memory holes, no copy to hole.
1821 * IOREMAP area is treated as memory hole and no copy is done.
1822 *
1823 * If [addr...addr+count) doesn't includes any intersects with alive
1824 * vm_struct area, returns 0.
1825 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1826 * the caller should guarantee KM_USER0 is not used.
1827 *
1828 * Note: In usual ops, vwrite() is never necessary because the caller
1829 * should know vmalloc() area is valid and can use memcpy().
1830 * This is for routines which have to access vmalloc area without
1831 * any informaion, as /dev/kmem.
1832 *
1833 * The caller should guarantee KM_USER1 is not used.
1834 */
1835
1666long vwrite(char *buf, char *addr, unsigned long count) 1836long vwrite(char *buf, char *addr, unsigned long count)
1667{ 1837{
1668 struct vm_struct *tmp; 1838 struct vm_struct *tmp;
1669 char *vaddr, *buf_start = buf; 1839 char *vaddr;
1670 unsigned long n; 1840 unsigned long n, buflen;
1841 int copied = 0;
1671 1842
1672 /* Don't allow overflow */ 1843 /* Don't allow overflow */
1673 if ((unsigned long) addr + count < count) 1844 if ((unsigned long) addr + count < count)
1674 count = -(unsigned long) addr; 1845 count = -(unsigned long) addr;
1846 buflen = count;
1675 1847
1676 read_lock(&vmlist_lock); 1848 read_lock(&vmlist_lock);
1677 for (tmp = vmlist; tmp; tmp = tmp->next) { 1849 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1678 vaddr = (char *) tmp->addr; 1850 vaddr = (char *) tmp->addr;
1679 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1851 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1680 continue; 1852 continue;
@@ -1686,18 +1858,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
1686 count--; 1858 count--;
1687 } 1859 }
1688 n = vaddr + tmp->size - PAGE_SIZE - addr; 1860 n = vaddr + tmp->size - PAGE_SIZE - addr;
1689 do { 1861 if (n > count)
1690 if (count == 0) 1862 n = count;
1691 goto finished; 1863 if (!(tmp->flags & VM_IOREMAP)) {
1692 *addr = *buf; 1864 aligned_vwrite(buf, addr, n);
1693 buf++; 1865 copied++;
1694 addr++; 1866 }
1695 count--; 1867 buf += n;
1696 } while (--n > 0); 1868 addr += n;
1869 count -= n;
1697 } 1870 }
1698finished: 1871finished:
1699 read_unlock(&vmlist_lock); 1872 read_unlock(&vmlist_lock);
1700 return buf - buf_start; 1873 if (!copied)
1874 return 0;
1875 return buflen;
1701} 1876}
1702 1877
1703/** 1878/**
@@ -1818,6 +1993,286 @@ void free_vm_area(struct vm_struct *area)
1818} 1993}
1819EXPORT_SYMBOL_GPL(free_vm_area); 1994EXPORT_SYMBOL_GPL(free_vm_area);
1820 1995
1996static struct vmap_area *node_to_va(struct rb_node *n)
1997{
1998 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1999}
2000
2001/**
2002 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2003 * @end: target address
2004 * @pnext: out arg for the next vmap_area
2005 * @pprev: out arg for the previous vmap_area
2006 *
2007 * Returns: %true if either or both of next and prev are found,
2008 * %false if no vmap_area exists
2009 *
2010 * Find vmap_areas end addresses of which enclose @end. ie. if not
2011 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
2012 */
2013static bool pvm_find_next_prev(unsigned long end,
2014 struct vmap_area **pnext,
2015 struct vmap_area **pprev)
2016{
2017 struct rb_node *n = vmap_area_root.rb_node;
2018 struct vmap_area *va = NULL;
2019
2020 while (n) {
2021 va = rb_entry(n, struct vmap_area, rb_node);
2022 if (end < va->va_end)
2023 n = n->rb_left;
2024 else if (end > va->va_end)
2025 n = n->rb_right;
2026 else
2027 break;
2028 }
2029
2030 if (!va)
2031 return false;
2032
2033 if (va->va_end > end) {
2034 *pnext = va;
2035 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2036 } else {
2037 *pprev = va;
2038 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2039 }
2040 return true;
2041}
2042
2043/**
2044 * pvm_determine_end - find the highest aligned address between two vmap_areas
2045 * @pnext: in/out arg for the next vmap_area
2046 * @pprev: in/out arg for the previous vmap_area
2047 * @align: alignment
2048 *
2049 * Returns: determined end address
2050 *
2051 * Find the highest aligned address between *@pnext and *@pprev below
2052 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2053 * down address is between the end addresses of the two vmap_areas.
2054 *
2055 * Please note that the address returned by this function may fall
2056 * inside *@pnext vmap_area. The caller is responsible for checking
2057 * that.
2058 */
2059static unsigned long pvm_determine_end(struct vmap_area **pnext,
2060 struct vmap_area **pprev,
2061 unsigned long align)
2062{
2063 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2064 unsigned long addr;
2065
2066 if (*pnext)
2067 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2068 else
2069 addr = vmalloc_end;
2070
2071 while (*pprev && (*pprev)->va_end > addr) {
2072 *pnext = *pprev;
2073 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2074 }
2075
2076 return addr;
2077}
2078
2079/**
2080 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
2081 * @offsets: array containing offset of each area
2082 * @sizes: array containing size of each area
2083 * @nr_vms: the number of areas to allocate
2084 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2085 * @gfp_mask: allocation mask
2086 *
2087 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2088 * vm_structs on success, %NULL on failure
2089 *
2090 * Percpu allocator wants to use congruent vm areas so that it can
2091 * maintain the offsets among percpu areas. This function allocates
2092 * congruent vmalloc areas for it. These areas tend to be scattered
2093 * pretty far, distance between two areas easily going up to
2094 * gigabytes. To avoid interacting with regular vmallocs, these areas
2095 * are allocated from top.
2096 *
2097 * Despite its complicated look, this allocator is rather simple. It
2098 * does everything top-down and scans areas from the end looking for
2099 * matching slot. While scanning, if any of the areas overlaps with
2100 * existing vmap_area, the base address is pulled down to fit the
2101 * area. Scanning is repeated till all the areas fit and then all
2102 * necessary data structres are inserted and the result is returned.
2103 */
2104struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2105 const size_t *sizes, int nr_vms,
2106 size_t align, gfp_t gfp_mask)
2107{
2108 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2109 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2110 struct vmap_area **vas, *prev, *next;
2111 struct vm_struct **vms;
2112 int area, area2, last_area, term_area;
2113 unsigned long base, start, end, last_end;
2114 bool purged = false;
2115
2116 gfp_mask &= GFP_RECLAIM_MASK;
2117
2118 /* verify parameters and allocate data structures */
2119 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2120 for (last_area = 0, area = 0; area < nr_vms; area++) {
2121 start = offsets[area];
2122 end = start + sizes[area];
2123
2124 /* is everything aligned properly? */
2125 BUG_ON(!IS_ALIGNED(offsets[area], align));
2126 BUG_ON(!IS_ALIGNED(sizes[area], align));
2127
2128 /* detect the area with the highest address */
2129 if (start > offsets[last_area])
2130 last_area = area;
2131
2132 for (area2 = 0; area2 < nr_vms; area2++) {
2133 unsigned long start2 = offsets[area2];
2134 unsigned long end2 = start2 + sizes[area2];
2135
2136 if (area2 == area)
2137 continue;
2138
2139 BUG_ON(start2 >= start && start2 < end);
2140 BUG_ON(end2 <= end && end2 > start);
2141 }
2142 }
2143 last_end = offsets[last_area] + sizes[last_area];
2144
2145 if (vmalloc_end - vmalloc_start < last_end) {
2146 WARN_ON(true);
2147 return NULL;
2148 }
2149
2150 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
2151 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
2152 if (!vas || !vms)
2153 goto err_free;
2154
2155 for (area = 0; area < nr_vms; area++) {
2156 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
2157 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
2158 if (!vas[area] || !vms[area])
2159 goto err_free;
2160 }
2161retry:
2162 spin_lock(&vmap_area_lock);
2163
2164 /* start scanning - we scan from the top, begin with the last area */
2165 area = term_area = last_area;
2166 start = offsets[area];
2167 end = start + sizes[area];
2168
2169 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2170 base = vmalloc_end - last_end;
2171 goto found;
2172 }
2173 base = pvm_determine_end(&next, &prev, align) - end;
2174
2175 while (true) {
2176 BUG_ON(next && next->va_end <= base + end);
2177 BUG_ON(prev && prev->va_end > base + end);
2178
2179 /*
2180 * base might have underflowed, add last_end before
2181 * comparing.
2182 */
2183 if (base + last_end < vmalloc_start + last_end) {
2184 spin_unlock(&vmap_area_lock);
2185 if (!purged) {
2186 purge_vmap_area_lazy();
2187 purged = true;
2188 goto retry;
2189 }
2190 goto err_free;
2191 }
2192
2193 /*
2194 * If next overlaps, move base downwards so that it's
2195 * right below next and then recheck.
2196 */
2197 if (next && next->va_start < base + end) {
2198 base = pvm_determine_end(&next, &prev, align) - end;
2199 term_area = area;
2200 continue;
2201 }
2202
2203 /*
2204 * If prev overlaps, shift down next and prev and move
2205 * base so that it's right below new next and then
2206 * recheck.
2207 */
2208 if (prev && prev->va_end > base + start) {
2209 next = prev;
2210 prev = node_to_va(rb_prev(&next->rb_node));
2211 base = pvm_determine_end(&next, &prev, align) - end;
2212 term_area = area;
2213 continue;
2214 }
2215
2216 /*
2217 * This area fits, move on to the previous one. If
2218 * the previous one is the terminal one, we're done.
2219 */
2220 area = (area + nr_vms - 1) % nr_vms;
2221 if (area == term_area)
2222 break;
2223 start = offsets[area];
2224 end = start + sizes[area];
2225 pvm_find_next_prev(base + end, &next, &prev);
2226 }
2227found:
2228 /* we've found a fitting base, insert all va's */
2229 for (area = 0; area < nr_vms; area++) {
2230 struct vmap_area *va = vas[area];
2231
2232 va->va_start = base + offsets[area];
2233 va->va_end = va->va_start + sizes[area];
2234 __insert_vmap_area(va);
2235 }
2236
2237 vmap_area_pcpu_hole = base + offsets[last_area];
2238
2239 spin_unlock(&vmap_area_lock);
2240
2241 /* insert all vm's */
2242 for (area = 0; area < nr_vms; area++)
2243 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2244 pcpu_get_vm_areas);
2245
2246 kfree(vas);
2247 return vms;
2248
2249err_free:
2250 for (area = 0; area < nr_vms; area++) {
2251 if (vas)
2252 kfree(vas[area]);
2253 if (vms)
2254 kfree(vms[area]);
2255 }
2256 kfree(vas);
2257 kfree(vms);
2258 return NULL;
2259}
2260
2261/**
2262 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2263 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2264 * @nr_vms: the number of allocated areas
2265 *
2266 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2267 */
2268void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2269{
2270 int i;
2271
2272 for (i = 0; i < nr_vms; i++)
2273 free_vm_area(vms[i]);
2274 kfree(vms);
2275}
1821 2276
1822#ifdef CONFIG_PROC_FS 2277#ifdef CONFIG_PROC_FS
1823static void *s_start(struct seq_file *m, loff_t *pos) 2278static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..64e438898832 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
148 return &zone->reclaim_stat; 148 return &zone->reclaim_stat;
149} 149}
150 150
151static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, 151static unsigned long zone_nr_lru_pages(struct zone *zone,
152 enum lru_list lru) 152 struct scan_control *sc, enum lru_list lru)
153{ 153{
154 if (!scanning_global_lru(sc)) 154 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page)
286 286
287static inline int is_page_cache_freeable(struct page *page) 287static inline int is_page_cache_freeable(struct page *page)
288{ 288{
289 return page_count(page) - !!page_has_private(page) == 2; 289 /*
290 * A freeable page cache page is referenced only by the caller
291 * that isolated the page, the page cache radix tree and
292 * optional buffer heads at page->private.
293 */
294 return page_count(page) - page_has_private(page) == 2;
290} 295}
291 296
292static int may_write_to_queue(struct backing_dev_info *bdi) 297static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
361 * block, for some throttling. This happens by accident, because 366 * block, for some throttling. This happens by accident, because
362 * swap_backing_dev_info is bust: it doesn't reflect the 367 * swap_backing_dev_info is bust: it doesn't reflect the
363 * congestion state of the swapdevs. Easy to fix, if needed. 368 * congestion state of the swapdevs. Easy to fix, if needed.
364 * See swapfile.c:page_queue_congested().
365 */ 369 */
366 if (!is_page_cache_freeable(page)) 370 if (!is_page_cache_freeable(page))
367 return PAGE_KEEP; 371 return PAGE_KEEP;
@@ -531,7 +535,7 @@ redo:
531 * unevictable page on [in]active list. 535 * unevictable page on [in]active list.
532 * We know how to handle that. 536 * We know how to handle that.
533 */ 537 */
534 lru = active + page_is_file_cache(page); 538 lru = active + page_lru_base_type(page);
535 lru_cache_add_lru(page, lru); 539 lru_cache_add_lru(page, lru);
536 } else { 540 } else {
537 /* 541 /*
@@ -659,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
659 * processes. Try to unmap it here. 663 * processes. Try to unmap it here.
660 */ 664 */
661 if (page_mapped(page) && mapping) { 665 if (page_mapped(page) && mapping) {
662 switch (try_to_unmap(page, 0)) { 666 switch (try_to_unmap(page, TTU_UNMAP)) {
663 case SWAP_FAIL: 667 case SWAP_FAIL:
664 goto activate_locked; 668 goto activate_locked;
665 case SWAP_AGAIN: 669 case SWAP_AGAIN:
@@ -821,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
821 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 825 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
822 return ret; 826 return ret;
823 827
824 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) 828 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
825 return ret; 829 return ret;
826 830
827 /* 831 /*
@@ -935,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
935 /* Check that we have not crossed a zone boundary. */ 939 /* Check that we have not crossed a zone boundary. */
936 if (unlikely(page_zone_id(cursor_page) != zone_id)) 940 if (unlikely(page_zone_id(cursor_page) != zone_id))
937 continue; 941 continue;
942
943 /*
944 * If we don't have enough swap space, reclaiming of
945 * anon page which don't already have a swap slot is
946 * pointless.
947 */
948 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
949 !PageSwapCache(cursor_page))
950 continue;
951
938 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 952 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
939 list_move(&cursor_page->lru, dst); 953 list_move(&cursor_page->lru, dst);
940 mem_cgroup_del_lru(cursor_page); 954 mem_cgroup_del_lru(cursor_page);
@@ -961,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr,
961 if (file) 975 if (file)
962 lru += LRU_FILE; 976 lru += LRU_FILE;
963 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, 977 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
964 mode, !!file); 978 mode, file);
965} 979}
966 980
967/* 981/*
@@ -976,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list,
976 struct page *page; 990 struct page *page;
977 991
978 list_for_each_entry(page, page_list, lru) { 992 list_for_each_entry(page, page_list, lru) {
979 lru = page_is_file_cache(page); 993 lru = page_lru_base_type(page);
980 if (PageActive(page)) { 994 if (PageActive(page)) {
981 lru += LRU_ACTIVE; 995 lru += LRU_ACTIVE;
982 ClearPageActive(page); 996 ClearPageActive(page);
@@ -1034,6 +1048,31 @@ int isolate_lru_page(struct page *page)
1034} 1048}
1035 1049
1036/* 1050/*
1051 * Are there way too many processes in the direct reclaim path already?
1052 */
1053static int too_many_isolated(struct zone *zone, int file,
1054 struct scan_control *sc)
1055{
1056 unsigned long inactive, isolated;
1057
1058 if (current_is_kswapd())
1059 return 0;
1060
1061 if (!scanning_global_lru(sc))
1062 return 0;
1063
1064 if (file) {
1065 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1066 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1067 } else {
1068 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1069 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1070 }
1071
1072 return isolated > inactive;
1073}
1074
1075/*
1037 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1076 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1038 * of reclaimed pages 1077 * of reclaimed pages
1039 */ 1078 */
@@ -1048,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1048 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1087 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1049 int lumpy_reclaim = 0; 1088 int lumpy_reclaim = 0;
1050 1089
1090 while (unlikely(too_many_isolated(zone, file, sc))) {
1091 congestion_wait(WRITE, HZ/10);
1092
1093 /* We are about to die and free our memory. Return now. */
1094 if (fatal_signal_pending(current))
1095 return SWAP_CLUSTER_MAX;
1096 }
1097
1051 /* 1098 /*
1052 * If we need a large contiguous chunk of memory, or have 1099 * If we need a large contiguous chunk of memory, or have
1053 * trouble getting a small set of contiguous pages, we 1100 * trouble getting a small set of contiguous pages, we
@@ -1072,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1072 unsigned long nr_active; 1119 unsigned long nr_active;
1073 unsigned int count[NR_LRU_LISTS] = { 0, }; 1120 unsigned int count[NR_LRU_LISTS] = { 0, };
1074 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; 1121 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1122 unsigned long nr_anon;
1123 unsigned long nr_file;
1075 1124
1076 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1125 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1077 &page_list, &nr_scan, sc->order, mode, 1126 &page_list, &nr_scan, sc->order, mode,
1078 zone, sc->mem_cgroup, 0, file); 1127 zone, sc->mem_cgroup, 0, file);
1128
1129 if (scanning_global_lru(sc)) {
1130 zone->pages_scanned += nr_scan;
1131 if (current_is_kswapd())
1132 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1133 nr_scan);
1134 else
1135 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1136 nr_scan);
1137 }
1138
1139 if (nr_taken == 0)
1140 goto done;
1141
1079 nr_active = clear_active_flags(&page_list, count); 1142 nr_active = clear_active_flags(&page_list, count);
1080 __count_vm_events(PGDEACTIVATE, nr_active); 1143 __count_vm_events(PGDEACTIVATE, nr_active);
1081 1144
@@ -1088,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1088 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1151 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1089 -count[LRU_INACTIVE_ANON]); 1152 -count[LRU_INACTIVE_ANON]);
1090 1153
1091 if (scanning_global_lru(sc)) 1154 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1092 zone->pages_scanned += nr_scan; 1155 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1156 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1157 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1093 1158
1094 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1159 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1095 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1160 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@ -1123,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1123 } 1188 }
1124 1189
1125 nr_reclaimed += nr_freed; 1190 nr_reclaimed += nr_freed;
1191
1126 local_irq_disable(); 1192 local_irq_disable();
1127 if (current_is_kswapd()) { 1193 if (current_is_kswapd())
1128 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1129 __count_vm_events(KSWAPD_STEAL, nr_freed); 1194 __count_vm_events(KSWAPD_STEAL, nr_freed);
1130 } else if (scanning_global_lru(sc))
1131 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1132
1133 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1195 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
1134 1196
1135 if (nr_taken == 0)
1136 goto done;
1137
1138 spin_lock(&zone->lru_lock); 1197 spin_lock(&zone->lru_lock);
1139 /* 1198 /*
1140 * Put back any unfreeable pages. 1199 * Put back any unfreeable pages.
@@ -1153,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1153 SetPageLRU(page); 1212 SetPageLRU(page);
1154 lru = page_lru(page); 1213 lru = page_lru(page);
1155 add_page_to_lru_list(zone, page, lru); 1214 add_page_to_lru_list(zone, page, lru);
1156 if (PageActive(page)) { 1215 if (is_active_lru(lru)) {
1157 int file = !!page_is_file_cache(page); 1216 int file = is_file_lru(lru);
1158 reclaim_stat->recent_rotated[file]++; 1217 reclaim_stat->recent_rotated[file]++;
1159 } 1218 }
1160 if (!pagevec_add(&pvec, page)) { 1219 if (!pagevec_add(&pvec, page)) {
@@ -1163,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1163 spin_lock_irq(&zone->lru_lock); 1222 spin_lock_irq(&zone->lru_lock);
1164 } 1223 }
1165 } 1224 }
1225 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1226 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1227
1166 } while (nr_scanned < max_scan); 1228 } while (nr_scanned < max_scan);
1167 spin_unlock(&zone->lru_lock); 1229
1168done: 1230done:
1169 local_irq_enable(); 1231 spin_unlock_irq(&zone->lru_lock);
1170 pagevec_release(&pvec); 1232 pagevec_release(&pvec);
1171 return nr_reclaimed; 1233 return nr_reclaimed;
1172} 1234}
@@ -1215,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone,
1215 1277
1216 while (!list_empty(list)) { 1278 while (!list_empty(list)) {
1217 page = lru_to_page(list); 1279 page = lru_to_page(list);
1218 prefetchw_prev_lru_page(page, list, flags);
1219 1280
1220 VM_BUG_ON(PageLRU(page)); 1281 VM_BUG_ON(PageLRU(page));
1221 SetPageLRU(page); 1282 SetPageLRU(page);
1222 1283
1223 VM_BUG_ON(!PageActive(page));
1224 if (!is_active_lru(lru))
1225 ClearPageActive(page); /* we are de-activating */
1226
1227 list_move(&page->lru, &zone->lru[lru].list); 1284 list_move(&page->lru, &zone->lru[lru].list);
1228 mem_cgroup_add_lru_list(page, lru); 1285 mem_cgroup_add_lru_list(page, lru);
1229 pgmoved++; 1286 pgmoved++;
@@ -1244,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1244static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1301static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 struct scan_control *sc, int priority, int file) 1302 struct scan_control *sc, int priority, int file)
1246{ 1303{
1247 unsigned long pgmoved; 1304 unsigned long nr_taken;
1248 unsigned long pgscanned; 1305 unsigned long pgscanned;
1249 unsigned long vm_flags; 1306 unsigned long vm_flags;
1250 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1307 LIST_HEAD(l_hold); /* The pages which were snipped off */
@@ -1252,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1252 LIST_HEAD(l_inactive); 1309 LIST_HEAD(l_inactive);
1253 struct page *page; 1310 struct page *page;
1254 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1311 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1312 unsigned long nr_rotated = 0;
1255 1313
1256 lru_add_drain(); 1314 lru_add_drain();
1257 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1258 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1316 nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1259 ISOLATE_ACTIVE, zone, 1317 ISOLATE_ACTIVE, zone,
1260 sc->mem_cgroup, 1, file); 1318 sc->mem_cgroup, 1, file);
1261 /* 1319 /*
@@ -1265,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1265 if (scanning_global_lru(sc)) { 1323 if (scanning_global_lru(sc)) {
1266 zone->pages_scanned += pgscanned; 1324 zone->pages_scanned += pgscanned;
1267 } 1325 }
1268 reclaim_stat->recent_scanned[!!file] += pgmoved; 1326 reclaim_stat->recent_scanned[file] += nr_taken;
1269 1327
1270 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1328 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1271 if (file) 1329 if (file)
1272 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1330 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1273 else 1331 else
1274 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1332 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1333 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1275 spin_unlock_irq(&zone->lru_lock); 1334 spin_unlock_irq(&zone->lru_lock);
1276 1335
1277 pgmoved = 0; /* count referenced (mapping) mapped pages */
1278 while (!list_empty(&l_hold)) { 1336 while (!list_empty(&l_hold)) {
1279 cond_resched(); 1337 cond_resched();
1280 page = lru_to_page(&l_hold); 1338 page = lru_to_page(&l_hold);
@@ -1288,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1288 /* page_referenced clears PageReferenced */ 1346 /* page_referenced clears PageReferenced */
1289 if (page_mapping_inuse(page) && 1347 if (page_mapping_inuse(page) &&
1290 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1348 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1291 pgmoved++; 1349 nr_rotated++;
1292 /* 1350 /*
1293 * Identify referenced, file-backed active pages and 1351 * Identify referenced, file-backed active pages and
1294 * give them one more trip around the active list. So 1352 * give them one more trip around the active list. So
@@ -1304,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1304 } 1362 }
1305 } 1363 }
1306 1364
1365 ClearPageActive(page); /* we are de-activating */
1307 list_add(&page->lru, &l_inactive); 1366 list_add(&page->lru, &l_inactive);
1308 } 1367 }
1309 1368
@@ -1317,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1317 * helps balance scan pressure between file and anonymous pages in 1376 * helps balance scan pressure between file and anonymous pages in
1318 * get_scan_ratio. 1377 * get_scan_ratio.
1319 */ 1378 */
1320 reclaim_stat->recent_rotated[!!file] += pgmoved; 1379 reclaim_stat->recent_rotated[file] += nr_rotated;
1321 1380
1322 move_active_pages_to_lru(zone, &l_active, 1381 move_active_pages_to_lru(zone, &l_active,
1323 LRU_ACTIVE + file * LRU_FILE); 1382 LRU_ACTIVE + file * LRU_FILE);
1324 move_active_pages_to_lru(zone, &l_inactive, 1383 move_active_pages_to_lru(zone, &l_inactive,
1325 LRU_BASE + file * LRU_FILE); 1384 LRU_BASE + file * LRU_FILE);
1326 1385 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1327 spin_unlock_irq(&zone->lru_lock); 1386 spin_unlock_irq(&zone->lru_lock);
1328} 1387}
1329 1388
@@ -1429,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1429 unsigned long ap, fp; 1488 unsigned long ap, fp;
1430 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1489 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1431 1490
1432 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1491 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1433 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1492 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1434 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1493 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1435 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); 1494 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1436 1495
1437 if (scanning_global_lru(sc)) { 1496 if (scanning_global_lru(sc)) {
1438 free = zone_page_state(zone, NR_FREE_PAGES); 1497 free = zone_page_state(zone, NR_FREE_PAGES);
@@ -1526,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone,
1526 enum lru_list l; 1585 enum lru_list l;
1527 unsigned long nr_reclaimed = sc->nr_reclaimed; 1586 unsigned long nr_reclaimed = sc->nr_reclaimed;
1528 unsigned long swap_cluster_max = sc->swap_cluster_max; 1587 unsigned long swap_cluster_max = sc->swap_cluster_max;
1588 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1529 int noswap = 0; 1589 int noswap = 0;
1530 1590
1531 /* If we have no swap space, do not bother scanning anon pages. */ 1591 /* If we have no swap space, do not bother scanning anon pages. */
@@ -1540,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone,
1540 int file = is_file_lru(l); 1600 int file = is_file_lru(l);
1541 unsigned long scan; 1601 unsigned long scan;
1542 1602
1543 scan = zone_nr_pages(zone, sc, l); 1603 scan = zone_nr_lru_pages(zone, sc, l);
1544 if (priority || noswap) { 1604 if (priority || noswap) {
1545 scan >>= priority; 1605 scan >>= priority;
1546 scan = (scan * percent[file]) / 100; 1606 scan = (scan * percent[file]) / 100;
1547 } 1607 }
1548 if (scanning_global_lru(sc)) 1608 nr[l] = nr_scan_try_batch(scan,
1549 nr[l] = nr_scan_try_batch(scan, 1609 &reclaim_stat->nr_saved_scan[l],
1550 &zone->lru[l].nr_saved_scan, 1610 swap_cluster_max);
1551 swap_cluster_max);
1552 else
1553 nr[l] = scan;
1554 } 1611 }
1555 1612
1556 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1613 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1652,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1652 * 1709 *
1653 * If the caller is !__GFP_FS then the probability of a failure is reasonably 1710 * If the caller is !__GFP_FS then the probability of a failure is reasonably
1654 * high - the zone may be full of dirty or under-writeback pages, which this 1711 * high - the zone may be full of dirty or under-writeback pages, which this
1655 * caller can't do much about. We kick pdflush and take explicit naps in the 1712 * caller can't do much about. We kick the writeback threads and take explicit
1656 * hope that some of these pages can be written. But if the allocating task 1713 * naps in the hope that some of these pages can be written. But if the
1657 * holds filesystem locks which prevent writeout this might not work, and the 1714 * allocating task holds filesystem locks which prevent writeout this might not
1658 * allocation attempt will fail. 1715 * work, and the allocation attempt will fail.
1659 * 1716 *
1660 * returns: 0, if no pages reclaimed 1717 * returns: 0, if no pages reclaimed
1661 * else, the number of pages reclaimed 1718 * else, the number of pages reclaimed
@@ -1685,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1685 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1742 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1686 continue; 1743 continue;
1687 1744
1688 lru_pages += zone_lru_pages(zone); 1745 lru_pages += zone_reclaimable_pages(zone);
1689 } 1746 }
1690 } 1747 }
1691 1748
@@ -1720,7 +1777,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1777 */
1721 if (total_scanned > sc->swap_cluster_max + 1778 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1779 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1780 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1781 sc->may_writepage = 1;
1725 } 1782 }
1726 1783
@@ -1779,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1779 1836
1780#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1837#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1781 1838
1839unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1840 gfp_t gfp_mask, bool noswap,
1841 unsigned int swappiness,
1842 struct zone *zone, int nid)
1843{
1844 struct scan_control sc = {
1845 .may_writepage = !laptop_mode,
1846 .may_unmap = 1,
1847 .may_swap = !noswap,
1848 .swap_cluster_max = SWAP_CLUSTER_MAX,
1849 .swappiness = swappiness,
1850 .order = 0,
1851 .mem_cgroup = mem,
1852 .isolate_pages = mem_cgroup_isolate_pages,
1853 };
1854 nodemask_t nm = nodemask_of_node(nid);
1855
1856 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1857 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1858 sc.nodemask = &nm;
1859 sc.nr_reclaimed = 0;
1860 sc.nr_scanned = 0;
1861 /*
1862 * NOTE: Although we can get the priority field, using it
1863 * here is not a good idea, since it limits the pages we can scan.
1864 * if we don't reclaim here, the shrink_zone from balance_pgdat
1865 * will pick up pages from other mem cgroup's as well. We hack
1866 * the priority and make it zero.
1867 */
1868 shrink_zone(0, zone, &sc);
1869 return sc.nr_reclaimed;
1870}
1871
1782unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1872unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1783 gfp_t gfp_mask, 1873 gfp_t gfp_mask,
1784 bool noswap, 1874 bool noswap,
1785 unsigned int swappiness) 1875 unsigned int swappiness)
1786{ 1876{
1877 struct zonelist *zonelist;
1787 struct scan_control sc = { 1878 struct scan_control sc = {
1788 .may_writepage = !laptop_mode, 1879 .may_writepage = !laptop_mode,
1789 .may_unmap = 1, 1880 .may_unmap = 1,
@@ -1795,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1795 .isolate_pages = mem_cgroup_isolate_pages, 1886 .isolate_pages = mem_cgroup_isolate_pages,
1796 .nodemask = NULL, /* we don't care the placement */ 1887 .nodemask = NULL, /* we don't care the placement */
1797 }; 1888 };
1798 struct zonelist *zonelist;
1799 1889
1800 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1890 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1801 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1891 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1902,7 +1992,7 @@ loop_again:
1902 for (i = 0; i <= end_zone; i++) { 1992 for (i = 0; i <= end_zone; i++) {
1903 struct zone *zone = pgdat->node_zones + i; 1993 struct zone *zone = pgdat->node_zones + i;
1904 1994
1905 lru_pages += zone_lru_pages(zone); 1995 lru_pages += zone_reclaimable_pages(zone);
1906 } 1996 }
1907 1997
1908 /* 1998 /*
@@ -1917,6 +2007,7 @@ loop_again:
1917 for (i = 0; i <= end_zone; i++) { 2007 for (i = 0; i <= end_zone; i++) {
1918 struct zone *zone = pgdat->node_zones + i; 2008 struct zone *zone = pgdat->node_zones + i;
1919 int nr_slab; 2009 int nr_slab;
2010 int nid, zid;
1920 2011
1921 if (!populated_zone(zone)) 2012 if (!populated_zone(zone))
1922 continue; 2013 continue;
@@ -1931,6 +2022,15 @@ loop_again:
1931 temp_priority[i] = priority; 2022 temp_priority[i] = priority;
1932 sc.nr_scanned = 0; 2023 sc.nr_scanned = 0;
1933 note_zone_scanning_priority(zone, priority); 2024 note_zone_scanning_priority(zone, priority);
2025
2026 nid = pgdat->node_id;
2027 zid = zone_idx(zone);
2028 /*
2029 * Call soft limit reclaim before calling shrink_zone.
2030 * For now we ignore the return value
2031 */
2032 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
2033 nid, zid);
1934 /* 2034 /*
1935 * We put equal pressure on every zone, unless one 2035 * We put equal pressure on every zone, unless one
1936 * zone has way too many pages free already. 2036 * zone has way too many pages free already.
@@ -1946,7 +2046,7 @@ loop_again:
1946 if (zone_is_all_unreclaimable(zone)) 2046 if (zone_is_all_unreclaimable(zone))
1947 continue; 2047 continue;
1948 if (nr_slab == 0 && zone->pages_scanned >= 2048 if (nr_slab == 0 && zone->pages_scanned >=
1949 (zone_lru_pages(zone) * 6)) 2049 (zone_reclaimable_pages(zone) * 6))
1950 zone_set_flag(zone, 2050 zone_set_flag(zone,
1951 ZONE_ALL_UNRECLAIMABLE); 2051 ZONE_ALL_UNRECLAIMABLE);
1952 /* 2052 /*
@@ -2113,12 +2213,39 @@ void wakeup_kswapd(struct zone *zone, int order)
2113 wake_up_interruptible(&pgdat->kswapd_wait); 2213 wake_up_interruptible(&pgdat->kswapd_wait);
2114} 2214}
2115 2215
2116unsigned long global_lru_pages(void) 2216/*
2217 * The reclaimable count would be mostly accurate.
2218 * The less reclaimable pages may be
2219 * - mlocked pages, which will be moved to unevictable list when encountered
2220 * - mapped pages, which may require several travels to be reclaimed
2221 * - dirty pages, which is not "instantly" reclaimable
2222 */
2223unsigned long global_reclaimable_pages(void)
2224{
2225 int nr;
2226
2227 nr = global_page_state(NR_ACTIVE_FILE) +
2228 global_page_state(NR_INACTIVE_FILE);
2229
2230 if (nr_swap_pages > 0)
2231 nr += global_page_state(NR_ACTIVE_ANON) +
2232 global_page_state(NR_INACTIVE_ANON);
2233
2234 return nr;
2235}
2236
2237unsigned long zone_reclaimable_pages(struct zone *zone)
2117{ 2238{
2118 return global_page_state(NR_ACTIVE_ANON) 2239 int nr;
2119 + global_page_state(NR_ACTIVE_FILE) 2240
2120 + global_page_state(NR_INACTIVE_ANON) 2241 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2121 + global_page_state(NR_INACTIVE_FILE); 2242 zone_page_state(zone, NR_INACTIVE_FILE);
2243
2244 if (nr_swap_pages > 0)
2245 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2246 zone_page_state(zone, NR_INACTIVE_ANON);
2247
2248 return nr;
2122} 2249}
2123 2250
2124#ifdef CONFIG_HIBERNATION 2251#ifdef CONFIG_HIBERNATION
@@ -2133,6 +2260,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2133{ 2260{
2134 struct zone *zone; 2261 struct zone *zone;
2135 unsigned long nr_reclaimed = 0; 2262 unsigned long nr_reclaimed = 0;
2263 struct zone_reclaim_stat *reclaim_stat;
2136 2264
2137 for_each_populated_zone(zone) { 2265 for_each_populated_zone(zone) {
2138 enum lru_list l; 2266 enum lru_list l;
@@ -2149,11 +2277,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2149 l == LRU_ACTIVE_FILE)) 2277 l == LRU_ACTIVE_FILE))
2150 continue; 2278 continue;
2151 2279
2152 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; 2280 reclaim_stat = get_reclaim_stat(zone, sc);
2153 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { 2281 reclaim_stat->nr_saved_scan[l] +=
2282 (lru_pages >> prio) + 1;
2283 if (reclaim_stat->nr_saved_scan[l]
2284 >= nr_pages || pass > 3) {
2154 unsigned long nr_to_scan; 2285 unsigned long nr_to_scan;
2155 2286
2156 zone->lru[l].nr_saved_scan = 0; 2287 reclaim_stat->nr_saved_scan[l] = 0;
2157 nr_to_scan = min(nr_pages, lru_pages); 2288 nr_to_scan = min(nr_pages, lru_pages);
2158 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2289 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2159 sc, prio); 2290 sc, prio);
@@ -2190,7 +2321,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2190 2321
2191 current->reclaim_state = &reclaim_state; 2322 current->reclaim_state = &reclaim_state;
2192 2323
2193 lru_pages = global_lru_pages(); 2324 lru_pages = global_reclaimable_pages();
2194 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2325 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
2195 /* If slab caches are huge, it's better to hit them first */ 2326 /* If slab caches are huge, it's better to hit them first */
2196 while (nr_slab >= lru_pages) { 2327 while (nr_slab >= lru_pages) {
@@ -2232,7 +2363,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2232 2363
2233 reclaim_state.reclaimed_slab = 0; 2364 reclaim_state.reclaimed_slab = 0;
2234 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2365 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2235 global_lru_pages()); 2366 global_reclaimable_pages());
2236 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2367 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2237 if (sc.nr_reclaimed >= nr_pages) 2368 if (sc.nr_reclaimed >= nr_pages)
2238 goto out; 2369 goto out;
@@ -2249,7 +2380,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2249 if (!sc.nr_reclaimed) { 2380 if (!sc.nr_reclaimed) {
2250 do { 2381 do {
2251 reclaim_state.reclaimed_slab = 0; 2382 reclaim_state.reclaimed_slab = 0;
2252 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2383 shrink_slab(nr_pages, sc.gfp_mask,
2384 global_reclaimable_pages());
2253 sc.nr_reclaimed += reclaim_state.reclaimed_slab; 2385 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2254 } while (sc.nr_reclaimed < nr_pages && 2386 } while (sc.nr_reclaimed < nr_pages &&
2255 reclaim_state.reclaimed_slab > 0); 2387 reclaim_state.reclaimed_slab > 0);
@@ -2569,7 +2701,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone)
2569retry: 2701retry:
2570 ClearPageUnevictable(page); 2702 ClearPageUnevictable(page);
2571 if (page_evictable(page, NULL)) { 2703 if (page_evictable(page, NULL)) {
2572 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); 2704 enum lru_list l = page_lru_base_type(page);
2573 2705
2574 __dec_zone_state(zone, NR_UNEVICTABLE); 2706 __dec_zone_state(zone, NR_UNEVICTABLE);
2575 list_move(&page->lru, &zone->lru[l].list); 2707 list_move(&page->lru, &zone->lru[l].list);
@@ -2712,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void)
2712unsigned long scan_unevictable_pages; 2844unsigned long scan_unevictable_pages;
2713 2845
2714int scan_unevictable_handler(struct ctl_table *table, int write, 2846int scan_unevictable_handler(struct ctl_table *table, int write,
2715 struct file *file, void __user *buffer, 2847 void __user *buffer,
2716 size_t *length, loff_t *ppos) 2848 size_t *length, loff_t *ppos)
2717{ 2849{
2718 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 2850 proc_doulongvec_minmax(table, write, buffer, length, ppos);
2719 2851
2720 if (write && *(unsigned long *)table->data) 2852 if (write && *(unsigned long *)table->data)
2721 scan_all_zones_unevictable_pages(); 2853 scan_all_zones_unevictable_pages();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 138bed53706e..c81321f9feec 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -639,11 +639,14 @@ static const char * const vmstat_text[] = {
639 "nr_slab_reclaimable", 639 "nr_slab_reclaimable",
640 "nr_slab_unreclaimable", 640 "nr_slab_unreclaimable",
641 "nr_page_table_pages", 641 "nr_page_table_pages",
642 "nr_kernel_stack",
642 "nr_unstable", 643 "nr_unstable",
643 "nr_bounce", 644 "nr_bounce",
644 "nr_vmscan_write", 645 "nr_vmscan_write",
645 "nr_writeback_temp", 646 "nr_writeback_temp",
646 647 "nr_isolated_anon",
648 "nr_isolated_file",
649 "nr_shmem",
647#ifdef CONFIG_NUMA 650#ifdef CONFIG_NUMA
648 "numa_hit", 651 "numa_hit",
649 "numa_miss", 652 "numa_miss",