aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2009-09-20 08:55:36 -0400
committerDavid Woodhouse <David.Woodhouse@intel.com>2009-09-20 08:55:36 -0400
commit6469f540ea37d53db089c8fea9c0c77a3d9353d4 (patch)
tree1dc9dc077150d57f4424cae49e711b5dd6e903a1 /mm
parent304e6d5fe294b80e6d3107f99ec241816390ebcc (diff)
parent78f28b7c555359c67c2a0d23f7436e915329421e (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/mtd/mtdcore.c Merged in order that I can apply the Nomadik nand/onenand support patches.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c28
-rw-r--r--mm/backing-dev.c427
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/filemap.c170
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c336
-rw-r--r--mm/memory.c1
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/page-writeback.c179
-rw-r--r--mm/page_alloc.c6
-rw-r--r--mm/pdflush.c269
-rw-r--r--mm/percpu.c1421
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/shmem.c15
-rw-r--r--mm/shmem_acl.c11
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c92
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/vmalloc.c338
-rw-r--r--mm/vmscan.c2
25 files changed, 2274 insertions, 1059 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fe5f674d7a7d..3aa519f52e18 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE
153# 153#
154config PAGEFLAGS_EXTENDED 154config PAGEFLAGS_EXTENDED
155 def_bool y 155 def_bool y
156 depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM 156 depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
157 157
158# Heavily threaded applications may benefit from splitting the mm-wide 158# Heavily threaded applications may benefit from splitting the mm-wide
159# page_table_lock, so that faults on different parts of the user address 159# page_table_lock, so that faults on different parts of the user address
diff --git a/mm/Makefile b/mm/Makefile
index 5e0bd6426693..ea4b18bd3960 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
8 vmalloc.o 8 vmalloc.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o \
12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
@@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
36ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 36ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
37obj-$(CONFIG_SMP) += percpu.o 37obj-$(CONFIG_SMP) += percpu.o
38else 38else
39obj-$(CONFIG_SMP) += allocpercpu.o 39obj-$(CONFIG_SMP) += allocpercpu.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index dfdee6a47359..df34ceae0c67 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -5,6 +5,8 @@
5 */ 5 */
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/bootmem.h>
9#include <asm/sections.h>
8 10
9#ifndef cache_line_size 11#ifndef cache_line_size
10#define cache_line_size() L1_CACHE_BYTES 12#define cache_line_size() L1_CACHE_BYTES
@@ -147,3 +149,29 @@ void free_percpu(void *__pdata)
147 kfree(__percpu_disguise(__pdata)); 149 kfree(__percpu_disguise(__pdata));
148} 150}
149EXPORT_SYMBOL_GPL(free_percpu); 151EXPORT_SYMBOL_GPL(free_percpu);
152
153/*
154 * Generic percpu area setup.
155 */
156#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
157unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
158
159EXPORT_SYMBOL(__per_cpu_offset);
160
161void __init setup_per_cpu_areas(void)
162{
163 unsigned long size, i;
164 char *ptr;
165 unsigned long nr_possible_cpus = num_possible_cpus();
166
167 /* Copy section for each CPU (we discard the original) */
168 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
169 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
170
171 for_each_possible_cpu(i) {
172 __per_cpu_offset[i] = ptr - __per_cpu_start;
173 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
174 ptr += size;
175 }
176}
177#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c86edd244294..3d3accb1f800 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,8 +1,11 @@
1 1
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/kthread.h>
5#include <linux/freezer.h>
4#include <linux/fs.h> 6#include <linux/fs.h>
5#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/mm.h>
6#include <linux/sched.h> 9#include <linux/sched.h>
7#include <linux/module.h> 10#include <linux/module.h>
8#include <linux/writeback.h> 11#include <linux/writeback.h>
@@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
14EXPORT_SYMBOL(default_unplug_io_fn); 17EXPORT_SYMBOL(default_unplug_io_fn);
15 18
16struct backing_dev_info default_backing_dev_info = { 19struct backing_dev_info default_backing_dev_info = {
20 .name = "default",
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, 21 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0, 22 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY, 23 .capabilities = BDI_CAP_MAP_COPY,
@@ -23,6 +27,24 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
23 27
24static struct class *bdi_class; 28static struct class *bdi_class;
25 29
30/*
31 * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
32 * reader side protection for bdi_pending_list. bdi_list has RCU reader side
33 * locking.
34 */
35DEFINE_SPINLOCK(bdi_lock);
36LIST_HEAD(bdi_list);
37LIST_HEAD(bdi_pending_list);
38
39static struct task_struct *sync_supers_tsk;
40static struct timer_list sync_supers_timer;
41
42static int bdi_sync_supers(void *);
43static void sync_supers_timer_fn(unsigned long);
44static void arm_supers_timer(void);
45
46static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
47
26#ifdef CONFIG_DEBUG_FS 48#ifdef CONFIG_DEBUG_FS
27#include <linux/debugfs.h> 49#include <linux/debugfs.h>
28#include <linux/seq_file.h> 50#include <linux/seq_file.h>
@@ -37,9 +59,29 @@ static void bdi_debug_init(void)
37static int bdi_debug_stats_show(struct seq_file *m, void *v) 59static int bdi_debug_stats_show(struct seq_file *m, void *v)
38{ 60{
39 struct backing_dev_info *bdi = m->private; 61 struct backing_dev_info *bdi = m->private;
62 struct bdi_writeback *wb;
40 unsigned long background_thresh; 63 unsigned long background_thresh;
41 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
42 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
67 struct inode *inode;
68
69 /*
70 * inode lock is enough here, the bdi->wb_list is protected by
71 * RCU on the reader side
72 */
73 nr_wb = nr_dirty = nr_io = nr_more_io = 0;
74 spin_lock(&inode_lock);
75 list_for_each_entry(wb, &bdi->wb_list, list) {
76 nr_wb++;
77 list_for_each_entry(inode, &wb->b_dirty, i_list)
78 nr_dirty++;
79 list_for_each_entry(inode, &wb->b_io, i_list)
80 nr_io++;
81 list_for_each_entry(inode, &wb->b_more_io, i_list)
82 nr_more_io++;
83 }
84 spin_unlock(&inode_lock);
43 85
44 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 86 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
45 87
@@ -49,12 +91,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
49 "BdiReclaimable: %8lu kB\n" 91 "BdiReclaimable: %8lu kB\n"
50 "BdiDirtyThresh: %8lu kB\n" 92 "BdiDirtyThresh: %8lu kB\n"
51 "DirtyThresh: %8lu kB\n" 93 "DirtyThresh: %8lu kB\n"
52 "BackgroundThresh: %8lu kB\n", 94 "BackgroundThresh: %8lu kB\n"
95 "WriteBack threads:%8lu\n"
96 "b_dirty: %8lu\n"
97 "b_io: %8lu\n"
98 "b_more_io: %8lu\n"
99 "bdi_list: %8u\n"
100 "state: %8lx\n"
101 "wb_mask: %8lx\n"
102 "wb_list: %8u\n"
103 "wb_cnt: %8u\n",
53 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 104 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
54 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 105 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
55 K(bdi_thresh), 106 K(bdi_thresh), K(dirty_thresh),
56 K(dirty_thresh), 107 K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
57 K(background_thresh)); 108 !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
109 !list_empty(&bdi->wb_list), bdi->wb_cnt);
58#undef K 110#undef K
59 111
60 return 0; 112 return 0;
@@ -185,6 +237,13 @@ static int __init default_bdi_init(void)
185{ 237{
186 int err; 238 int err;
187 239
240 sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
241 BUG_ON(IS_ERR(sync_supers_tsk));
242
243 init_timer(&sync_supers_timer);
244 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
245 arm_supers_timer();
246
188 err = bdi_init(&default_backing_dev_info); 247 err = bdi_init(&default_backing_dev_info);
189 if (!err) 248 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default"); 249 bdi_register(&default_backing_dev_info, NULL, "default");
@@ -193,6 +252,279 @@ static int __init default_bdi_init(void)
193} 252}
194subsys_initcall(default_bdi_init); 253subsys_initcall(default_bdi_init);
195 254
255static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
256{
257 memset(wb, 0, sizeof(*wb));
258
259 wb->bdi = bdi;
260 wb->last_old_flush = jiffies;
261 INIT_LIST_HEAD(&wb->b_dirty);
262 INIT_LIST_HEAD(&wb->b_io);
263 INIT_LIST_HEAD(&wb->b_more_io);
264}
265
266static void bdi_task_init(struct backing_dev_info *bdi,
267 struct bdi_writeback *wb)
268{
269 struct task_struct *tsk = current;
270
271 spin_lock(&bdi->wb_lock);
272 list_add_tail_rcu(&wb->list, &bdi->wb_list);
273 spin_unlock(&bdi->wb_lock);
274
275 tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
276 set_freezable();
277
278 /*
279 * Our parent may run at a different priority, just set us to normal
280 */
281 set_user_nice(tsk, 0);
282}
283
284static int bdi_start_fn(void *ptr)
285{
286 struct bdi_writeback *wb = ptr;
287 struct backing_dev_info *bdi = wb->bdi;
288 int ret;
289
290 /*
291 * Add us to the active bdi_list
292 */
293 spin_lock_bh(&bdi_lock);
294 list_add_rcu(&bdi->bdi_list, &bdi_list);
295 spin_unlock_bh(&bdi_lock);
296
297 bdi_task_init(bdi, wb);
298
299 /*
300 * Clear pending bit and wakeup anybody waiting to tear us down
301 */
302 clear_bit(BDI_pending, &bdi->state);
303 smp_mb__after_clear_bit();
304 wake_up_bit(&bdi->state, BDI_pending);
305
306 ret = bdi_writeback_task(wb);
307
308 /*
309 * Remove us from the list
310 */
311 spin_lock(&bdi->wb_lock);
312 list_del_rcu(&wb->list);
313 spin_unlock(&bdi->wb_lock);
314
315 /*
316 * Flush any work that raced with us exiting. No new work
317 * will be added, since this bdi isn't discoverable anymore.
318 */
319 if (!list_empty(&bdi->work_list))
320 wb_do_writeback(wb, 1);
321
322 wb->task = NULL;
323 return ret;
324}
325
326int bdi_has_dirty_io(struct backing_dev_info *bdi)
327{
328 return wb_has_dirty_io(&bdi->wb);
329}
330
331static void bdi_flush_io(struct backing_dev_info *bdi)
332{
333 struct writeback_control wbc = {
334 .bdi = bdi,
335 .sync_mode = WB_SYNC_NONE,
336 .older_than_this = NULL,
337 .range_cyclic = 1,
338 .nr_to_write = 1024,
339 };
340
341 writeback_inodes_wbc(&wbc);
342}
343
344/*
345 * kupdated() used to do this. We cannot do it from the bdi_forker_task()
346 * or we risk deadlocking on ->s_umount. The longer term solution would be
347 * to implement sync_supers_bdi() or similar and simply do it from the
348 * bdi writeback tasks individually.
349 */
350static int bdi_sync_supers(void *unused)
351{
352 set_user_nice(current, 0);
353
354 while (!kthread_should_stop()) {
355 set_current_state(TASK_INTERRUPTIBLE);
356 schedule();
357
358 /*
359 * Do this periodically, like kupdated() did before.
360 */
361 sync_supers();
362 }
363
364 return 0;
365}
366
367static void arm_supers_timer(void)
368{
369 unsigned long next;
370
371 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
372 mod_timer(&sync_supers_timer, round_jiffies_up(next));
373}
374
375static void sync_supers_timer_fn(unsigned long unused)
376{
377 wake_up_process(sync_supers_tsk);
378 arm_supers_timer();
379}
380
381static int bdi_forker_task(void *ptr)
382{
383 struct bdi_writeback *me = ptr;
384
385 bdi_task_init(me->bdi, me);
386
387 for (;;) {
388 struct backing_dev_info *bdi, *tmp;
389 struct bdi_writeback *wb;
390
391 /*
392 * Temporary measure, we want to make sure we don't see
393 * dirty data on the default backing_dev_info
394 */
395 if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
396 wb_do_writeback(me, 0);
397
398 spin_lock_bh(&bdi_lock);
399
400 /*
401 * Check if any existing bdi's have dirty data without
402 * a thread registered. If so, set that up.
403 */
404 list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
405 if (bdi->wb.task)
406 continue;
407 if (list_empty(&bdi->work_list) &&
408 !bdi_has_dirty_io(bdi))
409 continue;
410
411 bdi_add_default_flusher_task(bdi);
412 }
413
414 set_current_state(TASK_INTERRUPTIBLE);
415
416 if (list_empty(&bdi_pending_list)) {
417 unsigned long wait;
418
419 spin_unlock_bh(&bdi_lock);
420 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
421 schedule_timeout(wait);
422 try_to_freeze();
423 continue;
424 }
425
426 __set_current_state(TASK_RUNNING);
427
428 /*
429 * This is our real job - check for pending entries in
430 * bdi_pending_list, and create the tasks that got added
431 */
432 bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
433 bdi_list);
434 list_del_init(&bdi->bdi_list);
435 spin_unlock_bh(&bdi_lock);
436
437 wb = &bdi->wb;
438 wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
439 dev_name(bdi->dev));
440 /*
441 * If task creation fails, then readd the bdi to
442 * the pending list and force writeout of the bdi
443 * from this forker thread. That will free some memory
444 * and we can try again.
445 */
446 if (IS_ERR(wb->task)) {
447 wb->task = NULL;
448
449 /*
450 * Add this 'bdi' to the back, so we get
451 * a chance to flush other bdi's to free
452 * memory.
453 */
454 spin_lock_bh(&bdi_lock);
455 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
456 spin_unlock_bh(&bdi_lock);
457
458 bdi_flush_io(bdi);
459 }
460 }
461
462 return 0;
463}
464
465static void bdi_add_to_pending(struct rcu_head *head)
466{
467 struct backing_dev_info *bdi;
468
469 bdi = container_of(head, struct backing_dev_info, rcu_head);
470 INIT_LIST_HEAD(&bdi->bdi_list);
471
472 spin_lock(&bdi_lock);
473 list_add_tail(&bdi->bdi_list, &bdi_pending_list);
474 spin_unlock(&bdi_lock);
475
476 /*
477 * We are now on the pending list, wake up bdi_forker_task()
478 * to finish the job and add us back to the active bdi_list
479 */
480 wake_up_process(default_backing_dev_info.wb.task);
481}
482
483/*
484 * Add the default flusher task that gets created for any bdi
485 * that has dirty data pending writeout
486 */
487void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
488{
489 if (!bdi_cap_writeback_dirty(bdi))
490 return;
491
492 if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
493 printk(KERN_ERR "bdi %p/%s is not registered!\n",
494 bdi, bdi->name);
495 return;
496 }
497
498 /*
499 * Check with the helper whether to proceed adding a task. Will only
500 * abort if we two or more simultanous calls to
501 * bdi_add_default_flusher_task() occured, further additions will block
502 * waiting for previous additions to finish.
503 */
504 if (!test_and_set_bit(BDI_pending, &bdi->state)) {
505 list_del_rcu(&bdi->bdi_list);
506
507 /*
508 * We must wait for the current RCU period to end before
509 * moving to the pending list. So schedule that operation
510 * from an RCU callback.
511 */
512 call_rcu(&bdi->rcu_head, bdi_add_to_pending);
513 }
514}
515
516/*
517 * Remove bdi from bdi_list, and ensure that it is no longer visible
518 */
519static void bdi_remove_from_list(struct backing_dev_info *bdi)
520{
521 spin_lock_bh(&bdi_lock);
522 list_del_rcu(&bdi->bdi_list);
523 spin_unlock_bh(&bdi_lock);
524
525 synchronize_rcu();
526}
527
196int bdi_register(struct backing_dev_info *bdi, struct device *parent, 528int bdi_register(struct backing_dev_info *bdi, struct device *parent,
197 const char *fmt, ...) 529 const char *fmt, ...)
198{ 530{
@@ -211,9 +543,33 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
211 goto exit; 543 goto exit;
212 } 544 }
213 545
546 spin_lock_bh(&bdi_lock);
547 list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
548 spin_unlock_bh(&bdi_lock);
549
214 bdi->dev = dev; 550 bdi->dev = dev;
215 bdi_debug_register(bdi, dev_name(dev));
216 551
552 /*
553 * Just start the forker thread for our default backing_dev_info,
554 * and add other bdi's to the list. They will get a thread created
555 * on-demand when they need it.
556 */
557 if (bdi_cap_flush_forker(bdi)) {
558 struct bdi_writeback *wb = &bdi->wb;
559
560 wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
561 dev_name(dev));
562 if (IS_ERR(wb->task)) {
563 wb->task = NULL;
564 ret = -ENOMEM;
565
566 bdi_remove_from_list(bdi);
567 goto exit;
568 }
569 }
570
571 bdi_debug_register(bdi, dev_name(dev));
572 set_bit(BDI_registered, &bdi->state);
217exit: 573exit:
218 return ret; 574 return ret;
219} 575}
@@ -225,9 +581,40 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
225} 581}
226EXPORT_SYMBOL(bdi_register_dev); 582EXPORT_SYMBOL(bdi_register_dev);
227 583
584/*
585 * Remove bdi from the global list and shutdown any threads we have running
586 */
587static void bdi_wb_shutdown(struct backing_dev_info *bdi)
588{
589 struct bdi_writeback *wb;
590
591 if (!bdi_cap_writeback_dirty(bdi))
592 return;
593
594 /*
595 * If setup is pending, wait for that to complete first
596 */
597 wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
598 TASK_UNINTERRUPTIBLE);
599
600 /*
601 * Make sure nobody finds us on the bdi_list anymore
602 */
603 bdi_remove_from_list(bdi);
604
605 /*
606 * Finally, kill the kernel threads. We don't need to be RCU
607 * safe anymore, since the bdi is gone from visibility.
608 */
609 list_for_each_entry(wb, &bdi->wb_list, list)
610 kthread_stop(wb->task);
611}
612
228void bdi_unregister(struct backing_dev_info *bdi) 613void bdi_unregister(struct backing_dev_info *bdi)
229{ 614{
230 if (bdi->dev) { 615 if (bdi->dev) {
616 if (!bdi_cap_flush_forker(bdi))
617 bdi_wb_shutdown(bdi);
231 bdi_debug_unregister(bdi); 618 bdi_debug_unregister(bdi);
232 device_unregister(bdi->dev); 619 device_unregister(bdi->dev);
233 bdi->dev = NULL; 620 bdi->dev = NULL;
@@ -237,14 +624,26 @@ EXPORT_SYMBOL(bdi_unregister);
237 624
238int bdi_init(struct backing_dev_info *bdi) 625int bdi_init(struct backing_dev_info *bdi)
239{ 626{
240 int i; 627 int i, err;
241 int err;
242 628
243 bdi->dev = NULL; 629 bdi->dev = NULL;
244 630
245 bdi->min_ratio = 0; 631 bdi->min_ratio = 0;
246 bdi->max_ratio = 100; 632 bdi->max_ratio = 100;
247 bdi->max_prop_frac = PROP_FRAC_BASE; 633 bdi->max_prop_frac = PROP_FRAC_BASE;
634 spin_lock_init(&bdi->wb_lock);
635 INIT_RCU_HEAD(&bdi->rcu_head);
636 INIT_LIST_HEAD(&bdi->bdi_list);
637 INIT_LIST_HEAD(&bdi->wb_list);
638 INIT_LIST_HEAD(&bdi->work_list);
639
640 bdi_wb_init(&bdi->wb, bdi);
641
642 /*
643 * Just one thread support for now, hard code mask and count
644 */
645 bdi->wb_mask = 1;
646 bdi->wb_cnt = 1;
248 647
249 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 648 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
250 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 649 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -269,6 +668,20 @@ void bdi_destroy(struct backing_dev_info *bdi)
269{ 668{
270 int i; 669 int i;
271 670
671 /*
672 * Splice our entries to the default_backing_dev_info, if this
673 * bdi disappears
674 */
675 if (bdi_has_dirty_io(bdi)) {
676 struct bdi_writeback *dst = &default_backing_dev_info.wb;
677
678 spin_lock(&inode_lock);
679 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
680 list_splice(&bdi->wb.b_io, &dst->b_io);
681 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
682 spin_unlock(&inode_lock);
683 }
684
272 bdi_unregister(bdi); 685 bdi_unregister(bdi);
273 686
274 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 687 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 701740c9e81b..555d5d2731c6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -521,7 +521,11 @@ find_block:
521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
522 start_off); 522 start_off);
523 memset(region, 0, size); 523 memset(region, 0, size);
524 kmemleak_alloc(region, size, 1, 0); 524 /*
525 * The min_count is set to 0 so that bootmem allocated blocks
526 * are never reported as leaks.
527 */
528 kmemleak_alloc(region, size, 0, 0);
525 return region; 529 return region;
526 } 530 }
527 531
diff --git a/mm/filemap.c b/mm/filemap.c
index ccea3b665c12..dd51c68e2b86 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -39,11 +39,10 @@
39/* 39/*
40 * FIXME: remove all knowledge of the buffer layer from the core VM 40 * FIXME: remove all knowledge of the buffer layer from the core VM
41 */ 41 */
42#include <linux/buffer_head.h> /* for generic_osync_inode */ 42#include <linux/buffer_head.h> /* for try_to_free_buffers */
43 43
44#include <asm/mman.h> 44#include <asm/mman.h>
45 45
46
47/* 46/*
48 * Shared mappings implemented 30.11.1994. It's not fully working yet, 47 * Shared mappings implemented 30.11.1994. It's not fully working yet,
49 * though. 48 * though.
@@ -307,68 +306,24 @@ int wait_on_page_writeback_range(struct address_space *mapping,
307} 306}
308 307
309/** 308/**
310 * sync_page_range - write and wait on all pages in the passed range 309 * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
311 * @inode: target inode 310 * @mapping: address space structure to wait for
312 * @mapping: target address_space 311 * @start: offset in bytes where the range starts
313 * @pos: beginning offset in pages to write 312 * @end: offset in bytes where the range ends (inclusive)
314 * @count: number of bytes to write
315 *
316 * Write and wait upon all the pages in the passed range. This is a "data
317 * integrity" operation. It waits upon in-flight writeout before starting and
318 * waiting upon new writeout. If there was an IO error, return it.
319 * 313 *
320 * We need to re-take i_mutex during the generic_osync_inode list walk because 314 * Walk the list of under-writeback pages of the given address space
321 * it is otherwise livelockable. 315 * in the given range and wait for all of them.
322 */
323int sync_page_range(struct inode *inode, struct address_space *mapping,
324 loff_t pos, loff_t count)
325{
326 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
327 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
328 int ret;
329
330 if (!mapping_cap_writeback_dirty(mapping) || !count)
331 return 0;
332 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
333 if (ret == 0) {
334 mutex_lock(&inode->i_mutex);
335 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
336 mutex_unlock(&inode->i_mutex);
337 }
338 if (ret == 0)
339 ret = wait_on_page_writeback_range(mapping, start, end);
340 return ret;
341}
342EXPORT_SYMBOL(sync_page_range);
343
344/**
345 * sync_page_range_nolock - write & wait on all pages in the passed range without locking
346 * @inode: target inode
347 * @mapping: target address_space
348 * @pos: beginning offset in pages to write
349 * @count: number of bytes to write
350 * 316 *
351 * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea 317 * This is just a simple wrapper so that callers don't have to convert offsets
352 * as it forces O_SYNC writers to different parts of the same file 318 * to page indexes themselves
353 * to be serialised right until io completion.
354 */ 319 */
355int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, 320int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
356 loff_t pos, loff_t count) 321 loff_t end)
357{ 322{
358 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 323 return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
359 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 324 end >> PAGE_CACHE_SHIFT);
360 int ret;
361
362 if (!mapping_cap_writeback_dirty(mapping) || !count)
363 return 0;
364 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
365 if (ret == 0)
366 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
367 if (ret == 0)
368 ret = wait_on_page_writeback_range(mapping, start, end);
369 return ret;
370} 325}
371EXPORT_SYMBOL(sync_page_range_nolock); 326EXPORT_SYMBOL(filemap_fdatawait_range);
372 327
373/** 328/**
374 * filemap_fdatawait - wait for all under-writeback pages to complete 329 * filemap_fdatawait - wait for all under-writeback pages to complete
@@ -2167,20 +2122,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2167 } 2122 }
2168 *ppos = end; 2123 *ppos = end;
2169 } 2124 }
2170
2171 /*
2172 * Sync the fs metadata but not the minor inode changes and
2173 * of course not the data as we did direct DMA for the IO.
2174 * i_mutex is held, which protects generic_osync_inode() from
2175 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2176 */
2177out: 2125out:
2178 if ((written >= 0 || written == -EIOCBQUEUED) &&
2179 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2180 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2181 if (err < 0)
2182 written = err;
2183 }
2184 return written; 2126 return written;
2185} 2127}
2186EXPORT_SYMBOL(generic_file_direct_write); 2128EXPORT_SYMBOL(generic_file_direct_write);
@@ -2312,8 +2254,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2312{ 2254{
2313 struct file *file = iocb->ki_filp; 2255 struct file *file = iocb->ki_filp;
2314 struct address_space *mapping = file->f_mapping; 2256 struct address_space *mapping = file->f_mapping;
2315 const struct address_space_operations *a_ops = mapping->a_ops;
2316 struct inode *inode = mapping->host;
2317 ssize_t status; 2257 ssize_t status;
2318 struct iov_iter i; 2258 struct iov_iter i;
2319 2259
@@ -2323,16 +2263,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2323 if (likely(status >= 0)) { 2263 if (likely(status >= 0)) {
2324 written += status; 2264 written += status;
2325 *ppos = pos + status; 2265 *ppos = pos + status;
2326
2327 /*
2328 * For now, when the user asks for O_SYNC, we'll actually give
2329 * O_DSYNC
2330 */
2331 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2332 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2333 status = generic_osync_inode(inode, mapping,
2334 OSYNC_METADATA|OSYNC_DATA);
2335 }
2336 } 2266 }
2337 2267
2338 /* 2268 /*
@@ -2348,9 +2278,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2348} 2278}
2349EXPORT_SYMBOL(generic_file_buffered_write); 2279EXPORT_SYMBOL(generic_file_buffered_write);
2350 2280
2351static ssize_t 2281/**
2352__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, 2282 * __generic_file_aio_write - write data to a file
2353 unsigned long nr_segs, loff_t *ppos) 2283 * @iocb: IO state structure (file, offset, etc.)
2284 * @iov: vector with data to write
2285 * @nr_segs: number of segments in the vector
2286 * @ppos: position where to write
2287 *
2288 * This function does all the work needed for actually writing data to a
2289 * file. It does all basic checks, removes SUID from the file, updates
2290 * modification times and calls proper subroutines depending on whether we
2291 * do direct IO or a standard buffered write.
2292 *
2293 * It expects i_mutex to be grabbed unless we work on a block device or similar
2294 * object which does not need locking at all.
2295 *
2296 * This function does *not* take care of syncing data in case of O_SYNC write.
2297 * A caller has to handle it. This is mainly due to the fact that we want to
2298 * avoid syncing under i_mutex.
2299 */
2300ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2301 unsigned long nr_segs, loff_t *ppos)
2354{ 2302{
2355 struct file *file = iocb->ki_filp; 2303 struct file *file = iocb->ki_filp;
2356 struct address_space * mapping = file->f_mapping; 2304 struct address_space * mapping = file->f_mapping;
@@ -2447,51 +2395,37 @@ out:
2447 current->backing_dev_info = NULL; 2395 current->backing_dev_info = NULL;
2448 return written ? written : err; 2396 return written ? written : err;
2449} 2397}
2398EXPORT_SYMBOL(__generic_file_aio_write);
2450 2399
2451ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, 2400/**
2452 const struct iovec *iov, unsigned long nr_segs, loff_t pos) 2401 * generic_file_aio_write - write data to a file
2453{ 2402 * @iocb: IO state structure
2454 struct file *file = iocb->ki_filp; 2403 * @iov: vector with data to write
2455 struct address_space *mapping = file->f_mapping; 2404 * @nr_segs: number of segments in the vector
2456 struct inode *inode = mapping->host; 2405 * @pos: position in file where to write
2457 ssize_t ret; 2406 *
2458 2407 * This is a wrapper around __generic_file_aio_write() to be used by most
2459 BUG_ON(iocb->ki_pos != pos); 2408 * filesystems. It takes care of syncing the file in case of O_SYNC file
2460 2409 * and acquires i_mutex as needed.
2461 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2410 */
2462 &iocb->ki_pos);
2463
2464 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2465 ssize_t err;
2466
2467 err = sync_page_range_nolock(inode, mapping, pos, ret);
2468 if (err < 0)
2469 ret = err;
2470 }
2471 return ret;
2472}
2473EXPORT_SYMBOL(generic_file_aio_write_nolock);
2474
2475ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, 2411ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2476 unsigned long nr_segs, loff_t pos) 2412 unsigned long nr_segs, loff_t pos)
2477{ 2413{
2478 struct file *file = iocb->ki_filp; 2414 struct file *file = iocb->ki_filp;
2479 struct address_space *mapping = file->f_mapping; 2415 struct inode *inode = file->f_mapping->host;
2480 struct inode *inode = mapping->host;
2481 ssize_t ret; 2416 ssize_t ret;
2482 2417
2483 BUG_ON(iocb->ki_pos != pos); 2418 BUG_ON(iocb->ki_pos != pos);
2484 2419
2485 mutex_lock(&inode->i_mutex); 2420 mutex_lock(&inode->i_mutex);
2486 ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, 2421 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
2487 &iocb->ki_pos);
2488 mutex_unlock(&inode->i_mutex); 2422 mutex_unlock(&inode->i_mutex);
2489 2423
2490 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2424 if (ret > 0 || ret == -EIOCBQUEUED) {
2491 ssize_t err; 2425 ssize_t err;
2492 2426
2493 err = sync_page_range(inode, mapping, pos, ret); 2427 err = generic_write_sync(file, pos, ret);
2494 if (err < 0) 2428 if (err < 0 && ret > 0)
2495 ret = err; 2429 ret = err;
2496 } 2430 }
2497 return ret; 2431 return ret;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafdcee154e8..b16d63634777 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234 234
235 return 1UL << (hstate->order + PAGE_SHIFT); 235 return 1UL << (hstate->order + PAGE_SHIFT);
236} 236}
237EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237 238
238/* 239/*
239 * Return the page size being used by the MMU to back a VMA. In the majority 240 * Return the page size being used by the MMU to back a VMA. In the majority
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index d5292fc6f523..177a5169bbde 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -36,7 +36,7 @@ struct test_node {
36}; 36};
37 37
38static LIST_HEAD(test_list); 38static LIST_HEAD(test_list);
39static DEFINE_PER_CPU(void *, test_pointer); 39static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
40 40
41/* 41/*
42 * Some very simple testing. This function needs to be extended for 42 * Some very simple testing. This function needs to be extended for
@@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void)
86 } 86 }
87 87
88 for_each_possible_cpu(i) { 88 for_each_possible_cpu(i) {
89 per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); 89 per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
90 pr_info("kmemleak: kmalloc(129) = %p\n", 90 pr_info("kmemleak: kmalloc(129) = %p\n",
91 per_cpu(test_pointer, i)); 91 per_cpu(kmemleak_test_pointer, i));
92 } 92 }
93 93
94 return 0; 94 return 0;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 487267310a84..4ea4510e2996 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -92,11 +92,13 @@
92#include <linux/string.h> 92#include <linux/string.h>
93#include <linux/nodemask.h> 93#include <linux/nodemask.h>
94#include <linux/mm.h> 94#include <linux/mm.h>
95#include <linux/workqueue.h>
95 96
96#include <asm/sections.h> 97#include <asm/sections.h>
97#include <asm/processor.h> 98#include <asm/processor.h>
98#include <asm/atomic.h> 99#include <asm/atomic.h>
99 100
101#include <linux/kmemcheck.h>
100#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
101 103
102/* 104/*
@@ -107,6 +109,7 @@
107#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 109#define SECS_FIRST_SCAN 60 /* delay before the first scan */
108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ 111#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
112#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */
110 113
111#define BYTES_PER_POINTER sizeof(void *) 114#define BYTES_PER_POINTER sizeof(void *)
112 115
@@ -120,6 +123,9 @@ struct kmemleak_scan_area {
120 size_t length; 123 size_t length;
121}; 124};
122 125
126#define KMEMLEAK_GREY 0
127#define KMEMLEAK_BLACK -1
128
123/* 129/*
124 * Structure holding the metadata for each allocated memory block. 130 * Structure holding the metadata for each allocated memory block.
125 * Modifications to such objects should be made while holding the 131 * Modifications to such objects should be made while holding the
@@ -161,6 +167,15 @@ struct kmemleak_object {
161/* flag set on newly allocated objects */ 167/* flag set on newly allocated objects */
162#define OBJECT_NEW (1 << 3) 168#define OBJECT_NEW (1 << 3)
163 169
170/* number of bytes to print per line; must be 16 or 32 */
171#define HEX_ROW_SIZE 16
172/* number of bytes to print at a time (1, 2, 4, 8) */
173#define HEX_GROUP_SIZE 1
174/* include ASCII after the hex output */
175#define HEX_ASCII 1
176/* max number of lines to be printed */
177#define HEX_MAX_LINES 2
178
164/* the list of all allocated objects */ 179/* the list of all allocated objects */
165static LIST_HEAD(object_list); 180static LIST_HEAD(object_list);
166/* the list of gray-colored objects (see color_gray comment below) */ 181/* the list of gray-colored objects (see color_gray comment below) */
@@ -228,11 +243,14 @@ struct early_log {
228 int min_count; /* minimum reference count */ 243 int min_count; /* minimum reference count */
229 unsigned long offset; /* scan area offset */ 244 unsigned long offset; /* scan area offset */
230 size_t length; /* scan area length */ 245 size_t length; /* scan area length */
246 unsigned long trace[MAX_TRACE]; /* stack trace */
247 unsigned int trace_len; /* stack trace length */
231}; 248};
232 249
233/* early logging buffer and current position */ 250/* early logging buffer and current position */
234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; 251static struct early_log
235static int crt_early_log; 252 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
253static int crt_early_log __initdata;
236 254
237static void kmemleak_disable(void); 255static void kmemleak_disable(void);
238 256
@@ -255,6 +273,35 @@ static void kmemleak_disable(void);
255} while (0) 273} while (0)
256 274
257/* 275/*
276 * Printing of the objects hex dump to the seq file. The number of lines to be
277 * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
278 * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called
279 * with the object->lock held.
280 */
281static void hex_dump_object(struct seq_file *seq,
282 struct kmemleak_object *object)
283{
284 const u8 *ptr = (const u8 *)object->pointer;
285 int i, len, remaining;
286 unsigned char linebuf[HEX_ROW_SIZE * 5];
287
288 /* limit the number of lines to HEX_MAX_LINES */
289 remaining = len =
290 min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE));
291
292 seq_printf(seq, " hex dump (first %d bytes):\n", len);
293 for (i = 0; i < len; i += HEX_ROW_SIZE) {
294 int linelen = min(remaining, HEX_ROW_SIZE);
295
296 remaining -= HEX_ROW_SIZE;
297 hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE,
298 HEX_GROUP_SIZE, linebuf, sizeof(linebuf),
299 HEX_ASCII);
300 seq_printf(seq, " %s\n", linebuf);
301 }
302}
303
304/*
258 * Object colors, encoded with count and min_count: 305 * Object colors, encoded with count and min_count:
259 * - white - orphan object, not enough references to it (count < min_count) 306 * - white - orphan object, not enough references to it (count < min_count)
260 * - gray - not orphan, not marked as false positive (min_count == 0) or 307 * - gray - not orphan, not marked as false positive (min_count == 0) or
@@ -264,19 +311,21 @@ static void kmemleak_disable(void);
264 * Newly created objects don't have any color assigned (object->count == -1) 311 * Newly created objects don't have any color assigned (object->count == -1)
265 * before the next memory scan when they become white. 312 * before the next memory scan when they become white.
266 */ 313 */
267static int color_white(const struct kmemleak_object *object) 314static bool color_white(const struct kmemleak_object *object)
268{ 315{
269 return object->count != -1 && object->count < object->min_count; 316 return object->count != KMEMLEAK_BLACK &&
317 object->count < object->min_count;
270} 318}
271 319
272static int color_gray(const struct kmemleak_object *object) 320static bool color_gray(const struct kmemleak_object *object)
273{ 321{
274 return object->min_count != -1 && object->count >= object->min_count; 322 return object->min_count != KMEMLEAK_BLACK &&
323 object->count >= object->min_count;
275} 324}
276 325
277static int color_black(const struct kmemleak_object *object) 326static bool color_black(const struct kmemleak_object *object)
278{ 327{
279 return object->min_count == -1; 328 return object->min_count == KMEMLEAK_BLACK;
280} 329}
281 330
282/* 331/*
@@ -284,7 +333,7 @@ static int color_black(const struct kmemleak_object *object)
284 * not be deleted and have a minimum age to avoid false positives caused by 333 * not be deleted and have a minimum age to avoid false positives caused by
285 * pointers temporarily stored in CPU registers. 334 * pointers temporarily stored in CPU registers.
286 */ 335 */
287static int unreferenced_object(struct kmemleak_object *object) 336static bool unreferenced_object(struct kmemleak_object *object)
288{ 337{
289 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 338 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
290 time_before_eq(object->jiffies + jiffies_min_age, 339 time_before_eq(object->jiffies + jiffies_min_age,
@@ -304,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq,
304 object->pointer, object->size); 353 object->pointer, object->size);
305 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", 354 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
306 object->comm, object->pid, object->jiffies); 355 object->comm, object->pid, object->jiffies);
356 hex_dump_object(seq, object);
307 seq_printf(seq, " backtrace:\n"); 357 seq_printf(seq, " backtrace:\n");
308 358
309 for (i = 0; i < object->trace_len; i++) { 359 for (i = 0; i < object->trace_len; i++) {
@@ -330,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
330 object->comm, object->pid, object->jiffies); 380 object->comm, object->pid, object->jiffies);
331 pr_notice(" min_count = %d\n", object->min_count); 381 pr_notice(" min_count = %d\n", object->min_count);
332 pr_notice(" count = %d\n", object->count); 382 pr_notice(" count = %d\n", object->count);
383 pr_notice(" flags = 0x%lx\n", object->flags);
333 pr_notice(" backtrace:\n"); 384 pr_notice(" backtrace:\n");
334 print_stack_trace(&trace, 4); 385 print_stack_trace(&trace, 4);
335} 386}
@@ -434,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
434} 485}
435 486
436/* 487/*
488 * Save stack trace to the given array of MAX_TRACE size.
489 */
490static int __save_stack_trace(unsigned long *trace)
491{
492 struct stack_trace stack_trace;
493
494 stack_trace.max_entries = MAX_TRACE;
495 stack_trace.nr_entries = 0;
496 stack_trace.entries = trace;
497 stack_trace.skip = 2;
498 save_stack_trace(&stack_trace);
499
500 return stack_trace.nr_entries;
501}
502
503/*
437 * Create the metadata (struct kmemleak_object) corresponding to an allocated 504 * Create the metadata (struct kmemleak_object) corresponding to an allocated
438 * memory block and add it to the object_list and object_tree_root. 505 * memory block and add it to the object_list and object_tree_root.
439 */ 506 */
440static void create_object(unsigned long ptr, size_t size, int min_count, 507static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
441 gfp_t gfp) 508 int min_count, gfp_t gfp)
442{ 509{
443 unsigned long flags; 510 unsigned long flags;
444 struct kmemleak_object *object; 511 struct kmemleak_object *object;
445 struct prio_tree_node *node; 512 struct prio_tree_node *node;
446 struct stack_trace trace;
447 513
448 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
449 if (!object) { 515 if (!object) {
450 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
451 return; 517 return NULL;
452 } 518 }
453 519
454 INIT_LIST_HEAD(&object->object_list); 520 INIT_LIST_HEAD(&object->object_list);
@@ -482,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
482 } 548 }
483 549
484 /* kernel backtrace */ 550 /* kernel backtrace */
485 trace.max_entries = MAX_TRACE; 551 object->trace_len = __save_stack_trace(object->trace);
486 trace.nr_entries = 0;
487 trace.entries = object->trace;
488 trace.skip = 1;
489 save_stack_trace(&trace);
490 object->trace_len = trace.nr_entries;
491 552
492 INIT_PRIO_TREE_NODE(&object->tree_node); 553 INIT_PRIO_TREE_NODE(&object->tree_node);
493 object->tree_node.start = ptr; 554 object->tree_node.start = ptr;
494 object->tree_node.last = ptr + size - 1; 555 object->tree_node.last = ptr + size - 1;
495 556
496 write_lock_irqsave(&kmemleak_lock, flags); 557 write_lock_irqsave(&kmemleak_lock, flags);
558
497 min_addr = min(min_addr, ptr); 559 min_addr = min(min_addr, ptr);
498 max_addr = max(max_addr, ptr + size); 560 max_addr = max(max_addr, ptr + size);
499 node = prio_tree_insert(&object_tree_root, &object->tree_node); 561 node = prio_tree_insert(&object_tree_root, &object->tree_node);
@@ -504,20 +566,19 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
504 * random memory blocks. 566 * random memory blocks.
505 */ 567 */
506 if (node != &object->tree_node) { 568 if (node != &object->tree_node) {
507 unsigned long flags;
508
509 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 569 kmemleak_stop("Cannot insert 0x%lx into the object search tree "
510 "(already existing)\n", ptr); 570 "(already existing)\n", ptr);
511 object = lookup_object(ptr, 1); 571 object = lookup_object(ptr, 1);
512 spin_lock_irqsave(&object->lock, flags); 572 spin_lock(&object->lock);
513 dump_object_info(object); 573 dump_object_info(object);
514 spin_unlock_irqrestore(&object->lock, flags); 574 spin_unlock(&object->lock);
515 575
516 goto out; 576 goto out;
517 } 577 }
518 list_add_tail_rcu(&object->object_list, &object_list); 578 list_add_tail_rcu(&object->object_list, &object_list);
519out: 579out:
520 write_unlock_irqrestore(&kmemleak_lock, flags); 580 write_unlock_irqrestore(&kmemleak_lock, flags);
581 return object;
521} 582}
522 583
523/* 584/*
@@ -604,46 +665,55 @@ static void delete_object_part(unsigned long ptr, size_t size)
604 665
605 put_object(object); 666 put_object(object);
606} 667}
607/* 668
608 * Make a object permanently as gray-colored so that it can no longer be 669static void __paint_it(struct kmemleak_object *object, int color)
609 * reported as a leak. This is used in general to mark a false positive. 670{
610 */ 671 object->min_count = color;
611static void make_gray_object(unsigned long ptr) 672 if (color == KMEMLEAK_BLACK)
673 object->flags |= OBJECT_NO_SCAN;
674}
675
676static void paint_it(struct kmemleak_object *object, int color)
612{ 677{
613 unsigned long flags; 678 unsigned long flags;
679
680 spin_lock_irqsave(&object->lock, flags);
681 __paint_it(object, color);
682 spin_unlock_irqrestore(&object->lock, flags);
683}
684
685static void paint_ptr(unsigned long ptr, int color)
686{
614 struct kmemleak_object *object; 687 struct kmemleak_object *object;
615 688
616 object = find_and_get_object(ptr, 0); 689 object = find_and_get_object(ptr, 0);
617 if (!object) { 690 if (!object) {
618 kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); 691 kmemleak_warn("Trying to color unknown object "
692 "at 0x%08lx as %s\n", ptr,
693 (color == KMEMLEAK_GREY) ? "Grey" :
694 (color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
619 return; 695 return;
620 } 696 }
621 697 paint_it(object, color);
622 spin_lock_irqsave(&object->lock, flags);
623 object->min_count = 0;
624 spin_unlock_irqrestore(&object->lock, flags);
625 put_object(object); 698 put_object(object);
626} 699}
627 700
628/* 701/*
702 * Make a object permanently as gray-colored so that it can no longer be
703 * reported as a leak. This is used in general to mark a false positive.
704 */
705static void make_gray_object(unsigned long ptr)
706{
707 paint_ptr(ptr, KMEMLEAK_GREY);
708}
709
710/*
629 * Mark the object as black-colored so that it is ignored from scans and 711 * Mark the object as black-colored so that it is ignored from scans and
630 * reporting. 712 * reporting.
631 */ 713 */
632static void make_black_object(unsigned long ptr) 714static void make_black_object(unsigned long ptr)
633{ 715{
634 unsigned long flags; 716 paint_ptr(ptr, KMEMLEAK_BLACK);
635 struct kmemleak_object *object;
636
637 object = find_and_get_object(ptr, 0);
638 if (!object) {
639 kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr);
640 return;
641 }
642
643 spin_lock_irqsave(&object->lock, flags);
644 object->min_count = -1;
645 spin_unlock_irqrestore(&object->lock, flags);
646 put_object(object);
647} 717}
648 718
649/* 719/*
@@ -715,14 +785,15 @@ static void object_no_scan(unsigned long ptr)
715 * Log an early kmemleak_* call to the early_log buffer. These calls will be 785 * Log an early kmemleak_* call to the early_log buffer. These calls will be
716 * processed later once kmemleak is fully initialized. 786 * processed later once kmemleak is fully initialized.
717 */ 787 */
718static void log_early(int op_type, const void *ptr, size_t size, 788static void __init log_early(int op_type, const void *ptr, size_t size,
719 int min_count, unsigned long offset, size_t length) 789 int min_count, unsigned long offset, size_t length)
720{ 790{
721 unsigned long flags; 791 unsigned long flags;
722 struct early_log *log; 792 struct early_log *log;
723 793
724 if (crt_early_log >= ARRAY_SIZE(early_log)) { 794 if (crt_early_log >= ARRAY_SIZE(early_log)) {
725 pr_warning("Early log buffer exceeded\n"); 795 pr_warning("Early log buffer exceeded, "
796 "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n");
726 kmemleak_disable(); 797 kmemleak_disable();
727 return; 798 return;
728 } 799 }
@@ -739,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size,
739 log->min_count = min_count; 810 log->min_count = min_count;
740 log->offset = offset; 811 log->offset = offset;
741 log->length = length; 812 log->length = length;
813 if (op_type == KMEMLEAK_ALLOC)
814 log->trace_len = __save_stack_trace(log->trace);
742 crt_early_log++; 815 crt_early_log++;
743 local_irq_restore(flags); 816 local_irq_restore(flags);
744} 817}
745 818
746/* 819/*
820 * Log an early allocated block and populate the stack trace.
821 */
822static void early_alloc(struct early_log *log)
823{
824 struct kmemleak_object *object;
825 unsigned long flags;
826 int i;
827
828 if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
829 return;
830
831 /*
832 * RCU locking needed to ensure object is not freed via put_object().
833 */
834 rcu_read_lock();
835 object = create_object((unsigned long)log->ptr, log->size,
836 log->min_count, GFP_KERNEL);
837 spin_lock_irqsave(&object->lock, flags);
838 for (i = 0; i < log->trace_len; i++)
839 object->trace[i] = log->trace[i];
840 object->trace_len = log->trace_len;
841 spin_unlock_irqrestore(&object->lock, flags);
842 rcu_read_unlock();
843}
844
845/*
747 * Memory allocation function callback. This function is called from the 846 * Memory allocation function callback. This function is called from the
748 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, 847 * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
749 * vmalloc etc.). 848 * vmalloc etc.).
750 */ 849 */
751void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) 850void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
851 gfp_t gfp)
752{ 852{
753 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); 853 pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
754 854
@@ -763,7 +863,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
763 * Memory freeing function callback. This function is called from the kernel 863 * Memory freeing function callback. This function is called from the kernel
764 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). 864 * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
765 */ 865 */
766void kmemleak_free(const void *ptr) 866void __ref kmemleak_free(const void *ptr)
767{ 867{
768 pr_debug("%s(0x%p)\n", __func__, ptr); 868 pr_debug("%s(0x%p)\n", __func__, ptr);
769 869
@@ -778,7 +878,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free);
778 * Partial memory freeing function callback. This function is usually called 878 * Partial memory freeing function callback. This function is usually called
779 * from bootmem allocator when (part of) a memory block is freed. 879 * from bootmem allocator when (part of) a memory block is freed.
780 */ 880 */
781void kmemleak_free_part(const void *ptr, size_t size) 881void __ref kmemleak_free_part(const void *ptr, size_t size)
782{ 882{
783 pr_debug("%s(0x%p)\n", __func__, ptr); 883 pr_debug("%s(0x%p)\n", __func__, ptr);
784 884
@@ -793,7 +893,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free_part);
793 * Mark an already allocated memory block as a false positive. This will cause 893 * Mark an already allocated memory block as a false positive. This will cause
794 * the block to no longer be reported as leak and always be scanned. 894 * the block to no longer be reported as leak and always be scanned.
795 */ 895 */
796void kmemleak_not_leak(const void *ptr) 896void __ref kmemleak_not_leak(const void *ptr)
797{ 897{
798 pr_debug("%s(0x%p)\n", __func__, ptr); 898 pr_debug("%s(0x%p)\n", __func__, ptr);
799 899
@@ -809,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak);
809 * corresponding block is not a leak and does not contain any references to 909 * corresponding block is not a leak and does not contain any references to
810 * other allocated memory blocks. 910 * other allocated memory blocks.
811 */ 911 */
812void kmemleak_ignore(const void *ptr) 912void __ref kmemleak_ignore(const void *ptr)
813{ 913{
814 pr_debug("%s(0x%p)\n", __func__, ptr); 914 pr_debug("%s(0x%p)\n", __func__, ptr);
815 915
@@ -823,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore);
823/* 923/*
824 * Limit the range to be scanned in an allocated memory block. 924 * Limit the range to be scanned in an allocated memory block.
825 */ 925 */
826void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, 926void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
827 gfp_t gfp) 927 size_t length, gfp_t gfp)
828{ 928{
829 pr_debug("%s(0x%p)\n", __func__, ptr); 929 pr_debug("%s(0x%p)\n", __func__, ptr);
830 930
@@ -838,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area);
838/* 938/*
839 * Inform kmemleak not to scan the given memory block. 939 * Inform kmemleak not to scan the given memory block.
840 */ 940 */
841void kmemleak_no_scan(const void *ptr) 941void __ref kmemleak_no_scan(const void *ptr)
842{ 942{
843 pr_debug("%s(0x%p)\n", __func__, ptr); 943 pr_debug("%s(0x%p)\n", __func__, ptr);
844 944
@@ -882,15 +982,22 @@ static void scan_block(void *_start, void *_end,
882 unsigned long *end = _end - (BYTES_PER_POINTER - 1); 982 unsigned long *end = _end - (BYTES_PER_POINTER - 1);
883 983
884 for (ptr = start; ptr < end; ptr++) { 984 for (ptr = start; ptr < end; ptr++) {
885 unsigned long flags;
886 unsigned long pointer = *ptr;
887 struct kmemleak_object *object; 985 struct kmemleak_object *object;
986 unsigned long flags;
987 unsigned long pointer;
888 988
889 if (allow_resched) 989 if (allow_resched)
890 cond_resched(); 990 cond_resched();
891 if (scan_should_stop()) 991 if (scan_should_stop())
892 break; 992 break;
893 993
994 /* don't scan uninitialized memory */
995 if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
996 BYTES_PER_POINTER))
997 continue;
998
999 pointer = *ptr;
1000
894 object = find_and_get_object(pointer, 1); 1001 object = find_and_get_object(pointer, 1);
895 if (!object) 1002 if (!object)
896 continue; 1003 continue;
@@ -949,10 +1056,21 @@ static void scan_object(struct kmemleak_object *object)
949 if (!(object->flags & OBJECT_ALLOCATED)) 1056 if (!(object->flags & OBJECT_ALLOCATED))
950 /* already freed object */ 1057 /* already freed object */
951 goto out; 1058 goto out;
952 if (hlist_empty(&object->area_list)) 1059 if (hlist_empty(&object->area_list)) {
953 scan_block((void *)object->pointer, 1060 void *start = (void *)object->pointer;
954 (void *)(object->pointer + object->size), object, 0); 1061 void *end = (void *)(object->pointer + object->size);
955 else 1062
1063 while (start < end && (object->flags & OBJECT_ALLOCATED) &&
1064 !(object->flags & OBJECT_NO_SCAN)) {
1065 scan_block(start, min(start + MAX_SCAN_SIZE, end),
1066 object, 0);
1067 start += MAX_SCAN_SIZE;
1068
1069 spin_unlock_irqrestore(&object->lock, flags);
1070 cond_resched();
1071 spin_lock_irqsave(&object->lock, flags);
1072 }
1073 } else
956 hlist_for_each_entry(area, elem, &object->area_list, node) 1074 hlist_for_each_entry(area, elem, &object->area_list, node)
957 scan_block((void *)(object->pointer + area->offset), 1075 scan_block((void *)(object->pointer + area->offset),
958 (void *)(object->pointer + area->offset 1076 (void *)(object->pointer + area->offset
@@ -970,7 +1088,6 @@ static void kmemleak_scan(void)
970{ 1088{
971 unsigned long flags; 1089 unsigned long flags;
972 struct kmemleak_object *object, *tmp; 1090 struct kmemleak_object *object, *tmp;
973 struct task_struct *task;
974 int i; 1091 int i;
975 int new_leaks = 0; 1092 int new_leaks = 0;
976 int gray_list_pass = 0; 1093 int gray_list_pass = 0;
@@ -1037,15 +1154,16 @@ static void kmemleak_scan(void)
1037 } 1154 }
1038 1155
1039 /* 1156 /*
1040 * Scanning the task stacks may introduce false negatives and it is 1157 * Scanning the task stacks (may introduce false negatives).
1041 * not enabled by default.
1042 */ 1158 */
1043 if (kmemleak_stack_scan) { 1159 if (kmemleak_stack_scan) {
1160 struct task_struct *p, *g;
1161
1044 read_lock(&tasklist_lock); 1162 read_lock(&tasklist_lock);
1045 for_each_process(task) 1163 do_each_thread(g, p) {
1046 scan_block(task_stack_page(task), 1164 scan_block(task_stack_page(p), task_stack_page(p) +
1047 task_stack_page(task) + THREAD_SIZE, 1165 THREAD_SIZE, NULL, 0);
1048 NULL, 0); 1166 } while_each_thread(g, p);
1049 read_unlock(&tasklist_lock); 1167 read_unlock(&tasklist_lock);
1050 } 1168 }
1051 1169
@@ -1170,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg)
1170 * Start the automatic memory scanning thread. This function must be called 1288 * Start the automatic memory scanning thread. This function must be called
1171 * with the scan_mutex held. 1289 * with the scan_mutex held.
1172 */ 1290 */
1173void start_scan_thread(void) 1291static void start_scan_thread(void)
1174{ 1292{
1175 if (scan_thread) 1293 if (scan_thread)
1176 return; 1294 return;
@@ -1185,7 +1303,7 @@ void start_scan_thread(void)
1185 * Stop the automatic memory scanning thread. This function must be called 1303 * Stop the automatic memory scanning thread. This function must be called
1186 * with the scan_mutex held. 1304 * with the scan_mutex held.
1187 */ 1305 */
1188void stop_scan_thread(void) 1306static void stop_scan_thread(void)
1189{ 1307{
1190 if (scan_thread) { 1308 if (scan_thread) {
1191 kthread_stop(scan_thread); 1309 kthread_stop(scan_thread);
@@ -1294,6 +1412,49 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1294 return seq_release(inode, file); 1412 return seq_release(inode, file);
1295} 1413}
1296 1414
1415static int dump_str_object_info(const char *str)
1416{
1417 unsigned long flags;
1418 struct kmemleak_object *object;
1419 unsigned long addr;
1420
1421 addr= simple_strtoul(str, NULL, 0);
1422 object = find_and_get_object(addr, 0);
1423 if (!object) {
1424 pr_info("Unknown object at 0x%08lx\n", addr);
1425 return -EINVAL;
1426 }
1427
1428 spin_lock_irqsave(&object->lock, flags);
1429 dump_object_info(object);
1430 spin_unlock_irqrestore(&object->lock, flags);
1431
1432 put_object(object);
1433 return 0;
1434}
1435
1436/*
1437 * We use grey instead of black to ensure we can do future scans on the same
1438 * objects. If we did not do future scans these black objects could
1439 * potentially contain references to newly allocated objects in the future and
1440 * we'd end up with false positives.
1441 */
1442static void kmemleak_clear(void)
1443{
1444 struct kmemleak_object *object;
1445 unsigned long flags;
1446
1447 rcu_read_lock();
1448 list_for_each_entry_rcu(object, &object_list, object_list) {
1449 spin_lock_irqsave(&object->lock, flags);
1450 if ((object->flags & OBJECT_REPORTED) &&
1451 unreferenced_object(object))
1452 __paint_it(object, KMEMLEAK_GREY);
1453 spin_unlock_irqrestore(&object->lock, flags);
1454 }
1455 rcu_read_unlock();
1456}
1457
1297/* 1458/*
1298 * File write operation to configure kmemleak at run-time. The following 1459 * File write operation to configure kmemleak at run-time. The following
1299 * commands can be written to the /sys/kernel/debug/kmemleak file: 1460 * commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1305,6 +1466,9 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1305 * scan=... - set the automatic memory scanning period in seconds (0 to 1466 * scan=... - set the automatic memory scanning period in seconds (0 to
1306 * disable it) 1467 * disable it)
1307 * scan - trigger a memory scan 1468 * scan - trigger a memory scan
1469 * clear - mark all current reported unreferenced kmemleak objects as
1470 * grey to ignore printing them
1471 * dump=... - dump information about the object found at the given address
1308 */ 1472 */
1309static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1473static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1310 size_t size, loff_t *ppos) 1474 size_t size, loff_t *ppos)
@@ -1345,6 +1509,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1345 } 1509 }
1346 } else if (strncmp(buf, "scan", 4) == 0) 1510 } else if (strncmp(buf, "scan", 4) == 0)
1347 kmemleak_scan(); 1511 kmemleak_scan();
1512 else if (strncmp(buf, "clear", 5) == 0)
1513 kmemleak_clear();
1514 else if (strncmp(buf, "dump=", 5) == 0)
1515 ret = dump_str_object_info(buf + 5);
1348 else 1516 else
1349 ret = -EINVAL; 1517 ret = -EINVAL;
1350 1518
@@ -1371,7 +1539,7 @@ static const struct file_operations kmemleak_fops = {
1371 * Perform the freeing of the kmemleak internal objects after waiting for any 1539 * Perform the freeing of the kmemleak internal objects after waiting for any
1372 * current memory scan to complete. 1540 * current memory scan to complete.
1373 */ 1541 */
1374static int kmemleak_cleanup_thread(void *arg) 1542static void kmemleak_do_cleanup(struct work_struct *work)
1375{ 1543{
1376 struct kmemleak_object *object; 1544 struct kmemleak_object *object;
1377 1545
@@ -1383,22 +1551,9 @@ static int kmemleak_cleanup_thread(void *arg)
1383 delete_object_full(object->pointer); 1551 delete_object_full(object->pointer);
1384 rcu_read_unlock(); 1552 rcu_read_unlock();
1385 mutex_unlock(&scan_mutex); 1553 mutex_unlock(&scan_mutex);
1386
1387 return 0;
1388} 1554}
1389 1555
1390/* 1556static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
1391 * Start the clean-up thread.
1392 */
1393static void kmemleak_cleanup(void)
1394{
1395 struct task_struct *cleanup_thread;
1396
1397 cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
1398 "kmemleak-clean");
1399 if (IS_ERR(cleanup_thread))
1400 pr_warning("Failed to create the clean-up thread\n");
1401}
1402 1557
1403/* 1558/*
1404 * Disable kmemleak. No memory allocation/freeing will be traced once this 1559 * Disable kmemleak. No memory allocation/freeing will be traced once this
@@ -1416,7 +1571,7 @@ static void kmemleak_disable(void)
1416 1571
1417 /* check whether it is too early for a kernel thread */ 1572 /* check whether it is too early for a kernel thread */
1418 if (atomic_read(&kmemleak_initialized)) 1573 if (atomic_read(&kmemleak_initialized))
1419 kmemleak_cleanup(); 1574 schedule_work(&cleanup_work);
1420 1575
1421 pr_info("Kernel memory leak detector disabled\n"); 1576 pr_info("Kernel memory leak detector disabled\n");
1422} 1577}
@@ -1469,8 +1624,7 @@ void __init kmemleak_init(void)
1469 1624
1470 switch (log->op_type) { 1625 switch (log->op_type) {
1471 case KMEMLEAK_ALLOC: 1626 case KMEMLEAK_ALLOC:
1472 kmemleak_alloc(log->ptr, log->size, log->min_count, 1627 early_alloc(log);
1473 GFP_KERNEL);
1474 break; 1628 break;
1475 case KMEMLEAK_FREE: 1629 case KMEMLEAK_FREE:
1476 kmemleak_free(log->ptr); 1630 kmemleak_free(log->ptr);
@@ -1513,7 +1667,7 @@ static int __init kmemleak_late_init(void)
1513 * after setting kmemleak_initialized and we may end up with 1667 * after setting kmemleak_initialized and we may end up with
1514 * two clean-up threads but serialized by scan_mutex. 1668 * two clean-up threads but serialized by scan_mutex.
1515 */ 1669 */
1516 kmemleak_cleanup(); 1670 schedule_work(&cleanup_work);
1517 return -ENOMEM; 1671 return -ENOMEM;
1518 } 1672 }
1519 1673
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce3aba4..e8f63d9961ea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
56#include <linux/swapops.h> 56#include <linux/swapops.h>
57#include <linux/elf.h> 57#include <linux/elf.h>
58 58
59#include <asm/io.h>
59#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
60#include <asm/uaccess.h> 61#include <asm/uaccess.h>
61#include <asm/tlb.h> 62#include <asm/tlb.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 8101de490c73..26892e346d8f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -905,7 +905,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
905#endif /* CONFIG_PROC_FS */ 905#endif /* CONFIG_PROC_FS */
906 906
907/* 907/*
908 * The caller must hold down_write(current->mm->mmap_sem). 908 * The caller must hold down_write(&current->mm->mmap_sem).
909 */ 909 */
910 910
911unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 911unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
diff --git a/mm/nommu.c b/mm/nommu.c
index 4bde489ec431..66e81e7e9fe9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1352,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1352 } 1352 }
1353 1353
1354 vma->vm_region = region; 1354 vma->vm_region = region;
1355 add_nommu_region(region);
1355 1356
1356 /* set up the mapping */ 1357 /* set up the mapping */
1357 if (file && vma->vm_flags & VM_SHARED) 1358 if (file && vma->vm_flags & VM_SHARED)
@@ -1361,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1361 if (ret < 0) 1362 if (ret < 0)
1362 goto error_put_region; 1363 goto error_put_region;
1363 1364
1364 add_nommu_region(region);
1365
1366 /* okay... we have a mapping; now we have to register it */ 1365 /* okay... we have a mapping; now we have to register it */
1367 result = vma->vm_start; 1366 result = vma->vm_start;
1368 1367
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 81627ebcd313..1eea4fa0d410 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,15 +36,6 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37 37
38/* 38/*
39 * The maximum number of pages to writeout in a single bdflush/kupdate
40 * operation. We do this so we don't hold I_SYNC against an inode for
41 * enormous amounts of time, which would block a userspace task which has
42 * been forced to throttle against that inode. Also, the code reevaluates
43 * the dirty each time it has written this many pages.
44 */
45#define MAX_WRITEBACK_PAGES 1024
46
47/*
48 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 39 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
49 * will look to see if it needs to force writeback or throttling. 40 * will look to see if it needs to force writeback or throttling.
50 */ 41 */
@@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode);
117/* End of sysctl-exported parameters */ 108/* End of sysctl-exported parameters */
118 109
119 110
120static void background_writeout(unsigned long _min_pages);
121
122/* 111/*
123 * Scale the writeback cache size proportional to the relative writeout speeds. 112 * Scale the writeback cache size proportional to the relative writeout speeds.
124 * 113 *
@@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
320/* 309/*
321 * 310 *
322 */ 311 */
323static DEFINE_SPINLOCK(bdi_lock);
324static unsigned int bdi_min_ratio; 312static unsigned int bdi_min_ratio;
325 313
326int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) 314int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
327{ 315{
328 int ret = 0; 316 int ret = 0;
329 unsigned long flags;
330 317
331 spin_lock_irqsave(&bdi_lock, flags); 318 spin_lock_bh(&bdi_lock);
332 if (min_ratio > bdi->max_ratio) { 319 if (min_ratio > bdi->max_ratio) {
333 ret = -EINVAL; 320 ret = -EINVAL;
334 } else { 321 } else {
@@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
340 ret = -EINVAL; 327 ret = -EINVAL;
341 } 328 }
342 } 329 }
343 spin_unlock_irqrestore(&bdi_lock, flags); 330 spin_unlock_bh(&bdi_lock);
344 331
345 return ret; 332 return ret;
346} 333}
347 334
348int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) 335int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
349{ 336{
350 unsigned long flags;
351 int ret = 0; 337 int ret = 0;
352 338
353 if (max_ratio > 100) 339 if (max_ratio > 100)
354 return -EINVAL; 340 return -EINVAL;
355 341
356 spin_lock_irqsave(&bdi_lock, flags); 342 spin_lock_bh(&bdi_lock);
357 if (bdi->min_ratio > max_ratio) { 343 if (bdi->min_ratio > max_ratio) {
358 ret = -EINVAL; 344 ret = -EINVAL;
359 } else { 345 } else {
360 bdi->max_ratio = max_ratio; 346 bdi->max_ratio = max_ratio;
361 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; 347 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
362 } 348 }
363 spin_unlock_irqrestore(&bdi_lock, flags); 349 spin_unlock_bh(&bdi_lock);
364 350
365 return ret; 351 return ret;
366} 352}
@@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping)
546 * up. 532 * up.
547 */ 533 */
548 if (bdi_nr_reclaimable > bdi_thresh) { 534 if (bdi_nr_reclaimable > bdi_thresh) {
549 writeback_inodes(&wbc); 535 writeback_inodes_wbc(&wbc);
550 pages_written += write_chunk - wbc.nr_to_write; 536 pages_written += write_chunk - wbc.nr_to_write;
551 get_dirty_limits(&background_thresh, &dirty_thresh, 537 get_dirty_limits(&background_thresh, &dirty_thresh,
552 &bdi_thresh, bdi); 538 &bdi_thresh, bdi);
@@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping)
575 if (pages_written >= write_chunk) 561 if (pages_written >= write_chunk)
576 break; /* We've done our duty */ 562 break; /* We've done our duty */
577 563
578 congestion_wait(BLK_RW_ASYNC, HZ/10); 564 schedule_timeout(1);
579 } 565 }
580 566
581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 567 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -594,10 +580,10 @@ static void balance_dirty_pages(struct address_space *mapping)
594 * background_thresh, to keep the amount of dirty memory low. 580 * background_thresh, to keep the amount of dirty memory low.
595 */ 581 */
596 if ((laptop_mode && pages_written) || 582 if ((laptop_mode && pages_written) ||
597 (!laptop_mode && (global_page_state(NR_FILE_DIRTY) 583 (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS) 584 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 585 > background_thresh)))
600 pdflush_operation(background_writeout, 0); 586 bdi_start_writeback(bdi, nr_writeback);
601} 587}
602 588
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 589void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -610,6 +596,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
610 } 596 }
611} 597}
612 598
599static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
600
613/** 601/**
614 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 602 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
615 * @mapping: address_space which was dirtied 603 * @mapping: address_space which was dirtied
@@ -627,7 +615,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
627void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 615void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
628 unsigned long nr_pages_dirtied) 616 unsigned long nr_pages_dirtied)
629{ 617{
630 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
631 unsigned long ratelimit; 618 unsigned long ratelimit;
632 unsigned long *p; 619 unsigned long *p;
633 620
@@ -640,7 +627,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
640 * tasks in balance_dirty_pages(). Period. 627 * tasks in balance_dirty_pages(). Period.
641 */ 628 */
642 preempt_disable(); 629 preempt_disable();
643 p = &__get_cpu_var(ratelimits); 630 p = &__get_cpu_var(bdp_ratelimits);
644 *p += nr_pages_dirtied; 631 *p += nr_pages_dirtied;
645 if (unlikely(*p >= ratelimit)) { 632 if (unlikely(*p >= ratelimit)) {
646 *p = 0; 633 *p = 0;
@@ -681,153 +668,35 @@ void throttle_vm_writeout(gfp_t gfp_mask)
681 } 668 }
682} 669}
683 670
684/*
685 * writeback at least _min_pages, and keep writing until the amount of dirty
686 * memory is less than the background threshold, or until we're all clean.
687 */
688static void background_writeout(unsigned long _min_pages)
689{
690 long min_pages = _min_pages;
691 struct writeback_control wbc = {
692 .bdi = NULL,
693 .sync_mode = WB_SYNC_NONE,
694 .older_than_this = NULL,
695 .nr_to_write = 0,
696 .nonblocking = 1,
697 .range_cyclic = 1,
698 };
699
700 for ( ; ; ) {
701 unsigned long background_thresh;
702 unsigned long dirty_thresh;
703
704 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
705 if (global_page_state(NR_FILE_DIRTY) +
706 global_page_state(NR_UNSTABLE_NFS) < background_thresh
707 && min_pages <= 0)
708 break;
709 wbc.more_io = 0;
710 wbc.encountered_congestion = 0;
711 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
712 wbc.pages_skipped = 0;
713 writeback_inodes(&wbc);
714 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
716 /* Wrote less than expected */
717 if (wbc.encountered_congestion || wbc.more_io)
718 congestion_wait(BLK_RW_ASYNC, HZ/10);
719 else
720 break;
721 }
722 }
723}
724
725/*
726 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
727 * the whole world. Returns 0 if a pdflush thread was dispatched. Returns
728 * -1 if all pdflush threads were busy.
729 */
730int wakeup_pdflush(long nr_pages)
731{
732 if (nr_pages == 0)
733 nr_pages = global_page_state(NR_FILE_DIRTY) +
734 global_page_state(NR_UNSTABLE_NFS);
735 return pdflush_operation(background_writeout, nr_pages);
736}
737
738static void wb_timer_fn(unsigned long unused);
739static void laptop_timer_fn(unsigned long unused); 671static void laptop_timer_fn(unsigned long unused);
740 672
741static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
742static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); 673static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
743 674
744/* 675/*
745 * Periodic writeback of "old" data.
746 *
747 * Define "old": the first time one of an inode's pages is dirtied, we mark the
748 * dirtying-time in the inode's address_space. So this periodic writeback code
749 * just walks the superblock inode list, writing back any inodes which are
750 * older than a specific point in time.
751 *
752 * Try to run once per dirty_writeback_interval. But if a writeback event
753 * takes longer than a dirty_writeback_interval interval, then leave a
754 * one-second gap.
755 *
756 * older_than_this takes precedence over nr_to_write. So we'll only write back
757 * all dirty pages if they are all attached to "old" mappings.
758 */
759static void wb_kupdate(unsigned long arg)
760{
761 unsigned long oldest_jif;
762 unsigned long start_jif;
763 unsigned long next_jif;
764 long nr_to_write;
765 struct writeback_control wbc = {
766 .bdi = NULL,
767 .sync_mode = WB_SYNC_NONE,
768 .older_than_this = &oldest_jif,
769 .nr_to_write = 0,
770 .nonblocking = 1,
771 .for_kupdate = 1,
772 .range_cyclic = 1,
773 };
774
775 sync_supers();
776
777 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
778 start_jif = jiffies;
779 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
780 nr_to_write = global_page_state(NR_FILE_DIRTY) +
781 global_page_state(NR_UNSTABLE_NFS) +
782 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
783 while (nr_to_write > 0) {
784 wbc.more_io = 0;
785 wbc.encountered_congestion = 0;
786 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
787 writeback_inodes(&wbc);
788 if (wbc.nr_to_write > 0) {
789 if (wbc.encountered_congestion || wbc.more_io)
790 congestion_wait(BLK_RW_ASYNC, HZ/10);
791 else
792 break; /* All the old data is written */
793 }
794 nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
795 }
796 if (time_before(next_jif, jiffies + HZ))
797 next_jif = jiffies + HZ;
798 if (dirty_writeback_interval)
799 mod_timer(&wb_timer, next_jif);
800}
801
802/*
803 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 676 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
804 */ 677 */
805int dirty_writeback_centisecs_handler(ctl_table *table, int write, 678int dirty_writeback_centisecs_handler(ctl_table *table, int write,
806 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 679 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
807{ 680{
808 proc_dointvec(table, write, file, buffer, length, ppos); 681 proc_dointvec(table, write, file, buffer, length, ppos);
809 if (dirty_writeback_interval)
810 mod_timer(&wb_timer, jiffies +
811 msecs_to_jiffies(dirty_writeback_interval * 10));
812 else
813 del_timer(&wb_timer);
814 return 0; 682 return 0;
815} 683}
816 684
817static void wb_timer_fn(unsigned long unused) 685static void do_laptop_sync(struct work_struct *work)
818{ 686{
819 if (pdflush_operation(wb_kupdate, 0) < 0) 687 wakeup_flusher_threads(0);
820 mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ 688 kfree(work);
821}
822
823static void laptop_flush(unsigned long unused)
824{
825 sys_sync();
826} 689}
827 690
828static void laptop_timer_fn(unsigned long unused) 691static void laptop_timer_fn(unsigned long unused)
829{ 692{
830 pdflush_operation(laptop_flush, 0); 693 struct work_struct *work;
694
695 work = kmalloc(sizeof(*work), GFP_ATOMIC);
696 if (work) {
697 INIT_WORK(work, do_laptop_sync);
698 schedule_work(work);
699 }
831} 700}
832 701
833/* 702/*
@@ -910,8 +779,6 @@ void __init page_writeback_init(void)
910{ 779{
911 int shift; 780 int shift;
912 781
913 mod_timer(&wb_timer,
914 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
915 writeback_set_ratelimit(); 782 writeback_set_ratelimit();
916 register_cpu_notifier(&ratelimit_nb); 783 register_cpu_notifier(&ratelimit_nb);
917 784
@@ -1145,12 +1012,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
1145 1012
1146 if (wbc->nr_to_write <= 0) 1013 if (wbc->nr_to_write <= 0)
1147 return 0; 1014 return 0;
1148 wbc->for_writepages = 1;
1149 if (mapping->a_ops->writepages) 1015 if (mapping->a_ops->writepages)
1150 ret = mapping->a_ops->writepages(mapping, wbc); 1016 ret = mapping->a_ops->writepages(mapping, wbc);
1151 else 1017 else
1152 ret = generic_writepages(mapping, wbc); 1018 ret = generic_writepages(mapping, wbc);
1153 wbc->for_writepages = 0;
1154 return ret; 1019 return ret;
1155} 1020}
1156 1021
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5cc986eb9f6f..a0de15f46987 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
817 * agressive about taking ownership of free pages 817 * agressive about taking ownership of free pages
818 */ 818 */
819 if (unlikely(current_order >= (pageblock_order >> 1)) || 819 if (unlikely(current_order >= (pageblock_order >> 1)) ||
820 start_migratetype == MIGRATE_RECLAIMABLE) { 820 start_migratetype == MIGRATE_RECLAIMABLE ||
821 page_group_by_mobility_disabled) {
821 unsigned long pages; 822 unsigned long pages;
822 pages = move_freepages_block(zone, page, 823 pages = move_freepages_block(zone, page,
823 start_migratetype); 824 start_migratetype);
824 825
825 /* Claim the whole block if over half of it is free */ 826 /* Claim the whole block if over half of it is free */
826 if (pages >= (1 << (pageblock_order-1))) 827 if (pages >= (1 << (pageblock_order-1)) ||
828 page_group_by_mobility_disabled)
827 set_pageblock_migratetype(page, 829 set_pageblock_migratetype(page,
828 start_migratetype); 830 start_migratetype);
829 831
diff --git a/mm/pdflush.c b/mm/pdflush.c
deleted file mode 100644
index 235ac440c44e..000000000000
--- a/mm/pdflush.c
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * mm/pdflush.c - worker threads for writing back filesystem data
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * 09Apr2002 Andrew Morton
7 * Initial version
8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread.
11 */
12
13#include <linux/sched.h>
14#include <linux/list.h>
15#include <linux/signal.h>
16#include <linux/spinlock.h>
17#include <linux/gfp.h>
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/fs.h> /* Needed by writeback.h */
21#include <linux/writeback.h> /* Prototypes pdflush_operation() */
22#include <linux/kthread.h>
23#include <linux/cpuset.h>
24#include <linux/freezer.h>
25
26
27/*
28 * Minimum and maximum number of pdflush instances
29 */
30#define MIN_PDFLUSH_THREADS 2
31#define MAX_PDFLUSH_THREADS 8
32
33static void start_one_pdflush_thread(void);
34
35
36/*
37 * The pdflush threads are worker threads for writing back dirty data.
38 * Ideally, we'd like one thread per active disk spindle. But the disk
39 * topology is very hard to divine at this level. Instead, we take
40 * care in various places to prevent more than one pdflush thread from
41 * performing writeback against a single filesystem. pdflush threads
42 * have the PF_FLUSHER flag set in current->flags to aid in this.
43 */
44
45/*
46 * All the pdflush threads. Protected by pdflush_lock
47 */
48static LIST_HEAD(pdflush_list);
49static DEFINE_SPINLOCK(pdflush_lock);
50
51/*
52 * The count of currently-running pdflush threads. Protected
53 * by pdflush_lock.
54 *
55 * Readable by sysctl, but not writable. Published to userspace at
56 * /proc/sys/vm/nr_pdflush_threads.
57 */
58int nr_pdflush_threads = 0;
59
60/*
61 * The time at which the pdflush thread pool last went empty
62 */
63static unsigned long last_empty_jifs;
64
65/*
66 * The pdflush thread.
67 *
68 * Thread pool management algorithm:
69 *
70 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
72 *
73 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one.
75 *
76 * - If the least-recently-went-to-sleep pdflush thread has been asleep
77 * for more than one second, terminate a thread.
78 */
79
80/*
81 * A structure for passing work to a pdflush thread. Also for passing
82 * state information between pdflush threads. Protected by pdflush_lock.
83 */
84struct pdflush_work {
85 struct task_struct *who; /* The thread */
86 void (*fn)(unsigned long); /* A callback function */
87 unsigned long arg0; /* An argument to the callback */
88 struct list_head list; /* On pdflush_list, when idle */
89 unsigned long when_i_went_to_sleep;
90};
91
92static int __pdflush(struct pdflush_work *my_work)
93{
94 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
95 set_freezable();
96 my_work->fn = NULL;
97 my_work->who = current;
98 INIT_LIST_HEAD(&my_work->list);
99
100 spin_lock_irq(&pdflush_lock);
101 for ( ; ; ) {
102 struct pdflush_work *pdf;
103
104 set_current_state(TASK_INTERRUPTIBLE);
105 list_move(&my_work->list, &pdflush_list);
106 my_work->when_i_went_to_sleep = jiffies;
107 spin_unlock_irq(&pdflush_lock);
108 schedule();
109 try_to_freeze();
110 spin_lock_irq(&pdflush_lock);
111 if (!list_empty(&my_work->list)) {
112 /*
113 * Someone woke us up, but without removing our control
114 * structure from the global list. swsusp will do this
115 * in try_to_freeze()->refrigerator(). Handle it.
116 */
117 my_work->fn = NULL;
118 continue;
119 }
120 if (my_work->fn == NULL) {
121 printk("pdflush: bogus wakeup\n");
122 continue;
123 }
124 spin_unlock_irq(&pdflush_lock);
125
126 (*my_work->fn)(my_work->arg0);
127
128 spin_lock_irq(&pdflush_lock);
129
130 /*
131 * Thread creation: For how long have there been zero
132 * available threads?
133 *
134 * To throttle creation, we reset last_empty_jifs.
135 */
136 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
137 if (list_empty(&pdflush_list)) {
138 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
139 last_empty_jifs = jiffies;
140 nr_pdflush_threads++;
141 spin_unlock_irq(&pdflush_lock);
142 start_one_pdflush_thread();
143 spin_lock_irq(&pdflush_lock);
144 }
145 }
146 }
147
148 my_work->fn = NULL;
149
150 /*
151 * Thread destruction: For how long has the sleepiest
152 * thread slept?
153 */
154 if (list_empty(&pdflush_list))
155 continue;
156 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
157 continue;
158 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
159 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
160 /* Limit exit rate */
161 pdf->when_i_went_to_sleep = jiffies;
162 break; /* exeunt */
163 }
164 }
165 nr_pdflush_threads--;
166 spin_unlock_irq(&pdflush_lock);
167 return 0;
168}
169
170/*
171 * Of course, my_work wants to be just a local in __pdflush(). It is
172 * separated out in this manner to hopefully prevent the compiler from
173 * performing unfortunate optimisations against the auto variables. Because
174 * these are visible to other tasks and CPUs. (No problem has actually
175 * been observed. This is just paranoia).
176 */
177static int pdflush(void *dummy)
178{
179 struct pdflush_work my_work;
180 cpumask_var_t cpus_allowed;
181
182 /*
183 * Since the caller doesn't even check kthread_run() worked, let's not
184 * freak out too much if this fails.
185 */
186 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
187 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
188 return 0;
189 }
190
191 /*
192 * pdflush can spend a lot of time doing encryption via dm-crypt. We
193 * don't want to do that at keventd's priority.
194 */
195 set_user_nice(current, 0);
196
197 /*
198 * Some configs put our parent kthread in a limited cpuset,
199 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
200 * Our needs are more modest - cut back to our cpusets cpus_allowed.
201 * This is needed as pdflush's are dynamically created and destroyed.
202 * The boottime pdflush's are easily placed w/o these 2 lines.
203 */
204 cpuset_cpus_allowed(current, cpus_allowed);
205 set_cpus_allowed_ptr(current, cpus_allowed);
206 free_cpumask_var(cpus_allowed);
207
208 return __pdflush(&my_work);
209}
210
211/*
212 * Attempt to wake up a pdflush thread, and get it to do some work for you.
213 * Returns zero if it indeed managed to find a worker thread, and passed your
214 * payload to it.
215 */
216int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
217{
218 unsigned long flags;
219 int ret = 0;
220
221 BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */
222
223 spin_lock_irqsave(&pdflush_lock, flags);
224 if (list_empty(&pdflush_list)) {
225 ret = -1;
226 } else {
227 struct pdflush_work *pdf;
228
229 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
230 list_del_init(&pdf->list);
231 if (list_empty(&pdflush_list))
232 last_empty_jifs = jiffies;
233 pdf->fn = fn;
234 pdf->arg0 = arg0;
235 wake_up_process(pdf->who);
236 }
237 spin_unlock_irqrestore(&pdflush_lock, flags);
238
239 return ret;
240}
241
242static void start_one_pdflush_thread(void)
243{
244 struct task_struct *k;
245
246 k = kthread_run(pdflush, NULL, "pdflush");
247 if (unlikely(IS_ERR(k))) {
248 spin_lock_irq(&pdflush_lock);
249 nr_pdflush_threads--;
250 spin_unlock_irq(&pdflush_lock);
251 }
252}
253
254static int __init pdflush_init(void)
255{
256 int i;
257
258 /*
259 * Pre-set nr_pdflush_threads... If we fail to create,
260 * the count will be decremented.
261 */
262 nr_pdflush_threads = MIN_PDFLUSH_THREADS;
263
264 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
265 start_one_pdflush_thread();
266 return 0;
267}
268
269module_init(pdflush_init);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5fe37842e0ea..43d8cacfdaa5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,13 @@
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of nr_cpu_ids units and the first chunk is used 11 * chunk is consisted of boot-time determined number of units and the
12 * for static percpu variables in the kernel image (special boot time 12 * first chunk is used for static percpu variables in the kernel image
13 * alloc/init handling necessary as these areas need to be brought up 13 * (special boot time alloc/init handling necessary as these areas
14 * before allocation services are running). Unit grows as necessary 14 * need to be brought up before allocation services are running).
15 * and all units grow or shrink in unison. When a chunk is filled up, 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * another chunk is allocated. ie. in vmalloc area 16 * When a chunk is filled up, another chunk is allocated. ie. in
17 * vmalloc area
17 * 18 *
18 * c0 c1 c2 19 * c0 c1 c2
19 * ------------------- ------------------- ------------ 20 * ------------------- ------------------- ------------
@@ -22,11 +23,13 @@
22 * 23 *
23 * Allocation is done in offset-size areas of single unit space. Ie, 24 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, 25 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring 26 * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
26 * percpu base registers pcpu_unit_size apart. 27 * cpus. On NUMA, the mapping can be non-linear and even sparse.
28 * Percpu access can be done by configuring percpu base registers
29 * according to cpu to unit mapping and pcpu_unit_size.
27 * 30 *
28 * There are usually many small percpu allocations many of them as 31 * There are usually many small percpu allocations many of them being
29 * small as 4 bytes. The allocator organizes chunks into lists 32 * as small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one. 33 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is 34 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous 35 * guaranteed to be eqaul to or larger than the maximum contiguous
@@ -43,7 +46,7 @@
43 * 46 *
44 * To use this allocator, arch code should do the followings. 47 * To use this allocator, arch code should do the followings.
45 * 48 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 49 * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
47 * 50 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 51 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be 52 * regular address to percpu pointer and back if they need to be
@@ -55,7 +58,9 @@
55 58
56#include <linux/bitmap.h> 59#include <linux/bitmap.h>
57#include <linux/bootmem.h> 60#include <linux/bootmem.h>
61#include <linux/err.h>
58#include <linux/list.h> 62#include <linux/list.h>
63#include <linux/log2.h>
59#include <linux/mm.h> 64#include <linux/mm.h>
60#include <linux/module.h> 65#include <linux/module.h>
61#include <linux/mutex.h> 66#include <linux/mutex.h>
@@ -89,25 +94,38 @@ struct pcpu_chunk {
89 struct list_head list; /* linked to pcpu_slot lists */ 94 struct list_head list; /* linked to pcpu_slot lists */
90 int free_size; /* free bytes in the chunk */ 95 int free_size; /* free bytes in the chunk */
91 int contig_hint; /* max contiguous size hint */ 96 int contig_hint; /* max contiguous size hint */
92 struct vm_struct *vm; /* mapped vmalloc region */ 97 void *base_addr; /* base address of this chunk */
93 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
94 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
95 int *map; /* allocation map */ 100 int *map; /* allocation map */
101 struct vm_struct **vms; /* mapped vmalloc regions */
96 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
97 struct page **page; /* points to page array */ 103 unsigned long populated[]; /* populated bitmap */
98 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
99}; 104};
100 105
101static int pcpu_unit_pages __read_mostly; 106static int pcpu_unit_pages __read_mostly;
102static int pcpu_unit_size __read_mostly; 107static int pcpu_unit_size __read_mostly;
103static int pcpu_chunk_size __read_mostly; 108static int pcpu_nr_units __read_mostly;
109static int pcpu_atom_size __read_mostly;
104static int pcpu_nr_slots __read_mostly; 110static int pcpu_nr_slots __read_mostly;
105static size_t pcpu_chunk_struct_size __read_mostly; 111static size_t pcpu_chunk_struct_size __read_mostly;
106 112
113/* cpus with the lowest and highest unit numbers */
114static unsigned int pcpu_first_unit_cpu __read_mostly;
115static unsigned int pcpu_last_unit_cpu __read_mostly;
116
107/* the address of the first chunk which starts with the kernel static area */ 117/* the address of the first chunk which starts with the kernel static area */
108void *pcpu_base_addr __read_mostly; 118void *pcpu_base_addr __read_mostly;
109EXPORT_SYMBOL_GPL(pcpu_base_addr); 119EXPORT_SYMBOL_GPL(pcpu_base_addr);
110 120
121static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
122const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
123
124/* group information, used for vm allocation */
125static int pcpu_nr_groups __read_mostly;
126static const unsigned long *pcpu_group_offsets __read_mostly;
127static const size_t *pcpu_group_sizes __read_mostly;
128
111/* 129/*
112 * The first chunk which always exists. Note that unlike other 130 * The first chunk which always exists. Note that unlike other
113 * chunks, this one can be allocated and mapped in several different 131 * chunks, this one can be allocated and mapped in several different
@@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit;
129 * Synchronization rules. 147 * Synchronization rules.
130 * 148 *
131 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former 149 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
132 * protects allocation/reclaim paths, chunks and chunk->page arrays. 150 * protects allocation/reclaim paths, chunks, populated bitmap and
133 * The latter is a spinlock and protects the index data structures - 151 * vmalloc mapping. The latter is a spinlock and protects the index
134 * chunk slots, chunks and area maps in chunks. 152 * data structures - chunk slots, chunks and area maps in chunks.
135 * 153 *
136 * During allocation, pcpu_alloc_mutex is kept locked all the time and 154 * During allocation, pcpu_alloc_mutex is kept locked all the time and
137 * pcpu_lock is grabbed and released as necessary. All actual memory 155 * pcpu_lock is grabbed and released as necessary. All actual memory
@@ -178,26 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
178 196
179static int pcpu_page_idx(unsigned int cpu, int page_idx) 197static int pcpu_page_idx(unsigned int cpu, int page_idx)
180{ 198{
181 return cpu * pcpu_unit_pages + page_idx; 199 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
182}
183
184static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
185 unsigned int cpu, int page_idx)
186{
187 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
188} 200}
189 201
190static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, 202static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
191 unsigned int cpu, int page_idx) 203 unsigned int cpu, int page_idx)
192{ 204{
193 return (unsigned long)chunk->vm->addr + 205 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
194 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); 206 (page_idx << PAGE_SHIFT);
195} 207}
196 208
197static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, 209static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
198 int page_idx) 210 unsigned int cpu, int page_idx)
199{ 211{
200 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; 212 /* must not be used on pre-mapped chunk */
213 WARN_ON(chunk->immutable);
214
215 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
201} 216}
202 217
203/* set the pointer to a chunk in a page struct */ 218/* set the pointer to a chunk in a page struct */
@@ -212,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
212 return (struct pcpu_chunk *)page->index; 227 return (struct pcpu_chunk *)page->index;
213} 228}
214 229
230static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
231{
232 *rs = find_next_zero_bit(chunk->populated, end, *rs);
233 *re = find_next_bit(chunk->populated, end, *rs + 1);
234}
235
236static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
237{
238 *rs = find_next_bit(chunk->populated, end, *rs);
239 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
240}
241
242/*
243 * (Un)populated page region iterators. Iterate over (un)populated
244 * page regions betwen @start and @end in @chunk. @rs and @re should
245 * be integer variables and will be set to start and end page index of
246 * the current region.
247 */
248#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
249 for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
250 (rs) < (re); \
251 (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
252
253#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
254 for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
255 (rs) < (re); \
256 (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
257
215/** 258/**
216 * pcpu_mem_alloc - allocate memory 259 * pcpu_mem_alloc - allocate memory
217 * @size: bytes to allocate 260 * @size: bytes to allocate
@@ -287,16 +330,24 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
287 */ 330 */
288static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) 331static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
289{ 332{
290 void *first_start = pcpu_first_chunk->vm->addr; 333 void *first_start = pcpu_first_chunk->base_addr;
291 334
292 /* is it in the first chunk? */ 335 /* is it in the first chunk? */
293 if (addr >= first_start && addr < first_start + pcpu_chunk_size) { 336 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
294 /* is it in the reserved area? */ 337 /* is it in the reserved area? */
295 if (addr < first_start + pcpu_reserved_chunk_limit) 338 if (addr < first_start + pcpu_reserved_chunk_limit)
296 return pcpu_reserved_chunk; 339 return pcpu_reserved_chunk;
297 return pcpu_first_chunk; 340 return pcpu_first_chunk;
298 } 341 }
299 342
343 /*
344 * The address is relative to unit0 which might be unused and
345 * thus unmapped. Offset the address to the unit space of the
346 * current processor before looking it up in the vmalloc
347 * space. Note that any possible cpu id can be used here, so
348 * there's no need to worry about preemption or cpu hotplug.
349 */
350 addr += pcpu_unit_offsets[raw_smp_processor_id()];
300 return pcpu_get_page_chunk(vmalloc_to_page(addr)); 351 return pcpu_get_page_chunk(vmalloc_to_page(addr));
301} 352}
302 353
@@ -545,125 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
545} 596}
546 597
547/** 598/**
548 * pcpu_unmap - unmap pages out of a pcpu_chunk 599 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
549 * @chunk: chunk of interest 600 * @chunk: chunk of interest
550 * @page_start: page index of the first page to unmap 601 * @bitmapp: output parameter for bitmap
551 * @page_end: page index of the last page to unmap + 1 602 * @may_alloc: may allocate the array
552 * @flush_tlb: whether to flush tlb or not
553 * 603 *
554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 604 * Returns pointer to array of pointers to struct page and bitmap,
555 * If @flush is true, vcache is flushed before unmapping and tlb 605 * both of which can be indexed with pcpu_page_idx(). The returned
556 * after. 606 * array is cleared to zero and *@bitmapp is copied from
607 * @chunk->populated. Note that there is only one array and bitmap
608 * and access exclusion is the caller's responsibility.
609 *
610 * CONTEXT:
611 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
612 * Otherwise, don't care.
613 *
614 * RETURNS:
615 * Pointer to temp pages array on success, NULL on failure.
557 */ 616 */
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 617static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
559 bool flush_tlb) 618 unsigned long **bitmapp,
619 bool may_alloc)
560{ 620{
561 unsigned int last = nr_cpu_ids - 1; 621 static struct page **pages;
562 unsigned int cpu; 622 static unsigned long *bitmap;
623 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
624 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
625 sizeof(unsigned long);
626
627 if (!pages || !bitmap) {
628 if (may_alloc && !pages)
629 pages = pcpu_mem_alloc(pages_size);
630 if (may_alloc && !bitmap)
631 bitmap = pcpu_mem_alloc(bitmap_size);
632 if (!pages || !bitmap)
633 return NULL;
634 }
563 635
564 /* unmap must not be done on immutable chunk */ 636 memset(pages, 0, pages_size);
565 WARN_ON(chunk->immutable); 637 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
566 638
567 /* 639 *bitmapp = bitmap;
568 * Each flushing trial can be very expensive, issue flush on 640 return pages;
569 * the whole region at once rather than doing it for each cpu. 641}
570 * This could be an overkill but is more scalable.
571 */
572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
573 pcpu_chunk_addr(chunk, last, page_end));
574 642
575 for_each_possible_cpu(cpu) 643/**
576 unmap_kernel_range_noflush( 644 * pcpu_free_pages - free pages which were allocated for @chunk
577 pcpu_chunk_addr(chunk, cpu, page_start), 645 * @chunk: chunk pages were allocated for
578 (page_end - page_start) << PAGE_SHIFT); 646 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
579 647 * @populated: populated bitmap
580 /* ditto as flush_cache_vunmap() */ 648 * @page_start: page index of the first page to be freed
581 if (flush_tlb) 649 * @page_end: page index of the last page to be freed + 1
582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 650 *
583 pcpu_chunk_addr(chunk, last, page_end)); 651 * Free pages [@page_start and @page_end) in @pages for all units.
652 * The pages were allocated for @chunk.
653 */
654static void pcpu_free_pages(struct pcpu_chunk *chunk,
655 struct page **pages, unsigned long *populated,
656 int page_start, int page_end)
657{
658 unsigned int cpu;
659 int i;
660
661 for_each_possible_cpu(cpu) {
662 for (i = page_start; i < page_end; i++) {
663 struct page *page = pages[pcpu_page_idx(cpu, i)];
664
665 if (page)
666 __free_page(page);
667 }
668 }
584} 669}
585 670
586/** 671/**
587 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 672 * pcpu_alloc_pages - allocates pages for @chunk
588 * @chunk: chunk to depopulate 673 * @chunk: target chunk
589 * @off: offset to the area to depopulate 674 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
590 * @size: size of the area to depopulate in bytes 675 * @populated: populated bitmap
591 * @flush: whether to flush cache and tlb or not 676 * @page_start: page index of the first page to be allocated
592 * 677 * @page_end: page index of the last page to be allocated + 1
593 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 678 *
594 * from @chunk. If @flush is true, vcache is flushed before unmapping 679 * Allocate pages [@page_start,@page_end) into @pages for all units.
595 * and tlb after. 680 * The allocation is for @chunk. Percpu core doesn't care about the
596 * 681 * content of @pages and will pass it verbatim to pcpu_map_pages().
597 * CONTEXT:
598 * pcpu_alloc_mutex.
599 */ 682 */
600static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 683static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
601 bool flush) 684 struct page **pages, unsigned long *populated,
685 int page_start, int page_end)
602{ 686{
603 int page_start = PFN_DOWN(off); 687 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
604 int page_end = PFN_UP(off + size);
605 int unmap_start = -1;
606 int uninitialized_var(unmap_end);
607 unsigned int cpu; 688 unsigned int cpu;
608 int i; 689 int i;
609 690
610 for (i = page_start; i < page_end; i++) { 691 for_each_possible_cpu(cpu) {
611 for_each_possible_cpu(cpu) { 692 for (i = page_start; i < page_end; i++) {
612 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 693 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
694
695 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
696 if (!*pagep) {
697 pcpu_free_pages(chunk, pages, populated,
698 page_start, page_end);
699 return -ENOMEM;
700 }
701 }
702 }
703 return 0;
704}
613 705
614 if (!*pagep) 706/**
615 continue; 707 * pcpu_pre_unmap_flush - flush cache prior to unmapping
708 * @chunk: chunk the regions to be flushed belongs to
709 * @page_start: page index of the first page to be flushed
710 * @page_end: page index of the last page to be flushed + 1
711 *
712 * Pages in [@page_start,@page_end) of @chunk are about to be
713 * unmapped. Flush cache. As each flushing trial can be very
714 * expensive, issue flush on the whole region at once rather than
715 * doing it for each cpu. This could be an overkill but is more
716 * scalable.
717 */
718static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
719 int page_start, int page_end)
720{
721 flush_cache_vunmap(
722 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
723 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
724}
725
726static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
727{
728 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
729}
616 730
617 __free_page(*pagep); 731/**
732 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
733 * @chunk: chunk of interest
734 * @pages: pages array which can be used to pass information to free
735 * @populated: populated bitmap
736 * @page_start: page index of the first page to unmap
737 * @page_end: page index of the last page to unmap + 1
738 *
739 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
740 * Corresponding elements in @pages were cleared by the caller and can
741 * be used to carry information to pcpu_free_pages() which will be
742 * called after all unmaps are finished. The caller should call
743 * proper pre/post flush functions.
744 */
745static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
746 struct page **pages, unsigned long *populated,
747 int page_start, int page_end)
748{
749 unsigned int cpu;
750 int i;
618 751
619 /* 752 for_each_possible_cpu(cpu) {
620 * If it's partial depopulation, it might get 753 for (i = page_start; i < page_end; i++) {
621 * populated or depopulated again. Mark the 754 struct page *page;
622 * page gone.
623 */
624 *pagep = NULL;
625 755
626 unmap_start = unmap_start < 0 ? i : unmap_start; 756 page = pcpu_chunk_page(chunk, cpu, i);
627 unmap_end = i + 1; 757 WARN_ON(!page);
758 pages[pcpu_page_idx(cpu, i)] = page;
628 } 759 }
760 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
761 page_end - page_start);
629 } 762 }
630 763
631 if (unmap_start >= 0) 764 for (i = page_start; i < page_end; i++)
632 pcpu_unmap(chunk, unmap_start, unmap_end, flush); 765 __clear_bit(i, populated);
766}
767
768/**
769 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
770 * @chunk: pcpu_chunk the regions to be flushed belong to
771 * @page_start: page index of the first page to be flushed
772 * @page_end: page index of the last page to be flushed + 1
773 *
774 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
775 * TLB for the regions. This can be skipped if the area is to be
776 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
777 *
778 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
779 * for the whole region.
780 */
781static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
782 int page_start, int page_end)
783{
784 flush_tlb_kernel_range(
785 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
786 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
787}
788
789static int __pcpu_map_pages(unsigned long addr, struct page **pages,
790 int nr_pages)
791{
792 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
793 PAGE_KERNEL, pages);
633} 794}
634 795
635/** 796/**
636 * pcpu_map - map pages into a pcpu_chunk 797 * pcpu_map_pages - map pages into a pcpu_chunk
637 * @chunk: chunk of interest 798 * @chunk: chunk of interest
799 * @pages: pages array containing pages to be mapped
800 * @populated: populated bitmap
638 * @page_start: page index of the first page to map 801 * @page_start: page index of the first page to map
639 * @page_end: page index of the last page to map + 1 802 * @page_end: page index of the last page to map + 1
640 * 803 *
641 * For each cpu, map pages [@page_start,@page_end) into @chunk. 804 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
642 * vcache is flushed afterwards. 805 * caller is responsible for calling pcpu_post_map_flush() after all
806 * mappings are complete.
807 *
808 * This function is responsible for setting corresponding bits in
809 * @chunk->populated bitmap and whatever is necessary for reverse
810 * lookup (addr -> chunk).
643 */ 811 */
644static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) 812static int pcpu_map_pages(struct pcpu_chunk *chunk,
813 struct page **pages, unsigned long *populated,
814 int page_start, int page_end)
645{ 815{
646 unsigned int last = nr_cpu_ids - 1; 816 unsigned int cpu, tcpu;
647 unsigned int cpu; 817 int i, err;
648 int err;
649
650 /* map must not be done on immutable chunk */
651 WARN_ON(chunk->immutable);
652 818
653 for_each_possible_cpu(cpu) { 819 for_each_possible_cpu(cpu) {
654 err = map_kernel_range_noflush( 820 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
655 pcpu_chunk_addr(chunk, cpu, page_start), 821 &pages[pcpu_page_idx(cpu, page_start)],
656 (page_end - page_start) << PAGE_SHIFT, 822 page_end - page_start);
657 PAGE_KERNEL,
658 pcpu_chunk_pagep(chunk, cpu, page_start));
659 if (err < 0) 823 if (err < 0)
660 return err; 824 goto err;
825 }
826
827 /* mapping successful, link chunk and mark populated */
828 for (i = page_start; i < page_end; i++) {
829 for_each_possible_cpu(cpu)
830 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
831 chunk);
832 __set_bit(i, populated);
661 } 833 }
662 834
663 /* flush at once, please read comments in pcpu_unmap() */
664 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
665 pcpu_chunk_addr(chunk, last, page_end));
666 return 0; 835 return 0;
836
837err:
838 for_each_possible_cpu(tcpu) {
839 if (tcpu == cpu)
840 break;
841 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
842 page_end - page_start);
843 }
844 return err;
845}
846
847/**
848 * pcpu_post_map_flush - flush cache after mapping
849 * @chunk: pcpu_chunk the regions to be flushed belong to
850 * @page_start: page index of the first page to be flushed
851 * @page_end: page index of the last page to be flushed + 1
852 *
853 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
854 * cache.
855 *
856 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
857 * for the whole region.
858 */
859static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
860 int page_start, int page_end)
861{
862 flush_cache_vmap(
863 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
864 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
865}
866
867/**
868 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
869 * @chunk: chunk to depopulate
870 * @off: offset to the area to depopulate
871 * @size: size of the area to depopulate in bytes
872 * @flush: whether to flush cache and tlb or not
873 *
874 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
875 * from @chunk. If @flush is true, vcache is flushed before unmapping
876 * and tlb after.
877 *
878 * CONTEXT:
879 * pcpu_alloc_mutex.
880 */
881static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
882{
883 int page_start = PFN_DOWN(off);
884 int page_end = PFN_UP(off + size);
885 struct page **pages;
886 unsigned long *populated;
887 int rs, re;
888
889 /* quick path, check whether it's empty already */
890 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
891 if (rs == page_start && re == page_end)
892 return;
893 break;
894 }
895
896 /* immutable chunks can't be depopulated */
897 WARN_ON(chunk->immutable);
898
899 /*
900 * If control reaches here, there must have been at least one
901 * successful population attempt so the temp pages array must
902 * be available now.
903 */
904 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
905 BUG_ON(!pages);
906
907 /* unmap and free */
908 pcpu_pre_unmap_flush(chunk, page_start, page_end);
909
910 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
911 pcpu_unmap_pages(chunk, pages, populated, rs, re);
912
913 /* no need to flush tlb, vmalloc will handle it lazily */
914
915 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
916 pcpu_free_pages(chunk, pages, populated, rs, re);
917
918 /* commit new bitmap */
919 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
667} 920}
668 921
669/** 922/**
@@ -680,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
680 */ 933 */
681static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 934static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
682{ 935{
683 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
684 int page_start = PFN_DOWN(off); 936 int page_start = PFN_DOWN(off);
685 int page_end = PFN_UP(off + size); 937 int page_end = PFN_UP(off + size);
686 int map_start = -1; 938 int free_end = page_start, unmap_end = page_start;
687 int uninitialized_var(map_end); 939 struct page **pages;
940 unsigned long *populated;
688 unsigned int cpu; 941 unsigned int cpu;
689 int i; 942 int rs, re, rc;
690 943
691 for (i = page_start; i < page_end; i++) { 944 /* quick path, check whether all pages are already there */
692 if (pcpu_chunk_page_occupied(chunk, i)) { 945 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
693 if (map_start >= 0) { 946 if (rs == page_start && re == page_end)
694 if (pcpu_map(chunk, map_start, map_end)) 947 goto clear;
695 goto err; 948 break;
696 map_start = -1; 949 }
697 }
698 continue;
699 }
700 950
701 map_start = map_start < 0 ? i : map_start; 951 /* need to allocate and map pages, this chunk can't be immutable */
702 map_end = i + 1; 952 WARN_ON(chunk->immutable);
703 953
704 for_each_possible_cpu(cpu) { 954 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
705 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); 955 if (!pages)
956 return -ENOMEM;
706 957
707 *pagep = alloc_pages_node(cpu_to_node(cpu), 958 /* alloc and map */
708 alloc_mask, 0); 959 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
709 if (!*pagep) 960 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
710 goto err; 961 if (rc)
711 pcpu_set_page_chunk(*pagep, chunk); 962 goto err_free;
712 } 963 free_end = re;
713 } 964 }
714 965
715 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) 966 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
716 goto err; 967 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
968 if (rc)
969 goto err_unmap;
970 unmap_end = re;
971 }
972 pcpu_post_map_flush(chunk, page_start, page_end);
717 973
974 /* commit new bitmap */
975 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
976clear:
718 for_each_possible_cpu(cpu) 977 for_each_possible_cpu(cpu)
719 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, 978 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
720 size);
721
722 return 0; 979 return 0;
723err: 980
724 /* likely under heavy memory pressure, give memory back */ 981err_unmap:
725 pcpu_depopulate_chunk(chunk, off, size, true); 982 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
726 return -ENOMEM; 983 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
984 pcpu_unmap_pages(chunk, pages, populated, rs, re);
985 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
986err_free:
987 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
988 pcpu_free_pages(chunk, pages, populated, rs, re);
989 return rc;
727} 990}
728 991
729static void free_pcpu_chunk(struct pcpu_chunk *chunk) 992static void free_pcpu_chunk(struct pcpu_chunk *chunk)
730{ 993{
731 if (!chunk) 994 if (!chunk)
732 return; 995 return;
733 if (chunk->vm) 996 if (chunk->vms)
734 free_vm_area(chunk->vm); 997 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
735 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); 998 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
736 kfree(chunk); 999 kfree(chunk);
737} 1000}
@@ -747,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
747 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); 1010 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
748 chunk->map_alloc = PCPU_DFL_MAP_ALLOC; 1011 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
749 chunk->map[chunk->map_used++] = pcpu_unit_size; 1012 chunk->map[chunk->map_used++] = pcpu_unit_size;
750 chunk->page = chunk->page_ar;
751 1013
752 chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); 1014 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
753 if (!chunk->vm) { 1015 pcpu_nr_groups, pcpu_atom_size,
1016 GFP_KERNEL);
1017 if (!chunk->vms) {
754 free_pcpu_chunk(chunk); 1018 free_pcpu_chunk(chunk);
755 return NULL; 1019 return NULL;
756 } 1020 }
@@ -758,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
758 INIT_LIST_HEAD(&chunk->list); 1022 INIT_LIST_HEAD(&chunk->list);
759 chunk->free_size = pcpu_unit_size; 1023 chunk->free_size = pcpu_unit_size;
760 chunk->contig_hint = pcpu_unit_size; 1024 chunk->contig_hint = pcpu_unit_size;
1025 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
761 1026
762 return chunk; 1027 return chunk;
763} 1028}
@@ -847,7 +1112,8 @@ area_found:
847 1112
848 mutex_unlock(&pcpu_alloc_mutex); 1113 mutex_unlock(&pcpu_alloc_mutex);
849 1114
850 return __addr_to_pcpu_ptr(chunk->vm->addr + off); 1115 /* return address relative to base address */
1116 return __addr_to_pcpu_ptr(chunk->base_addr + off);
851 1117
852fail_unlock: 1118fail_unlock:
853 spin_unlock_irq(&pcpu_lock); 1119 spin_unlock_irq(&pcpu_lock);
@@ -925,12 +1191,13 @@ static void pcpu_reclaim(struct work_struct *work)
925 } 1191 }
926 1192
927 spin_unlock_irq(&pcpu_lock); 1193 spin_unlock_irq(&pcpu_lock);
928 mutex_unlock(&pcpu_alloc_mutex);
929 1194
930 list_for_each_entry_safe(chunk, next, &todo, list) { 1195 list_for_each_entry_safe(chunk, next, &todo, list) {
931 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 1196 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
932 free_pcpu_chunk(chunk); 1197 free_pcpu_chunk(chunk);
933 } 1198 }
1199
1200 mutex_unlock(&pcpu_alloc_mutex);
934} 1201}
935 1202
936/** 1203/**
@@ -955,7 +1222,7 @@ void free_percpu(void *ptr)
955 spin_lock_irqsave(&pcpu_lock, flags); 1222 spin_lock_irqsave(&pcpu_lock, flags);
956 1223
957 chunk = pcpu_chunk_addr_search(addr); 1224 chunk = pcpu_chunk_addr_search(addr);
958 off = addr - chunk->vm->addr; 1225 off = addr - chunk->base_addr;
959 1226
960 pcpu_free_area(chunk, off); 1227 pcpu_free_area(chunk, off);
961 1228
@@ -974,30 +1241,295 @@ void free_percpu(void *ptr)
974} 1241}
975EXPORT_SYMBOL_GPL(free_percpu); 1242EXPORT_SYMBOL_GPL(free_percpu);
976 1243
1244static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1245 size_t reserved_size,
1246 ssize_t *dyn_sizep)
1247{
1248 size_t size_sum;
1249
1250 size_sum = PFN_ALIGN(static_size + reserved_size +
1251 (*dyn_sizep >= 0 ? *dyn_sizep : 0));
1252 if (*dyn_sizep != 0)
1253 *dyn_sizep = size_sum - static_size - reserved_size;
1254
1255 return size_sum;
1256}
1257
977/** 1258/**
978 * pcpu_setup_first_chunk - initialize the first percpu chunk 1259 * pcpu_alloc_alloc_info - allocate percpu allocation info
979 * @get_page_fn: callback to fetch page pointer 1260 * @nr_groups: the number of groups
980 * @static_size: the size of static percpu area in bytes 1261 * @nr_units: the number of units
1262 *
1263 * Allocate ai which is large enough for @nr_groups groups containing
1264 * @nr_units units. The returned ai's groups[0].cpu_map points to the
1265 * cpu_map array which is long enough for @nr_units and filled with
1266 * NR_CPUS. It's the caller's responsibility to initialize cpu_map
1267 * pointer of other groups.
1268 *
1269 * RETURNS:
1270 * Pointer to the allocated pcpu_alloc_info on success, NULL on
1271 * failure.
1272 */
1273struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
1274 int nr_units)
1275{
1276 struct pcpu_alloc_info *ai;
1277 size_t base_size, ai_size;
1278 void *ptr;
1279 int unit;
1280
1281 base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
1282 __alignof__(ai->groups[0].cpu_map[0]));
1283 ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
1284
1285 ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
1286 if (!ptr)
1287 return NULL;
1288 ai = ptr;
1289 ptr += base_size;
1290
1291 ai->groups[0].cpu_map = ptr;
1292
1293 for (unit = 0; unit < nr_units; unit++)
1294 ai->groups[0].cpu_map[unit] = NR_CPUS;
1295
1296 ai->nr_groups = nr_groups;
1297 ai->__ai_size = PFN_ALIGN(ai_size);
1298
1299 return ai;
1300}
1301
1302/**
1303 * pcpu_free_alloc_info - free percpu allocation info
1304 * @ai: pcpu_alloc_info to free
1305 *
1306 * Free @ai which was allocated by pcpu_alloc_alloc_info().
1307 */
1308void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
1309{
1310 free_bootmem(__pa(ai), ai->__ai_size);
1311}
1312
1313/**
1314 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
981 * @reserved_size: the size of reserved percpu area in bytes 1315 * @reserved_size: the size of reserved percpu area in bytes
982 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1316 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
983 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1317 * @atom_size: allocation atom size
984 * @base_addr: mapped address, NULL for auto 1318 * @cpu_distance_fn: callback to determine distance between cpus, optional
985 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1319 *
1320 * This function determines grouping of units, their mappings to cpus
1321 * and other parameters considering needed percpu size, allocation
1322 * atom size and distances between CPUs.
1323 *
1324 * Groups are always mutliples of atom size and CPUs which are of
1325 * LOCAL_DISTANCE both ways are grouped together and share space for
1326 * units in the same group. The returned configuration is guaranteed
1327 * to have CPUs on different nodes on different groups and >=75% usage
1328 * of allocated virtual address space.
1329 *
1330 * RETURNS:
1331 * On success, pointer to the new allocation_info is returned. On
1332 * failure, ERR_PTR value is returned.
1333 */
1334struct pcpu_alloc_info * __init pcpu_build_alloc_info(
1335 size_t reserved_size, ssize_t dyn_size,
1336 size_t atom_size,
1337 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1338{
1339 static int group_map[NR_CPUS] __initdata;
1340 static int group_cnt[NR_CPUS] __initdata;
1341 const size_t static_size = __per_cpu_end - __per_cpu_start;
1342 int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
1343 size_t size_sum, min_unit_size, alloc_size;
1344 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1345 int last_allocs, group, unit;
1346 unsigned int cpu, tcpu;
1347 struct pcpu_alloc_info *ai;
1348 unsigned int *cpu_map;
1349
1350 /*
1351 * Determine min_unit_size, alloc_size and max_upa such that
1352 * alloc_size is multiple of atom_size and is the smallest
1353 * which can accomodate 4k aligned segments which are equal to
1354 * or larger than min_unit_size.
1355 */
1356 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1357 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1358
1359 alloc_size = roundup(min_unit_size, atom_size);
1360 upa = alloc_size / min_unit_size;
1361 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1362 upa--;
1363 max_upa = upa;
1364
1365 /* group cpus according to their proximity */
1366 for_each_possible_cpu(cpu) {
1367 group = 0;
1368 next_group:
1369 for_each_possible_cpu(tcpu) {
1370 if (cpu == tcpu)
1371 break;
1372 if (group_map[tcpu] == group && cpu_distance_fn &&
1373 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1374 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1375 group++;
1376 nr_groups = max(nr_groups, group + 1);
1377 goto next_group;
1378 }
1379 }
1380 group_map[cpu] = group;
1381 group_cnt[group]++;
1382 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1383 }
1384
1385 /*
1386 * Expand unit size until address space usage goes over 75%
1387 * and then as much as possible without using more address
1388 * space.
1389 */
1390 last_allocs = INT_MAX;
1391 for (upa = max_upa; upa; upa--) {
1392 int allocs = 0, wasted = 0;
1393
1394 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1395 continue;
1396
1397 for (group = 0; group < nr_groups; group++) {
1398 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1399 allocs += this_allocs;
1400 wasted += this_allocs * upa - group_cnt[group];
1401 }
1402
1403 /*
1404 * Don't accept if wastage is over 25%. The
1405 * greater-than comparison ensures upa==1 always
1406 * passes the following check.
1407 */
1408 if (wasted > num_possible_cpus() / 3)
1409 continue;
1410
1411 /* and then don't consume more memory */
1412 if (allocs > last_allocs)
1413 break;
1414 last_allocs = allocs;
1415 best_upa = upa;
1416 }
1417 upa = best_upa;
1418
1419 /* allocate and fill alloc_info */
1420 for (group = 0; group < nr_groups; group++)
1421 nr_units += roundup(group_cnt[group], upa);
1422
1423 ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
1424 if (!ai)
1425 return ERR_PTR(-ENOMEM);
1426 cpu_map = ai->groups[0].cpu_map;
1427
1428 for (group = 0; group < nr_groups; group++) {
1429 ai->groups[group].cpu_map = cpu_map;
1430 cpu_map += roundup(group_cnt[group], upa);
1431 }
1432
1433 ai->static_size = static_size;
1434 ai->reserved_size = reserved_size;
1435 ai->dyn_size = dyn_size;
1436 ai->unit_size = alloc_size / upa;
1437 ai->atom_size = atom_size;
1438 ai->alloc_size = alloc_size;
1439
1440 for (group = 0, unit = 0; group_cnt[group]; group++) {
1441 struct pcpu_group_info *gi = &ai->groups[group];
1442
1443 /*
1444 * Initialize base_offset as if all groups are located
1445 * back-to-back. The caller should update this to
1446 * reflect actual allocation.
1447 */
1448 gi->base_offset = unit * ai->unit_size;
1449
1450 for_each_possible_cpu(cpu)
1451 if (group_map[cpu] == group)
1452 gi->cpu_map[gi->nr_units++] = cpu;
1453 gi->nr_units = roundup(gi->nr_units, upa);
1454 unit += gi->nr_units;
1455 }
1456 BUG_ON(unit != nr_units);
1457
1458 return ai;
1459}
1460
1461/**
1462 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
1463 * @lvl: loglevel
1464 * @ai: allocation info to dump
1465 *
1466 * Print out information about @ai using loglevel @lvl.
1467 */
1468static void pcpu_dump_alloc_info(const char *lvl,
1469 const struct pcpu_alloc_info *ai)
1470{
1471 int group_width = 1, cpu_width = 1, width;
1472 char empty_str[] = "--------";
1473 int alloc = 0, alloc_end = 0;
1474 int group, v;
1475 int upa, apl; /* units per alloc, allocs per line */
1476
1477 v = ai->nr_groups;
1478 while (v /= 10)
1479 group_width++;
1480
1481 v = num_possible_cpus();
1482 while (v /= 10)
1483 cpu_width++;
1484 empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
1485
1486 upa = ai->alloc_size / ai->unit_size;
1487 width = upa * (cpu_width + 1) + group_width + 3;
1488 apl = rounddown_pow_of_two(max(60 / width, 1));
1489
1490 printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
1491 lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
1492 ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
1493
1494 for (group = 0; group < ai->nr_groups; group++) {
1495 const struct pcpu_group_info *gi = &ai->groups[group];
1496 int unit = 0, unit_end = 0;
1497
1498 BUG_ON(gi->nr_units % upa);
1499 for (alloc_end += gi->nr_units / upa;
1500 alloc < alloc_end; alloc++) {
1501 if (!(alloc % apl)) {
1502 printk("\n");
1503 printk("%spcpu-alloc: ", lvl);
1504 }
1505 printk("[%0*d] ", group_width, group);
1506
1507 for (unit_end += upa; unit < unit_end; unit++)
1508 if (gi->cpu_map[unit] != NR_CPUS)
1509 printk("%0*d ", cpu_width,
1510 gi->cpu_map[unit]);
1511 else
1512 printk("%s ", empty_str);
1513 }
1514 }
1515 printk("\n");
1516}
1517
1518/**
1519 * pcpu_setup_first_chunk - initialize the first percpu chunk
1520 * @ai: pcpu_alloc_info describing how to percpu area is shaped
1521 * @base_addr: mapped address
986 * 1522 *
987 * Initialize the first percpu chunk which contains the kernel static 1523 * Initialize the first percpu chunk which contains the kernel static
988 * perpcu area. This function is to be called from arch percpu area 1524 * perpcu area. This function is to be called from arch percpu area
989 * setup path. The first two parameters are mandatory. The rest are 1525 * setup path.
990 * optional. 1526 *
991 * 1527 * @ai contains all information necessary to initialize the first
992 * @get_page_fn() should return pointer to percpu page given cpu 1528 * chunk and prime the dynamic percpu allocator.
993 * number and page number. It should at least return enough pages to 1529 *
994 * cover the static area. The returned pages for static area should 1530 * @ai->static_size is the size of static percpu area.
995 * have been initialized with valid data. If @unit_size is specified, 1531 *
996 * it can also return pages after the static area. NULL return 1532 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
997 * indicates end of pages for the cpu. Note that @get_page_fn() must
998 * return the same number of pages for all cpus.
999 *
1000 * @reserved_size, if non-zero, specifies the amount of bytes to
1001 * reserve after the static area in the first chunk. This reserves 1533 * reserve after the static area in the first chunk. This reserves
1002 * the first chunk such that it's available only through reserved 1534 * the first chunk such that it's available only through reserved
1003 * percpu allocation. This is primarily used to serve module percpu 1535 * percpu allocation. This is primarily used to serve module percpu
@@ -1005,22 +1537,29 @@ EXPORT_SYMBOL_GPL(free_percpu);
1005 * limited offset range for symbol relocations to guarantee module 1537 * limited offset range for symbol relocations to guarantee module
1006 * percpu symbols fall inside the relocatable range. 1538 * percpu symbols fall inside the relocatable range.
1007 * 1539 *
1008 * @dyn_size, if non-negative, determines the number of bytes 1540 * @ai->dyn_size determines the number of bytes available for dynamic
1009 * available for dynamic allocation in the first chunk. Specifying 1541 * allocation in the first chunk. The area between @ai->static_size +
1010 * non-negative value makes percpu leave alone the area beyond 1542 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
1011 * @static_size + @reserved_size + @dyn_size.
1012 * 1543 *
1013 * @unit_size, if non-negative, specifies unit size and must be 1544 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
1014 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1545 * and equal to or larger than @ai->static_size + @ai->reserved_size +
1015 * @reserved_size + if non-negative, @dyn_size. 1546 * @ai->dyn_size.
1016 * 1547 *
1017 * Non-null @base_addr means that the caller already allocated virtual 1548 * @ai->atom_size is the allocation atom size and used as alignment
1018 * region for the first chunk and mapped it. percpu must not mess 1549 * for vm areas.
1019 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
1020 * @populate_pte_fn doesn't make any sense.
1021 * 1550 *
1022 * @populate_pte_fn is used to populate the pagetable. NULL means the 1551 * @ai->alloc_size is the allocation size and always multiple of
1023 * caller already populated the pagetable. 1552 * @ai->atom_size. This is larger than @ai->atom_size if
1553 * @ai->unit_size is larger than @ai->atom_size.
1554 *
1555 * @ai->nr_groups and @ai->groups describe virtual memory layout of
1556 * percpu areas. Units which should be colocated are put into the
1557 * same group. Dynamic VM areas will be allocated according to these
1558 * groupings. If @ai->nr_groups is zero, a single group containing
1559 * all units is assumed.
1560 *
1561 * The caller should have mapped the first chunk at @base_addr and
1562 * copied static data to each unit.
1024 * 1563 *
1025 * If the first chunk ends up with both reserved and dynamic areas, it 1564 * If the first chunk ends up with both reserved and dynamic areas, it
1026 * is served by two chunks - one to serve the core static and reserved 1565 * is served by two chunks - one to serve the core static and reserved
@@ -1030,49 +1569,83 @@ EXPORT_SYMBOL_GPL(free_percpu);
1030 * and available for dynamic allocation like any other chunks. 1569 * and available for dynamic allocation like any other chunks.
1031 * 1570 *
1032 * RETURNS: 1571 * RETURNS:
1033 * The determined pcpu_unit_size which can be used to initialize 1572 * 0 on success, -errno on failure.
1034 * percpu access.
1035 */ 1573 */
1036size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1574int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1037 size_t static_size, size_t reserved_size, 1575 void *base_addr)
1038 ssize_t dyn_size, ssize_t unit_size,
1039 void *base_addr,
1040 pcpu_populate_pte_fn_t populate_pte_fn)
1041{ 1576{
1042 static struct vm_struct first_vm;
1043 static int smap[2], dmap[2]; 1577 static int smap[2], dmap[2];
1044 size_t size_sum = static_size + reserved_size + 1578 size_t dyn_size = ai->dyn_size;
1045 (dyn_size >= 0 ? dyn_size : 0); 1579 size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
1046 struct pcpu_chunk *schunk, *dchunk = NULL; 1580 struct pcpu_chunk *schunk, *dchunk = NULL;
1581 unsigned long *group_offsets;
1582 size_t *group_sizes;
1583 unsigned long *unit_off;
1047 unsigned int cpu; 1584 unsigned int cpu;
1048 int nr_pages; 1585 int *unit_map;
1049 int err, i; 1586 int group, unit, i;
1050 1587
1051 /* santiy checks */ 1588 /* sanity checks */
1052 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || 1589 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1053 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1590 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1054 BUG_ON(!static_size); 1591 BUG_ON(ai->nr_groups <= 0);
1055 if (unit_size >= 0) { 1592 BUG_ON(!ai->static_size);
1056 BUG_ON(unit_size < size_sum); 1593 BUG_ON(!base_addr);
1057 BUG_ON(unit_size & ~PAGE_MASK); 1594 BUG_ON(ai->unit_size < size_sum);
1058 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); 1595 BUG_ON(ai->unit_size & ~PAGE_MASK);
1059 } else 1596 BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1060 BUG_ON(base_addr); 1597
1061 BUG_ON(base_addr && populate_pte_fn); 1598 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1062 1599
1063 if (unit_size >= 0) 1600 /* process group information and build config tables accordingly */
1064 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1601 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
1065 else 1602 group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
1066 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1603 unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
1067 PFN_UP(size_sum)); 1604 unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
1605
1606 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
1607 unit_map[cpu] = NR_CPUS;
1608 pcpu_first_unit_cpu = NR_CPUS;
1609
1610 for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
1611 const struct pcpu_group_info *gi = &ai->groups[group];
1612
1613 group_offsets[group] = gi->base_offset;
1614 group_sizes[group] = gi->nr_units * ai->unit_size;
1615
1616 for (i = 0; i < gi->nr_units; i++) {
1617 cpu = gi->cpu_map[i];
1618 if (cpu == NR_CPUS)
1619 continue;
1068 1620
1069 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1621 BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
1070 pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; 1622 BUG_ON(unit_map[cpu] != NR_CPUS);
1071 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1072 + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
1073 1623
1074 if (dyn_size < 0) 1624 unit_map[cpu] = unit + i;
1075 dyn_size = pcpu_unit_size - static_size - reserved_size; 1625 unit_off[cpu] = gi->base_offset + i * ai->unit_size;
1626
1627 if (pcpu_first_unit_cpu == NR_CPUS)
1628 pcpu_first_unit_cpu = cpu;
1629 }
1630 }
1631 pcpu_last_unit_cpu = cpu;
1632 pcpu_nr_units = unit;
1633
1634 for_each_possible_cpu(cpu)
1635 BUG_ON(unit_map[cpu] == NR_CPUS);
1636
1637 pcpu_nr_groups = ai->nr_groups;
1638 pcpu_group_offsets = group_offsets;
1639 pcpu_group_sizes = group_sizes;
1640 pcpu_unit_map = unit_map;
1641 pcpu_unit_offsets = unit_off;
1642
1643 /* determine basic parameters */
1644 pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
1645 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1646 pcpu_atom_size = ai->atom_size;
1647 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
1648 BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
1076 1649
1077 /* 1650 /*
1078 * Allocate chunk slots. The additional last slot is for 1651 * Allocate chunk slots. The additional last slot is for
@@ -1092,189 +1665,351 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1092 */ 1665 */
1093 schunk = alloc_bootmem(pcpu_chunk_struct_size); 1666 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1094 INIT_LIST_HEAD(&schunk->list); 1667 INIT_LIST_HEAD(&schunk->list);
1095 schunk->vm = &first_vm; 1668 schunk->base_addr = base_addr;
1096 schunk->map = smap; 1669 schunk->map = smap;
1097 schunk->map_alloc = ARRAY_SIZE(smap); 1670 schunk->map_alloc = ARRAY_SIZE(smap);
1098 schunk->page = schunk->page_ar; 1671 schunk->immutable = true;
1672 bitmap_fill(schunk->populated, pcpu_unit_pages);
1099 1673
1100 if (reserved_size) { 1674 if (ai->reserved_size) {
1101 schunk->free_size = reserved_size; 1675 schunk->free_size = ai->reserved_size;
1102 pcpu_reserved_chunk = schunk; 1676 pcpu_reserved_chunk = schunk;
1103 pcpu_reserved_chunk_limit = static_size + reserved_size; 1677 pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
1104 } else { 1678 } else {
1105 schunk->free_size = dyn_size; 1679 schunk->free_size = dyn_size;
1106 dyn_size = 0; /* dynamic area covered */ 1680 dyn_size = 0; /* dynamic area covered */
1107 } 1681 }
1108 schunk->contig_hint = schunk->free_size; 1682 schunk->contig_hint = schunk->free_size;
1109 1683
1110 schunk->map[schunk->map_used++] = -static_size; 1684 schunk->map[schunk->map_used++] = -ai->static_size;
1111 if (schunk->free_size) 1685 if (schunk->free_size)
1112 schunk->map[schunk->map_used++] = schunk->free_size; 1686 schunk->map[schunk->map_used++] = schunk->free_size;
1113 1687
1114 /* init dynamic chunk if necessary */ 1688 /* init dynamic chunk if necessary */
1115 if (dyn_size) { 1689 if (dyn_size) {
1116 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); 1690 dchunk = alloc_bootmem(pcpu_chunk_struct_size);
1117 INIT_LIST_HEAD(&dchunk->list); 1691 INIT_LIST_HEAD(&dchunk->list);
1118 dchunk->vm = &first_vm; 1692 dchunk->base_addr = base_addr;
1119 dchunk->map = dmap; 1693 dchunk->map = dmap;
1120 dchunk->map_alloc = ARRAY_SIZE(dmap); 1694 dchunk->map_alloc = ARRAY_SIZE(dmap);
1121 dchunk->page = schunk->page_ar; /* share page map with schunk */ 1695 dchunk->immutable = true;
1696 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1122 1697
1123 dchunk->contig_hint = dchunk->free_size = dyn_size; 1698 dchunk->contig_hint = dchunk->free_size = dyn_size;
1124 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; 1699 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1125 dchunk->map[dchunk->map_used++] = dchunk->free_size; 1700 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1126 } 1701 }
1127 1702
1128 /* allocate vm address */
1129 first_vm.flags = VM_ALLOC;
1130 first_vm.size = pcpu_chunk_size;
1131
1132 if (!base_addr)
1133 vm_area_register_early(&first_vm, PAGE_SIZE);
1134 else {
1135 /*
1136 * Pages already mapped. No need to remap into
1137 * vmalloc area. In this case the first chunks can't
1138 * be mapped or unmapped by percpu and are marked
1139 * immutable.
1140 */
1141 first_vm.addr = base_addr;
1142 schunk->immutable = true;
1143 if (dchunk)
1144 dchunk->immutable = true;
1145 }
1146
1147 /* assign pages */
1148 nr_pages = -1;
1149 for_each_possible_cpu(cpu) {
1150 for (i = 0; i < pcpu_unit_pages; i++) {
1151 struct page *page = get_page_fn(cpu, i);
1152
1153 if (!page)
1154 break;
1155 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1156 }
1157
1158 BUG_ON(i < PFN_UP(static_size));
1159
1160 if (nr_pages < 0)
1161 nr_pages = i;
1162 else
1163 BUG_ON(nr_pages != i);
1164 }
1165
1166 /* map them */
1167 if (populate_pte_fn) {
1168 for_each_possible_cpu(cpu)
1169 for (i = 0; i < nr_pages; i++)
1170 populate_pte_fn(pcpu_chunk_addr(schunk,
1171 cpu, i));
1172
1173 err = pcpu_map(schunk, 0, nr_pages);
1174 if (err)
1175 panic("failed to setup static percpu area, err=%d\n",
1176 err);
1177 }
1178
1179 /* link the first chunk in */ 1703 /* link the first chunk in */
1180 pcpu_first_chunk = dchunk ?: schunk; 1704 pcpu_first_chunk = dchunk ?: schunk;
1181 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1705 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1182 1706
1183 /* we're done */ 1707 /* we're done */
1184 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1708 pcpu_base_addr = base_addr;
1185 return pcpu_unit_size; 1709 return 0;
1186} 1710}
1187 1711
1188/* 1712const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
1189 * Embedding first chunk setup helper. 1713 [PCPU_FC_AUTO] = "auto",
1190 */ 1714 [PCPU_FC_EMBED] = "embed",
1191static void *pcpue_ptr __initdata; 1715 [PCPU_FC_PAGE] = "page",
1192static size_t pcpue_size __initdata; 1716};
1193static size_t pcpue_unit_size __initdata;
1194 1717
1195static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) 1718enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
1196{
1197 size_t off = (size_t)pageno << PAGE_SHIFT;
1198 1719
1199 if (off >= pcpue_size) 1720static int __init percpu_alloc_setup(char *str)
1200 return NULL; 1721{
1722 if (0)
1723 /* nada */;
1724#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
1725 else if (!strcmp(str, "embed"))
1726 pcpu_chosen_fc = PCPU_FC_EMBED;
1727#endif
1728#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1729 else if (!strcmp(str, "page"))
1730 pcpu_chosen_fc = PCPU_FC_PAGE;
1731#endif
1732 else
1733 pr_warning("PERCPU: unknown allocator %s specified\n", str);
1201 1734
1202 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); 1735 return 0;
1203} 1736}
1737early_param("percpu_alloc", percpu_alloc_setup);
1204 1738
1739#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
1740 !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
1205/** 1741/**
1206 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1742 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1207 * @static_size: the size of static percpu area in bytes
1208 * @reserved_size: the size of reserved percpu area in bytes 1743 * @reserved_size: the size of reserved percpu area in bytes
1209 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1744 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1210 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto 1745 * @atom_size: allocation atom size
1746 * @cpu_distance_fn: callback to determine distance between cpus, optional
1747 * @alloc_fn: function to allocate percpu page
1748 * @free_fn: funtion to free percpu page
1211 * 1749 *
1212 * This is a helper to ease setting up embedded first percpu chunk and 1750 * This is a helper to ease setting up embedded first percpu chunk and
1213 * can be called where pcpu_setup_first_chunk() is expected. 1751 * can be called where pcpu_setup_first_chunk() is expected.
1214 * 1752 *
1215 * If this function is used to setup the first chunk, it is allocated 1753 * If this function is used to setup the first chunk, it is allocated
1216 * as a contiguous area using bootmem allocator and used as-is without 1754 * by calling @alloc_fn and used as-is without being mapped into
1217 * being mapped into vmalloc area. This enables the first chunk to 1755 * vmalloc area. Allocations are always whole multiples of @atom_size
1218 * piggy back on the linear physical mapping which often uses larger 1756 * aligned to @atom_size.
1219 * page size. 1757 *
1758 * This enables the first chunk to piggy back on the linear physical
1759 * mapping which often uses larger page size. Please note that this
1760 * can result in very sparse cpu->unit mapping on NUMA machines thus
1761 * requiring large vmalloc address space. Don't use this allocator if
1762 * vmalloc space is not orders of magnitude larger than distances
1763 * between node memory addresses (ie. 32bit NUMA machines).
1220 * 1764 *
1221 * When @dyn_size is positive, dynamic area might be larger than 1765 * When @dyn_size is positive, dynamic area might be larger than
1222 * specified to fill page alignment. Also, when @dyn_size is auto, 1766 * specified to fill page alignment. When @dyn_size is auto,
1223 * @dyn_size does not fill the whole first chunk but only what's 1767 * @dyn_size is just big enough to fill page alignment after static
1224 * necessary for page alignment after static and reserved areas. 1768 * and reserved areas.
1225 * 1769 *
1226 * If the needed size is smaller than the minimum or specified unit 1770 * If the needed size is smaller than the minimum or specified unit
1227 * size, the leftover is returned to the bootmem allocator. 1771 * size, the leftover is returned using @free_fn.
1228 * 1772 *
1229 * RETURNS: 1773 * RETURNS:
1230 * The determined pcpu_unit_size which can be used to initialize 1774 * 0 on success, -errno on failure.
1231 * percpu access on success, -errno on failure.
1232 */ 1775 */
1233ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1776int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1234 ssize_t dyn_size, ssize_t unit_size) 1777 size_t atom_size,
1778 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1779 pcpu_fc_alloc_fn_t alloc_fn,
1780 pcpu_fc_free_fn_t free_fn)
1235{ 1781{
1236 size_t chunk_size; 1782 void *base = (void *)ULONG_MAX;
1237 unsigned int cpu; 1783 void **areas = NULL;
1784 struct pcpu_alloc_info *ai;
1785 size_t size_sum, areas_size;
1786 int group, i, rc;
1787
1788 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1789 cpu_distance_fn);
1790 if (IS_ERR(ai))
1791 return PTR_ERR(ai);
1792
1793 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1794 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1795
1796 areas = alloc_bootmem_nopanic(areas_size);
1797 if (!areas) {
1798 rc = -ENOMEM;
1799 goto out_free;
1800 }
1238 1801
1239 /* determine parameters and allocate */ 1802 /* allocate, copy and determine base address */
1240 pcpue_size = PFN_ALIGN(static_size + reserved_size + 1803 for (group = 0; group < ai->nr_groups; group++) {
1241 (dyn_size >= 0 ? dyn_size : 0)); 1804 struct pcpu_group_info *gi = &ai->groups[group];
1242 if (dyn_size != 0) 1805 unsigned int cpu = NR_CPUS;
1243 dyn_size = pcpue_size - static_size - reserved_size; 1806 void *ptr;
1244 1807
1245 if (unit_size >= 0) { 1808 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1246 BUG_ON(unit_size < pcpue_size); 1809 cpu = gi->cpu_map[i];
1247 pcpue_unit_size = unit_size; 1810 BUG_ON(cpu == NR_CPUS);
1248 } else 1811
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1812 /* allocate space for the whole group */
1250 1813 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1251 chunk_size = pcpue_unit_size * nr_cpu_ids; 1814 if (!ptr) {
1252 1815 rc = -ENOMEM;
1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1816 goto out_free_areas;
1254 __pa(MAX_DMA_ADDRESS)); 1817 }
1255 if (!pcpue_ptr) { 1818 areas[group] = ptr;
1256 pr_warning("PERCPU: failed to allocate %zu bytes for " 1819
1257 "embedding\n", chunk_size); 1820 base = min(ptr, base);
1258 return -ENOMEM; 1821
1822 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1823 if (gi->cpu_map[i] == NR_CPUS) {
1824 /* unused unit, free whole */
1825 free_fn(ptr, ai->unit_size);
1826 continue;
1827 }
1828 /* copy and return the unused part */
1829 memcpy(ptr, __per_cpu_load, ai->static_size);
1830 free_fn(ptr + size_sum, ai->unit_size - size_sum);
1831 }
1259 } 1832 }
1260 1833
1261 /* return the leftover and copy */ 1834 /* base address is now known, determine group base offsets */
1262 for (cpu = 0; cpu < nr_cpu_ids; cpu++) { 1835 for (group = 0; group < ai->nr_groups; group++)
1263 void *ptr = pcpue_ptr + cpu * pcpue_unit_size; 1836 ai->groups[group].base_offset = areas[group] - base;
1837
1838 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1839 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
1840 ai->dyn_size, ai->unit_size);
1841
1842 rc = pcpu_setup_first_chunk(ai, base);
1843 goto out_free;
1844
1845out_free_areas:
1846 for (group = 0; group < ai->nr_groups; group++)
1847 free_fn(areas[group],
1848 ai->groups[group].nr_units * ai->unit_size);
1849out_free:
1850 pcpu_free_alloc_info(ai);
1851 if (areas)
1852 free_bootmem(__pa(areas), areas_size);
1853 return rc;
1854}
1855#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
1856 !CONFIG_HAVE_SETUP_PER_CPU_AREA */
1857
1858#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1859/**
1860 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
1861 * @reserved_size: the size of reserved percpu area in bytes
1862 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
1863 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
1864 * @populate_pte_fn: function to populate pte
1865 *
1866 * This is a helper to ease setting up page-remapped first percpu
1867 * chunk and can be called where pcpu_setup_first_chunk() is expected.
1868 *
1869 * This is the basic allocator. Static percpu area is allocated
1870 * page-by-page into vmalloc area.
1871 *
1872 * RETURNS:
1873 * 0 on success, -errno on failure.
1874 */
1875int __init pcpu_page_first_chunk(size_t reserved_size,
1876 pcpu_fc_alloc_fn_t alloc_fn,
1877 pcpu_fc_free_fn_t free_fn,
1878 pcpu_fc_populate_pte_fn_t populate_pte_fn)
1879{
1880 static struct vm_struct vm;
1881 struct pcpu_alloc_info *ai;
1882 char psize_str[16];
1883 int unit_pages;
1884 size_t pages_size;
1885 struct page **pages;
1886 int unit, i, j, rc;
1887
1888 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
1889
1890 ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
1891 if (IS_ERR(ai))
1892 return PTR_ERR(ai);
1893 BUG_ON(ai->nr_groups != 1);
1894 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
1895
1896 unit_pages = ai->unit_size >> PAGE_SHIFT;
1897
1898 /* unaligned allocations can't be freed, round up to page size */
1899 pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
1900 sizeof(pages[0]));
1901 pages = alloc_bootmem(pages_size);
1902
1903 /* allocate pages */
1904 j = 0;
1905 for (unit = 0; unit < num_possible_cpus(); unit++)
1906 for (i = 0; i < unit_pages; i++) {
1907 unsigned int cpu = ai->groups[0].cpu_map[unit];
1908 void *ptr;
1909
1910 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
1911 if (!ptr) {
1912 pr_warning("PERCPU: failed to allocate %s page "
1913 "for cpu%u\n", psize_str, cpu);
1914 goto enomem;
1915 }
1916 pages[j++] = virt_to_page(ptr);
1917 }
1918
1919 /* allocate vm area, map the pages and copy static data */
1920 vm.flags = VM_ALLOC;
1921 vm.size = num_possible_cpus() * ai->unit_size;
1922 vm_area_register_early(&vm, PAGE_SIZE);
1923
1924 for (unit = 0; unit < num_possible_cpus(); unit++) {
1925 unsigned long unit_addr =
1926 (unsigned long)vm.addr + unit * ai->unit_size;
1927
1928 for (i = 0; i < unit_pages; i++)
1929 populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
1930
1931 /* pte already populated, the following shouldn't fail */
1932 rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
1933 unit_pages);
1934 if (rc < 0)
1935 panic("failed to map percpu area, err=%d\n", rc);
1264 1936
1265 if (cpu_possible(cpu)) { 1937 /*
1266 free_bootmem(__pa(ptr + pcpue_size), 1938 * FIXME: Archs with virtual cache should flush local
1267 pcpue_unit_size - pcpue_size); 1939 * cache for the linear mapping here - something
1268 memcpy(ptr, __per_cpu_load, static_size); 1940 * equivalent to flush_cache_vmap() on the local cpu.
1269 } else 1941 * flush_cache_vmap() can't be used as most supporting
1270 free_bootmem(__pa(ptr), pcpue_unit_size); 1942 * data structures are not set up yet.
1943 */
1944
1945 /* copy static data */
1946 memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
1271 } 1947 }
1272 1948
1273 /* we're ready, commit */ 1949 /* we're ready, commit */
1274 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", 1950 pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
1275 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); 1951 unit_pages, psize_str, vm.addr, ai->static_size,
1952 ai->reserved_size, ai->dyn_size);
1953
1954 rc = pcpu_setup_first_chunk(ai, vm.addr);
1955 goto out_free_ar;
1956
1957enomem:
1958 while (--j >= 0)
1959 free_fn(page_address(pages[j]), PAGE_SIZE);
1960 rc = -ENOMEM;
1961out_free_ar:
1962 free_bootmem(__pa(pages), pages_size);
1963 pcpu_free_alloc_info(ai);
1964 return rc;
1965}
1966#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
1967
1968/*
1969 * Generic percpu area setup.
1970 *
1971 * The embedding helper is used because its behavior closely resembles
1972 * the original non-dynamic generic percpu area setup. This is
1973 * important because many archs have addressing restrictions and might
1974 * fail if the percpu area is located far away from the previous
1975 * location. As an added bonus, in non-NUMA cases, embedding is
1976 * generally a good idea TLB-wise because percpu area can piggy back
1977 * on the physical linear memory mapping which uses large page
1978 * mappings on applicable archs.
1979 */
1980#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
1981unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
1982EXPORT_SYMBOL(__per_cpu_offset);
1983
1984static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
1985 size_t align)
1986{
1987 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
1988}
1276 1989
1277 return pcpu_setup_first_chunk(pcpue_get_page, static_size, 1990static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
1278 reserved_size, dyn_size, 1991{
1279 pcpue_unit_size, pcpue_ptr, NULL); 1992 free_bootmem(__pa(ptr), size);
1993}
1994
1995void __init setup_per_cpu_areas(void)
1996{
1997 unsigned long delta;
1998 unsigned int cpu;
1999 int rc;
2000
2001 /*
2002 * Always reserve area for module percpu variables. That's
2003 * what the legacy allocator did.
2004 */
2005 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2006 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2007 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2008 if (rc < 0)
2009 panic("Failed to initialized percpu areas.");
2010
2011 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
2012 for_each_possible_cpu(cpu)
2013 __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
1280} 2014}
2015#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index e66d07d1b4ff..6eedf7e473d1 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -19,7 +19,7 @@
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/quicklist.h> 20#include <linux/quicklist.h>
21 21
22DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; 22DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
23 23
24#define FRACTION_OF_NODE_MEM 16 24#define FRACTION_OF_NODE_MEM 16
25 25
diff --git a/mm/shmem.c b/mm/shmem.c
index d713239ce2ce..bd20f8bb02aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2298,8 +2298,7 @@ static void shmem_put_super(struct super_block *sb)
2298 sb->s_fs_info = NULL; 2298 sb->s_fs_info = NULL;
2299} 2299}
2300 2300
2301static int shmem_fill_super(struct super_block *sb, 2301int shmem_fill_super(struct super_block *sb, void *data, int silent)
2302 void *data, int silent)
2303{ 2302{
2304 struct inode *inode; 2303 struct inode *inode;
2305 struct dentry *root; 2304 struct dentry *root;
@@ -2446,7 +2445,7 @@ static const struct inode_operations shmem_inode_operations = {
2446 .getxattr = generic_getxattr, 2445 .getxattr = generic_getxattr,
2447 .listxattr = generic_listxattr, 2446 .listxattr = generic_listxattr,
2448 .removexattr = generic_removexattr, 2447 .removexattr = generic_removexattr,
2449 .permission = shmem_permission, 2448 .check_acl = shmem_check_acl,
2450#endif 2449#endif
2451 2450
2452}; 2451};
@@ -2469,7 +2468,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2469 .getxattr = generic_getxattr, 2468 .getxattr = generic_getxattr,
2470 .listxattr = generic_listxattr, 2469 .listxattr = generic_listxattr,
2471 .removexattr = generic_removexattr, 2470 .removexattr = generic_removexattr,
2472 .permission = shmem_permission, 2471 .check_acl = shmem_check_acl,
2473#endif 2472#endif
2474}; 2473};
2475 2474
@@ -2480,7 +2479,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2480 .getxattr = generic_getxattr, 2479 .getxattr = generic_getxattr,
2481 .listxattr = generic_listxattr, 2480 .listxattr = generic_listxattr,
2482 .removexattr = generic_removexattr, 2481 .removexattr = generic_removexattr,
2483 .permission = shmem_permission, 2482 .check_acl = shmem_check_acl,
2484#endif 2483#endif
2485}; 2484};
2486 2485
@@ -2519,7 +2518,7 @@ static struct file_system_type tmpfs_fs_type = {
2519 .kill_sb = kill_litter_super, 2518 .kill_sb = kill_litter_super,
2520}; 2519};
2521 2520
2522static int __init init_tmpfs(void) 2521int __init init_tmpfs(void)
2523{ 2522{
2524 int error; 2523 int error;
2525 2524
@@ -2576,7 +2575,7 @@ static struct file_system_type tmpfs_fs_type = {
2576 .kill_sb = kill_litter_super, 2575 .kill_sb = kill_litter_super,
2577}; 2576};
2578 2577
2579static int __init init_tmpfs(void) 2578int __init init_tmpfs(void)
2580{ 2579{
2581 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2580 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2582 2581
@@ -2687,5 +2686,3 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2687 vma->vm_ops = &shmem_vm_ops; 2686 vma->vm_ops = &shmem_vm_ops;
2688 return 0; 2687 return 0;
2689} 2688}
2690
2691module_init(init_tmpfs)
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index 606a8e757a42..df2c87fdae50 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir)
157/** 157/**
158 * shmem_check_acl - check_acl() callback for generic_permission() 158 * shmem_check_acl - check_acl() callback for generic_permission()
159 */ 159 */
160static int 160int
161shmem_check_acl(struct inode *inode, int mask) 161shmem_check_acl(struct inode *inode, int mask)
162{ 162{
163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); 163 struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
@@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask)
169 } 169 }
170 return -EAGAIN; 170 return -EAGAIN;
171} 171}
172
173/**
174 * shmem_permission - permission() inode operation
175 */
176int
177shmem_permission(struct inode *inode, int mask)
178{
179 return generic_permission(inode, mask, shmem_check_acl);
180}
diff --git a/mm/slob.c b/mm/slob.c
index 9641da3d5e58..837ebd64cc34 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -692,3 +692,8 @@ void __init kmem_cache_init(void)
692{ 692{
693 slob_ready = 1; 693 slob_ready = 1;
694} 694}
695
696void __init kmem_cache_init_late(void)
697{
698 /* Nothing to do */
699}
diff --git a/mm/slub.c b/mm/slub.c
index b9f1491a58a1..0a216aae227e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -141,6 +141,13 @@
141 SLAB_POISON | SLAB_STORE_USER) 141 SLAB_POISON | SLAB_STORE_USER)
142 142
143/* 143/*
144 * Debugging flags that require metadata to be stored in the slab. These get
145 * disabled when slub_debug=O is used and a cache's min order increases with
146 * metadata.
147 */
148#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
149
150/*
144 * Set of flags that will prevent slab merging 151 * Set of flags that will prevent slab merging
145 */ 152 */
146#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 153#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
@@ -325,6 +332,7 @@ static int slub_debug;
325#endif 332#endif
326 333
327static char *slub_debug_slabs; 334static char *slub_debug_slabs;
335static int disable_higher_order_debug;
328 336
329/* 337/*
330 * Object debugging 338 * Object debugging
@@ -646,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
646 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 654 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
647 print_section("Padding", end - remainder, remainder); 655 print_section("Padding", end - remainder, remainder);
648 656
649 restore_bytes(s, "slab padding", POISON_INUSE, start, end); 657 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
650 return 0; 658 return 0;
651} 659}
652 660
@@ -976,6 +984,15 @@ static int __init setup_slub_debug(char *str)
976 */ 984 */
977 goto check_slabs; 985 goto check_slabs;
978 986
987 if (tolower(*str) == 'o') {
988 /*
989 * Avoid enabling debugging on caches if its minimum order
990 * would increase as a result.
991 */
992 disable_higher_order_debug = 1;
993 goto out;
994 }
995
979 slub_debug = 0; 996 slub_debug = 0;
980 if (*str == '-') 997 if (*str == '-')
981 /* 998 /*
@@ -1026,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
1026 * Enable debugging if selected on the kernel commandline. 1043 * Enable debugging if selected on the kernel commandline.
1027 */ 1044 */
1028 if (slub_debug && (!slub_debug_slabs || 1045 if (slub_debug && (!slub_debug_slabs ||
1029 strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) 1046 !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
1030 flags |= slub_debug; 1047 flags |= slub_debug;
1031 1048
1032 return flags; 1049 return flags;
1033} 1050}
@@ -1054,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1054} 1071}
1055#define slub_debug 0 1072#define slub_debug 0
1056 1073
1074#define disable_higher_order_debug 0
1075
1057static inline unsigned long slabs_node(struct kmem_cache *s, int node) 1076static inline unsigned long slabs_node(struct kmem_cache *s, int node)
1058 { return 0; } 1077 { return 0; }
1059static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) 1078static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
@@ -1109,8 +1128,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1109 } 1128 }
1110 1129
1111 if (kmemcheck_enabled 1130 if (kmemcheck_enabled
1112 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) 1131 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1113 {
1114 int pages = 1 << oo_order(oo); 1132 int pages = 1 << oo_order(oo);
1115 1133
1116 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); 1134 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@ -1560,6 +1578,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
1560 "default order: %d, min order: %d\n", s->name, s->objsize, 1578 "default order: %d, min order: %d\n", s->name, s->objsize,
1561 s->size, oo_order(s->oo), oo_order(s->min)); 1579 s->size, oo_order(s->oo), oo_order(s->min));
1562 1580
1581 if (oo_order(s->min) > get_order(s->objsize))
1582 printk(KERN_WARNING " %s debugging increased min order, use "
1583 "slub_debug=O to disable.\n", s->name);
1584
1563 for_each_online_node(node) { 1585 for_each_online_node(node) {
1564 struct kmem_cache_node *n = get_node(s, node); 1586 struct kmem_cache_node *n = get_node(s, node);
1565 unsigned long nr_slabs; 1587 unsigned long nr_slabs;
@@ -2001,7 +2023,7 @@ static inline int calculate_order(int size)
2001 return order; 2023 return order;
2002 fraction /= 2; 2024 fraction /= 2;
2003 } 2025 }
2004 min_objects --; 2026 min_objects--;
2005 } 2027 }
2006 2028
2007 /* 2029 /*
@@ -2091,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2091 */ 2113 */
2092#define NR_KMEM_CACHE_CPU 100 2114#define NR_KMEM_CACHE_CPU 100
2093 2115
2094static DEFINE_PER_CPU(struct kmem_cache_cpu, 2116static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
2095 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 2117 kmem_cache_cpu);
2096 2118
2097static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 2119static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
2098static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); 2120static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@ -2400,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2400 * on bootup. 2422 * on bootup.
2401 */ 2423 */
2402 align = calculate_alignment(flags, align, s->objsize); 2424 align = calculate_alignment(flags, align, s->objsize);
2425 s->align = align;
2403 2426
2404 /* 2427 /*
2405 * SLUB stores one object immediately after another beginning from 2428 * SLUB stores one object immediately after another beginning from
@@ -2452,6 +2475,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2452 2475
2453 if (!calculate_sizes(s, -1)) 2476 if (!calculate_sizes(s, -1))
2454 goto error; 2477 goto error;
2478 if (disable_higher_order_debug) {
2479 /*
2480 * Disable debugging flags that store metadata if the min slab
2481 * order increased.
2482 */
2483 if (get_order(s->size) > get_order(s->objsize)) {
2484 s->flags &= ~DEBUG_METADATA_FLAGS;
2485 s->offset = 0;
2486 if (!calculate_sizes(s, -1))
2487 goto error;
2488 }
2489 }
2455 2490
2456 /* 2491 /*
2457 * The larger the object size is, the more pages we want on the partial 2492 * The larger the object size is, the more pages we want on the partial
@@ -2594,8 +2629,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2594 */ 2629 */
2595void kmem_cache_destroy(struct kmem_cache *s) 2630void kmem_cache_destroy(struct kmem_cache *s)
2596{ 2631{
2597 if (s->flags & SLAB_DESTROY_BY_RCU)
2598 rcu_barrier();
2599 down_write(&slub_lock); 2632 down_write(&slub_lock);
2600 s->refcount--; 2633 s->refcount--;
2601 if (!s->refcount) { 2634 if (!s->refcount) {
@@ -2606,6 +2639,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
2606 "still has objects.\n", s->name, __func__); 2639 "still has objects.\n", s->name, __func__);
2607 dump_stack(); 2640 dump_stack();
2608 } 2641 }
2642 if (s->flags & SLAB_DESTROY_BY_RCU)
2643 rcu_barrier();
2609 sysfs_slab_remove(s); 2644 sysfs_slab_remove(s);
2610 } else 2645 } else
2611 up_write(&slub_lock); 2646 up_write(&slub_lock);
@@ -2790,6 +2825,11 @@ static s8 size_index[24] = {
2790 2 /* 192 */ 2825 2 /* 192 */
2791}; 2826};
2792 2827
2828static inline int size_index_elem(size_t bytes)
2829{
2830 return (bytes - 1) / 8;
2831}
2832
2793static struct kmem_cache *get_slab(size_t size, gfp_t flags) 2833static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2794{ 2834{
2795 int index; 2835 int index;
@@ -2798,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2798 if (!size) 2838 if (!size)
2799 return ZERO_SIZE_PTR; 2839 return ZERO_SIZE_PTR;
2800 2840
2801 index = size_index[(size - 1) / 8]; 2841 index = size_index[size_index_elem(size)];
2802 } else 2842 } else
2803 index = fls(size - 1); 2843 index = fls(size - 1);
2804 2844
@@ -3156,10 +3196,12 @@ void __init kmem_cache_init(void)
3156 slab_state = PARTIAL; 3196 slab_state = PARTIAL;
3157 3197
3158 /* Caches that are not of the two-to-the-power-of size */ 3198 /* Caches that are not of the two-to-the-power-of size */
3159 if (KMALLOC_MIN_SIZE <= 64) { 3199 if (KMALLOC_MIN_SIZE <= 32) {
3160 create_kmalloc_cache(&kmalloc_caches[1], 3200 create_kmalloc_cache(&kmalloc_caches[1],
3161 "kmalloc-96", 96, GFP_NOWAIT); 3201 "kmalloc-96", 96, GFP_NOWAIT);
3162 caches++; 3202 caches++;
3203 }
3204 if (KMALLOC_MIN_SIZE <= 64) {
3163 create_kmalloc_cache(&kmalloc_caches[2], 3205 create_kmalloc_cache(&kmalloc_caches[2],
3164 "kmalloc-192", 192, GFP_NOWAIT); 3206 "kmalloc-192", 192, GFP_NOWAIT);
3165 caches++; 3207 caches++;
@@ -3186,17 +3228,28 @@ void __init kmem_cache_init(void)
3186 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 3228 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
3187 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 3229 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
3188 3230
3189 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) 3231 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
3190 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; 3232 int elem = size_index_elem(i);
3233 if (elem >= ARRAY_SIZE(size_index))
3234 break;
3235 size_index[elem] = KMALLOC_SHIFT_LOW;
3236 }
3191 3237
3192 if (KMALLOC_MIN_SIZE == 128) { 3238 if (KMALLOC_MIN_SIZE == 64) {
3239 /*
3240 * The 96 byte size cache is not used if the alignment
3241 * is 64 byte.
3242 */
3243 for (i = 64 + 8; i <= 96; i += 8)
3244 size_index[size_index_elem(i)] = 7;
3245 } else if (KMALLOC_MIN_SIZE == 128) {
3193 /* 3246 /*
3194 * The 192 byte sized cache is not used if the alignment 3247 * The 192 byte sized cache is not used if the alignment
3195 * is 128 byte. Redirect kmalloc to use the 256 byte cache 3248 * is 128 byte. Redirect kmalloc to use the 256 byte cache
3196 * instead. 3249 * instead.
3197 */ 3250 */
3198 for (i = 128 + 8; i <= 192; i += 8) 3251 for (i = 128 + 8; i <= 192; i += 8)
3199 size_index[(i - 1) / 8] = 8; 3252 size_index[size_index_elem(i)] = 8;
3200 } 3253 }
3201 3254
3202 slab_state = UP; 3255 slab_state = UP;
@@ -4543,8 +4596,11 @@ static int sysfs_slab_add(struct kmem_cache *s)
4543 } 4596 }
4544 4597
4545 err = sysfs_create_group(&s->kobj, &slab_attr_group); 4598 err = sysfs_create_group(&s->kobj, &slab_attr_group);
4546 if (err) 4599 if (err) {
4600 kobject_del(&s->kobj);
4601 kobject_put(&s->kobj);
4547 return err; 4602 return err;
4603 }
4548 kobject_uevent(&s->kobj, KOBJ_ADD); 4604 kobject_uevent(&s->kobj, KOBJ_ADD);
4549 if (!unmergeable) { 4605 if (!unmergeable) {
4550 /* Setup first alias */ 4606 /* Setup first alias */
@@ -4726,7 +4782,7 @@ static const struct file_operations proc_slabinfo_operations = {
4726 4782
4727static int __init slab_proc_init(void) 4783static int __init slab_proc_init(void)
4728{ 4784{
4729 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); 4785 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
4730 return 0; 4786 return 0;
4731} 4787}
4732module_init(slab_proc_init); 4788module_init(slab_proc_init);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 42cd38eba79f..5ae6b8b78c80 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = {
34}; 34};
35 35
36static struct backing_dev_info swap_backing_dev_info = { 36static struct backing_dev_info swap_backing_dev_info = {
37 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 38 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38 .unplug_io_fn = swap_unplug_io_fn, 39 .unplug_io_fn = swap_unplug_io_fn,
39}; 40};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d23c53..74f1102e8749 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si)
161 } 161 }
162 162
163 err = blkdev_issue_discard(si->bdev, start_block, 163 err = blkdev_issue_discard(si->bdev, start_block,
164 nr_blocks, GFP_KERNEL); 164 nr_blocks, GFP_KERNEL,
165 DISCARD_FL_BARRIER);
165 if (err) 166 if (err)
166 break; 167 break;
167 168
@@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
200 start_block <<= PAGE_SHIFT - 9; 201 start_block <<= PAGE_SHIFT - 9;
201 nr_blocks <<= PAGE_SHIFT - 9; 202 nr_blocks <<= PAGE_SHIFT - 9;
202 if (blkdev_issue_discard(si->bdev, start_block, 203 if (blkdev_issue_discard(si->bdev, start_block,
203 nr_blocks, GFP_NOIO)) 204 nr_blocks, GFP_NOIO,
205 DISCARD_FL_BARRIER))
204 break; 206 break;
205 } 207 }
206 208
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f8189a4b3e13..204b8243d8ab 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -265,6 +265,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 265static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 266static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 267static LIST_HEAD(vmap_area_list);
268static unsigned long vmap_area_pcpu_hole;
268 269
269static struct vmap_area *__find_vmap_area(unsigned long addr) 270static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 271{
@@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va)
431 RB_CLEAR_NODE(&va->rb_node); 432 RB_CLEAR_NODE(&va->rb_node);
432 list_del_rcu(&va->list); 433 list_del_rcu(&va->list);
433 434
435 /*
436 * Track the highest possible candidate for pcpu area
437 * allocation. Areas outside of vmalloc area can be returned
438 * here too, consider only end addresses which fall inside
439 * vmalloc area proper.
440 */
441 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
442 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
443
434 call_rcu(&va->rcu_head, rcu_free_va); 444 call_rcu(&va->rcu_head, rcu_free_va);
435} 445}
436 446
@@ -1038,6 +1048,9 @@ void __init vmalloc_init(void)
1038 va->va_end = va->va_start + tmp->size; 1048 va->va_end = va->va_start + tmp->size;
1039 __insert_vmap_area(va); 1049 __insert_vmap_area(va);
1040 } 1050 }
1051
1052 vmap_area_pcpu_hole = VMALLOC_END;
1053
1041 vmap_initialized = true; 1054 vmap_initialized = true;
1042} 1055}
1043 1056
@@ -1122,13 +1135,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1122DEFINE_RWLOCK(vmlist_lock); 1135DEFINE_RWLOCK(vmlist_lock);
1123struct vm_struct *vmlist; 1136struct vm_struct *vmlist;
1124 1137
1138static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1139 unsigned long flags, void *caller)
1140{
1141 struct vm_struct *tmp, **p;
1142
1143 vm->flags = flags;
1144 vm->addr = (void *)va->va_start;
1145 vm->size = va->va_end - va->va_start;
1146 vm->caller = caller;
1147 va->private = vm;
1148 va->flags |= VM_VM_AREA;
1149
1150 write_lock(&vmlist_lock);
1151 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1152 if (tmp->addr >= vm->addr)
1153 break;
1154 }
1155 vm->next = *p;
1156 *p = vm;
1157 write_unlock(&vmlist_lock);
1158}
1159
1125static struct vm_struct *__get_vm_area_node(unsigned long size, 1160static struct vm_struct *__get_vm_area_node(unsigned long size,
1126 unsigned long flags, unsigned long start, unsigned long end, 1161 unsigned long flags, unsigned long start, unsigned long end,
1127 int node, gfp_t gfp_mask, void *caller) 1162 int node, gfp_t gfp_mask, void *caller)
1128{ 1163{
1129 static struct vmap_area *va; 1164 static struct vmap_area *va;
1130 struct vm_struct *area; 1165 struct vm_struct *area;
1131 struct vm_struct *tmp, **p;
1132 unsigned long align = 1; 1166 unsigned long align = 1;
1133 1167
1134 BUG_ON(in_interrupt()); 1168 BUG_ON(in_interrupt());
@@ -1147,7 +1181,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1147 if (unlikely(!size)) 1181 if (unlikely(!size))
1148 return NULL; 1182 return NULL;
1149 1183
1150 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1184 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1151 if (unlikely(!area)) 1185 if (unlikely(!area))
1152 return NULL; 1186 return NULL;
1153 1187
@@ -1162,25 +1196,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1162 return NULL; 1196 return NULL;
1163 } 1197 }
1164 1198
1165 area->flags = flags; 1199 insert_vmalloc_vm(area, va, flags, caller);
1166 area->addr = (void *)va->va_start;
1167 area->size = size;
1168 area->pages = NULL;
1169 area->nr_pages = 0;
1170 area->phys_addr = 0;
1171 area->caller = caller;
1172 va->private = area;
1173 va->flags |= VM_VM_AREA;
1174
1175 write_lock(&vmlist_lock);
1176 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1177 if (tmp->addr >= area->addr)
1178 break;
1179 }
1180 area->next = *p;
1181 *p = area;
1182 write_unlock(&vmlist_lock);
1183
1184 return area; 1200 return area;
1185} 1201}
1186 1202
@@ -1818,6 +1834,286 @@ void free_vm_area(struct vm_struct *area)
1818} 1834}
1819EXPORT_SYMBOL_GPL(free_vm_area); 1835EXPORT_SYMBOL_GPL(free_vm_area);
1820 1836
1837static struct vmap_area *node_to_va(struct rb_node *n)
1838{
1839 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1840}
1841
1842/**
1843 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
1844 * @end: target address
1845 * @pnext: out arg for the next vmap_area
1846 * @pprev: out arg for the previous vmap_area
1847 *
1848 * Returns: %true if either or both of next and prev are found,
1849 * %false if no vmap_area exists
1850 *
1851 * Find vmap_areas end addresses of which enclose @end. ie. if not
1852 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
1853 */
1854static bool pvm_find_next_prev(unsigned long end,
1855 struct vmap_area **pnext,
1856 struct vmap_area **pprev)
1857{
1858 struct rb_node *n = vmap_area_root.rb_node;
1859 struct vmap_area *va = NULL;
1860
1861 while (n) {
1862 va = rb_entry(n, struct vmap_area, rb_node);
1863 if (end < va->va_end)
1864 n = n->rb_left;
1865 else if (end > va->va_end)
1866 n = n->rb_right;
1867 else
1868 break;
1869 }
1870
1871 if (!va)
1872 return false;
1873
1874 if (va->va_end > end) {
1875 *pnext = va;
1876 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
1877 } else {
1878 *pprev = va;
1879 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
1880 }
1881 return true;
1882}
1883
1884/**
1885 * pvm_determine_end - find the highest aligned address between two vmap_areas
1886 * @pnext: in/out arg for the next vmap_area
1887 * @pprev: in/out arg for the previous vmap_area
1888 * @align: alignment
1889 *
1890 * Returns: determined end address
1891 *
1892 * Find the highest aligned address between *@pnext and *@pprev below
1893 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
1894 * down address is between the end addresses of the two vmap_areas.
1895 *
1896 * Please note that the address returned by this function may fall
1897 * inside *@pnext vmap_area. The caller is responsible for checking
1898 * that.
1899 */
1900static unsigned long pvm_determine_end(struct vmap_area **pnext,
1901 struct vmap_area **pprev,
1902 unsigned long align)
1903{
1904 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
1905 unsigned long addr;
1906
1907 if (*pnext)
1908 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
1909 else
1910 addr = vmalloc_end;
1911
1912 while (*pprev && (*pprev)->va_end > addr) {
1913 *pnext = *pprev;
1914 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
1915 }
1916
1917 return addr;
1918}
1919
1920/**
1921 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
1922 * @offsets: array containing offset of each area
1923 * @sizes: array containing size of each area
1924 * @nr_vms: the number of areas to allocate
1925 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
1926 * @gfp_mask: allocation mask
1927 *
1928 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
1929 * vm_structs on success, %NULL on failure
1930 *
1931 * Percpu allocator wants to use congruent vm areas so that it can
1932 * maintain the offsets among percpu areas. This function allocates
1933 * congruent vmalloc areas for it. These areas tend to be scattered
1934 * pretty far, distance between two areas easily going up to
1935 * gigabytes. To avoid interacting with regular vmallocs, these areas
1936 * are allocated from top.
1937 *
1938 * Despite its complicated look, this allocator is rather simple. It
1939 * does everything top-down and scans areas from the end looking for
1940 * matching slot. While scanning, if any of the areas overlaps with
1941 * existing vmap_area, the base address is pulled down to fit the
1942 * area. Scanning is repeated till all the areas fit and then all
1943 * necessary data structres are inserted and the result is returned.
1944 */
1945struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
1946 const size_t *sizes, int nr_vms,
1947 size_t align, gfp_t gfp_mask)
1948{
1949 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
1950 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
1951 struct vmap_area **vas, *prev, *next;
1952 struct vm_struct **vms;
1953 int area, area2, last_area, term_area;
1954 unsigned long base, start, end, last_end;
1955 bool purged = false;
1956
1957 gfp_mask &= GFP_RECLAIM_MASK;
1958
1959 /* verify parameters and allocate data structures */
1960 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
1961 for (last_area = 0, area = 0; area < nr_vms; area++) {
1962 start = offsets[area];
1963 end = start + sizes[area];
1964
1965 /* is everything aligned properly? */
1966 BUG_ON(!IS_ALIGNED(offsets[area], align));
1967 BUG_ON(!IS_ALIGNED(sizes[area], align));
1968
1969 /* detect the area with the highest address */
1970 if (start > offsets[last_area])
1971 last_area = area;
1972
1973 for (area2 = 0; area2 < nr_vms; area2++) {
1974 unsigned long start2 = offsets[area2];
1975 unsigned long end2 = start2 + sizes[area2];
1976
1977 if (area2 == area)
1978 continue;
1979
1980 BUG_ON(start2 >= start && start2 < end);
1981 BUG_ON(end2 <= end && end2 > start);
1982 }
1983 }
1984 last_end = offsets[last_area] + sizes[last_area];
1985
1986 if (vmalloc_end - vmalloc_start < last_end) {
1987 WARN_ON(true);
1988 return NULL;
1989 }
1990
1991 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
1992 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
1993 if (!vas || !vms)
1994 goto err_free;
1995
1996 for (area = 0; area < nr_vms; area++) {
1997 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
1998 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
1999 if (!vas[area] || !vms[area])
2000 goto err_free;
2001 }
2002retry:
2003 spin_lock(&vmap_area_lock);
2004
2005 /* start scanning - we scan from the top, begin with the last area */
2006 area = term_area = last_area;
2007 start = offsets[area];
2008 end = start + sizes[area];
2009
2010 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2011 base = vmalloc_end - last_end;
2012 goto found;
2013 }
2014 base = pvm_determine_end(&next, &prev, align) - end;
2015
2016 while (true) {
2017 BUG_ON(next && next->va_end <= base + end);
2018 BUG_ON(prev && prev->va_end > base + end);
2019
2020 /*
2021 * base might have underflowed, add last_end before
2022 * comparing.
2023 */
2024 if (base + last_end < vmalloc_start + last_end) {
2025 spin_unlock(&vmap_area_lock);
2026 if (!purged) {
2027 purge_vmap_area_lazy();
2028 purged = true;
2029 goto retry;
2030 }
2031 goto err_free;
2032 }
2033
2034 /*
2035 * If next overlaps, move base downwards so that it's
2036 * right below next and then recheck.
2037 */
2038 if (next && next->va_start < base + end) {
2039 base = pvm_determine_end(&next, &prev, align) - end;
2040 term_area = area;
2041 continue;
2042 }
2043
2044 /*
2045 * If prev overlaps, shift down next and prev and move
2046 * base so that it's right below new next and then
2047 * recheck.
2048 */
2049 if (prev && prev->va_end > base + start) {
2050 next = prev;
2051 prev = node_to_va(rb_prev(&next->rb_node));
2052 base = pvm_determine_end(&next, &prev, align) - end;
2053 term_area = area;
2054 continue;
2055 }
2056
2057 /*
2058 * This area fits, move on to the previous one. If
2059 * the previous one is the terminal one, we're done.
2060 */
2061 area = (area + nr_vms - 1) % nr_vms;
2062 if (area == term_area)
2063 break;
2064 start = offsets[area];
2065 end = start + sizes[area];
2066 pvm_find_next_prev(base + end, &next, &prev);
2067 }
2068found:
2069 /* we've found a fitting base, insert all va's */
2070 for (area = 0; area < nr_vms; area++) {
2071 struct vmap_area *va = vas[area];
2072
2073 va->va_start = base + offsets[area];
2074 va->va_end = va->va_start + sizes[area];
2075 __insert_vmap_area(va);
2076 }
2077
2078 vmap_area_pcpu_hole = base + offsets[last_area];
2079
2080 spin_unlock(&vmap_area_lock);
2081
2082 /* insert all vm's */
2083 for (area = 0; area < nr_vms; area++)
2084 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2085 pcpu_get_vm_areas);
2086
2087 kfree(vas);
2088 return vms;
2089
2090err_free:
2091 for (area = 0; area < nr_vms; area++) {
2092 if (vas)
2093 kfree(vas[area]);
2094 if (vms)
2095 kfree(vms[area]);
2096 }
2097 kfree(vas);
2098 kfree(vms);
2099 return NULL;
2100}
2101
2102/**
2103 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2104 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2105 * @nr_vms: the number of allocated areas
2106 *
2107 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2108 */
2109void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2110{
2111 int i;
2112
2113 for (i = 0; i < nr_vms; i++)
2114 free_vm_area(vms[i]);
2115 kfree(vms);
2116}
1821 2117
1822#ifdef CONFIG_PROC_FS 2118#ifdef CONFIG_PROC_FS
1823static void *s_start(struct seq_file *m, loff_t *pos) 2119static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 94e86dd6954c..ba8228e0a806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1720,7 +1720,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1720 */ 1720 */
1721 if (total_scanned > sc->swap_cluster_max + 1721 if (total_scanned > sc->swap_cluster_max +
1722 sc->swap_cluster_max / 2) { 1722 sc->swap_cluster_max / 2) {
1723 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1723 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
1724 sc->may_writepage = 1; 1724 sc->may_writepage = 1;
1725 } 1725 }
1726 1726