aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile6
-rw-r--r--mm/backing-dev.c52
-rw-r--r--mm/bootmem.c31
-rw-r--r--mm/bounce.c1
-rw-r--r--mm/failslab.c1
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/filemap_xip.c1
-rw-r--r--mm/hugetlb.c9
-rw-r--r--mm/kmemleak.c1
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/memcontrol.c76
-rw-r--r--mm/memory-failure.c1
-rw-r--r--mm/memory.c3
-rw-r--r--mm/mempolicy.c51
-rw-r--r--mm/migrate.c1
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mlock.c41
-rw-r--r--mm/mmap.c113
-rw-r--r--mm/mmu_context.c1
-rw-r--r--mm/mmu_notifier.c1
-rw-r--r--mm/mprotect.c1
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c13
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page-writeback.c44
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/pagewalk.c47
-rw-r--r--mm/percpu-km.c104
-rw-r--r--mm/percpu-vm.c451
-rw-r--r--mm/percpu.c611
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/quicklist.c1
-rw-r--r--mm/readahead.c3
-rw-r--r--mm/rmap.c43
-rw-r--r--mm/shmem.c29
-rw-r--r--mm/slab.c54
-rw-r--r--mm/slob.c8
-rw-r--r--mm/slub.c51
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c14
-rw-r--r--mm/truncate.c1
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmscan.c25
-rw-r--r--mm/vmstat.c1
49 files changed, 1158 insertions, 813 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 7a68d2ab556..6c2a73a54a4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 33obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
34obj-$(CONFIG_FS_XIP) += filemap_xip.o 34obj-$(CONFIG_FS_XIP) += filemap_xip.o
35obj-$(CONFIG_MIGRATION) += migrate.o 35obj-$(CONFIG_MIGRATION) += migrate.o
36obj-$(CONFIG_SMP) += percpu.o 36ifdef CONFIG_SMP
37obj-y += percpu.o
38else
39obj-y += percpu_up.o
40endif
37obj-$(CONFIG_QUICKLIST) += quicklist.o 41obj-$(CONFIG_QUICKLIST) += quicklist.o
38obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 42obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
39obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 43obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca034770..660a87a2251 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,8 @@
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/device.h> 12#include <linux/device.h>
13 13
14static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
15
14void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 16void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
15{ 17{
16} 18}
@@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = {
25}; 27};
26EXPORT_SYMBOL_GPL(default_backing_dev_info); 28EXPORT_SYMBOL_GPL(default_backing_dev_info);
27 29
30struct backing_dev_info noop_backing_dev_info = {
31 .name = "noop",
32};
33EXPORT_SYMBOL_GPL(noop_backing_dev_info);
34
28static struct class *bdi_class; 35static struct class *bdi_class;
29 36
30/* 37/*
@@ -41,7 +48,6 @@ static struct timer_list sync_supers_timer;
41 48
42static int bdi_sync_supers(void *); 49static int bdi_sync_supers(void *);
43static void sync_supers_timer_fn(unsigned long); 50static void sync_supers_timer_fn(unsigned long);
44static void arm_supers_timer(void);
45 51
46static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); 52static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
47 53
@@ -227,6 +233,9 @@ static struct device_attribute bdi_dev_attrs[] = {
227static __init int bdi_class_init(void) 233static __init int bdi_class_init(void)
228{ 234{
229 bdi_class = class_create(THIS_MODULE, "bdi"); 235 bdi_class = class_create(THIS_MODULE, "bdi");
236 if (IS_ERR(bdi_class))
237 return PTR_ERR(bdi_class);
238
230 bdi_class->dev_attrs = bdi_dev_attrs; 239 bdi_class->dev_attrs = bdi_dev_attrs;
231 bdi_debug_init(); 240 bdi_debug_init();
232 return 0; 241 return 0;
@@ -242,7 +251,7 @@ static int __init default_bdi_init(void)
242 251
243 init_timer(&sync_supers_timer); 252 init_timer(&sync_supers_timer);
244 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); 253 setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
245 arm_supers_timer(); 254 bdi_arm_supers_timer();
246 255
247 err = bdi_init(&default_backing_dev_info); 256 err = bdi_init(&default_backing_dev_info);
248 if (!err) 257 if (!err)
@@ -364,10 +373,13 @@ static int bdi_sync_supers(void *unused)
364 return 0; 373 return 0;
365} 374}
366 375
367static void arm_supers_timer(void) 376void bdi_arm_supers_timer(void)
368{ 377{
369 unsigned long next; 378 unsigned long next;
370 379
380 if (!dirty_writeback_interval)
381 return;
382
371 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; 383 next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
372 mod_timer(&sync_supers_timer, round_jiffies_up(next)); 384 mod_timer(&sync_supers_timer, round_jiffies_up(next));
373} 385}
@@ -375,7 +387,7 @@ static void arm_supers_timer(void)
375static void sync_supers_timer_fn(unsigned long unused) 387static void sync_supers_timer_fn(unsigned long unused)
376{ 388{
377 wake_up_process(sync_supers_tsk); 389 wake_up_process(sync_supers_tsk);
378 arm_supers_timer(); 390 bdi_arm_supers_timer();
379} 391}
380 392
381static int bdi_forker_task(void *ptr) 393static int bdi_forker_task(void *ptr)
@@ -418,7 +430,10 @@ static int bdi_forker_task(void *ptr)
418 430
419 spin_unlock_bh(&bdi_lock); 431 spin_unlock_bh(&bdi_lock);
420 wait = msecs_to_jiffies(dirty_writeback_interval * 10); 432 wait = msecs_to_jiffies(dirty_writeback_interval * 10);
421 schedule_timeout(wait); 433 if (wait)
434 schedule_timeout(wait);
435 else
436 schedule();
422 try_to_freeze(); 437 try_to_freeze();
423 continue; 438 continue;
424 } 439 }
@@ -712,6 +727,33 @@ void bdi_destroy(struct backing_dev_info *bdi)
712} 727}
713EXPORT_SYMBOL(bdi_destroy); 728EXPORT_SYMBOL(bdi_destroy);
714 729
730/*
731 * For use from filesystems to quickly init and register a bdi associated
732 * with dirty writeback
733 */
734int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
735 unsigned int cap)
736{
737 char tmp[32];
738 int err;
739
740 bdi->name = name;
741 bdi->capabilities = cap;
742 err = bdi_init(bdi);
743 if (err)
744 return err;
745
746 sprintf(tmp, "%.28s%s", name, "-%d");
747 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
748 if (err) {
749 bdi_destroy(bdi);
750 return err;
751 }
752
753 return 0;
754}
755EXPORT_SYMBOL(bdi_setup_and_register);
756
715static wait_queue_head_t congestion_wqh[2] = { 757static wait_queue_head_t congestion_wqh[2] = {
716 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 758 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
717 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 759 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d7c791ef003..58c66cc5056 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/slab.h>
13#include <linux/bootmem.h> 14#include <linux/bootmem.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
@@ -180,19 +181,12 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
180 end_aligned = end & ~(BITS_PER_LONG - 1); 181 end_aligned = end & ~(BITS_PER_LONG - 1);
181 182
182 if (end_aligned <= start_aligned) { 183 if (end_aligned <= start_aligned) {
183#if 1
184 printk(KERN_DEBUG " %lx - %lx\n", start, end);
185#endif
186 for (i = start; i < end; i++) 184 for (i = start; i < end; i++)
187 __free_pages_bootmem(pfn_to_page(i), 0); 185 __free_pages_bootmem(pfn_to_page(i), 0);
188 186
189 return; 187 return;
190 } 188 }
191 189
192#if 1
193 printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
194 start, start_aligned, end_aligned, end);
195#endif
196 for (i = start; i < start_aligned; i++) 190 for (i = start; i < start_aligned; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0); 191 __free_pages_bootmem(pfn_to_page(i), 0);
198 192
@@ -310,9 +304,22 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
310unsigned long __init free_all_bootmem(void) 304unsigned long __init free_all_bootmem(void)
311{ 305{
312#ifdef CONFIG_NO_BOOTMEM 306#ifdef CONFIG_NO_BOOTMEM
313 return free_all_memory_core_early(NODE_DATA(0)->node_id); 307 /*
308 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
309 * because in some case like Node0 doesnt have RAM installed
310 * low ram will be on Node1
311 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
312 * will be used instead of only Node0 related
313 */
314 return free_all_memory_core_early(MAX_NUMNODES);
314#else 315#else
315 return free_all_bootmem_core(NODE_DATA(0)->bdata); 316 unsigned long total_pages = 0;
317 bootmem_data_t *bdata;
318
319 list_for_each_entry(bdata, &bdata_list, list)
320 total_pages += free_all_bootmem_core(bdata);
321
322 return total_pages;
316#endif 323#endif
317} 324}
318 325
@@ -428,9 +435,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
428{ 435{
429#ifdef CONFIG_NO_BOOTMEM 436#ifdef CONFIG_NO_BOOTMEM
430 free_early(physaddr, physaddr + size); 437 free_early(physaddr, physaddr + size);
431#if 0
432 printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
433#endif
434#else 438#else
435 unsigned long start, end; 439 unsigned long start, end;
436 440
@@ -456,9 +460,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
456{ 460{
457#ifdef CONFIG_NO_BOOTMEM 461#ifdef CONFIG_NO_BOOTMEM
458 free_early(addr, addr + size); 462 free_early(addr, addr + size);
459#if 0
460 printk(KERN_DEBUG "free %lx %lx\n", addr, size);
461#endif
462#else 463#else
463 unsigned long start, end; 464 unsigned long start, end;
464 465
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e3..13b6dad1eed 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -6,6 +6,7 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/gfp.h>
9#include <linux/bio.h> 10#include <linux/bio.h>
10#include <linux/pagemap.h> 11#include <linux/pagemap.h>
11#include <linux/mempool.h> 12#include <linux/mempool.h>
diff --git a/mm/failslab.c b/mm/failslab.c
index bb41f98dd8b..c5f88f240dd 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,4 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4 3
5static struct { 4static struct {
diff --git a/mm/filemap.c b/mm/filemap.c
index 045b31c3765..140ebda9640 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -10,13 +10,13 @@
10 * the NFS filesystem used to do this differently, for example) 10 * the NFS filesystem used to do this differently, for example)
11 */ 11 */
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/compiler.h> 13#include <linux/compiler.h>
15#include <linux/fs.h> 14#include <linux/fs.h>
16#include <linux/uaccess.h> 15#include <linux/uaccess.h>
17#include <linux/aio.h> 16#include <linux/aio.h>
18#include <linux/capability.h> 17#include <linux/capability.h>
19#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/gfp.h>
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/mman.h> 22#include <linux/mman.h>
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 78b94f0b6d5..83364df74a3 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,7 @@
17#include <linux/sched.h> 17#include <linux/sched.h>
18#include <linux/seqlock.h> 18#include <linux/seqlock.h>
19#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/gfp.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/io.h> 22#include <asm/io.h>
22 23
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb37c11..4c9e6bbf377 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
2 * Generic hugetlb support. 2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004 3 * (C) William Irwin, April 2004
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/list.h> 5#include <linux/list.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -18,6 +17,7 @@
18#include <linux/mutex.h> 17#include <linux/mutex.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h>
21 21
22#include <asm/page.h> 22#include <asm/page.h>
23#include <asm/pgtable.h> 23#include <asm/pgtable.h>
@@ -546,6 +546,7 @@ static void free_huge_page(struct page *page)
546 546
547 mapping = (struct address_space *) page_private(page); 547 mapping = (struct address_space *) page_private(page);
548 set_page_private(page, 0); 548 set_page_private(page, 0);
549 page->mapping = NULL;
549 BUG_ON(page_count(page)); 550 BUG_ON(page_count(page));
550 INIT_LIST_HEAD(&page->lru); 551 INIT_LIST_HEAD(&page->lru);
551 552
@@ -1038,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1038 page = alloc_buddy_huge_page(h, vma, addr); 1039 page = alloc_buddy_huge_page(h, vma, addr);
1039 if (!page) { 1040 if (!page) {
1040 hugetlb_put_quota(inode->i_mapping, chg); 1041 hugetlb_put_quota(inode->i_mapping, chg);
1041 return ERR_PTR(-VM_FAULT_OOM); 1042 return ERR_PTR(-VM_FAULT_SIGBUS);
1042 } 1043 }
1043 } 1044 }
1044 1045
@@ -2447,8 +2448,10 @@ retry:
2447 spin_lock(&inode->i_lock); 2448 spin_lock(&inode->i_lock);
2448 inode->i_blocks += blocks_per_huge_page(h); 2449 inode->i_blocks += blocks_per_huge_page(h);
2449 spin_unlock(&inode->i_lock); 2450 spin_unlock(&inode->i_lock);
2450 } else 2451 } else {
2451 lock_page(page); 2452 lock_page(page);
2453 page->mapping = HUGETLB_POISON;
2454 }
2452 } 2455 }
2453 2456
2454 /* 2457 /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5b069e4f5e4..2c0d032ac89 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -72,7 +72,6 @@
72#include <linux/module.h> 72#include <linux/module.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/prio_tree.h>
75#include <linux/gfp.h>
76#include <linux/fs.h> 75#include <linux/fs.h>
77#include <linux/debugfs.h> 76#include <linux/debugfs.h>
78#include <linux/seq_file.h> 77#include <linux/seq_file.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7f508..956880f2ff4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -365,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
365 do { 365 do {
366 cond_resched(); 366 cond_resched();
367 page = follow_page(vma, addr, FOLL_GET); 367 page = follow_page(vma, addr, FOLL_GET);
368 if (!page) 368 if (IS_ERR_OR_NULL(page))
369 break; 369 break;
370 if (PageKsm(page)) 370 if (PageKsm(page))
371 ret = handle_mm_fault(vma->vm_mm, vma, addr, 371 ret = handle_mm_fault(vma->vm_mm, vma, addr,
@@ -447,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
447 goto out; 447 goto out;
448 448
449 page = follow_page(vma, addr, FOLL_GET); 449 page = follow_page(vma, addr, FOLL_GET);
450 if (!page) 450 if (IS_ERR_OR_NULL(page))
451 goto out; 451 goto out;
452 if (PageAnon(page)) { 452 if (PageAnon(page)) {
453 flush_anon_page(vma, page, addr); 453 flush_anon_page(vma, page, addr);
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
751 * page 751 * page
752 */ 752 */
753 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
754 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at(mm, addr, ptep, entry);
755 goto out_unlock; 755 goto out_unlock;
756 } 756 }
757 entry = pte_wrprotect(entry); 757 entry = pte_wrprotect(entry);
@@ -1086,7 +1086,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1086 cond_resched(); 1086 cond_resched();
1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node); 1087 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1088 tree_page = get_mergeable_page(tree_rmap_item); 1088 tree_page = get_mergeable_page(tree_rmap_item);
1089 if (!tree_page) 1089 if (IS_ERR_OR_NULL(tree_page))
1090 return NULL; 1090 return NULL;
1091 1091
1092 /* 1092 /*
@@ -1294,7 +1294,7 @@ next_mm:
1294 if (ksm_test_exit(mm)) 1294 if (ksm_test_exit(mm))
1295 break; 1295 break;
1296 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1296 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1297 if (*page && PageAnon(*page)) { 1297 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
1298 flush_anon_page(vma, *page, ksm_scan.address); 1298 flush_anon_page(vma, *page, ksm_scan.address);
1299 flush_dcache_page(*page); 1299 flush_dcache_page(*page);
1300 rmap_item = get_next_rmap_item(slot, 1300 rmap_item = get_next_rmap_item(slot,
@@ -1308,7 +1308,7 @@ next_mm:
1308 up_read(&mm->mmap_sem); 1308 up_read(&mm->mmap_sem);
1309 return rmap_item; 1309 return rmap_item;
1310 } 1310 }
1311 if (*page) 1311 if (!IS_ERR_OR_NULL(*page))
1312 put_page(*page); 1312 put_page(*page);
1313 ksm_scan.address += PAGE_SIZE; 1313 ksm_scan.address += PAGE_SIZE;
1314 cond_resched(); 1314 cond_resched();
@@ -1367,7 +1367,7 @@ next_mm:
1367static void ksm_do_scan(unsigned int scan_npages) 1367static void ksm_do_scan(unsigned int scan_npages)
1368{ 1368{
1369 struct rmap_item *rmap_item; 1369 struct rmap_item *rmap_item;
1370 struct page *page; 1370 struct page *uninitialized_var(page);
1371 1371
1372 while (scan_npages--) { 1372 while (scan_npages--) {
1373 cond_resched(); 1373 cond_resched();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7973b5221fb..c8569bc298f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1359,16 +1359,19 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
1359 1359
1360 lock_page_cgroup(pc); 1360 lock_page_cgroup(pc);
1361 mem = pc->mem_cgroup; 1361 mem = pc->mem_cgroup;
1362 if (!mem) 1362 if (!mem || !PageCgroupUsed(pc))
1363 goto done;
1364
1365 if (!PageCgroupUsed(pc))
1366 goto done; 1363 goto done;
1367 1364
1368 /* 1365 /*
1369 * Preemption is already disabled. We can use __this_cpu_xxx 1366 * Preemption is already disabled. We can use __this_cpu_xxx
1370 */ 1367 */
1371 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val); 1368 if (val > 0) {
1369 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1370 SetPageCgroupFileMapped(pc);
1371 } else {
1372 __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
1373 ClearPageCgroupFileMapped(pc);
1374 }
1372 1375
1373done: 1376done:
1374 unlock_page_cgroup(pc); 1377 unlock_page_cgroup(pc);
@@ -1435,7 +1438,7 @@ static void drain_local_stock(struct work_struct *dummy)
1435 1438
1436/* 1439/*
1437 * Cache charges(val) which is from res_counter, to local per_cpu area. 1440 * Cache charges(val) which is from res_counter, to local per_cpu area.
1438 * This will be consumed by consumt_stock() function, later. 1441 * This will be consumed by consume_stock() function, later.
1439 */ 1442 */
1440static void refill_stock(struct mem_cgroup *mem, int val) 1443static void refill_stock(struct mem_cgroup *mem, int val)
1441{ 1444{
@@ -1598,7 +1601,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1598 * There is a small race that "from" or "to" can be 1601 * There is a small race that "from" or "to" can be
1599 * freed by rmdir, so we use css_tryget(). 1602 * freed by rmdir, so we use css_tryget().
1600 */ 1603 */
1601 rcu_read_lock();
1602 from = mc.from; 1604 from = mc.from;
1603 to = mc.to; 1605 to = mc.to;
1604 if (from && css_tryget(&from->css)) { 1606 if (from && css_tryget(&from->css)) {
@@ -1619,7 +1621,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1619 do_continue = (to == mem_over_limit); 1621 do_continue = (to == mem_over_limit);
1620 css_put(&to->css); 1622 css_put(&to->css);
1621 } 1623 }
1622 rcu_read_unlock();
1623 if (do_continue) { 1624 if (do_continue) {
1624 DEFINE_WAIT(wait); 1625 DEFINE_WAIT(wait);
1625 prepare_to_wait(&mc.waitq, &wait, 1626 prepare_to_wait(&mc.waitq, &wait,
@@ -1801,16 +1802,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1801static void __mem_cgroup_move_account(struct page_cgroup *pc, 1802static void __mem_cgroup_move_account(struct page_cgroup *pc,
1802 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 1803 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
1803{ 1804{
1804 struct page *page;
1805
1806 VM_BUG_ON(from == to); 1805 VM_BUG_ON(from == to);
1807 VM_BUG_ON(PageLRU(pc->page)); 1806 VM_BUG_ON(PageLRU(pc->page));
1808 VM_BUG_ON(!PageCgroupLocked(pc)); 1807 VM_BUG_ON(!PageCgroupLocked(pc));
1809 VM_BUG_ON(!PageCgroupUsed(pc)); 1808 VM_BUG_ON(!PageCgroupUsed(pc));
1810 VM_BUG_ON(pc->mem_cgroup != from); 1809 VM_BUG_ON(pc->mem_cgroup != from);
1811 1810
1812 page = pc->page; 1811 if (PageCgroupFileMapped(pc)) {
1813 if (page_mapped(page) && !PageAnon(page)) {
1814 /* Update mapped_file data for mem_cgroup */ 1812 /* Update mapped_file data for mem_cgroup */
1815 preempt_disable(); 1813 preempt_disable();
1816 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 1814 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
@@ -2429,11 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
2429 } 2427 }
2430 unlock_page_cgroup(pc); 2428 unlock_page_cgroup(pc);
2431 2429
2430 *ptr = mem;
2432 if (mem) { 2431 if (mem) {
2433 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2432 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
2434 css_put(&mem->css); 2433 css_put(&mem->css);
2435 } 2434 }
2436 *ptr = mem;
2437 return ret; 2435 return ret;
2438} 2436}
2439 2437
@@ -3691,8 +3689,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
3691 else 3689 else
3692 mem = vmalloc(size); 3690 mem = vmalloc(size);
3693 3691
3694 if (mem) 3692 if (!mem)
3695 memset(mem, 0, size); 3693 return NULL;
3694
3695 memset(mem, 0, size);
3696 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 3696 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3697 if (!mem->stat) { 3697 if (!mem->stat) {
3698 if (size < PAGE_SIZE) 3698 if (size < PAGE_SIZE)
@@ -3946,28 +3946,6 @@ one_by_one:
3946 } 3946 }
3947 return ret; 3947 return ret;
3948} 3948}
3949#else /* !CONFIG_MMU */
3950static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3951 struct cgroup *cgroup,
3952 struct task_struct *p,
3953 bool threadgroup)
3954{
3955 return 0;
3956}
3957static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
3958 struct cgroup *cgroup,
3959 struct task_struct *p,
3960 bool threadgroup)
3961{
3962}
3963static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3964 struct cgroup *cont,
3965 struct cgroup *old_cont,
3966 struct task_struct *p,
3967 bool threadgroup)
3968{
3969}
3970#endif
3971 3949
3972/** 3950/**
3973 * is_target_pte_for_mc - check a pte whether it is valid for move charge 3951 * is_target_pte_for_mc - check a pte whether it is valid for move charge
@@ -4330,6 +4308,28 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4330 } 4308 }
4331 mem_cgroup_clear_mc(); 4309 mem_cgroup_clear_mc();
4332} 4310}
4311#else /* !CONFIG_MMU */
4312static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4313 struct cgroup *cgroup,
4314 struct task_struct *p,
4315 bool threadgroup)
4316{
4317 return 0;
4318}
4319static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4320 struct cgroup *cgroup,
4321 struct task_struct *p,
4322 bool threadgroup)
4323{
4324}
4325static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4326 struct cgroup *cont,
4327 struct cgroup *old_cont,
4328 struct task_struct *p,
4329 bool threadgroup)
4330{
4331}
4332#endif
4333 4333
4334struct cgroup_subsys mem_cgroup_subsys = { 4334struct cgroup_subsys mem_cgroup_subsys = {
4335 .name = "memory", 4335 .name = "memory",
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d1f33516297..620b0b46159 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -44,6 +44,7 @@
44#include <linux/migrate.h> 44#include <linux/migrate.h>
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h>
47#include "internal.h" 48#include "internal.h"
48 49
49int sysctl_memory_failure_early_kill __read_mostly = 0; 50int sysctl_memory_failure_early_kill __read_mostly = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 5b7f2002e54..833952d8b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
56#include <linux/kallsyms.h> 56#include <linux/kallsyms.h>
57#include <linux/swapops.h> 57#include <linux/swapops.h>
58#include <linux/elf.h> 58#include <linux/elf.h>
59#include <linux/gfp.h>
59 60
60#include <asm/io.h> 61#include <asm/io.h>
61#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
@@ -124,7 +125,7 @@ core_initcall(init_zero_pfn);
124 125
125#if defined(SPLIT_RSS_COUNTING) 126#if defined(SPLIT_RSS_COUNTING)
126 127
127void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) 128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
128{ 129{
129 int i; 130 int i;
130 131
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 643f66e1018..08f40a2f3fe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
73#include <linux/sched.h> 73#include <linux/sched.h>
74#include <linux/nodemask.h> 74#include <linux/nodemask.h>
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h> 76#include <linux/slab.h>
78#include <linux/string.h> 77#include <linux/string.h>
79#include <linux/module.h> 78#include <linux/module.h>
@@ -806,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
806 805
807 err = 0; 806 err = 0;
808 if (nmask) { 807 if (nmask) {
809 task_lock(current); 808 if (mpol_store_user_nodemask(pol)) {
810 get_policy_nodemask(pol, nmask); 809 *nmask = pol->w.user_nodemask;
811 task_unlock(current); 810 } else {
811 task_lock(current);
812 get_policy_nodemask(pol, nmask);
813 task_unlock(current);
814 }
812 } 815 }
813 816
814 out: 817 out:
@@ -2195,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2195 char *rest = nodelist; 2198 char *rest = nodelist;
2196 while (isdigit(*rest)) 2199 while (isdigit(*rest))
2197 rest++; 2200 rest++;
2198 if (!*rest) 2201 if (*rest)
2199 err = 0; 2202 goto out;
2200 } 2203 }
2201 break; 2204 break;
2202 case MPOL_INTERLEAVE: 2205 case MPOL_INTERLEAVE:
@@ -2205,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2205 */ 2208 */
2206 if (!nodelist) 2209 if (!nodelist)
2207 nodes = node_states[N_HIGH_MEMORY]; 2210 nodes = node_states[N_HIGH_MEMORY];
2208 err = 0;
2209 break; 2211 break;
2210 case MPOL_LOCAL: 2212 case MPOL_LOCAL:
2211 /* 2213 /*
@@ -2215,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2215 goto out; 2217 goto out;
2216 mode = MPOL_PREFERRED; 2218 mode = MPOL_PREFERRED;
2217 break; 2219 break;
2218 2220 case MPOL_DEFAULT:
2219 /* 2221 /*
2220 * case MPOL_BIND: mpol_new() enforces non-empty nodemask. 2222 * Insist on a empty nodelist
2221 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. 2223 */
2222 */ 2224 if (!nodelist)
2225 err = 0;
2226 goto out;
2227 case MPOL_BIND:
2228 /*
2229 * Insist on a nodelist
2230 */
2231 if (!nodelist)
2232 goto out;
2223 } 2233 }
2224 2234
2225 mode_flags = 0; 2235 mode_flags = 0;
@@ -2233,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2233 else if (!strcmp(flags, "relative")) 2243 else if (!strcmp(flags, "relative"))
2234 mode_flags |= MPOL_F_RELATIVE_NODES; 2244 mode_flags |= MPOL_F_RELATIVE_NODES;
2235 else 2245 else
2236 err = 1; 2246 goto out;
2237 } 2247 }
2238 2248
2239 new = mpol_new(mode, mode_flags, &nodes); 2249 new = mpol_new(mode, mode_flags, &nodes);
2240 if (IS_ERR(new)) 2250 if (IS_ERR(new))
2241 err = 1; 2251 goto out;
2242 else { 2252
2253 {
2243 int ret; 2254 int ret;
2244 NODEMASK_SCRATCH(scratch); 2255 NODEMASK_SCRATCH(scratch);
2245 if (scratch) { 2256 if (scratch) {
@@ -2250,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2250 ret = -ENOMEM; 2261 ret = -ENOMEM;
2251 NODEMASK_SCRATCH_FREE(scratch); 2262 NODEMASK_SCRATCH_FREE(scratch);
2252 if (ret) { 2263 if (ret) {
2253 err = 1;
2254 mpol_put(new); 2264 mpol_put(new);
2255 } else if (no_context) { 2265 goto out;
2256 /* save for contextualization */
2257 new->w.user_nodemask = nodes;
2258 } 2266 }
2259 } 2267 }
2268 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2260 2273
2261out: 2274out:
2262 /* Restore string for error message */ 2275 /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 88000b89fc9..d3f3f7f8107 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/gfp.h>
35 36
36#include "internal.h" 37#include "internal.h"
37 38
diff --git a/mm/mincore.c b/mm/mincore.c
index 7a3436ef39e..f77433c2027 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -7,8 +7,8 @@
7/* 7/*
8 * The mincore() system call. 8 * The mincore() system call.
9 */ 9 */
10#include <linux/slab.h>
11#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/gfp.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/mman.h> 13#include <linux/mman.h>
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec..3f82720e051 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
607 spin_unlock(&shmlock_user_lock); 607 spin_unlock(&shmlock_user_lock);
608 free_uid(user); 608 free_uid(user);
609} 609}
610
611int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
612 size_t size)
613{
614 unsigned long lim, vm, pgsz;
615 int error = -ENOMEM;
616
617 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
618
619 down_write(&mm->mmap_sem);
620
621 lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
622 vm = mm->total_vm + pgsz;
623 if (lim < vm)
624 goto out;
625
626 lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
627 vm = mm->locked_vm + pgsz;
628 if (lim < vm)
629 goto out;
630
631 mm->total_vm += pgsz;
632 mm->locked_vm += pgsz;
633
634 error = 0;
635 out:
636 up_write(&mm->mmap_sem);
637 return error;
638}
639
640void refund_locked_memory(struct mm_struct *mm, size_t size)
641{
642 unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
643
644 down_write(&mm->mmap_sem);
645
646 mm->total_vm -= pgsz;
647 mm->locked_vm -= pgsz;
648
649 up_write(&mm->mmap_sem);
650}
diff --git a/mm/mmap.c b/mm/mmap.c
index 75557c639ad..456ec6f2788 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -507,11 +507,12 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
507 struct address_space *mapping = NULL; 507 struct address_space *mapping = NULL;
508 struct prio_tree_root *root = NULL; 508 struct prio_tree_root *root = NULL;
509 struct file *file = vma->vm_file; 509 struct file *file = vma->vm_file;
510 struct anon_vma *anon_vma = NULL;
511 long adjust_next = 0; 510 long adjust_next = 0;
512 int remove_next = 0; 511 int remove_next = 0;
513 512
514 if (next && !insert) { 513 if (next && !insert) {
514 struct vm_area_struct *exporter = NULL;
515
515 if (end >= next->vm_end) { 516 if (end >= next->vm_end) {
516 /* 517 /*
517 * vma expands, overlapping all the next, and 518 * vma expands, overlapping all the next, and
@@ -519,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
519 */ 520 */
520again: remove_next = 1 + (end > next->vm_end); 521again: remove_next = 1 + (end > next->vm_end);
521 end = next->vm_end; 522 end = next->vm_end;
522 anon_vma = next->anon_vma; 523 exporter = next;
523 importer = vma; 524 importer = vma;
524 } else if (end > next->vm_start) { 525 } else if (end > next->vm_start) {
525 /* 526 /*
@@ -527,7 +528,7 @@ again: remove_next = 1 + (end > next->vm_end);
527 * mprotect case 5 shifting the boundary up. 528 * mprotect case 5 shifting the boundary up.
528 */ 529 */
529 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 530 adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
530 anon_vma = next->anon_vma; 531 exporter = next;
531 importer = vma; 532 importer = vma;
532 } else if (end < vma->vm_end) { 533 } else if (end < vma->vm_end) {
533 /* 534 /*
@@ -536,28 +537,19 @@ again: remove_next = 1 + (end > next->vm_end);
536 * mprotect case 4 shifting the boundary down. 537 * mprotect case 4 shifting the boundary down.
537 */ 538 */
538 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 539 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
539 anon_vma = next->anon_vma; 540 exporter = vma;
540 importer = next; 541 importer = next;
541 } 542 }
542 }
543 543
544 /*
545 * When changing only vma->vm_end, we don't really need anon_vma lock.
546 */
547 if (vma->anon_vma && (insert || importer || start != vma->vm_start))
548 anon_vma = vma->anon_vma;
549 if (anon_vma) {
550 /* 544 /*
551 * Easily overlooked: when mprotect shifts the boundary, 545 * Easily overlooked: when mprotect shifts the boundary,
552 * make sure the expanding vma has anon_vma set if the 546 * make sure the expanding vma has anon_vma set if the
553 * shrinking vma had, to cover any anon pages imported. 547 * shrinking vma had, to cover any anon pages imported.
554 */ 548 */
555 if (importer && !importer->anon_vma) { 549 if (exporter && exporter->anon_vma && !importer->anon_vma) {
556 /* Block reverse map lookups until things are set up. */ 550 if (anon_vma_clone(importer, exporter))
557 if (anon_vma_clone(importer, vma)) {
558 return -ENOMEM; 551 return -ENOMEM;
559 } 552 importer->anon_vma = exporter->anon_vma;
560 importer->anon_vma = anon_vma;
561 } 553 }
562 } 554 }
563 555
@@ -825,6 +817,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
825} 817}
826 818
827/* 819/*
820 * Rough compatbility check to quickly see if it's even worth looking
821 * at sharing an anon_vma.
822 *
823 * They need to have the same vm_file, and the flags can only differ
824 * in things that mprotect may change.
825 *
826 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
827 * we can merge the two vma's. For example, we refuse to merge a vma if
828 * there is a vm_ops->close() function, because that indicates that the
829 * driver is doing some kind of reference counting. But that doesn't
830 * really matter for the anon_vma sharing case.
831 */
832static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
833{
834 return a->vm_end == b->vm_start &&
835 mpol_equal(vma_policy(a), vma_policy(b)) &&
836 a->vm_file == b->vm_file &&
837 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
838 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
839}
840
841/*
842 * Do some basic sanity checking to see if we can re-use the anon_vma
843 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
844 * the same as 'old', the other will be the new one that is trying
845 * to share the anon_vma.
846 *
847 * NOTE! This runs with mm_sem held for reading, so it is possible that
848 * the anon_vma of 'old' is concurrently in the process of being set up
849 * by another page fault trying to merge _that_. But that's ok: if it
850 * is being set up, that automatically means that it will be a singleton
851 * acceptable for merging, so we can do all of this optimistically. But
852 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
853 *
854 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
855 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
856 * is to return an anon_vma that is "complex" due to having gone through
857 * a fork).
858 *
859 * We also make sure that the two vma's are compatible (adjacent,
860 * and with the same memory policies). That's all stable, even with just
861 * a read lock on the mm_sem.
862 */
863static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
864{
865 if (anon_vma_compatible(a, b)) {
866 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
867
868 if (anon_vma && list_is_singular(&old->anon_vma_chain))
869 return anon_vma;
870 }
871 return NULL;
872}
873
874/*
828 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 875 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
829 * neighbouring vmas for a suitable anon_vma, before it goes off 876 * neighbouring vmas for a suitable anon_vma, before it goes off
830 * to allocate a new anon_vma. It checks because a repetitive 877 * to allocate a new anon_vma. It checks because a repetitive
@@ -834,28 +881,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
834 */ 881 */
835struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 882struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
836{ 883{
884 struct anon_vma *anon_vma;
837 struct vm_area_struct *near; 885 struct vm_area_struct *near;
838 unsigned long vm_flags;
839 886
840 near = vma->vm_next; 887 near = vma->vm_next;
841 if (!near) 888 if (!near)
842 goto try_prev; 889 goto try_prev;
843 890
844 /* 891 anon_vma = reusable_anon_vma(near, vma, near);
845 * Since only mprotect tries to remerge vmas, match flags 892 if (anon_vma)
846 * which might be mprotected into each other later on. 893 return anon_vma;
847 * Neither mlock nor madvise tries to remerge at present,
848 * so leave their flags as obstructing a merge.
849 */
850 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
851 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
852
853 if (near->anon_vma && vma->vm_end == near->vm_start &&
854 mpol_equal(vma_policy(vma), vma_policy(near)) &&
855 can_vma_merge_before(near, vm_flags,
856 NULL, vma->vm_file, vma->vm_pgoff +
857 ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
858 return near->anon_vma;
859try_prev: 894try_prev:
860 /* 895 /*
861 * It is potentially slow to have to call find_vma_prev here. 896 * It is potentially slow to have to call find_vma_prev here.
@@ -868,14 +903,9 @@ try_prev:
868 if (!near) 903 if (!near)
869 goto none; 904 goto none;
870 905
871 vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); 906 anon_vma = reusable_anon_vma(near, near, vma);
872 vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); 907 if (anon_vma)
873 908 return anon_vma;
874 if (near->anon_vma && near->vm_end == vma->vm_start &&
875 mpol_equal(vma_policy(near), vma_policy(vma)) &&
876 can_vma_merge_after(near, vm_flags,
877 NULL, vma->vm_file, vma->vm_pgoff))
878 return near->anon_vma;
879none: 909none:
880 /* 910 /*
881 * There's no absolute need to look only at touching neighbours: 911 * There's no absolute need to look only at touching neighbours:
@@ -1947,7 +1977,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1947 return 0; 1977 return 0;
1948 1978
1949 /* Clean everything up if vma_adjust failed. */ 1979 /* Clean everything up if vma_adjust failed. */
1950 new->vm_ops->close(new); 1980 if (new->vm_ops && new->vm_ops->close)
1981 new->vm_ops->close(new);
1951 if (new->vm_file) { 1982 if (new->vm_file) {
1952 if (vma->vm_flags & VM_EXECUTABLE) 1983 if (vma->vm_flags & VM_EXECUTABLE)
1953 removed_exe_file_vma(mm); 1984 removed_exe_file_vma(mm);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 0777654147c..9e82e937000 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,6 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm);
56 tsk->mm = NULL; 57 tsk->mm = NULL;
57 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
58 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7e33f2cb3c7..438951d366f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -16,6 +16,7 @@
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/rcupdate.h> 17#include <linux/rcupdate.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/slab.h>
19 20
20/* 21/*
21 * This function can't run concurrently against mmu_notifier_register 22 * This function can't run concurrently against mmu_notifier_register
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d8112..2d1bf7cf885 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -10,7 +10,6 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/slab.h>
14#include <linux/shm.h> 13#include <linux/shm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index e9c75efce60..cde56ee51ef 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/slab.h>
13#include <linux/shm.h> 12#include <linux/shm.h>
14#include <linux/ksm.h> 13#include <linux/ksm.h>
15#include <linux/mman.h> 14#include <linux/mman.h>
diff --git a/mm/msync.c b/mm/msync.c
index 4083209b7f0..632df4527c0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
82 (vma->vm_flags & VM_SHARED)) { 82 (vma->vm_flags & VM_SHARED)) {
83 get_file(file); 83 get_file(file);
84 up_read(&mm->mmap_sem); 84 up_read(&mm->mmap_sem);
85 error = vfs_fsync(file, file->f_path.dentry, 0); 85 error = vfs_fsync(file, 0);
86 fput(file); 86 fput(file);
87 if (error || start >= end) 87 if (error || start >= end)
88 goto out; 88 goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8982a..63fa17d121f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 146 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
147 147
148 for (i = 0; i < nr_pages; i++) { 148 for (i = 0; i < nr_pages; i++) {
149 vma = find_extend_vma(mm, start); 149 vma = find_vma(mm, start);
150 if (!vma) 150 if (!vma)
151 goto finish_or_fault; 151 goto finish_or_fault;
152 152
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
162 } 162 }
163 if (vmas) 163 if (vmas)
164 vmas[i] = vma; 164 vmas[i] = vma;
165 start += PAGE_SIZE; 165 start = (start + PAGE_SIZE) & PAGE_MASK;
166 } 166 }
167 167
168 return i; 168 return i;
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
764 */ 764 */
765struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) 765struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
766{ 766{
767 return find_vma(mm, addr & PAGE_MASK); 767 return find_vma(mm, addr);
768} 768}
769 769
770/* 770/*
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1040 if (ret != -ENOSYS) 1040 if (ret != -ENOSYS)
1041 return ret; 1041 return ret;
1042 1042
1043 /* getting an ENOSYS error indicates that direct mmap isn't 1043 /* getting -ENOSYS indicates that direct mmap isn't possible (as
1044 * possible (as opposed to tried but failed) so we'll fall 1044 * opposed to tried but failed) so we can only give a suitable error as
1045 * through to making a private copy of the data and mapping 1045 * it's not possible to make a private copy if MAP_SHARED was given */
1046 * that if we can */
1047 return -ENODEV; 1046 return -ENODEV;
1048} 1047}
1049 1048
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9b223af6a14..b68e802a7a7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -18,6 +18,7 @@
18#include <linux/oom.h> 18#include <linux/oom.h>
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/err.h> 20#include <linux/err.h>
21#include <linux/gfp.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/swap.h> 23#include <linux/swap.h>
23#include <linux/timex.h> 24#include <linux/timex.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8..b289310e2c8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 597 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
598 + global_page_state(NR_UNSTABLE_NFS)) 598 + global_page_state(NR_UNSTABLE_NFS))
599 > background_thresh))) 599 > background_thresh)))
600 bdi_start_writeback(bdi, NULL, 0); 600 bdi_start_writeback(bdi, NULL, 0, 0);
601} 601}
602 602
603void set_page_dirty_balance(struct page *page, int page_mkwrite) 603void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
683 } 683 }
684} 684}
685 685
686static void laptop_timer_fn(unsigned long unused);
687
688static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
689
690/* 686/*
691 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 687 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
692 */ 688 */
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
694 void __user *buffer, size_t *length, loff_t *ppos) 690 void __user *buffer, size_t *length, loff_t *ppos)
695{ 691{
696 proc_dointvec(table, write, buffer, length, ppos); 692 proc_dointvec(table, write, buffer, length, ppos);
693 bdi_arm_supers_timer();
697 return 0; 694 return 0;
698} 695}
699 696
700static void do_laptop_sync(struct work_struct *work) 697#ifdef CONFIG_BLOCK
698void laptop_mode_timer_fn(unsigned long data)
701{ 699{
702 wakeup_flusher_threads(0); 700 struct request_queue *q = (struct request_queue *)data;
703 kfree(work); 701 int nr_pages = global_page_state(NR_FILE_DIRTY) +
704} 702 global_page_state(NR_UNSTABLE_NFS);
705 703
706static void laptop_timer_fn(unsigned long unused) 704 /*
707{ 705 * We want to write everything out, not just down to the dirty
708 struct work_struct *work; 706 * threshold
707 */
709 708
710 work = kmalloc(sizeof(*work), GFP_ATOMIC); 709 if (bdi_has_dirty_io(&q->backing_dev_info))
711 if (work) { 710 bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
712 INIT_WORK(work, do_laptop_sync);
713 schedule_work(work);
714 }
715} 711}
716 712
717/* 713/*
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused)
719 * of all dirty data a few seconds from now. If the flush is already scheduled 715 * of all dirty data a few seconds from now. If the flush is already scheduled
720 * then push it back - the user is still using the disk. 716 * then push it back - the user is still using the disk.
721 */ 717 */
722void laptop_io_completion(void) 718void laptop_io_completion(struct backing_dev_info *info)
723{ 719{
724 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); 720 mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
725} 721}
726 722
727/* 723/*
@@ -731,8 +727,16 @@ void laptop_io_completion(void)
731 */ 727 */
732void laptop_sync_completion(void) 728void laptop_sync_completion(void)
733{ 729{
734 del_timer(&laptop_mode_wb_timer); 730 struct backing_dev_info *bdi;
731
732 rcu_read_lock();
733
734 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
735 del_timer(&bdi->laptop_mode_wb_timer);
736
737 rcu_read_unlock();
735} 738}
739#endif
736 740
737/* 741/*
738 * If ratelimit_pages is too high then we can get into dirty-data overload 742 * If ratelimit_pages is too high then we can get into dirty-data overload
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946d556..a6326c71b66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2579,7 +2579,7 @@ static int default_zonelist_order(void)
2579 struct zone *z; 2579 struct zone *z;
2580 int average_size; 2580 int average_size;
2581 /* 2581 /*
2582 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 2582 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2583 * If they are really small and used heavily, the system can fall 2583 * If they are really small and used heavily, the system can fall
2584 * into OOM very easily. 2584 * into OOM very easily.
2585 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2585 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
diff --git a/mm/page_io.c b/mm/page_io.c
index a19af956ee1..31a3b962230 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,6 +12,7 @@
12 12
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/gfp.h>
15#include <linux/pagemap.h> 16#include <linux/pagemap.h>
16#include <linux/swap.h> 17#include <linux/swap.h>
17#include <linux/bio.h> 18#include <linux/bio.h>
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7b47a57b664..8b1a2ce21ee 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
80 return err; 80 return err;
81} 81}
82 82
83#ifdef CONFIG_HUGETLB_PAGE
84static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
85 unsigned long end)
86{
87 unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
88 return boundary < end ? boundary : end;
89}
90
91static int walk_hugetlb_range(struct vm_area_struct *vma,
92 unsigned long addr, unsigned long end,
93 struct mm_walk *walk)
94{
95 struct hstate *h = hstate_vma(vma);
96 unsigned long next;
97 unsigned long hmask = huge_page_mask(h);
98 pte_t *pte;
99 int err = 0;
100
101 do {
102 next = hugetlb_entry_end(h, addr, end);
103 pte = huge_pte_offset(walk->mm, addr & hmask);
104 if (pte && walk->hugetlb_entry)
105 err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
106 if (err)
107 return err;
108 } while (addr = next, addr != end);
109
110 return 0;
111}
112#endif
113
83/** 114/**
84 * walk_page_range - walk a memory map's page tables with a callback 115 * walk_page_range - walk a memory map's page tables with a callback
85 * @mm: memory map to walk 116 * @mm: memory map to walk
@@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end,
128 vma = find_vma(walk->mm, addr); 159 vma = find_vma(walk->mm, addr);
129#ifdef CONFIG_HUGETLB_PAGE 160#ifdef CONFIG_HUGETLB_PAGE
130 if (vma && is_vm_hugetlb_page(vma)) { 161 if (vma && is_vm_hugetlb_page(vma)) {
131 pte_t *pte;
132 struct hstate *hs;
133
134 if (vma->vm_end < next) 162 if (vma->vm_end < next)
135 next = vma->vm_end; 163 next = vma->vm_end;
136 hs = hstate_vma(vma); 164 /*
137 pte = huge_pte_offset(walk->mm, 165 * Hugepage is very tightly coupled with vma, so
138 addr & huge_page_mask(hs)); 166 * walk through hugetlb entries within a given vma.
139 if (pte && !huge_pte_none(huge_ptep_get(pte)) 167 */
140 && walk->hugetlb_entry) 168 err = walk_hugetlb_range(vma, addr, next, walk);
141 err = walk->hugetlb_entry(pte, addr,
142 next, walk);
143 if (err) 169 if (err)
144 break; 170 break;
171 pgd = pgd_offset(walk->mm, next);
145 continue; 172 continue;
146 } 173 }
147#endif 174#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 00000000000..df680855540
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,104 @@
1/*
2 * mm/percpu-km.c - kernel memory based chunk allocation
3 *
4 * Copyright (C) 2010 SUSE Linux Products GmbH
5 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * Chunks are allocated as a contiguous kernel memory using gfp
10 * allocation. This is to be used on nommu architectures.
11 *
12 * To use percpu-km,
13 *
14 * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
15 *
16 * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's
17 * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work
18 * fine.
19 *
20 * - NUMA is not supported. When setting up the first chunk,
21 * @cpu_distance_fn should be NULL or report all CPUs to be nearer
22 * than or at LOCAL_DISTANCE.
23 *
24 * - It's best if the chunk size is power of two multiple of
25 * PAGE_SIZE. Because each chunk is allocated as a contiguous
26 * kernel memory block using alloc_pages(), memory will be wasted if
27 * chunk size is not aligned. percpu-km code will whine about it.
28 */
29
30#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
31#error "contiguous percpu allocation is incompatible with paged first chunk"
32#endif
33
34#include <linux/log2.h>
35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
37{
38 /* noop */
39 return 0;
40}
41
42static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
43{
44 /* nada */
45}
46
47static struct pcpu_chunk *pcpu_create_chunk(void)
48{
49 const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
50 struct pcpu_chunk *chunk;
51 struct page *pages;
52 int i;
53
54 chunk = pcpu_alloc_chunk();
55 if (!chunk)
56 return NULL;
57
58 pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
59 if (!pages) {
60 pcpu_free_chunk(chunk);
61 return NULL;
62 }
63
64 for (i = 0; i < nr_pages; i++)
65 pcpu_set_page_chunk(nth_page(pages, i), chunk);
66
67 chunk->data = pages;
68 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
69 return chunk;
70}
71
72static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
73{
74 const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
75
76 if (chunk && chunk->data)
77 __free_pages(chunk->data, order_base_2(nr_pages));
78 pcpu_free_chunk(chunk);
79}
80
81static struct page *pcpu_addr_to_page(void *addr)
82{
83 return virt_to_page(addr);
84}
85
86static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
87{
88 size_t nr_pages, alloc_pages;
89
90 /* all units must be in a single group */
91 if (ai->nr_groups != 1) {
92 printk(KERN_CRIT "percpu: can't handle more than one groups\n");
93 return -EINVAL;
94 }
95
96 nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
97 alloc_pages = roundup_pow_of_two(nr_pages);
98
99 if (alloc_pages > nr_pages)
100 printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
101 alloc_pages - nr_pages);
102
103 return 0;
104}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 00000000000..7d9c1d0ebd3
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,451 @@
1/*
2 * mm/percpu-vm.c - vmalloc area based chunk allocation
3 *
4 * Copyright (C) 2010 SUSE Linux Products GmbH
5 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * Chunks are mapped into vmalloc areas and populated page by page.
10 * This is the default chunk allocator.
11 */
12
13static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
14 unsigned int cpu, int page_idx)
15{
16 /* must not be used on pre-mapped chunk */
17 WARN_ON(chunk->immutable);
18
19 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
20}
21
22/**
23 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
24 * @chunk: chunk of interest
25 * @bitmapp: output parameter for bitmap
26 * @may_alloc: may allocate the array
27 *
28 * Returns pointer to array of pointers to struct page and bitmap,
29 * both of which can be indexed with pcpu_page_idx(). The returned
30 * array is cleared to zero and *@bitmapp is copied from
31 * @chunk->populated. Note that there is only one array and bitmap
32 * and access exclusion is the caller's responsibility.
33 *
34 * CONTEXT:
35 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
36 * Otherwise, don't care.
37 *
38 * RETURNS:
39 * Pointer to temp pages array on success, NULL on failure.
40 */
41static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
42 unsigned long **bitmapp,
43 bool may_alloc)
44{
45 static struct page **pages;
46 static unsigned long *bitmap;
47 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
48 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
49 sizeof(unsigned long);
50
51 if (!pages || !bitmap) {
52 if (may_alloc && !pages)
53 pages = pcpu_mem_alloc(pages_size);
54 if (may_alloc && !bitmap)
55 bitmap = pcpu_mem_alloc(bitmap_size);
56 if (!pages || !bitmap)
57 return NULL;
58 }
59
60 memset(pages, 0, pages_size);
61 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
62
63 *bitmapp = bitmap;
64 return pages;
65}
66
67/**
68 * pcpu_free_pages - free pages which were allocated for @chunk
69 * @chunk: chunk pages were allocated for
70 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
71 * @populated: populated bitmap
72 * @page_start: page index of the first page to be freed
73 * @page_end: page index of the last page to be freed + 1
74 *
75 * Free pages [@page_start and @page_end) in @pages for all units.
76 * The pages were allocated for @chunk.
77 */
78static void pcpu_free_pages(struct pcpu_chunk *chunk,
79 struct page **pages, unsigned long *populated,
80 int page_start, int page_end)
81{
82 unsigned int cpu;
83 int i;
84
85 for_each_possible_cpu(cpu) {
86 for (i = page_start; i < page_end; i++) {
87 struct page *page = pages[pcpu_page_idx(cpu, i)];
88
89 if (page)
90 __free_page(page);
91 }
92 }
93}
94
95/**
96 * pcpu_alloc_pages - allocates pages for @chunk
97 * @chunk: target chunk
98 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
99 * @populated: populated bitmap
100 * @page_start: page index of the first page to be allocated
101 * @page_end: page index of the last page to be allocated + 1
102 *
103 * Allocate pages [@page_start,@page_end) into @pages for all units.
104 * The allocation is for @chunk. Percpu core doesn't care about the
105 * content of @pages and will pass it verbatim to pcpu_map_pages().
106 */
107static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
108 struct page **pages, unsigned long *populated,
109 int page_start, int page_end)
110{
111 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
112 unsigned int cpu;
113 int i;
114
115 for_each_possible_cpu(cpu) {
116 for (i = page_start; i < page_end; i++) {
117 struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
118
119 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
120 if (!*pagep) {
121 pcpu_free_pages(chunk, pages, populated,
122 page_start, page_end);
123 return -ENOMEM;
124 }
125 }
126 }
127 return 0;
128}
129
130/**
131 * pcpu_pre_unmap_flush - flush cache prior to unmapping
132 * @chunk: chunk the regions to be flushed belongs to
133 * @page_start: page index of the first page to be flushed
134 * @page_end: page index of the last page to be flushed + 1
135 *
136 * Pages in [@page_start,@page_end) of @chunk are about to be
137 * unmapped. Flush cache. As each flushing trial can be very
138 * expensive, issue flush on the whole region at once rather than
139 * doing it for each cpu. This could be an overkill but is more
140 * scalable.
141 */
142static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
143 int page_start, int page_end)
144{
145 flush_cache_vunmap(
146 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
147 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
148}
149
150static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
151{
152 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
153}
154
155/**
156 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
157 * @chunk: chunk of interest
158 * @pages: pages array which can be used to pass information to free
159 * @populated: populated bitmap
160 * @page_start: page index of the first page to unmap
161 * @page_end: page index of the last page to unmap + 1
162 *
163 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
164 * Corresponding elements in @pages were cleared by the caller and can
165 * be used to carry information to pcpu_free_pages() which will be
166 * called after all unmaps are finished. The caller should call
167 * proper pre/post flush functions.
168 */
169static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
170 struct page **pages, unsigned long *populated,
171 int page_start, int page_end)
172{
173 unsigned int cpu;
174 int i;
175
176 for_each_possible_cpu(cpu) {
177 for (i = page_start; i < page_end; i++) {
178 struct page *page;
179
180 page = pcpu_chunk_page(chunk, cpu, i);
181 WARN_ON(!page);
182 pages[pcpu_page_idx(cpu, i)] = page;
183 }
184 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
185 page_end - page_start);
186 }
187
188 for (i = page_start; i < page_end; i++)
189 __clear_bit(i, populated);
190}
191
192/**
193 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
194 * @chunk: pcpu_chunk the regions to be flushed belong to
195 * @page_start: page index of the first page to be flushed
196 * @page_end: page index of the last page to be flushed + 1
197 *
198 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
199 * TLB for the regions. This can be skipped if the area is to be
200 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
201 *
202 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
203 * for the whole region.
204 */
205static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
206 int page_start, int page_end)
207{
208 flush_tlb_kernel_range(
209 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
210 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
211}
212
213static int __pcpu_map_pages(unsigned long addr, struct page **pages,
214 int nr_pages)
215{
216 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
217 PAGE_KERNEL, pages);
218}
219
220/**
221 * pcpu_map_pages - map pages into a pcpu_chunk
222 * @chunk: chunk of interest
223 * @pages: pages array containing pages to be mapped
224 * @populated: populated bitmap
225 * @page_start: page index of the first page to map
226 * @page_end: page index of the last page to map + 1
227 *
228 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
229 * caller is responsible for calling pcpu_post_map_flush() after all
230 * mappings are complete.
231 *
232 * This function is responsible for setting corresponding bits in
233 * @chunk->populated bitmap and whatever is necessary for reverse
234 * lookup (addr -> chunk).
235 */
236static int pcpu_map_pages(struct pcpu_chunk *chunk,
237 struct page **pages, unsigned long *populated,
238 int page_start, int page_end)
239{
240 unsigned int cpu, tcpu;
241 int i, err;
242
243 for_each_possible_cpu(cpu) {
244 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
245 &pages[pcpu_page_idx(cpu, page_start)],
246 page_end - page_start);
247 if (err < 0)
248 goto err;
249 }
250
251 /* mapping successful, link chunk and mark populated */
252 for (i = page_start; i < page_end; i++) {
253 for_each_possible_cpu(cpu)
254 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
255 chunk);
256 __set_bit(i, populated);
257 }
258
259 return 0;
260
261err:
262 for_each_possible_cpu(tcpu) {
263 if (tcpu == cpu)
264 break;
265 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
266 page_end - page_start);
267 }
268 return err;
269}
270
271/**
272 * pcpu_post_map_flush - flush cache after mapping
273 * @chunk: pcpu_chunk the regions to be flushed belong to
274 * @page_start: page index of the first page to be flushed
275 * @page_end: page index of the last page to be flushed + 1
276 *
277 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
278 * cache.
279 *
280 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
281 * for the whole region.
282 */
283static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
284 int page_start, int page_end)
285{
286 flush_cache_vmap(
287 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
288 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
289}
290
291/**
292 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
293 * @chunk: chunk of interest
294 * @off: offset to the area to populate
295 * @size: size of the area to populate in bytes
296 *
297 * For each cpu, populate and map pages [@page_start,@page_end) into
298 * @chunk. The area is cleared on return.
299 *
300 * CONTEXT:
301 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
302 */
303static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
304{
305 int page_start = PFN_DOWN(off);
306 int page_end = PFN_UP(off + size);
307 int free_end = page_start, unmap_end = page_start;
308 struct page **pages;
309 unsigned long *populated;
310 unsigned int cpu;
311 int rs, re, rc;
312
313 /* quick path, check whether all pages are already there */
314 rs = page_start;
315 pcpu_next_pop(chunk, &rs, &re, page_end);
316 if (rs == page_start && re == page_end)
317 goto clear;
318
319 /* need to allocate and map pages, this chunk can't be immutable */
320 WARN_ON(chunk->immutable);
321
322 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
323 if (!pages)
324 return -ENOMEM;
325
326 /* alloc and map */
327 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
328 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
329 if (rc)
330 goto err_free;
331 free_end = re;
332 }
333
334 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
335 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
336 if (rc)
337 goto err_unmap;
338 unmap_end = re;
339 }
340 pcpu_post_map_flush(chunk, page_start, page_end);
341
342 /* commit new bitmap */
343 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
344clear:
345 for_each_possible_cpu(cpu)
346 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
347 return 0;
348
349err_unmap:
350 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
351 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
352 pcpu_unmap_pages(chunk, pages, populated, rs, re);
353 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
354err_free:
355 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
356 pcpu_free_pages(chunk, pages, populated, rs, re);
357 return rc;
358}
359
360/**
361 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
362 * @chunk: chunk to depopulate
363 * @off: offset to the area to depopulate
364 * @size: size of the area to depopulate in bytes
365 * @flush: whether to flush cache and tlb or not
366 *
367 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
368 * from @chunk. If @flush is true, vcache is flushed before unmapping
369 * and tlb after.
370 *
371 * CONTEXT:
372 * pcpu_alloc_mutex.
373 */
374static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
375{
376 int page_start = PFN_DOWN(off);
377 int page_end = PFN_UP(off + size);
378 struct page **pages;
379 unsigned long *populated;
380 int rs, re;
381
382 /* quick path, check whether it's empty already */
383 rs = page_start;
384 pcpu_next_unpop(chunk, &rs, &re, page_end);
385 if (rs == page_start && re == page_end)
386 return;
387
388 /* immutable chunks can't be depopulated */
389 WARN_ON(chunk->immutable);
390
391 /*
392 * If control reaches here, there must have been at least one
393 * successful population attempt so the temp pages array must
394 * be available now.
395 */
396 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
397 BUG_ON(!pages);
398
399 /* unmap and free */
400 pcpu_pre_unmap_flush(chunk, page_start, page_end);
401
402 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
403 pcpu_unmap_pages(chunk, pages, populated, rs, re);
404
405 /* no need to flush tlb, vmalloc will handle it lazily */
406
407 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
408 pcpu_free_pages(chunk, pages, populated, rs, re);
409
410 /* commit new bitmap */
411 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
412}
413
414static struct pcpu_chunk *pcpu_create_chunk(void)
415{
416 struct pcpu_chunk *chunk;
417 struct vm_struct **vms;
418
419 chunk = pcpu_alloc_chunk();
420 if (!chunk)
421 return NULL;
422
423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
424 pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
425 if (!vms) {
426 pcpu_free_chunk(chunk);
427 return NULL;
428 }
429
430 chunk->data = vms;
431 chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
432 return chunk;
433}
434
435static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
436{
437 if (chunk && chunk->data)
438 pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
439 pcpu_free_chunk(chunk);
440}
441
442static struct page *pcpu_addr_to_page(void *addr)
443{
444 return vmalloc_to_page(addr);
445}
446
447static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
448{
449 /* no extra restriction */
450 return 0;
451}
diff --git a/mm/percpu.c b/mm/percpu.c
index 768419d44ad..39f7dfd5958 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/mm/percpu.c - percpu memory allocator 2 * mm/percpu.c - percpu memory allocator
3 * 3 *
4 * Copyright (C) 2009 SUSE Linux Products GmbH 4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org> 5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
@@ -7,14 +7,13 @@
7 * This file is released under the GPLv2. 7 * This file is released under the GPLv2.
8 * 8 *
9 * This is percpu allocator which can handle both static and dynamic 9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each 10 * areas. Percpu areas are allocated in chunks. Each chunk is
11 * chunk is consisted of boot-time determined number of units and the 11 * consisted of boot-time determined number of units and the first
12 * first chunk is used for static percpu variables in the kernel image 12 * chunk is used for static percpu variables in the kernel image
13 * (special boot time alloc/init handling necessary as these areas 13 * (special boot time alloc/init handling necessary as these areas
14 * need to be brought up before allocation services are running). 14 * need to be brought up before allocation services are running).
15 * Unit grows as necessary and all units grow or shrink in unison. 15 * Unit grows as necessary and all units grow or shrink in unison.
16 * When a chunk is filled up, another chunk is allocated. ie. in 16 * When a chunk is filled up, another chunk is allocated.
17 * vmalloc area
18 * 17 *
19 * c0 c1 c2 18 * c0 c1 c2
20 * ------------------- ------------------- ------------ 19 * ------------------- ------------------- ------------
@@ -99,7 +98,7 @@ struct pcpu_chunk {
99 int map_used; /* # of map entries used */ 98 int map_used; /* # of map entries used */
100 int map_alloc; /* # of map entries allocated */ 99 int map_alloc; /* # of map entries allocated */
101 int *map; /* allocation map */ 100 int *map; /* allocation map */
102 struct vm_struct **vms; /* mapped vmalloc regions */ 101 void *data; /* chunk data */
103 bool immutable; /* no [de]population allowed */ 102 bool immutable; /* no [de]population allowed */
104 unsigned long populated[]; /* populated bitmap */ 103 unsigned long populated[]; /* populated bitmap */
105}; 104};
@@ -177,6 +176,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
177static void pcpu_reclaim(struct work_struct *work); 176static void pcpu_reclaim(struct work_struct *work);
178static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 177static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
179 178
179static bool pcpu_addr_in_first_chunk(void *addr)
180{
181 void *first_start = pcpu_first_chunk->base_addr;
182
183 return addr >= first_start && addr < first_start + pcpu_unit_size;
184}
185
186static bool pcpu_addr_in_reserved_chunk(void *addr)
187{
188 void *first_start = pcpu_first_chunk->base_addr;
189
190 return addr >= first_start &&
191 addr < first_start + pcpu_reserved_chunk_limit;
192}
193
180static int __pcpu_size_to_slot(int size) 194static int __pcpu_size_to_slot(int size)
181{ 195{
182 int highbit = fls(size); /* size is in bytes */ 196 int highbit = fls(size); /* size is in bytes */
@@ -198,27 +212,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
198 return pcpu_size_to_slot(chunk->free_size); 212 return pcpu_size_to_slot(chunk->free_size);
199} 213}
200 214
201static int pcpu_page_idx(unsigned int cpu, int page_idx)
202{
203 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
204}
205
206static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
207 unsigned int cpu, int page_idx)
208{
209 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
210 (page_idx << PAGE_SHIFT);
211}
212
213static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
214 unsigned int cpu, int page_idx)
215{
216 /* must not be used on pre-mapped chunk */
217 WARN_ON(chunk->immutable);
218
219 return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
220}
221
222/* set the pointer to a chunk in a page struct */ 215/* set the pointer to a chunk in a page struct */
223static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) 216static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
224{ 217{
@@ -231,13 +224,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
231 return (struct pcpu_chunk *)page->index; 224 return (struct pcpu_chunk *)page->index;
232} 225}
233 226
234static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) 227static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
228{
229 return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
230}
231
232static unsigned long __maybe_unused pcpu_chunk_addr(struct pcpu_chunk *chunk,
233 unsigned int cpu, int page_idx)
234{
235 return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
236 (page_idx << PAGE_SHIFT);
237}
238
239static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
240 int *rs, int *re, int end)
235{ 241{
236 *rs = find_next_zero_bit(chunk->populated, end, *rs); 242 *rs = find_next_zero_bit(chunk->populated, end, *rs);
237 *re = find_next_bit(chunk->populated, end, *rs + 1); 243 *re = find_next_bit(chunk->populated, end, *rs + 1);
238} 244}
239 245
240static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) 246static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
247 int *rs, int *re, int end)
241{ 248{
242 *rs = find_next_bit(chunk->populated, end, *rs); 249 *rs = find_next_bit(chunk->populated, end, *rs);
243 *re = find_next_zero_bit(chunk->populated, end, *rs + 1); 250 *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
@@ -326,36 +333,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
326} 333}
327 334
328/** 335/**
329 * pcpu_chunk_addr_search - determine chunk containing specified address
330 * @addr: address for which the chunk needs to be determined.
331 *
332 * RETURNS:
333 * The address of the found chunk.
334 */
335static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
336{
337 void *first_start = pcpu_first_chunk->base_addr;
338
339 /* is it in the first chunk? */
340 if (addr >= first_start && addr < first_start + pcpu_unit_size) {
341 /* is it in the reserved area? */
342 if (addr < first_start + pcpu_reserved_chunk_limit)
343 return pcpu_reserved_chunk;
344 return pcpu_first_chunk;
345 }
346
347 /*
348 * The address is relative to unit0 which might be unused and
349 * thus unmapped. Offset the address to the unit space of the
350 * current processor before looking it up in the vmalloc
351 * space. Note that any possible cpu id can be used here, so
352 * there's no need to worry about preemption or cpu hotplug.
353 */
354 addr += pcpu_unit_offsets[raw_smp_processor_id()];
355 return pcpu_get_page_chunk(vmalloc_to_page(addr));
356}
357
358/**
359 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 336 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
360 * @chunk: chunk of interest 337 * @chunk: chunk of interest
361 * 338 *
@@ -623,434 +600,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
623 pcpu_chunk_relocate(chunk, oslot); 600 pcpu_chunk_relocate(chunk, oslot);
624} 601}
625 602
626/** 603static struct pcpu_chunk *pcpu_alloc_chunk(void)
627 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
628 * @chunk: chunk of interest
629 * @bitmapp: output parameter for bitmap
630 * @may_alloc: may allocate the array
631 *
632 * Returns pointer to array of pointers to struct page and bitmap,
633 * both of which can be indexed with pcpu_page_idx(). The returned
634 * array is cleared to zero and *@bitmapp is copied from
635 * @chunk->populated. Note that there is only one array and bitmap
636 * and access exclusion is the caller's responsibility.
637 *
638 * CONTEXT:
639 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
640 * Otherwise, don't care.
641 *
642 * RETURNS:
643 * Pointer to temp pages array on success, NULL on failure.
644 */
645static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
646 unsigned long **bitmapp,
647 bool may_alloc)
648{
649 static struct page **pages;
650 static unsigned long *bitmap;
651 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
652 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
653 sizeof(unsigned long);
654
655 if (!pages || !bitmap) {
656 if (may_alloc && !pages)
657 pages = pcpu_mem_alloc(pages_size);
658 if (may_alloc && !bitmap)
659 bitmap = pcpu_mem_alloc(bitmap_size);
660 if (!pages || !bitmap)
661 return NULL;
662 }
663
664 memset(pages, 0, pages_size);
665 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
666
667 *bitmapp = bitmap;
668 return pages;
669}
670
671/**
672 * pcpu_free_pages - free pages which were allocated for @chunk
673 * @chunk: chunk pages were allocated for
674 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
675 * @populated: populated bitmap
676 * @page_start: page index of the first page to be freed
677 * @page_end: page index of the last page to be freed + 1
678 *
679 * Free pages [@page_start and @page_end) in @pages for all units.
680 * The pages were allocated for @chunk.
681 */
682static void pcpu_free_pages(struct pcpu_chunk *chunk,
683 struct page **pages, unsigned long *populated,
684 int page_start, int page_end)
685{ 604{
686 unsigned int cpu; 605 struct pcpu_chunk *chunk;
687 int i;
688 606
689 for_each_possible_cpu(cpu) { 607 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
690 for (i = page_start; i < page_end; i++) { 608 if (!chunk)
691 struct page *page = pages[pcpu_page_idx(cpu, i)]; 609 return NULL;
692 610
693 if (page) 611 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
694 __free_page(page); 612 if (!chunk->map) {
695 } 613 kfree(chunk);
614 return NULL;
696 } 615 }
697}
698 616
699/** 617 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
700 * pcpu_alloc_pages - allocates pages for @chunk 618 chunk->map[chunk->map_used++] = pcpu_unit_size;
701 * @chunk: target chunk
702 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
703 * @populated: populated bitmap
704 * @page_start: page index of the first page to be allocated
705 * @page_end: page index of the last page to be allocated + 1
706 *
707 * Allocate pages [@page_start,@page_end) into @pages for all units.
708 * The allocation is for @chunk. Percpu core doesn't care about the
709 * content of @pages and will pass it verbatim to pcpu_map_pages().
710 */
711static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
712 struct page **pages, unsigned long *populated,
713 int page_start, int page_end)
714{
715 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
716 unsigned int cpu;
717 int i;
718 619
719 for_each_possible_cpu(cpu) { 620 INIT_LIST_HEAD(&chunk->list);
720 for (i = page_start; i < page_end; i++) { 621 chunk->free_size = pcpu_unit_size;
721 struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; 622 chunk->contig_hint = pcpu_unit_size;
722
723 *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
724 if (!*pagep) {
725 pcpu_free_pages(chunk, pages, populated,
726 page_start, page_end);
727 return -ENOMEM;
728 }
729 }
730 }
731 return 0;
732}
733 623
734/** 624 return chunk;
735 * pcpu_pre_unmap_flush - flush cache prior to unmapping
736 * @chunk: chunk the regions to be flushed belongs to
737 * @page_start: page index of the first page to be flushed
738 * @page_end: page index of the last page to be flushed + 1
739 *
740 * Pages in [@page_start,@page_end) of @chunk are about to be
741 * unmapped. Flush cache. As each flushing trial can be very
742 * expensive, issue flush on the whole region at once rather than
743 * doing it for each cpu. This could be an overkill but is more
744 * scalable.
745 */
746static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
747 int page_start, int page_end)
748{
749 flush_cache_vunmap(
750 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
751 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
752} 625}
753 626
754static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) 627static void pcpu_free_chunk(struct pcpu_chunk *chunk)
755{ 628{
756 unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); 629 if (!chunk)
630 return;
631 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
632 kfree(chunk);
757} 633}
758 634
759/** 635/*
760 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 636 * Chunk management implementation.
761 * @chunk: chunk of interest 637 *
762 * @pages: pages array which can be used to pass information to free 638 * To allow different implementations, chunk alloc/free and
763 * @populated: populated bitmap 639 * [de]population are implemented in a separate file which is pulled
764 * @page_start: page index of the first page to unmap 640 * into this file and compiled together. The following functions
765 * @page_end: page index of the last page to unmap + 1 641 * should be implemented.
766 * 642 *
767 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 643 * pcpu_populate_chunk - populate the specified range of a chunk
768 * Corresponding elements in @pages were cleared by the caller and can 644 * pcpu_depopulate_chunk - depopulate the specified range of a chunk
769 * be used to carry information to pcpu_free_pages() which will be 645 * pcpu_create_chunk - create a new chunk
770 * called after all unmaps are finished. The caller should call 646 * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
771 * proper pre/post flush functions. 647 * pcpu_addr_to_page - translate address to physical address
648 * pcpu_verify_alloc_info - check alloc_info is acceptable during init
772 */ 649 */
773static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 650static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
774 struct page **pages, unsigned long *populated, 651static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
775 int page_start, int page_end) 652static struct pcpu_chunk *pcpu_create_chunk(void);
776{ 653static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
777 unsigned int cpu; 654static struct page *pcpu_addr_to_page(void *addr);
778 int i; 655static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
779 656
780 for_each_possible_cpu(cpu) { 657#ifdef CONFIG_NEED_PER_CPU_KM
781 for (i = page_start; i < page_end; i++) { 658#include "percpu-km.c"
782 struct page *page; 659#else
783 660#include "percpu-vm.c"
784 page = pcpu_chunk_page(chunk, cpu, i); 661#endif
785 WARN_ON(!page);
786 pages[pcpu_page_idx(cpu, i)] = page;
787 }
788 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
789 page_end - page_start);
790 }
791
792 for (i = page_start; i < page_end; i++)
793 __clear_bit(i, populated);
794}
795 662
796/** 663/**
797 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping 664 * pcpu_chunk_addr_search - determine chunk containing specified address
798 * @chunk: pcpu_chunk the regions to be flushed belong to 665 * @addr: address for which the chunk needs to be determined.
799 * @page_start: page index of the first page to be flushed
800 * @page_end: page index of the last page to be flushed + 1
801 *
802 * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
803 * TLB for the regions. This can be skipped if the area is to be
804 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
805 * 666 *
806 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once 667 * RETURNS:
807 * for the whole region. 668 * The address of the found chunk.
808 */
809static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
810 int page_start, int page_end)
811{
812 flush_tlb_kernel_range(
813 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
814 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
815}
816
817static int __pcpu_map_pages(unsigned long addr, struct page **pages,
818 int nr_pages)
819{
820 return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
821 PAGE_KERNEL, pages);
822}
823
824/**
825 * pcpu_map_pages - map pages into a pcpu_chunk
826 * @chunk: chunk of interest
827 * @pages: pages array containing pages to be mapped
828 * @populated: populated bitmap
829 * @page_start: page index of the first page to map
830 * @page_end: page index of the last page to map + 1
831 *
832 * For each cpu, map pages [@page_start,@page_end) into @chunk. The
833 * caller is responsible for calling pcpu_post_map_flush() after all
834 * mappings are complete.
835 *
836 * This function is responsible for setting corresponding bits in
837 * @chunk->populated bitmap and whatever is necessary for reverse
838 * lookup (addr -> chunk).
839 */ 669 */
840static int pcpu_map_pages(struct pcpu_chunk *chunk, 670static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
841 struct page **pages, unsigned long *populated,
842 int page_start, int page_end)
843{ 671{
844 unsigned int cpu, tcpu; 672 /* is it in the first chunk? */
845 int i, err; 673 if (pcpu_addr_in_first_chunk(addr)) {
846 674 /* is it in the reserved area? */
847 for_each_possible_cpu(cpu) { 675 if (pcpu_addr_in_reserved_chunk(addr))
848 err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), 676 return pcpu_reserved_chunk;
849 &pages[pcpu_page_idx(cpu, page_start)], 677 return pcpu_first_chunk;
850 page_end - page_start);
851 if (err < 0)
852 goto err;
853 }
854
855 /* mapping successful, link chunk and mark populated */
856 for (i = page_start; i < page_end; i++) {
857 for_each_possible_cpu(cpu)
858 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
859 chunk);
860 __set_bit(i, populated);
861 }
862
863 return 0;
864
865err:
866 for_each_possible_cpu(tcpu) {
867 if (tcpu == cpu)
868 break;
869 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
870 page_end - page_start);
871 } 678 }
872 return err;
873}
874
875/**
876 * pcpu_post_map_flush - flush cache after mapping
877 * @chunk: pcpu_chunk the regions to be flushed belong to
878 * @page_start: page index of the first page to be flushed
879 * @page_end: page index of the last page to be flushed + 1
880 *
881 * Pages [@page_start,@page_end) of @chunk have been mapped. Flush
882 * cache.
883 *
884 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
885 * for the whole region.
886 */
887static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
888 int page_start, int page_end)
889{
890 flush_cache_vmap(
891 pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
892 pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
893}
894
895/**
896 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
897 * @chunk: chunk to depopulate
898 * @off: offset to the area to depopulate
899 * @size: size of the area to depopulate in bytes
900 * @flush: whether to flush cache and tlb or not
901 *
902 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
903 * from @chunk. If @flush is true, vcache is flushed before unmapping
904 * and tlb after.
905 *
906 * CONTEXT:
907 * pcpu_alloc_mutex.
908 */
909static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
910{
911 int page_start = PFN_DOWN(off);
912 int page_end = PFN_UP(off + size);
913 struct page **pages;
914 unsigned long *populated;
915 int rs, re;
916
917 /* quick path, check whether it's empty already */
918 rs = page_start;
919 pcpu_next_unpop(chunk, &rs, &re, page_end);
920 if (rs == page_start && re == page_end)
921 return;
922
923 /* immutable chunks can't be depopulated */
924 WARN_ON(chunk->immutable);
925 679
926 /* 680 /*
927 * If control reaches here, there must have been at least one 681 * The address is relative to unit0 which might be unused and
928 * successful population attempt so the temp pages array must 682 * thus unmapped. Offset the address to the unit space of the
929 * be available now. 683 * current processor before looking it up in the vmalloc
684 * space. Note that any possible cpu id can be used here, so
685 * there's no need to worry about preemption or cpu hotplug.
930 */ 686 */
931 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 687 addr += pcpu_unit_offsets[raw_smp_processor_id()];
932 BUG_ON(!pages); 688 return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
933
934 /* unmap and free */
935 pcpu_pre_unmap_flush(chunk, page_start, page_end);
936
937 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
938 pcpu_unmap_pages(chunk, pages, populated, rs, re);
939
940 /* no need to flush tlb, vmalloc will handle it lazily */
941
942 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
943 pcpu_free_pages(chunk, pages, populated, rs, re);
944
945 /* commit new bitmap */
946 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
947}
948
949/**
950 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
951 * @chunk: chunk of interest
952 * @off: offset to the area to populate
953 * @size: size of the area to populate in bytes
954 *
955 * For each cpu, populate and map pages [@page_start,@page_end) into
956 * @chunk. The area is cleared on return.
957 *
958 * CONTEXT:
959 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
960 */
961static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
962{
963 int page_start = PFN_DOWN(off);
964 int page_end = PFN_UP(off + size);
965 int free_end = page_start, unmap_end = page_start;
966 struct page **pages;
967 unsigned long *populated;
968 unsigned int cpu;
969 int rs, re, rc;
970
971 /* quick path, check whether all pages are already there */
972 rs = page_start;
973 pcpu_next_pop(chunk, &rs, &re, page_end);
974 if (rs == page_start && re == page_end)
975 goto clear;
976
977 /* need to allocate and map pages, this chunk can't be immutable */
978 WARN_ON(chunk->immutable);
979
980 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
981 if (!pages)
982 return -ENOMEM;
983
984 /* alloc and map */
985 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
986 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
987 if (rc)
988 goto err_free;
989 free_end = re;
990 }
991
992 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
993 rc = pcpu_map_pages(chunk, pages, populated, rs, re);
994 if (rc)
995 goto err_unmap;
996 unmap_end = re;
997 }
998 pcpu_post_map_flush(chunk, page_start, page_end);
999
1000 /* commit new bitmap */
1001 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
1002clear:
1003 for_each_possible_cpu(cpu)
1004 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
1005 return 0;
1006
1007err_unmap:
1008 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
1009 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
1010 pcpu_unmap_pages(chunk, pages, populated, rs, re);
1011 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
1012err_free:
1013 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
1014 pcpu_free_pages(chunk, pages, populated, rs, re);
1015 return rc;
1016}
1017
1018static void free_pcpu_chunk(struct pcpu_chunk *chunk)
1019{
1020 if (!chunk)
1021 return;
1022 if (chunk->vms)
1023 pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
1024 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
1025 kfree(chunk);
1026}
1027
1028static struct pcpu_chunk *alloc_pcpu_chunk(void)
1029{
1030 struct pcpu_chunk *chunk;
1031
1032 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
1033 if (!chunk)
1034 return NULL;
1035
1036 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
1037 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
1038 chunk->map[chunk->map_used++] = pcpu_unit_size;
1039
1040 chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
1041 pcpu_nr_groups, pcpu_atom_size,
1042 GFP_KERNEL);
1043 if (!chunk->vms) {
1044 free_pcpu_chunk(chunk);
1045 return NULL;
1046 }
1047
1048 INIT_LIST_HEAD(&chunk->list);
1049 chunk->free_size = pcpu_unit_size;
1050 chunk->contig_hint = pcpu_unit_size;
1051 chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
1052
1053 return chunk;
1054} 689}
1055 690
1056/** 691/**
@@ -1142,7 +777,7 @@ restart:
1142 /* hmmm... no space left, create a new chunk */ 777 /* hmmm... no space left, create a new chunk */
1143 spin_unlock_irqrestore(&pcpu_lock, flags); 778 spin_unlock_irqrestore(&pcpu_lock, flags);
1144 779
1145 chunk = alloc_pcpu_chunk(); 780 chunk = pcpu_create_chunk();
1146 if (!chunk) { 781 if (!chunk) {
1147 err = "failed to allocate new chunk"; 782 err = "failed to allocate new chunk";
1148 goto fail_unlock_mutex; 783 goto fail_unlock_mutex;
@@ -1254,7 +889,7 @@ static void pcpu_reclaim(struct work_struct *work)
1254 889
1255 list_for_each_entry_safe(chunk, next, &todo, list) { 890 list_for_each_entry_safe(chunk, next, &todo, list) {
1256 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 891 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
1257 free_pcpu_chunk(chunk); 892 pcpu_destroy_chunk(chunk);
1258 } 893 }
1259 894
1260 mutex_unlock(&pcpu_alloc_mutex); 895 mutex_unlock(&pcpu_alloc_mutex);
@@ -1304,6 +939,32 @@ void free_percpu(void __percpu *ptr)
1304EXPORT_SYMBOL_GPL(free_percpu); 939EXPORT_SYMBOL_GPL(free_percpu);
1305 940
1306/** 941/**
942 * is_kernel_percpu_address - test whether address is from static percpu area
943 * @addr: address to test
944 *
945 * Test whether @addr belongs to in-kernel static percpu area. Module
946 * static percpu areas are not considered. For those, use
947 * is_module_percpu_address().
948 *
949 * RETURNS:
950 * %true if @addr is from in-kernel static percpu area, %false otherwise.
951 */
952bool is_kernel_percpu_address(unsigned long addr)
953{
954 const size_t static_size = __per_cpu_end - __per_cpu_start;
955 void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
956 unsigned int cpu;
957
958 for_each_possible_cpu(cpu) {
959 void *start = per_cpu_ptr(base, cpu);
960
961 if ((void *)addr >= start && (void *)addr < start + static_size)
962 return true;
963 }
964 return false;
965}
966
967/**
1307 * per_cpu_ptr_to_phys - convert translated percpu address to physical address 968 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1308 * @addr: the address to be converted to physical address 969 * @addr: the address to be converted to physical address
1309 * 970 *
@@ -1317,11 +978,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
1317 */ 978 */
1318phys_addr_t per_cpu_ptr_to_phys(void *addr) 979phys_addr_t per_cpu_ptr_to_phys(void *addr)
1319{ 980{
1320 if ((unsigned long)addr < VMALLOC_START || 981 if (pcpu_addr_in_first_chunk(addr)) {
1321 (unsigned long)addr >= VMALLOC_END) 982 if ((unsigned long)addr < VMALLOC_START ||
1322 return __pa(addr); 983 (unsigned long)addr >= VMALLOC_END)
1323 else 984 return __pa(addr);
1324 return page_to_phys(vmalloc_to_page(addr)); 985 else
986 return page_to_phys(vmalloc_to_page(addr));
987 } else
988 return page_to_phys(pcpu_addr_to_page(addr));
1325} 989}
1326 990
1327static inline size_t pcpu_calc_fc_sizes(size_t static_size, 991static inline size_t pcpu_calc_fc_sizes(size_t static_size,
@@ -1693,6 +1357,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1693 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1357 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1694 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1358 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1695 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1359 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
1360 PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
1696 1361
1697 /* process group information and build config tables accordingly */ 1362 /* process group information and build config tables accordingly */
1698 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); 1363 group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
new file mode 100644
index 00000000000..c4351c7f57d
--- /dev/null
+++ b/mm/percpu_up.c
@@ -0,0 +1,30 @@
1/*
2 * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
3 */
4
5#include <linux/module.h>
6#include <linux/percpu.h>
7#include <linux/slab.h>
8
9void __percpu *__alloc_percpu(size_t size, size_t align)
10{
11 /*
12 * Can't easily make larger alignment work with kmalloc. WARN
13 * on it. Larger alignment should only be used for module
14 * percpu sections on SMP for which this path isn't used.
15 */
16 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
17 return kzalloc(size, GFP_KERNEL);
18}
19EXPORT_SYMBOL_GPL(__alloc_percpu);
20
21void free_percpu(void __percpu *p)
22{
23 kfree(p);
24}
25EXPORT_SYMBOL_GPL(free_percpu);
26
27phys_addr_t per_cpu_ptr_to_phys(void *addr)
28{
29 return __pa(addr);
30}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6633965bb27..2876349339a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -14,6 +14,7 @@
14 */ 14 */
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16 16
17#include <linux/gfp.h>
17#include <linux/mm.h> 18#include <linux/mm.h>
18#include <linux/mmzone.h> 19#include <linux/mmzone.h>
19#include <linux/module.h> 20#include <linux/module.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index 337b20e946f..dfa9a1a03a1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/blkdev.h> 15#include <linux/blkdev.h>
@@ -502,7 +503,7 @@ void page_cache_sync_readahead(struct address_space *mapping,
502 return; 503 return;
503 504
504 /* be dumb */ 505 /* be dumb */
505 if (filp->f_mode & FMODE_RANDOM) { 506 if (filp && (filp->f_mode & FMODE_RANDOM)) {
506 force_page_cache_readahead(mapping, filp, offset, req_size); 507 force_page_cache_readahead(mapping, filp, offset, req_size);
507 return; 508 return;
508 } 509 }
diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c9c99..0feeef860a8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -133,8 +133,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
133 goto out_enomem_free_avc; 133 goto out_enomem_free_avc;
134 allocated = anon_vma; 134 allocated = anon_vma;
135 } 135 }
136 spin_lock(&anon_vma->lock);
137 136
137 spin_lock(&anon_vma->lock);
138 /* page_table_lock to protect against threads */ 138 /* page_table_lock to protect against threads */
139 spin_lock(&mm->page_table_lock); 139 spin_lock(&mm->page_table_lock);
140 if (likely(!vma->anon_vma)) { 140 if (likely(!vma->anon_vma)) {
@@ -144,14 +144,15 @@ int anon_vma_prepare(struct vm_area_struct *vma)
144 list_add(&avc->same_vma, &vma->anon_vma_chain); 144 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head); 145 list_add(&avc->same_anon_vma, &anon_vma->head);
146 allocated = NULL; 146 allocated = NULL;
147 avc = NULL;
147 } 148 }
148 spin_unlock(&mm->page_table_lock); 149 spin_unlock(&mm->page_table_lock);
149
150 spin_unlock(&anon_vma->lock); 150 spin_unlock(&anon_vma->lock);
151 if (unlikely(allocated)) { 151
152 if (unlikely(allocated))
152 anon_vma_free(allocated); 153 anon_vma_free(allocated);
154 if (unlikely(avc))
153 anon_vma_chain_free(avc); 155 anon_vma_chain_free(avc);
154 }
155 } 156 }
156 return 0; 157 return 0;
157 158
@@ -182,7 +183,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
182{ 183{
183 struct anon_vma_chain *avc, *pavc; 184 struct anon_vma_chain *avc, *pavc;
184 185
185 list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { 186 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
186 avc = anon_vma_chain_alloc(); 187 avc = anon_vma_chain_alloc();
187 if (!avc) 188 if (!avc)
188 goto enomem_failure; 189 goto enomem_failure;
@@ -232,6 +233,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
232 out_error_free_anon_vma: 233 out_error_free_anon_vma:
233 anon_vma_free(anon_vma); 234 anon_vma_free(anon_vma);
234 out_error: 235 out_error:
236 unlink_anon_vmas(vma);
235 return -ENOMEM; 237 return -ENOMEM;
236} 238}
237 239
@@ -334,14 +336,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
334 336
335/* 337/*
336 * At what user virtual address is page expected in vma? 338 * At what user virtual address is page expected in vma?
337 * checking that the page matches the vma. 339 * Caller should check the page is actually part of the vma.
338 */ 340 */
339unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 341unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
340{ 342{
341 if (PageAnon(page)) { 343 if (PageAnon(page))
342 if (vma->anon_vma != page_anon_vma(page)) 344 ;
343 return -EFAULT; 345 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
344 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
345 if (!vma->vm_file || 346 if (!vma->vm_file ||
346 vma->vm_file->f_mapping != page->mapping) 347 vma->vm_file->f_mapping != page->mapping)
347 return -EFAULT; 348 return -EFAULT;
@@ -729,13 +730,29 @@ void page_move_anon_rmap(struct page *page,
729 * @page: the page to add the mapping to 730 * @page: the page to add the mapping to
730 * @vma: the vm area in which the mapping is added 731 * @vma: the vm area in which the mapping is added
731 * @address: the user virtual address mapped 732 * @address: the user virtual address mapped
733 * @exclusive: the page is exclusively owned by the current process
732 */ 734 */
733static void __page_set_anon_rmap(struct page *page, 735static void __page_set_anon_rmap(struct page *page,
734 struct vm_area_struct *vma, unsigned long address) 736 struct vm_area_struct *vma, unsigned long address, int exclusive)
735{ 737{
736 struct anon_vma *anon_vma = vma->anon_vma; 738 struct anon_vma *anon_vma = vma->anon_vma;
737 739
738 BUG_ON(!anon_vma); 740 BUG_ON(!anon_vma);
741
742 /*
743 * If the page isn't exclusively mapped into this vma,
744 * we must use the _oldest_ possible anon_vma for the
745 * page mapping!
746 *
747 * So take the last AVC chain entry in the vma, which is
748 * the deepest ancestor, and use the anon_vma from that.
749 */
750 if (!exclusive) {
751 struct anon_vma_chain *avc;
752 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
753 anon_vma = avc->anon_vma;
754 }
755
739 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 756 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
740 page->mapping = (struct address_space *) anon_vma; 757 page->mapping = (struct address_space *) anon_vma;
741 page->index = linear_page_index(vma, address); 758 page->index = linear_page_index(vma, address);
@@ -790,7 +807,7 @@ void page_add_anon_rmap(struct page *page,
790 VM_BUG_ON(!PageLocked(page)); 807 VM_BUG_ON(!PageLocked(page));
791 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 808 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
792 if (first) 809 if (first)
793 __page_set_anon_rmap(page, vma, address); 810 __page_set_anon_rmap(page, vma, address, 0);
794 else 811 else
795 __page_check_anon_rmap(page, vma, address); 812 __page_check_anon_rmap(page, vma, address);
796} 813}
@@ -812,7 +829,7 @@ void page_add_new_anon_rmap(struct page *page,
812 SetPageSwapBacked(page); 829 SetPageSwapBacked(page);
813 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 830 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
814 __inc_zone_page_state(page, NR_ANON_PAGES); 831 __inc_zone_page_state(page, NR_ANON_PAGES);
815 __page_set_anon_rmap(page, vma, address); 832 __page_set_anon_rmap(page, vma, address, 1);
816 if (page_evictable(page, vma)) 833 if (page_evictable(page, vma))
817 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 834 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
818 else 835 else
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebea515..0cd7f66f1c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1545 return 0; 1545 return 0;
1546} 1546}
1547 1547
1548static struct inode *shmem_get_inode(struct super_block *sb, int mode, 1548static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
1549 dev_t dev, unsigned long flags) 1549 int mode, dev_t dev, unsigned long flags)
1550{ 1550{
1551 struct inode *inode; 1551 struct inode *inode;
1552 struct shmem_inode_info *info; 1552 struct shmem_inode_info *info;
@@ -1557,9 +1557,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
1557 1557
1558 inode = new_inode(sb); 1558 inode = new_inode(sb);
1559 if (inode) { 1559 if (inode) {
1560 inode->i_mode = mode; 1560 inode_init_owner(inode, dir, mode);
1561 inode->i_uid = current_fsuid();
1562 inode->i_gid = current_fsgid();
1563 inode->i_blocks = 0; 1561 inode->i_blocks = 0;
1564 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; 1562 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1565 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1563 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1814,7 +1812,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1814 struct inode *inode; 1812 struct inode *inode;
1815 int error = -ENOSPC; 1813 int error = -ENOSPC;
1816 1814
1817 inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); 1815 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1818 if (inode) { 1816 if (inode) {
1819 error = security_inode_init_security(inode, dir, NULL, NULL, 1817 error = security_inode_init_security(inode, dir, NULL, NULL,
1820 NULL); 1818 NULL);
@@ -1833,11 +1831,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1833#else 1831#else
1834 error = 0; 1832 error = 0;
1835#endif 1833#endif
1836 if (dir->i_mode & S_ISGID) {
1837 inode->i_gid = dir->i_gid;
1838 if (S_ISDIR(mode))
1839 inode->i_mode |= S_ISGID;
1840 }
1841 dir->i_size += BOGO_DIRENT_SIZE; 1834 dir->i_size += BOGO_DIRENT_SIZE;
1842 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1835 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1843 d_instantiate(dentry, inode); 1836 d_instantiate(dentry, inode);
@@ -1957,7 +1950,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1957 if (len > PAGE_CACHE_SIZE) 1950 if (len > PAGE_CACHE_SIZE)
1958 return -ENAMETOOLONG; 1951 return -ENAMETOOLONG;
1959 1952
1960 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); 1953 inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
1961 if (!inode) 1954 if (!inode)
1962 return -ENOSPC; 1955 return -ENOSPC;
1963 1956
@@ -1992,8 +1985,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1992 unlock_page(page); 1985 unlock_page(page);
1993 page_cache_release(page); 1986 page_cache_release(page);
1994 } 1987 }
1995 if (dir->i_mode & S_ISGID)
1996 inode->i_gid = dir->i_gid;
1997 dir->i_size += BOGO_DIRENT_SIZE; 1988 dir->i_size += BOGO_DIRENT_SIZE;
1998 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1989 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1999 d_instantiate(dentry, inode); 1990 d_instantiate(dentry, inode);
@@ -2071,14 +2062,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
2071 size, flags); 2062 size, flags);
2072} 2063}
2073 2064
2074static struct xattr_handler shmem_xattr_security_handler = { 2065static const struct xattr_handler shmem_xattr_security_handler = {
2075 .prefix = XATTR_SECURITY_PREFIX, 2066 .prefix = XATTR_SECURITY_PREFIX,
2076 .list = shmem_xattr_security_list, 2067 .list = shmem_xattr_security_list,
2077 .get = shmem_xattr_security_get, 2068 .get = shmem_xattr_security_get,
2078 .set = shmem_xattr_security_set, 2069 .set = shmem_xattr_security_set,
2079}; 2070};
2080 2071
2081static struct xattr_handler *shmem_xattr_handlers[] = { 2072static const struct xattr_handler *shmem_xattr_handlers[] = {
2082 &generic_acl_access_handler, 2073 &generic_acl_access_handler,
2083 &generic_acl_default_handler, 2074 &generic_acl_default_handler,
2084 &shmem_xattr_security_handler, 2075 &shmem_xattr_security_handler,
@@ -2366,7 +2357,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2366 sb->s_flags |= MS_POSIXACL; 2357 sb->s_flags |= MS_POSIXACL;
2367#endif 2358#endif
2368 2359
2369 inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); 2360 inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2370 if (!inode) 2361 if (!inode)
2371 goto failed; 2362 goto failed;
2372 inode->i_uid = sbinfo->uid; 2363 inode->i_uid = sbinfo->uid;
@@ -2611,7 +2602,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2611 2602
2612#define shmem_vm_ops generic_file_vm_ops 2603#define shmem_vm_ops generic_file_vm_ops
2613#define shmem_file_operations ramfs_file_operations 2604#define shmem_file_operations ramfs_file_operations
2614#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) 2605#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2615#define shmem_acct_size(flags, size) 0 2606#define shmem_acct_size(flags, size) 0
2616#define shmem_unacct_size(flags, size) do {} while (0) 2607#define shmem_unacct_size(flags, size) do {} while (0)
2617#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE 2608#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
@@ -2655,7 +2646,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2655 path.mnt = mntget(shm_mnt); 2646 path.mnt = mntget(shm_mnt);
2656 2647
2657 error = -ENOSPC; 2648 error = -ENOSPC;
2658 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); 2649 inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2659 if (!inode) 2650 if (!inode)
2660 goto put_dentry; 2651 goto put_dentry;
2661 2652
diff --git a/mm/slab.c b/mm/slab.c
index 3230cd2c6b3..50a73fca19c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -145,30 +145,6 @@
145#define BYTES_PER_WORD sizeof(void *) 145#define BYTES_PER_WORD sizeof(void *)
146#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 146#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
147 147
148#ifndef ARCH_KMALLOC_MINALIGN
149/*
150 * Enforce a minimum alignment for the kmalloc caches.
151 * Usually, the kmalloc caches are cache_line_size() aligned, except when
152 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
153 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
154 * alignment larger than the alignment of a 64-bit integer.
155 * ARCH_KMALLOC_MINALIGN allows that.
156 * Note that increasing this value may disable some debug features.
157 */
158#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
159#endif
160
161#ifndef ARCH_SLAB_MINALIGN
162/*
163 * Enforce a minimum alignment for all caches.
164 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
165 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
166 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
167 * some debug features.
168 */
169#define ARCH_SLAB_MINALIGN 0
170#endif
171
172#ifndef ARCH_KMALLOC_FLAGS 148#ifndef ARCH_KMALLOC_FLAGS
173#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 149#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
174#endif 150#endif
@@ -2313,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2313 if (ralign < align) { 2289 if (ralign < align) {
2314 ralign = align; 2290 ralign = align;
2315 } 2291 }
2316 /* disable debug if necessary */ 2292 /* disable debug if not aligning with REDZONE_ALIGN */
2317 if (ralign > __alignof__(unsigned long long)) 2293 if (ralign & (__alignof__(unsigned long long) - 1))
2318 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2294 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2319 /* 2295 /*
2320 * 4) Store it. 2296 * 4) Store it.
@@ -2340,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2340 */ 2316 */
2341 if (flags & SLAB_RED_ZONE) { 2317 if (flags & SLAB_RED_ZONE) {
2342 /* add space for red zone words */ 2318 /* add space for red zone words */
2343 cachep->obj_offset += sizeof(unsigned long long); 2319 cachep->obj_offset += align;
2344 size += 2 * sizeof(unsigned long long); 2320 size += align + sizeof(unsigned long long);
2345 } 2321 }
2346 if (flags & SLAB_STORE_USER) { 2322 if (flags & SLAB_STORE_USER) {
2347 /* user store requires one word storage behind the end of 2323 /* user store requires one word storage behind the end of
@@ -3695,21 +3671,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3695 */ 3671 */
3696int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) 3672int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3697{ 3673{
3698 unsigned long addr = (unsigned long)ptr;
3699 unsigned long min_addr = PAGE_OFFSET;
3700 unsigned long align_mask = BYTES_PER_WORD - 1;
3701 unsigned long size = cachep->buffer_size; 3674 unsigned long size = cachep->buffer_size;
3702 struct page *page; 3675 struct page *page;
3703 3676
3704 if (unlikely(addr < min_addr)) 3677 if (unlikely(!kern_ptr_validate(ptr, size)))
3705 goto out;
3706 if (unlikely(addr > (unsigned long)high_memory - size))
3707 goto out;
3708 if (unlikely(addr & align_mask))
3709 goto out;
3710 if (unlikely(!kern_addr_valid(addr)))
3711 goto out;
3712 if (unlikely(!kern_addr_valid(addr + size - 1)))
3713 goto out; 3678 goto out;
3714 page = virt_to_page(ptr); 3679 page = virt_to_page(ptr);
3715 if (unlikely(!PageSlab(page))) 3680 if (unlikely(!PageSlab(page)))
@@ -4320,10 +4285,11 @@ static int s_show(struct seq_file *m, void *p)
4320 unsigned long node_frees = cachep->node_frees; 4285 unsigned long node_frees = cachep->node_frees;
4321 unsigned long overflows = cachep->node_overflow; 4286 unsigned long overflows = cachep->node_overflow;
4322 4287
4323 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 4288 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
4324 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, 4289 "%4lu %4lu %4lu %4lu %4lu",
4325 reaped, errors, max_freeable, node_allocs, 4290 allocs, high, grown,
4326 node_frees, overflows); 4291 reaped, errors, max_freeable, node_allocs,
4292 node_frees, overflows);
4327 } 4293 }
4328 /* cpu stats */ 4294 /* cpu stats */
4329 { 4295 {
diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc3..23631e2bb57 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -467,14 +467,6 @@ out:
467 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. 467 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
468 */ 468 */
469 469
470#ifndef ARCH_KMALLOC_MINALIGN
471#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
472#endif
473
474#ifndef ARCH_SLAB_MINALIGN
475#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
476#endif
477
478void *__kmalloc_node(size_t size, gfp_t gfp, int node) 470void *__kmalloc_node(size_t size, gfp_t gfp, int node)
479{ 471{
480 unsigned int *m; 472 unsigned int *m;
diff --git a/mm/slub.c b/mm/slub.c
index b364844a106..e46e3129697 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -157,14 +157,6 @@
157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ 157#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
158 SLAB_CACHE_DMA | SLAB_NOTRACK) 158 SLAB_CACHE_DMA | SLAB_NOTRACK)
159 159
160#ifndef ARCH_KMALLOC_MINALIGN
161#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
162#endif
163
164#ifndef ARCH_SLAB_MINALIGN
165#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
166#endif
167
168#define OO_SHIFT 16 160#define OO_SHIFT 16
169#define OO_MASK ((1 << OO_SHIFT) - 1) 161#define OO_MASK ((1 << OO_SHIFT) - 1)
170#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 162#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
@@ -1084,7 +1076,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
1084 if (node == -1) 1076 if (node == -1)
1085 return alloc_pages(flags, order); 1077 return alloc_pages(flags, order);
1086 else 1078 else
1087 return alloc_pages_node(node, flags, order); 1079 return alloc_pages_exact_node(node, flags, order);
1088} 1080}
1089 1081
1090static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1082static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2153,7 +2145,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2153 int local_node; 2145 int local_node;
2154 2146
2155 if (slab_state >= UP && (s < kmalloc_caches || 2147 if (slab_state >= UP && (s < kmalloc_caches ||
2156 s > kmalloc_caches + KMALLOC_CACHES)) 2148 s >= kmalloc_caches + KMALLOC_CACHES))
2157 local_node = page_to_nid(virt_to_page(s)); 2149 local_node = page_to_nid(virt_to_page(s));
2158 else 2150 else
2159 local_node = 0; 2151 local_node = 0;
@@ -2386,6 +2378,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2386{ 2378{
2387 struct page *page; 2379 struct page *page;
2388 2380
2381 if (!kern_ptr_validate(object, s->size))
2382 return 0;
2383
2389 page = get_object_page(object); 2384 page = get_object_page(object);
2390 2385
2391 if (!page || s != page->slab) 2386 if (!page || s != page->slab)
@@ -2426,9 +2421,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2426#ifdef CONFIG_SLUB_DEBUG 2421#ifdef CONFIG_SLUB_DEBUG
2427 void *addr = page_address(page); 2422 void *addr = page_address(page);
2428 void *p; 2423 void *p;
2429 DECLARE_BITMAP(map, page->objects); 2424 long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
2425 GFP_ATOMIC);
2430 2426
2431 bitmap_zero(map, page->objects); 2427 if (!map)
2428 return;
2432 slab_err(s, page, "%s", text); 2429 slab_err(s, page, "%s", text);
2433 slab_lock(page); 2430 slab_lock(page);
2434 for_each_free_object(p, s, page->freelist) 2431 for_each_free_object(p, s, page->freelist)
@@ -2443,6 +2440,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2443 } 2440 }
2444 } 2441 }
2445 slab_unlock(page); 2442 slab_unlock(page);
2443 kfree(map);
2446#endif 2444#endif
2447} 2445}
2448 2446
@@ -3335,8 +3333,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3335 struct kmem_cache *s; 3333 struct kmem_cache *s;
3336 void *ret; 3334 void *ret;
3337 3335
3338 if (unlikely(size > SLUB_MAX_SIZE)) 3336 if (unlikely(size > SLUB_MAX_SIZE)) {
3339 return kmalloc_large_node(size, gfpflags, node); 3337 ret = kmalloc_large_node(size, gfpflags, node);
3338
3339 trace_kmalloc_node(caller, ret,
3340 size, PAGE_SIZE << get_order(size),
3341 gfpflags, node);
3342
3343 return ret;
3344 }
3340 3345
3341 s = get_slab(size, gfpflags); 3346 s = get_slab(size, gfpflags);
3342 3347
@@ -3648,10 +3653,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3648} 3653}
3649 3654
3650static void process_slab(struct loc_track *t, struct kmem_cache *s, 3655static void process_slab(struct loc_track *t, struct kmem_cache *s,
3651 struct page *page, enum track_item alloc) 3656 struct page *page, enum track_item alloc,
3657 long *map)
3652{ 3658{
3653 void *addr = page_address(page); 3659 void *addr = page_address(page);
3654 DECLARE_BITMAP(map, page->objects);
3655 void *p; 3660 void *p;
3656 3661
3657 bitmap_zero(map, page->objects); 3662 bitmap_zero(map, page->objects);
@@ -3670,11 +3675,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
3670 unsigned long i; 3675 unsigned long i;
3671 struct loc_track t = { 0, 0, NULL }; 3676 struct loc_track t = { 0, 0, NULL };
3672 int node; 3677 int node;
3678 unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
3679 sizeof(unsigned long), GFP_KERNEL);
3673 3680
3674 if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), 3681 if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
3675 GFP_TEMPORARY)) 3682 GFP_TEMPORARY)) {
3683 kfree(map);
3676 return sprintf(buf, "Out of memory\n"); 3684 return sprintf(buf, "Out of memory\n");
3677 3685 }
3678 /* Push back cpu slabs */ 3686 /* Push back cpu slabs */
3679 flush_all(s); 3687 flush_all(s);
3680 3688
@@ -3688,9 +3696,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
3688 3696
3689 spin_lock_irqsave(&n->list_lock, flags); 3697 spin_lock_irqsave(&n->list_lock, flags);
3690 list_for_each_entry(page, &n->partial, lru) 3698 list_for_each_entry(page, &n->partial, lru)
3691 process_slab(&t, s, page, alloc); 3699 process_slab(&t, s, page, alloc, map);
3692 list_for_each_entry(page, &n->full, lru) 3700 list_for_each_entry(page, &n->full, lru)
3693 process_slab(&t, s, page, alloc); 3701 process_slab(&t, s, page, alloc, map);
3694 spin_unlock_irqrestore(&n->list_lock, flags); 3702 spin_unlock_irqrestore(&n->list_lock, flags);
3695 } 3703 }
3696 3704
@@ -3741,6 +3749,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3741 } 3749 }
3742 3750
3743 free_loc_track(&t); 3751 free_loc_track(&t);
3752 kfree(map);
3744 if (!t.count) 3753 if (!t.count)
3745 len += sprintf(buf, "No data\n"); 3754 len += sprintf(buf, "No data\n");
3746 return len; 3755 return len;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 392b9bb5bc0..aa33fd67fa4 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -22,6 +22,7 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/slab.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 22896d58913..dc0cc4d43ff 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,6 +2,7 @@
2 * sparse memory mappings. 2 * sparse memory mappings.
3 */ 3 */
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/slab.h>
5#include <linux/mmzone.h> 6#include <linux/mmzone.h>
6#include <linux/bootmem.h> 7#include <linux/bootmem.h>
7#include <linux/highmem.h> 8#include <linux/highmem.h>
diff --git a/mm/swap.c b/mm/swap.c
index 9036b89813a..7cd60bf0a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/gfp.h>
33 34
34#include "internal.h" 35#include "internal.h"
35 36
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb1cb4..e10f5833167 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/gfp.h>
11#include <linux/kernel_stat.h> 12#include <linux/kernel_stat.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13#include <linux/swapops.h> 14#include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc..03aa2d55f1a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 139 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
140 if (nr_blocks) { 140 if (nr_blocks) {
141 err = blkdev_issue_discard(si->bdev, start_block, 141 err = blkdev_issue_discard(si->bdev, start_block,
142 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 142 nr_blocks, GFP_KERNEL,
143 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
143 if (err) 144 if (err)
144 return err; 145 return err;
145 cond_resched(); 146 cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
150 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 151 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
151 152
152 err = blkdev_issue_discard(si->bdev, start_block, 153 err = blkdev_issue_discard(si->bdev, start_block,
153 nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); 154 nr_blocks, GFP_KERNEL,
155 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
154 if (err) 156 if (err)
155 break; 157 break;
156 158
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
189 start_block <<= PAGE_SHIFT - 9; 191 start_block <<= PAGE_SHIFT - 9;
190 nr_blocks <<= PAGE_SHIFT - 9; 192 nr_blocks <<= PAGE_SHIFT - 9;
191 if (blkdev_issue_discard(si->bdev, start_block, 193 if (blkdev_issue_discard(si->bdev, start_block,
192 nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) 194 nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
195 BLKDEV_IFL_BARRIER))
193 break; 196 break;
194 } 197 }
195 198
@@ -574,6 +577,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
574 577
575 /* free if no reference */ 578 /* free if no reference */
576 if (!usage) { 579 if (!usage) {
580 struct gendisk *disk = p->bdev->bd_disk;
577 if (offset < p->lowest_bit) 581 if (offset < p->lowest_bit)
578 p->lowest_bit = offset; 582 p->lowest_bit = offset;
579 if (offset > p->highest_bit) 583 if (offset > p->highest_bit)
@@ -583,6 +587,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
583 swap_list.next = p->type; 587 swap_list.next = p->type;
584 nr_swap_pages++; 588 nr_swap_pages++;
585 p->inuse_pages--; 589 p->inuse_pages--;
590 if ((p->flags & SWP_BLKDEV) &&
591 disk->fops->swap_slot_free_notify)
592 disk->fops->swap_slot_free_notify(p->bdev, offset);
586 } 593 }
587 594
588 return usage; 595 return usage;
@@ -1884,6 +1891,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1884 if (error < 0) 1891 if (error < 0)
1885 goto bad_swap; 1892 goto bad_swap;
1886 p->bdev = bdev; 1893 p->bdev = bdev;
1894 p->flags |= SWP_BLKDEV;
1887 } else if (S_ISREG(inode->i_mode)) { 1895 } else if (S_ISREG(inode->i_mode)) {
1888 p->bdev = inode->i_sb->s_bdev; 1896 p->bdev = inode->i_sb->s_bdev;
1889 mutex_lock(&inode->i_mutex); 1897 mutex_lock(&inode->i_mutex);
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e3724482..f42675a3615 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
12#include <linux/gfp.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/swap.h> 14#include <linux/swap.h>
14#include <linux/module.h> 15#include <linux/module.h>
diff --git a/mm/util.c b/mm/util.c
index 834db7be240..f5712e8964b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,6 +186,27 @@ void kzfree(const void *p)
186} 186}
187EXPORT_SYMBOL(kzfree); 187EXPORT_SYMBOL(kzfree);
188 188
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
189/* 210/*
190 * strndup_user - duplicate an existing string from user space 211 * strndup_user - duplicate an existing string from user space
191 * @s: The string to duplicate 212 * @s: The string to duplicate
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 79c809895fb..3ff3311447f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -13,7 +13,7 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/slab.h> 16#include <linux/gfp.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/swap.h> 18#include <linux/swap.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
@@ -1535,13 +1535,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1535 unsigned long ap, fp; 1535 unsigned long ap, fp;
1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1536 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1537 1537
1538 /* If we have no swap space, do not bother scanning anon pages. */
1539 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1540 percent[0] = 0;
1541 percent[1] = 100;
1542 return;
1543 }
1544
1545 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1538 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1546 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1539 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1547 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1540 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1639,20 +1632,22 @@ static void shrink_zone(int priority, struct zone *zone,
1639 unsigned long nr_reclaimed = sc->nr_reclaimed; 1632 unsigned long nr_reclaimed = sc->nr_reclaimed;
1640 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1633 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1641 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1634 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1635 int noswap = 0;
1642 1636
1643 get_scan_ratio(zone, sc, percent); 1637 /* If we have no swap space, do not bother scanning anon pages. */
1638 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1639 noswap = 1;
1640 percent[0] = 0;
1641 percent[1] = 100;
1642 } else
1643 get_scan_ratio(zone, sc, percent);
1644 1644
1645 for_each_evictable_lru(l) { 1645 for_each_evictable_lru(l) {
1646 int file = is_file_lru(l); 1646 int file = is_file_lru(l);
1647 unsigned long scan; 1647 unsigned long scan;
1648 1648
1649 if (percent[file] == 0) {
1650 nr[l] = 0;
1651 continue;
1652 }
1653
1654 scan = zone_nr_lru_pages(zone, sc, l); 1649 scan = zone_nr_lru_pages(zone, sc, l);
1655 if (priority) { 1650 if (priority || noswap) {
1656 scan >>= priority; 1651 scan >>= priority;
1657 scan = (scan * percent[file]) / 100; 1652 scan = (scan * percent[file]) / 100;
1658 } 1653 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7f760cbc73f..fa12ea3051f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h>
15#include <linux/cpu.h> 16#include <linux/cpu.h>
16#include <linux/vmstat.h> 17#include <linux/vmstat.h>
17#include <linux/sched.h> 18#include <linux/sched.h>