aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2009-04-05 02:14:15 -0400
committerLen Brown <len.brown@intel.com>2009-04-05 02:14:15 -0400
commit478c6a43fcbc6c11609f8cee7c7b57223907754f (patch)
treea7f7952099da60d33032aed6de9c0c56c9f8779e /mm
parent8a3f257c704e02aee9869decd069a806b45be3f1 (diff)
parent6bb597507f9839b13498781e481f5458aea33620 (diff)
Merge branch 'linus' into release
Conflicts: arch/x86/kernel/cpu/cpufreq/longhaul.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug26
-rw-r--r--mm/Makefile5
-rw-r--r--mm/allocpercpu.c34
-rw-r--r--mm/backing-dev.c26
-rw-r--r--mm/bootmem.c35
-rw-r--r--mm/debug-pagealloc.c129
-rw-r--r--mm/filemap.c28
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/highmem.c110
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memcontrol.c687
-rw-r--r--mm/memory.c39
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/nommu.c52
-rw-r--r--mm/oom_kill.c13
-rw-r--r--mm/page-writeback.c46
-rw-r--r--mm/page_alloc.c36
-rw-r--r--mm/page_cgroup.c37
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/percpu.c1326
-rw-r--r--mm/readahead.c65
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slab.c7
-rw-r--r--mm/slob.c45
-rw-r--r--mm/slub.c83
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c27
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c116
-rw-r--r--mm/vmscan.c109
-rw-r--r--mm/vmstat.c18
35 files changed, 2598 insertions, 596 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf2..b53427ad30a3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -206,7 +206,6 @@ config VIRT_TO_BUS
206config UNEVICTABLE_LRU 206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages" 207 bool "Add LRU list to track non-evictable pages"
208 default y 208 default y
209 depends on MMU
210 help 209 help
211 Keeps unevictable pages off of the active and inactive pageout 210 Keeps unevictable pages off of the active and inactive pageout
212 lists, so kswapd will not waste CPU time or have its balancing 211 lists, so kswapd will not waste CPU time or have its balancing
@@ -214,5 +213,13 @@ config UNEVICTABLE_LRU
214 will use one page flag and increase the code size a little, 213 will use one page flag and increase the code size a little,
215 say Y unless you know what you are doing. 214 say Y unless you know what you are doing.
216 215
216config HAVE_MLOCK
217 bool
218 default y if MMU=y
219
220config HAVE_MLOCKED_PAGE_BIT
221 bool
222 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
223
217config MMU_NOTIFIER 224config MMU_NOTIFIER
218 bool 225 bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 000000000000..bb01e298f260
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,26 @@
1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC
5 ---help---
6 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types
8 of memory corruptions.
9
10config WANT_PAGE_DEBUG_FLAGS
11 bool
12
13config PAGE_POISONING
14 bool "Debug page memory allocations"
15 depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
16 depends on !HIBERNATION
17 select DEBUG_PAGEALLOC
18 select WANT_PAGE_DEBUG_FLAGS
19 help
20 Fill the pages with poison patterns after free_pages() and verify
21 the patterns before alloc_pages(). This results in a large slowdown,
22 but helps to find certain types of memory corruptions.
23
24 This option cannot enalbe with hibernation. Otherwise, it will get
25 wrong messages for memory corruption because the free pages are not
26 saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..ec73c68b6015 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,12 +24,17 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
27obj-$(CONFIG_SLAB) += slab.o 28obj-$(CONFIG_SLAB) += slab.o
28obj-$(CONFIG_SLUB) += slub.o 29obj-$(CONFIG_SLUB) += slub.o
29obj-$(CONFIG_FAILSLAB) += failslab.o 30obj-$(CONFIG_FAILSLAB) += failslab.o
30obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 31obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
31obj-$(CONFIG_FS_XIP) += filemap_xip.o 32obj-$(CONFIG_FS_XIP) += filemap_xip.o
32obj-$(CONFIG_MIGRATION) += migrate.o 33obj-$(CONFIG_MIGRATION) += migrate.o
34ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
35obj-$(CONFIG_SMP) += percpu.o
36else
33obj-$(CONFIG_SMP) += allocpercpu.o 37obj-$(CONFIG_SMP) += allocpercpu.o
38endif
34obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd2..139d5b7b6621 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) 99 __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
100 100
101/** 101/**
102 * percpu_alloc_mask - initial setup of per-cpu data 102 * alloc_percpu - initial setup of per-cpu data
103 * @size: size of per-cpu object 103 * @size: size of per-cpu object
104 * @gfp: may sleep or not etc. 104 * @align: alignment
105 * @mask: populate per-data for cpu's selected through mask bits
106 * 105 *
107 * Populating per-cpu data for all online cpu's would be a typical use case, 106 * Allocate dynamic percpu area. Percpu objects are populated with
108 * which is simplified by the percpu_alloc() wrapper. 107 * zeroed buffers.
109 * Per-cpu objects are populated with zeroed buffers.
110 */ 108 */
111void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 109void *__alloc_percpu(size_t size, size_t align)
112{ 110{
113 /* 111 /*
114 * We allocate whole cache lines to avoid false sharing 112 * We allocate whole cache lines to avoid false sharing
115 */ 113 */
116 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); 114 size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
117 void *pdata = kzalloc(sz, gfp); 115 void *pdata = kzalloc(sz, GFP_KERNEL);
118 void *__pdata = __percpu_disguise(pdata); 116 void *__pdata = __percpu_disguise(pdata);
119 117
118 /*
119 * Can't easily make larger alignment work with kmalloc. WARN
120 * on it. Larger alignment should only be used for module
121 * percpu sections on SMP for which this path isn't used.
122 */
123 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
124
120 if (unlikely(!pdata)) 125 if (unlikely(!pdata))
121 return NULL; 126 return NULL;
122 if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) 127 if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
128 &cpu_possible_map)))
123 return __pdata; 129 return __pdata;
124 kfree(pdata); 130 kfree(pdata);
125 return NULL; 131 return NULL;
126} 132}
127EXPORT_SYMBOL_GPL(__percpu_alloc_mask); 133EXPORT_SYMBOL_GPL(__alloc_percpu);
128 134
129/** 135/**
130 * percpu_free - final cleanup of per-cpu data 136 * free_percpu - final cleanup of per-cpu data
131 * @__pdata: object to clean up 137 * @__pdata: object to clean up
132 * 138 *
133 * We simply clean up any per-cpu object left. No need for the client to 139 * We simply clean up any per-cpu object left. No need for the client to
134 * track and specify through a bis mask which per-cpu objects are to free. 140 * track and specify through a bis mask which per-cpu objects are to free.
135 */ 141 */
136void percpu_free(void *__pdata) 142void free_percpu(void *__pdata)
137{ 143{
138 if (unlikely(!__pdata)) 144 if (unlikely(!__pdata))
139 return; 145 return;
140 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 146 __percpu_depopulate_mask(__pdata, cpu_possible_mask);
141 kfree(__percpu_disguise(__pdata)); 147 kfree(__percpu_disguise(__pdata));
142} 148}
143EXPORT_SYMBOL_GPL(percpu_free); 149EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e8587444132..be68c956a660 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,11 +2,24 @@
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <linux/pagemap.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
6#include <linux/module.h> 7#include <linux/module.h>
7#include <linux/writeback.h> 8#include <linux/writeback.h>
8#include <linux/device.h> 9#include <linux/device.h>
9 10
11void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
12{
13}
14EXPORT_SYMBOL(default_unplug_io_fn);
15
16struct backing_dev_info default_backing_dev_info = {
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY,
20 .unplug_io_fn = default_unplug_io_fn,
21};
22EXPORT_SYMBOL_GPL(default_backing_dev_info);
10 23
11static struct class *bdi_class; 24static struct class *bdi_class;
12 25
@@ -166,9 +179,20 @@ static __init int bdi_class_init(void)
166 bdi_debug_init(); 179 bdi_debug_init();
167 return 0; 180 return 0;
168} 181}
169
170postcore_initcall(bdi_class_init); 182postcore_initcall(bdi_class_init);
171 183
184static int __init default_bdi_init(void)
185{
186 int err;
187
188 err = bdi_init(&default_backing_dev_info);
189 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default");
191
192 return err;
193}
194subsys_initcall(default_bdi_init);
195
172int bdi_register(struct backing_dev_info *bdi, struct device *parent, 196int bdi_register(struct backing_dev_info *bdi, struct device *parent,
173 const char *fmt, ...) 197 const char *fmt, ...)
174{ 198{
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0e..daf92713f7de 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -382,7 +382,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); 382 return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
383} 383}
384 384
385#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
386/** 385/**
387 * reserve_bootmem - mark a page range as usable 386 * reserve_bootmem - mark a page range as usable
388 * @addr: starting address of the range 387 * @addr: starting address of the range
@@ -403,7 +402,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
403 402
404 return mark_bootmem(start, end, 1, flags); 403 return mark_bootmem(start, end, 1, flags);
405} 404}
406#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
407 405
408static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, 406static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
409 unsigned long step) 407 unsigned long step)
@@ -429,8 +427,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
429} 427}
430 428
431static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 429static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
432 unsigned long size, unsigned long align, 430 unsigned long size, unsigned long align,
433 unsigned long goal, unsigned long limit) 431 unsigned long goal, unsigned long limit)
434{ 432{
435 unsigned long fallback = 0; 433 unsigned long fallback = 0;
436 unsigned long min, max, start, sidx, midx, step; 434 unsigned long min, max, start, sidx, midx, step;
@@ -530,17 +528,34 @@ find_block:
530 return NULL; 528 return NULL;
531} 529}
532 530
531static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
532 unsigned long size, unsigned long align,
533 unsigned long goal, unsigned long limit)
534{
535#ifdef CONFIG_HAVE_ARCH_BOOTMEM
536 bootmem_data_t *p_bdata;
537
538 p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
539 if (p_bdata)
540 return alloc_bootmem_core(p_bdata, size, align, goal, limit);
541#endif
542 return NULL;
543}
544
533static void * __init ___alloc_bootmem_nopanic(unsigned long size, 545static void * __init ___alloc_bootmem_nopanic(unsigned long size,
534 unsigned long align, 546 unsigned long align,
535 unsigned long goal, 547 unsigned long goal,
536 unsigned long limit) 548 unsigned long limit)
537{ 549{
538 bootmem_data_t *bdata; 550 bootmem_data_t *bdata;
551 void *region;
539 552
540restart: 553restart:
541 list_for_each_entry(bdata, &bdata_list, list) { 554 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
542 void *region; 555 if (region)
556 return region;
543 557
558 list_for_each_entry(bdata, &bdata_list, list) {
544 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) 559 if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
545 continue; 560 continue;
546 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) 561 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -618,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
618{ 633{
619 void *ptr; 634 void *ptr;
620 635
636 ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
637 if (ptr)
638 return ptr;
639
621 ptr = alloc_bootmem_core(bdata, size, align, goal, limit); 640 ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
622 if (ptr) 641 if (ptr)
623 return ptr; 642 return ptr;
@@ -674,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
674{ 693{
675 void *ptr; 694 void *ptr;
676 695
696 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
697 if (ptr)
698 return ptr;
699
677 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); 700 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
678 if (ptr) 701 if (ptr)
679 return ptr; 702 return ptr;
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 000000000000..a1e3324de2b5
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
1#include <linux/kernel.h>
2#include <linux/mm.h>
3#include <linux/page-debug-flags.h>
4#include <linux/poison.h>
5
6static inline void set_page_poison(struct page *page)
7{
8 __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
9}
10
11static inline void clear_page_poison(struct page *page)
12{
13 __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
14}
15
16static inline bool page_poison(struct page *page)
17{
18 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
19}
20
21static void poison_highpage(struct page *page)
22{
23 /*
24 * Page poisoning for highmem pages is not implemented.
25 *
26 * This can be called from interrupt contexts.
27 * So we need to create a new kmap_atomic slot for this
28 * application and it will need interrupt protection.
29 */
30}
31
32static void poison_page(struct page *page)
33{
34 void *addr;
35
36 if (PageHighMem(page)) {
37 poison_highpage(page);
38 return;
39 }
40 set_page_poison(page);
41 addr = page_address(page);
42 memset(addr, PAGE_POISON, PAGE_SIZE);
43}
44
45static void poison_pages(struct page *page, int n)
46{
47 int i;
48
49 for (i = 0; i < n; i++)
50 poison_page(page + i);
51}
52
53static bool single_bit_flip(unsigned char a, unsigned char b)
54{
55 unsigned char error = a ^ b;
56
57 return error && !(error & (error - 1));
58}
59
60static void check_poison_mem(unsigned char *mem, size_t bytes)
61{
62 unsigned char *start;
63 unsigned char *end;
64
65 for (start = mem; start < mem + bytes; start++) {
66 if (*start != PAGE_POISON)
67 break;
68 }
69 if (start == mem + bytes)
70 return;
71
72 for (end = mem + bytes - 1; end > start; end--) {
73 if (*end != PAGE_POISON)
74 break;
75 }
76
77 if (!printk_ratelimit())
78 return;
79 else if (start == end && single_bit_flip(*start, PAGE_POISON))
80 printk(KERN_ERR "pagealloc: single bit error\n");
81 else
82 printk(KERN_ERR "pagealloc: memory corruption\n");
83
84 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
85 end - start + 1, 1);
86 dump_stack();
87}
88
89static void unpoison_highpage(struct page *page)
90{
91 /*
92 * See comment in poison_highpage().
93 * Highmem pages should not be poisoned for now
94 */
95 BUG_ON(page_poison(page));
96}
97
98static void unpoison_page(struct page *page)
99{
100 if (PageHighMem(page)) {
101 unpoison_highpage(page);
102 return;
103 }
104 if (page_poison(page)) {
105 void *addr = page_address(page);
106
107 check_poison_mem(addr, PAGE_SIZE);
108 clear_page_poison(page);
109 }
110}
111
112static void unpoison_pages(struct page *page, int n)
113{
114 int i;
115
116 for (i = 0; i < n; i++)
117 unpoison_page(page + i);
118}
119
120void kernel_map_pages(struct page *page, int numpages, int enable)
121{
122 if (!debug_pagealloc_enabled)
123 return;
124
125 if (enable)
126 unpoison_pages(page, numpages);
127 else
128 poison_pages(page, numpages);
129}
diff --git a/mm/filemap.c b/mm/filemap.c
index 23acefe51808..fc11974f2bee 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -565,6 +565,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
565EXPORT_SYMBOL(wait_on_page_bit); 565EXPORT_SYMBOL(wait_on_page_bit);
566 566
567/** 567/**
568 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
569 * @page - Page defining the wait queue of interest
570 * @waiter - Waiter to add to the queue
571 *
572 * Add an arbitrary @waiter to the wait queue for the nominated @page.
573 */
574void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
575{
576 wait_queue_head_t *q = page_waitqueue(page);
577 unsigned long flags;
578
579 spin_lock_irqsave(&q->lock, flags);
580 __add_wait_queue(q, waiter);
581 spin_unlock_irqrestore(&q->lock, flags);
582}
583EXPORT_SYMBOL_GPL(add_page_wait_queue);
584
585/**
568 * unlock_page - unlock a locked page 586 * unlock_page - unlock a locked page
569 * @page: the page 587 * @page: the page
570 * 588 *
@@ -1823,7 +1841,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1823 int copy = min(bytes, iov->iov_len - base); 1841 int copy = min(bytes, iov->iov_len - base);
1824 1842
1825 base = 0; 1843 base = 0;
1826 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); 1844 left = __copy_from_user_inatomic(vaddr, buf, copy);
1827 copied += copy; 1845 copied += copy;
1828 bytes -= copy; 1846 bytes -= copy;
1829 vaddr += copy; 1847 vaddr += copy;
@@ -1851,8 +1869,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
1851 if (likely(i->nr_segs == 1)) { 1869 if (likely(i->nr_segs == 1)) {
1852 int left; 1870 int left;
1853 char __user *buf = i->iov->iov_base + i->iov_offset; 1871 char __user *buf = i->iov->iov_base + i->iov_offset;
1854 left = __copy_from_user_inatomic_nocache(kaddr + offset, 1872 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
1855 buf, bytes);
1856 copied = bytes - left; 1873 copied = bytes - left;
1857 } else { 1874 } else {
1858 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1875 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1880,7 +1897,7 @@ size_t iov_iter_copy_from_user(struct page *page,
1880 if (likely(i->nr_segs == 1)) { 1897 if (likely(i->nr_segs == 1)) {
1881 int left; 1898 int left;
1882 char __user *buf = i->iov->iov_base + i->iov_offset; 1899 char __user *buf = i->iov->iov_base + i->iov_offset;
1883 left = __copy_from_user_nocache(kaddr + offset, buf, bytes); 1900 left = __copy_from_user(kaddr + offset, buf, bytes);
1884 copied = bytes - left; 1901 copied = bytes - left;
1885 } else { 1902 } else {
1886 copied = __iovec_copy_from_user_inatomic(kaddr + offset, 1903 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -2464,6 +2481,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
2464 * (presumably at page->private). If the release was successful, return `1'. 2481 * (presumably at page->private). If the release was successful, return `1'.
2465 * Otherwise return zero. 2482 * Otherwise return zero.
2466 * 2483 *
2484 * This may also be called if PG_fscache is set on a page, indicating that the
2485 * page is known to the local caching routines.
2486 *
2467 * The @gfp_mask argument specifies whether I/O may be performed to release 2487 * The @gfp_mask argument specifies whether I/O may be performed to release
2468 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2488 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2469 * 2489 *
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0c04615651b7..427dfe3ce78c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping,
89 } 89 }
90 } 90 }
91 nr = nr - offset; 91 nr = nr - offset;
92 if (nr > len) 92 if (nr > len - copied)
93 nr = len; 93 nr = len - copied;
94 94
95 error = mapping->a_ops->get_xip_mem(mapping, index, 0, 95 error = mapping->a_ops->get_xip_mem(mapping, index, 0,
96 &xip_mem, &xip_pfn); 96 &xip_mem, &xip_pfn);
diff --git a/mm/highmem.c b/mm/highmem.c
index b36b83b920ff..68eb1d9b63fa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -67,6 +67,25 @@ pte_t * pkmap_page_table;
67 67
68static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); 68static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
69 69
70/*
71 * Most architectures have no use for kmap_high_get(), so let's abstract
72 * the disabling of IRQ out of the locking in that case to save on a
73 * potential useless overhead.
74 */
75#ifdef ARCH_NEEDS_KMAP_HIGH_GET
76#define lock_kmap() spin_lock_irq(&kmap_lock)
77#define unlock_kmap() spin_unlock_irq(&kmap_lock)
78#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags)
79#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags)
80#else
81#define lock_kmap() spin_lock(&kmap_lock)
82#define unlock_kmap() spin_unlock(&kmap_lock)
83#define lock_kmap_any(flags) \
84 do { spin_lock(&kmap_lock); (void)(flags); } while (0)
85#define unlock_kmap_any(flags) \
86 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
87#endif
88
70static void flush_all_zero_pkmaps(void) 89static void flush_all_zero_pkmaps(void)
71{ 90{
72 int i; 91 int i;
@@ -113,9 +132,9 @@ static void flush_all_zero_pkmaps(void)
113 */ 132 */
114void kmap_flush_unused(void) 133void kmap_flush_unused(void)
115{ 134{
116 spin_lock(&kmap_lock); 135 lock_kmap();
117 flush_all_zero_pkmaps(); 136 flush_all_zero_pkmaps();
118 spin_unlock(&kmap_lock); 137 unlock_kmap();
119} 138}
120 139
121static inline unsigned long map_new_virtual(struct page *page) 140static inline unsigned long map_new_virtual(struct page *page)
@@ -145,10 +164,10 @@ start:
145 164
146 __set_current_state(TASK_UNINTERRUPTIBLE); 165 __set_current_state(TASK_UNINTERRUPTIBLE);
147 add_wait_queue(&pkmap_map_wait, &wait); 166 add_wait_queue(&pkmap_map_wait, &wait);
148 spin_unlock(&kmap_lock); 167 unlock_kmap();
149 schedule(); 168 schedule();
150 remove_wait_queue(&pkmap_map_wait, &wait); 169 remove_wait_queue(&pkmap_map_wait, &wait);
151 spin_lock(&kmap_lock); 170 lock_kmap();
152 171
153 /* Somebody else might have mapped it while we slept */ 172 /* Somebody else might have mapped it while we slept */
154 if (page_address(page)) 173 if (page_address(page))
@@ -184,29 +203,59 @@ void *kmap_high(struct page *page)
184 * For highmem pages, we can't trust "virtual" until 203 * For highmem pages, we can't trust "virtual" until
185 * after we have the lock. 204 * after we have the lock.
186 */ 205 */
187 spin_lock(&kmap_lock); 206 lock_kmap();
188 vaddr = (unsigned long)page_address(page); 207 vaddr = (unsigned long)page_address(page);
189 if (!vaddr) 208 if (!vaddr)
190 vaddr = map_new_virtual(page); 209 vaddr = map_new_virtual(page);
191 pkmap_count[PKMAP_NR(vaddr)]++; 210 pkmap_count[PKMAP_NR(vaddr)]++;
192 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); 211 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
193 spin_unlock(&kmap_lock); 212 unlock_kmap();
194 return (void*) vaddr; 213 return (void*) vaddr;
195} 214}
196 215
197EXPORT_SYMBOL(kmap_high); 216EXPORT_SYMBOL(kmap_high);
198 217
218#ifdef ARCH_NEEDS_KMAP_HIGH_GET
219/**
220 * kmap_high_get - pin a highmem page into memory
221 * @page: &struct page to pin
222 *
223 * Returns the page's current virtual memory address, or NULL if no mapping
224 * exists. When and only when a non null address is returned then a
225 * matching call to kunmap_high() is necessary.
226 *
227 * This can be called from any context.
228 */
229void *kmap_high_get(struct page *page)
230{
231 unsigned long vaddr, flags;
232
233 lock_kmap_any(flags);
234 vaddr = (unsigned long)page_address(page);
235 if (vaddr) {
236 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1);
237 pkmap_count[PKMAP_NR(vaddr)]++;
238 }
239 unlock_kmap_any(flags);
240 return (void*) vaddr;
241}
242#endif
243
199/** 244/**
200 * kunmap_high - map a highmem page into memory 245 * kunmap_high - map a highmem page into memory
201 * @page: &struct page to unmap 246 * @page: &struct page to unmap
247 *
248 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
249 * only from user context.
202 */ 250 */
203void kunmap_high(struct page *page) 251void kunmap_high(struct page *page)
204{ 252{
205 unsigned long vaddr; 253 unsigned long vaddr;
206 unsigned long nr; 254 unsigned long nr;
255 unsigned long flags;
207 int need_wakeup; 256 int need_wakeup;
208 257
209 spin_lock(&kmap_lock); 258 lock_kmap_any(flags);
210 vaddr = (unsigned long)page_address(page); 259 vaddr = (unsigned long)page_address(page);
211 BUG_ON(!vaddr); 260 BUG_ON(!vaddr);
212 nr = PKMAP_NR(vaddr); 261 nr = PKMAP_NR(vaddr);
@@ -232,7 +281,7 @@ void kunmap_high(struct page *page)
232 */ 281 */
233 need_wakeup = waitqueue_active(&pkmap_map_wait); 282 need_wakeup = waitqueue_active(&pkmap_map_wait);
234 } 283 }
235 spin_unlock(&kmap_lock); 284 unlock_kmap_any(flags);
236 285
237 /* do wake-up, if needed, race-free outside of the spin lock */ 286 /* do wake-up, if needed, race-free outside of the spin lock */
238 if (need_wakeup) 287 if (need_wakeup)
@@ -373,3 +422,48 @@ void __init page_address_init(void)
373} 422}
374 423
375#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
425
426#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
427
428void debug_kmap_atomic(enum km_type type)
429{
430 static unsigned warn_count = 10;
431
432 if (unlikely(warn_count == 0))
433 return;
434
435 if (unlikely(in_interrupt())) {
436 if (in_irq()) {
437 if (type != KM_IRQ0 && type != KM_IRQ1 &&
438 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
439 type != KM_BOUNCE_READ) {
440 WARN_ON(1);
441 warn_count--;
442 }
443 } else if (!irqs_disabled()) { /* softirq */
444 if (type != KM_IRQ0 && type != KM_IRQ1 &&
445 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
446 type != KM_SKB_SUNRPC_DATA &&
447 type != KM_SKB_DATA_SOFTIRQ &&
448 type != KM_BOUNCE_READ) {
449 WARN_ON(1);
450 warn_count--;
451 }
452 }
453 }
454
455 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
456 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
457 if (!irqs_disabled()) {
458 WARN_ON(1);
459 warn_count--;
460 }
461 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
462 if (irq_count() == 0 && !irqs_disabled()) {
463 WARN_ON(1);
464 warn_count--;
465 }
466 }
467}
468
469#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d809a8..28c655ba9353 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h,
918 * an instantiated the change should be committed via vma_commit_reservation. 918 * an instantiated the change should be committed via vma_commit_reservation.
919 * No action is required on failure. 919 * No action is required on failure.
920 */ 920 */
921static int vma_needs_reservation(struct hstate *h, 921static long vma_needs_reservation(struct hstate *h,
922 struct vm_area_struct *vma, unsigned long addr) 922 struct vm_area_struct *vma, unsigned long addr)
923{ 923{
924 struct address_space *mapping = vma->vm_file->f_mapping; 924 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h,
933 return 1; 933 return 1;
934 934
935 } else { 935 } else {
936 int err; 936 long err;
937 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 937 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
938 struct resv_map *reservations = vma_resv_map(vma); 938 struct resv_map *reservations = vma_resv_map(vma);
939 939
@@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
969 struct page *page; 969 struct page *page;
970 struct address_space *mapping = vma->vm_file->f_mapping; 970 struct address_space *mapping = vma->vm_file->f_mapping;
971 struct inode *inode = mapping->host; 971 struct inode *inode = mapping->host;
972 unsigned int chg; 972 long chg;
973 973
974 /* 974 /*
975 * Processes that did not create the mapping will have no reserves and 975 * Processes that did not create the mapping will have no reserves and
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2a..987bb03fbdd8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
63 return page_private(page); 63 return page_private(page);
64} 64}
65 65
66#ifdef CONFIG_HAVE_MLOCK
66extern long mlock_vma_pages_range(struct vm_area_struct *vma, 67extern long mlock_vma_pages_range(struct vm_area_struct *vma,
67 unsigned long start, unsigned long end); 68 unsigned long start, unsigned long end);
68extern void munlock_vma_pages_range(struct vm_area_struct *vma, 69extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
71{ 72{
72 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 73 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
73} 74}
75#endif
74 76
75#ifdef CONFIG_UNEVICTABLE_LRU 77#ifdef CONFIG_UNEVICTABLE_LRU
76/* 78/*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
90} 92}
91#endif 93#endif
92 94
93#ifdef CONFIG_UNEVICTABLE_LRU 95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
94/* 96/*
95 * Called only in fault path via page_evictable() for a new page 97 * Called only in fault path via page_evictable() for a new page
96 * to determine if it's being mapped into a LOCKED vma. 98 * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
165 } 167 }
166} 168}
167 169
168#else /* CONFIG_UNEVICTABLE_LRU */ 170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
169static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
170{ 172{
171 return 0; 173 return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
175static inline void mlock_migrate_page(struct page *new, struct page *old) { } 177static inline void mlock_migrate_page(struct page *new, struct page *old) { }
176static inline void free_page_mlock(struct page *page) { } 178static inline void free_page_mlock(struct page *page) { }
177 179
178#endif /* CONFIG_UNEVICTABLE_LRU */ 180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
179 181
180/* 182/*
181 * Return the mem_map entry representing the 'offset' subpage within 183 * Return the mem_map entry representing the 'offset' subpage within
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..2fc6d6c48238 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h>
30#include <linux/mutex.h> 31#include <linux/mutex.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
32#include <linux/swap.h> 33#include <linux/swap.h>
@@ -95,6 +96,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
95 return ret; 96 return ret;
96} 97}
97 98
99static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
100{
101 s64 ret;
102
103 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
104 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
105 return ret;
106}
107
98/* 108/*
99 * per-zone information in memory controller. 109 * per-zone information in memory controller.
100 */ 110 */
@@ -154,9 +164,9 @@ struct mem_cgroup {
154 164
155 /* 165 /*
156 * While reclaiming in a hiearchy, we cache the last child we 166 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex 167 * reclaimed from.
158 */ 168 */
159 struct mem_cgroup *last_scanned_child; 169 int last_scanned_child;
160 /* 170 /*
161 * Should the accounting and control be hierarchical, per subtree? 171 * Should the accounting and control be hierarchical, per subtree?
162 */ 172 */
@@ -247,7 +257,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
247 return mem_cgroup_zoneinfo(mem, nid, zid); 257 return mem_cgroup_zoneinfo(mem, nid, zid);
248} 258}
249 259
250static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 260static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
251 enum lru_list idx) 261 enum lru_list idx)
252{ 262{
253 int nid, zid; 263 int nid, zid;
@@ -286,6 +296,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
286static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 296static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
287{ 297{
288 struct mem_cgroup *mem = NULL; 298 struct mem_cgroup *mem = NULL;
299
300 if (!mm)
301 return NULL;
289 /* 302 /*
290 * Because we have no locks, mm->owner's may be being moved to other 303 * Because we have no locks, mm->owner's may be being moved to other
291 * cgroup. We use css_tryget() here even if this looks 304 * cgroup. We use css_tryget() here even if this looks
@@ -308,6 +321,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
308 return css_is_removed(&mem->css); 321 return css_is_removed(&mem->css);
309} 322}
310 323
324
325/*
326 * Call callback function against all cgroup under hierarchy tree.
327 */
328static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
329 int (*func)(struct mem_cgroup *, void *))
330{
331 int found, ret, nextid;
332 struct cgroup_subsys_state *css;
333 struct mem_cgroup *mem;
334
335 if (!root->use_hierarchy)
336 return (*func)(root, data);
337
338 nextid = 1;
339 do {
340 ret = 0;
341 mem = NULL;
342
343 rcu_read_lock();
344 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
345 &found);
346 if (css && css_tryget(css))
347 mem = container_of(css, struct mem_cgroup, css);
348 rcu_read_unlock();
349
350 if (mem) {
351 ret = (*func)(mem, data);
352 css_put(&mem->css);
353 }
354 nextid = found + 1;
355 } while (!ret && css);
356
357 return ret;
358}
359
311/* 360/*
312 * Following LRU functions are allowed to be used without PCG_LOCK. 361 * Following LRU functions are allowed to be used without PCG_LOCK.
313 * Operations are called by routine of global LRU independently from memcg. 362 * Operations are called by routine of global LRU independently from memcg.
@@ -441,31 +490,24 @@ void mem_cgroup_move_lists(struct page *page,
441int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 490int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
442{ 491{
443 int ret; 492 int ret;
493 struct mem_cgroup *curr = NULL;
444 494
445 task_lock(task); 495 task_lock(task);
446 ret = task->mm && mm_match_cgroup(task->mm, mem); 496 rcu_read_lock();
497 curr = try_get_mem_cgroup_from_mm(task->mm);
498 rcu_read_unlock();
447 task_unlock(task); 499 task_unlock(task);
500 if (!curr)
501 return 0;
502 if (curr->use_hierarchy)
503 ret = css_is_ancestor(&curr->css, &mem->css);
504 else
505 ret = (curr == mem);
506 css_put(&curr->css);
448 return ret; 507 return ret;
449} 508}
450 509
451/* 510/*
452 * Calculate mapped_ratio under memory controller. This will be used in
453 * vmscan.c for deteremining we have to reclaim mapped pages.
454 */
455int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
456{
457 long total, rss;
458
459 /*
460 * usage is recorded in bytes. But, here, we assume the number of
461 * physical pages can be represented by "long" on any arch.
462 */
463 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
464 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
465 return (int)((rss * 100L) / total);
466}
467
468/*
469 * prev_priority control...this will be used in memory reclaim path. 511 * prev_priority control...this will be used in memory reclaim path.
470 */ 512 */
471int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 513int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -501,8 +543,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
501 unsigned long gb; 543 unsigned long gb;
502 unsigned long inactive_ratio; 544 unsigned long inactive_ratio;
503 545
504 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 546 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
505 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 547 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
506 548
507 gb = (inactive + active) >> (30 - PAGE_SHIFT); 549 gb = (inactive + active) >> (30 - PAGE_SHIFT);
508 if (gb) 550 if (gb)
@@ -629,172 +671,202 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
629#define mem_cgroup_from_res_counter(counter, member) \ 671#define mem_cgroup_from_res_counter(counter, member) \
630 container_of(counter, struct mem_cgroup, member) 672 container_of(counter, struct mem_cgroup, member)
631 673
632/* 674static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
633 * This routine finds the DFS walk successor. This routine should be
634 * called with hierarchy_mutex held
635 */
636static struct mem_cgroup *
637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638{ 675{
639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 676 if (do_swap_account) {
640 677 if (res_counter_check_under_limit(&mem->res) &&
641 curr_cgroup = curr->css.cgroup; 678 res_counter_check_under_limit(&mem->memsw))
642 root_cgroup = root_mem->css.cgroup; 679 return true;
680 } else
681 if (res_counter_check_under_limit(&mem->res))
682 return true;
683 return false;
684}
643 685
644 if (!list_empty(&curr_cgroup->children)) { 686static unsigned int get_swappiness(struct mem_cgroup *memcg)
645 /* 687{
646 * Walk down to children 688 struct cgroup *cgrp = memcg->css.cgroup;
647 */ 689 unsigned int swappiness;
648 cgroup = list_entry(curr_cgroup->children.next,
649 struct cgroup, sibling);
650 curr = mem_cgroup_from_cont(cgroup);
651 goto done;
652 }
653 690
654visit_parent: 691 /* root ? */
655 if (curr_cgroup == root_cgroup) { 692 if (cgrp->parent == NULL)
656 /* caller handles NULL case */ 693 return vm_swappiness;
657 curr = NULL;
658 goto done;
659 }
660 694
661 /* 695 spin_lock(&memcg->reclaim_param_lock);
662 * Goto next sibling 696 swappiness = memcg->swappiness;
663 */ 697 spin_unlock(&memcg->reclaim_param_lock);
664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 sibling);
667 curr = mem_cgroup_from_cont(cgroup);
668 goto done;
669 }
670 698
671 /* 699 return swappiness;
672 * Go up to next parent and next parent's sibling if need be 700}
673 */
674 curr_cgroup = curr_cgroup->parent;
675 goto visit_parent;
676 701
677done: 702static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
678 return curr; 703{
704 int *val = data;
705 (*val)++;
706 return 0;
679} 707}
680 708
681/* 709/**
682 * Visit the first child (need not be the first child as per the ordering 710 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
683 * of the cgroup list, since we track last_scanned_child) of @mem and use 711 * @memcg: The memory cgroup that went over limit
684 * that to reclaim free pages from. 712 * @p: Task that is going to be killed
713 *
714 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
715 * enabled
685 */ 716 */
686static struct mem_cgroup * 717void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688{ 718{
689 struct cgroup *cgroup; 719 struct cgroup *task_cgrp;
690 struct mem_cgroup *orig, *next; 720 struct cgroup *mem_cgrp;
691 bool obsolete;
692
693 /* 721 /*
694 * Scan all children under the mem_cgroup mem 722 * Need a buffer in BSS, can't rely on allocations. The code relies
723 * on the assumption that OOM is serialized for memory controller.
724 * If this assumption is broken, revisit this code.
695 */ 725 */
696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 726 static char memcg_name[PATH_MAX];
727 int ret;
728
729 if (!memcg)
730 return;
697 731
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700 732
701 if (list_empty(&root_mem->css.cgroup->children)) { 733 rcu_read_lock();
734
735 mem_cgrp = memcg->css.cgroup;
736 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
737
738 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
739 if (ret < 0) {
702 /* 740 /*
703 * root_mem might have children before and last_scanned_child 741 * Unfortunately, we are unable to convert to a useful name
704 * may point to one of them. We put it later. 742 * But we'll still print out the usage information
705 */ 743 */
706 if (orig) 744 rcu_read_unlock();
707 VM_BUG_ON(!obsolete);
708 next = NULL;
709 goto done; 745 goto done;
710 } 746 }
747 rcu_read_unlock();
711 748
712 if (!orig || obsolete) { 749 printk(KERN_INFO "Task in %s killed", memcg_name);
713 cgroup = list_first_entry(&root_mem->css.cgroup->children,
714 struct cgroup, sibling);
715 next = mem_cgroup_from_cont(cgroup);
716 } else
717 next = __mem_cgroup_get_next_node(orig, root_mem);
718 750
751 rcu_read_lock();
752 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
753 if (ret < 0) {
754 rcu_read_unlock();
755 goto done;
756 }
757 rcu_read_unlock();
758
759 /*
760 * Continues from above, so we don't need an KERN_ level
761 */
762 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
719done: 763done:
720 if (next) 764
721 mem_cgroup_get(next); 765 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
722 root_mem->last_scanned_child = next; 766 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
723 if (orig) 767 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
724 mem_cgroup_put(orig); 768 res_counter_read_u64(&memcg->res, RES_FAILCNT));
725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 769 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
726 return (next) ? next : root_mem; 770 "failcnt %llu\n",
771 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
772 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
773 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
727} 774}
728 775
729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 776/*
777 * This function returns the number of memcg under hierarchy tree. Returns
778 * 1(self count) if no children.
779 */
780static int mem_cgroup_count_children(struct mem_cgroup *mem)
730{ 781{
731 if (do_swap_account) { 782 int num = 0;
732 if (res_counter_check_under_limit(&mem->res) && 783 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
733 res_counter_check_under_limit(&mem->memsw)) 784 return num;
734 return true;
735 } else
736 if (res_counter_check_under_limit(&mem->res))
737 return true;
738 return false;
739} 785}
740 786
741static unsigned int get_swappiness(struct mem_cgroup *memcg) 787/*
788 * Visit the first child (need not be the first child as per the ordering
789 * of the cgroup list, since we track last_scanned_child) of @mem and use
790 * that to reclaim free pages from.
791 */
792static struct mem_cgroup *
793mem_cgroup_select_victim(struct mem_cgroup *root_mem)
742{ 794{
743 struct cgroup *cgrp = memcg->css.cgroup; 795 struct mem_cgroup *ret = NULL;
744 unsigned int swappiness; 796 struct cgroup_subsys_state *css;
797 int nextid, found;
745 798
746 /* root ? */ 799 if (!root_mem->use_hierarchy) {
747 if (cgrp->parent == NULL) 800 css_get(&root_mem->css);
748 return vm_swappiness; 801 ret = root_mem;
802 }
749 803
750 spin_lock(&memcg->reclaim_param_lock); 804 while (!ret) {
751 swappiness = memcg->swappiness; 805 rcu_read_lock();
752 spin_unlock(&memcg->reclaim_param_lock); 806 nextid = root_mem->last_scanned_child + 1;
807 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
808 &found);
809 if (css && css_tryget(css))
810 ret = container_of(css, struct mem_cgroup, css);
811
812 rcu_read_unlock();
813 /* Updates scanning parameter */
814 spin_lock(&root_mem->reclaim_param_lock);
815 if (!css) {
816 /* this means start scan from ID:1 */
817 root_mem->last_scanned_child = 0;
818 } else
819 root_mem->last_scanned_child = found;
820 spin_unlock(&root_mem->reclaim_param_lock);
821 }
753 822
754 return swappiness; 823 return ret;
755} 824}
756 825
757/* 826/*
758 * Dance down the hierarchy if needed to reclaim memory. We remember the 827 * Scan the hierarchy if needed to reclaim memory. We remember the last child
759 * last child we reclaimed from, so that we don't end up penalizing 828 * we reclaimed from, so that we don't end up penalizing one child extensively
760 * one child extensively based on its position in the children list. 829 * based on its position in the children list.
761 * 830 *
762 * root_mem is the original ancestor that we've been reclaim from. 831 * root_mem is the original ancestor that we've been reclaim from.
832 *
833 * We give up and return to the caller when we visit root_mem twice.
834 * (other groups can be removed while we're walking....)
835 *
836 * If shrink==true, for avoiding to free too much, this returns immedieately.
763 */ 837 */
764static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 838static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 gfp_t gfp_mask, bool noswap) 839 gfp_t gfp_mask, bool noswap, bool shrink)
766{ 840{
767 struct mem_cgroup *next_mem; 841 struct mem_cgroup *victim;
768 int ret = 0; 842 int ret, total = 0;
769 843 int loop = 0;
770 /* 844
771 * Reclaim unconditionally and don't check for return value. 845 while (loop < 2) {
772 * We need to reclaim in the current group and down the tree. 846 victim = mem_cgroup_select_victim(root_mem);
773 * One might think about checking for children before reclaiming, 847 if (victim == root_mem)
774 * but there might be left over accounting, even after children 848 loop++;
775 * have left. 849 if (!mem_cgroup_local_usage(&victim->stat)) {
776 */ 850 /* this cgroup's local usage == 0 */
777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 851 css_put(&victim->css);
778 get_swappiness(root_mem));
779 if (mem_cgroup_check_under_limit(root_mem))
780 return 1; /* indicate reclaim has succeeded */
781 if (!root_mem->use_hierarchy)
782 return ret;
783
784 next_mem = mem_cgroup_get_next_node(root_mem);
785
786 while (next_mem != root_mem) {
787 if (mem_cgroup_is_obsolete(next_mem)) {
788 next_mem = mem_cgroup_get_next_node(root_mem);
789 continue; 852 continue;
790 } 853 }
791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 854 /* we use swappiness of local cgroup */
792 get_swappiness(next_mem)); 855 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
856 get_swappiness(victim));
857 css_put(&victim->css);
858 /*
859 * At shrinking usage, we can't check we should stop here or
860 * reclaim more. It's depends on callers. last_scanned_child
861 * will work enough for keeping fairness under tree.
862 */
863 if (shrink)
864 return ret;
865 total += ret;
793 if (mem_cgroup_check_under_limit(root_mem)) 866 if (mem_cgroup_check_under_limit(root_mem))
794 return 1; /* indicate reclaim has succeeded */ 867 return 1 + total;
795 next_mem = mem_cgroup_get_next_node(root_mem);
796 } 868 }
797 return ret; 869 return total;
798} 870}
799 871
800bool mem_cgroup_oom_called(struct task_struct *task) 872bool mem_cgroup_oom_called(struct task_struct *task)
@@ -813,6 +885,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
813 rcu_read_unlock(); 885 rcu_read_unlock();
814 return ret; 886 return ret;
815} 887}
888
889static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
890{
891 mem->last_oom_jiffies = jiffies;
892 return 0;
893}
894
895static void record_last_oom(struct mem_cgroup *mem)
896{
897 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
898}
899
900
816/* 901/*
817 * Unlike exported interface, "oom" parameter is added. if oom==true, 902 * Unlike exported interface, "oom" parameter is added. if oom==true,
818 * oom-killer can be invoked. 903 * oom-killer can be invoked.
@@ -875,7 +960,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
875 goto nomem; 960 goto nomem;
876 961
877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 962 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
878 noswap); 963 noswap, false);
879 if (ret) 964 if (ret)
880 continue; 965 continue;
881 966
@@ -895,7 +980,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
895 mutex_lock(&memcg_tasklist); 980 mutex_lock(&memcg_tasklist);
896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 981 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897 mutex_unlock(&memcg_tasklist); 982 mutex_unlock(&memcg_tasklist);
898 mem_over_limit->last_oom_jiffies = jiffies; 983 record_last_oom(mem_over_limit);
899 } 984 }
900 goto nomem; 985 goto nomem;
901 } 986 }
@@ -906,20 +991,55 @@ nomem:
906 return -ENOMEM; 991 return -ENOMEM;
907} 992}
908 993
994
995/*
996 * A helper function to get mem_cgroup from ID. must be called under
997 * rcu_read_lock(). The caller must check css_is_removed() or some if
998 * it's concern. (dropping refcnt from swap can be called against removed
999 * memcg.)
1000 */
1001static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1002{
1003 struct cgroup_subsys_state *css;
1004
1005 /* ID 0 is unused ID */
1006 if (!id)
1007 return NULL;
1008 css = css_lookup(&mem_cgroup_subsys, id);
1009 if (!css)
1010 return NULL;
1011 return container_of(css, struct mem_cgroup, css);
1012}
1013
909static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1014static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
910{ 1015{
911 struct mem_cgroup *mem; 1016 struct mem_cgroup *mem;
1017 struct page_cgroup *pc;
1018 unsigned short id;
912 swp_entry_t ent; 1019 swp_entry_t ent;
913 1020
1021 VM_BUG_ON(!PageLocked(page));
1022
914 if (!PageSwapCache(page)) 1023 if (!PageSwapCache(page))
915 return NULL; 1024 return NULL;
916 1025
917 ent.val = page_private(page); 1026 pc = lookup_page_cgroup(page);
918 mem = lookup_swap_cgroup(ent); 1027 /*
919 if (!mem) 1028 * Used bit of swapcache is solid under page lock.
920 return NULL; 1029 */
921 if (!css_tryget(&mem->css)) 1030 if (PageCgroupUsed(pc)) {
922 return NULL; 1031 mem = pc->mem_cgroup;
1032 if (mem && !css_tryget(&mem->css))
1033 mem = NULL;
1034 } else {
1035 ent.val = page_private(page);
1036 id = lookup_swap_cgroup(ent);
1037 rcu_read_lock();
1038 mem = mem_cgroup_lookup(id);
1039 if (mem && !css_tryget(&mem->css))
1040 mem = NULL;
1041 rcu_read_unlock();
1042 }
923 return mem; 1043 return mem;
924} 1044}
925 1045
@@ -1118,6 +1238,10 @@ int mem_cgroup_newpage_charge(struct page *page,
1118 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1238 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1119} 1239}
1120 1240
1241static void
1242__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1243 enum charge_type ctype);
1244
1121int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1245int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1122 gfp_t gfp_mask) 1246 gfp_t gfp_mask)
1123{ 1247{
@@ -1154,16 +1278,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1154 unlock_page_cgroup(pc); 1278 unlock_page_cgroup(pc);
1155 } 1279 }
1156 1280
1157 if (do_swap_account && PageSwapCache(page)) {
1158 mem = try_get_mem_cgroup_from_swapcache(page);
1159 if (mem)
1160 mm = NULL;
1161 else
1162 mem = NULL;
1163 /* SwapCache may be still linked to LRU now. */
1164 mem_cgroup_lru_del_before_commit_swapcache(page);
1165 }
1166
1167 if (unlikely(!mm && !mem)) 1281 if (unlikely(!mm && !mem))
1168 mm = &init_mm; 1282 mm = &init_mm;
1169 1283
@@ -1171,22 +1285,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1171 return mem_cgroup_charge_common(page, mm, gfp_mask, 1285 return mem_cgroup_charge_common(page, mm, gfp_mask,
1172 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1286 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1173 1287
1174 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1288 /* shmem */
1175 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1289 if (PageSwapCache(page)) {
1176 if (mem) 1290 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1177 css_put(&mem->css); 1291 if (!ret)
1178 if (PageSwapCache(page)) 1292 __mem_cgroup_commit_charge_swapin(page, mem,
1179 mem_cgroup_lru_add_after_commit_swapcache(page); 1293 MEM_CGROUP_CHARGE_TYPE_SHMEM);
1294 } else
1295 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1296 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1180 1297
1181 if (do_swap_account && !ret && PageSwapCache(page)) {
1182 swp_entry_t ent = {.val = page_private(page)};
1183 /* avoid double counting */
1184 mem = swap_cgroup_record(ent, NULL);
1185 if (mem) {
1186 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1187 mem_cgroup_put(mem);
1188 }
1189 }
1190 return ret; 1298 return ret;
1191} 1299}
1192 1300
@@ -1229,7 +1337,9 @@ charge_cur_mm:
1229 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1337 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1230} 1338}
1231 1339
1232void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1340static void
1341__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1342 enum charge_type ctype)
1233{ 1343{
1234 struct page_cgroup *pc; 1344 struct page_cgroup *pc;
1235 1345
@@ -1239,7 +1349,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1239 return; 1349 return;
1240 pc = lookup_page_cgroup(page); 1350 pc = lookup_page_cgroup(page);
1241 mem_cgroup_lru_del_before_commit_swapcache(page); 1351 mem_cgroup_lru_del_before_commit_swapcache(page);
1242 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1352 __mem_cgroup_commit_charge(ptr, pc, ctype);
1243 mem_cgroup_lru_add_after_commit_swapcache(page); 1353 mem_cgroup_lru_add_after_commit_swapcache(page);
1244 /* 1354 /*
1245 * Now swap is on-memory. This means this page may be 1355 * Now swap is on-memory. This means this page may be
@@ -1250,18 +1360,32 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1250 */ 1360 */
1251 if (do_swap_account && PageSwapCache(page)) { 1361 if (do_swap_account && PageSwapCache(page)) {
1252 swp_entry_t ent = {.val = page_private(page)}; 1362 swp_entry_t ent = {.val = page_private(page)};
1363 unsigned short id;
1253 struct mem_cgroup *memcg; 1364 struct mem_cgroup *memcg;
1254 memcg = swap_cgroup_record(ent, NULL); 1365
1366 id = swap_cgroup_record(ent, 0);
1367 rcu_read_lock();
1368 memcg = mem_cgroup_lookup(id);
1255 if (memcg) { 1369 if (memcg) {
1370 /*
1371 * This recorded memcg can be obsolete one. So, avoid
1372 * calling css_tryget
1373 */
1256 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1374 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1257 mem_cgroup_put(memcg); 1375 mem_cgroup_put(memcg);
1258 } 1376 }
1259 1377 rcu_read_unlock();
1260 } 1378 }
1261 /* add this page(page_cgroup) to the LRU we want. */ 1379 /* add this page(page_cgroup) to the LRU we want. */
1262 1380
1263} 1381}
1264 1382
1383void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1384{
1385 __mem_cgroup_commit_charge_swapin(page, ptr,
1386 MEM_CGROUP_CHARGE_TYPE_MAPPED);
1387}
1388
1265void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1389void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1266{ 1390{
1267 if (mem_cgroup_disabled()) 1391 if (mem_cgroup_disabled())
@@ -1324,8 +1448,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1324 res_counter_uncharge(&mem->res, PAGE_SIZE); 1448 res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1449 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1450 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327
1328 mem_cgroup_charge_statistics(mem, pc, false); 1451 mem_cgroup_charge_statistics(mem, pc, false);
1452
1329 ClearPageCgroupUsed(pc); 1453 ClearPageCgroupUsed(pc);
1330 /* 1454 /*
1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1455 * pc->mem_cgroup is not cleared here. It will be accessed when it's
@@ -1377,7 +1501,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1377 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1501 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1378 /* record memcg information */ 1502 /* record memcg information */
1379 if (do_swap_account && memcg) { 1503 if (do_swap_account && memcg) {
1380 swap_cgroup_record(ent, memcg); 1504 swap_cgroup_record(ent, css_id(&memcg->css));
1381 mem_cgroup_get(memcg); 1505 mem_cgroup_get(memcg);
1382 } 1506 }
1383 if (memcg) 1507 if (memcg)
@@ -1392,15 +1516,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1392void mem_cgroup_uncharge_swap(swp_entry_t ent) 1516void mem_cgroup_uncharge_swap(swp_entry_t ent)
1393{ 1517{
1394 struct mem_cgroup *memcg; 1518 struct mem_cgroup *memcg;
1519 unsigned short id;
1395 1520
1396 if (!do_swap_account) 1521 if (!do_swap_account)
1397 return; 1522 return;
1398 1523
1399 memcg = swap_cgroup_record(ent, NULL); 1524 id = swap_cgroup_record(ent, 0);
1525 rcu_read_lock();
1526 memcg = mem_cgroup_lookup(id);
1400 if (memcg) { 1527 if (memcg) {
1528 /*
1529 * We uncharge this because swap is freed.
1530 * This memcg can be obsolete one. We avoid calling css_tryget
1531 */
1401 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1532 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1402 mem_cgroup_put(memcg); 1533 mem_cgroup_put(memcg);
1403 } 1534 }
1535 rcu_read_unlock();
1404} 1536}
1405#endif 1537#endif
1406 1538
@@ -1508,7 +1640,8 @@ int mem_cgroup_shrink_usage(struct page *page,
1508 return 0; 1640 return 0;
1509 1641
1510 do { 1642 do {
1511 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); 1643 progress = mem_cgroup_hierarchical_reclaim(mem,
1644 gfp_mask, true, false);
1512 progress += mem_cgroup_check_under_limit(mem); 1645 progress += mem_cgroup_check_under_limit(mem);
1513 } while (!progress && --retry); 1646 } while (!progress && --retry);
1514 1647
@@ -1523,11 +1656,21 @@ static DEFINE_MUTEX(set_limit_mutex);
1523static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1656static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1524 unsigned long long val) 1657 unsigned long long val)
1525{ 1658{
1526 1659 int retry_count;
1527 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1528 int progress; 1660 int progress;
1529 u64 memswlimit; 1661 u64 memswlimit;
1530 int ret = 0; 1662 int ret = 0;
1663 int children = mem_cgroup_count_children(memcg);
1664 u64 curusage, oldusage;
1665
1666 /*
1667 * For keeping hierarchical_reclaim simple, how long we should retry
1668 * is depends on callers. We set our retry-count to be function
1669 * of # of children which we should visit in this loop.
1670 */
1671 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
1672
1673 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1531 1674
1532 while (retry_count) { 1675 while (retry_count) {
1533 if (signal_pending(current)) { 1676 if (signal_pending(current)) {
@@ -1553,8 +1696,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1553 break; 1696 break;
1554 1697
1555 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 1698 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1556 false); 1699 false, true);
1557 if (!progress) retry_count--; 1700 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1701 /* Usage is reduced ? */
1702 if (curusage >= oldusage)
1703 retry_count--;
1704 else
1705 oldusage = curusage;
1558 } 1706 }
1559 1707
1560 return ret; 1708 return ret;
@@ -1563,13 +1711,16 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1563int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1711int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1564 unsigned long long val) 1712 unsigned long long val)
1565{ 1713{
1566 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1714 int retry_count;
1567 u64 memlimit, oldusage, curusage; 1715 u64 memlimit, oldusage, curusage;
1568 int ret; 1716 int children = mem_cgroup_count_children(memcg);
1717 int ret = -EBUSY;
1569 1718
1570 if (!do_swap_account) 1719 if (!do_swap_account)
1571 return -EINVAL; 1720 return -EINVAL;
1572 1721 /* see mem_cgroup_resize_res_limit */
1722 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1723 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1573 while (retry_count) { 1724 while (retry_count) {
1574 if (signal_pending(current)) { 1725 if (signal_pending(current)) {
1575 ret = -EINTR; 1726 ret = -EINTR;
@@ -1593,11 +1744,13 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1593 if (!ret) 1744 if (!ret)
1594 break; 1745 break;
1595 1746
1596 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1747 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
1597 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
1598 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1748 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1749 /* Usage is reduced ? */
1599 if (curusage >= oldusage) 1750 if (curusage >= oldusage)
1600 retry_count--; 1751 retry_count--;
1752 else
1753 oldusage = curusage;
1601 } 1754 }
1602 return ret; 1755 return ret;
1603} 1756}
@@ -1893,54 +2046,90 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1893 return 0; 2046 return 0;
1894} 2047}
1895 2048
1896static const struct mem_cgroup_stat_desc { 2049
1897 const char *msg; 2050/* For read statistics */
1898 u64 unit; 2051enum {
1899} mem_cgroup_stat_desc[] = { 2052 MCS_CACHE,
1900 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 2053 MCS_RSS,
1901 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 2054 MCS_PGPGIN,
1902 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 2055 MCS_PGPGOUT,
1903 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 2056 MCS_INACTIVE_ANON,
2057 MCS_ACTIVE_ANON,
2058 MCS_INACTIVE_FILE,
2059 MCS_ACTIVE_FILE,
2060 MCS_UNEVICTABLE,
2061 NR_MCS_STAT,
2062};
2063
2064struct mcs_total_stat {
2065 s64 stat[NR_MCS_STAT];
2066};
2067
2068struct {
2069 char *local_name;
2070 char *total_name;
2071} memcg_stat_strings[NR_MCS_STAT] = {
2072 {"cache", "total_cache"},
2073 {"rss", "total_rss"},
2074 {"pgpgin", "total_pgpgin"},
2075 {"pgpgout", "total_pgpgout"},
2076 {"inactive_anon", "total_inactive_anon"},
2077 {"active_anon", "total_active_anon"},
2078 {"inactive_file", "total_inactive_file"},
2079 {"active_file", "total_active_file"},
2080 {"unevictable", "total_unevictable"}
1904}; 2081};
1905 2082
2083
2084static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2085{
2086 struct mcs_total_stat *s = data;
2087 s64 val;
2088
2089 /* per cpu stat */
2090 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2091 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2092 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2093 s->stat[MCS_RSS] += val * PAGE_SIZE;
2094 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2095 s->stat[MCS_PGPGIN] += val;
2096 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2097 s->stat[MCS_PGPGOUT] += val;
2098
2099 /* per zone stat */
2100 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2101 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2102 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2103 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2104 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2105 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2106 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2107 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2108 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2109 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2110 return 0;
2111}
2112
2113static void
2114mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2115{
2116 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2117}
2118
1906static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2119static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1907 struct cgroup_map_cb *cb) 2120 struct cgroup_map_cb *cb)
1908{ 2121{
1909 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2122 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1910 struct mem_cgroup_stat *stat = &mem_cont->stat; 2123 struct mcs_total_stat mystat;
1911 int i; 2124 int i;
1912 2125
1913 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 2126 memset(&mystat, 0, sizeof(mystat));
1914 s64 val; 2127 mem_cgroup_get_local_stat(mem_cont, &mystat);
1915 2128
1916 val = mem_cgroup_read_stat(stat, i); 2129 for (i = 0; i < NR_MCS_STAT; i++)
1917 val *= mem_cgroup_stat_desc[i].unit; 2130 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
1918 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1919 }
1920 /* showing # of active pages */
1921 {
1922 unsigned long active_anon, inactive_anon;
1923 unsigned long active_file, inactive_file;
1924 unsigned long unevictable;
1925
1926 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1927 LRU_INACTIVE_ANON);
1928 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1929 LRU_ACTIVE_ANON);
1930 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1931 LRU_INACTIVE_FILE);
1932 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1933 LRU_ACTIVE_FILE);
1934 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1935 LRU_UNEVICTABLE);
1936
1937 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1938 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1939 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1940 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1941 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1942 2131
1943 } 2132 /* Hierarchical information */
1944 { 2133 {
1945 unsigned long long limit, memsw_limit; 2134 unsigned long long limit, memsw_limit;
1946 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2135 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
@@ -1949,6 +2138,12 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1949 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2138 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1950 } 2139 }
1951 2140
2141 memset(&mystat, 0, sizeof(mystat));
2142 mem_cgroup_get_total_stat(mem_cont, &mystat);
2143 for (i = 0; i < NR_MCS_STAT; i++)
2144 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2145
2146
1952#ifdef CONFIG_DEBUG_VM 2147#ifdef CONFIG_DEBUG_VM
1953 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2148 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1954 2149
@@ -2178,6 +2373,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2178{ 2373{
2179 int node; 2374 int node;
2180 2375
2376 free_css_id(&mem_cgroup_subsys, &mem->css);
2377
2181 for_each_node_state(node, N_POSSIBLE) 2378 for_each_node_state(node, N_POSSIBLE)
2182 free_mem_cgroup_per_zone_info(mem, node); 2379 free_mem_cgroup_per_zone_info(mem, node);
2183 2380
@@ -2228,11 +2425,12 @@ static struct cgroup_subsys_state * __ref
2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2425mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229{ 2426{
2230 struct mem_cgroup *mem, *parent; 2427 struct mem_cgroup *mem, *parent;
2428 long error = -ENOMEM;
2231 int node; 2429 int node;
2232 2430
2233 mem = mem_cgroup_alloc(); 2431 mem = mem_cgroup_alloc();
2234 if (!mem) 2432 if (!mem)
2235 return ERR_PTR(-ENOMEM); 2433 return ERR_PTR(error);
2236 2434
2237 for_each_node_state(node, N_POSSIBLE) 2435 for_each_node_state(node, N_POSSIBLE)
2238 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2436 if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -2260,7 +2458,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2260 res_counter_init(&mem->res, NULL); 2458 res_counter_init(&mem->res, NULL);
2261 res_counter_init(&mem->memsw, NULL); 2459 res_counter_init(&mem->memsw, NULL);
2262 } 2460 }
2263 mem->last_scanned_child = NULL; 2461 mem->last_scanned_child = 0;
2264 spin_lock_init(&mem->reclaim_param_lock); 2462 spin_lock_init(&mem->reclaim_param_lock);
2265 2463
2266 if (parent) 2464 if (parent)
@@ -2269,26 +2467,22 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2269 return &mem->css; 2467 return &mem->css;
2270free_out: 2468free_out:
2271 __mem_cgroup_free(mem); 2469 __mem_cgroup_free(mem);
2272 return ERR_PTR(-ENOMEM); 2470 return ERR_PTR(error);
2273} 2471}
2274 2472
2275static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2473static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 struct cgroup *cont) 2474 struct cgroup *cont)
2277{ 2475{
2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2476 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2279 mem_cgroup_force_empty(mem, false); 2477
2478 return mem_cgroup_force_empty(mem, false);
2280} 2479}
2281 2480
2282static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2481static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2283 struct cgroup *cont) 2482 struct cgroup *cont)
2284{ 2483{
2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2484 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2287 2485
2288 if (last_scanned_child) {
2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2290 mem_cgroup_put(last_scanned_child);
2291 }
2292 mem_cgroup_put(mem); 2486 mem_cgroup_put(mem);
2293} 2487}
2294 2488
@@ -2327,6 +2521,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
2327 .populate = mem_cgroup_populate, 2521 .populate = mem_cgroup_populate,
2328 .attach = mem_cgroup_move_task, 2522 .attach = mem_cgroup_move_task,
2329 .early_init = 0, 2523 .early_init = 0,
2524 .use_id = 1,
2330}; 2525};
2331 2526
2332#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2527#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..cf6873e91c6a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1151 if ((flags & FOLL_WRITE) && 1151 if ((flags & FOLL_WRITE) &&
1152 !pte_dirty(pte) && !PageDirty(page)) 1152 !pte_dirty(pte) && !PageDirty(page))
1153 set_page_dirty(page); 1153 set_page_dirty(page);
1154 /*
1155 * pte_mkyoung() would be more correct here, but atomic care
1156 * is needed to avoid losing the dirty bit: it is easier to use
1157 * mark_page_accessed().
1158 */
1154 mark_page_accessed(page); 1159 mark_page_accessed(page);
1155 } 1160 }
1156unlock: 1161unlock:
@@ -1665,9 +1670,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1665 * behaviour that some programs depend on. We mark the "original" 1670 * behaviour that some programs depend on. We mark the "original"
1666 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 1671 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1667 */ 1672 */
1668 if (addr == vma->vm_start && end == vma->vm_end) 1673 if (addr == vma->vm_start && end == vma->vm_end) {
1669 vma->vm_pgoff = pfn; 1674 vma->vm_pgoff = pfn;
1670 else if (is_cow_mapping(vma->vm_flags)) 1675 vma->vm_flags |= VM_PFN_AT_MMAP;
1676 } else if (is_cow_mapping(vma->vm_flags))
1671 return -EINVAL; 1677 return -EINVAL;
1672 1678
1673 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1679 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1685,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1679 * needed from higher level routine calling unmap_vmas 1685 * needed from higher level routine calling unmap_vmas
1680 */ 1686 */
1681 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); 1687 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1688 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1682 return -EINVAL; 1689 return -EINVAL;
1683 } 1690 }
1684 1691
@@ -1938,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1938 * get_user_pages(.write=1, .force=1). 1945 * get_user_pages(.write=1, .force=1).
1939 */ 1946 */
1940 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1947 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1948 struct vm_fault vmf;
1949 int tmp;
1950
1951 vmf.virtual_address = (void __user *)(address &
1952 PAGE_MASK);
1953 vmf.pgoff = old_page->index;
1954 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1955 vmf.page = old_page;
1956
1941 /* 1957 /*
1942 * Notify the address space that the page is about to 1958 * Notify the address space that the page is about to
1943 * become writable so that it can prohibit this or wait 1959 * become writable so that it can prohibit this or wait
@@ -1949,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1949 page_cache_get(old_page); 1965 page_cache_get(old_page);
1950 pte_unmap_unlock(page_table, ptl); 1966 pte_unmap_unlock(page_table, ptl);
1951 1967
1952 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) 1968 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
1969 if (unlikely(tmp &
1970 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
1971 ret = tmp;
1953 goto unwritable_page; 1972 goto unwritable_page;
1973 }
1954 1974
1955 /* 1975 /*
1956 * Since we dropped the lock we need to revalidate 1976 * Since we dropped the lock we need to revalidate
@@ -2099,7 +2119,7 @@ oom:
2099 2119
2100unwritable_page: 2120unwritable_page:
2101 page_cache_release(old_page); 2121 page_cache_release(old_page);
2102 return VM_FAULT_SIGBUS; 2122 return ret;
2103} 2123}
2104 2124
2105/* 2125/*
@@ -2433,8 +2453,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2433 count_vm_event(PGMAJFAULT); 2453 count_vm_event(PGMAJFAULT);
2434 } 2454 }
2435 2455
2436 mark_page_accessed(page);
2437
2438 lock_page(page); 2456 lock_page(page);
2439 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2457 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2440 2458
@@ -2643,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2643 * to become writable 2661 * to become writable
2644 */ 2662 */
2645 if (vma->vm_ops->page_mkwrite) { 2663 if (vma->vm_ops->page_mkwrite) {
2664 int tmp;
2665
2646 unlock_page(page); 2666 unlock_page(page);
2647 if (vma->vm_ops->page_mkwrite(vma, page) < 0) { 2667 vmf.flags |= FAULT_FLAG_MKWRITE;
2648 ret = VM_FAULT_SIGBUS; 2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp;
2649 anon = 1; /* no anon but release vmf.page */ 2672 anon = 1; /* no anon but release vmf.page */
2650 goto out_unlocked; 2673 goto out_unlocked;
2651 } 2674 }
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f092f6..068655d8f883 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -250,7 +250,7 @@ out:
250 * The number of remaining references must be: 250 * The number of remaining references must be:
251 * 1 for anonymous pages without a mapping 251 * 1 for anonymous pages without a mapping
252 * 2 for pages with a mapping 252 * 2 for pages with a mapping
253 * 3 for pages with a mapping and PagePrivate set. 253 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
254 */ 254 */
255static int migrate_page_move_mapping(struct address_space *mapping, 255static int migrate_page_move_mapping(struct address_space *mapping,
256 struct page *newpage, struct page *page) 256 struct page *newpage, struct page *page)
@@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 270 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 271 page_index(page));
272 272
273 expected_count = 2 + !!PagePrivate(page); 273 expected_count = 2 + !!page_has_private(page);
274 if (page_count(page) != expected_count || 274 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 275 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 276 spin_unlock_irq(&mapping->tree_lock);
@@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page);
386 386
387/* 387/*
388 * Common logic to directly migrate a single page suitable for 388 * Common logic to directly migrate a single page suitable for
389 * pages that do not use PagePrivate. 389 * pages that do not use PagePrivate/PagePrivate2.
390 * 390 *
391 * Pages are locked upon entry and exit. 391 * Pages are locked upon entry and exit.
392 */ 392 */
@@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping,
522 * Buffers may be managed in a filesystem specific way. 522 * Buffers may be managed in a filesystem specific way.
523 * We must have no buffers or drop them. 523 * We must have no buffers or drop them.
524 */ 524 */
525 if (PagePrivate(page) && 525 if (page_has_private(page) &&
526 !try_to_release_page(page, GFP_KERNEL)) 526 !try_to_release_page(page, GFP_KERNEL))
527 return -EAGAIN; 527 return -EAGAIN;
528 528
@@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
655 * free the metadata, so the page can be freed. 655 * free the metadata, so the page can be freed.
656 */ 656 */
657 if (!page->mapping) { 657 if (!page->mapping) {
658 if (!PageAnon(page) && PagePrivate(page)) { 658 if (!PageAnon(page) && page_has_private(page)) {
659 /* 659 /*
660 * Go direct to try_to_free_buffers() here because 660 * Go direct to try_to_free_buffers() here because
661 * a) that's what try_to_release_page() would do anyway 661 * a) that's what try_to_release_page() would do anyway
diff --git a/mm/mmap.c b/mm/mmap.c
index 00ced3ee49a8..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,6 +20,7 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/personality.h> 21#include <linux/personality.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/ima.h>
23#include <linux/hugetlb.h> 24#include <linux/hugetlb.h>
24#include <linux/profile.h> 25#include <linux/profile.h>
25#include <linux/module.h> 26#include <linux/module.h>
@@ -1049,6 +1050,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1049 error = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1050 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1050 if (error) 1051 if (error)
1051 return error; 1052 return error;
1053 error = ima_file_mmap(file, prot);
1054 if (error)
1055 return error;
1052 1056
1053 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1057 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1054} 1058}
@@ -2477,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
2477 */ 2481 */
2478void __init mmap_init(void) 2482void __init mmap_init(void)
2479{ 2483{
2480 vm_area_cachep = kmem_cache_create("vm_area_struct",
2481 sizeof(struct vm_area_struct), 0,
2482 SLAB_PANIC, NULL);
2483} 2484}
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ 69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72atomic_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
73 73
74EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
75EXPORT_SYMBOL(num_physpages); 75EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 463 */
464void __init mmap_init(void) 464void __init mmap_init(void)
465{ 465{
466 vm_region_jar = kmem_cache_create("vm_region_jar", 466 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
467 sizeof(struct vm_region), 0,
468 SLAB_PANIC, NULL);
469 vm_area_cachep = kmem_cache_create("vm_area_struct",
470 sizeof(struct vm_area_struct), 0,
471 SLAB_PANIC, NULL);
472} 467}
473 468
474/* 469/*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
486 return; 481 return;
487 482
488 last = rb_entry(lastp, struct vm_region, vm_rb); 483 last = rb_entry(lastp, struct vm_region, vm_rb);
489 if (unlikely(last->vm_end <= last->vm_start)) 484 BUG_ON(unlikely(last->vm_end <= last->vm_start));
490 BUG(); 485 BUG_ON(unlikely(last->vm_top < last->vm_end));
491 if (unlikely(last->vm_top < last->vm_end))
492 BUG();
493 486
494 while ((p = rb_next(lastp))) { 487 while ((p = rb_next(lastp))) {
495 region = rb_entry(p, struct vm_region, vm_rb); 488 region = rb_entry(p, struct vm_region, vm_rb);
496 last = rb_entry(lastp, struct vm_region, vm_rb); 489 last = rb_entry(lastp, struct vm_region, vm_rb);
497 490
498 if (unlikely(region->vm_end <= region->vm_start)) 491 BUG_ON(unlikely(region->vm_end <= region->vm_start));
499 BUG(); 492 BUG_ON(unlikely(region->vm_top < region->vm_end));
500 if (unlikely(region->vm_top < region->vm_end)) 493 BUG_ON(unlikely(region->vm_start < last->vm_top));
501 BUG();
502 if (unlikely(region->vm_start < last->vm_top))
503 BUG();
504 494
505 lastp = p; 495 lastp = p;
506 } 496 }
507} 497}
508#else 498#else
509#define validate_nommu_regions() do {} while(0) 499static void validate_nommu_regions(void)
500{
501}
510#endif 502#endif
511 503
512/* 504/*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
563 struct page *page = virt_to_page(from); 555 struct page *page = virt_to_page(from);
564 556
565 kdebug("- free %lx", from); 557 kdebug("- free %lx", from);
566 atomic_dec(&mmap_pages_allocated); 558 atomic_long_dec(&mmap_pages_allocated);
567 if (page_count(page) != 1) 559 if (page_count(page) != 1)
568 kdebug("free page %p [%d]", page, page_count(page)); 560 kdebug("free page %p: refcount not one: %d",
561 page, page_count(page));
569 put_page(page); 562 put_page(page);
570 } 563 }
571} 564}
572 565
573/* 566/*
574 * release a reference to a region 567 * release a reference to a region
575 * - the caller must hold the region semaphore, which this releases 568 * - the caller must hold the region semaphore for writing, which this releases
576 * - the region may not have been added to the tree yet, in which case vm_top 569 * - the region may not have been added to the tree yet, in which case vm_top
577 * will equal vm_start 570 * will equal vm_start
578 */ 571 */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1096 goto enomem; 1089 goto enomem;
1097 1090
1098 total = 1 << order; 1091 total = 1 << order;
1099 atomic_add(total, &mmap_pages_allocated); 1092 atomic_long_add(total, &mmap_pages_allocated);
1100 1093
1101 point = rlen >> PAGE_SHIFT; 1094 point = rlen >> PAGE_SHIFT;
1102 1095
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1107 order = ilog2(total - point); 1100 order = ilog2(total - point);
1108 n = 1 << order; 1101 n = 1 << order;
1109 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1102 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1110 atomic_sub(n, &mmap_pages_allocated); 1103 atomic_long_sub(n, &mmap_pages_allocated);
1111 total -= n; 1104 total -= n;
1112 set_page_refcounted(pages + total); 1105 set_page_refcounted(pages + total);
1113 __free_pages(pages + total, order); 1106 __free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1536 /* find the first potentially overlapping VMA */ 1529 /* find the first potentially overlapping VMA */
1537 vma = find_vma(mm, start); 1530 vma = find_vma(mm, start);
1538 if (!vma) { 1531 if (!vma) {
1539 printk(KERN_WARNING 1532 static int limit = 0;
1540 "munmap of memory not mmapped by process %d (%s):" 1533 if (limit < 5) {
1541 " 0x%lx-0x%lx\n", 1534 printk(KERN_WARNING
1542 current->pid, current->comm, start, start + len - 1); 1535 "munmap of memory not mmapped by process %d"
1536 " (%s): 0x%lx-0x%lx\n",
1537 current->pid, current->comm,
1538 start, start + len - 1);
1539 limit++;
1540 }
1543 return -EINVAL; 1541 return -EINVAL;
1544 } 1542 }
1545 1543
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba05061a4f..2f3166e308d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock);
55 55
56unsigned long badness(struct task_struct *p, unsigned long uptime) 56unsigned long badness(struct task_struct *p, unsigned long uptime)
57{ 57{
58 unsigned long points, cpu_time, run_time, s; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 61
@@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
110 else 110 else
111 run_time = 0; 111 run_time = 0;
112 112
113 s = int_sqrt(cpu_time); 113 if (cpu_time)
114 if (s) 114 points /= int_sqrt(cpu_time);
115 points /= s; 115 if (run_time)
116 s = int_sqrt(int_sqrt(run_time)); 116 points /= int_sqrt(int_sqrt(run_time));
117 if (s)
118 points /= s;
119 117
120 /* 118 /*
121 * Niced processes are most likely less important, so double 119 * Niced processes are most likely less important, so double
@@ -396,6 +394,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
396 cpuset_print_task_mems_allowed(current); 394 cpuset_print_task_mems_allowed(current);
397 task_unlock(current); 395 task_unlock(current);
398 dump_stack(); 396 dump_stack();
397 mem_cgroup_print_oom_info(mem, current);
399 show_mem(); 398 show_mem();
400 if (sysctl_oom_dump_tasks) 399 if (sysctl_oom_dump_tasks)
401 dump_tasks(mem); 400 dump_tasks(mem);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74dc57c74349..30351f0063ac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
66/* 66/*
67 * Start background writeback (via pdflush) at this percentage 67 * Start background writeback (via pdflush) at this percentage
68 */ 68 */
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 10;
70 70
71/* 71/*
72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of 72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable;
83/* 83/*
84 * The generator of dirty data starts writeback at this percentage 84 * The generator of dirty data starts writeback at this percentage
85 */ 85 */
86int vm_dirty_ratio = 10; 86int vm_dirty_ratio = 20;
87 87
88/* 88/*
89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of 89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 10;
92unsigned long vm_dirty_bytes; 92unsigned long vm_dirty_bytes;
93 93
94/* 94/*
95 * The interval between `kupdate'-style writebacks, in jiffies 95 * The interval between `kupdate'-style writebacks
96 */ 96 */
97int dirty_writeback_interval = 5 * HZ; 97unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
98 98
99/* 99/*
100 * The longest number of jiffies for which data is allowed to remain dirty 100 * The longest time for which data is allowed to remain dirty
101 */ 101 */
102int dirty_expire_interval = 30 * HZ; 102unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
103 103
104/* 104/*
105 * Flag that makes the machine dump writes/reads and block dirtyings. 105 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
770 770
771 sync_supers(); 771 sync_supers();
772 772
773 oldest_jif = jiffies - dirty_expire_interval; 773 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
774 start_jif = jiffies; 774 start_jif = jiffies;
775 next_jif = start_jif + dirty_writeback_interval; 775 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
776 nr_to_write = global_page_state(NR_FILE_DIRTY) + 776 nr_to_write = global_page_state(NR_FILE_DIRTY) +
777 global_page_state(NR_UNSTABLE_NFS) + 777 global_page_state(NR_UNSTABLE_NFS) +
778 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 778 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
801int dirty_writeback_centisecs_handler(ctl_table *table, int write, 801int dirty_writeback_centisecs_handler(ctl_table *table, int write,
802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
803{ 803{
804 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); 804 proc_dointvec(table, write, file, buffer, length, ppos);
805 if (dirty_writeback_interval) 805 if (dirty_writeback_interval)
806 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 806 mod_timer(&wb_timer, jiffies +
807 msecs_to_jiffies(dirty_writeback_interval * 10));
807 else 808 else
808 del_timer(&wb_timer); 809 del_timer(&wb_timer);
809 return 0; 810 return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
905{ 906{
906 int shift; 907 int shift;
907 908
908 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 909 mod_timer(&wb_timer,
910 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
909 writeback_set_ratelimit(); 911 writeback_set_ratelimit();
910 register_cpu_notifier(&ratelimit_nb); 912 register_cpu_notifier(&ratelimit_nb);
911 913
@@ -1198,6 +1200,20 @@ int __set_page_dirty_no_writeback(struct page *page)
1198} 1200}
1199 1201
1200/* 1202/*
1203 * Helper function for set_page_dirty family.
1204 * NOTE: This relies on being atomic wrt interrupts.
1205 */
1206void account_page_dirtied(struct page *page, struct address_space *mapping)
1207{
1208 if (mapping_cap_account_dirty(mapping)) {
1209 __inc_zone_page_state(page, NR_FILE_DIRTY);
1210 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1211 task_dirty_inc(current);
1212 task_io_account_write(PAGE_CACHE_SIZE);
1213 }
1214}
1215
1216/*
1201 * For address_spaces which do not use buffers. Just tag the page as dirty in 1217 * For address_spaces which do not use buffers. Just tag the page as dirty in
1202 * its radix tree. 1218 * its radix tree.
1203 * 1219 *
@@ -1226,13 +1242,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1226 if (mapping2) { /* Race with truncate? */ 1242 if (mapping2) { /* Race with truncate? */
1227 BUG_ON(mapping2 != mapping); 1243 BUG_ON(mapping2 != mapping);
1228 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 1244 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1229 if (mapping_cap_account_dirty(mapping)) { 1245 account_page_dirtied(page, mapping);
1230 __inc_zone_page_state(page, NR_FILE_DIRTY);
1231 __inc_bdi_stat(mapping->backing_dev_info,
1232 BDI_RECLAIMABLE);
1233 task_dirty_inc(current);
1234 task_io_account_write(PAGE_CACHE_SIZE);
1235 }
1236 radix_tree_tag_set(&mapping->page_tree, 1246 radix_tree_tag_set(&mapping->page_tree,
1237 page_index(page), PAGECACHE_TAG_DIRTY); 1247 page_index(page), PAGECACHE_TAG_DIRTY);
1238 } 1248 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c44ed49ca93..3f30189896fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -331,7 +331,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
331 for (i = 1; i < nr_pages; i++) { 331 for (i = 1; i < nr_pages; i++) {
332 struct page *p = page + i; 332 struct page *p = page + i;
333 333
334 if (unlikely(!PageTail(p) | (p->first_page != page))) { 334 if (unlikely(!PageTail(p) || (p->first_page != page))) {
335 bad_page(page); 335 bad_page(page);
336 bad++; 336 bad++;
337 } 337 }
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
922 unsigned long flags; 922 unsigned long flags;
923 struct zone *zone; 923 struct zone *zone;
924 924
925 for_each_zone(zone) { 925 for_each_populated_zone(zone) {
926 struct per_cpu_pageset *pset; 926 struct per_cpu_pageset *pset;
927 struct per_cpu_pages *pcp; 927 struct per_cpu_pages *pcp;
928 928
929 if (!populated_zone(zone))
930 continue;
931
932 pset = zone_pcp(zone, cpu); 929 pset = zone_pcp(zone, cpu);
933 930
934 pcp = &pset->pcp; 931 pcp = &pset->pcp;
@@ -1479,6 +1476,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1479 unsigned long did_some_progress; 1476 unsigned long did_some_progress;
1480 unsigned long pages_reclaimed = 0; 1477 unsigned long pages_reclaimed = 0;
1481 1478
1479 lockdep_trace_alloc(gfp_mask);
1480
1482 might_sleep_if(wait); 1481 might_sleep_if(wait);
1483 1482
1484 if (should_fail_alloc_page(gfp_mask, order)) 1483 if (should_fail_alloc_page(gfp_mask, order))
@@ -1578,12 +1577,16 @@ nofail_alloc:
1578 */ 1577 */
1579 cpuset_update_task_memory_state(); 1578 cpuset_update_task_memory_state();
1580 p->flags |= PF_MEMALLOC; 1579 p->flags |= PF_MEMALLOC;
1580
1581 lockdep_set_current_reclaim_state(gfp_mask);
1581 reclaim_state.reclaimed_slab = 0; 1582 reclaim_state.reclaimed_slab = 0;
1582 p->reclaim_state = &reclaim_state; 1583 p->reclaim_state = &reclaim_state;
1583 1584
1584 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1585 did_some_progress = try_to_free_pages(zonelist, order,
1586 gfp_mask, nodemask);
1585 1587
1586 p->reclaim_state = NULL; 1588 p->reclaim_state = NULL;
1589 lockdep_clear_current_reclaim_state();
1587 p->flags &= ~PF_MEMALLOC; 1590 p->flags &= ~PF_MEMALLOC;
1588 1591
1589 cond_resched(); 1592 cond_resched();
@@ -1874,10 +1877,7 @@ void show_free_areas(void)
1874 int cpu; 1877 int cpu;
1875 struct zone *zone; 1878 struct zone *zone;
1876 1879
1877 for_each_zone(zone) { 1880 for_each_populated_zone(zone) {
1878 if (!populated_zone(zone))
1879 continue;
1880
1881 show_node(zone); 1881 show_node(zone);
1882 printk("%s per-cpu:\n", zone->name); 1882 printk("%s per-cpu:\n", zone->name);
1883 1883
@@ -1917,12 +1917,9 @@ void show_free_areas(void)
1917 global_page_state(NR_PAGETABLE), 1917 global_page_state(NR_PAGETABLE),
1918 global_page_state(NR_BOUNCE)); 1918 global_page_state(NR_BOUNCE));
1919 1919
1920 for_each_zone(zone) { 1920 for_each_populated_zone(zone) {
1921 int i; 1921 int i;
1922 1922
1923 if (!populated_zone(zone))
1924 continue;
1925
1926 show_node(zone); 1923 show_node(zone);
1927 printk("%s" 1924 printk("%s"
1928 " free:%lukB" 1925 " free:%lukB"
@@ -1962,12 +1959,9 @@ void show_free_areas(void)
1962 printk("\n"); 1959 printk("\n");
1963 } 1960 }
1964 1961
1965 for_each_zone(zone) { 1962 for_each_populated_zone(zone) {
1966 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1963 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1967 1964
1968 if (!populated_zone(zone))
1969 continue;
1970
1971 show_node(zone); 1965 show_node(zone);
1972 printk("%s: ", zone->name); 1966 printk("%s: ", zone->name);
1973 1967
@@ -2779,11 +2773,7 @@ static int __cpuinit process_zones(int cpu)
2779 2773
2780 node_set_state(node, N_CPU); /* this node has a cpu */ 2774 node_set_state(node, N_CPU); /* this node has a cpu */
2781 2775
2782 for_each_zone(zone) { 2776 for_each_populated_zone(zone) {
2783
2784 if (!populated_zone(zone))
2785 continue;
2786
2787 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2777 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2788 GFP_KERNEL, node); 2778 GFP_KERNEL, node);
2789 if (!zone_pcp(zone, cpu)) 2779 if (!zone_pcp(zone, cpu))
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ceecfbb143fa..791905c991df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -285,12 +285,8 @@ struct swap_cgroup_ctrl {
285 285
286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
287 287
288/*
289 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
290 * cgroup rather than pointer.
291 */
292struct swap_cgroup { 288struct swap_cgroup {
293 struct mem_cgroup *val; 289 unsigned short id;
294}; 290};
295#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 291#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
296#define SC_POS_MASK (SC_PER_PAGE - 1) 292#define SC_POS_MASK (SC_PER_PAGE - 1)
@@ -342,10 +338,10 @@ not_enough_page:
342 * @ent: swap entry to be recorded into 338 * @ent: swap entry to be recorded into
343 * @mem: mem_cgroup to be recorded 339 * @mem: mem_cgroup to be recorded
344 * 340 *
345 * Returns old value at success, NULL at failure. 341 * Returns old value at success, 0 at failure.
346 * (Of course, old value can be NULL.) 342 * (Of course, old value can be 0.)
347 */ 343 */
348struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) 344unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
349{ 345{
350 int type = swp_type(ent); 346 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent); 347 unsigned long offset = swp_offset(ent);
@@ -354,18 +350,18 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
354 struct swap_cgroup_ctrl *ctrl; 350 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage; 351 struct page *mappage;
356 struct swap_cgroup *sc; 352 struct swap_cgroup *sc;
357 struct mem_cgroup *old; 353 unsigned short old;
358 354
359 if (!do_swap_account) 355 if (!do_swap_account)
360 return NULL; 356 return 0;
361 357
362 ctrl = &swap_cgroup_ctrl[type]; 358 ctrl = &swap_cgroup_ctrl[type];
363 359
364 mappage = ctrl->map[idx]; 360 mappage = ctrl->map[idx];
365 sc = page_address(mappage); 361 sc = page_address(mappage);
366 sc += pos; 362 sc += pos;
367 old = sc->val; 363 old = sc->id;
368 sc->val = mem; 364 sc->id = id;
369 365
370 return old; 366 return old;
371} 367}
@@ -374,9 +370,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
374 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 370 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
375 * @ent: swap entry to be looked up. 371 * @ent: swap entry to be looked up.
376 * 372 *
377 * Returns pointer to mem_cgroup at success. NULL at failure. 373 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
378 */ 374 */
379struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) 375unsigned short lookup_swap_cgroup(swp_entry_t ent)
380{ 376{
381 int type = swp_type(ent); 377 int type = swp_type(ent);
382 unsigned long offset = swp_offset(ent); 378 unsigned long offset = swp_offset(ent);
@@ -385,16 +381,16 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
385 struct swap_cgroup_ctrl *ctrl; 381 struct swap_cgroup_ctrl *ctrl;
386 struct page *mappage; 382 struct page *mappage;
387 struct swap_cgroup *sc; 383 struct swap_cgroup *sc;
388 struct mem_cgroup *ret; 384 unsigned short ret;
389 385
390 if (!do_swap_account) 386 if (!do_swap_account)
391 return NULL; 387 return 0;
392 388
393 ctrl = &swap_cgroup_ctrl[type]; 389 ctrl = &swap_cgroup_ctrl[type];
394 mappage = ctrl->map[idx]; 390 mappage = ctrl->map[idx];
395 sc = page_address(mappage); 391 sc = page_address(mappage);
396 sc += pos; 392 sc += pos;
397 ret = sc->val; 393 ret = sc->id;
398 return ret; 394 return ret;
399} 395}
400 396
@@ -430,13 +426,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
430 } 426 }
431 mutex_unlock(&swap_cgroup_mutex); 427 mutex_unlock(&swap_cgroup_mutex);
432 428
433 printk(KERN_INFO
434 "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
435 " and %ld bytes to hold mem_cgroup pointers on swap\n",
436 array_size, length * PAGE_SIZE);
437 printk(KERN_INFO
438 "swap_cgroup can be disabled by noswapaccount boot option.\n");
439
440 return 0; 429 return 0;
441nomem: 430nomem:
442 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 431 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 15de509b68fd..118905e3d788 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -191,7 +191,7 @@ static int pdflush(void *dummy)
191 191
192 /* 192 /*
193 * Some configs put our parent kthread in a limited cpuset, 193 * Some configs put our parent kthread in a limited cpuset,
194 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL. 194 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
195 * Our needs are more modest - cut back to our cpusets cpus_allowed. 195 * Our needs are more modest - cut back to our cpusets cpus_allowed.
196 * This is needed as pdflush's are dynamically created and destroyed. 196 * This is needed as pdflush's are dynamically created and destroyed.
197 * The boottime pdflush's are easily placed w/o these 2 lines. 197 * The boottime pdflush's are easily placed w/o these 2 lines.
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..1aa5d8fbca12
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,1326 @@
1/*
2 * linux/mm/percpu.c - percpu memory allocator
3 *
4 * Copyright (C) 2009 SUSE Linux Products GmbH
5 * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
6 *
7 * This file is released under the GPLv2.
8 *
9 * This is percpu allocator which can handle both static and dynamic
10 * areas. Percpu areas are allocated in chunks in vmalloc area. Each
11 * chunk is consisted of num_possible_cpus() units and the first chunk
12 * is used for static percpu variables in the kernel image (special
13 * boot time alloc/init handling necessary as these areas need to be
14 * brought up before allocation services are running). Unit grows as
15 * necessary and all units grow or shrink in unison. When a chunk is
16 * filled up, another chunk is allocated. ie. in vmalloc area
17 *
18 * c0 c1 c2
19 * ------------------- ------------------- ------------
20 * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
21 * ------------------- ...... ------------------- .... ------------
22 *
23 * Allocation is done in offset-size areas of single unit space. Ie,
24 * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
25 * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
26 * percpu base registers UNIT_SIZE apart.
27 *
28 * There are usually many small percpu allocations many of them as
29 * small as 4 bytes. The allocator organizes chunks into lists
30 * according to free size and tries to allocate from the fullest one.
31 * Each chunk keeps the maximum contiguous area size hint which is
32 * guaranteed to be eqaul to or larger than the maximum contiguous
33 * area in the chunk. This helps the allocator not to iterate the
34 * chunk maps unnecessarily.
35 *
36 * Allocation state in each chunk is kept using an array of integers
37 * on chunk->map. A positive value in the map represents a free
38 * region and negative allocated. Allocation inside a chunk is done
39 * by scanning this map sequentially and serving the first matching
40 * entry. This is mostly copied from the percpu_modalloc() allocator.
41 * Chunks are also linked into a rb tree to ease address to chunk
42 * mapping during free.
43 *
44 * To use this allocator, arch code should do the followings.
45 *
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
47 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back if they need to be
50 * different from the default
51 *
52 * - use pcpu_setup_first_chunk() during percpu area initialization to
53 * setup the first chunk containing the kernel static percpu area
54 */
55
56#include <linux/bitmap.h>
57#include <linux/bootmem.h>
58#include <linux/list.h>
59#include <linux/mm.h>
60#include <linux/module.h>
61#include <linux/mutex.h>
62#include <linux/percpu.h>
63#include <linux/pfn.h>
64#include <linux/rbtree.h>
65#include <linux/slab.h>
66#include <linux/spinlock.h>
67#include <linux/vmalloc.h>
68#include <linux/workqueue.h>
69
70#include <asm/cacheflush.h>
71#include <asm/sections.h>
72#include <asm/tlbflush.h>
73
74#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
75#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
76
77/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
78#ifndef __addr_to_pcpu_ptr
79#define __addr_to_pcpu_ptr(addr) \
80 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
81 + (unsigned long)__per_cpu_start)
82#endif
83#ifndef __pcpu_ptr_to_addr
84#define __pcpu_ptr_to_addr(ptr) \
85 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
86 - (unsigned long)__per_cpu_start)
87#endif
88
89struct pcpu_chunk {
90 struct list_head list; /* linked to pcpu_slot lists */
91 struct rb_node rb_node; /* key is chunk->vm->addr */
92 int free_size; /* free bytes in the chunk */
93 int contig_hint; /* max contiguous size hint */
94 struct vm_struct *vm; /* mapped vmalloc region */
95 int map_used; /* # of map entries used */
96 int map_alloc; /* # of map entries allocated */
97 int *map; /* allocation map */
98 bool immutable; /* no [de]population allowed */
99 struct page **page; /* points to page array */
100 struct page *page_ar[]; /* #cpus * UNIT_PAGES */
101};
102
103static int pcpu_unit_pages __read_mostly;
104static int pcpu_unit_size __read_mostly;
105static int pcpu_chunk_size __read_mostly;
106static int pcpu_nr_slots __read_mostly;
107static size_t pcpu_chunk_struct_size __read_mostly;
108
109/* the address of the first chunk which starts with the kernel static area */
110void *pcpu_base_addr __read_mostly;
111EXPORT_SYMBOL_GPL(pcpu_base_addr);
112
113/* optional reserved chunk, only accessible for reserved allocations */
114static struct pcpu_chunk *pcpu_reserved_chunk;
115/* offset limit of the reserved chunk */
116static int pcpu_reserved_chunk_limit;
117
118/*
119 * Synchronization rules.
120 *
121 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
122 * protects allocation/reclaim paths, chunks and chunk->page arrays.
123 * The latter is a spinlock and protects the index data structures -
124 * chunk slots, rbtree, chunks and area maps in chunks.
125 *
126 * During allocation, pcpu_alloc_mutex is kept locked all the time and
127 * pcpu_lock is grabbed and released as necessary. All actual memory
128 * allocations are done using GFP_KERNEL with pcpu_lock released.
129 *
130 * Free path accesses and alters only the index data structures, so it
131 * can be safely called from atomic context. When memory needs to be
132 * returned to the system, free path schedules reclaim_work which
133 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
134 * reclaimed, release both locks and frees the chunks. Note that it's
135 * necessary to grab both locks to remove a chunk from circulation as
136 * allocation path might be referencing the chunk with only
137 * pcpu_alloc_mutex locked.
138 */
139static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
140static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
141
142static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
143static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
144
145/* reclaim work to release fully free chunks, scheduled from free path */
146static void pcpu_reclaim(struct work_struct *work);
147static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
148
149static int __pcpu_size_to_slot(int size)
150{
151 int highbit = fls(size); /* size is in bytes */
152 return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
153}
154
155static int pcpu_size_to_slot(int size)
156{
157 if (size == pcpu_unit_size)
158 return pcpu_nr_slots - 1;
159 return __pcpu_size_to_slot(size);
160}
161
162static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
163{
164 if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
165 return 0;
166
167 return pcpu_size_to_slot(chunk->free_size);
168}
169
170static int pcpu_page_idx(unsigned int cpu, int page_idx)
171{
172 return cpu * pcpu_unit_pages + page_idx;
173}
174
175static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
176 unsigned int cpu, int page_idx)
177{
178 return &chunk->page[pcpu_page_idx(cpu, page_idx)];
179}
180
181static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
182 unsigned int cpu, int page_idx)
183{
184 return (unsigned long)chunk->vm->addr +
185 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
186}
187
188static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
189 int page_idx)
190{
191 return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
192}
193
194/**
195 * pcpu_mem_alloc - allocate memory
196 * @size: bytes to allocate
197 *
198 * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
199 * kzalloc() is used; otherwise, vmalloc() is used. The returned
200 * memory is always zeroed.
201 *
202 * CONTEXT:
203 * Does GFP_KERNEL allocation.
204 *
205 * RETURNS:
206 * Pointer to the allocated area on success, NULL on failure.
207 */
208static void *pcpu_mem_alloc(size_t size)
209{
210 if (size <= PAGE_SIZE)
211 return kzalloc(size, GFP_KERNEL);
212 else {
213 void *ptr = vmalloc(size);
214 if (ptr)
215 memset(ptr, 0, size);
216 return ptr;
217 }
218}
219
220/**
221 * pcpu_mem_free - free memory
222 * @ptr: memory to free
223 * @size: size of the area
224 *
225 * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
226 */
227static void pcpu_mem_free(void *ptr, size_t size)
228{
229 if (size <= PAGE_SIZE)
230 kfree(ptr);
231 else
232 vfree(ptr);
233}
234
235/**
236 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
237 * @chunk: chunk of interest
238 * @oslot: the previous slot it was on
239 *
240 * This function is called after an allocation or free changed @chunk.
241 * New slot according to the changed state is determined and @chunk is
242 * moved to the slot. Note that the reserved chunk is never put on
243 * chunk slots.
244 *
245 * CONTEXT:
246 * pcpu_lock.
247 */
248static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
249{
250 int nslot = pcpu_chunk_slot(chunk);
251
252 if (chunk != pcpu_reserved_chunk && oslot != nslot) {
253 if (oslot < nslot)
254 list_move(&chunk->list, &pcpu_slot[nslot]);
255 else
256 list_move_tail(&chunk->list, &pcpu_slot[nslot]);
257 }
258}
259
260static struct rb_node **pcpu_chunk_rb_search(void *addr,
261 struct rb_node **parentp)
262{
263 struct rb_node **p = &pcpu_addr_root.rb_node;
264 struct rb_node *parent = NULL;
265 struct pcpu_chunk *chunk;
266
267 while (*p) {
268 parent = *p;
269 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
270
271 if (addr < chunk->vm->addr)
272 p = &(*p)->rb_left;
273 else if (addr > chunk->vm->addr)
274 p = &(*p)->rb_right;
275 else
276 break;
277 }
278
279 if (parentp)
280 *parentp = parent;
281 return p;
282}
283
284/**
285 * pcpu_chunk_addr_search - search for chunk containing specified address
286 * @addr: address to search for
287 *
288 * Look for chunk which might contain @addr. More specifically, it
289 * searchs for the chunk with the highest start address which isn't
290 * beyond @addr.
291 *
292 * CONTEXT:
293 * pcpu_lock.
294 *
295 * RETURNS:
296 * The address of the found chunk.
297 */
298static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
299{
300 struct rb_node *n, *parent;
301 struct pcpu_chunk *chunk;
302
303 /* is it in the reserved chunk? */
304 if (pcpu_reserved_chunk) {
305 void *start = pcpu_reserved_chunk->vm->addr;
306
307 if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
308 return pcpu_reserved_chunk;
309 }
310
311 /* nah... search the regular ones */
312 n = *pcpu_chunk_rb_search(addr, &parent);
313 if (!n) {
314 /* no exactly matching chunk, the parent is the closest */
315 n = parent;
316 BUG_ON(!n);
317 }
318 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
319
320 if (addr < chunk->vm->addr) {
321 /* the parent was the next one, look for the previous one */
322 n = rb_prev(n);
323 BUG_ON(!n);
324 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
325 }
326
327 return chunk;
328}
329
330/**
331 * pcpu_chunk_addr_insert - insert chunk into address rb tree
332 * @new: chunk to insert
333 *
334 * Insert @new into address rb tree.
335 *
336 * CONTEXT:
337 * pcpu_lock.
338 */
339static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
340{
341 struct rb_node **p, *parent;
342
343 p = pcpu_chunk_rb_search(new->vm->addr, &parent);
344 BUG_ON(*p);
345 rb_link_node(&new->rb_node, parent, p);
346 rb_insert_color(&new->rb_node, &pcpu_addr_root);
347}
348
349/**
350 * pcpu_extend_area_map - extend area map for allocation
351 * @chunk: target chunk
352 *
353 * Extend area map of @chunk so that it can accomodate an allocation.
354 * A single allocation can split an area into three areas, so this
355 * function makes sure that @chunk->map has at least two extra slots.
356 *
357 * CONTEXT:
358 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
359 * if area map is extended.
360 *
361 * RETURNS:
362 * 0 if noop, 1 if successfully extended, -errno on failure.
363 */
364static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
365{
366 int new_alloc;
367 int *new;
368 size_t size;
369
370 /* has enough? */
371 if (chunk->map_alloc >= chunk->map_used + 2)
372 return 0;
373
374 spin_unlock_irq(&pcpu_lock);
375
376 new_alloc = PCPU_DFL_MAP_ALLOC;
377 while (new_alloc < chunk->map_used + 2)
378 new_alloc *= 2;
379
380 new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
381 if (!new) {
382 spin_lock_irq(&pcpu_lock);
383 return -ENOMEM;
384 }
385
386 /*
387 * Acquire pcpu_lock and switch to new area map. Only free
388 * could have happened inbetween, so map_used couldn't have
389 * grown.
390 */
391 spin_lock_irq(&pcpu_lock);
392 BUG_ON(new_alloc < chunk->map_used + 2);
393
394 size = chunk->map_alloc * sizeof(chunk->map[0]);
395 memcpy(new, chunk->map, size);
396
397 /*
398 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
399 * one of the first chunks and still using static map.
400 */
401 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
402 pcpu_mem_free(chunk->map, size);
403
404 chunk->map_alloc = new_alloc;
405 chunk->map = new;
406 return 0;
407}
408
409/**
410 * pcpu_split_block - split a map block
411 * @chunk: chunk of interest
412 * @i: index of map block to split
413 * @head: head size in bytes (can be 0)
414 * @tail: tail size in bytes (can be 0)
415 *
416 * Split the @i'th map block into two or three blocks. If @head is
417 * non-zero, @head bytes block is inserted before block @i moving it
418 * to @i+1 and reducing its size by @head bytes.
419 *
420 * If @tail is non-zero, the target block, which can be @i or @i+1
421 * depending on @head, is reduced by @tail bytes and @tail byte block
422 * is inserted after the target block.
423 *
424 * @chunk->map must have enough free slots to accomodate the split.
425 *
426 * CONTEXT:
427 * pcpu_lock.
428 */
429static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
430 int head, int tail)
431{
432 int nr_extra = !!head + !!tail;
433
434 BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
435
436 /* insert new subblocks */
437 memmove(&chunk->map[i + nr_extra], &chunk->map[i],
438 sizeof(chunk->map[0]) * (chunk->map_used - i));
439 chunk->map_used += nr_extra;
440
441 if (head) {
442 chunk->map[i + 1] = chunk->map[i] - head;
443 chunk->map[i++] = head;
444 }
445 if (tail) {
446 chunk->map[i++] -= tail;
447 chunk->map[i] = tail;
448 }
449}
450
451/**
452 * pcpu_alloc_area - allocate area from a pcpu_chunk
453 * @chunk: chunk of interest
454 * @size: wanted size in bytes
455 * @align: wanted align
456 *
457 * Try to allocate @size bytes area aligned at @align from @chunk.
458 * Note that this function only allocates the offset. It doesn't
459 * populate or map the area.
460 *
461 * @chunk->map must have at least two free slots.
462 *
463 * CONTEXT:
464 * pcpu_lock.
465 *
466 * RETURNS:
467 * Allocated offset in @chunk on success, -1 if no matching area is
468 * found.
469 */
470static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
471{
472 int oslot = pcpu_chunk_slot(chunk);
473 int max_contig = 0;
474 int i, off;
475
476 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
477 bool is_last = i + 1 == chunk->map_used;
478 int head, tail;
479
480 /* extra for alignment requirement */
481 head = ALIGN(off, align) - off;
482 BUG_ON(i == 0 && head != 0);
483
484 if (chunk->map[i] < 0)
485 continue;
486 if (chunk->map[i] < head + size) {
487 max_contig = max(chunk->map[i], max_contig);
488 continue;
489 }
490
491 /*
492 * If head is small or the previous block is free,
493 * merge'em. Note that 'small' is defined as smaller
494 * than sizeof(int), which is very small but isn't too
495 * uncommon for percpu allocations.
496 */
497 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
498 if (chunk->map[i - 1] > 0)
499 chunk->map[i - 1] += head;
500 else {
501 chunk->map[i - 1] -= head;
502 chunk->free_size -= head;
503 }
504 chunk->map[i] -= head;
505 off += head;
506 head = 0;
507 }
508
509 /* if tail is small, just keep it around */
510 tail = chunk->map[i] - head - size;
511 if (tail < sizeof(int))
512 tail = 0;
513
514 /* split if warranted */
515 if (head || tail) {
516 pcpu_split_block(chunk, i, head, tail);
517 if (head) {
518 i++;
519 off += head;
520 max_contig = max(chunk->map[i - 1], max_contig);
521 }
522 if (tail)
523 max_contig = max(chunk->map[i + 1], max_contig);
524 }
525
526 /* update hint and mark allocated */
527 if (is_last)
528 chunk->contig_hint = max_contig; /* fully scanned */
529 else
530 chunk->contig_hint = max(chunk->contig_hint,
531 max_contig);
532
533 chunk->free_size -= chunk->map[i];
534 chunk->map[i] = -chunk->map[i];
535
536 pcpu_chunk_relocate(chunk, oslot);
537 return off;
538 }
539
540 chunk->contig_hint = max_contig; /* fully scanned */
541 pcpu_chunk_relocate(chunk, oslot);
542
543 /* tell the upper layer that this chunk has no matching area */
544 return -1;
545}
546
547/**
548 * pcpu_free_area - free area to a pcpu_chunk
549 * @chunk: chunk of interest
550 * @freeme: offset of area to free
551 *
552 * Free area starting from @freeme to @chunk. Note that this function
553 * only modifies the allocation map. It doesn't depopulate or unmap
554 * the area.
555 *
556 * CONTEXT:
557 * pcpu_lock.
558 */
559static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
560{
561 int oslot = pcpu_chunk_slot(chunk);
562 int i, off;
563
564 for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
565 if (off == freeme)
566 break;
567 BUG_ON(off != freeme);
568 BUG_ON(chunk->map[i] > 0);
569
570 chunk->map[i] = -chunk->map[i];
571 chunk->free_size += chunk->map[i];
572
573 /* merge with previous? */
574 if (i > 0 && chunk->map[i - 1] >= 0) {
575 chunk->map[i - 1] += chunk->map[i];
576 chunk->map_used--;
577 memmove(&chunk->map[i], &chunk->map[i + 1],
578 (chunk->map_used - i) * sizeof(chunk->map[0]));
579 i--;
580 }
581 /* merge with next? */
582 if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
583 chunk->map[i] += chunk->map[i + 1];
584 chunk->map_used--;
585 memmove(&chunk->map[i + 1], &chunk->map[i + 2],
586 (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
587 }
588
589 chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
590 pcpu_chunk_relocate(chunk, oslot);
591}
592
593/**
594 * pcpu_unmap - unmap pages out of a pcpu_chunk
595 * @chunk: chunk of interest
596 * @page_start: page index of the first page to unmap
597 * @page_end: page index of the last page to unmap + 1
598 * @flush: whether to flush cache and tlb or not
599 *
600 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
601 * If @flush is true, vcache is flushed before unmapping and tlb
602 * after.
603 */
604static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
605 bool flush)
606{
607 unsigned int last = num_possible_cpus() - 1;
608 unsigned int cpu;
609
610 /* unmap must not be done on immutable chunk */
611 WARN_ON(chunk->immutable);
612
613 /*
614 * Each flushing trial can be very expensive, issue flush on
615 * the whole region at once rather than doing it for each cpu.
616 * This could be an overkill but is more scalable.
617 */
618 if (flush)
619 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
620 pcpu_chunk_addr(chunk, last, page_end));
621
622 for_each_possible_cpu(cpu)
623 unmap_kernel_range_noflush(
624 pcpu_chunk_addr(chunk, cpu, page_start),
625 (page_end - page_start) << PAGE_SHIFT);
626
627 /* ditto as flush_cache_vunmap() */
628 if (flush)
629 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
630 pcpu_chunk_addr(chunk, last, page_end));
631}
632
633/**
634 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
635 * @chunk: chunk to depopulate
636 * @off: offset to the area to depopulate
637 * @size: size of the area to depopulate in bytes
638 * @flush: whether to flush cache and tlb or not
639 *
640 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
641 * from @chunk. If @flush is true, vcache is flushed before unmapping
642 * and tlb after.
643 *
644 * CONTEXT:
645 * pcpu_alloc_mutex.
646 */
647static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
648 bool flush)
649{
650 int page_start = PFN_DOWN(off);
651 int page_end = PFN_UP(off + size);
652 int unmap_start = -1;
653 int uninitialized_var(unmap_end);
654 unsigned int cpu;
655 int i;
656
657 for (i = page_start; i < page_end; i++) {
658 for_each_possible_cpu(cpu) {
659 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
660
661 if (!*pagep)
662 continue;
663
664 __free_page(*pagep);
665
666 /*
667 * If it's partial depopulation, it might get
668 * populated or depopulated again. Mark the
669 * page gone.
670 */
671 *pagep = NULL;
672
673 unmap_start = unmap_start < 0 ? i : unmap_start;
674 unmap_end = i + 1;
675 }
676 }
677
678 if (unmap_start >= 0)
679 pcpu_unmap(chunk, unmap_start, unmap_end, flush);
680}
681
682/**
683 * pcpu_map - map pages into a pcpu_chunk
684 * @chunk: chunk of interest
685 * @page_start: page index of the first page to map
686 * @page_end: page index of the last page to map + 1
687 *
688 * For each cpu, map pages [@page_start,@page_end) into @chunk.
689 * vcache is flushed afterwards.
690 */
691static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
692{
693 unsigned int last = num_possible_cpus() - 1;
694 unsigned int cpu;
695 int err;
696
697 /* map must not be done on immutable chunk */
698 WARN_ON(chunk->immutable);
699
700 for_each_possible_cpu(cpu) {
701 err = map_kernel_range_noflush(
702 pcpu_chunk_addr(chunk, cpu, page_start),
703 (page_end - page_start) << PAGE_SHIFT,
704 PAGE_KERNEL,
705 pcpu_chunk_pagep(chunk, cpu, page_start));
706 if (err < 0)
707 return err;
708 }
709
710 /* flush at once, please read comments in pcpu_unmap() */
711 flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
712 pcpu_chunk_addr(chunk, last, page_end));
713 return 0;
714}
715
716/**
717 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
718 * @chunk: chunk of interest
719 * @off: offset to the area to populate
720 * @size: size of the area to populate in bytes
721 *
722 * For each cpu, populate and map pages [@page_start,@page_end) into
723 * @chunk. The area is cleared on return.
724 *
725 * CONTEXT:
726 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
727 */
728static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
729{
730 const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
731 int page_start = PFN_DOWN(off);
732 int page_end = PFN_UP(off + size);
733 int map_start = -1;
734 int uninitialized_var(map_end);
735 unsigned int cpu;
736 int i;
737
738 for (i = page_start; i < page_end; i++) {
739 if (pcpu_chunk_page_occupied(chunk, i)) {
740 if (map_start >= 0) {
741 if (pcpu_map(chunk, map_start, map_end))
742 goto err;
743 map_start = -1;
744 }
745 continue;
746 }
747
748 map_start = map_start < 0 ? i : map_start;
749 map_end = i + 1;
750
751 for_each_possible_cpu(cpu) {
752 struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
753
754 *pagep = alloc_pages_node(cpu_to_node(cpu),
755 alloc_mask, 0);
756 if (!*pagep)
757 goto err;
758 }
759 }
760
761 if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
762 goto err;
763
764 for_each_possible_cpu(cpu)
765 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
766 size);
767
768 return 0;
769err:
770 /* likely under heavy memory pressure, give memory back */
771 pcpu_depopulate_chunk(chunk, off, size, true);
772 return -ENOMEM;
773}
774
775static void free_pcpu_chunk(struct pcpu_chunk *chunk)
776{
777 if (!chunk)
778 return;
779 if (chunk->vm)
780 free_vm_area(chunk->vm);
781 pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
782 kfree(chunk);
783}
784
785static struct pcpu_chunk *alloc_pcpu_chunk(void)
786{
787 struct pcpu_chunk *chunk;
788
789 chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
790 if (!chunk)
791 return NULL;
792
793 chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
794 chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
795 chunk->map[chunk->map_used++] = pcpu_unit_size;
796 chunk->page = chunk->page_ar;
797
798 chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
799 if (!chunk->vm) {
800 free_pcpu_chunk(chunk);
801 return NULL;
802 }
803
804 INIT_LIST_HEAD(&chunk->list);
805 chunk->free_size = pcpu_unit_size;
806 chunk->contig_hint = pcpu_unit_size;
807
808 return chunk;
809}
810
811/**
812 * pcpu_alloc - the percpu allocator
813 * @size: size of area to allocate in bytes
814 * @align: alignment of area (max PAGE_SIZE)
815 * @reserved: allocate from the reserved chunk if available
816 *
817 * Allocate percpu area of @size bytes aligned at @align.
818 *
819 * CONTEXT:
820 * Does GFP_KERNEL allocation.
821 *
822 * RETURNS:
823 * Percpu pointer to the allocated area on success, NULL on failure.
824 */
825static void *pcpu_alloc(size_t size, size_t align, bool reserved)
826{
827 struct pcpu_chunk *chunk;
828 int slot, off;
829
830 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
831 WARN(true, "illegal size (%zu) or align (%zu) for "
832 "percpu allocation\n", size, align);
833 return NULL;
834 }
835
836 mutex_lock(&pcpu_alloc_mutex);
837 spin_lock_irq(&pcpu_lock);
838
839 /* serve reserved allocations from the reserved chunk if available */
840 if (reserved && pcpu_reserved_chunk) {
841 chunk = pcpu_reserved_chunk;
842 if (size > chunk->contig_hint ||
843 pcpu_extend_area_map(chunk) < 0)
844 goto fail_unlock;
845 off = pcpu_alloc_area(chunk, size, align);
846 if (off >= 0)
847 goto area_found;
848 goto fail_unlock;
849 }
850
851restart:
852 /* search through normal chunks */
853 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
854 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
855 if (size > chunk->contig_hint)
856 continue;
857
858 switch (pcpu_extend_area_map(chunk)) {
859 case 0:
860 break;
861 case 1:
862 goto restart; /* pcpu_lock dropped, restart */
863 default:
864 goto fail_unlock;
865 }
866
867 off = pcpu_alloc_area(chunk, size, align);
868 if (off >= 0)
869 goto area_found;
870 }
871 }
872
873 /* hmmm... no space left, create a new chunk */
874 spin_unlock_irq(&pcpu_lock);
875
876 chunk = alloc_pcpu_chunk();
877 if (!chunk)
878 goto fail_unlock_mutex;
879
880 spin_lock_irq(&pcpu_lock);
881 pcpu_chunk_relocate(chunk, -1);
882 pcpu_chunk_addr_insert(chunk);
883 goto restart;
884
885area_found:
886 spin_unlock_irq(&pcpu_lock);
887
888 /* populate, map and clear the area */
889 if (pcpu_populate_chunk(chunk, off, size)) {
890 spin_lock_irq(&pcpu_lock);
891 pcpu_free_area(chunk, off);
892 goto fail_unlock;
893 }
894
895 mutex_unlock(&pcpu_alloc_mutex);
896
897 return __addr_to_pcpu_ptr(chunk->vm->addr + off);
898
899fail_unlock:
900 spin_unlock_irq(&pcpu_lock);
901fail_unlock_mutex:
902 mutex_unlock(&pcpu_alloc_mutex);
903 return NULL;
904}
905
906/**
907 * __alloc_percpu - allocate dynamic percpu area
908 * @size: size of area to allocate in bytes
909 * @align: alignment of area (max PAGE_SIZE)
910 *
911 * Allocate percpu area of @size bytes aligned at @align. Might
912 * sleep. Might trigger writeouts.
913 *
914 * CONTEXT:
915 * Does GFP_KERNEL allocation.
916 *
917 * RETURNS:
918 * Percpu pointer to the allocated area on success, NULL on failure.
919 */
920void *__alloc_percpu(size_t size, size_t align)
921{
922 return pcpu_alloc(size, align, false);
923}
924EXPORT_SYMBOL_GPL(__alloc_percpu);
925
926/**
927 * __alloc_reserved_percpu - allocate reserved percpu area
928 * @size: size of area to allocate in bytes
929 * @align: alignment of area (max PAGE_SIZE)
930 *
931 * Allocate percpu area of @size bytes aligned at @align from reserved
932 * percpu area if arch has set it up; otherwise, allocation is served
933 * from the same dynamic area. Might sleep. Might trigger writeouts.
934 *
935 * CONTEXT:
936 * Does GFP_KERNEL allocation.
937 *
938 * RETURNS:
939 * Percpu pointer to the allocated area on success, NULL on failure.
940 */
941void *__alloc_reserved_percpu(size_t size, size_t align)
942{
943 return pcpu_alloc(size, align, true);
944}
945
946/**
947 * pcpu_reclaim - reclaim fully free chunks, workqueue function
948 * @work: unused
949 *
950 * Reclaim all fully free chunks except for the first one.
951 *
952 * CONTEXT:
953 * workqueue context.
954 */
955static void pcpu_reclaim(struct work_struct *work)
956{
957 LIST_HEAD(todo);
958 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
959 struct pcpu_chunk *chunk, *next;
960
961 mutex_lock(&pcpu_alloc_mutex);
962 spin_lock_irq(&pcpu_lock);
963
964 list_for_each_entry_safe(chunk, next, head, list) {
965 WARN_ON(chunk->immutable);
966
967 /* spare the first one */
968 if (chunk == list_first_entry(head, struct pcpu_chunk, list))
969 continue;
970
971 rb_erase(&chunk->rb_node, &pcpu_addr_root);
972 list_move(&chunk->list, &todo);
973 }
974
975 spin_unlock_irq(&pcpu_lock);
976 mutex_unlock(&pcpu_alloc_mutex);
977
978 list_for_each_entry_safe(chunk, next, &todo, list) {
979 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
980 free_pcpu_chunk(chunk);
981 }
982}
983
984/**
985 * free_percpu - free percpu area
986 * @ptr: pointer to area to free
987 *
988 * Free percpu area @ptr.
989 *
990 * CONTEXT:
991 * Can be called from atomic context.
992 */
993void free_percpu(void *ptr)
994{
995 void *addr = __pcpu_ptr_to_addr(ptr);
996 struct pcpu_chunk *chunk;
997 unsigned long flags;
998 int off;
999
1000 if (!ptr)
1001 return;
1002
1003 spin_lock_irqsave(&pcpu_lock, flags);
1004
1005 chunk = pcpu_chunk_addr_search(addr);
1006 off = addr - chunk->vm->addr;
1007
1008 pcpu_free_area(chunk, off);
1009
1010 /* if there are more than one fully free chunks, wake up grim reaper */
1011 if (chunk->free_size == pcpu_unit_size) {
1012 struct pcpu_chunk *pos;
1013
1014 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1015 if (pos != chunk) {
1016 schedule_work(&pcpu_reclaim_work);
1017 break;
1018 }
1019 }
1020
1021 spin_unlock_irqrestore(&pcpu_lock, flags);
1022}
1023EXPORT_SYMBOL_GPL(free_percpu);
1024
1025/**
1026 * pcpu_setup_first_chunk - initialize the first percpu chunk
1027 * @get_page_fn: callback to fetch page pointer
1028 * @static_size: the size of static percpu area in bytes
1029 * @reserved_size: the size of reserved percpu area in bytes
1030 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1031 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1032 * @base_addr: mapped address, NULL for auto
1033 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
1034 *
1035 * Initialize the first percpu chunk which contains the kernel static
1036 * perpcu area. This function is to be called from arch percpu area
1037 * setup path. The first two parameters are mandatory. The rest are
1038 * optional.
1039 *
1040 * @get_page_fn() should return pointer to percpu page given cpu
1041 * number and page number. It should at least return enough pages to
1042 * cover the static area. The returned pages for static area should
1043 * have been initialized with valid data. If @unit_size is specified,
1044 * it can also return pages after the static area. NULL return
1045 * indicates end of pages for the cpu. Note that @get_page_fn() must
1046 * return the same number of pages for all cpus.
1047 *
1048 * @reserved_size, if non-zero, specifies the amount of bytes to
1049 * reserve after the static area in the first chunk. This reserves
1050 * the first chunk such that it's available only through reserved
1051 * percpu allocation. This is primarily used to serve module percpu
1052 * static areas on architectures where the addressing model has
1053 * limited offset range for symbol relocations to guarantee module
1054 * percpu symbols fall inside the relocatable range.
1055 *
1056 * @dyn_size, if non-negative, determines the number of bytes
1057 * available for dynamic allocation in the first chunk. Specifying
1058 * non-negative value makes percpu leave alone the area beyond
1059 * @static_size + @reserved_size + @dyn_size.
1060 *
1061 * @unit_size, if non-negative, specifies unit size and must be
1062 * aligned to PAGE_SIZE and equal to or larger than @static_size +
1063 * @reserved_size + if non-negative, @dyn_size.
1064 *
1065 * Non-null @base_addr means that the caller already allocated virtual
1066 * region for the first chunk and mapped it. percpu must not mess
1067 * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
1068 * @populate_pte_fn doesn't make any sense.
1069 *
1070 * @populate_pte_fn is used to populate the pagetable. NULL means the
1071 * caller already populated the pagetable.
1072 *
1073 * If the first chunk ends up with both reserved and dynamic areas, it
1074 * is served by two chunks - one to serve the core static and reserved
1075 * areas and the other for the dynamic area. They share the same vm
1076 * and page map but uses different area allocation map to stay away
1077 * from each other. The latter chunk is circulated in the chunk slots
1078 * and available for dynamic allocation like any other chunks.
1079 *
1080 * RETURNS:
1081 * The determined pcpu_unit_size which can be used to initialize
1082 * percpu access.
1083 */
1084size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1085 size_t static_size, size_t reserved_size,
1086 ssize_t dyn_size, ssize_t unit_size,
1087 void *base_addr,
1088 pcpu_populate_pte_fn_t populate_pte_fn)
1089{
1090 static struct vm_struct first_vm;
1091 static int smap[2], dmap[2];
1092 size_t size_sum = static_size + reserved_size +
1093 (dyn_size >= 0 ? dyn_size : 0);
1094 struct pcpu_chunk *schunk, *dchunk = NULL;
1095 unsigned int cpu;
1096 int nr_pages;
1097 int err, i;
1098
1099 /* santiy checks */
1100 BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
1101 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1102 BUG_ON(!static_size);
1103 if (unit_size >= 0) {
1104 BUG_ON(unit_size < size_sum);
1105 BUG_ON(unit_size & ~PAGE_MASK);
1106 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1107 } else
1108 BUG_ON(base_addr);
1109 BUG_ON(base_addr && populate_pte_fn);
1110
1111 if (unit_size >= 0)
1112 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1113 else
1114 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
1115 PFN_UP(size_sum));
1116
1117 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1118 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
1119 pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
1120 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
1121
1122 if (dyn_size < 0)
1123 dyn_size = pcpu_unit_size - static_size - reserved_size;
1124
1125 /*
1126 * Allocate chunk slots. The additional last slot is for
1127 * empty chunks.
1128 */
1129 pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
1130 pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
1131 for (i = 0; i < pcpu_nr_slots; i++)
1132 INIT_LIST_HEAD(&pcpu_slot[i]);
1133
1134 /*
1135 * Initialize static chunk. If reserved_size is zero, the
1136 * static chunk covers static area + dynamic allocation area
1137 * in the first chunk. If reserved_size is not zero, it
1138 * covers static area + reserved area (mostly used for module
1139 * static percpu allocation).
1140 */
1141 schunk = alloc_bootmem(pcpu_chunk_struct_size);
1142 INIT_LIST_HEAD(&schunk->list);
1143 schunk->vm = &first_vm;
1144 schunk->map = smap;
1145 schunk->map_alloc = ARRAY_SIZE(smap);
1146 schunk->page = schunk->page_ar;
1147
1148 if (reserved_size) {
1149 schunk->free_size = reserved_size;
1150 pcpu_reserved_chunk = schunk; /* not for dynamic alloc */
1151 } else {
1152 schunk->free_size = dyn_size;
1153 dyn_size = 0; /* dynamic area covered */
1154 }
1155 schunk->contig_hint = schunk->free_size;
1156
1157 schunk->map[schunk->map_used++] = -static_size;
1158 if (schunk->free_size)
1159 schunk->map[schunk->map_used++] = schunk->free_size;
1160
1161 pcpu_reserved_chunk_limit = static_size + schunk->free_size;
1162
1163 /* init dynamic chunk if necessary */
1164 if (dyn_size) {
1165 dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
1166 INIT_LIST_HEAD(&dchunk->list);
1167 dchunk->vm = &first_vm;
1168 dchunk->map = dmap;
1169 dchunk->map_alloc = ARRAY_SIZE(dmap);
1170 dchunk->page = schunk->page_ar; /* share page map with schunk */
1171
1172 dchunk->contig_hint = dchunk->free_size = dyn_size;
1173 dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
1174 dchunk->map[dchunk->map_used++] = dchunk->free_size;
1175 }
1176
1177 /* allocate vm address */
1178 first_vm.flags = VM_ALLOC;
1179 first_vm.size = pcpu_chunk_size;
1180
1181 if (!base_addr)
1182 vm_area_register_early(&first_vm, PAGE_SIZE);
1183 else {
1184 /*
1185 * Pages already mapped. No need to remap into
1186 * vmalloc area. In this case the first chunks can't
1187 * be mapped or unmapped by percpu and are marked
1188 * immutable.
1189 */
1190 first_vm.addr = base_addr;
1191 schunk->immutable = true;
1192 if (dchunk)
1193 dchunk->immutable = true;
1194 }
1195
1196 /* assign pages */
1197 nr_pages = -1;
1198 for_each_possible_cpu(cpu) {
1199 for (i = 0; i < pcpu_unit_pages; i++) {
1200 struct page *page = get_page_fn(cpu, i);
1201
1202 if (!page)
1203 break;
1204 *pcpu_chunk_pagep(schunk, cpu, i) = page;
1205 }
1206
1207 BUG_ON(i < PFN_UP(static_size));
1208
1209 if (nr_pages < 0)
1210 nr_pages = i;
1211 else
1212 BUG_ON(nr_pages != i);
1213 }
1214
1215 /* map them */
1216 if (populate_pte_fn) {
1217 for_each_possible_cpu(cpu)
1218 for (i = 0; i < nr_pages; i++)
1219 populate_pte_fn(pcpu_chunk_addr(schunk,
1220 cpu, i));
1221
1222 err = pcpu_map(schunk, 0, nr_pages);
1223 if (err)
1224 panic("failed to setup static percpu area, err=%d\n",
1225 err);
1226 }
1227
1228 /* link the first chunk in */
1229 if (!dchunk) {
1230 pcpu_chunk_relocate(schunk, -1);
1231 pcpu_chunk_addr_insert(schunk);
1232 } else {
1233 pcpu_chunk_relocate(dchunk, -1);
1234 pcpu_chunk_addr_insert(dchunk);
1235 }
1236
1237 /* we're done */
1238 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
1239 return pcpu_unit_size;
1240}
1241
1242/*
1243 * Embedding first chunk setup helper.
1244 */
1245static void *pcpue_ptr __initdata;
1246static size_t pcpue_size __initdata;
1247static size_t pcpue_unit_size __initdata;
1248
1249static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1250{
1251 size_t off = (size_t)pageno << PAGE_SHIFT;
1252
1253 if (off >= pcpue_size)
1254 return NULL;
1255
1256 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
1257}
1258
1259/**
1260 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1261 * @static_size: the size of static percpu area in bytes
1262 * @reserved_size: the size of reserved percpu area in bytes
1263 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1264 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1265 *
1266 * This is a helper to ease setting up embedded first percpu chunk and
1267 * can be called where pcpu_setup_first_chunk() is expected.
1268 *
1269 * If this function is used to setup the first chunk, it is allocated
1270 * as a contiguous area using bootmem allocator and used as-is without
1271 * being mapped into vmalloc area. This enables the first chunk to
1272 * piggy back on the linear physical mapping which often uses larger
1273 * page size.
1274 *
1275 * When @dyn_size is positive, dynamic area might be larger than
1276 * specified to fill page alignment. Also, when @dyn_size is auto,
1277 * @dyn_size does not fill the whole first chunk but only what's
1278 * necessary for page alignment after static and reserved areas.
1279 *
1280 * If the needed size is smaller than the minimum or specified unit
1281 * size, the leftover is returned to the bootmem allocator.
1282 *
1283 * RETURNS:
1284 * The determined pcpu_unit_size which can be used to initialize
1285 * percpu access on success, -errno on failure.
1286 */
1287ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1288 ssize_t dyn_size, ssize_t unit_size)
1289{
1290 unsigned int cpu;
1291
1292 /* determine parameters and allocate */
1293 pcpue_size = PFN_ALIGN(static_size + reserved_size +
1294 (dyn_size >= 0 ? dyn_size : 0));
1295 if (dyn_size != 0)
1296 dyn_size = pcpue_size - static_size - reserved_size;
1297
1298 if (unit_size >= 0) {
1299 BUG_ON(unit_size < pcpue_size);
1300 pcpue_unit_size = unit_size;
1301 } else
1302 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1303
1304 pcpue_ptr = __alloc_bootmem_nopanic(
1305 num_possible_cpus() * pcpue_unit_size,
1306 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1307 if (!pcpue_ptr)
1308 return -ENOMEM;
1309
1310 /* return the leftover and copy */
1311 for_each_possible_cpu(cpu) {
1312 void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
1313
1314 free_bootmem(__pa(ptr + pcpue_size),
1315 pcpue_unit_size - pcpue_size);
1316 memcpy(ptr, __per_cpu_load, static_size);
1317 }
1318
1319 /* we're ready, commit */
1320 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1321 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
1322
1323 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
1324 reserved_size, dyn_size,
1325 pcpue_unit_size, pcpue_ptr, NULL);
1326}
diff --git a/mm/readahead.c b/mm/readahead.c
index bec83c15a78f..133b6d525513 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,19 +17,6 @@
17#include <linux/pagevec.h> 17#include <linux/pagevec.h>
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19 19
20void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
21{
22}
23EXPORT_SYMBOL(default_unplug_io_fn);
24
25struct backing_dev_info default_backing_dev_info = {
26 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
27 .state = 0,
28 .capabilities = BDI_CAP_MAP_COPY,
29 .unplug_io_fn = default_unplug_io_fn,
30};
31EXPORT_SYMBOL_GPL(default_backing_dev_info);
32
33/* 20/*
34 * Initialise a struct file's readahead state. Assumes that the caller has 21 * Initialise a struct file's readahead state. Assumes that the caller has
35 * memset *ra to zero. 22 * memset *ra to zero.
@@ -44,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
44 31
45#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 32#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
46 33
34/*
35 * see if a page needs releasing upon read_cache_pages() failure
36 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
37 * before calling, such as the NFS fs marking pages that are cached locally
38 * on disk, thus we need to give the fs a chance to clean up in the event of
39 * an error
40 */
41static void read_cache_pages_invalidate_page(struct address_space *mapping,
42 struct page *page)
43{
44 if (page_has_private(page)) {
45 if (!trylock_page(page))
46 BUG();
47 page->mapping = mapping;
48 do_invalidatepage(page, 0);
49 page->mapping = NULL;
50 unlock_page(page);
51 }
52 page_cache_release(page);
53}
54
55/*
56 * release a list of pages, invalidating them first if need be
57 */
58static void read_cache_pages_invalidate_pages(struct address_space *mapping,
59 struct list_head *pages)
60{
61 struct page *victim;
62
63 while (!list_empty(pages)) {
64 victim = list_to_page(pages);
65 list_del(&victim->lru);
66 read_cache_pages_invalidate_page(mapping, victim);
67 }
68}
69
47/** 70/**
48 * read_cache_pages - populate an address space with some pages & start reads against them 71 * read_cache_pages - populate an address space with some pages & start reads against them
49 * @mapping: the address_space 72 * @mapping: the address_space
@@ -65,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
65 list_del(&page->lru); 88 list_del(&page->lru);
66 if (add_to_page_cache_lru(page, mapping, 89 if (add_to_page_cache_lru(page, mapping,
67 page->index, GFP_KERNEL)) { 90 page->index, GFP_KERNEL)) {
68 page_cache_release(page); 91 read_cache_pages_invalidate_page(mapping, page);
69 continue; 92 continue;
70 } 93 }
71 page_cache_release(page); 94 page_cache_release(page);
72 95
73 ret = filler(data, page); 96 ret = filler(data, page);
74 if (unlikely(ret)) { 97 if (unlikely(ret)) {
75 put_pages_list(pages); 98 read_cache_pages_invalidate_pages(mapping, pages);
76 break; 99 break;
77 } 100 }
78 task_io_account_read(PAGE_CACHE_SIZE); 101 task_io_account_read(PAGE_CACHE_SIZE);
@@ -233,18 +256,6 @@ unsigned long max_sane_readahead(unsigned long nr)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 256 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 257}
235 258
236static int __init readahead_init(void)
237{
238 int err;
239
240 err = bdi_init(&default_backing_dev_info);
241 if (!err)
242 bdi_register(&default_backing_dev_info, NULL, "default");
243
244 return err;
245}
246subsys_initcall(readahead_init);
247
248/* 259/*
249 * Submit IO for the read-ahead request in file_ra_state. 260 * Submit IO for the read-ahead request in file_ra_state.
250 */ 261 */
diff --git a/mm/shmem.c b/mm/shmem.c
index 4103a239ce84..d94d2e9146bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/ima.h>
31 32
32static struct vfsmount *shm_mnt; 33static struct vfsmount *shm_mnt;
33 34
@@ -1067,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1067 swap_duplicate(swap); 1068 swap_duplicate(swap);
1068 BUG_ON(page_mapped(page)); 1069 BUG_ON(page_mapped(page));
1069 page_cache_release(page); /* pagecache ref */ 1070 page_cache_release(page); /* pagecache ref */
1070 set_page_dirty(page); 1071 swap_writepage(page, wbc);
1071 unlock_page(page);
1072 if (inode) { 1072 if (inode) {
1073 mutex_lock(&shmem_swaplist_mutex); 1073 mutex_lock(&shmem_swaplist_mutex);
1074 /* move instead of add in case we're racing */ 1074 /* move instead of add in case we're racing */
@@ -2665,6 +2665,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2665 if (IS_ERR(file)) 2665 if (IS_ERR(file))
2666 return PTR_ERR(file); 2666 return PTR_ERR(file);
2667 2667
2668 ima_shm_check(file);
2668 if (vma->vm_file) 2669 if (vma->vm_file)
2669 fput(vma->vm_file); 2670 fput(vma->vm_file);
2670 vma->vm_file = file; 2671 vma->vm_file = file;
diff --git a/mm/slab.c b/mm/slab.c
index 4d00855629c4..208323fd37bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3318,6 +3318,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3318 unsigned long save_flags; 3318 unsigned long save_flags;
3319 void *ptr; 3319 void *ptr;
3320 3320
3321 lockdep_trace_alloc(flags);
3322
3321 if (slab_should_failslab(cachep, flags)) 3323 if (slab_should_failslab(cachep, flags))
3322 return NULL; 3324 return NULL;
3323 3325
@@ -3394,6 +3396,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
3394 unsigned long save_flags; 3396 unsigned long save_flags;
3395 void *objp; 3397 void *objp;
3396 3398
3399 lockdep_trace_alloc(flags);
3400
3397 if (slab_should_failslab(cachep, flags)) 3401 if (slab_should_failslab(cachep, flags))
3398 return NULL; 3402 return NULL;
3399 3403
@@ -3988,8 +3992,7 @@ static void cache_reap(struct work_struct *w)
3988 struct kmem_cache *searchp; 3992 struct kmem_cache *searchp;
3989 struct kmem_list3 *l3; 3993 struct kmem_list3 *l3;
3990 int node = numa_node_id(); 3994 int node = numa_node_id();
3991 struct delayed_work *work = 3995 struct delayed_work *work = to_delayed_work(w);
3992 container_of(w, struct delayed_work, work);
3993 3996
3994 if (!mutex_trylock(&cache_chain_mutex)) 3997 if (!mutex_trylock(&cache_chain_mutex))
3995 /* Give up. Setup the next iteration. */ 3998 /* Give up. Setup the next iteration. */
diff --git a/mm/slob.c b/mm/slob.c
index 52bc8a2bd9ef..7a3411524dac 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,9 +126,9 @@ static LIST_HEAD(free_slob_medium);
126static LIST_HEAD(free_slob_large); 126static LIST_HEAD(free_slob_large);
127 127
128/* 128/*
129 * slob_page: True for all slob pages (false for bigblock pages) 129 * is_slob_page: True for all slob pages (false for bigblock pages)
130 */ 130 */
131static inline int slob_page(struct slob_page *sp) 131static inline int is_slob_page(struct slob_page *sp)
132{ 132{
133 return PageSlobPage((struct page *)sp); 133 return PageSlobPage((struct page *)sp);
134} 134}
@@ -143,6 +143,11 @@ static inline void clear_slob_page(struct slob_page *sp)
143 __ClearPageSlobPage((struct page *)sp); 143 __ClearPageSlobPage((struct page *)sp);
144} 144}
145 145
146static inline struct slob_page *slob_page(const void *addr)
147{
148 return (struct slob_page *)virt_to_page(addr);
149}
150
146/* 151/*
147 * slob_page_free: true for pages on free_slob_pages list. 152 * slob_page_free: true for pages on free_slob_pages list.
148 */ 153 */
@@ -230,7 +235,7 @@ static int slob_last(slob_t *s)
230 return !((unsigned long)slob_next(s) & ~PAGE_MASK); 235 return !((unsigned long)slob_next(s) & ~PAGE_MASK);
231} 236}
232 237
233static void *slob_new_page(gfp_t gfp, int order, int node) 238static void *slob_new_pages(gfp_t gfp, int order, int node)
234{ 239{
235 void *page; 240 void *page;
236 241
@@ -247,12 +252,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node)
247 return page_address(page); 252 return page_address(page);
248} 253}
249 254
255static void slob_free_pages(void *b, int order)
256{
257 free_pages((unsigned long)b, order);
258}
259
250/* 260/*
251 * Allocate a slob block within a given slob_page sp. 261 * Allocate a slob block within a given slob_page sp.
252 */ 262 */
253static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) 263static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
254{ 264{
255 slob_t *prev, *cur, *aligned = 0; 265 slob_t *prev, *cur, *aligned = NULL;
256 int delta = 0, units = SLOB_UNITS(size); 266 int delta = 0, units = SLOB_UNITS(size);
257 267
258 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { 268 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
@@ -349,10 +359,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
349 359
350 /* Not enough space: must allocate a new page */ 360 /* Not enough space: must allocate a new page */
351 if (!b) { 361 if (!b) {
352 b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); 362 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
353 if (!b) 363 if (!b)
354 return 0; 364 return NULL;
355 sp = (struct slob_page *)virt_to_page(b); 365 sp = slob_page(b);
356 set_slob_page(sp); 366 set_slob_page(sp);
357 367
358 spin_lock_irqsave(&slob_lock, flags); 368 spin_lock_irqsave(&slob_lock, flags);
@@ -384,7 +394,7 @@ static void slob_free(void *block, int size)
384 return; 394 return;
385 BUG_ON(!size); 395 BUG_ON(!size);
386 396
387 sp = (struct slob_page *)virt_to_page(block); 397 sp = slob_page(block);
388 units = SLOB_UNITS(size); 398 units = SLOB_UNITS(size);
389 399
390 spin_lock_irqsave(&slob_lock, flags); 400 spin_lock_irqsave(&slob_lock, flags);
@@ -393,10 +403,11 @@ static void slob_free(void *block, int size)
393 /* Go directly to page allocator. Do not pass slob allocator */ 403 /* Go directly to page allocator. Do not pass slob allocator */
394 if (slob_page_free(sp)) 404 if (slob_page_free(sp))
395 clear_slob_page_free(sp); 405 clear_slob_page_free(sp);
406 spin_unlock_irqrestore(&slob_lock, flags);
396 clear_slob_page(sp); 407 clear_slob_page(sp);
397 free_slob_page(sp); 408 free_slob_page(sp);
398 free_page((unsigned long)b); 409 free_page((unsigned long)b);
399 goto out; 410 return;
400 } 411 }
401 412
402 if (!slob_page_free(sp)) { 413 if (!slob_page_free(sp)) {
@@ -464,6 +475,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
464 unsigned int *m; 475 unsigned int *m;
465 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 476 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
466 477
478 lockdep_trace_alloc(gfp);
479
467 if (size < PAGE_SIZE - align) { 480 if (size < PAGE_SIZE - align) {
468 if (!size) 481 if (!size)
469 return ZERO_SIZE_PTR; 482 return ZERO_SIZE_PTR;
@@ -476,7 +489,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
476 } else { 489 } else {
477 void *ret; 490 void *ret;
478 491
479 ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); 492 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
480 if (ret) { 493 if (ret) {
481 struct page *page; 494 struct page *page;
482 page = virt_to_page(ret); 495 page = virt_to_page(ret);
@@ -494,8 +507,8 @@ void kfree(const void *block)
494 if (unlikely(ZERO_OR_NULL_PTR(block))) 507 if (unlikely(ZERO_OR_NULL_PTR(block)))
495 return; 508 return;
496 509
497 sp = (struct slob_page *)virt_to_page(block); 510 sp = slob_page(block);
498 if (slob_page(sp)) { 511 if (is_slob_page(sp)) {
499 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 512 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
500 unsigned int *m = (unsigned int *)(block - align); 513 unsigned int *m = (unsigned int *)(block - align);
501 slob_free(m, *m + align); 514 slob_free(m, *m + align);
@@ -513,8 +526,8 @@ size_t ksize(const void *block)
513 if (unlikely(block == ZERO_SIZE_PTR)) 526 if (unlikely(block == ZERO_SIZE_PTR))
514 return 0; 527 return 0;
515 528
516 sp = (struct slob_page *)virt_to_page(block); 529 sp = slob_page(block);
517 if (slob_page(sp)) { 530 if (is_slob_page(sp)) {
518 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 531 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
519 unsigned int *m = (unsigned int *)(block - align); 532 unsigned int *m = (unsigned int *)(block - align);
520 return SLOB_UNITS(*m) * SLOB_UNIT; 533 return SLOB_UNITS(*m) * SLOB_UNIT;
@@ -573,7 +586,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
573 if (c->size < PAGE_SIZE) 586 if (c->size < PAGE_SIZE)
574 b = slob_alloc(c->size, flags, c->align, node); 587 b = slob_alloc(c->size, flags, c->align, node);
575 else 588 else
576 b = slob_new_page(flags, get_order(c->size), node); 589 b = slob_new_pages(flags, get_order(c->size), node);
577 590
578 if (c->ctor) 591 if (c->ctor)
579 c->ctor(b); 592 c->ctor(b);
@@ -587,7 +600,7 @@ static void __kmem_cache_free(void *b, int size)
587 if (size < PAGE_SIZE) 600 if (size < PAGE_SIZE)
588 slob_free(b, size); 601 slob_free(b, size);
589 else 602 else
590 free_pages((unsigned long)b, get_order(size)); 603 slob_free_pages(b, get_order(size));
591} 604}
592 605
593static void kmem_rcu_free(struct rcu_head *head) 606static void kmem_rcu_free(struct rcu_head *head)
diff --git a/mm/slub.c b/mm/slub.c
index 0280eee6cf37..c4ea9158c9fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,14 +374,8 @@ static struct track *get_track(struct kmem_cache *s, void *object,
374static void set_track(struct kmem_cache *s, void *object, 374static void set_track(struct kmem_cache *s, void *object,
375 enum track_item alloc, unsigned long addr) 375 enum track_item alloc, unsigned long addr)
376{ 376{
377 struct track *p; 377 struct track *p = get_track(s, object, alloc);
378
379 if (s->offset)
380 p = object + s->offset + sizeof(void *);
381 else
382 p = object + s->inuse;
383 378
384 p += alloc;
385 if (addr) { 379 if (addr) {
386 p->addr = addr; 380 p->addr = addr;
387 p->cpu = smp_processor_id(); 381 p->cpu = smp_processor_id();
@@ -1335,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1335 n = get_node(s, zone_to_nid(zone)); 1329 n = get_node(s, zone_to_nid(zone));
1336 1330
1337 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1331 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1338 n->nr_partial > n->min_partial) { 1332 n->nr_partial > s->min_partial) {
1339 page = get_partial_node(n); 1333 page = get_partial_node(n);
1340 if (page) 1334 if (page)
1341 return page; 1335 return page;
@@ -1387,7 +1381,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1387 slab_unlock(page); 1381 slab_unlock(page);
1388 } else { 1382 } else {
1389 stat(c, DEACTIVATE_EMPTY); 1383 stat(c, DEACTIVATE_EMPTY);
1390 if (n->nr_partial < n->min_partial) { 1384 if (n->nr_partial < s->min_partial) {
1391 /* 1385 /*
1392 * Adding an empty slab to the partial slabs in order 1386 * Adding an empty slab to the partial slabs in order
1393 * to avoid page allocator overhead. This slab needs 1387 * to avoid page allocator overhead. This slab needs
@@ -1596,6 +1590,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1596 unsigned long flags; 1590 unsigned long flags;
1597 unsigned int objsize; 1591 unsigned int objsize;
1598 1592
1593 lockdep_trace_alloc(gfpflags);
1599 might_sleep_if(gfpflags & __GFP_WAIT); 1594 might_sleep_if(gfpflags & __GFP_WAIT);
1600 1595
1601 if (should_failslab(s->objsize, gfpflags)) 1596 if (should_failslab(s->objsize, gfpflags))
@@ -1724,7 +1719,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
1724 c = get_cpu_slab(s, smp_processor_id()); 1719 c = get_cpu_slab(s, smp_processor_id());
1725 debug_check_no_locks_freed(object, c->objsize); 1720 debug_check_no_locks_freed(object, c->objsize);
1726 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1721 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1727 debug_check_no_obj_freed(object, s->objsize); 1722 debug_check_no_obj_freed(object, c->objsize);
1728 if (likely(page == c->page && c->node >= 0)) { 1723 if (likely(page == c->page && c->node >= 0)) {
1729 object[c->offset] = c->freelist; 1724 object[c->offset] = c->freelist;
1730 c->freelist = object; 1725 c->freelist = object;
@@ -1844,6 +1839,7 @@ static inline int calculate_order(int size)
1844 int order; 1839 int order;
1845 int min_objects; 1840 int min_objects;
1846 int fraction; 1841 int fraction;
1842 int max_objects;
1847 1843
1848 /* 1844 /*
1849 * Attempt to find best configuration for a slab. This 1845 * Attempt to find best configuration for a slab. This
@@ -1856,6 +1852,9 @@ static inline int calculate_order(int size)
1856 min_objects = slub_min_objects; 1852 min_objects = slub_min_objects;
1857 if (!min_objects) 1853 if (!min_objects)
1858 min_objects = 4 * (fls(nr_cpu_ids) + 1); 1854 min_objects = 4 * (fls(nr_cpu_ids) + 1);
1855 max_objects = (PAGE_SIZE << slub_max_order)/size;
1856 min_objects = min(min_objects, max_objects);
1857
1859 while (min_objects > 1) { 1858 while (min_objects > 1) {
1860 fraction = 16; 1859 fraction = 16;
1861 while (fraction >= 4) { 1860 while (fraction >= 4) {
@@ -1865,7 +1864,7 @@ static inline int calculate_order(int size)
1865 return order; 1864 return order;
1866 fraction /= 2; 1865 fraction /= 2;
1867 } 1866 }
1868 min_objects /= 2; 1867 min_objects --;
1869 } 1868 }
1870 1869
1871 /* 1870 /*
@@ -1928,17 +1927,6 @@ static void
1928init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 1927init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1929{ 1928{
1930 n->nr_partial = 0; 1929 n->nr_partial = 0;
1931
1932 /*
1933 * The larger the object size is, the more pages we want on the partial
1934 * list to avoid pounding the page allocator excessively.
1935 */
1936 n->min_partial = ilog2(s->size);
1937 if (n->min_partial < MIN_PARTIAL)
1938 n->min_partial = MIN_PARTIAL;
1939 else if (n->min_partial > MAX_PARTIAL)
1940 n->min_partial = MAX_PARTIAL;
1941
1942 spin_lock_init(&n->list_lock); 1930 spin_lock_init(&n->list_lock);
1943 INIT_LIST_HEAD(&n->partial); 1931 INIT_LIST_HEAD(&n->partial);
1944#ifdef CONFIG_SLUB_DEBUG 1932#ifdef CONFIG_SLUB_DEBUG
@@ -2181,6 +2169,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2181} 2169}
2182#endif 2170#endif
2183 2171
2172static void set_min_partial(struct kmem_cache *s, unsigned long min)
2173{
2174 if (min < MIN_PARTIAL)
2175 min = MIN_PARTIAL;
2176 else if (min > MAX_PARTIAL)
2177 min = MAX_PARTIAL;
2178 s->min_partial = min;
2179}
2180
2184/* 2181/*
2185 * calculate_sizes() determines the order and the distribution of data within 2182 * calculate_sizes() determines the order and the distribution of data within
2186 * a slab object. 2183 * a slab object.
@@ -2319,6 +2316,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2319 if (!calculate_sizes(s, -1)) 2316 if (!calculate_sizes(s, -1))
2320 goto error; 2317 goto error;
2321 2318
2319 /*
2320 * The larger the object size is, the more pages we want on the partial
2321 * list to avoid pounding the page allocator excessively.
2322 */
2323 set_min_partial(s, ilog2(s->size));
2322 s->refcount = 1; 2324 s->refcount = 1;
2323#ifdef CONFIG_NUMA 2325#ifdef CONFIG_NUMA
2324 s->remote_node_defrag_ratio = 1000; 2326 s->remote_node_defrag_ratio = 1000;
@@ -2475,7 +2477,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
2475 * Kmalloc subsystem 2477 * Kmalloc subsystem
2476 *******************************************************************/ 2478 *******************************************************************/
2477 2479
2478struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; 2480struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
2479EXPORT_SYMBOL(kmalloc_caches); 2481EXPORT_SYMBOL(kmalloc_caches);
2480 2482
2481static int __init setup_slub_min_order(char *str) 2483static int __init setup_slub_min_order(char *str)
@@ -2537,7 +2539,7 @@ panic:
2537} 2539}
2538 2540
2539#ifdef CONFIG_ZONE_DMA 2541#ifdef CONFIG_ZONE_DMA
2540static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; 2542static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
2541 2543
2542static void sysfs_add_func(struct work_struct *w) 2544static void sysfs_add_func(struct work_struct *w)
2543{ 2545{
@@ -2658,7 +2660,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2658{ 2660{
2659 struct kmem_cache *s; 2661 struct kmem_cache *s;
2660 2662
2661 if (unlikely(size > PAGE_SIZE)) 2663 if (unlikely(size > SLUB_MAX_SIZE))
2662 return kmalloc_large(size, flags); 2664 return kmalloc_large(size, flags);
2663 2665
2664 s = get_slab(size, flags); 2666 s = get_slab(size, flags);
@@ -2686,7 +2688,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
2686{ 2688{
2687 struct kmem_cache *s; 2689 struct kmem_cache *s;
2688 2690
2689 if (unlikely(size > PAGE_SIZE)) 2691 if (unlikely(size > SLUB_MAX_SIZE))
2690 return kmalloc_large_node(size, flags, node); 2692 return kmalloc_large_node(size, flags, node);
2691 2693
2692 s = get_slab(size, flags); 2694 s = get_slab(size, flags);
@@ -2986,7 +2988,7 @@ void __init kmem_cache_init(void)
2986 caches++; 2988 caches++;
2987 } 2989 }
2988 2990
2989 for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { 2991 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
2990 create_kmalloc_cache(&kmalloc_caches[i], 2992 create_kmalloc_cache(&kmalloc_caches[i],
2991 "kmalloc", 1 << i, GFP_KERNEL); 2993 "kmalloc", 1 << i, GFP_KERNEL);
2992 caches++; 2994 caches++;
@@ -3023,7 +3025,7 @@ void __init kmem_cache_init(void)
3023 slab_state = UP; 3025 slab_state = UP;
3024 3026
3025 /* Provide the correct kmalloc names now that the caches are up */ 3027 /* Provide the correct kmalloc names now that the caches are up */
3026 for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) 3028 for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
3027 kmalloc_caches[i]. name = 3029 kmalloc_caches[i]. name =
3028 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); 3030 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3029 3031
@@ -3223,7 +3225,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3223{ 3225{
3224 struct kmem_cache *s; 3226 struct kmem_cache *s;
3225 3227
3226 if (unlikely(size > PAGE_SIZE)) 3228 if (unlikely(size > SLUB_MAX_SIZE))
3227 return kmalloc_large(size, gfpflags); 3229 return kmalloc_large(size, gfpflags);
3228 3230
3229 s = get_slab(size, gfpflags); 3231 s = get_slab(size, gfpflags);
@@ -3239,7 +3241,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3239{ 3241{
3240 struct kmem_cache *s; 3242 struct kmem_cache *s;
3241 3243
3242 if (unlikely(size > PAGE_SIZE)) 3244 if (unlikely(size > SLUB_MAX_SIZE))
3243 return kmalloc_large_node(size, gfpflags, node); 3245 return kmalloc_large_node(size, gfpflags, node);
3244 3246
3245 s = get_slab(size, gfpflags); 3247 s = get_slab(size, gfpflags);
@@ -3836,6 +3838,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
3836} 3838}
3837SLAB_ATTR(order); 3839SLAB_ATTR(order);
3838 3840
3841static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
3842{
3843 return sprintf(buf, "%lu\n", s->min_partial);
3844}
3845
3846static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
3847 size_t length)
3848{
3849 unsigned long min;
3850 int err;
3851
3852 err = strict_strtoul(buf, 10, &min);
3853 if (err)
3854 return err;
3855
3856 set_min_partial(s, min);
3857 return length;
3858}
3859SLAB_ATTR(min_partial);
3860
3839static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3861static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3840{ 3862{
3841 if (s->ctor) { 3863 if (s->ctor) {
@@ -4151,6 +4173,7 @@ static struct attribute *slab_attrs[] = {
4151 &object_size_attr.attr, 4173 &object_size_attr.attr,
4152 &objs_per_slab_attr.attr, 4174 &objs_per_slab_attr.attr,
4153 &order_attr.attr, 4175 &order_attr.attr,
4176 &min_partial_attr.attr,
4154 &objects_attr.attr, 4177 &objects_attr.attr,
4155 &objects_partial_attr.attr, 4178 &objects_partial_attr.attr,
4156 &total_objects_attr.attr, 4179 &total_objects_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 083f5b63e7a8..da432d9f0ae8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
164 WARN_ON_ONCE(1); 164 WARN_ON_ONCE(1);
165 *start_pfn = max_sparsemem_pfn; 165 *start_pfn = max_sparsemem_pfn;
166 *end_pfn = max_sparsemem_pfn; 166 *end_pfn = max_sparsemem_pfn;
167 } 167 } else if (*end_pfn > max_sparsemem_pfn) {
168
169 if (*end_pfn > max_sparsemem_pfn) {
170 mminit_dprintk(MMINIT_WARNING, "pfnvalidation", 168 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
171 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", 169 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
172 *start_pfn, *end_pfn, max_sparsemem_pfn); 170 *start_pfn, *end_pfn, max_sparsemem_pfn);
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e1..bede23ce64ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
448 for (i = 0; i < pagevec_count(pvec); i++) { 448 for (i = 0; i < pagevec_count(pvec); i++) {
449 struct page *page = pvec->pages[i]; 449 struct page *page = pvec->pages[i];
450 450
451 if (PagePrivate(page) && trylock_page(page)) { 451 if (page_has_private(page) && trylock_page(page)) {
452 if (PagePrivate(page)) 452 if (page_has_private(page))
453 try_to_release_page(page, 0); 453 try_to_release_page(page, 0);
454 unlock_page(page); 454 unlock_page(page);
455 } 455 }
@@ -457,29 +457,6 @@ void pagevec_strip(struct pagevec *pvec)
457} 457}
458 458
459/** 459/**
460 * pagevec_swap_free - try to free swap space from the pages in a pagevec
461 * @pvec: pagevec with swapcache pages to free the swap space of
462 *
463 * The caller needs to hold an extra reference to each page and
464 * not hold the page lock on the pages. This function uses a
465 * trylock on the page lock so it may not always free the swap
466 * space associated with a page.
467 */
468void pagevec_swap_free(struct pagevec *pvec)
469{
470 int i;
471
472 for (i = 0; i < pagevec_count(pvec); i++) {
473 struct page *page = pvec->pages[i];
474
475 if (PageSwapCache(page) && trylock_page(page)) {
476 try_to_free_swap(page);
477 unlock_page(page);
478 }
479 }
480}
481
482/**
483 * pagevec_lookup - gang pagecache lookup 460 * pagevec_lookup - gang pagecache lookup
484 * @pvec: Where the resulting pages are placed 461 * @pvec: Where the resulting pages are placed
485 * @mapping: The address_space to search 462 * @mapping: The address_space to search
diff --git a/mm/truncate.c b/mm/truncate.c
index 1229211104f8..55206fab7b99 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
50static inline void truncate_partial_page(struct page *page, unsigned partial) 50static inline void truncate_partial_page(struct page *page, unsigned partial)
51{ 51{
52 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 52 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
53 if (PagePrivate(page)) 53 if (page_has_private(page))
54 do_invalidatepage(page, partial); 54 do_invalidatepage(page, partial);
55} 55}
56 56
@@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return;
101 101
102 if (PagePrivate(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
104 104
105 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
126 if (page->mapping != mapping) 126 if (page->mapping != mapping)
127 return 0; 127 return 0;
128 128
129 if (PagePrivate(page) && !try_to_release_page(page, 0)) 129 if (page_has_private(page) && !try_to_release_page(page, 0))
130 return 0; 130 return 0;
131 131
132 clear_page_mlock(page); 132 clear_page_mlock(page);
@@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
348 if (page->mapping != mapping) 348 if (page->mapping != mapping)
349 return 0; 349 return 0;
350 350
351 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 351 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
352 return 0; 352 return 0;
353 353
354 spin_lock_irq(&mapping->tree_lock); 354 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
356 goto failed; 356 goto failed;
357 357
358 clear_page_mlock(page); 358 clear_page_mlock(page);
359 BUG_ON(PagePrivate(page)); 359 BUG_ON(page_has_private(page));
360 __remove_from_page_cache(page); 360 __remove_from_page_cache(page);
361 spin_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
362 page_cache_release(page); /* pagecache ref */ 362 page_cache_release(page); /* pagecache ref */
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf3054..7c122e49f769 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -70,6 +70,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
70EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
71 71
72/** 72/**
73 * memdup_user - duplicate memory region from user space
74 *
75 * @src: source address in user space
76 * @len: number of bytes to copy
77 *
78 * Returns an ERR_PTR() on failure.
79 */
80void *memdup_user(const void __user *src, size_t len)
81{
82 void *p;
83
84 /*
85 * Always use GFP_KERNEL, since copy_from_user() can sleep and
86 * cause pagefault, which makes it pointless to use GFP_NOFS
87 * or GFP_ATOMIC.
88 */
89 p = kmalloc_track_caller(len, GFP_KERNEL);
90 if (!p)
91 return ERR_PTR(-ENOMEM);
92
93 if (copy_from_user(p, src, len)) {
94 kfree(p);
95 return ERR_PTR(-EFAULT);
96 }
97
98 return p;
99}
100EXPORT_SYMBOL(memdup_user);
101
102/**
73 * __krealloc - like krealloc() but don't free @p. 103 * __krealloc - like krealloc() but don't free @p.
74 * @p: object to reallocate memory for. 104 * @p: object to reallocate memory for.
75 * @new_size: how many bytes of memory are required. 105 * @new_size: how many bytes of memory are required.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 520a75980269..fab19876b4d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -24,6 +24,7 @@
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h> 26#include <linux/bootmem.h>
27#include <linux/pfn.h>
27 28
28#include <asm/atomic.h> 29#include <asm/atomic.h>
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
152 * 153 *
153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 154 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
154 */ 155 */
155static int vmap_page_range(unsigned long start, unsigned long end, 156static int vmap_page_range_noflush(unsigned long start, unsigned long end,
156 pgprot_t prot, struct page **pages) 157 pgprot_t prot, struct page **pages)
157{ 158{
158 pgd_t *pgd; 159 pgd_t *pgd;
159 unsigned long next; 160 unsigned long next;
@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
169 if (err) 170 if (err)
170 break; 171 break;
171 } while (pgd++, addr = next, addr != end); 172 } while (pgd++, addr = next, addr != end);
172 flush_cache_vmap(start, end);
173 173
174 if (unlikely(err)) 174 if (unlikely(err))
175 return err; 175 return err;
176 return nr; 176 return nr;
177} 177}
178 178
179static int vmap_page_range(unsigned long start, unsigned long end,
180 pgprot_t prot, struct page **pages)
181{
182 int ret;
183
184 ret = vmap_page_range_noflush(start, end, prot, pages);
185 flush_cache_vmap(start, end);
186 return ret;
187}
188
179static inline int is_vmalloc_or_module_addr(const void *x) 189static inline int is_vmalloc_or_module_addr(const void *x)
180{ 190{
181 /* 191 /*
@@ -661,10 +671,7 @@ struct vmap_block {
661 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 671 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
662 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 672 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
663 union { 673 union {
664 struct { 674 struct list_head free_list;
665 struct list_head free_list;
666 struct list_head dirty_list;
667 };
668 struct rcu_head rcu_head; 675 struct rcu_head rcu_head;
669 }; 676 };
670}; 677};
@@ -731,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
731 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); 738 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
732 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 739 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
733 INIT_LIST_HEAD(&vb->free_list); 740 INIT_LIST_HEAD(&vb->free_list);
734 INIT_LIST_HEAD(&vb->dirty_list);
735 741
736 vb_idx = addr_to_vb_idx(va->va_start); 742 vb_idx = addr_to_vb_idx(va->va_start);
737 spin_lock(&vmap_block_tree_lock); 743 spin_lock(&vmap_block_tree_lock);
@@ -762,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb)
762 struct vmap_block *tmp; 768 struct vmap_block *tmp;
763 unsigned long vb_idx; 769 unsigned long vb_idx;
764 770
765 spin_lock(&vb->vbq->lock); 771 BUG_ON(!list_empty(&vb->free_list));
766 if (!list_empty(&vb->free_list))
767 list_del(&vb->free_list);
768 if (!list_empty(&vb->dirty_list))
769 list_del(&vb->dirty_list);
770 spin_unlock(&vb->vbq->lock);
771 772
772 vb_idx = addr_to_vb_idx(vb->va->va_start); 773 vb_idx = addr_to_vb_idx(vb->va->va_start);
773 spin_lock(&vmap_block_tree_lock); 774 spin_lock(&vmap_block_tree_lock);
@@ -852,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size)
852 853
853 spin_lock(&vb->lock); 854 spin_lock(&vb->lock);
854 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 855 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
855 if (!vb->dirty) { 856
856 spin_lock(&vb->vbq->lock);
857 list_add(&vb->dirty_list, &vb->vbq->dirty);
858 spin_unlock(&vb->vbq->lock);
859 }
860 vb->dirty += 1UL << order; 857 vb->dirty += 1UL << order;
861 if (vb->dirty == VMAP_BBMAP_BITS) { 858 if (vb->dirty == VMAP_BBMAP_BITS) {
862 BUG_ON(vb->free || !list_empty(&vb->free_list)); 859 BUG_ON(vb->free || !list_empty(&vb->free_list));
@@ -990,6 +987,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
990} 987}
991EXPORT_SYMBOL(vm_map_ram); 988EXPORT_SYMBOL(vm_map_ram);
992 989
990/**
991 * vm_area_register_early - register vmap area early during boot
992 * @vm: vm_struct to register
993 * @align: requested alignment
994 *
995 * This function is used to register kernel vm area before
996 * vmalloc_init() is called. @vm->size and @vm->flags should contain
997 * proper values on entry and other fields should be zero. On return,
998 * vm->addr contains the allocated address.
999 *
1000 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1001 */
1002void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1003{
1004 static size_t vm_init_off __initdata;
1005 unsigned long addr;
1006
1007 addr = ALIGN(VMALLOC_START + vm_init_off, align);
1008 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1009
1010 vm->addr = (void *)addr;
1011
1012 vm->next = vmlist;
1013 vmlist = vm;
1014}
1015
993void __init vmalloc_init(void) 1016void __init vmalloc_init(void)
994{ 1017{
995 struct vmap_area *va; 1018 struct vmap_area *va;
@@ -1017,6 +1040,58 @@ void __init vmalloc_init(void)
1017 vmap_initialized = true; 1040 vmap_initialized = true;
1018} 1041}
1019 1042
1043/**
1044 * map_kernel_range_noflush - map kernel VM area with the specified pages
1045 * @addr: start of the VM area to map
1046 * @size: size of the VM area to map
1047 * @prot: page protection flags to use
1048 * @pages: pages to map
1049 *
1050 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1051 * specify should have been allocated using get_vm_area() and its
1052 * friends.
1053 *
1054 * NOTE:
1055 * This function does NOT do any cache flushing. The caller is
1056 * responsible for calling flush_cache_vmap() on to-be-mapped areas
1057 * before calling this function.
1058 *
1059 * RETURNS:
1060 * The number of pages mapped on success, -errno on failure.
1061 */
1062int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1063 pgprot_t prot, struct page **pages)
1064{
1065 return vmap_page_range_noflush(addr, addr + size, prot, pages);
1066}
1067
1068/**
1069 * unmap_kernel_range_noflush - unmap kernel VM area
1070 * @addr: start of the VM area to unmap
1071 * @size: size of the VM area to unmap
1072 *
1073 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1074 * specify should have been allocated using get_vm_area() and its
1075 * friends.
1076 *
1077 * NOTE:
1078 * This function does NOT do any cache flushing. The caller is
1079 * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1080 * before calling this function and flush_tlb_kernel_range() after.
1081 */
1082void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1083{
1084 vunmap_page_range(addr, addr + size);
1085}
1086
1087/**
1088 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
1089 * @addr: start of the VM area to unmap
1090 * @size: size of the VM area to unmap
1091 *
1092 * Similar to unmap_kernel_range_noflush() but flushes vcache before
1093 * the unmapping and tlb after.
1094 */
1020void unmap_kernel_range(unsigned long addr, unsigned long size) 1095void unmap_kernel_range(unsigned long addr, unsigned long size)
1021{ 1096{
1022 unsigned long end = addr + size; 1097 unsigned long end = addr + size;
@@ -1267,6 +1342,7 @@ EXPORT_SYMBOL(vfree);
1267void vunmap(const void *addr) 1342void vunmap(const void *addr)
1268{ 1343{
1269 BUG_ON(in_interrupt()); 1344 BUG_ON(in_interrupt());
1345 might_sleep();
1270 __vunmap(addr, 0); 1346 __vunmap(addr, 0);
1271} 1347}
1272EXPORT_SYMBOL(vunmap); 1348EXPORT_SYMBOL(vunmap);
@@ -1286,6 +1362,8 @@ void *vmap(struct page **pages, unsigned int count,
1286{ 1362{
1287 struct vm_struct *area; 1363 struct vm_struct *area;
1288 1364
1365 might_sleep();
1366
1289 if (count > num_physpages) 1367 if (count > num_physpages)
1290 return NULL; 1368 return NULL;
1291 1369
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56ddf41149eb..425244988bb2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -60,8 +60,8 @@ struct scan_control {
60 60
61 int may_writepage; 61 int may_writepage;
62 62
63 /* Can pages be swapped as part of reclaim? */ 63 /* Can mapped pages be reclaimed? */
64 int may_swap; 64 int may_unmap;
65 65
66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
67 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 67 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
@@ -78,6 +78,12 @@ struct scan_control {
78 /* Which cgroup do we reclaim from */ 78 /* Which cgroup do we reclaim from */
79 struct mem_cgroup *mem_cgroup; 79 struct mem_cgroup *mem_cgroup;
80 80
81 /*
82 * Nodemask of nodes allowed by the caller. If NULL, all nodes
83 * are scanned.
84 */
85 nodemask_t *nodemask;
86
81 /* Pluggable isolate pages callback */ 87 /* Pluggable isolate pages callback */
82 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 88 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
83 unsigned long *scanned, int order, int mode, 89 unsigned long *scanned, int order, int mode,
@@ -214,8 +220,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
214 do_div(delta, lru_pages + 1); 220 do_div(delta, lru_pages + 1);
215 shrinker->nr += delta; 221 shrinker->nr += delta;
216 if (shrinker->nr < 0) { 222 if (shrinker->nr < 0) {
217 printk(KERN_ERR "%s: nr=%ld\n", 223 printk(KERN_ERR "shrink_slab: %pF negative objects to "
218 __func__, shrinker->nr); 224 "delete nr=%ld\n",
225 shrinker->shrink, shrinker->nr);
219 shrinker->nr = max_pass; 226 shrinker->nr = max_pass;
220 } 227 }
221 228
@@ -276,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page)
276 283
277static inline int is_page_cache_freeable(struct page *page) 284static inline int is_page_cache_freeable(struct page *page)
278{ 285{
279 return page_count(page) - !!PagePrivate(page) == 2; 286 return page_count(page) - !!page_has_private(page) == 2;
280} 287}
281 288
282static int may_write_to_queue(struct backing_dev_info *bdi) 289static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -360,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
360 * Some data journaling orphaned pages can have 367 * Some data journaling orphaned pages can have
361 * page->mapping == NULL while being dirty with clean buffers. 368 * page->mapping == NULL while being dirty with clean buffers.
362 */ 369 */
363 if (PagePrivate(page)) { 370 if (page_has_private(page)) {
364 if (try_to_free_buffers(page)) { 371 if (try_to_free_buffers(page)) {
365 ClearPageDirty(page); 372 ClearPageDirty(page);
366 printk("%s: orphaned page\n", __func__); 373 printk("%s: orphaned page\n", __func__);
@@ -606,7 +613,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
606 if (unlikely(!page_evictable(page, NULL))) 613 if (unlikely(!page_evictable(page, NULL)))
607 goto cull_mlocked; 614 goto cull_mlocked;
608 615
609 if (!sc->may_swap && page_mapped(page)) 616 if (!sc->may_unmap && page_mapped(page))
610 goto keep_locked; 617 goto keep_locked;
611 618
612 /* Double the slab pressure for mapped and swapcache pages */ 619 /* Double the slab pressure for mapped and swapcache pages */
@@ -720,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 * process address space (page_count == 1) it can be freed. 727 * process address space (page_count == 1) it can be freed.
721 * Otherwise, leave the page on the LRU so it is swappable. 728 * Otherwise, leave the page on the LRU so it is swappable.
722 */ 729 */
723 if (PagePrivate(page)) { 730 if (page_has_private(page)) {
724 if (!try_to_release_page(page, sc->gfp_mask)) 731 if (!try_to_release_page(page, sc->gfp_mask))
725 goto activate_locked; 732 goto activate_locked;
726 if (!mapping && page_count(page) == 1) { 733 if (!mapping && page_count(page) == 1) {
@@ -1298,17 +1305,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1298 } 1305 }
1299 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1306 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1300 pgdeactivate += pgmoved; 1307 pgdeactivate += pgmoved;
1301 if (buffer_heads_over_limit) {
1302 spin_unlock_irq(&zone->lru_lock);
1303 pagevec_strip(&pvec);
1304 spin_lock_irq(&zone->lru_lock);
1305 }
1306 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1308 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1307 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1309 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1308 spin_unlock_irq(&zone->lru_lock); 1310 spin_unlock_irq(&zone->lru_lock);
1309 if (vm_swap_full()) 1311 if (buffer_heads_over_limit)
1310 pagevec_swap_free(&pvec); 1312 pagevec_strip(&pvec);
1311
1312 pagevec_release(&pvec); 1313 pagevec_release(&pvec);
1313} 1314}
1314 1315
@@ -1543,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1543 struct zone *zone; 1544 struct zone *zone;
1544 1545
1545 sc->all_unreclaimable = 1; 1546 sc->all_unreclaimable = 1;
1546 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1547 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1548 sc->nodemask) {
1547 if (!populated_zone(zone)) 1549 if (!populated_zone(zone))
1548 continue; 1550 continue;
1549 /* 1551 /*
@@ -1688,17 +1690,18 @@ out:
1688} 1690}
1689 1691
1690unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 1692unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1691 gfp_t gfp_mask) 1693 gfp_t gfp_mask, nodemask_t *nodemask)
1692{ 1694{
1693 struct scan_control sc = { 1695 struct scan_control sc = {
1694 .gfp_mask = gfp_mask, 1696 .gfp_mask = gfp_mask,
1695 .may_writepage = !laptop_mode, 1697 .may_writepage = !laptop_mode,
1696 .swap_cluster_max = SWAP_CLUSTER_MAX, 1698 .swap_cluster_max = SWAP_CLUSTER_MAX,
1697 .may_swap = 1, 1699 .may_unmap = 1,
1698 .swappiness = vm_swappiness, 1700 .swappiness = vm_swappiness,
1699 .order = order, 1701 .order = order,
1700 .mem_cgroup = NULL, 1702 .mem_cgroup = NULL,
1701 .isolate_pages = isolate_pages_global, 1703 .isolate_pages = isolate_pages_global,
1704 .nodemask = nodemask,
1702 }; 1705 };
1703 1706
1704 return do_try_to_free_pages(zonelist, &sc); 1707 return do_try_to_free_pages(zonelist, &sc);
@@ -1713,17 +1716,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1713{ 1716{
1714 struct scan_control sc = { 1717 struct scan_control sc = {
1715 .may_writepage = !laptop_mode, 1718 .may_writepage = !laptop_mode,
1716 .may_swap = 1, 1719 .may_unmap = 1,
1717 .swap_cluster_max = SWAP_CLUSTER_MAX, 1720 .swap_cluster_max = SWAP_CLUSTER_MAX,
1718 .swappiness = swappiness, 1721 .swappiness = swappiness,
1719 .order = 0, 1722 .order = 0,
1720 .mem_cgroup = mem_cont, 1723 .mem_cgroup = mem_cont,
1721 .isolate_pages = mem_cgroup_isolate_pages, 1724 .isolate_pages = mem_cgroup_isolate_pages,
1725 .nodemask = NULL, /* we don't care the placement */
1722 }; 1726 };
1723 struct zonelist *zonelist; 1727 struct zonelist *zonelist;
1724 1728
1725 if (noswap) 1729 if (noswap)
1726 sc.may_swap = 0; 1730 sc.may_unmap = 0;
1727 1731
1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1732 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1733 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1762,7 +1766,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1762 struct reclaim_state *reclaim_state = current->reclaim_state; 1766 struct reclaim_state *reclaim_state = current->reclaim_state;
1763 struct scan_control sc = { 1767 struct scan_control sc = {
1764 .gfp_mask = GFP_KERNEL, 1768 .gfp_mask = GFP_KERNEL,
1765 .may_swap = 1, 1769 .may_unmap = 1,
1766 .swap_cluster_max = SWAP_CLUSTER_MAX, 1770 .swap_cluster_max = SWAP_CLUSTER_MAX,
1767 .swappiness = vm_swappiness, 1771 .swappiness = vm_swappiness,
1768 .order = order, 1772 .order = order,
@@ -1965,6 +1969,8 @@ static int kswapd(void *p)
1965 }; 1969 };
1966 node_to_cpumask_ptr(cpumask, pgdat->node_id); 1970 node_to_cpumask_ptr(cpumask, pgdat->node_id);
1967 1971
1972 lockdep_set_current_reclaim_state(GFP_KERNEL);
1973
1968 if (!cpumask_empty(cpumask)) 1974 if (!cpumask_empty(cpumask))
1969 set_cpus_allowed_ptr(tsk, cpumask); 1975 set_cpus_allowed_ptr(tsk, cpumask);
1970 current->reclaim_state = &reclaim_state; 1976 current->reclaim_state = &reclaim_state;
@@ -2048,22 +2054,19 @@ unsigned long global_lru_pages(void)
2048#ifdef CONFIG_PM 2054#ifdef CONFIG_PM
2049/* 2055/*
2050 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2056 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2051 * from LRU lists system-wide, for given pass and priority, and returns the 2057 * from LRU lists system-wide, for given pass and priority.
2052 * number of reclaimed pages
2053 * 2058 *
2054 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 2059 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2055 */ 2060 */
2056static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, 2061static void shrink_all_zones(unsigned long nr_pages, int prio,
2057 int pass, struct scan_control *sc) 2062 int pass, struct scan_control *sc)
2058{ 2063{
2059 struct zone *zone; 2064 struct zone *zone;
2060 unsigned long ret = 0; 2065 unsigned long nr_reclaimed = 0;
2061 2066
2062 for_each_zone(zone) { 2067 for_each_populated_zone(zone) {
2063 enum lru_list l; 2068 enum lru_list l;
2064 2069
2065 if (!populated_zone(zone))
2066 continue;
2067 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2070 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2068 continue; 2071 continue;
2069 2072
@@ -2082,14 +2085,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2082 2085
2083 zone->lru[l].nr_scan = 0; 2086 zone->lru[l].nr_scan = 0;
2084 nr_to_scan = min(nr_pages, lru_pages); 2087 nr_to_scan = min(nr_pages, lru_pages);
2085 ret += shrink_list(l, nr_to_scan, zone, 2088 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2086 sc, prio); 2089 sc, prio);
2087 if (ret >= nr_pages) 2090 if (nr_reclaimed >= nr_pages) {
2088 return ret; 2091 sc->nr_reclaimed = nr_reclaimed;
2092 return;
2093 }
2089 } 2094 }
2090 } 2095 }
2091 } 2096 }
2092 return ret; 2097 sc->nr_reclaimed = nr_reclaimed;
2093} 2098}
2094 2099
2095/* 2100/*
@@ -2103,13 +2108,11 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2103unsigned long shrink_all_memory(unsigned long nr_pages) 2108unsigned long shrink_all_memory(unsigned long nr_pages)
2104{ 2109{
2105 unsigned long lru_pages, nr_slab; 2110 unsigned long lru_pages, nr_slab;
2106 unsigned long ret = 0;
2107 int pass; 2111 int pass;
2108 struct reclaim_state reclaim_state; 2112 struct reclaim_state reclaim_state;
2109 struct scan_control sc = { 2113 struct scan_control sc = {
2110 .gfp_mask = GFP_KERNEL, 2114 .gfp_mask = GFP_KERNEL,
2111 .may_swap = 0, 2115 .may_unmap = 0,
2112 .swap_cluster_max = nr_pages,
2113 .may_writepage = 1, 2116 .may_writepage = 1,
2114 .isolate_pages = isolate_pages_global, 2117 .isolate_pages = isolate_pages_global,
2115 }; 2118 };
@@ -2125,8 +2128,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2125 if (!reclaim_state.reclaimed_slab) 2128 if (!reclaim_state.reclaimed_slab)
2126 break; 2129 break;
2127 2130
2128 ret += reclaim_state.reclaimed_slab; 2131 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2129 if (ret >= nr_pages) 2132 if (sc.nr_reclaimed >= nr_pages)
2130 goto out; 2133 goto out;
2131 2134
2132 nr_slab -= reclaim_state.reclaimed_slab; 2135 nr_slab -= reclaim_state.reclaimed_slab;
@@ -2145,21 +2148,22 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2145 2148
2146 /* Force reclaiming mapped pages in the passes #3 and #4 */ 2149 /* Force reclaiming mapped pages in the passes #3 and #4 */
2147 if (pass > 2) 2150 if (pass > 2)
2148 sc.may_swap = 1; 2151 sc.may_unmap = 1;
2149 2152
2150 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 2153 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2151 unsigned long nr_to_scan = nr_pages - ret; 2154 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2152 2155
2153 sc.nr_scanned = 0; 2156 sc.nr_scanned = 0;
2154 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 2157 sc.swap_cluster_max = nr_to_scan;
2155 if (ret >= nr_pages) 2158 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2159 if (sc.nr_reclaimed >= nr_pages)
2156 goto out; 2160 goto out;
2157 2161
2158 reclaim_state.reclaimed_slab = 0; 2162 reclaim_state.reclaimed_slab = 0;
2159 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2163 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2160 global_lru_pages()); 2164 global_lru_pages());
2161 ret += reclaim_state.reclaimed_slab; 2165 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2162 if (ret >= nr_pages) 2166 if (sc.nr_reclaimed >= nr_pages)
2163 goto out; 2167 goto out;
2164 2168
2165 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 2169 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
@@ -2168,21 +2172,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2168 } 2172 }
2169 2173
2170 /* 2174 /*
2171 * If ret = 0, we could not shrink LRUs, but there may be something 2175 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2172 * in slab caches 2176 * something in slab caches
2173 */ 2177 */
2174 if (!ret) { 2178 if (!sc.nr_reclaimed) {
2175 do { 2179 do {
2176 reclaim_state.reclaimed_slab = 0; 2180 reclaim_state.reclaimed_slab = 0;
2177 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2181 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
2178 ret += reclaim_state.reclaimed_slab; 2182 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2179 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2183 } while (sc.nr_reclaimed < nr_pages &&
2184 reclaim_state.reclaimed_slab > 0);
2180 } 2185 }
2181 2186
2187
2182out: 2188out:
2183 current->reclaim_state = NULL; 2189 current->reclaim_state = NULL;
2184 2190
2185 return ret; 2191 return sc.nr_reclaimed;
2186} 2192}
2187#endif 2193#endif
2188 2194
@@ -2288,11 +2294,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2288 int priority; 2294 int priority;
2289 struct scan_control sc = { 2295 struct scan_control sc = {
2290 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2296 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2291 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2297 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2292 .swap_cluster_max = max_t(unsigned long, nr_pages, 2298 .swap_cluster_max = max_t(unsigned long, nr_pages,
2293 SWAP_CLUSTER_MAX), 2299 SWAP_CLUSTER_MAX),
2294 .gfp_mask = gfp_mask, 2300 .gfp_mask = gfp_mask,
2295 .swappiness = vm_swappiness, 2301 .swappiness = vm_swappiness,
2302 .order = order,
2296 .isolate_pages = isolate_pages_global, 2303 .isolate_pages = isolate_pages_global,
2297 }; 2304 };
2298 unsigned long slab_reclaimable; 2305 unsigned long slab_reclaimable;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 91149746bb8d..66f6130976cb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
27 27
28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
29 29
30 for_each_cpu_mask_nr(cpu, *cpumask) { 30 for_each_cpu(cpu, cpumask) {
31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
32 32
33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
135 int cpu; 135 int cpu;
136 int threshold; 136 int threshold;
137 137
138 for_each_zone(zone) { 138 for_each_populated_zone(zone) {
139
140 if (!zone->present_pages)
141 continue;
142
143 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
144 140
145 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
301 int i; 297 int i;
302 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 298 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303 299
304 for_each_zone(zone) { 300 for_each_populated_zone(zone) {
305 struct per_cpu_pageset *p; 301 struct per_cpu_pageset *p;
306 302
307 if (!populated_zone(zone))
308 continue;
309
310 p = zone_pcp(zone, cpu); 303 p = zone_pcp(zone, cpu);
311 304
312 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -898,7 +891,7 @@ static void vmstat_update(struct work_struct *w)
898{ 891{
899 refresh_cpu_vm_stats(smp_processor_id()); 892 refresh_cpu_vm_stats(smp_processor_id());
900 schedule_delayed_work(&__get_cpu_var(vmstat_work), 893 schedule_delayed_work(&__get_cpu_var(vmstat_work),
901 sysctl_stat_interval); 894 round_jiffies_relative(sysctl_stat_interval));
902} 895}
903 896
904static void __cpuinit start_cpu_timer(int cpu) 897static void __cpuinit start_cpu_timer(int cpu)
@@ -906,7 +899,8 @@ static void __cpuinit start_cpu_timer(int cpu)
906 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); 899 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
907 900
908 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); 901 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
909 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); 902 schedule_delayed_work_on(cpu, vmstat_work,
903 __round_jiffies_relative(HZ, cpu));
910} 904}
911 905
912/* 906/*