aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug17
-rw-r--r--mm/Makefile1
-rw-r--r--mm/allocpercpu.c4
-rw-r--r--mm/backing-dev.c26
-rw-r--r--mm/debug-pagealloc.c129
-rw-r--r--mm/highmem.c110
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memory.c39
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page-writeback.c46
-rw-r--r--mm/page_alloc.c29
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/percpu.c130
-rw-r--r--mm/readahead.c25
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slob.c45
-rw-r--r--mm/slub.c66
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c23
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c19
-rw-r--r--mm/vmscan.c101
-rw-r--r--mm/vmstat.c13
26 files changed, 648 insertions, 255 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf2..b53427ad30a3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -206,7 +206,6 @@ config VIRT_TO_BUS
206config UNEVICTABLE_LRU 206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages" 207 bool "Add LRU list to track non-evictable pages"
208 default y 208 default y
209 depends on MMU
210 help 209 help
211 Keeps unevictable pages off of the active and inactive pageout 210 Keeps unevictable pages off of the active and inactive pageout
212 lists, so kswapd will not waste CPU time or have its balancing 211 lists, so kswapd will not waste CPU time or have its balancing
@@ -214,5 +213,13 @@ config UNEVICTABLE_LRU
214 will use one page flag and increase the code size a little, 213 will use one page flag and increase the code size a little,
215 say Y unless you know what you are doing. 214 say Y unless you know what you are doing.
216 215
216config HAVE_MLOCK
217 bool
218 default y if MMU=y
219
220config HAVE_MLOCKED_PAGE_BIT
221 bool
222 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
223
217config MMU_NOTIFIER 224config MMU_NOTIFIER
218 bool 225 bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 000000000000..c8d62d49a44e
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,17 @@
1config WANT_PAGE_DEBUG_FLAGS
2 bool
3
4config PAGE_POISONING
5 bool "Debug page memory allocations"
6 depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
7 depends on !HIBERNATION
8 select DEBUG_PAGEALLOC
9 select WANT_PAGE_DEBUG_FLAGS
10 help
11 Fill the pages with poison patterns after free_pages() and verify
12 the patterns before alloc_pages(). This results in a large slowdown,
13 but helps to find certain types of memory corruptions.
14
15 This option cannot enalbe with hibernation. Otherwise, it will get
16 wrong messages for memory corruption because the free pages are not
17 saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f46..ec73c68b6015 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
27obj-$(CONFIG_SLAB) += slab.o 28obj-$(CONFIG_SLAB) += slab.o
28obj-$(CONFIG_SLUB) += slub.o 29obj-$(CONFIG_SLUB) += slub.o
29obj-$(CONFIG_FAILSLAB) += failslab.o 30obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 3653c570232b..139d5b7b6621 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -120,7 +120,7 @@ void *__alloc_percpu(size_t size, size_t align)
120 * on it. Larger alignment should only be used for module 120 * on it. Larger alignment should only be used for module
121 * percpu sections on SMP for which this path isn't used. 121 * percpu sections on SMP for which this path isn't used.
122 */ 122 */
123 WARN_ON_ONCE(align > __alignof__(unsigned long long)); 123 WARN_ON_ONCE(align > SMP_CACHE_BYTES);
124 124
125 if (unlikely(!pdata)) 125 if (unlikely(!pdata))
126 return NULL; 126 return NULL;
@@ -143,7 +143,7 @@ void free_percpu(void *__pdata)
143{ 143{
144 if (unlikely(!__pdata)) 144 if (unlikely(!__pdata))
145 return; 145 return;
146 __percpu_depopulate_mask(__pdata, &cpu_possible_map); 146 __percpu_depopulate_mask(__pdata, cpu_possible_mask);
147 kfree(__percpu_disguise(__pdata)); 147 kfree(__percpu_disguise(__pdata));
148} 148}
149EXPORT_SYMBOL_GPL(free_percpu); 149EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e8587444132..be68c956a660 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,11 +2,24 @@
2#include <linux/wait.h> 2#include <linux/wait.h>
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <linux/pagemap.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
6#include <linux/module.h> 7#include <linux/module.h>
7#include <linux/writeback.h> 8#include <linux/writeback.h>
8#include <linux/device.h> 9#include <linux/device.h>
9 10
11void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
12{
13}
14EXPORT_SYMBOL(default_unplug_io_fn);
15
16struct backing_dev_info default_backing_dev_info = {
17 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
18 .state = 0,
19 .capabilities = BDI_CAP_MAP_COPY,
20 .unplug_io_fn = default_unplug_io_fn,
21};
22EXPORT_SYMBOL_GPL(default_backing_dev_info);
10 23
11static struct class *bdi_class; 24static struct class *bdi_class;
12 25
@@ -166,9 +179,20 @@ static __init int bdi_class_init(void)
166 bdi_debug_init(); 179 bdi_debug_init();
167 return 0; 180 return 0;
168} 181}
169
170postcore_initcall(bdi_class_init); 182postcore_initcall(bdi_class_init);
171 183
184static int __init default_bdi_init(void)
185{
186 int err;
187
188 err = bdi_init(&default_backing_dev_info);
189 if (!err)
190 bdi_register(&default_backing_dev_info, NULL, "default");
191
192 return err;
193}
194subsys_initcall(default_bdi_init);
195
172int bdi_register(struct backing_dev_info *bdi, struct device *parent, 196int bdi_register(struct backing_dev_info *bdi, struct device *parent,
173 const char *fmt, ...) 197 const char *fmt, ...)
174{ 198{
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 000000000000..a1e3324de2b5
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
1#include <linux/kernel.h>
2#include <linux/mm.h>
3#include <linux/page-debug-flags.h>
4#include <linux/poison.h>
5
6static inline void set_page_poison(struct page *page)
7{
8 __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
9}
10
11static inline void clear_page_poison(struct page *page)
12{
13 __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
14}
15
16static inline bool page_poison(struct page *page)
17{
18 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
19}
20
21static void poison_highpage(struct page *page)
22{
23 /*
24 * Page poisoning for highmem pages is not implemented.
25 *
26 * This can be called from interrupt contexts.
27 * So we need to create a new kmap_atomic slot for this
28 * application and it will need interrupt protection.
29 */
30}
31
32static void poison_page(struct page *page)
33{
34 void *addr;
35
36 if (PageHighMem(page)) {
37 poison_highpage(page);
38 return;
39 }
40 set_page_poison(page);
41 addr = page_address(page);
42 memset(addr, PAGE_POISON, PAGE_SIZE);
43}
44
45static void poison_pages(struct page *page, int n)
46{
47 int i;
48
49 for (i = 0; i < n; i++)
50 poison_page(page + i);
51}
52
53static bool single_bit_flip(unsigned char a, unsigned char b)
54{
55 unsigned char error = a ^ b;
56
57 return error && !(error & (error - 1));
58}
59
60static void check_poison_mem(unsigned char *mem, size_t bytes)
61{
62 unsigned char *start;
63 unsigned char *end;
64
65 for (start = mem; start < mem + bytes; start++) {
66 if (*start != PAGE_POISON)
67 break;
68 }
69 if (start == mem + bytes)
70 return;
71
72 for (end = mem + bytes - 1; end > start; end--) {
73 if (*end != PAGE_POISON)
74 break;
75 }
76
77 if (!printk_ratelimit())
78 return;
79 else if (start == end && single_bit_flip(*start, PAGE_POISON))
80 printk(KERN_ERR "pagealloc: single bit error\n");
81 else
82 printk(KERN_ERR "pagealloc: memory corruption\n");
83
84 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
85 end - start + 1, 1);
86 dump_stack();
87}
88
89static void unpoison_highpage(struct page *page)
90{
91 /*
92 * See comment in poison_highpage().
93 * Highmem pages should not be poisoned for now
94 */
95 BUG_ON(page_poison(page));
96}
97
98static void unpoison_page(struct page *page)
99{
100 if (PageHighMem(page)) {
101 unpoison_highpage(page);
102 return;
103 }
104 if (page_poison(page)) {
105 void *addr = page_address(page);
106
107 check_poison_mem(addr, PAGE_SIZE);
108 clear_page_poison(page);
109 }
110}
111
112static void unpoison_pages(struct page *page, int n)
113{
114 int i;
115
116 for (i = 0; i < n; i++)
117 unpoison_page(page + i);
118}
119
120void kernel_map_pages(struct page *page, int numpages, int enable)
121{
122 if (!debug_pagealloc_enabled)
123 return;
124
125 if (enable)
126 unpoison_pages(page, numpages);
127 else
128 poison_pages(page, numpages);
129}
diff --git a/mm/highmem.c b/mm/highmem.c
index b36b83b920ff..68eb1d9b63fa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -67,6 +67,25 @@ pte_t * pkmap_page_table;
67 67
68static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); 68static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
69 69
70/*
71 * Most architectures have no use for kmap_high_get(), so let's abstract
72 * the disabling of IRQ out of the locking in that case to save on a
73 * potential useless overhead.
74 */
75#ifdef ARCH_NEEDS_KMAP_HIGH_GET
76#define lock_kmap() spin_lock_irq(&kmap_lock)
77#define unlock_kmap() spin_unlock_irq(&kmap_lock)
78#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags)
79#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags)
80#else
81#define lock_kmap() spin_lock(&kmap_lock)
82#define unlock_kmap() spin_unlock(&kmap_lock)
83#define lock_kmap_any(flags) \
84 do { spin_lock(&kmap_lock); (void)(flags); } while (0)
85#define unlock_kmap_any(flags) \
86 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
87#endif
88
70static void flush_all_zero_pkmaps(void) 89static void flush_all_zero_pkmaps(void)
71{ 90{
72 int i; 91 int i;
@@ -113,9 +132,9 @@ static void flush_all_zero_pkmaps(void)
113 */ 132 */
114void kmap_flush_unused(void) 133void kmap_flush_unused(void)
115{ 134{
116 spin_lock(&kmap_lock); 135 lock_kmap();
117 flush_all_zero_pkmaps(); 136 flush_all_zero_pkmaps();
118 spin_unlock(&kmap_lock); 137 unlock_kmap();
119} 138}
120 139
121static inline unsigned long map_new_virtual(struct page *page) 140static inline unsigned long map_new_virtual(struct page *page)
@@ -145,10 +164,10 @@ start:
145 164
146 __set_current_state(TASK_UNINTERRUPTIBLE); 165 __set_current_state(TASK_UNINTERRUPTIBLE);
147 add_wait_queue(&pkmap_map_wait, &wait); 166 add_wait_queue(&pkmap_map_wait, &wait);
148 spin_unlock(&kmap_lock); 167 unlock_kmap();
149 schedule(); 168 schedule();
150 remove_wait_queue(&pkmap_map_wait, &wait); 169 remove_wait_queue(&pkmap_map_wait, &wait);
151 spin_lock(&kmap_lock); 170 lock_kmap();
152 171
153 /* Somebody else might have mapped it while we slept */ 172 /* Somebody else might have mapped it while we slept */
154 if (page_address(page)) 173 if (page_address(page))
@@ -184,29 +203,59 @@ void *kmap_high(struct page *page)
184 * For highmem pages, we can't trust "virtual" until 203 * For highmem pages, we can't trust "virtual" until
185 * after we have the lock. 204 * after we have the lock.
186 */ 205 */
187 spin_lock(&kmap_lock); 206 lock_kmap();
188 vaddr = (unsigned long)page_address(page); 207 vaddr = (unsigned long)page_address(page);
189 if (!vaddr) 208 if (!vaddr)
190 vaddr = map_new_virtual(page); 209 vaddr = map_new_virtual(page);
191 pkmap_count[PKMAP_NR(vaddr)]++; 210 pkmap_count[PKMAP_NR(vaddr)]++;
192 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); 211 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
193 spin_unlock(&kmap_lock); 212 unlock_kmap();
194 return (void*) vaddr; 213 return (void*) vaddr;
195} 214}
196 215
197EXPORT_SYMBOL(kmap_high); 216EXPORT_SYMBOL(kmap_high);
198 217
218#ifdef ARCH_NEEDS_KMAP_HIGH_GET
219/**
220 * kmap_high_get - pin a highmem page into memory
221 * @page: &struct page to pin
222 *
223 * Returns the page's current virtual memory address, or NULL if no mapping
224 * exists. When and only when a non null address is returned then a
225 * matching call to kunmap_high() is necessary.
226 *
227 * This can be called from any context.
228 */
229void *kmap_high_get(struct page *page)
230{
231 unsigned long vaddr, flags;
232
233 lock_kmap_any(flags);
234 vaddr = (unsigned long)page_address(page);
235 if (vaddr) {
236 BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1);
237 pkmap_count[PKMAP_NR(vaddr)]++;
238 }
239 unlock_kmap_any(flags);
240 return (void*) vaddr;
241}
242#endif
243
199/** 244/**
200 * kunmap_high - map a highmem page into memory 245 * kunmap_high - map a highmem page into memory
201 * @page: &struct page to unmap 246 * @page: &struct page to unmap
247 *
248 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
249 * only from user context.
202 */ 250 */
203void kunmap_high(struct page *page) 251void kunmap_high(struct page *page)
204{ 252{
205 unsigned long vaddr; 253 unsigned long vaddr;
206 unsigned long nr; 254 unsigned long nr;
255 unsigned long flags;
207 int need_wakeup; 256 int need_wakeup;
208 257
209 spin_lock(&kmap_lock); 258 lock_kmap_any(flags);
210 vaddr = (unsigned long)page_address(page); 259 vaddr = (unsigned long)page_address(page);
211 BUG_ON(!vaddr); 260 BUG_ON(!vaddr);
212 nr = PKMAP_NR(vaddr); 261 nr = PKMAP_NR(vaddr);
@@ -232,7 +281,7 @@ void kunmap_high(struct page *page)
232 */ 281 */
233 need_wakeup = waitqueue_active(&pkmap_map_wait); 282 need_wakeup = waitqueue_active(&pkmap_map_wait);
234 } 283 }
235 spin_unlock(&kmap_lock); 284 unlock_kmap_any(flags);
236 285
237 /* do wake-up, if needed, race-free outside of the spin lock */ 286 /* do wake-up, if needed, race-free outside of the spin lock */
238 if (need_wakeup) 287 if (need_wakeup)
@@ -373,3 +422,48 @@ void __init page_address_init(void)
373} 422}
374 423
375#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
425
426#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
427
428void debug_kmap_atomic(enum km_type type)
429{
430 static unsigned warn_count = 10;
431
432 if (unlikely(warn_count == 0))
433 return;
434
435 if (unlikely(in_interrupt())) {
436 if (in_irq()) {
437 if (type != KM_IRQ0 && type != KM_IRQ1 &&
438 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
439 type != KM_BOUNCE_READ) {
440 WARN_ON(1);
441 warn_count--;
442 }
443 } else if (!irqs_disabled()) { /* softirq */
444 if (type != KM_IRQ0 && type != KM_IRQ1 &&
445 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
446 type != KM_SKB_SUNRPC_DATA &&
447 type != KM_SKB_DATA_SOFTIRQ &&
448 type != KM_BOUNCE_READ) {
449 WARN_ON(1);
450 warn_count--;
451 }
452 }
453 }
454
455 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
456 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
457 if (!irqs_disabled()) {
458 WARN_ON(1);
459 warn_count--;
460 }
461 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
462 if (irq_count() == 0 && !irqs_disabled()) {
463 WARN_ON(1);
464 warn_count--;
465 }
466 }
467}
468
469#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d809a8..28c655ba9353 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h,
918 * an instantiated the change should be committed via vma_commit_reservation. 918 * an instantiated the change should be committed via vma_commit_reservation.
919 * No action is required on failure. 919 * No action is required on failure.
920 */ 920 */
921static int vma_needs_reservation(struct hstate *h, 921static long vma_needs_reservation(struct hstate *h,
922 struct vm_area_struct *vma, unsigned long addr) 922 struct vm_area_struct *vma, unsigned long addr)
923{ 923{
924 struct address_space *mapping = vma->vm_file->f_mapping; 924 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h,
933 return 1; 933 return 1;
934 934
935 } else { 935 } else {
936 int err; 936 long err;
937 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 937 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
938 struct resv_map *reservations = vma_resv_map(vma); 938 struct resv_map *reservations = vma_resv_map(vma);
939 939
@@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
969 struct page *page; 969 struct page *page;
970 struct address_space *mapping = vma->vm_file->f_mapping; 970 struct address_space *mapping = vma->vm_file->f_mapping;
971 struct inode *inode = mapping->host; 971 struct inode *inode = mapping->host;
972 unsigned int chg; 972 long chg;
973 973
974 /* 974 /*
975 * Processes that did not create the mapping will have no reserves and 975 * Processes that did not create the mapping will have no reserves and
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2a..987bb03fbdd8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
63 return page_private(page); 63 return page_private(page);
64} 64}
65 65
66#ifdef CONFIG_HAVE_MLOCK
66extern long mlock_vma_pages_range(struct vm_area_struct *vma, 67extern long mlock_vma_pages_range(struct vm_area_struct *vma,
67 unsigned long start, unsigned long end); 68 unsigned long start, unsigned long end);
68extern void munlock_vma_pages_range(struct vm_area_struct *vma, 69extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
71{ 72{
72 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 73 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
73} 74}
75#endif
74 76
75#ifdef CONFIG_UNEVICTABLE_LRU 77#ifdef CONFIG_UNEVICTABLE_LRU
76/* 78/*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
90} 92}
91#endif 93#endif
92 94
93#ifdef CONFIG_UNEVICTABLE_LRU 95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
94/* 96/*
95 * Called only in fault path via page_evictable() for a new page 97 * Called only in fault path via page_evictable() for a new page
96 * to determine if it's being mapped into a LOCKED vma. 98 * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
165 } 167 }
166} 168}
167 169
168#else /* CONFIG_UNEVICTABLE_LRU */ 170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
169static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
170{ 172{
171 return 0; 173 return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
175static inline void mlock_migrate_page(struct page *new, struct page *old) { } 177static inline void mlock_migrate_page(struct page *new, struct page *old) { }
176static inline void free_page_mlock(struct page *page) { } 178static inline void free_page_mlock(struct page *page) { }
177 179
178#endif /* CONFIG_UNEVICTABLE_LRU */ 180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
179 181
180/* 182/*
181 * Return the mem_map entry representing the 'offset' subpage within 183 * Return the mem_map entry representing the 'offset' subpage within
diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd2..cf6873e91c6a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1151 if ((flags & FOLL_WRITE) && 1151 if ((flags & FOLL_WRITE) &&
1152 !pte_dirty(pte) && !PageDirty(page)) 1152 !pte_dirty(pte) && !PageDirty(page))
1153 set_page_dirty(page); 1153 set_page_dirty(page);
1154 /*
1155 * pte_mkyoung() would be more correct here, but atomic care
1156 * is needed to avoid losing the dirty bit: it is easier to use
1157 * mark_page_accessed().
1158 */
1154 mark_page_accessed(page); 1159 mark_page_accessed(page);
1155 } 1160 }
1156unlock: 1161unlock:
@@ -1665,9 +1670,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1665 * behaviour that some programs depend on. We mark the "original" 1670 * behaviour that some programs depend on. We mark the "original"
1666 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 1671 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1667 */ 1672 */
1668 if (addr == vma->vm_start && end == vma->vm_end) 1673 if (addr == vma->vm_start && end == vma->vm_end) {
1669 vma->vm_pgoff = pfn; 1674 vma->vm_pgoff = pfn;
1670 else if (is_cow_mapping(vma->vm_flags)) 1675 vma->vm_flags |= VM_PFN_AT_MMAP;
1676 } else if (is_cow_mapping(vma->vm_flags))
1671 return -EINVAL; 1677 return -EINVAL;
1672 1678
1673 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1679 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
@@ -1679,6 +1685,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1679 * needed from higher level routine calling unmap_vmas 1685 * needed from higher level routine calling unmap_vmas
1680 */ 1686 */
1681 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); 1687 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1688 vma->vm_flags &= ~VM_PFN_AT_MMAP;
1682 return -EINVAL; 1689 return -EINVAL;
1683 } 1690 }
1684 1691
@@ -1938,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1938 * get_user_pages(.write=1, .force=1). 1945 * get_user_pages(.write=1, .force=1).
1939 */ 1946 */
1940 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1947 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1948 struct vm_fault vmf;
1949 int tmp;
1950
1951 vmf.virtual_address = (void __user *)(address &
1952 PAGE_MASK);
1953 vmf.pgoff = old_page->index;
1954 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1955 vmf.page = old_page;
1956
1941 /* 1957 /*
1942 * Notify the address space that the page is about to 1958 * Notify the address space that the page is about to
1943 * become writable so that it can prohibit this or wait 1959 * become writable so that it can prohibit this or wait
@@ -1949,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1949 page_cache_get(old_page); 1965 page_cache_get(old_page);
1950 pte_unmap_unlock(page_table, ptl); 1966 pte_unmap_unlock(page_table, ptl);
1951 1967
1952 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) 1968 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
1969 if (unlikely(tmp &
1970 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
1971 ret = tmp;
1953 goto unwritable_page; 1972 goto unwritable_page;
1973 }
1954 1974
1955 /* 1975 /*
1956 * Since we dropped the lock we need to revalidate 1976 * Since we dropped the lock we need to revalidate
@@ -2099,7 +2119,7 @@ oom:
2099 2119
2100unwritable_page: 2120unwritable_page:
2101 page_cache_release(old_page); 2121 page_cache_release(old_page);
2102 return VM_FAULT_SIGBUS; 2122 return ret;
2103} 2123}
2104 2124
2105/* 2125/*
@@ -2433,8 +2453,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2433 count_vm_event(PGMAJFAULT); 2453 count_vm_event(PGMAJFAULT);
2434 } 2454 }
2435 2455
2436 mark_page_accessed(page);
2437
2438 lock_page(page); 2456 lock_page(page);
2439 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2457 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2440 2458
@@ -2643,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2643 * to become writable 2661 * to become writable
2644 */ 2662 */
2645 if (vma->vm_ops->page_mkwrite) { 2663 if (vma->vm_ops->page_mkwrite) {
2664 int tmp;
2665
2646 unlock_page(page); 2666 unlock_page(page);
2647 if (vma->vm_ops->page_mkwrite(vma, page) < 0) { 2667 vmf.flags |= FAULT_FLAG_MKWRITE;
2648 ret = VM_FAULT_SIGBUS; 2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp;
2649 anon = 1; /* no anon but release vmf.page */ 2672 anon = 1; /* no anon but release vmf.page */
2650 goto out_unlocked; 2673 goto out_unlocked;
2651 } 2674 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 00ced3ee49a8..1abb9185a686 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,6 +20,7 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/personality.h> 21#include <linux/personality.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/ima.h>
23#include <linux/hugetlb.h> 24#include <linux/hugetlb.h>
24#include <linux/profile.h> 25#include <linux/profile.h>
25#include <linux/module.h> 26#include <linux/module.h>
@@ -1049,6 +1050,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1049 error = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1050 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1050 if (error) 1051 if (error)
1051 return error; 1052 return error;
1053 error = ima_file_mmap(file, prot);
1054 if (error)
1055 return error;
1052 1056
1053 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1057 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1054} 1058}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba05061a4f..d3b9bac085b5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock);
55 55
56unsigned long badness(struct task_struct *p, unsigned long uptime) 56unsigned long badness(struct task_struct *p, unsigned long uptime)
57{ 57{
58 unsigned long points, cpu_time, run_time, s; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 61
@@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
110 else 110 else
111 run_time = 0; 111 run_time = 0;
112 112
113 s = int_sqrt(cpu_time); 113 if (cpu_time)
114 if (s) 114 points /= int_sqrt(cpu_time);
115 points /= s; 115 if (run_time)
116 s = int_sqrt(int_sqrt(run_time)); 116 points /= int_sqrt(int_sqrt(run_time));
117 if (s)
118 points /= s;
119 117
120 /* 118 /*
121 * Niced processes are most likely less important, so double 119 * Niced processes are most likely less important, so double
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74dc57c74349..30351f0063ac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -66,7 +66,7 @@ static inline long sync_writeback_pages(void)
66/* 66/*
67 * Start background writeback (via pdflush) at this percentage 67 * Start background writeback (via pdflush) at this percentage
68 */ 68 */
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 10;
70 70
71/* 71/*
72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of 72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -83,7 +83,7 @@ int vm_highmem_is_dirtyable;
83/* 83/*
84 * The generator of dirty data starts writeback at this percentage 84 * The generator of dirty data starts writeback at this percentage
85 */ 85 */
86int vm_dirty_ratio = 10; 86int vm_dirty_ratio = 20;
87 87
88/* 88/*
89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of 89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 10;
92unsigned long vm_dirty_bytes; 92unsigned long vm_dirty_bytes;
93 93
94/* 94/*
95 * The interval between `kupdate'-style writebacks, in jiffies 95 * The interval between `kupdate'-style writebacks
96 */ 96 */
97int dirty_writeback_interval = 5 * HZ; 97unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
98 98
99/* 99/*
100 * The longest number of jiffies for which data is allowed to remain dirty 100 * The longest time for which data is allowed to remain dirty
101 */ 101 */
102int dirty_expire_interval = 30 * HZ; 102unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
103 103
104/* 104/*
105 * Flag that makes the machine dump writes/reads and block dirtyings. 105 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
770 770
771 sync_supers(); 771 sync_supers();
772 772
773 oldest_jif = jiffies - dirty_expire_interval; 773 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
774 start_jif = jiffies; 774 start_jif = jiffies;
775 next_jif = start_jif + dirty_writeback_interval; 775 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
776 nr_to_write = global_page_state(NR_FILE_DIRTY) + 776 nr_to_write = global_page_state(NR_FILE_DIRTY) +
777 global_page_state(NR_UNSTABLE_NFS) + 777 global_page_state(NR_UNSTABLE_NFS) +
778 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 778 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
801int dirty_writeback_centisecs_handler(ctl_table *table, int write, 801int dirty_writeback_centisecs_handler(ctl_table *table, int write,
802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
803{ 803{
804 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); 804 proc_dointvec(table, write, file, buffer, length, ppos);
805 if (dirty_writeback_interval) 805 if (dirty_writeback_interval)
806 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 806 mod_timer(&wb_timer, jiffies +
807 msecs_to_jiffies(dirty_writeback_interval * 10));
807 else 808 else
808 del_timer(&wb_timer); 809 del_timer(&wb_timer);
809 return 0; 810 return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
905{ 906{
906 int shift; 907 int shift;
907 908
908 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 909 mod_timer(&wb_timer,
910 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
909 writeback_set_ratelimit(); 911 writeback_set_ratelimit();
910 register_cpu_notifier(&ratelimit_nb); 912 register_cpu_notifier(&ratelimit_nb);
911 913
@@ -1198,6 +1200,20 @@ int __set_page_dirty_no_writeback(struct page *page)
1198} 1200}
1199 1201
1200/* 1202/*
1203 * Helper function for set_page_dirty family.
1204 * NOTE: This relies on being atomic wrt interrupts.
1205 */
1206void account_page_dirtied(struct page *page, struct address_space *mapping)
1207{
1208 if (mapping_cap_account_dirty(mapping)) {
1209 __inc_zone_page_state(page, NR_FILE_DIRTY);
1210 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1211 task_dirty_inc(current);
1212 task_io_account_write(PAGE_CACHE_SIZE);
1213 }
1214}
1215
1216/*
1201 * For address_spaces which do not use buffers. Just tag the page as dirty in 1217 * For address_spaces which do not use buffers. Just tag the page as dirty in
1202 * its radix tree. 1218 * its radix tree.
1203 * 1219 *
@@ -1226,13 +1242,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1226 if (mapping2) { /* Race with truncate? */ 1242 if (mapping2) { /* Race with truncate? */
1227 BUG_ON(mapping2 != mapping); 1243 BUG_ON(mapping2 != mapping);
1228 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 1244 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1229 if (mapping_cap_account_dirty(mapping)) { 1245 account_page_dirtied(page, mapping);
1230 __inc_zone_page_state(page, NR_FILE_DIRTY);
1231 __inc_bdi_stat(mapping->backing_dev_info,
1232 BDI_RECLAIMABLE);
1233 task_dirty_inc(current);
1234 task_io_account_write(PAGE_CACHE_SIZE);
1235 }
1236 radix_tree_tag_set(&mapping->page_tree, 1246 radix_tree_tag_set(&mapping->page_tree,
1237 page_index(page), PAGECACHE_TAG_DIRTY); 1247 page_index(page), PAGECACHE_TAG_DIRTY);
1238 } 1248 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3803ea8c27d..0284e528748d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
922 unsigned long flags; 922 unsigned long flags;
923 struct zone *zone; 923 struct zone *zone;
924 924
925 for_each_zone(zone) { 925 for_each_populated_zone(zone) {
926 struct per_cpu_pageset *pset; 926 struct per_cpu_pageset *pset;
927 struct per_cpu_pages *pcp; 927 struct per_cpu_pages *pcp;
928 928
929 if (!populated_zone(zone))
930 continue;
931
932 pset = zone_pcp(zone, cpu); 929 pset = zone_pcp(zone, cpu);
933 930
934 pcp = &pset->pcp; 931 pcp = &pset->pcp;
@@ -1585,7 +1582,8 @@ nofail_alloc:
1585 reclaim_state.reclaimed_slab = 0; 1582 reclaim_state.reclaimed_slab = 0;
1586 p->reclaim_state = &reclaim_state; 1583 p->reclaim_state = &reclaim_state;
1587 1584
1588 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1585 did_some_progress = try_to_free_pages(zonelist, order,
1586 gfp_mask, nodemask);
1589 1587
1590 p->reclaim_state = NULL; 1588 p->reclaim_state = NULL;
1591 lockdep_clear_current_reclaim_state(); 1589 lockdep_clear_current_reclaim_state();
@@ -1879,10 +1877,7 @@ void show_free_areas(void)
1879 int cpu; 1877 int cpu;
1880 struct zone *zone; 1878 struct zone *zone;
1881 1879
1882 for_each_zone(zone) { 1880 for_each_populated_zone(zone) {
1883 if (!populated_zone(zone))
1884 continue;
1885
1886 show_node(zone); 1881 show_node(zone);
1887 printk("%s per-cpu:\n", zone->name); 1882 printk("%s per-cpu:\n", zone->name);
1888 1883
@@ -1922,12 +1917,9 @@ void show_free_areas(void)
1922 global_page_state(NR_PAGETABLE), 1917 global_page_state(NR_PAGETABLE),
1923 global_page_state(NR_BOUNCE)); 1918 global_page_state(NR_BOUNCE));
1924 1919
1925 for_each_zone(zone) { 1920 for_each_populated_zone(zone) {
1926 int i; 1921 int i;
1927 1922
1928 if (!populated_zone(zone))
1929 continue;
1930
1931 show_node(zone); 1923 show_node(zone);
1932 printk("%s" 1924 printk("%s"
1933 " free:%lukB" 1925 " free:%lukB"
@@ -1967,12 +1959,9 @@ void show_free_areas(void)
1967 printk("\n"); 1959 printk("\n");
1968 } 1960 }
1969 1961
1970 for_each_zone(zone) { 1962 for_each_populated_zone(zone) {
1971 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1963 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1972 1964
1973 if (!populated_zone(zone))
1974 continue;
1975
1976 show_node(zone); 1965 show_node(zone);
1977 printk("%s: ", zone->name); 1966 printk("%s: ", zone->name);
1978 1967
@@ -2784,11 +2773,7 @@ static int __cpuinit process_zones(int cpu)
2784 2773
2785 node_set_state(node, N_CPU); /* this node has a cpu */ 2774 node_set_state(node, N_CPU); /* this node has a cpu */
2786 2775
2787 for_each_zone(zone) { 2776 for_each_populated_zone(zone) {
2788
2789 if (!populated_zone(zone))
2790 continue;
2791
2792 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2777 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2793 GFP_KERNEL, node); 2778 GFP_KERNEL, node);
2794 if (!zone_pcp(zone, cpu)) 2779 if (!zone_pcp(zone, cpu))
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 15de509b68fd..118905e3d788 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -191,7 +191,7 @@ static int pdflush(void *dummy)
191 191
192 /* 192 /*
193 * Some configs put our parent kthread in a limited cpuset, 193 * Some configs put our parent kthread in a limited cpuset,
194 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL. 194 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
195 * Our needs are more modest - cut back to our cpusets cpus_allowed. 195 * Our needs are more modest - cut back to our cpusets cpus_allowed.
196 * This is needed as pdflush's are dynamically created and destroyed. 196 * This is needed as pdflush's are dynamically created and destroyed.
197 * The boottime pdflush's are easily placed w/o these 2 lines. 197 * The boottime pdflush's are easily placed w/o these 2 lines.
diff --git a/mm/percpu.c b/mm/percpu.c
index bfe6a3afaf45..1aa5d8fbca12 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -46,7 +46,8 @@
46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 46 * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
47 * 47 *
48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate 48 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
49 * regular address to percpu pointer and back 49 * regular address to percpu pointer and back if they need to be
50 * different from the default
50 * 51 *
51 * - use pcpu_setup_first_chunk() during percpu area initialization to 52 * - use pcpu_setup_first_chunk() during percpu area initialization to
52 * setup the first chunk containing the kernel static percpu area 53 * setup the first chunk containing the kernel static percpu area
@@ -67,11 +68,24 @@
67#include <linux/workqueue.h> 68#include <linux/workqueue.h>
68 69
69#include <asm/cacheflush.h> 70#include <asm/cacheflush.h>
71#include <asm/sections.h>
70#include <asm/tlbflush.h> 72#include <asm/tlbflush.h>
71 73
72#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 74#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
73#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 75#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
74 76
77/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
78#ifndef __addr_to_pcpu_ptr
79#define __addr_to_pcpu_ptr(addr) \
80 (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
81 + (unsigned long)__per_cpu_start)
82#endif
83#ifndef __pcpu_ptr_to_addr
84#define __pcpu_ptr_to_addr(ptr) \
85 (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
86 - (unsigned long)__per_cpu_start)
87#endif
88
75struct pcpu_chunk { 89struct pcpu_chunk {
76 struct list_head list; /* linked to pcpu_slot lists */ 90 struct list_head list; /* linked to pcpu_slot lists */
77 struct rb_node rb_node; /* key is chunk->vm->addr */ 91 struct rb_node rb_node; /* key is chunk->vm->addr */
@@ -1013,8 +1027,8 @@ EXPORT_SYMBOL_GPL(free_percpu);
1013 * @get_page_fn: callback to fetch page pointer 1027 * @get_page_fn: callback to fetch page pointer
1014 * @static_size: the size of static percpu area in bytes 1028 * @static_size: the size of static percpu area in bytes
1015 * @reserved_size: the size of reserved percpu area in bytes 1029 * @reserved_size: the size of reserved percpu area in bytes
1016 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1017 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1030 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1031 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1018 * @base_addr: mapped address, NULL for auto 1032 * @base_addr: mapped address, NULL for auto
1019 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary 1033 * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
1020 * 1034 *
@@ -1039,14 +1053,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
1039 * limited offset range for symbol relocations to guarantee module 1053 * limited offset range for symbol relocations to guarantee module
1040 * percpu symbols fall inside the relocatable range. 1054 * percpu symbols fall inside the relocatable range.
1041 * 1055 *
1056 * @dyn_size, if non-negative, determines the number of bytes
1057 * available for dynamic allocation in the first chunk. Specifying
1058 * non-negative value makes percpu leave alone the area beyond
1059 * @static_size + @reserved_size + @dyn_size.
1060 *
1042 * @unit_size, if non-negative, specifies unit size and must be 1061 * @unit_size, if non-negative, specifies unit size and must be
1043 * aligned to PAGE_SIZE and equal to or larger than @static_size + 1062 * aligned to PAGE_SIZE and equal to or larger than @static_size +
1044 * @reserved_size + @dyn_size. 1063 * @reserved_size + if non-negative, @dyn_size.
1045 *
1046 * @dyn_size, if non-negative, limits the number of bytes available
1047 * for dynamic allocation in the first chunk. Specifying non-negative
1048 * value make percpu leave alone the area beyond @static_size +
1049 * @reserved_size + @dyn_size.
1050 * 1064 *
1051 * Non-null @base_addr means that the caller already allocated virtual 1065 * Non-null @base_addr means that the caller already allocated virtual
1052 * region for the first chunk and mapped it. percpu must not mess 1066 * region for the first chunk and mapped it. percpu must not mess
@@ -1069,12 +1083,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
1069 */ 1083 */
1070size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, 1084size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1071 size_t static_size, size_t reserved_size, 1085 size_t static_size, size_t reserved_size,
1072 ssize_t unit_size, ssize_t dyn_size, 1086 ssize_t dyn_size, ssize_t unit_size,
1073 void *base_addr, 1087 void *base_addr,
1074 pcpu_populate_pte_fn_t populate_pte_fn) 1088 pcpu_populate_pte_fn_t populate_pte_fn)
1075{ 1089{
1076 static struct vm_struct first_vm; 1090 static struct vm_struct first_vm;
1077 static int smap[2], dmap[2]; 1091 static int smap[2], dmap[2];
1092 size_t size_sum = static_size + reserved_size +
1093 (dyn_size >= 0 ? dyn_size : 0);
1078 struct pcpu_chunk *schunk, *dchunk = NULL; 1094 struct pcpu_chunk *schunk, *dchunk = NULL;
1079 unsigned int cpu; 1095 unsigned int cpu;
1080 int nr_pages; 1096 int nr_pages;
@@ -1085,20 +1101,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1085 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); 1101 ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
1086 BUG_ON(!static_size); 1102 BUG_ON(!static_size);
1087 if (unit_size >= 0) { 1103 if (unit_size >= 0) {
1088 BUG_ON(unit_size < static_size + reserved_size + 1104 BUG_ON(unit_size < size_sum);
1089 (dyn_size >= 0 ? dyn_size : 0));
1090 BUG_ON(unit_size & ~PAGE_MASK); 1105 BUG_ON(unit_size & ~PAGE_MASK);
1091 } else { 1106 BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1092 BUG_ON(dyn_size >= 0); 1107 } else
1093 BUG_ON(base_addr); 1108 BUG_ON(base_addr);
1094 }
1095 BUG_ON(base_addr && populate_pte_fn); 1109 BUG_ON(base_addr && populate_pte_fn);
1096 1110
1097 if (unit_size >= 0) 1111 if (unit_size >= 0)
1098 pcpu_unit_pages = unit_size >> PAGE_SHIFT; 1112 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
1099 else 1113 else
1100 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, 1114 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
1101 PFN_UP(static_size + reserved_size)); 1115 PFN_UP(size_sum));
1102 1116
1103 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; 1117 pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
1104 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; 1118 pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -1224,3 +1238,89 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
1224 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); 1238 pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
1225 return pcpu_unit_size; 1239 return pcpu_unit_size;
1226} 1240}
1241
1242/*
1243 * Embedding first chunk setup helper.
1244 */
1245static void *pcpue_ptr __initdata;
1246static size_t pcpue_size __initdata;
1247static size_t pcpue_unit_size __initdata;
1248
1249static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1250{
1251 size_t off = (size_t)pageno << PAGE_SHIFT;
1252
1253 if (off >= pcpue_size)
1254 return NULL;
1255
1256 return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
1257}
1258
1259/**
1260 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1261 * @static_size: the size of static percpu area in bytes
1262 * @reserved_size: the size of reserved percpu area in bytes
1263 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1264 * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
1265 *
1266 * This is a helper to ease setting up embedded first percpu chunk and
1267 * can be called where pcpu_setup_first_chunk() is expected.
1268 *
1269 * If this function is used to setup the first chunk, it is allocated
1270 * as a contiguous area using bootmem allocator and used as-is without
1271 * being mapped into vmalloc area. This enables the first chunk to
1272 * piggy back on the linear physical mapping which often uses larger
1273 * page size.
1274 *
1275 * When @dyn_size is positive, dynamic area might be larger than
1276 * specified to fill page alignment. Also, when @dyn_size is auto,
1277 * @dyn_size does not fill the whole first chunk but only what's
1278 * necessary for page alignment after static and reserved areas.
1279 *
1280 * If the needed size is smaller than the minimum or specified unit
1281 * size, the leftover is returned to the bootmem allocator.
1282 *
1283 * RETURNS:
1284 * The determined pcpu_unit_size which can be used to initialize
1285 * percpu access on success, -errno on failure.
1286 */
1287ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1288 ssize_t dyn_size, ssize_t unit_size)
1289{
1290 unsigned int cpu;
1291
1292 /* determine parameters and allocate */
1293 pcpue_size = PFN_ALIGN(static_size + reserved_size +
1294 (dyn_size >= 0 ? dyn_size : 0));
1295 if (dyn_size != 0)
1296 dyn_size = pcpue_size - static_size - reserved_size;
1297
1298 if (unit_size >= 0) {
1299 BUG_ON(unit_size < pcpue_size);
1300 pcpue_unit_size = unit_size;
1301 } else
1302 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1303
1304 pcpue_ptr = __alloc_bootmem_nopanic(
1305 num_possible_cpus() * pcpue_unit_size,
1306 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
1307 if (!pcpue_ptr)
1308 return -ENOMEM;
1309
1310 /* return the leftover and copy */
1311 for_each_possible_cpu(cpu) {
1312 void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
1313
1314 free_bootmem(__pa(ptr + pcpue_size),
1315 pcpue_unit_size - pcpue_size);
1316 memcpy(ptr, __per_cpu_load, static_size);
1317 }
1318
1319 /* we're ready, commit */
1320 pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
1321 pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
1322
1323 return pcpu_setup_first_chunk(pcpue_get_page, static_size,
1324 reserved_size, dyn_size,
1325 pcpue_unit_size, pcpue_ptr, NULL);
1326}
diff --git a/mm/readahead.c b/mm/readahead.c
index bec83c15a78f..9ce303d4b810 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,19 +17,6 @@
17#include <linux/pagevec.h> 17#include <linux/pagevec.h>
18#include <linux/pagemap.h> 18#include <linux/pagemap.h>
19 19
20void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
21{
22}
23EXPORT_SYMBOL(default_unplug_io_fn);
24
25struct backing_dev_info default_backing_dev_info = {
26 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
27 .state = 0,
28 .capabilities = BDI_CAP_MAP_COPY,
29 .unplug_io_fn = default_unplug_io_fn,
30};
31EXPORT_SYMBOL_GPL(default_backing_dev_info);
32
33/* 20/*
34 * Initialise a struct file's readahead state. Assumes that the caller has 21 * Initialise a struct file's readahead state. Assumes that the caller has
35 * memset *ra to zero. 22 * memset *ra to zero.
@@ -233,18 +220,6 @@ unsigned long max_sane_readahead(unsigned long nr)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 220 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 221}
235 222
236static int __init readahead_init(void)
237{
238 int err;
239
240 err = bdi_init(&default_backing_dev_info);
241 if (!err)
242 bdi_register(&default_backing_dev_info, NULL, "default");
243
244 return err;
245}
246subsys_initcall(readahead_init);
247
248/* 223/*
249 * Submit IO for the read-ahead request in file_ra_state. 224 * Submit IO for the read-ahead request in file_ra_state.
250 */ 225 */
diff --git a/mm/shmem.c b/mm/shmem.c
index 4103a239ce84..d94d2e9146bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/ima.h>
31 32
32static struct vfsmount *shm_mnt; 33static struct vfsmount *shm_mnt;
33 34
@@ -1067,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1067 swap_duplicate(swap); 1068 swap_duplicate(swap);
1068 BUG_ON(page_mapped(page)); 1069 BUG_ON(page_mapped(page));
1069 page_cache_release(page); /* pagecache ref */ 1070 page_cache_release(page); /* pagecache ref */
1070 set_page_dirty(page); 1071 swap_writepage(page, wbc);
1071 unlock_page(page);
1072 if (inode) { 1072 if (inode) {
1073 mutex_lock(&shmem_swaplist_mutex); 1073 mutex_lock(&shmem_swaplist_mutex);
1074 /* move instead of add in case we're racing */ 1074 /* move instead of add in case we're racing */
@@ -2665,6 +2665,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2665 if (IS_ERR(file)) 2665 if (IS_ERR(file))
2666 return PTR_ERR(file); 2666 return PTR_ERR(file);
2667 2667
2668 ima_shm_check(file);
2668 if (vma->vm_file) 2669 if (vma->vm_file)
2669 fput(vma->vm_file); 2670 fput(vma->vm_file);
2670 vma->vm_file = file; 2671 vma->vm_file = file;
diff --git a/mm/slob.c b/mm/slob.c
index 596152926a8d..4dd6516447f2 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -127,9 +127,9 @@ static LIST_HEAD(free_slob_medium);
127static LIST_HEAD(free_slob_large); 127static LIST_HEAD(free_slob_large);
128 128
129/* 129/*
130 * slob_page: True for all slob pages (false for bigblock pages) 130 * is_slob_page: True for all slob pages (false for bigblock pages)
131 */ 131 */
132static inline int slob_page(struct slob_page *sp) 132static inline int is_slob_page(struct slob_page *sp)
133{ 133{
134 return PageSlobPage((struct page *)sp); 134 return PageSlobPage((struct page *)sp);
135} 135}
@@ -144,6 +144,11 @@ static inline void clear_slob_page(struct slob_page *sp)
144 __ClearPageSlobPage((struct page *)sp); 144 __ClearPageSlobPage((struct page *)sp);
145} 145}
146 146
147static inline struct slob_page *slob_page(const void *addr)
148{
149 return (struct slob_page *)virt_to_page(addr);
150}
151
147/* 152/*
148 * slob_page_free: true for pages on free_slob_pages list. 153 * slob_page_free: true for pages on free_slob_pages list.
149 */ 154 */
@@ -231,7 +236,7 @@ static int slob_last(slob_t *s)
231 return !((unsigned long)slob_next(s) & ~PAGE_MASK); 236 return !((unsigned long)slob_next(s) & ~PAGE_MASK);
232} 237}
233 238
234static void *slob_new_page(gfp_t gfp, int order, int node) 239static void *slob_new_pages(gfp_t gfp, int order, int node)
235{ 240{
236 void *page; 241 void *page;
237 242
@@ -248,12 +253,17 @@ static void *slob_new_page(gfp_t gfp, int order, int node)
248 return page_address(page); 253 return page_address(page);
249} 254}
250 255
256static void slob_free_pages(void *b, int order)
257{
258 free_pages((unsigned long)b, order);
259}
260
251/* 261/*
252 * Allocate a slob block within a given slob_page sp. 262 * Allocate a slob block within a given slob_page sp.
253 */ 263 */
254static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) 264static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
255{ 265{
256 slob_t *prev, *cur, *aligned = 0; 266 slob_t *prev, *cur, *aligned = NULL;
257 int delta = 0, units = SLOB_UNITS(size); 267 int delta = 0, units = SLOB_UNITS(size);
258 268
259 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { 269 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
@@ -350,10 +360,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
350 360
351 /* Not enough space: must allocate a new page */ 361 /* Not enough space: must allocate a new page */
352 if (!b) { 362 if (!b) {
353 b = slob_new_page(gfp & ~__GFP_ZERO, 0, node); 363 b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
354 if (!b) 364 if (!b)
355 return 0; 365 return NULL;
356 sp = (struct slob_page *)virt_to_page(b); 366 sp = slob_page(b);
357 set_slob_page(sp); 367 set_slob_page(sp);
358 368
359 spin_lock_irqsave(&slob_lock, flags); 369 spin_lock_irqsave(&slob_lock, flags);
@@ -385,7 +395,7 @@ static void slob_free(void *block, int size)
385 return; 395 return;
386 BUG_ON(!size); 396 BUG_ON(!size);
387 397
388 sp = (struct slob_page *)virt_to_page(block); 398 sp = slob_page(block);
389 units = SLOB_UNITS(size); 399 units = SLOB_UNITS(size);
390 400
391 spin_lock_irqsave(&slob_lock, flags); 401 spin_lock_irqsave(&slob_lock, flags);
@@ -394,10 +404,11 @@ static void slob_free(void *block, int size)
394 /* Go directly to page allocator. Do not pass slob allocator */ 404 /* Go directly to page allocator. Do not pass slob allocator */
395 if (slob_page_free(sp)) 405 if (slob_page_free(sp))
396 clear_slob_page_free(sp); 406 clear_slob_page_free(sp);
407 spin_unlock_irqrestore(&slob_lock, flags);
397 clear_slob_page(sp); 408 clear_slob_page(sp);
398 free_slob_page(sp); 409 free_slob_page(sp);
399 free_page((unsigned long)b); 410 free_page((unsigned long)b);
400 goto out; 411 return;
401 } 412 }
402 413
403 if (!slob_page_free(sp)) { 414 if (!slob_page_free(sp)) {
@@ -466,7 +477,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
466 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 477 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
467 void *ret; 478 void *ret;
468 479
469 lockdep_trace_alloc(flags); 480 lockdep_trace_alloc(gfp);
470 481
471 if (size < PAGE_SIZE - align) { 482 if (size < PAGE_SIZE - align) {
472 if (!size) 483 if (!size)
@@ -485,7 +496,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
485 } else { 496 } else {
486 unsigned int order = get_order(size); 497 unsigned int order = get_order(size);
487 498
488 ret = slob_new_page(gfp | __GFP_COMP, order, node); 499 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
489 if (ret) { 500 if (ret) {
490 struct page *page; 501 struct page *page;
491 page = virt_to_page(ret); 502 page = virt_to_page(ret);
@@ -508,8 +519,8 @@ void kfree(const void *block)
508 if (unlikely(ZERO_OR_NULL_PTR(block))) 519 if (unlikely(ZERO_OR_NULL_PTR(block)))
509 return; 520 return;
510 521
511 sp = (struct slob_page *)virt_to_page(block); 522 sp = slob_page(block);
512 if (slob_page(sp)) { 523 if (is_slob_page(sp)) {
513 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 524 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
514 unsigned int *m = (unsigned int *)(block - align); 525 unsigned int *m = (unsigned int *)(block - align);
515 slob_free(m, *m + align); 526 slob_free(m, *m + align);
@@ -529,8 +540,8 @@ size_t ksize(const void *block)
529 if (unlikely(block == ZERO_SIZE_PTR)) 540 if (unlikely(block == ZERO_SIZE_PTR))
530 return 0; 541 return 0;
531 542
532 sp = (struct slob_page *)virt_to_page(block); 543 sp = slob_page(block);
533 if (slob_page(sp)) { 544 if (is_slob_page(sp)) {
534 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 545 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
535 unsigned int *m = (unsigned int *)(block - align); 546 unsigned int *m = (unsigned int *)(block - align);
536 return SLOB_UNITS(*m) * SLOB_UNIT; 547 return SLOB_UNITS(*m) * SLOB_UNIT;
@@ -593,7 +604,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
593 SLOB_UNITS(c->size) * SLOB_UNIT, 604 SLOB_UNITS(c->size) * SLOB_UNIT,
594 flags, node); 605 flags, node);
595 } else { 606 } else {
596 b = slob_new_page(flags, get_order(c->size), node); 607 b = slob_new_pages(flags, get_order(c->size), node);
597 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, 608 kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
598 _RET_IP_, b, c->size, 609 _RET_IP_, b, c->size,
599 PAGE_SIZE << get_order(c->size), 610 PAGE_SIZE << get_order(c->size),
@@ -612,7 +623,7 @@ static void __kmem_cache_free(void *b, int size)
612 if (size < PAGE_SIZE) 623 if (size < PAGE_SIZE)
613 slob_free(b, size); 624 slob_free(b, size);
614 else 625 else
615 free_pages((unsigned long)b, get_order(size)); 626 slob_free_pages(b, get_order(size));
616} 627}
617 628
618static void kmem_rcu_free(struct rcu_head *head) 629static void kmem_rcu_free(struct rcu_head *head)
diff --git a/mm/slub.c b/mm/slub.c
index 816734ed8aa3..7aaa121d0ea9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -375,14 +375,8 @@ static struct track *get_track(struct kmem_cache *s, void *object,
375static void set_track(struct kmem_cache *s, void *object, 375static void set_track(struct kmem_cache *s, void *object,
376 enum track_item alloc, unsigned long addr) 376 enum track_item alloc, unsigned long addr)
377{ 377{
378 struct track *p; 378 struct track *p = get_track(s, object, alloc);
379
380 if (s->offset)
381 p = object + s->offset + sizeof(void *);
382 else
383 p = object + s->inuse;
384 379
385 p += alloc;
386 if (addr) { 380 if (addr) {
387 p->addr = addr; 381 p->addr = addr;
388 p->cpu = smp_processor_id(); 382 p->cpu = smp_processor_id();
@@ -1336,7 +1330,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1336 n = get_node(s, zone_to_nid(zone)); 1330 n = get_node(s, zone_to_nid(zone));
1337 1331
1338 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1332 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1339 n->nr_partial > n->min_partial) { 1333 n->nr_partial > s->min_partial) {
1340 page = get_partial_node(n); 1334 page = get_partial_node(n);
1341 if (page) 1335 if (page)
1342 return page; 1336 return page;
@@ -1388,7 +1382,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1388 slab_unlock(page); 1382 slab_unlock(page);
1389 } else { 1383 } else {
1390 stat(c, DEACTIVATE_EMPTY); 1384 stat(c, DEACTIVATE_EMPTY);
1391 if (n->nr_partial < n->min_partial) { 1385 if (n->nr_partial < s->min_partial) {
1392 /* 1386 /*
1393 * Adding an empty slab to the partial slabs in order 1387 * Adding an empty slab to the partial slabs in order
1394 * to avoid page allocator overhead. This slab needs 1388 * to avoid page allocator overhead. This slab needs
@@ -1754,7 +1748,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
1754 c = get_cpu_slab(s, smp_processor_id()); 1748 c = get_cpu_slab(s, smp_processor_id());
1755 debug_check_no_locks_freed(object, c->objsize); 1749 debug_check_no_locks_freed(object, c->objsize);
1756 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1750 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1757 debug_check_no_obj_freed(object, s->objsize); 1751 debug_check_no_obj_freed(object, c->objsize);
1758 if (likely(page == c->page && c->node >= 0)) { 1752 if (likely(page == c->page && c->node >= 0)) {
1759 object[c->offset] = c->freelist; 1753 object[c->offset] = c->freelist;
1760 c->freelist = object; 1754 c->freelist = object;
@@ -1876,6 +1870,7 @@ static inline int calculate_order(int size)
1876 int order; 1870 int order;
1877 int min_objects; 1871 int min_objects;
1878 int fraction; 1872 int fraction;
1873 int max_objects;
1879 1874
1880 /* 1875 /*
1881 * Attempt to find best configuration for a slab. This 1876 * Attempt to find best configuration for a slab. This
@@ -1888,6 +1883,9 @@ static inline int calculate_order(int size)
1888 min_objects = slub_min_objects; 1883 min_objects = slub_min_objects;
1889 if (!min_objects) 1884 if (!min_objects)
1890 min_objects = 4 * (fls(nr_cpu_ids) + 1); 1885 min_objects = 4 * (fls(nr_cpu_ids) + 1);
1886 max_objects = (PAGE_SIZE << slub_max_order)/size;
1887 min_objects = min(min_objects, max_objects);
1888
1891 while (min_objects > 1) { 1889 while (min_objects > 1) {
1892 fraction = 16; 1890 fraction = 16;
1893 while (fraction >= 4) { 1891 while (fraction >= 4) {
@@ -1897,7 +1895,7 @@ static inline int calculate_order(int size)
1897 return order; 1895 return order;
1898 fraction /= 2; 1896 fraction /= 2;
1899 } 1897 }
1900 min_objects /= 2; 1898 min_objects --;
1901 } 1899 }
1902 1900
1903 /* 1901 /*
@@ -1960,17 +1958,6 @@ static void
1960init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) 1958init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1961{ 1959{
1962 n->nr_partial = 0; 1960 n->nr_partial = 0;
1963
1964 /*
1965 * The larger the object size is, the more pages we want on the partial
1966 * list to avoid pounding the page allocator excessively.
1967 */
1968 n->min_partial = ilog2(s->size);
1969 if (n->min_partial < MIN_PARTIAL)
1970 n->min_partial = MIN_PARTIAL;
1971 else if (n->min_partial > MAX_PARTIAL)
1972 n->min_partial = MAX_PARTIAL;
1973
1974 spin_lock_init(&n->list_lock); 1961 spin_lock_init(&n->list_lock);
1975 INIT_LIST_HEAD(&n->partial); 1962 INIT_LIST_HEAD(&n->partial);
1976#ifdef CONFIG_SLUB_DEBUG 1963#ifdef CONFIG_SLUB_DEBUG
@@ -2213,6 +2200,15 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
2213} 2200}
2214#endif 2201#endif
2215 2202
2203static void set_min_partial(struct kmem_cache *s, unsigned long min)
2204{
2205 if (min < MIN_PARTIAL)
2206 min = MIN_PARTIAL;
2207 else if (min > MAX_PARTIAL)
2208 min = MAX_PARTIAL;
2209 s->min_partial = min;
2210}
2211
2216/* 2212/*
2217 * calculate_sizes() determines the order and the distribution of data within 2213 * calculate_sizes() determines the order and the distribution of data within
2218 * a slab object. 2214 * a slab object.
@@ -2351,6 +2347,11 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2351 if (!calculate_sizes(s, -1)) 2347 if (!calculate_sizes(s, -1))
2352 goto error; 2348 goto error;
2353 2349
2350 /*
2351 * The larger the object size is, the more pages we want on the partial
2352 * list to avoid pounding the page allocator excessively.
2353 */
2354 set_min_partial(s, ilog2(s->size));
2354 s->refcount = 1; 2355 s->refcount = 1;
2355#ifdef CONFIG_NUMA 2356#ifdef CONFIG_NUMA
2356 s->remote_node_defrag_ratio = 1000; 2357 s->remote_node_defrag_ratio = 1000;
@@ -3904,6 +3905,26 @@ static ssize_t order_show(struct kmem_cache *s, char *buf)
3904} 3905}
3905SLAB_ATTR(order); 3906SLAB_ATTR(order);
3906 3907
3908static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
3909{
3910 return sprintf(buf, "%lu\n", s->min_partial);
3911}
3912
3913static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
3914 size_t length)
3915{
3916 unsigned long min;
3917 int err;
3918
3919 err = strict_strtoul(buf, 10, &min);
3920 if (err)
3921 return err;
3922
3923 set_min_partial(s, min);
3924 return length;
3925}
3926SLAB_ATTR(min_partial);
3927
3907static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3928static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3908{ 3929{
3909 if (s->ctor) { 3930 if (s->ctor) {
@@ -4219,6 +4240,7 @@ static struct attribute *slab_attrs[] = {
4219 &object_size_attr.attr, 4240 &object_size_attr.attr,
4220 &objs_per_slab_attr.attr, 4241 &objs_per_slab_attr.attr,
4221 &order_attr.attr, 4242 &order_attr.attr,
4243 &min_partial_attr.attr,
4222 &objects_attr.attr, 4244 &objects_attr.attr,
4223 &objects_partial_attr.attr, 4245 &objects_partial_attr.attr,
4224 &total_objects_attr.attr, 4246 &total_objects_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 083f5b63e7a8..da432d9f0ae8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
164 WARN_ON_ONCE(1); 164 WARN_ON_ONCE(1);
165 *start_pfn = max_sparsemem_pfn; 165 *start_pfn = max_sparsemem_pfn;
166 *end_pfn = max_sparsemem_pfn; 166 *end_pfn = max_sparsemem_pfn;
167 } 167 } else if (*end_pfn > max_sparsemem_pfn) {
168
169 if (*end_pfn > max_sparsemem_pfn) {
170 mminit_dprintk(MMINIT_WARNING, "pfnvalidation", 168 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
171 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", 169 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
172 *start_pfn, *end_pfn, max_sparsemem_pfn); 170 *start_pfn, *end_pfn, max_sparsemem_pfn);
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e1..6e83084c1f6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -457,29 +457,6 @@ void pagevec_strip(struct pagevec *pvec)
457} 457}
458 458
459/** 459/**
460 * pagevec_swap_free - try to free swap space from the pages in a pagevec
461 * @pvec: pagevec with swapcache pages to free the swap space of
462 *
463 * The caller needs to hold an extra reference to each page and
464 * not hold the page lock on the pages. This function uses a
465 * trylock on the page lock so it may not always free the swap
466 * space associated with a page.
467 */
468void pagevec_swap_free(struct pagevec *pvec)
469{
470 int i;
471
472 for (i = 0; i < pagevec_count(pvec); i++) {
473 struct page *page = pvec->pages[i];
474
475 if (PageSwapCache(page) && trylock_page(page)) {
476 try_to_free_swap(page);
477 unlock_page(page);
478 }
479 }
480}
481
482/**
483 * pagevec_lookup - gang pagecache lookup 460 * pagevec_lookup - gang pagecache lookup
484 * @pvec: Where the resulting pages are placed 461 * @pvec: Where the resulting pages are placed
485 * @mapping: The address_space to search 462 * @mapping: The address_space to search
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf3054..7c122e49f769 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -70,6 +70,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
70EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
71 71
72/** 72/**
73 * memdup_user - duplicate memory region from user space
74 *
75 * @src: source address in user space
76 * @len: number of bytes to copy
77 *
78 * Returns an ERR_PTR() on failure.
79 */
80void *memdup_user(const void __user *src, size_t len)
81{
82 void *p;
83
84 /*
85 * Always use GFP_KERNEL, since copy_from_user() can sleep and
86 * cause pagefault, which makes it pointless to use GFP_NOFS
87 * or GFP_ATOMIC.
88 */
89 p = kmalloc_track_caller(len, GFP_KERNEL);
90 if (!p)
91 return ERR_PTR(-ENOMEM);
92
93 if (copy_from_user(p, src, len)) {
94 kfree(p);
95 return ERR_PTR(-EFAULT);
96 }
97
98 return p;
99}
100EXPORT_SYMBOL(memdup_user);
101
102/**
73 * __krealloc - like krealloc() but don't free @p. 103 * __krealloc - like krealloc() but don't free @p.
74 * @p: object to reallocate memory for. 104 * @p: object to reallocate memory for.
75 * @new_size: how many bytes of memory are required. 105 * @new_size: how many bytes of memory are required.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af58324c361a..fab19876b4d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -671,10 +671,7 @@ struct vmap_block {
671 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 671 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
672 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 672 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
673 union { 673 union {
674 struct { 674 struct list_head free_list;
675 struct list_head free_list;
676 struct list_head dirty_list;
677 };
678 struct rcu_head rcu_head; 675 struct rcu_head rcu_head;
679 }; 676 };
680}; 677};
@@ -741,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
741 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); 738 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
742 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 739 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
743 INIT_LIST_HEAD(&vb->free_list); 740 INIT_LIST_HEAD(&vb->free_list);
744 INIT_LIST_HEAD(&vb->dirty_list);
745 741
746 vb_idx = addr_to_vb_idx(va->va_start); 742 vb_idx = addr_to_vb_idx(va->va_start);
747 spin_lock(&vmap_block_tree_lock); 743 spin_lock(&vmap_block_tree_lock);
@@ -772,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb)
772 struct vmap_block *tmp; 768 struct vmap_block *tmp;
773 unsigned long vb_idx; 769 unsigned long vb_idx;
774 770
775 spin_lock(&vb->vbq->lock); 771 BUG_ON(!list_empty(&vb->free_list));
776 if (!list_empty(&vb->free_list))
777 list_del(&vb->free_list);
778 if (!list_empty(&vb->dirty_list))
779 list_del(&vb->dirty_list);
780 spin_unlock(&vb->vbq->lock);
781 772
782 vb_idx = addr_to_vb_idx(vb->va->va_start); 773 vb_idx = addr_to_vb_idx(vb->va->va_start);
783 spin_lock(&vmap_block_tree_lock); 774 spin_lock(&vmap_block_tree_lock);
@@ -862,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size)
862 853
863 spin_lock(&vb->lock); 854 spin_lock(&vb->lock);
864 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 855 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
865 if (!vb->dirty) { 856
866 spin_lock(&vb->vbq->lock);
867 list_add(&vb->dirty_list, &vb->vbq->dirty);
868 spin_unlock(&vb->vbq->lock);
869 }
870 vb->dirty += 1UL << order; 857 vb->dirty += 1UL << order;
871 if (vb->dirty == VMAP_BBMAP_BITS) { 858 if (vb->dirty == VMAP_BBMAP_BITS) {
872 BUG_ON(vb->free || !list_empty(&vb->free_list)); 859 BUG_ON(vb->free || !list_empty(&vb->free_list));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 479e46719394..06e72693b458 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -60,8 +60,8 @@ struct scan_control {
60 60
61 int may_writepage; 61 int may_writepage;
62 62
63 /* Can pages be swapped as part of reclaim? */ 63 /* Can mapped pages be reclaimed? */
64 int may_swap; 64 int may_unmap;
65 65
66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
67 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 67 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
@@ -78,6 +78,12 @@ struct scan_control {
78 /* Which cgroup do we reclaim from */ 78 /* Which cgroup do we reclaim from */
79 struct mem_cgroup *mem_cgroup; 79 struct mem_cgroup *mem_cgroup;
80 80
81 /*
82 * Nodemask of nodes allowed by the caller. If NULL, all nodes
83 * are scanned.
84 */
85 nodemask_t *nodemask;
86
81 /* Pluggable isolate pages callback */ 87 /* Pluggable isolate pages callback */
82 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 88 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
83 unsigned long *scanned, int order, int mode, 89 unsigned long *scanned, int order, int mode,
@@ -214,8 +220,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
214 do_div(delta, lru_pages + 1); 220 do_div(delta, lru_pages + 1);
215 shrinker->nr += delta; 221 shrinker->nr += delta;
216 if (shrinker->nr < 0) { 222 if (shrinker->nr < 0) {
217 printk(KERN_ERR "%s: nr=%ld\n", 223 printk(KERN_ERR "shrink_slab: %pF negative objects to "
218 __func__, shrinker->nr); 224 "delete nr=%ld\n",
225 shrinker->shrink, shrinker->nr);
219 shrinker->nr = max_pass; 226 shrinker->nr = max_pass;
220 } 227 }
221 228
@@ -606,7 +613,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
606 if (unlikely(!page_evictable(page, NULL))) 613 if (unlikely(!page_evictable(page, NULL)))
607 goto cull_mlocked; 614 goto cull_mlocked;
608 615
609 if (!sc->may_swap && page_mapped(page)) 616 if (!sc->may_unmap && page_mapped(page))
610 goto keep_locked; 617 goto keep_locked;
611 618
612 /* Double the slab pressure for mapped and swapcache pages */ 619 /* Double the slab pressure for mapped and swapcache pages */
@@ -1298,17 +1305,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1298 } 1305 }
1299 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1306 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1300 pgdeactivate += pgmoved; 1307 pgdeactivate += pgmoved;
1301 if (buffer_heads_over_limit) {
1302 spin_unlock_irq(&zone->lru_lock);
1303 pagevec_strip(&pvec);
1304 spin_lock_irq(&zone->lru_lock);
1305 }
1306 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1308 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1307 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1309 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1308 spin_unlock_irq(&zone->lru_lock); 1310 spin_unlock_irq(&zone->lru_lock);
1309 if (vm_swap_full()) 1311 if (buffer_heads_over_limit)
1310 pagevec_swap_free(&pvec); 1312 pagevec_strip(&pvec);
1311
1312 pagevec_release(&pvec); 1313 pagevec_release(&pvec);
1313} 1314}
1314 1315
@@ -1543,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1543 struct zone *zone; 1544 struct zone *zone;
1544 1545
1545 sc->all_unreclaimable = 1; 1546 sc->all_unreclaimable = 1;
1546 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1547 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1548 sc->nodemask) {
1547 if (!populated_zone(zone)) 1549 if (!populated_zone(zone))
1548 continue; 1550 continue;
1549 /* 1551 /*
@@ -1688,17 +1690,18 @@ out:
1688} 1690}
1689 1691
1690unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 1692unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1691 gfp_t gfp_mask) 1693 gfp_t gfp_mask, nodemask_t *nodemask)
1692{ 1694{
1693 struct scan_control sc = { 1695 struct scan_control sc = {
1694 .gfp_mask = gfp_mask, 1696 .gfp_mask = gfp_mask,
1695 .may_writepage = !laptop_mode, 1697 .may_writepage = !laptop_mode,
1696 .swap_cluster_max = SWAP_CLUSTER_MAX, 1698 .swap_cluster_max = SWAP_CLUSTER_MAX,
1697 .may_swap = 1, 1699 .may_unmap = 1,
1698 .swappiness = vm_swappiness, 1700 .swappiness = vm_swappiness,
1699 .order = order, 1701 .order = order,
1700 .mem_cgroup = NULL, 1702 .mem_cgroup = NULL,
1701 .isolate_pages = isolate_pages_global, 1703 .isolate_pages = isolate_pages_global,
1704 .nodemask = nodemask,
1702 }; 1705 };
1703 1706
1704 return do_try_to_free_pages(zonelist, &sc); 1707 return do_try_to_free_pages(zonelist, &sc);
@@ -1713,17 +1716,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1713{ 1716{
1714 struct scan_control sc = { 1717 struct scan_control sc = {
1715 .may_writepage = !laptop_mode, 1718 .may_writepage = !laptop_mode,
1716 .may_swap = 1, 1719 .may_unmap = 1,
1717 .swap_cluster_max = SWAP_CLUSTER_MAX, 1720 .swap_cluster_max = SWAP_CLUSTER_MAX,
1718 .swappiness = swappiness, 1721 .swappiness = swappiness,
1719 .order = 0, 1722 .order = 0,
1720 .mem_cgroup = mem_cont, 1723 .mem_cgroup = mem_cont,
1721 .isolate_pages = mem_cgroup_isolate_pages, 1724 .isolate_pages = mem_cgroup_isolate_pages,
1725 .nodemask = NULL, /* we don't care the placement */
1722 }; 1726 };
1723 struct zonelist *zonelist; 1727 struct zonelist *zonelist;
1724 1728
1725 if (noswap) 1729 if (noswap)
1726 sc.may_swap = 0; 1730 sc.may_unmap = 0;
1727 1731
1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1732 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1733 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1762,7 +1766,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1762 struct reclaim_state *reclaim_state = current->reclaim_state; 1766 struct reclaim_state *reclaim_state = current->reclaim_state;
1763 struct scan_control sc = { 1767 struct scan_control sc = {
1764 .gfp_mask = GFP_KERNEL, 1768 .gfp_mask = GFP_KERNEL,
1765 .may_swap = 1, 1769 .may_unmap = 1,
1766 .swap_cluster_max = SWAP_CLUSTER_MAX, 1770 .swap_cluster_max = SWAP_CLUSTER_MAX,
1767 .swappiness = vm_swappiness, 1771 .swappiness = vm_swappiness,
1768 .order = order, 1772 .order = order,
@@ -2050,22 +2054,19 @@ unsigned long global_lru_pages(void)
2050#ifdef CONFIG_PM 2054#ifdef CONFIG_PM
2051/* 2055/*
2052 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2056 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2053 * from LRU lists system-wide, for given pass and priority, and returns the 2057 * from LRU lists system-wide, for given pass and priority.
2054 * number of reclaimed pages
2055 * 2058 *
2056 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 2059 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2057 */ 2060 */
2058static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, 2061static void shrink_all_zones(unsigned long nr_pages, int prio,
2059 int pass, struct scan_control *sc) 2062 int pass, struct scan_control *sc)
2060{ 2063{
2061 struct zone *zone; 2064 struct zone *zone;
2062 unsigned long ret = 0; 2065 unsigned long nr_reclaimed = 0;
2063 2066
2064 for_each_zone(zone) { 2067 for_each_populated_zone(zone) {
2065 enum lru_list l; 2068 enum lru_list l;
2066 2069
2067 if (!populated_zone(zone))
2068 continue;
2069 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2070 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2070 continue; 2071 continue;
2071 2072
@@ -2084,14 +2085,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2084 2085
2085 zone->lru[l].nr_scan = 0; 2086 zone->lru[l].nr_scan = 0;
2086 nr_to_scan = min(nr_pages, lru_pages); 2087 nr_to_scan = min(nr_pages, lru_pages);
2087 ret += shrink_list(l, nr_to_scan, zone, 2088 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2088 sc, prio); 2089 sc, prio);
2089 if (ret >= nr_pages) 2090 if (nr_reclaimed >= nr_pages) {
2090 return ret; 2091 sc->nr_reclaimed = nr_reclaimed;
2092 return;
2093 }
2091 } 2094 }
2092 } 2095 }
2093 } 2096 }
2094 return ret; 2097 sc->nr_reclaimed = nr_reclaimed;
2095} 2098}
2096 2099
2097/* 2100/*
@@ -2105,13 +2108,11 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2105unsigned long shrink_all_memory(unsigned long nr_pages) 2108unsigned long shrink_all_memory(unsigned long nr_pages)
2106{ 2109{
2107 unsigned long lru_pages, nr_slab; 2110 unsigned long lru_pages, nr_slab;
2108 unsigned long ret = 0;
2109 int pass; 2111 int pass;
2110 struct reclaim_state reclaim_state; 2112 struct reclaim_state reclaim_state;
2111 struct scan_control sc = { 2113 struct scan_control sc = {
2112 .gfp_mask = GFP_KERNEL, 2114 .gfp_mask = GFP_KERNEL,
2113 .may_swap = 0, 2115 .may_unmap = 0,
2114 .swap_cluster_max = nr_pages,
2115 .may_writepage = 1, 2116 .may_writepage = 1,
2116 .isolate_pages = isolate_pages_global, 2117 .isolate_pages = isolate_pages_global,
2117 }; 2118 };
@@ -2127,8 +2128,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2127 if (!reclaim_state.reclaimed_slab) 2128 if (!reclaim_state.reclaimed_slab)
2128 break; 2129 break;
2129 2130
2130 ret += reclaim_state.reclaimed_slab; 2131 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2131 if (ret >= nr_pages) 2132 if (sc.nr_reclaimed >= nr_pages)
2132 goto out; 2133 goto out;
2133 2134
2134 nr_slab -= reclaim_state.reclaimed_slab; 2135 nr_slab -= reclaim_state.reclaimed_slab;
@@ -2147,21 +2148,22 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2147 2148
2148 /* Force reclaiming mapped pages in the passes #3 and #4 */ 2149 /* Force reclaiming mapped pages in the passes #3 and #4 */
2149 if (pass > 2) 2150 if (pass > 2)
2150 sc.may_swap = 1; 2151 sc.may_unmap = 1;
2151 2152
2152 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 2153 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2153 unsigned long nr_to_scan = nr_pages - ret; 2154 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2154 2155
2155 sc.nr_scanned = 0; 2156 sc.nr_scanned = 0;
2156 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 2157 sc.swap_cluster_max = nr_to_scan;
2157 if (ret >= nr_pages) 2158 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2159 if (sc.nr_reclaimed >= nr_pages)
2158 goto out; 2160 goto out;
2159 2161
2160 reclaim_state.reclaimed_slab = 0; 2162 reclaim_state.reclaimed_slab = 0;
2161 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2163 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2162 global_lru_pages()); 2164 global_lru_pages());
2163 ret += reclaim_state.reclaimed_slab; 2165 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2164 if (ret >= nr_pages) 2166 if (sc.nr_reclaimed >= nr_pages)
2165 goto out; 2167 goto out;
2166 2168
2167 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 2169 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
@@ -2170,21 +2172,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2170 } 2172 }
2171 2173
2172 /* 2174 /*
2173 * If ret = 0, we could not shrink LRUs, but there may be something 2175 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2174 * in slab caches 2176 * something in slab caches
2175 */ 2177 */
2176 if (!ret) { 2178 if (!sc.nr_reclaimed) {
2177 do { 2179 do {
2178 reclaim_state.reclaimed_slab = 0; 2180 reclaim_state.reclaimed_slab = 0;
2179 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2181 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
2180 ret += reclaim_state.reclaimed_slab; 2182 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2181 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2183 } while (sc.nr_reclaimed < nr_pages &&
2184 reclaim_state.reclaimed_slab > 0);
2182 } 2185 }
2183 2186
2187
2184out: 2188out:
2185 current->reclaim_state = NULL; 2189 current->reclaim_state = NULL;
2186 2190
2187 return ret; 2191 return sc.nr_reclaimed;
2188} 2192}
2189#endif 2193#endif
2190 2194
@@ -2290,11 +2294,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2290 int priority; 2294 int priority;
2291 struct scan_control sc = { 2295 struct scan_control sc = {
2292 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2296 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2293 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2297 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2294 .swap_cluster_max = max_t(unsigned long, nr_pages, 2298 .swap_cluster_max = max_t(unsigned long, nr_pages,
2295 SWAP_CLUSTER_MAX), 2299 SWAP_CLUSTER_MAX),
2296 .gfp_mask = gfp_mask, 2300 .gfp_mask = gfp_mask,
2297 .swappiness = vm_swappiness, 2301 .swappiness = vm_swappiness,
2302 .order = order,
2298 .isolate_pages = isolate_pages_global, 2303 .isolate_pages = isolate_pages_global,
2299 }; 2304 };
2300 unsigned long slab_reclaimable; 2305 unsigned long slab_reclaimable;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 91149746bb8d..9826766f1274 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
27 27
28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 28 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
29 29
30 for_each_cpu_mask_nr(cpu, *cpumask) { 30 for_each_cpu(cpu, cpumask) {
31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
32 32
33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 33 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
135 int cpu; 135 int cpu;
136 int threshold; 136 int threshold;
137 137
138 for_each_zone(zone) { 138 for_each_populated_zone(zone) {
139
140 if (!zone->present_pages)
141 continue;
142
143 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
144 140
145 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
301 int i; 297 int i;
302 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 298 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303 299
304 for_each_zone(zone) { 300 for_each_populated_zone(zone) {
305 struct per_cpu_pageset *p; 301 struct per_cpu_pageset *p;
306 302
307 if (!populated_zone(zone))
308 continue;
309
310 p = zone_pcp(zone, cpu); 303 p = zone_pcp(zone, cpu);
311 304
312 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)