aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJ. Bruce Fields <bfields@citi.umich.edu>2009-08-21 11:27:29 -0400
committerJ. Bruce Fields <bfields@citi.umich.edu>2009-08-21 11:27:29 -0400
commite9dc122166b8d863d3057a66ada04838e5548e52 (patch)
tree749e15bf719b64bf9113db7acd8e043d9742cb26 /mm
parent560ab42ef923aaf2e4347315bdfcc74b2708972c (diff)
parent405d8f8b1d936414da2093d4149ff790ff3f84a5 (diff)
Merge branch 'nfs-for-2.6.32' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6 into for-2.6.32-incoming
Conflicts: net/sunrpc/cache.c
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c7
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kmemleak.c409
-rw-r--r--mm/memcontrol.c25
-rw-r--r--mm/memory.c37
-rw-r--r--mm/mempolicy.c84
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/nommu.c33
-rw-r--r--mm/page-writeback.c13
-rw-r--r--mm/page_alloc.c54
-rw-r--r--mm/percpu.c24
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c12
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmscan.c8
19 files changed, 431 insertions, 304 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 493b468a5035..c86edd244294 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = {
283 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 283 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
284 }; 284 };
285 285
286
287void clear_bdi_congested(struct backing_dev_info *bdi, int sync) 286void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
288{ 287{
289 enum bdi_state bit; 288 enum bdi_state bit;
@@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested);
308 307
309/** 308/**
310 * congestion_wait - wait for a backing_dev to become uncongested 309 * congestion_wait - wait for a backing_dev to become uncongested
311 * @rw: READ or WRITE 310 * @sync: SYNC or ASYNC IO
312 * @timeout: timeout in jiffies 311 * @timeout: timeout in jiffies
313 * 312 *
314 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit 313 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
315 * write congestion. If no backing_devs are congested then just wait for the 314 * write congestion. If no backing_devs are congested then just wait for the
316 * next write to be completed. 315 * next write to be completed.
317 */ 316 */
318long congestion_wait(int rw, long timeout) 317long congestion_wait(int sync, long timeout)
319{ 318{
320 long ret; 319 long ret;
321 DEFINE_WAIT(wait); 320 DEFINE_WAIT(wait);
322 wait_queue_head_t *wqh = &congestion_wqh[rw]; 321 wait_queue_head_t *wqh = &congestion_wqh[sync];
323 322
324 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); 323 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
325 ret = io_schedule_timeout(timeout); 324 ret = io_schedule_timeout(timeout);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d2a9ce952768..701740c9e81b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,6 +12,7 @@
12#include <linux/pfn.h> 12#include <linux/pfn.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/kmemleak.h>
15 16
16#include <asm/bug.h> 17#include <asm/bug.h>
17#include <asm/io.h> 18#include <asm/io.h>
@@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
335{ 336{
336 unsigned long start, end; 337 unsigned long start, end;
337 338
339 kmemleak_free_part(__va(physaddr), size);
340
338 start = PFN_UP(physaddr); 341 start = PFN_UP(physaddr);
339 end = PFN_DOWN(physaddr + size); 342 end = PFN_DOWN(physaddr + size);
340 343
@@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
354{ 357{
355 unsigned long start, end; 358 unsigned long start, end;
356 359
360 kmemleak_free_part(__va(addr), size);
361
357 start = PFN_UP(addr); 362 start = PFN_UP(addr);
358 end = PFN_DOWN(addr + size); 363 end = PFN_DOWN(addr + size);
359 364
@@ -516,6 +521,7 @@ find_block:
516 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + 521 region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
517 start_off); 522 start_off);
518 memset(region, 0, size); 523 memset(region, 0, size);
524 kmemleak_alloc(region, size, 1, 0);
519 return region; 525 return region;
520 } 526 }
521 527
diff --git a/mm/dmapool.c b/mm/dmapool.c
index b1f0885dda22..3df063706f53 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
86 unsigned pages = 0; 86 unsigned pages = 0;
87 unsigned blocks = 0; 87 unsigned blocks = 0;
88 88
89 spin_lock_irq(&pool->lock);
89 list_for_each_entry(page, &pool->page_list, page_list) { 90 list_for_each_entry(page, &pool->page_list, page_list) {
90 pages++; 91 pages++;
91 blocks += page->in_use; 92 blocks += page->in_use;
92 } 93 }
94 spin_unlock_irq(&pool->lock);
93 95
94 /* per-pool info, no real statistics yet */ 96 /* per-pool info, no real statistics yet */
95 temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", 97 temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
diff --git a/mm/filemap.c b/mm/filemap.c
index 22396713feb9..ccea3b665c12 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2272,6 +2272,7 @@ again:
2272 pagefault_enable(); 2272 pagefault_enable();
2273 flush_dcache_page(page); 2273 flush_dcache_page(page);
2274 2274
2275 mark_page_accessed(page);
2275 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2276 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2276 page, fsdata); 2277 page, fsdata);
2277 if (unlikely(status < 0)) 2278 if (unlikely(status < 0))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..cafdcee154e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2370 long chg = region_truncate(&inode->i_mapping->private_list, offset); 2370 long chg = region_truncate(&inode->i_mapping->private_list, offset);
2371 2371
2372 spin_lock(&inode->i_lock); 2372 spin_lock(&inode->i_lock);
2373 inode->i_blocks -= blocks_per_huge_page(h); 2373 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2374 spin_unlock(&inode->i_lock); 2374 spin_unlock(&inode->i_lock);
2375 2375
2376 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2376 hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c96f2c8700aa..487267310a84 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -48,10 +48,10 @@
48 * scanned. This list is only modified during a scanning episode when the 48 * scanned. This list is only modified during a scanning episode when the
49 * scan_mutex is held. At the end of a scan, the gray_list is always empty. 49 * scan_mutex is held. At the end of a scan, the gray_list is always empty.
50 * Note that the kmemleak_object.use_count is incremented when an object is 50 * Note that the kmemleak_object.use_count is incremented when an object is
51 * added to the gray_list and therefore cannot be freed 51 * added to the gray_list and therefore cannot be freed. This mutex also
52 * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs 52 * prevents multiple users of the "kmemleak" debugfs file together with
53 * file together with modifications to the memory scanning parameters 53 * modifications to the memory scanning parameters including the scan_thread
54 * including the scan_thread pointer 54 * pointer
55 * 55 *
56 * The kmemleak_object structures have a use_count incremented or decremented 56 * The kmemleak_object structures have a use_count incremented or decremented
57 * using the get_object()/put_object() functions. When the use_count becomes 57 * using the get_object()/put_object() functions. When the use_count becomes
@@ -103,11 +103,10 @@
103 * Kmemleak configuration and common defines. 103 * Kmemleak configuration and common defines.
104 */ 104 */
105#define MAX_TRACE 16 /* stack trace length */ 105#define MAX_TRACE 16 /* stack trace length */
106#define REPORTS_NR 50 /* maximum number of reported leaks */
107#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ 106#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */
108#define MSECS_SCAN_YIELD 10 /* CPU yielding period */
109#define SECS_FIRST_SCAN 60 /* delay before the first scan */ 107#define SECS_FIRST_SCAN 60 /* delay before the first scan */
110#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ 108#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */
109#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */
111 110
112#define BYTES_PER_POINTER sizeof(void *) 111#define BYTES_PER_POINTER sizeof(void *)
113 112
@@ -159,6 +158,8 @@ struct kmemleak_object {
159#define OBJECT_REPORTED (1 << 1) 158#define OBJECT_REPORTED (1 << 1)
160/* flag set to not scan the object */ 159/* flag set to not scan the object */
161#define OBJECT_NO_SCAN (1 << 2) 160#define OBJECT_NO_SCAN (1 << 2)
161/* flag set on newly allocated objects */
162#define OBJECT_NEW (1 << 3)
162 163
163/* the list of all allocated objects */ 164/* the list of all allocated objects */
164static LIST_HEAD(object_list); 165static LIST_HEAD(object_list);
@@ -186,22 +187,16 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0);
186static unsigned long min_addr = ULONG_MAX; 187static unsigned long min_addr = ULONG_MAX;
187static unsigned long max_addr; 188static unsigned long max_addr;
188 189
189/* used for yielding the CPU to other tasks during scanning */
190static unsigned long next_scan_yield;
191static struct task_struct *scan_thread; 190static struct task_struct *scan_thread;
192static unsigned long jiffies_scan_yield; 191/* used to avoid reporting of recently allocated objects */
193static unsigned long jiffies_min_age; 192static unsigned long jiffies_min_age;
193static unsigned long jiffies_last_scan;
194/* delay between automatic memory scannings */ 194/* delay between automatic memory scannings */
195static signed long jiffies_scan_wait; 195static signed long jiffies_scan_wait;
196/* enables or disables the task stacks scanning */ 196/* enables or disables the task stacks scanning */
197static int kmemleak_stack_scan; 197static int kmemleak_stack_scan = 1;
198/* mutex protecting the memory scanning */ 198/* protects the memory scanning, parameters and debug/kmemleak file access */
199static DEFINE_MUTEX(scan_mutex); 199static DEFINE_MUTEX(scan_mutex);
200/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
201static DEFINE_MUTEX(kmemleak_mutex);
202
203/* number of leaks reported (for limitation purposes) */
204static int reported_leaks;
205 200
206/* 201/*
207 * Early object allocation/freeing logging. Kmemleak is initialized after the 202 * Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -215,6 +210,7 @@ static int reported_leaks;
215enum { 210enum {
216 KMEMLEAK_ALLOC, 211 KMEMLEAK_ALLOC,
217 KMEMLEAK_FREE, 212 KMEMLEAK_FREE,
213 KMEMLEAK_FREE_PART,
218 KMEMLEAK_NOT_LEAK, 214 KMEMLEAK_NOT_LEAK,
219 KMEMLEAK_IGNORE, 215 KMEMLEAK_IGNORE,
220 KMEMLEAK_SCAN_AREA, 216 KMEMLEAK_SCAN_AREA,
@@ -235,7 +231,7 @@ struct early_log {
235}; 231};
236 232
237/* early logging buffer and current position */ 233/* early logging buffer and current position */
238static struct early_log early_log[200]; 234static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE];
239static int crt_early_log; 235static int crt_early_log;
240 236
241static void kmemleak_disable(void); 237static void kmemleak_disable(void);
@@ -278,13 +274,9 @@ static int color_gray(const struct kmemleak_object *object)
278 return object->min_count != -1 && object->count >= object->min_count; 274 return object->min_count != -1 && object->count >= object->min_count;
279} 275}
280 276
281/* 277static int color_black(const struct kmemleak_object *object)
282 * Objects are considered referenced if their color is gray and they have not
283 * been deleted.
284 */
285static int referenced_object(struct kmemleak_object *object)
286{ 278{
287 return (object->flags & OBJECT_ALLOCATED) && color_gray(object); 279 return object->min_count == -1;
288} 280}
289 281
290/* 282/*
@@ -295,42 +287,28 @@ static int referenced_object(struct kmemleak_object *object)
295static int unreferenced_object(struct kmemleak_object *object) 287static int unreferenced_object(struct kmemleak_object *object)
296{ 288{
297 return (object->flags & OBJECT_ALLOCATED) && color_white(object) && 289 return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
298 time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); 290 time_before_eq(object->jiffies + jiffies_min_age,
291 jiffies_last_scan);
299} 292}
300 293
301/* 294/*
302 * Printing of the (un)referenced objects information, either to the seq file 295 * Printing of the unreferenced objects information to the seq file. The
303 * or to the kernel log. The print_referenced/print_unreferenced functions 296 * print_unreferenced function must be called with the object->lock held.
304 * must be called with the object->lock held.
305 */ 297 */
306#define print_helper(seq, x...) do { \
307 struct seq_file *s = (seq); \
308 if (s) \
309 seq_printf(s, x); \
310 else \
311 pr_info(x); \
312} while (0)
313
314static void print_referenced(struct kmemleak_object *object)
315{
316 pr_info("referenced object 0x%08lx (size %zu)\n",
317 object->pointer, object->size);
318}
319
320static void print_unreferenced(struct seq_file *seq, 298static void print_unreferenced(struct seq_file *seq,
321 struct kmemleak_object *object) 299 struct kmemleak_object *object)
322{ 300{
323 int i; 301 int i;
324 302
325 print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n", 303 seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
326 object->pointer, object->size); 304 object->pointer, object->size);
327 print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", 305 seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n",
328 object->comm, object->pid, object->jiffies); 306 object->comm, object->pid, object->jiffies);
329 print_helper(seq, " backtrace:\n"); 307 seq_printf(seq, " backtrace:\n");
330 308
331 for (i = 0; i < object->trace_len; i++) { 309 for (i = 0; i < object->trace_len; i++) {
332 void *ptr = (void *)object->trace[i]; 310 void *ptr = (void *)object->trace[i];
333 print_helper(seq, " [<%p>] %pS\n", ptr, ptr); 311 seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
334 } 312 }
335} 313}
336 314
@@ -478,7 +456,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
478 INIT_HLIST_HEAD(&object->area_list); 456 INIT_HLIST_HEAD(&object->area_list);
479 spin_lock_init(&object->lock); 457 spin_lock_init(&object->lock);
480 atomic_set(&object->use_count, 1); 458 atomic_set(&object->use_count, 1);
481 object->flags = OBJECT_ALLOCATED; 459 object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
482 object->pointer = ptr; 460 object->pointer = ptr;
483 object->size = size; 461 object->size = size;
484 object->min_count = min_count; 462 object->min_count = min_count;
@@ -546,39 +524,87 @@ out:
546 * Remove the metadata (struct kmemleak_object) for a memory block from the 524 * Remove the metadata (struct kmemleak_object) for a memory block from the
547 * object_list and object_tree_root and decrement its use_count. 525 * object_list and object_tree_root and decrement its use_count.
548 */ 526 */
549static void delete_object(unsigned long ptr) 527static void __delete_object(struct kmemleak_object *object)
550{ 528{
551 unsigned long flags; 529 unsigned long flags;
552 struct kmemleak_object *object;
553 530
554 write_lock_irqsave(&kmemleak_lock, flags); 531 write_lock_irqsave(&kmemleak_lock, flags);
555 object = lookup_object(ptr, 0);
556 if (!object) {
557 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
558 ptr);
559 write_unlock_irqrestore(&kmemleak_lock, flags);
560 return;
561 }
562 prio_tree_remove(&object_tree_root, &object->tree_node); 532 prio_tree_remove(&object_tree_root, &object->tree_node);
563 list_del_rcu(&object->object_list); 533 list_del_rcu(&object->object_list);
564 write_unlock_irqrestore(&kmemleak_lock, flags); 534 write_unlock_irqrestore(&kmemleak_lock, flags);
565 535
566 WARN_ON(!(object->flags & OBJECT_ALLOCATED)); 536 WARN_ON(!(object->flags & OBJECT_ALLOCATED));
567 WARN_ON(atomic_read(&object->use_count) < 1); 537 WARN_ON(atomic_read(&object->use_count) < 2);
568 538
569 /* 539 /*
570 * Locking here also ensures that the corresponding memory block 540 * Locking here also ensures that the corresponding memory block
571 * cannot be freed when it is being scanned. 541 * cannot be freed when it is being scanned.
572 */ 542 */
573 spin_lock_irqsave(&object->lock, flags); 543 spin_lock_irqsave(&object->lock, flags);
574 if (object->flags & OBJECT_REPORTED)
575 print_referenced(object);
576 object->flags &= ~OBJECT_ALLOCATED; 544 object->flags &= ~OBJECT_ALLOCATED;
577 spin_unlock_irqrestore(&object->lock, flags); 545 spin_unlock_irqrestore(&object->lock, flags);
578 put_object(object); 546 put_object(object);
579} 547}
580 548
581/* 549/*
550 * Look up the metadata (struct kmemleak_object) corresponding to ptr and
551 * delete it.
552 */
553static void delete_object_full(unsigned long ptr)
554{
555 struct kmemleak_object *object;
556
557 object = find_and_get_object(ptr, 0);
558 if (!object) {
559#ifdef DEBUG
560 kmemleak_warn("Freeing unknown object at 0x%08lx\n",
561 ptr);
562#endif
563 return;
564 }
565 __delete_object(object);
566 put_object(object);
567}
568
569/*
570 * Look up the metadata (struct kmemleak_object) corresponding to ptr and
571 * delete it. If the memory block is partially freed, the function may create
572 * additional metadata for the remaining parts of the block.
573 */
574static void delete_object_part(unsigned long ptr, size_t size)
575{
576 struct kmemleak_object *object;
577 unsigned long start, end;
578
579 object = find_and_get_object(ptr, 1);
580 if (!object) {
581#ifdef DEBUG
582 kmemleak_warn("Partially freeing unknown object at 0x%08lx "
583 "(size %zu)\n", ptr, size);
584#endif
585 return;
586 }
587 __delete_object(object);
588
589 /*
590 * Create one or two objects that may result from the memory block
591 * split. Note that partial freeing is only done by free_bootmem() and
592 * this happens before kmemleak_init() is called. The path below is
593 * only executed during early log recording in kmemleak_init(), so
594 * GFP_KERNEL is enough.
595 */
596 start = object->pointer;
597 end = object->pointer + object->size;
598 if (ptr > start)
599 create_object(start, ptr - start, object->min_count,
600 GFP_KERNEL);
601 if (ptr + size < end)
602 create_object(ptr + size, end - ptr - size, object->min_count,
603 GFP_KERNEL);
604
605 put_object(object);
606}
607/*
582 * Make a object permanently as gray-colored so that it can no longer be 608 * Make a object permanently as gray-colored so that it can no longer be
583 * reported as a leak. This is used in general to mark a false positive. 609 * reported as a leak. This is used in general to mark a false positive.
584 */ 610 */
@@ -696,7 +722,8 @@ static void log_early(int op_type, const void *ptr, size_t size,
696 struct early_log *log; 722 struct early_log *log;
697 723
698 if (crt_early_log >= ARRAY_SIZE(early_log)) { 724 if (crt_early_log >= ARRAY_SIZE(early_log)) {
699 kmemleak_stop("Early log buffer exceeded\n"); 725 pr_warning("Early log buffer exceeded\n");
726 kmemleak_disable();
700 return; 727 return;
701 } 728 }
702 729
@@ -741,13 +768,28 @@ void kmemleak_free(const void *ptr)
741 pr_debug("%s(0x%p)\n", __func__, ptr); 768 pr_debug("%s(0x%p)\n", __func__, ptr);
742 769
743 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) 770 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
744 delete_object((unsigned long)ptr); 771 delete_object_full((unsigned long)ptr);
745 else if (atomic_read(&kmemleak_early_log)) 772 else if (atomic_read(&kmemleak_early_log))
746 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); 773 log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
747} 774}
748EXPORT_SYMBOL_GPL(kmemleak_free); 775EXPORT_SYMBOL_GPL(kmemleak_free);
749 776
750/* 777/*
778 * Partial memory freeing function callback. This function is usually called
779 * from bootmem allocator when (part of) a memory block is freed.
780 */
781void kmemleak_free_part(const void *ptr, size_t size)
782{
783 pr_debug("%s(0x%p)\n", __func__, ptr);
784
785 if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
786 delete_object_part((unsigned long)ptr, size);
787 else if (atomic_read(&kmemleak_early_log))
788 log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
789}
790EXPORT_SYMBOL_GPL(kmemleak_free_part);
791
792/*
751 * Mark an already allocated memory block as a false positive. This will cause 793 * Mark an already allocated memory block as a false positive. This will cause
752 * the block to no longer be reported as leak and always be scanned. 794 * the block to no longer be reported as leak and always be scanned.
753 */ 795 */
@@ -808,21 +850,6 @@ void kmemleak_no_scan(const void *ptr)
808EXPORT_SYMBOL(kmemleak_no_scan); 850EXPORT_SYMBOL(kmemleak_no_scan);
809 851
810/* 852/*
811 * Yield the CPU so that other tasks get a chance to run. The yielding is
812 * rate-limited to avoid excessive number of calls to the schedule() function
813 * during memory scanning.
814 */
815static void scan_yield(void)
816{
817 might_sleep();
818
819 if (time_is_before_eq_jiffies(next_scan_yield)) {
820 schedule();
821 next_scan_yield = jiffies + jiffies_scan_yield;
822 }
823}
824
825/*
826 * Memory scanning is a long process and it needs to be interruptable. This 853 * Memory scanning is a long process and it needs to be interruptable. This
827 * function checks whether such interrupt condition occured. 854 * function checks whether such interrupt condition occured.
828 */ 855 */
@@ -848,7 +875,7 @@ static int scan_should_stop(void)
848 * found to the gray list. 875 * found to the gray list.
849 */ 876 */
850static void scan_block(void *_start, void *_end, 877static void scan_block(void *_start, void *_end,
851 struct kmemleak_object *scanned) 878 struct kmemleak_object *scanned, int allow_resched)
852{ 879{
853 unsigned long *ptr; 880 unsigned long *ptr;
854 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); 881 unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
@@ -859,18 +886,11 @@ static void scan_block(void *_start, void *_end,
859 unsigned long pointer = *ptr; 886 unsigned long pointer = *ptr;
860 struct kmemleak_object *object; 887 struct kmemleak_object *object;
861 888
889 if (allow_resched)
890 cond_resched();
862 if (scan_should_stop()) 891 if (scan_should_stop())
863 break; 892 break;
864 893
865 /*
866 * When scanning a memory block with a corresponding
867 * kmemleak_object, the CPU yielding is handled in the calling
868 * code since it holds the object->lock to avoid the block
869 * freeing.
870 */
871 if (!scanned)
872 scan_yield();
873
874 object = find_and_get_object(pointer, 1); 894 object = find_and_get_object(pointer, 1);
875 if (!object) 895 if (!object)
876 continue; 896 continue;
@@ -931,12 +951,12 @@ static void scan_object(struct kmemleak_object *object)
931 goto out; 951 goto out;
932 if (hlist_empty(&object->area_list)) 952 if (hlist_empty(&object->area_list))
933 scan_block((void *)object->pointer, 953 scan_block((void *)object->pointer,
934 (void *)(object->pointer + object->size), object); 954 (void *)(object->pointer + object->size), object, 0);
935 else 955 else
936 hlist_for_each_entry(area, elem, &object->area_list, node) 956 hlist_for_each_entry(area, elem, &object->area_list, node)
937 scan_block((void *)(object->pointer + area->offset), 957 scan_block((void *)(object->pointer + area->offset),
938 (void *)(object->pointer + area->offset 958 (void *)(object->pointer + area->offset
939 + area->length), object); 959 + area->length), object, 0);
940out: 960out:
941 spin_unlock_irqrestore(&object->lock, flags); 961 spin_unlock_irqrestore(&object->lock, flags);
942} 962}
@@ -952,6 +972,10 @@ static void kmemleak_scan(void)
952 struct kmemleak_object *object, *tmp; 972 struct kmemleak_object *object, *tmp;
953 struct task_struct *task; 973 struct task_struct *task;
954 int i; 974 int i;
975 int new_leaks = 0;
976 int gray_list_pass = 0;
977
978 jiffies_last_scan = jiffies;
955 979
956 /* prepare the kmemleak_object's */ 980 /* prepare the kmemleak_object's */
957 rcu_read_lock(); 981 rcu_read_lock();
@@ -970,6 +994,7 @@ static void kmemleak_scan(void)
970#endif 994#endif
971 /* reset the reference count (whiten the object) */ 995 /* reset the reference count (whiten the object) */
972 object->count = 0; 996 object->count = 0;
997 object->flags &= ~OBJECT_NEW;
973 if (color_gray(object) && get_object(object)) 998 if (color_gray(object) && get_object(object))
974 list_add_tail(&object->gray_list, &gray_list); 999 list_add_tail(&object->gray_list, &gray_list);
975 1000
@@ -978,14 +1003,14 @@ static void kmemleak_scan(void)
978 rcu_read_unlock(); 1003 rcu_read_unlock();
979 1004
980 /* data/bss scanning */ 1005 /* data/bss scanning */
981 scan_block(_sdata, _edata, NULL); 1006 scan_block(_sdata, _edata, NULL, 1);
982 scan_block(__bss_start, __bss_stop, NULL); 1007 scan_block(__bss_start, __bss_stop, NULL, 1);
983 1008
984#ifdef CONFIG_SMP 1009#ifdef CONFIG_SMP
985 /* per-cpu sections scanning */ 1010 /* per-cpu sections scanning */
986 for_each_possible_cpu(i) 1011 for_each_possible_cpu(i)
987 scan_block(__per_cpu_start + per_cpu_offset(i), 1012 scan_block(__per_cpu_start + per_cpu_offset(i),
988 __per_cpu_end + per_cpu_offset(i), NULL); 1013 __per_cpu_end + per_cpu_offset(i), NULL, 1);
989#endif 1014#endif
990 1015
991 /* 1016 /*
@@ -1007,7 +1032,7 @@ static void kmemleak_scan(void)
1007 /* only scan if page is in use */ 1032 /* only scan if page is in use */
1008 if (page_count(page) == 0) 1033 if (page_count(page) == 0)
1009 continue; 1034 continue;
1010 scan_block(page, page + 1, NULL); 1035 scan_block(page, page + 1, NULL, 1);
1011 } 1036 }
1012 } 1037 }
1013 1038
@@ -1019,7 +1044,8 @@ static void kmemleak_scan(void)
1019 read_lock(&tasklist_lock); 1044 read_lock(&tasklist_lock);
1020 for_each_process(task) 1045 for_each_process(task)
1021 scan_block(task_stack_page(task), 1046 scan_block(task_stack_page(task),
1022 task_stack_page(task) + THREAD_SIZE, NULL); 1047 task_stack_page(task) + THREAD_SIZE,
1048 NULL, 0);
1023 read_unlock(&tasklist_lock); 1049 read_unlock(&tasklist_lock);
1024 } 1050 }
1025 1051
@@ -1031,9 +1057,10 @@ static void kmemleak_scan(void)
1031 * kmemleak objects cannot be freed from outside the loop because their 1057 * kmemleak objects cannot be freed from outside the loop because their
1032 * use_count was increased. 1058 * use_count was increased.
1033 */ 1059 */
1060repeat:
1034 object = list_entry(gray_list.next, typeof(*object), gray_list); 1061 object = list_entry(gray_list.next, typeof(*object), gray_list);
1035 while (&object->gray_list != &gray_list) { 1062 while (&object->gray_list != &gray_list) {
1036 scan_yield(); 1063 cond_resched();
1037 1064
1038 /* may add new objects to the list */ 1065 /* may add new objects to the list */
1039 if (!scan_should_stop()) 1066 if (!scan_should_stop())
@@ -1048,7 +1075,59 @@ static void kmemleak_scan(void)
1048 1075
1049 object = tmp; 1076 object = tmp;
1050 } 1077 }
1078
1079 if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
1080 goto scan_end;
1081
1082 /*
1083 * Check for new objects allocated during this scanning and add them
1084 * to the gray list.
1085 */
1086 rcu_read_lock();
1087 list_for_each_entry_rcu(object, &object_list, object_list) {
1088 spin_lock_irqsave(&object->lock, flags);
1089 if ((object->flags & OBJECT_NEW) && !color_black(object) &&
1090 get_object(object)) {
1091 object->flags &= ~OBJECT_NEW;
1092 list_add_tail(&object->gray_list, &gray_list);
1093 }
1094 spin_unlock_irqrestore(&object->lock, flags);
1095 }
1096 rcu_read_unlock();
1097
1098 if (!list_empty(&gray_list))
1099 goto repeat;
1100
1101scan_end:
1051 WARN_ON(!list_empty(&gray_list)); 1102 WARN_ON(!list_empty(&gray_list));
1103
1104 /*
1105 * If scanning was stopped or new objects were being allocated at a
1106 * higher rate than gray list scanning, do not report any new
1107 * unreferenced objects.
1108 */
1109 if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
1110 return;
1111
1112 /*
1113 * Scanning result reporting.
1114 */
1115 rcu_read_lock();
1116 list_for_each_entry_rcu(object, &object_list, object_list) {
1117 spin_lock_irqsave(&object->lock, flags);
1118 if (unreferenced_object(object) &&
1119 !(object->flags & OBJECT_REPORTED)) {
1120 object->flags |= OBJECT_REPORTED;
1121 new_leaks++;
1122 }
1123 spin_unlock_irqrestore(&object->lock, flags);
1124 }
1125 rcu_read_unlock();
1126
1127 if (new_leaks)
1128 pr_info("%d new suspected memory leaks (see "
1129 "/sys/kernel/debug/kmemleak)\n", new_leaks);
1130
1052} 1131}
1053 1132
1054/* 1133/*
@@ -1060,6 +1139,7 @@ static int kmemleak_scan_thread(void *arg)
1060 static int first_run = 1; 1139 static int first_run = 1;
1061 1140
1062 pr_info("Automatic memory scanning thread started\n"); 1141 pr_info("Automatic memory scanning thread started\n");
1142 set_user_nice(current, 10);
1063 1143
1064 /* 1144 /*
1065 * Wait before the first scan to allow the system to fully initialize. 1145 * Wait before the first scan to allow the system to fully initialize.
@@ -1070,36 +1150,12 @@ static int kmemleak_scan_thread(void *arg)
1070 } 1150 }
1071 1151
1072 while (!kthread_should_stop()) { 1152 while (!kthread_should_stop()) {
1073 struct kmemleak_object *object;
1074 signed long timeout = jiffies_scan_wait; 1153 signed long timeout = jiffies_scan_wait;
1075 1154
1076 mutex_lock(&scan_mutex); 1155 mutex_lock(&scan_mutex);
1077
1078 kmemleak_scan(); 1156 kmemleak_scan();
1079 reported_leaks = 0;
1080
1081 rcu_read_lock();
1082 list_for_each_entry_rcu(object, &object_list, object_list) {
1083 unsigned long flags;
1084
1085 if (reported_leaks >= REPORTS_NR)
1086 break;
1087 spin_lock_irqsave(&object->lock, flags);
1088 if (!(object->flags & OBJECT_REPORTED) &&
1089 unreferenced_object(object)) {
1090 print_unreferenced(NULL, object);
1091 object->flags |= OBJECT_REPORTED;
1092 reported_leaks++;
1093 } else if ((object->flags & OBJECT_REPORTED) &&
1094 referenced_object(object)) {
1095 print_referenced(object);
1096 object->flags &= ~OBJECT_REPORTED;
1097 }
1098 spin_unlock_irqrestore(&object->lock, flags);
1099 }
1100 rcu_read_unlock();
1101
1102 mutex_unlock(&scan_mutex); 1157 mutex_unlock(&scan_mutex);
1158
1103 /* wait before the next scan */ 1159 /* wait before the next scan */
1104 while (timeout && !kthread_should_stop()) 1160 while (timeout && !kthread_should_stop())
1105 timeout = schedule_timeout_interruptible(timeout); 1161 timeout = schedule_timeout_interruptible(timeout);
@@ -1112,7 +1168,7 @@ static int kmemleak_scan_thread(void *arg)
1112 1168
1113/* 1169/*
1114 * Start the automatic memory scanning thread. This function must be called 1170 * Start the automatic memory scanning thread. This function must be called
1115 * with the kmemleak_mutex held. 1171 * with the scan_mutex held.
1116 */ 1172 */
1117void start_scan_thread(void) 1173void start_scan_thread(void)
1118{ 1174{
@@ -1127,7 +1183,7 @@ void start_scan_thread(void)
1127 1183
1128/* 1184/*
1129 * Stop the automatic memory scanning thread. This function must be called 1185 * Stop the automatic memory scanning thread. This function must be called
1130 * with the kmemleak_mutex held. 1186 * with the scan_mutex held.
1131 */ 1187 */
1132void stop_scan_thread(void) 1188void stop_scan_thread(void)
1133{ 1189{
@@ -1146,13 +1202,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1146{ 1202{
1147 struct kmemleak_object *object; 1203 struct kmemleak_object *object;
1148 loff_t n = *pos; 1204 loff_t n = *pos;
1205 int err;
1149 1206
1150 if (!n) { 1207 err = mutex_lock_interruptible(&scan_mutex);
1151 kmemleak_scan(); 1208 if (err < 0)
1152 reported_leaks = 0; 1209 return ERR_PTR(err);
1153 }
1154 if (reported_leaks >= REPORTS_NR)
1155 return NULL;
1156 1210
1157 rcu_read_lock(); 1211 rcu_read_lock();
1158 list_for_each_entry_rcu(object, &object_list, object_list) { 1212 list_for_each_entry_rcu(object, &object_list, object_list) {
@@ -1163,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
1163 } 1217 }
1164 object = NULL; 1218 object = NULL;
1165out: 1219out:
1166 rcu_read_unlock();
1167 return object; 1220 return object;
1168} 1221}
1169 1222
@@ -1178,17 +1231,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1178 struct list_head *n = &prev_obj->object_list; 1231 struct list_head *n = &prev_obj->object_list;
1179 1232
1180 ++(*pos); 1233 ++(*pos);
1181 if (reported_leaks >= REPORTS_NR)
1182 goto out;
1183 1234
1184 rcu_read_lock();
1185 list_for_each_continue_rcu(n, &object_list) { 1235 list_for_each_continue_rcu(n, &object_list) {
1186 next_obj = list_entry(n, struct kmemleak_object, object_list); 1236 next_obj = list_entry(n, struct kmemleak_object, object_list);
1187 if (get_object(next_obj)) 1237 if (get_object(next_obj))
1188 break; 1238 break;
1189 } 1239 }
1190 rcu_read_unlock(); 1240
1191out:
1192 put_object(prev_obj); 1241 put_object(prev_obj);
1193 return next_obj; 1242 return next_obj;
1194} 1243}
@@ -1198,8 +1247,16 @@ out:
1198 */ 1247 */
1199static void kmemleak_seq_stop(struct seq_file *seq, void *v) 1248static void kmemleak_seq_stop(struct seq_file *seq, void *v)
1200{ 1249{
1201 if (v) 1250 if (!IS_ERR(v)) {
1202 put_object(v); 1251 /*
1252 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
1253 * waiting was interrupted, so only release it if !IS_ERR.
1254 */
1255 rcu_read_unlock();
1256 mutex_unlock(&scan_mutex);
1257 if (v)
1258 put_object(v);
1259 }
1203} 1260}
1204 1261
1205/* 1262/*
@@ -1211,11 +1268,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
1211 unsigned long flags; 1268 unsigned long flags;
1212 1269
1213 spin_lock_irqsave(&object->lock, flags); 1270 spin_lock_irqsave(&object->lock, flags);
1214 if (!unreferenced_object(object)) 1271 if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
1215 goto out; 1272 print_unreferenced(seq, object);
1216 print_unreferenced(seq, object);
1217 reported_leaks++;
1218out:
1219 spin_unlock_irqrestore(&object->lock, flags); 1273 spin_unlock_irqrestore(&object->lock, flags);
1220 return 0; 1274 return 0;
1221} 1275}
@@ -1229,43 +1283,15 @@ static const struct seq_operations kmemleak_seq_ops = {
1229 1283
1230static int kmemleak_open(struct inode *inode, struct file *file) 1284static int kmemleak_open(struct inode *inode, struct file *file)
1231{ 1285{
1232 int ret = 0;
1233
1234 if (!atomic_read(&kmemleak_enabled)) 1286 if (!atomic_read(&kmemleak_enabled))
1235 return -EBUSY; 1287 return -EBUSY;
1236 1288
1237 ret = mutex_lock_interruptible(&kmemleak_mutex); 1289 return seq_open(file, &kmemleak_seq_ops);
1238 if (ret < 0)
1239 goto out;
1240 if (file->f_mode & FMODE_READ) {
1241 ret = mutex_lock_interruptible(&scan_mutex);
1242 if (ret < 0)
1243 goto kmemleak_unlock;
1244 ret = seq_open(file, &kmemleak_seq_ops);
1245 if (ret < 0)
1246 goto scan_unlock;
1247 }
1248 return ret;
1249
1250scan_unlock:
1251 mutex_unlock(&scan_mutex);
1252kmemleak_unlock:
1253 mutex_unlock(&kmemleak_mutex);
1254out:
1255 return ret;
1256} 1290}
1257 1291
1258static int kmemleak_release(struct inode *inode, struct file *file) 1292static int kmemleak_release(struct inode *inode, struct file *file)
1259{ 1293{
1260 int ret = 0; 1294 return seq_release(inode, file);
1261
1262 if (file->f_mode & FMODE_READ) {
1263 seq_release(inode, file);
1264 mutex_unlock(&scan_mutex);
1265 }
1266 mutex_unlock(&kmemleak_mutex);
1267
1268 return ret;
1269} 1295}
1270 1296
1271/* 1297/*
@@ -1278,21 +1304,24 @@ static int kmemleak_release(struct inode *inode, struct file *file)
1278 * scan=off - stop the automatic memory scanning thread 1304 * scan=off - stop the automatic memory scanning thread
1279 * scan=... - set the automatic memory scanning period in seconds (0 to 1305 * scan=... - set the automatic memory scanning period in seconds (0 to
1280 * disable it) 1306 * disable it)
1307 * scan - trigger a memory scan
1281 */ 1308 */
1282static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, 1309static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1283 size_t size, loff_t *ppos) 1310 size_t size, loff_t *ppos)
1284{ 1311{
1285 char buf[64]; 1312 char buf[64];
1286 int buf_size; 1313 int buf_size;
1287 1314 int ret;
1288 if (!atomic_read(&kmemleak_enabled))
1289 return -EBUSY;
1290 1315
1291 buf_size = min(size, (sizeof(buf) - 1)); 1316 buf_size = min(size, (sizeof(buf) - 1));
1292 if (strncpy_from_user(buf, user_buf, buf_size) < 0) 1317 if (strncpy_from_user(buf, user_buf, buf_size) < 0)
1293 return -EFAULT; 1318 return -EFAULT;
1294 buf[buf_size] = 0; 1319 buf[buf_size] = 0;
1295 1320
1321 ret = mutex_lock_interruptible(&scan_mutex);
1322 if (ret < 0)
1323 return ret;
1324
1296 if (strncmp(buf, "off", 3) == 0) 1325 if (strncmp(buf, "off", 3) == 0)
1297 kmemleak_disable(); 1326 kmemleak_disable();
1298 else if (strncmp(buf, "stack=on", 8) == 0) 1327 else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1305,18 +1334,24 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1305 stop_scan_thread(); 1334 stop_scan_thread();
1306 else if (strncmp(buf, "scan=", 5) == 0) { 1335 else if (strncmp(buf, "scan=", 5) == 0) {
1307 unsigned long secs; 1336 unsigned long secs;
1308 int err;
1309 1337
1310 err = strict_strtoul(buf + 5, 0, &secs); 1338 ret = strict_strtoul(buf + 5, 0, &secs);
1311 if (err < 0) 1339 if (ret < 0)
1312 return err; 1340 goto out;
1313 stop_scan_thread(); 1341 stop_scan_thread();
1314 if (secs) { 1342 if (secs) {
1315 jiffies_scan_wait = msecs_to_jiffies(secs * 1000); 1343 jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
1316 start_scan_thread(); 1344 start_scan_thread();
1317 } 1345 }
1318 } else 1346 } else if (strncmp(buf, "scan", 4) == 0)
1319 return -EINVAL; 1347 kmemleak_scan();
1348 else
1349 ret = -EINVAL;
1350
1351out:
1352 mutex_unlock(&scan_mutex);
1353 if (ret < 0)
1354 return ret;
1320 1355
1321 /* ignore the rest of the buffer, only one command at a time */ 1356 /* ignore the rest of the buffer, only one command at a time */
1322 *ppos += size; 1357 *ppos += size;
@@ -1340,14 +1375,12 @@ static int kmemleak_cleanup_thread(void *arg)
1340{ 1375{
1341 struct kmemleak_object *object; 1376 struct kmemleak_object *object;
1342 1377
1343 mutex_lock(&kmemleak_mutex); 1378 mutex_lock(&scan_mutex);
1344 stop_scan_thread(); 1379 stop_scan_thread();
1345 mutex_unlock(&kmemleak_mutex);
1346 1380
1347 mutex_lock(&scan_mutex);
1348 rcu_read_lock(); 1381 rcu_read_lock();
1349 list_for_each_entry_rcu(object, &object_list, object_list) 1382 list_for_each_entry_rcu(object, &object_list, object_list)
1350 delete_object(object->pointer); 1383 delete_object_full(object->pointer);
1351 rcu_read_unlock(); 1384 rcu_read_unlock();
1352 mutex_unlock(&scan_mutex); 1385 mutex_unlock(&scan_mutex);
1353 1386
@@ -1411,7 +1444,6 @@ void __init kmemleak_init(void)
1411 int i; 1444 int i;
1412 unsigned long flags; 1445 unsigned long flags;
1413 1446
1414 jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
1415 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); 1447 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
1416 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); 1448 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
1417 1449
@@ -1443,6 +1475,9 @@ void __init kmemleak_init(void)
1443 case KMEMLEAK_FREE: 1475 case KMEMLEAK_FREE:
1444 kmemleak_free(log->ptr); 1476 kmemleak_free(log->ptr);
1445 break; 1477 break;
1478 case KMEMLEAK_FREE_PART:
1479 kmemleak_free_part(log->ptr, log->size);
1480 break;
1446 case KMEMLEAK_NOT_LEAK: 1481 case KMEMLEAK_NOT_LEAK:
1447 kmemleak_not_leak(log->ptr); 1482 kmemleak_not_leak(log->ptr);
1448 break; 1483 break;
@@ -1486,9 +1521,9 @@ static int __init kmemleak_late_init(void)
1486 &kmemleak_fops); 1521 &kmemleak_fops);
1487 if (!dentry) 1522 if (!dentry)
1488 pr_warning("Failed to create the debugfs kmemleak file\n"); 1523 pr_warning("Failed to create the debugfs kmemleak file\n");
1489 mutex_lock(&kmemleak_mutex); 1524 mutex_lock(&scan_mutex);
1490 start_scan_thread(); 1525 start_scan_thread();
1491 mutex_unlock(&kmemleak_mutex); 1526 mutex_unlock(&scan_mutex);
1492 1527
1493 pr_info("Kernel memory leak detector initialized\n"); 1528 pr_info("Kernel memory leak detector initialized\n");
1494 1529
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2fa20dadf40..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1207 ret = 0; 1207 ret = 0;
1208out: 1208out:
1209 unlock_page_cgroup(pc); 1209 unlock_page_cgroup(pc);
1210 /*
1211 * We charges against "to" which may not have any tasks. Then, "to"
1212 * can be under rmdir(). But in current implementation, caller of
1213 * this function is just force_empty() and it's garanteed that
1214 * "to" is never removed. So, we don't check rmdir status here.
1215 */
1210 return ret; 1216 return ret;
1211} 1217}
1212 1218
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1428 return; 1434 return;
1429 if (!ptr) 1435 if (!ptr)
1430 return; 1436 return;
1437 cgroup_exclude_rmdir(&ptr->css);
1431 pc = lookup_page_cgroup(page); 1438 pc = lookup_page_cgroup(page);
1432 mem_cgroup_lru_del_before_commit_swapcache(page); 1439 mem_cgroup_lru_del_before_commit_swapcache(page);
1433 __mem_cgroup_commit_charge(ptr, pc, ctype); 1440 __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1457 } 1464 }
1458 rcu_read_unlock(); 1465 rcu_read_unlock();
1459 } 1466 }
1460 /* add this page(page_cgroup) to the LRU we want. */ 1467 /*
1461 1468 * At swapin, we may charge account against cgroup which has no tasks.
1469 * So, rmdir()->pre_destroy() can be called while we do this charge.
1470 * In that case, we need to call pre_destroy() again. check it here.
1471 */
1472 cgroup_release_and_wakeup_rmdir(&ptr->css);
1462} 1473}
1463 1474
1464void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1475void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1664 1675
1665 if (!mem) 1676 if (!mem)
1666 return; 1677 return;
1667 1678 cgroup_exclude_rmdir(&mem->css);
1668 /* at migration success, oldpage->mapping is NULL. */ 1679 /* at migration success, oldpage->mapping is NULL. */
1669 if (oldpage->mapping) { 1680 if (oldpage->mapping) {
1670 target = oldpage; 1681 target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
1704 */ 1715 */
1705 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 1716 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1706 mem_cgroup_uncharge_page(target); 1717 mem_cgroup_uncharge_page(target);
1718 /*
1719 * At migration, we may charge account against cgroup which has no tasks
1720 * So, rmdir()->pre_destroy() can be called while we do this charge.
1721 * In that case, we need to call pre_destroy() again. check it here.
1722 */
1723 cgroup_release_and_wakeup_rmdir(&mem->css);
1707} 1724}
1708 1725
1709/* 1726/*
@@ -1973,7 +1990,7 @@ try_to_free:
1973 if (!progress) { 1990 if (!progress) {
1974 nr_retries--; 1991 nr_retries--;
1975 /* maybe some writeback is necessary */ 1992 /* maybe some writeback is necessary */
1976 congestion_wait(WRITE, HZ/10); 1993 congestion_wait(BLK_RW_ASYNC, HZ/10);
1977 } 1994 }
1978 1995
1979 } 1996 }
diff --git a/mm/memory.c b/mm/memory.c
index f46ac18ba231..aede2ce3aba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd)
135 * Note: this doesn't free the actual pages themselves. That 135 * Note: this doesn't free the actual pages themselves. That
136 * has been handled earlier when unmapping all the memory regions. 136 * has been handled earlier when unmapping all the memory regions.
137 */ 137 */
138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) 138static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
139 unsigned long addr)
139{ 140{
140 pgtable_t token = pmd_pgtable(*pmd); 141 pgtable_t token = pmd_pgtable(*pmd);
141 pmd_clear(pmd); 142 pmd_clear(pmd);
142 pte_free_tlb(tlb, token); 143 pte_free_tlb(tlb, token, addr);
143 tlb->mm->nr_ptes--; 144 tlb->mm->nr_ptes--;
144} 145}
145 146
@@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
157 next = pmd_addr_end(addr, end); 158 next = pmd_addr_end(addr, end);
158 if (pmd_none_or_clear_bad(pmd)) 159 if (pmd_none_or_clear_bad(pmd))
159 continue; 160 continue;
160 free_pte_range(tlb, pmd); 161 free_pte_range(tlb, pmd, addr);
161 } while (pmd++, addr = next, addr != end); 162 } while (pmd++, addr = next, addr != end);
162 163
163 start &= PUD_MASK; 164 start &= PUD_MASK;
@@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
173 174
174 pmd = pmd_offset(pud, start); 175 pmd = pmd_offset(pud, start);
175 pud_clear(pud); 176 pud_clear(pud);
176 pmd_free_tlb(tlb, pmd); 177 pmd_free_tlb(tlb, pmd, start);
177} 178}
178 179
179static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 180static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
206 207
207 pud = pud_offset(pgd, start); 208 pud = pud_offset(pgd, start);
208 pgd_clear(pgd); 209 pgd_clear(pgd);
209 pud_free_tlb(tlb, pud); 210 pud_free_tlb(tlb, pud, start);
210} 211}
211 212
212/* 213/*
@@ -1207,8 +1208,8 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1207 1208
1208 1209
1209int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1210int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1210 unsigned long start, int len, int flags, 1211 unsigned long start, int nr_pages, int flags,
1211 struct page **pages, struct vm_area_struct **vmas) 1212 struct page **pages, struct vm_area_struct **vmas)
1212{ 1213{
1213 int i; 1214 int i;
1214 unsigned int vm_flags = 0; 1215 unsigned int vm_flags = 0;
@@ -1217,7 +1218,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1217 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); 1218 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1218 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); 1219 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1219 1220
1220 if (len <= 0) 1221 if (nr_pages <= 0)
1221 return 0; 1222 return 0;
1222 /* 1223 /*
1223 * Require read or write permissions. 1224 * Require read or write permissions.
@@ -1269,7 +1270,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1269 vmas[i] = gate_vma; 1270 vmas[i] = gate_vma;
1270 i++; 1271 i++;
1271 start += PAGE_SIZE; 1272 start += PAGE_SIZE;
1272 len--; 1273 nr_pages--;
1273 continue; 1274 continue;
1274 } 1275 }
1275 1276
@@ -1280,7 +1281,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1280 1281
1281 if (is_vm_hugetlb_page(vma)) { 1282 if (is_vm_hugetlb_page(vma)) {
1282 i = follow_hugetlb_page(mm, vma, pages, vmas, 1283 i = follow_hugetlb_page(mm, vma, pages, vmas,
1283 &start, &len, i, write); 1284 &start, &nr_pages, i, write);
1284 continue; 1285 continue;
1285 } 1286 }
1286 1287
@@ -1357,9 +1358,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1357 vmas[i] = vma; 1358 vmas[i] = vma;
1358 i++; 1359 i++;
1359 start += PAGE_SIZE; 1360 start += PAGE_SIZE;
1360 len--; 1361 nr_pages--;
1361 } while (len && start < vma->vm_end); 1362 } while (nr_pages && start < vma->vm_end);
1362 } while (len); 1363 } while (nr_pages);
1363 return i; 1364 return i;
1364} 1365}
1365 1366
@@ -1368,7 +1369,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1368 * @tsk: task_struct of target task 1369 * @tsk: task_struct of target task
1369 * @mm: mm_struct of target mm 1370 * @mm: mm_struct of target mm
1370 * @start: starting user address 1371 * @start: starting user address
1371 * @len: number of pages from start to pin 1372 * @nr_pages: number of pages from start to pin
1372 * @write: whether pages will be written to by the caller 1373 * @write: whether pages will be written to by the caller
1373 * @force: whether to force write access even if user mapping is 1374 * @force: whether to force write access even if user mapping is
1374 * readonly. This will result in the page being COWed even 1375 * readonly. This will result in the page being COWed even
@@ -1380,7 +1381,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1380 * Or NULL if the caller does not require them. 1381 * Or NULL if the caller does not require them.
1381 * 1382 *
1382 * Returns number of pages pinned. This may be fewer than the number 1383 * Returns number of pages pinned. This may be fewer than the number
1383 * requested. If len is 0 or negative, returns 0. If no pages 1384 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1384 * were pinned, returns -errno. Each page returned must be released 1385 * were pinned, returns -errno. Each page returned must be released
1385 * with a put_page() call when it is finished with. vmas will only 1386 * with a put_page() call when it is finished with. vmas will only
1386 * remain valid while mmap_sem is held. 1387 * remain valid while mmap_sem is held.
@@ -1414,7 +1415,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1414 * See also get_user_pages_fast, for performance critical applications. 1415 * See also get_user_pages_fast, for performance critical applications.
1415 */ 1416 */
1416int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1417int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1417 unsigned long start, int len, int write, int force, 1418 unsigned long start, int nr_pages, int write, int force,
1418 struct page **pages, struct vm_area_struct **vmas) 1419 struct page **pages, struct vm_area_struct **vmas)
1419{ 1420{
1420 int flags = 0; 1421 int flags = 0;
@@ -1424,9 +1425,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1424 if (force) 1425 if (force)
1425 flags |= GUP_FLAGS_FORCE; 1426 flags |= GUP_FLAGS_FORCE;
1426 1427
1427 return __get_user_pages(tsk, mm, 1428 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
1428 start, len, flags,
1429 pages, vmas);
1430} 1429}
1431 1430
1432EXPORT_SYMBOL(get_user_pages); 1431EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
191 * Must be called holding task's alloc_lock to protect task's mems_allowed 191 * Must be called holding task's alloc_lock to protect task's mems_allowed
192 * and mempolicy. May also be called holding the mmap_semaphore for write. 192 * and mempolicy. May also be called holding the mmap_semaphore for write.
193 */ 193 */
194static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) 194static int mpol_set_nodemask(struct mempolicy *pol,
195 const nodemask_t *nodes, struct nodemask_scratch *nsc)
195{ 196{
196 nodemask_t cpuset_context_nmask;
197 int ret; 197 int ret;
198 198
199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ 199 /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200 if (pol == NULL) 200 if (pol == NULL)
201 return 0; 201 return 0;
202 /* Check N_HIGH_MEMORY */
203 nodes_and(nsc->mask1,
204 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
202 205
203 VM_BUG_ON(!nodes); 206 VM_BUG_ON(!nodes);
204 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) 207 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205 nodes = NULL; /* explicit local allocation */ 208 nodes = NULL; /* explicit local allocation */
206 else { 209 else {
207 if (pol->flags & MPOL_F_RELATIVE_NODES) 210 if (pol->flags & MPOL_F_RELATIVE_NODES)
208 mpol_relative_nodemask(&cpuset_context_nmask, nodes, 211 mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
209 &cpuset_current_mems_allowed);
210 else 212 else
211 nodes_and(cpuset_context_nmask, *nodes, 213 nodes_and(nsc->mask2, *nodes, nsc->mask1);
212 cpuset_current_mems_allowed); 214
213 if (mpol_store_user_nodemask(pol)) 215 if (mpol_store_user_nodemask(pol))
214 pol->w.user_nodemask = *nodes; 216 pol->w.user_nodemask = *nodes;
215 else 217 else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
217 cpuset_current_mems_allowed; 219 cpuset_current_mems_allowed;
218 } 220 }
219 221
220 ret = mpol_ops[pol->mode].create(pol, 222 if (nodes)
221 nodes ? &cpuset_context_nmask : NULL); 223 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
224 else
225 ret = mpol_ops[pol->mode].create(pol, NULL);
222 return ret; 226 return ret;
223} 227}
224 228
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
620{ 624{
621 struct mempolicy *new, *old; 625 struct mempolicy *new, *old;
622 struct mm_struct *mm = current->mm; 626 struct mm_struct *mm = current->mm;
627 NODEMASK_SCRATCH(scratch);
623 int ret; 628 int ret;
624 629
625 new = mpol_new(mode, flags, nodes); 630 if (!scratch)
626 if (IS_ERR(new)) 631 return -ENOMEM;
627 return PTR_ERR(new);
628 632
633 new = mpol_new(mode, flags, nodes);
634 if (IS_ERR(new)) {
635 ret = PTR_ERR(new);
636 goto out;
637 }
629 /* 638 /*
630 * prevent changing our mempolicy while show_numa_maps() 639 * prevent changing our mempolicy while show_numa_maps()
631 * is using it. 640 * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
635 if (mm) 644 if (mm)
636 down_write(&mm->mmap_sem); 645 down_write(&mm->mmap_sem);
637 task_lock(current); 646 task_lock(current);
638 ret = mpol_set_nodemask(new, nodes); 647 ret = mpol_set_nodemask(new, nodes, scratch);
639 if (ret) { 648 if (ret) {
640 task_unlock(current); 649 task_unlock(current);
641 if (mm) 650 if (mm)
642 up_write(&mm->mmap_sem); 651 up_write(&mm->mmap_sem);
643 mpol_put(new); 652 mpol_put(new);
644 return ret; 653 goto out;
645 } 654 }
646 old = current->mempolicy; 655 old = current->mempolicy;
647 current->mempolicy = new; 656 current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
654 up_write(&mm->mmap_sem); 663 up_write(&mm->mmap_sem);
655 664
656 mpol_put(old); 665 mpol_put(old);
657 return 0; 666 ret = 0;
667out:
668 NODEMASK_SCRATCH_FREE(scratch);
669 return ret;
658} 670}
659 671
660/* 672/*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
1014 if (err) 1026 if (err)
1015 return err; 1027 return err;
1016 } 1028 }
1017 down_write(&mm->mmap_sem); 1029 {
1018 task_lock(current); 1030 NODEMASK_SCRATCH(scratch);
1019 err = mpol_set_nodemask(new, nmask); 1031 if (scratch) {
1020 task_unlock(current); 1032 down_write(&mm->mmap_sem);
1033 task_lock(current);
1034 err = mpol_set_nodemask(new, nmask, scratch);
1035 task_unlock(current);
1036 if (err)
1037 up_write(&mm->mmap_sem);
1038 } else
1039 err = -ENOMEM;
1040 NODEMASK_SCRATCH_FREE(scratch);
1041 }
1021 if (err) { 1042 if (err) {
1022 up_write(&mm->mmap_sem);
1023 mpol_put(new); 1043 mpol_put(new);
1024 return err; 1044 return err;
1025 } 1045 }
@@ -1891,6 +1911,7 @@ restart:
1891 * Install non-NULL @mpol in inode's shared policy rb-tree. 1911 * Install non-NULL @mpol in inode's shared policy rb-tree.
1892 * On entry, the current task has a reference on a non-NULL @mpol. 1912 * On entry, the current task has a reference on a non-NULL @mpol.
1893 * This must be released on exit. 1913 * This must be released on exit.
1914 * This is called at get_inode() calls and we can use GFP_KERNEL.
1894 */ 1915 */
1895void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) 1916void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1896{ 1917{
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1902 if (mpol) { 1923 if (mpol) {
1903 struct vm_area_struct pvma; 1924 struct vm_area_struct pvma;
1904 struct mempolicy *new; 1925 struct mempolicy *new;
1926 NODEMASK_SCRATCH(scratch);
1905 1927
1928 if (!scratch)
1929 return;
1906 /* contextualize the tmpfs mount point mempolicy */ 1930 /* contextualize the tmpfs mount point mempolicy */
1907 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); 1931 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1908 if (IS_ERR(new)) { 1932 if (IS_ERR(new)) {
1909 mpol_put(mpol); /* drop our ref on sb mpol */ 1933 mpol_put(mpol); /* drop our ref on sb mpol */
1934 NODEMASK_SCRATCH_FREE(scratch);
1910 return; /* no valid nodemask intersection */ 1935 return; /* no valid nodemask intersection */
1911 } 1936 }
1912 1937
1913 task_lock(current); 1938 task_lock(current);
1914 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); 1939 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1915 task_unlock(current); 1940 task_unlock(current);
1916 mpol_put(mpol); /* drop our ref on sb mpol */ 1941 mpol_put(mpol); /* drop our ref on sb mpol */
1917 if (ret) { 1942 if (ret) {
1943 NODEMASK_SCRATCH_FREE(scratch);
1918 mpol_put(new); 1944 mpol_put(new);
1919 return; 1945 return;
1920 } 1946 }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1924 pvma.vm_end = TASK_SIZE; /* policy covers entire file */ 1950 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1925 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ 1951 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1926 mpol_put(new); /* drop initial ref */ 1952 mpol_put(new); /* drop initial ref */
1953 NODEMASK_SCRATCH_FREE(scratch);
1927 } 1954 }
1928} 1955}
1929 1956
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2140 err = 1; 2167 err = 1;
2141 else { 2168 else {
2142 int ret; 2169 int ret;
2143 2170 NODEMASK_SCRATCH(scratch);
2144 task_lock(current); 2171 if (scratch) {
2145 ret = mpol_set_nodemask(new, &nodes); 2172 task_lock(current);
2146 task_unlock(current); 2173 ret = mpol_set_nodemask(new, &nodes, scratch);
2147 if (ret) 2174 task_unlock(current);
2175 } else
2176 ret = -ENOMEM;
2177 NODEMASK_SCRATCH_FREE(scratch);
2178 if (ret) {
2148 err = 1; 2179 err = 1;
2149 else if (no_context) { 2180 mpol_put(new);
2181 } else if (no_context) {
2150 /* save for contextualization */ 2182 /* save for contextualization */
2151 new->w.user_nodemask = nodes; 2183 new->w.user_nodemask = nodes;
2152 } 2184 }
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..32e75d400503 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
303 */ 303 */
304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) 304void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
305{ 305{
306 size_t size = (size_t)(long)pool_data; 306 size_t size = (size_t)pool_data;
307 return kmalloc(size, gfp_mask); 307 return kmalloc(size, gfp_mask);
308} 308}
309EXPORT_SYMBOL(mempool_kmalloc); 309EXPORT_SYMBOL(mempool_kmalloc);
310 310
311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) 311void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
312{ 312{
313 size_t size = (size_t) pool_data; 313 size_t size = (size_t)pool_data;
314 return kzalloc(size, gfp_mask); 314 return kzalloc(size, gfp_mask);
315} 315}
316EXPORT_SYMBOL(mempool_kzalloc); 316EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fd2ad5da98e..53cab10fece4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -173,8 +173,8 @@ unsigned int kobjsize(const void *objp)
173} 173}
174 174
175int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 175int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
176 unsigned long start, int len, int flags, 176 unsigned long start, int nr_pages, int flags,
177 struct page **pages, struct vm_area_struct **vmas) 177 struct page **pages, struct vm_area_struct **vmas)
178{ 178{
179 struct vm_area_struct *vma; 179 struct vm_area_struct *vma;
180 unsigned long vm_flags; 180 unsigned long vm_flags;
@@ -189,7 +189,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
189 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 189 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
190 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 190 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
191 191
192 for (i = 0; i < len; i++) { 192 for (i = 0; i < nr_pages; i++) {
193 vma = find_vma(mm, start); 193 vma = find_vma(mm, start);
194 if (!vma) 194 if (!vma)
195 goto finish_or_fault; 195 goto finish_or_fault;
@@ -224,7 +224,7 @@ finish_or_fault:
224 * - don't permit access to VMAs that don't support it, such as I/O mappings 224 * - don't permit access to VMAs that don't support it, such as I/O mappings
225 */ 225 */
226int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 226int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
227 unsigned long start, int len, int write, int force, 227 unsigned long start, int nr_pages, int write, int force,
228 struct page **pages, struct vm_area_struct **vmas) 228 struct page **pages, struct vm_area_struct **vmas)
229{ 229{
230 int flags = 0; 230 int flags = 0;
@@ -234,12 +234,31 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
234 if (force) 234 if (force)
235 flags |= GUP_FLAGS_FORCE; 235 flags |= GUP_FLAGS_FORCE;
236 236
237 return __get_user_pages(tsk, mm, 237 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
238 start, len, flags,
239 pages, vmas);
240} 238}
241EXPORT_SYMBOL(get_user_pages); 239EXPORT_SYMBOL(get_user_pages);
242 240
241/**
242 * follow_pfn - look up PFN at a user virtual address
243 * @vma: memory mapping
244 * @address: user virtual address
245 * @pfn: location to store found PFN
246 *
247 * Only IO mappings and raw PFN mappings are allowed.
248 *
249 * Returns zero and the pfn at @pfn on success, -ve otherwise.
250 */
251int follow_pfn(struct vm_area_struct *vma, unsigned long address,
252 unsigned long *pfn)
253{
254 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
255 return -EINVAL;
256
257 *pfn = address >> PAGE_SHIFT;
258 return 0;
259}
260EXPORT_SYMBOL(follow_pfn);
261
243DEFINE_RWLOCK(vmlist_lock); 262DEFINE_RWLOCK(vmlist_lock);
244struct vm_struct *vmlist; 263struct vm_struct *vmlist;
245 264
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7b0dcea4935b..81627ebcd313 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -541,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping)
541 * filesystems (i.e. NFS) in which data may have been 541 * filesystems (i.e. NFS) in which data may have been
542 * written to the server's write cache, but has not yet 542 * written to the server's write cache, but has not yet
543 * been flushed to permanent storage. 543 * been flushed to permanent storage.
544 * Only move pages to writeback if this bdi is over its
545 * threshold otherwise wait until the disk writes catch
546 * up.
544 */ 547 */
545 if (bdi_nr_reclaimable) { 548 if (bdi_nr_reclaimable > bdi_thresh) {
546 writeback_inodes(&wbc); 549 writeback_inodes(&wbc);
547 pages_written += write_chunk - wbc.nr_to_write; 550 pages_written += write_chunk - wbc.nr_to_write;
548 get_dirty_limits(&background_thresh, &dirty_thresh, 551 get_dirty_limits(&background_thresh, &dirty_thresh,
@@ -572,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
572 if (pages_written >= write_chunk) 575 if (pages_written >= write_chunk)
573 break; /* We've done our duty */ 576 break; /* We've done our duty */
574 577
575 congestion_wait(WRITE, HZ/10); 578 congestion_wait(BLK_RW_ASYNC, HZ/10);
576 } 579 }
577 580
578 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 581 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -666,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
666 if (global_page_state(NR_UNSTABLE_NFS) + 669 if (global_page_state(NR_UNSTABLE_NFS) +
667 global_page_state(NR_WRITEBACK) <= dirty_thresh) 670 global_page_state(NR_WRITEBACK) <= dirty_thresh)
668 break; 671 break;
669 congestion_wait(WRITE, HZ/10); 672 congestion_wait(BLK_RW_ASYNC, HZ/10);
670 673
671 /* 674 /*
672 * The caller might hold locks which can prevent IO completion 675 * The caller might hold locks which can prevent IO completion
@@ -712,7 +715,7 @@ static void background_writeout(unsigned long _min_pages)
712 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 715 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
713 /* Wrote less than expected */ 716 /* Wrote less than expected */
714 if (wbc.encountered_congestion || wbc.more_io) 717 if (wbc.encountered_congestion || wbc.more_io)
715 congestion_wait(WRITE, HZ/10); 718 congestion_wait(BLK_RW_ASYNC, HZ/10);
716 else 719 else
717 break; 720 break;
718 } 721 }
@@ -784,7 +787,7 @@ static void wb_kupdate(unsigned long arg)
784 writeback_inodes(&wbc); 787 writeback_inodes(&wbc);
785 if (wbc.nr_to_write > 0) { 788 if (wbc.nr_to_write > 0) {
786 if (wbc.encountered_congestion || wbc.more_io) 789 if (wbc.encountered_congestion || wbc.more_io)
787 congestion_wait(WRITE, HZ/10); 790 congestion_wait(BLK_RW_ASYNC, HZ/10);
788 else 791 else
789 break; /* All the old data is written */ 792 break; /* All the old data is written */
790 } 793 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d714f8fb303..d052abbe3063 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -882,7 +882,7 @@ retry_reserve:
882 */ 882 */
883static int rmqueue_bulk(struct zone *zone, unsigned int order, 883static int rmqueue_bulk(struct zone *zone, unsigned int order,
884 unsigned long count, struct list_head *list, 884 unsigned long count, struct list_head *list,
885 int migratetype) 885 int migratetype, int cold)
886{ 886{
887 int i; 887 int i;
888 888
@@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
901 * merge IO requests if the physical pages are ordered 901 * merge IO requests if the physical pages are ordered
902 * properly. 902 * properly.
903 */ 903 */
904 list_add(&page->lru, list); 904 if (likely(cold == 0))
905 list_add(&page->lru, list);
906 else
907 list_add_tail(&page->lru, list);
905 set_page_private(page, migratetype); 908 set_page_private(page, migratetype);
906 list = &page->lru; 909 list = &page->lru;
907 } 910 }
@@ -1119,7 +1122,8 @@ again:
1119 local_irq_save(flags); 1122 local_irq_save(flags);
1120 if (!pcp->count) { 1123 if (!pcp->count) {
1121 pcp->count = rmqueue_bulk(zone, 0, 1124 pcp->count = rmqueue_bulk(zone, 0,
1122 pcp->batch, &pcp->list, migratetype); 1125 pcp->batch, &pcp->list,
1126 migratetype, cold);
1123 if (unlikely(!pcp->count)) 1127 if (unlikely(!pcp->count))
1124 goto failed; 1128 goto failed;
1125 } 1129 }
@@ -1138,7 +1142,8 @@ again:
1138 /* Allocate more to the pcp list if necessary */ 1142 /* Allocate more to the pcp list if necessary */
1139 if (unlikely(&page->lru == &pcp->list)) { 1143 if (unlikely(&page->lru == &pcp->list)) {
1140 pcp->count += rmqueue_bulk(zone, 0, 1144 pcp->count += rmqueue_bulk(zone, 0,
1141 pcp->batch, &pcp->list, migratetype); 1145 pcp->batch, &pcp->list,
1146 migratetype, cold);
1142 page = list_entry(pcp->list.next, struct page, lru); 1147 page = list_entry(pcp->list.next, struct page, lru);
1143 } 1148 }
1144 1149
@@ -1666,7 +1671,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1666 preferred_zone, migratetype); 1671 preferred_zone, migratetype);
1667 1672
1668 if (!page && gfp_mask & __GFP_NOFAIL) 1673 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50); 1674 congestion_wait(BLK_RW_ASYNC, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1675 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671 1676
1672 return page; 1677 return page;
@@ -1740,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1740 * be using allocators in order of preference for an area that is 1745 * be using allocators in order of preference for an area that is
1741 * too large. 1746 * too large.
1742 */ 1747 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER)) 1748 if (order >= MAX_ORDER) {
1749 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1744 return NULL; 1750 return NULL;
1751 }
1745 1752
1746 /* 1753 /*
1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1754 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1789,6 +1796,10 @@ rebalance:
1789 if (p->flags & PF_MEMALLOC) 1796 if (p->flags & PF_MEMALLOC)
1790 goto nopage; 1797 goto nopage;
1791 1798
1799 /* Avoid allocations with no watermarks from looping endlessly */
1800 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1801 goto nopage;
1802
1792 /* Try direct reclaim and then allocating */ 1803 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order, 1804 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx, 1805 zonelist, high_zoneidx,
@@ -1831,7 +1842,7 @@ rebalance:
1831 pages_reclaimed += did_some_progress; 1842 pages_reclaimed += did_some_progress;
1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 1843 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1833 /* Wait for some write requests to complete then retry */ 1844 /* Wait for some write requests to complete then retry */
1834 congestion_wait(WRITE, HZ/50); 1845 congestion_wait(BLK_RW_ASYNC, HZ/50);
1835 goto rebalance; 1846 goto rebalance;
1836 } 1847 }
1837 1848
@@ -1983,7 +1994,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1983 unsigned long alloc_end = addr + (PAGE_SIZE << order); 1994 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1984 unsigned long used = addr + PAGE_ALIGN(size); 1995 unsigned long used = addr + PAGE_ALIGN(size);
1985 1996
1986 split_page(virt_to_page(addr), order); 1997 split_page(virt_to_page((void *)addr), order);
1987 while (used < alloc_end) { 1998 while (used < alloc_end) {
1988 free_page(used); 1999 free_page(used);
1989 used += PAGE_SIZE; 2000 used += PAGE_SIZE;
@@ -4032,6 +4043,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
4032 int i, nid; 4043 int i, nid;
4033 unsigned long usable_startpfn; 4044 unsigned long usable_startpfn;
4034 unsigned long kernelcore_node, kernelcore_remaining; 4045 unsigned long kernelcore_node, kernelcore_remaining;
4046 /* save the state before borrow the nodemask */
4047 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4035 unsigned long totalpages = early_calculate_totalpages(); 4048 unsigned long totalpages = early_calculate_totalpages();
4036 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4049 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4037 4050
@@ -4059,7 +4072,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
4059 4072
4060 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4073 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
4061 if (!required_kernelcore) 4074 if (!required_kernelcore)
4062 return; 4075 goto out;
4063 4076
4064 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4077 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4065 find_usable_zone_for_movable(); 4078 find_usable_zone_for_movable();
@@ -4158,6 +4171,10 @@ restart:
4158 for (nid = 0; nid < MAX_NUMNODES; nid++) 4171 for (nid = 0; nid < MAX_NUMNODES; nid++)
4159 zone_movable_pfn[nid] = 4172 zone_movable_pfn[nid] =
4160 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4173 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4174
4175out:
4176 /* restore the node_state */
4177 node_states[N_HIGH_MEMORY] = saved_node_state;
4161} 4178}
4162 4179
4163/* Any regular memory on that node ? */ 4180/* Any regular memory on that node ? */
@@ -4242,11 +4259,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4242 early_node_map[i].start_pfn, 4259 early_node_map[i].start_pfn,
4243 early_node_map[i].end_pfn); 4260 early_node_map[i].end_pfn);
4244 4261
4245 /*
4246 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4247 * that node_mask, clear it at first
4248 */
4249 nodes_clear(node_states[N_HIGH_MEMORY]);
4250 /* Initialise every node */ 4262 /* Initialise every node */
4251 mminit_verify_pageflags_layout(); 4263 mminit_verify_pageflags_layout();
4252 setup_nr_node_ids(); 4264 setup_nr_node_ids();
@@ -4744,8 +4756,10 @@ void *__init alloc_large_system_hash(const char *tablename,
4744 * some pages at the end of hash table which 4756 * some pages at the end of hash table which
4745 * alloc_pages_exact() automatically does 4757 * alloc_pages_exact() automatically does
4746 */ 4758 */
4747 if (get_order(size) < MAX_ORDER) 4759 if (get_order(size) < MAX_ORDER) {
4748 table = alloc_pages_exact(size, GFP_ATOMIC); 4760 table = alloc_pages_exact(size, GFP_ATOMIC);
4761 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4762 }
4749 } 4763 }
4750 } while (!table && size > PAGE_SIZE && --log2qty); 4764 } while (!table && size > PAGE_SIZE && --log2qty);
4751 4765
@@ -4763,16 +4777,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4763 if (_hash_mask) 4777 if (_hash_mask)
4764 *_hash_mask = (1 << log2qty) - 1; 4778 *_hash_mask = (1 << log2qty) - 1;
4765 4779
4766 /*
4767 * If hashdist is set, the table allocation is done with __vmalloc()
4768 * which invokes the kmemleak_alloc() callback. This function may also
4769 * be called before the slab and kmemleak are initialised when
4770 * kmemleak simply buffers the request to be executed later
4771 * (GFP_ATOMIC flag ignored in this case).
4772 */
4773 if (!hashdist)
4774 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4775
4776 return table; 4780 return table;
4777} 4781}
4778 4782
diff --git a/mm/percpu.c b/mm/percpu.c
index c0b2c1a76e81..b70f2acd8853 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -549,14 +549,14 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
549 * @chunk: chunk of interest 549 * @chunk: chunk of interest
550 * @page_start: page index of the first page to unmap 550 * @page_start: page index of the first page to unmap
551 * @page_end: page index of the last page to unmap + 1 551 * @page_end: page index of the last page to unmap + 1
552 * @flush: whether to flush cache and tlb or not 552 * @flush_tlb: whether to flush tlb or not
553 * 553 *
554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. 554 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
555 * If @flush is true, vcache is flushed before unmapping and tlb 555 * If @flush is true, vcache is flushed before unmapping and tlb
556 * after. 556 * after.
557 */ 557 */
558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, 558static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
559 bool flush) 559 bool flush_tlb)
560{ 560{
561 unsigned int last = num_possible_cpus() - 1; 561 unsigned int last = num_possible_cpus() - 1;
562 unsigned int cpu; 562 unsigned int cpu;
@@ -569,9 +569,8 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
569 * the whole region at once rather than doing it for each cpu. 569 * the whole region at once rather than doing it for each cpu.
570 * This could be an overkill but is more scalable. 570 * This could be an overkill but is more scalable.
571 */ 571 */
572 if (flush) 572 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
573 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), 573 pcpu_chunk_addr(chunk, last, page_end));
574 pcpu_chunk_addr(chunk, last, page_end));
575 574
576 for_each_possible_cpu(cpu) 575 for_each_possible_cpu(cpu)
577 unmap_kernel_range_noflush( 576 unmap_kernel_range_noflush(
@@ -579,7 +578,7 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
579 (page_end - page_start) << PAGE_SHIFT); 578 (page_end - page_start) << PAGE_SHIFT);
580 579
581 /* ditto as flush_cache_vunmap() */ 580 /* ditto as flush_cache_vunmap() */
582 if (flush) 581 if (flush_tlb)
583 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), 582 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
584 pcpu_chunk_addr(chunk, last, page_end)); 583 pcpu_chunk_addr(chunk, last, page_end));
585} 584}
@@ -1234,6 +1233,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
1234ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, 1233ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1235 ssize_t dyn_size, ssize_t unit_size) 1234 ssize_t dyn_size, ssize_t unit_size)
1236{ 1235{
1236 size_t chunk_size;
1237 unsigned int cpu; 1237 unsigned int cpu;
1238 1238
1239 /* determine parameters and allocate */ 1239 /* determine parameters and allocate */
@@ -1248,11 +1248,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
1248 } else 1248 } else
1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); 1249 pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
1250 1250
1251 pcpue_ptr = __alloc_bootmem_nopanic( 1251 chunk_size = pcpue_unit_size * num_possible_cpus();
1252 num_possible_cpus() * pcpue_unit_size, 1252
1253 PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); 1253 pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
1254 if (!pcpue_ptr) 1254 __pa(MAX_DMA_ADDRESS));
1255 if (!pcpue_ptr) {
1256 pr_warning("PERCPU: failed to allocate %zu bytes for "
1257 "embedding\n", chunk_size);
1255 return -ENOMEM; 1258 return -ENOMEM;
1259 }
1256 1260
1257 /* return the leftover and copy */ 1261 /* return the leftover and copy */
1258 for_each_possible_cpu(cpu) { 1262 for_each_possible_cpu(cpu) {
diff --git a/mm/slab.c b/mm/slab.c
index e74a16e4ced6..7b5d4deacfcd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1544,9 +1544,6 @@ void __init kmem_cache_init(void)
1544 } 1544 }
1545 1545
1546 g_cpucache_up = EARLY; 1546 g_cpucache_up = EARLY;
1547
1548 /* Annotate slab for lockdep -- annotate the malloc caches */
1549 init_lock_keys();
1550} 1547}
1551 1548
1552void __init kmem_cache_init_late(void) 1549void __init kmem_cache_init_late(void)
@@ -1563,6 +1560,9 @@ void __init kmem_cache_init_late(void)
1563 /* Done! */ 1560 /* Done! */
1564 g_cpucache_up = FULL; 1561 g_cpucache_up = FULL;
1565 1562
1563 /* Annotate slab for lockdep -- annotate the malloc caches */
1564 init_lock_keys();
1565
1566 /* 1566 /*
1567 * Register a cpu startup notifier callback that initializes 1567 * Register a cpu startup notifier callback that initializes
1568 * cpu_cache_get for all new cpus 1568 * cpu_cache_get for all new cpus
@@ -2547,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2547 } 2547 }
2548 2548
2549 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2549 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2550 synchronize_rcu(); 2550 rcu_barrier();
2551 2551
2552 __kmem_cache_destroy(cachep); 2552 __kmem_cache_destroy(cachep);
2553 mutex_unlock(&cache_chain_mutex); 2553 mutex_unlock(&cache_chain_mutex);
diff --git a/mm/slob.c b/mm/slob.c
index c78742defdc6..9641da3d5e58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -595,6 +595,8 @@ EXPORT_SYMBOL(kmem_cache_create);
595void kmem_cache_destroy(struct kmem_cache *c) 595void kmem_cache_destroy(struct kmem_cache *c)
596{ 596{
597 kmemleak_free(c); 597 kmemleak_free(c);
598 if (c->flags & SLAB_DESTROY_BY_RCU)
599 rcu_barrier();
598 slob_free(c, sizeof(struct kmem_cache)); 600 slob_free(c, sizeof(struct kmem_cache));
599} 601}
600EXPORT_SYMBOL(kmem_cache_destroy); 602EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index 819f056b39c6..b9f1491a58a1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -21,7 +21,6 @@
21#include <linux/kmemcheck.h> 21#include <linux/kmemcheck.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/cpuset.h> 23#include <linux/cpuset.h>
24#include <linux/kmemleak.h>
25#include <linux/mempolicy.h> 24#include <linux/mempolicy.h>
26#include <linux/ctype.h> 25#include <linux/ctype.h>
27#include <linux/debugobjects.h> 26#include <linux/debugobjects.h>
@@ -2595,6 +2594,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
2595 */ 2594 */
2596void kmem_cache_destroy(struct kmem_cache *s) 2595void kmem_cache_destroy(struct kmem_cache *s)
2597{ 2596{
2597 if (s->flags & SLAB_DESTROY_BY_RCU)
2598 rcu_barrier();
2598 down_write(&slub_lock); 2599 down_write(&slub_lock);
2599 s->refcount--; 2600 s->refcount--;
2600 if (!s->refcount) { 2601 if (!s->refcount) {
@@ -2833,13 +2834,15 @@ EXPORT_SYMBOL(__kmalloc);
2833static void *kmalloc_large_node(size_t size, gfp_t flags, int node) 2834static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2834{ 2835{
2835 struct page *page; 2836 struct page *page;
2837 void *ptr = NULL;
2836 2838
2837 flags |= __GFP_COMP | __GFP_NOTRACK; 2839 flags |= __GFP_COMP | __GFP_NOTRACK;
2838 page = alloc_pages_node(node, flags, get_order(size)); 2840 page = alloc_pages_node(node, flags, get_order(size));
2839 if (page) 2841 if (page)
2840 return page_address(page); 2842 ptr = page_address(page);
2841 else 2843
2842 return NULL; 2844 kmemleak_alloc(ptr, size, 1, flags);
2845 return ptr;
2843} 2846}
2844 2847
2845#ifdef CONFIG_NUMA 2848#ifdef CONFIG_NUMA
@@ -2924,6 +2927,7 @@ void kfree(const void *x)
2924 page = virt_to_head_page(x); 2927 page = virt_to_head_page(x);
2925 if (unlikely(!PageSlab(page))) { 2928 if (unlikely(!PageSlab(page))) {
2926 BUG_ON(!PageCompound(page)); 2929 BUG_ON(!PageCompound(page));
2930 kmemleak_free(x);
2927 put_page(page); 2931 put_page(page);
2928 return; 2932 return;
2929 } 2933 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
753 753
754 if (!bdev) { 754 if (!bdev) {
755 if (bdev_p) 755 if (bdev_p)
756 *bdev_p = bdget(sis->bdev->bd_dev); 756 *bdev_p = bdgrab(sis->bdev);
757 757
758 spin_unlock(&swap_lock); 758 spin_unlock(&swap_lock);
759 return i; 759 return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
765 struct swap_extent, list); 765 struct swap_extent, list);
766 if (se->start_block == offset) { 766 if (se->start_block == offset) {
767 if (bdev_p) 767 if (bdev_p)
768 *bdev_p = bdget(sis->bdev->bd_dev); 768 *bdev_p = bdgrab(sis->bdev);
769 769
770 spin_unlock(&swap_lock); 770 spin_unlock(&swap_lock);
771 bdput(bdev); 771 bdput(bdev);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 54155268dfca..dea7abd31098 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1104,7 +1104,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1104 */ 1104 */
1105 if (nr_freed < nr_taken && !current_is_kswapd() && 1105 if (nr_freed < nr_taken && !current_is_kswapd() &&
1106 lumpy_reclaim) { 1106 lumpy_reclaim) {
1107 congestion_wait(WRITE, HZ/10); 1107 congestion_wait(BLK_RW_ASYNC, HZ/10);
1108 1108
1109 /* 1109 /*
1110 * The attempt at page out may have made some 1110 * The attempt at page out may have made some
@@ -1721,7 +1721,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1721 1721
1722 /* Take a nap, wait for some writeback to complete */ 1722 /* Take a nap, wait for some writeback to complete */
1723 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1723 if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
1724 congestion_wait(WRITE, HZ/10); 1724 congestion_wait(BLK_RW_ASYNC, HZ/10);
1725 } 1725 }
1726 /* top priority shrink_zones still had more to do? don't OOM, then */ 1726 /* top priority shrink_zones still had more to do? don't OOM, then */
1727 if (!sc->all_unreclaimable && scanning_global_lru(sc)) 1727 if (!sc->all_unreclaimable && scanning_global_lru(sc))
@@ -1960,7 +1960,7 @@ loop_again:
1960 * another pass across the zones. 1960 * another pass across the zones.
1961 */ 1961 */
1962 if (total_scanned && priority < DEF_PRIORITY - 2) 1962 if (total_scanned && priority < DEF_PRIORITY - 2)
1963 congestion_wait(WRITE, HZ/10); 1963 congestion_wait(BLK_RW_ASYNC, HZ/10);
1964 1964
1965 /* 1965 /*
1966 * We do this so kswapd doesn't build up large priorities for 1966 * We do this so kswapd doesn't build up large priorities for
@@ -2233,7 +2233,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2233 goto out; 2233 goto out;
2234 2234
2235 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 2235 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2236 congestion_wait(WRITE, HZ / 10); 2236 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2237 } 2237 }
2238 } 2238 }
2239 2239