Merge branch 'nfs-for-2.6.32' of git://git.linux-nfs.org/projects/trondmy/nfs-2.6 into for-2.6.32-incoming

Conflicts: net/sunrpc/cache.c
author: J. Bruce Fields <bfields@citi.umich.edu> 2009-08-21 11:27:29 -0400
committer: J. Bruce Fields <bfields@citi.umich.edu> 2009-08-21 11:27:29 -0400
commit: e9dc122166b8d863d3057a66ada04838e5548e52 (patch)
tree: 749e15bf719b64bf9113db7acd8e043d9742cb26 /mm
parent: 560ab42ef923aaf2e4347315bdfcc74b2708972c (diff)
parent: 405d8f8b1d936414da2093d4149ff790ff3f84a5 (diff)
19 files changed, 431 insertions, 304 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 493b468a5035..c86edd244294 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
        enum bdi_state bit;
@@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested);
 /**
 * congestion_wait - wait for a backing_dev to become uncongested
- * @rw: READ or WRITE
+ * @sync: SYNC or ASYNC IO
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 * write congestion.  If no backing_devs are congested then just wait for the
 * next write to be completed.
 */
-long congestion_wait(int rw, long timeout)
+long congestion_wait(int sync, long timeout)
 {
        long ret;
        DEFINE_WAIT(wait);
-        wait_queue_head_t *wqh = &congestion_wqh[rw];
+        wait_queue_head_t *wqh = &congestion_wqh[sync];
        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        ret = io_schedule_timeout(timeout);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d2a9ce952768..701740c9e81b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,6 +12,7 @@
 #include <linux/pfn.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/kmemleak.h>
 #include <asm/bug.h>
 #include <asm/io.h>
@@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 {
        unsigned long start, end;
+        kmemleak_free_part(__va(physaddr), size);
        start = PFN_UP(physaddr);
        end = PFN_DOWN(physaddr + size);
@@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 {
        unsigned long start, end;
+        kmemleak_free_part(__va(addr), size);
        start = PFN_UP(addr);
        end = PFN_DOWN(addr + size);
@@ -516,6 +521,7 @@ find_block:
                region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
                                start_off);
                memset(region, 0, size);
+                kmemleak_alloc(region, size, 1, 0);
                return region;
        }
diff --git a/mm/dmapool.c b/mm/dmapool.c
index b1f0885dda22..3df063706f53 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
                unsigned pages = 0;
                unsigned blocks = 0;
+                spin_lock_irq(&pool->lock);
                list_for_each_entry(page, &pool->page_list, page_list) {
                        pages++;
                        blocks += page->in_use;
                }
+                spin_unlock_irq(&pool->lock);
                /* per-pool info, no real statistics yet */
                temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
diff --git a/mm/filemap.c b/mm/filemap.c
index 22396713feb9..ccea3b665c12 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2272,6 +2272,7 @@ again:
                pagefault_enable();
                flush_dcache_page(page);
+                mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..cafdcee154e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
        spin_lock(&inode->i_lock);
-        inode->i_blocks -= blocks_per_huge_page(h);
+        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c96f2c8700aa..487267310a84 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -48,10 +48,10 @@
 *   scanned. This list is only modified during a scanning episode when the
 *   scan_mutex is held. At the end of a scan, the gray_list is always empty.
 *   Note that the kmemleak_object.use_count is incremented when an object is
- *   added to the gray_list and therefore cannot be freed
+ *   added to the gray_list and therefore cannot be freed. This mutex also
- * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
+ *   prevents multiple users of the "kmemleak" debugfs file together with
- *   file together with modifications to the memory scanning parameters
+ *   modifications to the memory scanning parameters including the scan_thread
- *   including the scan_thread pointer
+ *   pointer
 *
 * The kmemleak_object structures have a use_count incremented or decremented
 * using the get_object()/put_object() functions. When the use_count becomes
@@ -103,11 +103,10 @@
 * Kmemleak configuration and common defines.
 */
 #define MAX_TRACE               16      /* stack trace length */
-#define REPORTS_NR              50      /* maximum number of reported leaks */
 #define MSECS_MIN_AGE           5000    /* minimum object age for reporting */
-#define MSECS_SCAN_YIELD        10      /* CPU yielding period */
 #define SECS_FIRST_SCAN         60      /* delay before the first scan */
 #define SECS_SCAN_WAIT          600     /* subsequent auto scanning delay */
+#define GRAY_LIST_PASSES        25      /* maximum number of gray list scans */
 #define BYTES_PER_POINTER       sizeof(void *)
@@ -159,6 +158,8 @@ struct kmemleak_object {
 #define OBJECT_REPORTED         (1 << 1)
 /* flag set to not scan the object */
 #define OBJECT_NO_SCAN          (1 << 2)
+/* flag set on newly allocated objects */
+#define OBJECT_NEW              (1 << 3)
 /* the list of all allocated objects */
 static LIST_HEAD(object_list);
@@ -186,22 +187,16 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0);
 static unsigned long min_addr = ULONG_MAX;
 static unsigned long max_addr;
-/* used for yielding the CPU to other tasks during scanning */
-static unsigned long next_scan_yield;
 static struct task_struct *scan_thread;
-static unsigned long jiffies_scan_yield;
+/* used to avoid reporting of recently allocated objects */
 static unsigned long jiffies_min_age;
+static unsigned long jiffies_last_scan;
 /* delay between automatic memory scannings */
 static signed long jiffies_scan_wait;
 /* enables or disables the task stacks scanning */
-static int kmemleak_stack_scan;
+static int kmemleak_stack_scan = 1;
-/* mutex protecting the memory scanning */
+/* protects the memory scanning, parameters and debug/kmemleak file access */
 static DEFINE_MUTEX(scan_mutex);
-/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
-static DEFINE_MUTEX(kmemleak_mutex);
-/* number of leaks reported (for limitation purposes) */
-static int reported_leaks;
 /*
 * Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -215,6 +210,7 @@ static int reported_leaks;
 enum {
        KMEMLEAK_ALLOC,
        KMEMLEAK_FREE,
+        KMEMLEAK_FREE_PART,
        KMEMLEAK_NOT_LEAK,
        KMEMLEAK_IGNORE,
        KMEMLEAK_SCAN_AREA,
@@ -235,7 +231,7 @@ struct early_log {
 };
 /* early logging buffer and current position */
-static struct early_log early_log[200];
+static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE];
 static int crt_early_log;
 static void kmemleak_disable(void);
@@ -278,13 +274,9 @@ static int color_gray(const struct kmemleak_object *object)
        return object->min_count != -1 && object->count >= object->min_count;
 }
-/*
+static int color_black(const struct kmemleak_object *object)
- * Objects are considered referenced if their color is gray and they have not
- * been deleted.
- */
-static int referenced_object(struct kmemleak_object *object)
 {
-        return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
+        return object->min_count == -1;
 }
 /*
@@ -295,42 +287,28 @@ static int referenced_object(struct kmemleak_object *object)
 static int unreferenced_object(struct kmemleak_object *object)
 {
        return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
-                time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
+                time_before_eq(object->jiffies + jiffies_min_age,
+                               jiffies_last_scan);
 }
 /*
- * Printing of the (un)referenced objects information, either to the seq file
+ * Printing of the unreferenced objects information to the seq file. The
- * or to the kernel log. The print_referenced/print_unreferenced functions
+ * print_unreferenced function must be called with the object->lock held.
- * must be called with the object->lock held.
 */
-#define print_helper(seq, x...) do {    \
-        struct seq_file *s = (seq);     \
-        if (s)                          \
-                seq_printf(s, x);       \
-        else                            \
-                pr_info(x);             \
-} while (0)
-static void print_referenced(struct kmemleak_object *object)
-{
-        pr_info("referenced object 0x%08lx (size %zu)\n",
-                object->pointer, object->size);
-}
 static void print_unreferenced(struct seq_file *seq,
                               struct kmemleak_object *object)
 {
        int i;
-        print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n",
+        seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
-                     object->pointer, object->size);
+                   object->pointer, object->size);
-        print_helper(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
+        seq_printf(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
-                     object->comm, object->pid, object->jiffies);
+                   object->comm, object->pid, object->jiffies);
-        print_helper(seq, "  backtrace:\n");
+        seq_printf(seq, "  backtrace:\n");
        for (i = 0; i < object->trace_len; i++) {
                void *ptr = (void *)object->trace[i];
-                print_helper(seq, "    [<%p>] %pS\n", ptr, ptr);
+                seq_printf(seq, "    [<%p>] %pS\n", ptr, ptr);
        }
 }
@@ -478,7 +456,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
        INIT_HLIST_HEAD(&object->area_list);
        spin_lock_init(&object->lock);
        atomic_set(&object->use_count, 1);
-        object->flags = OBJECT_ALLOCATED;
+        object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
        object->pointer = ptr;
        object->size = size;
        object->min_count = min_count;
@@ -546,39 +524,87 @@ out:
 * Remove the metadata (struct kmemleak_object) for a memory block from the
 * object_list and object_tree_root and decrement its use_count.
 */
-static void delete_object(unsigned long ptr)
+static void __delete_object(struct kmemleak_object *object)
 {
        unsigned long flags;
-        struct kmemleak_object *object;
        write_lock_irqsave(&kmemleak_lock, flags);
-        object = lookup_object(ptr, 0);
-        if (!object) {
-                kmemleak_warn("Freeing unknown object at 0x%08lx\n",
-                              ptr);
-                write_unlock_irqrestore(&kmemleak_lock, flags);
-                return;
-        }
        prio_tree_remove(&object_tree_root, &object->tree_node);
        list_del_rcu(&object->object_list);
        write_unlock_irqrestore(&kmemleak_lock, flags);
        WARN_ON(!(object->flags & OBJECT_ALLOCATED));
-        WARN_ON(atomic_read(&object->use_count) < 1);
+        WARN_ON(atomic_read(&object->use_count) < 2);
        /*
         * Locking here also ensures that the corresponding memory block
         * cannot be freed when it is being scanned.
         */
        spin_lock_irqsave(&object->lock, flags);
-        if (object->flags & OBJECT_REPORTED)
-                print_referenced(object);
        object->flags &= ~OBJECT_ALLOCATED;
        spin_unlock_irqrestore(&object->lock, flags);
        put_object(object);
 }
 /*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it.
+ */
+static void delete_object_full(unsigned long ptr)
+{
+        struct kmemleak_object *object;
+        object = find_and_get_object(ptr, 0);
+        if (!object) {
+#ifdef DEBUG
+                kmemleak_warn("Freeing unknown object at 0x%08lx\n",
+                              ptr);
+#endif
+                return;
+        }
+        __delete_object(object);
+        put_object(object);
+}
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it. If the memory block is partially freed, the function may create
+ * additional metadata for the remaining parts of the block.
+ */
+static void delete_object_part(unsigned long ptr, size_t size)
+{
+        struct kmemleak_object *object;
+        unsigned long start, end;
+        object = find_and_get_object(ptr, 1);
+        if (!object) {
+#ifdef DEBUG
+                kmemleak_warn("Partially freeing unknown object at 0x%08lx "
+                              "(size %zu)\n", ptr, size);
+#endif
+                return;
+        }
+        __delete_object(object);
+        /*
+         * Create one or two objects that may result from the memory block
+         * split. Note that partial freeing is only done by free_bootmem() and
+         * this happens before kmemleak_init() is called. The path below is
+         * only executed during early log recording in kmemleak_init(), so
+         * GFP_KERNEL is enough.
+         */
+        start = object->pointer;
+        end = object->pointer + object->size;
+        if (ptr > start)
+                create_object(start, ptr - start, object->min_count,
+                              GFP_KERNEL);
+        if (ptr + size < end)
+                create_object(ptr + size, end - ptr - size, object->min_count,
+                              GFP_KERNEL);
+        put_object(object);
+}
+/*
 * Make a object permanently as gray-colored so that it can no longer be
 * reported as a leak. This is used in general to mark a false positive.
 */
@@ -696,7 +722,8 @@ static void log_early(int op_type, const void *ptr, size_t size,
        struct early_log *log;
        if (crt_early_log >= ARRAY_SIZE(early_log)) {
-                kmemleak_stop("Early log buffer exceeded\n");
+                pr_warning("Early log buffer exceeded\n");
+                kmemleak_disable();
                return;
        }
@@ -741,13 +768,28 @@ void kmemleak_free(const void *ptr)
        pr_debug("%s(0x%p)\n", __func__, ptr);
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
-                delete_object((unsigned long)ptr);
+                delete_object_full((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
                log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
 /*
+ * Partial memory freeing function callback. This function is usually called
+ * from bootmem allocator when (part of) a memory block is freed.
+ */
+void kmemleak_free_part(const void *ptr, size_t size)
+{
+        pr_debug("%s(0x%p)\n", __func__, ptr);
+        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+                delete_object_part((unsigned long)ptr, size);
+        else if (atomic_read(&kmemleak_early_log))
+                log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_part);
+/*
 * Mark an already allocated memory block as a false positive. This will cause
 * the block to no longer be reported as leak and always be scanned.
 */
@@ -808,21 +850,6 @@ void kmemleak_no_scan(const void *ptr)
 EXPORT_SYMBOL(kmemleak_no_scan);
 /*
- * Yield the CPU so that other tasks get a chance to run.  The yielding is
- * rate-limited to avoid excessive number of calls to the schedule() function
- * during memory scanning.
- */
-static void scan_yield(void)
-{
-        might_sleep();
-        if (time_is_before_eq_jiffies(next_scan_yield)) {
-                schedule();
-                next_scan_yield = jiffies + jiffies_scan_yield;
-        }
-}
-/*
 * Memory scanning is a long process and it needs to be interruptable. This
 * function checks whether such interrupt condition occured.
 */
@@ -848,7 +875,7 @@ static int scan_should_stop(void)
 * found to the gray list.
 */
 static void scan_block(void *_start, void *_end,
-                       struct kmemleak_object *scanned)
+                       struct kmemleak_object *scanned, int allow_resched)
 {
        unsigned long *ptr;
        unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
@@ -859,18 +886,11 @@ static void scan_block(void *_start, void *_end,
                unsigned long pointer = *ptr;
                struct kmemleak_object *object;
+                if (allow_resched)
+                        cond_resched();
                if (scan_should_stop())
                        break;
-                /*
-                 * When scanning a memory block with a corresponding
-                 * kmemleak_object, the CPU yielding is handled in the calling
-                 * code since it holds the object->lock to avoid the block
-                 * freeing.
-                 */
-                if (!scanned)
-                        scan_yield();
                object = find_and_get_object(pointer, 1);
                if (!object)
                        continue;
@@ -931,12 +951,12 @@ static void scan_object(struct kmemleak_object *object)
                goto out;
        if (hlist_empty(&object->area_list))
                scan_block((void *)object->pointer,
-                           (void *)(object->pointer + object->size), object);
+                           (void *)(object->pointer + object->size), object, 0);
        else
                hlist_for_each_entry(area, elem, &object->area_list, node)
                        scan_block((void *)(object->pointer + area->offset),
                                   (void *)(object->pointer + area->offset
-                                            + area->length), object);
+                                            + area->length), object, 0);
 out:
        spin_unlock_irqrestore(&object->lock, flags);
 }
@@ -952,6 +972,10 @@ static void kmemleak_scan(void)
        struct kmemleak_object *object, *tmp;
        struct task_struct *task;
        int i;
+        int new_leaks = 0;
+        int gray_list_pass = 0;
+        jiffies_last_scan = jiffies;
        /* prepare the kmemleak_object's */
        rcu_read_lock();
@@ -970,6 +994,7 @@ static void kmemleak_scan(void)
 #endif
                /* reset the reference count (whiten the object) */
                object->count = 0;
+                object->flags &= ~OBJECT_NEW;
                if (color_gray(object) && get_object(object))
                        list_add_tail(&object->gray_list, &gray_list);
@@ -978,14 +1003,14 @@ static void kmemleak_scan(void)
        rcu_read_unlock();
        /* data/bss scanning */
-        scan_block(_sdata, _edata, NULL);
+        scan_block(_sdata, _edata, NULL, 1);
-        scan_block(__bss_start, __bss_stop, NULL);
+        scan_block(__bss_start, __bss_stop, NULL, 1);
 #ifdef CONFIG_SMP
        /* per-cpu sections scanning */
        for_each_possible_cpu(i)
                scan_block(__per_cpu_start + per_cpu_offset(i),
-                           __per_cpu_end + per_cpu_offset(i), NULL);
+                           __per_cpu_end + per_cpu_offset(i), NULL, 1);
 #endif
        /*
@@ -1007,7 +1032,7 @@ static void kmemleak_scan(void)
                        /* only scan if page is in use */
                        if (page_count(page) == 0)
                                continue;
-                        scan_block(page, page + 1, NULL);
+                        scan_block(page, page + 1, NULL, 1);
                }
        }
@@ -1019,7 +1044,8 @@ static void kmemleak_scan(void)
                read_lock(&tasklist_lock);
                for_each_process(task)
                        scan_block(task_stack_page(task),
-                                   task_stack_page(task) + THREAD_SIZE, NULL);
+                                   task_stack_page(task) + THREAD_SIZE,
+                                   NULL, 0);
                read_unlock(&tasklist_lock);
        }
@@ -1031,9 +1057,10 @@ static void kmemleak_scan(void)
         * kmemleak objects cannot be freed from outside the loop because their
         * use_count was increased.
         */
+repeat:
        object = list_entry(gray_list.next, typeof(*object), gray_list);
        while (&object->gray_list != &gray_list) {
-                scan_yield();
+                cond_resched();
                /* may add new objects to the list */
                if (!scan_should_stop())
@@ -1048,7 +1075,59 @@ static void kmemleak_scan(void)
                object = tmp;
        }
+        if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
+                goto scan_end;
+        /*
+         * Check for new objects allocated during this scanning and add them
+         * to the gray list.
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(object, &object_list, object_list) {
+                spin_lock_irqsave(&object->lock, flags);
+                if ((object->flags & OBJECT_NEW) && !color_black(object) &&
+                    get_object(object)) {
+                        object->flags &= ~OBJECT_NEW;
+                        list_add_tail(&object->gray_list, &gray_list);
+                }
+                spin_unlock_irqrestore(&object->lock, flags);
+        }
+        rcu_read_unlock();
+        if (!list_empty(&gray_list))
+                goto repeat;
+scan_end:
        WARN_ON(!list_empty(&gray_list));
+        /*
+         * If scanning was stopped or new objects were being allocated at a
+         * higher rate than gray list scanning, do not report any new
+         * unreferenced objects.
+         */
+        if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
+                return;
+        /*
+         * Scanning result reporting.
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(object, &object_list, object_list) {
+                spin_lock_irqsave(&object->lock, flags);
+                if (unreferenced_object(object) &&
+                    !(object->flags & OBJECT_REPORTED)) {
+                        object->flags |= OBJECT_REPORTED;
+                        new_leaks++;
+                }
+                spin_unlock_irqrestore(&object->lock, flags);
+        }
+        rcu_read_unlock();
+        if (new_leaks)
+                pr_info("%d new suspected memory leaks (see "
+                        "/sys/kernel/debug/kmemleak)\n", new_leaks);
 }
 /*
@@ -1060,6 +1139,7 @@ static int kmemleak_scan_thread(void *arg)
        static int first_run = 1;
        pr_info("Automatic memory scanning thread started\n");
+        set_user_nice(current, 10);
        /*
         * Wait before the first scan to allow the system to fully initialize.
@@ -1070,36 +1150,12 @@ static int kmemleak_scan_thread(void *arg)
        }
        while (!kthread_should_stop()) {
-                struct kmemleak_object *object;
                signed long timeout = jiffies_scan_wait;
                mutex_lock(&scan_mutex);
                kmemleak_scan();
-                reported_leaks = 0;
-                rcu_read_lock();
-                list_for_each_entry_rcu(object, &object_list, object_list) {
-                        unsigned long flags;
-                        if (reported_leaks >= REPORTS_NR)
-                                break;
-                        spin_lock_irqsave(&object->lock, flags);
-                        if (!(object->flags & OBJECT_REPORTED) &&
-                            unreferenced_object(object)) {
-                                print_unreferenced(NULL, object);
-                                object->flags |= OBJECT_REPORTED;
-                                reported_leaks++;
-                        } else if ((object->flags & OBJECT_REPORTED) &&
-                                   referenced_object(object)) {
-                                print_referenced(object);
-                                object->flags &= ~OBJECT_REPORTED;
-                        }
-                        spin_unlock_irqrestore(&object->lock, flags);
-                }
-                rcu_read_unlock();
                mutex_unlock(&scan_mutex);
                /* wait before the next scan */
                while (timeout && !kthread_should_stop())
                        timeout = schedule_timeout_interruptible(timeout);
@@ -1112,7 +1168,7 @@ static int kmemleak_scan_thread(void *arg)
 /*
 * Start the automatic memory scanning thread. This function must be called
- * with the kmemleak_mutex held.
+ * with the scan_mutex held.
 */
 void start_scan_thread(void)
 {
@@ -1127,7 +1183,7 @@ void start_scan_thread(void)
 /*
 * Stop the automatic memory scanning thread. This function must be called
- * with the kmemleak_mutex held.
+ * with the scan_mutex held.
 */
 void stop_scan_thread(void)
 {
@@ -1146,13 +1202,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct kmemleak_object *object;
        loff_t n = *pos;
+        int err;
-        if (!n) {
+        err = mutex_lock_interruptible(&scan_mutex);
-                kmemleak_scan();
+        if (err < 0)
-                reported_leaks = 0;
+                return ERR_PTR(err);
-        }
-        if (reported_leaks >= REPORTS_NR)
-                return NULL;
        rcu_read_lock();
        list_for_each_entry_rcu(object, &object_list, object_list) {
@@ -1163,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
        }
        object = NULL;
 out:
-        rcu_read_unlock();
        return object;
 }
@@ -1178,17 +1231,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        struct list_head *n = &prev_obj->object_list;
        ++(*pos);
-        if (reported_leaks >= REPORTS_NR)
-                goto out;
-        rcu_read_lock();
        list_for_each_continue_rcu(n, &object_list) {
                next_obj = list_entry(n, struct kmemleak_object, object_list);
                if (get_object(next_obj))
                        break;
        }
-        rcu_read_unlock();
-out:
        put_object(prev_obj);
        return next_obj;
 }
@@ -1198,8 +1247,16 @@ out:
 */
 static void kmemleak_seq_stop(struct seq_file *seq, void *v)
 {
-        if (v)
+        if (!IS_ERR(v)) {
-                put_object(v);
+                /*
+                 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
+                 * waiting was interrupted, so only release it if !IS_ERR.
+                 */
+                rcu_read_unlock();
+                mutex_unlock(&scan_mutex);
+                if (v)
+                        put_object(v);
+        }
 }
 /*
@@ -1211,11 +1268,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
        unsigned long flags;
        spin_lock_irqsave(&object->lock, flags);
-        if (!unreferenced_object(object))
+        if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
-                goto out;
+                print_unreferenced(seq, object);
-        print_unreferenced(seq, object);
-        reported_leaks++;
-out:
        spin_unlock_irqrestore(&object->lock, flags);
        return 0;
 }
@@ -1229,43 +1283,15 @@ static const struct seq_operations kmemleak_seq_ops = {
 static int kmemleak_open(struct inode *inode, struct file *file)
 {
-        int ret = 0;
        if (!atomic_read(&kmemleak_enabled))
                return -EBUSY;
-        ret = mutex_lock_interruptible(&kmemleak_mutex);
+        return seq_open(file, &kmemleak_seq_ops);
-        if (ret < 0)
-                goto out;
-        if (file->f_mode & FMODE_READ) {
-                ret = mutex_lock_interruptible(&scan_mutex);
-                if (ret < 0)
-                        goto kmemleak_unlock;
-                ret = seq_open(file, &kmemleak_seq_ops);
-                if (ret < 0)
-                        goto scan_unlock;
-        }
-        return ret;
-scan_unlock:
-        mutex_unlock(&scan_mutex);
-kmemleak_unlock:
-        mutex_unlock(&kmemleak_mutex);
-out:
-        return ret;
 }
 static int kmemleak_release(struct inode *inode, struct file *file)
 {
-        int ret = 0;
+        return seq_release(inode, file);
-        if (file->f_mode & FMODE_READ) {
-                seq_release(inode, file);
-                mutex_unlock(&scan_mutex);
-        }
-        mutex_unlock(&kmemleak_mutex);
-        return ret;
 }
 /*
@@ -1278,21 +1304,24 @@ static int kmemleak_release(struct inode *inode, struct file *file)
 *   scan=off   - stop the automatic memory scanning thread
 *   scan=...   - set the automatic memory scanning period in seconds (0 to
 *                disable it)
+ *   scan       - trigger a memory scan
 */
 static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
                              size_t size, loff_t *ppos)
 {
        char buf[64];
        int buf_size;
+        int ret;
-        if (!atomic_read(&kmemleak_enabled))
-                return -EBUSY;
        buf_size = min(size, (sizeof(buf) - 1));
        if (strncpy_from_user(buf, user_buf, buf_size) < 0)
                return -EFAULT;
        buf[buf_size] = 0;
+        ret = mutex_lock_interruptible(&scan_mutex);
+        if (ret < 0)
+                return ret;
        if (strncmp(buf, "off", 3) == 0)
                kmemleak_disable();
        else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1305,18 +1334,24 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
                stop_scan_thread();
        else if (strncmp(buf, "scan=", 5) == 0) {
                unsigned long secs;
-                int err;
-                err = strict_strtoul(buf + 5, 0, &secs);
+                ret = strict_strtoul(buf + 5, 0, &secs);
-                if (err < 0)
+                if (ret < 0)
-                        return err;
+                        goto out;
                stop_scan_thread();
                if (secs) {
                        jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
                        start_scan_thread();
                }
-        } else
+        } else if (strncmp(buf, "scan", 4) == 0)
-                return -EINVAL;
+                kmemleak_scan();
+        else
+                ret = -EINVAL;
+out:
+        mutex_unlock(&scan_mutex);
+        if (ret < 0)
+                return ret;
        /* ignore the rest of the buffer, only one command at a time */
        *ppos += size;
@@ -1340,14 +1375,12 @@ static int kmemleak_cleanup_thread(void *arg)
 {
        struct kmemleak_object *object;
-        mutex_lock(&kmemleak_mutex);
+        mutex_lock(&scan_mutex);
        stop_scan_thread();
-        mutex_unlock(&kmemleak_mutex);
-        mutex_lock(&scan_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(object, &object_list, object_list)
-                delete_object(object->pointer);
+                delete_object_full(object->pointer);
        rcu_read_unlock();
        mutex_unlock(&scan_mutex);
@@ -1411,7 +1444,6 @@ void __init kmemleak_init(void)
        int i;
        unsigned long flags;
-        jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
        jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
        jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
@@ -1443,6 +1475,9 @@ void __init kmemleak_init(void)
                case KMEMLEAK_FREE:
                        kmemleak_free(log->ptr);
                        break;
+                case KMEMLEAK_FREE_PART:
+                        kmemleak_free_part(log->ptr, log->size);
+                        break;
                case KMEMLEAK_NOT_LEAK:
                        kmemleak_not_leak(log->ptr);
                        break;
@@ -1486,9 +1521,9 @@ static int __init kmemleak_late_init(void)
                                     &kmemleak_fops);
        if (!dentry)
                pr_warning("Failed to create the debugfs kmemleak file\n");
-        mutex_lock(&kmemleak_mutex);
+        mutex_lock(&scan_mutex);
        start_scan_thread();
-        mutex_unlock(&kmemleak_mutex);
+        mutex_unlock(&scan_mutex);
        pr_info("Kernel memory leak detector initialized\n");
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2fa20dadf40..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        ret = 0;
 out:
        unlock_page_cgroup(pc);
+        /*
+         * We charges against "to" which may not have any tasks. Then, "to"
+         * can be under rmdir(). But in current implementation, caller of
+         * this function is just force_empty() and it's garanteed that
+         * "to" is never removed. So, we don't check rmdir status here.
+         */
        return ret;
 }
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                return;
        if (!ptr)
                return;
+        cgroup_exclude_rmdir(&ptr->css);
        pc = lookup_page_cgroup(page);
        mem_cgroup_lru_del_before_commit_swapcache(page);
        __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                }
                rcu_read_unlock();
        }
-        /* add this page(page_cgroup) to the LRU we want. */
+        /*
+         * At swapin, we may charge account against cgroup which has no tasks.
+         * So, rmdir()->pre_destroy() can be called while we do this charge.
+         * In that case, we need to call pre_destroy() again. check it here.
+         */
+        cgroup_release_and_wakeup_rmdir(&ptr->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        if (!mem)
                return;
+        cgroup_exclude_rmdir(&mem->css);
        /* at migration success, oldpage->mapping is NULL. */
        if (oldpage->mapping) {
                target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
         */
        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
                mem_cgroup_uncharge_page(target);
+        /*
+         * At migration, we may charge account against cgroup which has no tasks
+         * So, rmdir()->pre_destroy() can be called while we do this charge.
+         * In that case, we need to call pre_destroy() again. check it here.
+         */
+        cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 /*
@@ -1973,7 +1990,7 @@ try_to_free:
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                }
        }
diff --git a/mm/memory.c b/mm/memory.c
index f46ac18ba231..aede2ce3aba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd)
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
-static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+                           unsigned long addr)
 {
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
-        pte_free_tlb(tlb, token);
+        pte_free_tlb(tlb, token, addr);
        tlb->mm->nr_ptes--;
 }
@@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                free_pte_range(tlb, pmd);
+                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);
        start &= PUD_MASK;
@@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
        pmd = pmd_offset(pud, start);
        pud_clear(pud);
-        pmd_free_tlb(tlb, pmd);
+        pmd_free_tlb(tlb, pmd, start);
 }
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
        pud = pud_offset(pgd, start);
        pgd_clear(pgd);
-        pud_free_tlb(tlb, pud);
+        pud_free_tlb(tlb, pud, start);
 }
 /*
@@ -1207,8 +1208,8 @@ static inline int use_zero_page(struct vm_area_struct *vma)
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long start, int len, int flags,
+                     unsigned long start, int nr_pages, int flags,
-                struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas)
 {
        int i;
        unsigned int vm_flags = 0;
@@ -1217,7 +1218,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
        int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
-        if (len <= 0)
+        if (nr_pages <= 0)
                return 0;
        /* 
         * Require read or write permissions.
@@ -1269,7 +1270,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                vmas[i] = gate_vma;
                        i++;
                        start += PAGE_SIZE;
-                        len--;
+                        nr_pages--;
                        continue;
                }
@@ -1280,7 +1281,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                if (is_vm_hugetlb_page(vma)) {
                        i = follow_hugetlb_page(mm, vma, pages, vmas,
-                                                &start, &len, i, write);
+                                                &start, &nr_pages, i, write);
                        continue;
                }
@@ -1357,9 +1358,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                vmas[i] = vma;
                        i++;
                        start += PAGE_SIZE;
-                        len--;
+                        nr_pages--;
-                } while (len && start < vma->vm_end);
+                } while (nr_pages && start < vma->vm_end);
-        } while (len);
+        } while (nr_pages);
        return i;
 }
@@ -1368,7 +1369,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 * @tsk:        task_struct of target task
 * @mm:         mm_struct of target mm
 * @start:      starting user address
- * @len:        number of pages from start to pin
+ * @nr_pages:   number of pages from start to pin
 * @write:      whether pages will be written to by the caller
 * @force:      whether to force write access even if user mapping is
 *              readonly. This will result in the page being COWed even
@@ -1380,7 +1381,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 *              Or NULL if the caller does not require them.
 *
 * Returns number of pages pinned. This may be fewer than the number
- * requested. If len is 0 or negative, returns 0. If no pages
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno. Each page returned must be released
 * with a put_page() call when it is finished with. vmas will only
 * remain valid while mmap_sem is held.
@@ -1414,7 +1415,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 * See also get_user_pages_fast, for performance critical applications.
 */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                unsigned long start, int len, int write, int force,
+                unsigned long start, int nr_pages, int write, int force,
                struct page **pages, struct vm_area_struct **vmas)
 {
        int flags = 0;
@@ -1424,9 +1425,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= GUP_FLAGS_FORCE;
-        return __get_user_pages(tsk, mm,
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
-                                start, len, flags,
-                                pages, vmas);
 }
 EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 */
-static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_set_nodemask(struct mempolicy *pol,
+                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
-        nodemask_t cpuset_context_nmask;
        int ret;
        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
+        /* Check N_HIGH_MEMORY */
+        nodes_and(nsc->mask1,
+                  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;   /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
-                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
-                                               &cpuset_current_mems_allowed);
                else
-                        nodes_and(cpuset_context_nmask, *nodes,
+                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
-                                  cpuset_current_mems_allowed);
                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
                                                cpuset_current_mems_allowed;
        }
-        ret = mpol_ops[pol->mode].create(pol,
+        if (nodes)
-                                nodes ? &cpuset_context_nmask : NULL);
+                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+        else
+                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
 }
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
        struct mempolicy *new, *old;
        struct mm_struct *mm = current->mm;
+        NODEMASK_SCRATCH(scratch);
        int ret;
-        new = mpol_new(mode, flags, nodes);
+        if (!scratch)
-        if (IS_ERR(new))
+                return -ENOMEM;
-                return PTR_ERR(new);
+        new = mpol_new(mode, flags, nodes);
+        if (IS_ERR(new)) {
+                ret = PTR_ERR(new);
+                goto out;
+        }
        /*
         * prevent changing our mempolicy while show_numa_maps()
         * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
        if (mm)
                down_write(&mm->mmap_sem);
        task_lock(current);
-        ret = mpol_set_nodemask(new, nodes);
+        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                if (mm)
                        up_write(&mm->mmap_sem);
                mpol_put(new);
-                return ret;
+                goto out;
        }
        old = current->mempolicy;
        current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                up_write(&mm->mmap_sem);
        mpol_put(old);
-        return 0;
+        ret = 0;
+out:
+        NODEMASK_SCRATCH_FREE(scratch);
+        return ret;
 }
 /*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (err)
                        return err;
        }
-        down_write(&mm->mmap_sem);
+        {
-        task_lock(current);
+                NODEMASK_SCRATCH(scratch);
-        err = mpol_set_nodemask(new, nmask);
+                if (scratch) {
-        task_unlock(current);
+                        down_write(&mm->mmap_sem);
+                        task_lock(current);
+                        err = mpol_set_nodemask(new, nmask, scratch);
+                        task_unlock(current);
+                        if (err)
+                                up_write(&mm->mmap_sem);
+                } else
+                        err = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+        }
        if (err) {
-                up_write(&mm->mmap_sem);
                mpol_put(new);
                return err;
        }
@@ -1891,6 +1911,7 @@ restart:
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        if (mpol) {
                struct vm_area_struct pvma;
                struct mempolicy *new;
+                NODEMASK_SCRATCH(scratch);
+                if (!scratch)
+                        return;
                /* contextualize the tmpfs mount point mempolicy */
                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(new)) {
                        mpol_put(mpol); /* drop our ref on sb mpol */
+                        NODEMASK_SCRATCH_FREE(scratch);
                        return;         /* no valid nodemask intersection */
                }
                task_lock(current);
-                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                mpol_put(mpol); /* drop our ref on sb mpol */
                if (ret) {
+                        NODEMASK_SCRATCH_FREE(scratch);
                        mpol_put(new);
                        return;
                }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
                mpol_put(new);                  /* drop initial ref */
+                NODEMASK_SCRATCH_FREE(scratch);
        }
 }
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                err = 1;
        else {
                int ret;
+                NODEMASK_SCRATCH(scratch);
-                task_lock(current);
+                if (scratch) {
-                ret = mpol_set_nodemask(new, &nodes);
+                        task_lock(current);
-                task_unlock(current);
+                        ret = mpol_set_nodemask(new, &nodes, scratch);
-                if (ret)
+                        task_unlock(current);
+                } else
+                        ret = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+                if (ret) {
                        err = 1;
-                else if (no_context) {
+                        mpol_put(new);
+                } else if (no_context) {
                        /* save for contextualization */
                        new->w.user_nodemask = nodes;
                }
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..32e75d400503 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
 */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
-        size_t size = (size_t)(long)pool_data;
+        size_t size = (size_t)pool_data;
        return kmalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
 {
-        size_t size = (size_t) pool_data;
+        size_t size = (size_t)pool_data;
        return kzalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fd2ad5da98e..53cab10fece4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -173,8 +173,8 @@ unsigned int kobjsize(const void *objp)
 }
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long start, int len, int flags,
+                     unsigned long start, int nr_pages, int flags,
-                struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -189,7 +189,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
-        for (i = 0; i < len; i++) {
+        for (i = 0; i < nr_pages; i++) {
                vma = find_vma(mm, start);
                if (!vma)
                        goto finish_or_fault;
@@ -224,7 +224,7 @@ finish_or_fault:
 * - don't permit access to VMAs that don't support it, such as I/O mappings
 */
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-        unsigned long start, int len, int write, int force,
+        unsigned long start, int nr_pages, int write, int force,
        struct page **pages, struct vm_area_struct **vmas)
 {
        int flags = 0;
@@ -234,12 +234,31 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= GUP_FLAGS_FORCE;
-        return __get_user_pages(tsk, mm,
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
-                                start, len, flags,
-                                pages, vmas);
 }
 EXPORT_SYMBOL(get_user_pages);
+/**
+ * follow_pfn - look up PFN at a user virtual address
+ * @vma: memory mapping
+ * @address: user virtual address
+ * @pfn: location to store found PFN
+ *
+ * Only IO mappings and raw PFN mappings are allowed.
+ *
+ * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ */
+int follow_pfn(struct vm_area_struct *vma, unsigned long address,
+        unsigned long *pfn)
+{
+        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                return -EINVAL;
+        *pfn = address >> PAGE_SHIFT;
+        return 0;
+}
+EXPORT_SYMBOL(follow_pfn);
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7b0dcea4935b..81627ebcd313 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -541,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * filesystems (i.e. NFS) in which data may have been
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
+                 * Only move pages to writeback if this bdi is over its
+                 * threshold otherwise wait until the disk writes catch
+                 * up.
                 */
-                if (bdi_nr_reclaimable) {
+                if (bdi_nr_reclaimable > bdi_thresh) {
                        writeback_inodes(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
                        get_dirty_limits(&background_thresh, &dirty_thresh,
@@ -572,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -666,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                if (global_page_state(NR_UNSTABLE_NFS) +
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
                /*
                 * The caller might hold locks which can prevent IO completion
@@ -712,7 +715,7 @@ static void background_writeout(unsigned long _min_pages)
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
                        if (wbc.encountered_congestion || wbc.more_io)
-                                congestion_wait(WRITE, HZ/10);
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;
                }
@@ -784,7 +787,7 @@ static void wb_kupdate(unsigned long arg)
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
                        if (wbc.encountered_congestion || wbc.more_io)
-                                congestion_wait(WRITE, HZ/10);
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;  /* All the old data is written */
                }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d714f8fb303..d052abbe3063 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -882,7 +882,7 @@ retry_reserve:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                        unsigned long count, struct list_head *list,
-                        int migratetype)
+                        int migratetype, int cold)
 {
        int i;
        
@@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 * merge IO requests if the physical pages are ordered
                 * properly.
                 */
-                list_add(&page->lru, list);
+                if (likely(cold == 0))
+                        list_add(&page->lru, list);
+                else
+                        list_add_tail(&page->lru, list);
                set_page_private(page, migratetype);
                list = &page->lru;
        }
@@ -1119,7 +1122,8 @@ again:
                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
-                                        pcp->batch, &pcp->list, migratetype);
+                                        pcp->batch, &pcp->list,
+                                        migratetype, cold);
                        if (unlikely(!pcp->count))
                                goto failed;
                }
@@ -1138,7 +1142,8 @@ again:
                /* Allocate more to the pcp list if necessary */
                if (unlikely(&page->lru == &pcp->list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
-                                        pcp->batch, &pcp->list, migratetype);
+                                        pcp->batch, &pcp->list,
+                                        migratetype, cold);
                        page = list_entry(pcp->list.next, struct page, lru);
                }
@@ -1666,7 +1671,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        congestion_wait(WRITE, HZ/50);
+                        congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
@@ -1740,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * be using allocators in order of preference for an area that is
         * too large.
         */
-        if (WARN_ON_ONCE(order >= MAX_ORDER))
+        if (order >= MAX_ORDER) {
+                WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
                return NULL;
+        }
        /*
         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1789,6 +1796,10 @@ rebalance:
        if (p->flags & PF_MEMALLOC)
                goto nopage;
+        /* Avoid allocations with no watermarks from looping endlessly */
+        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -1831,7 +1842,7 @@ rebalance:
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-                congestion_wait(WRITE, HZ/50);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
                goto rebalance;
        }
@@ -1983,7 +1994,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
                unsigned long alloc_end = addr + (PAGE_SIZE << order);
                unsigned long used = addr + PAGE_ALIGN(size);
-                split_page(virt_to_page(addr), order);
+                split_page(virt_to_page((void *)addr), order);
                while (used < alloc_end) {
                        free_page(used);
                        used += PAGE_SIZE;
@@ -4032,6 +4043,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
        int i, nid;
        unsigned long usable_startpfn;
        unsigned long kernelcore_node, kernelcore_remaining;
+        /* save the state before borrow the nodemask */
+        nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
        int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
@@ -4059,7 +4072,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
        /* If kernelcore was not specified, there is no ZONE_MOVABLE */
        if (!required_kernelcore)
-                return;
+                goto out;
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
        find_usable_zone_for_movable();
@@ -4158,6 +4171,10 @@ restart:
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+out:
+        /* restore the node_state */
+        node_states[N_HIGH_MEMORY] = saved_node_state;
 }
 /* Any regular memory on that node ? */
@@ -4242,11 +4259,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                                early_node_map[i].start_pfn,
                                                early_node_map[i].end_pfn);
-        /*
-         * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
-         * that node_mask, clear it at first
-         */
-        nodes_clear(node_states[N_HIGH_MEMORY]);
        /* Initialise every node */
        mminit_verify_pageflags_layout();
        setup_nr_node_ids();
@@ -4744,8 +4756,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                         * some pages at the end of hash table which
                         * alloc_pages_exact() automatically does
                         */
-                        if (get_order(size) < MAX_ORDER)
+                        if (get_order(size) < MAX_ORDER) {
                                table = alloc_pages_exact(size, GFP_ATOMIC);
+                                kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+                        }
                }
        } while (!table && size > PAGE_SIZE && --log2qty);
@@ -4763,16 +4777,6 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (_hash_mask)
                *_hash_mask = (1 << log2qty) - 1;
-        /*
-         * If hashdist is set, the table allocation is done with __vmalloc()
-         * which invokes the kmemleak_alloc() callback. This function may also
-         * be called before the slab and kmemleak are initialised when
-         * kmemleak simply buffers the request to be executed later
-         * (GFP_ATOMIC flag ignored in this case).
-         */
-        if (!hashdist)
-                kmemleak_alloc(table, size, 1, GFP_ATOMIC);
        return table;
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index c0b2c1a76e81..b70f2acd8853 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -549,14 +549,14 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 * @chunk: chunk of interest
 * @page_start: page index of the first page to unmap
 * @page_end: page index of the last page to unmap + 1
- * @flush: whether to flush cache and tlb or not
+ * @flush_tlb: whether to flush tlb or not
 *
 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 * If @flush is true, vcache is flushed before unmapping and tlb
 * after.
 */
 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
-                       bool flush)
+                       bool flush_tlb)
 {
        unsigned int last = num_possible_cpus() - 1;
        unsigned int cpu;
@@ -569,9 +569,8 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
         * the whole region at once rather than doing it for each cpu.
         * This could be an overkill but is more scalable.
         */
-        if (flush)
+        flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
-                flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+                           pcpu_chunk_addr(chunk, last, page_end));
-                                   pcpu_chunk_addr(chunk, last, page_end));
        for_each_possible_cpu(cpu)
                unmap_kernel_range_noflush(
@@ -579,7 +578,7 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
                                (page_end - page_start) << PAGE_SHIFT);
        /* ditto as flush_cache_vunmap() */
-        if (flush)
+        if (flush_tlb)
                flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
                                       pcpu_chunk_addr(chunk, last, page_end));
 }
@@ -1234,6 +1233,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
 ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
                                      ssize_t dyn_size, ssize_t unit_size)
 {
+        size_t chunk_size;
        unsigned int cpu;
        /* determine parameters and allocate */
@@ -1248,11 +1248,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
        } else
                pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-        pcpue_ptr = __alloc_bootmem_nopanic(
+        chunk_size = pcpue_unit_size * num_possible_cpus();
-                                        num_possible_cpus() * pcpue_unit_size,
-                                        PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
-        if (!pcpue_ptr)
+                                            __pa(MAX_DMA_ADDRESS));
+        if (!pcpue_ptr) {
+                pr_warning("PERCPU: failed to allocate %zu bytes for "
+                           "embedding\n", chunk_size);
                return -ENOMEM;
+        }
        /* return the leftover and copy */
        for_each_possible_cpu(cpu) {
diff --git a/mm/slab.c b/mm/slab.c
index e74a16e4ced6..7b5d4deacfcd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1544,9 +1544,6 @@ void __init kmem_cache_init(void)
        }
        g_cpucache_up = EARLY;
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
 }
 void __init kmem_cache_init_late(void)
@@ -1563,6 +1560,9 @@ void __init kmem_cache_init_late(void)
        /* Done! */
        g_cpucache_up = FULL;
+        /* Annotate slab for lockdep -- annotate the malloc caches */
+        init_lock_keys();
        /*
         * Register a cpu startup notifier callback that initializes
         * cpu_cache_get for all new cpus
@@ -2547,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
        }
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
-                synchronize_rcu();
+                rcu_barrier();
        __kmem_cache_destroy(cachep);
        mutex_unlock(&cache_chain_mutex);
diff --git a/mm/slob.c b/mm/slob.c
index c78742defdc6..9641da3d5e58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -595,6 +595,8 @@ EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *c)
 {
        kmemleak_free(c);
+        if (c->flags & SLAB_DESTROY_BY_RCU)
+                rcu_barrier();
        slob_free(c, sizeof(struct kmem_cache));
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index 819f056b39c6..b9f1491a58a1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -21,7 +21,6 @@
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/kmemleak.h>
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/debugobjects.h>
@@ -2595,6 +2594,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+        if (s->flags & SLAB_DESTROY_BY_RCU)
+                rcu_barrier();
        down_write(&slub_lock);
        s->refcount--;
        if (!s->refcount) {
@@ -2833,13 +2834,15 @@ EXPORT_SYMBOL(__kmalloc);
 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 {
        struct page *page;
+        void *ptr = NULL;
        flags |= __GFP_COMP | __GFP_NOTRACK;
        page = alloc_pages_node(node, flags, get_order(size));
        if (page)
-                return page_address(page);
+                ptr = page_address(page);
-        else
-                return NULL;
+        kmemleak_alloc(ptr, size, 1, flags);
+        return ptr;
 }
 #ifdef CONFIG_NUMA
@@ -2924,6 +2927,7 @@ void kfree(const void *x)
        page = virt_to_head_page(x);
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
+                kmemleak_free(x);
                put_page(page);
                return;
        }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                if (!bdev) {
                        if (bdev_p)
-                                *bdev_p = bdget(sis->bdev->bd_dev);
+                                *bdev_p = bdgrab(sis->bdev);
                        spin_unlock(&swap_lock);
                        return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                                        struct swap_extent, list);
                        if (se->start_block == offset) {
                                if (bdev_p)
-                                        *bdev_p = bdget(sis->bdev->bd_dev);
+                                        *bdev_p = bdgrab(sis->bdev);
                                spin_unlock(&swap_lock);
                                bdput(bdev);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 54155268dfca..dea7abd31098 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1104,7 +1104,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 */
                if (nr_freed < nr_taken && !current_is_kswapd() &&
                    lumpy_reclaim) {
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                        /*
                         * The attempt at page out may have made some
@@ -1721,7 +1721,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                /* Take a nap, wait for some writeback to complete */
                if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (!sc->all_unreclaimable && scanning_global_lru(sc))
@@ -1960,7 +1960,7 @@ loop_again:
                 * another pass across the zones.
                 */
                if (total_scanned && priority < DEF_PRIORITY - 2)
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                /*
                 * We do this so kswapd doesn't build up large priorities for
@@ -2233,7 +2233,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                                goto out;
                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-                                congestion_wait(WRITE, HZ / 10);
+                                congestion_wait(BLK_RW_ASYNC, HZ / 10);
                }
        }
author	J. Bruce Fields <bfields@citi.umich.edu>	2009-08-21 11:27:29 -0400
committer	J. Bruce Fields <bfields@citi.umich.edu>	2009-08-21 11:27:29 -0400
commit	e9dc122166b8d863d3057a66ada04838e5548e52 (patch)
tree	749e15bf719b64bf9113db7acd8e043d9742cb26 /mm
parent	560ab42ef923aaf2e4347315bdfcc74b2708972c (diff)
parent	405d8f8b1d936414da2093d4149ff790ff3f84a5 (diff)