17 files changed, 322 insertions, 164 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 493b468a5035..c86edd244294 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
        enum bdi_state bit;
@@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested);
 /**
 * congestion_wait - wait for a backing_dev to become uncongested
- * @rw: READ or WRITE
+ * @sync: SYNC or ASYNC IO
 * @timeout: timeout in jiffies
 *
 * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
 * write congestion.  If no backing_devs are congested then just wait for the
 * next write to be completed.
 */
-long congestion_wait(int rw, long timeout)
+long congestion_wait(int sync, long timeout)
 {
        long ret;
        DEFINE_WAIT(wait);
-        wait_queue_head_t *wqh = &congestion_wqh[rw];
+        wait_queue_head_t *wqh = &congestion_wqh[sync];
        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        ret = io_schedule_timeout(timeout);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d2a9ce952768..701740c9e81b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,6 +12,7 @@
 #include <linux/pfn.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/kmemleak.h>
 #include <asm/bug.h>
 #include <asm/io.h>
@@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 {
        unsigned long start, end;
+        kmemleak_free_part(__va(physaddr), size);
        start = PFN_UP(physaddr);
        end = PFN_DOWN(physaddr + size);
@@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 {
        unsigned long start, end;
+        kmemleak_free_part(__va(addr), size);
        start = PFN_UP(addr);
        end = PFN_DOWN(addr + size);
@@ -516,6 +521,7 @@ find_block:
                region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
                                start_off);
                memset(region, 0, size);
+                kmemleak_alloc(region, size, 1, 0);
                return region;
        }
diff --git a/mm/filemap.c b/mm/filemap.c
index 22396713feb9..ccea3b665c12 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2272,6 +2272,7 @@ again:
                pagefault_enable();
                flush_dcache_page(page);
+                mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                page, fsdata);
                if (unlikely(status < 0))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..cafdcee154e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
        spin_lock(&inode->i_lock);
-        inode->i_blocks -= blocks_per_huge_page(h);
+        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e766e1da09d2..487267310a84 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -103,10 +103,10 @@
 * Kmemleak configuration and common defines.
 */
 #define MAX_TRACE               16      /* stack trace length */
-#define REPORTS_NR              50      /* maximum number of reported leaks */
 #define MSECS_MIN_AGE           5000    /* minimum object age for reporting */
 #define SECS_FIRST_SCAN         60      /* delay before the first scan */
 #define SECS_SCAN_WAIT          600     /* subsequent auto scanning delay */
+#define GRAY_LIST_PASSES        25      /* maximum number of gray list scans */
 #define BYTES_PER_POINTER       sizeof(void *)
@@ -158,6 +158,8 @@ struct kmemleak_object {
 #define OBJECT_REPORTED         (1 << 1)
 /* flag set to not scan the object */
 #define OBJECT_NO_SCAN          (1 << 2)
+/* flag set on newly allocated objects */
+#define OBJECT_NEW              (1 << 3)
 /* the list of all allocated objects */
 static LIST_HEAD(object_list);
@@ -196,9 +198,6 @@ static int kmemleak_stack_scan = 1;
 /* protects the memory scanning, parameters and debug/kmemleak file access */
 static DEFINE_MUTEX(scan_mutex);
-/* number of leaks reported (for limitation purposes) */
-static int reported_leaks;
 /*
 * Early object allocation/freeing logging. Kmemleak is initialized after the
 * kernel allocator. However, both the kernel allocator and kmemleak may
@@ -211,6 +210,7 @@ static int reported_leaks;
 enum {
        KMEMLEAK_ALLOC,
        KMEMLEAK_FREE,
+        KMEMLEAK_FREE_PART,
        KMEMLEAK_NOT_LEAK,
        KMEMLEAK_IGNORE,
        KMEMLEAK_SCAN_AREA,
@@ -274,6 +274,11 @@ static int color_gray(const struct kmemleak_object *object)
        return object->min_count != -1 && object->count >= object->min_count;
 }
+static int color_black(const struct kmemleak_object *object)
+{
+        return object->min_count == -1;
+}
 /*
 * Objects are considered unreferenced only if their color is white, they have
 * not be deleted and have a minimum age to avoid false positives caused by
@@ -451,7 +456,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count,
        INIT_HLIST_HEAD(&object->area_list);
        spin_lock_init(&object->lock);
        atomic_set(&object->use_count, 1);
-        object->flags = OBJECT_ALLOCATED;
+        object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
        object->pointer = ptr;
        object->size = size;
        object->min_count = min_count;
@@ -519,27 +524,17 @@ out:
 * Remove the metadata (struct kmemleak_object) for a memory block from the
 * object_list and object_tree_root and decrement its use_count.
 */
-static void delete_object(unsigned long ptr)
+static void __delete_object(struct kmemleak_object *object)
 {
        unsigned long flags;
-        struct kmemleak_object *object;
        write_lock_irqsave(&kmemleak_lock, flags);
-        object = lookup_object(ptr, 0);
-        if (!object) {
-#ifdef DEBUG
-                kmemleak_warn("Freeing unknown object at 0x%08lx\n",
-                              ptr);
-#endif
-                write_unlock_irqrestore(&kmemleak_lock, flags);
-                return;
-        }
        prio_tree_remove(&object_tree_root, &object->tree_node);
        list_del_rcu(&object->object_list);
        write_unlock_irqrestore(&kmemleak_lock, flags);
        WARN_ON(!(object->flags & OBJECT_ALLOCATED));
-        WARN_ON(atomic_read(&object->use_count) < 1);
+        WARN_ON(atomic_read(&object->use_count) < 2);
        /*
         * Locking here also ensures that the corresponding memory block
@@ -552,6 +547,64 @@ static void delete_object(unsigned long ptr)
 }
 /*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it.
+ */
+static void delete_object_full(unsigned long ptr)
+{
+        struct kmemleak_object *object;
+        object = find_and_get_object(ptr, 0);
+        if (!object) {
+#ifdef DEBUG
+                kmemleak_warn("Freeing unknown object at 0x%08lx\n",
+                              ptr);
+#endif
+                return;
+        }
+        __delete_object(object);
+        put_object(object);
+}
+/*
+ * Look up the metadata (struct kmemleak_object) corresponding to ptr and
+ * delete it. If the memory block is partially freed, the function may create
+ * additional metadata for the remaining parts of the block.
+ */
+static void delete_object_part(unsigned long ptr, size_t size)
+{
+        struct kmemleak_object *object;
+        unsigned long start, end;
+        object = find_and_get_object(ptr, 1);
+        if (!object) {
+#ifdef DEBUG
+                kmemleak_warn("Partially freeing unknown object at 0x%08lx "
+                              "(size %zu)\n", ptr, size);
+#endif
+                return;
+        }
+        __delete_object(object);
+        /*
+         * Create one or two objects that may result from the memory block
+         * split. Note that partial freeing is only done by free_bootmem() and
+         * this happens before kmemleak_init() is called. The path below is
+         * only executed during early log recording in kmemleak_init(), so
+         * GFP_KERNEL is enough.
+         */
+        start = object->pointer;
+        end = object->pointer + object->size;
+        if (ptr > start)
+                create_object(start, ptr - start, object->min_count,
+                              GFP_KERNEL);
+        if (ptr + size < end)
+                create_object(ptr + size, end - ptr - size, object->min_count,
+                              GFP_KERNEL);
+        put_object(object);
+}
+/*
 * Make a object permanently as gray-colored so that it can no longer be
 * reported as a leak. This is used in general to mark a false positive.
 */
@@ -715,13 +768,28 @@ void kmemleak_free(const void *ptr)
        pr_debug("%s(0x%p)\n", __func__, ptr);
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
-                delete_object((unsigned long)ptr);
+                delete_object_full((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
                log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
 /*
+ * Partial memory freeing function callback. This function is usually called
+ * from bootmem allocator when (part of) a memory block is freed.
+ */
+void kmemleak_free_part(const void *ptr, size_t size)
+{
+        pr_debug("%s(0x%p)\n", __func__, ptr);
+        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+                delete_object_part((unsigned long)ptr, size);
+        else if (atomic_read(&kmemleak_early_log))
+                log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free_part);
+/*
 * Mark an already allocated memory block as a false positive. This will cause
 * the block to no longer be reported as leak and always be scanned.
 */
@@ -807,7 +875,7 @@ static int scan_should_stop(void)
 * found to the gray list.
 */
 static void scan_block(void *_start, void *_end,
-                       struct kmemleak_object *scanned)
+                       struct kmemleak_object *scanned, int allow_resched)
 {
        unsigned long *ptr;
        unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
@@ -818,6 +886,8 @@ static void scan_block(void *_start, void *_end,
                unsigned long pointer = *ptr;
                struct kmemleak_object *object;
+                if (allow_resched)
+                        cond_resched();
                if (scan_should_stop())
                        break;
@@ -881,12 +951,12 @@ static void scan_object(struct kmemleak_object *object)
                goto out;
        if (hlist_empty(&object->area_list))
                scan_block((void *)object->pointer,
-                           (void *)(object->pointer + object->size), object);
+                           (void *)(object->pointer + object->size), object, 0);
        else
                hlist_for_each_entry(area, elem, &object->area_list, node)
                        scan_block((void *)(object->pointer + area->offset),
                                   (void *)(object->pointer + area->offset
-                                            + area->length), object);
+                                            + area->length), object, 0);
 out:
        spin_unlock_irqrestore(&object->lock, flags);
 }
@@ -903,6 +973,7 @@ static void kmemleak_scan(void)
        struct task_struct *task;
        int i;
        int new_leaks = 0;
+        int gray_list_pass = 0;
        jiffies_last_scan = jiffies;
@@ -923,6 +994,7 @@ static void kmemleak_scan(void)
 #endif
                /* reset the reference count (whiten the object) */
                object->count = 0;
+                object->flags &= ~OBJECT_NEW;
                if (color_gray(object) && get_object(object))
                        list_add_tail(&object->gray_list, &gray_list);
@@ -931,14 +1003,14 @@ static void kmemleak_scan(void)
        rcu_read_unlock();
        /* data/bss scanning */
-        scan_block(_sdata, _edata, NULL);
+        scan_block(_sdata, _edata, NULL, 1);
-        scan_block(__bss_start, __bss_stop, NULL);
+        scan_block(__bss_start, __bss_stop, NULL, 1);
 #ifdef CONFIG_SMP
        /* per-cpu sections scanning */
        for_each_possible_cpu(i)
                scan_block(__per_cpu_start + per_cpu_offset(i),
-                           __per_cpu_end + per_cpu_offset(i), NULL);
+                           __per_cpu_end + per_cpu_offset(i), NULL, 1);
 #endif
        /*
@@ -960,7 +1032,7 @@ static void kmemleak_scan(void)
                        /* only scan if page is in use */
                        if (page_count(page) == 0)
                                continue;
-                        scan_block(page, page + 1, NULL);
+                        scan_block(page, page + 1, NULL, 1);
                }
        }
@@ -972,7 +1044,8 @@ static void kmemleak_scan(void)
                read_lock(&tasklist_lock);
                for_each_process(task)
                        scan_block(task_stack_page(task),
-                                   task_stack_page(task) + THREAD_SIZE, NULL);
+                                   task_stack_page(task) + THREAD_SIZE,
+                                   NULL, 0);
                read_unlock(&tasklist_lock);
        }
@@ -984,6 +1057,7 @@ static void kmemleak_scan(void)
         * kmemleak objects cannot be freed from outside the loop because their
         * use_count was increased.
         */
+repeat:
        object = list_entry(gray_list.next, typeof(*object), gray_list);
        while (&object->gray_list != &gray_list) {
                cond_resched();
@@ -1001,12 +1075,38 @@ static void kmemleak_scan(void)
                object = tmp;
        }
+        if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
+                goto scan_end;
+        /*
+         * Check for new objects allocated during this scanning and add them
+         * to the gray list.
+         */
+        rcu_read_lock();
+        list_for_each_entry_rcu(object, &object_list, object_list) {
+                spin_lock_irqsave(&object->lock, flags);
+                if ((object->flags & OBJECT_NEW) && !color_black(object) &&
+                    get_object(object)) {
+                        object->flags &= ~OBJECT_NEW;
+                        list_add_tail(&object->gray_list, &gray_list);
+                }
+                spin_unlock_irqrestore(&object->lock, flags);
+        }
+        rcu_read_unlock();
+        if (!list_empty(&gray_list))
+                goto repeat;
+scan_end:
        WARN_ON(!list_empty(&gray_list));
        /*
-         * If scanning was stopped do not report any new unreferenced objects.
+         * If scanning was stopped or new objects were being allocated at a
+         * higher rate than gray list scanning, do not report any new
+         * unreferenced objects.
         */
-        if (scan_should_stop())
+        if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
                return;
        /*
@@ -1039,6 +1139,7 @@ static int kmemleak_scan_thread(void *arg)
        static int first_run = 1;
        pr_info("Automatic memory scanning thread started\n");
+        set_user_nice(current, 10);
        /*
         * Wait before the first scan to allow the system to fully initialize.
@@ -1101,11 +1202,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct kmemleak_object *object;
        loff_t n = *pos;
+        int err;
-        if (!n)
+        err = mutex_lock_interruptible(&scan_mutex);
-                reported_leaks = 0;
+        if (err < 0)
-        if (reported_leaks >= REPORTS_NR)
+                return ERR_PTR(err);
-                return NULL;
        rcu_read_lock();
        list_for_each_entry_rcu(object, &object_list, object_list) {
@@ -1116,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
        }
        object = NULL;
 out:
-        rcu_read_unlock();
        return object;
 }
@@ -1131,17 +1231,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        struct list_head *n = &prev_obj->object_list;
        ++(*pos);
-        if (reported_leaks >= REPORTS_NR)
-                goto out;
-        rcu_read_lock();
        list_for_each_continue_rcu(n, &object_list) {
                next_obj = list_entry(n, struct kmemleak_object, object_list);
                if (get_object(next_obj))
                        break;
        }
-        rcu_read_unlock();
-out:
        put_object(prev_obj);
        return next_obj;
 }
@@ -1151,8 +1247,16 @@ out:
 */
 static void kmemleak_seq_stop(struct seq_file *seq, void *v)
 {
-        if (v)
+        if (!IS_ERR(v)) {
-                put_object(v);
+                /*
+                 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
+                 * waiting was interrupted, so only release it if !IS_ERR.
+                 */
+                rcu_read_unlock();
+                mutex_unlock(&scan_mutex);
+                if (v)
+                        put_object(v);
+        }
 }
 /*
@@ -1164,10 +1268,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
        unsigned long flags;
        spin_lock_irqsave(&object->lock, flags);
-        if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) {
+        if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
                print_unreferenced(seq, object);
-                reported_leaks++;
-        }
        spin_unlock_irqrestore(&object->lock, flags);
        return 0;
 }
@@ -1181,36 +1283,15 @@ static const struct seq_operations kmemleak_seq_ops = {
 static int kmemleak_open(struct inode *inode, struct file *file)
 {
-        int ret = 0;
        if (!atomic_read(&kmemleak_enabled))
                return -EBUSY;
-        ret = mutex_lock_interruptible(&scan_mutex);
+        return seq_open(file, &kmemleak_seq_ops);
-        if (ret < 0)
-                goto out;
-        if (file->f_mode & FMODE_READ) {
-                ret = seq_open(file, &kmemleak_seq_ops);
-                if (ret < 0)
-                        goto scan_unlock;
-        }
-        return ret;
-scan_unlock:
-        mutex_unlock(&scan_mutex);
-out:
-        return ret;
 }
 static int kmemleak_release(struct inode *inode, struct file *file)
 {
-        int ret = 0;
+        return seq_release(inode, file);
-        if (file->f_mode & FMODE_READ)
-                seq_release(inode, file);
-        mutex_unlock(&scan_mutex);
-        return ret;
 }
 /*
@@ -1230,15 +1311,17 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
 {
        char buf[64];
        int buf_size;
+        int ret;
-        if (!atomic_read(&kmemleak_enabled))
-                return -EBUSY;
        buf_size = min(size, (sizeof(buf) - 1));
        if (strncpy_from_user(buf, user_buf, buf_size) < 0)
                return -EFAULT;
        buf[buf_size] = 0;
+        ret = mutex_lock_interruptible(&scan_mutex);
+        if (ret < 0)
+                return ret;
        if (strncmp(buf, "off", 3) == 0)
                kmemleak_disable();
        else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1251,11 +1334,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
                stop_scan_thread();
        else if (strncmp(buf, "scan=", 5) == 0) {
                unsigned long secs;
-                int err;
-                err = strict_strtoul(buf + 5, 0, &secs);
+                ret = strict_strtoul(buf + 5, 0, &secs);
-                if (err < 0)
+                if (ret < 0)
-                        return err;
+                        goto out;
                stop_scan_thread();
                if (secs) {
                        jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
@@ -1264,7 +1346,12 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
        } else if (strncmp(buf, "scan", 4) == 0)
                kmemleak_scan();
        else
-                return -EINVAL;
+                ret = -EINVAL;
+out:
+        mutex_unlock(&scan_mutex);
+        if (ret < 0)
+                return ret;
        /* ignore the rest of the buffer, only one command at a time */
        *ppos += size;
@@ -1293,7 +1380,7 @@ static int kmemleak_cleanup_thread(void *arg)
        rcu_read_lock();
        list_for_each_entry_rcu(object, &object_list, object_list)
-                delete_object(object->pointer);
+                delete_object_full(object->pointer);
        rcu_read_unlock();
        mutex_unlock(&scan_mutex);
@@ -1388,6 +1475,9 @@ void __init kmemleak_init(void)
                case KMEMLEAK_FREE:
                        kmemleak_free(log->ptr);
                        break;
+                case KMEMLEAK_FREE_PART:
+                        kmemleak_free_part(log->ptr, log->size);
+                        break;
                case KMEMLEAK_NOT_LEAK:
                        kmemleak_not_leak(log->ptr);
                        break;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e2fa20dadf40..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        ret = 0;
 out:
        unlock_page_cgroup(pc);
+        /*
+         * We charges against "to" which may not have any tasks. Then, "to"
+         * can be under rmdir(). But in current implementation, caller of
+         * this function is just force_empty() and it's garanteed that
+         * "to" is never removed. So, we don't check rmdir status here.
+         */
        return ret;
 }
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                return;
        if (!ptr)
                return;
+        cgroup_exclude_rmdir(&ptr->css);
        pc = lookup_page_cgroup(page);
        mem_cgroup_lru_del_before_commit_swapcache(page);
        __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                }
                rcu_read_unlock();
        }
-        /* add this page(page_cgroup) to the LRU we want. */
+        /*
+         * At swapin, we may charge account against cgroup which has no tasks.
+         * So, rmdir()->pre_destroy() can be called while we do this charge.
+         * In that case, we need to call pre_destroy() again. check it here.
+         */
+        cgroup_release_and_wakeup_rmdir(&ptr->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        if (!mem)
                return;
+        cgroup_exclude_rmdir(&mem->css);
        /* at migration success, oldpage->mapping is NULL. */
        if (oldpage->mapping) {
                target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
         */
        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
                mem_cgroup_uncharge_page(target);
+        /*
+         * At migration, we may charge account against cgroup which has no tasks
+         * So, rmdir()->pre_destroy() can be called while we do this charge.
+         * In that case, we need to call pre_destroy() again. check it here.
+         */
+        cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 /*
@@ -1973,7 +1990,7 @@ try_to_free:
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                }
        }
diff --git a/mm/memory.c b/mm/memory.c
index 65216194eb8d..aede2ce3aba4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd)
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
-static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+                           unsigned long addr)
 {
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
-        pte_free_tlb(tlb, token);
+        pte_free_tlb(tlb, token, addr);
        tlb->mm->nr_ptes--;
 }
@@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                free_pte_range(tlb, pmd);
+                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);
        start &= PUD_MASK;
@@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
        pmd = pmd_offset(pud, start);
        pud_clear(pud);
-        pmd_free_tlb(tlb, pmd);
+        pmd_free_tlb(tlb, pmd, start);
 }
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
        pud = pud_offset(pgd, start);
        pgd_clear(pgd);
-        pud_free_tlb(tlb, pud);
+        pud_free_tlb(tlb, pud, start);
 }
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 */
-static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_set_nodemask(struct mempolicy *pol,
+                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
-        nodemask_t cpuset_context_nmask;
        int ret;
        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
+        /* Check N_HIGH_MEMORY */
+        nodes_and(nsc->mask1,
+                  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;   /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
-                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
-                                               &cpuset_current_mems_allowed);
                else
-                        nodes_and(cpuset_context_nmask, *nodes,
+                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
-                                  cpuset_current_mems_allowed);
                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
                                                cpuset_current_mems_allowed;
        }
-        ret = mpol_ops[pol->mode].create(pol,
+        if (nodes)
-                                nodes ? &cpuset_context_nmask : NULL);
+                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+        else
+                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
 }
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
        struct mempolicy *new, *old;
        struct mm_struct *mm = current->mm;
+        NODEMASK_SCRATCH(scratch);
        int ret;
-        new = mpol_new(mode, flags, nodes);
+        if (!scratch)
-        if (IS_ERR(new))
+                return -ENOMEM;
-                return PTR_ERR(new);
+        new = mpol_new(mode, flags, nodes);
+        if (IS_ERR(new)) {
+                ret = PTR_ERR(new);
+                goto out;
+        }
        /*
         * prevent changing our mempolicy while show_numa_maps()
         * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
        if (mm)
                down_write(&mm->mmap_sem);
        task_lock(current);
-        ret = mpol_set_nodemask(new, nodes);
+        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                if (mm)
                        up_write(&mm->mmap_sem);
                mpol_put(new);
-                return ret;
+                goto out;
        }
        old = current->mempolicy;
        current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                up_write(&mm->mmap_sem);
        mpol_put(old);
-        return 0;
+        ret = 0;
+out:
+        NODEMASK_SCRATCH_FREE(scratch);
+        return ret;
 }
 /*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (err)
                        return err;
        }
-        down_write(&mm->mmap_sem);
+        {
-        task_lock(current);
+                NODEMASK_SCRATCH(scratch);
-        err = mpol_set_nodemask(new, nmask);
+                if (scratch) {
-        task_unlock(current);
+                        down_write(&mm->mmap_sem);
+                        task_lock(current);
+                        err = mpol_set_nodemask(new, nmask, scratch);
+                        task_unlock(current);
+                        if (err)
+                                up_write(&mm->mmap_sem);
+                } else
+                        err = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+        }
        if (err) {
-                up_write(&mm->mmap_sem);
                mpol_put(new);
                return err;
        }
@@ -1891,6 +1911,7 @@ restart:
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        if (mpol) {
                struct vm_area_struct pvma;
                struct mempolicy *new;
+                NODEMASK_SCRATCH(scratch);
+                if (!scratch)
+                        return;
                /* contextualize the tmpfs mount point mempolicy */
                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(new)) {
                        mpol_put(mpol); /* drop our ref on sb mpol */
+                        NODEMASK_SCRATCH_FREE(scratch);
                        return;         /* no valid nodemask intersection */
                }
                task_lock(current);
-                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                mpol_put(mpol); /* drop our ref on sb mpol */
                if (ret) {
+                        NODEMASK_SCRATCH_FREE(scratch);
                        mpol_put(new);
                        return;
                }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
                mpol_put(new);                  /* drop initial ref */
+                NODEMASK_SCRATCH_FREE(scratch);
        }
 }
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                err = 1;
        else {
                int ret;
+                NODEMASK_SCRATCH(scratch);
-                task_lock(current);
+                if (scratch) {
-                ret = mpol_set_nodemask(new, &nodes);
+                        task_lock(current);
-                task_unlock(current);
+                        ret = mpol_set_nodemask(new, &nodes, scratch);
-                if (ret)
+                        task_unlock(current);
+                } else
+                        ret = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+                if (ret) {
                        err = 1;
-                else if (no_context) {
+                        mpol_put(new);
+                } else if (no_context) {
                        /* save for contextualization */
                        new->w.user_nodemask = nodes;
                }
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..32e75d400503 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
 */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
-        size_t size = (size_t)(long)pool_data;
+        size_t size = (size_t)pool_data;
        return kmalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
 {
-        size_t size = (size_t) pool_data;
+        size_t size = (size_t)pool_data;
        return kzalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3c7f5e1afe5f..997186c0b519 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -575,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@ -670,7 +670,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                if (global_page_state(NR_UNSTABLE_NFS) +
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
-                congestion_wait(WRITE, HZ/10);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
                /*
                 * The caller might hold locks which can prevent IO completion
@@ -716,7 +716,7 @@ static void background_writeout(unsigned long _min_pages)
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
                        if (wbc.encountered_congestion || wbc.more_io)
-                                congestion_wait(WRITE, HZ/10);
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;
                }
@@ -788,7 +788,7 @@ static void wb_kupdate(unsigned long arg)
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
                        if (wbc.encountered_congestion || wbc.more_io)
-                                congestion_wait(WRITE, HZ/10);
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
                        else
                                break;  /* All the old data is written */
                }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0f2cdf9d8b1..d052abbe3063 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -882,7 +882,7 @@ retry_reserve:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                        unsigned long count, struct list_head *list,
-                        int migratetype)
+                        int migratetype, int cold)
 {
        int i;
        
@@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 * merge IO requests if the physical pages are ordered
                 * properly.
                 */
-                list_add(&page->lru, list);
+                if (likely(cold == 0))
+                        list_add(&page->lru, list);
+                else
+                        list_add_tail(&page->lru, list);
                set_page_private(page, migratetype);
                list = &page->lru;
        }
@@ -1119,7 +1122,8 @@ again:
                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
-                                        pcp->batch, &pcp->list, migratetype);
+                                        pcp->batch, &pcp->list,
+                                        migratetype, cold);
                        if (unlikely(!pcp->count))
                                goto failed;
                }
@@ -1138,7 +1142,8 @@ again:
                /* Allocate more to the pcp list if necessary */
                if (unlikely(&page->lru == &pcp->list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
-                                        pcp->batch, &pcp->list, migratetype);
+                                        pcp->batch, &pcp->list,
+                                        migratetype, cold);
                        page = list_entry(pcp->list.next, struct page, lru);
                }
@@ -1666,7 +1671,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        congestion_wait(WRITE, HZ/50);
+                        congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
@@ -1740,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * be using allocators in order of preference for an area that is
         * too large.
         */
-        if (WARN_ON_ONCE(order >= MAX_ORDER))
+        if (order >= MAX_ORDER) {
+                WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
                return NULL;
+        }
        /*
         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1789,6 +1796,10 @@ rebalance:
        if (p->flags & PF_MEMALLOC)
                goto nopage;
+        /* Avoid allocations with no watermarks from looping endlessly */
+        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -1831,7 +1842,7 @@ rebalance:
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-                congestion_wait(WRITE, HZ/50);
+                congestion_wait(BLK_RW_ASYNC, HZ/50);
                goto rebalance;
        }
@@ -1983,7 +1994,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
                unsigned long alloc_end = addr + (PAGE_SIZE << order);
                unsigned long used = addr + PAGE_ALIGN(size);
-                split_page(virt_to_page(addr), order);
+                split_page(virt_to_page((void *)addr), order);
                while (used < alloc_end) {
                        free_page(used);
                        used += PAGE_SIZE;
@@ -4745,8 +4756,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                         * some pages at the end of hash table which
                         * alloc_pages_exact() automatically does
                         */
-                        if (get_order(size) < MAX_ORDER)
+                        if (get_order(size) < MAX_ORDER) {
                                table = alloc_pages_exact(size, GFP_ATOMIC);
+                                kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+                        }
                }
        } while (!table && size > PAGE_SIZE && --log2qty);
@@ -4764,16 +4777,6 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (_hash_mask)
                *_hash_mask = (1 << log2qty) - 1;
-        /*
-         * If hashdist is set, the table allocation is done with __vmalloc()
-         * which invokes the kmemleak_alloc() callback. This function may also
-         * be called before the slab and kmemleak are initialised when
-         * kmemleak simply buffers the request to be executed later
-         * (GFP_ATOMIC flag ignored in this case).
-         */
-        if (!hashdist)
-                kmemleak_alloc(table, size, 1, GFP_ATOMIC);
        return table;
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index b3d0bcff8c7c..3f9f182f9b44 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1004,7 +1004,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
        chunk->map[chunk->map_used++] = pcpu_unit_size;
-        chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+        chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
        if (!chunk->vm) {
                free_pcpu_chunk(chunk);
                return NULL;
@@ -1325,7 +1325,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
                int *identity_map;
                /* #units == #cpus, identity mapped */
-                identity_map = alloc_bootmem(num_possible_cpus() *
+                identity_map = alloc_bootmem(nr_cpu_ids *
                                             sizeof(identity_map[0]));
                for_each_possible_cpu(cpu)
@@ -1333,7 +1333,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
                pcpu_first_unit_cpu = 0;
                pcpu_last_unit_cpu = pcpu_nr_units - 1;
-                pcpu_nr_units = num_possible_cpus();
+                pcpu_nr_units = nr_cpu_ids;
                pcpu_unit_map = identity_map;
        }
@@ -1464,7 +1464,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
        size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
        unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-        chunk_size = unit_size * num_possible_cpus();
+        chunk_size = unit_size * nr_cpu_ids;
        base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
                                       __pa(MAX_DMA_ADDRESS));
@@ -1475,11 +1475,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
        }
        /* return the leftover and copy */
-        for_each_possible_cpu(cpu) {
+        for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
                void *ptr = base + cpu * unit_size;
-                free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
+                if (cpu_possible(cpu)) {
-                memcpy(ptr, __per_cpu_load, static_size);
+                        free_bootmem(__pa(ptr + size_sum),
+                                     unit_size - size_sum);
+                        memcpy(ptr, __per_cpu_load, static_size);
+                } else
+                        free_bootmem(__pa(ptr), unit_size);
        }
        /* we're ready, commit */
@@ -1525,8 +1529,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
                                  PCPU_MIN_UNIT_SIZE));
        /* unaligned allocations can't be freed, round up to page size */
-        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
+        pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
-                               sizeof(pages[0]));
        pages = alloc_bootmem(pages_size);
        /* allocate pages */
@@ -1546,7 +1549,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
-        vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
+        vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
        vm_area_register_early(&vm, PAGE_SIZE);
        for_each_possible_cpu(cpu) {
diff --git a/mm/slab.c b/mm/slab.c
index e74a16e4ced6..7b5d4deacfcd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1544,9 +1544,6 @@ void __init kmem_cache_init(void)
        }
        g_cpucache_up = EARLY;
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
 }
 void __init kmem_cache_init_late(void)
@@ -1563,6 +1560,9 @@ void __init kmem_cache_init_late(void)
        /* Done! */
        g_cpucache_up = FULL;
+        /* Annotate slab for lockdep -- annotate the malloc caches */
+        init_lock_keys();
        /*
         * Register a cpu startup notifier callback that initializes
         * cpu_cache_get for all new cpus
@@ -2547,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
        }
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
-                synchronize_rcu();
+                rcu_barrier();
        __kmem_cache_destroy(cachep);
        mutex_unlock(&cache_chain_mutex);
diff --git a/mm/slob.c b/mm/slob.c
index c78742defdc6..9641da3d5e58 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -595,6 +595,8 @@ EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *c)
 {
        kmemleak_free(c);
+        if (c->flags & SLAB_DESTROY_BY_RCU)
+                rcu_barrier();
        slob_free(c, sizeof(struct kmem_cache));
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index ffc895cc3a68..dc9765bb49dc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -21,7 +21,6 @@
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/kmemleak.h>
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/debugobjects.h>
@@ -2595,6 +2594,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+        if (s->flags & SLAB_DESTROY_BY_RCU)
+                rcu_barrier();
        down_write(&slub_lock);
        s->refcount--;
        if (!s->refcount) {
@@ -2833,13 +2834,15 @@ EXPORT_SYMBOL(__kmalloc);
 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 {
        struct page *page;
+        void *ptr = NULL;
        flags |= __GFP_COMP | __GFP_NOTRACK;
        page = alloc_pages_node(node, flags, get_order(size));
        if (page)
-                return page_address(page);
+                ptr = page_address(page);
-        else
-                return NULL;
+        kmemleak_alloc(ptr, size, 1, flags);
+        return ptr;
 }
 #ifdef CONFIG_NUMA
@@ -2924,6 +2927,7 @@ void kfree(const void *x)
        page = virt_to_head_page(x);
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
+                kmemleak_free(x);
                put_page(page);
                return;
        }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                if (!bdev) {
                        if (bdev_p)
-                                *bdev_p = bdget(sis->bdev->bd_dev);
+                                *bdev_p = bdgrab(sis->bdev);
                        spin_unlock(&swap_lock);
                        return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                                        struct swap_extent, list);
                        if (se->start_block == offset) {
                                if (bdev_p)
-                                        *bdev_p = bdget(sis->bdev->bd_dev);
+                                        *bdev_p = bdgrab(sis->bdev);
                                spin_unlock(&swap_lock);
                                bdput(bdev);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 54155268dfca..dea7abd31098 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1104,7 +1104,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 */
                if (nr_freed < nr_taken && !current_is_kswapd() &&
                    lumpy_reclaim) {
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                        /*
                         * The attempt at page out may have made some
@@ -1721,7 +1721,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                /* Take a nap, wait for some writeback to complete */
                if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (!sc->all_unreclaimable && scanning_global_lru(sc))
@@ -1960,7 +1960,7 @@ loop_again:
                 * another pass across the zones.
                 */
                if (total_scanned && priority < DEF_PRIORITY - 2)
-                        congestion_wait(WRITE, HZ/10);
+                        congestion_wait(BLK_RW_ASYNC, HZ/10);
                /*
                 * We do this so kswapd doesn't build up large priorities for
@@ -2233,7 +2233,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                                goto out;
                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-                                congestion_wait(WRITE, HZ / 10);
+                                congestion_wait(BLK_RW_ASYNC, HZ / 10);
                }
        }