14 files changed, 176 insertions, 95 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c948d4ca8bde..fe5f674d7a7d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR
          For most ia64, ppc64 and x86 users with lots of address space
          a value of 65536 is reasonable and should cause no problems.
          On arm and other archs it should not be higher than 32768.
-          Programs which use vm86 functionality would either need additional
+          Programs which use vm86 functionality or have some need to map
-          permissions from either the LSM or the capabilities module or have
+          this low address space will need CAP_SYS_RAWIO or disable this
-          this protection disabled.
+          protection by setting the value to 0.
          This value can be changed after boot using the
          /proc/sys/vm/mmap_min_addr tunable.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0351e31f474..cafdcee154e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
        spin_lock(&inode->i_lock);
-        inode->i_blocks -= blocks_per_huge_page(h);
+        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5aabd41ffb8f..487267310a84 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1217,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
        }
        object = NULL;
 out:
-        rcu_read_unlock();
        return object;
 }
@@ -1233,13 +1232,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        ++(*pos);
-        rcu_read_lock();
        list_for_each_continue_rcu(n, &object_list) {
                next_obj = list_entry(n, struct kmemleak_object, object_list);
                if (get_object(next_obj))
                        break;
        }
-        rcu_read_unlock();
        put_object(prev_obj);
        return next_obj;
@@ -1255,6 +1252,7 @@ static void kmemleak_seq_stop(struct seq_file *seq, void *v)
                 * kmemleak_seq_start may return ERR_PTR if the scan_mutex
                 * waiting was interrupted, so only release it if !IS_ERR.
                 */
+                rcu_read_unlock();
                mutex_unlock(&scan_mutex);
                if (v)
                        put_object(v);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e717964cb5a0..fd4529d86de5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        ret = 0;
 out:
        unlock_page_cgroup(pc);
+        /*
+         * We charges against "to" which may not have any tasks. Then, "to"
+         * can be under rmdir(). But in current implementation, caller of
+         * this function is just force_empty() and it's garanteed that
+         * "to" is never removed. So, we don't check rmdir status here.
+         */
        return ret;
 }
@@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                return;
        if (!ptr)
                return;
+        cgroup_exclude_rmdir(&ptr->css);
        pc = lookup_page_cgroup(page);
        mem_cgroup_lru_del_before_commit_swapcache(page);
        __mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                }
                rcu_read_unlock();
        }
-        /* add this page(page_cgroup) to the LRU we want. */
+        /*
+         * At swapin, we may charge account against cgroup which has no tasks.
+         * So, rmdir()->pre_destroy() can be called while we do this charge.
+         * In that case, we need to call pre_destroy() again. check it here.
+         */
+        cgroup_release_and_wakeup_rmdir(&ptr->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        if (!mem)
                return;
+        cgroup_exclude_rmdir(&mem->css);
        /* at migration success, oldpage->mapping is NULL. */
        if (oldpage->mapping) {
                target = oldpage;
@@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
         */
        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
                mem_cgroup_uncharge_page(target);
+        /*
+         * At migration, we may charge account against cgroup which has no tasks
+         * So, rmdir()->pre_destroy() can be called while we do this charge.
+         * In that case, we need to call pre_destroy() again. check it here.
+         */
+        cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 */
-static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_set_nodemask(struct mempolicy *pol,
+                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
-        nodemask_t cpuset_context_nmask;
        int ret;
        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
+        /* Check N_HIGH_MEMORY */
+        nodes_and(nsc->mask1,
+                  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;   /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
-                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
-                                               &cpuset_current_mems_allowed);
                else
-                        nodes_and(cpuset_context_nmask, *nodes,
+                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
-                                  cpuset_current_mems_allowed);
                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
                                                cpuset_current_mems_allowed;
        }
-        ret = mpol_ops[pol->mode].create(pol,
+        if (nodes)
-                                nodes ? &cpuset_context_nmask : NULL);
+                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+        else
+                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
 }
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
        struct mempolicy *new, *old;
        struct mm_struct *mm = current->mm;
+        NODEMASK_SCRATCH(scratch);
        int ret;
-        new = mpol_new(mode, flags, nodes);
+        if (!scratch)
-        if (IS_ERR(new))
+                return -ENOMEM;
-                return PTR_ERR(new);
+        new = mpol_new(mode, flags, nodes);
+        if (IS_ERR(new)) {
+                ret = PTR_ERR(new);
+                goto out;
+        }
        /*
         * prevent changing our mempolicy while show_numa_maps()
         * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
        if (mm)
                down_write(&mm->mmap_sem);
        task_lock(current);
-        ret = mpol_set_nodemask(new, nodes);
+        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                if (mm)
                        up_write(&mm->mmap_sem);
                mpol_put(new);
-                return ret;
+                goto out;
        }
        old = current->mempolicy;
        current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                up_write(&mm->mmap_sem);
        mpol_put(old);
-        return 0;
+        ret = 0;
+out:
+        NODEMASK_SCRATCH_FREE(scratch);
+        return ret;
 }
 /*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (err)
                        return err;
        }
-        down_write(&mm->mmap_sem);
+        {
-        task_lock(current);
+                NODEMASK_SCRATCH(scratch);
-        err = mpol_set_nodemask(new, nmask);
+                if (scratch) {
-        task_unlock(current);
+                        down_write(&mm->mmap_sem);
+                        task_lock(current);
+                        err = mpol_set_nodemask(new, nmask, scratch);
+                        task_unlock(current);
+                        if (err)
+                                up_write(&mm->mmap_sem);
+                } else
+                        err = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+        }
        if (err) {
-                up_write(&mm->mmap_sem);
                mpol_put(new);
                return err;
        }
@@ -1891,6 +1911,7 @@ restart:
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        if (mpol) {
                struct vm_area_struct pvma;
                struct mempolicy *new;
+                NODEMASK_SCRATCH(scratch);
+                if (!scratch)
+                        return;
                /* contextualize the tmpfs mount point mempolicy */
                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(new)) {
                        mpol_put(mpol); /* drop our ref on sb mpol */
+                        NODEMASK_SCRATCH_FREE(scratch);
                        return;         /* no valid nodemask intersection */
                }
                task_lock(current);
-                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                mpol_put(mpol); /* drop our ref on sb mpol */
                if (ret) {
+                        NODEMASK_SCRATCH_FREE(scratch);
                        mpol_put(new);
                        return;
                }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
                mpol_put(new);                  /* drop initial ref */
+                NODEMASK_SCRATCH_FREE(scratch);
        }
 }
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                err = 1;
        else {
                int ret;
+                NODEMASK_SCRATCH(scratch);
-                task_lock(current);
+                if (scratch) {
-                ret = mpol_set_nodemask(new, &nodes);
+                        task_lock(current);
-                task_unlock(current);
+                        ret = mpol_set_nodemask(new, &nodes, scratch);
-                if (ret)
+                        task_unlock(current);
+                } else
+                        ret = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+                if (ret) {
                        err = 1;
-                else if (no_context) {
+                        mpol_put(new);
+                } else if (no_context) {
                        /* save for contextualization */
                        new->w.user_nodemask = nodes;
                }
diff --git a/mm/mempool.c b/mm/mempool.c
index a46eb1b4bb66..32e75d400503 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab);
 */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
-        size_t size = (size_t)(long)pool_data;
+        size_t size = (size_t)pool_data;
        return kmalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
 {
-        size_t size = (size_t) pool_data;
+        size_t size = (size_t)pool_data;
        return kzalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kzalloc);
diff --git a/mm/mmap.c b/mm/mmap.c
index 34579b23ebd5..8101de490c73 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
 /*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10fece4..4bde489ec431 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
 atomic_long_t mmap_pages_allocated;
 EXPORT_SYMBOL(mem_map);
@@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file,
                if (!file->f_op->read)
                        capabilities &= ~BDI_CAP_MAP_COPY;
+                /* The file shall have been opened with read permission. */
+                if (!(file->f_mode & FMODE_READ))
+                        return -EACCES;
                if (flags & MAP_SHARED) {
                        /* do checks for writing, appending and locking */
                        if ((prot & PROT_WRITE) &&
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..a7b2460e922b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        unsigned long points, cpu_time, run_time;
        struct mm_struct *mm;
        struct task_struct *child;
-        int oom_adj;
        task_lock(p);
        mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
                task_unlock(p);
                return 0;
        }
-        oom_adj = mm->oom_adj;
-        if (oom_adj == OOM_DISABLE) {
-                task_unlock(p);
-                return 0;
-        }
        /*
         * The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
                points /= 8;
        /*
-         * Adjust the score by oom_adj.
+         * Adjust the score by oomkilladj.
         */
-        if (oom_adj) {
+        if (p->oomkilladj) {
-                if (oom_adj > 0) {
+                if (p->oomkilladj > 0) {
                        if (!points)
                                points = 1;
-                        points <<= oom_adj;
+                        points <<= p->oomkilladj;
                } else
-                        points >>= -(oom_adj);
+                        points >>= -(p->oomkilladj);
        }
 #ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                        *ppoints = ULONG_MAX;
                }
+                if (p->oomkilladj == OOM_DISABLE)
+                        continue;
                points = badness(p, uptime.tv_sec);
-                if (points > *ppoints) {
+                if (points > *ppoints || !chosen) {
                        chosen = p;
                        *ppoints = points;
                }
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
                }
                printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
                       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
-                       get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
+                       get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
+                       p->comm);
                task_unlock(p);
        } while_each_thread(g, p);
 }
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
                return;
        }
-        if (!p->mm)
+        if (!p->mm) {
+                WARN_ON(1);
+                printk(KERN_WARNING "tried to kill an mm-less task!\n");
                return;
+        }
        if (verbose)
                printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
        struct mm_struct *mm;
        struct task_struct *g, *q;
-        task_lock(p);
        mm = p->mm;
-        if (!mm || mm->oom_adj == OOM_DISABLE) {
-                task_unlock(p);
+        /* WARNING: mm may not be dereferenced since we did not obtain its
+         * value from get_task_mm(p).  This is OK since all we need to do is
+         * compare mm to q->mm below.
+         *
+         * Furthermore, even if mm contains a non-NULL value, p->mm may
+         * change to NULL at any time since we do not hold task_lock(p).
+         * However, this is of no concern to us.
+         */
+        if (mm == NULL)
                return 1;
-        }
-        task_unlock(p);
+        /*
+         * Don't kill the process if any threads are set to OOM_DISABLE
+         */
+        do_each_thread(g, q) {
+                if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
+                        return 1;
+        } while_each_thread(g, q);
        __oom_kill_task(p, 1);
        /*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        struct task_struct *c;
        if (printk_ratelimit()) {
-                task_lock(current);
                printk(KERN_WARNING "%s invoked oom-killer: "
-                        "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
+                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-                        current->comm, gfp_mask, order,
+                        current->comm, gfp_mask, order, current->oomkilladj);
-                        current->mm ? current->mm->oom_adj : OOM_DISABLE);
+                task_lock(current);
                cpuset_print_task_mems_allowed(current);
                task_unlock(current);
                dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
-         * if its mm is still attached.
         */
-        if (p->mm && (p->flags & PF_EXITING)) {
+        if (p->flags & PF_EXITING) {
                __oom_kill_task(p, 0);
                return 0;
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caa92689aac9..5cc986eb9f6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -882,7 +882,7 @@ retry_reserve:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                        unsigned long count, struct list_head *list,
-                        int migratetype)
+                        int migratetype, int cold)
 {
        int i;
        
@@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 * merge IO requests if the physical pages are ordered
                 * properly.
                 */
-                list_add(&page->lru, list);
+                if (likely(cold == 0))
+                        list_add(&page->lru, list);
+                else
+                        list_add_tail(&page->lru, list);
                set_page_private(page, migratetype);
                list = &page->lru;
        }
@@ -1119,7 +1122,8 @@ again:
                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
-                                        pcp->batch, &pcp->list, migratetype);
+                                        pcp->batch, &pcp->list,
+                                        migratetype, cold);
                        if (unlikely(!pcp->count))
                                goto failed;
                }
@@ -1138,7 +1142,8 @@ again:
                /* Allocate more to the pcp list if necessary */
                if (unlikely(&page->lru == &pcp->list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
-                                        pcp->batch, &pcp->list, migratetype);
+                                        pcp->batch, &pcp->list,
+                                        migratetype, cold);
                        page = list_entry(pcp->list.next, struct page, lru);
                }
@@ -1740,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * be using allocators in order of preference for an area that is
         * too large.
         */
-        if (WARN_ON_ONCE(order >= MAX_ORDER))
+        if (order >= MAX_ORDER) {
+                WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
                return NULL;
+        }
        /*
         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1789,6 +1796,10 @@ rebalance:
        if (p->flags & PF_MEMALLOC)
                goto nopage;
+        /* Avoid allocations with no watermarks from looping endlessly */
+        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2533,7 +2544,6 @@ static void build_zonelists(pg_data_t *pgdat)
        prev_node = local_node;
        nodes_clear(used_mask);
-        memset(node_load, 0, sizeof(node_load));
        memset(node_order, 0, sizeof(node_order));
        j = 0;
@@ -2642,6 +2652,9 @@ static int __build_all_zonelists(void *dummy)
 {
        int nid;
+#ifdef CONFIG_NUMA
+        memset(node_load, 0, sizeof(node_load));
+#endif
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
diff --git a/mm/percpu.c b/mm/percpu.c
index b70f2acd8853..5fe37842e0ea 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -8,12 +8,12 @@
 *
 * This is percpu allocator which can handle both static and dynamic
 * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
- * chunk is consisted of num_possible_cpus() units and the first chunk
+ * chunk is consisted of nr_cpu_ids units and the first chunk is used
- * is used for static percpu variables in the kernel image (special
+ * for static percpu variables in the kernel image (special boot time
- * boot time alloc/init handling necessary as these areas need to be
+ * alloc/init handling necessary as these areas need to be brought up
- * brought up before allocation services are running).  Unit grows as
+ * before allocation services are running).  Unit grows as necessary
- * necessary and all units grow or shrink in unison.  When a chunk is
+ * and all units grow or shrink in unison.  When a chunk is filled up,
- * filled up, another chunk is allocated.  ie. in vmalloc area
+ * another chunk is allocated.  ie. in vmalloc area
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
@@ -558,7 +558,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
                       bool flush_tlb)
 {
-        unsigned int last = num_possible_cpus() - 1;
+        unsigned int last = nr_cpu_ids - 1;
        unsigned int cpu;
        /* unmap must not be done on immutable chunk */
@@ -643,7 +643,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 */
 static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 {
-        unsigned int last = num_possible_cpus() - 1;
+        unsigned int last = nr_cpu_ids - 1;
        unsigned int cpu;
        int err;
@@ -749,7 +749,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
        chunk->map[chunk->map_used++] = pcpu_unit_size;
        chunk->page = chunk->page_ar;
-        chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+        chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
        if (!chunk->vm) {
                free_pcpu_chunk(chunk);
                return NULL;
@@ -1067,9 +1067,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
                                        PFN_UP(size_sum));
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
-        pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+        pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
        pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
-                + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+                + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
        if (dyn_size < 0)
                dyn_size = pcpu_unit_size - static_size - reserved_size;
@@ -1248,7 +1248,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
        } else
                pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
-        chunk_size = pcpue_unit_size * num_possible_cpus();
+        chunk_size = pcpue_unit_size * nr_cpu_ids;
        pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
                                            __pa(MAX_DMA_ADDRESS));
@@ -1259,12 +1259,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
        }
        /* return the leftover and copy */
-        for_each_possible_cpu(cpu) {
+        for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
                void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
-                free_bootmem(__pa(ptr + pcpue_size),
+                if (cpu_possible(cpu)) {
-                             pcpue_unit_size - pcpue_size);
+                        free_bootmem(__pa(ptr + pcpue_size),
-                memcpy(ptr, __per_cpu_load, static_size);
+                                     pcpue_unit_size - pcpue_size);
+                        memcpy(ptr, __per_cpu_load, static_size);
+                } else
+                        free_bootmem(__pa(ptr), pcpue_unit_size);
        }
        /* we're ready, commit */
diff --git a/mm/rmap.c b/mm/rmap.c
index 836c6c63e1f2..0895b5c7cbff 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page,
         */
        if (vma->vm_flags & VM_LOCKED) {
                *mapcount = 1;  /* break early from loop */
+                *vm_flags |= VM_LOCKED;
                goto out_unmap;
        }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1ade1a48ee7..8ffdc0d23c53 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                if (!bdev) {
                        if (bdev_p)
-                                *bdev_p = bdget(sis->bdev->bd_dev);
+                                *bdev_p = bdgrab(sis->bdev);
                        spin_unlock(&swap_lock);
                        return i;
@@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                                        struct swap_extent, list);
                        if (se->start_block == offset) {
                                if (bdev_p)
-                                        *bdev_p = bdget(sis->bdev->bd_dev);
+                                        *bdev_p = bdgrab(sis->bdev);
                                spin_unlock(&swap_lock);
                                bdput(bdev);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dea7abd31098..94e86dd6954c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                referenced = page_referenced(page, 1,
                                                sc->mem_cgroup, &vm_flags);
-                /* In active use or really unfreeable?  Activate it. */
+                /*
+                 * In active use or really unfreeable?  Activate it.
+                 * If page which have PG_mlocked lost isoltation race,
+                 * try_to_unmap moves it to unevictable list
+                 */
                if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
-                                        referenced && page_mapping_inuse(page))
+                                        referenced && page_mapping_inuse(page)
+                                        && !(vm_flags & VM_LOCKED))
                        goto activate_locked;
                /*