14 files changed, 139 insertions, 117 deletions
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f3306a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                        flags &= MAP_NONBLOCK;
                        get_file(file);
                        addr = mmap_region(file, start, size,
-                                        flags, vma->vm_flags, pgoff, 1);
+                                        flags, vma->vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..107da3d809a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,12 +2269,18 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
-                                        struct vm_area_struct *vma)
+                                        struct vm_area_struct *vma,
+                                        int acctflag)
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
-        if (vma && vma->vm_flags & VM_NORESERVE)
+        /*
+         * Only apply hugepage reservation if asked. At fault time, an
+         * attempt will be made for VM_NORESERVE to allocate a page
+         * and filesystem quota without using reserves
+         */
+        if (acctflag & VM_NORESERVE)
                return 0;
        /*
@@ -2299,13 +2305,31 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (chg < 0)
                return chg;
+        /* There must be enough filesystem quota for the mapping */
        if (hugetlb_get_quota(inode->i_mapping, chg))
                return -ENOSPC;
+        /*
+         * Check enough hugepages are available for the reservation.
+         * Hand back the quota if there are not
+         */
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
+        /*
+         * Account for the reservations made. Shared mappings record regions
+         * that have reservations as they are shared by multiple VMAs.
+         * When the last VMA disappears, the region map says how much
+         * the reservation was and the page cache tells how much of
+         * the reservation was consumed. Private mappings are per-VMA and
+         * only the consumed reservations are tracked. When the VMA
+         * disappears, the original reservation is the VMA size and the
+         * consumed reservations are stored in the map. Hence, nothing
+         * else has to be done for private mappings here
+         */
        if (!vma || vma->vm_flags & VM_SHARED)
                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d0ea3ceba6d..8e4be9cb2a6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -202,6 +202,7 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         struct page_cgroup *pc,
@@ -1684,7 +1685,7 @@ move_account:
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                ret = 0;
-                for_each_node_state(node, N_POSSIBLE) {
+                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
                                for_each_lru(l) {
@@ -2193,10 +2194,23 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
 static void mem_cgroup_put(struct mem_cgroup *mem)
 {
-        if (atomic_dec_and_test(&mem->refcnt))
+        if (atomic_dec_and_test(&mem->refcnt)) {
+                struct mem_cgroup *parent = parent_mem_cgroup(mem);
                __mem_cgroup_free(mem);
+                if (parent)
+                        mem_cgroup_put(parent);
+        }
 }
+/*
+ * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
+ */
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+{
+        if (!mem->res.parent)
+                return NULL;
+        return mem_cgroup_from_res_counter(mem->res.parent, res);
+}
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static void __init enable_swap_cgroup(void)
@@ -2235,6 +2249,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        if (parent && parent->use_hierarchy) {
                res_counter_init(&mem->res, &parent->res);
                res_counter_init(&mem->memsw, &parent->memsw);
+                /*
+                 * We increment refcnt of the parent to ensure that we can
+                 * safely access it on res_counter_charge/uncharge.
+                 * This refcnt will be decremented when freeing this
+                 * mem_cgroup(see mem_cgroup_put).
+                 */
+                mem_cgroup_get(parent);
        } else {
                res_counter_init(&mem->res, NULL);
                res_counter_init(&mem->memsw, NULL);
diff --git a/mm/memory.c b/mm/memory.c
index 22bfa7a47a0b..baa999e87cd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1999,7 +1999,7 @@ gotten:
         * Don't let another task, with possibly unlocked vma,
         * keep the mlocked page.
         */
-        if (vma->vm_flags & VM_LOCKED) {
+        if ((vma->vm_flags & VM_LOCKED) && old_page) {
                lock_page(old_page);    /* for LRU manipulation */
                clear_page_mlock(old_page);
                unlock_page(old_page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 2bb4e1d63520..a9eff3f092f6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        struct vm_area_struct *vma;
        int err = 0;
-        for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+        for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
                if (vma->vm_ops && vma->vm_ops->migrate) {
                        err = vma->vm_ops->migrate(vma, to, from, flags);
                        if (err)
diff --git a/mm/mlock.c b/mm/mlock.c
index 2904a347e476..037161d61b4e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
 *
 * return number of pages [> 0] to be removed from locked_vm on success
 * of "special" vmas.
- *
- * return negative error if vma spanning @start-@range disappears while
- * mmap semaphore is dropped.  Unlikely?
 */
 long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm = vma->vm_mm;
        int nr_pages = (end - start) / PAGE_SIZE;
        BUG_ON(!(vma->vm_flags & VM_LOCKED));
@@ -314,20 +310,11 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                long error;
-                downgrade_write(&mm->mmap_sem);
-                error = __mlock_vma_pages_range(vma, start, end, 1);
-                up_read(&mm->mmap_sem);
+                __mlock_vma_pages_range(vma, start, end, 1);
-                /* vma can change or disappear */
-                down_write(&mm->mmap_sem);
-                vma = find_vma(mm, start);
-                /* non-NULL vma must contain @start, but need to check @end */
-                if (!vma ||  end > vma->vm_end)
-                        return -ENOMEM;
-                return 0;       /* hide other errors from mmap(), et al */
+                /* Hide errors from mmap() and other callers */
+                return 0;
        }
        /*
@@ -438,41 +425,14 @@ success:
        vma->vm_flags = newflags;
        if (lock) {
-                /*
-                 * mmap_sem is currently held for write.  Downgrade the write
-                 * lock to a read lock so that other faults, mmap scans, ...
-                 * while we fault in all pages.
-                 */
-                downgrade_write(&mm->mmap_sem);
                ret = __mlock_vma_pages_range(vma, start, end, 1);
-                /*
+                if (ret > 0) {
-                 * Need to reacquire mmap sem in write mode, as our callers
-                 * expect this.  We have no support for atomically upgrading
-                 * a sem to write, so we need to check for ranges while sem
-                 * is unlocked.
-                 */
-                up_read(&mm->mmap_sem);
-                /* vma can change or disappear */
-                down_write(&mm->mmap_sem);
-                *prev = find_vma(mm, start);
-                /* non-NULL *prev must contain @start, but need to check @end */
-                if (!(*prev) || end > (*prev)->vm_end)
-                        ret = -ENOMEM;
-                else if (ret > 0) {
                        mm->locked_vm -= ret;
                        ret = 0;
                } else
                        ret = __mlock_posix_error_return(ret); /* translate if needed */
        } else {
-                /*
-                 * TODO:  for unlocking, pages will already be resident, so
-                 * we don't need to wait for allocations/reclaim/pagein, ...
-                 * However, unlocking a very large region can still take a
-                 * while.  Should we downgrade the semaphore for both lock
-                 * AND unlock ?
-                 */
                __mlock_vma_pages_range(vma, start, end, 0);
        }
diff --git a/mm/mmap.c b/mm/mmap.c
index 8d95902e9a38..00ced3ee49a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -658,6 +658,9 @@ again:			remove_next = 1 + (end > next->vm_end);
        validate_mm(mm);
 }
+/* Flags that can be inherited from an existing mapping when merging */
+#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
 /*
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
@@ -665,7 +668,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
-        if (vma->vm_flags != vm_flags)
+        if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
                return 0;
        if (vma->vm_file != file)
                return 0;
@@ -915,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        struct inode *inode;
        unsigned int vm_flags;
        int error;
-        int accountable = 1;
        unsigned long reqprot = prot;
        /*
@@ -1016,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }
-                        if (is_file_hugepages(file))
-                                accountable = 0;
                        if (!file->f_op || !file->f_op->mmap)
                                return -ENODEV;
@@ -1050,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        if (error)
                return error;
-        return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
-                           accountable);
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1087,10 +1086,25 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                mapping_cap_account_dirty(vma->vm_file->f_mapping);
 }
+/*
+ * We account for memory if it's a private writeable mapping,
+ * not hugepages and VM_NORESERVE wasn't set.
+ */
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+{
+        /*
+         * hugetlb has its own accounting separate from the core VM
+         * VM_HUGETLB may not be set yet so we cannot check for that flag.
+         */
+        if (file && is_file_hugepages(file))
+                return 0;
+        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
+}
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                          unsigned int vm_flags, unsigned long pgoff,
+                          unsigned int vm_flags, unsigned long pgoff)
-                          int accountable)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1114,38 +1128,38 @@ munmap_back:
        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
-        if (flags & MAP_NORESERVE)
+        /*
-                vm_flags |= VM_NORESERVE;
+         * Set 'VM_NORESERVE' if we should not account for the
+         * memory use of this mapping.
+         */
+        if ((flags & MAP_NORESERVE)) {
+                /* We honor MAP_NORESERVE if allowed to overcommit */
+                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+                        vm_flags |= VM_NORESERVE;
-        if (accountable && (!(flags & MAP_NORESERVE) ||
+                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
-                            sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
+                if (file && is_file_hugepages(file))
-                if (vm_flags & VM_SHARED) {
+                        vm_flags |= VM_NORESERVE;
-                        /* Check memory availability in shmem_file_setup? */
-                        vm_flags |= VM_ACCOUNT;
-                } else if (vm_flags & VM_WRITE) {
-                        /*
-                         * Private writable mapping: check memory availability
-                         */
-                        charged = len >> PAGE_SHIFT;
-                        if (security_vm_enough_memory(charged))
-                                return -ENOMEM;
-                        vm_flags |= VM_ACCOUNT;
-                }
        }
        /*
-         * Can we just expand an old private anonymous mapping?
+         * Private writable mapping: check memory availability
-         * The VM_SHARED test is necessary because shmem_zero_setup
-         * will create the file object for a shared anonymous map below.
         */
-        if (!file && !(vm_flags & VM_SHARED)) {
+        if (accountable_mapping(file, vm_flags)) {
-                vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+                charged = len >> PAGE_SHIFT;
-                                        NULL, NULL, pgoff, NULL);
+                if (security_vm_enough_memory(charged))
-                if (vma)
+                        return -ENOMEM;
-                        goto out;
+                vm_flags |= VM_ACCOUNT;
        }
        /*
+         * Can we just expand an old mapping?
+         */
+        vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+        if (vma)
+                goto out;
+        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
@@ -1186,14 +1200,6 @@ munmap_back:
                        goto free_vma;
        }
-        /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
-         * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
-         * that memory reservation must be checked; but that reservation
-         * belongs to shared memory object, not to vma: so now clear it.
-         */
-        if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
-                vma->vm_flags &= ~VM_ACCOUNT;
        /* Can addr have changed??
         *
         * Answer: Yes, several device drivers can do it in their
@@ -1206,17 +1212,8 @@ munmap_back:
        if (vma_wants_writenotify(vma))
                vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
-        if (file && vma_merge(mm, prev, addr, vma->vm_end,
+        vma_link(mm, vma, prev, rb_link, rb_parent);
-                        vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
+        file = vma->vm_file;
-                mpol_put(vma_policy(vma));
-                kmem_cache_free(vm_area_cachep, vma);
-                fput(file);
-                if (vm_flags & VM_EXECUTABLE)
-                        removed_exe_file_vma(mm);
-        } else {
-                vma_link(mm, vma, prev, rb_link, rb_parent);
-                file = vma->vm_file;
-        }
        /* Once vma denies write, undo our temporary denial count */
        if (correct_wcount)
@@ -2087,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm)
        unsigned long end;
        /* mm's last user has gone, and its about to be pulled down */
-        arch_exit_mmap(mm);
        mmu_notifier_release(mm);
-        if (!mm->mmap)  /* Can happen if dup_mmap() received an OOM */
-                return;
        if (mm->locked_vm) {
                vma = mm->mmap;
                while (vma) {
@@ -2101,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm)
                        vma = vma->vm_next;
                }
        }
+        arch_exit_mmap(mm);
        vma = mm->mmap;
+        if (!vma)       /* Can happen if dup_mmap() received an OOM */
+                return;
        lru_add_drain();
        flush_cache_mm(mm);
        tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694e13f4..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        /*
         * If we make a private mapping writable we increase our commit;
         * but (without finer accounting) cannot reduce our commit if we
-         * make it unwritable again.
+         * make it unwritable again. hugetlb mapping were accounted for
+         * even if read-only so there is no need to account for them here
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b493db7841dc..6106a5c7ed44 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int old_bytes = vm_dirty_bytes;
+        unsigned long old_bytes = vm_dirty_bytes;
        int ret;
        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
@@ -1051,13 +1051,25 @@ continue_unlock:
                                }
                        }
-                        if (wbc->sync_mode == WB_SYNC_NONE) {
+                        if (nr_to_write > 0) {
-                                wbc->nr_to_write--;
+                                nr_to_write--;
-                                if (wbc->nr_to_write <= 0) {
+                                if (nr_to_write == 0 &&
+                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                        /*
+                                         * We stop writing back only if we are
+                                         * not doing integrity sync. In case of
+                                         * integrity sync we have to keep going
+                                         * because someone may be concurrently
+                                         * dirtying pages, and we might have
+                                         * synced a lot of newly appeared dirty
+                                         * pages, but have not synced all of the
+                                         * old dirty pages.
+                                         */
                                        done = 1;
                                        break;
                                }
                        }
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                                wbc->encountered_congestion = 1;
                                done = 1;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 7006a11350c8..ceecfbb143fa 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
                nid = page_to_nid(pfn_to_page(pfn));
                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
                if (slab_is_available()) {
-                        base = kmalloc_node(table_size, GFP_KERNEL, nid);
+                        base = kmalloc_node(table_size,
+                                        GFP_KERNEL | __GFP_NOWARN, nid);
                        if (!base)
                                base = vmalloc_node(table_size, nid);
                } else {
diff --git a/mm/rmap.c b/mm/rmap.c
index ac4af8cffbf9..16521664010d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (MLOCK_PAGES && unlikely(unlock)) {
-                        if (!(vma->vm_flags & VM_LOCKED))
+                        if (!((vma->vm_flags & VM_LOCKED) &&
+                                                page_mapped_in_vma(page, vma)))
                                continue;       /* must visit all vmas */
                        ret = SWAP_MLOCK;
                } else {
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d0de96c9789..19d566ccdeea 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2628,7 +2628,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
                goto close_file;
 #ifdef CONFIG_SHMEM
-        SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
+        SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT;
 #endif
        d_instantiate(dentry, inode);
        inode->i_size = size;
diff --git a/mm/slub.c b/mm/slub.c
index 6392ae5cc6b1..bdc9abb08a23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1996,7 +1996,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
 static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
 {
        if (c < per_cpu(kmem_cache_cpu, cpu) ||
-                        c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
+                        c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
                kfree(c);
                return;
        }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f48b831e5e5c..7e6304dfafab 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -698,8 +698,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        pte_t *pte;
        int ret = 1;
-        if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr))
+        if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
                ret = -ENOMEM;
+                goto out_nolock;
+        }
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
@@ -723,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        activate_page(page);
 out:
        pte_unmap_unlock(pte, ptl);
+out_nolock:
        return ret;
 }