18 files changed, 165 insertions, 76 deletions
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f3306a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                        flags &= MAP_NONBLOCK;
                        get_file(file);
                        addr = mmap_region(file, start, size,
-                                        flags, vma->vm_flags, pgoff, 1);
+                                        flags, vma->vm_flags, pgoff);
                        fput(file);
                        if (IS_ERR_VALUE(addr)) {
                                err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..107da3d809a8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,12 +2269,18 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
-                                        struct vm_area_struct *vma)
+                                        struct vm_area_struct *vma,
+                                        int acctflag)
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
-        if (vma && vma->vm_flags & VM_NORESERVE)
+        /*
+         * Only apply hugepage reservation if asked. At fault time, an
+         * attempt will be made for VM_NORESERVE to allocate a page
+         * and filesystem quota without using reserves
+         */
+        if (acctflag & VM_NORESERVE)
                return 0;
        /*
@@ -2299,13 +2305,31 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (chg < 0)
                return chg;
+        /* There must be enough filesystem quota for the mapping */
        if (hugetlb_get_quota(inode->i_mapping, chg))
                return -ENOSPC;
+        /*
+         * Check enough hugepages are available for the reservation.
+         * Hand back the quota if there are not
+         */
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
+        /*
+         * Account for the reservations made. Shared mappings record regions
+         * that have reservations as they are shared by multiple VMAs.
+         * When the last VMA disappears, the region map says how much
+         * the reservation was and the page cache tells how much of
+         * the reservation was consumed. Private mappings are per-VMA and
+         * only the consumed reservations are tracked. When the VMA
+         * disappears, the original reservation is the VMA size and the
+         * consumed reservations are stored in the map. Hence, nothing
+         * else has to be done for private mappings here
+         */
        if (!vma || vma->vm_flags & VM_SHARED)
                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 2bb4e1d63520..a9eff3f092f6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        struct vm_area_struct *vma;
        int err = 0;
-        for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+        for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
                if (vma->vm_ops && vma->vm_ops->migrate) {
                        err = vma->vm_ops->migrate(vma, to, from, flags);
                        if (err)
diff --git a/mm/mlock.c b/mm/mlock.c
index 028ec482fdd4..cbe9e0581b75 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -311,7 +311,10 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
                        is_vm_hugetlb_page(vma) ||
                        vma == get_gate_vma(current))) {
-                return __mlock_vma_pages_range(vma, start, end, 1);
+                __mlock_vma_pages_range(vma, start, end, 1);
+                /* Hide errors from mmap() and other callers */
+                return 0;
        }
        /*
@@ -657,7 +660,7 @@ void *alloc_locked_buffer(size_t size)
        return buffer;
 }
-void free_locked_buffer(void *buffer, size_t size)
+void release_locked_buffer(void *buffer, size_t size)
 {
        unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
@@ -667,6 +670,11 @@ void free_locked_buffer(void *buffer, size_t size)
        current->mm->locked_vm -= pgsz;
        up_write(&current->mm->mmap_sem);
+}
+void free_locked_buffer(void *buffer, size_t size)
+{
+        release_locked_buffer(buffer, size);
        kfree(buffer);
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 214b6a258eeb..00ced3ee49a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        struct inode *inode;
        unsigned int vm_flags;
        int error;
-        int accountable = 1;
        unsigned long reqprot = prot;
        /*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }
-                        if (is_file_hugepages(file))
-                                accountable = 0;
                        if (!file->f_op || !file->f_op->mmap)
                                return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        if (error)
                return error;
-        return mmap_region(file, addr, len, flags, vm_flags, pgoff,
+        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
-                           accountable);
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 /*
 * We account for memory if it's a private writeable mapping,
- * and VM_NORESERVE wasn't set.
+ * not hugepages and VM_NORESERVE wasn't set.
 */
-static inline int accountable_mapping(unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 {
+        /*
+         * hugetlb has its own accounting separate from the core VM
+         * VM_HUGETLB may not be set yet so we cannot check for that flag.
+         */
+        if (file && is_file_hugepages(file))
+                return 0;
        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                          unsigned int vm_flags, unsigned long pgoff,
+                          unsigned int vm_flags, unsigned long pgoff)
-                          int accountable)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ munmap_back:
        /*
         * Set 'VM_NORESERVE' if we should not account for the
-         * memory use of this mapping. We only honor MAP_NORESERVE
+         * memory use of this mapping.
-         * if we're allowed to overcommit memory.
         */
-        if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+        if ((flags & MAP_NORESERVE)) {
-                vm_flags |= VM_NORESERVE;
+                /* We honor MAP_NORESERVE if allowed to overcommit */
-        if (!accountable)
+                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-                vm_flags |= VM_NORESERVE;
+                        vm_flags |= VM_NORESERVE;
+                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+                if (file && is_file_hugepages(file))
+                        vm_flags |= VM_NORESERVE;
+        }
        /*
         * Private writable mapping: check memory availability
         */
-        if (accountable_mapping(vm_flags)) {
+        if (accountable_mapping(file, vm_flags)) {
                charged = len >> PAGE_SHIFT;
                if (security_vm_enough_memory(charged))
                        return -ENOMEM;
@@ -2078,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm)
        unsigned long end;
        /* mm's last user has gone, and its about to be pulled down */
-        arch_exit_mmap(mm);
        mmu_notifier_release(mm);
-        if (!mm->mmap)  /* Can happen if dup_mmap() received an OOM */
-                return;
        if (mm->locked_vm) {
                vma = mm->mmap;
                while (vma) {
@@ -2092,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm)
                        vma = vma->vm_next;
                }
        }
+        arch_exit_mmap(mm);
        vma = mm->mmap;
+        if (!vma)       /* Can happen if dup_mmap() received an OOM */
+                return;
        lru_add_drain();
        flush_cache_mm(mm);
        tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694e13f4..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        /*
         * If we make a private mapping writable we increase our commit;
         * but (without finer accounting) cannot reduce our commit if we
-         * make it unwritable again.
+         * make it unwritable again. hugetlb mapping were accounted for
+         * even if read-only so there is no need to account for them here
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index dc32dae01e5f..74dc57c74349 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int old_bytes = vm_dirty_bytes;
+        unsigned long old_bytes = vm_dirty_bytes;
        int ret;
        ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
@@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
-static inline void task_dirty_inc(struct task_struct *tsk)
+void task_dirty_inc(struct task_struct *tsk)
 {
        prop_inc_single(&vm_dirties, &tsk->dirties);
 }
@@ -1051,20 +1051,23 @@ continue_unlock:
                                }
                        }
-                        if (nr_to_write > 0)
+                        if (nr_to_write > 0) {
                                nr_to_write--;
-                        else if (wbc->sync_mode == WB_SYNC_NONE) {
+                                if (nr_to_write == 0 &&
-                                /*
+                                    wbc->sync_mode == WB_SYNC_NONE) {
-                                 * We stop writing back only if we are not
+                                        /*
-                                 * doing integrity sync. In case of integrity
+                                         * We stop writing back only if we are
-                                 * sync we have to keep going because someone
+                                         * not doing integrity sync. In case of
-                                 * may be concurrently dirtying pages, and we
+                                         * integrity sync we have to keep going
-                                 * might have synced a lot of newly appeared
+                                         * because someone may be concurrently
-                                 * dirty pages, but have not synced all of the
+                                         * dirtying pages, and we might have
-                                 * old dirty pages.
+                                         * synced a lot of newly appeared dirty
-                                 */
+                                         * pages, but have not synced all of the
-                                done = 1;
+                                         * old dirty pages.
-                                break;
+                                         */
+                                        done = 1;
+                                        break;
+                                }
                        }
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -1076,7 +1079,7 @@ continue_unlock:
                pagevec_release(&pvec);
                cond_resched();
        }
-        if (!cycled) {
+        if (!cycled && !done) {
                /*
                 * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
@@ -1227,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                                __inc_zone_page_state(page, NR_FILE_DIRTY);
                                __inc_bdi_stat(mapping->backing_dev_info,
                                                BDI_RECLAIMABLE);
+                                task_dirty_inc(current);
                                task_io_account_write(PAGE_CACHE_SIZE);
                        }
                        radix_tree_tag_set(&mapping->page_tree,
@@ -1259,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
 * If the mapping doesn't provide a set_page_dirty a_op, then
 * just fall through and assume that it wants buffer_heads.
 */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
@@ -1277,14 +1281,6 @@ static int __set_page_dirty(struct page *page)
        }
        return 0;
 }
-int set_page_dirty(struct page *page)
-{
-        int ret = __set_page_dirty(page);
-        if (ret)
-                task_dirty_inc(current);
-        return ret;
-}
 EXPORT_SYMBOL(set_page_dirty);
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b3073854..5c44ed49ca93 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2989,7 +2989,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
 * was used and there are no special requirements, this is a convenient
 * alternative
 */
-int __meminit early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
        int i;
@@ -3000,10 +3000,33 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
                if (start_pfn <= pfn && pfn < end_pfn)
                        return early_node_map[i].nid;
        }
+        /* This is a memory hole */
+        return -1;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+        int nid;
+        nid = __early_pfn_to_nid(pfn);
+        if (nid >= 0)
+                return nid;
+        /* just returns 0 */
        return 0;
 }
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+        int nid;
+        nid = __early_pfn_to_nid(pfn);
+        if (nid >= 0 && nid != node)
+                return false;
+        return true;
+}
+#endif
 /* Basic iterator support to walk early_node_map[] */
 #define for_each_active_range_index_in_nid(i, nid) \
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 7006a11350c8..ceecfbb143fa 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
                nid = page_to_nid(pfn_to_page(pfn));
                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
                if (slab_is_available()) {
-                        base = kmalloc_node(table_size, GFP_KERNEL, nid);
+                        base = kmalloc_node(table_size,
+                                        GFP_KERNEL | __GFP_NOWARN, nid);
                        if (!base)
                                base = vmalloc_node(table_size, nid);
                } else {
diff --git a/mm/page_io.c b/mm/page_io.c
index dc6ce0afbded..3023c475e041 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                goto out;
        }
        if (wbc->sync_mode == WB_SYNC_ALL)
-                rw |= (1 << BIO_RW_SYNC);
+                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
        count_vm_event(PSWPOUT);
        set_page_writeback(page);
        unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index ac4af8cffbf9..16521664010d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration)
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (MLOCK_PAGES && unlikely(unlock)) {
-                        if (!(vma->vm_flags & VM_LOCKED))
+                        if (!((vma->vm_flags & VM_LOCKED) &&
+                                                page_mapped_in_vma(page, vma)))
                                continue;       /* must visit all vmas */
                        ret = SWAP_MLOCK;
                } else {
diff --git a/mm/slab.c b/mm/slab.c
index ddc41f337d58..4d00855629c4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4457,3 +4457,4 @@ size_t ksize(const void *objp)
        return obj_size(virt_to_cache(objp));
 }
+EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index bf7e8fc3aed8..52bc8a2bd9ef 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -521,6 +521,7 @@ size_t ksize(const void *block)
        } else
                return sp->page.private;
 }
+EXPORT_SYMBOL(ksize);
 struct kmem_cache {
        unsigned int size, align;
diff --git a/mm/slub.c b/mm/slub.c
index bdc9abb08a23..0280eee6cf37 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2736,6 +2736,7 @@ size_t ksize(const void *object)
         */
        return s->size;
 }
+EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7e6304dfafab..312fafe0ab6e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -635,7 +635,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                if (!bdev) {
                        if (bdev_p)
-                                *bdev_p = sis->bdev;
+                                *bdev_p = bdget(sis->bdev->bd_dev);
                        spin_unlock(&swap_lock);
                        return i;
@@ -647,7 +647,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                                        struct swap_extent, list);
                        if (se->start_block == offset) {
                                if (bdev_p)
-                                        *bdev_p = sis->bdev;
+                                        *bdev_p = bdget(sis->bdev->bd_dev);
                                spin_unlock(&swap_lock);
                                bdput(bdev);
diff --git a/mm/util.c b/mm/util.c
index cb00b748ce47..37eaccdf3054 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -129,6 +129,26 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
 }
 EXPORT_SYMBOL(krealloc);
+/**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ */
+void kzfree(const void *p)
+{
+        size_t ks;
+        void *mem = (void *)p;
+        if (unlikely(ZERO_OR_NULL_PTR(mem)))
+                return;
+        ks = ksize(mem);
+        memset(mem, 0, ks);
+        kfree(mem);
+}
+EXPORT_SYMBOL(kzfree);
 /*
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 366ae9ea6af2..fb6f59935fb2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1196,6 +1196,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
+struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
+                                       unsigned long start, unsigned long end,
+                                       void *caller)
+{
+        return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+                                  caller);
+}
 /**
 *      get_vm_area  -  reserve a contiguous kernel virtual area
 *      @size:          size of the area
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9a27c44aa327..6177e3bcd66b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2057,31 +2057,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                                      int pass, struct scan_control *sc)
 {
        struct zone *zone;
-        unsigned long nr_to_scan, ret = 0;
+        unsigned long ret = 0;
-        enum lru_list l;
        for_each_zone(zone) {
+                enum lru_list l;
                if (!populated_zone(zone))
                        continue;
                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
                for_each_evictable_lru(l) {
+                        enum zone_stat_item ls = NR_LRU_BASE + l;
+                        unsigned long lru_pages = zone_page_state(zone, ls);
                        /* For pass = 0, we don't shrink the active list */
-                        if (pass == 0 &&
+                        if (pass == 0 && (l == LRU_ACTIVE_ANON ||
-                                (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
+                                                l == LRU_ACTIVE_FILE))
                                continue;
-                        zone->lru[l].nr_scan +=
+                        zone->lru[l].nr_scan += (lru_pages >> prio) + 1;
-                                (zone_page_state(zone, NR_LRU_BASE + l)
-                                                                >> prio) + 1;
                        if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+                                unsigned long nr_to_scan;
                                zone->lru[l].nr_scan = 0;
-                                nr_to_scan = min(nr_pages,
+                                nr_to_scan = min(nr_pages, lru_pages);
-                                        zone_page_state(zone,
-                                                        NR_LRU_BASE + l));
                                ret += shrink_list(l, nr_to_scan, zone,
                                                                sc, prio);
                                if (ret >= nr_pages)
@@ -2089,7 +2089,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                        }
                }
        }
        return ret;
 }
@@ -2112,7 +2111,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                .may_swap = 0,
                .swap_cluster_max = nr_pages,
                .may_writepage = 1,
-                .swappiness = vm_swappiness,
                .isolate_pages = isolate_pages_global,
        };
@@ -2146,10 +2144,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                int prio;
                /* Force reclaiming mapped pages in the passes #3 and #4 */
-                if (pass > 2) {
+                if (pass > 2)
                        sc.may_swap = 1;
-                        sc.swappiness = 100;
-                }
                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
                        unsigned long nr_to_scan = nr_pages - ret;