12 files changed, 194 insertions, 165 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 57971d2ab84..c2b57d81e15 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
        bool
+config NOMMU_INITIAL_TRIM_EXCESS
+        int "Turn on mmap() excess space trimming before booting"
+        depends on !MMU
+        default 1
+        help
+          The NOMMU mmap() frequently needs to allocate large contiguous chunks
+          of memory on which to store mappings, but it can only ask the system
+          allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+          more than it requires.  To deal with this, mmap() is able to trim off
+          the excess and return it to the allocator.
+          If trimming is enabled, the excess is trimmed off and returned to the
+          system allocator, which can cause extra fragmentation, particularly
+          if there are a lot of transient processes.
+          If trimming is disabled, the excess is kept, but not used, which for
+          long-term mappings means that the space is wasted.
+          Trimming can be dynamically controlled through a sysctl option
+          (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+          excess pages there must be before trimming should occur, or zero if
+          no trimming is to occur.
+          This option specifies the initial value of this option.  The default
+          of 1 says that all excess pages should be trimmed.
+          See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e44fb0fbb80..01c2d8f1468 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1024,9 +1024,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
                return NULL;
        pc = lookup_page_cgroup(page);
-        /*
+        lock_page_cgroup(pc);
-         * Used bit of swapcache is solid under page lock.
-         */
        if (PageCgroupUsed(pc)) {
                mem = pc->mem_cgroup;
                if (mem && !css_tryget(&mem->css))
@@ -1040,6 +1038,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
                        mem = NULL;
                rcu_read_unlock();
        }
+        unlock_page_cgroup(pc);
        return mem;
 }
@@ -1618,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
 }
 /*
- * A call to try to shrink memory usage under specified resource controller.
+ * A call to try to shrink memory usage on charge failure at shmem's swapin.
- * This is typically used for page reclaiming for shmem for reducing side
+ * Calling hierarchical_reclaim is not enough because we should update
- * effect of page allocation from shmem, which is used by some mem_cgroup.
+ * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
+ * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
+ * not from the memcg which this page would be charged to.
+ * try_charge_swapin does all of these works properly.
 */
-int mem_cgroup_shrink_usage(struct page *page,
+int mem_cgroup_shmem_charge_fallback(struct page *page,
                            struct mm_struct *mm,
                            gfp_t gfp_mask)
 {
        struct mem_cgroup *mem = NULL;
-        int progress = 0;
+        int ret;
-        int retry = MEM_CGROUP_RECLAIM_RETRIES;
        if (mem_cgroup_disabled())
                return 0;
-        if (page)
-                mem = try_get_mem_cgroup_from_swapcache(page);
-        if (!mem && mm)
-                mem = try_get_mem_cgroup_from_mm(mm);
-        if (unlikely(!mem))
-                return 0;
-        do {
+        ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
-                progress = mem_cgroup_hierarchical_reclaim(mem,
+        if (!ret)
-                                        gfp_mask, true, false);
+                mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
-                progress += mem_cgroup_check_under_limit(mem);
-        } while (!progress && --retry);
-        css_put(&mem->css);
+        return ret;
-        if (!retry)
-                return -ENOMEM;
-        return 0;
 }
 static DEFINE_MUTEX(set_limit_mutex);
diff --git a/mm/memory.c b/mm/memory.c
index cf6873e91c6..4126dd16778 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                ret = tmp;
                                goto unwritable_page;
                        }
+                        if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                                lock_page(old_page);
+                                if (!old_page->mapping) {
+                                        ret = 0; /* retry the fault */
+                                        unlock_page(old_page);
+                                        goto unwritable_page;
+                                }
+                        } else
+                                VM_BUG_ON(!PageLocked(old_page));
                        /*
                         * Since we dropped the lock we need to revalidate
@@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         */
                        page_table = pte_offset_map_lock(mm, pmd, address,
                                                         &ptl);
-                        page_cache_release(old_page);
+                        if (!pte_same(*page_table, orig_pte)) {
-                        if (!pte_same(*page_table, orig_pte))
+                                unlock_page(old_page);
+                                page_cache_release(old_page);
                                goto unlock;
+                        }
                        page_mkwrite = 1;
                }
@@ -2094,9 +2105,6 @@ gotten:
 unlock:
        pte_unmap_unlock(page_table, ptl);
        if (dirty_page) {
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
                /*
                 * Yes, Virginia, this is actually required to prevent a race
                 * with clear_page_dirty_for_io() from clearing the page dirty
@@ -2105,16 +2113,41 @@ unlock:
                 *
                 * do_no_page is protected similarly.
                 */
-                wait_on_page_locked(dirty_page);
+                if (!page_mkwrite) {
-                set_page_dirty_balance(dirty_page, page_mkwrite);
+                        wait_on_page_locked(dirty_page);
+                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                }
                put_page(dirty_page);
+                if (page_mkwrite) {
+                        struct address_space *mapping = dirty_page->mapping;
+                        set_page_dirty(dirty_page);
+                        unlock_page(dirty_page);
+                        page_cache_release(dirty_page);
+                        if (mapping)    {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
+                }
+                /* file_update_time outside page_lock */
+                if (vma->vm_file)
+                        file_update_time(vma->vm_file);
        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
 oom:
-        if (old_page)
+        if (old_page) {
+                if (page_mkwrite) {
+                        unlock_page(old_page);
+                        page_cache_release(old_page);
+                }
                page_cache_release(old_page);
+        }
        return VM_FAULT_OOM;
 unwritable_page:
@@ -2458,8 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                ret = VM_FAULT_OOM;
-                unlock_page(page);
+                goto out_page;
-                goto out;
        }
        /*
@@ -2521,6 +2553,7 @@ out:
 out_nomap:
        mem_cgroup_cancel_charge_swapin(ptr);
        pte_unmap_unlock(page_table, ptl);
+out_page:
        unlock_page(page);
        page_cache_release(page);
        return ret;
@@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                int tmp;
                                unlock_page(page);
-                                vmf.flags |= FAULT_FLAG_MKWRITE;
+                                vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
                                tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
                                if (unlikely(tmp &
                                          (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                                        ret = tmp;
-                                        anon = 1; /* no anon but release vmf.page */
+                                        goto unwritable_page;
-                                        goto out_unlocked;
-                                }
-                                lock_page(page);
-                                /*
-                                 * XXX: this is not quite right (racy vs
-                                 * invalidate) to unlock and relock the page
-                                 * like this, however a better fix requires
-                                 * reworking page_mkwrite locking API, which
-                                 * is better done later.
-                                 */
-                                if (!page->mapping) {
-                                        ret = 0;
-                                        anon = 1; /* no anon but release vmf.page */
-                                        goto out;
                                }
+                                if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
+                                        lock_page(page);
+                                        if (!page->mapping) {
+                                                ret = 0; /* retry the fault */
+                                                unlock_page(page);
+                                                goto unwritable_page;
+                                        }
+                                } else
+                                        VM_BUG_ON(!PageLocked(page));
                                page_mkwrite = 1;
                        }
                }
@@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_unmap_unlock(page_table, ptl);
 out:
-        unlock_page(vmf.page);
+        if (dirty_page) {
-out_unlocked:
+                struct address_space *mapping = page->mapping;
-        if (anon)
-                page_cache_release(vmf.page);
-        else if (dirty_page) {
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
-                set_page_dirty_balance(dirty_page, page_mkwrite);
+                if (set_page_dirty(dirty_page))
+                        page_mkwrite = 1;
+                unlock_page(dirty_page);
                put_page(dirty_page);
+                if (page_mkwrite && mapping) {
+                        /*
+                         * Some device drivers do not set page.mapping but still
+                         * dirty their pages
+                         */
+                        balance_dirty_pages_ratelimited(mapping);
+                }
+                /* file_update_time outside page_lock */
+                if (vma->vm_file)
+                        file_update_time(vma->vm_file);
+        } else {
+                unlock_page(vmf.page);
+                if (anon)
+                        page_cache_release(vmf.page);
        }
        return ret;
+unwritable_page:
+        page_cache_release(page);
+        return ret;
 }
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/mmap.c b/mm/mmap.c
index 8a49df4c736..2c1c2cb0e2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;       /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 /*
 * Check that a process has enough memory to allocate a new virtual
@@ -180,11 +180,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        if (mm)
                allowed -= mm->total_vm / 32;
-        /*
+        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
-         * cast `allowed' as a signed long because vm_committed_space
-         * sometimes has a negative value
-         */
-        if (atomic_long_read(&vm_committed_space) < (long)allowed)
                return 0;
 error:
        vm_unacct_memory(pages);
@@ -2491,4 +2487,8 @@ void mm_drop_all_locks(struct mm_struct *mm)
 */
 void __init mmap_init(void)
 {
+        int ret;
+        ret = percpu_counter_init(&vm_committed_as, 0);
+        VM_BUG_ON(ret);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 72eda4aee2c..b571ef70742 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,11 +62,11 @@ void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
-int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 atomic_long_t mmap_pages_allocated;
@@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 */
 void __init mmap_init(void)
 {
+        int ret;
+        ret = percpu_counter_init(&vm_committed_as, 0);
+        VM_BUG_ON(ret);
        vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
@@ -511,8 +515,6 @@ static void add_nommu_region(struct vm_region *region)
        validate_nommu_regions();
-        BUG_ON(region->vm_start & ~PAGE_MASK);
        parent = NULL;
        p = &nommu_region_tree.rb_node;
        while (*p) {
@@ -1847,12 +1849,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        if (mm)
                allowed -= mm->total_vm / 32;
-        /*
+        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
-         * cast `allowed' as a signed long because vm_committed_space
-         * sometimes has a negative value
-         */
-        if (atomic_long_read(&vm_committed_space) < (long)allowed)
                return 0;
 error:
        vm_unacct_memory(pages);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f3166e308d..92bcf1db16b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 */
 static void __out_of_memory(gfp_t gfp_mask, int order)
 {
-        if (sysctl_oom_kill_allocating_task) {
+        struct task_struct *p;
-                oom_kill_process(current, gfp_mask, order, 0, NULL,
+        unsigned long points;
-                                "Out of memory (oom_kill_allocating_task)");
-        } else {
-                unsigned long points;
-                struct task_struct *p;
-retry:
-                /*
-                 * Rambo mode: Shoot down a process and hope it solves whatever
-                 * issues we may have.
-                 */
-                p = select_bad_process(&points, NULL);
-                if (PTR_ERR(p) == -1UL)
+        if (sysctl_oom_kill_allocating_task)
+                if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
+                                "Out of memory (oom_kill_allocating_task)"))
                        return;
+retry:
+        /*
+         * Rambo mode: Shoot down a process and hope it solves whatever
+         * issues we may have.
+         */
+        p = select_bad_process(&points, NULL);
-                /* Found nothing?!?! Either we hang forever, or we panic. */
+        if (PTR_ERR(p) == -1UL)
-                if (!p) {
+                return;
-                        read_unlock(&tasklist_lock);
-                        panic("Out of memory and no killable processes...\n");
-                }
-                if (oom_kill_process(p, gfp_mask, order, points, NULL,
+        /* Found nothing?!?! Either we hang forever, or we panic. */
-                                     "Out of memory"))
+        if (!p) {
-                        goto retry;
+                read_unlock(&tasklist_lock);
+                panic("Out of memory and no killable processes...\n");
        }
+        if (oom_kill_process(p, gfp_mask, order, points, NULL,
+                             "Out of memory"))
+                goto retry;
 }
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f26991fff..fe753ecf2aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 static int zone_batchsize(struct zone *zone)
 {
+#ifdef CONFIG_MMU
        int batch;
        /*
@@ -2706,9 +2707,26 @@ static int zone_batchsize(struct zone *zone)
         * of pages of one half of the possible page colors
         * and the other with pages of the other colors.
         */
-        batch = (1 << (fls(batch + batch/2)-1)) - 1;
+        batch = rounddown_pow_of_two(batch + batch/2) - 1;
        return batch;
+#else
+        /* The deferral and batching of frees should be suppressed under NOMMU
+         * conditions.
+         *
+         * The problem is that NOMMU needs to be able to allocate large chunks
+         * of contiguous memory as there's no hardware page translation to
+         * assemble apparent contiguous memory from discontiguous pages.
+         *
+         * Queueing large contiguous runs of pages for batching, however,
+         * causes the pages to actually be freed in smaller chunks.  As there
+         * can be a significant delay between the individual batches being
+         * recycled, this leads to the once large chunks of space being
+         * fragmented and becoming unavailable for high-order allocations.
+         */
+        return 0;
+#endif
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
diff --git a/mm/pdflush.c b/mm/pdflush.c
index f2caf96993f..235ac440c44 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -58,14 +58,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
 int nr_pdflush_threads = 0;
 /*
- * The max/min number of pdflush threads. R/W by sysctl at
- * /proc/sys/vm/nr_pdflush_threads_max/min
- */
-int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
-int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
-/*
 * The time at which the pdflush thread pool last went empty
 */
 static unsigned long last_empty_jifs;
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
 * Thread pool management algorithm:
 * 
 * - The minimum and maximum number of pdflush instances are bound
- *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
 * 
 * - If there have been no idle pdflush instances for 1 second, create
 *   a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
                 * To throttle creation, we reset last_empty_jifs.
                 */
                if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-                        if (list_empty(&pdflush_list) &&
+                        if (list_empty(&pdflush_list)) {
-                            nr_pdflush_threads < nr_pdflush_threads_max) {
+                                if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
-                                last_empty_jifs = jiffies;
+                                        last_empty_jifs = jiffies;
-                                nr_pdflush_threads++;
+                                        nr_pdflush_threads++;
-                                spin_unlock_irq(&pdflush_lock);
+                                        spin_unlock_irq(&pdflush_lock);
-                                start_one_pdflush_thread();
+                                        start_one_pdflush_thread();
-                                spin_lock_irq(&pdflush_lock);
+                                        spin_lock_irq(&pdflush_lock);
+                                }
                        }
                }
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
                 */
                if (list_empty(&pdflush_list))
                        continue;
-                if (nr_pdflush_threads <= nr_pdflush_threads_min)
+                if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
                        continue;
                pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
                if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
         * Pre-set nr_pdflush_threads...  If we fail to create,
         * the count will be decremented.
         */
-        nr_pdflush_threads = nr_pdflush_threads_min;
+        nr_pdflush_threads = MIN_PDFLUSH_THREADS;
-        for (i = 0; i < nr_pdflush_threads_min; i++)
+        for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
                start_one_pdflush_thread();
        return 0;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index f9cb20ebb99..b25f95ce3db 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1340,8 +1340,12 @@ repeat:
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        if (error == -ENOMEM) {
-                                /* allow reclaim from this memory cgroup */
+                                /*
-                                error = mem_cgroup_shrink_usage(swappage,
+                                 * reclaim from proper memory cgroup and
+                                 * call memcg's OOM if needed.
+                                 */
+                                error = mem_cgroup_shmem_charge_fallback(
+                                                                swappage,
                                                                current->mm,
                                                                gfp);
                                if (error) {
diff --git a/mm/swap.c b/mm/swap.c
index bede23ce64e..cb29ae5d33a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 EXPORT_SYMBOL(pagevec_lookup_tag);
-#ifdef CONFIG_SMP
-/*
- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
- * CPUs
- */
-#define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
-static DEFINE_PER_CPU(long, committed_space);
-void vm_acct_memory(long pages)
-{
-        long *local;
-        preempt_disable();
-        local = &__get_cpu_var(committed_space);
-        *local += pages;
-        if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
-                atomic_long_add(*local, &vm_committed_space);
-                *local = 0;
-        }
-        preempt_enable();
-}
-#ifdef CONFIG_HOTPLUG_CPU
-/* Drop the CPU's cached committed space back into the central pool. */
-static int cpu_swap_callback(struct notifier_block *nfb,
-                             unsigned long action,
-                             void *hcpu)
-{
-        long *committed;
-        committed = &per_cpu(committed_space, (long)hcpu);
-        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-                atomic_long_add(*committed, &vm_committed_space);
-                *committed = 0;
-                drain_cpu_pagevecs((long)hcpu);
-        }
-        return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#endif /* CONFIG_SMP */
 /*
 * Perform any setup for the swap system
 */
@@ -554,7 +511,4 @@ void __init swap_setup(void)
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */
-#ifdef CONFIG_HOTPLUG_CPU
-        hotcpu_notifier(cpu_swap_callback, 0);
-#endif
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d..083716ea38c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -402,6 +402,7 @@ overflow:
                        printk(KERN_WARNING
                                "vmap allocation for size %lu failed: "
                                "use vmalloc=<size> to increase size.\n", size);
+                kfree(va);
                return ERR_PTR(-EBUSY);
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eac9577941f..5fa3eda1f03 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1471,7 +1471,7 @@ static void shrink_zone(int priority, struct zone *zone,
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
-                int scan;
+                unsigned long scan;
                scan = zone_nr_pages(zone, sc, l);
                if (priority) {