7 files changed, 82 insertions, 54 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 57971d2ab848..c2b57d81e153 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
        bool
+config NOMMU_INITIAL_TRIM_EXCESS
+        int "Turn on mmap() excess space trimming before booting"
+        depends on !MMU
+        default 1
+        help
+          The NOMMU mmap() frequently needs to allocate large contiguous chunks
+          of memory on which to store mappings, but it can only ask the system
+          allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+          more than it requires.  To deal with this, mmap() is able to trim off
+          the excess and return it to the allocator.
+          If trimming is enabled, the excess is trimmed off and returned to the
+          system allocator, which can cause extra fragmentation, particularly
+          if there are a lot of transient processes.
+          If trimming is disabled, the excess is kept, but not used, which for
+          long-term mappings means that the space is wasted.
+          Trimming can be dynamically controlled through a sysctl option
+          (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+          excess pages there must be before trimming should occur, or zero if
+          no trimming is to occur.
+          This option specifies the initial value of this option.  The default
+          of 1 says that all excess pages should be trimmed.
+          See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/madvise.c b/mm/madvise.c
index 36d6ea2b6340..b9ce574827c8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,14 +112,6 @@ static long madvise_willneed(struct vm_area_struct * vma,
        if (!file)
                return -EBADF;
-        /*
-         * Page cache readahead assumes page cache pages are order-0 which
-         * is not the case for hugetlbfs. Do not give a bad return value
-         * but ignore the advice.
-         */
-        if (vma->vm_flags & VM_HUGETLB)
-                return 0;
        if (file->f_mapping->a_ops->get_xip_mem) {
                /* no bad return value, but ignore advice */
                return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 809998aa7b50..b571ef707428 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,7 +66,7 @@ struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
-int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 atomic_long_t mmap_pages_allocated;
@@ -515,8 +515,6 @@ static void add_nommu_region(struct vm_region *region)
        validate_nommu_regions();
-        BUG_ON(region->vm_start & ~PAGE_MASK);
        parent = NULL;
        p = &nommu_region_tree.rb_node;
        while (*p) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f3166e308d9..92bcf1db16b2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 */
 static void __out_of_memory(gfp_t gfp_mask, int order)
 {
-        if (sysctl_oom_kill_allocating_task) {
+        struct task_struct *p;
-                oom_kill_process(current, gfp_mask, order, 0, NULL,
+        unsigned long points;
-                                "Out of memory (oom_kill_allocating_task)");
-        } else {
-                unsigned long points;
-                struct task_struct *p;
-retry:
-                /*
-                 * Rambo mode: Shoot down a process and hope it solves whatever
-                 * issues we may have.
-                 */
-                p = select_bad_process(&points, NULL);
-                if (PTR_ERR(p) == -1UL)
+        if (sysctl_oom_kill_allocating_task)
+                if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
+                                "Out of memory (oom_kill_allocating_task)"))
                        return;
+retry:
+        /*
+         * Rambo mode: Shoot down a process and hope it solves whatever
+         * issues we may have.
+         */
+        p = select_bad_process(&points, NULL);
-                /* Found nothing?!?! Either we hang forever, or we panic. */
+        if (PTR_ERR(p) == -1UL)
-                if (!p) {
+                return;
-                        read_unlock(&tasklist_lock);
-                        panic("Out of memory and no killable processes...\n");
-                }
-                if (oom_kill_process(p, gfp_mask, order, points, NULL,
+        /* Found nothing?!?! Either we hang forever, or we panic. */
-                                     "Out of memory"))
+        if (!p) {
-                        goto retry;
+                read_unlock(&tasklist_lock);
+                panic("Out of memory and no killable processes...\n");
        }
+        if (oom_kill_process(p, gfp_mask, order, points, NULL,
+                             "Out of memory"))
+                goto retry;
 }
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f26991fff1..fe753ecf2aa5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 static int zone_batchsize(struct zone *zone)
 {
+#ifdef CONFIG_MMU
        int batch;
        /*
@@ -2706,9 +2707,26 @@ static int zone_batchsize(struct zone *zone)
         * of pages of one half of the possible page colors
         * and the other with pages of the other colors.
         */
-        batch = (1 << (fls(batch + batch/2)-1)) - 1;
+        batch = rounddown_pow_of_two(batch + batch/2) - 1;
        return batch;
+#else
+        /* The deferral and batching of frees should be suppressed under NOMMU
+         * conditions.
+         *
+         * The problem is that NOMMU needs to be able to allocate large chunks
+         * of contiguous memory as there's no hardware page translation to
+         * assemble apparent contiguous memory from discontiguous pages.
+         *
+         * Queueing large contiguous runs of pages for batching, however,
+         * causes the pages to actually be freed in smaller chunks.  As there
+         * can be a significant delay between the individual batches being
+         * recycled, this leads to the once large chunks of space being
+         * fragmented and becoming unavailable for high-order allocations.
+         */
+        return 0;
+#endif
 }
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
diff --git a/mm/pdflush.c b/mm/pdflush.c
index f2caf96993f8..235ac440c44e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -58,14 +58,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
 int nr_pdflush_threads = 0;
 /*
- * The max/min number of pdflush threads. R/W by sysctl at
- * /proc/sys/vm/nr_pdflush_threads_max/min
- */
-int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
-int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
-/*
 * The time at which the pdflush thread pool last went empty
 */
 static unsigned long last_empty_jifs;
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
 * Thread pool management algorithm:
 * 
 * - The minimum and maximum number of pdflush instances are bound
- *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
 * 
 * - If there have been no idle pdflush instances for 1 second, create
 *   a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
                 * To throttle creation, we reset last_empty_jifs.
                 */
                if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-                        if (list_empty(&pdflush_list) &&
+                        if (list_empty(&pdflush_list)) {
-                            nr_pdflush_threads < nr_pdflush_threads_max) {
+                                if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
-                                last_empty_jifs = jiffies;
+                                        last_empty_jifs = jiffies;
-                                nr_pdflush_threads++;
+                                        nr_pdflush_threads++;
-                                spin_unlock_irq(&pdflush_lock);
+                                        spin_unlock_irq(&pdflush_lock);
-                                start_one_pdflush_thread();
+                                        start_one_pdflush_thread();
-                                spin_lock_irq(&pdflush_lock);
+                                        spin_lock_irq(&pdflush_lock);
+                                }
                        }
                }
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
                 */
                if (list_empty(&pdflush_list))
                        continue;
-                if (nr_pdflush_threads <= nr_pdflush_threads_min)
+                if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
                        continue;
                pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
                if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
         * Pre-set nr_pdflush_threads...  If we fail to create,
         * the count will be decremented.
         */
-        nr_pdflush_threads = nr_pdflush_threads_min;
+        nr_pdflush_threads = MIN_PDFLUSH_THREADS;
-        for (i = 0; i < nr_pdflush_threads_min; i++)
+        for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
                start_one_pdflush_thread();
        return 0;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d1..083716ea38c9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -402,6 +402,7 @@ overflow:
                        printk(KERN_WARNING
                                "vmap allocation for size %lu failed: "
                                "use vmalloc=<size> to increase size.\n", size);
+                kfree(va);
                return ERR_PTR(-EBUSY);
        }