Merge branch 'akpm' (aka "Andrew's patch-bomb")

Andrew elucidates: - First installmeant of MM. We have a HUGE number of MM patches this time. It's crazy. - MAINTAINERS updates - backlight updates - leds - checkpatch updates - misc ELF stuff - rtc updates - reiserfs - procfs - some misc other bits * akpm: (124 commits) user namespace: make signal.c respect user namespaces workqueue: make alloc_workqueue() take printf fmt and args for name procfs: add hidepid= and gid= mount options procfs: parse mount options procfs: introduce the /proc/<pid>/map_files/ directory procfs: make proc_get_link to use dentry instead of inode signal: add block_sigmask() for adding sigmask to current->blocked sparc: make SA_NOMASK a synonym of SA_NODEFER reiserfs: don't lock root inode searching reiserfs: don't lock journal_init() reiserfs: delay reiserfs lock until journal initialization reiserfs: delete comments referring to the BKL drivers/rtc/interface.c: fix alarm rollover when day or month is out-of-range drivers/rtc/rtc-twl.c: add DT support for RTC inside twl4030/twl6030 drivers/rtc/: remove redundant spi driver bus initialization drivers/rtc/rtc-jz4740.c: make jz4740_rtc_driver static drivers/rtc/rtc-mc13xxx.c: make mc13xxx_rtc_idtable static rtc: convert drivers/rtc/* to use module_platform_driver() drivers/rtc/rtc-wm831x.c: convert to devm_kzalloc() drivers/rtc/rtc-wm831x.c: remove unused period IRQ handler ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-01-10 19:42:48 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-01-10 19:42:48 -0500
commit: 40ba587923ae67090d9f141c1d3c951be5c1420e (patch)
tree: 342a72fc0ee13a0d2496ef970b64dfeadf1355d2 /mm
parent: 54c2c5761febcca46c8037d3a81612991e6c209a (diff)
parent: 6b550f9495947fc279d12c38feaf98500e8d0646 (diff)
20 files changed, 615 insertions, 313 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 8b1a477162dc..4b2443254de2 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC
        depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
        depends on !KMEMCHECK
        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+        select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
          Unmap pages from the kernel linear mapping after free_pages().
          This results in a large slowdown, but helps to find certain types
@@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS
 config PAGE_POISONING
        bool
        select WANT_PAGE_DEBUG_FLAGS
+config PAGE_GUARD
+        bool
+        select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1a77012ecdb3..668e94df8cf2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
 static unsigned long __init bootmap_bytes(unsigned long pages)
 {
-        unsigned long bytes = (pages + 7) / 8;
+        unsigned long bytes = DIV_ROUND_UP(pages, 8);
        return ALIGN(bytes, sizeof(long));
 }
@@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
-        int aligned;
        struct page *page;
        unsigned long start, end, pages, count = 0;
@@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        start = bdata->node_min_pfn;
        end = bdata->node_low_pfn;
-        /*
+        bdebug("nid=%td start=%lx end=%lx\n",
-         * If the start is aligned to the machines wordsize, we might
+                bdata - bootmem_node_data, start, end);
-         * be able to free pages in bulks of that order.
-         */
-        aligned = !(start & (BITS_PER_LONG - 1));
-        bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
-                bdata - bootmem_node_data, start, end, aligned);
        while (start < end) {
                unsigned long *map, idx, vec;
@@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                map = bdata->node_bootmem_map;
                idx = start - bdata->node_min_pfn;
                vec = ~map[idx / BITS_PER_LONG];
+                /*
-                if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+                 * If we have a properly aligned and fully unreserved
+                 * BITS_PER_LONG block of pages in front of us, free
+                 * it in one go.
+                 */
+                if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
                        int order = ilog2(BITS_PER_LONG);
                        __free_pages_bootmem(pfn_to_page(start), order);
                        count += BITS_PER_LONG;
+                        start += BITS_PER_LONG;
                } else {
                        unsigned long off = 0;
@@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                                vec >>= 1;
                                off++;
                        }
+                        start = ALIGN(start + 1, BITS_PER_LONG);
                }
-                start += BITS_PER_LONG;
        }
        page = virt_to_page(bdata->node_bootmem_map);
diff --git a/mm/compaction.c b/mm/compaction.c
index 1253d7ac332b..e6670c34eb49 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                nr_isolated++;
                /* Avoid isolating too much */
-                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+                        ++low_pfn;
                        break;
+                }
        }
        acct_isolated(zone, cc);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 8d723c9e8b75..469491e0af79 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                break;
        case POSIX_FADV_DONTNEED:
                if (!bdi_write_congested(mapping->backing_dev_info))
-                        filemap_flush(mapping);
+                        __filemap_fdatawrite_range(mapping, offset, endbyte,
+                                                   WB_SYNC_NONE);
                /* First and last FULL page! */
                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
diff --git a/mm/filemap.c b/mm/filemap.c
index a0701e6eec10..c4ee2e918bea 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2351,8 +2351,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                                        pgoff_t index, unsigned flags)
 {
        int status;
+        gfp_t gfp_mask;
        struct page *page;
        gfp_t gfp_notmask = 0;
+        gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
        if (flags & AOP_FLAG_NOFS)
                gfp_notmask = __GFP_FS;
 repeat:
@@ -2360,7 +2363,7 @@ repeat:
        if (page)
                goto found;
-        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
+        page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
        if (!page)
                return NULL;
        status = add_to_page_cache_lru(page, mapping, index,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7acd12503f73..ea8c3a4cd2ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
-                return NULL;
+                page = NULL;
        }
        spin_lock(&hugetlb_lock);
@@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
-        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+        pgoff = vma_hugecache_offset(h, vma, address);
-                + (vma->vm_pgoff >> PAGE_SHIFT);
        mapping = (struct address_space *)page_private(page);
        /*
@@ -2349,6 +2348,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 /*
 * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * cannot race with other handlers or page migration.
+ * Keep the pte_same checks anyway to make transition from the mutex easier.
 */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte,
@@ -2408,7 +2410,14 @@ retry_avoidcopy:
                                BUG_ON(page_count(old_page) != 1);
                                BUG_ON(huge_pte_none(pte));
                                spin_lock(&mm->page_table_lock);
-                                goto retry_avoidcopy;
+                                ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+                                if (likely(pte_same(huge_ptep_get(ptep), pte)))
+                                        goto retry_avoidcopy;
+                                /*
+                                 * race occurs while re-acquiring page_table_lock, and
+                                 * our job is done.
+                                 */
+                                return 0;
                        }
                        WARN_ON_ONCE(1);
                }
@@ -2630,6 +2639,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        struct hstate *h = hstate_vma(vma);
+        address &= huge_page_mask(h);
        ptep = huge_pte_offset(mm, address);
        if (ptep) {
                entry = huge_ptep_get(ptep);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3fdbcb17658..e3d58f088466 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
 }
 /* Slow path of a mempolicy comparison */
-int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
        if (!a || !b)
-                return 0;
+                return false;
        if (a->mode != b->mode)
-                return 0;
+                return false;
        if (a->flags != b->flags)
-                return 0;
+                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
-                        return 0;
+                        return false;
        switch (a->mode) {
        case MPOL_BIND:
                /* Fall through */
        case MPOL_INTERLEAVE:
-                return nodes_equal(a->v.nodes, b->v.nodes);
+                return !!nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
                return a->v.preferred_node == b->v.preferred_node;
        default:
                BUG();
-                return 0;
+                return false;
        }
 }
diff --git a/mm/mempool.c b/mm/mempool.c
index e73641b79bb5..d9049811f352 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool)
        return pool->elements[--pool->curr_nr];
 }
-static void free_pool(mempool_t *pool)
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * Free all reserved elements in @pool and @pool itself.  This function
+ * only sleeps if the free_fn() function sleeps.
+ */
+void mempool_destroy(mempool_t *pool)
 {
        while (pool->curr_nr) {
                void *element = remove_element(pool);
@@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool)
        kfree(pool->elements);
        kfree(pool);
 }
+EXPORT_SYMBOL(mempool_destroy);
 /**
 * mempool_create - create a memory pool
@@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
                element = pool->alloc(GFP_KERNEL, pool->pool_data);
                if (unlikely(!element)) {
-                        free_pool(pool);
+                        mempool_destroy(pool);
                        return NULL;
                }
                add_element(pool, element);
@@ -172,23 +181,6 @@ out:
 EXPORT_SYMBOL(mempool_resize);
 /**
- * mempool_destroy - deallocate a memory pool
- * @pool:      pointer to the memory pool which was allocated via
- *             mempool_create().
- *
- * this function only sleeps if the free_fn() function sleeps. The caller
- * has to guarantee that all elements have been returned to the pool (ie:
- * freed) prior to calling mempool_destroy().
- */
-void mempool_destroy(mempool_t *pool)
-{
-        /* Check for outstanding elements */
-        BUG_ON(pool->curr_nr != pool->min_nr);
-        free_pool(pool);
-}
-EXPORT_SYMBOL(mempool_destroy);
-/**
 * mempool_alloc - allocate an element from a specific memory pool
 * @pool:      pointer to the memory pool which was allocated via
 *             mempool_create().
@@ -224,28 +216,40 @@ repeat_alloc:
        if (likely(pool->curr_nr)) {
                element = remove_element(pool);
                spin_unlock_irqrestore(&pool->lock, flags);
+                /* paired with rmb in mempool_free(), read comment there */
+                smp_wmb();
                return element;
        }
-        spin_unlock_irqrestore(&pool->lock, flags);
-        /* We must not sleep in the GFP_ATOMIC case */
+        /*
-        if (!(gfp_mask & __GFP_WAIT))
+         * We use gfp mask w/o __GFP_WAIT or IO for the first round.  If
+         * alloc failed with that and @pool was empty, retry immediately.
+         */
+        if (gfp_temp != gfp_mask) {
+                spin_unlock_irqrestore(&pool->lock, flags);
+                gfp_temp = gfp_mask;
+                goto repeat_alloc;
+        }
+        /* We must not sleep if !__GFP_WAIT */
+        if (!(gfp_mask & __GFP_WAIT)) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                return NULL;
+        }
-        /* Now start performing page reclaim */
+        /* Let's wait for someone else to return an element to @pool */
-        gfp_temp = gfp_mask;
        init_wait(&wait);
        prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-        smp_mb();
-        if (!pool->curr_nr) {
-                /*
-                 * FIXME: this should be io_schedule().  The timeout is there
-                 * as a workaround for some DM problems in 2.6.18.
-                 */
-                io_schedule_timeout(5*HZ);
-        }
-        finish_wait(&pool->wait, &wait);
+        spin_unlock_irqrestore(&pool->lock, flags);
+        /*
+         * FIXME: this should be io_schedule().  The timeout is there as a
+         * workaround for some DM problems in 2.6.18.
+         */
+        io_schedule_timeout(5*HZ);
+        finish_wait(&pool->wait, &wait);
        goto repeat_alloc;
 }
 EXPORT_SYMBOL(mempool_alloc);
@@ -265,7 +269,39 @@ void mempool_free(void *element, mempool_t *pool)
        if (unlikely(element == NULL))
                return;
-        smp_mb();
+        /*
+         * Paired with the wmb in mempool_alloc().  The preceding read is
+         * for @element and the following @pool->curr_nr.  This ensures
+         * that the visible value of @pool->curr_nr is from after the
+         * allocation of @element.  This is necessary for fringe cases
+         * where @element was passed to this task without going through
+         * barriers.
+         *
+         * For example, assume @p is %NULL at the beginning and one task
+         * performs "p = mempool_alloc(...);" while another task is doing
+         * "while (!p) cpu_relax(); mempool_free(p, ...);".  This function
+         * may end up using curr_nr value which is from before allocation
+         * of @p without the following rmb.
+         */
+        smp_rmb();
+        /*
+         * For correctness, we need a test which is guaranteed to trigger
+         * if curr_nr + #allocated == min_nr.  Testing curr_nr < min_nr
+         * without locking achieves that and refilling as soon as possible
+         * is desirable.
+         *
+         * Because curr_nr visible here is always a value after the
+         * allocation of @element, any task which decremented curr_nr below
+         * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
+         * incremented to min_nr afterwards.  If curr_nr gets incremented
+         * to min_nr after the allocation of @element, the elements
+         * allocated after that are subject to the same guarantee.
+         *
+         * Waiters happen iff curr_nr is 0 and the above guarantee also
+         * ensures that there will be frees which return elements to the
+         * pool waking up the waiters.
+         */
        if (pool->curr_nr < pool->min_nr) {
                spin_lock_irqsave(&pool->lock, flags);
                if (pool->curr_nr < pool->min_nr) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 177aca424a06..89ea0854332e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,8 +39,6 @@
 #include "internal.h"
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 /*
 * migrate_prep() needs to be called before we start compiling a list of pages
 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
@@ -181,8 +179,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 * Something used the pte of a page under migration. We need to
 * get to the page and wait until migration is finished.
 * When we return from this function the fault will be retried.
- *
- * This function is called from do_swap_page().
 */
 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long address)
@@ -269,12 +265,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
-        page_unfreeze_refs(page, expected_count);
        /*
-         * Drop cache reference from old page.
+         * Drop cache reference from old page by unfreezing
+         * to one less reference.
         * We know this isn't the last reference.
         */
-        __put_page(page);
+        page_unfreeze_refs(page, expected_count - 1);
        /*
         * If moved to a different zone then also account
@@ -334,9 +330,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
-        page_unfreeze_refs(page, expected_count);
+        page_unfreeze_refs(page, expected_count - 1);
-        __put_page(page);
        spin_unlock_irq(&mapping->tree_lock);
        return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index eae90af60ea6..3f758c7f4c81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 EXPORT_SYMBOL(find_vma);
-/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
+/*
+ * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+ * Note: pprev is set to NULL when return value is NULL.
+ */
 struct vm_area_struct *
 find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
 {
-        struct vm_area_struct *vma = NULL, *prev = NULL;
+        struct vm_area_struct *vma;
-        struct rb_node *rb_node;
-        if (!mm)
-                goto out;
-        /* Guard against addr being lower than the first VMA */
-        vma = mm->mmap;
-        /* Go through the RB tree quickly. */
-        rb_node = mm->mm_rb.rb_node;
-        while (rb_node) {
-                struct vm_area_struct *vma_tmp;
-                vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
-                if (addr < vma_tmp->vm_end) {
-                        rb_node = rb_node->rb_left;
-                } else {
-                        prev = vma_tmp;
-                        if (!prev->vm_next || (addr < prev->vm_next->vm_end))
-                                break;
-                        rb_node = rb_node->rb_right;
-                }
-        }
-out:
+        vma = find_vma(mm, addr);
-        *pprev = prev;
+        *pprev = vma ? vma->vm_prev : NULL;
-        return prev ? prev->vm_next : vma;
+        return vma;
 }
 /*
@@ -2322,13 +2302,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        struct vm_area_struct *new_vma, *prev;
        struct rb_node **rb_link, *rb_parent;
        struct mempolicy *pol;
+        bool faulted_in_anon_vma = true;
        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
-        if (!vma->vm_file && !vma->anon_vma)
+        if (unlikely(!vma->vm_file && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
+                faulted_in_anon_vma = false;
+        }
        find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
        new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2337,9 +2320,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                /*
                 * Source vma may have been merged into new_vma
                 */
-                if (vma_start >= new_vma->vm_start &&
+                if (unlikely(vma_start >= new_vma->vm_start &&
-                    vma_start < new_vma->vm_end)
+                             vma_start < new_vma->vm_end)) {
+                        /*
+                         * The only way we can get a vma_merge with
+                         * self during an mremap is if the vma hasn't
+                         * been faulted in yet and we were allowed to
+                         * reset the dst vma->vm_pgoff to the
+                         * destination address of the mremap to allow
+                         * the merge to happen. mremap must change the
+                         * vm_pgoff linearity between src and dst vmas
+                         * (in turn preventing a vma_merge) to be
+                         * safe. It is only safe to keep the vm_pgoff
+                         * linear if there are no pages mapped yet.
+                         */
+                        VM_BUG_ON(faulted_in_anon_vma);
                        *vmap = new_vma;
+                } else
+                        anon_vma_moveto_tail(new_vma);
        } else {
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c
index d6959cb4df58..87bb8393e7d2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -221,6 +221,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
        if (moved_len < old_len) {
                /*
+                 * Before moving the page tables from the new vma to
+                 * the old vma, we need to be sure the old vma is
+                 * queued after new vma in the same_anon_vma list to
+                 * prevent SMP races with rmap_walk (that could lead
+                 * rmap_walk to miss some page table).
+                 */
+                anon_vma_moveto_tail(vma);
+                /*
                 * On error, move entries back from new area to old,
                 * which will succeed since page tables still there,
                 * and then proceed to unmap new area instead of old.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eeb27e27dce3..7c122faa05c5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -33,6 +33,10 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/freezer.h>
+#include <linux/ftrace.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/oom.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
        spin_lock_irq(&sighand->siglock);
        if (current->signal->oom_score_adj == old_val)
                current->signal->oom_score_adj = new_val;
+        trace_oom_score_adj_update(current);
        spin_unlock_irq(&sighand->siglock);
 }
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
        spin_lock_irq(&sighand->siglock);
        old_val = current->signal->oom_score_adj;
        current->signal->oom_score_adj = new_val;
+        trace_oom_score_adj_update(current);
        spin_unlock_irq(&sighand->siglock);
        return old_val;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8616ef3025a4..5cdd4f2b0c9d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -130,6 +130,191 @@ unsigned long global_dirty_limit;
 static struct prop_descriptor vm_completions;
 /*
+ * Work out the current dirty-memory clamping and background writeout
+ * thresholds.
+ *
+ * The main aim here is to lower them aggressively if there is a lot of mapped
+ * memory around.  To avoid stressing page reclaim with lots of unreclaimable
+ * pages.  It is better to clamp down on writers than to start swapping, and
+ * performing lots of scanning.
+ *
+ * We only allow 1/2 of the currently-unmapped memory to be dirtied.
+ *
+ * We don't permit the clamping level to fall below 5% - that is getting rather
+ * excessive.
+ *
+ * We make sure that the background writeout level is below the adjusted
+ * clamping level.
+ */
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied.  Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+static unsigned long highmem_dirtyable_memory(unsigned long total)
+{
+#ifdef CONFIG_HIGHMEM
+        int node;
+        unsigned long x = 0;
+        for_each_node_state(node, N_HIGH_MEMORY) {
+                struct zone *z =
+                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+                x += zone_page_state(z, NR_FREE_PAGES) +
+                     zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+        }
+        /*
+         * Make sure that the number of highmem pages is never larger
+         * than the number of the total dirtyable memory. This can only
+         * occur in very strange VM situations but we want to make sure
+         * that this does not occur.
+         */
+        return min(x, total);
+#else
+        return 0;
+#endif
+}
+/**
+ * global_dirtyable_memory - number of globally dirtyable pages
+ *
+ * Returns the global number of pages potentially available for dirty
+ * page cache.  This is the base value for the global dirty limits.
+ */
+unsigned long global_dirtyable_memory(void)
+{
+        unsigned long x;
+        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+            dirty_balance_reserve;
+        if (!vm_highmem_is_dirtyable)
+                x -= highmem_dirtyable_memory(x);
+        return x + 1;   /* Ensure that we never return 0 */
+}
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
+ * - vm.dirty_ratio             or  vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+        unsigned long background;
+        unsigned long dirty;
+        unsigned long uninitialized_var(available_memory);
+        struct task_struct *tsk;
+        if (!vm_dirty_bytes || !dirty_background_bytes)
+                available_memory = global_dirtyable_memory();
+        if (vm_dirty_bytes)
+                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+        else
+                dirty = (vm_dirty_ratio * available_memory) / 100;
+        if (dirty_background_bytes)
+                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+        else
+                background = (dirty_background_ratio * available_memory) / 100;
+        if (background >= dirty)
+                background = dirty / 2;
+        tsk = current;
+        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+                background += background / 4;
+                dirty += dirty / 4;
+        }
+        *pbackground = background;
+        *pdirty = dirty;
+        trace_global_dirty_state(background, dirty);
+}
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache.  This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+        /*
+         * The effective global number of dirtyable pages may exclude
+         * highmem as a big-picture measure to keep the ratio between
+         * dirty memory and lowmem reasonable.
+         *
+         * But this function is purely about the individual zone and a
+         * highmem zone can hold its share of dirty pages, so we don't
+         * care about vm_highmem_is_dirtyable here.
+         */
+        return zone_page_state(zone, NR_FREE_PAGES) +
+               zone_reclaimable_pages(zone) -
+               zone->dirty_balance_reserve;
+}
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+        unsigned long zone_memory = zone_dirtyable_memory(zone);
+        struct task_struct *tsk = current;
+        unsigned long dirty;
+        if (vm_dirty_bytes)
+                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+                        zone_memory / global_dirtyable_memory();
+        else
+                dirty = vm_dirty_ratio * zone_memory / 100;
+        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+                dirty += dirty / 4;
+        return dirty;
+}
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+        unsigned long limit = zone_dirty_limit(zone);
+        return zone_page_state(zone, NR_FILE_DIRTY) +
+               zone_page_state(zone, NR_UNSTABLE_NFS) +
+               zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+/*
 * couple the period to the dirty_ratio:
 *
 *   period/2 ~ roundup_pow_of_two(dirty limit)
@@ -141,7 +326,7 @@ static int calc_period_shift(void)
        if (vm_dirty_bytes)
                dirty_total = vm_dirty_bytes / PAGE_SIZE;
        else
-                dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
+                dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
                                100;
        return 2 + ilog2(dirty_total - 1);
 }
@@ -196,7 +381,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
        return ret;
 }
 int dirty_bytes_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -291,67 +475,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
-/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around.  To avoid stressing page reclaim with lots of unreclaimable
- * pages.  It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-static unsigned long highmem_dirtyable_memory(unsigned long total)
-{
-#ifdef CONFIG_HIGHMEM
-        int node;
-        unsigned long x = 0;
-        for_each_node_state(node, N_HIGH_MEMORY) {
-                struct zone *z =
-                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-                x += zone_page_state(z, NR_FREE_PAGES) +
-                     zone_reclaimable_pages(z);
-        }
-        /*
-         * Make sure that the number of highmem pages is never larger
-         * than the number of the total dirtyable memory. This can only
-         * occur in very strange VM situations but we want to make sure
-         * that this does not occur.
-         */
-        return min(x, total);
-#else
-        return 0;
-#endif
-}
-/**
- * determine_dirtyable_memory - amount of memory that may be used
- *
- * Returns the numebr of pages that can currently be freed and used
- * by the kernel for direct mappings.
- */
-unsigned long determine_dirtyable_memory(void)
-{
-        unsigned long x;
-        x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-        if (!vm_highmem_is_dirtyable)
-                x -= highmem_dirtyable_memory(x);
-        return x + 1;   /* Ensure that we never return 0 */
-}
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
 {
@@ -363,47 +486,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
        return max(thresh, global_dirty_limit);
 }
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
- *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
- * - vm.dirty_ratio             or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
-{
-        unsigned long background;
-        unsigned long dirty;
-        unsigned long uninitialized_var(available_memory);
-        struct task_struct *tsk;
-        if (!vm_dirty_bytes || !dirty_background_bytes)
-                available_memory = determine_dirtyable_memory();
-        if (vm_dirty_bytes)
-                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-        else
-                dirty = (vm_dirty_ratio * available_memory) / 100;
-        if (dirty_background_bytes)
-                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
-        else
-                background = (dirty_background_ratio * available_memory) / 100;
-        if (background >= dirty)
-                background = dirty / 2;
-        tsk = current;
-        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-                background += background / 4;
-                dirty += dirty / 4;
-        }
-        *pbackground = background;
-        *pdirty = dirty;
-        trace_global_dirty_state(background, dirty);
-}
 /**
 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
 * @bdi: the backing_dev_info to query
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7990ca154d1b..794e6715c226 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+bool pm_suspended_storage(void)
+{
+        if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+                return false;
+        return true;
+}
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+        unsigned long res;
+        if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+                printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+                return 0;
+        }
+        _debug_guardpage_minorder = res;
+        printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+        return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+static inline void set_page_guard_flag(struct page *page)
+{
+        __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+static inline void clear_page_guard_flag(struct page *page)
+{
+        __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
        if (page_zone_id(page) != page_zone_id(buddy))
                return 0;
+        if (page_is_guard(buddy) && page_order(buddy) == order) {
+                VM_BUG_ON(page_count(buddy) != 0);
+                return 1;
+        }
        if (PageBuddy(buddy) && page_order(buddy) == order) {
                VM_BUG_ON(page_count(buddy) != 0);
                return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
+                /*
-                /* Our buddy is free, merge with it and move up one order. */
+                 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
-                list_del(&buddy->lru);
+                 * merge with it and move up one order.
-                zone->free_area[order].nr_free--;
+                 */
-                rmv_page_order(buddy);
+                if (page_is_guard(buddy)) {
+                        clear_page_guard_flag(buddy);
+                        set_page_private(page, 0);
+                        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+                } else {
+                        list_del(&buddy->lru);
+                        zone->free_area[order].nr_free--;
+                        rmv_page_order(buddy);
+                }
                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
-        trace_mm_page_free_direct(page, order);
+        trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
        if (PageAnon(page))
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
-        if (order == 0) {
+        unsigned int nr_pages = 1 << order;
-                __ClearPageReserved(page);
+        unsigned int loop;
-                set_page_count(page, 0);
-                set_page_refcounted(page);
-                __free_page(page);
-        } else {
-                int loop;
-                prefetchw(page);
-                for (loop = 0; loop < (1 << order); loop++) {
-                        struct page *p = &page[loop];
-                        if (loop + 1 < (1 << order))
+        prefetchw(page);
-                                prefetchw(p + 1);
+        for (loop = 0; loop < nr_pages; loop++) {
-                        __ClearPageReserved(p);
+                struct page *p = &page[loop];
-                        set_page_count(p, 0);
-                }
-                set_page_refcounted(page);
+                if (loop + 1 < nr_pages)
-                __free_pages(page, order);
+                        prefetchw(p + 1);
+                __ClearPageReserved(p);
+                set_page_count(p, 0);
        }
+        set_page_refcounted(page);
+        __free_pages(page, order);
 }
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
+#ifdef CONFIG_DEBUG_PAGEALLOC
+                if (high < debug_guardpage_minorder()) {
+                        /*
+                         * Mark as guard pages (or page), that will allow to
+                         * merge back to allocator when buddy will be freed.
+                         * Corresponding page table entries will not be touched,
+                         * pages will stay not present in virtual address space
+                         */
+                        INIT_LIST_HEAD(&page[size].lru);
+                        set_page_guard_flag(&page[size]);
+                        set_page_private(&page[size], high);
+                        /* Guard pages are not available for any usage */
+                        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                        continue;
+                }
+#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -1189,6 +1257,19 @@ out:
 }
 /*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+        struct page *page, *next;
+        list_for_each_entry_safe(page, next, list, lru) {
+                trace_mm_page_free_batched(page, cold);
+                free_hot_cold_page(page, cold);
+        }
+}
+/*
 * split_page takes a non-compound higher-order page, and splits it into
 * n (1<<order) sub-pages: page[0..n]
 * Each sub-page must be freed individually.
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        long min = mark;
        int o;
-        free_pages -= (1 << order) + 1;
+        free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1726,35 @@ zonelist_scan:
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
+                /*
+                 * When allocating a page cache page for writing, we
+                 * want to get it from a zone that is within its dirty
+                 * limit, such that no single zone holds more than its
+                 * proportional share of globally allowed dirty pages.
+                 * The dirty limits take into account the zone's
+                 * lowmem reserves and high watermark so that kswapd
+                 * should be able to balance it without having to
+                 * write pages from its LRU list.
+                 *
+                 * This may look like it could increase pressure on
+                 * lower zones by failing allocations in higher zones
+                 * before they are full.  But the pages that do spill
+                 * over are limited as the lower zones are protected
+                 * by this very same mechanism.  It should not become
+                 * a practical burden to them.
+                 *
+                 * XXX: For now, allow allocations to potentially
+                 * exceed the per-zone dirty limit in the slowpath
+                 * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                 * which is important when on a NUMA setup the allowed
+                 * zones are together not big enough to reach the
+                 * global limit.  The proper fix for these situations
+                 * will require awareness of zones in the
+                 * dirty-throttling and the flusher threads.
+                 */
+                if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                        goto this_zone_full;
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
-        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+            debug_guardpage_minorder() > 0)
                return;
        /*
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                                unsigned long did_some_progress,
                                unsigned long pages_reclaimed)
 {
        /* Do not loop if specifically requested */
        if (gfp_mask & __GFP_NORETRY)
                return 0;
+        /* Always retry if specifically requested */
+        if (gfp_mask & __GFP_NOFAIL)
+                return 1;
+        /*
+         * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+         * making forward progress without invoking OOM. Suspend also disables
+         * storage devices so kswapd will not help. Bail if we are suspending.
+         */
+        if (!did_some_progress && pm_suspended_storage())
+                return 0;
        /*
         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
         * means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                return 1;
-        /*
-         * Don't let big-order allocations loop unless the caller
-         * explicitly requests that.
-         */
-        if (gfp_mask & __GFP_NOFAIL)
-                return 1;
        return 0;
 }
@@ -2196,7 +2313,8 @@ rebalance:
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
-        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+        if (should_alloc_retry(gfp_mask, order, did_some_progress,
+                                                pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
-void __pagevec_free(struct pagevec *pvec)
-{
-        int i = pagevec_count(pvec);
-        while (--i >= 0) {
-                trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-                free_hot_cold_page(pvec->pages[i], pvec->cold);
-        }
-}
 void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
-                /* Blocks with reserved pages will never free, skip them. */
-                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                if (pageblock_is_reserved(pfn, block_end_pfn))
-                        continue;
                block_migratetype = get_pageblock_migratetype(page);
-                /* If this block is reserved, account for it */
+                /* Only test what is necessary when the reserves are not met */
-                if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+                if (reserve > 0) {
-                        reserve--;
+                        /*
-                        continue;
+                         * Blocks with reserved pages will never free, skip
-                }
+                         * them.
+                         */
+                        block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                        if (pageblock_is_reserved(pfn, block_end_pfn))
+                                continue;
-                /* Suitable for reserving if this block is movable */
+                        /* If this block is reserved, account for it */
-                if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+                        if (block_migratetype == MIGRATE_RESERVE) {
-                        set_pageblock_migratetype(page, MIGRATE_RESERVE);
+                                reserve--;
-                        move_freepages_block(zone, page, MIGRATE_RESERVE);
+                                continue;
-                        reserve--;
+                        }
-                        continue;
+                        /* Suitable for reserving if this block is movable */
+                        if (block_migratetype == MIGRATE_MOVABLE) {
+                                set_pageblock_migratetype(page,
+                                                        MIGRATE_RESERVE);
+                                move_freepages_block(zone, page,
+                                                        MIGRATE_RESERVE);
+                                reserve--;
+                                continue;
+                        }
                }
                /*
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void)
                        if (max > zone->present_pages)
                                max = zone->present_pages;
                        reserve_pages += max;
+                        /*
+                         * Lowmem reserves are not available to
+                         * GFP_HIGHUSER page cache allocations and
+                         * kswapd tries to balance zones to their high
+                         * watermark.  As a result, neither should be
+                         * regarded as dirtyable memory, to prevent a
+                         * situation where reclaim has to clean pages
+                         * in order to balance the zones.
+                         */
+                        zone->dirty_balance_reserve = max;
                }
        }
+        dirty_balance_reserve = reserve_pages;
        totalreserve_pages = reserve_pages;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index a4fd3680038b..a2e5ce1fa081 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -272,6 +272,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 }
 /*
+ * Some rmap walk that needs to find all ptes/hugepmds without false
+ * negatives (like migrate and split_huge_page) running concurrent
+ * with operations that copy or move pagetables (like mremap() and
+ * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
+ * list to be in a certain order: the dst_vma must be placed after the
+ * src_vma in the list. This is always guaranteed by fork() but
+ * mremap() needs to call this function to enforce it in case the
+ * dst_vma isn't newly allocated and chained with the anon_vma_clone()
+ * function but just an extension of a pre-existing vma through
+ * vma_merge.
+ *
+ * NOTE: the same_anon_vma list can still be changed by other
+ * processes while mremap runs because mremap doesn't hold the
+ * anon_vma mutex to prevent modifications to the list while it
+ * runs. All we need to enforce is that the relative order of this
+ * process vmas isn't changing (we don't care about other vmas
+ * order). Each vma corresponds to an anon_vma_chain structure so
+ * there's no risk that other processes calling anon_vma_moveto_tail()
+ * and changing the same_anon_vma list under mremap() will screw with
+ * the relative order of this process vmas in the list, because we
+ * they can't alter the order of any vma that belongs to this
+ * process. And there can't be another anon_vma_moveto_tail() running
+ * concurrently with mremap() coming from this process because we hold
+ * the mmap_sem for the whole mremap(). fork() ordering dependency
+ * also shouldn't be affected because fork() only cares that the
+ * parent vmas are placed in the list before the child vmas and
+ * anon_vma_moveto_tail() won't reorder vmas from either the fork()
+ * parent or child.
+ */
+void anon_vma_moveto_tail(struct vm_area_struct *dst)
+{
+        struct anon_vma_chain *pavc;
+        struct anon_vma *root = NULL;
+        list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
+                struct anon_vma *anon_vma = pavc->anon_vma;
+                VM_BUG_ON(pavc->vma != dst);
+                root = lock_anon_vma_root(root, anon_vma);
+                list_del(&pavc->same_anon_vma);
+                list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
+        }
+        unlock_anon_vma_root(root);
+}
+/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
diff --git a/mm/slub.c b/mm/slub.c
index 025f6ac51569..d99acbf14e01 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3654,6 +3654,9 @@ void __init kmem_cache_init(void)
        struct kmem_cache *temp_kmem_cache_node;
        unsigned long kmalloc_size;
+        if (debug_guardpage_minorder())
+                slub_max_order = 0;
        kmem_size = offsetof(struct kmem_cache, node) +
                                nr_node_ids * sizeof(struct kmem_cache_node *);
diff --git a/mm/swap.c b/mm/swap.c
index a91caf754d9b..67a09a633a09 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -585,11 +585,10 @@ int lru_add_drain_all(void)
 void release_pages(struct page **pages, int nr, int cold)
 {
        int i;
-        struct pagevec pages_to_free;
+        LIST_HEAD(pages_to_free);
        struct zone *zone = NULL;
        unsigned long uninitialized_var(flags);
-        pagevec_init(&pages_to_free, cold);
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
@@ -620,19 +619,12 @@ void release_pages(struct page **pages, int nr, int cold)
                        del_page_from_lru(zone, page);
                }
-                if (!pagevec_add(&pages_to_free, page)) {
+                list_add(&page->lru, &pages_to_free);
-                        if (zone) {
-                                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                                zone = NULL;
-                        }
-                        __pagevec_free(&pages_to_free);
-                        pagevec_reinit(&pages_to_free);
-                }
        }
        if (zone)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        pagevec_free(&pages_to_free);
+        free_hot_cold_page_list(&pages_to_free, cold);
 }
 EXPORT_SYMBOL(release_pages);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b1cd12060723..9520592d4231 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -667,10 +667,10 @@ int try_to_free_swap(struct page *page)
         * original page might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
-         * Hibernation clears bits from gfp_allowed_mask to prevent
+         * Hibration suspends storage while it is writing the image
-         * memory reclaim from writing to disk, so check that here.
+         * to disk so check that here.
         */
-        if (!(gfp_allowed_mask & __GFP_IO))
+        if (pm_suspended_storage())
                return 0;
        delete_from_swap_cache(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 21fdf46ad5aa..877ca046f43d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
        struct rb_node rb_node;         /* address sorted rbtree */
        struct list_head list;          /* address sorted list */
        struct list_head purge_list;    /* "lazy purge" list */
-        void *private;
+        struct vm_struct *vm;
        struct rcu_head rcu_head;
 };
@@ -1285,7 +1285,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
-        va->private = vm;
+        va->vm = vm;
        va->flags |= VM_VM_AREA;
 }
@@ -1408,7 +1408,7 @@ static struct vm_struct *find_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA)
-                return va->private;
+                return va->vm;
        return NULL;
 }
@@ -1427,7 +1427,7 @@ struct vm_struct *remove_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA) {
-                struct vm_struct *vm = va->private;
+                struct vm_struct *vm = va->vm;
                if (!(vm->flags & VM_UNLIST)) {
                        struct vm_struct *tmp, **p;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11adc890ce30..26f4a8a4e0c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -715,7 +715,13 @@ static enum page_references page_check_references(struct page *page,
                 */
                SetPageReferenced(page);
-                if (referenced_page)
+                if (referenced_page || referenced_ptes > 1)
+                        return PAGEREF_ACTIVATE;
+                /*
+                 * Activate file-backed executable pages after first usage.
+                 */
+                if (vm_flags & VM_EXEC)
                        return PAGEREF_ACTIVATE;
                return PAGEREF_KEEP;
@@ -728,24 +734,6 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
-        struct pagevec freed_pvec;
-        struct page *page, *tmp;
-        pagevec_init(&freed_pvec, 1);
-        list_for_each_entry_safe(page, tmp, free_pages, lru) {
-                list_del(&page->lru);
-                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_free(&freed_pvec);
-                        pagevec_reinit(&freed_pvec);
-                }
-        }
-        pagevec_free(&freed_pvec);
-}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -1009,7 +997,7 @@ keep_lumpy:
        if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
                zone_set_flag(zone, ZONE_CONGESTED);
-        free_page_list(&free_pages);
+        free_hot_cold_page_list(&free_pages, 1);
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
@@ -1178,14 +1166,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * anon page which don't already have a swap slot is
                         * pointless.
                         */
-                        if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+                        if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
                            !PageSwapCache(cursor_page))
                                break;
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
-                                nr_taken += hpage_nr_pages(page);
+                                nr_taken += hpage_nr_pages(cursor_page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
                                        nr_lumpy_dirty++;
@@ -2012,8 +2000,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (nr_swap_pages > 0)
+                inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
@@ -3448,9 +3437,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 static void warn_scan_unevictable_pages(void)
 {
        printk_once(KERN_WARNING
-                    "The scan_unevictable_pages sysctl/node-interface has been "
+                    "%s: The scan_unevictable_pages sysctl/node-interface has been "
                    "disabled for lack of a legitimate use case.  If you have "
-                    "one, please send an email to linux-mm@kvack.org.\n");
+                    "one, please send an email to linux-mm@kvack.org.\n",
+                    current->comm);
 }
 /*
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-01-10 19:42:48 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-01-10 19:42:48 -0500
commit	40ba587923ae67090d9f141c1d3c951be5c1420e (patch)
tree	342a72fc0ee13a0d2496ef970b64dfeadf1355d2 /mm
parent	54c2c5761febcca46c8037d3a81612991e6c209a (diff)
parent	6b550f9495947fc279d12c38feaf98500e8d0646 (diff)