Merge branch 'slab/rcu' into slab/next

Conflicts: mm/slub.c
author: Pekka Enberg <penberg@kernel.org> 2011-03-11 11:10:45 -0500
committer: Pekka Enberg <penberg@kernel.org> 2011-03-11 11:10:45 -0500
commit: c9149556756d56c68451a4a8735c37e7062fd3d7 (patch)
tree: a2dae56b22adaa9a23c8f92f30c3b3ad3b610850 /mm
parent: d71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 (diff)
parent: 5bfe53a77e8a3ffce4a10003c75f464a138e272d (diff)
19 files changed, 356 insertions, 190 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
 config COMPACTION
        bool "Allow for memory compaction"
        select MIGRATION
-        depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+        depends on MMU
        help
          Allows the compaction of memory for the allocation of huge pages.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..dbe99a5f2073 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 static inline struct page *alloc_hugepage_vma(int defrag,
                                              struct vm_area_struct *vma,
-                                              unsigned long haddr)
+                                              unsigned long haddr, int nd)
 {
        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-                               HPAGE_PMD_ORDER, vma, haddr);
+                               HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 #ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                if (unlikely(khugepaged_enter(vma)))
                        return VM_FAULT_OOM;
                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                          vma, haddr);
+                                          vma, haddr, numa_node_id());
                if (unlikely(!page))
                        goto out;
                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        }
        for (i = 0; i < HPAGE_PMD_NR; i++) {
-                pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
+                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
-                                          vma, address);
+                                               vma, address, page_to_nid(page));
                if (unlikely(!pages[i] ||
                             mem_cgroup_newpage_charge(pages[i], mm,
                                                       GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow())
                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-                                              vma, haddr);
+                                              vma, haddr, numa_node_id());
        else
                new_page = NULL;
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
-                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+                /*
+                 * retain hwpoison flag of the poisoned tail page:
+                 *   fix for the unsuitable process killed on Guest Machine(KVM)
+                 *   by the memory-failure.
+                 */
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
                page_tail->flags |= (page->flags &
                                     ((1L << PG_referenced) |
                                      (1L << PG_swapbacked) |
@@ -1740,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
                               unsigned long address,
                               struct page **hpage,
-                               struct vm_area_struct *vma)
+                               struct vm_area_struct *vma,
+                               int node)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -1768,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+                                      node);
        if (unlikely(!new_page)) {
                up_read(&mm->mmap_sem);
                *hpage = ERR_PTR(-ENOMEM);
@@ -1806,6 +1813,8 @@ static void collapse_huge_page(struct mm_struct *mm,
        /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
        if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
                goto out;
+        if (is_vma_temporary_stack(vma))
+                goto out;
        VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
        pgd = pgd_offset(mm, address);
@@ -1847,7 +1856,6 @@ static void collapse_huge_page(struct mm_struct *mm,
                set_pmd_at(mm, address, pmd, _pmd);
                spin_unlock(&mm->page_table_lock);
                anon_vma_unlock(vma->anon_vma);
-                mem_cgroup_uncharge_page(new_page);
                goto out;
        }
@@ -1893,6 +1901,7 @@ out_up_write:
        return;
 out:
+        mem_cgroup_uncharge_page(new_page);
 #ifdef CONFIG_NUMA
        put_page(new_page);
 #endif
@@ -1912,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
+        int node = -1;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1942,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page))
                        goto out_unmap;
+                /*
+                 * Chose the node of the first page. This could
+                 * be more sophisticated and look at more pages,
+                 * but isn't for now.
+                 */
+                if (node == -1)
+                        node = page_to_nid(page);
                VM_BUG_ON(PageCompound(page));
                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
                        goto out_unmap;
@@ -1958,7 +1975,7 @@ out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (ret)
                /* collapse_huge_page will return with the mmap_sem released */
-                collapse_huge_page(mm, address, hpage, vma);
+                collapse_huge_page(mm, address, hpage, vma, node);
 out:
        return ret;
 }
@@ -2027,32 +2044,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
                     !khugepaged_always()) ||
                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                skip:
                        progress++;
                        continue;
                }
                /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-                if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+                if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
-                        khugepaged_scan.address = vma->vm_end;
+                        goto skip;
-                        progress++;
+                if (is_vma_temporary_stack(vma))
-                        continue;
+                        goto skip;
-                }
                VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
-                if (hstart >= hend) {
+                if (hstart >= hend)
-                        progress++;
+                        goto skip;
-                        continue;
+                if (khugepaged_scan.address > hend)
-                }
+                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
-                if (khugepaged_scan.address > hend) {
+                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
-                        khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
-                        progress++;
-                        continue;
-                }
-                BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
                while (khugepaged_scan.address < hend) {
                        int ret;
@@ -2081,7 +2093,7 @@ breakouterloop:
 breakouterloop_mmap_sem:
        spin_lock(&khugepaged_mm_lock);
-        BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
@@ -2236,9 +2248,9 @@ static int khugepaged(void *none)
        for (;;) {
                mutex_unlock(&khugepaged_mutex);
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                khugepaged_loop();
-                BUG_ON(khugepaged_thread != current);
+                VM_BUG_ON(khugepaged_thread != current);
                mutex_lock(&khugepaged_mutex);
                if (!khugepaged_enabled())
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
         * after the module is removed.
         */
        for (i = 0; i < 10; i++) {
-                elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+                elem = kzalloc(sizeof(*elem), GFP_KERNEL);
-                pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+                pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
                if (!elem)
                        return -ENOMEM;
-                memset(elem, 0, sizeof(*elem));
                INIT_LIST_HEAD(&elem->list);
                list_add_tail(&elem->list, &test_list);
        }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
 #define BYTES_PER_POINTER       sizeof(void *)
 /* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK       (GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp)  (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+                                 __GFP_NORETRY | __GFP_NOMEMALLOC | \
+                                 __GFP_NOWARN)
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        struct kmemleak_object *object;
        struct prio_tree_node *node;
-        object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
-                kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+                pr_warning("Cannot allocate a kmemleak_object structure\n");
+                kmemleak_disable();
                return NULL;
        }
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
                return;
        }
-        area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+        area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
        if (!area) {
-                kmemleak_warn("Cannot allocate a scan area\n");
+                pr_warning("Cannot allocate a scan area\n");
                goto out;
        }
diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
        BUG_ON(0 == size);
-        size = memblock_align_up(size, align);
        /* Pump up max_addr */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
                end = memblock.current_limit;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db76ef726293..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
+        else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+                nr_pages = -nr_pages; /* for event */
+        }
        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
@@ -1111,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+        if (!res_counter_check_margin(&mem->res, bytes))
+                return false;
+        if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+                return false;
+        return true;
+}
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1832,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * csize can be either a huge page (HPAGE_SIZE), a batch of
+         * regular pages (CHARGE_SIZE), or a single regular page
+         * (PAGE_SIZE).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags);
+        if (mem_cgroup_check_margin(mem_over_limit, csize))
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
        /*
@@ -2144,6 +2175,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
        unsigned long flags;
+        if (mem_cgroup_disabled())
+                return;
        /*
         * We have no races with charge/uncharge but will have races with
         * page state accounting.
@@ -2233,7 +2266,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 {
        int ret = -EINVAL;
        unsigned long flags;
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
        if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
                return -EBUSY;
@@ -2265,7 +2303,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
-        int charge = PAGE_SIZE;
+        int page_size = PAGE_SIZE;
        unsigned long flags;
        int ret;
@@ -2278,23 +2316,26 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto out;
        if (isolate_lru_page(page))
                goto put;
-        /* The page is isolated from LRU and we have no race with splitting */
-        charge = PAGE_SIZE << compound_order(page);
+        if (PageTransHuge(page))
+                page_size = HPAGE_SIZE;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+                                &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
-        if (charge > PAGE_SIZE)
+        if (page_size > PAGE_SIZE)
                flags = compound_lock_irqsave(page);
-        ret = mem_cgroup_move_account(pc, child, parent, true, charge);
+        ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-                mem_cgroup_cancel_charge(parent, charge);
+                mem_cgroup_cancel_charge(parent, page_size);
-put_back:
-        if (charge > PAGE_SIZE)
+        if (page_size > PAGE_SIZE)
                compound_unlock_irqrestore(page, flags);
+put_back:
        putback_lru_page(page);
 put:
        put_page(page);
@@ -2312,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
-        int page_size = PAGE_SIZE;
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
        }
        pc = lookup_page_cgroup(page);
@@ -2327,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
@@ -5013,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!s || !strcmp(s, "1"))
+        if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "0"))
+        else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -5023,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 static int __init disable_swap_account(char *s)
 {
-        enable_swap_account("0");
+        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+        enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
        }
        /*
-         * Only all shrink_slab here (which would also
+         * Only call shrink_slab here (which would also shrink other caches) if
-         * shrink other caches) if access is not potentially fatal.
+         * access is not potentially fatal.
         */
        if (access) {
                int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
-        if (!PageHuge(page) && unlikely(split_huge_page(page)))
-                return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1;
        struct page *hpage = compound_head(p);
+        struct page *ppage;
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        }
        /*
+         * ppage: poisoned page
+         *   if p is regular page(4k page)
+         *        ppage == real poisoned page;
+         *   else p is hugetlb or THP, ppage == head page.
+         */
+        ppage = hpage;
+        if (PageTransHuge(hpage)) {
+                /*
+                 * Verify that this isn't a hugetlbfs head page, the check for
+                 * PageAnon is just for avoid tripping a split_huge_page
+                 * internal debug check, as split_huge_page refuses to deal with
+                 * anything that isn't an anon page. PageAnon can't go away fro
+                 * under us because we hold a refcount on the hpage, without a
+                 * refcount on the hpage. split_huge_page can't be safely called
+                 * in the first place, having a refcount on the tail isn't
+                 * enough * to be safe.
+                 */
+                if (!PageHuge(hpage) && PageAnon(hpage)) {
+                        if (unlikely(split_huge_page(hpage))) {
+                                /*
+                                 * FIXME: if splitting THP is failed, it is
+                                 * better to stop the following operation rather
+                                 * than causing panic by unmapping. System might
+                                 * survive if the page is freed later.
+                                 */
+                                printk(KERN_INFO
+                                        "MCE %#lx: failed to split THP\n", pfn);
+                                BUG_ON(!PageHWPoison(p));
+                                return SWAP_FAIL;
+                        }
+                        /* THP is split, so ppage should be the real poisoned page. */
+                        ppage = p;
+                }
+        }
+        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
         * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(hpage, &tokill);
+                collect_procs(ppage, &tokill);
+        if (hpage != ppage)
+                lock_page_nosync(ppage);
-        ret = try_to_unmap(hpage, ttu);
+        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(hpage));
+                                pfn, page_mapcount(ppage));
+        if (hpage != ppage)
+                unlock_page(ppage);
        /*
         * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
                      ret != SWAP_SUCCESS, p, pfn);
        return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p) && !PageHuge(p))
+        if (!PageHuge(p) && !PageTransCompound(p)) {
-                shake_page(p, 0);
+                if (!PageLRU(p))
-        if (!PageLRU(p) && !PageHuge(p)) {
+                        shake_page(p, 0);
-                /*
+                if (!PageLRU(p)) {
-                 * shake_page could have turned it free.
+                        /*
-                 */
+                         * shake_page could have turned it free.
-                if (is_free_buddy_page(p)) {
+                         */
-                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        if (is_free_buddy_page(p)) {
-                        return 0;
+                                action_result(pfn, "free buddy, 2nd try",
+                                                DELAYED);
+                                return 0;
+                        }
+                        action_result(pfn, "non LRU", IGNORED);
+                        put_page(p);
+                        return -EBUSY;
                }
-                action_result(pfn, "non LRU", IGNORED);
-                put_page(p);
-                return -EBUSY;
        }
        /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
         */
-        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, "hugepage already hardware poisoned",
                                IGNORED);
                unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
                                true);
        if (ret) {
-                putback_lru_pages(&pagelist);
+                struct page *page1, *page2;
+                list_for_each_entry_safe(page1, page2, &pagelist, lru)
+                        put_page(page1);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                                                0, true);
                if (ret) {
+                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
                        page_cache_release(old_page);
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
@@ -2367,16 +2365,6 @@ gotten:
        }
        __SetPageUptodate(new_page);
-        /*
-         * Don't let another task, with possibly unlocked vma,
-         * keep the mlocked page.
-         */
-        if ((vma->vm_flags & VM_LOCKED) && old_page) {
-                lock_page(old_page);    /* for LRU manipulation */
-                clear_page_mlock(old_page);
-                unlock_page(old_page);
-        }
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
@@ -2444,10 +2432,20 @@ gotten:
        if (new_page)
                page_cache_release(new_page);
-        if (old_page)
-                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
+        if (old_page) {
+                /*
+                 * Don't let another task, with possibly unlocked vma,
+                 * keep the mlocked page.
+                 */
+                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+                        lock_page(old_page);    /* LRU manipulation */
+                        munlock_vma_page(old_page);
+                        unlock_page(old_page);
+                }
+                page_cache_release(old_page);
+        }
        return ret;
 oom_free_new:
        page_cache_release(new_page);
@@ -2650,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
                details.last_index = ULONG_MAX;
        details.i_mmap_lock = &mapping->i_mmap_lock;
+        mutex_lock(&mapping->unmap_mutex);
        spin_lock(&mapping->i_mmap_lock);
        /* Protect against endless unmapping loops */
@@ -2666,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -3053,12 +3053,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        charged = 1;
-                        /*
-                         * Don't let another task, with possibly unlocked vma,
-                         * keep the mlocked page.
-                         */
-                        if (vma->vm_flags & VM_LOCKED)
-                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..b53ec99f1428 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+        int nd)
 {
-        int nd = numa_node_id();
        switch (policy->mode) {
        case MPOL_PREFERRED:
                if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
-                zl = policy_zonelist(gfp_flags, *mpol);
+                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
                if ((*mpol)->mode == MPOL_BIND)
                        *nodemask = &(*mpol)->v.nodes;
        }
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-                unsigned long addr)
+                unsigned long addr, int node)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1830,13 +1829,13 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
-                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
-        zl = policy_zonelist(gfp, pol);
+        zl = policy_zonelist(gfp, pol, node);
        if (unlikely(mpol_needs_cond_ref(pol))) {
                /*
                 * slow path: ref counted shared policy
@@ -1892,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
        else
                page = __alloc_pages_nodemask(gfp, order,
-                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+                                policy_zonelist(gfp, pol, numa_node_id()),
+                                policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..352de555626c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
 unlock:
        unlock_page(page);
+move_newpage:
        if (rc != -EAGAIN) {
                /*
                 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
                putback_lru_page(page);
        }
-move_newpage:
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -888,7 +887,7 @@ out:
 * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 * Caller should call putback_lru_pages to return pages to the LRU
- * or free list.
+ * or free list only if ret != 0.
 *
 * Return: Number of pages not migrated or error code.
 */
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
        }
        rc = 0;
 out:
-        list_for_each_entry_safe(page, page2, from, lru)
-                put_page(page);
        if (rc)
                return rc;
@@ -1292,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                return -EPERM;
        /* Find the mm_struct */
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                return -ESRCH;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!mm)
                return -EINVAL;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        /*
+         * We want mlock to succeed for regions that have any permissions
+         * other than PROT_NONE.
+         */
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
        if (vma->vm_flags & VM_LOCKED)
                gup_flags |= FOLL_MLOCK;
diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 */
                mapping = vma->vm_file->f_mapping;
                spin_lock(&mapping->i_mmap_lock);
-                if (new_vma->vm_truncate_count &&
+                new_vma->vm_truncate_count = 0;
-                    new_vma->vm_truncate_count != vma->vm_truncate_count)
-                        new_vma->vm_truncate_count = 0;
        }
        /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..cdef1d4b4e47 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                free_pcppages_bulk(zone, pcp->count, pcp);
+                if (pcp->count) {
-                pcp->count = 0;
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                        pcp->count = 0;
+                }
                local_irq_restore(flags);
        }
 }
@@ -2034,6 +2036,14 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
+        /*
+         * Find the true preferred zone if the allocation is unconstrained by
+         * cpusets.
+         */
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                        &preferred_zone);
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+        first_zones_zonelist(zonelist, high_zoneidx,
+                                nodemask ? : &cpuset_current_mems_allowed,
+                                &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
@@ -5364,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
                unsigned long check = pfn + iter;
-                if (!pfn_valid_within(check)) {
+                if (!pfn_valid_within(check))
-                        iter++;
                        continue;
-                }
                page = pfn_to_page(check);
                if (!page_count(page)) {
                        if (PageBuddy(page))
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
 *  Copyright (C) 2010  Linus Torvalds
 */
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
diff --git a/mm/slab.c b/mm/slab.c
index 4bab2d1a8291..7d92f08b88d7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t;
 #define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 /*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
-        struct list_head list;
-        unsigned long colouroff;
-        void *s_mem;            /* including colour offset */
-        unsigned int inuse;     /* num of objs active in slab */
-        kmem_bufctl_t free;
-        unsigned short nodeid;
-};
-/*
 * struct slab_rcu
 *
 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +203,6 @@ struct slab {
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
 */
 struct slab_rcu {
        struct rcu_head head;
@@ -229,6 +211,27 @@ struct slab_rcu {
 };
 /*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+        union {
+                struct {
+                        struct list_head list;
+                        unsigned long colouroff;
+                        void *s_mem;            /* including colour offset */
+                        unsigned int inuse;     /* num of objs active in slab */
+                        kmem_bufctl_t free;
+                        unsigned short nodeid;
+                };
+                struct slab_rcu __slab_cover_slab_rcu;
+        };
+};
+/*
 * struct array_cache
 *
 * Purpose:
diff --git a/mm/slub.c b/mm/slub.c
index ea6f0390996f..e841d8921c22 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -305,11 +305,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
        return s->size;
 }
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+        return ((PAGE_SIZE << order) - reserved) / size;
+}
 static inline struct kmem_cache_order_objects oo_make(int order,
-                                                unsigned long size)
+                unsigned long size, int reserved)
 {
        struct kmem_cache_order_objects x = {
-                (order << OO_SHIFT) + (PAGE_SIZE << order) / size
+                (order << OO_SHIFT) + order_objects(order, size, reserved)
        };
        return x;
@@ -641,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
                return 1;
        start = page_address(page);
-        length = (PAGE_SIZE << compound_order(page));
+        length = (PAGE_SIZE << compound_order(page)) - s->reserved;
        end = start + length;
        remainder = length % s->size;
        if (!remainder)
@@ -722,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
                return 0;
        }
-        maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+        maxobj = order_objects(compound_order(page), s->size, s->reserved);
        if (page->objects > maxobj) {
                slab_err(s, page, "objects %u > max %u",
                        s->name, page->objects, maxobj);
@@ -772,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
                nr++;
        }
-        max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+        max_objects = order_objects(compound_order(page), s->size, s->reserved);
        if (max_objects > MAX_OBJS_PER_PAGE)
                max_objects = MAX_OBJS_PER_PAGE;
@@ -1273,21 +1278,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __free_pages(page, order);
 }
+#define need_reserve_slab_rcu                                           \
+        (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
 static void rcu_free_slab(struct rcu_head *h)
 {
        struct page *page;
-        page = container_of((struct list_head *)h, struct page, lru);
+        if (need_reserve_slab_rcu)
+                page = virt_to_head_page(h);
+        else
+                page = container_of((struct list_head *)h, struct page, lru);
        __free_slab(page->slab, page);
 }
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
-                /*
+                struct rcu_head *head;
-                 * RCU free overloads the RCU head over the LRU
-                 */
+                if (need_reserve_slab_rcu) {
-                struct rcu_head *head = (void *)&page->lru;
+                        int order = compound_order(page);
+                        int offset = (PAGE_SIZE << order) - s->reserved;
+                        VM_BUG_ON(s->reserved != sizeof(*head));
+                        head = page_address(page) + offset;
+                } else {
+                        /*
+                         * RCU free overloads the RCU head over the LRU
+                         */
+                        head = (void *)&page->lru;
+                }
                call_rcu(head, rcu_free_slab);
        } else
@@ -2012,13 +2034,13 @@ static int slub_nomerge;
 * the smallest order which will fit the object.
 */
 static inline int slab_order(int size, int min_objects,
-                                int max_order, int fract_leftover)
+                                int max_order, int fract_leftover, int reserved)
 {
        int order;
        int rem;
        int min_order = slub_min_order;
-        if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+        if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
        for (order = max(min_order,
@@ -2027,10 +2049,10 @@ static inline int slab_order(int size, int min_objects,
                unsigned long slab_size = PAGE_SIZE << order;
-                if (slab_size < min_objects * size)
+                if (slab_size < min_objects * size + reserved)
                        continue;
-                rem = slab_size % size;
+                rem = (slab_size - reserved) % size;
                if (rem <= slab_size / fract_leftover)
                        break;
@@ -2040,7 +2062,7 @@ static inline int slab_order(int size, int min_objects,
        return order;
 }
-static inline int calculate_order(int size)
+static inline int calculate_order(int size, int reserved)
 {
        int order;
        int min_objects;
@@ -2058,14 +2080,14 @@ static inline int calculate_order(int size)
        min_objects = slub_min_objects;
        if (!min_objects)
                min_objects = 4 * (fls(nr_cpu_ids) + 1);
-        max_objects = (PAGE_SIZE << slub_max_order)/size;
+        max_objects = order_objects(slub_max_order, size, reserved);
        min_objects = min(min_objects, max_objects);
        while (min_objects > 1) {
                fraction = 16;
                while (fraction >= 4) {
                        order = slab_order(size, min_objects,
-                                                slub_max_order, fraction);
+                                        slub_max_order, fraction, reserved);
                        if (order <= slub_max_order)
                                return order;
                        fraction /= 2;
@@ -2077,14 +2099,14 @@ static inline int calculate_order(int size)
         * We were unable to place multiple objects in a slab. Now
         * lets see if we can place a single object there.
         */
-        order = slab_order(size, 1, slub_max_order, 1);
+        order = slab_order(size, 1, slub_max_order, 1, reserved);
        if (order <= slub_max_order)
                return order;
        /*
         * Doh this slab cannot be placed using slub_max_order.
         */
-        order = slab_order(size, 1, MAX_ORDER, 1);
+        order = slab_order(size, 1, MAX_ORDER, 1, reserved);
        if (order < MAX_ORDER)
                return order;
        return -ENOSYS;
@@ -2335,7 +2357,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
        if (forced_order >= 0)
                order = forced_order;
        else
-                order = calculate_order(size);
+                order = calculate_order(size, s->reserved);
        if (order < 0)
                return 0;
@@ -2353,8 +2375,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
        /*
         * Determine the number of objects per slab
         */
-        s->oo = oo_make(order, size);
+        s->oo = oo_make(order, size, s->reserved);
-        s->min = oo_make(get_order(size), size);
+        s->min = oo_make(get_order(size), size, s->reserved);
        if (oo_objects(s->oo) > oo_objects(s->max))
                s->max = s->oo;
@@ -2373,6 +2395,10 @@ static int kmem_cache_open(struct kmem_cache *s,
        s->objsize = size;
        s->align = align;
        s->flags = kmem_cache_flags(size, flags, name, ctor);
+        s->reserved = 0;
+        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+                s->reserved = sizeof(struct rcu_head);
        if (!calculate_sizes(s, -1))
                goto error;
@@ -4014,6 +4040,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(destroy_by_rcu);
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", s->reserved);
+}
+SLAB_ATTR_RO(reserved);
 #ifdef CONFIG_SLUB_DEBUG
 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
@@ -4300,6 +4332,7 @@ static struct attribute *slab_attrs[] = {
        &reclaim_account_attr.attr,
        &destroy_by_rcu_attr.attr,
        &shrink_attr.attr,
+        &reserved_attr.attr,
 #ifdef CONFIG_SLUB_DEBUG
        &total_objects_attr.attr,
        &slabs_attr.attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = -EINVAL;
        if (S_ISBLK(inode->i_mode)) {
-                bdev = I_BDEV(inode);
+                bdev = bdgrab(I_BDEV(inode));
                error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
                                   sys_swapon);
                if (error < 0) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..d64296be00d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        next = start;
        while (next <= end &&
               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5d90dedebba..6771ea70bfe7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
                return false;
-        /*
+        /* Consider stopping depending on scan and reclaim activity */
-         * If we failed to reclaim and have scanned the full list, stop.
+        if (sc->gfp_mask & __GFP_REPEAT) {
-         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+                /*
-         *       faster but obviously would be less likely to succeed
+                 * For __GFP_REPEAT allocations, stop reclaiming if the
-         *       allocation. If this is desirable, use GFP_REPEAT to decide
+                 * full LRU list has been scanned and we are still failing
-         *       if both reclaimed and scanned should be checked or just
+                 * to reclaim pages. This full LRU scan is potentially
-         *       reclaimed
+                 * expensive but a __GFP_REPEAT caller really wants to succeed
-         */
+                 */
-        if (!nr_reclaimed && !nr_scanned)
+                if (!nr_reclaimed && !nr_scanned)
-                return false;
+                        return false;
+        } else {
+                /*
+                 * For non-__GFP_REPEAT allocations which can presumably
+                 * fail without consequence, stop if we failed to reclaim
+                 * any pages from the last SWAP_CLUSTER_MAX number of
+                 * pages that were scanned. This will return to the
+                 * caller faster at the risk reclaim/compaction and
+                 * the resulting allocation attempt fails
+                 */
+                if (!nr_reclaimed)
+                        return false;
+        }
        /*
         * If we have not reclaimed enough pages for compaction and the
@@ -1882,12 +1894,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed;
+        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-        unsigned long nr_scanned = sc->nr_scanned;
 restart:
        nr_reclaimed = 0;
+        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2083,7 +2095,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        struct zone *preferred_zone;
                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                        NULL, &preferred_zone);
+                                                &cpuset_current_mems_allowed,
+                                                &preferred_zone);
                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
                }
        }
author	Pekka Enberg <penberg@kernel.org>	2011-03-11 11:10:45 -0500
committer	Pekka Enberg <penberg@kernel.org>	2011-03-11 11:10:45 -0500
commit	c9149556756d56c68451a4a8735c37e7062fd3d7 (patch)
tree	a2dae56b22adaa9a23c8f92f30c3b3ad3b610850 /mm
parent	d71f606f687ef9d0cdddfd3619ca7cb9a0b3fb63 (diff)
parent	5bfe53a77e8a3ffce4a10003c75f464a138e272d (diff)