Merge commit 'v2.6.38-rc4' into imx-for-2.6.39

Conflicts: arch/arm/mach-mxs/clock-mx28.c Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
author: Sascha Hauer <s.hauer@pengutronix.de> 2011-02-11 02:32:18 -0500
committer: Sascha Hauer <s.hauer@pengutronix.de> 2011-02-11 02:33:14 -0500
commit: f19693a17c6705e197eb24d4618060eaac1b535c (patch)
tree: fc39dc23297c0e6be730cb0dfd74a34d9c0b8bfd /mm
parent: 23b120cdfae4f5c29da69de750d545bad719ead4 (diff)
parent: 100b33c8bd8a3235fd0b7948338d6cbb3db3c63d (diff)
14 files changed, 312 insertions, 150 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
 config COMPACTION
        bool "Allow for memory compaction"
        select MIGRATION
-        depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+        depends on MMU
        help
          Allows the compaction of memory for the allocation of huge pages.
diff --git a/mm/compaction.c b/mm/compaction.c
index 6d592a021072..8be430b812de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -406,6 +406,10 @@ static int compact_finished(struct zone *zone,
        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
+        /*
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
        if (cc->order == -1)
                return COMPACT_CONTINUE;
@@ -454,6 +458,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
                return COMPACT_SKIPPED;
        /*
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (order == -1)
+                return COMPACT_CONTINUE;
+        /*
         * fragmentation index determines if allocation failures are due to
         * low memory or external fragmentation
         *
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 004c9c2aac78..b6c1ce3c53b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
-                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+                /*
+                 * retain hwpoison flag of the poisoned tail page:
+                 *   fix for the unsuitable process killed on Guest Machine(KVM)
+                 *   by the memory-failure.
+                 */
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
                page_tail->flags |= (page->flags &
                                     ((1L << PG_referenced) |
                                      (1L << PG_swapbacked) |
@@ -1203,6 +1208,8 @@ static void __split_huge_page_refcount(struct page *page)
                BUG_ON(!PageDirty(page_tail));
                BUG_ON(!PageSwapBacked(page_tail));
+                mem_cgroup_split_huge_fixup(page, page_tail);
                lru_add_page_tail(zone, page, page_tail);
        }
@@ -1837,9 +1844,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
        spin_unlock(ptl);
-        pte_unmap(pte);
        if (unlikely(!isolated)) {
+                pte_unmap(pte);
                spin_lock(&mm->page_table_lock);
                BUG_ON(!pmd_none(*pmd));
                set_pmd_at(mm, address, pmd, _pmd);
@@ -1856,6 +1863,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        anon_vma_unlock(vma->anon_vma);
        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
        VM_BUG_ON(page_count(pgtable) != 1);
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
         * after the module is removed.
         */
        for (i = 0; i < 10; i++) {
-                elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+                elem = kzalloc(sizeof(*elem), GFP_KERNEL);
-                pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+                pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
                if (!elem)
                        return -ENOMEM;
-                memset(elem, 0, sizeof(*elem));
                INIT_LIST_HEAD(&elem->list);
                list_add_tail(&elem->list, &test_list);
        }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
 #define BYTES_PER_POINTER       sizeof(void *)
 /* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK       (GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp)  (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+                                 __GFP_NORETRY | __GFP_NOMEMALLOC | \
+                                 __GFP_NOWARN)
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        struct kmemleak_object *object;
        struct prio_tree_node *node;
-        object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
-                kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+                pr_warning("Cannot allocate a kmemleak_object structure\n");
+                kmemleak_disable();
                return NULL;
        }
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
                return;
        }
-        area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+        area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
        if (!area) {
-                kmemleak_warn("Cannot allocate a scan area\n");
+                pr_warning("Cannot allocate a scan area\n");
                goto out;
        }
diff --git a/mm/memblock.c b/mm/memblock.c
index 400dc62697d7..bdba245d8afd 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -683,13 +683,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
-        int idx = memblock_search(&memblock.reserved, base);
+        int idx = memblock_search(&memblock.memory, base);
        if (idx == -1)
                return 0;
-        return memblock.reserved.regions[idx].base <= base &&
+        return memblock.memory.regions[idx].base <= base &&
-                (memblock.reserved.regions[idx].base +
+                (memblock.memory.regions[idx].base +
-                 memblock.reserved.regions[idx].size) >= (base + size);
+                 memblock.memory.regions[idx].size) >= (base + size);
 }
 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ab841031436..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -600,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
-                                         struct page_cgroup *pc,
+                                         bool file, int nr_pages)
-                                         bool charge)
 {
-        int val = (charge) ? 1 : -1;
        preempt_disable();
-        if (PageCgroupCache(pc))
+        if (file)
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
        else
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
-        if (charge)
+        /* pagein of a big page is an event. So, ignore page size */
+        if (nr_pages > 0)
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
+        else {
                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
-        __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+                nr_pages = -nr_pages; /* for event */
+        }
+        __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
        preempt_enable();
 }
@@ -815,7 +816,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
         * removed from global LRU.
         */
        mz = page_cgroup_zoneinfo(pc);
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        VM_BUG_ON(list_empty(&pc->lru));
@@ -836,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        /* unused or root page is not rotated. */
-        if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+        if (!PageCgroupUsed(pc))
+                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
+        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        mz = page_cgroup_zoneinfo(pc);
        list_move(&pc->lru, &mz->lists[lru]);
@@ -857,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
        VM_BUG_ON(PageCgroupAcctLRU(pc));
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
        SetPageCgroupAcctLRU(pc);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
@@ -1030,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return NULL;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
        mz = page_cgroup_zoneinfo(pc);
        if (!mz)
                return NULL;
@@ -1119,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
        return false;
 }
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+        if (!res_counter_check_margin(&mem->res, bytes))
+                return false;
+        if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+                return false;
+        return true;
+}
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
@@ -1615,7 +1626,7 @@ void mem_cgroup_update_page_stat(struct page *page,
        if (unlikely(!mem || !PageCgroupUsed(pc)))
                goto out;
        /* pc->mem_cgroup is unstable ? */
-        if (unlikely(mem_cgroup_stealed(mem))) {
+        if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
                /* take a lock against to access pc->mem_cgroup */
                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
@@ -1840,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * csize can be either a huge page (HPAGE_SIZE), a batch of
+         * regular pages (CHARGE_SIZE), or a single regular page
+         * (PAGE_SIZE).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (csize == CHARGE_SIZE)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags);
+        if (mem_cgroup_check_margin(mem_over_limit, csize))
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (csize == PAGE_SIZE && ret)
                return CHARGE_RETRY;
        /*
@@ -2084,14 +2107,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        return mem;
 }
-/*
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
+                                       struct page_cgroup *pc,
- * USED state. If already USED, uncharge and return.
+                                       enum charge_type ctype,
- */
+                                       int page_size)
-static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                         struct page_cgroup *pc,
-                                         enum charge_type ctype)
 {
+        int nr_pages = page_size >> PAGE_SHIFT;
+        /* try_charge() can return NULL to *memcg, taking care of it. */
+        if (!mem)
+                return;
+        lock_page_cgroup(pc);
+        if (unlikely(PageCgroupUsed(pc))) {
+                unlock_page_cgroup(pc);
+                mem_cgroup_cancel_charge(mem, page_size);
+                return;
+        }
+        /*
+         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * accessed by any other context at this point.
+         */
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2115,43 +2151,57 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, true);
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+        unlock_page_cgroup(pc);
+        /*
+         * "charge_statistics" updated event counter. Then, check it.
+         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+         * if they exceeds softlimit.
+         */
+        memcg_check_events(mem, pc->page);
 }
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-                                       struct page_cgroup *pc,
-                                       enum charge_type ctype,
-                                       int page_size)
-{
-        int i;
-        int count = page_size >> PAGE_SHIFT;
-        /* try_charge() can return NULL to *memcg, taking care of it. */
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
-        if (!mem)
+                        (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
-                return;
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+        struct page_cgroup *head_pc = lookup_page_cgroup(head);
+        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+        unsigned long flags;
-        lock_page_cgroup(pc);
+        if (mem_cgroup_disabled())
-        if (unlikely(PageCgroupUsed(pc))) {
-                unlock_page_cgroup(pc);
-                mem_cgroup_cancel_charge(mem, page_size);
                return;
-        }
        /*
-         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * We have no races with charge/uncharge but will have races with
-         * accessed by any other context at this point.
+         * page state accounting.
         */
-        for (i = 0; i < count; i++)
+        move_lock_page_cgroup(head_pc, &flags);
-                ____mem_cgroup_commit_charge(mem, pc + i, ctype);
-        unlock_page_cgroup(pc);
+        tail_pc->mem_cgroup = head_pc->mem_cgroup;
-        /*
+        smp_wmb(); /* see __commit_charge() */
-         * "charge_statistics" updated event counter. Then, check it.
+        if (PageCgroupAcctLRU(head_pc)) {
-         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+                enum lru_list lru;
-         * if they exceeds softlimit.
+                struct mem_cgroup_per_zone *mz;
-         */
-        memcg_check_events(mem, pc->page);
+                /*
+                 * LRU flags cannot be copied because we need to add tail
+                 *.page to LRU by generic call and our hook will be called.
+                 * We hold lru_lock, then, reduce counter directly.
+                 */
+                lru = page_lru(head);
+                mz = page_cgroup_zoneinfo(head_pc);
+                MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        }
+        tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+        move_unlock_page_cgroup(head_pc, &flags);
 }
+#endif
 /**
 * __mem_cgroup_move_account - move account of the page
@@ -2171,8 +2221,11 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 */
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
-        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
+        int charge_size)
 {
+        int nr_pages = charge_size >> PAGE_SHIFT;
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
        VM_BUG_ON(!page_is_cgroup_locked(pc));
@@ -2186,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-        mem_cgroup_charge_statistics(from, pc, false);
+        mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-                mem_cgroup_cancel_charge(from, PAGE_SIZE);
+                mem_cgroup_cancel_charge(from, charge_size);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-        mem_cgroup_charge_statistics(to, pc, true);
+        mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
@@ -2208,15 +2261,24 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 * __mem_cgroup_move_account()
 */
 static int mem_cgroup_move_account(struct page_cgroup *pc,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+                struct mem_cgroup *from, struct mem_cgroup *to,
+                bool uncharge, int charge_size)
 {
        int ret = -EINVAL;
        unsigned long flags;
+        /*
+         * The page is isolated from LRU. So, collapse function
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
+        if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
+                return -EBUSY;
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
                move_lock_page_cgroup(pc, &flags);
-                __mem_cgroup_move_account(pc, from, to, uncharge);
+                __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
                move_unlock_page_cgroup(pc, &flags);
                ret = 0;
        }
@@ -2241,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
+        int page_size = PAGE_SIZE;
+        unsigned long flags;
        int ret;
        /* Is ROOT ? */
@@ -2253,15 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (isolate_lru_page(page))
                goto put;
+        if (PageTransHuge(page))
+                page_size = HPAGE_SIZE;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask,
-                                      PAGE_SIZE);
+                                &parent, false, page_size);
        if (ret || !parent)
                goto put_back;
-        ret = mem_cgroup_move_account(pc, child, parent, true);
+        if (page_size > PAGE_SIZE)
+                flags = compound_lock_irqsave(page);
+        ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
        if (ret)
-                mem_cgroup_cancel_charge(parent, PAGE_SIZE);
+                mem_cgroup_cancel_charge(parent, page_size);
+        if (page_size > PAGE_SIZE)
+                compound_unlock_irqrestore(page, flags);
 put_back:
        putback_lru_page(page);
 put:
@@ -2280,13 +2353,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        int page_size = PAGE_SIZE;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
-        int page_size = PAGE_SIZE;
        if (PageTransHuge(page)) {
                page_size <<= compound_order(page);
                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
        }
        pc = lookup_page_cgroup(page);
@@ -2295,7 +2374,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
        if (ret || !mem)
                return ret;
@@ -2546,7 +2625,6 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-        int i;
        int count;
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
@@ -2596,8 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        for (i = 0; i < count; i++)
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
-                mem_cgroup_charge_statistics(mem, pc + i, false);
        ClearPageCgroupUsed(pc);
        /*
@@ -4844,7 +4921,7 @@ retry:
                                goto put;
                        pc = lookup_page_cgroup(page);
                        if (!mem_cgroup_move_account(pc,
-                                                mc.from, mc.to, false)) {
+                                        mc.from, mc.to, false, PAGE_SIZE)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -4983,9 +5060,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
-        if (!s || !strcmp(s, "1"))
+        if (!(*s) || !strcmp(s, "=1"))
                really_do_swap_account = 1;
-        else if (!strcmp(s, "0"))
+        else if (!strcmp(s, "=0"))
                really_do_swap_account = 0;
        return 1;
 }
@@ -4993,7 +5070,8 @@ __setup("swapaccount", enable_swap_account);
 static int __init disable_swap_account(char *s)
 {
-        enable_swap_account("0");
+        printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+        enable_swap_account("=0");
        return 1;
 }
 __setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
        }
        /*
-         * Only all shrink_slab here (which would also
+         * Only call shrink_slab here (which would also shrink other caches) if
-         * shrink other caches) if access is not potentially fatal.
+         * access is not potentially fatal.
         */
        if (access) {
                int nr;
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
-        if (!PageHuge(page) && unlikely(split_huge_page(page)))
-                return;
        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
@@ -856,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int kill = 1;
        struct page *hpage = compound_head(p);
+        struct page *ppage;
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -897,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        }
        /*
+         * ppage: poisoned page
+         *   if p is regular page(4k page)
+         *        ppage == real poisoned page;
+         *   else p is hugetlb or THP, ppage == head page.
+         */
+        ppage = hpage;
+        if (PageTransHuge(hpage)) {
+                /*
+                 * Verify that this isn't a hugetlbfs head page, the check for
+                 * PageAnon is just for avoid tripping a split_huge_page
+                 * internal debug check, as split_huge_page refuses to deal with
+                 * anything that isn't an anon page. PageAnon can't go away fro
+                 * under us because we hold a refcount on the hpage, without a
+                 * refcount on the hpage. split_huge_page can't be safely called
+                 * in the first place, having a refcount on the tail isn't
+                 * enough * to be safe.
+                 */
+                if (!PageHuge(hpage) && PageAnon(hpage)) {
+                        if (unlikely(split_huge_page(hpage))) {
+                                /*
+                                 * FIXME: if splitting THP is failed, it is
+                                 * better to stop the following operation rather
+                                 * than causing panic by unmapping. System might
+                                 * survive if the page is freed later.
+                                 */
+                                printk(KERN_INFO
+                                        "MCE %#lx: failed to split THP\n", pfn);
+                                BUG_ON(!PageHWPoison(p));
+                                return SWAP_FAIL;
+                        }
+                        /* THP is split, so ppage should be the real poisoned page. */
+                        ppage = p;
+                }
+        }
+        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
         * because ttu takes the rmap data structures down.
@@ -905,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(hpage, &tokill);
+                collect_procs(ppage, &tokill);
+        if (hpage != ppage)
+                lock_page_nosync(ppage);
-        ret = try_to_unmap(hpage, ttu);
+        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(hpage));
+                                pfn, page_mapcount(ppage));
+        if (hpage != ppage)
+                unlock_page(ppage);
        /*
         * Now that the dirty bit has been propagated to the
@@ -921,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
                      ret != SWAP_SUCCESS, p, pfn);
        return ret;
@@ -1022,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p) && !PageHuge(p))
+        if (!PageHuge(p) && !PageTransCompound(p)) {
-                shake_page(p, 0);
+                if (!PageLRU(p))
-        if (!PageLRU(p) && !PageHuge(p)) {
+                        shake_page(p, 0);
-                /*
+                if (!PageLRU(p)) {
-                 * shake_page could have turned it free.
+                        /*
-                 */
+                         * shake_page could have turned it free.
-                if (is_free_buddy_page(p)) {
+                         */
-                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        if (is_free_buddy_page(p)) {
-                        return 0;
+                                action_result(pfn, "free buddy, 2nd try",
+                                                DELAYED);
+                                return 0;
+                        }
+                        action_result(pfn, "non LRU", IGNORED);
+                        put_page(p);
+                        return -EBUSY;
                }
-                action_result(pfn, "non LRU", IGNORED);
-                put_page(p);
-                return -EBUSY;
        }
        /*
@@ -1064,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
         */
-        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, "hugepage already hardware poisoned",
                                IGNORED);
                unlock_page(hpage);
@@ -1295,7 +1341,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
                                true);
        if (ret) {
-                putback_lru_pages(&pagelist);
+                struct page *page1, *page2;
+                list_for_each_entry_safe(page1, page2, &pagelist, lru)
+                        put_page(page1);
                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
                         pfn, ret, page->flags);
                if (ret > 0)
@@ -1419,6 +1468,7 @@ int soft_offline_page(struct page *page, int flags)
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
                                                                0, true);
                if (ret) {
+                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..766115253807 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
 unlock:
        unlock_page(page);
+move_newpage:
        if (rc != -EAGAIN) {
                /*
                 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
                putback_lru_page(page);
        }
-move_newpage:
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -888,7 +887,7 @@ out:
 * are movable anymore because to has become empty
 * or no retryable pages exist anymore.
 * Caller should call putback_lru_pages to return pages to the LRU
- * or free list.
+ * or free list only if ret != 0.
 *
 * Return: Number of pages not migrated or error code.
 */
@@ -981,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
        }
        rc = 0;
 out:
-        list_for_each_entry_safe(page, page2, from, lru)
-                put_page(page);
        if (rc)
                return rc;
diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        /*
+         * We want mlock to succeed for regions that have any permissions
+         * other than PROT_NONE.
+         */
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
        if (vma->vm_flags & VM_LOCKED)
                gup_flags |= FOLL_MLOCK;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..a873e61e312e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                free_pcppages_bulk(zone, pcp->count, pcp);
+                if (pcp->count) {
-                pcp->count = 0;
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                        pcp->count = 0;
+                }
                local_irq_restore(flags);
        }
 }
@@ -2034,6 +2036,14 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
+        /*
+         * Find the true preferred zone if the allocation is unconstrained by
+         * cpusets.
+         */
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                        &preferred_zone);
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+        first_zones_zonelist(zonelist, high_zoneidx,
+                                nodemask ? : &cpuset_current_mems_allowed,
+                                &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
 *  Copyright (C) 2010  Linus Torvalds
 */
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
diff --git a/mm/truncate.c b/mm/truncate.c
index 3c2d5ddfa0d4..49feb46e77b8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -549,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache);
 * @inode: inode
 * @newsize: new file size
 *
- * truncate_setsize updastes i_size update and performs pagecache
+ * truncate_setsize updates i_size and performs pagecache truncation (if
- * truncation (if necessary) for a file size updates. It will be
+ * necessary) to @newsize. It will be typically be called from the filesystem's
- * typically be called from the filesystem's setattr function when
+ * setattr function when ATTR_SIZE is passed in.
- * ATTR_SIZE is passed in.
 *
- * Must be called with inode_mutex held and after all filesystem
+ * Must be called with inode_mutex held and before all filesystem specific
- * specific block truncation has been performed.
+ * block truncation has been performed.
 */
 void truncate_setsize(struct inode *inode, loff_t newsize)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 47a50962ce81..148c6e630df2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,7 +41,6 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
-#include <linux/compaction.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -2084,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        struct zone *preferred_zone;
                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-                                                        NULL, &preferred_zone);
+                                                &cpuset_current_mems_allowed,
+                                                &preferred_zone);
                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
                }
        }
author	Sascha Hauer <s.hauer@pengutronix.de>	2011-02-11 02:32:18 -0500
committer	Sascha Hauer <s.hauer@pengutronix.de>	2011-02-11 02:33:14 -0500
commit	f19693a17c6705e197eb24d4618060eaac1b535c (patch)
tree	fc39dc23297c0e6be730cb0dfd74a34d9c0b8bfd /mm
parent	23b120cdfae4f5c29da69de750d545bad719ead4 (diff)
parent	100b33c8bd8a3235fd0b7948338d6cbb3db3c63d (diff)