Merge remote-tracking branch 'agust/next' into next

Brings some 52xx updates. Also manually merged tools/perf/perf.h. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2012-12-17 18:22:27 -0500
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2012-12-17 18:22:27 -0500
commit: 376bddd34433065aeb9b9a140870537feecf90ef (patch)
tree: a40e2b84ad89f4b3ba968de65a4bf7ff6ccae835 /mm
parent: d526e85f60fce9aa2a1432cbd06e3cf20c1644c8 (diff)
parent: 667b504a2c411e4d5915a6e2260a3857ba9f797a (diff)
14 files changed, 165 insertions, 150 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 434be4ae7a04..f468185b3b28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,8 +198,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                        int order = ilog2(BITS_PER_LONG);
                        __free_pages_bootmem(pfn_to_page(start), order);
-                        fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
-                                        start, start + BITS_PER_LONG);
                        count += BITS_PER_LONG;
                        start += BITS_PER_LONG;
                } else {
@@ -210,9 +208,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                                if (vec & 1) {
                                        page = pfn_to_page(start + off);
                                        __free_pages_bootmem(page, 0);
-                                        fixup_zone_present_pages(
-                                                page_to_nid(page),
-                                                start + off, start + off + 1);
                                        count++;
                                }
                                vec >>= 1;
@@ -226,11 +221,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        pages = bdata->node_low_pfn - bdata->node_min_pfn;
        pages = bootmem_bootmap_pages(pages);
        count += pages;
-        while (pages--) {
+        while (pages--)
-                fixup_zone_present_pages(page_to_nid(page),
-                                page_to_pfn(page), page_to_pfn(page) + 1);
                __free_pages_bootmem(page++, 0);
-        }
        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
diff --git a/mm/highmem.c b/mm/highmem.c
index d517cd16a6eb..2da13a5c50e2 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -98,7 +98,7 @@ struct page *kmap_to_page(void *vaddr)
 {
        unsigned long addr = (unsigned long)vaddr;
-        if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+        if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
                int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
                return pte_page(pkmap_page_table[i]);
        }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7acf43bf04a2..dd39ba000b31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
                                      struct mem_cgroup *memcg)
 {
        struct mem_cgroup_per_zone *mz;
+        struct lruvec *lruvec;
-        if (mem_cgroup_disabled())
+        if (mem_cgroup_disabled()) {
-                return &zone->lruvec;
+                lruvec = &zone->lruvec;
+                goto out;
+        }
        mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
-        return &mz->lruvec;
+        lruvec = &mz->lruvec;
+out:
+        /*
+         * Since a node can be onlined after the mem_cgroup was created,
+         * we have to be prepared to initialize lruvec->zone here;
+         * and if offlined then reonlined, we need to reinitialize it.
+         */
+        if (unlikely(lruvec->zone != zone))
+                lruvec->zone = zone;
+        return lruvec;
 }
 /*
@@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup *memcg;
        struct page_cgroup *pc;
+        struct lruvec *lruvec;
-        if (mem_cgroup_disabled())
+        if (mem_cgroup_disabled()) {
-                return &zone->lruvec;
+                lruvec = &zone->lruvec;
+                goto out;
+        }
        pc = lookup_page_cgroup(page);
        memcg = pc->mem_cgroup;
@@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
                pc->mem_cgroup = memcg = root_mem_cgroup;
        mz = page_cgroup_zoneinfo(memcg, page);
-        return &mz->lruvec;
+        lruvec = &mz->lruvec;
+out:
+        /*
+         * Since a node can be onlined after the mem_cgroup was created,
+         * we have to be prepared to initialize lruvec->zone here;
+         * and if offlined then reonlined, we need to reinitialize it.
+         */
+        if (unlikely(lruvec->zone != zone))
+                lruvec->zone = zone;
+        return lruvec;
 }
 /**
@@ -1452,17 +1476,26 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
        u64 limit;
-        u64 memsw;
        limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-        limit += total_swap_pages << PAGE_SHIFT;
-        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
        /*
-         * If memsw is finite and limits the amount of swap space available
+         * Do not consider swap space if we cannot swap due to swappiness
-         * to this memcg, return that limit.
         */
-        return min(limit, memsw);
+        if (mem_cgroup_swappiness(memcg)) {
+                u64 memsw;
+                limit += total_swap_pages << PAGE_SHIFT;
+                memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+                /*
+                 * If memsw is finite and limits the amount of swap space
+                 * available to this memcg, return that limit.
+                 */
+                limit = min(limit, memsw);
+        }
+        return limit;
 }
 void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -3688,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
-        struct mem_cgroup_per_zone *mz;
+        struct lruvec *lruvec;
        unsigned long flags, loop;
        struct list_head *list;
        struct page *busy;
        struct zone *zone;
        zone = &NODE_DATA(node)->node_zones[zid];
-        mz = mem_cgroup_zoneinfo(memcg, node, zid);
+        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-        list = &mz->lruvec.lists[lru];
+        list = &lruvec->lists[lru];
-        loop = mz->lru_size[lru];
+        loop = mem_cgroup_get_lru_size(lruvec, lru);
        /* give some margin against EBUSY etc...*/
        loop += 256;
        busy = NULL;
@@ -4736,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-                lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
+                lruvec_init(&mz->lruvec);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
                mz->memcg = memcg;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b9034a..8b20278be6a6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
 {
        int ret;
        unsigned long pfn = page_to_pfn(page);
+        struct page *hpage = compound_trans_head(page);
        if (PageHuge(page))
                return soft_offline_huge_page(page, flags);
+        if (PageTransHuge(hpage)) {
+                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+                        pr_info("soft offline: %#lx: failed to split THP\n",
+                                pfn);
+                        return -EBUSY;
+                }
+        }
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
diff --git a/mm/memory.c b/mm/memory.c
index fb135ba4aba9..221fc9ffcab1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2527,9 +2527,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
-        unsigned long mmun_start;       /* For mmu_notifiers */
+        unsigned long mmun_start = 0;   /* For mmu_notifiers */
-        unsigned long mmun_end;         /* For mmu_notifiers */
+        unsigned long mmun_end = 0;     /* For mmu_notifiers */
-        bool mmun_called = false;       /* For mmu_notifiers */
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page) {
@@ -2708,8 +2707,7 @@ gotten:
                goto oom_free_new;
        mmun_start  = address & PAGE_MASK;
-        mmun_end    = (address & PAGE_MASK) + PAGE_SIZE;
+        mmun_end    = mmun_start + PAGE_SIZE;
-        mmun_called = true;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        /*
@@ -2778,7 +2776,7 @@ gotten:
                page_cache_release(new_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (mmun_called)
+        if (mmun_end > mmun_start)
                mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 56b758ae57d2..e4eeacae2b91 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,7 +106,6 @@ static void get_page_bootmem(unsigned long info,  struct page *page,
 void __ref put_page_bootmem(struct page *page)
 {
        unsigned long type;
-        struct zone *zone;
        type = (unsigned long) page->lru.next;
        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -117,12 +116,6 @@ void __ref put_page_bootmem(struct page *page)
                set_page_private(page, 0);
                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
-                zone = page_zone(page);
-                zone_span_writelock(zone);
-                zone->present_pages++;
-                zone_span_writeunlock(zone);
-                totalram_pages++;
        }
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 2d942353d681..9a796c41e7d9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -334,8 +334,10 @@ void validate_mm(struct mm_struct *mm)
        struct vm_area_struct *vma = mm->mmap;
        while (vma) {
                struct anon_vma_chain *avc;
+                vma_lock_anon_vma(vma);
                list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                        anon_vma_interval_tree_verify(avc);
+                vma_unlock_anon_vma(vma);
                vma = vma->vm_next;
                i++;
        }
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 3cef80f6ac79..4596d81b89b1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn,
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+void lruvec_init(struct lruvec *lruvec)
 {
        enum lru_list lru;
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
        for_each_lru(lru)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
-#ifdef CONFIG_MEMCG
-        lruvec->zone = zone;
-#endif
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 714d5d650470..bd82f6b31411 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,8 +116,6 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
                return 0;
        __free_pages_memory(start_pfn, end_pfn);
-        fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
-                        start_pfn, end_pfn);
        return end_pfn - start_pfn;
 }
@@ -128,7 +126,6 @@ unsigned long __init free_low_memory_core_early(int nodeid)
        phys_addr_t start, end, size;
        u64 i;
-        reset_zone_present_pages();
        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
                count += __free_memory_core(start, end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b74de6702e0..a8f2c87792c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1405,7 +1405,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
        mt = get_pageblock_migratetype(page);
        if (unlikely(mt != MIGRATE_ISOLATE))
-                __mod_zone_freepage_state(zone, -(1UL << order), mt);
+                __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
        if (alloc_order != order)
                expand(zone, page, alloc_order, order,
@@ -1422,7 +1422,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
                }
        }
-        return 1UL << order;
+        return 1UL << alloc_order;
 }
 /*
@@ -2378,6 +2378,15 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
+/* Returns true if the allocation is likely for THP */
+static bool is_thp_alloc(gfp_t gfp_mask, unsigned int order)
+{
+        if (order == pageblock_order &&
+            (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
+                return true;
+        return false;
+}
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2416,7 +2425,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx,
+        /* The decision whether to wake kswapd for THP is made later */
+        if (!is_thp_alloc(gfp_mask, order))
+                wake_all_kswapd(order, zonelist, high_zoneidx,
                                        zone_idx(preferred_zone));
        /*
@@ -2487,15 +2498,21 @@ rebalance:
                goto got_pg;
        sync_migration = true;
-        /*
+        if (is_thp_alloc(gfp_mask, order)) {
-         * If compaction is deferred for high-order allocations, it is because
+                /*
-         * sync compaction recently failed. In this is the case and the caller
+                 * If compaction is deferred for high-order allocations, it is
-         * requested a movable allocation that does not heavily disrupt the
+                 * because sync compaction recently failed. If this is the case
-         * system then fail the allocation instead of entering direct reclaim.
+                 * and the caller requested a movable allocation that does not
-         */
+                 * heavily disrupt the system then fail the allocation instead
-        if ((deferred_compaction || contended_compaction) &&
+                 * of entering direct reclaim.
-            (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
+                 */
-                goto nopage;
+                if (deferred_compaction || contended_compaction)
+                        goto nopage;
+                /* If process is willing to reclaim/compact then wake kswapd */
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                        zone_idx(preferred_zone));
+        }
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -4505,7 +4522,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
-                lruvec_init(&zone->lruvec, zone);
+                lruvec_init(&zone->lruvec);
                if (!size)
                        continue;
@@ -6098,37 +6115,3 @@ void dump_page(struct page *page)
        dump_page_flags(page->flags);
        mem_cgroup_print_bad_page(page);
 }
-/* reset zone->present_pages */
-void reset_zone_present_pages(void)
-{
-        struct zone *z;
-        int i, nid;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
-                for (i = 0; i < MAX_NR_ZONES; i++) {
-                        z = NODE_DATA(nid)->node_zones + i;
-                        z->present_pages = 0;
-                }
-        }
-}
-/* calculate zone's present pages in buddy system */
-void fixup_zone_present_pages(int nid, unsigned long start_pfn,
-                                unsigned long end_pfn)
-{
-        struct zone *z;
-        unsigned long zone_start_pfn, zone_end_pfn;
-        int i;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                z = NODE_DATA(nid)->node_zones + i;
-                zone_start_pfn = z->zone_start_pfn;
-                zone_end_pfn = zone_start_pfn + z->spanned_pages;
-                /* if the two regions intersect */
-                if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
-                        z->present_pages += min(end_pfn, zone_end_pfn) -
-                                            max(start_pfn, zone_start_pfn);
-        }
-}
diff --git a/mm/shmem.c b/mm/shmem.c
index 67afba5117f2..89341b658bd0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -643,7 +643,7 @@ static void shmem_evict_inode(struct inode *inode)
                kfree(info->symlink);
        simple_xattrs_free(&info->xattrs);
-        BUG_ON(inode->i_blocks);
+        WARN_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
        clear_inode(inode);
 }
@@ -1145,8 +1145,20 @@ repeat:
                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                gfp, swp_to_radix_entry(swap));
-                        /* We already confirmed swap, and make no allocation */
+                        /*
-                        VM_BUG_ON(error);
+                         * We already confirmed swap under page lock, and make
+                         * no memory allocation here, so usually no possibility
+                         * of error; but free_swap_and_cache() only trylocks a
+                         * page, so it is just possible that the entry has been
+                         * truncated or holepunched since swap was confirmed.
+                         * shmem_undo_range() will have done some of the
+                         * unaccounting, now delete_from_swap_cache() will do
+                         * the rest (including mem_cgroup_uncharge_swapcache).
+                         * Reset swap.val? No, leave it so "failed" goes back to
+                         * "repeat": reading a hole and writing should succeed.
+                         */
+                        if (error)
+                                delete_from_swap_cache(page);
                }
                if (error)
                        goto failed;
diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2888f2..a83de2f72b30 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
        return; /* XXX: Not implemented yet */
 }
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
 }
 #else
@@ -658,10 +658,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
                           get_order(sizeof(struct page) * nr_pages));
 }
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
        unsigned long magic;
+        struct page *page = virt_to_page(memmap);
        for (i = 0; i < nr_pages; i++, page++) {
                magic = (unsigned long) page->lru.next;
@@ -710,13 +711,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
         */
        if (memmap) {
-                struct page *memmap_page;
-                memmap_page = virt_to_page(memmap);
                nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
                        >> PAGE_SHIFT;
-                free_map_bootmem(memmap_page, nr_pages);
+                free_map_bootmem(memmap, nr_pages);
        }
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71cd288b2001..f91a25547ffe 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1494,9 +1494,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        BUG_ON(!current->mm);
        pathname = getname(specialfile);
-        err = PTR_ERR(pathname);
        if (IS_ERR(pathname))
-                goto out;
+                return PTR_ERR(pathname);
        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
        err = PTR_ERR(victim);
@@ -1608,6 +1607,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 out_dput:
        filp_close(victim, NULL);
 out:
+        putname(pathname);
        return err;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2624edcfb420..83f4d0e85601 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1760,28 +1760,6 @@ static bool in_reclaim_compaction(struct scan_control *sc)
        return false;
 }
-#ifdef CONFIG_COMPACTION
-/*
- * If compaction is deferred for sc->order then scale the number of pages
- * reclaimed based on the number of consecutive allocation failures
- */
-static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
-                        struct lruvec *lruvec, struct scan_control *sc)
-{
-        struct zone *zone = lruvec_zone(lruvec);
-        if (zone->compact_order_failed <= sc->order)
-                pages_for_compaction <<= zone->compact_defer_shift;
-        return pages_for_compaction;
-}
-#else
-static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
-                        struct lruvec *lruvec, struct scan_control *sc)
-{
-        return pages_for_compaction;
-}
-#endif
 /*
 * Reclaim/compaction is used for high-order allocation requests. It reclaims
 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1829,9 +1807,6 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-        pages_for_compaction = scale_for_compaction(pages_for_compaction,
-                                                    lruvec, sc);
        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (nr_swap_pages > 0)
                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2232,9 +2207,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 * Throttle direct reclaimers if backing storage is backed by the network
 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
 * depleted. kswapd will continue to make progress and wake the processes
- * when the low watermark is reached
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
 */
-static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
                                        nodemask_t *nodemask)
 {
        struct zone *zone;
@@ -2249,13 +2227,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         * processes to block on log_wait_commit().
         */
        if (current->flags & PF_KTHREAD)
-                return;
+                goto out;
+        /*
+         * If a fatal signal is pending, this process should not throttle.
+         * It should return quickly so it can exit and free its memory
+         */
+        if (fatal_signal_pending(current))
+                goto out;
        /* Check if the pfmemalloc reserves are ok */
        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
        pgdat = zone->zone_pgdat;
        if (pfmemalloc_watermark_ok(pgdat))
-                return;
+                goto out;
        /* Account for the throttling */
        count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2271,12 +2256,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
        if (!(gfp_mask & __GFP_FS)) {
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        pfmemalloc_watermark_ok(pgdat), HZ);
-                return;
+                goto check_pending;
        }
        /* Throttle until kswapd wakes the process */
        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
                pfmemalloc_watermark_ok(pgdat));
+check_pending:
+        if (fatal_signal_pending(current))
+                return true;
+out:
+        return false;
 }
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2298,13 +2291,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
-        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
        /*
-         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * Do not enter reclaim if fatal signal was delivered while throttled.
-         * that the page allocator does not consider triggering OOM
+         * 1 is returned so that the page allocator does not OOM kill at this
+         * point.
         */
-        if (fatal_signal_pending(current))
+        if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2422,6 +2414,19 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
        } while (memcg);
 }
+static bool zone_balanced(struct zone *zone, int order,
+                          unsigned long balance_gap, int classzone_idx)
+{
+        if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+                                    balance_gap, classzone_idx, 0))
+                return false;
+        if (COMPACTION_BUILD && order && !compaction_suitable(zone, order))
+                return false;
+        return true;
+}
 /*
 * pgdat_balanced is used when checking if a node is balanced for high-order
 * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2500,8 +2505,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                        continue;
                }
-                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+                if (!zone_balanced(zone, order, 0, i))
-                                                        i, 0))
                        all_zones_ok = false;
                else
                        balanced += zone->present_pages;
@@ -2610,8 +2614,7 @@ loop_again:
                                break;
                        }
-                        if (!zone_watermark_ok_safe(zone, order,
+                        if (!zone_balanced(zone, order, 0, 0)) {
-                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
                        } else {
@@ -2687,9 +2690,8 @@ loop_again:
                                testorder = 0;
                        if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
-                                    !zone_watermark_ok_safe(zone, testorder,
+                            !zone_balanced(zone, testorder,
-                                        high_wmark_pages(zone) + balance_gap,
+                                           balance_gap, end_zone)) {
-                                        end_zone, 0)) {
                                shrink_zone(zone, &sc);
                                reclaim_state->reclaimed_slab = 0;
@@ -2716,8 +2718,7 @@ loop_again:
                                continue;
                        }
-                        if (!zone_watermark_ok_safe(zone, testorder,
+                        if (!zone_balanced(zone, testorder, 0, end_zone)) {
-                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
                                 * We are still under min water mark.  This
@@ -3017,6 +3018,8 @@ static int kswapd(void *p)
                                                &balanced_classzone_idx);
                }
        }
+        current->reclaim_state = NULL;
        return 0;
 }
author	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2012-12-17 18:22:27 -0500
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2012-12-17 18:22:27 -0500
commit	376bddd34433065aeb9b9a140870537feecf90ef (patch)
tree	a40e2b84ad89f4b3ba968de65a4bf7ff6ccae835 /mm
parent	d526e85f60fce9aa2a1432cbd06e3cf20c1644c8 (diff)
parent	667b504a2c411e4d5915a6e2260a3857ba9f797a (diff)