aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c13
-rw-r--r--mm/mempolicy.c130
-rw-r--r--mm/page_alloc.c27
-rw-r--r--mm/page_isolation.c26
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/vmscan.c111
6 files changed, 137 insertions, 174 deletions
diff --git a/mm/memory.c b/mm/memory.c
index e0a9b0ce4f10..bb1369f7b9b4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -184,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
184 return 1; 184 return 1;
185 } 185 }
186 186
187 if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
188 return 0;
189
187 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); 190 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
188 if (!batch) 191 if (!batch)
189 return 0; 192 return 0;
190 193
194 tlb->batch_count++;
191 batch->next = NULL; 195 batch->next = NULL;
192 batch->nr = 0; 196 batch->nr = 0;
193 batch->max = MAX_GATHER_BATCH; 197 batch->max = MAX_GATHER_BATCH;
@@ -216,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
216 tlb->local.nr = 0; 220 tlb->local.nr = 0;
217 tlb->local.max = ARRAY_SIZE(tlb->__pages); 221 tlb->local.max = ARRAY_SIZE(tlb->__pages);
218 tlb->active = &tlb->local; 222 tlb->active = &tlb->local;
223 tlb->batch_count = 0;
219 224
220#ifdef CONFIG_HAVE_RCU_TABLE_FREE 225#ifdef CONFIG_HAVE_RCU_TABLE_FREE
221 tlb->batch = NULL; 226 tlb->batch = NULL;
@@ -3706,6 +3711,14 @@ retry:
3706 if (pmd_trans_huge(orig_pmd)) { 3711 if (pmd_trans_huge(orig_pmd)) {
3707 unsigned int dirty = flags & FAULT_FLAG_WRITE; 3712 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3708 3713
3714 /*
3715 * If the pmd is splitting, return and retry the
3716 * the fault. Alternative: wait until the split
3717 * is done, and goto retry.
3718 */
3719 if (pmd_trans_splitting(orig_pmd))
3720 return 0;
3721
3709 if (pmd_numa(orig_pmd)) 3722 if (pmd_numa(orig_pmd))
3710 return do_huge_pmd_numa_page(mm, vma, address, 3723 return do_huge_pmd_numa_page(mm, vma, address,
3711 orig_pmd, pmd); 3724 orig_pmd, pmd);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d1b315e98627..e2df1c1fb41f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2132,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2132 */ 2132 */
2133 2133
2134/* lookup first element intersecting start-end */ 2134/* lookup first element intersecting start-end */
2135/* Caller holds sp->mutex */ 2135/* Caller holds sp->lock */
2136static struct sp_node * 2136static struct sp_node *
2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2137sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2138{ 2138{
@@ -2196,13 +2196,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2196 2196
2197 if (!sp->root.rb_node) 2197 if (!sp->root.rb_node)
2198 return NULL; 2198 return NULL;
2199 mutex_lock(&sp->mutex); 2199 spin_lock(&sp->lock);
2200 sn = sp_lookup(sp, idx, idx+1); 2200 sn = sp_lookup(sp, idx, idx+1);
2201 if (sn) { 2201 if (sn) {
2202 mpol_get(sn->policy); 2202 mpol_get(sn->policy);
2203 pol = sn->policy; 2203 pol = sn->policy;
2204 } 2204 }
2205 mutex_unlock(&sp->mutex); 2205 spin_unlock(&sp->lock);
2206 return pol; 2206 return pol;
2207} 2207}
2208 2208
@@ -2328,6 +2328,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2328 sp_free(n); 2328 sp_free(n);
2329} 2329}
2330 2330
2331static void sp_node_init(struct sp_node *node, unsigned long start,
2332 unsigned long end, struct mempolicy *pol)
2333{
2334 node->start = start;
2335 node->end = end;
2336 node->policy = pol;
2337}
2338
2331static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2339static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2332 struct mempolicy *pol) 2340 struct mempolicy *pol)
2333{ 2341{
@@ -2344,10 +2352,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2344 return NULL; 2352 return NULL;
2345 } 2353 }
2346 newpol->flags |= MPOL_F_SHARED; 2354 newpol->flags |= MPOL_F_SHARED;
2347 2355 sp_node_init(n, start, end, newpol);
2348 n->start = start;
2349 n->end = end;
2350 n->policy = newpol;
2351 2356
2352 return n; 2357 return n;
2353} 2358}
@@ -2357,9 +2362,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2357 unsigned long end, struct sp_node *new) 2362 unsigned long end, struct sp_node *new)
2358{ 2363{
2359 struct sp_node *n; 2364 struct sp_node *n;
2365 struct sp_node *n_new = NULL;
2366 struct mempolicy *mpol_new = NULL;
2360 int ret = 0; 2367 int ret = 0;
2361 2368
2362 mutex_lock(&sp->mutex); 2369restart:
2370 spin_lock(&sp->lock);
2363 n = sp_lookup(sp, start, end); 2371 n = sp_lookup(sp, start, end);
2364 /* Take care of old policies in the same range. */ 2372 /* Take care of old policies in the same range. */
2365 while (n && n->start < end) { 2373 while (n && n->start < end) {
@@ -2372,14 +2380,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2372 } else { 2380 } else {
2373 /* Old policy spanning whole new range. */ 2381 /* Old policy spanning whole new range. */
2374 if (n->end > end) { 2382 if (n->end > end) {
2375 struct sp_node *new2; 2383 if (!n_new)
2376 new2 = sp_alloc(end, n->end, n->policy); 2384 goto alloc_new;
2377 if (!new2) { 2385
2378 ret = -ENOMEM; 2386 *mpol_new = *n->policy;
2379 goto out; 2387 atomic_set(&mpol_new->refcnt, 1);
2380 } 2388 sp_node_init(n_new, n->end, end, mpol_new);
2389 sp_insert(sp, n_new);
2381 n->end = start; 2390 n->end = start;
2382 sp_insert(sp, new2); 2391 n_new = NULL;
2392 mpol_new = NULL;
2383 break; 2393 break;
2384 } else 2394 } else
2385 n->end = start; 2395 n->end = start;
@@ -2390,9 +2400,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2390 } 2400 }
2391 if (new) 2401 if (new)
2392 sp_insert(sp, new); 2402 sp_insert(sp, new);
2393out: 2403 spin_unlock(&sp->lock);
2394 mutex_unlock(&sp->mutex); 2404 ret = 0;
2405
2406err_out:
2407 if (mpol_new)
2408 mpol_put(mpol_new);
2409 if (n_new)
2410 kmem_cache_free(sn_cache, n_new);
2411
2395 return ret; 2412 return ret;
2413
2414alloc_new:
2415 spin_unlock(&sp->lock);
2416 ret = -ENOMEM;
2417 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2418 if (!n_new)
2419 goto err_out;
2420 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2421 if (!mpol_new)
2422 goto err_out;
2423 goto restart;
2396} 2424}
2397 2425
2398/** 2426/**
@@ -2410,7 +2438,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2410 int ret; 2438 int ret;
2411 2439
2412 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2440 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2413 mutex_init(&sp->mutex); 2441 spin_lock_init(&sp->lock);
2414 2442
2415 if (mpol) { 2443 if (mpol) {
2416 struct vm_area_struct pvma; 2444 struct vm_area_struct pvma;
@@ -2476,14 +2504,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2476 2504
2477 if (!p->root.rb_node) 2505 if (!p->root.rb_node)
2478 return; 2506 return;
2479 mutex_lock(&p->mutex); 2507 spin_lock(&p->lock);
2480 next = rb_first(&p->root); 2508 next = rb_first(&p->root);
2481 while (next) { 2509 while (next) {
2482 n = rb_entry(next, struct sp_node, nd); 2510 n = rb_entry(next, struct sp_node, nd);
2483 next = rb_next(&n->nd); 2511 next = rb_next(&n->nd);
2484 sp_delete(p, n); 2512 sp_delete(p, n);
2485 } 2513 }
2486 mutex_unlock(&p->mutex); 2514 spin_unlock(&p->lock);
2487} 2515}
2488 2516
2489#ifdef CONFIG_NUMA_BALANCING 2517#ifdef CONFIG_NUMA_BALANCING
@@ -2595,8 +2623,7 @@ void numa_default_policy(void)
2595 */ 2623 */
2596 2624
2597/* 2625/*
2598 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag 2626 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2599 * Used only for mpol_parse_str() and mpol_to_str()
2600 */ 2627 */
2601static const char * const policy_modes[] = 2628static const char * const policy_modes[] =
2602{ 2629{
@@ -2610,28 +2637,20 @@ static const char * const policy_modes[] =
2610 2637
2611#ifdef CONFIG_TMPFS 2638#ifdef CONFIG_TMPFS
2612/** 2639/**
2613 * mpol_parse_str - parse string to mempolicy 2640 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2614 * @str: string containing mempolicy to parse 2641 * @str: string containing mempolicy to parse
2615 * @mpol: pointer to struct mempolicy pointer, returned on success. 2642 * @mpol: pointer to struct mempolicy pointer, returned on success.
2616 * @no_context: flag whether to "contextualize" the mempolicy
2617 * 2643 *
2618 * Format of input: 2644 * Format of input:
2619 * <mode>[=<flags>][:<nodelist>] 2645 * <mode>[=<flags>][:<nodelist>]
2620 * 2646 *
2621 * if @no_context is true, save the input nodemask in w.user_nodemask in
2622 * the returned mempolicy. This will be used to "clone" the mempolicy in
2623 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2624 * mount option. Note that if 'static' or 'relative' mode flags were
2625 * specified, the input nodemask will already have been saved. Saving
2626 * it again is redundant, but safe.
2627 *
2628 * On success, returns 0, else 1 2647 * On success, returns 0, else 1
2629 */ 2648 */
2630int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) 2649int mpol_parse_str(char *str, struct mempolicy **mpol)
2631{ 2650{
2632 struct mempolicy *new = NULL; 2651 struct mempolicy *new = NULL;
2633 unsigned short mode; 2652 unsigned short mode;
2634 unsigned short uninitialized_var(mode_flags); 2653 unsigned short mode_flags;
2635 nodemask_t nodes; 2654 nodemask_t nodes;
2636 char *nodelist = strchr(str, ':'); 2655 char *nodelist = strchr(str, ':');
2637 char *flags = strchr(str, '='); 2656 char *flags = strchr(str, '=');
@@ -2719,24 +2738,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2719 if (IS_ERR(new)) 2738 if (IS_ERR(new))
2720 goto out; 2739 goto out;
2721 2740
2722 if (no_context) { 2741 /*
2723 /* save for contextualization */ 2742 * Save nodes for mpol_to_str() to show the tmpfs mount options
2724 new->w.user_nodemask = nodes; 2743 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2725 } else { 2744 */
2726 int ret; 2745 if (mode != MPOL_PREFERRED)
2727 NODEMASK_SCRATCH(scratch); 2746 new->v.nodes = nodes;
2728 if (scratch) { 2747 else if (nodelist)
2729 task_lock(current); 2748 new->v.preferred_node = first_node(nodes);
2730 ret = mpol_set_nodemask(new, &nodes, scratch); 2749 else
2731 task_unlock(current); 2750 new->flags |= MPOL_F_LOCAL;
2732 } else 2751
2733 ret = -ENOMEM; 2752 /*
2734 NODEMASK_SCRATCH_FREE(scratch); 2753 * Save nodes for contextualization: this will be used to "clone"
2735 if (ret) { 2754 * the mempolicy in a specific context [cpuset] at a later time.
2736 mpol_put(new); 2755 */
2737 goto out; 2756 new->w.user_nodemask = nodes;
2738 } 2757
2739 }
2740 err = 0; 2758 err = 0;
2741 2759
2742out: 2760out:
@@ -2756,13 +2774,12 @@ out:
2756 * @buffer: to contain formatted mempolicy string 2774 * @buffer: to contain formatted mempolicy string
2757 * @maxlen: length of @buffer 2775 * @maxlen: length of @buffer
2758 * @pol: pointer to mempolicy to be formatted 2776 * @pol: pointer to mempolicy to be formatted
2759 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2760 * 2777 *
2761 * Convert a mempolicy into a string. 2778 * Convert a mempolicy into a string.
2762 * Returns the number of characters in buffer (if positive) 2779 * Returns the number of characters in buffer (if positive)
2763 * or an error (negative) 2780 * or an error (negative)
2764 */ 2781 */
2765int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) 2782int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2766{ 2783{
2767 char *p = buffer; 2784 char *p = buffer;
2768 int l; 2785 int l;
@@ -2788,7 +2805,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2788 case MPOL_PREFERRED: 2805 case MPOL_PREFERRED:
2789 nodes_clear(nodes); 2806 nodes_clear(nodes);
2790 if (flags & MPOL_F_LOCAL) 2807 if (flags & MPOL_F_LOCAL)
2791 mode = MPOL_LOCAL; /* pseudo-policy */ 2808 mode = MPOL_LOCAL;
2792 else 2809 else
2793 node_set(pol->v.preferred_node, nodes); 2810 node_set(pol->v.preferred_node, nodes);
2794 break; 2811 break;
@@ -2796,10 +2813,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2796 case MPOL_BIND: 2813 case MPOL_BIND:
2797 /* Fall through */ 2814 /* Fall through */
2798 case MPOL_INTERLEAVE: 2815 case MPOL_INTERLEAVE:
2799 if (no_context) 2816 nodes = pol->v.nodes;
2800 nodes = pol->w.user_nodemask;
2801 else
2802 nodes = pol->v.nodes;
2803 break; 2817 break;
2804 2818
2805 default: 2819 default:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ba5e37127fc..bc6cc0e913bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes);
221 221
222int page_group_by_mobility_disabled __read_mostly; 222int page_group_by_mobility_disabled __read_mostly;
223 223
224/*
225 * NOTE:
226 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
227 * Instead, use {un}set_pageblock_isolate.
228 */
229void set_pageblock_migratetype(struct page *page, int migratetype) 224void set_pageblock_migratetype(struct page *page, int migratetype)
230{ 225{
231 226
@@ -1655,20 +1650,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1655 return true; 1650 return true;
1656} 1651}
1657 1652
1658#ifdef CONFIG_MEMORY_ISOLATION
1659static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1660{
1661 if (unlikely(zone->nr_pageblock_isolate))
1662 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1663 return 0;
1664}
1665#else
1666static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1667{
1668 return 0;
1669}
1670#endif
1671
1672bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1653bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1673 int classzone_idx, int alloc_flags) 1654 int classzone_idx, int alloc_flags)
1674{ 1655{
@@ -1684,14 +1665,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1684 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1665 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1685 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1666 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1686 1667
1687 /*
1688 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1689 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1690 * sleep although it could do so. But this is more desirable for memory
1691 * hotplug than sleeping which can cause a livelock in the direct
1692 * reclaim path.
1693 */
1694 free_pages -= nr_zone_isolate_freepages(z);
1695 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1668 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1696 free_pages); 1669 free_pages);
1697} 1670}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d2264ea4606..383bdbb98b04 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,28 +8,6 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include "internal.h" 9#include "internal.h"
10 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) 11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
34{ 12{
35 struct zone *zone; 13 struct zone *zone;
@@ -80,7 +58,7 @@ out:
80 unsigned long nr_pages; 58 unsigned long nr_pages;
81 int migratetype = get_pageblock_migratetype(page); 59 int migratetype = get_pageblock_migratetype(page);
82 60
83 set_pageblock_isolate(page); 61 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
84 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); 62 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
85 63
86 __mod_zone_freepage_state(zone, -nr_pages, migratetype); 64 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
@@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
103 goto out; 81 goto out;
104 nr_pages = move_freepages_block(zone, page, migratetype); 82 nr_pages = move_freepages_block(zone, page, migratetype);
105 __mod_zone_freepage_state(zone, nr_pages, migratetype); 83 __mod_zone_freepage_state(zone, nr_pages, migratetype);
106 restore_pageblock_isolate(page, migratetype); 84 set_pageblock_migratetype(page, migratetype);
107out: 85out:
108 spin_unlock_irqrestore(&zone->lock, flags); 86 spin_unlock_irqrestore(&zone->lock, flags);
109} 87}
diff --git a/mm/shmem.c b/mm/shmem.c
index 5c90d84c2b02..5dd56f6efdbd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -889,7 +889,7 @@ static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
889 if (!mpol || mpol->mode == MPOL_DEFAULT) 889 if (!mpol || mpol->mode == MPOL_DEFAULT)
890 return; /* show nothing */ 890 return; /* show nothing */
891 891
892 mpol_to_str(buffer, sizeof(buffer), mpol, 1); 892 mpol_to_str(buffer, sizeof(buffer), mpol);
893 893
894 seq_printf(seq, ",mpol=%s", buffer); 894 seq_printf(seq, ",mpol=%s", buffer);
895} 895}
@@ -2463,7 +2463,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2463 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2464 goto bad_val; 2464 goto bad_val;
2465 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2466 if (mpol_parse_str(value, &sbinfo->mpol, 1)) 2466 if (mpol_parse_str(value, &sbinfo->mpol))
2467 goto bad_val; 2467 goto bad_val;
2468 } else { 2468 } else {
2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
diff --git a/mm/vmscan.c b/mm/vmscan.c
index adc7e9058181..196709f5ee58 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2452,12 +2452,16 @@ static bool zone_balanced(struct zone *zone, int order,
2452} 2452}
2453 2453
2454/* 2454/*
2455 * pgdat_balanced is used when checking if a node is balanced for high-order 2455 * pgdat_balanced() is used when checking if a node is balanced.
2456 * allocations. Only zones that meet watermarks and are in a zone allowed 2456 *
2457 * by the callers classzone_idx are added to balanced_pages. The total of 2457 * For order-0, all zones must be balanced!
2458 * balanced pages must be at least 25% of the zones allowed by classzone_idx 2458 *
2459 * for the node to be considered balanced. Forcing all zones to be balanced 2459 * For high-order allocations only zones that meet watermarks and are in a
2460 * for high orders can cause excessive reclaim when there are imbalanced zones. 2460 * zone allowed by the callers classzone_idx are added to balanced_pages. The
2461 * total of balanced pages must be at least 25% of the zones allowed by
2462 * classzone_idx for the node to be considered balanced. Forcing all zones to
2463 * be balanced for high orders can cause excessive reclaim when there are
2464 * imbalanced zones.
2461 * The choice of 25% is due to 2465 * The choice of 25% is due to
2462 * o a 16M DMA zone that is balanced will not balance a zone on any 2466 * o a 16M DMA zone that is balanced will not balance a zone on any
2463 * reasonable sized machine 2467 * reasonable sized machine
@@ -2467,17 +2471,43 @@ static bool zone_balanced(struct zone *zone, int order,
2467 * Similarly, on x86-64 the Normal zone would need to be at least 1G 2471 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2468 * to balance a node on its own. These seemed like reasonable ratios. 2472 * to balance a node on its own. These seemed like reasonable ratios.
2469 */ 2473 */
2470static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, 2474static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2471 int classzone_idx)
2472{ 2475{
2473 unsigned long present_pages = 0; 2476 unsigned long present_pages = 0;
2477 unsigned long balanced_pages = 0;
2474 int i; 2478 int i;
2475 2479
2476 for (i = 0; i <= classzone_idx; i++) 2480 /* Check the watermark levels */
2477 present_pages += pgdat->node_zones[i].present_pages; 2481 for (i = 0; i <= classzone_idx; i++) {
2482 struct zone *zone = pgdat->node_zones + i;
2478 2483
2479 /* A special case here: if zone has no page, we think it's balanced */ 2484 if (!populated_zone(zone))
2480 return balanced_pages >= (present_pages >> 2); 2485 continue;
2486
2487 present_pages += zone->present_pages;
2488
2489 /*
2490 * A special case here:
2491 *
2492 * balance_pgdat() skips over all_unreclaimable after
2493 * DEF_PRIORITY. Effectively, it considers them balanced so
2494 * they must be considered balanced here as well!
2495 */
2496 if (zone->all_unreclaimable) {
2497 balanced_pages += zone->present_pages;
2498 continue;
2499 }
2500
2501 if (zone_balanced(zone, order, 0, i))
2502 balanced_pages += zone->present_pages;
2503 else if (!order)
2504 return false;
2505 }
2506
2507 if (order)
2508 return balanced_pages >= (present_pages >> 2);
2509 else
2510 return true;
2481} 2511}
2482 2512
2483/* 2513/*
@@ -2489,10 +2519,6 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2489static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, 2519static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2490 int classzone_idx) 2520 int classzone_idx)
2491{ 2521{
2492 int i;
2493 unsigned long balanced = 0;
2494 bool all_zones_ok = true;
2495
2496 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2522 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2497 if (remaining) 2523 if (remaining)
2498 return false; 2524 return false;
@@ -2511,39 +2537,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2511 return false; 2537 return false;
2512 } 2538 }
2513 2539
2514 /* Check the watermark levels */ 2540 return pgdat_balanced(pgdat, order, classzone_idx);
2515 for (i = 0; i <= classzone_idx; i++) {
2516 struct zone *zone = pgdat->node_zones + i;
2517
2518 if (!populated_zone(zone))
2519 continue;
2520
2521 /*
2522 * balance_pgdat() skips over all_unreclaimable after
2523 * DEF_PRIORITY. Effectively, it considers them balanced so
2524 * they must be considered balanced here as well if kswapd
2525 * is to sleep
2526 */
2527 if (zone->all_unreclaimable) {
2528 balanced += zone->present_pages;
2529 continue;
2530 }
2531
2532 if (!zone_balanced(zone, order, 0, i))
2533 all_zones_ok = false;
2534 else
2535 balanced += zone->present_pages;
2536 }
2537
2538 /*
2539 * For high-order requests, the balanced zones must contain at least
2540 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2541 * must be balanced
2542 */
2543 if (order)
2544 return pgdat_balanced(pgdat, balanced, classzone_idx);
2545 else
2546 return all_zones_ok;
2547} 2541}
2548 2542
2549/* 2543/*
@@ -2571,7 +2565,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2571 int *classzone_idx) 2565 int *classzone_idx)
2572{ 2566{
2573 struct zone *unbalanced_zone; 2567 struct zone *unbalanced_zone;
2574 unsigned long balanced;
2575 int i; 2568 int i;
2576 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2569 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2577 unsigned long total_scanned; 2570 unsigned long total_scanned;
@@ -2605,7 +2598,6 @@ loop_again:
2605 int has_under_min_watermark_zone = 0; 2598 int has_under_min_watermark_zone = 0;
2606 2599
2607 unbalanced_zone = NULL; 2600 unbalanced_zone = NULL;
2608 balanced = 0;
2609 2601
2610 /* 2602 /*
2611 * Scan in the highmem->dma direction for the highest 2603 * Scan in the highmem->dma direction for the highest
@@ -2761,8 +2753,6 @@ loop_again:
2761 * speculatively avoid congestion waits 2753 * speculatively avoid congestion waits
2762 */ 2754 */
2763 zone_clear_flag(zone, ZONE_CONGESTED); 2755 zone_clear_flag(zone, ZONE_CONGESTED);
2764 if (i <= *classzone_idx)
2765 balanced += zone->present_pages;
2766 } 2756 }
2767 2757
2768 } 2758 }
@@ -2776,7 +2766,7 @@ loop_again:
2776 pfmemalloc_watermark_ok(pgdat)) 2766 pfmemalloc_watermark_ok(pgdat))
2777 wake_up(&pgdat->pfmemalloc_wait); 2767 wake_up(&pgdat->pfmemalloc_wait);
2778 2768
2779 if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2769 if (pgdat_balanced(pgdat, order, *classzone_idx))
2780 break; /* kswapd: all done */ 2770 break; /* kswapd: all done */
2781 /* 2771 /*
2782 * OK, kswapd is getting into trouble. Take a nap, then take 2772 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2785,7 +2775,7 @@ loop_again:
2785 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { 2775 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2786 if (has_under_min_watermark_zone) 2776 if (has_under_min_watermark_zone)
2787 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2777 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2788 else 2778 else if (unbalanced_zone)
2789 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); 2779 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2790 } 2780 }
2791 2781
@@ -2800,12 +2790,7 @@ loop_again:
2800 } while (--sc.priority >= 0); 2790 } while (--sc.priority >= 0);
2801out: 2791out:
2802 2792
2803 /* 2793 if (!pgdat_balanced(pgdat, order, *classzone_idx)) {
2804 * order-0: All zones must meet high watermark for a balanced node
2805 * high-order: Balanced zones must make up at least 25% of the node
2806 * for the node to be balanced
2807 */
2808 if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) {
2809 cond_resched(); 2794 cond_resched();
2810 2795
2811 try_to_freeze(); 2796 try_to_freeze();
@@ -3137,8 +3122,8 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3137 not required for correctness. So if the last cpu in a node goes 3122 not required for correctness. So if the last cpu in a node goes
3138 away, we get changed to run anywhere: as the first one comes back, 3123 away, we get changed to run anywhere: as the first one comes back,
3139 restore their cpu bindings. */ 3124 restore their cpu bindings. */
3140static int __devinit cpu_callback(struct notifier_block *nfb, 3125static int cpu_callback(struct notifier_block *nfb, unsigned long action,
3141 unsigned long action, void *hcpu) 3126 void *hcpu)
3142{ 3127{
3143 int nid; 3128 int nid;
3144 3129