diff options
author | Mel Gorman <mgorman@techsingularity.net> | 2016-07-28 18:46:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-28 19:07:41 -0400 |
commit | e6cbd7f2efb433d717af72aa8510a9db6f7a7e05 (patch) | |
tree | 579fde32463c0885123d14a119dfecf968b26a16 | |
parent | e5146b12e2d02af04608301c958d95b2fc47a0f9 (diff) |
mm, page_alloc: remove fair zone allocation policy
The fair zone allocation policy interleaves allocation requests between
zones to avoid an age inversion problem whereby new pages are reclaimed
to balance a zone. Reclaim is now node-based so this should no longer
be an issue and the fair zone allocation policy is not free. This patch
removes it.
Link: http://lkml.kernel.org/r/1467970510-21195-30-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/mmzone.h | 5 | ||||
-rw-r--r-- | mm/internal.h | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 75 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
4 files changed, 2 insertions, 83 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e19c081c794e..bd33e6f1bed0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -110,7 +110,6 @@ struct zone_padding { | |||
110 | enum zone_stat_item { | 110 | enum zone_stat_item { |
111 | /* First 128 byte cacheline (assuming 64 bit words) */ | 111 | /* First 128 byte cacheline (assuming 64 bit words) */ |
112 | NR_FREE_PAGES, | 112 | NR_FREE_PAGES, |
113 | NR_ALLOC_BATCH, | ||
114 | NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ | 113 | NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ |
115 | NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE, | 114 | NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE, |
116 | NR_ZONE_LRU_FILE, | 115 | NR_ZONE_LRU_FILE, |
@@ -516,10 +515,6 @@ struct zone { | |||
516 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 515 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; |
517 | } ____cacheline_internodealigned_in_smp; | 516 | } ____cacheline_internodealigned_in_smp; |
518 | 517 | ||
519 | enum zone_flags { | ||
520 | ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ | ||
521 | }; | ||
522 | |||
523 | enum pgdat_flags { | 518 | enum pgdat_flags { |
524 | PGDAT_CONGESTED, /* pgdat has many dirty pages backed by | 519 | PGDAT_CONGESTED, /* pgdat has many dirty pages backed by |
525 | * a congested BDI | 520 | * a congested BDI |
diff --git a/mm/internal.h b/mm/internal.h index 1e21b2d3838d..28932cd6a195 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -467,7 +467,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
467 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 467 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
468 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 468 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
469 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | 469 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ |
470 | #define ALLOC_FAIR 0x100 /* fair zone allocation */ | ||
471 | 470 | ||
472 | enum ttu_flags; | 471 | enum ttu_flags; |
473 | struct tlbflush_unmap_batch; | 472 | struct tlbflush_unmap_batch; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c9d1720c58a3..1dd0f1f218db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -2587,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
2587 | else | 2587 | else |
2588 | page = list_first_entry(list, struct page, lru); | 2588 | page = list_first_entry(list, struct page, lru); |
2589 | 2589 | ||
2590 | __dec_zone_state(zone, NR_ALLOC_BATCH); | ||
2591 | list_del(&page->lru); | 2590 | list_del(&page->lru); |
2592 | pcp->count--; | 2591 | pcp->count--; |
2593 | 2592 | ||
@@ -2613,15 +2612,10 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
2613 | spin_unlock(&zone->lock); | 2612 | spin_unlock(&zone->lock); |
2614 | if (!page) | 2613 | if (!page) |
2615 | goto failed; | 2614 | goto failed; |
2616 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | ||
2617 | __mod_zone_freepage_state(zone, -(1 << order), | 2615 | __mod_zone_freepage_state(zone, -(1 << order), |
2618 | get_pcppage_migratetype(page)); | 2616 | get_pcppage_migratetype(page)); |
2619 | } | 2617 | } |
2620 | 2618 | ||
2621 | if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && | ||
2622 | !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) | ||
2623 | set_bit(ZONE_FAIR_DEPLETED, &zone->flags); | ||
2624 | |||
2625 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 2619 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
2626 | zone_statistics(preferred_zone, zone, gfp_flags); | 2620 | zone_statistics(preferred_zone, zone, gfp_flags); |
2627 | local_irq_restore(flags); | 2621 | local_irq_restore(flags); |
@@ -2832,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, | |||
2832 | } | 2826 | } |
2833 | 2827 | ||
2834 | #ifdef CONFIG_NUMA | 2828 | #ifdef CONFIG_NUMA |
2835 | static bool zone_local(struct zone *local_zone, struct zone *zone) | ||
2836 | { | ||
2837 | return local_zone->node == zone->node; | ||
2838 | } | ||
2839 | |||
2840 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 2829 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
2841 | { | 2830 | { |
2842 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < | 2831 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < |
2843 | RECLAIM_DISTANCE; | 2832 | RECLAIM_DISTANCE; |
2844 | } | 2833 | } |
2845 | #else /* CONFIG_NUMA */ | 2834 | #else /* CONFIG_NUMA */ |
2846 | static bool zone_local(struct zone *local_zone, struct zone *zone) | ||
2847 | { | ||
2848 | return true; | ||
2849 | } | ||
2850 | |||
2851 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 2835 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
2852 | { | 2836 | { |
2853 | return true; | 2837 | return true; |
2854 | } | 2838 | } |
2855 | #endif /* CONFIG_NUMA */ | 2839 | #endif /* CONFIG_NUMA */ |
2856 | 2840 | ||
2857 | static void reset_alloc_batches(struct zone *preferred_zone) | ||
2858 | { | ||
2859 | struct zone *zone = preferred_zone->zone_pgdat->node_zones; | ||
2860 | |||
2861 | do { | ||
2862 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
2863 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
2864 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
2865 | clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); | ||
2866 | } while (zone++ != preferred_zone); | ||
2867 | } | ||
2868 | |||
2869 | /* | 2841 | /* |
2870 | * get_page_from_freelist goes through the zonelist trying to allocate | 2842 | * get_page_from_freelist goes through the zonelist trying to allocate |
2871 | * a page. | 2843 | * a page. |
@@ -2876,10 +2848,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, | |||
2876 | { | 2848 | { |
2877 | struct zoneref *z = ac->preferred_zoneref; | 2849 | struct zoneref *z = ac->preferred_zoneref; |
2878 | struct zone *zone; | 2850 | struct zone *zone; |
2879 | bool fair_skipped = false; | ||
2880 | bool apply_fair = (alloc_flags & ALLOC_FAIR); | ||
2881 | |||
2882 | zonelist_scan: | ||
2883 | /* | 2851 | /* |
2884 | * Scan zonelist, looking for a zone with enough free. | 2852 | * Scan zonelist, looking for a zone with enough free. |
2885 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2853 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
@@ -2894,23 +2862,6 @@ zonelist_scan: | |||
2894 | !__cpuset_zone_allowed(zone, gfp_mask)) | 2862 | !__cpuset_zone_allowed(zone, gfp_mask)) |
2895 | continue; | 2863 | continue; |
2896 | /* | 2864 | /* |
2897 | * Distribute pages in proportion to the individual | ||
2898 | * zone size to ensure fair page aging. The zone a | ||
2899 | * page was allocated in should have no effect on the | ||
2900 | * time the page has in memory before being reclaimed. | ||
2901 | */ | ||
2902 | if (apply_fair) { | ||
2903 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | ||
2904 | fair_skipped = true; | ||
2905 | continue; | ||
2906 | } | ||
2907 | if (!zone_local(ac->preferred_zoneref->zone, zone)) { | ||
2908 | if (fair_skipped) | ||
2909 | goto reset_fair; | ||
2910 | apply_fair = false; | ||
2911 | } | ||
2912 | } | ||
2913 | /* | ||
2914 | * When allocating a page cache page for writing, we | 2865 | * When allocating a page cache page for writing, we |
2915 | * want to get it from a node that is within its dirty | 2866 | * want to get it from a node that is within its dirty |
2916 | * limit, such that no single node holds more than its | 2867 | * limit, such that no single node holds more than its |
@@ -2981,23 +2932,6 @@ try_this_zone: | |||
2981 | } | 2932 | } |
2982 | } | 2933 | } |
2983 | 2934 | ||
2984 | /* | ||
2985 | * The first pass makes sure allocations are spread fairly within the | ||
2986 | * local node. However, the local node might have free pages left | ||
2987 | * after the fairness batches are exhausted, and remote zones haven't | ||
2988 | * even been considered yet. Try once more without fairness, and | ||
2989 | * include remote zones now, before entering the slowpath and waking | ||
2990 | * kswapd: prefer spilling to a remote zone over swapping locally. | ||
2991 | */ | ||
2992 | if (fair_skipped) { | ||
2993 | reset_fair: | ||
2994 | apply_fair = false; | ||
2995 | fair_skipped = false; | ||
2996 | reset_alloc_batches(ac->preferred_zoneref->zone); | ||
2997 | z = ac->preferred_zoneref; | ||
2998 | goto zonelist_scan; | ||
2999 | } | ||
3000 | |||
3001 | return NULL; | 2935 | return NULL; |
3002 | } | 2936 | } |
3003 | 2937 | ||
@@ -3746,7 +3680,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
3746 | { | 3680 | { |
3747 | struct page *page; | 3681 | struct page *page; |
3748 | unsigned int cpuset_mems_cookie; | 3682 | unsigned int cpuset_mems_cookie; |
3749 | unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; | 3683 | unsigned int alloc_flags = ALLOC_WMARK_LOW; |
3750 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ | 3684 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ |
3751 | struct alloc_context ac = { | 3685 | struct alloc_context ac = { |
3752 | .high_zoneidx = gfp_zone(gfp_mask), | 3686 | .high_zoneidx = gfp_zone(gfp_mask), |
@@ -5958,9 +5892,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5958 | zone_seqlock_init(zone); | 5892 | zone_seqlock_init(zone); |
5959 | zone_pcp_init(zone); | 5893 | zone_pcp_init(zone); |
5960 | 5894 | ||
5961 | /* For bootup, initialized properly in watermark setup */ | ||
5962 | mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); | ||
5963 | |||
5964 | if (!size) | 5895 | if (!size) |
5965 | continue; | 5896 | continue; |
5966 | 5897 | ||
@@ -6808,10 +6739,6 @@ static void __setup_per_zone_wmarks(void) | |||
6808 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; | 6739 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; |
6809 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; | 6740 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; |
6810 | 6741 | ||
6811 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
6812 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
6813 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
6814 | |||
6815 | spin_unlock_irqrestore(&zone->lock, flags); | 6742 | spin_unlock_irqrestore(&zone->lock, flags); |
6816 | } | 6743 | } |
6817 | 6744 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index bc94968400d0..ab7f78995c89 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -921,7 +921,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) | |||
921 | const char * const vmstat_text[] = { | 921 | const char * const vmstat_text[] = { |
922 | /* enum zone_stat_item countes */ | 922 | /* enum zone_stat_item countes */ |
923 | "nr_free_pages", | 923 | "nr_free_pages", |
924 | "nr_alloc_batch", | ||
925 | "nr_zone_anon_lru", | 924 | "nr_zone_anon_lru", |
926 | "nr_zone_file_lru", | 925 | "nr_zone_file_lru", |
927 | "nr_zone_write_pending", | 926 | "nr_zone_write_pending", |
@@ -1632,10 +1631,9 @@ int vmstat_refresh(struct ctl_table *table, int write, | |||
1632 | val = atomic_long_read(&vm_zone_stat[i]); | 1631 | val = atomic_long_read(&vm_zone_stat[i]); |
1633 | if (val < 0) { | 1632 | if (val < 0) { |
1634 | switch (i) { | 1633 | switch (i) { |
1635 | case NR_ALLOC_BATCH: | ||
1636 | case NR_PAGES_SCANNED: | 1634 | case NR_PAGES_SCANNED: |
1637 | /* | 1635 | /* |
1638 | * These are often seen to go negative in | 1636 | * This is often seen to go negative in |
1639 | * recent kernels, but not to go permanently | 1637 | * recent kernels, but not to go permanently |
1640 | * negative. Whilst it would be nicer not to | 1638 | * negative. Whilst it would be nicer not to |
1641 | * have exceptions, rooting them out would be | 1639 | * have exceptions, rooting them out would be |