aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2014-08-06 19:07:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 21:01:20 -0400
commit4ffeaf3560a52b4a69cc7909873d08c0ef5909d4 (patch)
tree1ef6cebf30961e85fcb7a8531f92f1a62c107a74 /mm
parentf7b5d647946aae1647bf5cd26c16b3a793c1ac49 (diff)
mm: page_alloc: reduce cost of the fair zone allocation policy
The fair zone allocation policy round-robins allocations between zones within a node to avoid age inversion problems during reclaim. If the first allocation fails, the batch counts are reset and a second attempt made before entering the slow path. One assumption made with this scheme is that batches expire at roughly the same time and the resets each time are justified. This assumption does not hold when zones reach their low watermark as the batches will be consumed at uneven rates. Allocation failure due to watermark depletion result in additional zonelist scans for the reset and another watermark check before hitting the slowpath. On UMA, the benefit is negligible -- around 0.25%. On 4-socket NUMA machine it's variable due to the variability of measuring overhead with the vmstat changes. The system CPU overhead comparison looks like 3.16.0-rc3 3.16.0-rc3 3.16.0-rc3 vanilla vmstat-v5 lowercost-v5 User 746.94 774.56 802.00 System 65336.22 32847.27 40852.33 Elapsed 27553.52 27415.04 27368.46 However it is worth noting that the overall benchmark still completed faster and intuitively it makes sense to take as few passes as possible through the zonelists. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/page_alloc.c101
1 files changed, 53 insertions, 48 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e5e8f762532..fb9908148474 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1612,6 +1612,9 @@ again:
1612 } 1612 }
1613 1613
1614 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1614 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1615 if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
1616 !zone_is_fair_depleted(zone))
1617 zone_set_flag(zone, ZONE_FAIR_DEPLETED);
1615 1618
1616 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1619 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1617 zone_statistics(preferred_zone, zone, gfp_flags); 1620 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1923 1926
1924#endif /* CONFIG_NUMA */ 1927#endif /* CONFIG_NUMA */
1925 1928
1929static void reset_alloc_batches(struct zone *preferred_zone)
1930{
1931 struct zone *zone = preferred_zone->zone_pgdat->node_zones;
1932
1933 do {
1934 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1935 high_wmark_pages(zone) - low_wmark_pages(zone) -
1936 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1937 zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
1938 } while (zone++ != preferred_zone);
1939}
1940
1926/* 1941/*
1927 * get_page_from_freelist goes through the zonelist trying to allocate 1942 * get_page_from_freelist goes through the zonelist trying to allocate
1928 * a page. 1943 * a page.
@@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1940 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1955 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1941 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && 1956 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1942 (gfp_mask & __GFP_WRITE); 1957 (gfp_mask & __GFP_WRITE);
1958 int nr_fair_skipped = 0;
1959 bool zonelist_rescan;
1943 1960
1944zonelist_scan: 1961zonelist_scan:
1962 zonelist_rescan = false;
1963
1945 /* 1964 /*
1946 * Scan zonelist, looking for a zone with enough free. 1965 * Scan zonelist, looking for a zone with enough free.
1947 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1966 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1966,8 +1985,10 @@ zonelist_scan:
1966 if (alloc_flags & ALLOC_FAIR) { 1985 if (alloc_flags & ALLOC_FAIR) {
1967 if (!zone_local(preferred_zone, zone)) 1986 if (!zone_local(preferred_zone, zone))
1968 break; 1987 break;
1969 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1988 if (zone_is_fair_depleted(zone)) {
1989 nr_fair_skipped++;
1970 continue; 1990 continue;
1991 }
1971 } 1992 }
1972 /* 1993 /*
1973 * When allocating a page cache page for writing, we 1994 * When allocating a page cache page for writing, we
@@ -2073,13 +2094,7 @@ this_zone_full:
2073 zlc_mark_zone_full(zonelist, z); 2094 zlc_mark_zone_full(zonelist, z);
2074 } 2095 }
2075 2096
2076 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2097 if (page) {
2077 /* Disable zlc cache for second zonelist scan */
2078 zlc_active = 0;
2079 goto zonelist_scan;
2080 }
2081
2082 if (page)
2083 /* 2098 /*
2084 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2099 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2085 * necessary to allocate the page. The expectation is 2100 * necessary to allocate the page. The expectation is
@@ -2088,8 +2103,37 @@ this_zone_full:
2088 * for !PFMEMALLOC purposes. 2103 * for !PFMEMALLOC purposes.
2089 */ 2104 */
2090 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2105 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2106 return page;
2107 }
2091 2108
2092 return page; 2109 /*
2110 * The first pass makes sure allocations are spread fairly within the
2111 * local node. However, the local node might have free pages left
2112 * after the fairness batches are exhausted, and remote zones haven't
2113 * even been considered yet. Try once more without fairness, and
2114 * include remote zones now, before entering the slowpath and waking
2115 * kswapd: prefer spilling to a remote zone over swapping locally.
2116 */
2117 if (alloc_flags & ALLOC_FAIR) {
2118 alloc_flags &= ~ALLOC_FAIR;
2119 if (nr_fair_skipped) {
2120 zonelist_rescan = true;
2121 reset_alloc_batches(preferred_zone);
2122 }
2123 if (nr_online_nodes > 1)
2124 zonelist_rescan = true;
2125 }
2126
2127 if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
2128 /* Disable zlc cache for second zonelist scan */
2129 zlc_active = 0;
2130 zonelist_rescan = true;
2131 }
2132
2133 if (zonelist_rescan)
2134 goto zonelist_scan;
2135
2136 return NULL;
2093} 2137}
2094 2138
2095/* 2139/*
@@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2410 return page; 2454 return page;
2411} 2455}
2412 2456
2413static void reset_alloc_batches(struct zonelist *zonelist,
2414 enum zone_type high_zoneidx,
2415 struct zone *preferred_zone)
2416{
2417 struct zoneref *z;
2418 struct zone *zone;
2419
2420 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2421 /*
2422 * Only reset the batches of zones that were actually
2423 * considered in the fairness pass, we don't want to
2424 * trash fairness information for zones that are not
2425 * actually part of this zonelist's round-robin cycle.
2426 */
2427 if (!zone_local(preferred_zone, zone))
2428 continue;
2429 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2430 high_wmark_pages(zone) - low_wmark_pages(zone) -
2431 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2432 }
2433}
2434
2435static void wake_all_kswapds(unsigned int order, 2457static void wake_all_kswapds(unsigned int order,
2436 struct zonelist *zonelist, 2458 struct zonelist *zonelist,
2437 enum zone_type high_zoneidx, 2459 enum zone_type high_zoneidx,
@@ -2767,29 +2789,12 @@ retry_cpuset:
2767 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2789 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2768 alloc_flags |= ALLOC_CMA; 2790 alloc_flags |= ALLOC_CMA;
2769#endif 2791#endif
2770retry:
2771 /* First allocation attempt */ 2792 /* First allocation attempt */
2772 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2793 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2773 zonelist, high_zoneidx, alloc_flags, 2794 zonelist, high_zoneidx, alloc_flags,
2774 preferred_zone, classzone_idx, migratetype); 2795 preferred_zone, classzone_idx, migratetype);
2775 if (unlikely(!page)) { 2796 if (unlikely(!page)) {
2776 /* 2797 /*
2777 * The first pass makes sure allocations are spread
2778 * fairly within the local node. However, the local
2779 * node might have free pages left after the fairness
2780 * batches are exhausted, and remote zones haven't
2781 * even been considered yet. Try once more without
2782 * fairness, and include remote zones now, before
2783 * entering the slowpath and waking kswapd: prefer
2784 * spilling to a remote zone over swapping locally.
2785 */
2786 if (alloc_flags & ALLOC_FAIR) {
2787 reset_alloc_batches(zonelist, high_zoneidx,
2788 preferred_zone);
2789 alloc_flags &= ~ALLOC_FAIR;
2790 goto retry;
2791 }
2792 /*
2793 * Runtime PM, block IO and its error handling path 2798 * Runtime PM, block IO and its error handling path
2794 * can deadlock because I/O on the device might not 2799 * can deadlock because I/O on the device might not
2795 * complete. 2800 * complete.