aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2013-09-11 17:20:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:57:23 -0400
commit81c0a2bb515fd4daae8cab64352877480792b515 (patch)
tree5ef326d226fdd14332cd0e5382e6dd2759dd08e3 /mm
parente085dbc52fad8d79fa2245339c84bf3ef0b3a802 (diff)
mm: page_alloc: fair zone allocator policy
Each zone that holds userspace pages of one workload must be aged at a speed proportional to the zone size. Otherwise, the time an individual page gets to stay in memory depends on the zone it happened to be allocated in. Asymmetry in the zone aging creates rather unpredictable aging behavior and results in the wrong pages being reclaimed, activated etc. But exactly this happens right now because of the way the page allocator and kswapd interact. The page allocator uses per-node lists of all zones in the system, ordered by preference, when allocating a new page. When the first iteration does not yield any results, kswapd is woken up and the allocator retries. Due to the way kswapd reclaims zones below the high watermark while a zone can be allocated from when it is above the low watermark, the allocator may keep kswapd running while kswapd reclaim ensures that the page allocator can keep allocating from the first zone in the zonelist for extended periods of time. Meanwhile the other zones rarely see new allocations and thus get aged much slower in comparison. The result is that the occasional page placed in lower zones gets relatively more time in memory, even gets promoted to the active list after its peers have long been evicted. Meanwhile, the bulk of the working set may be thrashing on the preferred zone even though there may be significant amounts of memory available in the lower zones. Even the most basic test -- repeatedly reading a file slightly bigger than memory -- shows how broken the zone aging is. In this scenario, no single page should be able stay in memory long enough to get referenced twice and activated, but activation happens in spades: $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 0 nr_active_file 8 nr_inactive_file 1582 nr_active_file 11994 $ cat data data data data >/dev/null $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 70 nr_inactive_file 258753 nr_active_file 443214 nr_inactive_file 149793 nr_active_file 12021 Fix this with a very simple round robin allocator. Each zone is allowed a batch of allocations that is proportional to the zone's size, after which it is treated as full. The batch counters are reset when all zones have been tried and the allocator enters the slowpath and kicks off kswapd reclaim. Allocation and reclaim is now fairly spread out to all available/allowable zones: $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 174 nr_active_file 4865 nr_inactive_file 53 nr_active_file 860 $ cat data data data data >/dev/null $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 666622 nr_active_file 4988 nr_inactive_file 190969 nr_active_file 937 When zone_reclaim_mode is enabled, allocations will now spread out to all zones on the local node, not just the first preferred zone (which on a 4G node might be a tiny Normal zone). Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Paul Bolle <paul.bollee@gmail.com> Cc: Zlatko Calusic <zcalusic@bitsync.net> Tested-by: Kevin Hilman <khilman@linaro.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/page_alloc.c72
-rw-r--r--mm/vmstat.c1
2 files changed, 63 insertions, 10 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9884aa0f233a..544d19d681a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1551,6 +1551,7 @@ again:
1551 get_pageblock_migratetype(page)); 1551 get_pageblock_migratetype(page));
1552 } 1552 }
1553 1553
1554 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1554 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1555 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1555 zone_statistics(preferred_zone, zone, gfp_flags); 1556 zone_statistics(preferred_zone, zone, gfp_flags);
1556 local_irq_restore(flags); 1557 local_irq_restore(flags);
@@ -1817,6 +1818,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1817 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1818 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1818} 1819}
1819 1820
1821static bool zone_local(struct zone *local_zone, struct zone *zone)
1822{
1823 return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
1824}
1825
1820static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1826static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1821{ 1827{
1822 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1828 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
@@ -1854,6 +1860,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1854{ 1860{
1855} 1861}
1856 1862
1863static bool zone_local(struct zone *local_zone, struct zone *zone)
1864{
1865 return true;
1866}
1867
1857static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1868static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1858{ 1869{
1859 return true; 1870 return true;
@@ -1901,6 +1912,26 @@ zonelist_scan:
1901 if (alloc_flags & ALLOC_NO_WATERMARKS) 1912 if (alloc_flags & ALLOC_NO_WATERMARKS)
1902 goto try_this_zone; 1913 goto try_this_zone;
1903 /* 1914 /*
1915 * Distribute pages in proportion to the individual
1916 * zone size to ensure fair page aging. The zone a
1917 * page was allocated in should have no effect on the
1918 * time the page has in memory before being reclaimed.
1919 *
1920 * When zone_reclaim_mode is enabled, try to stay in
1921 * local zones in the fastpath. If that fails, the
1922 * slowpath is entered, which will do another pass
1923 * starting with the local zones, but ultimately fall
1924 * back to remote zones that do not partake in the
1925 * fairness round-robin cycle of this zonelist.
1926 */
1927 if (alloc_flags & ALLOC_WMARK_LOW) {
1928 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1929 continue;
1930 if (zone_reclaim_mode &&
1931 !zone_local(preferred_zone, zone))
1932 continue;
1933 }
1934 /*
1904 * When allocating a page cache page for writing, we 1935 * When allocating a page cache page for writing, we
1905 * want to get it from a zone that is within its dirty 1936 * want to get it from a zone that is within its dirty
1906 * limit, such that no single zone holds more than its 1937 * limit, such that no single zone holds more than its
@@ -2346,16 +2377,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2346 return page; 2377 return page;
2347} 2378}
2348 2379
2349static inline 2380static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
2350void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2381 struct zonelist *zonelist,
2351 enum zone_type high_zoneidx, 2382 enum zone_type high_zoneidx,
2352 enum zone_type classzone_idx) 2383 struct zone *preferred_zone)
2353{ 2384{
2354 struct zoneref *z; 2385 struct zoneref *z;
2355 struct zone *zone; 2386 struct zone *zone;
2356 2387
2357 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2388 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2358 wakeup_kswapd(zone, order, classzone_idx); 2389 if (!(gfp_mask & __GFP_NO_KSWAPD))
2390 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2391 /*
2392 * Only reset the batches of zones that were actually
2393 * considered in the fast path, we don't want to
2394 * thrash fairness information for zones that are not
2395 * actually part of this zonelist's round-robin cycle.
2396 */
2397 if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
2398 continue;
2399 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2400 high_wmark_pages(zone) -
2401 low_wmark_pages(zone) -
2402 zone_page_state(zone, NR_ALLOC_BATCH));
2403 }
2359} 2404}
2360 2405
2361static inline int 2406static inline int
@@ -2451,9 +2496,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2451 goto nopage; 2496 goto nopage;
2452 2497
2453restart: 2498restart:
2454 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2499 prepare_slowpath(gfp_mask, order, zonelist,
2455 wake_all_kswapd(order, zonelist, high_zoneidx, 2500 high_zoneidx, preferred_zone);
2456 zone_idx(preferred_zone));
2457 2501
2458 /* 2502 /*
2459 * OK, we're below the kswapd watermark and have kicked background 2503 * OK, we're below the kswapd watermark and have kicked background
@@ -4753,8 +4797,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4753 spin_lock_init(&zone->lru_lock); 4797 spin_lock_init(&zone->lru_lock);
4754 zone_seqlock_init(zone); 4798 zone_seqlock_init(zone);
4755 zone->zone_pgdat = pgdat; 4799 zone->zone_pgdat = pgdat;
4756
4757 zone_pcp_init(zone); 4800 zone_pcp_init(zone);
4801
4802 /* For bootup, initialized properly in watermark setup */
4803 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4804
4758 lruvec_init(&zone->lruvec); 4805 lruvec_init(&zone->lruvec);
4759 if (!size) 4806 if (!size)
4760 continue; 4807 continue;
@@ -5525,6 +5572,11 @@ static void __setup_per_zone_wmarks(void)
5525 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5572 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5526 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5573 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5527 5574
5575 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5576 high_wmark_pages(zone) -
5577 low_wmark_pages(zone) -
5578 zone_page_state(zone, NR_ALLOC_BATCH));
5579
5528 setup_zone_migrate_reserve(zone); 5580 setup_zone_migrate_reserve(zone);
5529 spin_unlock_irqrestore(&zone->lock, flags); 5581 spin_unlock_irqrestore(&zone->lock, flags);
5530 } 5582 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ca06e9653827..8a8da1f9b044 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -703,6 +703,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
703const char * const vmstat_text[] = { 703const char * const vmstat_text[] = {
704 /* Zoned VM counters */ 704 /* Zoned VM counters */
705 "nr_free_pages", 705 "nr_free_pages",
706 "nr_alloc_batch",
706 "nr_inactive_anon", 707 "nr_inactive_anon",
707 "nr_active_anon", 708 "nr_active_anon",
708 "nr_inactive_file", 709 "nr_inactive_file",