aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2016-03-15 17:57:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 19:55:16 -0400
commit612e44939c3c77245ac80843c0c7876c8cf97282 (patch)
treedfa3bff1ff58990699c6f0562c38b71b9ab94808 /mm
parent162453bfbdf4c0f58cb3058aad9ad8cda1044cda (diff)
mm: workingset: eviction buckets for bigmem/lowbit machines
For per-cgroup thrash detection, we need to store the memcg ID inside the radix tree cookie as well. However, on 32 bit that doesn't leave enough bits for the eviction timestamp to cover the necessary range of recently evicted pages. The radix tree entry would look like this: [ RADIX_TREE_EXCEPTIONAL(2) | ZONEID(2) | MEMCGID(16) | EVICTION(12) ] 12 bits means 4096 pages, means 16M worth of recently evicted pages. But refaults are actionable up to distances covering half of memory. To not miss refaults, we have to stretch out the range at the cost of how precisely we can tell when a page was evicted. This way we can shave off lower bits from the eviction timestamp until the necessary range is covered. E.g. grouping evictions into 1M buckets (256 pages) will stretch the longest representable refault distance to 4G. This patch implements eviction buckets that are automatically sized according to the available bits and the necessary refault range, in preparation for per-cgroup thrash detection. The maximum actionable distance is currently half of memory, but to support memory hotplug of up to 200% of boot-time memory, we size the buckets to cover double the distance. Beyond that, thrashing won't be detectable anymore. During boot, the kernel will print out the exact parameters, like so: [ 0.113929] workingset: timestamp_bits=12 max_order=18 bucket_order=6 In this example, there are 12 radix entry bits available for the eviction timestamp, to cover a maximum distance of 2^18 pages (this is a 1G machine). Consequently, evictions must be grouped into buckets of 2^6 pages, or 256K. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/workingset.c30
1 files changed, 29 insertions, 1 deletions
diff --git a/mm/workingset.c b/mm/workingset.c
index f874b2c663e3..9a26a60368d2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -156,8 +156,19 @@
156 ZONES_SHIFT + NODES_SHIFT) 156 ZONES_SHIFT + NODES_SHIFT)
157#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 157#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
158 158
159/*
160 * Eviction timestamps need to be able to cover the full range of
161 * actionable refaults. However, bits are tight in the radix tree
162 * entry, and after storing the identifier for the lruvec there might
163 * not be enough left to represent every single actionable refault. In
164 * that case, we have to sacrifice granularity for distance, and group
165 * evictions into coarser buckets by shaving off lower timestamp bits.
166 */
167static unsigned int bucket_order __read_mostly;
168
159static void *pack_shadow(unsigned long eviction, struct zone *zone) 169static void *pack_shadow(unsigned long eviction, struct zone *zone)
160{ 170{
171 eviction >>= bucket_order;
161 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 172 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
162 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); 173 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
163 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 174 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -178,7 +189,7 @@ static void unpack_shadow(void *shadow, struct zone **zonep,
178 entry >>= NODES_SHIFT; 189 entry >>= NODES_SHIFT;
179 190
180 *zonep = NODE_DATA(nid)->node_zones + zid; 191 *zonep = NODE_DATA(nid)->node_zones + zid;
181 *evictionp = entry; 192 *evictionp = entry << bucket_order;
182} 193}
183 194
184/** 195/**
@@ -400,8 +411,25 @@ static struct lock_class_key shadow_nodes_key;
400 411
401static int __init workingset_init(void) 412static int __init workingset_init(void)
402{ 413{
414 unsigned int timestamp_bits;
415 unsigned int max_order;
403 int ret; 416 int ret;
404 417
418 BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
419 /*
420 * Calculate the eviction bucket size to cover the longest
421 * actionable refault distance, which is currently half of
422 * memory (totalram_pages/2). However, memory hotplug may add
423 * some more pages at runtime, so keep working with up to
424 * double the initial memory by using totalram_pages as-is.
425 */
426 timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
427 max_order = fls_long(totalram_pages - 1);
428 if (max_order > timestamp_bits)
429 bucket_order = max_order - timestamp_bits;
430 printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
431 timestamp_bits, max_order, bucket_order);
432
405 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); 433 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
406 if (ret) 434 if (ret)
407 goto err; 435 goto err;