aboutsummaryrefslogtreecommitdiffstats
path: root/mm/workingset.c
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2016-03-15 17:57:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 19:55:16 -0400
commit23047a96d7cfcfca1a6d026ecaec526ea4803e9e (patch)
tree3c90e27cc6dcb6a386a54c503bbb0860e828509b /mm/workingset.c
parent612e44939c3c77245ac80843c0c7876c8cf97282 (diff)
mm: workingset: per-cgroup cache thrash detection
Cache thrash detection (see a528910e12ec "mm: thrash detection-based file cache sizing" for details) currently only works on the system level, not inside cgroups. Worse, as the refaults are compared to the global number of active cache, cgroups might wrongfully get all their refaults activated when their pages are hotter than those of others. Move the refault machinery from the zone to the lruvec, and then tag eviction entries with the memcg ID. This makes the thrash detection work correctly inside cgroups. [sergey.senozhatsky@gmail.com: do not return from workingset_activation() with locked rcu and page] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/workingset.c')
-rw-r--r--mm/workingset.c79
1 files changed, 69 insertions, 10 deletions
diff --git a/mm/workingset.c b/mm/workingset.c
index 9a26a60368d2..14bc23a7779b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -153,7 +153,8 @@
153 */ 153 */
154 154
155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ 155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
156 ZONES_SHIFT + NODES_SHIFT) 156 ZONES_SHIFT + NODES_SHIFT + \
157 MEM_CGROUP_ID_SHIFT)
157#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 158#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
158 159
159/* 160/*
@@ -166,9 +167,10 @@
166 */ 167 */
167static unsigned int bucket_order __read_mostly; 168static unsigned int bucket_order __read_mostly;
168 169
169static void *pack_shadow(unsigned long eviction, struct zone *zone) 170static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
170{ 171{
171 eviction >>= bucket_order; 172 eviction >>= bucket_order;
173 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
172 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 174 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
173 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); 175 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
174 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 176 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -176,18 +178,21 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
176 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 178 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
177} 179}
178 180
179static void unpack_shadow(void *shadow, struct zone **zonep, 181static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
180 unsigned long *evictionp) 182 unsigned long *evictionp)
181{ 183{
182 unsigned long entry = (unsigned long)shadow; 184 unsigned long entry = (unsigned long)shadow;
183 int zid, nid; 185 int memcgid, nid, zid;
184 186
185 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 187 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
186 zid = entry & ((1UL << ZONES_SHIFT) - 1); 188 zid = entry & ((1UL << ZONES_SHIFT) - 1);
187 entry >>= ZONES_SHIFT; 189 entry >>= ZONES_SHIFT;
188 nid = entry & ((1UL << NODES_SHIFT) - 1); 190 nid = entry & ((1UL << NODES_SHIFT) - 1);
189 entry >>= NODES_SHIFT; 191 entry >>= NODES_SHIFT;
192 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
193 entry >>= MEM_CGROUP_ID_SHIFT;
190 194
195 *memcgidp = memcgid;
191 *zonep = NODE_DATA(nid)->node_zones + zid; 196 *zonep = NODE_DATA(nid)->node_zones + zid;
192 *evictionp = entry << bucket_order; 197 *evictionp = entry << bucket_order;
193} 198}
@@ -202,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep,
202 */ 207 */
203void *workingset_eviction(struct address_space *mapping, struct page *page) 208void *workingset_eviction(struct address_space *mapping, struct page *page)
204{ 209{
210 struct mem_cgroup *memcg = page_memcg(page);
205 struct zone *zone = page_zone(page); 211 struct zone *zone = page_zone(page);
212 int memcgid = mem_cgroup_id(memcg);
206 unsigned long eviction; 213 unsigned long eviction;
214 struct lruvec *lruvec;
207 215
208 eviction = atomic_long_inc_return(&zone->inactive_age); 216 /* Page is fully exclusive and pins page->mem_cgroup */
209 return pack_shadow(eviction, zone); 217 VM_BUG_ON_PAGE(PageLRU(page), page);
218 VM_BUG_ON_PAGE(page_count(page), page);
219 VM_BUG_ON_PAGE(!PageLocked(page), page);
220
221 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, zone, eviction);
210} 224}
211 225
212/** 226/**
@@ -221,13 +235,42 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
221bool workingset_refault(void *shadow) 235bool workingset_refault(void *shadow)
222{ 236{
223 unsigned long refault_distance; 237 unsigned long refault_distance;
238 unsigned long active_file;
239 struct mem_cgroup *memcg;
224 unsigned long eviction; 240 unsigned long eviction;
241 struct lruvec *lruvec;
225 unsigned long refault; 242 unsigned long refault;
226 struct zone *zone; 243 struct zone *zone;
244 int memcgid;
227 245
228 unpack_shadow(shadow, &zone, &eviction); 246 unpack_shadow(shadow, &memcgid, &zone, &eviction);
229 247
230 refault = atomic_long_read(&zone->inactive_age); 248 rcu_read_lock();
249 /*
250 * Look up the memcg associated with the stored ID. It might
251 * have been deleted since the page's eviction.
252 *
253 * Note that in rare events the ID could have been recycled
254 * for a new cgroup that refaults a shared page. This is
255 * impossible to tell from the available data. However, this
256 * should be a rare and limited disturbance, and activations
257 * are always speculative anyway. Ultimately, it's the aging
258 * algorithm's job to shake out the minimum access frequency
259 * for the active cache.
260 *
261 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
262 * would be better if the root_mem_cgroup existed in all
263 * configurations instead.
264 */
265 memcg = mem_cgroup_from_id(memcgid);
266 if (!mem_cgroup_disabled() && !memcg) {
267 rcu_read_unlock();
268 return false;
269 }
270 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273 rcu_read_unlock();
231 274
232 /* 275 /*
233 * The unsigned subtraction here gives an accurate distance 276 * The unsigned subtraction here gives an accurate distance
@@ -249,7 +292,7 @@ bool workingset_refault(void *shadow)
249 292
250 inc_zone_state(zone, WORKINGSET_REFAULT); 293 inc_zone_state(zone, WORKINGSET_REFAULT);
251 294
252 if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { 295 if (refault_distance <= active_file) {
253 inc_zone_state(zone, WORKINGSET_ACTIVATE); 296 inc_zone_state(zone, WORKINGSET_ACTIVATE);
254 return true; 297 return true;
255 } 298 }
@@ -262,7 +305,23 @@ bool workingset_refault(void *shadow)
262 */ 305 */
263void workingset_activation(struct page *page) 306void workingset_activation(struct page *page)
264{ 307{
265 atomic_long_inc(&page_zone(page)->inactive_age); 308 struct mem_cgroup *memcg;
309 struct lruvec *lruvec;
310
311 memcg = lock_page_memcg(page);
312 /*
313 * Filter non-memcg pages here, e.g. unmap can call
314 * mark_page_accessed() on VDSO pages.
315 *
316 * XXX: See workingset_refault() - this should return
317 * root_mem_cgroup even for !CONFIG_MEMCG.
318 */
319 if (!mem_cgroup_disabled() && !memcg)
320 goto out;
321 lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg);
322 atomic_long_inc(&lruvec->inactive_age);
323out:
324 unlock_page_memcg(memcg);
266} 325}
267 326
268/* 327/*