aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2016-03-15 17:57:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 19:55:16 -0400
commit23047a96d7cfcfca1a6d026ecaec526ea4803e9e (patch)
tree3c90e27cc6dcb6a386a54c503bbb0860e828509b
parent612e44939c3c77245ac80843c0c7876c8cf97282 (diff)
mm: workingset: per-cgroup cache thrash detection
Cache thrash detection (see a528910e12ec "mm: thrash detection-based file cache sizing" for details) currently only works on the system level, not inside cgroups. Worse, as the refaults are compared to the global number of active cache, cgroups might wrongfully get all their refaults activated when their pages are hotter than those of others. Move the refault machinery from the zone to the lruvec, and then tag eviction entries with the memcg ID. This makes the thrash detection work correctly inside cgroups. [sergey.senozhatsky@gmail.com: do not return from workingset_activation() with locked rcu and page] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h56
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--mm/memcontrol.c25
-rw-r--r--mm/vmscan.c18
-rw-r--r--mm/workingset.c79
5 files changed, 134 insertions, 57 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8502fd4144eb..09b449849369 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -89,6 +89,10 @@ enum mem_cgroup_events_target {
89}; 89};
90 90
91#ifdef CONFIG_MEMCG 91#ifdef CONFIG_MEMCG
92
93#define MEM_CGROUP_ID_SHIFT 16
94#define MEM_CGROUP_ID_MAX USHRT_MAX
95
92struct mem_cgroup_stat_cpu { 96struct mem_cgroup_stat_cpu {
93 long count[MEMCG_NR_STAT]; 97 long count[MEMCG_NR_STAT];
94 unsigned long events[MEMCG_NR_EVENTS]; 98 unsigned long events[MEMCG_NR_EVENTS];
@@ -265,6 +269,11 @@ struct mem_cgroup {
265 269
266extern struct mem_cgroup *root_mem_cgroup; 270extern struct mem_cgroup *root_mem_cgroup;
267 271
272static inline bool mem_cgroup_disabled(void)
273{
274 return !cgroup_subsys_enabled(memory_cgrp_subsys);
275}
276
268/** 277/**
269 * mem_cgroup_events - count memory events against a cgroup 278 * mem_cgroup_events - count memory events against a cgroup
270 * @memcg: the memory cgroup 279 * @memcg: the memory cgroup
@@ -312,6 +321,28 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
312 struct mem_cgroup_reclaim_cookie *); 321 struct mem_cgroup_reclaim_cookie *);
313void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 322void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
314 323
324static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
325{
326 if (mem_cgroup_disabled())
327 return 0;
328
329 return memcg->css.id;
330}
331
332/**
333 * mem_cgroup_from_id - look up a memcg from an id
334 * @id: the id to look up
335 *
336 * Caller must hold rcu_read_lock() and use css_tryget() as necessary.
337 */
338static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
339{
340 struct cgroup_subsys_state *css;
341
342 css = css_from_id(id, &memory_cgrp_subsys);
343 return mem_cgroup_from_css(css);
344}
345
315/** 346/**
316 * parent_mem_cgroup - find the accounting parent of a memcg 347 * parent_mem_cgroup - find the accounting parent of a memcg
317 * @memcg: memcg whose parent to find 348 * @memcg: memcg whose parent to find
@@ -353,11 +384,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
353struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); 384struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
354ino_t page_cgroup_ino(struct page *page); 385ino_t page_cgroup_ino(struct page *page);
355 386
356static inline bool mem_cgroup_disabled(void)
357{
358 return !cgroup_subsys_enabled(memory_cgrp_subsys);
359}
360
361static inline bool mem_cgroup_online(struct mem_cgroup *memcg) 387static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
362{ 388{
363 if (mem_cgroup_disabled()) 389 if (mem_cgroup_disabled())
@@ -502,8 +528,17 @@ void mem_cgroup_split_huge_fixup(struct page *head);
502#endif 528#endif
503 529
504#else /* CONFIG_MEMCG */ 530#else /* CONFIG_MEMCG */
531
532#define MEM_CGROUP_ID_SHIFT 0
533#define MEM_CGROUP_ID_MAX 0
534
505struct mem_cgroup; 535struct mem_cgroup;
506 536
537static inline bool mem_cgroup_disabled(void)
538{
539 return true;
540}
541
507static inline void mem_cgroup_events(struct mem_cgroup *memcg, 542static inline void mem_cgroup_events(struct mem_cgroup *memcg,
508 enum mem_cgroup_events_index idx, 543 enum mem_cgroup_events_index idx,
509 unsigned int nr) 544 unsigned int nr)
@@ -586,9 +621,16 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
586{ 621{
587} 622}
588 623
589static inline bool mem_cgroup_disabled(void) 624static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
590{ 625{
591 return true; 626 return 0;
627}
628
629static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
630{
631 WARN_ON_ONCE(id);
632 /* XXX: This should always return root_mem_cgroup */
633 return NULL;
592} 634}
593 635
594static inline bool mem_cgroup_online(struct mem_cgroup *memcg) 636static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fc23ab550a7..03cbdd906f55 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -212,10 +212,12 @@ struct zone_reclaim_stat {
212}; 212};
213 213
214struct lruvec { 214struct lruvec {
215 struct list_head lists[NR_LRU_LISTS]; 215 struct list_head lists[NR_LRU_LISTS];
216 struct zone_reclaim_stat reclaim_stat; 216 struct zone_reclaim_stat reclaim_stat;
217 /* Evictions & activations on the inactive file list */
218 atomic_long_t inactive_age;
217#ifdef CONFIG_MEMCG 219#ifdef CONFIG_MEMCG
218 struct zone *zone; 220 struct zone *zone;
219#endif 221#endif
220}; 222};
221 223
@@ -490,9 +492,6 @@ struct zone {
490 spinlock_t lru_lock; 492 spinlock_t lru_lock;
491 struct lruvec lruvec; 493 struct lruvec lruvec;
492 494
493 /* Evictions & activations on the inactive file list */
494 atomic_long_t inactive_age;
495
496 /* 495 /*
497 * When free pages are below this point, additional steps are taken 496 * When free pages are below this point, additional steps are taken
498 * when reading the number of free pages to avoid per-cpu counter 497 * when reading the number of free pages to avoid per-cpu counter
@@ -761,6 +760,8 @@ static inline struct zone *lruvec_zone(struct lruvec *lruvec)
761#endif 760#endif
762} 761}
763 762
763extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru);
764
764#ifdef CONFIG_HAVE_MEMORY_PRESENT 765#ifdef CONFIG_HAVE_MEMORY_PRESENT
765void memory_present(int nid, unsigned long start, unsigned long end); 766void memory_present(int nid, unsigned long start, unsigned long end);
766#else 767#else
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 953f0f984392..864e237f32d9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
268 return (memcg == root_mem_cgroup); 268 return (memcg == root_mem_cgroup);
269} 269}
270 270
271/*
272 * We restrict the id in the range of [1, 65535], so it can fit into
273 * an unsigned short.
274 */
275#define MEM_CGROUP_ID_MAX USHRT_MAX
276
277static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
278{
279 return memcg->css.id;
280}
281
282/*
283 * A helper function to get mem_cgroup from ID. must be called under
284 * rcu_read_lock(). The caller is responsible for calling
285 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
286 * refcnt from swap can be called against removed memcg.)
287 */
288static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
289{
290 struct cgroup_subsys_state *css;
291
292 css = css_from_id(id, &memory_cgrp_subsys);
293 return mem_cgroup_from_css(css);
294}
295
296#ifndef CONFIG_SLOB 271#ifndef CONFIG_SLOB
297/* 272/*
298 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 273 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 08547a7136d3..fd434cc89bea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,7 +213,7 @@ bool zone_reclaimable(struct zone *zone)
213 zone_reclaimable_pages(zone) * 6; 213 zone_reclaimable_pages(zone) * 6;
214} 214}
215 215
216static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 216unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
217{ 217{
218 if (!mem_cgroup_disabled()) 218 if (!mem_cgroup_disabled())
219 return mem_cgroup_get_lru_size(lruvec, lru); 219 return mem_cgroup_get_lru_size(lruvec, lru);
@@ -1923,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
1923 unsigned long inactive; 1923 unsigned long inactive;
1924 unsigned long active; 1924 unsigned long active;
1925 1925
1926 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1926 inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
1927 active = get_lru_size(lruvec, LRU_ACTIVE_FILE); 1927 active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
1928 1928
1929 return active > inactive; 1929 return active > inactive;
1930} 1930}
@@ -2063,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2063 * system is under heavy pressure. 2063 * system is under heavy pressure.
2064 */ 2064 */
2065 if (!inactive_file_is_low(lruvec) && 2065 if (!inactive_file_is_low(lruvec) &&
2066 get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { 2066 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
2067 scan_balance = SCAN_FILE; 2067 scan_balance = SCAN_FILE;
2068 goto out; 2068 goto out;
2069 } 2069 }
@@ -2089,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2089 * anon in [0], file in [1] 2089 * anon in [0], file in [1]
2090 */ 2090 */
2091 2091
2092 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + 2092 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
2093 get_lru_size(lruvec, LRU_INACTIVE_ANON); 2093 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
2094 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 2094 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
2095 get_lru_size(lruvec, LRU_INACTIVE_FILE); 2095 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
2096 2096
2097 spin_lock_irq(&zone->lru_lock); 2097 spin_lock_irq(&zone->lru_lock);
2098 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 2098 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2130,7 +2130,7 @@ out:
2130 unsigned long size; 2130 unsigned long size;
2131 unsigned long scan; 2131 unsigned long scan;
2132 2132
2133 size = get_lru_size(lruvec, lru); 2133 size = lruvec_lru_size(lruvec, lru);
2134 scan = size >> sc->priority; 2134 scan = size >> sc->priority;
2135 2135
2136 if (!scan && pass && force_scan) 2136 if (!scan && pass && force_scan)
diff --git a/mm/workingset.c b/mm/workingset.c
index 9a26a60368d2..14bc23a7779b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -153,7 +153,8 @@
153 */ 153 */
154 154
155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ 155#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
156 ZONES_SHIFT + NODES_SHIFT) 156 ZONES_SHIFT + NODES_SHIFT + \
157 MEM_CGROUP_ID_SHIFT)
157#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) 158#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
158 159
159/* 160/*
@@ -166,9 +167,10 @@
166 */ 167 */
167static unsigned int bucket_order __read_mostly; 168static unsigned int bucket_order __read_mostly;
168 169
169static void *pack_shadow(unsigned long eviction, struct zone *zone) 170static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
170{ 171{
171 eviction >>= bucket_order; 172 eviction >>= bucket_order;
173 eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
172 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); 174 eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
173 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); 175 eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
174 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); 176 eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -176,18 +178,21 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
176 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); 178 return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
177} 179}
178 180
179static void unpack_shadow(void *shadow, struct zone **zonep, 181static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
180 unsigned long *evictionp) 182 unsigned long *evictionp)
181{ 183{
182 unsigned long entry = (unsigned long)shadow; 184 unsigned long entry = (unsigned long)shadow;
183 int zid, nid; 185 int memcgid, nid, zid;
184 186
185 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; 187 entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
186 zid = entry & ((1UL << ZONES_SHIFT) - 1); 188 zid = entry & ((1UL << ZONES_SHIFT) - 1);
187 entry >>= ZONES_SHIFT; 189 entry >>= ZONES_SHIFT;
188 nid = entry & ((1UL << NODES_SHIFT) - 1); 190 nid = entry & ((1UL << NODES_SHIFT) - 1);
189 entry >>= NODES_SHIFT; 191 entry >>= NODES_SHIFT;
192 memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
193 entry >>= MEM_CGROUP_ID_SHIFT;
190 194
195 *memcgidp = memcgid;
191 *zonep = NODE_DATA(nid)->node_zones + zid; 196 *zonep = NODE_DATA(nid)->node_zones + zid;
192 *evictionp = entry << bucket_order; 197 *evictionp = entry << bucket_order;
193} 198}
@@ -202,11 +207,20 @@ static void unpack_shadow(void *shadow, struct zone **zonep,
202 */ 207 */
203void *workingset_eviction(struct address_space *mapping, struct page *page) 208void *workingset_eviction(struct address_space *mapping, struct page *page)
204{ 209{
210 struct mem_cgroup *memcg = page_memcg(page);
205 struct zone *zone = page_zone(page); 211 struct zone *zone = page_zone(page);
212 int memcgid = mem_cgroup_id(memcg);
206 unsigned long eviction; 213 unsigned long eviction;
214 struct lruvec *lruvec;
207 215
208 eviction = atomic_long_inc_return(&zone->inactive_age); 216 /* Page is fully exclusive and pins page->mem_cgroup */
209 return pack_shadow(eviction, zone); 217 VM_BUG_ON_PAGE(PageLRU(page), page);
218 VM_BUG_ON_PAGE(page_count(page), page);
219 VM_BUG_ON_PAGE(!PageLocked(page), page);
220
221 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
222 eviction = atomic_long_inc_return(&lruvec->inactive_age);
223 return pack_shadow(memcgid, zone, eviction);
210} 224}
211 225
212/** 226/**
@@ -221,13 +235,42 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
221bool workingset_refault(void *shadow) 235bool workingset_refault(void *shadow)
222{ 236{
223 unsigned long refault_distance; 237 unsigned long refault_distance;
238 unsigned long active_file;
239 struct mem_cgroup *memcg;
224 unsigned long eviction; 240 unsigned long eviction;
241 struct lruvec *lruvec;
225 unsigned long refault; 242 unsigned long refault;
226 struct zone *zone; 243 struct zone *zone;
244 int memcgid;
227 245
228 unpack_shadow(shadow, &zone, &eviction); 246 unpack_shadow(shadow, &memcgid, &zone, &eviction);
229 247
230 refault = atomic_long_read(&zone->inactive_age); 248 rcu_read_lock();
249 /*
250 * Look up the memcg associated with the stored ID. It might
251 * have been deleted since the page's eviction.
252 *
253 * Note that in rare events the ID could have been recycled
254 * for a new cgroup that refaults a shared page. This is
255 * impossible to tell from the available data. However, this
256 * should be a rare and limited disturbance, and activations
257 * are always speculative anyway. Ultimately, it's the aging
258 * algorithm's job to shake out the minimum access frequency
259 * for the active cache.
260 *
261 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
262 * would be better if the root_mem_cgroup existed in all
263 * configurations instead.
264 */
265 memcg = mem_cgroup_from_id(memcgid);
266 if (!mem_cgroup_disabled() && !memcg) {
267 rcu_read_unlock();
268 return false;
269 }
270 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
271 refault = atomic_long_read(&lruvec->inactive_age);
272 active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
273 rcu_read_unlock();
231 274
232 /* 275 /*
233 * The unsigned subtraction here gives an accurate distance 276 * The unsigned subtraction here gives an accurate distance
@@ -249,7 +292,7 @@ bool workingset_refault(void *shadow)
249 292
250 inc_zone_state(zone, WORKINGSET_REFAULT); 293 inc_zone_state(zone, WORKINGSET_REFAULT);
251 294
252 if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { 295 if (refault_distance <= active_file) {
253 inc_zone_state(zone, WORKINGSET_ACTIVATE); 296 inc_zone_state(zone, WORKINGSET_ACTIVATE);
254 return true; 297 return true;
255 } 298 }
@@ -262,7 +305,23 @@ bool workingset_refault(void *shadow)
262 */ 305 */
263void workingset_activation(struct page *page) 306void workingset_activation(struct page *page)
264{ 307{
265 atomic_long_inc(&page_zone(page)->inactive_age); 308 struct mem_cgroup *memcg;
309 struct lruvec *lruvec;
310
311 memcg = lock_page_memcg(page);
312 /*
313 * Filter non-memcg pages here, e.g. unmap can call
314 * mark_page_accessed() on VDSO pages.
315 *
316 * XXX: See workingset_refault() - this should return
317 * root_mem_cgroup even for !CONFIG_MEMCG.
318 */
319 if (!mem_cgroup_disabled() && !memcg)
320 goto out;
321 lruvec = mem_cgroup_zone_lruvec(page_zone(page), memcg);
322 atomic_long_inc(&lruvec->inactive_age);
323out:
324 unlock_page_memcg(memcg);
266} 325}
267 326
268/* 327/*