aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2016-01-14 18:21:32 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-14 19:00:49 -0500
commit8e8ae645249b85c8ed6c178557f8db8613a6bcc7 (patch)
treee1c347c9b18cad1a979dda026a1dff6f310d8977
parentf7e1cb6ec51b041335b5ad4dd7aefb37a56d79a6 (diff)
mm: memcontrol: hook up vmpressure to socket pressure
Let the networking stack know when a memcg is under reclaim pressure so that it can clamp its transmit windows accordingly. Whenever the reclaim efficiency of a cgroup's LRU lists drops low enough for a MEDIUM or HIGH vmpressure event to occur, assert a pressure state in the socket and tcp memory code that tells it to curb consumption growth from sockets associated with said control group. Traditionally, vmpressure reports for the entire subtree of a memcg under pressure, which drops useful information on the individual groups reclaimed. However, it's too late to change the userinterface, so add a second reporting mode that reports on the level of reclaim instead of at the level of pressure, and use that report for sockets. vmpressure events are naturally edge triggered, so for hysteresis assert socket pressure for a second to allow for subsequent vmpressure events to occur before letting the socket code return to normal. This will likely need finetuning for a wider variety of workloads, but for now stick to the vmpressure presets and keep hysteresis simple. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: David S. Miller <davem@davemloft.net> Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/memcontrol.h32
-rw-r--r--include/linux/vmpressure.h7
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/vmpressure.c78
-rw-r--r--mm/vmscan.c10
5 files changed, 104 insertions, 40 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a355f61a2ed3..c5a51039df57 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -249,6 +249,10 @@ struct mem_cgroup {
249 struct wb_domain cgwb_domain; 249 struct wb_domain cgwb_domain;
250#endif 250#endif
251 251
252#ifdef CONFIG_INET
253 unsigned long socket_pressure;
254#endif
255
252 /* List of events which userspace want to receive */ 256 /* List of events which userspace want to receive */
253 struct list_head event_list; 257 struct list_head event_list;
254 spinlock_t event_list_lock; 258 spinlock_t event_list_lock;
@@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
290 294
291bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); 295bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
292struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 296struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
293struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
294 297
295static inline 298static inline
296struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ 299struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
297 return css ? container_of(css, struct mem_cgroup, css) : NULL; 300 return css ? container_of(css, struct mem_cgroup, css) : NULL;
298} 301}
299 302
303#define mem_cgroup_from_counter(counter, member) \
304 container_of(counter, struct mem_cgroup, member)
305
300struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 306struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
301 struct mem_cgroup *, 307 struct mem_cgroup *,
302 struct mem_cgroup_reclaim_cookie *); 308 struct mem_cgroup_reclaim_cookie *);
303void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 309void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
304 310
311/**
312 * parent_mem_cgroup - find the accounting parent of a memcg
313 * @memcg: memcg whose parent to find
314 *
315 * Returns the parent memcg, or NULL if this is the root or the memory
316 * controller is in legacy no-hierarchy mode.
317 */
318static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
319{
320 if (!memcg->memory.parent)
321 return NULL;
322 return mem_cgroup_from_counter(memcg->memory.parent, memory);
323}
324
305static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, 325static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
306 struct mem_cgroup *root) 326 struct mem_cgroup *root)
307{ 327{
@@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key;
689static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) 709static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
690{ 710{
691#ifdef CONFIG_MEMCG_KMEM 711#ifdef CONFIG_MEMCG_KMEM
692 return memcg->tcp_mem.memory_pressure; 712 if (memcg->tcp_mem.memory_pressure)
693#else 713 return true;
694 return false;
695#endif 714#endif
715 do {
716 if (time_before(jiffies, memcg->socket_pressure))
717 return true;
718 } while ((memcg = parent_mem_cgroup(memcg)));
719 return false;
696} 720}
697#else 721#else
698#define mem_cgroup_sockets_enabled 0 722#define mem_cgroup_sockets_enabled 0
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3e4535876d37..3347cc3ec0ab 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -12,6 +12,9 @@
12struct vmpressure { 12struct vmpressure {
13 unsigned long scanned; 13 unsigned long scanned;
14 unsigned long reclaimed; 14 unsigned long reclaimed;
15
16 unsigned long tree_scanned;
17 unsigned long tree_reclaimed;
15 /* The lock is used to keep the scanned/reclaimed above in sync. */ 18 /* The lock is used to keep the scanned/reclaimed above in sync. */
16 struct spinlock sr_lock; 19 struct spinlock sr_lock;
17 20
@@ -26,7 +29,7 @@ struct vmpressure {
26struct mem_cgroup; 29struct mem_cgroup;
27 30
28#ifdef CONFIG_MEMCG 31#ifdef CONFIG_MEMCG
29extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 32extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
30 unsigned long scanned, unsigned long reclaimed); 33 unsigned long scanned, unsigned long reclaimed);
31extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); 34extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
32 35
@@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
40extern void vmpressure_unregister_event(struct mem_cgroup *memcg, 43extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
41 struct eventfd_ctx *eventfd); 44 struct eventfd_ctx *eventfd);
42#else 45#else
43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 46static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
44 unsigned long scanned, unsigned long reclaimed) {} 47 unsigned long scanned, unsigned long reclaimed) {}
45static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, 48static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
46 int prio) {} 49 int prio) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 60ebc486c2aa..df7f144a5a4b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1113 return ret; 1113 return ret;
1114} 1114}
1115 1115
1116#define mem_cgroup_from_counter(counter, member) \
1117 container_of(counter, struct mem_cgroup, member)
1118
1119/** 1116/**
1120 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1117 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1121 * @memcg: the memory cgroup 1118 * @memcg: the memory cgroup
@@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4183 kfree(memcg); 4180 kfree(memcg);
4184} 4181}
4185 4182
4186/*
4187 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4188 */
4189struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4190{
4191 if (!memcg->memory.parent)
4192 return NULL;
4193 return mem_cgroup_from_counter(memcg->memory.parent, memory);
4194}
4195EXPORT_SYMBOL(parent_mem_cgroup);
4196
4197static struct cgroup_subsys_state * __ref 4183static struct cgroup_subsys_state * __ref
4198mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4184mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4199{ 4185{
@@ -4234,6 +4220,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4234#ifdef CONFIG_CGROUP_WRITEBACK 4220#ifdef CONFIG_CGROUP_WRITEBACK
4235 INIT_LIST_HEAD(&memcg->cgwb_list); 4221 INIT_LIST_HEAD(&memcg->cgwb_list);
4236#endif 4222#endif
4223#ifdef CONFIG_INET
4224 memcg->socket_pressure = jiffies;
4225#endif
4237 return &memcg->css; 4226 return &memcg->css;
4238 4227
4239free_out: 4228free_out:
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..506f03e4be47 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
137}; 137};
138 138
139static bool vmpressure_event(struct vmpressure *vmpr, 139static bool vmpressure_event(struct vmpressure *vmpr,
140 unsigned long scanned, unsigned long reclaimed) 140 enum vmpressure_levels level)
141{ 141{
142 struct vmpressure_event *ev; 142 struct vmpressure_event *ev;
143 enum vmpressure_levels level;
144 bool signalled = false; 143 bool signalled = false;
145 144
146 level = vmpressure_calc_level(scanned, reclaimed);
147
148 mutex_lock(&vmpr->events_lock); 145 mutex_lock(&vmpr->events_lock);
149 146
150 list_for_each_entry(ev, &vmpr->events, node) { 147 list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
164 struct vmpressure *vmpr = work_to_vmpressure(work); 161 struct vmpressure *vmpr = work_to_vmpressure(work);
165 unsigned long scanned; 162 unsigned long scanned;
166 unsigned long reclaimed; 163 unsigned long reclaimed;
164 enum vmpressure_levels level;
167 165
168 spin_lock(&vmpr->sr_lock); 166 spin_lock(&vmpr->sr_lock);
169 /* 167 /*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
174 * here. No need for any locks here since we don't care if 172 * here. No need for any locks here since we don't care if
175 * vmpr->reclaimed is in sync. 173 * vmpr->reclaimed is in sync.
176 */ 174 */
177 scanned = vmpr->scanned; 175 scanned = vmpr->tree_scanned;
178 if (!scanned) { 176 if (!scanned) {
179 spin_unlock(&vmpr->sr_lock); 177 spin_unlock(&vmpr->sr_lock);
180 return; 178 return;
181 } 179 }
182 180
183 reclaimed = vmpr->reclaimed; 181 reclaimed = vmpr->tree_reclaimed;
184 vmpr->scanned = 0; 182 vmpr->tree_scanned = 0;
185 vmpr->reclaimed = 0; 183 vmpr->tree_reclaimed = 0;
186 spin_unlock(&vmpr->sr_lock); 184 spin_unlock(&vmpr->sr_lock);
187 185
186 level = vmpressure_calc_level(scanned, reclaimed);
187
188 do { 188 do {
189 if (vmpressure_event(vmpr, scanned, reclaimed)) 189 if (vmpressure_event(vmpr, level))
190 break; 190 break;
191 /* 191 /*
192 * If not handled, propagate the event upward into the 192 * If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
199 * vmpressure() - Account memory pressure through scanned/reclaimed ratio 199 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
200 * @gfp: reclaimer's gfp mask 200 * @gfp: reclaimer's gfp mask
201 * @memcg: cgroup memory controller handle 201 * @memcg: cgroup memory controller handle
202 * @tree: legacy subtree mode
202 * @scanned: number of pages scanned 203 * @scanned: number of pages scanned
203 * @reclaimed: number of pages reclaimed 204 * @reclaimed: number of pages reclaimed
204 * 205 *
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
206 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 207 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
207 * pressure index is then further refined and averaged over time. 208 * pressure index is then further refined and averaged over time.
208 * 209 *
210 * If @tree is set, vmpressure is in traditional userspace reporting
211 * mode: @memcg is considered the pressure root and userspace is
212 * notified of the entire subtree's reclaim efficiency.
213 *
214 * If @tree is not set, reclaim efficiency is recorded for @memcg, and
215 * only in-kernel users are notified.
216 *
209 * This function does not return any value. 217 * This function does not return any value.
210 */ 218 */
211void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 219void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
212 unsigned long scanned, unsigned long reclaimed) 220 unsigned long scanned, unsigned long reclaimed)
213{ 221{
214 struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 222 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
238 if (!scanned) 246 if (!scanned)
239 return; 247 return;
240 248
241 spin_lock(&vmpr->sr_lock); 249 if (tree) {
242 vmpr->scanned += scanned; 250 spin_lock(&vmpr->sr_lock);
243 vmpr->reclaimed += reclaimed; 251 vmpr->tree_scanned += scanned;
244 scanned = vmpr->scanned; 252 vmpr->tree_reclaimed += reclaimed;
245 spin_unlock(&vmpr->sr_lock); 253 scanned = vmpr->scanned;
254 spin_unlock(&vmpr->sr_lock);
246 255
247 if (scanned < vmpressure_win) 256 if (scanned < vmpressure_win)
248 return; 257 return;
249 schedule_work(&vmpr->work); 258 schedule_work(&vmpr->work);
259 } else {
260 enum vmpressure_levels level;
261
262 /* For now, no users for root-level efficiency */
263 if (memcg == root_mem_cgroup)
264 return;
265
266 spin_lock(&vmpr->sr_lock);
267 scanned = vmpr->scanned += scanned;
268 reclaimed = vmpr->reclaimed += reclaimed;
269 if (scanned < vmpressure_win) {
270 spin_unlock(&vmpr->sr_lock);
271 return;
272 }
273 vmpr->scanned = vmpr->reclaimed = 0;
274 spin_unlock(&vmpr->sr_lock);
275
276 level = vmpressure_calc_level(scanned, reclaimed);
277
278 if (level > VMPRESSURE_LOW) {
279 /*
280 * Let the socket buffer allocator know that
281 * we are having trouble reclaiming LRU pages.
282 *
283 * For hysteresis keep the pressure state
284 * asserted for a second in which subsequent
285 * pressure events can occur.
286 */
287 memcg->socket_pressure = jiffies + HZ;
288 }
289 }
250} 290}
251 291
252/** 292/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
276 * to the vmpressure() basically means that we signal 'critical' 316 * to the vmpressure() basically means that we signal 'critical'
277 * level. 317 * level.
278 */ 318 */
279 vmpressure(gfp, memcg, vmpressure_win, 0); 319 vmpressure(gfp, memcg, true, vmpressure_win, 0);
280} 320}
281 321
282/** 322/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e36d766dade9..bb0cbd4c9f01 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2396 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2396 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2397 do { 2397 do {
2398 unsigned long lru_pages; 2398 unsigned long lru_pages;
2399 unsigned long reclaimed;
2399 unsigned long scanned; 2400 unsigned long scanned;
2400 struct lruvec *lruvec; 2401 struct lruvec *lruvec;
2401 int swappiness; 2402 int swappiness;
@@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2408 2409
2409 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2410 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2410 swappiness = mem_cgroup_swappiness(memcg); 2411 swappiness = mem_cgroup_swappiness(memcg);
2412 reclaimed = sc->nr_reclaimed;
2411 scanned = sc->nr_scanned; 2413 scanned = sc->nr_scanned;
2412 2414
2413 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2415 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2418 memcg, sc->nr_scanned - scanned, 2420 memcg, sc->nr_scanned - scanned,
2419 lru_pages); 2421 lru_pages);
2420 2422
2423 /* Record the group's reclaim efficiency */
2424 vmpressure(sc->gfp_mask, memcg, false,
2425 sc->nr_scanned - scanned,
2426 sc->nr_reclaimed - reclaimed);
2427
2421 /* 2428 /*
2422 * Direct reclaim and kswapd have to scan all memory 2429 * Direct reclaim and kswapd have to scan all memory
2423 * cgroups to fulfill the overall scan target for the 2430 * cgroups to fulfill the overall scan target for the
@@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2449 reclaim_state->reclaimed_slab = 0; 2456 reclaim_state->reclaimed_slab = 0;
2450 } 2457 }
2451 2458
2452 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2459 /* Record the subtree's reclaim efficiency */
2460 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2453 sc->nr_scanned - nr_scanned, 2461 sc->nr_scanned - nr_scanned,
2454 sc->nr_reclaimed - nr_reclaimed); 2462 sc->nr_reclaimed - nr_reclaimed);
2455 2463