aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h32
-rw-r--r--include/linux/vmpressure.h7
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/vmpressure.c78
-rw-r--r--mm/vmscan.c10
5 files changed, 104 insertions, 40 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index a355f61a2ed3..c5a51039df57 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -249,6 +249,10 @@ struct mem_cgroup {
249 struct wb_domain cgwb_domain; 249 struct wb_domain cgwb_domain;
250#endif 250#endif
251 251
252#ifdef CONFIG_INET
253 unsigned long socket_pressure;
254#endif
255
252 /* List of events which userspace want to receive */ 256 /* List of events which userspace want to receive */
253 struct list_head event_list; 257 struct list_head event_list;
254 spinlock_t event_list_lock; 258 spinlock_t event_list_lock;
@@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
290 294
291bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); 295bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
292struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 296struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
293struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
294 297
295static inline 298static inline
296struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ 299struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
297 return css ? container_of(css, struct mem_cgroup, css) : NULL; 300 return css ? container_of(css, struct mem_cgroup, css) : NULL;
298} 301}
299 302
303#define mem_cgroup_from_counter(counter, member) \
304 container_of(counter, struct mem_cgroup, member)
305
300struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 306struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
301 struct mem_cgroup *, 307 struct mem_cgroup *,
302 struct mem_cgroup_reclaim_cookie *); 308 struct mem_cgroup_reclaim_cookie *);
303void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 309void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
304 310
311/**
312 * parent_mem_cgroup - find the accounting parent of a memcg
313 * @memcg: memcg whose parent to find
314 *
315 * Returns the parent memcg, or NULL if this is the root or the memory
316 * controller is in legacy no-hierarchy mode.
317 */
318static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
319{
320 if (!memcg->memory.parent)
321 return NULL;
322 return mem_cgroup_from_counter(memcg->memory.parent, memory);
323}
324
305static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, 325static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
306 struct mem_cgroup *root) 326 struct mem_cgroup *root)
307{ 327{
@@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key;
689static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) 709static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
690{ 710{
691#ifdef CONFIG_MEMCG_KMEM 711#ifdef CONFIG_MEMCG_KMEM
692 return memcg->tcp_mem.memory_pressure; 712 if (memcg->tcp_mem.memory_pressure)
693#else 713 return true;
694 return false;
695#endif 714#endif
715 do {
716 if (time_before(jiffies, memcg->socket_pressure))
717 return true;
718 } while ((memcg = parent_mem_cgroup(memcg)));
719 return false;
696} 720}
697#else 721#else
698#define mem_cgroup_sockets_enabled 0 722#define mem_cgroup_sockets_enabled 0
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index 3e4535876d37..3347cc3ec0ab 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -12,6 +12,9 @@
12struct vmpressure { 12struct vmpressure {
13 unsigned long scanned; 13 unsigned long scanned;
14 unsigned long reclaimed; 14 unsigned long reclaimed;
15
16 unsigned long tree_scanned;
17 unsigned long tree_reclaimed;
15 /* The lock is used to keep the scanned/reclaimed above in sync. */ 18 /* The lock is used to keep the scanned/reclaimed above in sync. */
16 struct spinlock sr_lock; 19 struct spinlock sr_lock;
17 20
@@ -26,7 +29,7 @@ struct vmpressure {
26struct mem_cgroup; 29struct mem_cgroup;
27 30
28#ifdef CONFIG_MEMCG 31#ifdef CONFIG_MEMCG
29extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 32extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
30 unsigned long scanned, unsigned long reclaimed); 33 unsigned long scanned, unsigned long reclaimed);
31extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); 34extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
32 35
@@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
40extern void vmpressure_unregister_event(struct mem_cgroup *memcg, 43extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
41 struct eventfd_ctx *eventfd); 44 struct eventfd_ctx *eventfd);
42#else 45#else
43static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 46static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
44 unsigned long scanned, unsigned long reclaimed) {} 47 unsigned long scanned, unsigned long reclaimed) {}
45static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, 48static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
46 int prio) {} 49 int prio) {}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 60ebc486c2aa..df7f144a5a4b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1113 return ret; 1113 return ret;
1114} 1114}
1115 1115
1116#define mem_cgroup_from_counter(counter, member) \
1117 container_of(counter, struct mem_cgroup, member)
1118
1119/** 1116/**
1120 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1117 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1121 * @memcg: the memory cgroup 1118 * @memcg: the memory cgroup
@@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4183 kfree(memcg); 4180 kfree(memcg);
4184} 4181}
4185 4182
4186/*
4187 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4188 */
4189struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4190{
4191 if (!memcg->memory.parent)
4192 return NULL;
4193 return mem_cgroup_from_counter(memcg->memory.parent, memory);
4194}
4195EXPORT_SYMBOL(parent_mem_cgroup);
4196
4197static struct cgroup_subsys_state * __ref 4183static struct cgroup_subsys_state * __ref
4198mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4184mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4199{ 4185{
@@ -4234,6 +4220,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4234#ifdef CONFIG_CGROUP_WRITEBACK 4220#ifdef CONFIG_CGROUP_WRITEBACK
4235 INIT_LIST_HEAD(&memcg->cgwb_list); 4221 INIT_LIST_HEAD(&memcg->cgwb_list);
4236#endif 4222#endif
4223#ifdef CONFIG_INET
4224 memcg->socket_pressure = jiffies;
4225#endif
4237 return &memcg->css; 4226 return &memcg->css;
4238 4227
4239free_out: 4228free_out:
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..506f03e4be47 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
137}; 137};
138 138
139static bool vmpressure_event(struct vmpressure *vmpr, 139static bool vmpressure_event(struct vmpressure *vmpr,
140 unsigned long scanned, unsigned long reclaimed) 140 enum vmpressure_levels level)
141{ 141{
142 struct vmpressure_event *ev; 142 struct vmpressure_event *ev;
143 enum vmpressure_levels level;
144 bool signalled = false; 143 bool signalled = false;
145 144
146 level = vmpressure_calc_level(scanned, reclaimed);
147
148 mutex_lock(&vmpr->events_lock); 145 mutex_lock(&vmpr->events_lock);
149 146
150 list_for_each_entry(ev, &vmpr->events, node) { 147 list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
164 struct vmpressure *vmpr = work_to_vmpressure(work); 161 struct vmpressure *vmpr = work_to_vmpressure(work);
165 unsigned long scanned; 162 unsigned long scanned;
166 unsigned long reclaimed; 163 unsigned long reclaimed;
164 enum vmpressure_levels level;
167 165
168 spin_lock(&vmpr->sr_lock); 166 spin_lock(&vmpr->sr_lock);
169 /* 167 /*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
174 * here. No need for any locks here since we don't care if 172 * here. No need for any locks here since we don't care if
175 * vmpr->reclaimed is in sync. 173 * vmpr->reclaimed is in sync.
176 */ 174 */
177 scanned = vmpr->scanned; 175 scanned = vmpr->tree_scanned;
178 if (!scanned) { 176 if (!scanned) {
179 spin_unlock(&vmpr->sr_lock); 177 spin_unlock(&vmpr->sr_lock);
180 return; 178 return;
181 } 179 }
182 180
183 reclaimed = vmpr->reclaimed; 181 reclaimed = vmpr->tree_reclaimed;
184 vmpr->scanned = 0; 182 vmpr->tree_scanned = 0;
185 vmpr->reclaimed = 0; 183 vmpr->tree_reclaimed = 0;
186 spin_unlock(&vmpr->sr_lock); 184 spin_unlock(&vmpr->sr_lock);
187 185
186 level = vmpressure_calc_level(scanned, reclaimed);
187
188 do { 188 do {
189 if (vmpressure_event(vmpr, scanned, reclaimed)) 189 if (vmpressure_event(vmpr, level))
190 break; 190 break;
191 /* 191 /*
192 * If not handled, propagate the event upward into the 192 * If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
199 * vmpressure() - Account memory pressure through scanned/reclaimed ratio 199 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
200 * @gfp: reclaimer's gfp mask 200 * @gfp: reclaimer's gfp mask
201 * @memcg: cgroup memory controller handle 201 * @memcg: cgroup memory controller handle
202 * @tree: legacy subtree mode
202 * @scanned: number of pages scanned 203 * @scanned: number of pages scanned
203 * @reclaimed: number of pages reclaimed 204 * @reclaimed: number of pages reclaimed
204 * 205 *
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
206 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw 207 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
207 * pressure index is then further refined and averaged over time. 208 * pressure index is then further refined and averaged over time.
208 * 209 *
210 * If @tree is set, vmpressure is in traditional userspace reporting
211 * mode: @memcg is considered the pressure root and userspace is
212 * notified of the entire subtree's reclaim efficiency.
213 *
214 * If @tree is not set, reclaim efficiency is recorded for @memcg, and
215 * only in-kernel users are notified.
216 *
209 * This function does not return any value. 217 * This function does not return any value.
210 */ 218 */
211void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, 219void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
212 unsigned long scanned, unsigned long reclaimed) 220 unsigned long scanned, unsigned long reclaimed)
213{ 221{
214 struct vmpressure *vmpr = memcg_to_vmpressure(memcg); 222 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
238 if (!scanned) 246 if (!scanned)
239 return; 247 return;
240 248
241 spin_lock(&vmpr->sr_lock); 249 if (tree) {
242 vmpr->scanned += scanned; 250 spin_lock(&vmpr->sr_lock);
243 vmpr->reclaimed += reclaimed; 251 vmpr->tree_scanned += scanned;
244 scanned = vmpr->scanned; 252 vmpr->tree_reclaimed += reclaimed;
245 spin_unlock(&vmpr->sr_lock); 253 scanned = vmpr->scanned;
254 spin_unlock(&vmpr->sr_lock);
246 255
247 if (scanned < vmpressure_win) 256 if (scanned < vmpressure_win)
248 return; 257 return;
249 schedule_work(&vmpr->work); 258 schedule_work(&vmpr->work);
259 } else {
260 enum vmpressure_levels level;
261
262 /* For now, no users for root-level efficiency */
263 if (memcg == root_mem_cgroup)
264 return;
265
266 spin_lock(&vmpr->sr_lock);
267 scanned = vmpr->scanned += scanned;
268 reclaimed = vmpr->reclaimed += reclaimed;
269 if (scanned < vmpressure_win) {
270 spin_unlock(&vmpr->sr_lock);
271 return;
272 }
273 vmpr->scanned = vmpr->reclaimed = 0;
274 spin_unlock(&vmpr->sr_lock);
275
276 level = vmpressure_calc_level(scanned, reclaimed);
277
278 if (level > VMPRESSURE_LOW) {
279 /*
280 * Let the socket buffer allocator know that
281 * we are having trouble reclaiming LRU pages.
282 *
283 * For hysteresis keep the pressure state
284 * asserted for a second in which subsequent
285 * pressure events can occur.
286 */
287 memcg->socket_pressure = jiffies + HZ;
288 }
289 }
250} 290}
251 291
252/** 292/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
276 * to the vmpressure() basically means that we signal 'critical' 316 * to the vmpressure() basically means that we signal 'critical'
277 * level. 317 * level.
278 */ 318 */
279 vmpressure(gfp, memcg, vmpressure_win, 0); 319 vmpressure(gfp, memcg, true, vmpressure_win, 0);
280} 320}
281 321
282/** 322/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e36d766dade9..bb0cbd4c9f01 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2396 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2396 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2397 do { 2397 do {
2398 unsigned long lru_pages; 2398 unsigned long lru_pages;
2399 unsigned long reclaimed;
2399 unsigned long scanned; 2400 unsigned long scanned;
2400 struct lruvec *lruvec; 2401 struct lruvec *lruvec;
2401 int swappiness; 2402 int swappiness;
@@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2408 2409
2409 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2410 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2410 swappiness = mem_cgroup_swappiness(memcg); 2411 swappiness = mem_cgroup_swappiness(memcg);
2412 reclaimed = sc->nr_reclaimed;
2411 scanned = sc->nr_scanned; 2413 scanned = sc->nr_scanned;
2412 2414
2413 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2415 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2418 memcg, sc->nr_scanned - scanned, 2420 memcg, sc->nr_scanned - scanned,
2419 lru_pages); 2421 lru_pages);
2420 2422
2423 /* Record the group's reclaim efficiency */
2424 vmpressure(sc->gfp_mask, memcg, false,
2425 sc->nr_scanned - scanned,
2426 sc->nr_reclaimed - reclaimed);
2427
2421 /* 2428 /*
2422 * Direct reclaim and kswapd have to scan all memory 2429 * Direct reclaim and kswapd have to scan all memory
2423 * cgroups to fulfill the overall scan target for the 2430 * cgroups to fulfill the overall scan target for the
@@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2449 reclaim_state->reclaimed_slab = 0; 2456 reclaim_state->reclaimed_slab = 0;
2450 } 2457 }
2451 2458
2452 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2459 /* Record the subtree's reclaim efficiency */
2460 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2453 sc->nr_scanned - nr_scanned, 2461 sc->nr_scanned - nr_scanned,
2454 sc->nr_reclaimed - nr_reclaimed); 2462 sc->nr_reclaimed - nr_reclaimed);
2455 2463