diff options
-rw-r--r-- | include/linux/memcontrol.h | 32 | ||||
-rw-r--r-- | include/linux/vmpressure.h | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 17 | ||||
-rw-r--r-- | mm/vmpressure.c | 78 | ||||
-rw-r--r-- | mm/vmscan.c | 10 |
5 files changed, 104 insertions, 40 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a355f61a2ed3..c5a51039df57 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -249,6 +249,10 @@ struct mem_cgroup { | |||
249 | struct wb_domain cgwb_domain; | 249 | struct wb_domain cgwb_domain; |
250 | #endif | 250 | #endif |
251 | 251 | ||
252 | #ifdef CONFIG_INET | ||
253 | unsigned long socket_pressure; | ||
254 | #endif | ||
255 | |||
252 | /* List of events which userspace want to receive */ | 256 | /* List of events which userspace want to receive */ |
253 | struct list_head event_list; | 257 | struct list_head event_list; |
254 | spinlock_t event_list_lock; | 258 | spinlock_t event_list_lock; |
@@ -290,18 +294,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); | |||
290 | 294 | ||
291 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); | 295 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); |
292 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); | 296 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); |
293 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); | ||
294 | 297 | ||
295 | static inline | 298 | static inline |
296 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ | 299 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ |
297 | return css ? container_of(css, struct mem_cgroup, css) : NULL; | 300 | return css ? container_of(css, struct mem_cgroup, css) : NULL; |
298 | } | 301 | } |
299 | 302 | ||
303 | #define mem_cgroup_from_counter(counter, member) \ | ||
304 | container_of(counter, struct mem_cgroup, member) | ||
305 | |||
300 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, | 306 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, |
301 | struct mem_cgroup *, | 307 | struct mem_cgroup *, |
302 | struct mem_cgroup_reclaim_cookie *); | 308 | struct mem_cgroup_reclaim_cookie *); |
303 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | 309 | void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); |
304 | 310 | ||
311 | /** | ||
312 | * parent_mem_cgroup - find the accounting parent of a memcg | ||
313 | * @memcg: memcg whose parent to find | ||
314 | * | ||
315 | * Returns the parent memcg, or NULL if this is the root or the memory | ||
316 | * controller is in legacy no-hierarchy mode. | ||
317 | */ | ||
318 | static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | ||
319 | { | ||
320 | if (!memcg->memory.parent) | ||
321 | return NULL; | ||
322 | return mem_cgroup_from_counter(memcg->memory.parent, memory); | ||
323 | } | ||
324 | |||
305 | static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, | 325 | static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, |
306 | struct mem_cgroup *root) | 326 | struct mem_cgroup *root) |
307 | { | 327 | { |
@@ -689,10 +709,14 @@ extern struct static_key memcg_sockets_enabled_key; | |||
689 | static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) | 709 | static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) |
690 | { | 710 | { |
691 | #ifdef CONFIG_MEMCG_KMEM | 711 | #ifdef CONFIG_MEMCG_KMEM |
692 | return memcg->tcp_mem.memory_pressure; | 712 | if (memcg->tcp_mem.memory_pressure) |
693 | #else | 713 | return true; |
694 | return false; | ||
695 | #endif | 714 | #endif |
715 | do { | ||
716 | if (time_before(jiffies, memcg->socket_pressure)) | ||
717 | return true; | ||
718 | } while ((memcg = parent_mem_cgroup(memcg))); | ||
719 | return false; | ||
696 | } | 720 | } |
697 | #else | 721 | #else |
698 | #define mem_cgroup_sockets_enabled 0 | 722 | #define mem_cgroup_sockets_enabled 0 |
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3e4535876d37..3347cc3ec0ab 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h | |||
@@ -12,6 +12,9 @@ | |||
12 | struct vmpressure { | 12 | struct vmpressure { |
13 | unsigned long scanned; | 13 | unsigned long scanned; |
14 | unsigned long reclaimed; | 14 | unsigned long reclaimed; |
15 | |||
16 | unsigned long tree_scanned; | ||
17 | unsigned long tree_reclaimed; | ||
15 | /* The lock is used to keep the scanned/reclaimed above in sync. */ | 18 | /* The lock is used to keep the scanned/reclaimed above in sync. */ |
16 | struct spinlock sr_lock; | 19 | struct spinlock sr_lock; |
17 | 20 | ||
@@ -26,7 +29,7 @@ struct vmpressure { | |||
26 | struct mem_cgroup; | 29 | struct mem_cgroup; |
27 | 30 | ||
28 | #ifdef CONFIG_MEMCG | 31 | #ifdef CONFIG_MEMCG |
29 | extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | 32 | extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, |
30 | unsigned long scanned, unsigned long reclaimed); | 33 | unsigned long scanned, unsigned long reclaimed); |
31 | extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); | 34 | extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); |
32 | 35 | ||
@@ -40,7 +43,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, | |||
40 | extern void vmpressure_unregister_event(struct mem_cgroup *memcg, | 43 | extern void vmpressure_unregister_event(struct mem_cgroup *memcg, |
41 | struct eventfd_ctx *eventfd); | 44 | struct eventfd_ctx *eventfd); |
42 | #else | 45 | #else |
43 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | 46 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, |
44 | unsigned long scanned, unsigned long reclaimed) {} | 47 | unsigned long scanned, unsigned long reclaimed) {} |
45 | static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, | 48 | static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, |
46 | int prio) {} | 49 | int prio) {} |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60ebc486c2aa..df7f144a5a4b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1113,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) | |||
1113 | return ret; | 1113 | return ret; |
1114 | } | 1114 | } |
1115 | 1115 | ||
1116 | #define mem_cgroup_from_counter(counter, member) \ | ||
1117 | container_of(counter, struct mem_cgroup, member) | ||
1118 | |||
1119 | /** | 1116 | /** |
1120 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1117 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1121 | * @memcg: the memory cgroup | 1118 | * @memcg: the memory cgroup |
@@ -4183,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4183 | kfree(memcg); | 4180 | kfree(memcg); |
4184 | } | 4181 | } |
4185 | 4182 | ||
4186 | /* | ||
4187 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | ||
4188 | */ | ||
4189 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | ||
4190 | { | ||
4191 | if (!memcg->memory.parent) | ||
4192 | return NULL; | ||
4193 | return mem_cgroup_from_counter(memcg->memory.parent, memory); | ||
4194 | } | ||
4195 | EXPORT_SYMBOL(parent_mem_cgroup); | ||
4196 | |||
4197 | static struct cgroup_subsys_state * __ref | 4183 | static struct cgroup_subsys_state * __ref |
4198 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 4184 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
4199 | { | 4185 | { |
@@ -4234,6 +4220,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4234 | #ifdef CONFIG_CGROUP_WRITEBACK | 4220 | #ifdef CONFIG_CGROUP_WRITEBACK |
4235 | INIT_LIST_HEAD(&memcg->cgwb_list); | 4221 | INIT_LIST_HEAD(&memcg->cgwb_list); |
4236 | #endif | 4222 | #endif |
4223 | #ifdef CONFIG_INET | ||
4224 | memcg->socket_pressure = jiffies; | ||
4225 | #endif | ||
4237 | return &memcg->css; | 4226 | return &memcg->css; |
4238 | 4227 | ||
4239 | free_out: | 4228 | free_out: |
diff --git a/mm/vmpressure.c b/mm/vmpressure.c index c5afd573d7da..506f03e4be47 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c | |||
@@ -137,14 +137,11 @@ struct vmpressure_event { | |||
137 | }; | 137 | }; |
138 | 138 | ||
139 | static bool vmpressure_event(struct vmpressure *vmpr, | 139 | static bool vmpressure_event(struct vmpressure *vmpr, |
140 | unsigned long scanned, unsigned long reclaimed) | 140 | enum vmpressure_levels level) |
141 | { | 141 | { |
142 | struct vmpressure_event *ev; | 142 | struct vmpressure_event *ev; |
143 | enum vmpressure_levels level; | ||
144 | bool signalled = false; | 143 | bool signalled = false; |
145 | 144 | ||
146 | level = vmpressure_calc_level(scanned, reclaimed); | ||
147 | |||
148 | mutex_lock(&vmpr->events_lock); | 145 | mutex_lock(&vmpr->events_lock); |
149 | 146 | ||
150 | list_for_each_entry(ev, &vmpr->events, node) { | 147 | list_for_each_entry(ev, &vmpr->events, node) { |
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work) | |||
164 | struct vmpressure *vmpr = work_to_vmpressure(work); | 161 | struct vmpressure *vmpr = work_to_vmpressure(work); |
165 | unsigned long scanned; | 162 | unsigned long scanned; |
166 | unsigned long reclaimed; | 163 | unsigned long reclaimed; |
164 | enum vmpressure_levels level; | ||
167 | 165 | ||
168 | spin_lock(&vmpr->sr_lock); | 166 | spin_lock(&vmpr->sr_lock); |
169 | /* | 167 | /* |
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work) | |||
174 | * here. No need for any locks here since we don't care if | 172 | * here. No need for any locks here since we don't care if |
175 | * vmpr->reclaimed is in sync. | 173 | * vmpr->reclaimed is in sync. |
176 | */ | 174 | */ |
177 | scanned = vmpr->scanned; | 175 | scanned = vmpr->tree_scanned; |
178 | if (!scanned) { | 176 | if (!scanned) { |
179 | spin_unlock(&vmpr->sr_lock); | 177 | spin_unlock(&vmpr->sr_lock); |
180 | return; | 178 | return; |
181 | } | 179 | } |
182 | 180 | ||
183 | reclaimed = vmpr->reclaimed; | 181 | reclaimed = vmpr->tree_reclaimed; |
184 | vmpr->scanned = 0; | 182 | vmpr->tree_scanned = 0; |
185 | vmpr->reclaimed = 0; | 183 | vmpr->tree_reclaimed = 0; |
186 | spin_unlock(&vmpr->sr_lock); | 184 | spin_unlock(&vmpr->sr_lock); |
187 | 185 | ||
186 | level = vmpressure_calc_level(scanned, reclaimed); | ||
187 | |||
188 | do { | 188 | do { |
189 | if (vmpressure_event(vmpr, scanned, reclaimed)) | 189 | if (vmpressure_event(vmpr, level)) |
190 | break; | 190 | break; |
191 | /* | 191 | /* |
192 | * If not handled, propagate the event upward into the | 192 | * If not handled, propagate the event upward into the |
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work) | |||
199 | * vmpressure() - Account memory pressure through scanned/reclaimed ratio | 199 | * vmpressure() - Account memory pressure through scanned/reclaimed ratio |
200 | * @gfp: reclaimer's gfp mask | 200 | * @gfp: reclaimer's gfp mask |
201 | * @memcg: cgroup memory controller handle | 201 | * @memcg: cgroup memory controller handle |
202 | * @tree: legacy subtree mode | ||
202 | * @scanned: number of pages scanned | 203 | * @scanned: number of pages scanned |
203 | * @reclaimed: number of pages reclaimed | 204 | * @reclaimed: number of pages reclaimed |
204 | * | 205 | * |
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work) | |||
206 | * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw | 207 | * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw |
207 | * pressure index is then further refined and averaged over time. | 208 | * pressure index is then further refined and averaged over time. |
208 | * | 209 | * |
210 | * If @tree is set, vmpressure is in traditional userspace reporting | ||
211 | * mode: @memcg is considered the pressure root and userspace is | ||
212 | * notified of the entire subtree's reclaim efficiency. | ||
213 | * | ||
214 | * If @tree is not set, reclaim efficiency is recorded for @memcg, and | ||
215 | * only in-kernel users are notified. | ||
216 | * | ||
209 | * This function does not return any value. | 217 | * This function does not return any value. |
210 | */ | 218 | */ |
211 | void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | 219 | void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, |
212 | unsigned long scanned, unsigned long reclaimed) | 220 | unsigned long scanned, unsigned long reclaimed) |
213 | { | 221 | { |
214 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); | 222 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); |
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | |||
238 | if (!scanned) | 246 | if (!scanned) |
239 | return; | 247 | return; |
240 | 248 | ||
241 | spin_lock(&vmpr->sr_lock); | 249 | if (tree) { |
242 | vmpr->scanned += scanned; | 250 | spin_lock(&vmpr->sr_lock); |
243 | vmpr->reclaimed += reclaimed; | 251 | vmpr->tree_scanned += scanned; |
244 | scanned = vmpr->scanned; | 252 | vmpr->tree_reclaimed += reclaimed; |
245 | spin_unlock(&vmpr->sr_lock); | 253 | scanned = vmpr->scanned; |
254 | spin_unlock(&vmpr->sr_lock); | ||
246 | 255 | ||
247 | if (scanned < vmpressure_win) | 256 | if (scanned < vmpressure_win) |
248 | return; | 257 | return; |
249 | schedule_work(&vmpr->work); | 258 | schedule_work(&vmpr->work); |
259 | } else { | ||
260 | enum vmpressure_levels level; | ||
261 | |||
262 | /* For now, no users for root-level efficiency */ | ||
263 | if (memcg == root_mem_cgroup) | ||
264 | return; | ||
265 | |||
266 | spin_lock(&vmpr->sr_lock); | ||
267 | scanned = vmpr->scanned += scanned; | ||
268 | reclaimed = vmpr->reclaimed += reclaimed; | ||
269 | if (scanned < vmpressure_win) { | ||
270 | spin_unlock(&vmpr->sr_lock); | ||
271 | return; | ||
272 | } | ||
273 | vmpr->scanned = vmpr->reclaimed = 0; | ||
274 | spin_unlock(&vmpr->sr_lock); | ||
275 | |||
276 | level = vmpressure_calc_level(scanned, reclaimed); | ||
277 | |||
278 | if (level > VMPRESSURE_LOW) { | ||
279 | /* | ||
280 | * Let the socket buffer allocator know that | ||
281 | * we are having trouble reclaiming LRU pages. | ||
282 | * | ||
283 | * For hysteresis keep the pressure state | ||
284 | * asserted for a second in which subsequent | ||
285 | * pressure events can occur. | ||
286 | */ | ||
287 | memcg->socket_pressure = jiffies + HZ; | ||
288 | } | ||
289 | } | ||
250 | } | 290 | } |
251 | 291 | ||
252 | /** | 292 | /** |
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | |||
276 | * to the vmpressure() basically means that we signal 'critical' | 316 | * to the vmpressure() basically means that we signal 'critical' |
277 | * level. | 317 | * level. |
278 | */ | 318 | */ |
279 | vmpressure(gfp, memcg, vmpressure_win, 0); | 319 | vmpressure(gfp, memcg, true, vmpressure_win, 0); |
280 | } | 320 | } |
281 | 321 | ||
282 | /** | 322 | /** |
diff --git a/mm/vmscan.c b/mm/vmscan.c index e36d766dade9..bb0cbd4c9f01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2396,6 +2396,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2396 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2396 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2397 | do { | 2397 | do { |
2398 | unsigned long lru_pages; | 2398 | unsigned long lru_pages; |
2399 | unsigned long reclaimed; | ||
2399 | unsigned long scanned; | 2400 | unsigned long scanned; |
2400 | struct lruvec *lruvec; | 2401 | struct lruvec *lruvec; |
2401 | int swappiness; | 2402 | int swappiness; |
@@ -2408,6 +2409,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2408 | 2409 | ||
2409 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2410 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2410 | swappiness = mem_cgroup_swappiness(memcg); | 2411 | swappiness = mem_cgroup_swappiness(memcg); |
2412 | reclaimed = sc->nr_reclaimed; | ||
2411 | scanned = sc->nr_scanned; | 2413 | scanned = sc->nr_scanned; |
2412 | 2414 | ||
2413 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); | 2415 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); |
@@ -2418,6 +2420,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2418 | memcg, sc->nr_scanned - scanned, | 2420 | memcg, sc->nr_scanned - scanned, |
2419 | lru_pages); | 2421 | lru_pages); |
2420 | 2422 | ||
2423 | /* Record the group's reclaim efficiency */ | ||
2424 | vmpressure(sc->gfp_mask, memcg, false, | ||
2425 | sc->nr_scanned - scanned, | ||
2426 | sc->nr_reclaimed - reclaimed); | ||
2427 | |||
2421 | /* | 2428 | /* |
2422 | * Direct reclaim and kswapd have to scan all memory | 2429 | * Direct reclaim and kswapd have to scan all memory |
2423 | * cgroups to fulfill the overall scan target for the | 2430 | * cgroups to fulfill the overall scan target for the |
@@ -2449,7 +2456,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2449 | reclaim_state->reclaimed_slab = 0; | 2456 | reclaim_state->reclaimed_slab = 0; |
2450 | } | 2457 | } |
2451 | 2458 | ||
2452 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2459 | /* Record the subtree's reclaim efficiency */ |
2460 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, | ||
2453 | sc->nr_scanned - nr_scanned, | 2461 | sc->nr_scanned - nr_scanned, |
2454 | sc->nr_reclaimed - nr_reclaimed); | 2462 | sc->nr_reclaimed - nr_reclaimed); |
2455 | 2463 | ||