aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/memory.txt70
-rw-r--r--include/linux/vmpressure.h47
-rw-r--r--mm/Makefile2
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/vmpressure.c374
-rw-r--r--mm/vmscan.c8
6 files changed, 528 insertions, 2 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 8b8c28b9864c..f336ede58e62 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -40,6 +40,7 @@ Features:
40 - soft limit 40 - soft limit
41 - moving (recharging) account at moving a task is selectable. 41 - moving (recharging) account at moving a task is selectable.
42 - usage threshold notifier 42 - usage threshold notifier
43 - memory pressure notifier
43 - oom-killer disable knob and oom-notifier 44 - oom-killer disable knob and oom-notifier
44 - Root cgroup has no limit controls. 45 - Root cgroup has no limit controls.
45 46
@@ -65,6 +66,7 @@ Brief summary of control files.
65 memory.stat # show various statistics 66 memory.stat # show various statistics
66 memory.use_hierarchy # set/show hierarchical account enabled 67 memory.use_hierarchy # set/show hierarchical account enabled
67 memory.force_empty # trigger forced move charge to parent 68 memory.force_empty # trigger forced move charge to parent
69 memory.pressure_level # set memory pressure notifications
68 memory.swappiness # set/show swappiness parameter of vmscan 70 memory.swappiness # set/show swappiness parameter of vmscan
69 (See sysctl's vm.swappiness) 71 (See sysctl's vm.swappiness)
70 memory.move_charge_at_immigrate # set/show controls of moving charges 72 memory.move_charge_at_immigrate # set/show controls of moving charges
@@ -762,7 +764,73 @@ At reading, current status of OOM is shown.
762 under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may 764 under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
763 be stopped.) 765 be stopped.)
764 766
76511. TODO 76711. Memory Pressure
768
769The pressure level notifications can be used to monitor the memory
770allocation cost; based on the pressure, applications can implement
771different strategies of managing their memory resources. The pressure
772levels are defined as following:
773
774The "low" level means that the system is reclaiming memory for new
775allocations. Monitoring this reclaiming activity might be useful for
776maintaining cache level. Upon notification, the program (typically
777"Activity Manager") might analyze vmstat and act in advance (i.e.
778prematurely shutdown unimportant services).
779
780The "medium" level means that the system is experiencing medium memory
781pressure, the system might be making swap, paging out active file caches,
782etc. Upon this event applications may decide to further analyze
783vmstat/zoneinfo/memcg or internal memory usage statistics and free any
784resources that can be easily reconstructed or re-read from a disk.
785
786The "critical" level means that the system is actively thrashing, it is
787about to out of memory (OOM) or even the in-kernel OOM killer is on its
788way to trigger. Applications should do whatever they can to help the
789system. It might be too late to consult with vmstat or any other
790statistics, so it's advisable to take an immediate action.
791
792The events are propagated upward until the event is handled, i.e. the
793events are not pass-through. Here is what this means: for example you have
794three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
795and C, and suppose group C experiences some pressure. In this situation,
796only group C will receive the notification, i.e. groups A and B will not
797receive it. This is done to avoid excessive "broadcasting" of messages,
798which disturbs the system and which is especially bad if we are low on
799memory or thrashing. So, organize the cgroups wisely, or propagate the
800events manually (or, ask us to implement the pass-through events,
801explaining why would you need them.)
802
803The file memory.pressure_level is only used to setup an eventfd. To
804register a notification, an application must:
805
806- create an eventfd using eventfd(2);
807- open memory.pressure_level;
808- write string like "<event_fd> <fd of memory.pressure_level> <level>"
809 to cgroup.event_control.
810
811Application will be notified through eventfd when memory pressure is at
812the specific level (or higher). Read/write operations to
813memory.pressure_level are no implemented.
814
815Test:
816
817 Here is a small script example that makes a new cgroup, sets up a
818 memory limit, sets up a notification in the cgroup and then makes child
819 cgroup experience a critical pressure:
820
821 # cd /sys/fs/cgroup/memory/
822 # mkdir foo
823 # cd foo
824 # cgroup_event_listener memory.pressure_level low &
825 # echo 8000000 > memory.limit_in_bytes
826 # echo 8000000 > memory.memsw.limit_in_bytes
827 # echo $$ > tasks
828 # dd if=/dev/zero | read x
829
830 (Expect a bunch of notifications, and eventually, the oom-killer will
831 trigger.)
832
83312. TODO
766 834
7671. Add support for accounting huge pages (as a separate controller) 8351. Add support for accounting huge pages (as a separate controller)
7682. Make per-cgroup scanner reclaim not-shared pages first 8362. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
new file mode 100644
index 000000000000..76be077340ea
--- /dev/null
+++ b/include/linux/vmpressure.h
@@ -0,0 +1,47 @@
1#ifndef __LINUX_VMPRESSURE_H
2#define __LINUX_VMPRESSURE_H
3
4#include <linux/mutex.h>
5#include <linux/list.h>
6#include <linux/workqueue.h>
7#include <linux/gfp.h>
8#include <linux/types.h>
9#include <linux/cgroup.h>
10
11struct vmpressure {
12 unsigned long scanned;
13 unsigned long reclaimed;
14 /* The lock is used to keep the scanned/reclaimed above in sync. */
15 struct mutex sr_lock;
16
17 /* The list of vmpressure_event structs. */
18 struct list_head events;
19 /* Have to grab the lock on events traversal or modifications. */
20 struct mutex events_lock;
21
22 struct work_struct work;
23};
24
25struct mem_cgroup;
26
27#ifdef CONFIG_MEMCG
28extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
29 unsigned long scanned, unsigned long reclaimed);
30extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
31
32extern void vmpressure_init(struct vmpressure *vmpr);
33extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
34extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
35extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
36extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
37 struct eventfd_ctx *eventfd,
38 const char *args);
39extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
40 struct eventfd_ctx *eventfd);
41#else
42static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
43 unsigned long scanned, unsigned long reclaimed) {}
44static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
45 int prio) {}
46#endif /* CONFIG_MEMCG */
47#endif /* __LINUX_VMPRESSURE_H */
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
50obj-$(CONFIG_MIGRATION) += migrate.o 50obj-$(CONFIG_MIGRATION) += migrate.o
51obj-$(CONFIG_QUICKLIST) += quicklist.o 51obj-$(CONFIG_QUICKLIST) += quicklist.o
52obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 52obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
53obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o 53obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
54obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o 54obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
55obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 55obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
56obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 56obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7e5bc43c2d1f..360464f40e96 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
49#include <linux/fs.h> 49#include <linux/fs.h>
50#include <linux/seq_file.h> 50#include <linux/seq_file.h>
51#include <linux/vmalloc.h> 51#include <linux/vmalloc.h>
52#include <linux/vmpressure.h>
52#include <linux/mm_inline.h> 53#include <linux/mm_inline.h>
53#include <linux/page_cgroup.h> 54#include <linux/page_cgroup.h>
54#include <linux/cpu.h> 55#include <linux/cpu.h>
@@ -261,6 +262,9 @@ struct mem_cgroup {
261 */ 262 */
262 struct res_counter res; 263 struct res_counter res;
263 264
265 /* vmpressure notifications */
266 struct vmpressure vmpressure;
267
264 union { 268 union {
265 /* 269 /*
266 * the counter to account for mem+swap usage. 270 * the counter to account for mem+swap usage.
@@ -359,6 +363,7 @@ struct mem_cgroup {
359 atomic_t numainfo_events; 363 atomic_t numainfo_events;
360 atomic_t numainfo_updating; 364 atomic_t numainfo_updating;
361#endif 365#endif
366
362 /* 367 /*
363 * Per cgroup active and inactive list, similar to the 368 * Per cgroup active and inactive list, similar to the
364 * per zone LRU lists. 369 * per zone LRU lists.
@@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
510 return container_of(s, struct mem_cgroup, css); 515 return container_of(s, struct mem_cgroup, css);
511} 516}
512 517
518/* Some nice accessors for the vmpressure. */
519struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
520{
521 if (!memcg)
522 memcg = root_mem_cgroup;
523 return &memcg->vmpressure;
524}
525
526struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
527{
528 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
529}
530
531struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
532{
533 return &mem_cgroup_from_css(css)->vmpressure;
534}
535
513static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 536static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
514{ 537{
515 return (memcg == root_mem_cgroup); 538 return (memcg == root_mem_cgroup);
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = {
5907 .unregister_event = mem_cgroup_oom_unregister_event, 5930 .unregister_event = mem_cgroup_oom_unregister_event,
5908 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5931 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5909 }, 5932 },
5933 {
5934 .name = "pressure_level",
5935 .register_event = vmpressure_register_event,
5936 .unregister_event = vmpressure_unregister_event,
5937 },
5910#ifdef CONFIG_NUMA 5938#ifdef CONFIG_NUMA
5911 { 5939 {
5912 .name = "numa_stat", 5940 .name = "numa_stat",
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6188 memcg->move_charge_at_immigrate = 0; 6216 memcg->move_charge_at_immigrate = 0;
6189 mutex_init(&memcg->thresholds_lock); 6217 mutex_init(&memcg->thresholds_lock);
6190 spin_lock_init(&memcg->move_lock); 6218 spin_lock_init(&memcg->move_lock);
6219 vmpressure_init(&memcg->vmpressure);
6191 6220
6192 return &memcg->css; 6221 return &memcg->css;
6193 6222
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
1/*
2 * Linux VM pressure
3 *
4 * Copyright 2012 Linaro Ltd.
5 * Anton Vorontsov <anton.vorontsov@linaro.org>
6 *
7 * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
8 * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License version 2 as published
12 * by the Free Software Foundation.
13 */
14
15#include <linux/cgroup.h>
16#include <linux/fs.h>
17#include <linux/log2.h>
18#include <linux/sched.h>
19#include <linux/mm.h>
20#include <linux/vmstat.h>
21#include <linux/eventfd.h>
22#include <linux/swap.h>
23#include <linux/printk.h>
24#include <linux/vmpressure.h>
25
26/*
27 * The window size (vmpressure_win) is the number of scanned pages before
28 * we try to analyze scanned/reclaimed ratio. So the window is used as a
29 * rate-limit tunable for the "low" level notification, and also for
30 * averaging the ratio for medium/critical levels. Using small window
31 * sizes can cause lot of false positives, but too big window size will
32 * delay the notifications.
33 *
34 * As the vmscan reclaimer logic works with chunks which are multiple of
35 * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
36 *
37 * TODO: Make the window size depend on machine size, as we do for vmstat
38 * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
39 */
40static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
41
42/*
43 * These thresholds are used when we account memory pressure through
44 * scanned/reclaimed ratio. The current values were chosen empirically. In
45 * essence, they are percents: the higher the value, the more number
46 * unsuccessful reclaims there were.
47 */
48static const unsigned int vmpressure_level_med = 60;
49static const unsigned int vmpressure_level_critical = 95;
50
51/*
52 * When there are too little pages left to scan, vmpressure() may miss the
53 * critical pressure as number of pages will be less than "window size".
54 * However, in that case the vmscan priority will raise fast as the
55 * reclaimer will try to scan LRUs more deeply.
56 *
57 * The vmscan logic considers these special priorities:
58 *
59 * prio == DEF_PRIORITY (12): reclaimer starts with that value
60 * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
61 * prio == 0 : close to OOM, kernel scans every page in an lru
62 *
63 * Any value in this range is acceptable for this tunable (i.e. from 12 to
64 * 0). Current value for the vmpressure_level_critical_prio is chosen
65 * empirically, but the number, in essence, means that we consider
66 * critical level when scanning depth is ~10% of the lru size (vmscan
67 * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
68 * eights).
69 */
70static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
71
72static struct vmpressure *work_to_vmpressure(struct work_struct *work)
73{
74 return container_of(work, struct vmpressure, work);
75}
76
77static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
78{
79 return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
80}
81
82static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
83{
84 struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
85 struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
86
87 memcg = parent_mem_cgroup(memcg);
88 if (!memcg)
89 return NULL;
90 return memcg_to_vmpressure(memcg);
91}
92
93enum vmpressure_levels {
94 VMPRESSURE_LOW = 0,
95 VMPRESSURE_MEDIUM,
96 VMPRESSURE_CRITICAL,
97 VMPRESSURE_NUM_LEVELS,
98};
99
100static const char * const vmpressure_str_levels[] = {
101 [VMPRESSURE_LOW] = "low",
102 [VMPRESSURE_MEDIUM] = "medium",
103 [VMPRESSURE_CRITICAL] = "critical",
104};
105
106static enum vmpressure_levels vmpressure_level(unsigned long pressure)
107{
108 if (pressure >= vmpressure_level_critical)
109 return VMPRESSURE_CRITICAL;
110 else if (pressure >= vmpressure_level_med)
111 return VMPRESSURE_MEDIUM;
112 return VMPRESSURE_LOW;
113}
114
115static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
116 unsigned long reclaimed)
117{
118 unsigned long scale = scanned + reclaimed;
119 unsigned long pressure;
120
121 /*
122 * We calculate the ratio (in percents) of how many pages were
123 * scanned vs. reclaimed in a given time frame (window). Note that
124 * time is in VM reclaimer's "ticks", i.e. number of pages
125 * scanned. This makes it possible to set desired reaction time
126 * and serves as a ratelimit.
127 */
128 pressure = scale - (reclaimed * scale / scanned);
129 pressure = pressure * 100 / scale;
130
131 pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
132 scanned, reclaimed);
133
134 return vmpressure_level(pressure);
135}
136
137struct vmpressure_event {
138 struct eventfd_ctx *efd;
139 enum vmpressure_levels level;
140 struct list_head node;
141};
142
143static bool vmpressure_event(struct vmpressure *vmpr,
144 unsigned long scanned, unsigned long reclaimed)
145{
146 struct vmpressure_event *ev;
147 enum vmpressure_levels level;
148 bool signalled = false;
149
150 level = vmpressure_calc_level(scanned, reclaimed);
151
152 mutex_lock(&vmpr->events_lock);
153
154 list_for_each_entry(ev, &vmpr->events, node) {
155 if (level >= ev->level) {
156 eventfd_signal(ev->efd, 1);
157 signalled = true;
158 }
159 }
160
161 mutex_unlock(&vmpr->events_lock);
162
163 return signalled;
164}
165
166static void vmpressure_work_fn(struct work_struct *work)
167{
168 struct vmpressure *vmpr = work_to_vmpressure(work);
169 unsigned long scanned;
170 unsigned long reclaimed;
171
172 /*
173 * Several contexts might be calling vmpressure(), so it is
174 * possible that the work was rescheduled again before the old
175 * work context cleared the counters. In that case we will run
176 * just after the old work returns, but then scanned might be zero
177 * here. No need for any locks here since we don't care if
178 * vmpr->reclaimed is in sync.
179 */
180 if (!vmpr->scanned)
181 return;
182
183 mutex_lock(&vmpr->sr_lock);
184 scanned = vmpr->scanned;
185 reclaimed = vmpr->reclaimed;
186 vmpr->scanned = 0;
187 vmpr->reclaimed = 0;
188 mutex_unlock(&vmpr->sr_lock);
189
190 do {
191 if (vmpressure_event(vmpr, scanned, reclaimed))
192 break;
193 /*
194 * If not handled, propagate the event upward into the
195 * hierarchy.
196 */
197 } while ((vmpr = vmpressure_parent(vmpr)));
198}
199
200/**
201 * vmpressure() - Account memory pressure through scanned/reclaimed ratio
202 * @gfp: reclaimer's gfp mask
203 * @memcg: cgroup memory controller handle
204 * @scanned: number of pages scanned
205 * @reclaimed: number of pages reclaimed
206 *
207 * This function should be called from the vmscan reclaim path to account
208 * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
209 * pressure index is then further refined and averaged over time.
210 *
211 * This function does not return any value.
212 */
213void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
214 unsigned long scanned, unsigned long reclaimed)
215{
216 struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
217
218 /*
219 * Here we only want to account pressure that userland is able to
220 * help us with. For example, suppose that DMA zone is under
221 * pressure; if we notify userland about that kind of pressure,
222 * then it will be mostly a waste as it will trigger unnecessary
223 * freeing of memory by userland (since userland is more likely to
224 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
225 * is why we include only movable, highmem and FS/IO pages.
226 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
227 * we account it too.
228 */
229 if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
230 return;
231
232 /*
233 * If we got here with no pages scanned, then that is an indicator
234 * that reclaimer was unable to find any shrinkable LRUs at the
235 * current scanning depth. But it does not mean that we should
236 * report the critical pressure, yet. If the scanning priority
237 * (scanning depth) goes too high (deep), we will be notified
238 * through vmpressure_prio(). But so far, keep calm.
239 */
240 if (!scanned)
241 return;
242
243 mutex_lock(&vmpr->sr_lock);
244 vmpr->scanned += scanned;
245 vmpr->reclaimed += reclaimed;
246 scanned = vmpr->scanned;
247 mutex_unlock(&vmpr->sr_lock);
248
249 if (scanned < vmpressure_win || work_pending(&vmpr->work))
250 return;
251 schedule_work(&vmpr->work);
252}
253
254/**
255 * vmpressure_prio() - Account memory pressure through reclaimer priority level
256 * @gfp: reclaimer's gfp mask
257 * @memcg: cgroup memory controller handle
258 * @prio: reclaimer's priority
259 *
260 * This function should be called from the reclaim path every time when
261 * the vmscan's reclaiming priority (scanning depth) changes.
262 *
263 * This function does not return any value.
264 */
265void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
266{
267 /*
268 * We only use prio for accounting critical level. For more info
269 * see comment for vmpressure_level_critical_prio variable above.
270 */
271 if (prio > vmpressure_level_critical_prio)
272 return;
273
274 /*
275 * OK, the prio is below the threshold, updating vmpressure
276 * information before shrinker dives into long shrinking of long
277 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
278 * to the vmpressure() basically means that we signal 'critical'
279 * level.
280 */
281 vmpressure(gfp, memcg, vmpressure_win, 0);
282}
283
284/**
285 * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
286 * @cg: cgroup that is interested in vmpressure notifications
287 * @cft: cgroup control files handle
288 * @eventfd: eventfd context to link notifications with
289 * @args: event arguments (used to set up a pressure level threshold)
290 *
291 * This function associates eventfd context with the vmpressure
292 * infrastructure, so that the notifications will be delivered to the
293 * @eventfd. The @args parameter is a string that denotes pressure level
294 * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
295 * "critical").
296 *
297 * This function should not be used directly, just pass it to (struct
298 * cftype).register_event, and then cgroup core will handle everything by
299 * itself.
300 */
301int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
302 struct eventfd_ctx *eventfd, const char *args)
303{
304 struct vmpressure *vmpr = cg_to_vmpressure(cg);
305 struct vmpressure_event *ev;
306 int level;
307
308 for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
309 if (!strcmp(vmpressure_str_levels[level], args))
310 break;
311 }
312
313 if (level >= VMPRESSURE_NUM_LEVELS)
314 return -EINVAL;
315
316 ev = kzalloc(sizeof(*ev), GFP_KERNEL);
317 if (!ev)
318 return -ENOMEM;
319
320 ev->efd = eventfd;
321 ev->level = level;
322
323 mutex_lock(&vmpr->events_lock);
324 list_add(&ev->node, &vmpr->events);
325 mutex_unlock(&vmpr->events_lock);
326
327 return 0;
328}
329
330/**
331 * vmpressure_unregister_event() - Unbind eventfd from vmpressure
332 * @cg: cgroup handle
333 * @cft: cgroup control files handle
334 * @eventfd: eventfd context that was used to link vmpressure with the @cg
335 *
336 * This function does internal manipulations to detach the @eventfd from
337 * the vmpressure notifications, and then frees internal resources
338 * associated with the @eventfd (but the @eventfd itself is not freed).
339 *
340 * This function should not be used directly, just pass it to (struct
341 * cftype).unregister_event, and then cgroup core will handle everything
342 * by itself.
343 */
344void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
345 struct eventfd_ctx *eventfd)
346{
347 struct vmpressure *vmpr = cg_to_vmpressure(cg);
348 struct vmpressure_event *ev;
349
350 mutex_lock(&vmpr->events_lock);
351 list_for_each_entry(ev, &vmpr->events, node) {
352 if (ev->efd != eventfd)
353 continue;
354 list_del(&ev->node);
355 kfree(ev);
356 break;
357 }
358 mutex_unlock(&vmpr->events_lock);
359}
360
361/**
362 * vmpressure_init() - Initialize vmpressure control structure
363 * @vmpr: Structure to be initialized
364 *
365 * This function should be called on every allocated vmpressure structure
366 * before any usage.
367 */
368void vmpressure_init(struct vmpressure *vmpr)
369{
370 mutex_init(&vmpr->sr_lock);
371 mutex_init(&vmpr->events_lock);
372 INIT_LIST_HEAD(&vmpr->events);
373 INIT_WORK(&vmpr->work, vmpressure_work_fn);
374}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e03a00b09da9..e53e49584cf3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/vmpressure.h>
22#include <linux/vmstat.h> 23#include <linux/vmstat.h>
23#include <linux/file.h> 24#include <linux/file.h>
24#include <linux/writeback.h> 25#include <linux/writeback.h>
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
1982 } 1983 }
1983 memcg = mem_cgroup_iter(root, memcg, &reclaim); 1984 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984 } while (memcg); 1985 } while (memcg);
1986
1987 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
1988 sc->nr_scanned - nr_scanned,
1989 sc->nr_reclaimed - nr_reclaimed);
1990
1985 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 1991 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986 sc->nr_scanned - nr_scanned, sc)); 1992 sc->nr_scanned - nr_scanned, sc));
1987} 1993}
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2167 count_vm_event(ALLOCSTALL); 2173 count_vm_event(ALLOCSTALL);
2168 2174
2169 do { 2175 do {
2176 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2177 sc->priority);
2170 sc->nr_scanned = 0; 2178 sc->nr_scanned = 0;
2171 aborted_reclaim = shrink_zones(zonelist, sc); 2179 aborted_reclaim = shrink_zones(zonelist, sc);
2172 2180