diff options
-rw-r--r-- | Documentation/cgroups/memory.txt | 70 | ||||
-rw-r--r-- | include/linux/vmpressure.h | 47 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 29 | ||||
-rw-r--r-- | mm/vmpressure.c | 374 | ||||
-rw-r--r-- | mm/vmscan.c | 8 |
6 files changed, 528 insertions, 2 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 8b8c28b9864c..f336ede58e62 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -40,6 +40,7 @@ Features: | |||
40 | - soft limit | 40 | - soft limit |
41 | - moving (recharging) account at moving a task is selectable. | 41 | - moving (recharging) account at moving a task is selectable. |
42 | - usage threshold notifier | 42 | - usage threshold notifier |
43 | - memory pressure notifier | ||
43 | - oom-killer disable knob and oom-notifier | 44 | - oom-killer disable knob and oom-notifier |
44 | - Root cgroup has no limit controls. | 45 | - Root cgroup has no limit controls. |
45 | 46 | ||
@@ -65,6 +66,7 @@ Brief summary of control files. | |||
65 | memory.stat # show various statistics | 66 | memory.stat # show various statistics |
66 | memory.use_hierarchy # set/show hierarchical account enabled | 67 | memory.use_hierarchy # set/show hierarchical account enabled |
67 | memory.force_empty # trigger forced move charge to parent | 68 | memory.force_empty # trigger forced move charge to parent |
69 | memory.pressure_level # set memory pressure notifications | ||
68 | memory.swappiness # set/show swappiness parameter of vmscan | 70 | memory.swappiness # set/show swappiness parameter of vmscan |
69 | (See sysctl's vm.swappiness) | 71 | (See sysctl's vm.swappiness) |
70 | memory.move_charge_at_immigrate # set/show controls of moving charges | 72 | memory.move_charge_at_immigrate # set/show controls of moving charges |
@@ -762,7 +764,73 @@ At reading, current status of OOM is shown. | |||
762 | under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may | 764 | under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may |
763 | be stopped.) | 765 | be stopped.) |
764 | 766 | ||
765 | 11. TODO | 767 | 11. Memory Pressure |
768 | |||
769 | The pressure level notifications can be used to monitor the memory | ||
770 | allocation cost; based on the pressure, applications can implement | ||
771 | different strategies of managing their memory resources. The pressure | ||
772 | levels are defined as following: | ||
773 | |||
774 | The "low" level means that the system is reclaiming memory for new | ||
775 | allocations. Monitoring this reclaiming activity might be useful for | ||
776 | maintaining cache level. Upon notification, the program (typically | ||
777 | "Activity Manager") might analyze vmstat and act in advance (i.e. | ||
778 | prematurely shutdown unimportant services). | ||
779 | |||
780 | The "medium" level means that the system is experiencing medium memory | ||
781 | pressure, the system might be making swap, paging out active file caches, | ||
782 | etc. Upon this event applications may decide to further analyze | ||
783 | vmstat/zoneinfo/memcg or internal memory usage statistics and free any | ||
784 | resources that can be easily reconstructed or re-read from a disk. | ||
785 | |||
786 | The "critical" level means that the system is actively thrashing, it is | ||
787 | about to out of memory (OOM) or even the in-kernel OOM killer is on its | ||
788 | way to trigger. Applications should do whatever they can to help the | ||
789 | system. It might be too late to consult with vmstat or any other | ||
790 | statistics, so it's advisable to take an immediate action. | ||
791 | |||
792 | The events are propagated upward until the event is handled, i.e. the | ||
793 | events are not pass-through. Here is what this means: for example you have | ||
794 | three cgroups: A->B->C. Now you set up an event listener on cgroups A, B | ||
795 | and C, and suppose group C experiences some pressure. In this situation, | ||
796 | only group C will receive the notification, i.e. groups A and B will not | ||
797 | receive it. This is done to avoid excessive "broadcasting" of messages, | ||
798 | which disturbs the system and which is especially bad if we are low on | ||
799 | memory or thrashing. So, organize the cgroups wisely, or propagate the | ||
800 | events manually (or, ask us to implement the pass-through events, | ||
801 | explaining why would you need them.) | ||
802 | |||
803 | The file memory.pressure_level is only used to setup an eventfd. To | ||
804 | register a notification, an application must: | ||
805 | |||
806 | - create an eventfd using eventfd(2); | ||
807 | - open memory.pressure_level; | ||
808 | - write string like "<event_fd> <fd of memory.pressure_level> <level>" | ||
809 | to cgroup.event_control. | ||
810 | |||
811 | Application will be notified through eventfd when memory pressure is at | ||
812 | the specific level (or higher). Read/write operations to | ||
813 | memory.pressure_level are no implemented. | ||
814 | |||
815 | Test: | ||
816 | |||
817 | Here is a small script example that makes a new cgroup, sets up a | ||
818 | memory limit, sets up a notification in the cgroup and then makes child | ||
819 | cgroup experience a critical pressure: | ||
820 | |||
821 | # cd /sys/fs/cgroup/memory/ | ||
822 | # mkdir foo | ||
823 | # cd foo | ||
824 | # cgroup_event_listener memory.pressure_level low & | ||
825 | # echo 8000000 > memory.limit_in_bytes | ||
826 | # echo 8000000 > memory.memsw.limit_in_bytes | ||
827 | # echo $$ > tasks | ||
828 | # dd if=/dev/zero | read x | ||
829 | |||
830 | (Expect a bunch of notifications, and eventually, the oom-killer will | ||
831 | trigger.) | ||
832 | |||
833 | 12. TODO | ||
766 | 834 | ||
767 | 1. Add support for accounting huge pages (as a separate controller) | 835 | 1. Add support for accounting huge pages (as a separate controller) |
768 | 2. Make per-cgroup scanner reclaim not-shared pages first | 836 | 2. Make per-cgroup scanner reclaim not-shared pages first |
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h new file mode 100644 index 000000000000..76be077340ea --- /dev/null +++ b/include/linux/vmpressure.h | |||
@@ -0,0 +1,47 @@ | |||
1 | #ifndef __LINUX_VMPRESSURE_H | ||
2 | #define __LINUX_VMPRESSURE_H | ||
3 | |||
4 | #include <linux/mutex.h> | ||
5 | #include <linux/list.h> | ||
6 | #include <linux/workqueue.h> | ||
7 | #include <linux/gfp.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/cgroup.h> | ||
10 | |||
11 | struct vmpressure { | ||
12 | unsigned long scanned; | ||
13 | unsigned long reclaimed; | ||
14 | /* The lock is used to keep the scanned/reclaimed above in sync. */ | ||
15 | struct mutex sr_lock; | ||
16 | |||
17 | /* The list of vmpressure_event structs. */ | ||
18 | struct list_head events; | ||
19 | /* Have to grab the lock on events traversal or modifications. */ | ||
20 | struct mutex events_lock; | ||
21 | |||
22 | struct work_struct work; | ||
23 | }; | ||
24 | |||
25 | struct mem_cgroup; | ||
26 | |||
27 | #ifdef CONFIG_MEMCG | ||
28 | extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | ||
29 | unsigned long scanned, unsigned long reclaimed); | ||
30 | extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); | ||
31 | |||
32 | extern void vmpressure_init(struct vmpressure *vmpr); | ||
33 | extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); | ||
34 | extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); | ||
35 | extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); | ||
36 | extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, | ||
37 | struct eventfd_ctx *eventfd, | ||
38 | const char *args); | ||
39 | extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, | ||
40 | struct eventfd_ctx *eventfd); | ||
41 | #else | ||
42 | static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | ||
43 | unsigned long scanned, unsigned long reclaimed) {} | ||
44 | static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, | ||
45 | int prio) {} | ||
46 | #endif /* CONFIG_MEMCG */ | ||
47 | #endif /* __LINUX_VMPRESSURE_H */ | ||
diff --git a/mm/Makefile b/mm/Makefile index 3a4628751f89..72c5acb9345f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
50 | obj-$(CONFIG_MIGRATION) += migrate.o | 50 | obj-$(CONFIG_MIGRATION) += migrate.o |
51 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 51 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
52 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 52 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
53 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o | 53 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o |
54 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | 54 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o |
55 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 55 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
56 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 56 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e5bc43c2d1f..360464f40e96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/fs.h> | 49 | #include <linux/fs.h> |
50 | #include <linux/seq_file.h> | 50 | #include <linux/seq_file.h> |
51 | #include <linux/vmalloc.h> | 51 | #include <linux/vmalloc.h> |
52 | #include <linux/vmpressure.h> | ||
52 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
53 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
54 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
@@ -261,6 +262,9 @@ struct mem_cgroup { | |||
261 | */ | 262 | */ |
262 | struct res_counter res; | 263 | struct res_counter res; |
263 | 264 | ||
265 | /* vmpressure notifications */ | ||
266 | struct vmpressure vmpressure; | ||
267 | |||
264 | union { | 268 | union { |
265 | /* | 269 | /* |
266 | * the counter to account for mem+swap usage. | 270 | * the counter to account for mem+swap usage. |
@@ -359,6 +363,7 @@ struct mem_cgroup { | |||
359 | atomic_t numainfo_events; | 363 | atomic_t numainfo_events; |
360 | atomic_t numainfo_updating; | 364 | atomic_t numainfo_updating; |
361 | #endif | 365 | #endif |
366 | |||
362 | /* | 367 | /* |
363 | * Per cgroup active and inactive list, similar to the | 368 | * Per cgroup active and inactive list, similar to the |
364 | * per zone LRU lists. | 369 | * per zone LRU lists. |
@@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | |||
510 | return container_of(s, struct mem_cgroup, css); | 515 | return container_of(s, struct mem_cgroup, css); |
511 | } | 516 | } |
512 | 517 | ||
518 | /* Some nice accessors for the vmpressure. */ | ||
519 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) | ||
520 | { | ||
521 | if (!memcg) | ||
522 | memcg = root_mem_cgroup; | ||
523 | return &memcg->vmpressure; | ||
524 | } | ||
525 | |||
526 | struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | ||
527 | { | ||
528 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | ||
529 | } | ||
530 | |||
531 | struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) | ||
532 | { | ||
533 | return &mem_cgroup_from_css(css)->vmpressure; | ||
534 | } | ||
535 | |||
513 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 536 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
514 | { | 537 | { |
515 | return (memcg == root_mem_cgroup); | 538 | return (memcg == root_mem_cgroup); |
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = { | |||
5907 | .unregister_event = mem_cgroup_oom_unregister_event, | 5930 | .unregister_event = mem_cgroup_oom_unregister_event, |
5908 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 5931 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
5909 | }, | 5932 | }, |
5933 | { | ||
5934 | .name = "pressure_level", | ||
5935 | .register_event = vmpressure_register_event, | ||
5936 | .unregister_event = vmpressure_unregister_event, | ||
5937 | }, | ||
5910 | #ifdef CONFIG_NUMA | 5938 | #ifdef CONFIG_NUMA |
5911 | { | 5939 | { |
5912 | .name = "numa_stat", | 5940 | .name = "numa_stat", |
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6188 | memcg->move_charge_at_immigrate = 0; | 6216 | memcg->move_charge_at_immigrate = 0; |
6189 | mutex_init(&memcg->thresholds_lock); | 6217 | mutex_init(&memcg->thresholds_lock); |
6190 | spin_lock_init(&memcg->move_lock); | 6218 | spin_lock_init(&memcg->move_lock); |
6219 | vmpressure_init(&memcg->vmpressure); | ||
6191 | 6220 | ||
6192 | return &memcg->css; | 6221 | return &memcg->css; |
6193 | 6222 | ||
diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..736a6011c2c8 --- /dev/null +++ b/mm/vmpressure.c | |||
@@ -0,0 +1,374 @@ | |||
1 | /* | ||
2 | * Linux VM pressure | ||
3 | * | ||
4 | * Copyright 2012 Linaro Ltd. | ||
5 | * Anton Vorontsov <anton.vorontsov@linaro.org> | ||
6 | * | ||
7 | * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, | ||
8 | * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify it | ||
11 | * under the terms of the GNU General Public License version 2 as published | ||
12 | * by the Free Software Foundation. | ||
13 | */ | ||
14 | |||
15 | #include <linux/cgroup.h> | ||
16 | #include <linux/fs.h> | ||
17 | #include <linux/log2.h> | ||
18 | #include <linux/sched.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/vmstat.h> | ||
21 | #include <linux/eventfd.h> | ||
22 | #include <linux/swap.h> | ||
23 | #include <linux/printk.h> | ||
24 | #include <linux/vmpressure.h> | ||
25 | |||
26 | /* | ||
27 | * The window size (vmpressure_win) is the number of scanned pages before | ||
28 | * we try to analyze scanned/reclaimed ratio. So the window is used as a | ||
29 | * rate-limit tunable for the "low" level notification, and also for | ||
30 | * averaging the ratio for medium/critical levels. Using small window | ||
31 | * sizes can cause lot of false positives, but too big window size will | ||
32 | * delay the notifications. | ||
33 | * | ||
34 | * As the vmscan reclaimer logic works with chunks which are multiple of | ||
35 | * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. | ||
36 | * | ||
37 | * TODO: Make the window size depend on machine size, as we do for vmstat | ||
38 | * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). | ||
39 | */ | ||
40 | static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; | ||
41 | |||
42 | /* | ||
43 | * These thresholds are used when we account memory pressure through | ||
44 | * scanned/reclaimed ratio. The current values were chosen empirically. In | ||
45 | * essence, they are percents: the higher the value, the more number | ||
46 | * unsuccessful reclaims there were. | ||
47 | */ | ||
48 | static const unsigned int vmpressure_level_med = 60; | ||
49 | static const unsigned int vmpressure_level_critical = 95; | ||
50 | |||
51 | /* | ||
52 | * When there are too little pages left to scan, vmpressure() may miss the | ||
53 | * critical pressure as number of pages will be less than "window size". | ||
54 | * However, in that case the vmscan priority will raise fast as the | ||
55 | * reclaimer will try to scan LRUs more deeply. | ||
56 | * | ||
57 | * The vmscan logic considers these special priorities: | ||
58 | * | ||
59 | * prio == DEF_PRIORITY (12): reclaimer starts with that value | ||
60 | * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed | ||
61 | * prio == 0 : close to OOM, kernel scans every page in an lru | ||
62 | * | ||
63 | * Any value in this range is acceptable for this tunable (i.e. from 12 to | ||
64 | * 0). Current value for the vmpressure_level_critical_prio is chosen | ||
65 | * empirically, but the number, in essence, means that we consider | ||
66 | * critical level when scanning depth is ~10% of the lru size (vmscan | ||
67 | * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one | ||
68 | * eights). | ||
69 | */ | ||
70 | static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); | ||
71 | |||
72 | static struct vmpressure *work_to_vmpressure(struct work_struct *work) | ||
73 | { | ||
74 | return container_of(work, struct vmpressure, work); | ||
75 | } | ||
76 | |||
77 | static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) | ||
78 | { | ||
79 | return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); | ||
80 | } | ||
81 | |||
82 | static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) | ||
83 | { | ||
84 | struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; | ||
85 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); | ||
86 | |||
87 | memcg = parent_mem_cgroup(memcg); | ||
88 | if (!memcg) | ||
89 | return NULL; | ||
90 | return memcg_to_vmpressure(memcg); | ||
91 | } | ||
92 | |||
93 | enum vmpressure_levels { | ||
94 | VMPRESSURE_LOW = 0, | ||
95 | VMPRESSURE_MEDIUM, | ||
96 | VMPRESSURE_CRITICAL, | ||
97 | VMPRESSURE_NUM_LEVELS, | ||
98 | }; | ||
99 | |||
100 | static const char * const vmpressure_str_levels[] = { | ||
101 | [VMPRESSURE_LOW] = "low", | ||
102 | [VMPRESSURE_MEDIUM] = "medium", | ||
103 | [VMPRESSURE_CRITICAL] = "critical", | ||
104 | }; | ||
105 | |||
106 | static enum vmpressure_levels vmpressure_level(unsigned long pressure) | ||
107 | { | ||
108 | if (pressure >= vmpressure_level_critical) | ||
109 | return VMPRESSURE_CRITICAL; | ||
110 | else if (pressure >= vmpressure_level_med) | ||
111 | return VMPRESSURE_MEDIUM; | ||
112 | return VMPRESSURE_LOW; | ||
113 | } | ||
114 | |||
115 | static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, | ||
116 | unsigned long reclaimed) | ||
117 | { | ||
118 | unsigned long scale = scanned + reclaimed; | ||
119 | unsigned long pressure; | ||
120 | |||
121 | /* | ||
122 | * We calculate the ratio (in percents) of how many pages were | ||
123 | * scanned vs. reclaimed in a given time frame (window). Note that | ||
124 | * time is in VM reclaimer's "ticks", i.e. number of pages | ||
125 | * scanned. This makes it possible to set desired reaction time | ||
126 | * and serves as a ratelimit. | ||
127 | */ | ||
128 | pressure = scale - (reclaimed * scale / scanned); | ||
129 | pressure = pressure * 100 / scale; | ||
130 | |||
131 | pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, | ||
132 | scanned, reclaimed); | ||
133 | |||
134 | return vmpressure_level(pressure); | ||
135 | } | ||
136 | |||
137 | struct vmpressure_event { | ||
138 | struct eventfd_ctx *efd; | ||
139 | enum vmpressure_levels level; | ||
140 | struct list_head node; | ||
141 | }; | ||
142 | |||
143 | static bool vmpressure_event(struct vmpressure *vmpr, | ||
144 | unsigned long scanned, unsigned long reclaimed) | ||
145 | { | ||
146 | struct vmpressure_event *ev; | ||
147 | enum vmpressure_levels level; | ||
148 | bool signalled = false; | ||
149 | |||
150 | level = vmpressure_calc_level(scanned, reclaimed); | ||
151 | |||
152 | mutex_lock(&vmpr->events_lock); | ||
153 | |||
154 | list_for_each_entry(ev, &vmpr->events, node) { | ||
155 | if (level >= ev->level) { | ||
156 | eventfd_signal(ev->efd, 1); | ||
157 | signalled = true; | ||
158 | } | ||
159 | } | ||
160 | |||
161 | mutex_unlock(&vmpr->events_lock); | ||
162 | |||
163 | return signalled; | ||
164 | } | ||
165 | |||
166 | static void vmpressure_work_fn(struct work_struct *work) | ||
167 | { | ||
168 | struct vmpressure *vmpr = work_to_vmpressure(work); | ||
169 | unsigned long scanned; | ||
170 | unsigned long reclaimed; | ||
171 | |||
172 | /* | ||
173 | * Several contexts might be calling vmpressure(), so it is | ||
174 | * possible that the work was rescheduled again before the old | ||
175 | * work context cleared the counters. In that case we will run | ||
176 | * just after the old work returns, but then scanned might be zero | ||
177 | * here. No need for any locks here since we don't care if | ||
178 | * vmpr->reclaimed is in sync. | ||
179 | */ | ||
180 | if (!vmpr->scanned) | ||
181 | return; | ||
182 | |||
183 | mutex_lock(&vmpr->sr_lock); | ||
184 | scanned = vmpr->scanned; | ||
185 | reclaimed = vmpr->reclaimed; | ||
186 | vmpr->scanned = 0; | ||
187 | vmpr->reclaimed = 0; | ||
188 | mutex_unlock(&vmpr->sr_lock); | ||
189 | |||
190 | do { | ||
191 | if (vmpressure_event(vmpr, scanned, reclaimed)) | ||
192 | break; | ||
193 | /* | ||
194 | * If not handled, propagate the event upward into the | ||
195 | * hierarchy. | ||
196 | */ | ||
197 | } while ((vmpr = vmpressure_parent(vmpr))); | ||
198 | } | ||
199 | |||
200 | /** | ||
201 | * vmpressure() - Account memory pressure through scanned/reclaimed ratio | ||
202 | * @gfp: reclaimer's gfp mask | ||
203 | * @memcg: cgroup memory controller handle | ||
204 | * @scanned: number of pages scanned | ||
205 | * @reclaimed: number of pages reclaimed | ||
206 | * | ||
207 | * This function should be called from the vmscan reclaim path to account | ||
208 | * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw | ||
209 | * pressure index is then further refined and averaged over time. | ||
210 | * | ||
211 | * This function does not return any value. | ||
212 | */ | ||
213 | void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, | ||
214 | unsigned long scanned, unsigned long reclaimed) | ||
215 | { | ||
216 | struct vmpressure *vmpr = memcg_to_vmpressure(memcg); | ||
217 | |||
218 | /* | ||
219 | * Here we only want to account pressure that userland is able to | ||
220 | * help us with. For example, suppose that DMA zone is under | ||
221 | * pressure; if we notify userland about that kind of pressure, | ||
222 | * then it will be mostly a waste as it will trigger unnecessary | ||
223 | * freeing of memory by userland (since userland is more likely to | ||
224 | * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That | ||
225 | * is why we include only movable, highmem and FS/IO pages. | ||
226 | * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so | ||
227 | * we account it too. | ||
228 | */ | ||
229 | if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) | ||
230 | return; | ||
231 | |||
232 | /* | ||
233 | * If we got here with no pages scanned, then that is an indicator | ||
234 | * that reclaimer was unable to find any shrinkable LRUs at the | ||
235 | * current scanning depth. But it does not mean that we should | ||
236 | * report the critical pressure, yet. If the scanning priority | ||
237 | * (scanning depth) goes too high (deep), we will be notified | ||
238 | * through vmpressure_prio(). But so far, keep calm. | ||
239 | */ | ||
240 | if (!scanned) | ||
241 | return; | ||
242 | |||
243 | mutex_lock(&vmpr->sr_lock); | ||
244 | vmpr->scanned += scanned; | ||
245 | vmpr->reclaimed += reclaimed; | ||
246 | scanned = vmpr->scanned; | ||
247 | mutex_unlock(&vmpr->sr_lock); | ||
248 | |||
249 | if (scanned < vmpressure_win || work_pending(&vmpr->work)) | ||
250 | return; | ||
251 | schedule_work(&vmpr->work); | ||
252 | } | ||
253 | |||
254 | /** | ||
255 | * vmpressure_prio() - Account memory pressure through reclaimer priority level | ||
256 | * @gfp: reclaimer's gfp mask | ||
257 | * @memcg: cgroup memory controller handle | ||
258 | * @prio: reclaimer's priority | ||
259 | * | ||
260 | * This function should be called from the reclaim path every time when | ||
261 | * the vmscan's reclaiming priority (scanning depth) changes. | ||
262 | * | ||
263 | * This function does not return any value. | ||
264 | */ | ||
265 | void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) | ||
266 | { | ||
267 | /* | ||
268 | * We only use prio for accounting critical level. For more info | ||
269 | * see comment for vmpressure_level_critical_prio variable above. | ||
270 | */ | ||
271 | if (prio > vmpressure_level_critical_prio) | ||
272 | return; | ||
273 | |||
274 | /* | ||
275 | * OK, the prio is below the threshold, updating vmpressure | ||
276 | * information before shrinker dives into long shrinking of long | ||
277 | * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 | ||
278 | * to the vmpressure() basically means that we signal 'critical' | ||
279 | * level. | ||
280 | */ | ||
281 | vmpressure(gfp, memcg, vmpressure_win, 0); | ||
282 | } | ||
283 | |||
284 | /** | ||
285 | * vmpressure_register_event() - Bind vmpressure notifications to an eventfd | ||
286 | * @cg: cgroup that is interested in vmpressure notifications | ||
287 | * @cft: cgroup control files handle | ||
288 | * @eventfd: eventfd context to link notifications with | ||
289 | * @args: event arguments (used to set up a pressure level threshold) | ||
290 | * | ||
291 | * This function associates eventfd context with the vmpressure | ||
292 | * infrastructure, so that the notifications will be delivered to the | ||
293 | * @eventfd. The @args parameter is a string that denotes pressure level | ||
294 | * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or | ||
295 | * "critical"). | ||
296 | * | ||
297 | * This function should not be used directly, just pass it to (struct | ||
298 | * cftype).register_event, and then cgroup core will handle everything by | ||
299 | * itself. | ||
300 | */ | ||
301 | int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, | ||
302 | struct eventfd_ctx *eventfd, const char *args) | ||
303 | { | ||
304 | struct vmpressure *vmpr = cg_to_vmpressure(cg); | ||
305 | struct vmpressure_event *ev; | ||
306 | int level; | ||
307 | |||
308 | for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { | ||
309 | if (!strcmp(vmpressure_str_levels[level], args)) | ||
310 | break; | ||
311 | } | ||
312 | |||
313 | if (level >= VMPRESSURE_NUM_LEVELS) | ||
314 | return -EINVAL; | ||
315 | |||
316 | ev = kzalloc(sizeof(*ev), GFP_KERNEL); | ||
317 | if (!ev) | ||
318 | return -ENOMEM; | ||
319 | |||
320 | ev->efd = eventfd; | ||
321 | ev->level = level; | ||
322 | |||
323 | mutex_lock(&vmpr->events_lock); | ||
324 | list_add(&ev->node, &vmpr->events); | ||
325 | mutex_unlock(&vmpr->events_lock); | ||
326 | |||
327 | return 0; | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * vmpressure_unregister_event() - Unbind eventfd from vmpressure | ||
332 | * @cg: cgroup handle | ||
333 | * @cft: cgroup control files handle | ||
334 | * @eventfd: eventfd context that was used to link vmpressure with the @cg | ||
335 | * | ||
336 | * This function does internal manipulations to detach the @eventfd from | ||
337 | * the vmpressure notifications, and then frees internal resources | ||
338 | * associated with the @eventfd (but the @eventfd itself is not freed). | ||
339 | * | ||
340 | * This function should not be used directly, just pass it to (struct | ||
341 | * cftype).unregister_event, and then cgroup core will handle everything | ||
342 | * by itself. | ||
343 | */ | ||
344 | void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, | ||
345 | struct eventfd_ctx *eventfd) | ||
346 | { | ||
347 | struct vmpressure *vmpr = cg_to_vmpressure(cg); | ||
348 | struct vmpressure_event *ev; | ||
349 | |||
350 | mutex_lock(&vmpr->events_lock); | ||
351 | list_for_each_entry(ev, &vmpr->events, node) { | ||
352 | if (ev->efd != eventfd) | ||
353 | continue; | ||
354 | list_del(&ev->node); | ||
355 | kfree(ev); | ||
356 | break; | ||
357 | } | ||
358 | mutex_unlock(&vmpr->events_lock); | ||
359 | } | ||
360 | |||
361 | /** | ||
362 | * vmpressure_init() - Initialize vmpressure control structure | ||
363 | * @vmpr: Structure to be initialized | ||
364 | * | ||
365 | * This function should be called on every allocated vmpressure structure | ||
366 | * before any usage. | ||
367 | */ | ||
368 | void vmpressure_init(struct vmpressure *vmpr) | ||
369 | { | ||
370 | mutex_init(&vmpr->sr_lock); | ||
371 | mutex_init(&vmpr->events_lock); | ||
372 | INIT_LIST_HEAD(&vmpr->events); | ||
373 | INIT_WORK(&vmpr->work, vmpressure_work_fn); | ||
374 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index e03a00b09da9..e53e49584cf3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/highmem.h> | 21 | #include <linux/highmem.h> |
22 | #include <linux/vmpressure.h> | ||
22 | #include <linux/vmstat.h> | 23 | #include <linux/vmstat.h> |
23 | #include <linux/file.h> | 24 | #include <linux/file.h> |
24 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1982 | } | 1983 | } |
1983 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 1984 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
1984 | } while (memcg); | 1985 | } while (memcg); |
1986 | |||
1987 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | ||
1988 | sc->nr_scanned - nr_scanned, | ||
1989 | sc->nr_reclaimed - nr_reclaimed); | ||
1990 | |||
1985 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 1991 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
1986 | sc->nr_scanned - nr_scanned, sc)); | 1992 | sc->nr_scanned - nr_scanned, sc)); |
1987 | } | 1993 | } |
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2167 | count_vm_event(ALLOCSTALL); | 2173 | count_vm_event(ALLOCSTALL); |
2168 | 2174 | ||
2169 | do { | 2175 | do { |
2176 | vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, | ||
2177 | sc->priority); | ||
2170 | sc->nr_scanned = 0; | 2178 | sc->nr_scanned = 0; |
2171 | aborted_reclaim = shrink_zones(zonelist, sc); | 2179 | aborted_reclaim = shrink_zones(zonelist, sc); |
2172 | 2180 | ||