memcg: add memory.pressure_level events

With this patch userland applications that want to maintain the interactivity/memory allocation cost can use the pressure level notifications. The levels are defined like this: The "low" level means that the system is reclaiming memory for new allocations. Monitoring this reclaiming activity might be useful for maintaining cache level. Upon notification, the program (typically "Activity Manager") might analyze vmstat and act in advance (i.e. prematurely shutdown unimportant services). The "medium" level means that the system is experiencing medium memory pressure, the system might be making swap, paging out active file caches, etc. Upon this event applications may decide to further analyze vmstat/zoneinfo/memcg or internal memory usage statistics and free any resources that can be easily reconstructed or re-read from a disk. The "critical" level means that the system is actively thrashing, it is about to out of memory (OOM) or even the in-kernel OOM killer is on its way to trigger. Applications should do whatever they can to help the system. It might be too late to consult with vmstat or any other statistics, so it's advisable to take an immediate action. The events are propagated upward until the event is handled, i.e. the events are not pass-through. Here is what this means: for example you have three cgroups: A->B->C. Now you set up an event listener on cgroups A, B and C, and suppose group C experiences some pressure. In this situation, only group C will receive the notification, i.e. groups A and B will not receive it. This is done to avoid excessive "broadcasting" of messages, which disturbs the system and which is especially bad if we are low on memory or thrashing. So, organize the cgroups wisely, or propagate the events manually (or, ask us to implement the pass-through events, explaining why would you need them.) Performance wise, the memory pressure notifications feature itself is lightweight and does not require much of bookkeeping, in contrast to the rest of memcg features. Unfortunately, as of current memcg implementation, pages accounting is an inseparable part and cannot be turned off. The good news is that there are some efforts[1] to improve the situation; plus, implementing the same, fully API-compatible[2] interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable option, so it will not require any changes on the userland side. [1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291 [2] http://lkml.org/lkml/2013/2/21/454 [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings] Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org> Acked-by: Kirill A. Shutemov <kirill@shutemov.name> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Glauber Costa <glommer@parallels.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com> Cc: John Stultz <john.stultz@linaro.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Anton Vorontsov <anton.vorontsov@linaro.org> 2013-04-29 18:08:31 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-04-29 18:54:38 -0400
commit: 70ddf637eebe47e61fb2be08a59315581b6d2f38 (patch)
tree: 7fdb9e04da11c191daa225cad2314e440effc176 /mm
parent: 84d96d897671cfb386e722acbefdb3a79e115a8a (diff)
4 files changed, 412 insertions, 1 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7e5bc43c2d1f..360464f40e96 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
@@ -261,6 +262,9 @@ struct mem_cgroup {
         */
        struct res_counter res;
+        /* vmpressure notifications */
+        struct vmpressure vmpressure;
        union {
                /*
                 * the counter to account for mem+swap usage.
@@ -359,6 +363,7 @@ struct mem_cgroup {
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;
 #endif
        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
@@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
        return container_of(s, struct mem_cgroup, css);
 }
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+        if (!memcg)
+                memcg = root_mem_cgroup;
+        return &memcg->vmpressure;
+}
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+{
+        return &mem_cgroup_from_css(css)->vmpressure;
+}
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = {
                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
+        {
+                .name = "pressure_level",
+                .register_event = vmpressure_register_event,
+                .unregister_event = vmpressure_unregister_event,
+        },
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
+        vmpressure_init(&memcg->vmpressure);
        return &memcg->css;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ *                Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0                : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+        return container_of(work, struct vmpressure, work);
+}
+static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
+{
+        return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+}
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+        struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
+        memcg = parent_mem_cgroup(memcg);
+        if (!memcg)
+                return NULL;
+        return memcg_to_vmpressure(memcg);
+}
+enum vmpressure_levels {
+        VMPRESSURE_LOW = 0,
+        VMPRESSURE_MEDIUM,
+        VMPRESSURE_CRITICAL,
+        VMPRESSURE_NUM_LEVELS,
+};
+static const char * const vmpressure_str_levels[] = {
+        [VMPRESSURE_LOW] = "low",
+        [VMPRESSURE_MEDIUM] = "medium",
+        [VMPRESSURE_CRITICAL] = "critical",
+};
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+        if (pressure >= vmpressure_level_critical)
+                return VMPRESSURE_CRITICAL;
+        else if (pressure >= vmpressure_level_med)
+                return VMPRESSURE_MEDIUM;
+        return VMPRESSURE_LOW;
+}
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+                                                    unsigned long reclaimed)
+{
+        unsigned long scale = scanned + reclaimed;
+        unsigned long pressure;
+        /*
+         * We calculate the ratio (in percents) of how many pages were
+         * scanned vs. reclaimed in a given time frame (window). Note that
+         * time is in VM reclaimer's "ticks", i.e. number of pages
+         * scanned. This makes it possible to set desired reaction time
+         * and serves as a ratelimit.
+         */
+        pressure = scale - (reclaimed * scale / scanned);
+        pressure = pressure * 100 / scale;
+        pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
+                 scanned, reclaimed);
+        return vmpressure_level(pressure);
+}
+struct vmpressure_event {
+        struct eventfd_ctx *efd;
+        enum vmpressure_levels level;
+        struct list_head node;
+};
+static bool vmpressure_event(struct vmpressure *vmpr,
+                             unsigned long scanned, unsigned long reclaimed)
+{
+        struct vmpressure_event *ev;
+        enum vmpressure_levels level;
+        bool signalled = false;
+        level = vmpressure_calc_level(scanned, reclaimed);
+        mutex_lock(&vmpr->events_lock);
+        list_for_each_entry(ev, &vmpr->events, node) {
+                if (level >= ev->level) {
+                        eventfd_signal(ev->efd, 1);
+                        signalled = true;
+                }
+        }
+        mutex_unlock(&vmpr->events_lock);
+        return signalled;
+}
+static void vmpressure_work_fn(struct work_struct *work)
+{
+        struct vmpressure *vmpr = work_to_vmpressure(work);
+        unsigned long scanned;
+        unsigned long reclaimed;
+        /*
+         * Several contexts might be calling vmpressure(), so it is
+         * possible that the work was rescheduled again before the old
+         * work context cleared the counters. In that case we will run
+         * just after the old work returns, but then scanned might be zero
+         * here. No need for any locks here since we don't care if
+         * vmpr->reclaimed is in sync.
+         */
+        if (!vmpr->scanned)
+                return;
+        mutex_lock(&vmpr->sr_lock);
+        scanned = vmpr->scanned;
+        reclaimed = vmpr->reclaimed;
+        vmpr->scanned = 0;
+        vmpr->reclaimed = 0;
+        mutex_unlock(&vmpr->sr_lock);
+        do {
+                if (vmpressure_event(vmpr, scanned, reclaimed))
+                        break;
+                /*
+                 * If not handled, propagate the event upward into the
+                 * hierarchy.
+                 */
+        } while ((vmpr = vmpressure_parent(vmpr)));
+}
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp:        reclaimer's gfp mask
+ * @memcg:      cgroup memory controller handle
+ * @scanned:    number of pages scanned
+ * @reclaimed:  number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+                unsigned long scanned, unsigned long reclaimed)
+{
+        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+        /*
+         * Here we only want to account pressure that userland is able to
+         * help us with. For example, suppose that DMA zone is under
+         * pressure; if we notify userland about that kind of pressure,
+         * then it will be mostly a waste as it will trigger unnecessary
+         * freeing of memory by userland (since userland is more likely to
+         * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+         * is why we include only movable, highmem and FS/IO pages.
+         * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+         * we account it too.
+         */
+        if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+                return;
+        /*
+         * If we got here with no pages scanned, then that is an indicator
+         * that reclaimer was unable to find any shrinkable LRUs at the
+         * current scanning depth. But it does not mean that we should
+         * report the critical pressure, yet. If the scanning priority
+         * (scanning depth) goes too high (deep), we will be notified
+         * through vmpressure_prio(). But so far, keep calm.
+         */
+        if (!scanned)
+                return;
+        mutex_lock(&vmpr->sr_lock);
+        vmpr->scanned += scanned;
+        vmpr->reclaimed += reclaimed;
+        scanned = vmpr->scanned;
+        mutex_unlock(&vmpr->sr_lock);
+        if (scanned < vmpressure_win || work_pending(&vmpr->work))
+                return;
+        schedule_work(&vmpr->work);
+}
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp:        reclaimer's gfp mask
+ * @memcg:      cgroup memory controller handle
+ * @prio:       reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+        /*
+         * We only use prio for accounting critical level. For more info
+         * see comment for vmpressure_level_critical_prio variable above.
+         */
+        if (prio > vmpressure_level_critical_prio)
+                return;
+        /*
+         * OK, the prio is below the threshold, updating vmpressure
+         * information before shrinker dives into long shrinking of long
+         * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+         * to the vmpressure() basically means that we signal 'critical'
+         * level.
+         */
+        vmpressure(gfp, memcg, vmpressure_win, 0);
+}
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @cg:         cgroup that is interested in vmpressure notifications
+ * @cft:        cgroup control files handle
+ * @eventfd:    eventfd context to link notifications with
+ * @args:       event arguments (used to set up a pressure level threshold)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a string that denotes pressure level
+ * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
+ * "critical").
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).register_event, and then cgroup core will handle everything by
+ * itself.
+ */
+int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+                              struct eventfd_ctx *eventfd, const char *args)
+{
+        struct vmpressure *vmpr = cg_to_vmpressure(cg);
+        struct vmpressure_event *ev;
+        int level;
+        for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
+                if (!strcmp(vmpressure_str_levels[level], args))
+                        break;
+        }
+        if (level >= VMPRESSURE_NUM_LEVELS)
+                return -EINVAL;
+        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+        if (!ev)
+                return -ENOMEM;
+        ev->efd = eventfd;
+        ev->level = level;
+        mutex_lock(&vmpr->events_lock);
+        list_add(&ev->node, &vmpr->events);
+        mutex_unlock(&vmpr->events_lock);
+        return 0;
+}
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @cg:         cgroup handle
+ * @cft:        cgroup control files handle
+ * @eventfd:    eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).unregister_event, and then cgroup core will handle everything
+ * by itself.
+ */
+void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+                                 struct eventfd_ctx *eventfd)
+{
+        struct vmpressure *vmpr = cg_to_vmpressure(cg);
+        struct vmpressure_event *ev;
+        mutex_lock(&vmpr->events_lock);
+        list_for_each_entry(ev, &vmpr->events, node) {
+                if (ev->efd != eventfd)
+                        continue;
+                list_del(&ev->node);
+                kfree(ev);
+                break;
+        }
+        mutex_unlock(&vmpr->events_lock);
+}
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr:       Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+        mutex_init(&vmpr->sr_lock);
+        mutex_init(&vmpr->events_lock);
+        INIT_LIST_HEAD(&vmpr->events);
+        INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e03a00b09da9..e53e49584cf3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                        }
                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
                } while (memcg);
+                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+                           sc->nr_scanned - nr_scanned,
+                           sc->nr_reclaimed - nr_reclaimed);
        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 }
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                count_vm_event(ALLOCSTALL);
        do {
+                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+                                sc->priority);
                sc->nr_scanned = 0;
                aborted_reclaim = shrink_zones(zonelist, sc);
author	Anton Vorontsov <anton.vorontsov@linaro.org>	2013-04-29 18:08:31 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-04-29 18:54:38 -0400
commit	70ddf637eebe47e61fb2be08a59315581b6d2f38 (patch)
tree	7fdb9e04da11c191daa225cad2314e440effc176 /mm
parent	84d96d897671cfb386e722acbefdb3a79e115a8a (diff)

diff --git a/mm/Makefile b/mm/Makefile index 3a4628751f89..72c5acb9345f 100644 --- a/mm/Makefile +++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
50	obj-$(CONFIG_MIGRATION) += migrate.o	50	obj-$(CONFIG_MIGRATION) += migrate.o
51	obj-$(CONFIG_QUICKLIST) += quicklist.o	51	obj-$(CONFIG_QUICKLIST) += quicklist.o
52	obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o	52	obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
53	obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o	53	obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
54	obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o	54	obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
55	obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o	55	obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
56	obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o	56	obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e5bc43c2d1f..360464f40e96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
49	#include <linux/fs.h>	49	#include <linux/fs.h>
50	#include <linux/seq_file.h>	50	#include <linux/seq_file.h>
51	#include <linux/vmalloc.h>	51	#include <linux/vmalloc.h>
		52	#include <linux/vmpressure.h>
52	#include <linux/mm_inline.h>	53	#include <linux/mm_inline.h>
53	#include <linux/page_cgroup.h>	54	#include <linux/page_cgroup.h>
54	#include <linux/cpu.h>	55	#include <linux/cpu.h>
@@ -261,6 +262,9 @@ struct mem_cgroup {
261	*/	262	*/
262	struct res_counter res;	263	struct res_counter res;
263		264
		265	/* vmpressure notifications */
		266	struct vmpressure vmpressure;
		267
264	union {	268	union {
265	/*	269	/*
266	* the counter to account for mem+swap usage.	270	* the counter to account for mem+swap usage.
@@ -359,6 +363,7 @@ struct mem_cgroup {
359	atomic_t numainfo_events;	363	atomic_t numainfo_events;
360	atomic_t numainfo_updating;	364	atomic_t numainfo_updating;
361	#endif	365	#endif
		366
362	/*	367	/*
363	* Per cgroup active and inactive list, similar to the	368	* Per cgroup active and inactive list, similar to the
364	* per zone LRU lists.	369	* per zone LRU lists.
@@ -510,6 +515,24 @@ struct mem_cgroup mem_cgroup_from_css(struct cgroup_subsys_state s)
510	return container_of(s, struct mem_cgroup, css);	515	return container_of(s, struct mem_cgroup, css);
511	}	516	}
512		517
		518	/* Some nice accessors for the vmpressure. */
		519	struct vmpressure memcg_to_vmpressure(struct mem_cgroup memcg)
		520	{
		521	if (!memcg)
		522	memcg = root_mem_cgroup;
		523	return &memcg->vmpressure;
		524	}
		525
		526	struct cgroup_subsys_state vmpressure_to_css(struct vmpressure vmpr)
		527	{
		528	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
		529	}
		530
		531	struct vmpressure css_to_vmpressure(struct cgroup_subsys_state css)
		532	{
		533	return &mem_cgroup_from_css(css)->vmpressure;
		534	}
		535
513	static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)	536	static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
514	{	537	{
515	return (memcg == root_mem_cgroup);	538	return (memcg == root_mem_cgroup);
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = {
5907	.unregister_event = mem_cgroup_oom_unregister_event,	5930	.unregister_event = mem_cgroup_oom_unregister_event,
5908	.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),	5931	.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5909	},	5932	},
		5933	{
		5934	.name = "pressure_level",
		5935	.register_event = vmpressure_register_event,
		5936	.unregister_event = vmpressure_unregister_event,
		5937	},
5910	#ifdef CONFIG_NUMA	5938	#ifdef CONFIG_NUMA
5911	{	5939	{
5912	.name = "numa_stat",	5940	.name = "numa_stat",
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6188	memcg->move_charge_at_immigrate = 0;	6216	memcg->move_charge_at_immigrate = 0;
6189	mutex_init(&memcg->thresholds_lock);	6217	mutex_init(&memcg->thresholds_lock);
6190	spin_lock_init(&memcg->move_lock);	6218	spin_lock_init(&memcg->move_lock);
		6219	vmpressure_init(&memcg->vmpressure);
6191		6220
6192	return &memcg->css;	6221	return &memcg->css;
6193		6222


diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..736a6011c2c8 --- /dev/null +++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
		1	/*
		2	* Linux VM pressure
		3	*
		4	* Copyright 2012 Linaro Ltd.
		5	* Anton Vorontsov <anton.vorontsov@linaro.org>
		6	*
		7	* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
		8	* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
		9	*
		10	* This program is free software; you can redistribute it and/or modify it
		11	* under the terms of the GNU General Public License version 2 as published
		12	* by the Free Software Foundation.
		13	*/
		14
		15	#include <linux/cgroup.h>
		16	#include <linux/fs.h>
		17	#include <linux/log2.h>
		18	#include <linux/sched.h>
		19	#include <linux/mm.h>
		20	#include <linux/vmstat.h>
		21	#include <linux/eventfd.h>
		22	#include <linux/swap.h>
		23	#include <linux/printk.h>
		24	#include <linux/vmpressure.h>
		25
		26	/*
		27	* The window size (vmpressure_win) is the number of scanned pages before
		28	* we try to analyze scanned/reclaimed ratio. So the window is used as a
		29	* rate-limit tunable for the "low" level notification, and also for
		30	* averaging the ratio for medium/critical levels. Using small window
		31	* sizes can cause lot of false positives, but too big window size will
		32	* delay the notifications.
		33	*
		34	* As the vmscan reclaimer logic works with chunks which are multiple of
		35	* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
		36	*
		37	* TODO: Make the window size depend on machine size, as we do for vmstat
		38	* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
		39	*/
		40	static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
		41
		42	/*
		43	* These thresholds are used when we account memory pressure through
		44	* scanned/reclaimed ratio. The current values were chosen empirically. In
		45	* essence, they are percents: the higher the value, the more number
		46	* unsuccessful reclaims there were.
		47	*/
		48	static const unsigned int vmpressure_level_med = 60;
		49	static const unsigned int vmpressure_level_critical = 95;
		50
		51	/*
		52	* When there are too little pages left to scan, vmpressure() may miss the
		53	* critical pressure as number of pages will be less than "window size".
		54	* However, in that case the vmscan priority will raise fast as the
		55	* reclaimer will try to scan LRUs more deeply.
		56	*
		57	* The vmscan logic considers these special priorities:
		58	*
		59	* prio == DEF_PRIORITY (12): reclaimer starts with that value
		60	* prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
		61	* prio == 0 : close to OOM, kernel scans every page in an lru
		62	*
		63	* Any value in this range is acceptable for this tunable (i.e. from 12 to
		64	* 0). Current value for the vmpressure_level_critical_prio is chosen
		65	* empirically, but the number, in essence, means that we consider
		66	* critical level when scanning depth is ~10% of the lru size (vmscan
		67	* scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
		68	* eights).
		69	*/
		70	static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
		71
		72	static struct vmpressure work_to_vmpressure(struct work_struct work)
		73	{
		74	return container_of(work, struct vmpressure, work);
		75	}
		76
		77	static struct vmpressure cg_to_vmpressure(struct cgroup cg)
		78	{
		79	return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
		80	}
		81
		82	static struct vmpressure vmpressure_parent(struct vmpressure vmpr)
		83	{
		84	struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
		85	struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
		86
		87	memcg = parent_mem_cgroup(memcg);
		88	if (!memcg)
		89	return NULL;
		90	return memcg_to_vmpressure(memcg);
		91	}
		92
		93	enum vmpressure_levels {
		94	VMPRESSURE_LOW = 0,
		95	VMPRESSURE_MEDIUM,
		96	VMPRESSURE_CRITICAL,
		97	VMPRESSURE_NUM_LEVELS,
		98	};
		99
		100	static const char * const vmpressure_str_levels[] = {
		101	[VMPRESSURE_LOW] = "low",
		102	[VMPRESSURE_MEDIUM] = "medium",
		103	[VMPRESSURE_CRITICAL] = "critical",
		104	};
		105
		106	static enum vmpressure_levels vmpressure_level(unsigned long pressure)
		107	{
		108	if (pressure >= vmpressure_level_critical)
		109	return VMPRESSURE_CRITICAL;
		110	else if (pressure >= vmpressure_level_med)
		111	return VMPRESSURE_MEDIUM;
		112	return VMPRESSURE_LOW;
		113	}
		114
		115	static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
		116	unsigned long reclaimed)
		117	{
		118	unsigned long scale = scanned + reclaimed;
		119	unsigned long pressure;
		120
		121	/*
		122	* We calculate the ratio (in percents) of how many pages were
		123	* scanned vs. reclaimed in a given time frame (window). Note that
		124	* time is in VM reclaimer's "ticks", i.e. number of pages
		125	* scanned. This makes it possible to set desired reaction time
		126	* and serves as a ratelimit.
		127	*/
		128	pressure = scale - (reclaimed * scale / scanned);
		129	pressure = pressure * 100 / scale;
		130
		131	pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
		132	scanned, reclaimed);
		133
		134	return vmpressure_level(pressure);
		135	}
		136
		137	struct vmpressure_event {
		138	struct eventfd_ctx *efd;
		139	enum vmpressure_levels level;
		140	struct list_head node;
		141	};
		142
		143	static bool vmpressure_event(struct vmpressure *vmpr,
		144	unsigned long scanned, unsigned long reclaimed)
		145	{
		146	struct vmpressure_event *ev;
		147	enum vmpressure_levels level;
		148	bool signalled = false;
		149
		150	level = vmpressure_calc_level(scanned, reclaimed);
		151
		152	mutex_lock(&vmpr->events_lock);
		153
		154	list_for_each_entry(ev, &vmpr->events, node) {
		155	if (level >= ev->level) {
		156	eventfd_signal(ev->efd, 1);
		157	signalled = true;
		158	}
		159	}
		160
		161	mutex_unlock(&vmpr->events_lock);
		162
		163	return signalled;
		164	}
		165
		166	static void vmpressure_work_fn(struct work_struct *work)
		167	{
		168	struct vmpressure *vmpr = work_to_vmpressure(work);
		169	unsigned long scanned;
		170	unsigned long reclaimed;
		171
		172	/*
		173	* Several contexts might be calling vmpressure(), so it is
		174	* possible that the work was rescheduled again before the old
		175	* work context cleared the counters. In that case we will run
		176	* just after the old work returns, but then scanned might be zero
		177	* here. No need for any locks here since we don't care if
		178	* vmpr->reclaimed is in sync.
		179	*/
		180	if (!vmpr->scanned)
		181	return;
		182
		183	mutex_lock(&vmpr->sr_lock);
		184	scanned = vmpr->scanned;
		185	reclaimed = vmpr->reclaimed;
		186	vmpr->scanned = 0;
		187	vmpr->reclaimed = 0;
		188	mutex_unlock(&vmpr->sr_lock);
		189
		190	do {
		191	if (vmpressure_event(vmpr, scanned, reclaimed))
		192	break;
		193	/*
		194	* If not handled, propagate the event upward into the
		195	* hierarchy.
		196	*/
		197	} while ((vmpr = vmpressure_parent(vmpr)));
		198	}
		199
		200	/**
		201	* vmpressure() - Account memory pressure through scanned/reclaimed ratio
		202	* @gfp: reclaimer's gfp mask
		203	* @memcg: cgroup memory controller handle
		204	* @scanned: number of pages scanned
		205	* @reclaimed: number of pages reclaimed
		206	*
		207	* This function should be called from the vmscan reclaim path to account
		208	* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
		209	* pressure index is then further refined and averaged over time.
		210	*
		211	* This function does not return any value.
		212	*/
		213	void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
		214	unsigned long scanned, unsigned long reclaimed)
		215	{
		216	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
		217
		218	/*
		219	* Here we only want to account pressure that userland is able to
		220	* help us with. For example, suppose that DMA zone is under
		221	* pressure; if we notify userland about that kind of pressure,
		222	* then it will be mostly a waste as it will trigger unnecessary
		223	* freeing of memory by userland (since userland is more likely to
		224	* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
		225	* is why we include only movable, highmem and FS/IO pages.
		226	* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
		227	* we account it too.
		228	*/
		229	if (!(gfp & (__GFP_HIGHMEM \| __GFP_MOVABLE \| __GFP_IO \| __GFP_FS)))
		230	return;
		231
		232	/*
		233	* If we got here with no pages scanned, then that is an indicator
		234	* that reclaimer was unable to find any shrinkable LRUs at the
		235	* current scanning depth. But it does not mean that we should
		236	* report the critical pressure, yet. If the scanning priority
		237	* (scanning depth) goes too high (deep), we will be notified
		238	* through vmpressure_prio(). But so far, keep calm.
		239	*/
		240	if (!scanned)
		241	return;
		242
		243	mutex_lock(&vmpr->sr_lock);
		244	vmpr->scanned += scanned;
		245	vmpr->reclaimed += reclaimed;
		246	scanned = vmpr->scanned;
		247	mutex_unlock(&vmpr->sr_lock);
		248
		249	if (scanned < vmpressure_win \|\| work_pending(&vmpr->work))
		250	return;
		251	schedule_work(&vmpr->work);
		252	}
		253
		254	/**
		255	* vmpressure_prio() - Account memory pressure through reclaimer priority level
		256	* @gfp: reclaimer's gfp mask
		257	* @memcg: cgroup memory controller handle
		258	* @prio: reclaimer's priority
		259	*
		260	* This function should be called from the reclaim path every time when
		261	* the vmscan's reclaiming priority (scanning depth) changes.
		262	*
		263	* This function does not return any value.
		264	*/
		265	void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
		266	{
		267	/*
		268	* We only use prio for accounting critical level. For more info
		269	* see comment for vmpressure_level_critical_prio variable above.
		270	*/
		271	if (prio > vmpressure_level_critical_prio)
		272	return;
		273
		274	/*
		275	* OK, the prio is below the threshold, updating vmpressure
		276	* information before shrinker dives into long shrinking of long
		277	* range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
		278	* to the vmpressure() basically means that we signal 'critical'
		279	* level.
		280	*/
		281	vmpressure(gfp, memcg, vmpressure_win, 0);
		282	}
		283
		284	/**
		285	* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
		286	* @cg: cgroup that is interested in vmpressure notifications
		287	* @cft: cgroup control files handle
		288	* @eventfd: eventfd context to link notifications with
		289	* @args: event arguments (used to set up a pressure level threshold)
		290	*
		291	* This function associates eventfd context with the vmpressure
		292	* infrastructure, so that the notifications will be delivered to the
		293	* @eventfd. The @args parameter is a string that denotes pressure level
		294	* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
		295	* "critical").
		296	*
		297	* This function should not be used directly, just pass it to (struct
		298	* cftype).register_event, and then cgroup core will handle everything by
		299	* itself.
		300	*/
		301	int vmpressure_register_event(struct cgroup cg, struct cftype cft,
		302	struct eventfd_ctx eventfd, const char args)
		303	{
		304	struct vmpressure *vmpr = cg_to_vmpressure(cg);
		305	struct vmpressure_event *ev;
		306	int level;
		307
		308	for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
		309	if (!strcmp(vmpressure_str_levels[level], args))
		310	break;
		311	}
		312
		313	if (level >= VMPRESSURE_NUM_LEVELS)
		314	return -EINVAL;
		315
		316	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
		317	if (!ev)
		318	return -ENOMEM;
		319
		320	ev->efd = eventfd;
		321	ev->level = level;
		322
		323	mutex_lock(&vmpr->events_lock);
		324	list_add(&ev->node, &vmpr->events);
		325	mutex_unlock(&vmpr->events_lock);
		326
		327	return 0;
		328	}
		329
		330	/**
		331	* vmpressure_unregister_event() - Unbind eventfd from vmpressure
		332	* @cg: cgroup handle
		333	* @cft: cgroup control files handle
		334	* @eventfd: eventfd context that was used to link vmpressure with the @cg
		335	*
		336	* This function does internal manipulations to detach the @eventfd from
		337	* the vmpressure notifications, and then frees internal resources
		338	* associated with the @eventfd (but the @eventfd itself is not freed).
		339	*
		340	* This function should not be used directly, just pass it to (struct
		341	* cftype).unregister_event, and then cgroup core will handle everything
		342	* by itself.
		343	*/
		344	void vmpressure_unregister_event(struct cgroup cg, struct cftype cft,
		345	struct eventfd_ctx *eventfd)
		346	{
		347	struct vmpressure *vmpr = cg_to_vmpressure(cg);
		348	struct vmpressure_event *ev;
		349
		350	mutex_lock(&vmpr->events_lock);
		351	list_for_each_entry(ev, &vmpr->events, node) {
		352	if (ev->efd != eventfd)
		353	continue;
		354	list_del(&ev->node);
		355	kfree(ev);
		356	break;
		357	}
		358	mutex_unlock(&vmpr->events_lock);
		359	}
		360
		361	/**
		362	* vmpressure_init() - Initialize vmpressure control structure
		363	* @vmpr: Structure to be initialized
		364	*
		365	* This function should be called on every allocated vmpressure structure
		366	* before any usage.
		367	*/
		368	void vmpressure_init(struct vmpressure *vmpr)
		369	{
		370	mutex_init(&vmpr->sr_lock);
		371	mutex_init(&vmpr->events_lock);
		372	INIT_LIST_HEAD(&vmpr->events);
		373	INIT_WORK(&vmpr->work, vmpressure_work_fn);
		374	}


diff --git a/mm/vmscan.c b/mm/vmscan.c index e03a00b09da9..e53e49584cf3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19	#include <linux/pagemap.h>	19	#include <linux/pagemap.h>
20	#include <linux/init.h>	20	#include <linux/init.h>
21	#include <linux/highmem.h>	21	#include <linux/highmem.h>
		22	#include <linux/vmpressure.h>
22	#include <linux/vmstat.h>	23	#include <linux/vmstat.h>
23	#include <linux/file.h>	24	#include <linux/file.h>
24	#include <linux/writeback.h>	25	#include <linux/writeback.h>
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone zone, struct scan_control sc)
1982	}	1983	}
1983	memcg = mem_cgroup_iter(root, memcg, &reclaim);	1984	memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984	} while (memcg);	1985	} while (memcg);
		1986
		1987	vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
		1988	sc->nr_scanned - nr_scanned,
		1989	sc->nr_reclaimed - nr_reclaimed);
		1990
1985	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,	1991	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986	sc->nr_scanned - nr_scanned, sc));	1992	sc->nr_scanned - nr_scanned, sc));
1987	}	1993	}
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2167	count_vm_event(ALLOCSTALL);	2173	count_vm_event(ALLOCSTALL);
2168		2174
2169	do {	2175	do {
		2176	vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
		2177	sc->priority);
2170	sc->nr_scanned = 0;	2178	sc->nr_scanned = 0;
2171	aborted_reclaim = shrink_zones(zonelist, sc);	2179	aborted_reclaim = shrink_zones(zonelist, sc);
2172		2180