4 files changed, 412 insertions, 1 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7e5bc43c2d1f..360464f40e96 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
@@ -261,6 +262,9 @@ struct mem_cgroup {
         */
        struct res_counter res;
+        /* vmpressure notifications */
+        struct vmpressure vmpressure;
        union {
                /*
                 * the counter to account for mem+swap usage.
@@ -359,6 +363,7 @@ struct mem_cgroup {
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;
 #endif
        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
@@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
        return container_of(s, struct mem_cgroup, css);
 }
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+        if (!memcg)
+                memcg = root_mem_cgroup;
+        return &memcg->vmpressure;
+}
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+{
+        return &mem_cgroup_from_css(css)->vmpressure;
+}
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = {
                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
+        {
+                .name = "pressure_level",
+                .register_event = vmpressure_register_event,
+                .unregister_event = vmpressure_unregister_event,
+        },
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
+        vmpressure_init(&memcg->vmpressure);
        return &memcg->css;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ *                Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0                : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+        return container_of(work, struct vmpressure, work);
+}
+static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
+{
+        return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+}
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+        struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
+        memcg = parent_mem_cgroup(memcg);
+        if (!memcg)
+                return NULL;
+        return memcg_to_vmpressure(memcg);
+}
+enum vmpressure_levels {
+        VMPRESSURE_LOW = 0,
+        VMPRESSURE_MEDIUM,
+        VMPRESSURE_CRITICAL,
+        VMPRESSURE_NUM_LEVELS,
+};
+static const char * const vmpressure_str_levels[] = {
+        [VMPRESSURE_LOW] = "low",
+        [VMPRESSURE_MEDIUM] = "medium",
+        [VMPRESSURE_CRITICAL] = "critical",
+};
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+        if (pressure >= vmpressure_level_critical)
+                return VMPRESSURE_CRITICAL;
+        else if (pressure >= vmpressure_level_med)
+                return VMPRESSURE_MEDIUM;
+        return VMPRESSURE_LOW;
+}
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+                                                    unsigned long reclaimed)
+{
+        unsigned long scale = scanned + reclaimed;
+        unsigned long pressure;
+        /*
+         * We calculate the ratio (in percents) of how many pages were
+         * scanned vs. reclaimed in a given time frame (window). Note that
+         * time is in VM reclaimer's "ticks", i.e. number of pages
+         * scanned. This makes it possible to set desired reaction time
+         * and serves as a ratelimit.
+         */
+        pressure = scale - (reclaimed * scale / scanned);
+        pressure = pressure * 100 / scale;
+        pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
+                 scanned, reclaimed);
+        return vmpressure_level(pressure);
+}
+struct vmpressure_event {
+        struct eventfd_ctx *efd;
+        enum vmpressure_levels level;
+        struct list_head node;
+};
+static bool vmpressure_event(struct vmpressure *vmpr,
+                             unsigned long scanned, unsigned long reclaimed)
+{
+        struct vmpressure_event *ev;
+        enum vmpressure_levels level;
+        bool signalled = false;
+        level = vmpressure_calc_level(scanned, reclaimed);
+        mutex_lock(&vmpr->events_lock);
+        list_for_each_entry(ev, &vmpr->events, node) {
+                if (level >= ev->level) {
+                        eventfd_signal(ev->efd, 1);
+                        signalled = true;
+                }
+        }
+        mutex_unlock(&vmpr->events_lock);
+        return signalled;
+}
+static void vmpressure_work_fn(struct work_struct *work)
+{
+        struct vmpressure *vmpr = work_to_vmpressure(work);
+        unsigned long scanned;
+        unsigned long reclaimed;
+        /*
+         * Several contexts might be calling vmpressure(), so it is
+         * possible that the work was rescheduled again before the old
+         * work context cleared the counters. In that case we will run
+         * just after the old work returns, but then scanned might be zero
+         * here. No need for any locks here since we don't care if
+         * vmpr->reclaimed is in sync.
+         */
+        if (!vmpr->scanned)
+                return;
+        mutex_lock(&vmpr->sr_lock);
+        scanned = vmpr->scanned;
+        reclaimed = vmpr->reclaimed;
+        vmpr->scanned = 0;
+        vmpr->reclaimed = 0;
+        mutex_unlock(&vmpr->sr_lock);
+        do {
+                if (vmpressure_event(vmpr, scanned, reclaimed))
+                        break;
+                /*
+                 * If not handled, propagate the event upward into the
+                 * hierarchy.
+                 */
+        } while ((vmpr = vmpressure_parent(vmpr)));
+}
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp:        reclaimer's gfp mask
+ * @memcg:      cgroup memory controller handle
+ * @scanned:    number of pages scanned
+ * @reclaimed:  number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+                unsigned long scanned, unsigned long reclaimed)
+{
+        struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+        /*
+         * Here we only want to account pressure that userland is able to
+         * help us with. For example, suppose that DMA zone is under
+         * pressure; if we notify userland about that kind of pressure,
+         * then it will be mostly a waste as it will trigger unnecessary
+         * freeing of memory by userland (since userland is more likely to
+         * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+         * is why we include only movable, highmem and FS/IO pages.
+         * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+         * we account it too.
+         */
+        if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+                return;
+        /*
+         * If we got here with no pages scanned, then that is an indicator
+         * that reclaimer was unable to find any shrinkable LRUs at the
+         * current scanning depth. But it does not mean that we should
+         * report the critical pressure, yet. If the scanning priority
+         * (scanning depth) goes too high (deep), we will be notified
+         * through vmpressure_prio(). But so far, keep calm.
+         */
+        if (!scanned)
+                return;
+        mutex_lock(&vmpr->sr_lock);
+        vmpr->scanned += scanned;
+        vmpr->reclaimed += reclaimed;
+        scanned = vmpr->scanned;
+        mutex_unlock(&vmpr->sr_lock);
+        if (scanned < vmpressure_win || work_pending(&vmpr->work))
+                return;
+        schedule_work(&vmpr->work);
+}
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp:        reclaimer's gfp mask
+ * @memcg:      cgroup memory controller handle
+ * @prio:       reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+        /*
+         * We only use prio for accounting critical level. For more info
+         * see comment for vmpressure_level_critical_prio variable above.
+         */
+        if (prio > vmpressure_level_critical_prio)
+                return;
+        /*
+         * OK, the prio is below the threshold, updating vmpressure
+         * information before shrinker dives into long shrinking of long
+         * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+         * to the vmpressure() basically means that we signal 'critical'
+         * level.
+         */
+        vmpressure(gfp, memcg, vmpressure_win, 0);
+}
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @cg:         cgroup that is interested in vmpressure notifications
+ * @cft:        cgroup control files handle
+ * @eventfd:    eventfd context to link notifications with
+ * @args:       event arguments (used to set up a pressure level threshold)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a string that denotes pressure level
+ * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
+ * "critical").
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).register_event, and then cgroup core will handle everything by
+ * itself.
+ */
+int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+                              struct eventfd_ctx *eventfd, const char *args)
+{
+        struct vmpressure *vmpr = cg_to_vmpressure(cg);
+        struct vmpressure_event *ev;
+        int level;
+        for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
+                if (!strcmp(vmpressure_str_levels[level], args))
+                        break;
+        }
+        if (level >= VMPRESSURE_NUM_LEVELS)
+                return -EINVAL;
+        ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+        if (!ev)
+                return -ENOMEM;
+        ev->efd = eventfd;
+        ev->level = level;
+        mutex_lock(&vmpr->events_lock);
+        list_add(&ev->node, &vmpr->events);
+        mutex_unlock(&vmpr->events_lock);
+        return 0;
+}
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @cg:         cgroup handle
+ * @cft:        cgroup control files handle
+ * @eventfd:    eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).unregister_event, and then cgroup core will handle everything
+ * by itself.
+ */
+void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+                                 struct eventfd_ctx *eventfd)
+{
+        struct vmpressure *vmpr = cg_to_vmpressure(cg);
+        struct vmpressure_event *ev;
+        mutex_lock(&vmpr->events_lock);
+        list_for_each_entry(ev, &vmpr->events, node) {
+                if (ev->efd != eventfd)
+                        continue;
+                list_del(&ev->node);
+                kfree(ev);
+                break;
+        }
+        mutex_unlock(&vmpr->events_lock);
+}
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr:       Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+        mutex_init(&vmpr->sr_lock);
+        mutex_init(&vmpr->events_lock);
+        INIT_LIST_HEAD(&vmpr->events);
+        INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e03a00b09da9..e53e49584cf3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
                        }
                        memcg = mem_cgroup_iter(root, memcg, &reclaim);
                } while (memcg);
+                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+                           sc->nr_scanned - nr_scanned,
+                           sc->nr_reclaimed - nr_reclaimed);
        } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 }
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                count_vm_event(ALLOCSTALL);
        do {
+                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+                                sc->priority);
                sc->nr_scanned = 0;
                aborted_reclaim = shrink_zones(zonelist, sc);

diff --git a/mm/Makefile b/mm/Makefile index 3a4628751f89..72c5acb9345f 100644 --- a/mm/Makefile +++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
50	obj-$(CONFIG_MIGRATION) += migrate.o	50	obj-$(CONFIG_MIGRATION) += migrate.o
51	obj-$(CONFIG_QUICKLIST) += quicklist.o	51	obj-$(CONFIG_QUICKLIST) += quicklist.o
52	obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o	52	obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
53	obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o	53	obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
54	obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o	54	obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
55	obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o	55	obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
56	obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o	56	obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o


diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e5bc43c2d1f..360464f40e96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
49	#include <linux/fs.h>	49	#include <linux/fs.h>
50	#include <linux/seq_file.h>	50	#include <linux/seq_file.h>
51	#include <linux/vmalloc.h>	51	#include <linux/vmalloc.h>
		52	#include <linux/vmpressure.h>
52	#include <linux/mm_inline.h>	53	#include <linux/mm_inline.h>
53	#include <linux/page_cgroup.h>	54	#include <linux/page_cgroup.h>
54	#include <linux/cpu.h>	55	#include <linux/cpu.h>
@@ -261,6 +262,9 @@ struct mem_cgroup {
261	*/	262	*/
262	struct res_counter res;	263	struct res_counter res;
263		264
		265	/* vmpressure notifications */
		266	struct vmpressure vmpressure;
		267
264	union {	268	union {
265	/*	269	/*
266	* the counter to account for mem+swap usage.	270	* the counter to account for mem+swap usage.
@@ -359,6 +363,7 @@ struct mem_cgroup {
359	atomic_t numainfo_events;	363	atomic_t numainfo_events;
360	atomic_t numainfo_updating;	364	atomic_t numainfo_updating;
361	#endif	365	#endif
		366
362	/*	367	/*
363	* Per cgroup active and inactive list, similar to the	368	* Per cgroup active and inactive list, similar to the
364	* per zone LRU lists.	369	* per zone LRU lists.
@@ -510,6 +515,24 @@ struct mem_cgroup mem_cgroup_from_css(struct cgroup_subsys_state s)
510	return container_of(s, struct mem_cgroup, css);	515	return container_of(s, struct mem_cgroup, css);
511	}	516	}
512		517
		518	/* Some nice accessors for the vmpressure. */
		519	struct vmpressure memcg_to_vmpressure(struct mem_cgroup memcg)
		520	{
		521	if (!memcg)
		522	memcg = root_mem_cgroup;
		523	return &memcg->vmpressure;
		524	}
		525
		526	struct cgroup_subsys_state vmpressure_to_css(struct vmpressure vmpr)
		527	{
		528	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
		529	}
		530
		531	struct vmpressure css_to_vmpressure(struct cgroup_subsys_state css)
		532	{
		533	return &mem_cgroup_from_css(css)->vmpressure;
		534	}
		535
513	static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)	536	static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
514	{	537	{
515	return (memcg == root_mem_cgroup);	538	return (memcg == root_mem_cgroup);
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = {
5907	.unregister_event = mem_cgroup_oom_unregister_event,	5930	.unregister_event = mem_cgroup_oom_unregister_event,
5908	.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),	5931	.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5909	},	5932	},
		5933	{
		5934	.name = "pressure_level",
		5935	.register_event = vmpressure_register_event,
		5936	.unregister_event = vmpressure_unregister_event,
		5937	},
5910	#ifdef CONFIG_NUMA	5938	#ifdef CONFIG_NUMA
5911	{	5939	{
5912	.name = "numa_stat",	5940	.name = "numa_stat",
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6188	memcg->move_charge_at_immigrate = 0;	6216	memcg->move_charge_at_immigrate = 0;
6189	mutex_init(&memcg->thresholds_lock);	6217	mutex_init(&memcg->thresholds_lock);
6190	spin_lock_init(&memcg->move_lock);	6218	spin_lock_init(&memcg->move_lock);
		6219	vmpressure_init(&memcg->vmpressure);
6191		6220
6192	return &memcg->css;	6221	return &memcg->css;
6193		6222


diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..736a6011c2c8 --- /dev/null +++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
		1	/*
		2	* Linux VM pressure
		3	*
		4	* Copyright 2012 Linaro Ltd.
		5	* Anton Vorontsov <anton.vorontsov@linaro.org>
		6	*
		7	* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
		8	* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
		9	*
		10	* This program is free software; you can redistribute it and/or modify it
		11	* under the terms of the GNU General Public License version 2 as published
		12	* by the Free Software Foundation.
		13	*/
		14
		15	#include <linux/cgroup.h>
		16	#include <linux/fs.h>
		17	#include <linux/log2.h>
		18	#include <linux/sched.h>
		19	#include <linux/mm.h>
		20	#include <linux/vmstat.h>
		21	#include <linux/eventfd.h>
		22	#include <linux/swap.h>
		23	#include <linux/printk.h>
		24	#include <linux/vmpressure.h>
		25
		26	/*
		27	* The window size (vmpressure_win) is the number of scanned pages before
		28	* we try to analyze scanned/reclaimed ratio. So the window is used as a
		29	* rate-limit tunable for the "low" level notification, and also for
		30	* averaging the ratio for medium/critical levels. Using small window
		31	* sizes can cause lot of false positives, but too big window size will
		32	* delay the notifications.
		33	*
		34	* As the vmscan reclaimer logic works with chunks which are multiple of
		35	* SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
		36	*
		37	* TODO: Make the window size depend on machine size, as we do for vmstat
		38	* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
		39	*/
		40	static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
		41
		42	/*
		43	* These thresholds are used when we account memory pressure through
		44	* scanned/reclaimed ratio. The current values were chosen empirically. In
		45	* essence, they are percents: the higher the value, the more number
		46	* unsuccessful reclaims there were.
		47	*/
		48	static const unsigned int vmpressure_level_med = 60;
		49	static const unsigned int vmpressure_level_critical = 95;
		50
		51	/*
		52	* When there are too little pages left to scan, vmpressure() may miss the
		53	* critical pressure as number of pages will be less than "window size".
		54	* However, in that case the vmscan priority will raise fast as the
		55	* reclaimer will try to scan LRUs more deeply.
		56	*
		57	* The vmscan logic considers these special priorities:
		58	*
		59	* prio == DEF_PRIORITY (12): reclaimer starts with that value
		60	* prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
		61	* prio == 0 : close to OOM, kernel scans every page in an lru
		62	*
		63	* Any value in this range is acceptable for this tunable (i.e. from 12 to
		64	* 0). Current value for the vmpressure_level_critical_prio is chosen
		65	* empirically, but the number, in essence, means that we consider
		66	* critical level when scanning depth is ~10% of the lru size (vmscan
		67	* scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
		68	* eights).
		69	*/
		70	static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
		71
		72	static struct vmpressure work_to_vmpressure(struct work_struct work)
		73	{
		74	return container_of(work, struct vmpressure, work);
		75	}
		76
		77	static struct vmpressure cg_to_vmpressure(struct cgroup cg)
		78	{
		79	return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
		80	}
		81
		82	static struct vmpressure vmpressure_parent(struct vmpressure vmpr)
		83	{
		84	struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
		85	struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
		86
		87	memcg = parent_mem_cgroup(memcg);
		88	if (!memcg)
		89	return NULL;
		90	return memcg_to_vmpressure(memcg);
		91	}
		92
		93	enum vmpressure_levels {
		94	VMPRESSURE_LOW = 0,
		95	VMPRESSURE_MEDIUM,
		96	VMPRESSURE_CRITICAL,
		97	VMPRESSURE_NUM_LEVELS,
		98	};
		99
		100	static const char * const vmpressure_str_levels[] = {
		101	[VMPRESSURE_LOW] = "low",
		102	[VMPRESSURE_MEDIUM] = "medium",
		103	[VMPRESSURE_CRITICAL] = "critical",
		104	};
		105
		106	static enum vmpressure_levels vmpressure_level(unsigned long pressure)
		107	{
		108	if (pressure >= vmpressure_level_critical)
		109	return VMPRESSURE_CRITICAL;
		110	else if (pressure >= vmpressure_level_med)
		111	return VMPRESSURE_MEDIUM;
		112	return VMPRESSURE_LOW;
		113	}
		114
		115	static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
		116	unsigned long reclaimed)
		117	{
		118	unsigned long scale = scanned + reclaimed;
		119	unsigned long pressure;
		120
		121	/*
		122	* We calculate the ratio (in percents) of how many pages were
		123	* scanned vs. reclaimed in a given time frame (window). Note that
		124	* time is in VM reclaimer's "ticks", i.e. number of pages
		125	* scanned. This makes it possible to set desired reaction time
		126	* and serves as a ratelimit.
		127	*/
		128	pressure = scale - (reclaimed * scale / scanned);
		129	pressure = pressure * 100 / scale;
		130
		131	pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
		132	scanned, reclaimed);
		133
		134	return vmpressure_level(pressure);
		135	}
		136
		137	struct vmpressure_event {
		138	struct eventfd_ctx *efd;
		139	enum vmpressure_levels level;
		140	struct list_head node;
		141	};
		142
		143	static bool vmpressure_event(struct vmpressure *vmpr,
		144	unsigned long scanned, unsigned long reclaimed)
		145	{
		146	struct vmpressure_event *ev;
		147	enum vmpressure_levels level;
		148	bool signalled = false;
		149
		150	level = vmpressure_calc_level(scanned, reclaimed);
		151
		152	mutex_lock(&vmpr->events_lock);
		153
		154	list_for_each_entry(ev, &vmpr->events, node) {
		155	if (level >= ev->level) {
		156	eventfd_signal(ev->efd, 1);
		157	signalled = true;
		158	}
		159	}
		160
		161	mutex_unlock(&vmpr->events_lock);
		162
		163	return signalled;
		164	}
		165
		166	static void vmpressure_work_fn(struct work_struct *work)
		167	{
		168	struct vmpressure *vmpr = work_to_vmpressure(work);
		169	unsigned long scanned;
		170	unsigned long reclaimed;
		171
		172	/*
		173	* Several contexts might be calling vmpressure(), so it is
		174	* possible that the work was rescheduled again before the old
		175	* work context cleared the counters. In that case we will run
		176	* just after the old work returns, but then scanned might be zero
		177	* here. No need for any locks here since we don't care if
		178	* vmpr->reclaimed is in sync.
		179	*/
		180	if (!vmpr->scanned)
		181	return;
		182
		183	mutex_lock(&vmpr->sr_lock);
		184	scanned = vmpr->scanned;
		185	reclaimed = vmpr->reclaimed;
		186	vmpr->scanned = 0;
		187	vmpr->reclaimed = 0;
		188	mutex_unlock(&vmpr->sr_lock);
		189
		190	do {
		191	if (vmpressure_event(vmpr, scanned, reclaimed))
		192	break;
		193	/*
		194	* If not handled, propagate the event upward into the
		195	* hierarchy.
		196	*/
		197	} while ((vmpr = vmpressure_parent(vmpr)));
		198	}
		199
		200	/**
		201	* vmpressure() - Account memory pressure through scanned/reclaimed ratio
		202	* @gfp: reclaimer's gfp mask
		203	* @memcg: cgroup memory controller handle
		204	* @scanned: number of pages scanned
		205	* @reclaimed: number of pages reclaimed
		206	*
		207	* This function should be called from the vmscan reclaim path to account
		208	* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
		209	* pressure index is then further refined and averaged over time.
		210	*
		211	* This function does not return any value.
		212	*/
		213	void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
		214	unsigned long scanned, unsigned long reclaimed)
		215	{
		216	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
		217
		218	/*
		219	* Here we only want to account pressure that userland is able to
		220	* help us with. For example, suppose that DMA zone is under
		221	* pressure; if we notify userland about that kind of pressure,
		222	* then it will be mostly a waste as it will trigger unnecessary
		223	* freeing of memory by userland (since userland is more likely to
		224	* have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
		225	* is why we include only movable, highmem and FS/IO pages.
		226	* Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
		227	* we account it too.
		228	*/
		229	if (!(gfp & (__GFP_HIGHMEM \| __GFP_MOVABLE \| __GFP_IO \| __GFP_FS)))
		230	return;
		231
		232	/*
		233	* If we got here with no pages scanned, then that is an indicator
		234	* that reclaimer was unable to find any shrinkable LRUs at the
		235	* current scanning depth. But it does not mean that we should
		236	* report the critical pressure, yet. If the scanning priority
		237	* (scanning depth) goes too high (deep), we will be notified
		238	* through vmpressure_prio(). But so far, keep calm.
		239	*/
		240	if (!scanned)
		241	return;
		242
		243	mutex_lock(&vmpr->sr_lock);
		244	vmpr->scanned += scanned;
		245	vmpr->reclaimed += reclaimed;
		246	scanned = vmpr->scanned;
		247	mutex_unlock(&vmpr->sr_lock);
		248
		249	if (scanned < vmpressure_win \|\| work_pending(&vmpr->work))
		250	return;
		251	schedule_work(&vmpr->work);
		252	}
		253
		254	/**
		255	* vmpressure_prio() - Account memory pressure through reclaimer priority level
		256	* @gfp: reclaimer's gfp mask
		257	* @memcg: cgroup memory controller handle
		258	* @prio: reclaimer's priority
		259	*
		260	* This function should be called from the reclaim path every time when
		261	* the vmscan's reclaiming priority (scanning depth) changes.
		262	*
		263	* This function does not return any value.
		264	*/
		265	void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
		266	{
		267	/*
		268	* We only use prio for accounting critical level. For more info
		269	* see comment for vmpressure_level_critical_prio variable above.
		270	*/
		271	if (prio > vmpressure_level_critical_prio)
		272	return;
		273
		274	/*
		275	* OK, the prio is below the threshold, updating vmpressure
		276	* information before shrinker dives into long shrinking of long
		277	* range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
		278	* to the vmpressure() basically means that we signal 'critical'
		279	* level.
		280	*/
		281	vmpressure(gfp, memcg, vmpressure_win, 0);
		282	}
		283
		284	/**
		285	* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
		286	* @cg: cgroup that is interested in vmpressure notifications
		287	* @cft: cgroup control files handle
		288	* @eventfd: eventfd context to link notifications with
		289	* @args: event arguments (used to set up a pressure level threshold)
		290	*
		291	* This function associates eventfd context with the vmpressure
		292	* infrastructure, so that the notifications will be delivered to the
		293	* @eventfd. The @args parameter is a string that denotes pressure level
		294	* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
		295	* "critical").
		296	*
		297	* This function should not be used directly, just pass it to (struct
		298	* cftype).register_event, and then cgroup core will handle everything by
		299	* itself.
		300	*/
		301	int vmpressure_register_event(struct cgroup cg, struct cftype cft,
		302	struct eventfd_ctx eventfd, const char args)
		303	{
		304	struct vmpressure *vmpr = cg_to_vmpressure(cg);
		305	struct vmpressure_event *ev;
		306	int level;
		307
		308	for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
		309	if (!strcmp(vmpressure_str_levels[level], args))
		310	break;
		311	}
		312
		313	if (level >= VMPRESSURE_NUM_LEVELS)
		314	return -EINVAL;
		315
		316	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
		317	if (!ev)
		318	return -ENOMEM;
		319
		320	ev->efd = eventfd;
		321	ev->level = level;
		322
		323	mutex_lock(&vmpr->events_lock);
		324	list_add(&ev->node, &vmpr->events);
		325	mutex_unlock(&vmpr->events_lock);
		326
		327	return 0;
		328	}
		329
		330	/**
		331	* vmpressure_unregister_event() - Unbind eventfd from vmpressure
		332	* @cg: cgroup handle
		333	* @cft: cgroup control files handle
		334	* @eventfd: eventfd context that was used to link vmpressure with the @cg
		335	*
		336	* This function does internal manipulations to detach the @eventfd from
		337	* the vmpressure notifications, and then frees internal resources
		338	* associated with the @eventfd (but the @eventfd itself is not freed).
		339	*
		340	* This function should not be used directly, just pass it to (struct
		341	* cftype).unregister_event, and then cgroup core will handle everything
		342	* by itself.
		343	*/
		344	void vmpressure_unregister_event(struct cgroup cg, struct cftype cft,
		345	struct eventfd_ctx *eventfd)
		346	{
		347	struct vmpressure *vmpr = cg_to_vmpressure(cg);
		348	struct vmpressure_event *ev;
		349
		350	mutex_lock(&vmpr->events_lock);
		351	list_for_each_entry(ev, &vmpr->events, node) {
		352	if (ev->efd != eventfd)
		353	continue;
		354	list_del(&ev->node);
		355	kfree(ev);
		356	break;
		357	}
		358	mutex_unlock(&vmpr->events_lock);
		359	}
		360
		361	/**
		362	* vmpressure_init() - Initialize vmpressure control structure
		363	* @vmpr: Structure to be initialized
		364	*
		365	* This function should be called on every allocated vmpressure structure
		366	* before any usage.
		367	*/
		368	void vmpressure_init(struct vmpressure *vmpr)
		369	{
		370	mutex_init(&vmpr->sr_lock);
		371	mutex_init(&vmpr->events_lock);
		372	INIT_LIST_HEAD(&vmpr->events);
		373	INIT_WORK(&vmpr->work, vmpressure_work_fn);
		374	}


diff --git a/mm/vmscan.c b/mm/vmscan.c index e03a00b09da9..e53e49584cf3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
19	#include <linux/pagemap.h>	19	#include <linux/pagemap.h>
20	#include <linux/init.h>	20	#include <linux/init.h>
21	#include <linux/highmem.h>	21	#include <linux/highmem.h>
		22	#include <linux/vmpressure.h>
22	#include <linux/vmstat.h>	23	#include <linux/vmstat.h>
23	#include <linux/file.h>	24	#include <linux/file.h>
24	#include <linux/writeback.h>	25	#include <linux/writeback.h>
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone zone, struct scan_control sc)
1982	}	1983	}
1983	memcg = mem_cgroup_iter(root, memcg, &reclaim);	1984	memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984	} while (memcg);	1985	} while (memcg);
		1986
		1987	vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
		1988	sc->nr_scanned - nr_scanned,
		1989	sc->nr_reclaimed - nr_reclaimed);
		1990
1985	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,	1991	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986	sc->nr_scanned - nr_scanned, sc));	1992	sc->nr_scanned - nr_scanned, sc));
1987	}	1993	}
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2167	count_vm_event(ALLOCSTALL);	2173	count_vm_event(ALLOCSTALL);
2168		2174
2169	do {	2175	do {
		2176	vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
		2177	sc->priority);
2170	sc->nr_scanned = 0;	2178	sc->nr_scanned = 0;
2171	aborted_reclaim = shrink_zones(zonelist, sc);	2179	aborted_reclaim = shrink_zones(zonelist, sc);
2172		2180