mm, compaction: introduce kcompactd

Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Rik van Riel <riel@redhat.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: David Rientjes <rientjes@google.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Vlastimil Babka <vbabka@suse.cz> 2016-03-17 17:18:08 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-17 18:09:34 -0400
commit: 698b1b30642f1ff0ea10ef1de9745ab633031377 (patch)
tree: 8a522baa63e84d97620fca8a321ed4182bd937a7 /mm/compaction.c
parent: 81c5857b279e6b18f6ff0d1975e80a07af542cd1 (diff)
1 files changed, 222 insertions, 0 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 93f71d968098..5b2bfbaa821a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
 */
+#include <linux/cpu.h>
 #include <linux/swap.h>
 #include <linux/migrate.h>
 #include <linux/compaction.h>
@@ -17,6 +18,8 @@
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
 #include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "internal.h"
 #ifdef CONFIG_COMPACTION
@@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node)
 }
 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+        return pgdat->kcompactd_max_order > 0;
+}
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+        int zoneid;
+        struct zone *zone;
+        enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+        for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+                zone = &pgdat->node_zones[zoneid];
+                if (!populated_zone(zone))
+                        continue;
+                if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+                                        classzone_idx) == COMPACT_CONTINUE)
+                        return true;
+        }
+        return false;
+}
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+        /*
+         * With no special task, compact all zones so that a page of requested
+         * order is allocatable.
+         */
+        int zoneid;
+        struct zone *zone;
+        struct compact_control cc = {
+                .order = pgdat->kcompactd_max_order,
+                .classzone_idx = pgdat->kcompactd_classzone_idx,
+                .mode = MIGRATE_SYNC_LIGHT,
+                .ignore_skip_hint = true,
+        };
+        bool success = false;
+        trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+                                                        cc.classzone_idx);
+        count_vm_event(KCOMPACTD_WAKE);
+        for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+                int status;
+                zone = &pgdat->node_zones[zoneid];
+                if (!populated_zone(zone))
+                        continue;
+                if (compaction_deferred(zone, cc.order))
+                        continue;
+                if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+                                                        COMPACT_CONTINUE)
+                        continue;
+                cc.nr_freepages = 0;
+                cc.nr_migratepages = 0;
+                cc.zone = zone;
+                INIT_LIST_HEAD(&cc.freepages);
+                INIT_LIST_HEAD(&cc.migratepages);
+                status = compact_zone(zone, &cc);
+                if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+                                                cc.classzone_idx, 0)) {
+                        success = true;
+                        compaction_defer_reset(zone, cc.order, false);
+                } else if (status == COMPACT_COMPLETE) {
+                        /*
+                         * We use sync migration mode here, so we defer like
+                         * sync direct compaction does.
+                         */
+                        defer_compaction(zone, cc.order);
+                }
+                VM_BUG_ON(!list_empty(&cc.freepages));
+                VM_BUG_ON(!list_empty(&cc.migratepages));
+        }
+        /*
+         * Regardless of success, we are done until woken up next. But remember
+         * the requested order/classzone_idx in case it was higher/tighter than
+         * our current ones
+         */
+        if (pgdat->kcompactd_max_order <= cc.order)
+                pgdat->kcompactd_max_order = 0;
+        if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+                pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+        if (!order)
+                return;
+        if (pgdat->kcompactd_max_order < order)
+                pgdat->kcompactd_max_order = order;
+        if (pgdat->kcompactd_classzone_idx > classzone_idx)
+                pgdat->kcompactd_classzone_idx = classzone_idx;
+        if (!waitqueue_active(&pgdat->kcompactd_wait))
+                return;
+        if (!kcompactd_node_suitable(pgdat))
+                return;
+        trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+                                                        classzone_idx);
+        wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+        pg_data_t *pgdat = (pg_data_t*)p;
+        struct task_struct *tsk = current;
+        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+        if (!cpumask_empty(cpumask))
+                set_cpus_allowed_ptr(tsk, cpumask);
+        set_freezable();
+        pgdat->kcompactd_max_order = 0;
+        pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+        while (!kthread_should_stop()) {
+                trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+                wait_event_freezable(pgdat->kcompactd_wait,
+                                kcompactd_work_requested(pgdat));
+                kcompactd_do_work(pgdat);
+        }
+        return 0;
+}
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+        pg_data_t *pgdat = NODE_DATA(nid);
+        int ret = 0;
+        if (pgdat->kcompactd)
+                return 0;
+        pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+        if (IS_ERR(pgdat->kcompactd)) {
+                pr_err("Failed to start kcompactd on node %d\n", nid);
+                ret = PTR_ERR(pgdat->kcompactd);
+                pgdat->kcompactd = NULL;
+        }
+        return ret;
+}
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+        struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+        if (kcompactd) {
+                kthread_stop(kcompactd);
+                NODE_DATA(nid)->kcompactd = NULL;
+        }
+}
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+                        void *hcpu)
+{
+        int nid;
+        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+                for_each_node_state(nid, N_MEMORY) {
+                        pg_data_t *pgdat = NODE_DATA(nid);
+                        const struct cpumask *mask;
+                        mask = cpumask_of_node(pgdat->node_id);
+                        if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+                                /* One of our CPUs online: restore mask */
+                                set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+                }
+        }
+        return NOTIFY_OK;
+}
+static int __init kcompactd_init(void)
+{
+        int nid;
+        for_each_node_state(nid, N_MEMORY)
+                kcompactd_run(nid);
+        hotcpu_notifier(cpu_callback, 0);
+        return 0;
+}
+subsys_initcall(kcompactd_init)
 #endif /* CONFIG_COMPACTION */
author	Vlastimil Babka <vbabka@suse.cz>	2016-03-17 17:18:08 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-17 18:09:34 -0400
commit	698b1b30642f1ff0ea10ef1de9745ab633031377 (patch)
tree	8a522baa63e84d97620fca8a321ed4182bd937a7 /mm/compaction.c
parent	81c5857b279e6b18f6ff0d1975e80a07af542cd1 (diff)

diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d968098..5b2bfbaa821a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c
@@ -7,6 +7,7 @@
7	*	7	*
8	* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>	8	* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
9	*/	9	*/
		10	#include <linux/cpu.h>
10	#include <linux/swap.h>	11	#include <linux/swap.h>
11	#include <linux/migrate.h>	12	#include <linux/migrate.h>
12	#include <linux/compaction.h>	13	#include <linux/compaction.h>
@@ -17,6 +18,8 @@
17	#include <linux/balloon_compaction.h>	18	#include <linux/balloon_compaction.h>
18	#include <linux/page-isolation.h>	19	#include <linux/page-isolation.h>
19	#include <linux/kasan.h>	20	#include <linux/kasan.h>
		21	#include <linux/kthread.h>
		22	#include <linux/freezer.h>
20	#include "internal.h"	23	#include "internal.h"
21		24
22	#ifdef CONFIG_COMPACTION	25	#ifdef CONFIG_COMPACTION
@@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node)
1736	}	1739	}
1737	#endif /* CONFIG_SYSFS && CONFIG_NUMA */	1740	#endif /* CONFIG_SYSFS && CONFIG_NUMA */
1738		1741
		1742	static inline bool kcompactd_work_requested(pg_data_t *pgdat)
		1743	{
		1744	return pgdat->kcompactd_max_order > 0;
		1745	}
		1746
		1747	static bool kcompactd_node_suitable(pg_data_t *pgdat)
		1748	{
		1749	int zoneid;
		1750	struct zone *zone;
		1751	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
		1752
		1753	for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
		1754	zone = &pgdat->node_zones[zoneid];
		1755
		1756	if (!populated_zone(zone))
		1757	continue;
		1758
		1759	if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
		1760	classzone_idx) == COMPACT_CONTINUE)
		1761	return true;
		1762	}
		1763
		1764	return false;
		1765	}
		1766
		1767	static void kcompactd_do_work(pg_data_t *pgdat)
		1768	{
		1769	/*
		1770	* With no special task, compact all zones so that a page of requested
		1771	* order is allocatable.
		1772	*/
		1773	int zoneid;
		1774	struct zone *zone;
		1775	struct compact_control cc = {
		1776	.order = pgdat->kcompactd_max_order,
		1777	.classzone_idx = pgdat->kcompactd_classzone_idx,
		1778	.mode = MIGRATE_SYNC_LIGHT,
		1779	.ignore_skip_hint = true,
		1780
		1781	};
		1782	bool success = false;
		1783
		1784	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
		1785	cc.classzone_idx);
		1786	count_vm_event(KCOMPACTD_WAKE);
		1787
		1788	for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
		1789	int status;
		1790
		1791	zone = &pgdat->node_zones[zoneid];
		1792	if (!populated_zone(zone))
		1793	continue;
		1794
		1795	if (compaction_deferred(zone, cc.order))
		1796	continue;
		1797
		1798	if (compaction_suitable(zone, cc.order, 0, zoneid) !=
		1799	COMPACT_CONTINUE)
		1800	continue;
		1801
		1802	cc.nr_freepages = 0;
		1803	cc.nr_migratepages = 0;
		1804	cc.zone = zone;
		1805	INIT_LIST_HEAD(&cc.freepages);
		1806	INIT_LIST_HEAD(&cc.migratepages);
		1807
		1808	status = compact_zone(zone, &cc);
		1809
		1810	if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
		1811	cc.classzone_idx, 0)) {
		1812	success = true;
		1813	compaction_defer_reset(zone, cc.order, false);
		1814	} else if (status == COMPACT_COMPLETE) {
		1815	/*
		1816	* We use sync migration mode here, so we defer like
		1817	* sync direct compaction does.
		1818	*/
		1819	defer_compaction(zone, cc.order);
		1820	}
		1821
		1822	VM_BUG_ON(!list_empty(&cc.freepages));
		1823	VM_BUG_ON(!list_empty(&cc.migratepages));
		1824	}
		1825
		1826	/*
		1827	* Regardless of success, we are done until woken up next. But remember
		1828	* the requested order/classzone_idx in case it was higher/tighter than
		1829	* our current ones
		1830	*/
		1831	if (pgdat->kcompactd_max_order <= cc.order)
		1832	pgdat->kcompactd_max_order = 0;
		1833	if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
		1834	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
		1835	}
		1836
		1837	void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
		1838	{
		1839	if (!order)
		1840	return;
		1841
		1842	if (pgdat->kcompactd_max_order < order)
		1843	pgdat->kcompactd_max_order = order;
		1844
		1845	if (pgdat->kcompactd_classzone_idx > classzone_idx)
		1846	pgdat->kcompactd_classzone_idx = classzone_idx;
		1847
		1848	if (!waitqueue_active(&pgdat->kcompactd_wait))
		1849	return;
		1850
		1851	if (!kcompactd_node_suitable(pgdat))
		1852	return;
		1853
		1854	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
		1855	classzone_idx);
		1856	wake_up_interruptible(&pgdat->kcompactd_wait);
		1857	}
		1858
		1859	/*
		1860	* The background compaction daemon, started as a kernel thread
		1861	* from the init process.
		1862	*/
		1863	static int kcompactd(void *p)
		1864	{
		1865	pg_data_t pgdat = (pg_data_t)p;
		1866	struct task_struct *tsk = current;
		1867
		1868	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
		1869
		1870	if (!cpumask_empty(cpumask))
		1871	set_cpus_allowed_ptr(tsk, cpumask);
		1872
		1873	set_freezable();
		1874
		1875	pgdat->kcompactd_max_order = 0;
		1876	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
		1877
		1878	while (!kthread_should_stop()) {
		1879	trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
		1880	wait_event_freezable(pgdat->kcompactd_wait,
		1881	kcompactd_work_requested(pgdat));
		1882
		1883	kcompactd_do_work(pgdat);
		1884	}
		1885
		1886	return 0;
		1887	}
		1888
		1889	/*
		1890	* This kcompactd start function will be called by init and node-hot-add.
		1891	* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
		1892	*/
		1893	int kcompactd_run(int nid)
		1894	{
		1895	pg_data_t *pgdat = NODE_DATA(nid);
		1896	int ret = 0;
		1897
		1898	if (pgdat->kcompactd)
		1899	return 0;
		1900
		1901	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
		1902	if (IS_ERR(pgdat->kcompactd)) {
		1903	pr_err("Failed to start kcompactd on node %d\n", nid);
		1904	ret = PTR_ERR(pgdat->kcompactd);
		1905	pgdat->kcompactd = NULL;
		1906	}
		1907	return ret;
		1908	}
		1909
		1910	/*
		1911	* Called by memory hotplug when all memory in a node is offlined. Caller must
		1912	* hold mem_hotplug_begin/end().
		1913	*/
		1914	void kcompactd_stop(int nid)
		1915	{
		1916	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
		1917
		1918	if (kcompactd) {
		1919	kthread_stop(kcompactd);
		1920	NODE_DATA(nid)->kcompactd = NULL;
		1921	}
		1922	}
		1923
		1924	/*
		1925	* It's optimal to keep kcompactd on the same CPUs as their memory, but
		1926	* not required for correctness. So if the last cpu in a node goes
		1927	* away, we get changed to run anywhere: as the first one comes back,
		1928	* restore their cpu bindings.
		1929	*/
		1930	static int cpu_callback(struct notifier_block *nfb, unsigned long action,
		1931	void *hcpu)
		1932	{
		1933	int nid;
		1934
		1935	if (action == CPU_ONLINE \|\| action == CPU_ONLINE_FROZEN) {
		1936	for_each_node_state(nid, N_MEMORY) {
		1937	pg_data_t *pgdat = NODE_DATA(nid);
		1938	const struct cpumask *mask;
		1939
		1940	mask = cpumask_of_node(pgdat->node_id);
		1941
		1942	if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
		1943	/* One of our CPUs online: restore mask */
		1944	set_cpus_allowed_ptr(pgdat->kcompactd, mask);
		1945	}
		1946	}
		1947	return NOTIFY_OK;
		1948	}
		1949
		1950	static int __init kcompactd_init(void)
		1951	{
		1952	int nid;
		1953
		1954	for_each_node_state(nid, N_MEMORY)
		1955	kcompactd_run(nid);
		1956	hotcpu_notifier(cpu_callback, 0);
		1957	return 0;
		1958	}
		1959	subsys_initcall(kcompactd_init)
		1960
1739	#endif /* CONFIG_COMPACTION */	1961	#endif /* CONFIG_COMPACTION */