blkcg: add generic throttling mechanism

Since IO can be issued from literally anywhere it's almost impossible to do throttling without having some sort of adverse effect somewhere else in the system because of locking or other dependencies. The best way to solve this is to do the throttling when we know we aren't holding any other kernel resources. Do this by tracking throttling in a per-blkg basis, and if we require throttling flag the task that it needs to check before it returns to user space and possibly sleep there. This is to address the case where a process is doing work that is generating IO that can't be throttled, whether that is directly with a lot of REQ_META IO, or indirectly by allocating so much memory that it is swamping the disk with REQ_SWAP. We can't use task_add_work as we don't want to induce a memory allocation in the IO path, so simply saving the request queue in the task and flagging it to do the notify_resume thing achieves the same result without the overhead of a memory allocation. Signed-off-by: Josef Bacik <jbacik@fb.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Josef Bacik <jbacik@fb.com> 2018-07-03 11:14:55 -0400
committer: Jens Axboe <axboe@kernel.dk> 2018-07-09 11:07:54 -0400
commit: d09d8df3a29403693d9d20cc34ed101f2c558e2b (patch)
tree: ef13236fd3cab8b7a3d6c27a7484862561afcd32 /block/blk-cgroup.c
parent: 0d3bd88d54f513723602b361dccfc71639f50779 (diff)
1 files changed, 220 insertions, 0 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7dc6f05cc44b..d3310ec96c2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
 #include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
+#include <linux/tracehook.h>
 #include "blk.h"
 #define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
                if (!blkcg_debug_stats)
                        goto next;
+                if (atomic_read(&blkg->use_delay)) {
+                        has_stats = true;
+                        off += scnprintf(buf+off, size-off,
+                                         " use_delay=%d delay_nsec=%llu",
+                                         atomic_read(&blkg->use_delay),
+                                        (unsigned long long)atomic64_read(&blkg->delay_nsec));
+                }
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];
                        size_t written;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
        mutex_unlock(&blkcg_pol_mutex);
 }
+static void blkcg_exit(struct task_struct *tsk)
+{
+        if (tsk->throttle_queue)
+                blk_put_queue(tsk->throttle_queue);
+        tsk->throttle_queue = NULL;
+}
 struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_offline = blkcg_css_offline,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
        .dfl_cftypes = blkcg_files,
        .legacy_cftypes = blkcg_legacy_files,
        .legacy_name = "blkio",
+        .exit = blkcg_exit,
 #ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+/*
+ * Scale the accumulated delay based on how long it has been since we updated
+ * the delay.  We only call this when we are adding delay, in case it's been a
+ * while since we added delay, and when we are checking to see if we need to
+ * delay a task, to account for any delays that may have occurred.
+ */
+static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
+{
+        u64 old = atomic64_read(&blkg->delay_start);
+        /*
+         * We only want to scale down every second.  The idea here is that we
+         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
+         * time window.  We only want to throttle tasks for recent delay that
+         * has occurred, in 1 second time windows since that's the maximum
+         * things can be throttled.  We save the current delay window in
+         * blkg->last_delay so we know what amount is still left to be charged
+         * to the blkg from this point onward.  blkg->last_use keeps track of
+         * the use_delay counter.  The idea is if we're unthrottling the blkg we
+         * are ok with whatever is happening now, and we can take away more of
+         * the accumulated delay as we've already throttled enough that
+         * everybody is happy with their IO latencies.
+         */
+        if (time_before64(old + NSEC_PER_SEC, now) &&
+            atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+                u64 cur = atomic64_read(&blkg->delay_nsec);
+                u64 sub = min_t(u64, blkg->last_delay, now - old);
+                int cur_use = atomic_read(&blkg->use_delay);
+                /*
+                 * We've been unthrottled, subtract a larger chunk of our
+                 * accumulated delay.
+                 */
+                if (cur_use < blkg->last_use)
+                        sub = max_t(u64, sub, blkg->last_delay >> 1);
+                /*
+                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
+                 * should only ever be growing except here where we subtract out
+                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
+                 * rather not end up with negative numbers.
+                 */
+                if (unlikely(cur < sub)) {
+                        atomic64_set(&blkg->delay_nsec, 0);
+                        blkg->last_delay = 0;
+                } else {
+                        atomic64_sub(sub, &blkg->delay_nsec);
+                        blkg->last_delay = cur - sub;
+                }
+                blkg->last_use = cur_use;
+        }
+}
+/*
+ * This is called when we want to actually walk up the hierarchy and check to
+ * see if we need to throttle, and then actually throttle if there is some
+ * accumulated delay.  This should only be called upon return to user space so
+ * we're not holding some lock that would induce a priority inversion.
+ */
+static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
+{
+        u64 now = ktime_to_ns(ktime_get());
+        u64 exp;
+        u64 delay_nsec = 0;
+        int tok;
+        while (blkg->parent) {
+                if (atomic_read(&blkg->use_delay)) {
+                        blkcg_scale_delay(blkg, now);
+                        delay_nsec = max_t(u64, delay_nsec,
+                                           atomic64_read(&blkg->delay_nsec));
+                }
+                blkg = blkg->parent;
+        }
+        if (!delay_nsec)
+                return;
+        /*
+         * Let's not sleep for all eternity if we've amassed a huge delay.
+         * Swapping or metadata IO can accumulate 10's of seconds worth of
+         * delay, and we want userspace to be able to do _something_ so cap the
+         * delays at 1 second.  If there's 10's of seconds worth of delay then
+         * the tasks will be delayed for 1 second for every syscall.
+         */
+        delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+        /*
+         * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
+         * that hasn't landed upstream yet.  Once that stuff is in place we need
+         * to do a psi_memstall_enter/leave if memdelay is set.
+         */
+        exp = ktime_add_ns(now, delay_nsec);
+        tok = io_schedule_prepare();
+        do {
+                __set_current_state(TASK_KILLABLE);
+                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
+                        break;
+        } while (!fatal_signal_pending(current));
+        io_schedule_finish(tok);
+}
+/**
+ * blkcg_maybe_throttle_current - throttle the current task if it has been marked
+ *
+ * This is only called if we've been marked with set_notify_resume().  Obviously
+ * we can be set_notify_resume() for reasons other than blkcg throttling, so we
+ * check to see if current->throttle_queue is set and if not this doesn't do
+ * anything.  This should only ever be called by the resume code, it's not meant
+ * to be called by people willy-nilly as it will actually do the work to
+ * throttle the task if it is setup for throttling.
+ */
+void blkcg_maybe_throttle_current(void)
+{
+        struct request_queue *q = current->throttle_queue;
+        struct cgroup_subsys_state *css;
+        struct blkcg *blkcg;
+        struct blkcg_gq *blkg;
+        bool use_memdelay = current->use_memdelay;
+        if (!q)
+                return;
+        current->throttle_queue = NULL;
+        current->use_memdelay = false;
+        rcu_read_lock();
+        css = kthread_blkcg();
+        if (css)
+                blkcg = css_to_blkcg(css);
+        else
+                blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
+        if (!blkcg)
+                goto out;
+        blkg = blkg_lookup(blkcg, q);
+        if (!blkg)
+                goto out;
+        blkg = blkg_try_get(blkg);
+        if (!blkg)
+                goto out;
+        rcu_read_unlock();
+        blk_put_queue(q);
+        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
+        blkg_put(blkg);
+        return;
+out:
+        rcu_read_unlock();
+        blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
+/**
+ * blkcg_schedule_throttle - this task needs to check for throttling
+ * @q - the request queue IO was submitted on
+ * @use_memdelay - do we charge this to memory delay for PSI
+ *
+ * This is called by the IO controller when we know there's delay accumulated
+ * for the blkg for this task.  We do not pass the blkg because there are places
+ * we call this that may not have that information, the swapping code for
+ * instance will only have a request_queue at that point.  This set's the
+ * notify_resume for the task to check and see if it requires throttling before
+ * returning to user space.
+ *
+ * We will only schedule once per syscall.  You can call this over and over
+ * again and it will only do the check once upon return to user space, and only
+ * throttle once.  If the task needs to be throttled again it'll need to be
+ * re-set at the next time we see the task.
+ */
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+{
+        if (unlikely(current->flags & PF_KTHREAD))
+                return;
+        if (!blk_get_queue(q))
+                return;
+        if (current->throttle_queue)
+                blk_put_queue(current->throttle_queue);
+        current->throttle_queue = q;
+        if (use_memdelay)
+                current->use_memdelay = use_memdelay;
+        set_notify_resume(current);
+}
+EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
+/**
+ * blkcg_add_delay - add delay to this blkg
+ * @now - the current time in nanoseconds
+ * @delta - how many nanoseconds of delay to add
+ *
+ * Charge @delta to the blkg's current delay accumulation.  This is used to
+ * throttle tasks if an IO controller thinks we need more throttling.
+ */
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
+{
+        blkcg_scale_delay(blkg, now);
+        atomic64_add(delta, &blkg->delay_nsec);
+}
+EXPORT_SYMBOL_GPL(blkcg_add_delay);
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
author	Josef Bacik <jbacik@fb.com>	2018-07-03 11:14:55 -0400
committer	Jens Axboe <axboe@kernel.dk>	2018-07-09 11:07:54 -0400
commit	d09d8df3a29403693d9d20cc34ed101f2c558e2b (patch)
tree	ef13236fd3cab8b7a3d6c27a7484862561afcd32 /block/blk-cgroup.c
parent	0d3bd88d54f513723602b361dccfc71639f50779 (diff)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7dc6f05cc44b..d3310ec96c2a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
27	#include <linux/atomic.h>	27	#include <linux/atomic.h>
28	#include <linux/ctype.h>	28	#include <linux/ctype.h>
29	#include <linux/blk-cgroup.h>	29	#include <linux/blk-cgroup.h>
		30	#include <linux/tracehook.h>
30	#include "blk.h"	31	#include "blk.h"
31		32
32	#define MAX_KEY_LEN 100	33	#define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file sf, void v)
999	if (!blkcg_debug_stats)	1000	if (!blkcg_debug_stats)
1000	goto next;	1001	goto next;
1001		1002
		1003	if (atomic_read(&blkg->use_delay)) {
		1004	has_stats = true;
		1005	off += scnprintf(buf+off, size-off,
		1006	" use_delay=%d delay_nsec=%llu",
		1007	atomic_read(&blkg->use_delay),
		1008	(unsigned long long)atomic64_read(&blkg->delay_nsec));
		1009	}
		1010
1002	for (i = 0; i < BLKCG_MAX_POLS; i++) {	1011	for (i = 0; i < BLKCG_MAX_POLS; i++) {
1003	struct blkcg_policy *pol = blkcg_policy[i];	1012	struct blkcg_policy *pol = blkcg_policy[i];
1004	size_t written;	1013	size_t written;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
1326	mutex_unlock(&blkcg_pol_mutex);	1335	mutex_unlock(&blkcg_pol_mutex);
1327	}	1336	}
1328		1337
		1338	static void blkcg_exit(struct task_struct *tsk)
		1339	{
		1340	if (tsk->throttle_queue)
		1341	blk_put_queue(tsk->throttle_queue);
		1342	tsk->throttle_queue = NULL;
		1343	}
		1344
1329	struct cgroup_subsys io_cgrp_subsys = {	1345	struct cgroup_subsys io_cgrp_subsys = {
1330	.css_alloc = blkcg_css_alloc,	1346	.css_alloc = blkcg_css_alloc,
1331	.css_offline = blkcg_css_offline,	1347	.css_offline = blkcg_css_offline,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
1335	.dfl_cftypes = blkcg_files,	1351	.dfl_cftypes = blkcg_files,
1336	.legacy_cftypes = blkcg_legacy_files,	1352	.legacy_cftypes = blkcg_legacy_files,
1337	.legacy_name = "blkio",	1353	.legacy_name = "blkio",
		1354	.exit = blkcg_exit,
1338	#ifdef CONFIG_MEMCG	1355	#ifdef CONFIG_MEMCG
1339	/*	1356	/*
1340	* This ensures that, if available, memcg is automatically enabled	1357	* This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ out_unlock:
1586	}	1603	}
1587	EXPORT_SYMBOL_GPL(blkcg_policy_unregister);	1604	EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1588		1605
		1606	/*
		1607	* Scale the accumulated delay based on how long it has been since we updated
		1608	* the delay. We only call this when we are adding delay, in case it's been a
		1609	* while since we added delay, and when we are checking to see if we need to
		1610	* delay a task, to account for any delays that may have occurred.
		1611	*/
		1612	static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
		1613	{
		1614	u64 old = atomic64_read(&blkg->delay_start);
		1615
		1616	/*
		1617	* We only want to scale down every second. The idea here is that we
		1618	* want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
		1619	* time window. We only want to throttle tasks for recent delay that
		1620	* has occurred, in 1 second time windows since that's the maximum
		1621	* things can be throttled. We save the current delay window in
		1622	* blkg->last_delay so we know what amount is still left to be charged
		1623	* to the blkg from this point onward. blkg->last_use keeps track of
		1624	* the use_delay counter. The idea is if we're unthrottling the blkg we
		1625	* are ok with whatever is happening now, and we can take away more of
		1626	* the accumulated delay as we've already throttled enough that
		1627	* everybody is happy with their IO latencies.
		1628	*/
		1629	if (time_before64(old + NSEC_PER_SEC, now) &&
		1630	atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
		1631	u64 cur = atomic64_read(&blkg->delay_nsec);
		1632	u64 sub = min_t(u64, blkg->last_delay, now - old);
		1633	int cur_use = atomic_read(&blkg->use_delay);
		1634
		1635	/*
		1636	* We've been unthrottled, subtract a larger chunk of our
		1637	* accumulated delay.
		1638	*/
		1639	if (cur_use < blkg->last_use)
		1640	sub = max_t(u64, sub, blkg->last_delay >> 1);
		1641
		1642	/*
		1643	* This shouldn't happen, but handle it anyway. Our delay_nsec
		1644	* should only ever be growing except here where we subtract out
		1645	* min(last_delay, 1 second), but lord knows bugs happen and I'd
		1646	* rather not end up with negative numbers.
		1647	*/
		1648	if (unlikely(cur < sub)) {
		1649	atomic64_set(&blkg->delay_nsec, 0);
		1650	blkg->last_delay = 0;
		1651	} else {
		1652	atomic64_sub(sub, &blkg->delay_nsec);
		1653	blkg->last_delay = cur - sub;
		1654	}
		1655	blkg->last_use = cur_use;
		1656	}
		1657	}
		1658
		1659	/*
		1660	* This is called when we want to actually walk up the hierarchy and check to
		1661	* see if we need to throttle, and then actually throttle if there is some
		1662	* accumulated delay. This should only be called upon return to user space so
		1663	* we're not holding some lock that would induce a priority inversion.
		1664	*/
		1665	static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
		1666	{
		1667	u64 now = ktime_to_ns(ktime_get());
		1668	u64 exp;
		1669	u64 delay_nsec = 0;
		1670	int tok;
		1671
		1672	while (blkg->parent) {
		1673	if (atomic_read(&blkg->use_delay)) {
		1674	blkcg_scale_delay(blkg, now);
		1675	delay_nsec = max_t(u64, delay_nsec,
		1676	atomic64_read(&blkg->delay_nsec));
		1677	}
		1678	blkg = blkg->parent;
		1679	}
		1680
		1681	if (!delay_nsec)
		1682	return;
		1683
		1684	/*
		1685	* Let's not sleep for all eternity if we've amassed a huge delay.
		1686	* Swapping or metadata IO can accumulate 10's of seconds worth of
		1687	* delay, and we want userspace to be able to do _something_ so cap the
		1688	* delays at 1 second. If there's 10's of seconds worth of delay then
		1689	* the tasks will be delayed for 1 second for every syscall.
		1690	*/
		1691	delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
		1692
		1693	/*
		1694	* TODO: the use_memdelay flag is going to be for the upcoming psi stuff
		1695	* that hasn't landed upstream yet. Once that stuff is in place we need
		1696	* to do a psi_memstall_enter/leave if memdelay is set.
		1697	*/
		1698
		1699	exp = ktime_add_ns(now, delay_nsec);
		1700	tok = io_schedule_prepare();
		1701	do {
		1702	__set_current_state(TASK_KILLABLE);
		1703	if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
		1704	break;
		1705	} while (!fatal_signal_pending(current));
		1706	io_schedule_finish(tok);
		1707	}
		1708
		1709	/**
		1710	* blkcg_maybe_throttle_current - throttle the current task if it has been marked
		1711	*
		1712	* This is only called if we've been marked with set_notify_resume(). Obviously
		1713	* we can be set_notify_resume() for reasons other than blkcg throttling, so we
		1714	* check to see if current->throttle_queue is set and if not this doesn't do
		1715	* anything. This should only ever be called by the resume code, it's not meant
		1716	* to be called by people willy-nilly as it will actually do the work to
		1717	* throttle the task if it is setup for throttling.
		1718	*/
		1719	void blkcg_maybe_throttle_current(void)
		1720	{
		1721	struct request_queue *q = current->throttle_queue;
		1722	struct cgroup_subsys_state *css;
		1723	struct blkcg *blkcg;
		1724	struct blkcg_gq *blkg;
		1725	bool use_memdelay = current->use_memdelay;
		1726
		1727	if (!q)
		1728	return;
		1729
		1730	current->throttle_queue = NULL;
		1731	current->use_memdelay = false;
		1732
		1733	rcu_read_lock();
		1734	css = kthread_blkcg();
		1735	if (css)
		1736	blkcg = css_to_blkcg(css);
		1737	else
		1738	blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
		1739
		1740	if (!blkcg)
		1741	goto out;
		1742	blkg = blkg_lookup(blkcg, q);
		1743	if (!blkg)
		1744	goto out;
		1745	blkg = blkg_try_get(blkg);
		1746	if (!blkg)
		1747	goto out;
		1748	rcu_read_unlock();
		1749	blk_put_queue(q);
		1750
		1751	blkcg_maybe_throttle_blkg(blkg, use_memdelay);
		1752	blkg_put(blkg);
		1753	return;
		1754	out:
		1755	rcu_read_unlock();
		1756	blk_put_queue(q);
		1757	}
		1758	EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
		1759
		1760	/**
		1761	* blkcg_schedule_throttle - this task needs to check for throttling
		1762	* @q - the request queue IO was submitted on
		1763	* @use_memdelay - do we charge this to memory delay for PSI
		1764	*
		1765	* This is called by the IO controller when we know there's delay accumulated
		1766	* for the blkg for this task. We do not pass the blkg because there are places
		1767	* we call this that may not have that information, the swapping code for
		1768	* instance will only have a request_queue at that point. This set's the
		1769	* notify_resume for the task to check and see if it requires throttling before
		1770	* returning to user space.
		1771	*
		1772	* We will only schedule once per syscall. You can call this over and over
		1773	* again and it will only do the check once upon return to user space, and only
		1774	* throttle once. If the task needs to be throttled again it'll need to be
		1775	* re-set at the next time we see the task.
		1776	*/
		1777	void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
		1778	{
		1779	if (unlikely(current->flags & PF_KTHREAD))
		1780	return;
		1781
		1782	if (!blk_get_queue(q))
		1783	return;
		1784
		1785	if (current->throttle_queue)
		1786	blk_put_queue(current->throttle_queue);
		1787	current->throttle_queue = q;
		1788	if (use_memdelay)
		1789	current->use_memdelay = use_memdelay;
		1790	set_notify_resume(current);
		1791	}
		1792	EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
		1793
		1794	/**
		1795	* blkcg_add_delay - add delay to this blkg
		1796	* @now - the current time in nanoseconds
		1797	* @delta - how many nanoseconds of delay to add
		1798	*
		1799	* Charge @delta to the blkg's current delay accumulation. This is used to
		1800	* throttle tasks if an IO controller thinks we need more throttling.
		1801	*/
		1802	void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
		1803	{
		1804	blkcg_scale_delay(blkg, now);
		1805	atomic64_add(delta, &blkg->delay_nsec);
		1806	}
		1807	EXPORT_SYMBOL_GPL(blkcg_add_delay);
		1808
1589	module_param(blkcg_debug_stats, bool, 0644);	1809	module_param(blkcg_debug_stats, bool, 0644);
1590	MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");	1810	MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");