aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-cgroup.c
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2018-07-03 11:14:55 -0400
committerJens Axboe <axboe@kernel.dk>2018-07-09 11:07:54 -0400
commitd09d8df3a29403693d9d20cc34ed101f2c558e2b (patch)
treeef13236fd3cab8b7a3d6c27a7484862561afcd32 /block/blk-cgroup.c
parent0d3bd88d54f513723602b361dccfc71639f50779 (diff)
blkcg: add generic throttling mechanism
Since IO can be issued from literally anywhere it's almost impossible to do throttling without having some sort of adverse effect somewhere else in the system because of locking or other dependencies. The best way to solve this is to do the throttling when we know we aren't holding any other kernel resources. Do this by tracking throttling in a per-blkg basis, and if we require throttling flag the task that it needs to check before it returns to user space and possibly sleep there. This is to address the case where a process is doing work that is generating IO that can't be throttled, whether that is directly with a lot of REQ_META IO, or indirectly by allocating so much memory that it is swamping the disk with REQ_SWAP. We can't use task_add_work as we don't want to induce a memory allocation in the IO path, so simply saving the request queue in the task and flagging it to do the notify_resume thing achieves the same result without the overhead of a memory allocation. Signed-off-by: Josef Bacik <jbacik@fb.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/blk-cgroup.c')
-rw-r--r--block/blk-cgroup.c220
1 files changed, 220 insertions, 0 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 7dc6f05cc44b..d3310ec96c2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -27,6 +27,7 @@
27#include <linux/atomic.h> 27#include <linux/atomic.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/blk-cgroup.h> 29#include <linux/blk-cgroup.h>
30#include <linux/tracehook.h>
30#include "blk.h" 31#include "blk.h"
31 32
32#define MAX_KEY_LEN 100 33#define MAX_KEY_LEN 100
@@ -999,6 +1000,14 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
999 if (!blkcg_debug_stats) 1000 if (!blkcg_debug_stats)
1000 goto next; 1001 goto next;
1001 1002
1003 if (atomic_read(&blkg->use_delay)) {
1004 has_stats = true;
1005 off += scnprintf(buf+off, size-off,
1006 " use_delay=%d delay_nsec=%llu",
1007 atomic_read(&blkg->use_delay),
1008 (unsigned long long)atomic64_read(&blkg->delay_nsec));
1009 }
1010
1002 for (i = 0; i < BLKCG_MAX_POLS; i++) { 1011 for (i = 0; i < BLKCG_MAX_POLS; i++) {
1003 struct blkcg_policy *pol = blkcg_policy[i]; 1012 struct blkcg_policy *pol = blkcg_policy[i];
1004 size_t written; 1013 size_t written;
@@ -1326,6 +1335,13 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css)
1326 mutex_unlock(&blkcg_pol_mutex); 1335 mutex_unlock(&blkcg_pol_mutex);
1327} 1336}
1328 1337
1338static void blkcg_exit(struct task_struct *tsk)
1339{
1340 if (tsk->throttle_queue)
1341 blk_put_queue(tsk->throttle_queue);
1342 tsk->throttle_queue = NULL;
1343}
1344
1329struct cgroup_subsys io_cgrp_subsys = { 1345struct cgroup_subsys io_cgrp_subsys = {
1330 .css_alloc = blkcg_css_alloc, 1346 .css_alloc = blkcg_css_alloc,
1331 .css_offline = blkcg_css_offline, 1347 .css_offline = blkcg_css_offline,
@@ -1335,6 +1351,7 @@ struct cgroup_subsys io_cgrp_subsys = {
1335 .dfl_cftypes = blkcg_files, 1351 .dfl_cftypes = blkcg_files,
1336 .legacy_cftypes = blkcg_legacy_files, 1352 .legacy_cftypes = blkcg_legacy_files,
1337 .legacy_name = "blkio", 1353 .legacy_name = "blkio",
1354 .exit = blkcg_exit,
1338#ifdef CONFIG_MEMCG 1355#ifdef CONFIG_MEMCG
1339 /* 1356 /*
1340 * This ensures that, if available, memcg is automatically enabled 1357 * This ensures that, if available, memcg is automatically enabled
@@ -1586,5 +1603,208 @@ out_unlock:
1586} 1603}
1587EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1604EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1588 1605
1606/*
1607 * Scale the accumulated delay based on how long it has been since we updated
1608 * the delay. We only call this when we are adding delay, in case it's been a
1609 * while since we added delay, and when we are checking to see if we need to
1610 * delay a task, to account for any delays that may have occurred.
1611 */
1612static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
1613{
1614 u64 old = atomic64_read(&blkg->delay_start);
1615
1616 /*
1617 * We only want to scale down every second. The idea here is that we
1618 * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
1619 * time window. We only want to throttle tasks for recent delay that
1620 * has occurred, in 1 second time windows since that's the maximum
1621 * things can be throttled. We save the current delay window in
1622 * blkg->last_delay so we know what amount is still left to be charged
1623 * to the blkg from this point onward. blkg->last_use keeps track of
1624 * the use_delay counter. The idea is if we're unthrottling the blkg we
1625 * are ok with whatever is happening now, and we can take away more of
1626 * the accumulated delay as we've already throttled enough that
1627 * everybody is happy with their IO latencies.
1628 */
1629 if (time_before64(old + NSEC_PER_SEC, now) &&
1630 atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
1631 u64 cur = atomic64_read(&blkg->delay_nsec);
1632 u64 sub = min_t(u64, blkg->last_delay, now - old);
1633 int cur_use = atomic_read(&blkg->use_delay);
1634
1635 /*
1636 * We've been unthrottled, subtract a larger chunk of our
1637 * accumulated delay.
1638 */
1639 if (cur_use < blkg->last_use)
1640 sub = max_t(u64, sub, blkg->last_delay >> 1);
1641
1642 /*
1643 * This shouldn't happen, but handle it anyway. Our delay_nsec
1644 * should only ever be growing except here where we subtract out
1645 * min(last_delay, 1 second), but lord knows bugs happen and I'd
1646 * rather not end up with negative numbers.
1647 */
1648 if (unlikely(cur < sub)) {
1649 atomic64_set(&blkg->delay_nsec, 0);
1650 blkg->last_delay = 0;
1651 } else {
1652 atomic64_sub(sub, &blkg->delay_nsec);
1653 blkg->last_delay = cur - sub;
1654 }
1655 blkg->last_use = cur_use;
1656 }
1657}
1658
1659/*
1660 * This is called when we want to actually walk up the hierarchy and check to
1661 * see if we need to throttle, and then actually throttle if there is some
1662 * accumulated delay. This should only be called upon return to user space so
1663 * we're not holding some lock that would induce a priority inversion.
1664 */
1665static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
1666{
1667 u64 now = ktime_to_ns(ktime_get());
1668 u64 exp;
1669 u64 delay_nsec = 0;
1670 int tok;
1671
1672 while (blkg->parent) {
1673 if (atomic_read(&blkg->use_delay)) {
1674 blkcg_scale_delay(blkg, now);
1675 delay_nsec = max_t(u64, delay_nsec,
1676 atomic64_read(&blkg->delay_nsec));
1677 }
1678 blkg = blkg->parent;
1679 }
1680
1681 if (!delay_nsec)
1682 return;
1683
1684 /*
1685 * Let's not sleep for all eternity if we've amassed a huge delay.
1686 * Swapping or metadata IO can accumulate 10's of seconds worth of
1687 * delay, and we want userspace to be able to do _something_ so cap the
1688 * delays at 1 second. If there's 10's of seconds worth of delay then
1689 * the tasks will be delayed for 1 second for every syscall.
1690 */
1691 delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
1692
1693 /*
1694 * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
1695 * that hasn't landed upstream yet. Once that stuff is in place we need
1696 * to do a psi_memstall_enter/leave if memdelay is set.
1697 */
1698
1699 exp = ktime_add_ns(now, delay_nsec);
1700 tok = io_schedule_prepare();
1701 do {
1702 __set_current_state(TASK_KILLABLE);
1703 if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
1704 break;
1705 } while (!fatal_signal_pending(current));
1706 io_schedule_finish(tok);
1707}
1708
1709/**
1710 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
1711 *
1712 * This is only called if we've been marked with set_notify_resume(). Obviously
1713 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
1714 * check to see if current->throttle_queue is set and if not this doesn't do
1715 * anything. This should only ever be called by the resume code, it's not meant
1716 * to be called by people willy-nilly as it will actually do the work to
1717 * throttle the task if it is setup for throttling.
1718 */
1719void blkcg_maybe_throttle_current(void)
1720{
1721 struct request_queue *q = current->throttle_queue;
1722 struct cgroup_subsys_state *css;
1723 struct blkcg *blkcg;
1724 struct blkcg_gq *blkg;
1725 bool use_memdelay = current->use_memdelay;
1726
1727 if (!q)
1728 return;
1729
1730 current->throttle_queue = NULL;
1731 current->use_memdelay = false;
1732
1733 rcu_read_lock();
1734 css = kthread_blkcg();
1735 if (css)
1736 blkcg = css_to_blkcg(css);
1737 else
1738 blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
1739
1740 if (!blkcg)
1741 goto out;
1742 blkg = blkg_lookup(blkcg, q);
1743 if (!blkg)
1744 goto out;
1745 blkg = blkg_try_get(blkg);
1746 if (!blkg)
1747 goto out;
1748 rcu_read_unlock();
1749 blk_put_queue(q);
1750
1751 blkcg_maybe_throttle_blkg(blkg, use_memdelay);
1752 blkg_put(blkg);
1753 return;
1754out:
1755 rcu_read_unlock();
1756 blk_put_queue(q);
1757}
1758EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
1759
1760/**
1761 * blkcg_schedule_throttle - this task needs to check for throttling
1762 * @q - the request queue IO was submitted on
1763 * @use_memdelay - do we charge this to memory delay for PSI
1764 *
1765 * This is called by the IO controller when we know there's delay accumulated
1766 * for the blkg for this task. We do not pass the blkg because there are places
1767 * we call this that may not have that information, the swapping code for
1768 * instance will only have a request_queue at that point. This set's the
1769 * notify_resume for the task to check and see if it requires throttling before
1770 * returning to user space.
1771 *
1772 * We will only schedule once per syscall. You can call this over and over
1773 * again and it will only do the check once upon return to user space, and only
1774 * throttle once. If the task needs to be throttled again it'll need to be
1775 * re-set at the next time we see the task.
1776 */
1777void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
1778{
1779 if (unlikely(current->flags & PF_KTHREAD))
1780 return;
1781
1782 if (!blk_get_queue(q))
1783 return;
1784
1785 if (current->throttle_queue)
1786 blk_put_queue(current->throttle_queue);
1787 current->throttle_queue = q;
1788 if (use_memdelay)
1789 current->use_memdelay = use_memdelay;
1790 set_notify_resume(current);
1791}
1792EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
1793
1794/**
1795 * blkcg_add_delay - add delay to this blkg
1796 * @now - the current time in nanoseconds
1797 * @delta - how many nanoseconds of delay to add
1798 *
1799 * Charge @delta to the blkg's current delay accumulation. This is used to
1800 * throttle tasks if an IO controller thinks we need more throttling.
1801 */
1802void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1803{
1804 blkcg_scale_delay(blkg, now);
1805 atomic64_add(delta, &blkg->delay_nsec);
1806}
1807EXPORT_SYMBOL_GPL(blkcg_add_delay);
1808
1589module_param(blkcg_debug_stats, bool, 0644); 1809module_param(blkcg_debug_stats, bool, 0644);
1590MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1810MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");