blkcg: add generic throttling mechanism

Since IO can be issued from literally anywhere it's almost impossible to do throttling without having some sort of adverse effect somewhere else in the system because of locking or other dependencies. The best way to solve this is to do the throttling when we know we aren't holding any other kernel resources. Do this by tracking throttling in a per-blkg basis, and if we require throttling flag the task that it needs to check before it returns to user space and possibly sleep there. This is to address the case where a process is doing work that is generating IO that can't be throttled, whether that is directly with a lot of REQ_META IO, or indirectly by allocating so much memory that it is swamping the disk with REQ_SWAP. We can't use task_add_work as we don't want to induce a memory allocation in the IO path, so simply saving the request queue in the task and flagging it to do the notify_resume thing achieves the same result without the overhead of a memory allocation. Signed-off-by: Josef Bacik <jbacik@fb.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Josef Bacik <jbacik@fb.com> 2018-07-03 11:14:55 -0400
committer: Jens Axboe <axboe@kernel.dk> 2018-07-09 11:07:54 -0400
commit: d09d8df3a29403693d9d20cc34ed101f2c558e2b (patch)
tree: ef13236fd3cab8b7a3d6c27a7484862561afcd32 /include/linux/blk-cgroup.h
parent: 0d3bd88d54f513723602b361dccfc71639f50779 (diff)
1 files changed, 99 insertions, 0 deletions
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a8f9ba8f33a4..de57de4831d5 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -136,6 +136,12 @@ struct blkcg_gq {
        struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
        struct rcu_head                 rcu_head;
+        atomic_t                        use_delay;
+        atomic64_t                      delay_nsec;
+        atomic64_t                      delay_start;
+        u64                             last_delay;
+        int                             last_use;
 };
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -241,6 +247,26 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
        return css_to_blkcg(task_css(current, io_cgrp_id));
 }
+static inline bool blk_cgroup_congested(void)
+{
+        struct cgroup_subsys_state *css;
+        bool ret = false;
+        rcu_read_lock();
+        css = kthread_blkcg();
+        if (!css)
+                css = task_css(current, io_cgrp_id);
+        while (css) {
+                if (atomic_read(&css->cgroup->congestion_count)) {
+                        ret = true;
+                        break;
+                }
+                css = css->parent;
+        }
+        rcu_read_unlock();
+        return ret;
+}
 /**
 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 * @return: true if this bio needs to be submitted with the root blkg context.
@@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
        atomic_inc(&blkg->refcnt);
 }
+/**
+ * blkg_try_get - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+{
+        if (atomic_inc_not_zero(&blkg->refcnt))
+                return blkg;
+        return NULL;
+}
 void __blkg_release_rcu(struct rcu_head *rcu);
 /**
@@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
        return !throtl;
 }
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+        if (atomic_add_return(1, &blkg->use_delay) == 1)
+                atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+        int old = atomic_read(&blkg->use_delay);
+        if (old == 0)
+                return 0;
+        /*
+         * We do this song and dance because we can race with somebody else
+         * adding or removing delay.  If we just did an atomic_dec we'd end up
+         * negative and we'd already be in trouble.  We need to subtract 1 and
+         * then check to see if we were the last delay so we can drop the
+         * congestion count on the cgroup.
+         */
+        while (old) {
+                int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
+                if (cur == old)
+                        break;
+                old = cur;
+        }
+        if (old == 0)
+                return 0;
+        if (old == 1)
+                atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+        return 1;
+}
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+        int old = atomic_read(&blkg->use_delay);
+        if (!old)
+                return;
+        /* We only want 1 person clearing the congestion count for this blkg. */
+        while (old) {
+                int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
+                if (cur == old) {
+                        atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+                        break;
+                }
+                old = cur;
+        }
+}
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
+void blkcg_maybe_throttle_current(void);
 #else   /* CONFIG_BLK_CGROUP */
 struct blkcg {
@@ -753,8 +847,13 @@ struct blkcg_policy {
 #define blkcg_root_css  ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+static inline void blkcg_maybe_throttle_current(void) { }
+static inline bool blk_cgroup_congested(void) { return false; }
 #ifdef CONFIG_BLOCK
+static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 static inline void blkcg_drain_queue(struct request_queue *q) { }
author	Josef Bacik <jbacik@fb.com>	2018-07-03 11:14:55 -0400
committer	Jens Axboe <axboe@kernel.dk>	2018-07-09 11:07:54 -0400
commit	d09d8df3a29403693d9d20cc34ed101f2c558e2b (patch)
tree	ef13236fd3cab8b7a3d6c27a7484862561afcd32 /include/linux/blk-cgroup.h
parent	0d3bd88d54f513723602b361dccfc71639f50779 (diff)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a8f9ba8f33a4..de57de4831d5 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h
@@ -136,6 +136,12 @@ struct blkcg_gq {
136	struct blkg_policy_data *pd[BLKCG_MAX_POLS];	136	struct blkg_policy_data *pd[BLKCG_MAX_POLS];
137		137
138	struct rcu_head rcu_head;	138	struct rcu_head rcu_head;
		139
		140	atomic_t use_delay;
		141	atomic64_t delay_nsec;
		142	atomic64_t delay_start;
		143	u64 last_delay;
		144	int last_use;
139	};	145	};
140		146
141	typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);	147	typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -241,6 +247,26 @@ static inline struct blkcg bio_blkcg(struct bio bio)
241	return css_to_blkcg(task_css(current, io_cgrp_id));	247	return css_to_blkcg(task_css(current, io_cgrp_id));
242	}	248	}
243		249
		250	static inline bool blk_cgroup_congested(void)
		251	{
		252	struct cgroup_subsys_state *css;
		253	bool ret = false;
		254
		255	rcu_read_lock();
		256	css = kthread_blkcg();
		257	if (!css)
		258	css = task_css(current, io_cgrp_id);
		259	while (css) {
		260	if (atomic_read(&css->cgroup->congestion_count)) {
		261	ret = true;
		262	break;
		263	}
		264	css = css->parent;
		265	}
		266	rcu_read_unlock();
		267	return ret;
		268	}
		269
244	/**	270	/**
245	* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg	271	* bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
246	* @return: true if this bio needs to be submitted with the root blkg context.	272	* @return: true if this bio needs to be submitted with the root blkg context.
@@ -374,6 +400,21 @@ static inline void blkg_get(struct blkcg_gq *blkg)
374	atomic_inc(&blkg->refcnt);	400	atomic_inc(&blkg->refcnt);
375	}	401	}
376		402
		403	/**
		404	* blkg_try_get - try and get a blkg reference
		405	* @blkg: blkg to get
		406	*
		407	* This is for use when doing an RCU lookup of the blkg. We may be in the midst
		408	* of freeing this blkg, so we can only use it if the refcnt is not zero.
		409	*/
		410	static inline struct blkcg_gq blkg_try_get(struct blkcg_gq blkg)
		411	{
		412	if (atomic_inc_not_zero(&blkg->refcnt))
		413	return blkg;
		414	return NULL;
		415	}
		416
		417
377	void __blkg_release_rcu(struct rcu_head *rcu);	418	void __blkg_release_rcu(struct rcu_head *rcu);
378		419
379	/**	420	/**
@@ -734,6 +775,59 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
734	return !throtl;	775	return !throtl;
735	}	776	}
736		777
		778	static inline void blkcg_use_delay(struct blkcg_gq *blkg)
		779	{
		780	if (atomic_add_return(1, &blkg->use_delay) == 1)
		781	atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
		782	}
		783
		784	static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
		785	{
		786	int old = atomic_read(&blkg->use_delay);
		787
		788	if (old == 0)
		789	return 0;
		790
		791	/*
		792	* We do this song and dance because we can race with somebody else
		793	* adding or removing delay. If we just did an atomic_dec we'd end up
		794	* negative and we'd already be in trouble. We need to subtract 1 and
		795	* then check to see if we were the last delay so we can drop the
		796	* congestion count on the cgroup.
		797	*/
		798	while (old) {
		799	int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
		800	if (cur == old)
		801	break;
		802	old = cur;
		803	}
		804
		805	if (old == 0)
		806	return 0;
		807	if (old == 1)
		808	atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
		809	return 1;
		810	}
		811
		812	static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
		813	{
		814	int old = atomic_read(&blkg->use_delay);
		815	if (!old)
		816	return;
		817	/* We only want 1 person clearing the congestion count for this blkg. */
		818	while (old) {
		819	int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
		820	if (cur == old) {
		821	atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
		822	break;
		823	}
		824	old = cur;
		825	}
		826	}
		827
		828	void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
		829	void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
		830	void blkcg_maybe_throttle_current(void);
737	#else /* CONFIG_BLK_CGROUP */	831	#else /* CONFIG_BLK_CGROUP */
738		832
739	struct blkcg {	833	struct blkcg {
@@ -753,8 +847,13 @@ struct blkcg_policy {
753		847
754	#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))	848	#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
755		849
		850	static inline void blkcg_maybe_throttle_current(void) { }
		851	static inline bool blk_cgroup_congested(void) { return false; }
		852
756	#ifdef CONFIG_BLOCK	853	#ifdef CONFIG_BLOCK
757		854
		855	static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
		856
758	static inline struct blkcg_gq blkg_lookup(struct blkcg blkcg, void *key) { return NULL; }	857	static inline struct blkcg_gq blkg_lookup(struct blkcg blkcg, void *key) { return NULL; }
759	static inline int blkcg_init_queue(struct request_queue *q) { return 0; }	858	static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
760	static inline void blkcg_drain_queue(struct request_queue *q) { }	859	static inline void blkcg_drain_queue(struct request_queue *q) { }