blkcg: implement REQ_CGROUP_PUNT

When a shared kthread needs to issue a bio for a cgroup, doing so synchronously can lead to priority inversions as the kthread can be trapped waiting for that cgroup. This patch implements REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing to a dedicated per-blkcg work item to avoid such priority inversions. This will be used to fix priority inversions in btrfs compression and should be generally useful as we grow filesystem support for comprehensive IO control. Cc: Chris Mason <clm@fb.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Tejun Heo <tj@kernel.org> 2019-06-27 16:39:52 -0400
committer: Jens Axboe <axboe@kernel.dk> 2019-07-10 11:00:57 -0400
commit: d3f77dfdc71835f8db71ca57d272b1fbec9dfc18 (patch)
tree: ced59cee416b39c3c2f80c647c736a2d378772e3 /block/blk-cgroup.c
parent: 653c45c6b90c9659facbef10546d1f3a8e37d0cf (diff)
1 files changed, 53 insertions, 0 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad7a91dec934..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 static LIST_HEAD(all_blkcgs);           /* protected by blkcg_pol_mutex */
 static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
 static bool blkcg_policy_enabled(struct request_queue *q,
                                 const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
 {
        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+        WARN_ON(!bio_list_empty(&blkg->async_bios));
        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
        call_rcu(&blkg->rcu_head, __blkg_release);
 }
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+                                             async_bio_work);
+        struct bio_list bios = BIO_EMPTY_LIST;
+        struct bio *bio;
+        /* as long as there are pending bios, @blkg can't go away */
+        spin_lock_bh(&blkg->async_bio_lock);
+        bio_list_merge(&bios, &blkg->async_bios);
+        bio_list_init(&blkg->async_bios);
+        spin_unlock_bh(&blkg->async_bio_lock);
+        while ((bio = bio_list_pop(&bios)))
+                submit_bio(bio);
+}
 /**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
+        spin_lock_init(&blkg->async_bio_lock);
+        bio_list_init(&blkg->async_bios);
+        INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
        blkg->blkcg = blkcg;
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+        struct blkcg_gq *blkg = bio->bi_blkg;
+        /* consume the flag first */
+        bio->bi_opf &= ~REQ_CGROUP_PUNT;
+        /* never bounce for the root cgroup */
+        if (!blkg->parent)
+                return false;
+        spin_lock_bh(&blkg->async_bio_lock);
+        bio_list_add(&blkg->async_bios, bio);
+        spin_unlock_bh(&blkg->async_bio_lock);
+        queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+        return true;
+}
 /*
 * Scale the accumulated delay based on how long it has been since we updated
 * the delay.  We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
        atomic64_add(delta, &blkg->delay_nsec);
 }
+static int __init blkcg_init(void)
+{
+        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+                                            WQ_MEM_RECLAIM | WQ_FREEZABLE |
+                                            WQ_UNBOUND | WQ_SYSFS, 0);
+        if (!blkcg_punt_bio_wq)
+                return -ENOMEM;
+        return 0;
+}
+subsys_initcall(blkcg_init);
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
author	Tejun Heo <tj@kernel.org>	2019-06-27 16:39:52 -0400
committer	Jens Axboe <axboe@kernel.dk>	2019-07-10 11:00:57 -0400
commit	d3f77dfdc71835f8db71ca57d272b1fbec9dfc18 (patch)
tree	ced59cee416b39c3c2f80c647c736a2d378772e3 /block/blk-cgroup.c
parent	653c45c6b90c9659facbef10546d1f3a8e37d0cf (diff)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ad7a91dec934..24ed26957367 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
55	static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */	55	static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
56		56
57	static bool blkcg_debug_stats = false;	57	static bool blkcg_debug_stats = false;
		58	static struct workqueue_struct *blkcg_punt_bio_wq;
58		59
59	static bool blkcg_policy_enabled(struct request_queue *q,	60	static bool blkcg_policy_enabled(struct request_queue *q,
60	const struct blkcg_policy *pol)	61	const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
89	{	90	{
90	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);	91	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
91		92
		93	WARN_ON(!bio_list_empty(&blkg->async_bios));
		94
92	/* release the blkcg and parent blkg refs this blkg has been holding */	95	/* release the blkcg and parent blkg refs this blkg has been holding */
93	css_put(&blkg->blkcg->css);	96	css_put(&blkg->blkcg->css);
94	if (blkg->parent)	97	if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
114	call_rcu(&blkg->rcu_head, __blkg_release);	117	call_rcu(&blkg->rcu_head, __blkg_release);
115	}	118	}
116		119
		120	static void blkg_async_bio_workfn(struct work_struct *work)
		121	{
		122	struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
		123	async_bio_work);
		124	struct bio_list bios = BIO_EMPTY_LIST;
		125	struct bio *bio;
		126
		127	/* as long as there are pending bios, @blkg can't go away */
		128	spin_lock_bh(&blkg->async_bio_lock);
		129	bio_list_merge(&bios, &blkg->async_bios);
		130	bio_list_init(&blkg->async_bios);
		131	spin_unlock_bh(&blkg->async_bio_lock);
		132
		133	while ((bio = bio_list_pop(&bios)))
		134	submit_bio(bio);
		135	}
		136
117	/**	137	/**
118	* blkg_alloc - allocate a blkg	138	* blkg_alloc - allocate a blkg
119	* @blkcg: block cgroup the new blkg is associated with	139	* @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq blkg_alloc(struct blkcg blkcg, struct request_queue *q,
142		162
143	blkg->q = q;	163	blkg->q = q;
144	INIT_LIST_HEAD(&blkg->q_node);	164	INIT_LIST_HEAD(&blkg->q_node);
		165	spin_lock_init(&blkg->async_bio_lock);
		166	bio_list_init(&blkg->async_bios);
		167	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
145	blkg->blkcg = blkcg;	168	blkg->blkcg = blkcg;
146		169
147	for (i = 0; i < BLKCG_MAX_POLS; i++) {	170	for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
1528	}	1551	}
1529	EXPORT_SYMBOL_GPL(blkcg_policy_unregister);	1552	EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1530		1553
		1554	bool __blkcg_punt_bio_submit(struct bio *bio)
		1555	{
		1556	struct blkcg_gq *blkg = bio->bi_blkg;
		1557
		1558	/* consume the flag first */
		1559	bio->bi_opf &= ~REQ_CGROUP_PUNT;
		1560
		1561	/* never bounce for the root cgroup */
		1562	if (!blkg->parent)
		1563	return false;
		1564
		1565	spin_lock_bh(&blkg->async_bio_lock);
		1566	bio_list_add(&blkg->async_bios, bio);
		1567	spin_unlock_bh(&blkg->async_bio_lock);
		1568
		1569	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
		1570	return true;
		1571	}
		1572
1531	/*	1573	/*
1532	* Scale the accumulated delay based on how long it has been since we updated	1574	* Scale the accumulated delay based on how long it has been since we updated
1533	* the delay. We only call this when we are adding delay, in case it's been a	1575	* the delay. We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1729	atomic64_add(delta, &blkg->delay_nsec);	1771	atomic64_add(delta, &blkg->delay_nsec);
1730	}	1772	}
1731		1773
		1774	static int __init blkcg_init(void)
		1775	{
		1776	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
		1777	WQ_MEM_RECLAIM \| WQ_FREEZABLE \|
		1778	WQ_UNBOUND \| WQ_SYSFS, 0);
		1779	if (!blkcg_punt_bio_wq)
		1780	return -ENOMEM;
		1781	return 0;
		1782	}
		1783	subsys_initcall(blkcg_init);
		1784
1732	module_param(blkcg_debug_stats, bool, 0644);	1785	module_param(blkcg_debug_stats, bool, 0644);
1733	MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");	1786	MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");