summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2019-06-27 16:39:52 -0400
committerJens Axboe <axboe@kernel.dk>2019-07-10 11:00:57 -0400
commitd3f77dfdc71835f8db71ca57d272b1fbec9dfc18 (patch)
treeced59cee416b39c3c2f80c647c736a2d378772e3
parent653c45c6b90c9659facbef10546d1f3a8e37d0cf (diff)
blkcg: implement REQ_CGROUP_PUNT
When a shared kthread needs to issue a bio for a cgroup, doing so synchronously can lead to priority inversions as the kthread can be trapped waiting for that cgroup. This patch implements REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing to a dedicated per-blkcg work item to avoid such priority inversions. This will be used to fix priority inversions in btrfs compression and should be generally useful as we grow filesystem support for comprehensive IO control. Cc: Chris Mason <clm@fb.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Jan Kara <jack@suse.cz> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/blk-cgroup.c53
-rw-r--r--block/blk-core.c3
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/blk-cgroup.h16
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/writeback.h13
6 files changed, 92 insertions, 4 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad7a91dec934..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
55static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 55static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
56 56
57static bool blkcg_debug_stats = false; 57static bool blkcg_debug_stats = false;
58static struct workqueue_struct *blkcg_punt_bio_wq;
58 59
59static bool blkcg_policy_enabled(struct request_queue *q, 60static bool blkcg_policy_enabled(struct request_queue *q,
60 const struct blkcg_policy *pol) 61 const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
89{ 90{
90 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 91 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
91 92
93 WARN_ON(!bio_list_empty(&blkg->async_bios));
94
92 /* release the blkcg and parent blkg refs this blkg has been holding */ 95 /* release the blkcg and parent blkg refs this blkg has been holding */
93 css_put(&blkg->blkcg->css); 96 css_put(&blkg->blkcg->css);
94 if (blkg->parent) 97 if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
114 call_rcu(&blkg->rcu_head, __blkg_release); 117 call_rcu(&blkg->rcu_head, __blkg_release);
115} 118}
116 119
120static void blkg_async_bio_workfn(struct work_struct *work)
121{
122 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
123 async_bio_work);
124 struct bio_list bios = BIO_EMPTY_LIST;
125 struct bio *bio;
126
127 /* as long as there are pending bios, @blkg can't go away */
128 spin_lock_bh(&blkg->async_bio_lock);
129 bio_list_merge(&bios, &blkg->async_bios);
130 bio_list_init(&blkg->async_bios);
131 spin_unlock_bh(&blkg->async_bio_lock);
132
133 while ((bio = bio_list_pop(&bios)))
134 submit_bio(bio);
135}
136
117/** 137/**
118 * blkg_alloc - allocate a blkg 138 * blkg_alloc - allocate a blkg
119 * @blkcg: block cgroup the new blkg is associated with 139 * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
142 162
143 blkg->q = q; 163 blkg->q = q;
144 INIT_LIST_HEAD(&blkg->q_node); 164 INIT_LIST_HEAD(&blkg->q_node);
165 spin_lock_init(&blkg->async_bio_lock);
166 bio_list_init(&blkg->async_bios);
167 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
145 blkg->blkcg = blkcg; 168 blkg->blkcg = blkcg;
146 169
147 for (i = 0; i < BLKCG_MAX_POLS; i++) { 170 for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
1528} 1551}
1529EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1552EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1530 1553
1554bool __blkcg_punt_bio_submit(struct bio *bio)
1555{
1556 struct blkcg_gq *blkg = bio->bi_blkg;
1557
1558 /* consume the flag first */
1559 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1560
1561 /* never bounce for the root cgroup */
1562 if (!blkg->parent)
1563 return false;
1564
1565 spin_lock_bh(&blkg->async_bio_lock);
1566 bio_list_add(&blkg->async_bios, bio);
1567 spin_unlock_bh(&blkg->async_bio_lock);
1568
1569 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1570 return true;
1571}
1572
1531/* 1573/*
1532 * Scale the accumulated delay based on how long it has been since we updated 1574 * Scale the accumulated delay based on how long it has been since we updated
1533 * the delay. We only call this when we are adding delay, in case it's been a 1575 * the delay. We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1729 atomic64_add(delta, &blkg->delay_nsec); 1771 atomic64_add(delta, &blkg->delay_nsec);
1730} 1772}
1731 1773
1774static int __init blkcg_init(void)
1775{
1776 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1777 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1778 WQ_UNBOUND | WQ_SYSFS, 0);
1779 if (!blkcg_punt_bio_wq)
1780 return -ENOMEM;
1781 return 0;
1782}
1783subsys_initcall(blkcg_init);
1784
1732module_param(blkcg_debug_stats, bool, 0644); 1785module_param(blkcg_debug_stats, bool, 0644);
1733MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1786MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index edd009213f5b..260e36a2c343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
1128 */ 1128 */
1129blk_qc_t submit_bio(struct bio *bio) 1129blk_qc_t submit_bio(struct bio *bio)
1130{ 1130{
1131 if (blkcg_punt_bio_submit(bio))
1132 return BLK_QC_T_NONE;
1133
1131 /* 1134 /*
1132 * If it's a regular read/write or a barrier with data attached, 1135 * If it's a regular read/write or a barrier with data attached,
1133 * go through the normal accounting stuff before submission. 1136 * go through the normal accounting stuff before submission.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
48extern struct list_head bdi_list; 48extern struct list_head bdi_list;
49 49
50extern struct workqueue_struct *bdi_wq; 50extern struct workqueue_struct *bdi_wq;
51extern struct workqueue_struct *bdi_async_bio_wq;
51 52
52static inline bool wb_has_dirty_io(struct bdi_writeback *wb) 53static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
53{ 54{
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
132 132
133 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 133 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
134 134
135 struct rcu_head rcu_head; 135 spinlock_t async_bio_lock;
136 struct bio_list async_bios;
137 struct work_struct async_bio_work;
136 138
137 atomic_t use_delay; 139 atomic_t use_delay;
138 atomic64_t delay_nsec; 140 atomic64_t delay_nsec;
139 atomic64_t delay_start; 141 atomic64_t delay_start;
140 u64 last_delay; 142 u64 last_delay;
141 int last_use; 143 int last_use;
144
145 struct rcu_head rcu_head;
142}; 146};
143 147
144typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); 148typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
701 struct bio *bio) { return false; } 705 struct bio *bio) { return false; }
702#endif 706#endif
703 707
708bool __blkcg_punt_bio_submit(struct bio *bio);
709
710static inline bool blkcg_punt_bio_submit(struct bio *bio)
711{
712 if (bio->bi_opf & REQ_CGROUP_PUNT)
713 return __blkcg_punt_bio_submit(bio);
714 else
715 return false;
716}
704 717
705static inline void blkcg_bio_issue_init(struct bio *bio) 718static inline void blkcg_bio_issue_init(struct bio *bio)
706{ 719{
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
848static inline void blkg_get(struct blkcg_gq *blkg) { } 861static inline void blkg_get(struct blkcg_gq *blkg) { }
849static inline void blkg_put(struct blkcg_gq *blkg) { } 862static inline void blkg_put(struct blkcg_gq *blkg) { }
850 863
864static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
851static inline void blkcg_bio_issue_init(struct bio *bio) { } 865static inline void blkcg_bio_issue_init(struct bio *bio) { }
852static inline bool blkcg_bio_issue_check(struct request_queue *q, 866static inline bool blkcg_bio_issue_check(struct request_queue *q,
853 struct bio *bio) { return true; } 867 struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
311 __REQ_RAHEAD, /* read ahead, can fail anytime */ 311 __REQ_RAHEAD, /* read ahead, can fail anytime */
312 __REQ_BACKGROUND, /* background IO */ 312 __REQ_BACKGROUND, /* background IO */
313 __REQ_NOWAIT, /* Don't wait if request will block */ 313 __REQ_NOWAIT, /* Don't wait if request will block */
314 /*
315 * When a shared kthread needs to issue a bio for a cgroup, doing
316 * so synchronously can lead to priority inversions as the kthread
317 * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
318 * submit_bio() punt the actual issuing to a dedicated per-blkcg
319 * work item to avoid such priority inversions.
320 */
321 __REQ_CGROUP_PUNT,
314 322
315 /* command specific flags for REQ_OP_WRITE_ZEROES: */ 323 /* command specific flags for REQ_OP_WRITE_ZEROES: */
316 __REQ_NOUNMAP, /* do not free blocks when zeroing */ 324 __REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
337#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 345#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
338#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) 346#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
339#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) 347#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
348#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
349
340#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 350#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
341#define REQ_HIPRI (1ULL << __REQ_HIPRI) 351#define REQ_HIPRI (1ULL << __REQ_HIPRI)
342 352
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e056a22075cf..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,6 +78,8 @@ struct writeback_control {
78 */ 78 */
79 unsigned no_cgroup_owner:1; 79 unsigned no_cgroup_owner:1;
80 80
81 unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
82
81#ifdef CONFIG_CGROUP_WRITEBACK 83#ifdef CONFIG_CGROUP_WRITEBACK
82 struct bdi_writeback *wb; /* wb this writeback is issued under */ 84 struct bdi_writeback *wb; /* wb this writeback is issued under */
83 struct inode *inode; /* inode being written out */ 85 struct inode *inode; /* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
94 96
95static inline int wbc_to_write_flags(struct writeback_control *wbc) 97static inline int wbc_to_write_flags(struct writeback_control *wbc)
96{ 98{
99 int flags = 0;
100
101 if (wbc->punt_to_cgroup)
102 flags = REQ_CGROUP_PUNT;
103
97 if (wbc->sync_mode == WB_SYNC_ALL) 104 if (wbc->sync_mode == WB_SYNC_ALL)
98 return REQ_SYNC; 105 flags |= REQ_SYNC;
99 else if (wbc->for_kupdate || wbc->for_background) 106 else if (wbc->for_kupdate || wbc->for_background)
100 return REQ_BACKGROUND; 107 flags |= REQ_BACKGROUND;
101 108
102 return 0; 109 return flags;
103} 110}
104 111
105static inline struct cgroup_subsys_state * 112static inline struct cgroup_subsys_state *