summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/blk-cgroup.c53
-rw-r--r--block/blk-core.c3
-rw-r--r--include/linux/backing-dev.h1
-rw-r--r--include/linux/blk-cgroup.h16
-rw-r--r--include/linux/blk_types.h10
-rw-r--r--include/linux/writeback.h13
6 files changed, 92 insertions, 4 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad7a91dec934..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
55static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ 55static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
56 56
57static bool blkcg_debug_stats = false; 57static bool blkcg_debug_stats = false;
58static struct workqueue_struct *blkcg_punt_bio_wq;
58 59
59static bool blkcg_policy_enabled(struct request_queue *q, 60static bool blkcg_policy_enabled(struct request_queue *q,
60 const struct blkcg_policy *pol) 61 const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
89{ 90{
90 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); 91 struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
91 92
93 WARN_ON(!bio_list_empty(&blkg->async_bios));
94
92 /* release the blkcg and parent blkg refs this blkg has been holding */ 95 /* release the blkcg and parent blkg refs this blkg has been holding */
93 css_put(&blkg->blkcg->css); 96 css_put(&blkg->blkcg->css);
94 if (blkg->parent) 97 if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
114 call_rcu(&blkg->rcu_head, __blkg_release); 117 call_rcu(&blkg->rcu_head, __blkg_release);
115} 118}
116 119
120static void blkg_async_bio_workfn(struct work_struct *work)
121{
122 struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
123 async_bio_work);
124 struct bio_list bios = BIO_EMPTY_LIST;
125 struct bio *bio;
126
127 /* as long as there are pending bios, @blkg can't go away */
128 spin_lock_bh(&blkg->async_bio_lock);
129 bio_list_merge(&bios, &blkg->async_bios);
130 bio_list_init(&blkg->async_bios);
131 spin_unlock_bh(&blkg->async_bio_lock);
132
133 while ((bio = bio_list_pop(&bios)))
134 submit_bio(bio);
135}
136
117/** 137/**
118 * blkg_alloc - allocate a blkg 138 * blkg_alloc - allocate a blkg
119 * @blkcg: block cgroup the new blkg is associated with 139 * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
142 162
143 blkg->q = q; 163 blkg->q = q;
144 INIT_LIST_HEAD(&blkg->q_node); 164 INIT_LIST_HEAD(&blkg->q_node);
165 spin_lock_init(&blkg->async_bio_lock);
166 bio_list_init(&blkg->async_bios);
167 INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
145 blkg->blkcg = blkcg; 168 blkg->blkcg = blkcg;
146 169
147 for (i = 0; i < BLKCG_MAX_POLS; i++) { 170 for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
1528} 1551}
1529EXPORT_SYMBOL_GPL(blkcg_policy_unregister); 1552EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1530 1553
1554bool __blkcg_punt_bio_submit(struct bio *bio)
1555{
1556 struct blkcg_gq *blkg = bio->bi_blkg;
1557
1558 /* consume the flag first */
1559 bio->bi_opf &= ~REQ_CGROUP_PUNT;
1560
1561 /* never bounce for the root cgroup */
1562 if (!blkg->parent)
1563 return false;
1564
1565 spin_lock_bh(&blkg->async_bio_lock);
1566 bio_list_add(&blkg->async_bios, bio);
1567 spin_unlock_bh(&blkg->async_bio_lock);
1568
1569 queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
1570 return true;
1571}
1572
1531/* 1573/*
1532 * Scale the accumulated delay based on how long it has been since we updated 1574 * Scale the accumulated delay based on how long it has been since we updated
1533 * the delay. We only call this when we are adding delay, in case it's been a 1575 * the delay. We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
1729 atomic64_add(delta, &blkg->delay_nsec); 1771 atomic64_add(delta, &blkg->delay_nsec);
1730} 1772}
1731 1773
1774static int __init blkcg_init(void)
1775{
1776 blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
1777 WQ_MEM_RECLAIM | WQ_FREEZABLE |
1778 WQ_UNBOUND | WQ_SYSFS, 0);
1779 if (!blkcg_punt_bio_wq)
1780 return -ENOMEM;
1781 return 0;
1782}
1783subsys_initcall(blkcg_init);
1784
1732module_param(blkcg_debug_stats, bool, 0644); 1785module_param(blkcg_debug_stats, bool, 0644);
1733MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); 1786MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index edd009213f5b..260e36a2c343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
1128 */ 1128 */
1129blk_qc_t submit_bio(struct bio *bio) 1129blk_qc_t submit_bio(struct bio *bio)
1130{ 1130{
1131 if (blkcg_punt_bio_submit(bio))
1132 return BLK_QC_T_NONE;
1133
1131 /* 1134 /*
1132 * If it's a regular read/write or a barrier with data attached, 1135 * If it's a regular read/write or a barrier with data attached,
1133 * go through the normal accounting stuff before submission. 1136 * go through the normal accounting stuff before submission.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
48extern struct list_head bdi_list; 48extern struct list_head bdi_list;
49 49
50extern struct workqueue_struct *bdi_wq; 50extern struct workqueue_struct *bdi_wq;
51extern struct workqueue_struct *bdi_async_bio_wq;
51 52
52static inline bool wb_has_dirty_io(struct bdi_writeback *wb) 53static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
53{ 54{
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
132 132
133 struct blkg_policy_data *pd[BLKCG_MAX_POLS]; 133 struct blkg_policy_data *pd[BLKCG_MAX_POLS];
134 134
135 struct rcu_head rcu_head; 135 spinlock_t async_bio_lock;
136 struct bio_list async_bios;
137 struct work_struct async_bio_work;
136 138
137 atomic_t use_delay; 139 atomic_t use_delay;
138 atomic64_t delay_nsec; 140 atomic64_t delay_nsec;
139 atomic64_t delay_start; 141 atomic64_t delay_start;
140 u64 last_delay; 142 u64 last_delay;
141 int last_use; 143 int last_use;
144
145 struct rcu_head rcu_head;
142}; 146};
143 147
144typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); 148typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
701 struct bio *bio) { return false; } 705 struct bio *bio) { return false; }
702#endif 706#endif
703 707
708bool __blkcg_punt_bio_submit(struct bio *bio);
709
710static inline bool blkcg_punt_bio_submit(struct bio *bio)
711{
712 if (bio->bi_opf & REQ_CGROUP_PUNT)
713 return __blkcg_punt_bio_submit(bio);
714 else
715 return false;
716}
704 717
705static inline void blkcg_bio_issue_init(struct bio *bio) 718static inline void blkcg_bio_issue_init(struct bio *bio)
706{ 719{
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
848static inline void blkg_get(struct blkcg_gq *blkg) { } 861static inline void blkg_get(struct blkcg_gq *blkg) { }
849static inline void blkg_put(struct blkcg_gq *blkg) { } 862static inline void blkg_put(struct blkcg_gq *blkg) { }
850 863
864static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
851static inline void blkcg_bio_issue_init(struct bio *bio) { } 865static inline void blkcg_bio_issue_init(struct bio *bio) { }
852static inline bool blkcg_bio_issue_check(struct request_queue *q, 866static inline bool blkcg_bio_issue_check(struct request_queue *q,
853 struct bio *bio) { return true; } 867 struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
311 __REQ_RAHEAD, /* read ahead, can fail anytime */ 311 __REQ_RAHEAD, /* read ahead, can fail anytime */
312 __REQ_BACKGROUND, /* background IO */ 312 __REQ_BACKGROUND, /* background IO */
313 __REQ_NOWAIT, /* Don't wait if request will block */ 313 __REQ_NOWAIT, /* Don't wait if request will block */
314 /*
315 * When a shared kthread needs to issue a bio for a cgroup, doing
316 * so synchronously can lead to priority inversions as the kthread
317 * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes
318 * submit_bio() punt the actual issuing to a dedicated per-blkcg
319 * work item to avoid such priority inversions.
320 */
321 __REQ_CGROUP_PUNT,
314 322
315 /* command specific flags for REQ_OP_WRITE_ZEROES: */ 323 /* command specific flags for REQ_OP_WRITE_ZEROES: */
316 __REQ_NOUNMAP, /* do not free blocks when zeroing */ 324 __REQ_NOUNMAP, /* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
337#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) 345#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
338#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) 346#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
339#define REQ_NOWAIT (1ULL << __REQ_NOWAIT) 347#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
348#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT)
349
340#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) 350#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
341#define REQ_HIPRI (1ULL << __REQ_HIPRI) 351#define REQ_HIPRI (1ULL << __REQ_HIPRI)
342 352
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e056a22075cf..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,6 +78,8 @@ struct writeback_control {
78 */ 78 */
79 unsigned no_cgroup_owner:1; 79 unsigned no_cgroup_owner:1;
80 80
81 unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */
82
81#ifdef CONFIG_CGROUP_WRITEBACK 83#ifdef CONFIG_CGROUP_WRITEBACK
82 struct bdi_writeback *wb; /* wb this writeback is issued under */ 84 struct bdi_writeback *wb; /* wb this writeback is issued under */
83 struct inode *inode; /* inode being written out */ 85 struct inode *inode; /* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
94 96
95static inline int wbc_to_write_flags(struct writeback_control *wbc) 97static inline int wbc_to_write_flags(struct writeback_control *wbc)
96{ 98{
99 int flags = 0;
100
101 if (wbc->punt_to_cgroup)
102 flags = REQ_CGROUP_PUNT;
103
97 if (wbc->sync_mode == WB_SYNC_ALL) 104 if (wbc->sync_mode == WB_SYNC_ALL)
98 return REQ_SYNC; 105 flags |= REQ_SYNC;
99 else if (wbc->for_kupdate || wbc->for_background) 106 else if (wbc->for_kupdate || wbc->for_background)
100 return REQ_BACKGROUND; 107 flags |= REQ_BACKGROUND;
101 108
102 return 0; 109 return flags;
103} 110}
104 111
105static inline struct cgroup_subsys_state * 112static inline struct cgroup_subsys_state *