aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bio-integrity.c4
-rw-r--r--block/bio.c77
-rw-r--r--block/blk-cgroup.c92
-rw-r--r--block/blk-cgroup.h40
-rw-r--r--block/blk-core.c143
-rw-r--r--block/blk-exec.c10
-rw-r--r--block/blk-merge.c3
-rw-r--r--block/blk-mq-cpumap.c2
-rw-r--r--block/blk-mq-tag.c38
-rw-r--r--block/blk-mq-tag.h1
-rw-r--r--block/blk-mq.c228
-rw-r--r--block/blk-sysfs.c2
-rw-r--r--block/blk.h5
-rw-r--r--block/bounce.c5
-rw-r--r--block/cfq-iosched.c125
-rw-r--r--block/elevator.c8
-rw-r--r--block/genhd.c13
-rw-r--r--block/ioctl.c37
18 files changed, 534 insertions, 299 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61d..0436c21db7f2 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
361 361
362 /* Restore original bio completion handler */ 362 /* Restore original bio completion handler */
363 bio->bi_end_io = bip->bip_end_io; 363 bio->bi_end_io = bip->bip_end_io;
364 bio_endio_nodec(bio, error); 364 bio_endio(bio, error);
365} 365}
366 366
367/** 367/**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
388 */ 388 */
389 if (error) { 389 if (error) {
390 bio->bi_end_io = bip->bip_end_io; 390 bio->bi_end_io = bip->bip_end_io;
391 bio_endio_nodec(bio, error); 391 bio_endio(bio, error);
392 392
393 return; 393 return;
394 } 394 }
diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee..259197d97de1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,8 +270,8 @@ void bio_init(struct bio *bio)
270{ 270{
271 memset(bio, 0, sizeof(*bio)); 271 memset(bio, 0, sizeof(*bio));
272 bio->bi_flags = 1 << BIO_UPTODATE; 272 bio->bi_flags = 1 << BIO_UPTODATE;
273 atomic_set(&bio->bi_remaining, 1); 273 atomic_set(&bio->__bi_remaining, 1);
274 atomic_set(&bio->bi_cnt, 1); 274 atomic_set(&bio->__bi_cnt, 1);
275} 275}
276EXPORT_SYMBOL(bio_init); 276EXPORT_SYMBOL(bio_init);
277 277
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
292 __bio_free(bio); 292 __bio_free(bio);
293 293
294 memset(bio, 0, BIO_RESET_BYTES); 294 memset(bio, 0, BIO_RESET_BYTES);
295 bio->bi_flags = flags|(1 << BIO_UPTODATE); 295 bio->bi_flags = flags | (1 << BIO_UPTODATE);
296 atomic_set(&bio->bi_remaining, 1); 296 atomic_set(&bio->__bi_remaining, 1);
297} 297}
298EXPORT_SYMBOL(bio_reset); 298EXPORT_SYMBOL(bio_reset);
299 299
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
303 bio_put(bio); 303 bio_put(bio);
304} 304}
305 305
306/*
307 * Increment chain count for the bio. Make sure the CHAIN flag update
308 * is visible before the raised count.
309 */
310static inline void bio_inc_remaining(struct bio *bio)
311{
312 bio->bi_flags |= (1 << BIO_CHAIN);
313 smp_mb__before_atomic();
314 atomic_inc(&bio->__bi_remaining);
315}
316
306/** 317/**
307 * bio_chain - chain bio completions 318 * bio_chain - chain bio completions
308 * @bio: the target bio 319 * @bio: the target bio
@@ -320,7 +331,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
320 331
321 bio->bi_private = parent; 332 bio->bi_private = parent;
322 bio->bi_end_io = bio_chain_endio; 333 bio->bi_end_io = bio_chain_endio;
323 atomic_inc(&parent->bi_remaining); 334 bio_inc_remaining(parent);
324} 335}
325EXPORT_SYMBOL(bio_chain); 336EXPORT_SYMBOL(bio_chain);
326 337
@@ -524,13 +535,17 @@ EXPORT_SYMBOL(zero_fill_bio);
524 **/ 535 **/
525void bio_put(struct bio *bio) 536void bio_put(struct bio *bio)
526{ 537{
527 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 538 if (!bio_flagged(bio, BIO_REFFED))
528
529 /*
530 * last put frees it
531 */
532 if (atomic_dec_and_test(&bio->bi_cnt))
533 bio_free(bio); 539 bio_free(bio);
540 else {
541 BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
542
543 /*
544 * last put frees it
545 */
546 if (atomic_dec_and_test(&bio->__bi_cnt))
547 bio_free(bio);
548 }
534} 549}
535EXPORT_SYMBOL(bio_put); 550EXPORT_SYMBOL(bio_put);
536 551
@@ -1741,6 +1756,25 @@ void bio_flush_dcache_pages(struct bio *bi)
1741EXPORT_SYMBOL(bio_flush_dcache_pages); 1756EXPORT_SYMBOL(bio_flush_dcache_pages);
1742#endif 1757#endif
1743 1758
1759static inline bool bio_remaining_done(struct bio *bio)
1760{
1761 /*
1762 * If we're not chaining, then ->__bi_remaining is always 1 and
1763 * we always end io on the first invocation.
1764 */
1765 if (!bio_flagged(bio, BIO_CHAIN))
1766 return true;
1767
1768 BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
1769
1770 if (atomic_dec_and_test(&bio->__bi_remaining)) {
1771 clear_bit(BIO_CHAIN, &bio->bi_flags);
1772 return true;
1773 }
1774
1775 return false;
1776}
1777
1744/** 1778/**
1745 * bio_endio - end I/O on a bio 1779 * bio_endio - end I/O on a bio
1746 * @bio: bio 1780 * @bio: bio
@@ -1758,15 +1792,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
1758void bio_endio(struct bio *bio, int error) 1792void bio_endio(struct bio *bio, int error)
1759{ 1793{
1760 while (bio) { 1794 while (bio) {
1761 BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
1762
1763 if (error) 1795 if (error)
1764 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1796 clear_bit(BIO_UPTODATE, &bio->bi_flags);
1765 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1797 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1766 error = -EIO; 1798 error = -EIO;
1767 1799
1768 if (!atomic_dec_and_test(&bio->bi_remaining)) 1800 if (unlikely(!bio_remaining_done(bio)))
1769 return; 1801 break;
1770 1802
1771 /* 1803 /*
1772 * Need to have a real endio function for chained bios, 1804 * Need to have a real endio function for chained bios,
@@ -1790,21 +1822,6 @@ void bio_endio(struct bio *bio, int error)
1790EXPORT_SYMBOL(bio_endio); 1822EXPORT_SYMBOL(bio_endio);
1791 1823
1792/** 1824/**
1793 * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
1794 * @bio: bio
1795 * @error: error, if any
1796 *
1797 * For code that has saved and restored bi_end_io; thing hard before using this
1798 * function, probably you should've cloned the entire bio.
1799 **/
1800void bio_endio_nodec(struct bio *bio, int error)
1801{
1802 atomic_inc(&bio->bi_remaining);
1803 bio_endio(bio, error);
1804}
1805EXPORT_SYMBOL(bio_endio_nodec);
1806
1807/**
1808 * bio_split - split a bio 1825 * bio_split - split a bio
1809 * @bio: bio to split 1826 * @bio: bio to split
1810 * @sectors: number of sectors to split from the front of @bio 1827 * @sectors: number of sectors to split from the front of @bio
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ac817b750db..6e43fa355e71 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -9,6 +9,10 @@
9 * 9 *
10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> 10 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11 * Nauman Rafique <nauman@google.com> 11 * Nauman Rafique <nauman@google.com>
12 *
13 * For policy-specific per-blkcg data:
14 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
15 * Arianna Avanzini <avanzini.arianna@gmail.com>
12 */ 16 */
13#include <linux/ioprio.h> 17#include <linux/ioprio.h>
14#include <linux/kdev_t.h> 18#include <linux/kdev_t.h>
@@ -26,8 +30,7 @@
26 30
27static DEFINE_MUTEX(blkcg_pol_mutex); 31static DEFINE_MUTEX(blkcg_pol_mutex);
28 32
29struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, 33struct blkcg blkcg_root;
30 .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
31EXPORT_SYMBOL_GPL(blkcg_root); 34EXPORT_SYMBOL_GPL(blkcg_root);
32 35
33static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 36static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
@@ -823,6 +826,8 @@ static struct cgroup_subsys_state *
823blkcg_css_alloc(struct cgroup_subsys_state *parent_css) 826blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
824{ 827{
825 struct blkcg *blkcg; 828 struct blkcg *blkcg;
829 struct cgroup_subsys_state *ret;
830 int i;
826 831
827 if (!parent_css) { 832 if (!parent_css) {
828 blkcg = &blkcg_root; 833 blkcg = &blkcg_root;
@@ -830,17 +835,49 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
830 } 835 }
831 836
832 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); 837 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
833 if (!blkcg) 838 if (!blkcg) {
834 return ERR_PTR(-ENOMEM); 839 ret = ERR_PTR(-ENOMEM);
840 goto free_blkcg;
841 }
842
843 for (i = 0; i < BLKCG_MAX_POLS ; i++) {
844 struct blkcg_policy *pol = blkcg_policy[i];
845 struct blkcg_policy_data *cpd;
846
847 /*
848 * If the policy hasn't been attached yet, wait for it
849 * to be attached before doing anything else. Otherwise,
850 * check if the policy requires any specific per-cgroup
851 * data: if it does, allocate and initialize it.
852 */
853 if (!pol || !pol->cpd_size)
854 continue;
855
856 BUG_ON(blkcg->pd[i]);
857 cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
858 if (!cpd) {
859 ret = ERR_PTR(-ENOMEM);
860 goto free_pd_blkcg;
861 }
862 blkcg->pd[i] = cpd;
863 cpd->plid = i;
864 pol->cpd_init_fn(blkcg);
865 }
835 866
836 blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
837 blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
838done: 867done:
839 spin_lock_init(&blkcg->lock); 868 spin_lock_init(&blkcg->lock);
840 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); 869 INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
841 INIT_HLIST_HEAD(&blkcg->blkg_list); 870 INIT_HLIST_HEAD(&blkcg->blkg_list);
842 871
843 return &blkcg->css; 872 return &blkcg->css;
873
874free_pd_blkcg:
875 for (i--; i >= 0; i--)
876 kfree(blkcg->pd[i]);
877
878free_blkcg:
879 kfree(blkcg);
880 return ret;
844} 881}
845 882
846/** 883/**
@@ -958,8 +995,10 @@ int blkcg_activate_policy(struct request_queue *q,
958 const struct blkcg_policy *pol) 995 const struct blkcg_policy *pol)
959{ 996{
960 LIST_HEAD(pds); 997 LIST_HEAD(pds);
998 LIST_HEAD(cpds);
961 struct blkcg_gq *blkg, *new_blkg; 999 struct blkcg_gq *blkg, *new_blkg;
962 struct blkg_policy_data *pd, *n; 1000 struct blkg_policy_data *pd, *nd;
1001 struct blkcg_policy_data *cpd, *cnd;
963 int cnt = 0, ret; 1002 int cnt = 0, ret;
964 bool preloaded; 1003 bool preloaded;
965 1004
@@ -1003,7 +1042,10 @@ int blkcg_activate_policy(struct request_queue *q,
1003 1042
1004 spin_unlock_irq(q->queue_lock); 1043 spin_unlock_irq(q->queue_lock);
1005 1044
1006 /* allocate policy_data for all existing blkgs */ 1045 /*
1046 * Allocate per-blkg and per-blkcg policy data
1047 * for all existing blkgs.
1048 */
1007 while (cnt--) { 1049 while (cnt--) {
1008 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); 1050 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
1009 if (!pd) { 1051 if (!pd) {
@@ -1011,26 +1053,50 @@ int blkcg_activate_policy(struct request_queue *q,
1011 goto out_free; 1053 goto out_free;
1012 } 1054 }
1013 list_add_tail(&pd->alloc_node, &pds); 1055 list_add_tail(&pd->alloc_node, &pds);
1056
1057 if (!pol->cpd_size)
1058 continue;
1059 cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
1060 if (!cpd) {
1061 ret = -ENOMEM;
1062 goto out_free;
1063 }
1064 list_add_tail(&cpd->alloc_node, &cpds);
1014 } 1065 }
1015 1066
1016 /* 1067 /*
1017 * Install the allocated pds. With @q bypassing, no new blkg 1068 * Install the allocated pds and cpds. With @q bypassing, no new blkg
1018 * should have been created while the queue lock was dropped. 1069 * should have been created while the queue lock was dropped.
1019 */ 1070 */
1020 spin_lock_irq(q->queue_lock); 1071 spin_lock_irq(q->queue_lock);
1021 1072
1022 list_for_each_entry(blkg, &q->blkg_list, q_node) { 1073 list_for_each_entry(blkg, &q->blkg_list, q_node) {
1023 if (WARN_ON(list_empty(&pds))) { 1074 if (WARN_ON(list_empty(&pds)) ||
1075 WARN_ON(pol->cpd_size && list_empty(&cpds))) {
1024 /* umm... this shouldn't happen, just abort */ 1076 /* umm... this shouldn't happen, just abort */
1025 ret = -ENOMEM; 1077 ret = -ENOMEM;
1026 goto out_unlock; 1078 goto out_unlock;
1027 } 1079 }
1080 cpd = list_first_entry(&cpds, struct blkcg_policy_data,
1081 alloc_node);
1082 list_del_init(&cpd->alloc_node);
1028 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); 1083 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
1029 list_del_init(&pd->alloc_node); 1084 list_del_init(&pd->alloc_node);
1030 1085
1031 /* grab blkcg lock too while installing @pd on @blkg */ 1086 /* grab blkcg lock too while installing @pd on @blkg */
1032 spin_lock(&blkg->blkcg->lock); 1087 spin_lock(&blkg->blkcg->lock);
1033 1088
1089 if (!pol->cpd_size)
1090 goto no_cpd;
1091 if (!blkg->blkcg->pd[pol->plid]) {
1092 /* Per-policy per-blkcg data */
1093 blkg->blkcg->pd[pol->plid] = cpd;
1094 cpd->plid = pol->plid;
1095 pol->cpd_init_fn(blkg->blkcg);
1096 } else { /* must free it as it has already been extracted */
1097 kfree(cpd);
1098 }
1099no_cpd:
1034 blkg->pd[pol->plid] = pd; 1100 blkg->pd[pol->plid] = pd;
1035 pd->blkg = blkg; 1101 pd->blkg = blkg;
1036 pd->plid = pol->plid; 1102 pd->plid = pol->plid;
@@ -1045,8 +1111,10 @@ out_unlock:
1045 spin_unlock_irq(q->queue_lock); 1111 spin_unlock_irq(q->queue_lock);
1046out_free: 1112out_free:
1047 blk_queue_bypass_end(q); 1113 blk_queue_bypass_end(q);
1048 list_for_each_entry_safe(pd, n, &pds, alloc_node) 1114 list_for_each_entry_safe(pd, nd, &pds, alloc_node)
1049 kfree(pd); 1115 kfree(pd);
1116 list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
1117 kfree(cpd);
1050 return ret; 1118 return ret;
1051} 1119}
1052EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1120EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1087,6 +1155,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
1087 1155
1088 kfree(blkg->pd[pol->plid]); 1156 kfree(blkg->pd[pol->plid]);
1089 blkg->pd[pol->plid] = NULL; 1157 blkg->pd[pol->plid] = NULL;
1158 kfree(blkg->blkcg->pd[pol->plid]);
1159 blkg->blkcg->pd[pol->plid] = NULL;
1090 1160
1091 spin_unlock(&blkg->blkcg->lock); 1161 spin_unlock(&blkg->blkcg->lock);
1092 } 1162 }
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c567865b5f1d..74296a78bba1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,6 @@
23/* Max limits for throttle policy */ 23/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX 24#define THROTL_IOPS_MAX UINT_MAX
25 25
26/* CFQ specific, out here for blkcg->cfq_weight */
27#define CFQ_WEIGHT_MIN 10
28#define CFQ_WEIGHT_MAX 1000
29#define CFQ_WEIGHT_DEFAULT 500
30
31#ifdef CONFIG_BLK_CGROUP 26#ifdef CONFIG_BLK_CGROUP
32 27
33enum blkg_rwstat_type { 28enum blkg_rwstat_type {
@@ -50,9 +45,7 @@ struct blkcg {
50 struct blkcg_gq *blkg_hint; 45 struct blkcg_gq *blkg_hint;
51 struct hlist_head blkg_list; 46 struct hlist_head blkg_list;
52 47
53 /* TODO: per-policy storage in blkcg */ 48 struct blkcg_policy_data *pd[BLKCG_MAX_POLS];
54 unsigned int cfq_weight; /* belongs to cfq */
55 unsigned int cfq_leaf_weight;
56}; 49};
57 50
58struct blkg_stat { 51struct blkg_stat {
@@ -87,6 +80,24 @@ struct blkg_policy_data {
87 struct list_head alloc_node; 80 struct list_head alloc_node;
88}; 81};
89 82
83/*
84 * Policies that need to keep per-blkcg data which is independent
85 * from any request_queue associated to it must specify its size
86 * with the cpd_size field of the blkcg_policy structure and
87 * embed a blkcg_policy_data in it. blkcg core allocates
88 * policy-specific per-blkcg structures lazily the first time
89 * they are actually needed, so it handles them together with
90 * blkgs. cpd_init() is invoked to let each policy handle
91 * per-blkcg data.
92 */
93struct blkcg_policy_data {
94 /* the policy id this per-policy data belongs to */
95 int plid;
96
97 /* used during policy activation */
98 struct list_head alloc_node;
99};
100
90/* association between a blk cgroup and a request queue */ 101/* association between a blk cgroup and a request queue */
91struct blkcg_gq { 102struct blkcg_gq {
92 /* Pointer to the associated request_queue */ 103 /* Pointer to the associated request_queue */
@@ -112,6 +123,7 @@ struct blkcg_gq {
112 struct rcu_head rcu_head; 123 struct rcu_head rcu_head;
113}; 124};
114 125
126typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
115typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); 127typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
116typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); 128typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
117typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); 129typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
@@ -122,10 +134,13 @@ struct blkcg_policy {
122 int plid; 134 int plid;
123 /* policy specific private data size */ 135 /* policy specific private data size */
124 size_t pd_size; 136 size_t pd_size;
137 /* policy specific per-blkcg data size */
138 size_t cpd_size;
125 /* cgroup files for the policy */ 139 /* cgroup files for the policy */
126 struct cftype *cftypes; 140 struct cftype *cftypes;
127 141
128 /* operations */ 142 /* operations */
143 blkcg_pol_init_cpd_fn *cpd_init_fn;
129 blkcg_pol_init_pd_fn *pd_init_fn; 144 blkcg_pol_init_pd_fn *pd_init_fn;
130 blkcg_pol_online_pd_fn *pd_online_fn; 145 blkcg_pol_online_pd_fn *pd_online_fn;
131 blkcg_pol_offline_pd_fn *pd_offline_fn; 146 blkcg_pol_offline_pd_fn *pd_offline_fn;
@@ -218,6 +233,12 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
218 return blkg ? blkg->pd[pol->plid] : NULL; 233 return blkg ? blkg->pd[pol->plid] : NULL;
219} 234}
220 235
236static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
237 struct blkcg_policy *pol)
238{
239 return blkcg ? blkcg->pd[pol->plid] : NULL;
240}
241
221/** 242/**
222 * pdata_to_blkg - get blkg associated with policy private data 243 * pdata_to_blkg - get blkg associated with policy private data
223 * @pd: policy private data of interest 244 * @pd: policy private data of interest
@@ -564,6 +585,9 @@ struct blkcg;
564struct blkg_policy_data { 585struct blkg_policy_data {
565}; 586};
566 587
588struct blkcg_policy_data {
589};
590
567struct blkcg_gq { 591struct blkcg_gq {
568}; 592};
569 593
diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b94447a..f6ab750060fe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
117static void req_bio_endio(struct request *rq, struct bio *bio, 117static void req_bio_endio(struct request *rq, struct bio *bio,
118 unsigned int nbytes, int error) 118 unsigned int nbytes, int error)
119{ 119{
120 if (error) 120 if (error && !(rq->cmd_flags & REQ_CLONE))
121 clear_bit(BIO_UPTODATE, &bio->bi_flags); 121 clear_bit(BIO_UPTODATE, &bio->bi_flags);
122 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 122 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
123 error = -EIO; 123 error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
128 bio_advance(bio, nbytes); 128 bio_advance(bio, nbytes);
129 129
130 /* don't actually finish bio if it's part of flush sequence */ 130 /* don't actually finish bio if it's part of flush sequence */
131 if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) 131 if (bio->bi_iter.bi_size == 0 &&
132 !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
132 bio_endio(bio, error); 133 bio_endio(bio, error);
133} 134}
134 135
@@ -285,6 +286,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
285 q->request_fn(q); 286 q->request_fn(q);
286 q->request_fn_active--; 287 q->request_fn_active--;
287} 288}
289EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
288 290
289/** 291/**
290 * __blk_run_queue - run a single device queue 292 * __blk_run_queue - run a single device queue
@@ -552,6 +554,8 @@ void blk_cleanup_queue(struct request_queue *q)
552 q->queue_lock = &q->__queue_lock; 554 q->queue_lock = &q->__queue_lock;
553 spin_unlock_irq(lock); 555 spin_unlock_irq(lock);
554 556
557 bdi_destroy(&q->backing_dev_info);
558
555 /* @q is and will stay empty, shutdown and put */ 559 /* @q is and will stay empty, shutdown and put */
556 blk_put_queue(q); 560 blk_put_queue(q);
557} 561}
@@ -732,6 +736,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
732} 736}
733EXPORT_SYMBOL(blk_init_queue_node); 737EXPORT_SYMBOL(blk_init_queue_node);
734 738
739static void blk_queue_bio(struct request_queue *q, struct bio *bio);
740
735struct request_queue * 741struct request_queue *
736blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, 742blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
737 spinlock_t *lock) 743 spinlock_t *lock)
@@ -1521,7 +1527,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1521 * Caller must ensure !blk_queue_nomerges(q) beforehand. 1527 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1522 */ 1528 */
1523bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1529bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1524 unsigned int *request_count) 1530 unsigned int *request_count,
1531 struct request **same_queue_rq)
1525{ 1532{
1526 struct blk_plug *plug; 1533 struct blk_plug *plug;
1527 struct request *rq; 1534 struct request *rq;
@@ -1541,8 +1548,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1541 list_for_each_entry_reverse(rq, plug_list, queuelist) { 1548 list_for_each_entry_reverse(rq, plug_list, queuelist) {
1542 int el_ret; 1549 int el_ret;
1543 1550
1544 if (rq->q == q) 1551 if (rq->q == q) {
1545 (*request_count)++; 1552 (*request_count)++;
1553 /*
1554 * Only blk-mq multiple hardware queues case checks the
1555 * rq in the same queue, there should be only one such
1556 * rq in a queue
1557 **/
1558 if (same_queue_rq)
1559 *same_queue_rq = rq;
1560 }
1546 1561
1547 if (rq->q != q || !blk_rq_merge_ok(rq, bio)) 1562 if (rq->q != q || !blk_rq_merge_ok(rq, bio))
1548 continue; 1563 continue;
@@ -1576,7 +1591,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
1576 blk_rq_bio_prep(req->q, req, bio); 1591 blk_rq_bio_prep(req->q, req, bio);
1577} 1592}
1578 1593
1579void blk_queue_bio(struct request_queue *q, struct bio *bio) 1594static void blk_queue_bio(struct request_queue *q, struct bio *bio)
1580{ 1595{
1581 const bool sync = !!(bio->bi_rw & REQ_SYNC); 1596 const bool sync = !!(bio->bi_rw & REQ_SYNC);
1582 struct blk_plug *plug; 1597 struct blk_plug *plug;
@@ -1607,7 +1622,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1607 * any locks. 1622 * any locks.
1608 */ 1623 */
1609 if (!blk_queue_nomerges(q) && 1624 if (!blk_queue_nomerges(q) &&
1610 blk_attempt_plug_merge(q, bio, &request_count)) 1625 blk_attempt_plug_merge(q, bio, &request_count, NULL))
1611 return; 1626 return;
1612 1627
1613 spin_lock_irq(q->queue_lock); 1628 spin_lock_irq(q->queue_lock);
@@ -1684,7 +1699,6 @@ out_unlock:
1684 spin_unlock_irq(q->queue_lock); 1699 spin_unlock_irq(q->queue_lock);
1685 } 1700 }
1686} 1701}
1687EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */
1688 1702
1689/* 1703/*
1690 * If bio->bi_dev is a partition, remap the location 1704 * If bio->bi_dev is a partition, remap the location
@@ -1715,8 +1729,6 @@ static void handle_bad_sector(struct bio *bio)
1715 bio->bi_rw, 1729 bio->bi_rw,
1716 (unsigned long long)bio_end_sector(bio), 1730 (unsigned long long)bio_end_sector(bio),
1717 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); 1731 (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
1718
1719 set_bit(BIO_EOF, &bio->bi_flags);
1720} 1732}
1721 1733
1722#ifdef CONFIG_FAIL_MAKE_REQUEST 1734#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -2901,95 +2913,22 @@ int blk_lld_busy(struct request_queue *q)
2901} 2913}
2902EXPORT_SYMBOL_GPL(blk_lld_busy); 2914EXPORT_SYMBOL_GPL(blk_lld_busy);
2903 2915
2904/** 2916void blk_rq_prep_clone(struct request *dst, struct request *src)
2905 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
2906 * @rq: the clone request to be cleaned up
2907 *
2908 * Description:
2909 * Free all bios in @rq for a cloned request.
2910 */
2911void blk_rq_unprep_clone(struct request *rq)
2912{
2913 struct bio *bio;
2914
2915 while ((bio = rq->bio) != NULL) {
2916 rq->bio = bio->bi_next;
2917
2918 bio_put(bio);
2919 }
2920}
2921EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2922
2923/*
2924 * Copy attributes of the original request to the clone request.
2925 * The actual data parts (e.g. ->cmd, ->sense) are not copied.
2926 */
2927static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2928{ 2917{
2929 dst->cpu = src->cpu; 2918 dst->cpu = src->cpu;
2930 dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; 2919 dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
2920 dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
2931 dst->cmd_type = src->cmd_type; 2921 dst->cmd_type = src->cmd_type;
2932 dst->__sector = blk_rq_pos(src); 2922 dst->__sector = blk_rq_pos(src);
2933 dst->__data_len = blk_rq_bytes(src); 2923 dst->__data_len = blk_rq_bytes(src);
2934 dst->nr_phys_segments = src->nr_phys_segments; 2924 dst->nr_phys_segments = src->nr_phys_segments;
2935 dst->ioprio = src->ioprio; 2925 dst->ioprio = src->ioprio;
2936 dst->extra_len = src->extra_len; 2926 dst->extra_len = src->extra_len;
2937} 2927 dst->bio = src->bio;
2938 2928 dst->biotail = src->biotail;
2939/** 2929 dst->cmd = src->cmd;
2940 * blk_rq_prep_clone - Helper function to setup clone request 2930 dst->cmd_len = src->cmd_len;
2941 * @rq: the request to be setup 2931 dst->sense = src->sense;
2942 * @rq_src: original request to be cloned
2943 * @bs: bio_set that bios for clone are allocated from
2944 * @gfp_mask: memory allocation mask for bio
2945 * @bio_ctr: setup function to be called for each clone bio.
2946 * Returns %0 for success, non %0 for failure.
2947 * @data: private data to be passed to @bio_ctr
2948 *
2949 * Description:
2950 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
2951 * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
2952 * are not copied, and copying such parts is the caller's responsibility.
2953 * Also, pages which the original bios are pointing to are not copied
2954 * and the cloned bios just point same pages.
2955 * So cloned bios must be completed before original bios, which means
2956 * the caller must complete @rq before @rq_src.
2957 */
2958int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
2959 struct bio_set *bs, gfp_t gfp_mask,
2960 int (*bio_ctr)(struct bio *, struct bio *, void *),
2961 void *data)
2962{
2963 struct bio *bio, *bio_src;
2964
2965 if (!bs)
2966 bs = fs_bio_set;
2967
2968 __rq_for_each_bio(bio_src, rq_src) {
2969 bio = bio_clone_fast(bio_src, gfp_mask, bs);
2970 if (!bio)
2971 goto free_and_out;
2972
2973 if (bio_ctr && bio_ctr(bio, bio_src, data))
2974 goto free_and_out;
2975
2976 if (rq->bio) {
2977 rq->biotail->bi_next = bio;
2978 rq->biotail = bio;
2979 } else
2980 rq->bio = rq->biotail = bio;
2981 }
2982
2983 __blk_rq_prep_clone(rq, rq_src);
2984
2985 return 0;
2986
2987free_and_out:
2988 if (bio)
2989 bio_put(bio);
2990 blk_rq_unprep_clone(rq);
2991
2992 return -ENOMEM;
2993} 2932}
2994EXPORT_SYMBOL_GPL(blk_rq_prep_clone); 2933EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
2995 2934
@@ -3031,21 +2970,20 @@ void blk_start_plug(struct blk_plug *plug)
3031{ 2970{
3032 struct task_struct *tsk = current; 2971 struct task_struct *tsk = current;
3033 2972
2973 /*
2974 * If this is a nested plug, don't actually assign it.
2975 */
2976 if (tsk->plug)
2977 return;
2978
3034 INIT_LIST_HEAD(&plug->list); 2979 INIT_LIST_HEAD(&plug->list);
3035 INIT_LIST_HEAD(&plug->mq_list); 2980 INIT_LIST_HEAD(&plug->mq_list);
3036 INIT_LIST_HEAD(&plug->cb_list); 2981 INIT_LIST_HEAD(&plug->cb_list);
3037
3038 /* 2982 /*
3039 * If this is a nested plug, don't actually assign it. It will be 2983 * Store ordering should not be needed here, since a potential
3040 * flushed on its own. 2984 * preempt will imply a full memory barrier
3041 */ 2985 */
3042 if (!tsk->plug) { 2986 tsk->plug = plug;
3043 /*
3044 * Store ordering should not be needed here, since a potential
3045 * preempt will imply a full memory barrier
3046 */
3047 tsk->plug = plug;
3048 }
3049} 2987}
3050EXPORT_SYMBOL(blk_start_plug); 2988EXPORT_SYMBOL(blk_start_plug);
3051 2989
@@ -3192,10 +3130,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3192 3130
3193void blk_finish_plug(struct blk_plug *plug) 3131void blk_finish_plug(struct blk_plug *plug)
3194{ 3132{
3133 if (plug != current->plug)
3134 return;
3195 blk_flush_plug_list(plug, false); 3135 blk_flush_plug_list(plug, false);
3196 3136
3197 if (plug == current->plug) 3137 current->plug = NULL;
3198 current->plug = NULL;
3199} 3138}
3200EXPORT_SYMBOL(blk_finish_plug); 3139EXPORT_SYMBOL(blk_finish_plug);
3201 3140
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa50d..3fec8a29d0fa 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
53 rq_end_io_fn *done) 53 rq_end_io_fn *done)
54{ 54{
55 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 55 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
56 bool is_pm_resume;
57 56
58 WARN_ON(irqs_disabled()); 57 WARN_ON(irqs_disabled());
59 WARN_ON(rq->cmd_type == REQ_TYPE_FS); 58 WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
70 return; 69 return;
71 } 70 }
72 71
73 /*
74 * need to check this before __blk_run_queue(), because rq can
75 * be freed before that returns.
76 */
77 is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
78
79 spin_lock_irq(q->queue_lock); 72 spin_lock_irq(q->queue_lock);
80 73
81 if (unlikely(blk_queue_dying(q))) { 74 if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
88 81
89 __elv_add_request(q, rq, where); 82 __elv_add_request(q, rq, where);
90 __blk_run_queue(q); 83 __blk_run_queue(q);
91 /* the queue is stopped so it won't be run */
92 if (is_pm_resume)
93 __blk_run_queue_uncond(q);
94 spin_unlock_irq(q->queue_lock); 84 spin_unlock_irq(q->queue_lock);
95} 85}
96EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 86EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c23c..30a0d9f89017 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
589 !blk_write_same_mergeable(rq->bio, bio)) 589 !blk_write_same_mergeable(rq->bio, bio))
590 return false; 590 return false;
591 591
592 if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { 592 /* Only check gaps if the bio carries data */
593 if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
593 struct bio_vec *bprev; 594 struct bio_vec *bprev;
594 595
595 bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1]; 596 bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 5f13f4d0bcce..1e28ddb656b8 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -24,7 +24,7 @@ static int get_first_sibling(unsigned int cpu)
24{ 24{
25 unsigned int ret; 25 unsigned int ret;
26 26
27 ret = cpumask_first(topology_thread_cpumask(cpu)); 27 ret = cpumask_first(topology_sibling_cpumask(cpu));
28 if (ret < nr_cpu_ids) 28 if (ret < nr_cpu_ids)
29 return ret; 29 return ret;
30 30
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0644..9b6e28830b82 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
438 } 438 }
439} 439}
440 440
441static void bt_tags_for_each(struct blk_mq_tags *tags,
442 struct blk_mq_bitmap_tags *bt, unsigned int off,
443 busy_tag_iter_fn *fn, void *data, bool reserved)
444{
445 struct request *rq;
446 int bit, i;
447
448 if (!tags->rqs)
449 return;
450 for (i = 0; i < bt->map_nr; i++) {
451 struct blk_align_bitmap *bm = &bt->map[i];
452
453 for (bit = find_first_bit(&bm->word, bm->depth);
454 bit < bm->depth;
455 bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
456 rq = blk_mq_tag_to_rq(tags, off + bit);
457 fn(rq, data, reserved);
458 }
459
460 off += (1 << bt->bits_per_word);
461 }
462}
463
464void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
465 void *priv)
466{
467 if (tags->nr_reserved_tags)
468 bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
469 bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
470 false);
471}
472EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
473
441void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, 474void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
442 void *priv) 475 void *priv)
443{ 476{
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
580 if (!tags) 613 if (!tags)
581 return NULL; 614 return NULL;
582 615
616 if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
617 kfree(tags);
618 return NULL;
619 }
620
583 tags->nr_tags = total_tags; 621 tags->nr_tags = total_tags;
584 tags->nr_reserved_tags = reserved_tags; 622 tags->nr_reserved_tags = reserved_tags;
585 623
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370308..75893a34237d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
44 struct list_head page_list; 44 struct list_head page_list;
45 45
46 int alloc_policy; 46 int alloc_policy;
47 cpumask_var_t cpumask;
47}; 48};
48 49
49 50
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..f53779692c77 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
89 return -EBUSY; 89 return -EBUSY;
90 90
91 ret = wait_event_interruptible(q->mq_freeze_wq, 91 ret = wait_event_interruptible(q->mq_freeze_wq,
92 !q->mq_freeze_depth || blk_queue_dying(q)); 92 !atomic_read(&q->mq_freeze_depth) ||
93 blk_queue_dying(q));
93 if (blk_queue_dying(q)) 94 if (blk_queue_dying(q))
94 return -ENODEV; 95 return -ENODEV;
95 if (ret) 96 if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
112 113
113void blk_mq_freeze_queue_start(struct request_queue *q) 114void blk_mq_freeze_queue_start(struct request_queue *q)
114{ 115{
115 bool freeze; 116 int freeze_depth;
116 117
117 spin_lock_irq(q->queue_lock); 118 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
118 freeze = !q->mq_freeze_depth++; 119 if (freeze_depth == 1) {
119 spin_unlock_irq(q->queue_lock);
120
121 if (freeze) {
122 percpu_ref_kill(&q->mq_usage_counter); 120 percpu_ref_kill(&q->mq_usage_counter);
123 blk_mq_run_hw_queues(q, false); 121 blk_mq_run_hw_queues(q, false);
124 } 122 }
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
143 141
144void blk_mq_unfreeze_queue(struct request_queue *q) 142void blk_mq_unfreeze_queue(struct request_queue *q)
145{ 143{
146 bool wake; 144 int freeze_depth;
147 145
148 spin_lock_irq(q->queue_lock); 146 freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
149 wake = !--q->mq_freeze_depth; 147 WARN_ON_ONCE(freeze_depth < 0);
150 WARN_ON_ONCE(q->mq_freeze_depth < 0); 148 if (!freeze_depth) {
151 spin_unlock_irq(q->queue_lock);
152 if (wake) {
153 percpu_ref_reinit(&q->mq_usage_counter); 149 percpu_ref_reinit(&q->mq_usage_counter);
154 wake_up_all(&q->mq_freeze_wq); 150 wake_up_all(&q->mq_freeze_wq);
155 } 151 }
@@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv)
677 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 673 data.next = blk_rq_timeout(round_jiffies_up(data.next));
678 mod_timer(&q->timeout, data.next); 674 mod_timer(&q->timeout, data.next);
679 } else { 675 } else {
680 queue_for_each_hw_ctx(q, hctx, i) 676 queue_for_each_hw_ctx(q, hctx, i) {
681 blk_mq_tag_idle(hctx); 677 /* the hctx may be unmapped, so check it here */
678 if (blk_mq_hw_queue_mapped(hctx))
679 blk_mq_tag_idle(hctx);
680 }
682 } 681 }
683} 682}
684 683
@@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
855 spin_lock(&hctx->lock); 854 spin_lock(&hctx->lock);
856 list_splice(&rq_list, &hctx->dispatch); 855 list_splice(&rq_list, &hctx->dispatch);
857 spin_unlock(&hctx->lock); 856 spin_unlock(&hctx->lock);
857 /*
858 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
859 * it's possible the queue is stopped and restarted again
860 * before this. Queue restart will dispatch requests. And since
861 * requests in rq_list aren't added into hctx->dispatch yet,
862 * the requests in rq_list might get lost.
863 *
864 * blk_mq_run_hw_queue() already checks the STOPPED bit
865 **/
866 blk_mq_run_hw_queue(hctx, true);
858 } 867 }
859} 868}
860 869
@@ -1224,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
1224 return rq; 1233 return rq;
1225} 1234}
1226 1235
1236static int blk_mq_direct_issue_request(struct request *rq)
1237{
1238 int ret;
1239 struct request_queue *q = rq->q;
1240 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
1241 rq->mq_ctx->cpu);
1242 struct blk_mq_queue_data bd = {
1243 .rq = rq,
1244 .list = NULL,
1245 .last = 1
1246 };
1247
1248 /*
1249 * For OK queue, we are done. For error, kill it. Any other
1250 * error (busy), just add it to our list as we previously
1251 * would have done
1252 */
1253 ret = q->mq_ops->queue_rq(hctx, &bd);
1254 if (ret == BLK_MQ_RQ_QUEUE_OK)
1255 return 0;
1256 else {
1257 __blk_mq_requeue_request(rq);
1258
1259 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1260 rq->errors = -EIO;
1261 blk_mq_end_request(rq, rq->errors);
1262 return 0;
1263 }
1264 return -1;
1265 }
1266}
1267
1227/* 1268/*
1228 * Multiple hardware queue variant. This will not use per-process plugs, 1269 * Multiple hardware queue variant. This will not use per-process plugs,
1229 * but will attempt to bypass the hctx queueing if we can go straight to 1270 * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1235 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1276 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1236 struct blk_map_ctx data; 1277 struct blk_map_ctx data;
1237 struct request *rq; 1278 struct request *rq;
1279 unsigned int request_count = 0;
1280 struct blk_plug *plug;
1281 struct request *same_queue_rq = NULL;
1238 1282
1239 blk_queue_bounce(q, &bio); 1283 blk_queue_bounce(q, &bio);
1240 1284
@@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1243 return; 1287 return;
1244 } 1288 }
1245 1289
1290 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1291 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1292 return;
1293
1246 rq = blk_mq_map_request(q, bio, &data); 1294 rq = blk_mq_map_request(q, bio, &data);
1247 if (unlikely(!rq)) 1295 if (unlikely(!rq))
1248 return; 1296 return;
@@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1253 goto run_queue; 1301 goto run_queue;
1254 } 1302 }
1255 1303
1304 plug = current->plug;
1256 /* 1305 /*
1257 * If the driver supports defer issued based on 'last', then 1306 * If the driver supports defer issued based on 'last', then
1258 * queue it up like normal since we can potentially save some 1307 * queue it up like normal since we can potentially save some
1259 * CPU this way. 1308 * CPU this way.
1260 */ 1309 */
1261 if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1310 if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
1262 struct blk_mq_queue_data bd = { 1311 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
1263 .rq = rq, 1312 struct request *old_rq = NULL;
1264 .list = NULL,
1265 .last = 1
1266 };
1267 int ret;
1268 1313
1269 blk_mq_bio_to_request(rq, bio); 1314 blk_mq_bio_to_request(rq, bio);
1270 1315
1271 /* 1316 /*
1272 * For OK queue, we are done. For error, kill it. Any other 1317 * we do limited pluging. If bio can be merged, do merge.
1273 * error (busy), just add it to our list as we previously 1318 * Otherwise the existing request in the plug list will be
1274 * would have done 1319 * issued. So the plug list will have one request at most
1275 */ 1320 */
1276 ret = q->mq_ops->queue_rq(data.hctx, &bd); 1321 if (plug) {
1277 if (ret == BLK_MQ_RQ_QUEUE_OK) 1322 /*
1278 goto done; 1323 * The plug list might get flushed before this. If that
1279 else { 1324 * happens, same_queue_rq is invalid and plug list is empty
1280 __blk_mq_requeue_request(rq); 1325 **/
1281 1326 if (same_queue_rq && !list_empty(&plug->mq_list)) {
1282 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1327 old_rq = same_queue_rq;
1283 rq->errors = -EIO; 1328 list_del_init(&old_rq->queuelist);
1284 blk_mq_end_request(rq, rq->errors);
1285 goto done;
1286 } 1329 }
1287 } 1330 list_add_tail(&rq->queuelist, &plug->mq_list);
1331 } else /* is_sync */
1332 old_rq = rq;
1333 blk_mq_put_ctx(data.ctx);
1334 if (!old_rq)
1335 return;
1336 if (!blk_mq_direct_issue_request(old_rq))
1337 return;
1338 blk_mq_insert_request(old_rq, false, true, true);
1339 return;
1288 } 1340 }
1289 1341
1290 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1342 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1297run_queue: 1349run_queue:
1298 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1350 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1299 } 1351 }
1300done:
1301 blk_mq_put_ctx(data.ctx); 1352 blk_mq_put_ctx(data.ctx);
1302} 1353}
1303 1354
@@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1309{ 1360{
1310 const int is_sync = rw_is_sync(bio->bi_rw); 1361 const int is_sync = rw_is_sync(bio->bi_rw);
1311 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1362 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1312 unsigned int use_plug, request_count = 0; 1363 struct blk_plug *plug;
1364 unsigned int request_count = 0;
1313 struct blk_map_ctx data; 1365 struct blk_map_ctx data;
1314 struct request *rq; 1366 struct request *rq;
1315 1367
1316 /*
1317 * If we have multiple hardware queues, just go directly to
1318 * one of those for sync IO.
1319 */
1320 use_plug = !is_flush_fua && !is_sync;
1321
1322 blk_queue_bounce(q, &bio); 1368 blk_queue_bounce(q, &bio);
1323 1369
1324 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1370 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1326 return; 1372 return;
1327 } 1373 }
1328 1374
1329 if (use_plug && !blk_queue_nomerges(q) && 1375 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1330 blk_attempt_plug_merge(q, bio, &request_count)) 1376 blk_attempt_plug_merge(q, bio, &request_count, NULL))
1331 return; 1377 return;
1332 1378
1333 rq = blk_mq_map_request(q, bio, &data); 1379 rq = blk_mq_map_request(q, bio, &data);
@@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1345 * utilize that to temporarily store requests until the task is 1391 * utilize that to temporarily store requests until the task is
1346 * either done or scheduled away. 1392 * either done or scheduled away.
1347 */ 1393 */
1348 if (use_plug) { 1394 plug = current->plug;
1349 struct blk_plug *plug = current->plug; 1395 if (plug) {
1350 1396 blk_mq_bio_to_request(rq, bio);
1351 if (plug) { 1397 if (list_empty(&plug->mq_list))
1352 blk_mq_bio_to_request(rq, bio); 1398 trace_block_plug(q);
1353 if (list_empty(&plug->mq_list)) 1399 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1354 trace_block_plug(q); 1400 blk_flush_plug_list(plug, false);
1355 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 1401 trace_block_plug(q);
1356 blk_flush_plug_list(plug, false);
1357 trace_block_plug(q);
1358 }
1359 list_add_tail(&rq->queuelist, &plug->mq_list);
1360 blk_mq_put_ctx(data.ctx);
1361 return;
1362 } 1402 }
1403 list_add_tail(&rq->queuelist, &plug->mq_list);
1404 blk_mq_put_ctx(data.ctx);
1405 return;
1363 } 1406 }
1364 1407
1365 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1408 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1495,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1495 i++; 1538 i++;
1496 } 1539 }
1497 } 1540 }
1498
1499 return tags; 1541 return tags;
1500 1542
1501fail: 1543fail:
@@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1571 return NOTIFY_OK; 1613 return NOTIFY_OK;
1572} 1614}
1573 1615
1574static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1575{
1576 struct request_queue *q = hctx->queue;
1577 struct blk_mq_tag_set *set = q->tag_set;
1578
1579 if (set->tags[hctx->queue_num])
1580 return NOTIFY_OK;
1581
1582 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1583 if (!set->tags[hctx->queue_num])
1584 return NOTIFY_STOP;
1585
1586 hctx->tags = set->tags[hctx->queue_num];
1587 return NOTIFY_OK;
1588}
1589
1590static int blk_mq_hctx_notify(void *data, unsigned long action, 1616static int blk_mq_hctx_notify(void *data, unsigned long action,
1591 unsigned int cpu) 1617 unsigned int cpu)
1592{ 1618{
@@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
1594 1620
1595 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1621 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1596 return blk_mq_hctx_cpu_offline(hctx, cpu); 1622 return blk_mq_hctx_cpu_offline(hctx, cpu);
1597 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 1623
1598 return blk_mq_hctx_cpu_online(hctx, cpu); 1624 /*
1625 * In case of CPU online, tags may be reallocated
1626 * in blk_mq_map_swqueue() after mapping is updated.
1627 */
1599 1628
1600 return NOTIFY_OK; 1629 return NOTIFY_OK;
1601} 1630}
1602 1631
1632/* hctx->ctxs will be freed in queue's release handler */
1603static void blk_mq_exit_hctx(struct request_queue *q, 1633static void blk_mq_exit_hctx(struct request_queue *q,
1604 struct blk_mq_tag_set *set, 1634 struct blk_mq_tag_set *set,
1605 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1635 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
1618 1648
1619 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1649 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1620 blk_free_flush_queue(hctx->fq); 1650 blk_free_flush_queue(hctx->fq);
1621 kfree(hctx->ctxs);
1622 blk_mq_free_bitmap(&hctx->ctx_map); 1651 blk_mq_free_bitmap(&hctx->ctx_map);
1623} 1652}
1624 1653
@@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1775 unsigned int i; 1804 unsigned int i;
1776 struct blk_mq_hw_ctx *hctx; 1805 struct blk_mq_hw_ctx *hctx;
1777 struct blk_mq_ctx *ctx; 1806 struct blk_mq_ctx *ctx;
1807 struct blk_mq_tag_set *set = q->tag_set;
1778 1808
1779 queue_for_each_hw_ctx(q, hctx, i) { 1809 queue_for_each_hw_ctx(q, hctx, i) {
1780 cpumask_clear(hctx->cpumask); 1810 cpumask_clear(hctx->cpumask);
@@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1791 1821
1792 hctx = q->mq_ops->map_queue(q, i); 1822 hctx = q->mq_ops->map_queue(q, i);
1793 cpumask_set_cpu(i, hctx->cpumask); 1823 cpumask_set_cpu(i, hctx->cpumask);
1824 cpumask_set_cpu(i, hctx->tags->cpumask);
1794 ctx->index_hw = hctx->nr_ctx; 1825 ctx->index_hw = hctx->nr_ctx;
1795 hctx->ctxs[hctx->nr_ctx++] = ctx; 1826 hctx->ctxs[hctx->nr_ctx++] = ctx;
1796 } 1827 }
@@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1803 * disable it and free the request entries. 1834 * disable it and free the request entries.
1804 */ 1835 */
1805 if (!hctx->nr_ctx) { 1836 if (!hctx->nr_ctx) {
1806 struct blk_mq_tag_set *set = q->tag_set;
1807
1808 if (set->tags[i]) { 1837 if (set->tags[i]) {
1809 blk_mq_free_rq_map(set, set->tags[i], i); 1838 blk_mq_free_rq_map(set, set->tags[i], i);
1810 set->tags[i] = NULL; 1839 set->tags[i] = NULL;
1811 hctx->tags = NULL;
1812 } 1840 }
1841 hctx->tags = NULL;
1813 continue; 1842 continue;
1814 } 1843 }
1815 1844
1845 /* unmapped hw queue can be remapped after CPU topo changed */
1846 if (!set->tags[i])
1847 set->tags[i] = blk_mq_init_rq_map(set, i);
1848 hctx->tags = set->tags[i];
1849 WARN_ON(!hctx->tags);
1850
1816 /* 1851 /*
1817 * Set the map size to the number of mapped software queues. 1852 * Set the map size to the number of mapped software queues.
1818 * This is more accurate and more efficient than looping 1853 * This is more accurate and more efficient than looping
@@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q)
1886 unsigned int i; 1921 unsigned int i;
1887 1922
1888 /* hctx kobj stays in hctx */ 1923 /* hctx kobj stays in hctx */
1889 queue_for_each_hw_ctx(q, hctx, i) 1924 queue_for_each_hw_ctx(q, hctx, i) {
1925 if (!hctx)
1926 continue;
1927 kfree(hctx->ctxs);
1890 kfree(hctx); 1928 kfree(hctx);
1929 }
1891 1930
1892 kfree(q->queue_hw_ctx); 1931 kfree(q->queue_hw_ctx);
1893 1932
@@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
2047/* Basically redo blk_mq_init_queue with queue frozen */ 2086/* Basically redo blk_mq_init_queue with queue frozen */
2048static void blk_mq_queue_reinit(struct request_queue *q) 2087static void blk_mq_queue_reinit(struct request_queue *q)
2049{ 2088{
2050 WARN_ON_ONCE(!q->mq_freeze_depth); 2089 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2051 2090
2052 blk_mq_sysfs_unregister(q); 2091 blk_mq_sysfs_unregister(q);
2053 2092
@@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2090 */ 2129 */
2091 list_for_each_entry(q, &all_q_list, all_q_node) 2130 list_for_each_entry(q, &all_q_list, all_q_node)
2092 blk_mq_freeze_queue_start(q); 2131 blk_mq_freeze_queue_start(q);
2093 list_for_each_entry(q, &all_q_list, all_q_node) 2132 list_for_each_entry(q, &all_q_list, all_q_node) {
2094 blk_mq_freeze_queue_wait(q); 2133 blk_mq_freeze_queue_wait(q);
2095 2134
2135 /*
2136 * timeout handler can't touch hw queue during the
2137 * reinitialization
2138 */
2139 del_timer_sync(&q->timeout);
2140 }
2141
2096 list_for_each_entry(q, &all_q_list, all_q_node) 2142 list_for_each_entry(q, &all_q_list, all_q_node)
2097 blk_mq_queue_reinit(q); 2143 blk_mq_queue_reinit(q);
2098 2144
@@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2157 return 0; 2203 return 0;
2158} 2204}
2159 2205
2206struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
2207{
2208 return tags->cpumask;
2209}
2210EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
2211
2160/* 2212/*
2161 * Alloc a tag set to be associated with one or more request queues. 2213 * Alloc a tag set to be associated with one or more request queues.
2162 * May fail with EINVAL for various error conditions. May adjust the 2214 * May fail with EINVAL for various error conditions. May adjust the
@@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2218 int i; 2270 int i;
2219 2271
2220 for (i = 0; i < set->nr_hw_queues; i++) { 2272 for (i = 0; i < set->nr_hw_queues; i++) {
2221 if (set->tags[i]) 2273 if (set->tags[i]) {
2222 blk_mq_free_rq_map(set, set->tags[i], i); 2274 blk_mq_free_rq_map(set, set->tags[i], i);
2275 free_cpumask_var(set->tags[i]->cpumask);
2276 }
2223 } 2277 }
2224 2278
2225 kfree(set->tags); 2279 kfree(set->tags);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index faaf36ade7eb..2b8fd302f677 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -522,8 +522,6 @@ static void blk_release_queue(struct kobject *kobj)
522 522
523 blk_trace_shutdown(q); 523 blk_trace_shutdown(q);
524 524
525 bdi_destroy(&q->backing_dev_info);
526
527 ida_simple_remove(&blk_queue_ida, q->id); 525 ida_simple_remove(&blk_queue_ida, q->id);
528 call_rcu(&q->rcu_head, blk_free_queue_rcu); 526 call_rcu(&q->rcu_head, blk_free_queue_rcu);
529} 527}
diff --git a/block/blk.h b/block/blk.h
index 43b036185712..026d9594142b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
78bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 78bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
79 struct bio *bio); 79 struct bio *bio);
80bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 80bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
81 unsigned int *request_count); 81 unsigned int *request_count,
82 struct request **same_queue_rq);
82 83
83void blk_account_io_start(struct request *req, bool new_io); 84void blk_account_io_start(struct request *req, bool new_io);
84void blk_account_io_completion(struct request *req, unsigned int bytes); 85void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -193,8 +194,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
193 194
194void blk_queue_congestion_threshold(struct request_queue *q); 195void blk_queue_congestion_threshold(struct request_queue *q);
195 196
196void __blk_run_queue_uncond(struct request_queue *q);
197
198int blk_dev_init(void); 197int blk_dev_init(void);
199 198
200 199
diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..3ab0bce1c947 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
128 struct bio_vec *bvec, *org_vec; 128 struct bio_vec *bvec, *org_vec;
129 int i; 129 int i;
130 130
131 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
132 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
133
134 /* 131 /*
135 * free up bounce indirect pages used 132 * free up bounce indirect pages used
136 */ 133 */
@@ -221,8 +218,8 @@ bounce:
221 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) 218 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
222 continue; 219 continue;
223 220
224 inc_zone_page_state(to->bv_page, NR_BOUNCE);
225 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 221 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
222 inc_zone_page_state(to->bv_page, NR_BOUNCE);
226 223
227 if (rw == WRITE) { 224 if (rw == WRITE) {
228 char *vto, *vfrom; 225 char *vto, *vfrom;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9ab4b..d8ad45ccd8fa 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool;
67#define sample_valid(samples) ((samples) > 80) 67#define sample_valid(samples) ((samples) > 80)
68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 68#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
69 69
70/* blkio-related constants */
71#define CFQ_WEIGHT_MIN 10
72#define CFQ_WEIGHT_MAX 1000
73#define CFQ_WEIGHT_DEFAULT 500
74
70struct cfq_ttime { 75struct cfq_ttime {
71 unsigned long last_end_request; 76 unsigned long last_end_request;
72 77
@@ -212,6 +217,15 @@ struct cfqg_stats {
212#endif /* CONFIG_CFQ_GROUP_IOSCHED */ 217#endif /* CONFIG_CFQ_GROUP_IOSCHED */
213}; 218};
214 219
220/* Per-cgroup data */
221struct cfq_group_data {
222 /* must be the first member */
223 struct blkcg_policy_data pd;
224
225 unsigned int weight;
226 unsigned int leaf_weight;
227};
228
215/* This is per cgroup per device grouping structure */ 229/* This is per cgroup per device grouping structure */
216struct cfq_group { 230struct cfq_group {
217 /* must be the first member */ 231 /* must be the first member */
@@ -446,16 +460,6 @@ CFQ_CFQQ_FNS(deep);
446CFQ_CFQQ_FNS(wait_busy); 460CFQ_CFQQ_FNS(wait_busy);
447#undef CFQ_CFQQ_FNS 461#undef CFQ_CFQQ_FNS
448 462
449static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
450{
451 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
452}
453
454static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
455{
456 return pd_to_blkg(&cfqg->pd);
457}
458
459#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) 463#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
460 464
461/* cfqg stats flags */ 465/* cfqg stats flags */
@@ -600,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
600 604
601#ifdef CONFIG_CFQ_GROUP_IOSCHED 605#ifdef CONFIG_CFQ_GROUP_IOSCHED
602 606
607static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
608{
609 return pd ? container_of(pd, struct cfq_group, pd) : NULL;
610}
611
612static struct cfq_group_data
613*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
614{
615 return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
616}
617
618static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
619{
620 return pd_to_blkg(&cfqg->pd);
621}
622
603static struct blkcg_policy blkcg_policy_cfq; 623static struct blkcg_policy blkcg_policy_cfq;
604 624
605static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) 625static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
@@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
607 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); 627 return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
608} 628}
609 629
630static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
631{
632 return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
633}
634
610static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) 635static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
611{ 636{
612 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; 637 struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
1544#endif 1569#endif
1545} 1570}
1546 1571
1572static void cfq_cpd_init(const struct blkcg *blkcg)
1573{
1574 struct cfq_group_data *cgd =
1575 cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
1576
1577 if (blkcg == &blkcg_root) {
1578 cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
1579 cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
1580 } else {
1581 cgd->weight = CFQ_WEIGHT_DEFAULT;
1582 cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
1583 }
1584}
1585
1547static void cfq_pd_init(struct blkcg_gq *blkg) 1586static void cfq_pd_init(struct blkcg_gq *blkg)
1548{ 1587{
1549 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1588 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
1589 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
1550 1590
1551 cfq_init_cfqg_base(cfqg); 1591 cfq_init_cfqg_base(cfqg);
1552 cfqg->weight = blkg->blkcg->cfq_weight; 1592 cfqg->weight = cgd->weight;
1553 cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; 1593 cfqg->leaf_weight = cgd->leaf_weight;
1554 cfqg_stats_init(&cfqg->stats); 1594 cfqg_stats_init(&cfqg->stats);
1555 cfqg_stats_init(&cfqg->dead_stats); 1595 cfqg_stats_init(&cfqg->dead_stats);
1556} 1596}
@@ -1673,13 +1713,27 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
1673 1713
1674static int cfq_print_weight(struct seq_file *sf, void *v) 1714static int cfq_print_weight(struct seq_file *sf, void *v)
1675{ 1715{
1676 seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); 1716 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1717 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1718 unsigned int val = 0;
1719
1720 if (cgd)
1721 val = cgd->weight;
1722
1723 seq_printf(sf, "%u\n", val);
1677 return 0; 1724 return 0;
1678} 1725}
1679 1726
1680static int cfq_print_leaf_weight(struct seq_file *sf, void *v) 1727static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
1681{ 1728{
1682 seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); 1729 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1730 struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
1731 unsigned int val = 0;
1732
1733 if (cgd)
1734 val = cgd->leaf_weight;
1735
1736 seq_printf(sf, "%u\n", val);
1683 return 0; 1737 return 0;
1684} 1738}
1685 1739
@@ -1690,6 +1744,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1690 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1744 struct blkcg *blkcg = css_to_blkcg(of_css(of));
1691 struct blkg_conf_ctx ctx; 1745 struct blkg_conf_ctx ctx;
1692 struct cfq_group *cfqg; 1746 struct cfq_group *cfqg;
1747 struct cfq_group_data *cfqgd;
1693 int ret; 1748 int ret;
1694 1749
1695 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); 1750 ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@ -1698,17 +1753,22 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
1698 1753
1699 ret = -EINVAL; 1754 ret = -EINVAL;
1700 cfqg = blkg_to_cfqg(ctx.blkg); 1755 cfqg = blkg_to_cfqg(ctx.blkg);
1756 cfqgd = blkcg_to_cfqgd(blkcg);
1757 if (!cfqg || !cfqgd)
1758 goto err;
1759
1701 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { 1760 if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
1702 if (!is_leaf_weight) { 1761 if (!is_leaf_weight) {
1703 cfqg->dev_weight = ctx.v; 1762 cfqg->dev_weight = ctx.v;
1704 cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; 1763 cfqg->new_weight = ctx.v ?: cfqgd->weight;
1705 } else { 1764 } else {
1706 cfqg->dev_leaf_weight = ctx.v; 1765 cfqg->dev_leaf_weight = ctx.v;
1707 cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; 1766 cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
1708 } 1767 }
1709 ret = 0; 1768 ret = 0;
1710 } 1769 }
1711 1770
1771err:
1712 blkg_conf_finish(&ctx); 1772 blkg_conf_finish(&ctx);
1713 return ret ?: nbytes; 1773 return ret ?: nbytes;
1714} 1774}
@@ -1730,16 +1790,23 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1730{ 1790{
1731 struct blkcg *blkcg = css_to_blkcg(css); 1791 struct blkcg *blkcg = css_to_blkcg(css);
1732 struct blkcg_gq *blkg; 1792 struct blkcg_gq *blkg;
1793 struct cfq_group_data *cfqgd;
1794 int ret = 0;
1733 1795
1734 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) 1796 if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
1735 return -EINVAL; 1797 return -EINVAL;
1736 1798
1737 spin_lock_irq(&blkcg->lock); 1799 spin_lock_irq(&blkcg->lock);
1800 cfqgd = blkcg_to_cfqgd(blkcg);
1801 if (!cfqgd) {
1802 ret = -EINVAL;
1803 goto out;
1804 }
1738 1805
1739 if (!is_leaf_weight) 1806 if (!is_leaf_weight)
1740 blkcg->cfq_weight = val; 1807 cfqgd->weight = val;
1741 else 1808 else
1742 blkcg->cfq_leaf_weight = val; 1809 cfqgd->leaf_weight = val;
1743 1810
1744 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { 1811 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1745 struct cfq_group *cfqg = blkg_to_cfqg(blkg); 1812 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1749,15 +1816,16 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
1749 1816
1750 if (!is_leaf_weight) { 1817 if (!is_leaf_weight) {
1751 if (!cfqg->dev_weight) 1818 if (!cfqg->dev_weight)
1752 cfqg->new_weight = blkcg->cfq_weight; 1819 cfqg->new_weight = cfqgd->weight;
1753 } else { 1820 } else {
1754 if (!cfqg->dev_leaf_weight) 1821 if (!cfqg->dev_leaf_weight)
1755 cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; 1822 cfqg->new_leaf_weight = cfqgd->leaf_weight;
1756 } 1823 }
1757 } 1824 }
1758 1825
1826out:
1759 spin_unlock_irq(&blkcg->lock); 1827 spin_unlock_irq(&blkcg->lock);
1760 return 0; 1828 return ret;
1761} 1829}
1762 1830
1763static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, 1831static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -4477,6 +4545,18 @@ out_free:
4477 return ret; 4545 return ret;
4478} 4546}
4479 4547
4548static void cfq_registered_queue(struct request_queue *q)
4549{
4550 struct elevator_queue *e = q->elevator;
4551 struct cfq_data *cfqd = e->elevator_data;
4552
4553 /*
4554 * Default to IOPS mode with no idling for SSDs
4555 */
4556 if (blk_queue_nonrot(q))
4557 cfqd->cfq_slice_idle = 0;
4558}
4559
4480/* 4560/*
4481 * sysfs parts below --> 4561 * sysfs parts below -->
4482 */ 4562 */
@@ -4592,6 +4672,7 @@ static struct elevator_type iosched_cfq = {
4592 .elevator_may_queue_fn = cfq_may_queue, 4672 .elevator_may_queue_fn = cfq_may_queue,
4593 .elevator_init_fn = cfq_init_queue, 4673 .elevator_init_fn = cfq_init_queue,
4594 .elevator_exit_fn = cfq_exit_queue, 4674 .elevator_exit_fn = cfq_exit_queue,
4675 .elevator_registered_fn = cfq_registered_queue,
4595 }, 4676 },
4596 .icq_size = sizeof(struct cfq_io_cq), 4677 .icq_size = sizeof(struct cfq_io_cq),
4597 .icq_align = __alignof__(struct cfq_io_cq), 4678 .icq_align = __alignof__(struct cfq_io_cq),
@@ -4603,8 +4684,10 @@ static struct elevator_type iosched_cfq = {
4603#ifdef CONFIG_CFQ_GROUP_IOSCHED 4684#ifdef CONFIG_CFQ_GROUP_IOSCHED
4604static struct blkcg_policy blkcg_policy_cfq = { 4685static struct blkcg_policy blkcg_policy_cfq = {
4605 .pd_size = sizeof(struct cfq_group), 4686 .pd_size = sizeof(struct cfq_group),
4687 .cpd_size = sizeof(struct cfq_group_data),
4606 .cftypes = cfq_blkcg_files, 4688 .cftypes = cfq_blkcg_files,
4607 4689
4690 .cpd_init_fn = cfq_cpd_init,
4608 .pd_init_fn = cfq_pd_init, 4691 .pd_init_fn = cfq_pd_init,
4609 .pd_offline_fn = cfq_pd_offline, 4692 .pd_offline_fn = cfq_pd_offline,
4610 .pd_reset_stats_fn = cfq_pd_reset_stats, 4693 .pd_reset_stats_fn = cfq_pd_reset_stats,
diff --git a/block/elevator.c b/block/elevator.c
index 59794d0d38e3..942579d04128 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -157,7 +157,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
157 157
158 eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); 158 eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
159 if (unlikely(!eq)) 159 if (unlikely(!eq))
160 goto err; 160 return NULL;
161 161
162 eq->type = e; 162 eq->type = e;
163 kobject_init(&eq->kobj, &elv_ktype); 163 kobject_init(&eq->kobj, &elv_ktype);
@@ -165,10 +165,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
165 hash_init(eq->hash); 165 hash_init(eq->hash);
166 166
167 return eq; 167 return eq;
168err:
169 kfree(eq);
170 elevator_put(e);
171 return NULL;
172} 168}
173EXPORT_SYMBOL(elevator_alloc); 169EXPORT_SYMBOL(elevator_alloc);
174 170
@@ -810,6 +806,8 @@ int elv_register_queue(struct request_queue *q)
810 } 806 }
811 kobject_uevent(&e->kobj, KOBJ_ADD); 807 kobject_uevent(&e->kobj, KOBJ_ADD);
812 e->registered = 1; 808 e->registered = 1;
809 if (e->type->ops.elevator_registered_fn)
810 e->type->ops.elevator_registered_fn(q);
813 } 811 }
814 return error; 812 return error;
815} 813}
diff --git a/block/genhd.c b/block/genhd.c
index 0a536dc05f3b..ea982eadaf63 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -422,9 +422,9 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
422 /* allocate ext devt */ 422 /* allocate ext devt */
423 idr_preload(GFP_KERNEL); 423 idr_preload(GFP_KERNEL);
424 424
425 spin_lock(&ext_devt_lock); 425 spin_lock_bh(&ext_devt_lock);
426 idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); 426 idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
427 spin_unlock(&ext_devt_lock); 427 spin_unlock_bh(&ext_devt_lock);
428 428
429 idr_preload_end(); 429 idr_preload_end();
430 if (idx < 0) 430 if (idx < 0)
@@ -449,9 +449,9 @@ void blk_free_devt(dev_t devt)
449 return; 449 return;
450 450
451 if (MAJOR(devt) == BLOCK_EXT_MAJOR) { 451 if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
452 spin_lock(&ext_devt_lock); 452 spin_lock_bh(&ext_devt_lock);
453 idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); 453 idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
454 spin_unlock(&ext_devt_lock); 454 spin_unlock_bh(&ext_devt_lock);
455 } 455 }
456} 456}
457 457
@@ -653,7 +653,6 @@ void del_gendisk(struct gendisk *disk)
653 disk->flags &= ~GENHD_FL_UP; 653 disk->flags &= ~GENHD_FL_UP;
654 654
655 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); 655 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
656 bdi_unregister(&disk->queue->backing_dev_info);
657 blk_unregister_queue(disk); 656 blk_unregister_queue(disk);
658 blk_unregister_region(disk_devt(disk), disk->minors); 657 blk_unregister_region(disk_devt(disk), disk->minors);
659 658
@@ -691,13 +690,13 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
691 } else { 690 } else {
692 struct hd_struct *part; 691 struct hd_struct *part;
693 692
694 spin_lock(&ext_devt_lock); 693 spin_lock_bh(&ext_devt_lock);
695 part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); 694 part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
696 if (part && get_disk(part_to_disk(part))) { 695 if (part && get_disk(part_to_disk(part))) {
697 *partno = part->partno; 696 *partno = part->partno;
698 disk = part_to_disk(part); 697 disk = part_to_disk(part);
699 } 698 }
700 spin_unlock(&ext_devt_lock); 699 spin_unlock_bh(&ext_devt_lock);
701 } 700 }
702 701
703 return disk; 702 return disk;
diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2aca..8061eba42887 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,48 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
150 } 150 }
151} 151}
152 152
153static int blkdev_reread_part(struct block_device *bdev) 153/*
154 * This is an exported API for the block driver, and will not
155 * acquire bd_mutex. This API should be used in case that
156 * caller has held bd_mutex already.
157 */
158int __blkdev_reread_part(struct block_device *bdev)
154{ 159{
155 struct gendisk *disk = bdev->bd_disk; 160 struct gendisk *disk = bdev->bd_disk;
156 int res;
157 161
158 if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) 162 if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
159 return -EINVAL; 163 return -EINVAL;
160 if (!capable(CAP_SYS_ADMIN)) 164 if (!capable(CAP_SYS_ADMIN))
161 return -EACCES; 165 return -EACCES;
162 if (!mutex_trylock(&bdev->bd_mutex)) 166
163 return -EBUSY; 167 lockdep_assert_held(&bdev->bd_mutex);
164 res = rescan_partitions(disk, bdev); 168
169 return rescan_partitions(disk, bdev);
170}
171EXPORT_SYMBOL(__blkdev_reread_part);
172
173/*
174 * This is an exported API for the block driver, and will
175 * try to acquire bd_mutex. If bd_mutex has been held already
176 * in current context, please call __blkdev_reread_part().
177 *
178 * Make sure the held locks in current context aren't required
179 * in open()/close() handler and I/O path for avoiding ABBA deadlock:
180 * - bd_mutex is held before calling block driver's open/close
181 * handler
182 * - reading partition table may submit I/O to the block device
183 */
184int blkdev_reread_part(struct block_device *bdev)
185{
186 int res;
187
188 mutex_lock(&bdev->bd_mutex);
189 res = __blkdev_reread_part(bdev);
165 mutex_unlock(&bdev->bd_mutex); 190 mutex_unlock(&bdev->bd_mutex);
191
166 return res; 192 return res;
167} 193}
194EXPORT_SYMBOL(blkdev_reread_part);
168 195
169static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, 196static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
170 uint64_t len, int secure) 197 uint64_t len, int secure)