aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-22 20:00:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-22 20:00:32 -0400
commite9dd2b6837e26fe202708cce5ea4bb4ee3e3482e (patch)
treef42fd892495bfc4cbb740d06b016d267c9c42d00
parent4f3a29dadaf999a273f1e7fe2476595d0283eef3 (diff)
parentb4627321e18582dcbdeb45d77df29d3177107c65 (diff)
Merge branch 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block: (39 commits) cfq-iosched: Fix a gcc 4.5 warning and put some comments block: Turn bvec_k{un,}map_irq() into static inline functions block: fix accounting bug on cross partition merges block: Make the integrity mapped property a bio flag block: Fix double free in blk_integrity_unregister block: Ensure physical block size is unsigned int blkio-throttle: Fix possible multiplication overflow in iops calculations blkio-throttle: limit max iops value to UINT_MAX blkio-throttle: There is no need to convert jiffies to milli seconds blkio-throttle: Fix link failure failure on i386 blkio: Recalculate the throttled bio dispatch time upon throttle limit change blkio: Add root group to td->tg_list blkio: deletion of a cgroup was causes oops blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n block: set the bounce_pfn to the actual DMA limit rather than to max memory block: revert bad fix for memory hotplug causing bounces Fix compile error in blk-exec.c for !CONFIG_DETECT_HUNG_TASK block: set the bounce_pfn to the actual DMA limit rather than to max memory block: Prevent hang_check firing during long I/O cfq: improve fsync performance for small files ... Fix up trivial conflicts due to __rcu sparse annotation in include/linux/genhd.h
-rw-r--r--Documentation/cgroups/blkio-controller.txt106
-rw-r--r--block/Kconfig12
-rw-r--r--block/Makefile1
-rw-r--r--block/blk-cgroup.c804
-rw-r--r--block/blk-cgroup.h87
-rw-r--r--block/blk-core.c53
-rw-r--r--block/blk-exec.c9
-rw-r--r--block/blk-integrity.c94
-rw-r--r--block/blk-map.c5
-rw-r--r--block/blk-merge.c25
-rw-r--r--block/blk-settings.c12
-rw-r--r--block/blk-sysfs.c11
-rw-r--r--block/blk-throttle.c1123
-rw-r--r--block/blk.h12
-rw-r--r--block/cfq-iosched.c39
-rw-r--r--block/cfq.h2
-rw-r--r--block/genhd.c30
-rw-r--r--block/ioctl.c2
-rw-r--r--drivers/block/drbd/drbd_receiver.c1
-rw-r--r--drivers/md/dm-snap.c2
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/s390/scsi/zfcp_scsi.c1
-rw-r--r--drivers/scsi/hosts.c1
-rw-r--r--drivers/scsi/scsi_lib.c26
-rw-r--r--drivers/scsi/scsi_sysfs.c2
-rw-r--r--drivers/scsi/sd_dif.c11
-rw-r--r--drivers/scsi/sg.c2
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd2/commit.c2
-rw-r--r--fs/partitions/check.c35
-rw-r--r--fs/partitions/check.h3
-rw-r--r--fs/partitions/efi.c25
-rw-r--r--include/linux/bio.h15
-rw-r--r--include/linux/blk_types.h6
-rw-r--r--include/linux/blkdev.h66
-rw-r--r--include/linux/elevator.h2
-rw-r--r--include/linux/genhd.h54
-rw-r--r--include/linux/kernel.h10
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/scsi/scsi.h6
-rw-r--r--include/scsi/scsi_host.h7
-rw-r--r--init/Kconfig9
-rw-r--r--init/do_mounts.c70
43 files changed, 2494 insertions, 299 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 6919d62591d9..d6da611f8f63 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -8,12 +8,17 @@ both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
8Plan is to use the same cgroup based management interface for blkio controller 8Plan is to use the same cgroup based management interface for blkio controller
9and based on user options switch IO policies in the background. 9and based on user options switch IO policies in the background.
10 10
11In the first phase, this patchset implements proportional weight time based 11Currently two IO control policies are implemented. First one is proportional
12division of disk policy. It is implemented in CFQ. Hence this policy takes 12weight time based division of disk policy. It is implemented in CFQ. Hence
13effect only on leaf nodes when CFQ is being used. 13this policy takes effect only on leaf nodes when CFQ is being used. The second
14one is throttling policy which can be used to specify upper IO rate limits
15on devices. This policy is implemented in generic block layer and can be
16used on leaf nodes as well as higher level logical devices like device mapper.
14 17
15HOWTO 18HOWTO
16===== 19=====
20Proportional Weight division of bandwidth
21-----------------------------------------
17You can do a very simple testing of running two dd threads in two different 22You can do a very simple testing of running two dd threads in two different
18cgroups. Here is what you can do. 23cgroups. Here is what you can do.
19 24
@@ -55,6 +60,35 @@ cgroups. Here is what you can do.
55 group dispatched to the disk. We provide fairness in terms of disk time, so 60 group dispatched to the disk. We provide fairness in terms of disk time, so
56 ideally io.disk_time of cgroups should be in proportion to the weight. 61 ideally io.disk_time of cgroups should be in proportion to the weight.
57 62
63Throttling/Upper Limit policy
64-----------------------------
65- Enable Block IO controller
66 CONFIG_BLK_CGROUP=y
67
68- Enable throttling in block layer
69 CONFIG_BLK_DEV_THROTTLING=y
70
71- Mount blkio controller
72 mount -t cgroup -o blkio none /cgroup/blkio
73
74- Specify a bandwidth rate on particular device for root group. The format
75 for policy is "<major>:<minor> <byes_per_second>".
76
77 echo "8:16 1048576" > /cgroup/blkio/blkio.read_bps_device
78
79 Above will put a limit of 1MB/second on reads happening for root group
80 on device having major/minor number 8:16.
81
82- Run dd to read a file and see if rate is throttled to 1MB/s or not.
83
84 # dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
85 # iflag=direct
86 1024+0 records in
87 1024+0 records out
88 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
89
90 Limits for writes can be put using blkio.write_bps_device file.
91
58Various user visible config options 92Various user visible config options
59=================================== 93===================================
60CONFIG_BLK_CGROUP 94CONFIG_BLK_CGROUP
@@ -68,8 +102,13 @@ CONFIG_CFQ_GROUP_IOSCHED
68 - Enables group scheduling in CFQ. Currently only 1 level of group 102 - Enables group scheduling in CFQ. Currently only 1 level of group
69 creation is allowed. 103 creation is allowed.
70 104
105CONFIG_BLK_DEV_THROTTLING
106 - Enable block device throttling support in block layer.
107
71Details of cgroup files 108Details of cgroup files
72======================= 109=======================
110Proportional weight policy files
111--------------------------------
73- blkio.weight 112- blkio.weight
74 - Specifies per cgroup weight. This is default weight of the group 113 - Specifies per cgroup weight. This is default weight of the group
75 on all the devices until and unless overridden by per device rule. 114 on all the devices until and unless overridden by per device rule.
@@ -210,6 +249,67 @@ Details of cgroup files
210 and minor number of the device and third field specifies the number 249 and minor number of the device and third field specifies the number
211 of times a group was dequeued from a particular device. 250 of times a group was dequeued from a particular device.
212 251
252Throttling/Upper limit policy files
253-----------------------------------
254- blkio.throttle.read_bps_device
255 - Specifies upper limit on READ rate from the device. IO rate is
256 specified in bytes per second. Rules are per deivce. Following is
257 the format.
258
259 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device
260
261- blkio.throttle.write_bps_device
262 - Specifies upper limit on WRITE rate to the device. IO rate is
263 specified in bytes per second. Rules are per deivce. Following is
264 the format.
265
266 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device
267
268- blkio.throttle.read_iops_device
269 - Specifies upper limit on READ rate from the device. IO rate is
270 specified in IO per second. Rules are per deivce. Following is
271 the format.
272
273 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.read_iops_device
274
275- blkio.throttle.write_iops_device
276 - Specifies upper limit on WRITE rate to the device. IO rate is
277 specified in io per second. Rules are per deivce. Following is
278 the format.
279
280 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.write_iops_device
281
282Note: If both BW and IOPS rules are specified for a device, then IO is
283 subjectd to both the constraints.
284
285- blkio.throttle.io_serviced
286 - Number of IOs (bio) completed to/from the disk by the group (as
287 seen by throttling policy). These are further divided by the type
288 of operation - read or write, sync or async. First two fields specify
289 the major and minor number of the device, third field specifies the
290 operation type and the fourth field specifies the number of IOs.
291
292 blkio.io_serviced does accounting as seen by CFQ and counts are in
293 number of requests (struct request). On the other hand,
294 blkio.throttle.io_serviced counts number of IO in terms of number
295 of bios as seen by throttling policy. These bios can later be
296 merged by elevator and total number of requests completed can be
297 lesser.
298
299- blkio.throttle.io_service_bytes
300 - Number of bytes transferred to/from the disk by the group. These
301 are further divided by the type of operation - read or write, sync
302 or async. First two fields specify the major and minor number of the
303 device, third field specifies the operation type and the fourth field
304 specifies the number of bytes.
305
306 These numbers should roughly be same as blkio.io_service_bytes as
307 updated by CFQ. The difference between two is that
308 blkio.io_service_bytes will not be updated if CFQ is not operating
309 on request queue.
310
311Common files among various policies
312-----------------------------------
213- blkio.reset_stats 313- blkio.reset_stats
214 - Writing an int to this file will result in resetting all the stats 314 - Writing an int to this file will result in resetting all the stats
215 for that cgroup. 315 for that cgroup.
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_DEV_THROTTLING
81 bool "Block layer bio throttling support"
82 depends on BLK_CGROUP=y && EXPERIMENTAL
83 default n
84 ---help---
85 Block layer bio throttling support. It can be used to limit
86 the IO rate to a device. IO rate policies are per cgroup and
87 one needs to mount and use blkio cgroup controller for creating
88 cgroups and specifying per device IO rate policies.
89
90 See Documentation/cgroups/blkio-controller.txt for more information.
91
80endif # BLOCK 92endif # BLOCK
81 93
82config BLOCK_COMPAT 94config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
12obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 13obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef931a0..b1febd0f6d2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 39
40/* for encoding cft->private value on file */
41#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
42/* What policy owns the file, proportional or throttle */
43#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
44#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
45
40struct cgroup_subsys blkio_subsys = { 46struct cgroup_subsys blkio_subsys = {
41 .name = "blkio", 47 .name = "blkio",
42 .create = blkiocg_create, 48 .create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
59 list_add(&pn->node, &blkcg->policy_list); 65 list_add(&pn->node, &blkcg->policy_list);
60} 66}
61 67
68static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 struct blkio_group *blkg)
70{
71 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72
73 if (blkg->plid == plid)
74 return 1;
75
76 return 0;
77}
78
79/* Determines if policy node matches cgroup file being accessed */
80static inline bool pn_matches_cftype(struct cftype *cft,
81 struct blkio_policy_node *pn)
82{
83 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 int fileid = BLKIOFILE_ATTR(cft->private);
85
86 return (plid == pn->plid && fileid == pn->fileid);
87}
88
62/* Must be called with blkcg->lock held */ 89/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 90static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{ 91{
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
67 94
68/* Must be called with blkcg->lock held */ 95/* Must be called with blkcg->lock held */
69static struct blkio_policy_node * 96static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) 97blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 enum blkio_policy_id plid, int fileid)
71{ 99{
72 struct blkio_policy_node *pn; 100 struct blkio_policy_node *pn;
73 101
74 list_for_each_entry(pn, &blkcg->policy_list, node) { 102 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev) 103 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
76 return pn; 104 return pn;
77 } 105 }
78 106
@@ -86,6 +114,67 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
86} 114}
87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 115EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88 116
117static inline void
118blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
119{
120 struct blkio_policy_type *blkiop;
121
122 list_for_each_entry(blkiop, &blkio_list, list) {
123 /* If this policy does not own the blkg, do not send updates */
124 if (blkiop->plid != blkg->plid)
125 continue;
126 if (blkiop->ops.blkio_update_group_weight_fn)
127 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
128 blkg, weight);
129 }
130}
131
132static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
133 int fileid)
134{
135 struct blkio_policy_type *blkiop;
136
137 list_for_each_entry(blkiop, &blkio_list, list) {
138
139 /* If this policy does not own the blkg, do not send updates */
140 if (blkiop->plid != blkg->plid)
141 continue;
142
143 if (fileid == BLKIO_THROTL_read_bps_device
144 && blkiop->ops.blkio_update_group_read_bps_fn)
145 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
146 blkg, bps);
147
148 if (fileid == BLKIO_THROTL_write_bps_device
149 && blkiop->ops.blkio_update_group_write_bps_fn)
150 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
151 blkg, bps);
152 }
153}
154
155static inline void blkio_update_group_iops(struct blkio_group *blkg,
156 unsigned int iops, int fileid)
157{
158 struct blkio_policy_type *blkiop;
159
160 list_for_each_entry(blkiop, &blkio_list, list) {
161
162 /* If this policy does not own the blkg, do not send updates */
163 if (blkiop->plid != blkg->plid)
164 continue;
165
166 if (fileid == BLKIO_THROTL_read_iops_device
167 && blkiop->ops.blkio_update_group_read_iops_fn)
168 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
169 blkg, iops);
170
171 if (fileid == BLKIO_THROTL_write_iops_device
172 && blkiop->ops.blkio_update_group_write_iops_fn)
173 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
174 blkg,iops);
175 }
176}
177
89/* 178/*
90 * Add to the appropriate stat variable depending on the request type. 179 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held. 180 * This should be called with the blkg->stats_lock held.
@@ -341,7 +430,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 430EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342 431
343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 432void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344 struct blkio_group *blkg, void *key, dev_t dev) 433 struct blkio_group *blkg, void *key, dev_t dev,
434 enum blkio_policy_id plid)
345{ 435{
346 unsigned long flags; 436 unsigned long flags;
347 437
@@ -350,6 +440,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
350 rcu_assign_pointer(blkg->key, key); 440 rcu_assign_pointer(blkg->key, key);
351 blkg->blkcg_id = css_id(&blkcg->css); 441 blkg->blkcg_id = css_id(&blkcg->css);
352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 442 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
443 blkg->plid = plid;
353 spin_unlock_irqrestore(&blkcg->lock, flags); 444 spin_unlock_irqrestore(&blkcg->lock, flags);
354 /* Need to take css reference ? */ 445 /* Need to take css reference ? */
355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 446 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +499,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
408} 499}
409EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 500EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410 501
411#define SHOW_FUNCTION(__VAR) \
412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
413 struct cftype *cftype) \
414{ \
415 struct blkio_cgroup *blkcg; \
416 \
417 blkcg = cgroup_to_blkio_cgroup(cgroup); \
418 return (u64)blkcg->__VAR; \
419}
420
421SHOW_FUNCTION(weight);
422#undef SHOW_FUNCTION
423
424static int
425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426{
427 struct blkio_cgroup *blkcg;
428 struct blkio_group *blkg;
429 struct hlist_node *n;
430 struct blkio_policy_type *blkiop;
431 struct blkio_policy_node *pn;
432
433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
434 return -EINVAL;
435
436 blkcg = cgroup_to_blkio_cgroup(cgroup);
437 spin_lock(&blkio_list_lock);
438 spin_lock_irq(&blkcg->lock);
439 blkcg->weight = (unsigned int)val;
440
441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444 if (pn)
445 continue;
446
447 list_for_each_entry(blkiop, &blkio_list, list)
448 blkiop->ops.blkio_update_group_weight_fn(blkg,
449 blkcg->weight);
450 }
451 spin_unlock_irq(&blkcg->lock);
452 spin_unlock(&blkio_list_lock);
453 return 0;
454}
455
456static int 502static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 503blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{ 504{
@@ -593,52 +639,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
593 return disk_total; 639 return disk_total;
594} 640}
595 641
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
598 struct cftype *cftype, struct cgroup_map_cb *cb) \
599{ \
600 struct blkio_cgroup *blkcg; \
601 struct blkio_group *blkg; \
602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
604 \
605 if (!cgroup_lock_live_group(cgroup)) \
606 return -ENODEV; \
607 \
608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
609 rcu_read_lock(); \
610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611 if (blkg->dev) { \
612 spin_lock_irq(&blkg->stats_lock); \
613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
620 rcu_read_unlock(); \
621 cgroup_unlock(); \
622 return 0; \
623}
624
625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633#ifdef CONFIG_DEBUG_BLK_CGROUP
634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639#endif
640#undef SHOW_FUNCTION_PER_GROUP
641
642static int blkio_check_dev_num(dev_t dev) 642static int blkio_check_dev_num(dev_t dev)
643{ 643{
644 int part = 0; 644 int part = 0;
@@ -652,13 +652,14 @@ static int blkio_check_dev_num(dev_t dev)
652} 652}
653 653
654static int blkio_policy_parse_and_set(char *buf, 654static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn) 655 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
656{ 656{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 657 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret; 658 int ret;
659 unsigned long major, minor, temp; 659 unsigned long major, minor, temp;
660 int i = 0; 660 int i = 0;
661 dev_t dev; 661 dev_t dev;
662 u64 bps, iops;
662 663
663 memset(s, 0, sizeof(s)); 664 memset(s, 0, sizeof(s));
664 665
@@ -705,12 +706,47 @@ static int blkio_policy_parse_and_set(char *buf,
705 if (s[1] == NULL) 706 if (s[1] == NULL)
706 return -EINVAL; 707 return -EINVAL;
707 708
708 ret = strict_strtoul(s[1], 10, &temp); 709 switch (plid) {
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || 710 case BLKIO_POLICY_PROP:
710 temp > BLKIO_WEIGHT_MAX) 711 ret = strict_strtoul(s[1], 10, &temp);
711 return -EINVAL; 712 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
713 temp > BLKIO_WEIGHT_MAX)
714 return -EINVAL;
712 715
713 newpn->weight = temp; 716 newpn->plid = plid;
717 newpn->fileid = fileid;
718 newpn->val.weight = temp;
719 break;
720 case BLKIO_POLICY_THROTL:
721 switch(fileid) {
722 case BLKIO_THROTL_read_bps_device:
723 case BLKIO_THROTL_write_bps_device:
724 ret = strict_strtoull(s[1], 10, &bps);
725 if (ret)
726 return -EINVAL;
727
728 newpn->plid = plid;
729 newpn->fileid = fileid;
730 newpn->val.bps = bps;
731 break;
732 case BLKIO_THROTL_read_iops_device:
733 case BLKIO_THROTL_write_iops_device:
734 ret = strict_strtoull(s[1], 10, &iops);
735 if (ret)
736 return -EINVAL;
737
738 if (iops > THROTL_IOPS_MAX)
739 return -EINVAL;
740
741 newpn->plid = plid;
742 newpn->fileid = fileid;
743 newpn->val.iops = (unsigned int)iops;
744 break;
745 }
746 break;
747 default:
748 BUG();
749 }
714 750
715 return 0; 751 return 0;
716} 752}
@@ -720,26 +756,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
720{ 756{
721 struct blkio_policy_node *pn; 757 struct blkio_policy_node *pn;
722 758
723 pn = blkio_policy_search_node(blkcg, dev); 759 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
760 BLKIO_PROP_weight_device);
724 if (pn) 761 if (pn)
725 return pn->weight; 762 return pn->val.weight;
726 else 763 else
727 return blkcg->weight; 764 return blkcg->weight;
728} 765}
729EXPORT_SYMBOL_GPL(blkcg_get_weight); 766EXPORT_SYMBOL_GPL(blkcg_get_weight);
730 767
768uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
769{
770 struct blkio_policy_node *pn;
771
772 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
773 BLKIO_THROTL_read_bps_device);
774 if (pn)
775 return pn->val.bps;
776 else
777 return -1;
778}
779
780uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
781{
782 struct blkio_policy_node *pn;
783 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
784 BLKIO_THROTL_write_bps_device);
785 if (pn)
786 return pn->val.bps;
787 else
788 return -1;
789}
790
791unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
792{
793 struct blkio_policy_node *pn;
794
795 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796 BLKIO_THROTL_read_iops_device);
797 if (pn)
798 return pn->val.iops;
799 else
800 return -1;
801}
802
803unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
804{
805 struct blkio_policy_node *pn;
806 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
807 BLKIO_THROTL_write_iops_device);
808 if (pn)
809 return pn->val.iops;
810 else
811 return -1;
812}
813
814/* Checks whether user asked for deleting a policy rule */
815static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
816{
817 switch(pn->plid) {
818 case BLKIO_POLICY_PROP:
819 if (pn->val.weight == 0)
820 return 1;
821 break;
822 case BLKIO_POLICY_THROTL:
823 switch(pn->fileid) {
824 case BLKIO_THROTL_read_bps_device:
825 case BLKIO_THROTL_write_bps_device:
826 if (pn->val.bps == 0)
827 return 1;
828 break;
829 case BLKIO_THROTL_read_iops_device:
830 case BLKIO_THROTL_write_iops_device:
831 if (pn->val.iops == 0)
832 return 1;
833 }
834 break;
835 default:
836 BUG();
837 }
838
839 return 0;
840}
841
842static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
843 struct blkio_policy_node *newpn)
844{
845 switch(oldpn->plid) {
846 case BLKIO_POLICY_PROP:
847 oldpn->val.weight = newpn->val.weight;
848 break;
849 case BLKIO_POLICY_THROTL:
850 switch(newpn->fileid) {
851 case BLKIO_THROTL_read_bps_device:
852 case BLKIO_THROTL_write_bps_device:
853 oldpn->val.bps = newpn->val.bps;
854 break;
855 case BLKIO_THROTL_read_iops_device:
856 case BLKIO_THROTL_write_iops_device:
857 oldpn->val.iops = newpn->val.iops;
858 }
859 break;
860 default:
861 BUG();
862 }
863}
864
865/*
866 * Some rules/values in blkg have changed. Propogate those to respective
867 * policies.
868 */
869static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
870 struct blkio_group *blkg, struct blkio_policy_node *pn)
871{
872 unsigned int weight, iops;
873 u64 bps;
874
875 switch(pn->plid) {
876 case BLKIO_POLICY_PROP:
877 weight = pn->val.weight ? pn->val.weight :
878 blkcg->weight;
879 blkio_update_group_weight(blkg, weight);
880 break;
881 case BLKIO_POLICY_THROTL:
882 switch(pn->fileid) {
883 case BLKIO_THROTL_read_bps_device:
884 case BLKIO_THROTL_write_bps_device:
885 bps = pn->val.bps ? pn->val.bps : (-1);
886 blkio_update_group_bps(blkg, bps, pn->fileid);
887 break;
888 case BLKIO_THROTL_read_iops_device:
889 case BLKIO_THROTL_write_iops_device:
890 iops = pn->val.iops ? pn->val.iops : (-1);
891 blkio_update_group_iops(blkg, iops, pn->fileid);
892 break;
893 }
894 break;
895 default:
896 BUG();
897 }
898}
899
900/*
901 * A policy node rule has been updated. Propogate this update to all the
902 * block groups which might be affected by this update.
903 */
904static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
905 struct blkio_policy_node *pn)
906{
907 struct blkio_group *blkg;
908 struct hlist_node *n;
909
910 spin_lock(&blkio_list_lock);
911 spin_lock_irq(&blkcg->lock);
912
913 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
914 if (pn->dev != blkg->dev || pn->plid != blkg->plid)
915 continue;
916 blkio_update_blkg_policy(blkcg, blkg, pn);
917 }
918
919 spin_unlock_irq(&blkcg->lock);
920 spin_unlock(&blkio_list_lock);
921}
731 922
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, 923static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
733 const char *buffer) 924 const char *buffer)
734{ 925{
735 int ret = 0; 926 int ret = 0;
736 char *buf; 927 char *buf;
737 struct blkio_policy_node *newpn, *pn; 928 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg; 929 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0; 930 int keep_newpn = 0;
741 struct hlist_node *n; 931 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
742 struct blkio_policy_type *blkiop; 932 int fileid = BLKIOFILE_ATTR(cft->private);
743 933
744 buf = kstrdup(buffer, GFP_KERNEL); 934 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf) 935 if (!buf)
@@ -751,7 +941,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
751 goto free_buf; 941 goto free_buf;
752 } 942 }
753 943
754 ret = blkio_policy_parse_and_set(buf, newpn); 944 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
755 if (ret) 945 if (ret)
756 goto free_newpn; 946 goto free_newpn;
757 947
@@ -759,9 +949,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
759 949
760 spin_lock_irq(&blkcg->lock); 950 spin_lock_irq(&blkcg->lock);
761 951
762 pn = blkio_policy_search_node(blkcg, newpn->dev); 952 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
763 if (!pn) { 953 if (!pn) {
764 if (newpn->weight != 0) { 954 if (!blkio_delete_rule_command(newpn)) {
765 blkio_policy_insert_node(blkcg, newpn); 955 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1; 956 keep_newpn = 1;
767 } 957 }
@@ -769,33 +959,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
769 goto update_io_group; 959 goto update_io_group;
770 } 960 }
771 961
772 if (newpn->weight == 0) { 962 if (blkio_delete_rule_command(newpn)) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn); 963 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock); 964 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group; 965 goto update_io_group;
777 } 966 }
778 spin_unlock_irq(&blkcg->lock); 967 spin_unlock_irq(&blkcg->lock);
779 968
780 pn->weight = newpn->weight; 969 blkio_update_policy_rule(pn, newpn);
781 970
782update_io_group: 971update_io_group:
783 /* update weight for each cfqg */ 972 blkio_update_policy_node_blkg(blkcg, newpn);
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799 973
800free_newpn: 974free_newpn:
801 if (!keep_newpn) 975 if (!keep_newpn)
@@ -805,23 +979,256 @@ free_buf:
805 return ret; 979 return ret;
806} 980}
807 981
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, 982static void
809 struct seq_file *m) 983blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
810{ 984{
811 struct blkio_cgroup *blkcg; 985 switch(pn->plid) {
812 struct blkio_policy_node *pn; 986 case BLKIO_POLICY_PROP:
987 if (pn->fileid == BLKIO_PROP_weight_device)
988 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
989 MINOR(pn->dev), pn->val.weight);
990 break;
991 case BLKIO_POLICY_THROTL:
992 switch(pn->fileid) {
993 case BLKIO_THROTL_read_bps_device:
994 case BLKIO_THROTL_write_bps_device:
995 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
996 MINOR(pn->dev), pn->val.bps);
997 break;
998 case BLKIO_THROTL_read_iops_device:
999 case BLKIO_THROTL_write_iops_device:
1000 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001 MINOR(pn->dev), pn->val.iops);
1002 break;
1003 }
1004 break;
1005 default:
1006 BUG();
1007 }
1008}
813 1009
814 seq_printf(m, "dev\tweight\n"); 1010/* cgroup files which read their data from policy nodes end up here */
1011static void blkio_read_policy_node_files(struct cftype *cft,
1012 struct blkio_cgroup *blkcg, struct seq_file *m)
1013{
1014 struct blkio_policy_node *pn;
815 1015
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) { 1016 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock); 1017 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) { 1018 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1019 if (!pn_matches_cftype(cft, pn))
821 MINOR(pn->dev), pn->weight); 1020 continue;
1021 blkio_print_policy_node(m, pn);
822 } 1022 }
823 spin_unlock_irq(&blkcg->lock); 1023 spin_unlock_irq(&blkcg->lock);
824 } 1024 }
1025}
1026
1027static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1028 struct seq_file *m)
1029{
1030 struct blkio_cgroup *blkcg;
1031 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1032 int name = BLKIOFILE_ATTR(cft->private);
1033
1034 blkcg = cgroup_to_blkio_cgroup(cgrp);
1035
1036 switch(plid) {
1037 case BLKIO_POLICY_PROP:
1038 switch(name) {
1039 case BLKIO_PROP_weight_device:
1040 blkio_read_policy_node_files(cft, blkcg, m);
1041 return 0;
1042 default:
1043 BUG();
1044 }
1045 break;
1046 case BLKIO_POLICY_THROTL:
1047 switch(name){
1048 case BLKIO_THROTL_read_bps_device:
1049 case BLKIO_THROTL_write_bps_device:
1050 case BLKIO_THROTL_read_iops_device:
1051 case BLKIO_THROTL_write_iops_device:
1052 blkio_read_policy_node_files(cft, blkcg, m);
1053 return 0;
1054 default:
1055 BUG();
1056 }
1057 break;
1058 default:
1059 BUG();
1060 }
1061
1062 return 0;
1063}
1064
1065static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1066 struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1067 bool show_total)
1068{
1069 struct blkio_group *blkg;
1070 struct hlist_node *n;
1071 uint64_t cgroup_total = 0;
1072
1073 rcu_read_lock();
1074 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1075 if (blkg->dev) {
1076 if (!cftype_blkg_same_policy(cft, blkg))
1077 continue;
1078 spin_lock_irq(&blkg->stats_lock);
1079 cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1080 type);
1081 spin_unlock_irq(&blkg->stats_lock);
1082 }
1083 }
1084 if (show_total)
1085 cb->fill(cb, "Total", cgroup_total);
1086 rcu_read_unlock();
1087 return 0;
1088}
1089
1090/* All map kind of cgroup file get serviced by this function */
1091static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1092 struct cgroup_map_cb *cb)
1093{
1094 struct blkio_cgroup *blkcg;
1095 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1096 int name = BLKIOFILE_ATTR(cft->private);
1097
1098 blkcg = cgroup_to_blkio_cgroup(cgrp);
1099
1100 switch(plid) {
1101 case BLKIO_POLICY_PROP:
1102 switch(name) {
1103 case BLKIO_PROP_time:
1104 return blkio_read_blkg_stats(blkcg, cft, cb,
1105 BLKIO_STAT_TIME, 0);
1106 case BLKIO_PROP_sectors:
1107 return blkio_read_blkg_stats(blkcg, cft, cb,
1108 BLKIO_STAT_SECTORS, 0);
1109 case BLKIO_PROP_io_service_bytes:
1110 return blkio_read_blkg_stats(blkcg, cft, cb,
1111 BLKIO_STAT_SERVICE_BYTES, 1);
1112 case BLKIO_PROP_io_serviced:
1113 return blkio_read_blkg_stats(blkcg, cft, cb,
1114 BLKIO_STAT_SERVICED, 1);
1115 case BLKIO_PROP_io_service_time:
1116 return blkio_read_blkg_stats(blkcg, cft, cb,
1117 BLKIO_STAT_SERVICE_TIME, 1);
1118 case BLKIO_PROP_io_wait_time:
1119 return blkio_read_blkg_stats(blkcg, cft, cb,
1120 BLKIO_STAT_WAIT_TIME, 1);
1121 case BLKIO_PROP_io_merged:
1122 return blkio_read_blkg_stats(blkcg, cft, cb,
1123 BLKIO_STAT_MERGED, 1);
1124 case BLKIO_PROP_io_queued:
1125 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP
1128 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0);
1131 case BLKIO_PROP_avg_queue_size:
1132 return blkio_read_blkg_stats(blkcg, cft, cb,
1133 BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1134 case BLKIO_PROP_group_wait_time:
1135 return blkio_read_blkg_stats(blkcg, cft, cb,
1136 BLKIO_STAT_GROUP_WAIT_TIME, 0);
1137 case BLKIO_PROP_idle_time:
1138 return blkio_read_blkg_stats(blkcg, cft, cb,
1139 BLKIO_STAT_IDLE_TIME, 0);
1140 case BLKIO_PROP_empty_time:
1141 return blkio_read_blkg_stats(blkcg, cft, cb,
1142 BLKIO_STAT_EMPTY_TIME, 0);
1143#endif
1144 default:
1145 BUG();
1146 }
1147 break;
1148 case BLKIO_POLICY_THROTL:
1149 switch(name){
1150 case BLKIO_THROTL_io_service_bytes:
1151 return blkio_read_blkg_stats(blkcg, cft, cb,
1152 BLKIO_STAT_SERVICE_BYTES, 1);
1153 case BLKIO_THROTL_io_serviced:
1154 return blkio_read_blkg_stats(blkcg, cft, cb,
1155 BLKIO_STAT_SERVICED, 1);
1156 default:
1157 BUG();
1158 }
1159 break;
1160 default:
1161 BUG();
1162 }
1163
1164 return 0;
1165}
1166
1167static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1168{
1169 struct blkio_group *blkg;
1170 struct hlist_node *n;
1171 struct blkio_policy_node *pn;
1172
1173 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1174 return -EINVAL;
1175
1176 spin_lock(&blkio_list_lock);
1177 spin_lock_irq(&blkcg->lock);
1178 blkcg->weight = (unsigned int)val;
1179
1180 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1181 pn = blkio_policy_search_node(blkcg, blkg->dev,
1182 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1183 if (pn)
1184 continue;
1185
1186 blkio_update_group_weight(blkg, blkcg->weight);
1187 }
1188 spin_unlock_irq(&blkcg->lock);
1189 spin_unlock(&blkio_list_lock);
1190 return 0;
1191}
1192
1193static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1194 struct blkio_cgroup *blkcg;
1195 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1196 int name = BLKIOFILE_ATTR(cft->private);
1197
1198 blkcg = cgroup_to_blkio_cgroup(cgrp);
1199
1200 switch(plid) {
1201 case BLKIO_POLICY_PROP:
1202 switch(name) {
1203 case BLKIO_PROP_weight:
1204 return (u64)blkcg->weight;
1205 }
1206 break;
1207 default:
1208 BUG();
1209 }
1210 return 0;
1211}
1212
1213static int
1214blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1215{
1216 struct blkio_cgroup *blkcg;
1217 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1218 int name = BLKIOFILE_ATTR(cft->private);
1219
1220 blkcg = cgroup_to_blkio_cgroup(cgrp);
1221
1222 switch(plid) {
1223 case BLKIO_POLICY_PROP:
1224 switch(name) {
1225 case BLKIO_PROP_weight:
1226 return blkio_weight_write(blkcg, val);
1227 }
1228 break;
1229 default:
1230 BUG();
1231 }
825 1232
826 return 0; 1233 return 0;
827} 1234}
@@ -829,71 +1236,151 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
829struct cftype blkio_files[] = { 1236struct cftype blkio_files[] = {
830 { 1237 {
831 .name = "weight_device", 1238 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read, 1239 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
833 .write_string = blkiocg_weight_device_write, 1240 BLKIO_PROP_weight_device),
1241 .read_seq_string = blkiocg_file_read,
1242 .write_string = blkiocg_file_write,
834 .max_write_len = 256, 1243 .max_write_len = 256,
835 }, 1244 },
836 { 1245 {
837 .name = "weight", 1246 .name = "weight",
838 .read_u64 = blkiocg_weight_read, 1247 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
839 .write_u64 = blkiocg_weight_write, 1248 BLKIO_PROP_weight),
1249 .read_u64 = blkiocg_file_read_u64,
1250 .write_u64 = blkiocg_file_write_u64,
840 }, 1251 },
841 { 1252 {
842 .name = "time", 1253 .name = "time",
843 .read_map = blkiocg_time_read, 1254 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255 BLKIO_PROP_time),
1256 .read_map = blkiocg_file_read_map,
844 }, 1257 },
845 { 1258 {
846 .name = "sectors", 1259 .name = "sectors",
847 .read_map = blkiocg_sectors_read, 1260 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1261 BLKIO_PROP_sectors),
1262 .read_map = blkiocg_file_read_map,
848 }, 1263 },
849 { 1264 {
850 .name = "io_service_bytes", 1265 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read, 1266 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1267 BLKIO_PROP_io_service_bytes),
1268 .read_map = blkiocg_file_read_map,
852 }, 1269 },
853 { 1270 {
854 .name = "io_serviced", 1271 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read, 1272 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1273 BLKIO_PROP_io_serviced),
1274 .read_map = blkiocg_file_read_map,
856 }, 1275 },
857 { 1276 {
858 .name = "io_service_time", 1277 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read, 1278 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1279 BLKIO_PROP_io_service_time),
1280 .read_map = blkiocg_file_read_map,
860 }, 1281 },
861 { 1282 {
862 .name = "io_wait_time", 1283 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read, 1284 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1285 BLKIO_PROP_io_wait_time),
1286 .read_map = blkiocg_file_read_map,
864 }, 1287 },
865 { 1288 {
866 .name = "io_merged", 1289 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read, 1290 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1291 BLKIO_PROP_io_merged),
1292 .read_map = blkiocg_file_read_map,
868 }, 1293 },
869 { 1294 {
870 .name = "io_queued", 1295 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read, 1296 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1297 BLKIO_PROP_io_queued),
1298 .read_map = blkiocg_file_read_map,
872 }, 1299 },
873 { 1300 {
874 .name = "reset_stats", 1301 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats, 1302 .write_u64 = blkiocg_reset_stats,
876 }, 1303 },
1304#ifdef CONFIG_BLK_DEV_THROTTLING
1305 {
1306 .name = "throttle.read_bps_device",
1307 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1308 BLKIO_THROTL_read_bps_device),
1309 .read_seq_string = blkiocg_file_read,
1310 .write_string = blkiocg_file_write,
1311 .max_write_len = 256,
1312 },
1313
1314 {
1315 .name = "throttle.write_bps_device",
1316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1317 BLKIO_THROTL_write_bps_device),
1318 .read_seq_string = blkiocg_file_read,
1319 .write_string = blkiocg_file_write,
1320 .max_write_len = 256,
1321 },
1322
1323 {
1324 .name = "throttle.read_iops_device",
1325 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1326 BLKIO_THROTL_read_iops_device),
1327 .read_seq_string = blkiocg_file_read,
1328 .write_string = blkiocg_file_write,
1329 .max_write_len = 256,
1330 },
1331
1332 {
1333 .name = "throttle.write_iops_device",
1334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1335 BLKIO_THROTL_write_iops_device),
1336 .read_seq_string = blkiocg_file_read,
1337 .write_string = blkiocg_file_write,
1338 .max_write_len = 256,
1339 },
1340 {
1341 .name = "throttle.io_service_bytes",
1342 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343 BLKIO_THROTL_io_service_bytes),
1344 .read_map = blkiocg_file_read_map,
1345 },
1346 {
1347 .name = "throttle.io_serviced",
1348 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1349 BLKIO_THROTL_io_serviced),
1350 .read_map = blkiocg_file_read_map,
1351 },
1352#endif /* CONFIG_BLK_DEV_THROTTLING */
1353
877#ifdef CONFIG_DEBUG_BLK_CGROUP 1354#ifdef CONFIG_DEBUG_BLK_CGROUP
878 { 1355 {
879 .name = "avg_queue_size", 1356 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read, 1357 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358 BLKIO_PROP_avg_queue_size),
1359 .read_map = blkiocg_file_read_map,
881 }, 1360 },
882 { 1361 {
883 .name = "group_wait_time", 1362 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read, 1363 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1364 BLKIO_PROP_group_wait_time),
1365 .read_map = blkiocg_file_read_map,
885 }, 1366 },
886 { 1367 {
887 .name = "idle_time", 1368 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read, 1369 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1370 BLKIO_PROP_idle_time),
1371 .read_map = blkiocg_file_read_map,
889 }, 1372 },
890 { 1373 {
891 .name = "empty_time", 1374 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read, 1375 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1376 BLKIO_PROP_empty_time),
1377 .read_map = blkiocg_file_read_map,
893 }, 1378 },
894 { 1379 {
895 .name = "dequeue", 1380 .name = "dequeue",
896 .read_map = blkiocg_dequeue_read, 1381 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1382 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map,
897 }, 1384 },
898#endif 1385#endif
899}; 1386};
@@ -932,13 +1419,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
932 /* 1419 /*
933 * This blkio_group is being unlinked as associated cgroup is 1420 * This blkio_group is being unlinked as associated cgroup is
934 * going away. Let all the IO controlling policies know about 1421 * going away. Let all the IO controlling policies know about
935 * this event. Currently this is static call to one io 1422 * this event.
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */ 1423 */
939 spin_lock(&blkio_list_lock); 1424 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list) 1425 list_for_each_entry(blkiop, &blkio_list, list) {
1426 if (blkiop->plid != blkg->plid)
1427 continue;
941 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1428 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1429 }
942 spin_unlock(&blkio_list_lock); 1430 spin_unlock(&blkio_list_lock);
943 } while (1); 1431 } while (1);
944 1432
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,14 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17 17
18enum blkio_policy_id {
19 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
20 BLKIO_POLICY_THROTL, /* Throttling */
21};
22
23/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX
25
18#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 26#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
19 27
20#ifndef CONFIG_BLK_CGROUP 28#ifndef CONFIG_BLK_CGROUP
@@ -65,6 +73,35 @@ enum blkg_state_flags {
65 BLKG_empty, 73 BLKG_empty,
66}; 74};
67 75
76/* cgroup files owned by proportional weight policy */
77enum blkcg_file_name_prop {
78 BLKIO_PROP_weight = 1,
79 BLKIO_PROP_weight_device,
80 BLKIO_PROP_io_service_bytes,
81 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time,
83 BLKIO_PROP_sectors,
84 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged,
87 BLKIO_PROP_io_queued,
88 BLKIO_PROP_avg_queue_size,
89 BLKIO_PROP_group_wait_time,
90 BLKIO_PROP_idle_time,
91 BLKIO_PROP_empty_time,
92 BLKIO_PROP_dequeue,
93};
94
95/* cgroup files owned by throttle policy */
96enum blkcg_file_name_throtl {
97 BLKIO_THROTL_read_bps_device,
98 BLKIO_THROTL_write_bps_device,
99 BLKIO_THROTL_read_iops_device,
100 BLKIO_THROTL_write_iops_device,
101 BLKIO_THROTL_io_service_bytes,
102 BLKIO_THROTL_io_serviced,
103};
104
68struct blkio_cgroup { 105struct blkio_cgroup {
69 struct cgroup_subsys_state css; 106 struct cgroup_subsys_state css;
70 unsigned int weight; 107 unsigned int weight;
@@ -112,6 +149,8 @@ struct blkio_group {
112 char path[128]; 149 char path[128];
113 /* The device MKDEV(major, minor), this group has been created for */ 150 /* The device MKDEV(major, minor), this group has been created for */
114 dev_t dev; 151 dev_t dev;
152 /* policy which owns this blk group */
153 enum blkio_policy_id plid;
115 154
116 /* Need to serialize the stats in the case of reset/update */ 155 /* Need to serialize the stats in the case of reset/update */
117 spinlock_t stats_lock; 156 spinlock_t stats_lock;
@@ -121,24 +160,60 @@ struct blkio_group {
121struct blkio_policy_node { 160struct blkio_policy_node {
122 struct list_head node; 161 struct list_head node;
123 dev_t dev; 162 dev_t dev;
124 unsigned int weight; 163 /* This node belongs to max bw policy or porportional weight policy */
164 enum blkio_policy_id plid;
165 /* cgroup file to which this rule belongs to */
166 int fileid;
167
168 union {
169 unsigned int weight;
170 /*
171 * Rate read/write in terms of byptes per second
172 * Whether this rate represents read or write is determined
173 * by file type "fileid".
174 */
175 u64 bps;
176 unsigned int iops;
177 } val;
125}; 178};
126 179
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 180extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev); 181 dev_t dev);
182extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
183 dev_t dev);
184extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
185 dev_t dev);
186extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
187 dev_t dev);
188extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
189 dev_t dev);
129 190
130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 191typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 192
132 unsigned int weight); 193typedef void (blkio_update_group_weight_fn) (void *key,
194 struct blkio_group *blkg, unsigned int weight);
195typedef void (blkio_update_group_read_bps_fn) (void * key,
196 struct blkio_group *blkg, u64 read_bps);
197typedef void (blkio_update_group_write_bps_fn) (void *key,
198 struct blkio_group *blkg, u64 write_bps);
199typedef void (blkio_update_group_read_iops_fn) (void *key,
200 struct blkio_group *blkg, unsigned int read_iops);
201typedef void (blkio_update_group_write_iops_fn) (void *key,
202 struct blkio_group *blkg, unsigned int write_iops);
133 203
134struct blkio_policy_ops { 204struct blkio_policy_ops {
135 blkio_unlink_group_fn *blkio_unlink_group_fn; 205 blkio_unlink_group_fn *blkio_unlink_group_fn;
136 blkio_update_group_weight_fn *blkio_update_group_weight_fn; 206 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
207 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
208 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
209 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
210 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
137}; 211};
138 212
139struct blkio_policy_type { 213struct blkio_policy_type {
140 struct list_head list; 214 struct list_head list;
141 struct blkio_policy_ops ops; 215 struct blkio_policy_ops ops;
216 enum blkio_policy_id plid;
142}; 217};
143 218
144/* Blkio controller policy registration */ 219/* Blkio controller policy registration */
@@ -212,7 +287,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
212extern struct blkio_cgroup blkio_root_cgroup; 287extern struct blkio_cgroup blkio_root_cgroup;
213extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); 288extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
214extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 289extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
215 struct blkio_group *blkg, void *key, dev_t dev); 290 struct blkio_group *blkg, void *key, dev_t dev,
291 enum blkio_policy_id plid);
216extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 292extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
218 void *key); 294 void *key);
@@ -234,7 +310,8 @@ static inline struct blkio_cgroup *
234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 310cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
235 311
236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 312static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
237 struct blkio_group *blkg, void *key, dev_t dev) {} 313 struct blkio_group *blkg, void *key, dev_t dev,
314 enum blkio_policy_id plid) {}
238 315
239static inline int 316static inline int
240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 317blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 32a1c123dfb3..500eb859886e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 64 return;
65 65
66 cpu = part_stat_lock(); 66 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67
69 if (!new_io) 68 if (!new_io) {
69 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 70 part_stat_inc(cpu, part, merges[rw]);
71 else { 71 } else {
72 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
72 part_round_stats(cpu, part); 73 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 74 part_inc_in_flight(part, rw);
75 rq->part = part;
74 } 76 }
75 77
76 part_stat_unlock(); 78 part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 130 rq->ref_count = 1;
129 rq->start_time = jiffies; 131 rq->start_time = jiffies;
130 set_start_time_ns(rq); 132 set_start_time_ns(rq);
133 rq->part = NULL;
131} 134}
132EXPORT_SYMBOL(blk_rq_init); 135EXPORT_SYMBOL(blk_rq_init);
133 136
@@ -382,6 +385,7 @@ void blk_sync_queue(struct request_queue *q)
382 del_timer_sync(&q->unplug_timer); 385 del_timer_sync(&q->unplug_timer);
383 del_timer_sync(&q->timeout); 386 del_timer_sync(&q->timeout);
384 cancel_work_sync(&q->unplug_work); 387 cancel_work_sync(&q->unplug_work);
388 throtl_shutdown_timer_wq(q);
385} 389}
386EXPORT_SYMBOL(blk_sync_queue); 390EXPORT_SYMBOL(blk_sync_queue);
387 391
@@ -459,6 +463,8 @@ void blk_cleanup_queue(struct request_queue *q)
459 if (q->elevator) 463 if (q->elevator)
460 elevator_exit(q->elevator); 464 elevator_exit(q->elevator);
461 465
466 blk_throtl_exit(q);
467
462 blk_put_queue(q); 468 blk_put_queue(q);
463} 469}
464EXPORT_SYMBOL(blk_cleanup_queue); 470EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +521,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
515 return NULL; 521 return NULL;
516 } 522 }
517 523
524 if (blk_throtl_init(q)) {
525 kmem_cache_free(blk_requestq_cachep, q);
526 return NULL;
527 }
528
518 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 529 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
519 laptop_mode_timer_fn, (unsigned long) q); 530 laptop_mode_timer_fn, (unsigned long) q);
520 init_timer(&q->unplug_timer); 531 init_timer(&q->unplug_timer);
@@ -796,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
796 rl->starved[is_sync] = 0; 807 rl->starved[is_sync] = 0;
797 808
798 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 809 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
799 if (priv) 810 if (priv) {
800 rl->elvpriv++; 811 rl->elvpriv++;
801 812
802 if (blk_queue_io_stat(q)) 813 /*
803 rw_flags |= REQ_IO_STAT; 814 * Don't do stats for non-priv requests
815 */
816 if (blk_queue_io_stat(q))
817 rw_flags |= REQ_IO_STAT;
818 }
819
804 spin_unlock_irq(q->queue_lock); 820 spin_unlock_irq(q->queue_lock);
805 821
806 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 822 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1522,6 +1538,15 @@ static inline void __generic_make_request(struct bio *bio)
1522 goto end_io; 1538 goto end_io;
1523 } 1539 }
1524 1540
1541 blk_throtl_bio(q, &bio);
1542
1543 /*
1544 * If bio = NULL, bio has been throttled and will be submitted
1545 * later.
1546 */
1547 if (!bio)
1548 break;
1549
1525 trace_block_bio_queue(q, bio); 1550 trace_block_bio_queue(q, bio);
1526 1551
1527 ret = q->make_request_fn(q, bio); 1552 ret = q->make_request_fn(q, bio);
@@ -1612,11 +1637,12 @@ void submit_bio(int rw, struct bio *bio)
1612 1637
1613 if (unlikely(block_dump)) { 1638 if (unlikely(block_dump)) {
1614 char b[BDEVNAME_SIZE]; 1639 char b[BDEVNAME_SIZE];
1615 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 1640 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
1616 current->comm, task_pid_nr(current), 1641 current->comm, task_pid_nr(current),
1617 (rw & WRITE) ? "WRITE" : "READ", 1642 (rw & WRITE) ? "WRITE" : "READ",
1618 (unsigned long long)bio->bi_sector, 1643 (unsigned long long)bio->bi_sector,
1619 bdevname(bio->bi_bdev, b)); 1644 bdevname(bio->bi_bdev, b),
1645 count);
1620 } 1646 }
1621 } 1647 }
1622 1648
@@ -1759,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1759 int cpu; 1785 int cpu;
1760 1786
1761 cpu = part_stat_lock(); 1787 cpu = part_stat_lock();
1762 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1788 part = req->part;
1763 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1789 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1764 part_stat_unlock(); 1790 part_stat_unlock();
1765 } 1791 }
@@ -1779,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
1779 int cpu; 1805 int cpu;
1780 1806
1781 cpu = part_stat_lock(); 1807 cpu = part_stat_lock();
1782 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1808 part = req->part;
1783 1809
1784 part_stat_inc(cpu, part, ios[rw]); 1810 part_stat_inc(cpu, part, ios[rw]);
1785 part_stat_add(cpu, part, ticks[rw], duration); 1811 part_stat_add(cpu, part, ticks[rw], duration);
@@ -2579,6 +2605,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2579} 2605}
2580EXPORT_SYMBOL(kblockd_schedule_work); 2606EXPORT_SYMBOL(kblockd_schedule_work);
2581 2607
2608int kblockd_schedule_delayed_work(struct request_queue *q,
2609 struct delayed_work *dwork, unsigned long delay)
2610{
2611 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2612}
2613EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2614
2582int __init blk_dev_init(void) 2615int __init blk_dev_init(void)
2583{ 2616{
2584 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2617 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
80 DECLARE_COMPLETION_ONSTACK(wait); 80 DECLARE_COMPLETION_ONSTACK(wait);
81 char sense[SCSI_SENSE_BUFFERSIZE]; 81 char sense[SCSI_SENSE_BUFFERSIZE];
82 int err = 0; 82 int err = 0;
83 unsigned long hang_check;
83 84
84 /* 85 /*
85 * we need an extra reference to the request, so we can look at 86 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
95 96
96 rq->end_io_data = &wait; 97 rq->end_io_data = &wait;
97 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 98 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
98 wait_for_completion(&wait); 99
100 /* Prevent hang_check timer from firing at us during very long I/O */
101 hang_check = sysctl_hung_task_timeout_secs;
102 if (hang_check)
103 while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
104 else
105 wait_for_completion(&wait);
99 106
100 if (rq->errors) 107 if (rq->errors)
101 err = -EIO; 108 err = -EIO;
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
32 32
33/** 33/**
34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements 34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
35 * @rq: request with integrity metadata attached 35 * @q: request queue
36 * @bio: bio with integrity metadata attached
36 * 37 *
37 * Description: Returns the number of elements required in a 38 * Description: Returns the number of elements required in a
38 * scatterlist corresponding to the integrity metadata in a request. 39 * scatterlist corresponding to the integrity metadata in a bio.
39 */ 40 */
40int blk_rq_count_integrity_sg(struct request *rq) 41int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
41{ 42{
42 struct bio_vec *iv, *ivprv; 43 struct bio_vec *iv, *ivprv = NULL;
43 struct req_iterator iter; 44 unsigned int segments = 0;
44 unsigned int segments; 45 unsigned int seg_size = 0;
46 unsigned int i = 0;
45 47
46 ivprv = NULL; 48 bio_for_each_integrity_vec(iv, bio, i) {
47 segments = 0;
48 49
49 rq_for_each_integrity_segment(iv, rq, iter) { 50 if (ivprv) {
51 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
52 goto new_segment;
53
54 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
55 goto new_segment;
50 56
51 if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 57 if (seg_size + iv->bv_len > queue_max_segment_size(q))
58 goto new_segment;
59
60 seg_size += iv->bv_len;
61 } else {
62new_segment:
52 segments++; 63 segments++;
64 seg_size = iv->bv_len;
65 }
53 66
54 ivprv = iv; 67 ivprv = iv;
55 } 68 }
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
60 73
61/** 74/**
62 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist 75 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
63 * @rq: request with integrity metadata attached 76 * @q: request queue
77 * @bio: bio with integrity metadata attached
64 * @sglist: target scatterlist 78 * @sglist: target scatterlist
65 * 79 *
66 * Description: Map the integrity vectors in request into a 80 * Description: Map the integrity vectors in request into a
67 * scatterlist. The scatterlist must be big enough to hold all 81 * scatterlist. The scatterlist must be big enough to hold all
68 * elements. I.e. sized using blk_rq_count_integrity_sg(). 82 * elements. I.e. sized using blk_rq_count_integrity_sg().
69 */ 83 */
70int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) 84int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
85 struct scatterlist *sglist)
71{ 86{
72 struct bio_vec *iv, *ivprv; 87 struct bio_vec *iv, *ivprv = NULL;
73 struct req_iterator iter; 88 struct scatterlist *sg = NULL;
74 struct scatterlist *sg; 89 unsigned int segments = 0;
75 unsigned int segments; 90 unsigned int i = 0;
76
77 ivprv = NULL;
78 sg = NULL;
79 segments = 0;
80 91
81 rq_for_each_integrity_segment(iv, rq, iter) { 92 bio_for_each_integrity_vec(iv, bio, i) {
82 93
83 if (ivprv) { 94 if (ivprv) {
84 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 95 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
85 goto new_segment; 96 goto new_segment;
86 97
98 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
99 goto new_segment;
100
101 if (sg->length + iv->bv_len > queue_max_segment_size(q))
102 goto new_segment;
103
87 sg->length += iv->bv_len; 104 sg->length += iv->bv_len;
88 } else { 105 } else {
89new_segment: 106new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
162} 179}
163EXPORT_SYMBOL(blk_integrity_compare); 180EXPORT_SYMBOL(blk_integrity_compare);
164 181
182int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
183 struct request *next)
184{
185 if (blk_integrity_rq(req) != blk_integrity_rq(next))
186 return -1;
187
188 if (req->nr_integrity_segments + next->nr_integrity_segments >
189 q->limits.max_integrity_segments)
190 return -1;
191
192 return 0;
193}
194EXPORT_SYMBOL(blk_integrity_merge_rq);
195
196int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
197 struct bio *bio)
198{
199 int nr_integrity_segs;
200 struct bio *next = bio->bi_next;
201
202 bio->bi_next = NULL;
203 nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
204 bio->bi_next = next;
205
206 if (req->nr_integrity_segments + nr_integrity_segs >
207 q->limits.max_integrity_segments)
208 return -1;
209
210 req->nr_integrity_segments += nr_integrity_segs;
211
212 return 0;
213}
214EXPORT_SYMBOL(blk_integrity_merge_bio);
215
165struct integrity_sysfs_entry { 216struct integrity_sysfs_entry {
166 struct attribute attr; 217 struct attribute attr;
167 ssize_t (*show)(struct blk_integrity *, char *); 218 ssize_t (*show)(struct blk_integrity *, char *);
@@ -381,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
381 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 432 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
382 kobject_del(&bi->kobj); 433 kobject_del(&bi->kobj);
383 kobject_put(&bi->kobj); 434 kobject_put(&bi->kobj);
384 kmem_cache_free(integrity_cachep, bi);
385 disk->integrity = NULL; 435 disk->integrity = NULL;
386} 436}
387EXPORT_SYMBOL(blk_integrity_unregister); 437EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-map.c b/block/blk-map.c
index ade0a08c9099..d4a586d8691e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
54 * direct dma. else, set up kernel bounce buffers 54 * direct dma. else, set up kernel bounce buffers
55 */ 55 */
56 uaddr = (unsigned long) ubuf; 56 uaddr = (unsigned long) ubuf;
57 if (blk_rq_aligned(q, ubuf, len) && !map_data) 57 if (blk_rq_aligned(q, uaddr, len) && !map_data)
58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); 58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
59 else 59 else
60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); 60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
288 unsigned int len, gfp_t gfp_mask) 288 unsigned int len, gfp_t gfp_mask)
289{ 289{
290 int reading = rq_data_dir(rq) == READ; 290 int reading = rq_data_dir(rq) == READ;
291 unsigned long addr = (unsigned long) kbuf;
291 int do_copy = 0; 292 int do_copy = 0;
292 struct bio *bio; 293 struct bio *bio;
293 int ret; 294 int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
297 if (!len || !kbuf) 298 if (!len || !kbuf)
298 return -EINVAL; 299 return -EINVAL;
299 300
300 do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); 301 do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
301 if (do_copy) 302 if (do_copy)
302 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); 303 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
303 else 304 else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index eafc94f68d79..0a2fd8a48a38 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
205{ 205{
206 int nr_phys_segs = bio_phys_segments(q, bio); 206 int nr_phys_segs = bio_phys_segments(q, bio);
207 207
208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { 208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
209 req->cmd_flags |= REQ_NOMERGE; 209 goto no_merge;
210 if (req == q->last_merge) 210
211 q->last_merge = NULL; 211 if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
212 return 0; 212 goto no_merge;
213 }
214 213
215 /* 214 /*
216 * This will form the start of a new hw segment. Bump both 215 * This will form the start of a new hw segment. Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
218 */ 217 */
219 req->nr_phys_segments += nr_phys_segs; 218 req->nr_phys_segments += nr_phys_segs;
220 return 1; 219 return 1;
220
221no_merge:
222 req->cmd_flags |= REQ_NOMERGE;
223 if (req == q->last_merge)
224 q->last_merge = NULL;
225 return 0;
221} 226}
222 227
223int ll_back_merge_fn(struct request_queue *q, struct request *req, 228int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
301 if (total_phys_segments > queue_max_segments(q)) 306 if (total_phys_segments > queue_max_segments(q))
302 return 0; 307 return 0;
303 308
309 if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
310 return 0;
311
304 /* Merge is OK... */ 312 /* Merge is OK... */
305 req->nr_phys_segments = total_phys_segments; 313 req->nr_phys_segments = total_phys_segments;
306 return 1; 314 return 1;
@@ -343,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
343 int cpu; 351 int cpu;
344 352
345 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
346 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
347 355
348 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
349 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
@@ -384,9 +392,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
384 || next->special) 392 || next->special)
385 return 0; 393 return 0;
386 394
387 if (blk_integrity_rq(req) != blk_integrity_rq(next))
388 return 0;
389
390 /* 395 /*
391 * If we are allowed to merge, then append bio list 396 * If we are allowed to merge, then append bio list
392 * from next to rq and release next. merge_requests_fn 397 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..315b88c8cbbb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
111void blk_set_default_limits(struct queue_limits *lim) 111void blk_set_default_limits(struct queue_limits *lim)
112{ 112{
113 lim->max_segments = BLK_MAX_SEGMENTS; 113 lim->max_segments = BLK_MAX_SEGMENTS;
114 lim->max_integrity_segments = 0;
114 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
115 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
116 lim->max_sectors = BLK_DEF_MAX_SECTORS; 117 lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -213,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
213 */ 214 */
214 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 215 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
215 dma = 1; 216 dma = 1;
216 q->limits.bounce_pfn = max_low_pfn; 217 q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
217#else 218#else
218 if (b_pfn < blk_max_low_pfn) 219 if (b_pfn < blk_max_low_pfn)
219 dma = 1; 220 dma = 1;
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
343 * hardware can operate on without reverting to read-modify-write 344 * hardware can operate on without reverting to read-modify-write
344 * operations. 345 * operations.
345 */ 346 */
346void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) 347void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
347{ 348{
348 q->limits.physical_block_size = size; 349 q->limits.physical_block_size = size;
349 350
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
455} 456}
456EXPORT_SYMBOL(blk_queue_io_opt); 457EXPORT_SYMBOL(blk_queue_io_opt);
457 458
458/*
459 * Returns the minimum that is _not_ zero, unless both are zero.
460 */
461#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
462
463/** 459/**
464 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 460 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
465 * @t: the stacking driver (top) 461 * @t: the stacking driver (top)
@@ -514,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
514 b->seg_boundary_mask); 510 b->seg_boundary_mask);
515 511
516 t->max_segments = min_not_zero(t->max_segments, b->max_segments); 512 t->max_segments = min_not_zero(t->max_segments, b->max_segments);
513 t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
514 b->max_integrity_segments);
517 515
518 t->max_segment_size = min_not_zero(t->max_segment_size, 516 t->max_segment_size = min_not_zero(t->max_segment_size,
519 b->max_segment_size); 517 b->max_segment_size);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0749b89c6885..da8a8a40cd4c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
112 return queue_var_show(queue_max_segments(q), (page)); 112 return queue_var_show(queue_max_segments(q), (page));
113} 113}
114 114
115static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
116{
117 return queue_var_show(q->limits.max_integrity_segments, (page));
118}
119
115static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) 120static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
116{ 121{
117 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 122 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
288 .show = queue_max_segments_show, 293 .show = queue_max_segments_show,
289}; 294};
290 295
296static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
297 .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
298 .show = queue_max_integrity_segments_show,
299};
300
291static struct queue_sysfs_entry queue_max_segment_size_entry = { 301static struct queue_sysfs_entry queue_max_segment_size_entry = {
292 .attr = {.name = "max_segment_size", .mode = S_IRUGO }, 302 .attr = {.name = "max_segment_size", .mode = S_IRUGO },
293 .show = queue_max_segment_size_show, 303 .show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
375 &queue_max_hw_sectors_entry.attr, 385 &queue_max_hw_sectors_entry.attr,
376 &queue_max_sectors_entry.attr, 386 &queue_max_sectors_entry.attr,
377 &queue_max_segments_entry.attr, 387 &queue_max_segments_entry.attr,
388 &queue_max_integrity_segments_entry.attr,
378 &queue_max_segment_size_entry.attr, 389 &queue_max_segment_size_entry.attr,
379 &queue_iosched_entry.attr, 390 &queue_iosched_entry.attr,
380 &queue_hw_sector_size_entry.attr, 391 &queue_hw_sector_size_entry.attr,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..56ad4531b412
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,1123 @@
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
13
14/* Max dispatch from a group in 1 round */
15static int throtl_grp_quantum = 8;
16
17/* Total max dispatch from all groups in one round */
18static int throtl_quantum = 32;
19
20/* Throttling is performed over 100ms slice and after that slice is renewed */
21static unsigned long throtl_slice = HZ/10; /* 100 ms */
22
23struct throtl_rb_root {
24 struct rb_root rb;
25 struct rb_node *left;
26 unsigned int count;
27 unsigned long min_disptime;
28};
29
30#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
31 .count = 0, .min_disptime = 0}
32
33#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
34
35struct throtl_grp {
36 /* List of throtl groups on the request queue*/
37 struct hlist_node tg_node;
38
39 /* active throtl group service_tree member */
40 struct rb_node rb_node;
41
42 /*
43 * Dispatch time in jiffies. This is the estimated time when group
44 * will unthrottle and is ready to dispatch more bio. It is used as
45 * key to sort active groups in service tree.
46 */
47 unsigned long disptime;
48
49 struct blkio_group blkg;
50 atomic_t ref;
51 unsigned int flags;
52
53 /* Two lists for READ and WRITE */
54 struct bio_list bio_lists[2];
55
56 /* Number of queued bios on READ and WRITE lists */
57 unsigned int nr_queued[2];
58
59 /* bytes per second rate limits */
60 uint64_t bps[2];
61
62 /* IOPS limits */
63 unsigned int iops[2];
64
65 /* Number of bytes disptached in current slice */
66 uint64_t bytes_disp[2];
67 /* Number of bio's dispatched in current slice */
68 unsigned int io_disp[2];
69
70 /* When did we start a new slice */
71 unsigned long slice_start[2];
72 unsigned long slice_end[2];
73
74 /* Some throttle limits got updated for the group */
75 bool limits_changed;
76};
77
78struct throtl_data
79{
80 /* List of throtl groups */
81 struct hlist_head tg_list;
82
83 /* service tree for active throtl groups */
84 struct throtl_rb_root tg_service_tree;
85
86 struct throtl_grp root_tg;
87 struct request_queue *queue;
88
89 /* Total Number of queued bios on READ and WRITE lists */
90 unsigned int nr_queued[2];
91
92 /*
93 * number of total undestroyed groups
94 */
95 unsigned int nr_undestroyed_grps;
96
97 /* Work for dispatching throttled bios */
98 struct delayed_work throtl_work;
99
100 atomic_t limits_changed;
101};
102
103enum tg_state_flags {
104 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
105};
106
107#define THROTL_TG_FNS(name) \
108static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
109{ \
110 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
111} \
112static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
113{ \
114 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
115} \
116static inline int throtl_tg_##name(const struct throtl_grp *tg) \
117{ \
118 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
119}
120
121THROTL_TG_FNS(on_rr);
122
123#define throtl_log_tg(td, tg, fmt, args...) \
124 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
125 blkg_path(&(tg)->blkg), ##args); \
126
127#define throtl_log(td, fmt, args...) \
128 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
129
130static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
131{
132 if (blkg)
133 return container_of(blkg, struct throtl_grp, blkg);
134
135 return NULL;
136}
137
138static inline int total_nr_queued(struct throtl_data *td)
139{
140 return (td->nr_queued[0] + td->nr_queued[1]);
141}
142
143static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
144{
145 atomic_inc(&tg->ref);
146 return tg;
147}
148
149static void throtl_put_tg(struct throtl_grp *tg)
150{
151 BUG_ON(atomic_read(&tg->ref) <= 0);
152 if (!atomic_dec_and_test(&tg->ref))
153 return;
154 kfree(tg);
155}
156
157static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
158 struct cgroup *cgroup)
159{
160 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
161 struct throtl_grp *tg = NULL;
162 void *key = td;
163 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
164 unsigned int major, minor;
165
166 /*
167 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
168 * tree of blkg (instead of traversing through hash list all
169 * the time.
170 */
171 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
172
173 /* Fill in device details for root group */
174 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
175 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
176 tg->blkg.dev = MKDEV(major, minor);
177 goto done;
178 }
179
180 if (tg)
181 goto done;
182
183 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
184 if (!tg)
185 goto done;
186
187 INIT_HLIST_NODE(&tg->tg_node);
188 RB_CLEAR_NODE(&tg->rb_node);
189 bio_list_init(&tg->bio_lists[0]);
190 bio_list_init(&tg->bio_lists[1]);
191
192 /*
193 * Take the initial reference that will be released on destroy
194 * This can be thought of a joint reference by cgroup and
195 * request queue which will be dropped by either request queue
196 * exit or cgroup deletion path depending on who is exiting first.
197 */
198 atomic_set(&tg->ref, 1);
199
200 /* Add group onto cgroup list */
201 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
202 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
203 MKDEV(major, minor), BLKIO_POLICY_THROTL);
204
205 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
206 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
207 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
208 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
209
210 hlist_add_head(&tg->tg_node, &td->tg_list);
211 td->nr_undestroyed_grps++;
212done:
213 return tg;
214}
215
216static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
217{
218 struct cgroup *cgroup;
219 struct throtl_grp *tg = NULL;
220
221 rcu_read_lock();
222 cgroup = task_cgroup(current, blkio_subsys_id);
223 tg = throtl_find_alloc_tg(td, cgroup);
224 if (!tg)
225 tg = &td->root_tg;
226 rcu_read_unlock();
227 return tg;
228}
229
230static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
231{
232 /* Service tree is empty */
233 if (!root->count)
234 return NULL;
235
236 if (!root->left)
237 root->left = rb_first(&root->rb);
238
239 if (root->left)
240 return rb_entry_tg(root->left);
241
242 return NULL;
243}
244
245static void rb_erase_init(struct rb_node *n, struct rb_root *root)
246{
247 rb_erase(n, root);
248 RB_CLEAR_NODE(n);
249}
250
251static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
252{
253 if (root->left == n)
254 root->left = NULL;
255 rb_erase_init(n, &root->rb);
256 --root->count;
257}
258
259static void update_min_dispatch_time(struct throtl_rb_root *st)
260{
261 struct throtl_grp *tg;
262
263 tg = throtl_rb_first(st);
264 if (!tg)
265 return;
266
267 st->min_disptime = tg->disptime;
268}
269
270static void
271tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
272{
273 struct rb_node **node = &st->rb.rb_node;
274 struct rb_node *parent = NULL;
275 struct throtl_grp *__tg;
276 unsigned long key = tg->disptime;
277 int left = 1;
278
279 while (*node != NULL) {
280 parent = *node;
281 __tg = rb_entry_tg(parent);
282
283 if (time_before(key, __tg->disptime))
284 node = &parent->rb_left;
285 else {
286 node = &parent->rb_right;
287 left = 0;
288 }
289 }
290
291 if (left)
292 st->left = &tg->rb_node;
293
294 rb_link_node(&tg->rb_node, parent, node);
295 rb_insert_color(&tg->rb_node, &st->rb);
296}
297
298static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
299{
300 struct throtl_rb_root *st = &td->tg_service_tree;
301
302 tg_service_tree_add(st, tg);
303 throtl_mark_tg_on_rr(tg);
304 st->count++;
305}
306
307static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
308{
309 if (!throtl_tg_on_rr(tg))
310 __throtl_enqueue_tg(td, tg);
311}
312
313static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
314{
315 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
316 throtl_clear_tg_on_rr(tg);
317}
318
319static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
320{
321 if (throtl_tg_on_rr(tg))
322 __throtl_dequeue_tg(td, tg);
323}
324
325static void throtl_schedule_next_dispatch(struct throtl_data *td)
326{
327 struct throtl_rb_root *st = &td->tg_service_tree;
328
329 /*
330 * If there are more bios pending, schedule more work.
331 */
332 if (!total_nr_queued(td))
333 return;
334
335 BUG_ON(!st->count);
336
337 update_min_dispatch_time(st);
338
339 if (time_before_eq(st->min_disptime, jiffies))
340 throtl_schedule_delayed_work(td->queue, 0);
341 else
342 throtl_schedule_delayed_work(td->queue,
343 (st->min_disptime - jiffies));
344}
345
346static inline void
347throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
348{
349 tg->bytes_disp[rw] = 0;
350 tg->io_disp[rw] = 0;
351 tg->slice_start[rw] = jiffies;
352 tg->slice_end[rw] = jiffies + throtl_slice;
353 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
354 rw == READ ? 'R' : 'W', tg->slice_start[rw],
355 tg->slice_end[rw], jiffies);
356}
357
358static inline void throtl_extend_slice(struct throtl_data *td,
359 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
360{
361 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
362 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
363 rw == READ ? 'R' : 'W', tg->slice_start[rw],
364 tg->slice_end[rw], jiffies);
365}
366
367/* Determine if previously allocated or extended slice is complete or not */
368static bool
369throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
370{
371 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
372 return 0;
373
374 return 1;
375}
376
377/* Trim the used slices and adjust slice start accordingly */
378static inline void
379throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
380{
381 unsigned long nr_slices, time_elapsed, io_trim;
382 u64 bytes_trim, tmp;
383
384 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
385
386 /*
387 * If bps are unlimited (-1), then time slice don't get
388 * renewed. Don't try to trim the slice if slice is used. A new
389 * slice will start when appropriate.
390 */
391 if (throtl_slice_used(td, tg, rw))
392 return;
393
394 time_elapsed = jiffies - tg->slice_start[rw];
395
396 nr_slices = time_elapsed / throtl_slice;
397
398 if (!nr_slices)
399 return;
400 tmp = tg->bps[rw] * throtl_slice * nr_slices;
401 do_div(tmp, HZ);
402 bytes_trim = tmp;
403
404 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
405
406 if (!bytes_trim && !io_trim)
407 return;
408
409 if (tg->bytes_disp[rw] >= bytes_trim)
410 tg->bytes_disp[rw] -= bytes_trim;
411 else
412 tg->bytes_disp[rw] = 0;
413
414 if (tg->io_disp[rw] >= io_trim)
415 tg->io_disp[rw] -= io_trim;
416 else
417 tg->io_disp[rw] = 0;
418
419 tg->slice_start[rw] += nr_slices * throtl_slice;
420
421 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
422 " start=%lu end=%lu jiffies=%lu",
423 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
424 tg->slice_start[rw], tg->slice_end[rw], jiffies);
425}
426
427static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
428 struct bio *bio, unsigned long *wait)
429{
430 bool rw = bio_data_dir(bio);
431 unsigned int io_allowed;
432 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
433 u64 tmp;
434
435 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
436
437 /* Slice has just started. Consider one slice interval */
438 if (!jiffy_elapsed)
439 jiffy_elapsed_rnd = throtl_slice;
440
441 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
442
443 /*
444 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
445 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
446 * will allow dispatch after 1 second and after that slice should
447 * have been trimmed.
448 */
449
450 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
451 do_div(tmp, HZ);
452
453 if (tmp > UINT_MAX)
454 io_allowed = UINT_MAX;
455 else
456 io_allowed = tmp;
457
458 if (tg->io_disp[rw] + 1 <= io_allowed) {
459 if (wait)
460 *wait = 0;
461 return 1;
462 }
463
464 /* Calc approx time to dispatch */
465 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
466
467 if (jiffy_wait > jiffy_elapsed)
468 jiffy_wait = jiffy_wait - jiffy_elapsed;
469 else
470 jiffy_wait = 1;
471
472 if (wait)
473 *wait = jiffy_wait;
474 return 0;
475}
476
477static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
478 struct bio *bio, unsigned long *wait)
479{
480 bool rw = bio_data_dir(bio);
481 u64 bytes_allowed, extra_bytes, tmp;
482 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
483
484 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
485
486 /* Slice has just started. Consider one slice interval */
487 if (!jiffy_elapsed)
488 jiffy_elapsed_rnd = throtl_slice;
489
490 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
491
492 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
493 do_div(tmp, HZ);
494 bytes_allowed = tmp;
495
496 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
497 if (wait)
498 *wait = 0;
499 return 1;
500 }
501
502 /* Calc approx time to dispatch */
503 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
504 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
505
506 if (!jiffy_wait)
507 jiffy_wait = 1;
508
509 /*
510 * This wait time is without taking into consideration the rounding
511 * up we did. Add that time also.
512 */
513 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
514 if (wait)
515 *wait = jiffy_wait;
516 return 0;
517}
518
519/*
520 * Returns whether one can dispatch a bio or not. Also returns approx number
521 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
522 */
523static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
524 struct bio *bio, unsigned long *wait)
525{
526 bool rw = bio_data_dir(bio);
527 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
528
529 /*
530 * Currently whole state machine of group depends on first bio
531 * queued in the group bio list. So one should not be calling
532 * this function with a different bio if there are other bios
533 * queued.
534 */
535 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
536
537 /* If tg->bps = -1, then BW is unlimited */
538 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
539 if (wait)
540 *wait = 0;
541 return 1;
542 }
543
544 /*
545 * If previous slice expired, start a new one otherwise renew/extend
546 * existing slice to make sure it is at least throtl_slice interval
547 * long since now.
548 */
549 if (throtl_slice_used(td, tg, rw))
550 throtl_start_new_slice(td, tg, rw);
551 else {
552 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
553 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
554 }
555
556 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
557 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
558 if (wait)
559 *wait = 0;
560 return 1;
561 }
562
563 max_wait = max(bps_wait, iops_wait);
564
565 if (wait)
566 *wait = max_wait;
567
568 if (time_before(tg->slice_end[rw], jiffies + max_wait))
569 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
570
571 return 0;
572}
573
574static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
575{
576 bool rw = bio_data_dir(bio);
577 bool sync = bio->bi_rw & REQ_SYNC;
578
579 /* Charge the bio to the group */
580 tg->bytes_disp[rw] += bio->bi_size;
581 tg->io_disp[rw]++;
582
583 /*
584 * TODO: This will take blkg->stats_lock. Figure out a way
585 * to avoid this cost.
586 */
587 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
588}
589
590static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
591 struct bio *bio)
592{
593 bool rw = bio_data_dir(bio);
594
595 bio_list_add(&tg->bio_lists[rw], bio);
596 /* Take a bio reference on tg */
597 throtl_ref_get_tg(tg);
598 tg->nr_queued[rw]++;
599 td->nr_queued[rw]++;
600 throtl_enqueue_tg(td, tg);
601}
602
603static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
604{
605 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
606 struct bio *bio;
607
608 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
609 tg_may_dispatch(td, tg, bio, &read_wait);
610
611 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
612 tg_may_dispatch(td, tg, bio, &write_wait);
613
614 min_wait = min(read_wait, write_wait);
615 disptime = jiffies + min_wait;
616
617 /* Update dispatch time */
618 throtl_dequeue_tg(td, tg);
619 tg->disptime = disptime;
620 throtl_enqueue_tg(td, tg);
621}
622
623static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
624 bool rw, struct bio_list *bl)
625{
626 struct bio *bio;
627
628 bio = bio_list_pop(&tg->bio_lists[rw]);
629 tg->nr_queued[rw]--;
630 /* Drop bio reference on tg */
631 throtl_put_tg(tg);
632
633 BUG_ON(td->nr_queued[rw] <= 0);
634 td->nr_queued[rw]--;
635
636 throtl_charge_bio(tg, bio);
637 bio_list_add(bl, bio);
638 bio->bi_rw |= REQ_THROTTLED;
639
640 throtl_trim_slice(td, tg, rw);
641}
642
643static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
644 struct bio_list *bl)
645{
646 unsigned int nr_reads = 0, nr_writes = 0;
647 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
648 unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
649 struct bio *bio;
650
651 /* Try to dispatch 75% READS and 25% WRITES */
652
653 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
654 && tg_may_dispatch(td, tg, bio, NULL)) {
655
656 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
657 nr_reads++;
658
659 if (nr_reads >= max_nr_reads)
660 break;
661 }
662
663 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
664 && tg_may_dispatch(td, tg, bio, NULL)) {
665
666 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
667 nr_writes++;
668
669 if (nr_writes >= max_nr_writes)
670 break;
671 }
672
673 return nr_reads + nr_writes;
674}
675
676static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
677{
678 unsigned int nr_disp = 0;
679 struct throtl_grp *tg;
680 struct throtl_rb_root *st = &td->tg_service_tree;
681
682 while (1) {
683 tg = throtl_rb_first(st);
684
685 if (!tg)
686 break;
687
688 if (time_before(jiffies, tg->disptime))
689 break;
690
691 throtl_dequeue_tg(td, tg);
692
693 nr_disp += throtl_dispatch_tg(td, tg, bl);
694
695 if (tg->nr_queued[0] || tg->nr_queued[1]) {
696 tg_update_disptime(td, tg);
697 throtl_enqueue_tg(td, tg);
698 }
699
700 if (nr_disp >= throtl_quantum)
701 break;
702 }
703
704 return nr_disp;
705}
706
707static void throtl_process_limit_change(struct throtl_data *td)
708{
709 struct throtl_grp *tg;
710 struct hlist_node *pos, *n;
711
712 /*
713 * Make sure atomic_inc() effects from
714 * throtl_update_blkio_group_read_bps(), group of functions are
715 * visible.
716 * Is this required or smp_mb__after_atomic_inc() was suffcient
717 * after the atomic_inc().
718 */
719 smp_rmb();
720 if (!atomic_read(&td->limits_changed))
721 return;
722
723 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
724
725 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
726 /*
727 * Do I need an smp_rmb() here to make sure tg->limits_changed
728 * update is visible. I am relying on smp_rmb() at the
729 * beginning of function and not putting a new one here.
730 */
731
732 if (throtl_tg_on_rr(tg) && tg->limits_changed) {
733 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
734 " riops=%u wiops=%u", tg->bps[READ],
735 tg->bps[WRITE], tg->iops[READ],
736 tg->iops[WRITE]);
737 tg_update_disptime(td, tg);
738 tg->limits_changed = false;
739 }
740 }
741
742 smp_mb__before_atomic_dec();
743 atomic_dec(&td->limits_changed);
744 smp_mb__after_atomic_dec();
745}
746
747/* Dispatch throttled bios. Should be called without queue lock held. */
748static int throtl_dispatch(struct request_queue *q)
749{
750 struct throtl_data *td = q->td;
751 unsigned int nr_disp = 0;
752 struct bio_list bio_list_on_stack;
753 struct bio *bio;
754
755 spin_lock_irq(q->queue_lock);
756
757 throtl_process_limit_change(td);
758
759 if (!total_nr_queued(td))
760 goto out;
761
762 bio_list_init(&bio_list_on_stack);
763
764 throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
765 total_nr_queued(td), td->nr_queued[READ],
766 td->nr_queued[WRITE]);
767
768 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
769
770 if (nr_disp)
771 throtl_log(td, "bios disp=%u", nr_disp);
772
773 throtl_schedule_next_dispatch(td);
774out:
775 spin_unlock_irq(q->queue_lock);
776
777 /*
778 * If we dispatched some requests, unplug the queue to make sure
779 * immediate dispatch
780 */
781 if (nr_disp) {
782 while((bio = bio_list_pop(&bio_list_on_stack)))
783 generic_make_request(bio);
784 blk_unplug(q);
785 }
786 return nr_disp;
787}
788
789void blk_throtl_work(struct work_struct *work)
790{
791 struct throtl_data *td = container_of(work, struct throtl_data,
792 throtl_work.work);
793 struct request_queue *q = td->queue;
794
795 throtl_dispatch(q);
796}
797
798/* Call with queue lock held */
799void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
800{
801
802 struct throtl_data *td = q->td;
803 struct delayed_work *dwork = &td->throtl_work;
804
805 if (total_nr_queued(td) > 0) {
806 /*
807 * We might have a work scheduled to be executed in future.
808 * Cancel that and schedule a new one.
809 */
810 __cancel_delayed_work(dwork);
811 kblockd_schedule_delayed_work(q, dwork, delay);
812 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
813 delay, jiffies);
814 }
815}
816EXPORT_SYMBOL(throtl_schedule_delayed_work);
817
818static void
819throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
820{
821 /* Something wrong if we are trying to remove same group twice */
822 BUG_ON(hlist_unhashed(&tg->tg_node));
823
824 hlist_del_init(&tg->tg_node);
825
826 /*
827 * Put the reference taken at the time of creation so that when all
828 * queues are gone, group can be destroyed.
829 */
830 throtl_put_tg(tg);
831 td->nr_undestroyed_grps--;
832}
833
834static void throtl_release_tgs(struct throtl_data *td)
835{
836 struct hlist_node *pos, *n;
837 struct throtl_grp *tg;
838
839 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
840 /*
841 * If cgroup removal path got to blk_group first and removed
842 * it from cgroup list, then it will take care of destroying
843 * cfqg also.
844 */
845 if (!blkiocg_del_blkio_group(&tg->blkg))
846 throtl_destroy_tg(td, tg);
847 }
848}
849
850static void throtl_td_free(struct throtl_data *td)
851{
852 kfree(td);
853}
854
855/*
856 * Blk cgroup controller notification saying that blkio_group object is being
857 * delinked as associated cgroup object is going away. That also means that
858 * no new IO will come in this group. So get rid of this group as soon as
859 * any pending IO in the group is finished.
860 *
861 * This function is called under rcu_read_lock(). key is the rcu protected
862 * pointer. That means "key" is a valid throtl_data pointer as long as we are
863 * rcu read lock.
864 *
865 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
866 * it should not be NULL as even if queue was going away, cgroup deltion
867 * path got to it first.
868 */
869void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
870{
871 unsigned long flags;
872 struct throtl_data *td = key;
873
874 spin_lock_irqsave(td->queue->queue_lock, flags);
875 throtl_destroy_tg(td, tg_of_blkg(blkg));
876 spin_unlock_irqrestore(td->queue->queue_lock, flags);
877}
878
879/*
880 * For all update functions, key should be a valid pointer because these
881 * update functions are called under blkcg_lock, that means, blkg is
882 * valid and in turn key is valid. queue exit path can not race becuase
883 * of blkcg_lock
884 *
885 * Can not take queue lock in update functions as queue lock under blkcg_lock
886 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
887 */
888static void throtl_update_blkio_group_read_bps(void *key,
889 struct blkio_group *blkg, u64 read_bps)
890{
891 struct throtl_data *td = key;
892
893 tg_of_blkg(blkg)->bps[READ] = read_bps;
894 /* Make sure read_bps is updated before setting limits_changed */
895 smp_wmb();
896 tg_of_blkg(blkg)->limits_changed = true;
897
898 /* Make sure tg->limits_changed is updated before td->limits_changed */
899 smp_mb__before_atomic_inc();
900 atomic_inc(&td->limits_changed);
901 smp_mb__after_atomic_inc();
902
903 /* Schedule a work now to process the limit change */
904 throtl_schedule_delayed_work(td->queue, 0);
905}
906
907static void throtl_update_blkio_group_write_bps(void *key,
908 struct blkio_group *blkg, u64 write_bps)
909{
910 struct throtl_data *td = key;
911
912 tg_of_blkg(blkg)->bps[WRITE] = write_bps;
913 smp_wmb();
914 tg_of_blkg(blkg)->limits_changed = true;
915 smp_mb__before_atomic_inc();
916 atomic_inc(&td->limits_changed);
917 smp_mb__after_atomic_inc();
918 throtl_schedule_delayed_work(td->queue, 0);
919}
920
921static void throtl_update_blkio_group_read_iops(void *key,
922 struct blkio_group *blkg, unsigned int read_iops)
923{
924 struct throtl_data *td = key;
925
926 tg_of_blkg(blkg)->iops[READ] = read_iops;
927 smp_wmb();
928 tg_of_blkg(blkg)->limits_changed = true;
929 smp_mb__before_atomic_inc();
930 atomic_inc(&td->limits_changed);
931 smp_mb__after_atomic_inc();
932 throtl_schedule_delayed_work(td->queue, 0);
933}
934
935static void throtl_update_blkio_group_write_iops(void *key,
936 struct blkio_group *blkg, unsigned int write_iops)
937{
938 struct throtl_data *td = key;
939
940 tg_of_blkg(blkg)->iops[WRITE] = write_iops;
941 smp_wmb();
942 tg_of_blkg(blkg)->limits_changed = true;
943 smp_mb__before_atomic_inc();
944 atomic_inc(&td->limits_changed);
945 smp_mb__after_atomic_inc();
946 throtl_schedule_delayed_work(td->queue, 0);
947}
948
949void throtl_shutdown_timer_wq(struct request_queue *q)
950{
951 struct throtl_data *td = q->td;
952
953 cancel_delayed_work_sync(&td->throtl_work);
954}
955
956static struct blkio_policy_type blkio_policy_throtl = {
957 .ops = {
958 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
959 .blkio_update_group_read_bps_fn =
960 throtl_update_blkio_group_read_bps,
961 .blkio_update_group_write_bps_fn =
962 throtl_update_blkio_group_write_bps,
963 .blkio_update_group_read_iops_fn =
964 throtl_update_blkio_group_read_iops,
965 .blkio_update_group_write_iops_fn =
966 throtl_update_blkio_group_write_iops,
967 },
968 .plid = BLKIO_POLICY_THROTL,
969};
970
971int blk_throtl_bio(struct request_queue *q, struct bio **biop)
972{
973 struct throtl_data *td = q->td;
974 struct throtl_grp *tg;
975 struct bio *bio = *biop;
976 bool rw = bio_data_dir(bio), update_disptime = true;
977
978 if (bio->bi_rw & REQ_THROTTLED) {
979 bio->bi_rw &= ~REQ_THROTTLED;
980 return 0;
981 }
982
983 spin_lock_irq(q->queue_lock);
984 tg = throtl_get_tg(td);
985
986 if (tg->nr_queued[rw]) {
987 /*
988 * There is already another bio queued in same dir. No
989 * need to update dispatch time.
990 * Still update the disptime if rate limits on this group
991 * were changed.
992 */
993 if (!tg->limits_changed)
994 update_disptime = false;
995 else
996 tg->limits_changed = false;
997
998 goto queue_bio;
999 }
1000
1001 /* Bio is with-in rate limit of group */
1002 if (tg_may_dispatch(td, tg, bio, NULL)) {
1003 throtl_charge_bio(tg, bio);
1004 goto out;
1005 }
1006
1007queue_bio:
1008 throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
1009 " iodisp=%u iops=%u queued=%d/%d",
1010 rw == READ ? 'R' : 'W',
1011 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1012 tg->io_disp[rw], tg->iops[rw],
1013 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1014
1015 throtl_add_bio_tg(q->td, tg, bio);
1016 *biop = NULL;
1017
1018 if (update_disptime) {
1019 tg_update_disptime(td, tg);
1020 throtl_schedule_next_dispatch(td);
1021 }
1022
1023out:
1024 spin_unlock_irq(q->queue_lock);
1025 return 0;
1026}
1027
1028int blk_throtl_init(struct request_queue *q)
1029{
1030 struct throtl_data *td;
1031 struct throtl_grp *tg;
1032
1033 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1034 if (!td)
1035 return -ENOMEM;
1036
1037 INIT_HLIST_HEAD(&td->tg_list);
1038 td->tg_service_tree = THROTL_RB_ROOT;
1039 atomic_set(&td->limits_changed, 0);
1040
1041 /* Init root group */
1042 tg = &td->root_tg;
1043 INIT_HLIST_NODE(&tg->tg_node);
1044 RB_CLEAR_NODE(&tg->rb_node);
1045 bio_list_init(&tg->bio_lists[0]);
1046 bio_list_init(&tg->bio_lists[1]);
1047
1048 /* Practically unlimited BW */
1049 tg->bps[0] = tg->bps[1] = -1;
1050 tg->iops[0] = tg->iops[1] = -1;
1051
1052 /*
1053 * Set root group reference to 2. One reference will be dropped when
1054 * all groups on tg_list are being deleted during queue exit. Other
1055 * reference will remain there as we don't want to delete this group
1056 * as it is statically allocated and gets destroyed when throtl_data
1057 * goes away.
1058 */
1059 atomic_set(&tg->ref, 2);
1060 hlist_add_head(&tg->tg_node, &td->tg_list);
1061 td->nr_undestroyed_grps++;
1062
1063 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1064
1065 rcu_read_lock();
1066 blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
1067 0, BLKIO_POLICY_THROTL);
1068 rcu_read_unlock();
1069
1070 /* Attach throtl data to request queue */
1071 td->queue = q;
1072 q->td = td;
1073 return 0;
1074}
1075
1076void blk_throtl_exit(struct request_queue *q)
1077{
1078 struct throtl_data *td = q->td;
1079 bool wait = false;
1080
1081 BUG_ON(!td);
1082
1083 throtl_shutdown_timer_wq(q);
1084
1085 spin_lock_irq(q->queue_lock);
1086 throtl_release_tgs(td);
1087
1088 /* If there are other groups */
1089 if (td->nr_undestroyed_grps > 0)
1090 wait = true;
1091
1092 spin_unlock_irq(q->queue_lock);
1093
1094 /*
1095 * Wait for tg->blkg->key accessors to exit their grace periods.
1096 * Do this wait only if there are other undestroyed groups out
1097 * there (other than root group). This can happen if cgroup deletion
1098 * path claimed the responsibility of cleaning up a group before
1099 * queue cleanup code get to the group.
1100 *
1101 * Do not call synchronize_rcu() unconditionally as there are drivers
1102 * which create/delete request queue hundreds of times during scan/boot
1103 * and synchronize_rcu() can take significant time and slow down boot.
1104 */
1105 if (wait)
1106 synchronize_rcu();
1107
1108 /*
1109 * Just being safe to make sure after previous flush if some body did
1110 * update limits through cgroup and another work got queued, cancel
1111 * it.
1112 */
1113 throtl_shutdown_timer_wq(q);
1114 throtl_td_free(td);
1115}
1116
1117static int __init throtl_init(void)
1118{
1119 blkio_policy_register(&blkio_policy_throtl);
1120 return 0;
1121}
1122
1123module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d6b911ac002c..f864012ec300 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
110 110
111int blk_dev_init(void); 111int blk_dev_init(void);
112 112
113void elv_quiesce_start(struct request_queue *q);
114void elv_quiesce_end(struct request_queue *q);
115
116
117/* 113/*
118 * Return the threshold (number of used requests) at which the queue is 114 * Return the threshold (number of used requests) at which the queue is
119 * considered to be congested. It include a little hysteresis to keep the 115 * considered to be congested. It include a little hysteresis to keep the
@@ -132,14 +128,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
132 return q->nr_congestion_off; 128 return q->nr_congestion_off;
133} 129}
134 130
135#if defined(CONFIG_BLK_DEV_INTEGRITY)
136
137#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
138 __rq_for_each_bio(_iter.bio, _rq) \
139 bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
140
141#endif /* BLK_DEV_INTEGRITY */
142
143static inline int blk_cpu_to_group(int cpu) 131static inline int blk_cpu_to_group(int cpu)
144{ 132{
145 int group = NR_CPUS; 133 int group = NR_CPUS;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..4cd59b0d7c15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -160,6 +160,7 @@ enum wl_prio_t {
160 BE_WORKLOAD = 0, 160 BE_WORKLOAD = 0,
161 RT_WORKLOAD = 1, 161 RT_WORKLOAD = 1,
162 IDLE_WORKLOAD = 2, 162 IDLE_WORKLOAD = 2,
163 CFQ_PRIO_NR,
163}; 164};
164 165
165/* 166/*
@@ -184,10 +185,19 @@ struct cfq_group {
184 /* number of cfqq currently on this group */ 185 /* number of cfqq currently on this group */
185 int nr_cfqq; 186 int nr_cfqq;
186 187
187 /* Per group busy queus average. Useful for workload slice calc. */
188 unsigned int busy_queues_avg[2];
189 /* 188 /*
190 * rr lists of queues with requests, onle rr for each priority class. 189 * Per group busy queus average. Useful for workload slice calc. We
190 * create the array for each prio class but at run time it is used
191 * only for RT and BE class and slot for IDLE class remains unused.
192 * This is primarily done to avoid confusion and a gcc warning.
193 */
194 unsigned int busy_queues_avg[CFQ_PRIO_NR];
195 /*
196 * rr lists of queues with requests. We maintain service trees for
197 * RT and BE classes. These trees are subdivided in subclasses
198 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
199 * class there is no subclassification and all the cfq queues go on
200 * a single tree service_tree_idle.
191 * Counts are embedded in the cfq_rb_root 201 * Counts are embedded in the cfq_rb_root
192 */ 202 */
193 struct cfq_rb_root service_trees[2][3]; 203 struct cfq_rb_root service_trees[2][3];
@@ -221,7 +231,6 @@ struct cfq_data {
221 enum wl_type_t serving_type; 231 enum wl_type_t serving_type;
222 unsigned long workload_expires; 232 unsigned long workload_expires;
223 struct cfq_group *serving_group; 233 struct cfq_group *serving_group;
224 bool noidle_tree_requires_idle;
225 234
226 /* 235 /*
227 * Each priority tree is sorted by next_request position. These 236 * Each priority tree is sorted by next_request position. These
@@ -977,8 +986,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
977 return NULL; 986 return NULL;
978} 987}
979 988
980void 989void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 990 unsigned int weight)
982{ 991{
983 cfqg_of_blkg(blkg)->weight = weight; 992 cfqg_of_blkg(blkg)->weight = weight;
984} 993}
@@ -2180,7 +2189,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2189 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2181 cfq_log(cfqd, "workload slice:%d", slice); 2190 cfq_log(cfqd, "workload slice:%d", slice);
2182 cfqd->workload_expires = jiffies + slice; 2191 cfqd->workload_expires = jiffies + slice;
2183 cfqd->noidle_tree_requires_idle = false;
2184} 2192}
2185 2193
2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2194static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3177,7 +3185,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3177 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3185 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3178 cfq_mark_cfqq_deep(cfqq); 3186 cfq_mark_cfqq_deep(cfqq);
3179 3187
3180 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3188 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3189 enable_idle = 0;
3190 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3181 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3191 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3182 enable_idle = 0; 3192 enable_idle = 0;
3183 else if (sample_valid(cic->ttime_samples)) { 3193 else if (sample_valid(cic->ttime_samples)) {
@@ -3494,17 +3504,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3494 cfq_slice_expired(cfqd, 1); 3504 cfq_slice_expired(cfqd, 1);
3495 else if (sync && cfqq_empty && 3505 else if (sync && cfqq_empty &&
3496 !cfq_close_cooperator(cfqd, cfqq)) { 3506 !cfq_close_cooperator(cfqd, cfqq)) {
3497 cfqd->noidle_tree_requires_idle |= 3507 cfq_arm_slice_timer(cfqd);
3498 !(rq->cmd_flags & REQ_NOIDLE);
3499 /*
3500 * Idling is enabled for SYNC_WORKLOAD.
3501 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3502 * only if we processed at least one !REQ_NOIDLE request
3503 */
3504 if (cfqd->serving_type == SYNC_WORKLOAD
3505 || cfqd->noidle_tree_requires_idle
3506 || cfqq->cfqg->nr_cfqq == 1)
3507 cfq_arm_slice_timer(cfqd);
3508 } 3508 }
3509 } 3509 }
3510 3510
@@ -4090,6 +4090,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4092 }, 4092 },
4093 .plid = BLKIO_POLICY_PROP,
4093}; 4094};
4094#else 4095#else
4095static struct blkio_policy_type blkio_policy_cfq; 4096static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
69 69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) { 71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev); 72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73} 73}
74 74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) 75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk)
541 disk->major = MAJOR(devt); 541 disk->major = MAJOR(devt);
542 disk->first_minor = MINOR(devt); 542 disk->first_minor = MINOR(devt);
543 543
544 /* Register BDI before referencing it from bdev */
545 bdi = &disk->queue->backing_dev_info;
546 bdi_register_dev(bdi, disk_devt(disk));
547
544 blk_register_region(disk_devt(disk), disk->minors, NULL, 548 blk_register_region(disk_devt(disk), disk->minors, NULL,
545 exact_match, exact_lock, disk); 549 exact_match, exact_lock, disk);
546 register_disk(disk); 550 register_disk(disk);
547 blk_register_queue(disk); 551 blk_register_queue(disk);
548 552
549 bdi = &disk->queue->backing_dev_info;
550 bdi_register_dev(bdi, disk_devt(disk));
551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 553 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 "bdi"); 554 "bdi");
553 WARN_ON(retval); 555 WARN_ON(retval);
@@ -642,6 +644,7 @@ void __init printk_all_partitions(void)
642 struct hd_struct *part; 644 struct hd_struct *part;
643 char name_buf[BDEVNAME_SIZE]; 645 char name_buf[BDEVNAME_SIZE];
644 char devt_buf[BDEVT_SIZE]; 646 char devt_buf[BDEVT_SIZE];
647 u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
645 648
646 /* 649 /*
647 * Don't show empty devices or things that have been 650 * Don't show empty devices or things that have been
@@ -660,10 +663,14 @@ void __init printk_all_partitions(void)
660 while ((part = disk_part_iter_next(&piter))) { 663 while ((part = disk_part_iter_next(&piter))) {
661 bool is_part0 = part == &disk->part0; 664 bool is_part0 = part == &disk->part0;
662 665
663 printk("%s%s %10llu %s", is_part0 ? "" : " ", 666 uuid[0] = 0;
667 if (part->info)
668 part_unpack_uuid(part->info->uuid, uuid);
669
670 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
664 bdevt_str(part_devt(part), devt_buf), 671 bdevt_str(part_devt(part), devt_buf),
665 (unsigned long long)part->nr_sects >> 1, 672 (unsigned long long)part->nr_sects >> 1,
666 disk_name(disk, part->partno, name_buf)); 673 disk_name(disk, part->partno, name_buf), uuid);
667 if (is_part0) { 674 if (is_part0) {
668 if (disk->driverfs_dev != NULL && 675 if (disk->driverfs_dev != NULL &&
669 disk->driverfs_dev->driver != NULL) 676 disk->driverfs_dev->driver != NULL)
@@ -925,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
925{ 932{
926 struct disk_part_tbl *ptbl = 933 struct disk_part_tbl *ptbl =
927 container_of(head, struct disk_part_tbl, rcu_head); 934 container_of(head, struct disk_part_tbl, rcu_head);
935 struct gendisk *disk = ptbl->disk;
936 struct request_queue *q = disk->queue;
937 unsigned long flags;
928 938
929 kfree(ptbl); 939 kfree(ptbl);
940
941 spin_lock_irqsave(q->queue_lock, flags);
942 elv_quiesce_end(q);
943 spin_unlock_irqrestore(q->queue_lock, flags);
930} 944}
931 945
932/** 946/**
@@ -944,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
944 struct disk_part_tbl *new_ptbl) 958 struct disk_part_tbl *new_ptbl)
945{ 959{
946 struct disk_part_tbl *old_ptbl = disk->part_tbl; 960 struct disk_part_tbl *old_ptbl = disk->part_tbl;
961 struct request_queue *q = disk->queue;
947 962
948 rcu_assign_pointer(disk->part_tbl, new_ptbl); 963 rcu_assign_pointer(disk->part_tbl, new_ptbl);
949 964
950 if (old_ptbl) { 965 if (old_ptbl) {
951 rcu_assign_pointer(old_ptbl->last_lookup, NULL); 966 rcu_assign_pointer(old_ptbl->last_lookup, NULL);
967
968 spin_lock_irq(q->queue_lock);
969 elv_quiesce_start(q);
970 spin_unlock_irq(q->queue_lock);
971
952 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); 972 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
953 } 973 }
954} 974}
@@ -989,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
989 return -ENOMEM; 1009 return -ENOMEM;
990 1010
991 new_ptbl->len = target; 1011 new_ptbl->len = target;
1012 new_ptbl->disk = disk;
992 1013
993 for (i = 0; i < len; i++) 1014 for (i = 0; i < len; i++)
994 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); 1015 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
@@ -1004,6 +1025,7 @@ static void disk_release(struct device *dev)
1004 kfree(disk->random); 1025 kfree(disk->random);
1005 disk_replace_part_tbl(disk, NULL); 1026 disk_replace_part_tbl(disk, NULL);
1006 free_part_stats(&disk->part0); 1027 free_part_stats(&disk->part0);
1028 free_part_info(&disk->part0);
1007 kfree(disk); 1029 kfree(disk);
1008} 1030}
1009struct class block_class = { 1031struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..2c15fe0912c4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
62 62
63 /* all seems OK */ 63 /* all seems OK */
64 part = add_partition(disk, partno, start, length, 64 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE); 65 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 67 return IS_ERR(part) ? PTR_ERR(part) : 0;
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION:
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..484ecbb6b772 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2972,7 +2972,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
2972 * we still need to figure out whether we accept that. */ 2972 * we still need to figure out whether we accept that. */
2973 mdev->p_size = p_size; 2973 mdev->p_size = p_size;
2974 2974
2975#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
2976 if (get_ldev(mdev)) { 2975 if (get_ldev(mdev)) {
2977 warn_if_differ_considerably(mdev, "lower level device sizes", 2976 warn_if_differ_considerably(mdev, "lower level device sizes",
2978 p_size, drbd_get_max_capacity(mdev->ldev)); 2977 p_size, drbd_get_max_capacity(mdev->ldev));
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d3094d97..f30f6e8d594e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -706,8 +706,6 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
706 return 0; 706 return 0;
707} 707}
708 708
709#define min_not_zero(l, r) (((l) == 0) ? (r) : (((r) == 0) ? (l) : min(l, r)))
710
711/* 709/*
712 * Return a minimum chunk size of all snapshots that have the specified origin. 710 * Return a minimum chunk size of all snapshots that have the specified origin.
713 * Return zero if the origin has no snapshots. 711 * Return zero if the origin has no snapshots.
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f9fc07d7a4b9..90267f8d64ee 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -486,11 +486,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
486 return 0; 486 return 0;
487} 487}
488 488
489/*
490 * Returns the minimum that is _not_ zero, unless both are zero.
491 */
492#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
493
494int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 489int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
495 sector_t start, sector_t len, void *data) 490 sector_t start, sector_t len, void *data)
496{ 491{
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index cb000c9833bb..208256e39def 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -681,6 +681,7 @@ void zfcp_scsi_set_prot(struct zfcp_adapter *adapter)
681 adapter->adapter_features & FSF_FEATURE_DIX_PROT_TCPIP) { 681 adapter->adapter_features & FSF_FEATURE_DIX_PROT_TCPIP) {
682 mask |= SHOST_DIX_TYPE1_PROTECTION; 682 mask |= SHOST_DIX_TYPE1_PROTECTION;
683 scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); 683 scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP);
684 shost->sg_prot_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2;
684 shost->sg_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2; 685 shost->sg_tablesize = ZFCP_QDIO_MAX_SBALES_PER_REQ / 2;
685 shost->max_sectors = ZFCP_QDIO_MAX_SBALES_PER_REQ * 8 / 2; 686 shost->max_sectors = ZFCP_QDIO_MAX_SBALES_PER_REQ * 8 / 2;
686 } 687 }
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8a8f803439e1..10478153641b 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -376,6 +376,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
376 shost->this_id = sht->this_id; 376 shost->this_id = sht->this_id;
377 shost->can_queue = sht->can_queue; 377 shost->can_queue = sht->can_queue;
378 shost->sg_tablesize = sht->sg_tablesize; 378 shost->sg_tablesize = sht->sg_tablesize;
379 shost->sg_prot_tablesize = sht->sg_prot_tablesize;
379 shost->cmd_per_lun = sht->cmd_per_lun; 380 shost->cmd_per_lun = sht->cmd_per_lun;
380 shost->unchecked_isa_dma = sht->unchecked_isa_dma; 381 shost->unchecked_isa_dma = sht->unchecked_isa_dma;
381 shost->use_clustering = sht->use_clustering; 382 shost->use_clustering = sht->use_clustering;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ee02d3838a0a..8041fe1ab179 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -968,11 +968,13 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
968 */ 968 */
969int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask) 969int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
970{ 970{
971 int error = scsi_init_sgtable(cmd->request, &cmd->sdb, gfp_mask); 971 struct request *rq = cmd->request;
972
973 int error = scsi_init_sgtable(rq, &cmd->sdb, gfp_mask);
972 if (error) 974 if (error)
973 goto err_exit; 975 goto err_exit;
974 976
975 if (blk_bidi_rq(cmd->request)) { 977 if (blk_bidi_rq(rq)) {
976 struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc( 978 struct scsi_data_buffer *bidi_sdb = kmem_cache_zalloc(
977 scsi_sdb_cache, GFP_ATOMIC); 979 scsi_sdb_cache, GFP_ATOMIC);
978 if (!bidi_sdb) { 980 if (!bidi_sdb) {
@@ -980,28 +982,28 @@ int scsi_init_io(struct scsi_cmnd *cmd, gfp_t gfp_mask)
980 goto err_exit; 982 goto err_exit;
981 } 983 }
982 984
983 cmd->request->next_rq->special = bidi_sdb; 985 rq->next_rq->special = bidi_sdb;
984 error = scsi_init_sgtable(cmd->request->next_rq, bidi_sdb, 986 error = scsi_init_sgtable(rq->next_rq, bidi_sdb, GFP_ATOMIC);
985 GFP_ATOMIC);
986 if (error) 987 if (error)
987 goto err_exit; 988 goto err_exit;
988 } 989 }
989 990
990 if (blk_integrity_rq(cmd->request)) { 991 if (blk_integrity_rq(rq)) {
991 struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; 992 struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
992 int ivecs, count; 993 int ivecs, count;
993 994
994 BUG_ON(prot_sdb == NULL); 995 BUG_ON(prot_sdb == NULL);
995 ivecs = blk_rq_count_integrity_sg(cmd->request); 996 ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
996 997
997 if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) { 998 if (scsi_alloc_sgtable(prot_sdb, ivecs, gfp_mask)) {
998 error = BLKPREP_DEFER; 999 error = BLKPREP_DEFER;
999 goto err_exit; 1000 goto err_exit;
1000 } 1001 }
1001 1002
1002 count = blk_rq_map_integrity_sg(cmd->request, 1003 count = blk_rq_map_integrity_sg(rq->q, rq->bio,
1003 prot_sdb->table.sgl); 1004 prot_sdb->table.sgl);
1004 BUG_ON(unlikely(count > ivecs)); 1005 BUG_ON(unlikely(count > ivecs));
1006 BUG_ON(unlikely(count > queue_max_integrity_segments(rq->q)));
1005 1007
1006 cmd->prot_sdb = prot_sdb; 1008 cmd->prot_sdb = prot_sdb;
1007 cmd->prot_sdb->table.nents = count; 1009 cmd->prot_sdb->table.nents = count;
@@ -1625,6 +1627,14 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
1625 blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, 1627 blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
1626 SCSI_MAX_SG_CHAIN_SEGMENTS)); 1628 SCSI_MAX_SG_CHAIN_SEGMENTS));
1627 1629
1630 if (scsi_host_prot_dma(shost)) {
1631 shost->sg_prot_tablesize =
1632 min_not_zero(shost->sg_prot_tablesize,
1633 (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
1634 BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
1635 blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
1636 }
1637
1628 blk_queue_max_hw_sectors(q, shost->max_sectors); 1638 blk_queue_max_hw_sectors(q, shost->max_sectors);
1629 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); 1639 blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
1630 blk_queue_segment_boundary(q, shost->dma_boundary); 1640 blk_queue_segment_boundary(q, shost->dma_boundary);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index c3f67373a4f8..20ad59dff730 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -251,6 +251,7 @@ shost_rd_attr(host_busy, "%hu\n");
251shost_rd_attr(cmd_per_lun, "%hd\n"); 251shost_rd_attr(cmd_per_lun, "%hd\n");
252shost_rd_attr(can_queue, "%hd\n"); 252shost_rd_attr(can_queue, "%hd\n");
253shost_rd_attr(sg_tablesize, "%hu\n"); 253shost_rd_attr(sg_tablesize, "%hu\n");
254shost_rd_attr(sg_prot_tablesize, "%hu\n");
254shost_rd_attr(unchecked_isa_dma, "%d\n"); 255shost_rd_attr(unchecked_isa_dma, "%d\n");
255shost_rd_attr(prot_capabilities, "%u\n"); 256shost_rd_attr(prot_capabilities, "%u\n");
256shost_rd_attr(prot_guard_type, "%hd\n"); 257shost_rd_attr(prot_guard_type, "%hd\n");
@@ -262,6 +263,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = {
262 &dev_attr_cmd_per_lun.attr, 263 &dev_attr_cmd_per_lun.attr,
263 &dev_attr_can_queue.attr, 264 &dev_attr_can_queue.attr,
264 &dev_attr_sg_tablesize.attr, 265 &dev_attr_sg_tablesize.attr,
266 &dev_attr_sg_prot_tablesize.attr,
265 &dev_attr_unchecked_isa_dma.attr, 267 &dev_attr_unchecked_isa_dma.attr,
266 &dev_attr_proc_name.attr, 268 &dev_attr_proc_name.attr,
267 &dev_attr_scan.attr, 269 &dev_attr_scan.attr,
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 84be62149c6c..0cb39ff21171 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -375,21 +375,20 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s
375 unsigned int i, j; 375 unsigned int i, j;
376 u32 phys, virt; 376 u32 phys, virt;
377 377
378 /* Already remapped? */
379 if (rq->cmd_flags & REQ_INTEGRITY)
380 return 0;
381
382 sdkp = rq->bio->bi_bdev->bd_disk->private_data; 378 sdkp = rq->bio->bi_bdev->bd_disk->private_data;
383 379
384 if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION) 380 if (sdkp->protection_type == SD_DIF_TYPE3_PROTECTION)
385 return 0; 381 return 0;
386 382
387 rq->cmd_flags |= REQ_INTEGRITY;
388 phys = hw_sector & 0xffffffff; 383 phys = hw_sector & 0xffffffff;
389 384
390 __rq_for_each_bio(bio, rq) { 385 __rq_for_each_bio(bio, rq) {
391 struct bio_vec *iv; 386 struct bio_vec *iv;
392 387
388 /* Already remapped? */
389 if (bio_flagged(bio, BIO_MAPPED_INTEGRITY))
390 break;
391
393 virt = bio->bi_integrity->bip_sector & 0xffffffff; 392 virt = bio->bi_integrity->bip_sector & 0xffffffff;
394 393
395 bip_for_each_vec(iv, bio->bi_integrity, i) { 394 bip_for_each_vec(iv, bio->bi_integrity, i) {
@@ -408,6 +407,8 @@ int sd_dif_prepare(struct request *rq, sector_t hw_sector, unsigned int sector_s
408 407
409 kunmap_atomic(sdt, KM_USER0); 408 kunmap_atomic(sdt, KM_USER0);
410 } 409 }
410
411 bio->bi_flags |= BIO_MAPPED_INTEGRITY;
411 } 412 }
412 413
413 return 0; 414 return 0;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 58ec8f4efcc2..5428d53f5a13 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1660,7 +1660,7 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd)
1660 if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO && 1660 if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
1661 dxfer_dir != SG_DXFER_UNKNOWN && !iov_count && 1661 dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
1662 !sfp->parentdp->device->host->unchecked_isa_dma && 1662 !sfp->parentdp->device->host->unchecked_isa_dma &&
1663 blk_rq_aligned(q, hp->dxferp, dxfer_len)) 1663 blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len))
1664 md = NULL; 1664 md = NULL;
1665 else 1665 else
1666 md = &map_data; 1666 md = &map_data;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 95d8c11c929e..3f030e9efea6 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -318,7 +318,7 @@ void journal_commit_transaction(journal_t *journal)
318 int first_tag = 0; 318 int first_tag = 0;
319 int tag_flag; 319 int tag_flag;
320 int i; 320 int i;
321 int write_op = WRITE; 321 int write_op = WRITE_SYNC;
322 322
323 /* 323 /*
324 * First job: lock down the current transaction and wait for 324 * First job: lock down the current transaction and wait for
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7c068c189d80..80910f51d4b4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -360,7 +360,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
360 int tag_bytes = journal_tag_bytes(journal); 360 int tag_bytes = journal_tag_bytes(journal);
361 struct buffer_head *cbh = NULL; /* For transactional checksums */ 361 struct buffer_head *cbh = NULL; /* For transactional checksums */
362 __u32 crc32_sum = ~0; 362 __u32 crc32_sum = ~0;
363 int write_op = WRITE; 363 int write_op = WRITE_SYNC;
364 364
365 /* 365 /*
366 * First job: lock down the current transaction and wait for 366 * First job: lock down the current transaction and wait for
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 79fbf3f390f0..30f46c2cb9d5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -352,6 +352,7 @@ static void part_release(struct device *dev)
352{ 352{
353 struct hd_struct *p = dev_to_part(dev); 353 struct hd_struct *p = dev_to_part(dev);
354 free_part_stats(p); 354 free_part_stats(p);
355 free_part_info(p);
355 kfree(p); 356 kfree(p);
356} 357}
357 358
@@ -364,17 +365,25 @@ struct device_type part_type = {
364static void delete_partition_rcu_cb(struct rcu_head *head) 365static void delete_partition_rcu_cb(struct rcu_head *head)
365{ 366{
366 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); 367 struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
368 struct gendisk *disk = part_to_disk(part);
369 struct request_queue *q = disk->queue;
370 unsigned long flags;
367 371
368 part->start_sect = 0; 372 part->start_sect = 0;
369 part->nr_sects = 0; 373 part->nr_sects = 0;
370 part_stat_set_all(part, 0); 374 part_stat_set_all(part, 0);
371 put_device(part_to_dev(part)); 375 put_device(part_to_dev(part));
376
377 spin_lock_irqsave(q->queue_lock, flags);
378 elv_quiesce_end(q);
379 spin_unlock_irqrestore(q->queue_lock, flags);
372} 380}
373 381
374void delete_partition(struct gendisk *disk, int partno) 382void delete_partition(struct gendisk *disk, int partno)
375{ 383{
376 struct disk_part_tbl *ptbl = disk->part_tbl; 384 struct disk_part_tbl *ptbl = disk->part_tbl;
377 struct hd_struct *part; 385 struct hd_struct *part;
386 struct request_queue *q = disk->queue;
378 387
379 if (partno >= ptbl->len) 388 if (partno >= ptbl->len)
380 return; 389 return;
@@ -389,6 +398,10 @@ void delete_partition(struct gendisk *disk, int partno)
389 kobject_put(part->holder_dir); 398 kobject_put(part->holder_dir);
390 device_del(part_to_dev(part)); 399 device_del(part_to_dev(part));
391 400
401 spin_lock_irq(q->queue_lock);
402 elv_quiesce_start(q);
403 spin_unlock_irq(q->queue_lock);
404
392 call_rcu(&part->rcu_head, delete_partition_rcu_cb); 405 call_rcu(&part->rcu_head, delete_partition_rcu_cb);
393} 406}
394 407
@@ -401,7 +414,8 @@ static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
401 whole_disk_show, NULL); 414 whole_disk_show, NULL);
402 415
403struct hd_struct *add_partition(struct gendisk *disk, int partno, 416struct hd_struct *add_partition(struct gendisk *disk, int partno,
404 sector_t start, sector_t len, int flags) 417 sector_t start, sector_t len, int flags,
418 struct partition_meta_info *info)
405{ 419{
406 struct hd_struct *p; 420 struct hd_struct *p;
407 dev_t devt = MKDEV(0, 0); 421 dev_t devt = MKDEV(0, 0);
@@ -438,6 +452,14 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
438 p->partno = partno; 452 p->partno = partno;
439 p->policy = get_disk_ro(disk); 453 p->policy = get_disk_ro(disk);
440 454
455 if (info) {
456 struct partition_meta_info *pinfo = alloc_part_info(disk);
457 if (!pinfo)
458 goto out_free_stats;
459 memcpy(pinfo, info, sizeof(*info));
460 p->info = pinfo;
461 }
462
441 dname = dev_name(ddev); 463 dname = dev_name(ddev);
442 if (isdigit(dname[strlen(dname) - 1])) 464 if (isdigit(dname[strlen(dname) - 1]))
443 dev_set_name(pdev, "%sp%d", dname, partno); 465 dev_set_name(pdev, "%sp%d", dname, partno);
@@ -451,7 +473,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
451 473
452 err = blk_alloc_devt(p, &devt); 474 err = blk_alloc_devt(p, &devt);
453 if (err) 475 if (err)
454 goto out_free_stats; 476 goto out_free_info;
455 pdev->devt = devt; 477 pdev->devt = devt;
456 478
457 /* delay uevent until 'holders' subdir is created */ 479 /* delay uevent until 'holders' subdir is created */
@@ -481,6 +503,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
481 503
482 return p; 504 return p;
483 505
506out_free_info:
507 free_part_info(p);
484out_free_stats: 508out_free_stats:
485 free_part_stats(p); 509 free_part_stats(p);
486out_free: 510out_free:
@@ -642,6 +666,7 @@ rescan:
642 /* add partitions */ 666 /* add partitions */
643 for (p = 1; p < state->limit; p++) { 667 for (p = 1; p < state->limit; p++) {
644 sector_t size, from; 668 sector_t size, from;
669 struct partition_meta_info *info = NULL;
645 670
646 size = state->parts[p].size; 671 size = state->parts[p].size;
647 if (!size) 672 if (!size)
@@ -675,8 +700,12 @@ rescan:
675 size = get_capacity(disk) - from; 700 size = get_capacity(disk) - from;
676 } 701 }
677 } 702 }
703
704 if (state->parts[p].has_info)
705 info = &state->parts[p].info;
678 part = add_partition(disk, p, from, size, 706 part = add_partition(disk, p, from, size,
679 state->parts[p].flags); 707 state->parts[p].flags,
708 &state->parts[p].info);
680 if (IS_ERR(part)) { 709 if (IS_ERR(part)) {
681 printk(KERN_ERR " %s: p%d could not be added: %ld\n", 710 printk(KERN_ERR " %s: p%d could not be added: %ld\n",
682 disk->disk_name, p, -PTR_ERR(part)); 711 disk->disk_name, p, -PTR_ERR(part));
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 8e4e103ba216..d68bf4dc3bc2 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -1,5 +1,6 @@
1#include <linux/pagemap.h> 1#include <linux/pagemap.h>
2#include <linux/blkdev.h> 2#include <linux/blkdev.h>
3#include <linux/genhd.h>
3 4
4/* 5/*
5 * add_gd_partition adds a partitions details to the devices partition 6 * add_gd_partition adds a partitions details to the devices partition
@@ -12,6 +13,8 @@ struct parsed_partitions {
12 sector_t from; 13 sector_t from;
13 sector_t size; 14 sector_t size;
14 int flags; 15 int flags;
16 bool has_info;
17 struct partition_meta_info info;
15 } parts[DISK_MAX_PARTS]; 18 } parts[DISK_MAX_PARTS];
16 int next; 19 int next;
17 int limit; 20 int limit;
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index dbb44d4bb8a7..ac0ccb5026a2 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -94,6 +94,7 @@
94 * 94 *
95 ************************************************************/ 95 ************************************************************/
96#include <linux/crc32.h> 96#include <linux/crc32.h>
97#include <linux/ctype.h>
97#include <linux/math64.h> 98#include <linux/math64.h>
98#include <linux/slab.h> 99#include <linux/slab.h>
99#include "check.h" 100#include "check.h"
@@ -604,6 +605,7 @@ int efi_partition(struct parsed_partitions *state)
604 gpt_entry *ptes = NULL; 605 gpt_entry *ptes = NULL;
605 u32 i; 606 u32 i;
606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512; 607 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
608 u8 unparsed_guid[37];
607 609
608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { 610 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
609 kfree(gpt); 611 kfree(gpt);
@@ -614,6 +616,9 @@ int efi_partition(struct parsed_partitions *state)
614 pr_debug("GUID Partition Table is valid! Yea!\n"); 616 pr_debug("GUID Partition Table is valid! Yea!\n");
615 617
616 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { 618 for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
619 struct partition_meta_info *info;
620 unsigned label_count = 0;
621 unsigned label_max;
617 u64 start = le64_to_cpu(ptes[i].starting_lba); 622 u64 start = le64_to_cpu(ptes[i].starting_lba);
618 u64 size = le64_to_cpu(ptes[i].ending_lba) - 623 u64 size = le64_to_cpu(ptes[i].ending_lba) -
619 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 624 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
@@ -627,6 +632,26 @@ int efi_partition(struct parsed_partitions *state)
627 if (!efi_guidcmp(ptes[i].partition_type_guid, 632 if (!efi_guidcmp(ptes[i].partition_type_guid,
628 PARTITION_LINUX_RAID_GUID)) 633 PARTITION_LINUX_RAID_GUID))
629 state->parts[i + 1].flags = ADDPART_FLAG_RAID; 634 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635
636 info = &state->parts[i + 1].info;
637 /* Instead of doing a manual swap to big endian, reuse the
638 * common ASCII hex format as the interim.
639 */
640 efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
641 part_pack_uuid(unparsed_guid, info->uuid);
642
643 /* Naively convert UTF16-LE to 7 bits. */
644 label_max = min(sizeof(info->volname) - 1,
645 sizeof(ptes[i].partition_name));
646 info->volname[label_max] = 0;
647 while (label_count < label_max) {
648 u8 c = ptes[i].partition_name[label_count] & 0xff;
649 if (c && !isprint(c))
650 c = '!';
651 info->volname[label_count] = c;
652 label_count++;
653 }
654 state->parts[i + 1].has_info = true;
630 } 655 }
631 kfree(ptes); 656 kfree(ptes);
632 kfree(gpt); 657 kfree(gpt);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5274103434ad..ba679992d39b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -346,8 +346,15 @@ static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
346} 346}
347 347
348#else 348#else
349#define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset) 349static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
350#define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) 350{
351 return page_address(bvec->bv_page) + bvec->bv_offset;
352}
353
354static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
355{
356 *flags = 0;
357}
351#endif 358#endif
352 359
353static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, 360static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
@@ -496,6 +503,10 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
496#define bip_for_each_vec(bvl, bip, i) \ 503#define bip_for_each_vec(bvl, bip, i) \
497 __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) 504 __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
498 505
506#define bio_for_each_integrity_vec(_bvl, _bio, _iter) \
507 for_each_bio(_bio) \
508 bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
509
499#define bio_integrity(bio) (bio->bi_integrity != NULL) 510#define bio_integrity(bio) (bio->bi_integrity != NULL)
500 511
501extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); 512extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ca83a97c9715..d36629620a4f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -97,6 +97,7 @@ struct bio {
97#define BIO_NULL_MAPPED 9 /* contains invalid user pages */ 97#define BIO_NULL_MAPPED 9 /* contains invalid user pages */
98#define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ 98#define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */
99#define BIO_QUIET 11 /* Make BIO Quiet */ 99#define BIO_QUIET 11 /* Make BIO Quiet */
100#define BIO_MAPPED_INTEGRITY 12/* integrity metadata has been remapped */
100#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) 101#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
101 102
102/* 103/*
@@ -130,6 +131,8 @@ enum rq_flag_bits {
130 /* bio only flags */ 131 /* bio only flags */
131 __REQ_UNPLUG, /* unplug the immediately after submission */ 132 __REQ_UNPLUG, /* unplug the immediately after submission */
132 __REQ_RAHEAD, /* read ahead, can fail anytime */ 133 __REQ_RAHEAD, /* read ahead, can fail anytime */
134 __REQ_THROTTLED, /* This bio has already been subjected to
135 * throttling rules. Don't do it again. */
133 136
134 /* request only flags */ 137 /* request only flags */
135 __REQ_SORTED, /* elevator knows about this request */ 138 __REQ_SORTED, /* elevator knows about this request */
@@ -146,7 +149,6 @@ enum rq_flag_bits {
146 __REQ_ORDERED_COLOR, /* is before or after barrier */ 149 __REQ_ORDERED_COLOR, /* is before or after barrier */
147 __REQ_ALLOCED, /* request came from our alloc pool */ 150 __REQ_ALLOCED, /* request came from our alloc pool */
148 __REQ_COPY_USER, /* contains copies of user pages */ 151 __REQ_COPY_USER, /* contains copies of user pages */
149 __REQ_INTEGRITY, /* integrity metadata has been remapped */
150 __REQ_FLUSH, /* request for cache flush */ 152 __REQ_FLUSH, /* request for cache flush */
151 __REQ_IO_STAT, /* account I/O stat */ 153 __REQ_IO_STAT, /* account I/O stat */
152 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 154 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
@@ -172,6 +174,7 @@ enum rq_flag_bits {
172 174
173#define REQ_UNPLUG (1 << __REQ_UNPLUG) 175#define REQ_UNPLUG (1 << __REQ_UNPLUG)
174#define REQ_RAHEAD (1 << __REQ_RAHEAD) 176#define REQ_RAHEAD (1 << __REQ_RAHEAD)
177#define REQ_THROTTLED (1 << __REQ_THROTTLED)
175 178
176#define REQ_SORTED (1 << __REQ_SORTED) 179#define REQ_SORTED (1 << __REQ_SORTED)
177#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) 180#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
@@ -187,7 +190,6 @@ enum rq_flag_bits {
187#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) 190#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
188#define REQ_ALLOCED (1 << __REQ_ALLOCED) 191#define REQ_ALLOCED (1 << __REQ_ALLOCED)
189#define REQ_COPY_USER (1 << __REQ_COPY_USER) 192#define REQ_COPY_USER (1 << __REQ_COPY_USER)
190#define REQ_INTEGRITY (1 << __REQ_INTEGRITY)
191#define REQ_FLUSH (1 << __REQ_FLUSH) 193#define REQ_FLUSH (1 << __REQ_FLUSH)
192#define REQ_IO_STAT (1 << __REQ_IO_STAT) 194#define REQ_IO_STAT (1 << __REQ_IO_STAT)
193#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) 195#define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c54906f678f..16f7f1be1acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -115,6 +115,7 @@ struct request {
115 void *elevator_private3; 115 void *elevator_private3;
116 116
117 struct gendisk *rq_disk; 117 struct gendisk *rq_disk;
118 struct hd_struct *part;
118 unsigned long start_time; 119 unsigned long start_time;
119#ifdef CONFIG_BLK_CGROUP 120#ifdef CONFIG_BLK_CGROUP
120 unsigned long long start_time_ns; 121 unsigned long long start_time_ns;
@@ -124,6 +125,9 @@ struct request {
124 * physical address coalescing is performed. 125 * physical address coalescing is performed.
125 */ 126 */
126 unsigned short nr_phys_segments; 127 unsigned short nr_phys_segments;
128#if defined(CONFIG_BLK_DEV_INTEGRITY)
129 unsigned short nr_integrity_segments;
130#endif
127 131
128 unsigned short ioprio; 132 unsigned short ioprio;
129 133
@@ -243,6 +247,7 @@ struct queue_limits {
243 247
244 unsigned short logical_block_size; 248 unsigned short logical_block_size;
245 unsigned short max_segments; 249 unsigned short max_segments;
250 unsigned short max_integrity_segments;
246 251
247 unsigned char misaligned; 252 unsigned char misaligned;
248 unsigned char discard_misaligned; 253 unsigned char discard_misaligned;
@@ -367,6 +372,11 @@ struct request_queue
367#if defined(CONFIG_BLK_DEV_BSG) 372#if defined(CONFIG_BLK_DEV_BSG)
368 struct bsg_class_device bsg_dev; 373 struct bsg_class_device bsg_dev;
369#endif 374#endif
375
376#ifdef CONFIG_BLK_DEV_THROTTLING
377 /* Throttle data */
378 struct throtl_data *td;
379#endif
370}; 380};
371 381
372#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ 382#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */
@@ -851,7 +861,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
851extern void blk_queue_max_discard_sectors(struct request_queue *q, 861extern void blk_queue_max_discard_sectors(struct request_queue *q,
852 unsigned int max_discard_sectors); 862 unsigned int max_discard_sectors);
853extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); 863extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
854extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); 864extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
855extern void blk_queue_alignment_offset(struct request_queue *q, 865extern void blk_queue_alignment_offset(struct request_queue *q,
856 unsigned int alignment); 866 unsigned int alignment);
857extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); 867extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
@@ -1004,7 +1014,7 @@ static inline unsigned int queue_physical_block_size(struct request_queue *q)
1004 return q->limits.physical_block_size; 1014 return q->limits.physical_block_size;
1005} 1015}
1006 1016
1007static inline int bdev_physical_block_size(struct block_device *bdev) 1017static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
1008{ 1018{
1009 return queue_physical_block_size(bdev_get_queue(bdev)); 1019 return queue_physical_block_size(bdev_get_queue(bdev));
1010} 1020}
@@ -1093,11 +1103,11 @@ static inline int queue_dma_alignment(struct request_queue *q)
1093 return q ? q->dma_alignment : 511; 1103 return q ? q->dma_alignment : 511;
1094} 1104}
1095 1105
1096static inline int blk_rq_aligned(struct request_queue *q, void *addr, 1106static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
1097 unsigned int len) 1107 unsigned int len)
1098{ 1108{
1099 unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; 1109 unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
1100 return !((unsigned long)addr & alignment) && !(len & alignment); 1110 return !(addr & alignment) && !(len & alignment);
1101} 1111}
1102 1112
1103/* assumes size > 256 */ 1113/* assumes size > 256 */
@@ -1127,6 +1137,7 @@ static inline void put_dev_sector(Sector p)
1127 1137
1128struct work_struct; 1138struct work_struct;
1129int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1139int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1140int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
1130 1141
1131#ifdef CONFIG_BLK_CGROUP 1142#ifdef CONFIG_BLK_CGROUP
1132/* 1143/*
@@ -1170,6 +1181,24 @@ static inline uint64_t rq_io_start_time_ns(struct request *req)
1170} 1181}
1171#endif 1182#endif
1172 1183
1184#ifdef CONFIG_BLK_DEV_THROTTLING
1185extern int blk_throtl_init(struct request_queue *q);
1186extern void blk_throtl_exit(struct request_queue *q);
1187extern int blk_throtl_bio(struct request_queue *q, struct bio **bio);
1188extern void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay);
1189extern void throtl_shutdown_timer_wq(struct request_queue *q);
1190#else /* CONFIG_BLK_DEV_THROTTLING */
1191static inline int blk_throtl_bio(struct request_queue *q, struct bio **bio)
1192{
1193 return 0;
1194}
1195
1196static inline int blk_throtl_init(struct request_queue *q) { return 0; }
1197static inline int blk_throtl_exit(struct request_queue *q) { return 0; }
1198static inline void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) {}
1199static inline void throtl_shutdown_timer_wq(struct request_queue *q) {}
1200#endif /* CONFIG_BLK_DEV_THROTTLING */
1201
1173#define MODULE_ALIAS_BLOCKDEV(major,minor) \ 1202#define MODULE_ALIAS_BLOCKDEV(major,minor) \
1174 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) 1203 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
1175#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ 1204#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
@@ -1213,8 +1242,13 @@ struct blk_integrity {
1213extern int blk_integrity_register(struct gendisk *, struct blk_integrity *); 1242extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
1214extern void blk_integrity_unregister(struct gendisk *); 1243extern void blk_integrity_unregister(struct gendisk *);
1215extern int blk_integrity_compare(struct gendisk *, struct gendisk *); 1244extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
1216extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); 1245extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
1217extern int blk_rq_count_integrity_sg(struct request *); 1246 struct scatterlist *);
1247extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
1248extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
1249 struct request *);
1250extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
1251 struct bio *);
1218 1252
1219static inline 1253static inline
1220struct blk_integrity *bdev_get_integrity(struct block_device *bdev) 1254struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
@@ -1235,16 +1269,32 @@ static inline int blk_integrity_rq(struct request *rq)
1235 return bio_integrity(rq->bio); 1269 return bio_integrity(rq->bio);
1236} 1270}
1237 1271
1272static inline void blk_queue_max_integrity_segments(struct request_queue *q,
1273 unsigned int segs)
1274{
1275 q->limits.max_integrity_segments = segs;
1276}
1277
1278static inline unsigned short
1279queue_max_integrity_segments(struct request_queue *q)
1280{
1281 return q->limits.max_integrity_segments;
1282}
1283
1238#else /* CONFIG_BLK_DEV_INTEGRITY */ 1284#else /* CONFIG_BLK_DEV_INTEGRITY */
1239 1285
1240#define blk_integrity_rq(rq) (0) 1286#define blk_integrity_rq(rq) (0)
1241#define blk_rq_count_integrity_sg(a) (0) 1287#define blk_rq_count_integrity_sg(a, b) (0)
1242#define blk_rq_map_integrity_sg(a, b) (0) 1288#define blk_rq_map_integrity_sg(a, b, c) (0)
1243#define bdev_get_integrity(a) (0) 1289#define bdev_get_integrity(a) (0)
1244#define blk_get_integrity(a) (0) 1290#define blk_get_integrity(a) (0)
1245#define blk_integrity_compare(a, b) (0) 1291#define blk_integrity_compare(a, b) (0)
1246#define blk_integrity_register(a, b) (0) 1292#define blk_integrity_register(a, b) (0)
1247#define blk_integrity_unregister(a) do { } while (0); 1293#define blk_integrity_unregister(a) do { } while (0);
1294#define blk_queue_max_integrity_segments(a, b) do { } while (0);
1295#define queue_max_integrity_segments(a) (0)
1296#define blk_integrity_merge_rq(a, b, c) (0)
1297#define blk_integrity_merge_bio(a, b, c) (0)
1248 1298
1249#endif /* CONFIG_BLK_DEV_INTEGRITY */ 1299#endif /* CONFIG_BLK_DEV_INTEGRITY */
1250 1300
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4fd978e7eb83..80a0ece8f7e4 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -122,6 +122,8 @@ extern void elv_completed_request(struct request_queue *, struct request *);
122extern int elv_set_request(struct request_queue *, struct request *, gfp_t); 122extern int elv_set_request(struct request_queue *, struct request *, gfp_t);
123extern void elv_put_request(struct request_queue *, struct request *); 123extern void elv_put_request(struct request_queue *, struct request *);
124extern void elv_drain_elevator(struct request_queue *); 124extern void elv_drain_elevator(struct request_queue *);
125extern void elv_quiesce_start(struct request_queue *);
126extern void elv_quiesce_end(struct request_queue *);
125 127
126/* 128/*
127 * io scheduler registration 129 * io scheduler registration
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index af3f06b41dc1..557c3927e70f 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -12,6 +12,7 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/kdev_t.h> 13#include <linux/kdev_t.h>
14#include <linux/rcupdate.h> 14#include <linux/rcupdate.h>
15#include <linux/slab.h>
15 16
16#ifdef CONFIG_BLOCK 17#ifdef CONFIG_BLOCK
17 18
@@ -86,7 +87,15 @@ struct disk_stats {
86 unsigned long io_ticks; 87 unsigned long io_ticks;
87 unsigned long time_in_queue; 88 unsigned long time_in_queue;
88}; 89};
89 90
91#define PARTITION_META_INFO_VOLNAMELTH 64
92#define PARTITION_META_INFO_UUIDLTH 16
93
94struct partition_meta_info {
95 u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */
96 u8 volname[PARTITION_META_INFO_VOLNAMELTH];
97};
98
90struct hd_struct { 99struct hd_struct {
91 sector_t start_sect; 100 sector_t start_sect;
92 sector_t nr_sects; 101 sector_t nr_sects;
@@ -95,6 +104,7 @@ struct hd_struct {
95 struct device __dev; 104 struct device __dev;
96 struct kobject *holder_dir; 105 struct kobject *holder_dir;
97 int policy, partno; 106 int policy, partno;
107 struct partition_meta_info *info;
98#ifdef CONFIG_FAIL_MAKE_REQUEST 108#ifdef CONFIG_FAIL_MAKE_REQUEST
99 int make_it_fail; 109 int make_it_fail;
100#endif 110#endif
@@ -130,6 +140,7 @@ struct disk_part_tbl {
130 struct rcu_head rcu_head; 140 struct rcu_head rcu_head;
131 int len; 141 int len;
132 struct hd_struct __rcu *last_lookup; 142 struct hd_struct __rcu *last_lookup;
143 struct gendisk *disk;
133 struct hd_struct __rcu *part[]; 144 struct hd_struct __rcu *part[];
134}; 145};
135 146
@@ -181,6 +192,30 @@ static inline struct gendisk *part_to_disk(struct hd_struct *part)
181 return NULL; 192 return NULL;
182} 193}
183 194
195static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
196{
197 int i;
198 for (i = 0; i < 16; ++i) {
199 *to++ = (hex_to_bin(*uuid_str) << 4) |
200 (hex_to_bin(*(uuid_str + 1)));
201 uuid_str += 2;
202 switch (i) {
203 case 3:
204 case 5:
205 case 7:
206 case 9:
207 uuid_str++;
208 continue;
209 }
210 }
211}
212
213static inline char *part_unpack_uuid(const u8 *uuid, char *out)
214{
215 sprintf(out, "%pU", uuid);
216 return out;
217}
218
184static inline int disk_max_parts(struct gendisk *disk) 219static inline int disk_max_parts(struct gendisk *disk)
185{ 220{
186 if (disk->flags & GENHD_FL_EXT_DEVT) 221 if (disk->flags & GENHD_FL_EXT_DEVT)
@@ -342,6 +377,19 @@ static inline int part_in_flight(struct hd_struct *part)
342 return part->in_flight[0] + part->in_flight[1]; 377 return part->in_flight[0] + part->in_flight[1];
343} 378}
344 379
380static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
381{
382 if (disk)
383 return kzalloc_node(sizeof(struct partition_meta_info),
384 GFP_KERNEL, disk->node_id);
385 return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
386}
387
388static inline void free_part_info(struct hd_struct *part)
389{
390 kfree(part->info);
391}
392
345/* block/blk-core.c */ 393/* block/blk-core.c */
346extern void part_round_stats(int cpu, struct hd_struct *part); 394extern void part_round_stats(int cpu, struct hd_struct *part);
347 395
@@ -533,7 +581,9 @@ extern int disk_expand_part_tbl(struct gendisk *disk, int target);
533extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev); 581extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
534extern struct hd_struct * __must_check add_partition(struct gendisk *disk, 582extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
535 int partno, sector_t start, 583 int partno, sector_t start,
536 sector_t len, int flags); 584 sector_t len, int flags,
585 struct partition_meta_info
586 *info);
537extern void delete_partition(struct gendisk *, int); 587extern void delete_partition(struct gendisk *, int);
538extern void printk_all_partitions(void); 588extern void printk_all_partitions(void);
539 589
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 1759ba5adce8..edef168a0406 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -652,6 +652,16 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
652 _max1 > _max2 ? _max1 : _max2; }) 652 _max1 > _max2 ? _max1 : _max2; })
653 653
654/** 654/**
655 * min_not_zero - return the minimum that is _not_ zero, unless both are zero
656 * @x: value1
657 * @y: value2
658 */
659#define min_not_zero(x, y) ({ \
660 typeof(x) __x = (x); \
661 typeof(y) __y = (y); \
662 __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
663
664/**
655 * clamp - return a value clamped to a given range with strict typechecking 665 * clamp - return a value clamped to a given range with strict typechecking
656 * @val: current value 666 * @val: current value
657 * @min: minimum allowable value 667 * @min: minimum allowable value
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0383601a927c..56154bbb8da9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -336,6 +336,9 @@ extern unsigned long sysctl_hung_task_warnings;
336extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 336extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
337 void __user *buffer, 337 void __user *buffer,
338 size_t *lenp, loff_t *ppos); 338 size_t *lenp, loff_t *ppos);
339#else
340/* Avoid need for ifdefs elsewhere in the code */
341enum { sysctl_hung_task_timeout_secs = 0 };
339#endif 342#endif
340 343
341/* Attach to any functions which should be ignored in wchan output. */ 344/* Attach to any functions which should be ignored in wchan output. */
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 8fcb6e0e9e72..d63533a4a59e 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -32,6 +32,12 @@ struct scsi_cmnd;
32#endif 32#endif
33 33
34/* 34/*
35 * DIX-capable adapters effectively support infinite chaining for the
36 * protection information scatterlist
37 */
38#define SCSI_MAX_PROT_SG_SEGMENTS 0xFFFF
39
40/*
35 * Special value for scanning to specify scanning or rescanning of all 41 * Special value for scanning to specify scanning or rescanning of all
36 * possible channels, (target) ids, or luns on a given shost. 42 * possible channels, (target) ids, or luns on a given shost.
37 */ 43 */
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b7bdecb7b76e..d0a6a845f204 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -388,6 +388,7 @@ struct scsi_host_template {
388 * of scatter-gather. 388 * of scatter-gather.
389 */ 389 */
390 unsigned short sg_tablesize; 390 unsigned short sg_tablesize;
391 unsigned short sg_prot_tablesize;
391 392
392 /* 393 /*
393 * Set this if the host adapter has limitations beside segment count. 394 * Set this if the host adapter has limitations beside segment count.
@@ -599,6 +600,7 @@ struct Scsi_Host {
599 int can_queue; 600 int can_queue;
600 short cmd_per_lun; 601 short cmd_per_lun;
601 short unsigned int sg_tablesize; 602 short unsigned int sg_tablesize;
603 short unsigned int sg_prot_tablesize;
602 short unsigned int max_sectors; 604 short unsigned int max_sectors;
603 unsigned long dma_boundary; 605 unsigned long dma_boundary;
604 /* 606 /*
@@ -823,6 +825,11 @@ static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost)
823 return shost->prot_capabilities; 825 return shost->prot_capabilities;
824} 826}
825 827
828static inline int scsi_host_prot_dma(struct Scsi_Host *shost)
829{
830 return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION;
831}
832
826static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) 833static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type)
827{ 834{
828 static unsigned char cap[] = { 0, 835 static unsigned char cap[] = { 0,
diff --git a/init/Kconfig b/init/Kconfig
index be85a0ab1b82..bd125a795374 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -661,11 +661,14 @@ config BLK_CGROUP
661 661
662 Currently, CFQ IO scheduler uses it to recognize task groups and 662 Currently, CFQ IO scheduler uses it to recognize task groups and
663 control disk bandwidth allocation (proportional time slice allocation) 663 control disk bandwidth allocation (proportional time slice allocation)
664 to such task groups. 664 to such task groups. It is also used by bio throttling logic in
665 block layer to implement upper limit in IO rates on a device.
665 666
666 This option only enables generic Block IO controller infrastructure. 667 This option only enables generic Block IO controller infrastructure.
667 One needs to also enable actual IO controlling logic in CFQ for it 668 One needs to also enable actual IO controlling logic/policy. For
668 to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y). 669 enabling proportional weight division of disk bandwidth in CFQ seti
670 CONFIG_CFQ_GROUP_IOSCHED=y and for enabling throttling policy set
671 CONFIG_BLK_THROTTLE=y.
669 672
670 See Documentation/cgroups/blkio-controller.txt for more information. 673 See Documentation/cgroups/blkio-controller.txt for more information.
671 674
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 02e3ca4fc527..42db0551c3aa 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -58,6 +58,62 @@ static int __init readwrite(char *str)
58__setup("ro", readonly); 58__setup("ro", readonly);
59__setup("rw", readwrite); 59__setup("rw", readwrite);
60 60
61#ifdef CONFIG_BLOCK
62/**
63 * match_dev_by_uuid - callback for finding a partition using its uuid
64 * @dev: device passed in by the caller
65 * @data: opaque pointer to a 36 byte char array with a UUID
66 *
67 * Returns 1 if the device matches, and 0 otherwise.
68 */
69static int match_dev_by_uuid(struct device *dev, void *data)
70{
71 u8 *uuid = data;
72 struct hd_struct *part = dev_to_part(dev);
73
74 if (!part->info)
75 goto no_match;
76
77 if (memcmp(uuid, part->info->uuid, sizeof(part->info->uuid)))
78 goto no_match;
79
80 return 1;
81no_match:
82 return 0;
83}
84
85
86/**
87 * devt_from_partuuid - looks up the dev_t of a partition by its UUID
88 * @uuid: 36 byte char array containing a hex ascii UUID
89 *
90 * The function will return the first partition which contains a matching
91 * UUID value in its partition_meta_info struct. This does not search
92 * by filesystem UUIDs.
93 *
94 * Returns the matching dev_t on success or 0 on failure.
95 */
96static dev_t __init devt_from_partuuid(char *uuid_str)
97{
98 dev_t res = 0;
99 struct device *dev = NULL;
100 u8 uuid[16];
101
102 /* Pack the requested UUID in the expected format. */
103 part_pack_uuid(uuid_str, uuid);
104
105 dev = class_find_device(&block_class, NULL, uuid, &match_dev_by_uuid);
106 if (!dev)
107 goto done;
108
109 res = dev->devt;
110 put_device(dev);
111
112done:
113 return res;
114}
115#endif
116
61/* 117/*
62 * Convert a name into device number. We accept the following variants: 118 * Convert a name into device number. We accept the following variants:
63 * 119 *
@@ -68,6 +124,8 @@ __setup("rw", readwrite);
68 * of partition - device number of disk plus the partition number 124 * of partition - device number of disk plus the partition number
69 * 5) /dev/<disk_name>p<decimal> - same as the above, that form is 125 * 5) /dev/<disk_name>p<decimal> - same as the above, that form is
70 * used when disk name of partitioned disk ends on a digit. 126 * used when disk name of partitioned disk ends on a digit.
127 * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
128 * unique id of a partition if the partition table provides it.
71 * 129 *
72 * If name doesn't have fall into the categories above, we return (0,0). 130 * If name doesn't have fall into the categories above, we return (0,0).
73 * block_class is used to check if something is a disk name. If the disk 131 * block_class is used to check if something is a disk name. If the disk
@@ -82,6 +140,18 @@ dev_t name_to_dev_t(char *name)
82 dev_t res = 0; 140 dev_t res = 0;
83 int part; 141 int part;
84 142
143#ifdef CONFIG_BLOCK
144 if (strncmp(name, "PARTUUID=", 9) == 0) {
145 name += 9;
146 if (strlen(name) != 36)
147 goto fail;
148 res = devt_from_partuuid(name);
149 if (!res)
150 goto fail;
151 goto done;
152 }
153#endif
154
85 if (strncmp(name, "/dev/", 5) != 0) { 155 if (strncmp(name, "/dev/", 5) != 0) {
86 unsigned maj, min; 156 unsigned maj, min;
87 157