aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-22 20:00:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-22 20:00:32 -0400
commite9dd2b6837e26fe202708cce5ea4bb4ee3e3482e (patch)
treef42fd892495bfc4cbb740d06b016d267c9c42d00 /block
parent4f3a29dadaf999a273f1e7fe2476595d0283eef3 (diff)
parentb4627321e18582dcbdeb45d77df29d3177107c65 (diff)
Merge branch 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.37/core' of git://git.kernel.dk/linux-2.6-block: (39 commits) cfq-iosched: Fix a gcc 4.5 warning and put some comments block: Turn bvec_k{un,}map_irq() into static inline functions block: fix accounting bug on cross partition merges block: Make the integrity mapped property a bio flag block: Fix double free in blk_integrity_unregister block: Ensure physical block size is unsigned int blkio-throttle: Fix possible multiplication overflow in iops calculations blkio-throttle: limit max iops value to UINT_MAX blkio-throttle: There is no need to convert jiffies to milli seconds blkio-throttle: Fix link failure failure on i386 blkio: Recalculate the throttled bio dispatch time upon throttle limit change blkio: Add root group to td->tg_list blkio: deletion of a cgroup was causes oops blkio: Do not export throttle files if CONFIG_BLK_DEV_THROTTLING=n block: set the bounce_pfn to the actual DMA limit rather than to max memory block: revert bad fix for memory hotplug causing bounces Fix compile error in blk-exec.c for !CONFIG_DETECT_HUNG_TASK block: set the bounce_pfn to the actual DMA limit rather than to max memory block: Prevent hang_check firing during long I/O cfq: improve fsync performance for small files ... Fix up trivial conflicts due to __rcu sparse annotation in include/linux/genhd.h
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig12
-rw-r--r--block/Makefile1
-rw-r--r--block/blk-cgroup.c804
-rw-r--r--block/blk-cgroup.h87
-rw-r--r--block/blk-core.c53
-rw-r--r--block/blk-exec.c9
-rw-r--r--block/blk-integrity.c94
-rw-r--r--block/blk-map.c5
-rw-r--r--block/blk-merge.c25
-rw-r--r--block/blk-settings.c12
-rw-r--r--block/blk-sysfs.c11
-rw-r--r--block/blk-throttle.c1123
-rw-r--r--block/blk.h12
-rw-r--r--block/cfq-iosched.c39
-rw-r--r--block/cfq.h2
-rw-r--r--block/genhd.c30
-rw-r--r--block/ioctl.c2
17 files changed, 2069 insertions, 252 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9be0b56eaee1..6c9213ef15a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -77,6 +77,18 @@ config BLK_DEV_INTEGRITY
77 T10/SCSI Data Integrity Field or the T13/ATA External Path 77 T10/SCSI Data Integrity Field or the T13/ATA External Path
78 Protection. If in doubt, say N. 78 Protection. If in doubt, say N.
79 79
80config BLK_DEV_THROTTLING
81 bool "Block layer bio throttling support"
82 depends on BLK_CGROUP=y && EXPERIMENTAL
83 default n
84 ---help---
85 Block layer bio throttling support. It can be used to limit
86 the IO rate to a device. IO rate policies are per cgroup and
87 one needs to mount and use blkio cgroup controller for creating
88 cgroups and specifying per device IO rate policies.
89
90 See Documentation/cgroups/blkio-controller.txt for more information.
91
80endif # BLOCK 92endif # BLOCK
81 93
82config BLOCK_COMPAT 94config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 0bb499a739cd..c850d5ef80a2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
9 9
10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 10obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 11obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
12obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
12obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o 13obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
13obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o 14obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o 15obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2fef1ef931a0..b1febd0f6d2a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -37,6 +37,12 @@ static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 37static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 38static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
39 39
40/* for encoding cft->private value on file */
41#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val))
42/* What policy owns the file, proportional or throttle */
43#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff)
44#define BLKIOFILE_ATTR(val) ((val) & 0xffff)
45
40struct cgroup_subsys blkio_subsys = { 46struct cgroup_subsys blkio_subsys = {
41 .name = "blkio", 47 .name = "blkio",
42 .create = blkiocg_create, 48 .create = blkiocg_create,
@@ -59,6 +65,27 @@ static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
59 list_add(&pn->node, &blkcg->policy_list); 65 list_add(&pn->node, &blkcg->policy_list);
60} 66}
61 67
68static inline bool cftype_blkg_same_policy(struct cftype *cft,
69 struct blkio_group *blkg)
70{
71 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
72
73 if (blkg->plid == plid)
74 return 1;
75
76 return 0;
77}
78
79/* Determines if policy node matches cgroup file being accessed */
80static inline bool pn_matches_cftype(struct cftype *cft,
81 struct blkio_policy_node *pn)
82{
83 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
84 int fileid = BLKIOFILE_ATTR(cft->private);
85
86 return (plid == pn->plid && fileid == pn->fileid);
87}
88
62/* Must be called with blkcg->lock held */ 89/* Must be called with blkcg->lock held */
63static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) 90static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
64{ 91{
@@ -67,12 +94,13 @@ static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
67 94
68/* Must be called with blkcg->lock held */ 95/* Must be called with blkcg->lock held */
69static struct blkio_policy_node * 96static struct blkio_policy_node *
70blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) 97blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev,
98 enum blkio_policy_id plid, int fileid)
71{ 99{
72 struct blkio_policy_node *pn; 100 struct blkio_policy_node *pn;
73 101
74 list_for_each_entry(pn, &blkcg->policy_list, node) { 102 list_for_each_entry(pn, &blkcg->policy_list, node) {
75 if (pn->dev == dev) 103 if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid)
76 return pn; 104 return pn;
77 } 105 }
78 106
@@ -86,6 +114,67 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
86} 114}
87EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); 115EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
88 116
117static inline void
118blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight)
119{
120 struct blkio_policy_type *blkiop;
121
122 list_for_each_entry(blkiop, &blkio_list, list) {
123 /* If this policy does not own the blkg, do not send updates */
124 if (blkiop->plid != blkg->plid)
125 continue;
126 if (blkiop->ops.blkio_update_group_weight_fn)
127 blkiop->ops.blkio_update_group_weight_fn(blkg->key,
128 blkg, weight);
129 }
130}
131
132static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps,
133 int fileid)
134{
135 struct blkio_policy_type *blkiop;
136
137 list_for_each_entry(blkiop, &blkio_list, list) {
138
139 /* If this policy does not own the blkg, do not send updates */
140 if (blkiop->plid != blkg->plid)
141 continue;
142
143 if (fileid == BLKIO_THROTL_read_bps_device
144 && blkiop->ops.blkio_update_group_read_bps_fn)
145 blkiop->ops.blkio_update_group_read_bps_fn(blkg->key,
146 blkg, bps);
147
148 if (fileid == BLKIO_THROTL_write_bps_device
149 && blkiop->ops.blkio_update_group_write_bps_fn)
150 blkiop->ops.blkio_update_group_write_bps_fn(blkg->key,
151 blkg, bps);
152 }
153}
154
155static inline void blkio_update_group_iops(struct blkio_group *blkg,
156 unsigned int iops, int fileid)
157{
158 struct blkio_policy_type *blkiop;
159
160 list_for_each_entry(blkiop, &blkio_list, list) {
161
162 /* If this policy does not own the blkg, do not send updates */
163 if (blkiop->plid != blkg->plid)
164 continue;
165
166 if (fileid == BLKIO_THROTL_read_iops_device
167 && blkiop->ops.blkio_update_group_read_iops_fn)
168 blkiop->ops.blkio_update_group_read_iops_fn(blkg->key,
169 blkg, iops);
170
171 if (fileid == BLKIO_THROTL_write_iops_device
172 && blkiop->ops.blkio_update_group_write_iops_fn)
173 blkiop->ops.blkio_update_group_write_iops_fn(blkg->key,
174 blkg,iops);
175 }
176}
177
89/* 178/*
90 * Add to the appropriate stat variable depending on the request type. 179 * Add to the appropriate stat variable depending on the request type.
91 * This should be called with the blkg->stats_lock held. 180 * This should be called with the blkg->stats_lock held.
@@ -341,7 +430,8 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
341EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); 430EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
342 431
343void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 432void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
344 struct blkio_group *blkg, void *key, dev_t dev) 433 struct blkio_group *blkg, void *key, dev_t dev,
434 enum blkio_policy_id plid)
345{ 435{
346 unsigned long flags; 436 unsigned long flags;
347 437
@@ -350,6 +440,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
350 rcu_assign_pointer(blkg->key, key); 440 rcu_assign_pointer(blkg->key, key);
351 blkg->blkcg_id = css_id(&blkcg->css); 441 blkg->blkcg_id = css_id(&blkcg->css);
352 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); 442 hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
443 blkg->plid = plid;
353 spin_unlock_irqrestore(&blkcg->lock, flags); 444 spin_unlock_irqrestore(&blkcg->lock, flags);
354 /* Need to take css reference ? */ 445 /* Need to take css reference ? */
355 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); 446 cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
@@ -408,51 +499,6 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key)
408} 499}
409EXPORT_SYMBOL_GPL(blkiocg_lookup_group); 500EXPORT_SYMBOL_GPL(blkiocg_lookup_group);
410 501
411#define SHOW_FUNCTION(__VAR) \
412static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \
413 struct cftype *cftype) \
414{ \
415 struct blkio_cgroup *blkcg; \
416 \
417 blkcg = cgroup_to_blkio_cgroup(cgroup); \
418 return (u64)blkcg->__VAR; \
419}
420
421SHOW_FUNCTION(weight);
422#undef SHOW_FUNCTION
423
424static int
425blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
426{
427 struct blkio_cgroup *blkcg;
428 struct blkio_group *blkg;
429 struct hlist_node *n;
430 struct blkio_policy_type *blkiop;
431 struct blkio_policy_node *pn;
432
433 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
434 return -EINVAL;
435
436 blkcg = cgroup_to_blkio_cgroup(cgroup);
437 spin_lock(&blkio_list_lock);
438 spin_lock_irq(&blkcg->lock);
439 blkcg->weight = (unsigned int)val;
440
441 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
442 pn = blkio_policy_search_node(blkcg, blkg->dev);
443
444 if (pn)
445 continue;
446
447 list_for_each_entry(blkiop, &blkio_list, list)
448 blkiop->ops.blkio_update_group_weight_fn(blkg,
449 blkcg->weight);
450 }
451 spin_unlock_irq(&blkcg->lock);
452 spin_unlock(&blkio_list_lock);
453 return 0;
454}
455
456static int 502static int
457blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) 503blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
458{ 504{
@@ -593,52 +639,6 @@ static uint64_t blkio_get_stat(struct blkio_group *blkg,
593 return disk_total; 639 return disk_total;
594} 640}
595 641
596#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
597static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
598 struct cftype *cftype, struct cgroup_map_cb *cb) \
599{ \
600 struct blkio_cgroup *blkcg; \
601 struct blkio_group *blkg; \
602 struct hlist_node *n; \
603 uint64_t cgroup_total = 0; \
604 \
605 if (!cgroup_lock_live_group(cgroup)) \
606 return -ENODEV; \
607 \
608 blkcg = cgroup_to_blkio_cgroup(cgroup); \
609 rcu_read_lock(); \
610 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
611 if (blkg->dev) { \
612 spin_lock_irq(&blkg->stats_lock); \
613 cgroup_total += blkio_get_stat(blkg, cb, \
614 blkg->dev, type); \
615 spin_unlock_irq(&blkg->stats_lock); \
616 } \
617 } \
618 if (show_total) \
619 cb->fill(cb, "Total", cgroup_total); \
620 rcu_read_unlock(); \
621 cgroup_unlock(); \
622 return 0; \
623}
624
625SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
626SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
627SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
628SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
629SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
630SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
631SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
632SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
633#ifdef CONFIG_DEBUG_BLK_CGROUP
634SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
635SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
636SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
637SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
638SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
639#endif
640#undef SHOW_FUNCTION_PER_GROUP
641
642static int blkio_check_dev_num(dev_t dev) 642static int blkio_check_dev_num(dev_t dev)
643{ 643{
644 int part = 0; 644 int part = 0;
@@ -652,13 +652,14 @@ static int blkio_check_dev_num(dev_t dev)
652} 652}
653 653
654static int blkio_policy_parse_and_set(char *buf, 654static int blkio_policy_parse_and_set(char *buf,
655 struct blkio_policy_node *newpn) 655 struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid)
656{ 656{
657 char *s[4], *p, *major_s = NULL, *minor_s = NULL; 657 char *s[4], *p, *major_s = NULL, *minor_s = NULL;
658 int ret; 658 int ret;
659 unsigned long major, minor, temp; 659 unsigned long major, minor, temp;
660 int i = 0; 660 int i = 0;
661 dev_t dev; 661 dev_t dev;
662 u64 bps, iops;
662 663
663 memset(s, 0, sizeof(s)); 664 memset(s, 0, sizeof(s));
664 665
@@ -705,12 +706,47 @@ static int blkio_policy_parse_and_set(char *buf,
705 if (s[1] == NULL) 706 if (s[1] == NULL)
706 return -EINVAL; 707 return -EINVAL;
707 708
708 ret = strict_strtoul(s[1], 10, &temp); 709 switch (plid) {
709 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || 710 case BLKIO_POLICY_PROP:
710 temp > BLKIO_WEIGHT_MAX) 711 ret = strict_strtoul(s[1], 10, &temp);
711 return -EINVAL; 712 if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
713 temp > BLKIO_WEIGHT_MAX)
714 return -EINVAL;
712 715
713 newpn->weight = temp; 716 newpn->plid = plid;
717 newpn->fileid = fileid;
718 newpn->val.weight = temp;
719 break;
720 case BLKIO_POLICY_THROTL:
721 switch(fileid) {
722 case BLKIO_THROTL_read_bps_device:
723 case BLKIO_THROTL_write_bps_device:
724 ret = strict_strtoull(s[1], 10, &bps);
725 if (ret)
726 return -EINVAL;
727
728 newpn->plid = plid;
729 newpn->fileid = fileid;
730 newpn->val.bps = bps;
731 break;
732 case BLKIO_THROTL_read_iops_device:
733 case BLKIO_THROTL_write_iops_device:
734 ret = strict_strtoull(s[1], 10, &iops);
735 if (ret)
736 return -EINVAL;
737
738 if (iops > THROTL_IOPS_MAX)
739 return -EINVAL;
740
741 newpn->plid = plid;
742 newpn->fileid = fileid;
743 newpn->val.iops = (unsigned int)iops;
744 break;
745 }
746 break;
747 default:
748 BUG();
749 }
714 750
715 return 0; 751 return 0;
716} 752}
@@ -720,26 +756,180 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
720{ 756{
721 struct blkio_policy_node *pn; 757 struct blkio_policy_node *pn;
722 758
723 pn = blkio_policy_search_node(blkcg, dev); 759 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP,
760 BLKIO_PROP_weight_device);
724 if (pn) 761 if (pn)
725 return pn->weight; 762 return pn->val.weight;
726 else 763 else
727 return blkcg->weight; 764 return blkcg->weight;
728} 765}
729EXPORT_SYMBOL_GPL(blkcg_get_weight); 766EXPORT_SYMBOL_GPL(blkcg_get_weight);
730 767
768uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
769{
770 struct blkio_policy_node *pn;
771
772 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
773 BLKIO_THROTL_read_bps_device);
774 if (pn)
775 return pn->val.bps;
776 else
777 return -1;
778}
779
780uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev)
781{
782 struct blkio_policy_node *pn;
783 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
784 BLKIO_THROTL_write_bps_device);
785 if (pn)
786 return pn->val.bps;
787 else
788 return -1;
789}
790
791unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev)
792{
793 struct blkio_policy_node *pn;
794
795 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
796 BLKIO_THROTL_read_iops_device);
797 if (pn)
798 return pn->val.iops;
799 else
800 return -1;
801}
802
803unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev)
804{
805 struct blkio_policy_node *pn;
806 pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL,
807 BLKIO_THROTL_write_iops_device);
808 if (pn)
809 return pn->val.iops;
810 else
811 return -1;
812}
813
814/* Checks whether user asked for deleting a policy rule */
815static bool blkio_delete_rule_command(struct blkio_policy_node *pn)
816{
817 switch(pn->plid) {
818 case BLKIO_POLICY_PROP:
819 if (pn->val.weight == 0)
820 return 1;
821 break;
822 case BLKIO_POLICY_THROTL:
823 switch(pn->fileid) {
824 case BLKIO_THROTL_read_bps_device:
825 case BLKIO_THROTL_write_bps_device:
826 if (pn->val.bps == 0)
827 return 1;
828 break;
829 case BLKIO_THROTL_read_iops_device:
830 case BLKIO_THROTL_write_iops_device:
831 if (pn->val.iops == 0)
832 return 1;
833 }
834 break;
835 default:
836 BUG();
837 }
838
839 return 0;
840}
841
842static void blkio_update_policy_rule(struct blkio_policy_node *oldpn,
843 struct blkio_policy_node *newpn)
844{
845 switch(oldpn->plid) {
846 case BLKIO_POLICY_PROP:
847 oldpn->val.weight = newpn->val.weight;
848 break;
849 case BLKIO_POLICY_THROTL:
850 switch(newpn->fileid) {
851 case BLKIO_THROTL_read_bps_device:
852 case BLKIO_THROTL_write_bps_device:
853 oldpn->val.bps = newpn->val.bps;
854 break;
855 case BLKIO_THROTL_read_iops_device:
856 case BLKIO_THROTL_write_iops_device:
857 oldpn->val.iops = newpn->val.iops;
858 }
859 break;
860 default:
861 BUG();
862 }
863}
864
865/*
866 * Some rules/values in blkg have changed. Propogate those to respective
867 * policies.
868 */
869static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg,
870 struct blkio_group *blkg, struct blkio_policy_node *pn)
871{
872 unsigned int weight, iops;
873 u64 bps;
874
875 switch(pn->plid) {
876 case BLKIO_POLICY_PROP:
877 weight = pn->val.weight ? pn->val.weight :
878 blkcg->weight;
879 blkio_update_group_weight(blkg, weight);
880 break;
881 case BLKIO_POLICY_THROTL:
882 switch(pn->fileid) {
883 case BLKIO_THROTL_read_bps_device:
884 case BLKIO_THROTL_write_bps_device:
885 bps = pn->val.bps ? pn->val.bps : (-1);
886 blkio_update_group_bps(blkg, bps, pn->fileid);
887 break;
888 case BLKIO_THROTL_read_iops_device:
889 case BLKIO_THROTL_write_iops_device:
890 iops = pn->val.iops ? pn->val.iops : (-1);
891 blkio_update_group_iops(blkg, iops, pn->fileid);
892 break;
893 }
894 break;
895 default:
896 BUG();
897 }
898}
899
900/*
901 * A policy node rule has been updated. Propogate this update to all the
902 * block groups which might be affected by this update.
903 */
904static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg,
905 struct blkio_policy_node *pn)
906{
907 struct blkio_group *blkg;
908 struct hlist_node *n;
909
910 spin_lock(&blkio_list_lock);
911 spin_lock_irq(&blkcg->lock);
912
913 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
914 if (pn->dev != blkg->dev || pn->plid != blkg->plid)
915 continue;
916 blkio_update_blkg_policy(blkcg, blkg, pn);
917 }
918
919 spin_unlock_irq(&blkcg->lock);
920 spin_unlock(&blkio_list_lock);
921}
731 922
732static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, 923static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft,
733 const char *buffer) 924 const char *buffer)
734{ 925{
735 int ret = 0; 926 int ret = 0;
736 char *buf; 927 char *buf;
737 struct blkio_policy_node *newpn, *pn; 928 struct blkio_policy_node *newpn, *pn;
738 struct blkio_cgroup *blkcg; 929 struct blkio_cgroup *blkcg;
739 struct blkio_group *blkg;
740 int keep_newpn = 0; 930 int keep_newpn = 0;
741 struct hlist_node *n; 931 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
742 struct blkio_policy_type *blkiop; 932 int fileid = BLKIOFILE_ATTR(cft->private);
743 933
744 buf = kstrdup(buffer, GFP_KERNEL); 934 buf = kstrdup(buffer, GFP_KERNEL);
745 if (!buf) 935 if (!buf)
@@ -751,7 +941,7 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
751 goto free_buf; 941 goto free_buf;
752 } 942 }
753 943
754 ret = blkio_policy_parse_and_set(buf, newpn); 944 ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid);
755 if (ret) 945 if (ret)
756 goto free_newpn; 946 goto free_newpn;
757 947
@@ -759,9 +949,9 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
759 949
760 spin_lock_irq(&blkcg->lock); 950 spin_lock_irq(&blkcg->lock);
761 951
762 pn = blkio_policy_search_node(blkcg, newpn->dev); 952 pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid);
763 if (!pn) { 953 if (!pn) {
764 if (newpn->weight != 0) { 954 if (!blkio_delete_rule_command(newpn)) {
765 blkio_policy_insert_node(blkcg, newpn); 955 blkio_policy_insert_node(blkcg, newpn);
766 keep_newpn = 1; 956 keep_newpn = 1;
767 } 957 }
@@ -769,33 +959,17 @@ static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
769 goto update_io_group; 959 goto update_io_group;
770 } 960 }
771 961
772 if (newpn->weight == 0) { 962 if (blkio_delete_rule_command(newpn)) {
773 /* weight == 0 means deleteing a specific weight */
774 blkio_policy_delete_node(pn); 963 blkio_policy_delete_node(pn);
775 spin_unlock_irq(&blkcg->lock); 964 spin_unlock_irq(&blkcg->lock);
776 goto update_io_group; 965 goto update_io_group;
777 } 966 }
778 spin_unlock_irq(&blkcg->lock); 967 spin_unlock_irq(&blkcg->lock);
779 968
780 pn->weight = newpn->weight; 969 blkio_update_policy_rule(pn, newpn);
781 970
782update_io_group: 971update_io_group:
783 /* update weight for each cfqg */ 972 blkio_update_policy_node_blkg(blkcg, newpn);
784 spin_lock(&blkio_list_lock);
785 spin_lock_irq(&blkcg->lock);
786
787 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
788 if (newpn->dev == blkg->dev) {
789 list_for_each_entry(blkiop, &blkio_list, list)
790 blkiop->ops.blkio_update_group_weight_fn(blkg,
791 newpn->weight ?
792 newpn->weight :
793 blkcg->weight);
794 }
795 }
796
797 spin_unlock_irq(&blkcg->lock);
798 spin_unlock(&blkio_list_lock);
799 973
800free_newpn: 974free_newpn:
801 if (!keep_newpn) 975 if (!keep_newpn)
@@ -805,23 +979,256 @@ free_buf:
805 return ret; 979 return ret;
806} 980}
807 981
808static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, 982static void
809 struct seq_file *m) 983blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn)
810{ 984{
811 struct blkio_cgroup *blkcg; 985 switch(pn->plid) {
812 struct blkio_policy_node *pn; 986 case BLKIO_POLICY_PROP:
987 if (pn->fileid == BLKIO_PROP_weight_device)
988 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
989 MINOR(pn->dev), pn->val.weight);
990 break;
991 case BLKIO_POLICY_THROTL:
992 switch(pn->fileid) {
993 case BLKIO_THROTL_read_bps_device:
994 case BLKIO_THROTL_write_bps_device:
995 seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev),
996 MINOR(pn->dev), pn->val.bps);
997 break;
998 case BLKIO_THROTL_read_iops_device:
999 case BLKIO_THROTL_write_iops_device:
1000 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
1001 MINOR(pn->dev), pn->val.iops);
1002 break;
1003 }
1004 break;
1005 default:
1006 BUG();
1007 }
1008}
813 1009
814 seq_printf(m, "dev\tweight\n"); 1010/* cgroup files which read their data from policy nodes end up here */
1011static void blkio_read_policy_node_files(struct cftype *cft,
1012 struct blkio_cgroup *blkcg, struct seq_file *m)
1013{
1014 struct blkio_policy_node *pn;
815 1015
816 blkcg = cgroup_to_blkio_cgroup(cgrp);
817 if (!list_empty(&blkcg->policy_list)) { 1016 if (!list_empty(&blkcg->policy_list)) {
818 spin_lock_irq(&blkcg->lock); 1017 spin_lock_irq(&blkcg->lock);
819 list_for_each_entry(pn, &blkcg->policy_list, node) { 1018 list_for_each_entry(pn, &blkcg->policy_list, node) {
820 seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), 1019 if (!pn_matches_cftype(cft, pn))
821 MINOR(pn->dev), pn->weight); 1020 continue;
1021 blkio_print_policy_node(m, pn);
822 } 1022 }
823 spin_unlock_irq(&blkcg->lock); 1023 spin_unlock_irq(&blkcg->lock);
824 } 1024 }
1025}
1026
1027static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft,
1028 struct seq_file *m)
1029{
1030 struct blkio_cgroup *blkcg;
1031 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1032 int name = BLKIOFILE_ATTR(cft->private);
1033
1034 blkcg = cgroup_to_blkio_cgroup(cgrp);
1035
1036 switch(plid) {
1037 case BLKIO_POLICY_PROP:
1038 switch(name) {
1039 case BLKIO_PROP_weight_device:
1040 blkio_read_policy_node_files(cft, blkcg, m);
1041 return 0;
1042 default:
1043 BUG();
1044 }
1045 break;
1046 case BLKIO_POLICY_THROTL:
1047 switch(name){
1048 case BLKIO_THROTL_read_bps_device:
1049 case BLKIO_THROTL_write_bps_device:
1050 case BLKIO_THROTL_read_iops_device:
1051 case BLKIO_THROTL_write_iops_device:
1052 blkio_read_policy_node_files(cft, blkcg, m);
1053 return 0;
1054 default:
1055 BUG();
1056 }
1057 break;
1058 default:
1059 BUG();
1060 }
1061
1062 return 0;
1063}
1064
1065static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg,
1066 struct cftype *cft, struct cgroup_map_cb *cb, enum stat_type type,
1067 bool show_total)
1068{
1069 struct blkio_group *blkg;
1070 struct hlist_node *n;
1071 uint64_t cgroup_total = 0;
1072
1073 rcu_read_lock();
1074 hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {
1075 if (blkg->dev) {
1076 if (!cftype_blkg_same_policy(cft, blkg))
1077 continue;
1078 spin_lock_irq(&blkg->stats_lock);
1079 cgroup_total += blkio_get_stat(blkg, cb, blkg->dev,
1080 type);
1081 spin_unlock_irq(&blkg->stats_lock);
1082 }
1083 }
1084 if (show_total)
1085 cb->fill(cb, "Total", cgroup_total);
1086 rcu_read_unlock();
1087 return 0;
1088}
1089
1090/* All map kind of cgroup file get serviced by this function */
1091static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft,
1092 struct cgroup_map_cb *cb)
1093{
1094 struct blkio_cgroup *blkcg;
1095 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1096 int name = BLKIOFILE_ATTR(cft->private);
1097
1098 blkcg = cgroup_to_blkio_cgroup(cgrp);
1099
1100 switch(plid) {
1101 case BLKIO_POLICY_PROP:
1102 switch(name) {
1103 case BLKIO_PROP_time:
1104 return blkio_read_blkg_stats(blkcg, cft, cb,
1105 BLKIO_STAT_TIME, 0);
1106 case BLKIO_PROP_sectors:
1107 return blkio_read_blkg_stats(blkcg, cft, cb,
1108 BLKIO_STAT_SECTORS, 0);
1109 case BLKIO_PROP_io_service_bytes:
1110 return blkio_read_blkg_stats(blkcg, cft, cb,
1111 BLKIO_STAT_SERVICE_BYTES, 1);
1112 case BLKIO_PROP_io_serviced:
1113 return blkio_read_blkg_stats(blkcg, cft, cb,
1114 BLKIO_STAT_SERVICED, 1);
1115 case BLKIO_PROP_io_service_time:
1116 return blkio_read_blkg_stats(blkcg, cft, cb,
1117 BLKIO_STAT_SERVICE_TIME, 1);
1118 case BLKIO_PROP_io_wait_time:
1119 return blkio_read_blkg_stats(blkcg, cft, cb,
1120 BLKIO_STAT_WAIT_TIME, 1);
1121 case BLKIO_PROP_io_merged:
1122 return blkio_read_blkg_stats(blkcg, cft, cb,
1123 BLKIO_STAT_MERGED, 1);
1124 case BLKIO_PROP_io_queued:
1125 return blkio_read_blkg_stats(blkcg, cft, cb,
1126 BLKIO_STAT_QUEUED, 1);
1127#ifdef CONFIG_DEBUG_BLK_CGROUP
1128 case BLKIO_PROP_dequeue:
1129 return blkio_read_blkg_stats(blkcg, cft, cb,
1130 BLKIO_STAT_DEQUEUE, 0);
1131 case BLKIO_PROP_avg_queue_size:
1132 return blkio_read_blkg_stats(blkcg, cft, cb,
1133 BLKIO_STAT_AVG_QUEUE_SIZE, 0);
1134 case BLKIO_PROP_group_wait_time:
1135 return blkio_read_blkg_stats(blkcg, cft, cb,
1136 BLKIO_STAT_GROUP_WAIT_TIME, 0);
1137 case BLKIO_PROP_idle_time:
1138 return blkio_read_blkg_stats(blkcg, cft, cb,
1139 BLKIO_STAT_IDLE_TIME, 0);
1140 case BLKIO_PROP_empty_time:
1141 return blkio_read_blkg_stats(blkcg, cft, cb,
1142 BLKIO_STAT_EMPTY_TIME, 0);
1143#endif
1144 default:
1145 BUG();
1146 }
1147 break;
1148 case BLKIO_POLICY_THROTL:
1149 switch(name){
1150 case BLKIO_THROTL_io_service_bytes:
1151 return blkio_read_blkg_stats(blkcg, cft, cb,
1152 BLKIO_STAT_SERVICE_BYTES, 1);
1153 case BLKIO_THROTL_io_serviced:
1154 return blkio_read_blkg_stats(blkcg, cft, cb,
1155 BLKIO_STAT_SERVICED, 1);
1156 default:
1157 BUG();
1158 }
1159 break;
1160 default:
1161 BUG();
1162 }
1163
1164 return 0;
1165}
1166
1167static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val)
1168{
1169 struct blkio_group *blkg;
1170 struct hlist_node *n;
1171 struct blkio_policy_node *pn;
1172
1173 if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
1174 return -EINVAL;
1175
1176 spin_lock(&blkio_list_lock);
1177 spin_lock_irq(&blkcg->lock);
1178 blkcg->weight = (unsigned int)val;
1179
1180 hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
1181 pn = blkio_policy_search_node(blkcg, blkg->dev,
1182 BLKIO_POLICY_PROP, BLKIO_PROP_weight_device);
1183 if (pn)
1184 continue;
1185
1186 blkio_update_group_weight(blkg, blkcg->weight);
1187 }
1188 spin_unlock_irq(&blkcg->lock);
1189 spin_unlock(&blkio_list_lock);
1190 return 0;
1191}
1192
1193static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
1194 struct blkio_cgroup *blkcg;
1195 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1196 int name = BLKIOFILE_ATTR(cft->private);
1197
1198 blkcg = cgroup_to_blkio_cgroup(cgrp);
1199
1200 switch(plid) {
1201 case BLKIO_POLICY_PROP:
1202 switch(name) {
1203 case BLKIO_PROP_weight:
1204 return (u64)blkcg->weight;
1205 }
1206 break;
1207 default:
1208 BUG();
1209 }
1210 return 0;
1211}
1212
1213static int
1214blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1215{
1216 struct blkio_cgroup *blkcg;
1217 enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private);
1218 int name = BLKIOFILE_ATTR(cft->private);
1219
1220 blkcg = cgroup_to_blkio_cgroup(cgrp);
1221
1222 switch(plid) {
1223 case BLKIO_POLICY_PROP:
1224 switch(name) {
1225 case BLKIO_PROP_weight:
1226 return blkio_weight_write(blkcg, val);
1227 }
1228 break;
1229 default:
1230 BUG();
1231 }
825 1232
826 return 0; 1233 return 0;
827} 1234}
@@ -829,71 +1236,151 @@ static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
829struct cftype blkio_files[] = { 1236struct cftype blkio_files[] = {
830 { 1237 {
831 .name = "weight_device", 1238 .name = "weight_device",
832 .read_seq_string = blkiocg_weight_device_read, 1239 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
833 .write_string = blkiocg_weight_device_write, 1240 BLKIO_PROP_weight_device),
1241 .read_seq_string = blkiocg_file_read,
1242 .write_string = blkiocg_file_write,
834 .max_write_len = 256, 1243 .max_write_len = 256,
835 }, 1244 },
836 { 1245 {
837 .name = "weight", 1246 .name = "weight",
838 .read_u64 = blkiocg_weight_read, 1247 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
839 .write_u64 = blkiocg_weight_write, 1248 BLKIO_PROP_weight),
1249 .read_u64 = blkiocg_file_read_u64,
1250 .write_u64 = blkiocg_file_write_u64,
840 }, 1251 },
841 { 1252 {
842 .name = "time", 1253 .name = "time",
843 .read_map = blkiocg_time_read, 1254 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1255 BLKIO_PROP_time),
1256 .read_map = blkiocg_file_read_map,
844 }, 1257 },
845 { 1258 {
846 .name = "sectors", 1259 .name = "sectors",
847 .read_map = blkiocg_sectors_read, 1260 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1261 BLKIO_PROP_sectors),
1262 .read_map = blkiocg_file_read_map,
848 }, 1263 },
849 { 1264 {
850 .name = "io_service_bytes", 1265 .name = "io_service_bytes",
851 .read_map = blkiocg_io_service_bytes_read, 1266 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1267 BLKIO_PROP_io_service_bytes),
1268 .read_map = blkiocg_file_read_map,
852 }, 1269 },
853 { 1270 {
854 .name = "io_serviced", 1271 .name = "io_serviced",
855 .read_map = blkiocg_io_serviced_read, 1272 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1273 BLKIO_PROP_io_serviced),
1274 .read_map = blkiocg_file_read_map,
856 }, 1275 },
857 { 1276 {
858 .name = "io_service_time", 1277 .name = "io_service_time",
859 .read_map = blkiocg_io_service_time_read, 1278 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1279 BLKIO_PROP_io_service_time),
1280 .read_map = blkiocg_file_read_map,
860 }, 1281 },
861 { 1282 {
862 .name = "io_wait_time", 1283 .name = "io_wait_time",
863 .read_map = blkiocg_io_wait_time_read, 1284 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1285 BLKIO_PROP_io_wait_time),
1286 .read_map = blkiocg_file_read_map,
864 }, 1287 },
865 { 1288 {
866 .name = "io_merged", 1289 .name = "io_merged",
867 .read_map = blkiocg_io_merged_read, 1290 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1291 BLKIO_PROP_io_merged),
1292 .read_map = blkiocg_file_read_map,
868 }, 1293 },
869 { 1294 {
870 .name = "io_queued", 1295 .name = "io_queued",
871 .read_map = blkiocg_io_queued_read, 1296 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1297 BLKIO_PROP_io_queued),
1298 .read_map = blkiocg_file_read_map,
872 }, 1299 },
873 { 1300 {
874 .name = "reset_stats", 1301 .name = "reset_stats",
875 .write_u64 = blkiocg_reset_stats, 1302 .write_u64 = blkiocg_reset_stats,
876 }, 1303 },
1304#ifdef CONFIG_BLK_DEV_THROTTLING
1305 {
1306 .name = "throttle.read_bps_device",
1307 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1308 BLKIO_THROTL_read_bps_device),
1309 .read_seq_string = blkiocg_file_read,
1310 .write_string = blkiocg_file_write,
1311 .max_write_len = 256,
1312 },
1313
1314 {
1315 .name = "throttle.write_bps_device",
1316 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1317 BLKIO_THROTL_write_bps_device),
1318 .read_seq_string = blkiocg_file_read,
1319 .write_string = blkiocg_file_write,
1320 .max_write_len = 256,
1321 },
1322
1323 {
1324 .name = "throttle.read_iops_device",
1325 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1326 BLKIO_THROTL_read_iops_device),
1327 .read_seq_string = blkiocg_file_read,
1328 .write_string = blkiocg_file_write,
1329 .max_write_len = 256,
1330 },
1331
1332 {
1333 .name = "throttle.write_iops_device",
1334 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1335 BLKIO_THROTL_write_iops_device),
1336 .read_seq_string = blkiocg_file_read,
1337 .write_string = blkiocg_file_write,
1338 .max_write_len = 256,
1339 },
1340 {
1341 .name = "throttle.io_service_bytes",
1342 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1343 BLKIO_THROTL_io_service_bytes),
1344 .read_map = blkiocg_file_read_map,
1345 },
1346 {
1347 .name = "throttle.io_serviced",
1348 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL,
1349 BLKIO_THROTL_io_serviced),
1350 .read_map = blkiocg_file_read_map,
1351 },
1352#endif /* CONFIG_BLK_DEV_THROTTLING */
1353
877#ifdef CONFIG_DEBUG_BLK_CGROUP 1354#ifdef CONFIG_DEBUG_BLK_CGROUP
878 { 1355 {
879 .name = "avg_queue_size", 1356 .name = "avg_queue_size",
880 .read_map = blkiocg_avg_queue_size_read, 1357 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1358 BLKIO_PROP_avg_queue_size),
1359 .read_map = blkiocg_file_read_map,
881 }, 1360 },
882 { 1361 {
883 .name = "group_wait_time", 1362 .name = "group_wait_time",
884 .read_map = blkiocg_group_wait_time_read, 1363 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1364 BLKIO_PROP_group_wait_time),
1365 .read_map = blkiocg_file_read_map,
885 }, 1366 },
886 { 1367 {
887 .name = "idle_time", 1368 .name = "idle_time",
888 .read_map = blkiocg_idle_time_read, 1369 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1370 BLKIO_PROP_idle_time),
1371 .read_map = blkiocg_file_read_map,
889 }, 1372 },
890 { 1373 {
891 .name = "empty_time", 1374 .name = "empty_time",
892 .read_map = blkiocg_empty_time_read, 1375 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1376 BLKIO_PROP_empty_time),
1377 .read_map = blkiocg_file_read_map,
893 }, 1378 },
894 { 1379 {
895 .name = "dequeue", 1380 .name = "dequeue",
896 .read_map = blkiocg_dequeue_read, 1381 .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
1382 BLKIO_PROP_dequeue),
1383 .read_map = blkiocg_file_read_map,
897 }, 1384 },
898#endif 1385#endif
899}; 1386};
@@ -932,13 +1419,14 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
932 /* 1419 /*
933 * This blkio_group is being unlinked as associated cgroup is 1420 * This blkio_group is being unlinked as associated cgroup is
934 * going away. Let all the IO controlling policies know about 1421 * going away. Let all the IO controlling policies know about
935 * this event. Currently this is static call to one io 1422 * this event.
936 * controlling policy. Once we have more policies in place, we
937 * need some dynamic registration of callback function.
938 */ 1423 */
939 spin_lock(&blkio_list_lock); 1424 spin_lock(&blkio_list_lock);
940 list_for_each_entry(blkiop, &blkio_list, list) 1425 list_for_each_entry(blkiop, &blkio_list, list) {
1426 if (blkiop->plid != blkg->plid)
1427 continue;
941 blkiop->ops.blkio_unlink_group_fn(key, blkg); 1428 blkiop->ops.blkio_unlink_group_fn(key, blkg);
1429 }
942 spin_unlock(&blkio_list_lock); 1430 spin_unlock(&blkio_list_lock);
943 } while (1); 1431 } while (1);
944 1432
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 2b866ec1dcea..ea4861bdd549 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,14 @@
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17 17
18enum blkio_policy_id {
19 BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */
20 BLKIO_POLICY_THROTL, /* Throttling */
21};
22
23/* Max limits for throttle policy */
24#define THROTL_IOPS_MAX UINT_MAX
25
18#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 26#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
19 27
20#ifndef CONFIG_BLK_CGROUP 28#ifndef CONFIG_BLK_CGROUP
@@ -65,6 +73,35 @@ enum blkg_state_flags {
65 BLKG_empty, 73 BLKG_empty,
66}; 74};
67 75
76/* cgroup files owned by proportional weight policy */
77enum blkcg_file_name_prop {
78 BLKIO_PROP_weight = 1,
79 BLKIO_PROP_weight_device,
80 BLKIO_PROP_io_service_bytes,
81 BLKIO_PROP_io_serviced,
82 BLKIO_PROP_time,
83 BLKIO_PROP_sectors,
84 BLKIO_PROP_io_service_time,
85 BLKIO_PROP_io_wait_time,
86 BLKIO_PROP_io_merged,
87 BLKIO_PROP_io_queued,
88 BLKIO_PROP_avg_queue_size,
89 BLKIO_PROP_group_wait_time,
90 BLKIO_PROP_idle_time,
91 BLKIO_PROP_empty_time,
92 BLKIO_PROP_dequeue,
93};
94
95/* cgroup files owned by throttle policy */
96enum blkcg_file_name_throtl {
97 BLKIO_THROTL_read_bps_device,
98 BLKIO_THROTL_write_bps_device,
99 BLKIO_THROTL_read_iops_device,
100 BLKIO_THROTL_write_iops_device,
101 BLKIO_THROTL_io_service_bytes,
102 BLKIO_THROTL_io_serviced,
103};
104
68struct blkio_cgroup { 105struct blkio_cgroup {
69 struct cgroup_subsys_state css; 106 struct cgroup_subsys_state css;
70 unsigned int weight; 107 unsigned int weight;
@@ -112,6 +149,8 @@ struct blkio_group {
112 char path[128]; 149 char path[128];
113 /* The device MKDEV(major, minor), this group has been created for */ 150 /* The device MKDEV(major, minor), this group has been created for */
114 dev_t dev; 151 dev_t dev;
152 /* policy which owns this blk group */
153 enum blkio_policy_id plid;
115 154
116 /* Need to serialize the stats in the case of reset/update */ 155 /* Need to serialize the stats in the case of reset/update */
117 spinlock_t stats_lock; 156 spinlock_t stats_lock;
@@ -121,24 +160,60 @@ struct blkio_group {
121struct blkio_policy_node { 160struct blkio_policy_node {
122 struct list_head node; 161 struct list_head node;
123 dev_t dev; 162 dev_t dev;
124 unsigned int weight; 163 /* This node belongs to max bw policy or porportional weight policy */
164 enum blkio_policy_id plid;
165 /* cgroup file to which this rule belongs to */
166 int fileid;
167
168 union {
169 unsigned int weight;
170 /*
171 * Rate read/write in terms of byptes per second
172 * Whether this rate represents read or write is determined
173 * by file type "fileid".
174 */
175 u64 bps;
176 unsigned int iops;
177 } val;
125}; 178};
126 179
127extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, 180extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
128 dev_t dev); 181 dev_t dev);
182extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
183 dev_t dev);
184extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
185 dev_t dev);
186extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg,
187 dev_t dev);
188extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg,
189 dev_t dev);
129 190
130typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); 191typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
131typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, 192
132 unsigned int weight); 193typedef void (blkio_update_group_weight_fn) (void *key,
194 struct blkio_group *blkg, unsigned int weight);
195typedef void (blkio_update_group_read_bps_fn) (void * key,
196 struct blkio_group *blkg, u64 read_bps);
197typedef void (blkio_update_group_write_bps_fn) (void *key,
198 struct blkio_group *blkg, u64 write_bps);
199typedef void (blkio_update_group_read_iops_fn) (void *key,
200 struct blkio_group *blkg, unsigned int read_iops);
201typedef void (blkio_update_group_write_iops_fn) (void *key,
202 struct blkio_group *blkg, unsigned int write_iops);
133 203
134struct blkio_policy_ops { 204struct blkio_policy_ops {
135 blkio_unlink_group_fn *blkio_unlink_group_fn; 205 blkio_unlink_group_fn *blkio_unlink_group_fn;
136 blkio_update_group_weight_fn *blkio_update_group_weight_fn; 206 blkio_update_group_weight_fn *blkio_update_group_weight_fn;
207 blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn;
208 blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn;
209 blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn;
210 blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn;
137}; 211};
138 212
139struct blkio_policy_type { 213struct blkio_policy_type {
140 struct list_head list; 214 struct list_head list;
141 struct blkio_policy_ops ops; 215 struct blkio_policy_ops ops;
216 enum blkio_policy_id plid;
142}; 217};
143 218
144/* Blkio controller policy registration */ 219/* Blkio controller policy registration */
@@ -212,7 +287,8 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
212extern struct blkio_cgroup blkio_root_cgroup; 287extern struct blkio_cgroup blkio_root_cgroup;
213extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); 288extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
214extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 289extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
215 struct blkio_group *blkg, void *key, dev_t dev); 290 struct blkio_group *blkg, void *key, dev_t dev,
291 enum blkio_policy_id plid);
216extern int blkiocg_del_blkio_group(struct blkio_group *blkg); 292extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
217extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, 293extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
218 void *key); 294 void *key);
@@ -234,7 +310,8 @@ static inline struct blkio_cgroup *
234cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } 310cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
235 311
236static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 312static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
237 struct blkio_group *blkg, void *key, dev_t dev) {} 313 struct blkio_group *blkg, void *key, dev_t dev,
314 enum blkio_policy_id plid) {}
238 315
239static inline int 316static inline int
240blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } 317blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 32a1c123dfb3..500eb859886e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,13 +64,15 @@ static void drive_stat_acct(struct request *rq, int new_io)
64 return; 64 return;
65 65
66 cpu = part_stat_lock(); 66 cpu = part_stat_lock();
67 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67
69 if (!new_io) 68 if (!new_io) {
69 part = rq->part;
70 part_stat_inc(cpu, part, merges[rw]); 70 part_stat_inc(cpu, part, merges[rw]);
71 else { 71 } else {
72 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
72 part_round_stats(cpu, part); 73 part_round_stats(cpu, part);
73 part_inc_in_flight(part, rw); 74 part_inc_in_flight(part, rw);
75 rq->part = part;
74 } 76 }
75 77
76 part_stat_unlock(); 78 part_stat_unlock();
@@ -128,6 +130,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
128 rq->ref_count = 1; 130 rq->ref_count = 1;
129 rq->start_time = jiffies; 131 rq->start_time = jiffies;
130 set_start_time_ns(rq); 132 set_start_time_ns(rq);
133 rq->part = NULL;
131} 134}
132EXPORT_SYMBOL(blk_rq_init); 135EXPORT_SYMBOL(blk_rq_init);
133 136
@@ -382,6 +385,7 @@ void blk_sync_queue(struct request_queue *q)
382 del_timer_sync(&q->unplug_timer); 385 del_timer_sync(&q->unplug_timer);
383 del_timer_sync(&q->timeout); 386 del_timer_sync(&q->timeout);
384 cancel_work_sync(&q->unplug_work); 387 cancel_work_sync(&q->unplug_work);
388 throtl_shutdown_timer_wq(q);
385} 389}
386EXPORT_SYMBOL(blk_sync_queue); 390EXPORT_SYMBOL(blk_sync_queue);
387 391
@@ -459,6 +463,8 @@ void blk_cleanup_queue(struct request_queue *q)
459 if (q->elevator) 463 if (q->elevator)
460 elevator_exit(q->elevator); 464 elevator_exit(q->elevator);
461 465
466 blk_throtl_exit(q);
467
462 blk_put_queue(q); 468 blk_put_queue(q);
463} 469}
464EXPORT_SYMBOL(blk_cleanup_queue); 470EXPORT_SYMBOL(blk_cleanup_queue);
@@ -515,6 +521,11 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
515 return NULL; 521 return NULL;
516 } 522 }
517 523
524 if (blk_throtl_init(q)) {
525 kmem_cache_free(blk_requestq_cachep, q);
526 return NULL;
527 }
528
518 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 529 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
519 laptop_mode_timer_fn, (unsigned long) q); 530 laptop_mode_timer_fn, (unsigned long) q);
520 init_timer(&q->unplug_timer); 531 init_timer(&q->unplug_timer);
@@ -796,11 +807,16 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
796 rl->starved[is_sync] = 0; 807 rl->starved[is_sync] = 0;
797 808
798 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 809 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
799 if (priv) 810 if (priv) {
800 rl->elvpriv++; 811 rl->elvpriv++;
801 812
802 if (blk_queue_io_stat(q)) 813 /*
803 rw_flags |= REQ_IO_STAT; 814 * Don't do stats for non-priv requests
815 */
816 if (blk_queue_io_stat(q))
817 rw_flags |= REQ_IO_STAT;
818 }
819
804 spin_unlock_irq(q->queue_lock); 820 spin_unlock_irq(q->queue_lock);
805 821
806 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 822 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
@@ -1522,6 +1538,15 @@ static inline void __generic_make_request(struct bio *bio)
1522 goto end_io; 1538 goto end_io;
1523 } 1539 }
1524 1540
1541 blk_throtl_bio(q, &bio);
1542
1543 /*
1544 * If bio = NULL, bio has been throttled and will be submitted
1545 * later.
1546 */
1547 if (!bio)
1548 break;
1549
1525 trace_block_bio_queue(q, bio); 1550 trace_block_bio_queue(q, bio);
1526 1551
1527 ret = q->make_request_fn(q, bio); 1552 ret = q->make_request_fn(q, bio);
@@ -1612,11 +1637,12 @@ void submit_bio(int rw, struct bio *bio)
1612 1637
1613 if (unlikely(block_dump)) { 1638 if (unlikely(block_dump)) {
1614 char b[BDEVNAME_SIZE]; 1639 char b[BDEVNAME_SIZE];
1615 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 1640 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
1616 current->comm, task_pid_nr(current), 1641 current->comm, task_pid_nr(current),
1617 (rw & WRITE) ? "WRITE" : "READ", 1642 (rw & WRITE) ? "WRITE" : "READ",
1618 (unsigned long long)bio->bi_sector, 1643 (unsigned long long)bio->bi_sector,
1619 bdevname(bio->bi_bdev, b)); 1644 bdevname(bio->bi_bdev, b),
1645 count);
1620 } 1646 }
1621 } 1647 }
1622 1648
@@ -1759,7 +1785,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
1759 int cpu; 1785 int cpu;
1760 1786
1761 cpu = part_stat_lock(); 1787 cpu = part_stat_lock();
1762 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1788 part = req->part;
1763 part_stat_add(cpu, part, sectors[rw], bytes >> 9); 1789 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1764 part_stat_unlock(); 1790 part_stat_unlock();
1765 } 1791 }
@@ -1779,7 +1805,7 @@ static void blk_account_io_done(struct request *req)
1779 int cpu; 1805 int cpu;
1780 1806
1781 cpu = part_stat_lock(); 1807 cpu = part_stat_lock();
1782 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 1808 part = req->part;
1783 1809
1784 part_stat_inc(cpu, part, ios[rw]); 1810 part_stat_inc(cpu, part, ios[rw]);
1785 part_stat_add(cpu, part, ticks[rw], duration); 1811 part_stat_add(cpu, part, ticks[rw], duration);
@@ -2579,6 +2605,13 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2579} 2605}
2580EXPORT_SYMBOL(kblockd_schedule_work); 2606EXPORT_SYMBOL(kblockd_schedule_work);
2581 2607
2608int kblockd_schedule_delayed_work(struct request_queue *q,
2609 struct delayed_work *dwork, unsigned long delay)
2610{
2611 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2612}
2613EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2614
2582int __init blk_dev_init(void) 2615int __init blk_dev_init(void)
2583{ 2616{
2584 BUILD_BUG_ON(__REQ_NR_BITS > 8 * 2617 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e1672f14840e..cf1456a02acd 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -80,6 +80,7 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
80 DECLARE_COMPLETION_ONSTACK(wait); 80 DECLARE_COMPLETION_ONSTACK(wait);
81 char sense[SCSI_SENSE_BUFFERSIZE]; 81 char sense[SCSI_SENSE_BUFFERSIZE];
82 int err = 0; 82 int err = 0;
83 unsigned long hang_check;
83 84
84 /* 85 /*
85 * we need an extra reference to the request, so we can look at 86 * we need an extra reference to the request, so we can look at
@@ -95,7 +96,13 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
95 96
96 rq->end_io_data = &wait; 97 rq->end_io_data = &wait;
97 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 98 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
98 wait_for_completion(&wait); 99
100 /* Prevent hang_check timer from firing at us during very long I/O */
101 hang_check = sysctl_hung_task_timeout_secs;
102 if (hang_check)
103 while (!wait_for_completion_timeout(&wait, hang_check * (HZ/2)));
104 else
105 wait_for_completion(&wait);
99 106
100 if (rq->errors) 107 if (rq->errors)
101 err = -EIO; 108 err = -EIO;
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index edce1ef7933d..54bcba6c02a7 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -32,24 +32,37 @@ static struct kmem_cache *integrity_cachep;
32 32
33/** 33/**
34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements 34 * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
35 * @rq: request with integrity metadata attached 35 * @q: request queue
36 * @bio: bio with integrity metadata attached
36 * 37 *
37 * Description: Returns the number of elements required in a 38 * Description: Returns the number of elements required in a
38 * scatterlist corresponding to the integrity metadata in a request. 39 * scatterlist corresponding to the integrity metadata in a bio.
39 */ 40 */
40int blk_rq_count_integrity_sg(struct request *rq) 41int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
41{ 42{
42 struct bio_vec *iv, *ivprv; 43 struct bio_vec *iv, *ivprv = NULL;
43 struct req_iterator iter; 44 unsigned int segments = 0;
44 unsigned int segments; 45 unsigned int seg_size = 0;
46 unsigned int i = 0;
45 47
46 ivprv = NULL; 48 bio_for_each_integrity_vec(iv, bio, i) {
47 segments = 0;
48 49
49 rq_for_each_integrity_segment(iv, rq, iter) { 50 if (ivprv) {
51 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
52 goto new_segment;
53
54 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
55 goto new_segment;
50 56
51 if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 57 if (seg_size + iv->bv_len > queue_max_segment_size(q))
58 goto new_segment;
59
60 seg_size += iv->bv_len;
61 } else {
62new_segment:
52 segments++; 63 segments++;
64 seg_size = iv->bv_len;
65 }
53 66
54 ivprv = iv; 67 ivprv = iv;
55 } 68 }
@@ -60,30 +73,34 @@ EXPORT_SYMBOL(blk_rq_count_integrity_sg);
60 73
61/** 74/**
62 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist 75 * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
63 * @rq: request with integrity metadata attached 76 * @q: request queue
77 * @bio: bio with integrity metadata attached
64 * @sglist: target scatterlist 78 * @sglist: target scatterlist
65 * 79 *
66 * Description: Map the integrity vectors in request into a 80 * Description: Map the integrity vectors in request into a
67 * scatterlist. The scatterlist must be big enough to hold all 81 * scatterlist. The scatterlist must be big enough to hold all
68 * elements. I.e. sized using blk_rq_count_integrity_sg(). 82 * elements. I.e. sized using blk_rq_count_integrity_sg().
69 */ 83 */
70int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) 84int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
85 struct scatterlist *sglist)
71{ 86{
72 struct bio_vec *iv, *ivprv; 87 struct bio_vec *iv, *ivprv = NULL;
73 struct req_iterator iter; 88 struct scatterlist *sg = NULL;
74 struct scatterlist *sg; 89 unsigned int segments = 0;
75 unsigned int segments; 90 unsigned int i = 0;
76
77 ivprv = NULL;
78 sg = NULL;
79 segments = 0;
80 91
81 rq_for_each_integrity_segment(iv, rq, iter) { 92 bio_for_each_integrity_vec(iv, bio, i) {
82 93
83 if (ivprv) { 94 if (ivprv) {
84 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv)) 95 if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
85 goto new_segment; 96 goto new_segment;
86 97
98 if (!BIOVEC_SEG_BOUNDARY(q, ivprv, iv))
99 goto new_segment;
100
101 if (sg->length + iv->bv_len > queue_max_segment_size(q))
102 goto new_segment;
103
87 sg->length += iv->bv_len; 104 sg->length += iv->bv_len;
88 } else { 105 } else {
89new_segment: 106new_segment:
@@ -162,6 +179,40 @@ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
162} 179}
163EXPORT_SYMBOL(blk_integrity_compare); 180EXPORT_SYMBOL(blk_integrity_compare);
164 181
182int blk_integrity_merge_rq(struct request_queue *q, struct request *req,
183 struct request *next)
184{
185 if (blk_integrity_rq(req) != blk_integrity_rq(next))
186 return -1;
187
188 if (req->nr_integrity_segments + next->nr_integrity_segments >
189 q->limits.max_integrity_segments)
190 return -1;
191
192 return 0;
193}
194EXPORT_SYMBOL(blk_integrity_merge_rq);
195
196int blk_integrity_merge_bio(struct request_queue *q, struct request *req,
197 struct bio *bio)
198{
199 int nr_integrity_segs;
200 struct bio *next = bio->bi_next;
201
202 bio->bi_next = NULL;
203 nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
204 bio->bi_next = next;
205
206 if (req->nr_integrity_segments + nr_integrity_segs >
207 q->limits.max_integrity_segments)
208 return -1;
209
210 req->nr_integrity_segments += nr_integrity_segs;
211
212 return 0;
213}
214EXPORT_SYMBOL(blk_integrity_merge_bio);
215
165struct integrity_sysfs_entry { 216struct integrity_sysfs_entry {
166 struct attribute attr; 217 struct attribute attr;
167 ssize_t (*show)(struct blk_integrity *, char *); 218 ssize_t (*show)(struct blk_integrity *, char *);
@@ -381,7 +432,6 @@ void blk_integrity_unregister(struct gendisk *disk)
381 kobject_uevent(&bi->kobj, KOBJ_REMOVE); 432 kobject_uevent(&bi->kobj, KOBJ_REMOVE);
382 kobject_del(&bi->kobj); 433 kobject_del(&bi->kobj);
383 kobject_put(&bi->kobj); 434 kobject_put(&bi->kobj);
384 kmem_cache_free(integrity_cachep, bi);
385 disk->integrity = NULL; 435 disk->integrity = NULL;
386} 436}
387EXPORT_SYMBOL(blk_integrity_unregister); 437EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-map.c b/block/blk-map.c
index ade0a08c9099..d4a586d8691e 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -54,7 +54,7 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
54 * direct dma. else, set up kernel bounce buffers 54 * direct dma. else, set up kernel bounce buffers
55 */ 55 */
56 uaddr = (unsigned long) ubuf; 56 uaddr = (unsigned long) ubuf;
57 if (blk_rq_aligned(q, ubuf, len) && !map_data) 57 if (blk_rq_aligned(q, uaddr, len) && !map_data)
58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask); 58 bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
59 else 59 else
60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask); 60 bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
@@ -288,6 +288,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
288 unsigned int len, gfp_t gfp_mask) 288 unsigned int len, gfp_t gfp_mask)
289{ 289{
290 int reading = rq_data_dir(rq) == READ; 290 int reading = rq_data_dir(rq) == READ;
291 unsigned long addr = (unsigned long) kbuf;
291 int do_copy = 0; 292 int do_copy = 0;
292 struct bio *bio; 293 struct bio *bio;
293 int ret; 294 int ret;
@@ -297,7 +298,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
297 if (!len || !kbuf) 298 if (!len || !kbuf)
298 return -EINVAL; 299 return -EINVAL;
299 300
300 do_copy = !blk_rq_aligned(q, kbuf, len) || object_is_on_stack(kbuf); 301 do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
301 if (do_copy) 302 if (do_copy)
302 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); 303 bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
303 else 304 else
diff --git a/block/blk-merge.c b/block/blk-merge.c
index eafc94f68d79..0a2fd8a48a38 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -205,12 +205,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
205{ 205{
206 int nr_phys_segs = bio_phys_segments(q, bio); 206 int nr_phys_segs = bio_phys_segments(q, bio);
207 207
208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { 208 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
209 req->cmd_flags |= REQ_NOMERGE; 209 goto no_merge;
210 if (req == q->last_merge) 210
211 q->last_merge = NULL; 211 if (bio_integrity(bio) && blk_integrity_merge_bio(q, req, bio))
212 return 0; 212 goto no_merge;
213 }
214 213
215 /* 214 /*
216 * This will form the start of a new hw segment. Bump both 215 * This will form the start of a new hw segment. Bump both
@@ -218,6 +217,12 @@ static inline int ll_new_hw_segment(struct request_queue *q,
218 */ 217 */
219 req->nr_phys_segments += nr_phys_segs; 218 req->nr_phys_segments += nr_phys_segs;
220 return 1; 219 return 1;
220
221no_merge:
222 req->cmd_flags |= REQ_NOMERGE;
223 if (req == q->last_merge)
224 q->last_merge = NULL;
225 return 0;
221} 226}
222 227
223int ll_back_merge_fn(struct request_queue *q, struct request *req, 228int ll_back_merge_fn(struct request_queue *q, struct request *req,
@@ -301,6 +306,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
301 if (total_phys_segments > queue_max_segments(q)) 306 if (total_phys_segments > queue_max_segments(q))
302 return 0; 307 return 0;
303 308
309 if (blk_integrity_rq(req) && blk_integrity_merge_rq(q, req, next))
310 return 0;
311
304 /* Merge is OK... */ 312 /* Merge is OK... */
305 req->nr_phys_segments = total_phys_segments; 313 req->nr_phys_segments = total_phys_segments;
306 return 1; 314 return 1;
@@ -343,7 +351,7 @@ static void blk_account_io_merge(struct request *req)
343 int cpu; 351 int cpu;
344 352
345 cpu = part_stat_lock(); 353 cpu = part_stat_lock();
346 part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); 354 part = req->part;
347 355
348 part_round_stats(cpu, part); 356 part_round_stats(cpu, part);
349 part_dec_in_flight(part, rq_data_dir(req)); 357 part_dec_in_flight(part, rq_data_dir(req));
@@ -384,9 +392,6 @@ static int attempt_merge(struct request_queue *q, struct request *req,
384 || next->special) 392 || next->special)
385 return 0; 393 return 0;
386 394
387 if (blk_integrity_rq(req) != blk_integrity_rq(next))
388 return 0;
389
390 /* 395 /*
391 * If we are allowed to merge, then append bio list 396 * If we are allowed to merge, then append bio list
392 * from next to rq and release next. merge_requests_fn 397 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a234f4bf1d6f..315b88c8cbbb 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
111void blk_set_default_limits(struct queue_limits *lim) 111void blk_set_default_limits(struct queue_limits *lim)
112{ 112{
113 lim->max_segments = BLK_MAX_SEGMENTS; 113 lim->max_segments = BLK_MAX_SEGMENTS;
114 lim->max_integrity_segments = 0;
114 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
115 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
116 lim->max_sectors = BLK_DEF_MAX_SECTORS; 117 lim->max_sectors = BLK_DEF_MAX_SECTORS;
@@ -213,7 +214,7 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
213 */ 214 */
214 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 215 if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
215 dma = 1; 216 dma = 1;
216 q->limits.bounce_pfn = max_low_pfn; 217 q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
217#else 218#else
218 if (b_pfn < blk_max_low_pfn) 219 if (b_pfn < blk_max_low_pfn)
219 dma = 1; 220 dma = 1;
@@ -343,7 +344,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
343 * hardware can operate on without reverting to read-modify-write 344 * hardware can operate on without reverting to read-modify-write
344 * operations. 345 * operations.
345 */ 346 */
346void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) 347void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
347{ 348{
348 q->limits.physical_block_size = size; 349 q->limits.physical_block_size = size;
349 350
@@ -455,11 +456,6 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
455} 456}
456EXPORT_SYMBOL(blk_queue_io_opt); 457EXPORT_SYMBOL(blk_queue_io_opt);
457 458
458/*
459 * Returns the minimum that is _not_ zero, unless both are zero.
460 */
461#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
462
463/** 459/**
464 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 460 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
465 * @t: the stacking driver (top) 461 * @t: the stacking driver (top)
@@ -514,6 +510,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
514 b->seg_boundary_mask); 510 b->seg_boundary_mask);
515 511
516 t->max_segments = min_not_zero(t->max_segments, b->max_segments); 512 t->max_segments = min_not_zero(t->max_segments, b->max_segments);
513 t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
514 b->max_integrity_segments);
517 515
518 t->max_segment_size = min_not_zero(t->max_segment_size, 516 t->max_segment_size = min_not_zero(t->max_segment_size,
519 b->max_segment_size); 517 b->max_segment_size);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0749b89c6885..da8a8a40cd4c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -112,6 +112,11 @@ static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
112 return queue_var_show(queue_max_segments(q), (page)); 112 return queue_var_show(queue_max_segments(q), (page));
113} 113}
114 114
115static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
116{
117 return queue_var_show(q->limits.max_integrity_segments, (page));
118}
119
115static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) 120static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
116{ 121{
117 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) 122 if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
@@ -288,6 +293,11 @@ static struct queue_sysfs_entry queue_max_segments_entry = {
288 .show = queue_max_segments_show, 293 .show = queue_max_segments_show,
289}; 294};
290 295
296static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
297 .attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
298 .show = queue_max_integrity_segments_show,
299};
300
291static struct queue_sysfs_entry queue_max_segment_size_entry = { 301static struct queue_sysfs_entry queue_max_segment_size_entry = {
292 .attr = {.name = "max_segment_size", .mode = S_IRUGO }, 302 .attr = {.name = "max_segment_size", .mode = S_IRUGO },
293 .show = queue_max_segment_size_show, 303 .show = queue_max_segment_size_show,
@@ -375,6 +385,7 @@ static struct attribute *default_attrs[] = {
375 &queue_max_hw_sectors_entry.attr, 385 &queue_max_hw_sectors_entry.attr,
376 &queue_max_sectors_entry.attr, 386 &queue_max_sectors_entry.attr,
377 &queue_max_segments_entry.attr, 387 &queue_max_segments_entry.attr,
388 &queue_max_integrity_segments_entry.attr,
378 &queue_max_segment_size_entry.attr, 389 &queue_max_segment_size_entry.attr,
379 &queue_iosched_entry.attr, 390 &queue_iosched_entry.attr,
380 &queue_hw_sector_size_entry.attr, 391 &queue_hw_sector_size_entry.attr,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
new file mode 100644
index 000000000000..56ad4531b412
--- /dev/null
+++ b/block/blk-throttle.c
@@ -0,0 +1,1123 @@
1/*
2 * Interface for controlling IO bandwidth on a request queue
3 *
4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
5 */
6
7#include <linux/module.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/bio.h>
11#include <linux/blktrace_api.h>
12#include "blk-cgroup.h"
13
14/* Max dispatch from a group in 1 round */
15static int throtl_grp_quantum = 8;
16
17/* Total max dispatch from all groups in one round */
18static int throtl_quantum = 32;
19
20/* Throttling is performed over 100ms slice and after that slice is renewed */
21static unsigned long throtl_slice = HZ/10; /* 100 ms */
22
23struct throtl_rb_root {
24 struct rb_root rb;
25 struct rb_node *left;
26 unsigned int count;
27 unsigned long min_disptime;
28};
29
30#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
31 .count = 0, .min_disptime = 0}
32
33#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
34
35struct throtl_grp {
36 /* List of throtl groups on the request queue*/
37 struct hlist_node tg_node;
38
39 /* active throtl group service_tree member */
40 struct rb_node rb_node;
41
42 /*
43 * Dispatch time in jiffies. This is the estimated time when group
44 * will unthrottle and is ready to dispatch more bio. It is used as
45 * key to sort active groups in service tree.
46 */
47 unsigned long disptime;
48
49 struct blkio_group blkg;
50 atomic_t ref;
51 unsigned int flags;
52
53 /* Two lists for READ and WRITE */
54 struct bio_list bio_lists[2];
55
56 /* Number of queued bios on READ and WRITE lists */
57 unsigned int nr_queued[2];
58
59 /* bytes per second rate limits */
60 uint64_t bps[2];
61
62 /* IOPS limits */
63 unsigned int iops[2];
64
65 /* Number of bytes disptached in current slice */
66 uint64_t bytes_disp[2];
67 /* Number of bio's dispatched in current slice */
68 unsigned int io_disp[2];
69
70 /* When did we start a new slice */
71 unsigned long slice_start[2];
72 unsigned long slice_end[2];
73
74 /* Some throttle limits got updated for the group */
75 bool limits_changed;
76};
77
78struct throtl_data
79{
80 /* List of throtl groups */
81 struct hlist_head tg_list;
82
83 /* service tree for active throtl groups */
84 struct throtl_rb_root tg_service_tree;
85
86 struct throtl_grp root_tg;
87 struct request_queue *queue;
88
89 /* Total Number of queued bios on READ and WRITE lists */
90 unsigned int nr_queued[2];
91
92 /*
93 * number of total undestroyed groups
94 */
95 unsigned int nr_undestroyed_grps;
96
97 /* Work for dispatching throttled bios */
98 struct delayed_work throtl_work;
99
100 atomic_t limits_changed;
101};
102
103enum tg_state_flags {
104 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
105};
106
107#define THROTL_TG_FNS(name) \
108static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
109{ \
110 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \
111} \
112static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
113{ \
114 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
115} \
116static inline int throtl_tg_##name(const struct throtl_grp *tg) \
117{ \
118 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
119}
120
121THROTL_TG_FNS(on_rr);
122
123#define throtl_log_tg(td, tg, fmt, args...) \
124 blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
125 blkg_path(&(tg)->blkg), ##args); \
126
127#define throtl_log(td, fmt, args...) \
128 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
129
130static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
131{
132 if (blkg)
133 return container_of(blkg, struct throtl_grp, blkg);
134
135 return NULL;
136}
137
138static inline int total_nr_queued(struct throtl_data *td)
139{
140 return (td->nr_queued[0] + td->nr_queued[1]);
141}
142
143static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
144{
145 atomic_inc(&tg->ref);
146 return tg;
147}
148
149static void throtl_put_tg(struct throtl_grp *tg)
150{
151 BUG_ON(atomic_read(&tg->ref) <= 0);
152 if (!atomic_dec_and_test(&tg->ref))
153 return;
154 kfree(tg);
155}
156
157static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
158 struct cgroup *cgroup)
159{
160 struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
161 struct throtl_grp *tg = NULL;
162 void *key = td;
163 struct backing_dev_info *bdi = &td->queue->backing_dev_info;
164 unsigned int major, minor;
165
166 /*
167 * TODO: Speed up blkiocg_lookup_group() by maintaining a radix
168 * tree of blkg (instead of traversing through hash list all
169 * the time.
170 */
171 tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
172
173 /* Fill in device details for root group */
174 if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
175 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
176 tg->blkg.dev = MKDEV(major, minor);
177 goto done;
178 }
179
180 if (tg)
181 goto done;
182
183 tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
184 if (!tg)
185 goto done;
186
187 INIT_HLIST_NODE(&tg->tg_node);
188 RB_CLEAR_NODE(&tg->rb_node);
189 bio_list_init(&tg->bio_lists[0]);
190 bio_list_init(&tg->bio_lists[1]);
191
192 /*
193 * Take the initial reference that will be released on destroy
194 * This can be thought of a joint reference by cgroup and
195 * request queue which will be dropped by either request queue
196 * exit or cgroup deletion path depending on who is exiting first.
197 */
198 atomic_set(&tg->ref, 1);
199
200 /* Add group onto cgroup list */
201 sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
202 blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
203 MKDEV(major, minor), BLKIO_POLICY_THROTL);
204
205 tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
206 tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
207 tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
208 tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
209
210 hlist_add_head(&tg->tg_node, &td->tg_list);
211 td->nr_undestroyed_grps++;
212done:
213 return tg;
214}
215
216static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
217{
218 struct cgroup *cgroup;
219 struct throtl_grp *tg = NULL;
220
221 rcu_read_lock();
222 cgroup = task_cgroup(current, blkio_subsys_id);
223 tg = throtl_find_alloc_tg(td, cgroup);
224 if (!tg)
225 tg = &td->root_tg;
226 rcu_read_unlock();
227 return tg;
228}
229
230static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
231{
232 /* Service tree is empty */
233 if (!root->count)
234 return NULL;
235
236 if (!root->left)
237 root->left = rb_first(&root->rb);
238
239 if (root->left)
240 return rb_entry_tg(root->left);
241
242 return NULL;
243}
244
245static void rb_erase_init(struct rb_node *n, struct rb_root *root)
246{
247 rb_erase(n, root);
248 RB_CLEAR_NODE(n);
249}
250
251static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
252{
253 if (root->left == n)
254 root->left = NULL;
255 rb_erase_init(n, &root->rb);
256 --root->count;
257}
258
259static void update_min_dispatch_time(struct throtl_rb_root *st)
260{
261 struct throtl_grp *tg;
262
263 tg = throtl_rb_first(st);
264 if (!tg)
265 return;
266
267 st->min_disptime = tg->disptime;
268}
269
270static void
271tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
272{
273 struct rb_node **node = &st->rb.rb_node;
274 struct rb_node *parent = NULL;
275 struct throtl_grp *__tg;
276 unsigned long key = tg->disptime;
277 int left = 1;
278
279 while (*node != NULL) {
280 parent = *node;
281 __tg = rb_entry_tg(parent);
282
283 if (time_before(key, __tg->disptime))
284 node = &parent->rb_left;
285 else {
286 node = &parent->rb_right;
287 left = 0;
288 }
289 }
290
291 if (left)
292 st->left = &tg->rb_node;
293
294 rb_link_node(&tg->rb_node, parent, node);
295 rb_insert_color(&tg->rb_node, &st->rb);
296}
297
298static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
299{
300 struct throtl_rb_root *st = &td->tg_service_tree;
301
302 tg_service_tree_add(st, tg);
303 throtl_mark_tg_on_rr(tg);
304 st->count++;
305}
306
307static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
308{
309 if (!throtl_tg_on_rr(tg))
310 __throtl_enqueue_tg(td, tg);
311}
312
313static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
314{
315 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
316 throtl_clear_tg_on_rr(tg);
317}
318
319static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
320{
321 if (throtl_tg_on_rr(tg))
322 __throtl_dequeue_tg(td, tg);
323}
324
325static void throtl_schedule_next_dispatch(struct throtl_data *td)
326{
327 struct throtl_rb_root *st = &td->tg_service_tree;
328
329 /*
330 * If there are more bios pending, schedule more work.
331 */
332 if (!total_nr_queued(td))
333 return;
334
335 BUG_ON(!st->count);
336
337 update_min_dispatch_time(st);
338
339 if (time_before_eq(st->min_disptime, jiffies))
340 throtl_schedule_delayed_work(td->queue, 0);
341 else
342 throtl_schedule_delayed_work(td->queue,
343 (st->min_disptime - jiffies));
344}
345
346static inline void
347throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
348{
349 tg->bytes_disp[rw] = 0;
350 tg->io_disp[rw] = 0;
351 tg->slice_start[rw] = jiffies;
352 tg->slice_end[rw] = jiffies + throtl_slice;
353 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
354 rw == READ ? 'R' : 'W', tg->slice_start[rw],
355 tg->slice_end[rw], jiffies);
356}
357
358static inline void throtl_extend_slice(struct throtl_data *td,
359 struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
360{
361 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
362 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
363 rw == READ ? 'R' : 'W', tg->slice_start[rw],
364 tg->slice_end[rw], jiffies);
365}
366
367/* Determine if previously allocated or extended slice is complete or not */
368static bool
369throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
370{
371 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
372 return 0;
373
374 return 1;
375}
376
377/* Trim the used slices and adjust slice start accordingly */
378static inline void
379throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
380{
381 unsigned long nr_slices, time_elapsed, io_trim;
382 u64 bytes_trim, tmp;
383
384 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
385
386 /*
387 * If bps are unlimited (-1), then time slice don't get
388 * renewed. Don't try to trim the slice if slice is used. A new
389 * slice will start when appropriate.
390 */
391 if (throtl_slice_used(td, tg, rw))
392 return;
393
394 time_elapsed = jiffies - tg->slice_start[rw];
395
396 nr_slices = time_elapsed / throtl_slice;
397
398 if (!nr_slices)
399 return;
400 tmp = tg->bps[rw] * throtl_slice * nr_slices;
401 do_div(tmp, HZ);
402 bytes_trim = tmp;
403
404 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
405
406 if (!bytes_trim && !io_trim)
407 return;
408
409 if (tg->bytes_disp[rw] >= bytes_trim)
410 tg->bytes_disp[rw] -= bytes_trim;
411 else
412 tg->bytes_disp[rw] = 0;
413
414 if (tg->io_disp[rw] >= io_trim)
415 tg->io_disp[rw] -= io_trim;
416 else
417 tg->io_disp[rw] = 0;
418
419 tg->slice_start[rw] += nr_slices * throtl_slice;
420
421 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
422 " start=%lu end=%lu jiffies=%lu",
423 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
424 tg->slice_start[rw], tg->slice_end[rw], jiffies);
425}
426
427static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
428 struct bio *bio, unsigned long *wait)
429{
430 bool rw = bio_data_dir(bio);
431 unsigned int io_allowed;
432 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
433 u64 tmp;
434
435 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
436
437 /* Slice has just started. Consider one slice interval */
438 if (!jiffy_elapsed)
439 jiffy_elapsed_rnd = throtl_slice;
440
441 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
442
443 /*
444 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
445 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
446 * will allow dispatch after 1 second and after that slice should
447 * have been trimmed.
448 */
449
450 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
451 do_div(tmp, HZ);
452
453 if (tmp > UINT_MAX)
454 io_allowed = UINT_MAX;
455 else
456 io_allowed = tmp;
457
458 if (tg->io_disp[rw] + 1 <= io_allowed) {
459 if (wait)
460 *wait = 0;
461 return 1;
462 }
463
464 /* Calc approx time to dispatch */
465 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
466
467 if (jiffy_wait > jiffy_elapsed)
468 jiffy_wait = jiffy_wait - jiffy_elapsed;
469 else
470 jiffy_wait = 1;
471
472 if (wait)
473 *wait = jiffy_wait;
474 return 0;
475}
476
477static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
478 struct bio *bio, unsigned long *wait)
479{
480 bool rw = bio_data_dir(bio);
481 u64 bytes_allowed, extra_bytes, tmp;
482 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
483
484 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
485
486 /* Slice has just started. Consider one slice interval */
487 if (!jiffy_elapsed)
488 jiffy_elapsed_rnd = throtl_slice;
489
490 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
491
492 tmp = tg->bps[rw] * jiffy_elapsed_rnd;
493 do_div(tmp, HZ);
494 bytes_allowed = tmp;
495
496 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
497 if (wait)
498 *wait = 0;
499 return 1;
500 }
501
502 /* Calc approx time to dispatch */
503 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
504 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
505
506 if (!jiffy_wait)
507 jiffy_wait = 1;
508
509 /*
510 * This wait time is without taking into consideration the rounding
511 * up we did. Add that time also.
512 */
513 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
514 if (wait)
515 *wait = jiffy_wait;
516 return 0;
517}
518
519/*
520 * Returns whether one can dispatch a bio or not. Also returns approx number
521 * of jiffies to wait before this bio is with-in IO rate and can be dispatched
522 */
523static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
524 struct bio *bio, unsigned long *wait)
525{
526 bool rw = bio_data_dir(bio);
527 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
528
529 /*
530 * Currently whole state machine of group depends on first bio
531 * queued in the group bio list. So one should not be calling
532 * this function with a different bio if there are other bios
533 * queued.
534 */
535 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
536
537 /* If tg->bps = -1, then BW is unlimited */
538 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
539 if (wait)
540 *wait = 0;
541 return 1;
542 }
543
544 /*
545 * If previous slice expired, start a new one otherwise renew/extend
546 * existing slice to make sure it is at least throtl_slice interval
547 * long since now.
548 */
549 if (throtl_slice_used(td, tg, rw))
550 throtl_start_new_slice(td, tg, rw);
551 else {
552 if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
553 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
554 }
555
556 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
557 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
558 if (wait)
559 *wait = 0;
560 return 1;
561 }
562
563 max_wait = max(bps_wait, iops_wait);
564
565 if (wait)
566 *wait = max_wait;
567
568 if (time_before(tg->slice_end[rw], jiffies + max_wait))
569 throtl_extend_slice(td, tg, rw, jiffies + max_wait);
570
571 return 0;
572}
573
574static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
575{
576 bool rw = bio_data_dir(bio);
577 bool sync = bio->bi_rw & REQ_SYNC;
578
579 /* Charge the bio to the group */
580 tg->bytes_disp[rw] += bio->bi_size;
581 tg->io_disp[rw]++;
582
583 /*
584 * TODO: This will take blkg->stats_lock. Figure out a way
585 * to avoid this cost.
586 */
587 blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
588}
589
590static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
591 struct bio *bio)
592{
593 bool rw = bio_data_dir(bio);
594
595 bio_list_add(&tg->bio_lists[rw], bio);
596 /* Take a bio reference on tg */
597 throtl_ref_get_tg(tg);
598 tg->nr_queued[rw]++;
599 td->nr_queued[rw]++;
600 throtl_enqueue_tg(td, tg);
601}
602
603static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
604{
605 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
606 struct bio *bio;
607
608 if ((bio = bio_list_peek(&tg->bio_lists[READ])))
609 tg_may_dispatch(td, tg, bio, &read_wait);
610
611 if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
612 tg_may_dispatch(td, tg, bio, &write_wait);
613
614 min_wait = min(read_wait, write_wait);
615 disptime = jiffies + min_wait;
616
617 /* Update dispatch time */
618 throtl_dequeue_tg(td, tg);
619 tg->disptime = disptime;
620 throtl_enqueue_tg(td, tg);
621}
622
623static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
624 bool rw, struct bio_list *bl)
625{
626 struct bio *bio;
627
628 bio = bio_list_pop(&tg->bio_lists[rw]);
629 tg->nr_queued[rw]--;
630 /* Drop bio reference on tg */
631 throtl_put_tg(tg);
632
633 BUG_ON(td->nr_queued[rw] <= 0);
634 td->nr_queued[rw]--;
635
636 throtl_charge_bio(tg, bio);
637 bio_list_add(bl, bio);
638 bio->bi_rw |= REQ_THROTTLED;
639
640 throtl_trim_slice(td, tg, rw);
641}
642
643static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
644 struct bio_list *bl)
645{
646 unsigned int nr_reads = 0, nr_writes = 0;
647 unsigned int max_nr_reads = throtl_grp_quantum*3/4;
648 unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
649 struct bio *bio;
650
651 /* Try to dispatch 75% READS and 25% WRITES */
652
653 while ((bio = bio_list_peek(&tg->bio_lists[READ]))
654 && tg_may_dispatch(td, tg, bio, NULL)) {
655
656 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
657 nr_reads++;
658
659 if (nr_reads >= max_nr_reads)
660 break;
661 }
662
663 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
664 && tg_may_dispatch(td, tg, bio, NULL)) {
665
666 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
667 nr_writes++;
668
669 if (nr_writes >= max_nr_writes)
670 break;
671 }
672
673 return nr_reads + nr_writes;
674}
675
676static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
677{
678 unsigned int nr_disp = 0;
679 struct throtl_grp *tg;
680 struct throtl_rb_root *st = &td->tg_service_tree;
681
682 while (1) {
683 tg = throtl_rb_first(st);
684
685 if (!tg)
686 break;
687
688 if (time_before(jiffies, tg->disptime))
689 break;
690
691 throtl_dequeue_tg(td, tg);
692
693 nr_disp += throtl_dispatch_tg(td, tg, bl);
694
695 if (tg->nr_queued[0] || tg->nr_queued[1]) {
696 tg_update_disptime(td, tg);
697 throtl_enqueue_tg(td, tg);
698 }
699
700 if (nr_disp >= throtl_quantum)
701 break;
702 }
703
704 return nr_disp;
705}
706
707static void throtl_process_limit_change(struct throtl_data *td)
708{
709 struct throtl_grp *tg;
710 struct hlist_node *pos, *n;
711
712 /*
713 * Make sure atomic_inc() effects from
714 * throtl_update_blkio_group_read_bps(), group of functions are
715 * visible.
716 * Is this required or smp_mb__after_atomic_inc() was suffcient
717 * after the atomic_inc().
718 */
719 smp_rmb();
720 if (!atomic_read(&td->limits_changed))
721 return;
722
723 throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed));
724
725 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
726 /*
727 * Do I need an smp_rmb() here to make sure tg->limits_changed
728 * update is visible. I am relying on smp_rmb() at the
729 * beginning of function and not putting a new one here.
730 */
731
732 if (throtl_tg_on_rr(tg) && tg->limits_changed) {
733 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
734 " riops=%u wiops=%u", tg->bps[READ],
735 tg->bps[WRITE], tg->iops[READ],
736 tg->iops[WRITE]);
737 tg_update_disptime(td, tg);
738 tg->limits_changed = false;
739 }
740 }
741
742 smp_mb__before_atomic_dec();
743 atomic_dec(&td->limits_changed);
744 smp_mb__after_atomic_dec();
745}
746
747/* Dispatch throttled bios. Should be called without queue lock held. */
748static int throtl_dispatch(struct request_queue *q)
749{
750 struct throtl_data *td = q->td;
751 unsigned int nr_disp = 0;
752 struct bio_list bio_list_on_stack;
753 struct bio *bio;
754
755 spin_lock_irq(q->queue_lock);
756
757 throtl_process_limit_change(td);
758
759 if (!total_nr_queued(td))
760 goto out;
761
762 bio_list_init(&bio_list_on_stack);
763
764 throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
765 total_nr_queued(td), td->nr_queued[READ],
766 td->nr_queued[WRITE]);
767
768 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
769
770 if (nr_disp)
771 throtl_log(td, "bios disp=%u", nr_disp);
772
773 throtl_schedule_next_dispatch(td);
774out:
775 spin_unlock_irq(q->queue_lock);
776
777 /*
778 * If we dispatched some requests, unplug the queue to make sure
779 * immediate dispatch
780 */
781 if (nr_disp) {
782 while((bio = bio_list_pop(&bio_list_on_stack)))
783 generic_make_request(bio);
784 blk_unplug(q);
785 }
786 return nr_disp;
787}
788
789void blk_throtl_work(struct work_struct *work)
790{
791 struct throtl_data *td = container_of(work, struct throtl_data,
792 throtl_work.work);
793 struct request_queue *q = td->queue;
794
795 throtl_dispatch(q);
796}
797
798/* Call with queue lock held */
799void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
800{
801
802 struct throtl_data *td = q->td;
803 struct delayed_work *dwork = &td->throtl_work;
804
805 if (total_nr_queued(td) > 0) {
806 /*
807 * We might have a work scheduled to be executed in future.
808 * Cancel that and schedule a new one.
809 */
810 __cancel_delayed_work(dwork);
811 kblockd_schedule_delayed_work(q, dwork, delay);
812 throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
813 delay, jiffies);
814 }
815}
816EXPORT_SYMBOL(throtl_schedule_delayed_work);
817
818static void
819throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg)
820{
821 /* Something wrong if we are trying to remove same group twice */
822 BUG_ON(hlist_unhashed(&tg->tg_node));
823
824 hlist_del_init(&tg->tg_node);
825
826 /*
827 * Put the reference taken at the time of creation so that when all
828 * queues are gone, group can be destroyed.
829 */
830 throtl_put_tg(tg);
831 td->nr_undestroyed_grps--;
832}
833
834static void throtl_release_tgs(struct throtl_data *td)
835{
836 struct hlist_node *pos, *n;
837 struct throtl_grp *tg;
838
839 hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
840 /*
841 * If cgroup removal path got to blk_group first and removed
842 * it from cgroup list, then it will take care of destroying
843 * cfqg also.
844 */
845 if (!blkiocg_del_blkio_group(&tg->blkg))
846 throtl_destroy_tg(td, tg);
847 }
848}
849
850static void throtl_td_free(struct throtl_data *td)
851{
852 kfree(td);
853}
854
855/*
856 * Blk cgroup controller notification saying that blkio_group object is being
857 * delinked as associated cgroup object is going away. That also means that
858 * no new IO will come in this group. So get rid of this group as soon as
859 * any pending IO in the group is finished.
860 *
861 * This function is called under rcu_read_lock(). key is the rcu protected
862 * pointer. That means "key" is a valid throtl_data pointer as long as we are
863 * rcu read lock.
864 *
865 * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
866 * it should not be NULL as even if queue was going away, cgroup deltion
867 * path got to it first.
868 */
869void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg)
870{
871 unsigned long flags;
872 struct throtl_data *td = key;
873
874 spin_lock_irqsave(td->queue->queue_lock, flags);
875 throtl_destroy_tg(td, tg_of_blkg(blkg));
876 spin_unlock_irqrestore(td->queue->queue_lock, flags);
877}
878
879/*
880 * For all update functions, key should be a valid pointer because these
881 * update functions are called under blkcg_lock, that means, blkg is
882 * valid and in turn key is valid. queue exit path can not race becuase
883 * of blkcg_lock
884 *
885 * Can not take queue lock in update functions as queue lock under blkcg_lock
886 * is not allowed. Under other paths we take blkcg_lock under queue_lock.
887 */
888static void throtl_update_blkio_group_read_bps(void *key,
889 struct blkio_group *blkg, u64 read_bps)
890{
891 struct throtl_data *td = key;
892
893 tg_of_blkg(blkg)->bps[READ] = read_bps;
894 /* Make sure read_bps is updated before setting limits_changed */
895 smp_wmb();
896 tg_of_blkg(blkg)->limits_changed = true;
897
898 /* Make sure tg->limits_changed is updated before td->limits_changed */
899 smp_mb__before_atomic_inc();
900 atomic_inc(&td->limits_changed);
901 smp_mb__after_atomic_inc();
902
903 /* Schedule a work now to process the limit change */
904 throtl_schedule_delayed_work(td->queue, 0);
905}
906
907static void throtl_update_blkio_group_write_bps(void *key,
908 struct blkio_group *blkg, u64 write_bps)
909{
910 struct throtl_data *td = key;
911
912 tg_of_blkg(blkg)->bps[WRITE] = write_bps;
913 smp_wmb();
914 tg_of_blkg(blkg)->limits_changed = true;
915 smp_mb__before_atomic_inc();
916 atomic_inc(&td->limits_changed);
917 smp_mb__after_atomic_inc();
918 throtl_schedule_delayed_work(td->queue, 0);
919}
920
921static void throtl_update_blkio_group_read_iops(void *key,
922 struct blkio_group *blkg, unsigned int read_iops)
923{
924 struct throtl_data *td = key;
925
926 tg_of_blkg(blkg)->iops[READ] = read_iops;
927 smp_wmb();
928 tg_of_blkg(blkg)->limits_changed = true;
929 smp_mb__before_atomic_inc();
930 atomic_inc(&td->limits_changed);
931 smp_mb__after_atomic_inc();
932 throtl_schedule_delayed_work(td->queue, 0);
933}
934
935static void throtl_update_blkio_group_write_iops(void *key,
936 struct blkio_group *blkg, unsigned int write_iops)
937{
938 struct throtl_data *td = key;
939
940 tg_of_blkg(blkg)->iops[WRITE] = write_iops;
941 smp_wmb();
942 tg_of_blkg(blkg)->limits_changed = true;
943 smp_mb__before_atomic_inc();
944 atomic_inc(&td->limits_changed);
945 smp_mb__after_atomic_inc();
946 throtl_schedule_delayed_work(td->queue, 0);
947}
948
949void throtl_shutdown_timer_wq(struct request_queue *q)
950{
951 struct throtl_data *td = q->td;
952
953 cancel_delayed_work_sync(&td->throtl_work);
954}
955
956static struct blkio_policy_type blkio_policy_throtl = {
957 .ops = {
958 .blkio_unlink_group_fn = throtl_unlink_blkio_group,
959 .blkio_update_group_read_bps_fn =
960 throtl_update_blkio_group_read_bps,
961 .blkio_update_group_write_bps_fn =
962 throtl_update_blkio_group_write_bps,
963 .blkio_update_group_read_iops_fn =
964 throtl_update_blkio_group_read_iops,
965 .blkio_update_group_write_iops_fn =
966 throtl_update_blkio_group_write_iops,
967 },
968 .plid = BLKIO_POLICY_THROTL,
969};
970
971int blk_throtl_bio(struct request_queue *q, struct bio **biop)
972{
973 struct throtl_data *td = q->td;
974 struct throtl_grp *tg;
975 struct bio *bio = *biop;
976 bool rw = bio_data_dir(bio), update_disptime = true;
977
978 if (bio->bi_rw & REQ_THROTTLED) {
979 bio->bi_rw &= ~REQ_THROTTLED;
980 return 0;
981 }
982
983 spin_lock_irq(q->queue_lock);
984 tg = throtl_get_tg(td);
985
986 if (tg->nr_queued[rw]) {
987 /*
988 * There is already another bio queued in same dir. No
989 * need to update dispatch time.
990 * Still update the disptime if rate limits on this group
991 * were changed.
992 */
993 if (!tg->limits_changed)
994 update_disptime = false;
995 else
996 tg->limits_changed = false;
997
998 goto queue_bio;
999 }
1000
1001 /* Bio is with-in rate limit of group */
1002 if (tg_may_dispatch(td, tg, bio, NULL)) {
1003 throtl_charge_bio(tg, bio);
1004 goto out;
1005 }
1006
1007queue_bio:
1008 throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
1009 " iodisp=%u iops=%u queued=%d/%d",
1010 rw == READ ? 'R' : 'W',
1011 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
1012 tg->io_disp[rw], tg->iops[rw],
1013 tg->nr_queued[READ], tg->nr_queued[WRITE]);
1014
1015 throtl_add_bio_tg(q->td, tg, bio);
1016 *biop = NULL;
1017
1018 if (update_disptime) {
1019 tg_update_disptime(td, tg);
1020 throtl_schedule_next_dispatch(td);
1021 }
1022
1023out:
1024 spin_unlock_irq(q->queue_lock);
1025 return 0;
1026}
1027
1028int blk_throtl_init(struct request_queue *q)
1029{
1030 struct throtl_data *td;
1031 struct throtl_grp *tg;
1032
1033 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
1034 if (!td)
1035 return -ENOMEM;
1036
1037 INIT_HLIST_HEAD(&td->tg_list);
1038 td->tg_service_tree = THROTL_RB_ROOT;
1039 atomic_set(&td->limits_changed, 0);
1040
1041 /* Init root group */
1042 tg = &td->root_tg;
1043 INIT_HLIST_NODE(&tg->tg_node);
1044 RB_CLEAR_NODE(&tg->rb_node);
1045 bio_list_init(&tg->bio_lists[0]);
1046 bio_list_init(&tg->bio_lists[1]);
1047
1048 /* Practically unlimited BW */
1049 tg->bps[0] = tg->bps[1] = -1;
1050 tg->iops[0] = tg->iops[1] = -1;
1051
1052 /*
1053 * Set root group reference to 2. One reference will be dropped when
1054 * all groups on tg_list are being deleted during queue exit. Other
1055 * reference will remain there as we don't want to delete this group
1056 * as it is statically allocated and gets destroyed when throtl_data
1057 * goes away.
1058 */
1059 atomic_set(&tg->ref, 2);
1060 hlist_add_head(&tg->tg_node, &td->tg_list);
1061 td->nr_undestroyed_grps++;
1062
1063 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
1064
1065 rcu_read_lock();
1066 blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
1067 0, BLKIO_POLICY_THROTL);
1068 rcu_read_unlock();
1069
1070 /* Attach throtl data to request queue */
1071 td->queue = q;
1072 q->td = td;
1073 return 0;
1074}
1075
1076void blk_throtl_exit(struct request_queue *q)
1077{
1078 struct throtl_data *td = q->td;
1079 bool wait = false;
1080
1081 BUG_ON(!td);
1082
1083 throtl_shutdown_timer_wq(q);
1084
1085 spin_lock_irq(q->queue_lock);
1086 throtl_release_tgs(td);
1087
1088 /* If there are other groups */
1089 if (td->nr_undestroyed_grps > 0)
1090 wait = true;
1091
1092 spin_unlock_irq(q->queue_lock);
1093
1094 /*
1095 * Wait for tg->blkg->key accessors to exit their grace periods.
1096 * Do this wait only if there are other undestroyed groups out
1097 * there (other than root group). This can happen if cgroup deletion
1098 * path claimed the responsibility of cleaning up a group before
1099 * queue cleanup code get to the group.
1100 *
1101 * Do not call synchronize_rcu() unconditionally as there are drivers
1102 * which create/delete request queue hundreds of times during scan/boot
1103 * and synchronize_rcu() can take significant time and slow down boot.
1104 */
1105 if (wait)
1106 synchronize_rcu();
1107
1108 /*
1109 * Just being safe to make sure after previous flush if some body did
1110 * update limits through cgroup and another work got queued, cancel
1111 * it.
1112 */
1113 throtl_shutdown_timer_wq(q);
1114 throtl_td_free(td);
1115}
1116
1117static int __init throtl_init(void)
1118{
1119 blkio_policy_register(&blkio_policy_throtl);
1120 return 0;
1121}
1122
1123module_init(throtl_init);
diff --git a/block/blk.h b/block/blk.h
index d6b911ac002c..f864012ec300 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
110 110
111int blk_dev_init(void); 111int blk_dev_init(void);
112 112
113void elv_quiesce_start(struct request_queue *q);
114void elv_quiesce_end(struct request_queue *q);
115
116
117/* 113/*
118 * Return the threshold (number of used requests) at which the queue is 114 * Return the threshold (number of used requests) at which the queue is
119 * considered to be congested. It include a little hysteresis to keep the 115 * considered to be congested. It include a little hysteresis to keep the
@@ -132,14 +128,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
132 return q->nr_congestion_off; 128 return q->nr_congestion_off;
133} 129}
134 130
135#if defined(CONFIG_BLK_DEV_INTEGRITY)
136
137#define rq_for_each_integrity_segment(bvl, _rq, _iter) \
138 __rq_for_each_bio(_iter.bio, _rq) \
139 bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
140
141#endif /* BLK_DEV_INTEGRITY */
142
143static inline int blk_cpu_to_group(int cpu) 131static inline int blk_cpu_to_group(int cpu)
144{ 132{
145 int group = NR_CPUS; 133 int group = NR_CPUS;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9eba291eb6fd..4cd59b0d7c15 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -160,6 +160,7 @@ enum wl_prio_t {
160 BE_WORKLOAD = 0, 160 BE_WORKLOAD = 0,
161 RT_WORKLOAD = 1, 161 RT_WORKLOAD = 1,
162 IDLE_WORKLOAD = 2, 162 IDLE_WORKLOAD = 2,
163 CFQ_PRIO_NR,
163}; 164};
164 165
165/* 166/*
@@ -184,10 +185,19 @@ struct cfq_group {
184 /* number of cfqq currently on this group */ 185 /* number of cfqq currently on this group */
185 int nr_cfqq; 186 int nr_cfqq;
186 187
187 /* Per group busy queus average. Useful for workload slice calc. */
188 unsigned int busy_queues_avg[2];
189 /* 188 /*
190 * rr lists of queues with requests, onle rr for each priority class. 189 * Per group busy queus average. Useful for workload slice calc. We
190 * create the array for each prio class but at run time it is used
191 * only for RT and BE class and slot for IDLE class remains unused.
192 * This is primarily done to avoid confusion and a gcc warning.
193 */
194 unsigned int busy_queues_avg[CFQ_PRIO_NR];
195 /*
196 * rr lists of queues with requests. We maintain service trees for
197 * RT and BE classes. These trees are subdivided in subclasses
198 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
199 * class there is no subclassification and all the cfq queues go on
200 * a single tree service_tree_idle.
191 * Counts are embedded in the cfq_rb_root 201 * Counts are embedded in the cfq_rb_root
192 */ 202 */
193 struct cfq_rb_root service_trees[2][3]; 203 struct cfq_rb_root service_trees[2][3];
@@ -221,7 +231,6 @@ struct cfq_data {
221 enum wl_type_t serving_type; 231 enum wl_type_t serving_type;
222 unsigned long workload_expires; 232 unsigned long workload_expires;
223 struct cfq_group *serving_group; 233 struct cfq_group *serving_group;
224 bool noidle_tree_requires_idle;
225 234
226 /* 235 /*
227 * Each priority tree is sorted by next_request position. These 236 * Each priority tree is sorted by next_request position. These
@@ -977,8 +986,8 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
977 return NULL; 986 return NULL;
978} 987}
979 988
980void 989void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
981cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) 990 unsigned int weight)
982{ 991{
983 cfqg_of_blkg(blkg)->weight = weight; 992 cfqg_of_blkg(blkg)->weight = weight;
984} 993}
@@ -2180,7 +2189,6 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
2180 slice = max_t(unsigned, slice, CFQ_MIN_TT); 2189 slice = max_t(unsigned, slice, CFQ_MIN_TT);
2181 cfq_log(cfqd, "workload slice:%d", slice); 2190 cfq_log(cfqd, "workload slice:%d", slice);
2182 cfqd->workload_expires = jiffies + slice; 2191 cfqd->workload_expires = jiffies + slice;
2183 cfqd->noidle_tree_requires_idle = false;
2184} 2192}
2185 2193
2186static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) 2194static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
@@ -3177,7 +3185,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3177 if (cfqq->queued[0] + cfqq->queued[1] >= 4) 3185 if (cfqq->queued[0] + cfqq->queued[1] >= 4)
3178 cfq_mark_cfqq_deep(cfqq); 3186 cfq_mark_cfqq_deep(cfqq);
3179 3187
3180 if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3188 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3189 enable_idle = 0;
3190 else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3181 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3191 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3182 enable_idle = 0; 3192 enable_idle = 0;
3183 else if (sample_valid(cic->ttime_samples)) { 3193 else if (sample_valid(cic->ttime_samples)) {
@@ -3494,17 +3504,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
3494 cfq_slice_expired(cfqd, 1); 3504 cfq_slice_expired(cfqd, 1);
3495 else if (sync && cfqq_empty && 3505 else if (sync && cfqq_empty &&
3496 !cfq_close_cooperator(cfqd, cfqq)) { 3506 !cfq_close_cooperator(cfqd, cfqq)) {
3497 cfqd->noidle_tree_requires_idle |= 3507 cfq_arm_slice_timer(cfqd);
3498 !(rq->cmd_flags & REQ_NOIDLE);
3499 /*
3500 * Idling is enabled for SYNC_WORKLOAD.
3501 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
3502 * only if we processed at least one !REQ_NOIDLE request
3503 */
3504 if (cfqd->serving_type == SYNC_WORKLOAD
3505 || cfqd->noidle_tree_requires_idle
3506 || cfqq->cfqg->nr_cfqq == 1)
3507 cfq_arm_slice_timer(cfqd);
3508 } 3508 }
3509 } 3509 }
3510 3510
@@ -4090,6 +4090,7 @@ static struct blkio_policy_type blkio_policy_cfq = {
4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group, 4090 .blkio_unlink_group_fn = cfq_unlink_blkio_group,
4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, 4091 .blkio_update_group_weight_fn = cfq_update_blkio_group_weight,
4092 }, 4092 },
4093 .plid = BLKIO_POLICY_PROP,
4093}; 4094};
4094#else 4095#else
4095static struct blkio_policy_type blkio_policy_cfq; 4096static struct blkio_policy_type blkio_policy_cfq;
diff --git a/block/cfq.h b/block/cfq.h
index 93448e5a2e41..54a6d90f8e8c 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -69,7 +69,7 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
69 69
70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, 70static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
71 struct blkio_group *blkg, void *key, dev_t dev) { 71 struct blkio_group *blkg, void *key, dev_t dev) {
72 blkiocg_add_blkio_group(blkcg, blkg, key, dev); 72 blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP);
73} 73}
74 74
75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) 75static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
diff --git a/block/genhd.c b/block/genhd.c
index 59a2db6fecef..8313834596db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -541,13 +541,15 @@ void add_disk(struct gendisk *disk)
541 disk->major = MAJOR(devt); 541 disk->major = MAJOR(devt);
542 disk->first_minor = MINOR(devt); 542 disk->first_minor = MINOR(devt);
543 543
544 /* Register BDI before referencing it from bdev */
545 bdi = &disk->queue->backing_dev_info;
546 bdi_register_dev(bdi, disk_devt(disk));
547
544 blk_register_region(disk_devt(disk), disk->minors, NULL, 548 blk_register_region(disk_devt(disk), disk->minors, NULL,
545 exact_match, exact_lock, disk); 549 exact_match, exact_lock, disk);
546 register_disk(disk); 550 register_disk(disk);
547 blk_register_queue(disk); 551 blk_register_queue(disk);
548 552
549 bdi = &disk->queue->backing_dev_info;
550 bdi_register_dev(bdi, disk_devt(disk));
551 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 553 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 "bdi"); 554 "bdi");
553 WARN_ON(retval); 555 WARN_ON(retval);
@@ -642,6 +644,7 @@ void __init printk_all_partitions(void)
642 struct hd_struct *part; 644 struct hd_struct *part;
643 char name_buf[BDEVNAME_SIZE]; 645 char name_buf[BDEVNAME_SIZE];
644 char devt_buf[BDEVT_SIZE]; 646 char devt_buf[BDEVT_SIZE];
647 u8 uuid[PARTITION_META_INFO_UUIDLTH * 2 + 1];
645 648
646 /* 649 /*
647 * Don't show empty devices or things that have been 650 * Don't show empty devices or things that have been
@@ -660,10 +663,14 @@ void __init printk_all_partitions(void)
660 while ((part = disk_part_iter_next(&piter))) { 663 while ((part = disk_part_iter_next(&piter))) {
661 bool is_part0 = part == &disk->part0; 664 bool is_part0 = part == &disk->part0;
662 665
663 printk("%s%s %10llu %s", is_part0 ? "" : " ", 666 uuid[0] = 0;
667 if (part->info)
668 part_unpack_uuid(part->info->uuid, uuid);
669
670 printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
664 bdevt_str(part_devt(part), devt_buf), 671 bdevt_str(part_devt(part), devt_buf),
665 (unsigned long long)part->nr_sects >> 1, 672 (unsigned long long)part->nr_sects >> 1,
666 disk_name(disk, part->partno, name_buf)); 673 disk_name(disk, part->partno, name_buf), uuid);
667 if (is_part0) { 674 if (is_part0) {
668 if (disk->driverfs_dev != NULL && 675 if (disk->driverfs_dev != NULL &&
669 disk->driverfs_dev->driver != NULL) 676 disk->driverfs_dev->driver != NULL)
@@ -925,8 +932,15 @@ static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
925{ 932{
926 struct disk_part_tbl *ptbl = 933 struct disk_part_tbl *ptbl =
927 container_of(head, struct disk_part_tbl, rcu_head); 934 container_of(head, struct disk_part_tbl, rcu_head);
935 struct gendisk *disk = ptbl->disk;
936 struct request_queue *q = disk->queue;
937 unsigned long flags;
928 938
929 kfree(ptbl); 939 kfree(ptbl);
940
941 spin_lock_irqsave(q->queue_lock, flags);
942 elv_quiesce_end(q);
943 spin_unlock_irqrestore(q->queue_lock, flags);
930} 944}
931 945
932/** 946/**
@@ -944,11 +958,17 @@ static void disk_replace_part_tbl(struct gendisk *disk,
944 struct disk_part_tbl *new_ptbl) 958 struct disk_part_tbl *new_ptbl)
945{ 959{
946 struct disk_part_tbl *old_ptbl = disk->part_tbl; 960 struct disk_part_tbl *old_ptbl = disk->part_tbl;
961 struct request_queue *q = disk->queue;
947 962
948 rcu_assign_pointer(disk->part_tbl, new_ptbl); 963 rcu_assign_pointer(disk->part_tbl, new_ptbl);
949 964
950 if (old_ptbl) { 965 if (old_ptbl) {
951 rcu_assign_pointer(old_ptbl->last_lookup, NULL); 966 rcu_assign_pointer(old_ptbl->last_lookup, NULL);
967
968 spin_lock_irq(q->queue_lock);
969 elv_quiesce_start(q);
970 spin_unlock_irq(q->queue_lock);
971
952 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb); 972 call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
953 } 973 }
954} 974}
@@ -989,6 +1009,7 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
989 return -ENOMEM; 1009 return -ENOMEM;
990 1010
991 new_ptbl->len = target; 1011 new_ptbl->len = target;
1012 new_ptbl->disk = disk;
992 1013
993 for (i = 0; i < len; i++) 1014 for (i = 0; i < len; i++)
994 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); 1015 rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
@@ -1004,6 +1025,7 @@ static void disk_release(struct device *dev)
1004 kfree(disk->random); 1025 kfree(disk->random);
1005 disk_replace_part_tbl(disk, NULL); 1026 disk_replace_part_tbl(disk, NULL);
1006 free_part_stats(&disk->part0); 1027 free_part_stats(&disk->part0);
1028 free_part_info(&disk->part0);
1007 kfree(disk); 1029 kfree(disk);
1008} 1030}
1009struct class block_class = { 1031struct class block_class = {
diff --git a/block/ioctl.c b/block/ioctl.c
index d8052f0dabd3..2c15fe0912c4 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -62,7 +62,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
62 62
63 /* all seems OK */ 63 /* all seems OK */
64 part = add_partition(disk, partno, start, length, 64 part = add_partition(disk, partno, start, length,
65 ADDPART_FLAG_NONE); 65 ADDPART_FLAG_NONE, NULL);
66 mutex_unlock(&bdev->bd_mutex); 66 mutex_unlock(&bdev->bd_mutex);
67 return IS_ERR(part) ? PTR_ERR(part) : 0; 67 return IS_ERR(part) ? PTR_ERR(part) : 0;
68 case BLKPG_DEL_PARTITION: 68 case BLKPG_DEL_PARTITION: